From e549e57b8fd7b029dd2577fa2c6cea3fb1ec9daf Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Mon, 19 Aug 2024 14:48:07 +1000 Subject: [PATCH 001/222] Initial code inclusion, conversion from bash scripts --- bin/python-get-heroku-video-s3-acls.py | 67 ++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100755 bin/python-get-heroku-video-s3-acls.py diff --git a/bin/python-get-heroku-video-s3-acls.py b/bin/python-get-heroku-video-s3-acls.py new file mode 100755 index 00000000..41086a87 --- /dev/null +++ b/bin/python-get-heroku-video-s3-acls.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 +# Permissions required: +# heroku cli - access to app +# aws s3 - NZSL IAM access +# s3:GetObjectAcl permissions or READ_ACP access to the object +# https://docs.aws.amazon.com/cli/latest/reference/s3api/get-object-acl.html + +import os +import subprocess + + +# Setup +# TODO See how difficult using native API calls would be. +HEROKU = "/usr/bin/heroku" +AWS = "/usr/local/bin/aws" + +TMPDIR = "/tmp/nzsl" +try: + os.makedirs(TMPDIR, exist_ok=True) +except OSError as err: + print(f"Error creating directory: {err}") + exit() + +NZSL_RAW_KEYS_FILE = f"{TMPDIR}/nzsl_raw_keys.txt" +NZSL_COOKED_KEYS_FILE = f"{TMPDIR}/nzsl_cooked_keys.txt" +S3_BUCKET_RAW_KEYS_FILE = f"{TMPDIR}/s3_bucket_raw_keys.txt" +S3_BUCKET_ERROR_KEYS_FILE = f"{TMPDIR}/s3_bucket_error_keys.csv" +S3_BUCKET_CONTENTS_FILE = f"{TMPDIR}/s3_bucket_contents.csv" +S3_KEYS_NOT_IN_NZSL = f"{TMPDIR}/s3_keys_not_in_nzsl.csv" +for p in ( + NZSL_RAW_KEYS_FILE, + NZSL_COOKED_KEYS_FILE, + S3_BUCKET_RAW_KEYS_FILE, + S3_BUCKET_ERROR_KEYS_FILE, + S3_BUCKET_CONTENTS_FILE, + S3_KEYS_NOT_IN_NZSL +): + f = open(p, "a") + f.truncate() + f.close() + +RUN_MODE = "production" +if RUN_MODE == "production": + print("PRODUCTION") + NZSL_APP = "nzsl-signbank-production" + AWS_S3_BUCKET = "nzsl-signbank-media-production" +else: + print("STAGING") + NZSL_APP = "nzsl-signbank-uat" + AWS_S3_BUCKET = "nzsl-signbank-media-uat" + +new_env = os.environ.copy() +new_env["AWS_PROFILE"] = "nzsl" + + +# Get all keys from S3 +print("Getting raw S3 keys recursively ($AWS_S3_BUCKET) ...") +with open(S3_BUCKET_RAW_KEYS_FILE, "w") as f_obj: + result = subprocess.run([AWS, "s3", "ls", f"s3://{AWS_S3_BUCKET}", "--recursive"], + env=new_env, shell=False, check=True, + text=True, stdout=f_obj) +num_lines = sum(1 for _ in open(S3_BUCKET_RAW_KEYS_FILE)) +print(f"{num_lines} rows retrieved: {S3_BUCKET_RAW_KEYS_FILE}") + + + + From c07c66a160aafaf54b27676d57e2cd31432ff73e Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Mon, 19 Aug 2024 15:11:40 +1000 Subject: [PATCH 002/222] S3 retrieval working, NZSL Signbank retrieval working --- bin/python-get-heroku-video-s3-acls.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/bin/python-get-heroku-video-s3-acls.py b/bin/python-get-heroku-video-s3-acls.py index 41086a87..2a119fcc 100755 --- a/bin/python-get-heroku-video-s3-acls.py +++ b/bin/python-get-heroku-video-s3-acls.py @@ -54,14 +54,31 @@ # Get all keys from S3 -print("Getting raw S3 keys recursively ($AWS_S3_BUCKET) ...") +""" +print(f"Getting raw S3 keys recursively ({AWS_S3_BUCKET}) ...") with open(S3_BUCKET_RAW_KEYS_FILE, "w") as f_obj: result = subprocess.run([AWS, "s3", "ls", f"s3://{AWS_S3_BUCKET}", "--recursive"], env=new_env, shell=False, check=True, text=True, stdout=f_obj) num_lines = sum(1 for _ in open(S3_BUCKET_RAW_KEYS_FILE)) print(f"{num_lines} rows retrieved: {S3_BUCKET_RAW_KEYS_FILE}") +""" +# Get the video file keys from NZSL Signbank +print(f"Getting raw video file keys from NZSL Signbank ({NZSL_APP}) ...") +with open(NZSL_RAW_KEYS_FILE, "w") as f_obj: + result = subprocess.run([HEROKU, "pg:psql", "DATABASE_URL", "--app", f"{NZSL_APP}", + "-c", "select videofile, is_public from video_glossvideo"], + env=new_env, shell=False, check=True, + text=True, stdout=f_obj) +# Remove the first 2 and last 2 lines, as we cannot control pg:psql +with open(NZSL_RAW_KEYS_FILE, "r") as f_obj: + lines = f_obj.readlines() + lines = lines[2:] + lines = lines[:-2] + for x in lines: + print(x) - +#num_lines = sum(1 for _ in open(NZSL_RAW_KEYS_FILE)) +#print(f"{num_lines} rows retrieved: {NZSL_RAW_KEYS_FILE}") From f2191b8a5e23a98d099a3d47906d2a15429d4efc Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Mon, 19 Aug 2024 15:15:48 +1000 Subject: [PATCH 003/222] pg:psql header and footer removed --- bin/python-get-heroku-video-s3-acls.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/bin/python-get-heroku-video-s3-acls.py b/bin/python-get-heroku-video-s3-acls.py index 2a119fcc..e748a575 100755 --- a/bin/python-get-heroku-video-s3-acls.py +++ b/bin/python-get-heroku-video-s3-acls.py @@ -71,14 +71,12 @@ "-c", "select videofile, is_public from video_glossvideo"], env=new_env, shell=False, check=True, text=True, stdout=f_obj) -# Remove the first 2 and last 2 lines, as we cannot control pg:psql +# Remove the first 2 and last 2 lines, as we cannot control pg:psql's output formatting with open(NZSL_RAW_KEYS_FILE, "r") as f_obj: lines = f_obj.readlines() lines = lines[2:] lines = lines[:-2] - for x in lines: - print(x) - -#num_lines = sum(1 for _ in open(NZSL_RAW_KEYS_FILE)) -#print(f"{num_lines} rows retrieved: {NZSL_RAW_KEYS_FILE}") +with open(NZSL_RAW_KEYS_FILE, "w") as f_obj: + f_obj.writelines(lines) +print(f"{len(lines)} rows retrieved: {NZSL_RAW_KEYS_FILE}") From b5ed4b93bfb19a7230d0effd9b9179c8814999f1 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Mon, 19 Aug 2024 16:17:31 +1000 Subject: [PATCH 004/222] Sorting newlines --- bin/python-get-heroku-video-s3-acls.py | 53 +++++++++++++++++++++----- 1 file changed, 43 insertions(+), 10 deletions(-) diff --git a/bin/python-get-heroku-video-s3-acls.py b/bin/python-get-heroku-video-s3-acls.py index e748a575..41d3b897 100755 --- a/bin/python-get-heroku-video-s3-acls.py +++ b/bin/python-get-heroku-video-s3-acls.py @@ -5,9 +5,16 @@ # s3:GetObjectAcl permissions or READ_ACP access to the object # https://docs.aws.amazon.com/cli/latest/reference/s3api/get-object-acl.html +# FIXME +# Currently pulling all data into text files the way the bash script +# this python script is based on did it. +# We may be able to get away with losing some of the files and doing +# most of it in memory. + + import os import subprocess - +from pprint import pprint # Setup # TODO See how difficult using native API calls would be. @@ -54,15 +61,26 @@ # Get all keys from S3 -""" print(f"Getting raw S3 keys recursively ({AWS_S3_BUCKET}) ...") with open(S3_BUCKET_RAW_KEYS_FILE, "w") as f_obj: result = subprocess.run([AWS, "s3", "ls", f"s3://{AWS_S3_BUCKET}", "--recursive"], env=new_env, shell=False, check=True, text=True, stdout=f_obj) -num_lines = sum(1 for _ in open(S3_BUCKET_RAW_KEYS_FILE)) -print(f"{num_lines} rows retrieved: {S3_BUCKET_RAW_KEYS_FILE}") -""" + +# Get just the keys +# Put them in an in-memory list, stripping newlines +with open(S3_BUCKET_RAW_KEYS_FILE, "r") as f_obj: + s3_bucket_raw_keys_list = [line.split()[3] for line in f_obj] +print(f"{len(s3_bucket_raw_keys_list)} rows retrieved: {S3_BUCKET_RAW_KEYS_FILE}") + +# Write them back to the file for completeness +with open(S3_BUCKET_RAW_KEYS_FILE, "w") as f_obj: + for line in s3_bucket_raw_keys_list: + f_obj.write(f"{line}\n") + +print(S3_BUCKET_RAW_KEYS_FILE) +print("DEBUG EXIT") +exit() # Get the video file keys from NZSL Signbank print(f"Getting raw video file keys from NZSL Signbank ({NZSL_APP}) ...") @@ -71,12 +89,27 @@ "-c", "select videofile, is_public from video_glossvideo"], env=new_env, shell=False, check=True, text=True, stdout=f_obj) + +# Put them in an in-memory list, stripping newlines # Remove the first 2 and last 2 lines, as we cannot control pg:psql's output formatting with open(NZSL_RAW_KEYS_FILE, "r") as f_obj: - lines = f_obj.readlines() - lines = lines[2:] - lines = lines[:-2] + nzsl_raw_keys_list = [line.rstrip() for line in f_obj] + nzsl_raw_keys_list = nzsl_raw_keys_list[2:] + nzsl_raw_keys_list = nzsl_raw_keys_list[:-2] with open(NZSL_RAW_KEYS_FILE, "w") as f_obj: - f_obj.writelines(lines) -print(f"{len(lines)} rows retrieved: {NZSL_RAW_KEYS_FILE}") + f_obj.writelines(nzsl_raw_keys_list) +print(f"{len(nzsl_raw_keys_list)} rows retrieved: {NZSL_RAW_KEYS_FILE}") +#pprint(nzsl_raw_keys_list) + +# Write the NZSL keys to a dictionary so we can do fast operations on them +nzsl_raw_keys_dict = {} +for rawl in nzsl_raw_keys_list: + columns = rawl.split("|") + video_key = columns[0].strip() + is_public = columns[1].strip().lower() == 't' + nzsl_raw_keys_dict[video_key] = is_public +# Get the s3 keys present and absent from our NZSL keys +print("Getting S3 keys present and absent from NZSL Signbank ...") +nzsl_cooked_keys_list = [] +s3_keys_not_in_nzsl_list = [] From dfcba140e260e75d3a5a2daaf4873a462d261353 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Mon, 19 Aug 2024 17:01:21 +1000 Subject: [PATCH 005/222] Differencing working --- bin/python-get-heroku-video-s3-acls.py | 33 ++++++++++++++++---------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/bin/python-get-heroku-video-s3-acls.py b/bin/python-get-heroku-video-s3-acls.py index 41d3b897..4390ebd9 100755 --- a/bin/python-get-heroku-video-s3-acls.py +++ b/bin/python-get-heroku-video-s3-acls.py @@ -59,49 +59,47 @@ new_env = os.environ.copy() new_env["AWS_PROFILE"] = "nzsl" - # Get all keys from S3 print(f"Getting raw S3 keys recursively ({AWS_S3_BUCKET}) ...") +#TODO Change this to a file-like object with open(S3_BUCKET_RAW_KEYS_FILE, "w") as f_obj: result = subprocess.run([AWS, "s3", "ls", f"s3://{AWS_S3_BUCKET}", "--recursive"], env=new_env, shell=False, check=True, text=True, stdout=f_obj) -# Get just the keys -# Put them in an in-memory list, stripping newlines +# Separate out just the keys (also strips newlines) +# Put them in an in-memory list with open(S3_BUCKET_RAW_KEYS_FILE, "r") as f_obj: s3_bucket_raw_keys_list = [line.split()[3] for line in f_obj] print(f"{len(s3_bucket_raw_keys_list)} rows retrieved: {S3_BUCKET_RAW_KEYS_FILE}") -# Write them back to the file for completeness +# Write the keys back to the file with open(S3_BUCKET_RAW_KEYS_FILE, "w") as f_obj: for line in s3_bucket_raw_keys_list: f_obj.write(f"{line}\n") -print(S3_BUCKET_RAW_KEYS_FILE) -print("DEBUG EXIT") -exit() - # Get the video file keys from NZSL Signbank print(f"Getting raw video file keys from NZSL Signbank ({NZSL_APP}) ...") +#TODO Change this to a file-like object with open(NZSL_RAW_KEYS_FILE, "w") as f_obj: result = subprocess.run([HEROKU, "pg:psql", "DATABASE_URL", "--app", f"{NZSL_APP}", "-c", "select videofile, is_public from video_glossvideo"], env=new_env, shell=False, check=True, text=True, stdout=f_obj) -# Put them in an in-memory list, stripping newlines # Remove the first 2 and last 2 lines, as we cannot control pg:psql's output formatting with open(NZSL_RAW_KEYS_FILE, "r") as f_obj: - nzsl_raw_keys_list = [line.rstrip() for line in f_obj] + nzsl_raw_keys_list = f_obj.readlines() nzsl_raw_keys_list = nzsl_raw_keys_list[2:] nzsl_raw_keys_list = nzsl_raw_keys_list[:-2] +print(f"{len(nzsl_raw_keys_list)} rows retrieved: {NZSL_RAW_KEYS_FILE}") + +# Put the raw lines back into the text file with open(NZSL_RAW_KEYS_FILE, "w") as f_obj: f_obj.writelines(nzsl_raw_keys_list) -print(f"{len(nzsl_raw_keys_list)} rows retrieved: {NZSL_RAW_KEYS_FILE}") -#pprint(nzsl_raw_keys_list) -# Write the NZSL keys to a dictionary so we can do fast operations on them +# Separate out the NZSL key columns +# Write them to a dictionary so we can do fast operations on them nzsl_raw_keys_dict = {} for rawl in nzsl_raw_keys_list: columns = rawl.split("|") @@ -113,3 +111,12 @@ print("Getting S3 keys present and absent from NZSL Signbank ...") nzsl_cooked_keys_list = [] s3_keys_not_in_nzsl_list = [] + +for video_key in s3_bucket_raw_keys_list: + if video_key in nzsl_raw_keys_dict: + nzsl_cooked_keys_list.append(video_key) + else: + s3_keys_not_in_nzsl_list.append(video_key) + +print(f"PRESENT: {len(nzsl_cooked_keys_list)} keys") +print(f"ABSENT: {len(s3_keys_not_in_nzsl_list)} keys") From 4427cdffb196111d6e6a1b97ac11b79d4f5854ac Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Mon, 26 Aug 2024 12:33:44 +1000 Subject: [PATCH 006/222] Comparing is_public with ACL return --- bin/python-get-heroku-video-s3-acls.py | 198 ++++++++++++++----------- 1 file changed, 114 insertions(+), 84 deletions(-) diff --git a/bin/python-get-heroku-video-s3-acls.py b/bin/python-get-heroku-video-s3-acls.py index 4390ebd9..5f75b833 100755 --- a/bin/python-get-heroku-video-s3-acls.py +++ b/bin/python-get-heroku-video-s3-acls.py @@ -7,45 +7,23 @@ # FIXME # Currently pulling all data into text files the way the bash script -# this python script is based on did it. -# We may be able to get away with losing some of the files and doing -# most of it in memory. +# that this python script is based on did it. +# We may be able to get away with losing some the files and doing most +# if not all of it in memory. import os import subprocess from pprint import pprint + +DEBUG = True + # Setup # TODO See how difficult using native API calls would be. HEROKU = "/usr/bin/heroku" AWS = "/usr/local/bin/aws" -TMPDIR = "/tmp/nzsl" -try: - os.makedirs(TMPDIR, exist_ok=True) -except OSError as err: - print(f"Error creating directory: {err}") - exit() - -NZSL_RAW_KEYS_FILE = f"{TMPDIR}/nzsl_raw_keys.txt" -NZSL_COOKED_KEYS_FILE = f"{TMPDIR}/nzsl_cooked_keys.txt" -S3_BUCKET_RAW_KEYS_FILE = f"{TMPDIR}/s3_bucket_raw_keys.txt" -S3_BUCKET_ERROR_KEYS_FILE = f"{TMPDIR}/s3_bucket_error_keys.csv" -S3_BUCKET_CONTENTS_FILE = f"{TMPDIR}/s3_bucket_contents.csv" -S3_KEYS_NOT_IN_NZSL = f"{TMPDIR}/s3_keys_not_in_nzsl.csv" -for p in ( - NZSL_RAW_KEYS_FILE, - NZSL_COOKED_KEYS_FILE, - S3_BUCKET_RAW_KEYS_FILE, - S3_BUCKET_ERROR_KEYS_FILE, - S3_BUCKET_CONTENTS_FILE, - S3_KEYS_NOT_IN_NZSL -): - f = open(p, "a") - f.truncate() - f.close() - RUN_MODE = "production" if RUN_MODE == "production": print("PRODUCTION") @@ -59,64 +37,116 @@ new_env = os.environ.copy() new_env["AWS_PROFILE"] = "nzsl" -# Get all keys from S3 -print(f"Getting raw S3 keys recursively ({AWS_S3_BUCKET}) ...") -#TODO Change this to a file-like object -with open(S3_BUCKET_RAW_KEYS_FILE, "w") as f_obj: - result = subprocess.run([AWS, "s3", "ls", f"s3://{AWS_S3_BUCKET}", "--recursive"], - env=new_env, shell=False, check=True, - text=True, stdout=f_obj) - -# Separate out just the keys (also strips newlines) -# Put them in an in-memory list -with open(S3_BUCKET_RAW_KEYS_FILE, "r") as f_obj: - s3_bucket_raw_keys_list = [line.split()[3] for line in f_obj] -print(f"{len(s3_bucket_raw_keys_list)} rows retrieved: {S3_BUCKET_RAW_KEYS_FILE}") - -# Write the keys back to the file -with open(S3_BUCKET_RAW_KEYS_FILE, "w") as f_obj: - for line in s3_bucket_raw_keys_list: - f_obj.write(f"{line}\n") - -# Get the video file keys from NZSL Signbank -print(f"Getting raw video file keys from NZSL Signbank ({NZSL_APP}) ...") -#TODO Change this to a file-like object -with open(NZSL_RAW_KEYS_FILE, "w") as f_obj: - result = subprocess.run([HEROKU, "pg:psql", "DATABASE_URL", "--app", f"{NZSL_APP}", - "-c", "select videofile, is_public from video_glossvideo"], - env=new_env, shell=False, check=True, - text=True, stdout=f_obj) - -# Remove the first 2 and last 2 lines, as we cannot control pg:psql's output formatting -with open(NZSL_RAW_KEYS_FILE, "r") as f_obj: - nzsl_raw_keys_list = f_obj.readlines() - nzsl_raw_keys_list = nzsl_raw_keys_list[2:] - nzsl_raw_keys_list = nzsl_raw_keys_list[:-2] -print(f"{len(nzsl_raw_keys_list)} rows retrieved: {NZSL_RAW_KEYS_FILE}") - -# Put the raw lines back into the text file -with open(NZSL_RAW_KEYS_FILE, "w") as f_obj: - f_obj.writelines(nzsl_raw_keys_list) +TMPDIR = "/tmp/nzsl" +try: + os.makedirs(TMPDIR, exist_ok=True) +except OSError as err: + print(f"Error creating directory: {err}") + exit() +NZSL_RAW_KEYS_FILE = f"{TMPDIR}/nzsl_raw_keys.txt" +NZSL_COOKED_KEYS_FILE = f"{TMPDIR}/nzsl_cooked_keys.txt" +S3_BUCKET_RAW_KEYS_FILE = f"{TMPDIR}/s3_bucket_raw_keys.txt" +S3_BUCKET_ERROR_KEYS_FILE = f"{TMPDIR}/s3_bucket_error_keys.csv" +S3_BUCKET_CONTENTS_FILE = f"{TMPDIR}/s3_bucket_contents.csv" +S3_KEYS_NOT_IN_NZSL = f"{TMPDIR}/s3_keys_not_in_nzsl.csv" -# Separate out the NZSL key columns -# Write them to a dictionary so we can do fast operations on them nzsl_raw_keys_dict = {} -for rawl in nzsl_raw_keys_list: - columns = rawl.split("|") - video_key = columns[0].strip() - is_public = columns[1].strip().lower() == 't' - nzsl_raw_keys_dict[video_key] = is_public - -# Get the s3 keys present and absent from our NZSL keys -print("Getting S3 keys present and absent from NZSL Signbank ...") -nzsl_cooked_keys_list = [] +nzsl_cooked_keys_dict = {} s3_keys_not_in_nzsl_list = [] -for video_key in s3_bucket_raw_keys_list: - if video_key in nzsl_raw_keys_dict: - nzsl_cooked_keys_list.append(video_key) - else: - s3_keys_not_in_nzsl_list.append(video_key) +if not DEBUG: + for p in ( + NZSL_RAW_KEYS_FILE, + NZSL_COOKED_KEYS_FILE, + S3_BUCKET_RAW_KEYS_FILE, + S3_BUCKET_ERROR_KEYS_FILE, + S3_BUCKET_CONTENTS_FILE, + S3_KEYS_NOT_IN_NZSL + ): + f = open(p, "a") + f.truncate() + f.close() + + # Get all keys from S3 + print(f"Getting raw S3 keys recursively ({AWS_S3_BUCKET}) ...") + # TODO Change this to a file-like object + with open(S3_BUCKET_RAW_KEYS_FILE, "w") as f_obj: + result = subprocess.run([AWS, "s3", "ls", f"s3://{AWS_S3_BUCKET}", "--recursive"], + env=new_env, shell=False, check=True, + text=True, stdout=f_obj) + + # Separate out just the keys (also strips newlines) + # Put them in an in-memory list + with open(S3_BUCKET_RAW_KEYS_FILE, "r") as f_obj: + s3_bucket_raw_keys_list = [line.split()[3] for line in f_obj] + print(f"{len(s3_bucket_raw_keys_list)} rows retrieved: {S3_BUCKET_RAW_KEYS_FILE}") + + # Write the keys back to the file + with open(S3_BUCKET_RAW_KEYS_FILE, "w") as f_obj: + for line in s3_bucket_raw_keys_list: + f_obj.write(f"{line}\n") + + # Get the video file keys from NZSL Signbank + print(f"Getting raw video file keys from NZSL Signbank ({NZSL_APP}) ...") + with open(NZSL_RAW_KEYS_FILE, "w") as f_obj: + result = subprocess.run([HEROKU, "pg:psql", "DATABASE_URL", "--app", f"{NZSL_APP}", + "-c", "select videofile, is_public from video_glossvideo"], + env=new_env, shell=False, check=True, + text=True, stdout=f_obj) + + # Remove the first 2 and last 2 lines, as we cannot control pg:psql's output formatting + with open(NZSL_RAW_KEYS_FILE, "r") as f_obj: + nzsl_raw_keys_list = f_obj.readlines() + nzsl_raw_keys_list = nzsl_raw_keys_list[2:] + nzsl_raw_keys_list = nzsl_raw_keys_list[:-2] + print(f"{len(nzsl_raw_keys_list)} rows retrieved: {NZSL_RAW_KEYS_FILE}") + + # Put the raw lines back into the text file + with open(NZSL_RAW_KEYS_FILE, "w") as f_obj: + f_obj.writelines(nzsl_raw_keys_list) + + # Separate out the NZSL key columns + # Write them to a dictionary so we can do fast operations on them + for rawl in nzsl_raw_keys_list: + columns = rawl.split("|") + video_key = columns[0].strip() + is_public = columns[1].strip().lower() == 't' + nzsl_raw_keys_dict[video_key] = is_public + # for item in nzsl_raw_keys_dict.items(): + # print(item) + + # Get the s3 keys present and absent from our NZSL keys + print("Getting S3 keys present and absent from NZSL Signbank ...") + for video_key in s3_bucket_raw_keys_list: + if video_key in nzsl_raw_keys_dict: + nzsl_cooked_keys_dict[video_key] = nzsl_raw_keys_dict[video_key] + else: + s3_keys_not_in_nzsl_list.append(video_key) + print(f"PRESENT: {len(nzsl_cooked_keys_dict)} keys") + print(f"ABSENT: {len(s3_keys_not_in_nzsl_list)} keys") + # Write just the cooked keys back to a file + # This is mainly for Debug + with open(NZSL_COOKED_KEYS_FILE, "w") as f_obj: + for video_key, is_public in nzsl_cooked_keys_dict.items(): + f_obj.write(f"{video_key}, {str(is_public)}\n") + +if DEBUG: + # We used the ones we recorded on the last non-DEBUG run + with open(NZSL_COOKED_KEYS_FILE, "r") as f_obj: + for line in f_obj.readlines(): + video_key, is_public = line.strip().split(", ") + nzsl_cooked_keys_dict[video_key] = is_public + +# From the ones present, get all their ACL information +print(f"Getting ACLs for keys from S3 ({AWS_S3_BUCKET}) ...") +for video_key, is_public in nzsl_cooked_keys_dict.items(): + video_key = video_key.strip() + print(f"Key: {video_key}") + print(f"Public: {is_public}") + result = subprocess.run( + [AWS, "s3api", "get-object-acl", "--output", "text", "--bucket", AWS_S3_BUCKET, "--key", video_key], + env=new_env, shell=False, check=True, + capture_output=True, text=True) + print(result.stdout) + -print(f"PRESENT: {len(nzsl_cooked_keys_list)} keys") -print(f"ABSENT: {len(s3_keys_not_in_nzsl_list)} keys") From c4ecdcb26dcf6cbbf803a7502e6f8220577c5542 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Mon, 26 Aug 2024 15:39:08 +1000 Subject: [PATCH 007/222] First pass at native boto s3 client use (messy) --- bin/python-get-heroku-video-s3-acls.py | 50 +++++++++++++++++--------- 1 file changed, 34 insertions(+), 16 deletions(-) diff --git a/bin/python-get-heroku-video-s3-acls.py b/bin/python-get-heroku-video-s3-acls.py index 5f75b833..28ddcf2f 100755 --- a/bin/python-get-heroku-video-s3-acls.py +++ b/bin/python-get-heroku-video-s3-acls.py @@ -5,22 +5,26 @@ # s3:GetObjectAcl permissions or READ_ACP access to the object # https://docs.aws.amazon.com/cli/latest/reference/s3api/get-object-acl.html -# FIXME -# Currently pulling all data into text files the way the bash script -# that this python script is based on did it. -# We may be able to get away with losing some the files and doing most -# if not all of it in memory. - import os import subprocess +import boto3 from pprint import pprint +# Never store these in code +AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID", None) +AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY", None) + +print(AWS_ACCESS_KEY_ID) +print(AWS_SECRET_ACCESS_KEY) +# if DEBUG, we use the results stored in files and only process the ACLS online DEBUG = True # Setup # TODO See how difficult using native API calls would be. +# Answer: Heroku - no idea +# Answer: AWS - fairly simple HEROKU = "/usr/bin/heroku" AWS = "/usr/local/bin/aws" @@ -112,8 +116,6 @@ video_key = columns[0].strip() is_public = columns[1].strip().lower() == 't' nzsl_raw_keys_dict[video_key] = is_public - # for item in nzsl_raw_keys_dict.items(): - # print(item) # Get the s3 keys present and absent from our NZSL keys print("Getting S3 keys present and absent from NZSL Signbank ...") @@ -131,7 +133,7 @@ f_obj.write(f"{video_key}, {str(is_public)}\n") if DEBUG: - # We used the ones we recorded on the last non-DEBUG run + # We use the ones we recorded on the last non-DEBUG run with open(NZSL_COOKED_KEYS_FILE, "r") as f_obj: for line in f_obj.readlines(): video_key, is_public = line.strip().split(", ") @@ -139,14 +141,30 @@ # From the ones present, get all their ACL information print(f"Getting ACLs for keys from S3 ({AWS_S3_BUCKET}) ...") +print("(Warning, this is a slow operation)") for video_key, is_public in nzsl_cooked_keys_dict.items(): video_key = video_key.strip() - print(f"Key: {video_key}") - print(f"Public: {is_public}") - result = subprocess.run( - [AWS, "s3api", "get-object-acl", "--output", "text", "--bucket", AWS_S3_BUCKET, "--key", video_key], - env=new_env, shell=False, check=True, - capture_output=True, text=True) - print(result.stdout) + header = f"Key: {video_key}\nPublic: {is_public}" + + USE_S3_NATIVE = True + + if USE_S3_NATIVE: + # Be very careful, never write anything back + s3 = boto3.client( + "s3", + aws_access_key_id=AWS_ACCESS_KEY_ID, + aws_secret_access_key=AWS_SECRET_ACCESS_KEY, + ) + acl = s3.get_object_acl(Key=video_key, Bucket=AWS_S3_BUCKET) + print(header) + pprint(acl) + else: + result = subprocess.run( + [AWS, "s3api", "get-object-acl", "--output", "text", "--bucket", AWS_S3_BUCKET, "--key", video_key], + env=new_env, shell=False, check=True, capture_output=True, text=True) + print(f"Key: {video_key}") + print(f"Public: {is_public}") + print(header) + print(result.stdout) From 00e2d657e728cc5a84488e1dcc70dfa3fb32e41c Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Mon, 26 Aug 2024 17:05:34 +1000 Subject: [PATCH 008/222] Revert "First pass at native boto s3 client use (messy)" Turns out setting up to use either path based or native is very un-simple. Try if necessary another time. This reverts commit c4ecdcb26dcf6cbbf803a7502e6f8220577c5542. --- bin/python-get-heroku-video-s3-acls.py | 50 +++++++++----------------- 1 file changed, 16 insertions(+), 34 deletions(-) diff --git a/bin/python-get-heroku-video-s3-acls.py b/bin/python-get-heroku-video-s3-acls.py index 28ddcf2f..5f75b833 100755 --- a/bin/python-get-heroku-video-s3-acls.py +++ b/bin/python-get-heroku-video-s3-acls.py @@ -5,26 +5,22 @@ # s3:GetObjectAcl permissions or READ_ACP access to the object # https://docs.aws.amazon.com/cli/latest/reference/s3api/get-object-acl.html +# FIXME +# Currently pulling all data into text files the way the bash script +# that this python script is based on did it. +# We may be able to get away with losing some the files and doing most +# if not all of it in memory. + import os import subprocess -import boto3 from pprint import pprint -# Never store these in code -AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID", None) -AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY", None) - -print(AWS_ACCESS_KEY_ID) -print(AWS_SECRET_ACCESS_KEY) -# if DEBUG, we use the results stored in files and only process the ACLS online DEBUG = True # Setup # TODO See how difficult using native API calls would be. -# Answer: Heroku - no idea -# Answer: AWS - fairly simple HEROKU = "/usr/bin/heroku" AWS = "/usr/local/bin/aws" @@ -116,6 +112,8 @@ video_key = columns[0].strip() is_public = columns[1].strip().lower() == 't' nzsl_raw_keys_dict[video_key] = is_public + # for item in nzsl_raw_keys_dict.items(): + # print(item) # Get the s3 keys present and absent from our NZSL keys print("Getting S3 keys present and absent from NZSL Signbank ...") @@ -133,7 +131,7 @@ f_obj.write(f"{video_key}, {str(is_public)}\n") if DEBUG: - # We use the ones we recorded on the last non-DEBUG run + # We used the ones we recorded on the last non-DEBUG run with open(NZSL_COOKED_KEYS_FILE, "r") as f_obj: for line in f_obj.readlines(): video_key, is_public = line.strip().split(", ") @@ -141,30 +139,14 @@ # From the ones present, get all their ACL information print(f"Getting ACLs for keys from S3 ({AWS_S3_BUCKET}) ...") -print("(Warning, this is a slow operation)") for video_key, is_public in nzsl_cooked_keys_dict.items(): video_key = video_key.strip() - header = f"Key: {video_key}\nPublic: {is_public}" - - USE_S3_NATIVE = True - - if USE_S3_NATIVE: - # Be very careful, never write anything back - s3 = boto3.client( - "s3", - aws_access_key_id=AWS_ACCESS_KEY_ID, - aws_secret_access_key=AWS_SECRET_ACCESS_KEY, - ) - acl = s3.get_object_acl(Key=video_key, Bucket=AWS_S3_BUCKET) - print(header) - pprint(acl) - else: - result = subprocess.run( - [AWS, "s3api", "get-object-acl", "--output", "text", "--bucket", AWS_S3_BUCKET, "--key", video_key], - env=new_env, shell=False, check=True, capture_output=True, text=True) - print(f"Key: {video_key}") - print(f"Public: {is_public}") - print(header) - print(result.stdout) + print(f"Key: {video_key}") + print(f"Public: {is_public}") + result = subprocess.run( + [AWS, "s3api", "get-object-acl", "--output", "text", "--bucket", AWS_S3_BUCKET, "--key", video_key], + env=new_env, shell=False, check=True, + capture_output=True, text=True) + print(result.stdout) From a487f596639a9fc40f272914714b8c2f6d50c799 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 28 Aug 2024 14:25:02 +1000 Subject: [PATCH 009/222] Rudimentary command line parsing --- bin/python-get-heroku-video-s3-acls.py | 45 +++++++++++++------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/bin/python-get-heroku-video-s3-acls.py b/bin/python-get-heroku-video-s3-acls.py index 5f75b833..a2773423 100755 --- a/bin/python-get-heroku-video-s3-acls.py +++ b/bin/python-get-heroku-video-s3-acls.py @@ -5,22 +5,21 @@ # s3:GetObjectAcl permissions or READ_ACP access to the object # https://docs.aws.amazon.com/cli/latest/reference/s3api/get-object-acl.html -# FIXME -# Currently pulling all data into text files the way the bash script -# that this python script is based on did it. -# We may be able to get away with losing some the files and doing most -# if not all of it in memory. - import os import subprocess +import argparse from pprint import pprint - -DEBUG = True +parser = argparse.ArgumentParser() +parser.add_argument("--cached", + default=False, + required=False, + action="store_true", + help="Use keys generated on a previous non-cache run") +args = parser.parse_args() # Setup -# TODO See how difficult using native API calls would be. HEROKU = "/usr/bin/heroku" AWS = "/usr/local/bin/aws" @@ -54,7 +53,18 @@ nzsl_cooked_keys_dict = {} s3_keys_not_in_nzsl_list = [] -if not DEBUG: +if args.cached: + print("Using the video keys we recorded on the last non-cached run") + try: + with open(NZSL_COOKED_KEYS_FILE, "r") as f_obj: + for line in f_obj.readlines(): + video_key, is_public = line.strip().split(", ") + nzsl_cooked_keys_dict[video_key] = is_public + except FileNotFoundError: + print(f"File not found: {NZSL_COOKED_KEYS_FILE}") + exit() + print(f"PRESENT: {len(nzsl_cooked_keys_dict)} keys") +else: for p in ( NZSL_RAW_KEYS_FILE, NZSL_COOKED_KEYS_FILE, @@ -69,7 +79,6 @@ # Get all keys from S3 print(f"Getting raw S3 keys recursively ({AWS_S3_BUCKET}) ...") - # TODO Change this to a file-like object with open(S3_BUCKET_RAW_KEYS_FILE, "w") as f_obj: result = subprocess.run([AWS, "s3", "ls", f"s3://{AWS_S3_BUCKET}", "--recursive"], env=new_env, shell=False, check=True, @@ -130,14 +139,8 @@ for video_key, is_public in nzsl_cooked_keys_dict.items(): f_obj.write(f"{video_key}, {str(is_public)}\n") -if DEBUG: - # We used the ones we recorded on the last non-DEBUG run - with open(NZSL_COOKED_KEYS_FILE, "r") as f_obj: - for line in f_obj.readlines(): - video_key, is_public = line.strip().split(", ") - nzsl_cooked_keys_dict[video_key] = is_public -# From the ones present, get all their ACL information +# From the keys present in NZSL, get all their ACL information print(f"Getting ACLs for keys from S3 ({AWS_S3_BUCKET}) ...") for video_key, is_public in nzsl_cooked_keys_dict.items(): video_key = video_key.strip() @@ -145,8 +148,6 @@ print(f"Public: {is_public}") result = subprocess.run( [AWS, "s3api", "get-object-acl", "--output", "text", "--bucket", AWS_S3_BUCKET, "--key", video_key], - env=new_env, shell=False, check=True, - capture_output=True, text=True) + env=new_env, shell=False, check=True, + capture_output=True, text=True) print(result.stdout) - - From b58f6bf37fc4b627a4058444c8f864b5cc5b1897 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 28 Aug 2024 14:26:20 +1000 Subject: [PATCH 010/222] Rename --- bin/{python-get-heroku-video-s3-acls.py => get-video-s3-acls.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename bin/{python-get-heroku-video-s3-acls.py => get-video-s3-acls.py} (100%) diff --git a/bin/python-get-heroku-video-s3-acls.py b/bin/get-video-s3-acls.py similarity index 100% rename from bin/python-get-heroku-video-s3-acls.py rename to bin/get-video-s3-acls.py From 58d2a9aaf4f0d50dfb5931a90d6c665274304672 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 28 Aug 2024 14:58:31 +1000 Subject: [PATCH 011/222] Better command line arguments --- bin/get-video-s3-acls.py | 45 +++++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 12 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index a2773423..e947cc46 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -11,36 +11,58 @@ import argparse from pprint import pprint -parser = argparse.ArgumentParser() +HEROKU = "/usr/bin/heroku" +AWS = "/usr/local/bin/aws" + +parser = argparse.ArgumentParser(epilog="You must have a configured AWS profile to use this app. See the --awsprofile argument.") parser.add_argument("--cached", default=False, required=False, action="store_true", - help="Use keys generated on a previous non-cache run") + help="Use keys generated on a previous non-cache run (default: False)") +parser.add_argument("--production", + default=False, + required=False, + action="store_true", + help="Run in PRODUCTION mode (instead of STAGING) (default: False/STAGING)") +parser.add_argument("--pgclient", + default=HEROKU, + required=False, + help=f"Postgres client path (default: {HEROKU})") +parser.add_argument("--awsprofile", + default="nzsl", + required=False, + help=f"AWS configured profile to use (default: 'nzsl')") +parser.add_argument("--s3client", + default=AWS, + required=False, + help=f"AWS S3 client path (default: {AWS})") args = parser.parse_args() -# Setup -HEROKU = "/usr/bin/heroku" -AWS = "/usr/local/bin/aws" +HEROKU = args.pgclient +AWS = args.s3client -RUN_MODE = "production" -if RUN_MODE == "production": - print("PRODUCTION") +if args.production: + print("Mode: PRODUCTION") NZSL_APP = "nzsl-signbank-production" AWS_S3_BUCKET = "nzsl-signbank-media-production" else: - print("STAGING") + print("Mode: STAGING") NZSL_APP = "nzsl-signbank-uat" AWS_S3_BUCKET = "nzsl-signbank-media-uat" new_env = os.environ.copy() -new_env["AWS_PROFILE"] = "nzsl" +new_env["AWS_PROFILE"] = args.awsprofile + +print(f"Target NZSL app: {NZSL_APP}") +print(f"Target AWS S3 bucket: {AWS_S3_BUCKET}") +print(f"AWS profile using: {new_env['AWS_PROFILE']}") TMPDIR = "/tmp/nzsl" try: os.makedirs(TMPDIR, exist_ok=True) except OSError as err: - print(f"Error creating directory: {err}") + print(f"Error creating temporary directory: {TMPDIR} {err}") exit() NZSL_RAW_KEYS_FILE = f"{TMPDIR}/nzsl_raw_keys.txt" NZSL_COOKED_KEYS_FILE = f"{TMPDIR}/nzsl_cooked_keys.txt" @@ -139,7 +161,6 @@ for video_key, is_public in nzsl_cooked_keys_dict.items(): f_obj.write(f"{video_key}, {str(is_public)}\n") - # From the keys present in NZSL, get all their ACL information print(f"Getting ACLs for keys from S3 ({AWS_S3_BUCKET}) ...") for video_key, is_public in nzsl_cooked_keys_dict.items(): From da626ae1e198ecca8e22922c5be58865f5e9da5e Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 28 Aug 2024 15:31:16 +1000 Subject: [PATCH 012/222] Command line arguments and external apps codified --- bin/get-video-s3-acls.py | 45 ++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index e947cc46..832a0b28 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -11,10 +11,13 @@ import argparse from pprint import pprint -HEROKU = "/usr/bin/heroku" +PGCLIENT = "/usr/bin/psql" AWS = "/usr/local/bin/aws" parser = argparse.ArgumentParser(epilog="You must have a configured AWS profile to use this app. See the --awsprofile argument.") +parser.add_argument("--dburl", + required=True, + help=f"(REQUIRED) Database url (e.g. value of DATABASE_URL on Heroku)") parser.add_argument("--cached", default=False, required=False, @@ -26,34 +29,36 @@ action="store_true", help="Run in PRODUCTION mode (instead of STAGING) (default: False/STAGING)") parser.add_argument("--pgclient", - default=HEROKU, + default=PGCLIENT, required=False, - help=f"Postgres client path (default: {HEROKU})") + help=f"Postgres client path (default: {PGCLIENT})") parser.add_argument("--awsprofile", default="nzsl", required=False, help=f"AWS configured profile to use (default: 'nzsl')") -parser.add_argument("--s3client", +parser.add_argument("--awsclient", default=AWS, required=False, - help=f"AWS S3 client path (default: {AWS})") + help=f"AWS client path (default: {AWS})") args = parser.parse_args() -HEROKU = args.pgclient -AWS = args.s3client +DATABASE_URL = args.dburl +PGCLIENT = args.pgclient +AWS = args.awsclient if args.production: - print("Mode: PRODUCTION") + MODE_STR = "PRODUCTION" NZSL_APP = "nzsl-signbank-production" AWS_S3_BUCKET = "nzsl-signbank-media-production" else: - print("Mode: STAGING") + MODE_STR = "STAGING" NZSL_APP = "nzsl-signbank-uat" AWS_S3_BUCKET = "nzsl-signbank-media-uat" new_env = os.environ.copy() new_env["AWS_PROFILE"] = args.awsprofile +print(f"Mode: {MODE_STR}") print(f"Target NZSL app: {NZSL_APP}") print(f"Target AWS S3 bucket: {AWS_S3_BUCKET}") print(f"AWS profile using: {new_env['AWS_PROFILE']}") @@ -76,7 +81,7 @@ s3_keys_not_in_nzsl_list = [] if args.cached: - print("Using the video keys we recorded on the last non-cached run") + print("Using the video keys we recorded on the last non-cached run.") try: with open(NZSL_COOKED_KEYS_FILE, "r") as f_obj: for line in f_obj.readlines(): @@ -87,6 +92,7 @@ exit() print(f"PRESENT: {len(nzsl_cooked_keys_dict)} keys") else: + print("Generating keys from scratch.") for p in ( NZSL_RAW_KEYS_FILE, NZSL_COOKED_KEYS_FILE, @@ -120,31 +126,26 @@ # Get the video file keys from NZSL Signbank print(f"Getting raw video file keys from NZSL Signbank ({NZSL_APP}) ...") with open(NZSL_RAW_KEYS_FILE, "w") as f_obj: - result = subprocess.run([HEROKU, "pg:psql", "DATABASE_URL", "--app", f"{NZSL_APP}", - "-c", "select videofile, is_public from video_glossvideo"], + result = subprocess.run([PGCLIENT, + "-t", + "-c", "select videofile, is_public from video_glossvideo", + f"{DATABASE_URL}"], env=new_env, shell=False, check=True, text=True, stdout=f_obj) - - # Remove the first 2 and last 2 lines, as we cannot control pg:psql's output formatting with open(NZSL_RAW_KEYS_FILE, "r") as f_obj: nzsl_raw_keys_list = f_obj.readlines() - nzsl_raw_keys_list = nzsl_raw_keys_list[2:] - nzsl_raw_keys_list = nzsl_raw_keys_list[:-2] print(f"{len(nzsl_raw_keys_list)} rows retrieved: {NZSL_RAW_KEYS_FILE}") - # Put the raw lines back into the text file - with open(NZSL_RAW_KEYS_FILE, "w") as f_obj: - f_obj.writelines(nzsl_raw_keys_list) - # Separate out the NZSL key columns # Write them to a dictionary so we can do fast operations on them for rawl in nzsl_raw_keys_list: + rawl = rawl.strip() + if not rawl: + continue columns = rawl.split("|") video_key = columns[0].strip() is_public = columns[1].strip().lower() == 't' nzsl_raw_keys_dict[video_key] = is_public - # for item in nzsl_raw_keys_dict.items(): - # print(item) # Get the s3 keys present and absent from our NZSL keys print("Getting S3 keys present and absent from NZSL Signbank ...") From 3fdb0b21483b04858598e471e6f9fc695a8778ed Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 28 Aug 2024 15:32:24 +1000 Subject: [PATCH 013/222] Comments --- bin/get-video-s3-acls.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 832a0b28..3b6498d1 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -11,6 +11,9 @@ import argparse from pprint import pprint +# TODO +# We are using external apps just for the moment. +# These will be removed for native libraries. PGCLIENT = "/usr/bin/psql" AWS = "/usr/local/bin/aws" From 026c2fa98f6e991095738b650a8b9809035eb891 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 28 Aug 2024 15:41:20 +1000 Subject: [PATCH 014/222] black --- bin/get-video-s3-acls.py | 129 +++++++++++++++++++++++++-------------- 1 file changed, 84 insertions(+), 45 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 3b6498d1..e03327db 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -17,32 +17,44 @@ PGCLIENT = "/usr/bin/psql" AWS = "/usr/local/bin/aws" -parser = argparse.ArgumentParser(epilog="You must have a configured AWS profile to use this app. See the --awsprofile argument.") -parser.add_argument("--dburl", - required=True, - help=f"(REQUIRED) Database url (e.g. value of DATABASE_URL on Heroku)") -parser.add_argument("--cached", - default=False, - required=False, - action="store_true", - help="Use keys generated on a previous non-cache run (default: False)") -parser.add_argument("--production", - default=False, - required=False, - action="store_true", - help="Run in PRODUCTION mode (instead of STAGING) (default: False/STAGING)") -parser.add_argument("--pgclient", - default=PGCLIENT, - required=False, - help=f"Postgres client path (default: {PGCLIENT})") -parser.add_argument("--awsprofile", - default="nzsl", - required=False, - help=f"AWS configured profile to use (default: 'nzsl')") -parser.add_argument("--awsclient", - default=AWS, - required=False, - help=f"AWS client path (default: {AWS})") +parser = argparse.ArgumentParser( + epilog="You must have a configured AWS profile to use this app. See the --awsprofile " + "argument." +) +parser.add_argument( + "--dburl", + required=True, + help=f"(REQUIRED) Database url (e.g. value of DATABASE_URL on Heroku)", +) +parser.add_argument( + "--cached", + default=False, + required=False, + action="store_true", + help="Use keys generated on a previous non-cache run (default: False)", +) +parser.add_argument( + "--production", + default=False, + required=False, + action="store_true", + help="Run in PRODUCTION mode (instead of STAGING) (default: False/STAGING)", +) +parser.add_argument( + "--pgclient", + default=PGCLIENT, + required=False, + help=f"Postgres client path (default: {PGCLIENT})", +) +parser.add_argument( + "--awsprofile", + default="nzsl", + required=False, + help=f"AWS configured profile to use (default: 'nzsl')", +) +parser.add_argument( + "--awsclient", default=AWS, required=False, help=f"AWS client path (default: {AWS})" +) args = parser.parse_args() DATABASE_URL = args.dburl @@ -97,12 +109,12 @@ else: print("Generating keys from scratch.") for p in ( - NZSL_RAW_KEYS_FILE, - NZSL_COOKED_KEYS_FILE, - S3_BUCKET_RAW_KEYS_FILE, - S3_BUCKET_ERROR_KEYS_FILE, - S3_BUCKET_CONTENTS_FILE, - S3_KEYS_NOT_IN_NZSL + NZSL_RAW_KEYS_FILE, + NZSL_COOKED_KEYS_FILE, + S3_BUCKET_RAW_KEYS_FILE, + S3_BUCKET_ERROR_KEYS_FILE, + S3_BUCKET_CONTENTS_FILE, + S3_KEYS_NOT_IN_NZSL, ): f = open(p, "a") f.truncate() @@ -111,9 +123,14 @@ # Get all keys from S3 print(f"Getting raw S3 keys recursively ({AWS_S3_BUCKET}) ...") with open(S3_BUCKET_RAW_KEYS_FILE, "w") as f_obj: - result = subprocess.run([AWS, "s3", "ls", f"s3://{AWS_S3_BUCKET}", "--recursive"], - env=new_env, shell=False, check=True, - text=True, stdout=f_obj) + result = subprocess.run( + [AWS, "s3", "ls", f"s3://{AWS_S3_BUCKET}", "--recursive"], + env=new_env, + shell=False, + check=True, + text=True, + stdout=f_obj, + ) # Separate out just the keys (also strips newlines) # Put them in an in-memory list @@ -129,12 +146,20 @@ # Get the video file keys from NZSL Signbank print(f"Getting raw video file keys from NZSL Signbank ({NZSL_APP}) ...") with open(NZSL_RAW_KEYS_FILE, "w") as f_obj: - result = subprocess.run([PGCLIENT, - "-t", - "-c", "select videofile, is_public from video_glossvideo", - f"{DATABASE_URL}"], - env=new_env, shell=False, check=True, - text=True, stdout=f_obj) + result = subprocess.run( + [ + PGCLIENT, + "-t", + "-c", + "select videofile, is_public from video_glossvideo", + f"{DATABASE_URL}", + ], + env=new_env, + shell=False, + check=True, + text=True, + stdout=f_obj, + ) with open(NZSL_RAW_KEYS_FILE, "r") as f_obj: nzsl_raw_keys_list = f_obj.readlines() print(f"{len(nzsl_raw_keys_list)} rows retrieved: {NZSL_RAW_KEYS_FILE}") @@ -147,7 +172,7 @@ continue columns = rawl.split("|") video_key = columns[0].strip() - is_public = columns[1].strip().lower() == 't' + is_public = columns[1].strip().lower() == "t" nzsl_raw_keys_dict[video_key] = is_public # Get the s3 keys present and absent from our NZSL keys @@ -172,7 +197,21 @@ print(f"Key: {video_key}") print(f"Public: {is_public}") result = subprocess.run( - [AWS, "s3api", "get-object-acl", "--output", "text", "--bucket", AWS_S3_BUCKET, "--key", video_key], - env=new_env, shell=False, check=True, - capture_output=True, text=True) + [ + AWS, + "s3api", + "get-object-acl", + "--output", + "text", + "--bucket", + AWS_S3_BUCKET, + "--key", + video_key, + ], + env=new_env, + shell=False, + check=True, + capture_output=True, + text=True, + ) print(result.stdout) From cc6acb2e6d9f0fcd8536e6546cb2c8f3d5696cd9 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 28 Aug 2024 15:45:08 +1000 Subject: [PATCH 015/222] Better arguments --- bin/get-video-s3-acls.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index e03327db..401981bd 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -21,11 +21,12 @@ epilog="You must have a configured AWS profile to use this app. See the --awsprofile " "argument." ) +# Positional args parser.add_argument( - "--dburl", - required=True, + "dburl", help=f"(REQUIRED) Database url (e.g. value of DATABASE_URL on Heroku)", ) +# Named args parser.add_argument( "--cached", default=False, From f9019fd79401b51e353a592c2ed97655f026c6b4 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 28 Aug 2024 15:58:27 +1000 Subject: [PATCH 016/222] Better args --- bin/get-video-s3-acls.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 401981bd..b1687607 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -32,29 +32,33 @@ default=False, required=False, action="store_true", - help="Use keys generated on a previous non-cache run (default: False)", + help="Use keys generated on a previous non-cache run (default: %(default)s) " + "(Don't mix PRODUCTION and STAGING!)", ) parser.add_argument( "--production", default=False, required=False, action="store_true", - help="Run in PRODUCTION mode (instead of STAGING) (default: False/STAGING)", + help="Run in PRODUCTION mode, instead of STAGING (default: %(default)s)", ) parser.add_argument( "--pgclient", default=PGCLIENT, required=False, - help=f"Postgres client path (default: {PGCLIENT})", + help=f"Postgres client path (default: %(default)s)", ) parser.add_argument( - "--awsprofile", - default="nzsl", + "--awsclient", + default=AWS, required=False, - help=f"AWS configured profile to use (default: 'nzsl')", + help=f"AWS client path (default: %(default)s)", ) parser.add_argument( - "--awsclient", default=AWS, required=False, help=f"AWS client path (default: {AWS})" + "--awsprofile", + default="nzsl", + required=False, + help=f"AWS configured profile to use (default: '%(default)s')", ) args = parser.parse_args() @@ -195,8 +199,6 @@ print(f"Getting ACLs for keys from S3 ({AWS_S3_BUCKET}) ...") for video_key, is_public in nzsl_cooked_keys_dict.items(): video_key = video_key.strip() - print(f"Key: {video_key}") - print(f"Public: {is_public}") result = subprocess.run( [ AWS, @@ -215,4 +217,6 @@ capture_output=True, text=True, ) + print(f"Key: {video_key}") + print(f"Public: {is_public}") print(result.stdout) From 32cf39d425174b142b872921e70c855fc337d8c8 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 28 Aug 2024 16:01:43 +1000 Subject: [PATCH 017/222] Better arg help and ordering --- bin/get-video-s3-acls.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index b1687607..b42f15ba 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -18,22 +18,20 @@ AWS = "/usr/local/bin/aws" parser = argparse.ArgumentParser( - epilog="You must have a configured AWS profile to use this app. See the --awsprofile " + description="You must have a configured AWS profile to use this app. See the --awsprofile " "argument." ) -# Positional args +# Positional arguments parser.add_argument( "dburl", help=f"(REQUIRED) Database url (e.g. value of DATABASE_URL on Heroku)", ) -# Named args +# Optional arguments parser.add_argument( - "--cached", - default=False, + "--awsprofile", + default="nzsl", required=False, - action="store_true", - help="Use keys generated on a previous non-cache run (default: %(default)s) " - "(Don't mix PRODUCTION and STAGING!)", + help=f"AWS configured profile to use (default: '%(default)s')", ) parser.add_argument( "--production", @@ -42,6 +40,14 @@ action="store_true", help="Run in PRODUCTION mode, instead of STAGING (default: %(default)s)", ) +parser.add_argument( + "--cached", + default=False, + required=False, + action="store_true", + help="Use keys generated on a previous non-cached run (default: %(default)s) " + "(Don't mix PRODUCTION and STAGING!)", +) parser.add_argument( "--pgclient", default=PGCLIENT, @@ -54,12 +60,6 @@ required=False, help=f"AWS client path (default: %(default)s)", ) -parser.add_argument( - "--awsprofile", - default="nzsl", - required=False, - help=f"AWS configured profile to use (default: '%(default)s')", -) args = parser.parse_args() DATABASE_URL = args.dburl From 49ea76266e33cbb6a7b14bc1ebd4bb7b0793e19c Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 28 Aug 2024 17:52:18 +1000 Subject: [PATCH 018/222] Better cached handling --- bin/get-video-s3-acls.py | 65 +++++++++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 20 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index b42f15ba..a6142e35 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -15,23 +15,29 @@ # We are using external apps just for the moment. # These will be removed for native libraries. PGCLIENT = "/usr/bin/psql" -AWS = "/usr/local/bin/aws" +AWSCLIENT = "/usr/local/bin/aws" + +# NZSL: Is there a database url defined in the environment? +DATABASE_URL = os.getenv("DATABASE_URL", None) parser = argparse.ArgumentParser( - description="You must have a configured AWS profile to use this app. See the --awsprofile " + description="You must have a configured AWSCLIENT profile to use this app. See the --awsprofile " "argument." ) # Positional arguments -parser.add_argument( - "dburl", - help=f"(REQUIRED) Database url (e.g. value of DATABASE_URL on Heroku)", -) +if DATABASE_URL: + print("DATABASE_URL defined in environment") +else: + parser.add_argument( + "dburl", + help=f"(REQUIRED) Database url (Overridden by DATABASE_URL environment variable)", + ) # Optional arguments parser.add_argument( "--awsprofile", default="nzsl", required=False, - help=f"AWS configured profile to use (default: '%(default)s')", + help=f"AWSCLIENT configured profile to use (default: '%(default)s')", ) parser.add_argument( "--production", @@ -56,16 +62,12 @@ ) parser.add_argument( "--awsclient", - default=AWS, + default=AWSCLIENT, required=False, - help=f"AWS client path (default: %(default)s)", + help=f"AWSCLIENT client path (default: %(default)s)", ) args = parser.parse_args() -DATABASE_URL = args.dburl -PGCLIENT = args.pgclient -AWS = args.awsclient - if args.production: MODE_STR = "PRODUCTION" NZSL_APP = "nzsl-signbank-production" @@ -78,10 +80,22 @@ new_env = os.environ.copy() new_env["AWS_PROFILE"] = args.awsprofile +PGCLIENT = args.pgclient +AWSCLIENT = args.awsclient + +if not DATABASE_URL: + DATABASE_URL = args.dburl + +if args.cached: + print("Using the video keys we recorded on the last non-cached run.") + print(f"Mode: {MODE_STR}") print(f"Target NZSL app: {NZSL_APP}") print(f"Target AWS S3 bucket: {AWS_S3_BUCKET}") print(f"AWS profile using: {new_env['AWS_PROFILE']}") +print(f"PGCLIENT: {PGCLIENT}") +print(f"AWSCLIENT: {AWSCLIENT}") +print(f"DATABASE_URL:\n{DATABASE_URL}") TMPDIR = "/tmp/nzsl" try: @@ -94,14 +108,13 @@ S3_BUCKET_RAW_KEYS_FILE = f"{TMPDIR}/s3_bucket_raw_keys.txt" S3_BUCKET_ERROR_KEYS_FILE = f"{TMPDIR}/s3_bucket_error_keys.csv" S3_BUCKET_CONTENTS_FILE = f"{TMPDIR}/s3_bucket_contents.csv" -S3_KEYS_NOT_IN_NZSL = f"{TMPDIR}/s3_keys_not_in_nzsl.csv" +S3_KEYS_NOT_IN_NZSL_FILE = f"{TMPDIR}/s3_keys_not_in_nzsl.csv" nzsl_raw_keys_dict = {} nzsl_cooked_keys_dict = {} s3_keys_not_in_nzsl_list = [] if args.cached: - print("Using the video keys we recorded on the last non-cached run.") try: with open(NZSL_COOKED_KEYS_FILE, "r") as f_obj: for line in f_obj.readlines(): @@ -110,7 +123,14 @@ except FileNotFoundError: print(f"File not found: {NZSL_COOKED_KEYS_FILE}") exit() + try: + with open(S3_KEYS_NOT_IN_NZSL_FILE, "r") as f_obj: + s3_keys_not_in_nzsl_list = [line.strip() for line in f_obj.readlines()] + except FileNotFoundError: + print(f"File not found: {S3_KEYS_NOT_IN_NZSL_FILE}") + exit() print(f"PRESENT: {len(nzsl_cooked_keys_dict)} keys") + print(f"ABSENT: {len(s3_keys_not_in_nzsl_list)} keys") else: print("Generating keys from scratch.") for p in ( @@ -119,7 +139,7 @@ S3_BUCKET_RAW_KEYS_FILE, S3_BUCKET_ERROR_KEYS_FILE, S3_BUCKET_CONTENTS_FILE, - S3_KEYS_NOT_IN_NZSL, + S3_KEYS_NOT_IN_NZSL_FILE, ): f = open(p, "a") f.truncate() @@ -129,7 +149,7 @@ print(f"Getting raw S3 keys recursively ({AWS_S3_BUCKET}) ...") with open(S3_BUCKET_RAW_KEYS_FILE, "w") as f_obj: result = subprocess.run( - [AWS, "s3", "ls", f"s3://{AWS_S3_BUCKET}", "--recursive"], + [AWSCLIENT, "s3", "ls", f"s3://{AWS_S3_BUCKET}", "--recursive"], env=new_env, shell=False, check=True, @@ -189,19 +209,24 @@ s3_keys_not_in_nzsl_list.append(video_key) print(f"PRESENT: {len(nzsl_cooked_keys_dict)} keys") print(f"ABSENT: {len(s3_keys_not_in_nzsl_list)} keys") - # Write just the cooked keys back to a file - # This is mainly for Debug + + # Write the "cooked" (i.e. present) keys back to a file with open(NZSL_COOKED_KEYS_FILE, "w") as f_obj: for video_key, is_public in nzsl_cooked_keys_dict.items(): f_obj.write(f"{video_key}, {str(is_public)}\n") + # Write the absent keys back to a file + with open(S3_KEYS_NOT_IN_NZSL_FILE, "w") as f_obj: + for video_key in s3_keys_not_in_nzsl_list: + f_obj.write(f"{video_key}\n") + # From the keys present in NZSL, get all their ACL information print(f"Getting ACLs for keys from S3 ({AWS_S3_BUCKET}) ...") for video_key, is_public in nzsl_cooked_keys_dict.items(): video_key = video_key.strip() result = subprocess.run( [ - AWS, + AWSCLIENT, "s3api", "get-object-acl", "--output", From e692cb9d146707675013ca32eb38a61669282488 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 28 Aug 2024 17:52:49 +1000 Subject: [PATCH 019/222] Debug removed --- bin/get-video-s3-acls.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index a6142e35..bdab8285 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -9,7 +9,6 @@ import os import subprocess import argparse -from pprint import pprint # TODO # We are using external apps just for the moment. From ed40d189af4ae7b7ebf4280069ad105bb7d7c108 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 28 Aug 2024 17:55:32 +1000 Subject: [PATCH 020/222] Cut n pasted text fixed --- bin/get-video-s3-acls.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index bdab8285..1d8de80a 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -36,7 +36,7 @@ "--awsprofile", default="nzsl", required=False, - help=f"AWSCLIENT configured profile to use (default: '%(default)s')", + help=f"AWS configured profile to use (default: '%(default)s')", ) parser.add_argument( "--production", @@ -63,7 +63,7 @@ "--awsclient", default=AWSCLIENT, required=False, - help=f"AWSCLIENT client path (default: %(default)s)", + help=f"AWS client path (default: %(default)s)", ) args = parser.parse_args() From 29e0fda91387d62db83d9db613331a44822e73f1 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 28 Aug 2024 18:11:32 +1000 Subject: [PATCH 021/222] Incremental improvements --- bin/get-video-s3-acls.py | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 1d8de80a..2a1bde6f 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -13,8 +13,8 @@ # TODO # We are using external apps just for the moment. # These will be removed for native libraries. -PGCLIENT = "/usr/bin/psql" AWSCLIENT = "/usr/local/bin/aws" +PGCLIENT = "/usr/bin/psql" # NZSL: Is there a database url defined in the environment? DATABASE_URL = os.getenv("DATABASE_URL", None) @@ -79,21 +79,23 @@ new_env = os.environ.copy() new_env["AWS_PROFILE"] = args.awsprofile -PGCLIENT = args.pgclient AWSCLIENT = args.awsclient +PGCLIENT = args.pgclient if not DATABASE_URL: DATABASE_URL = args.dburl if args.cached: print("Using the video keys we recorded on the last non-cached run.") +else: + print("Generating keys from scratch.") -print(f"Mode: {MODE_STR}") -print(f"Target NZSL app: {NZSL_APP}") -print(f"Target AWS S3 bucket: {AWS_S3_BUCKET}") -print(f"AWS profile using: {new_env['AWS_PROFILE']}") -print(f"PGCLIENT: {PGCLIENT}") -print(f"AWSCLIENT: {AWSCLIENT}") +print(f"Mode: {MODE_STR}") +print(f"NZSL app: {NZSL_APP}") +print(f"AWS S3 bucket: {AWS_S3_BUCKET}") +print(f"AWS profile: {new_env['AWS_PROFILE']}") +print(f"AWSCLIENT: {AWSCLIENT}") +print(f"PGCLIENT: {PGCLIENT}") print(f"DATABASE_URL:\n{DATABASE_URL}") TMPDIR = "/tmp/nzsl" @@ -104,6 +106,7 @@ exit() NZSL_RAW_KEYS_FILE = f"{TMPDIR}/nzsl_raw_keys.txt" NZSL_COOKED_KEYS_FILE = f"{TMPDIR}/nzsl_cooked_keys.txt" +COOKED_DELIMITER = ", " S3_BUCKET_RAW_KEYS_FILE = f"{TMPDIR}/s3_bucket_raw_keys.txt" S3_BUCKET_ERROR_KEYS_FILE = f"{TMPDIR}/s3_bucket_error_keys.csv" S3_BUCKET_CONTENTS_FILE = f"{TMPDIR}/s3_bucket_contents.csv" @@ -114,10 +117,11 @@ s3_keys_not_in_nzsl_list = [] if args.cached: + # Pull all info from existing files try: with open(NZSL_COOKED_KEYS_FILE, "r") as f_obj: for line in f_obj.readlines(): - video_key, is_public = line.strip().split(", ") + video_key, is_public = line.strip().split(COOKED_DELIMITER) nzsl_cooked_keys_dict[video_key] = is_public except FileNotFoundError: print(f"File not found: {NZSL_COOKED_KEYS_FILE}") @@ -131,7 +135,7 @@ print(f"PRESENT: {len(nzsl_cooked_keys_dict)} keys") print(f"ABSENT: {len(s3_keys_not_in_nzsl_list)} keys") else: - print("Generating keys from scratch.") + # Zero-out files for p in ( NZSL_RAW_KEYS_FILE, NZSL_COOKED_KEYS_FILE, @@ -144,8 +148,8 @@ f.truncate() f.close() - # Get all keys from S3 - print(f"Getting raw S3 keys recursively ({AWS_S3_BUCKET}) ...") + # Get all keys from AWS S3 + print(f"Getting raw AWS S3 keys recursively ({AWS_S3_BUCKET}) ...") with open(S3_BUCKET_RAW_KEYS_FILE, "w") as f_obj: result = subprocess.run( [AWSCLIENT, "s3", "ls", f"s3://{AWS_S3_BUCKET}", "--recursive"], @@ -212,7 +216,7 @@ # Write the "cooked" (i.e. present) keys back to a file with open(NZSL_COOKED_KEYS_FILE, "w") as f_obj: for video_key, is_public in nzsl_cooked_keys_dict.items(): - f_obj.write(f"{video_key}, {str(is_public)}\n") + f_obj.write(f"{video_key}{COOKED_DELIMITER}{str(is_public)}\n") # Write the absent keys back to a file with open(S3_KEYS_NOT_IN_NZSL_FILE, "w") as f_obj: From 516f6a7a367a4dee3ff900faa27f98d9fab0b674 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Thu, 29 Aug 2024 13:06:24 +1000 Subject: [PATCH 022/222] Minor feedback text fix --- bin/get-video-s3-acls.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 2a1bde6f..707fb78d 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -90,12 +90,12 @@ else: print("Generating keys from scratch.") -print(f"Mode: {MODE_STR}") -print(f"NZSL app: {NZSL_APP}") -print(f"AWS S3 bucket: {AWS_S3_BUCKET}") -print(f"AWS profile: {new_env['AWS_PROFILE']}") -print(f"AWSCLIENT: {AWSCLIENT}") -print(f"PGCLIENT: {PGCLIENT}") +print(f"Mode: {MODE_STR}") +print(f" NZSL app: {NZSL_APP}") +print(f" S3 bucket: {AWS_S3_BUCKET}") +print(f"AWS profile: {new_env['AWS_PROFILE']}") +print(f"AWSCLIENT: {AWSCLIENT}") +print(f"PGCLIENT: {PGCLIENT}") print(f"DATABASE_URL:\n{DATABASE_URL}") TMPDIR = "/tmp/nzsl" From dbb1a43d40bc6650df5a40de51faa3bf6fe552f4 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Thu, 29 Aug 2024 17:36:01 +1000 Subject: [PATCH 023/222] set_public() functions renamed for clarity --- signbank/video/admin.py | 10 +++++----- signbank/video/models.py | 6 ++---- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/signbank/video/admin.py b/signbank/video/admin.py index f2d62811..5c1282b7 100644 --- a/signbank/video/admin.py +++ b/signbank/video/admin.py @@ -75,18 +75,18 @@ def queryset(self, request, queryset): return queryset -def set_public(modeladmin, request, queryset): +def admin_set_public(modeladmin, request, queryset): for glossvideo in queryset.all(): glossvideo.set_public(True) -def set_hidden(modeladmin, request, queryset): +def admin_set_hidden(modeladmin, request, queryset): for glossvideo in queryset.all(): glossvideo.set_public(False) -set_public.short_description = _lazy("Set selected videos public") -set_hidden.short_description = _lazy("Set selected videos hidden") +admin_set_public.short_description = _lazy("Set selected videos public") +admin_set_hidden.short_description = _lazy("Set selected videos hidden") class GlossVideoAdmin(admin.ModelAdmin): @@ -98,7 +98,7 @@ class GlossVideoAdmin(admin.ModelAdmin): 'videofile', 'video_type', 'posterfile', 'id', 'version') list_filter = ('is_public', 'video_type', 'gloss__dataset', HasGlossFilter, 'dataset', HasPosterFilter, GlossesVideoCountFilter) - actions = [set_public, set_hidden] + actions = [admin_set_public, admin_set_hidden] def get_queryset(self, request): qs = super(GlossVideoAdmin, self).get_queryset(request) diff --git a/signbank/video/models.py b/signbank/video/models.py index 8de46671..e1617765 100644 --- a/signbank/video/models.py +++ b/signbank/video/models.py @@ -48,7 +48,7 @@ def public_url(self, name): return f'{domain}{path}' - def set_public(self, name, is_public): + def set_public_acl(self, name, is_public): """ Set the object ACL on the object. This is only supported for S3 storage, and is a no-op for local file storage """ @@ -62,8 +62,6 @@ def set_public(self, name, is_public): ) - - class GlossVideo(models.Model): """A video that represents a particular idgloss""" #: Descriptive title of the GlossVideo. @@ -247,7 +245,7 @@ def is_video(self): def set_public(self, is_public): self.is_public = is_public self.save() - self.videofile.storage.set_public(self.videofile.name, is_public) + self.videofile.storage.set_public_acl(self.videofile.name, is_public) True From 1c294a29fe3cd05058eea8708f2ded24ded82df5 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 30 Aug 2024 15:52:54 +1000 Subject: [PATCH 024/222] Basics of final output collection working --- bin/get-video-s3-acls.py | 67 +++++++++++++++++++++++++++++----------- 1 file changed, 49 insertions(+), 18 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 707fb78d..5f587ec1 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 # Permissions required: -# heroku cli - access to app +# psql - access to heroku app's postgres # aws s3 - NZSL IAM access # s3:GetObjectAcl permissions or READ_ACP access to the object # https://docs.aws.amazon.com/cli/latest/reference/s3api/get-object-acl.html @@ -9,6 +9,7 @@ import os import subprocess import argparse +from pprint import pprint # TODO # We are using external apps just for the moment. @@ -106,22 +107,28 @@ exit() NZSL_RAW_KEYS_FILE = f"{TMPDIR}/nzsl_raw_keys.txt" NZSL_COOKED_KEYS_FILE = f"{TMPDIR}/nzsl_cooked_keys.txt" -COOKED_DELIMITER = ", " +CSV_DELIMITER = ", " S3_BUCKET_RAW_KEYS_FILE = f"{TMPDIR}/s3_bucket_raw_keys.txt" S3_BUCKET_ERROR_KEYS_FILE = f"{TMPDIR}/s3_bucket_error_keys.csv" S3_BUCKET_CONTENTS_FILE = f"{TMPDIR}/s3_bucket_contents.csv" S3_KEYS_NOT_IN_NZSL_FILE = f"{TMPDIR}/s3_keys_not_in_nzsl.csv" +ALL_KEYS_FILE = f"{TMPDIR}/all_keys.csv" nzsl_raw_keys_dict = {} nzsl_cooked_keys_dict = {} s3_keys_not_in_nzsl_list = [] +# TODO This will replace everything +all_keys_dict = {} + if args.cached: + print("NOT READY!") + exit() # Pull all info from existing files try: with open(NZSL_COOKED_KEYS_FILE, "r") as f_obj: for line in f_obj.readlines(): - video_key, is_public = line.strip().split(COOKED_DELIMITER) + video_key, is_public = line.strip().split(CSV_DELIMITER) nzsl_cooked_keys_dict[video_key] = is_public except FileNotFoundError: print(f"File not found: {NZSL_COOKED_KEYS_FILE}") @@ -143,6 +150,7 @@ S3_BUCKET_ERROR_KEYS_FILE, S3_BUCKET_CONTENTS_FILE, S3_KEYS_NOT_IN_NZSL_FILE, + ALL_KEYS_FILE ): f = open(p, "a") f.truncate() @@ -160,26 +168,26 @@ stdout=f_obj, ) - # Separate out just the keys (also strips newlines) - # Put them in an in-memory list + # Separate out just the key (also strip newline) from date, time, size, key + # Put the keys in an in-memory list with open(S3_BUCKET_RAW_KEYS_FILE, "r") as f_obj: s3_bucket_raw_keys_list = [line.split()[3] for line in f_obj] print(f"{len(s3_bucket_raw_keys_list)} rows retrieved: {S3_BUCKET_RAW_KEYS_FILE}") - # Write the keys back to the file + # Write the keys back to the file, for cleanliness with open(S3_BUCKET_RAW_KEYS_FILE, "w") as f_obj: for line in s3_bucket_raw_keys_list: f_obj.write(f"{line}\n") - # Get the video file keys from NZSL Signbank - print(f"Getting raw video file keys from NZSL Signbank ({NZSL_APP}) ...") + # Get the video files info from NZSL Signbank + print(f"Getting raw list of video file info from NZSL Signbank ({NZSL_APP}) ...") with open(NZSL_RAW_KEYS_FILE, "w") as f_obj: result = subprocess.run( [ PGCLIENT, "-t", "-c", - "select videofile, is_public from video_glossvideo", + "select id as db_id, gloss_id, is_public, videofile from video_glossvideo", f"{DATABASE_URL}", ], env=new_env, @@ -193,40 +201,63 @@ print(f"{len(nzsl_raw_keys_list)} rows retrieved: {NZSL_RAW_KEYS_FILE}") # Separate out the NZSL key columns - # Write them to a dictionary so we can do fast operations on them + # Write them to a dictionary, so we can do fast operations on them for rawl in nzsl_raw_keys_list: rawl = rawl.strip() if not rawl: continue columns = rawl.split("|") - video_key = columns[0].strip() - is_public = columns[1].strip().lower() == "t" - nzsl_raw_keys_dict[video_key] = is_public + db_id = columns[0].strip() + gloss_id = columns[1].strip() + is_public = columns[2].strip().lower() == "t" + # 'videofile' data is also the key for S3 + video_key = columns[3].strip() + # Each dictionary entry is all of these values + nzsl_raw_keys_dict[video_key] = [db_id, gloss_id, is_public] # Get the s3 keys present and absent from our NZSL keys print("Getting S3 keys present and absent from NZSL Signbank ...") + nkeys_present = 0 + nkeys_absent = 0 for video_key in s3_bucket_raw_keys_list: if video_key in nzsl_raw_keys_dict: + nkeys_present += 1 + # Add 'Present' column to start + all_keys_dict[video_key] = [True] + nzsl_raw_keys_dict[video_key] nzsl_cooked_keys_dict[video_key] = nzsl_raw_keys_dict[video_key] else: + nkeys_absent += 1 s3_keys_not_in_nzsl_list.append(video_key) - print(f"PRESENT: {len(nzsl_cooked_keys_dict)} keys") - print(f"ABSENT: {len(s3_keys_not_in_nzsl_list)} keys") + # Add 'Present' (absent) column to start + all_keys_dict[video_key] = [False, "", "", ""] + print(f"PRESENT: {nkeys_present} keys") + print(f"ABSENT: {nkeys_absent} keys") # Write the "cooked" (i.e. present) keys back to a file with open(NZSL_COOKED_KEYS_FILE, "w") as f_obj: for video_key, is_public in nzsl_cooked_keys_dict.items(): - f_obj.write(f"{video_key}{COOKED_DELIMITER}{str(is_public)}\n") + f_obj.write(f"{video_key}{CSV_DELIMITER}{str(is_public)}\n") # Write the absent keys back to a file with open(S3_KEYS_NOT_IN_NZSL_FILE, "w") as f_obj: for video_key in s3_keys_not_in_nzsl_list: f_obj.write(f"{video_key}\n") + # Write all keys back to a file + with open(ALL_KEYS_FILE, "w") as f_obj: + for video_key, item_list in all_keys_dict.items(): + outstr = f"{video_key}{CSV_DELIMITER}{CSV_DELIMITER.join(map(str, item_list))}\n" + f_obj.write(outstr) + # From the keys present in NZSL, get all their ACL information print(f"Getting ACLs for keys from S3 ({AWS_S3_BUCKET}) ...") -for video_key, is_public in nzsl_cooked_keys_dict.items(): - video_key = video_key.strip() +for video_key, [is_present, db_id, gloss_id, is_public] in all_keys_dict.items(): + if not is_present: + continue + + print("HUMPHREY") + print(video_key) + result = subprocess.run( [ AWSCLIENT, From eab29ba176017e32a484eae7d073598db3f4ad94 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 30 Aug 2024 17:08:47 +1000 Subject: [PATCH 025/222] Basics of final output collection working --- bin/get-video-s3-acls.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 5f587ec1..c72dd7ab 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -255,7 +255,6 @@ if not is_present: continue - print("HUMPHREY") print(video_key) result = subprocess.run( @@ -276,6 +275,8 @@ capture_output=True, text=True, ) - print(f"Key: {video_key}") - print(f"Public: {is_public}") + print(f"Key: {video_key}") + print(f"Public: {is_public}") + print(f"db_id: {db_id}") + print(f"gloss_id: {gloss_id}") print(result.stdout) From 54595ffe63f930175dd4447ef66263dd231e2c1a Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 30 Aug 2024 17:34:19 +1000 Subject: [PATCH 026/222] About to remove legacy files output --- bin/get-video-s3-acls.py | 43 +++++++++++++++++++++++++++++++++++----- 1 file changed, 38 insertions(+), 5 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index c72dd7ab..00a31dfb 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -105,9 +105,9 @@ except OSError as err: print(f"Error creating temporary directory: {TMPDIR} {err}") exit() +CSV_DELIMITER = "," NZSL_RAW_KEYS_FILE = f"{TMPDIR}/nzsl_raw_keys.txt" NZSL_COOKED_KEYS_FILE = f"{TMPDIR}/nzsl_cooked_keys.txt" -CSV_DELIMITER = ", " S3_BUCKET_RAW_KEYS_FILE = f"{TMPDIR}/s3_bucket_raw_keys.txt" S3_BUCKET_ERROR_KEYS_FILE = f"{TMPDIR}/s3_bucket_error_keys.csv" S3_BUCKET_CONTENTS_FILE = f"{TMPDIR}/s3_bucket_contents.csv" @@ -120,11 +120,45 @@ # TODO This will replace everything all_keys_dict = {} +nkeys_present = 0 +nkeys_absent = 0 if args.cached: - print("NOT READY!") - exit() # Pull all info from existing files + try: + with open(ALL_KEYS_FILE, "r") as f_obj: + for line in f_obj.readlines(): + + print(line, end="") + + video_key, is_present_str, db_id_str, gloss_id_str, is_public_str = line.strip().split(CSV_DELIMITER) + + is_present = is_present_str.strip().lower() == "true" + if is_present: + nkeys_present += 1 + db_id = int(db_id_str) + try: + gloss_id = int(gloss_id_str) + except ValueError: + gloss_id = None + is_public = is_public_str.strip().lower() == "true" + else: + nkeys_absent += 1 + db_id = None + gloss_id = None + is_public = None + + all_keys_dict[video_key] = [is_present, db_id, gloss_id, is_public] + + print(video_key, end=" ") + pprint(all_keys_dict[video_key]) + + print(f"PRESENT: {nkeys_present} keys") + print(f"ABSENT: {nkeys_absent} keys") + except FileNotFoundError: + print(f"File not found: {ALL_KEYS_FILE}") + exit() + """ try: with open(NZSL_COOKED_KEYS_FILE, "r") as f_obj: for line in f_obj.readlines(): @@ -141,6 +175,7 @@ exit() print(f"PRESENT: {len(nzsl_cooked_keys_dict)} keys") print(f"ABSENT: {len(s3_keys_not_in_nzsl_list)} keys") + """ else: # Zero-out files for p in ( @@ -217,8 +252,6 @@ # Get the s3 keys present and absent from our NZSL keys print("Getting S3 keys present and absent from NZSL Signbank ...") - nkeys_present = 0 - nkeys_absent = 0 for video_key in s3_bucket_raw_keys_list: if video_key in nzsl_raw_keys_dict: nkeys_present += 1 From c8c51620f7b8841e1bcddf66e8074cc3f56d8859 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 30 Aug 2024 17:39:39 +1000 Subject: [PATCH 027/222] Legacy output files removed --- bin/get-video-s3-acls.py | 44 ++-------------------------------------- 1 file changed, 2 insertions(+), 42 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 00a31dfb..bd68d607 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -105,21 +105,15 @@ except OSError as err: print(f"Error creating temporary directory: {TMPDIR} {err}") exit() + CSV_DELIMITER = "," NZSL_RAW_KEYS_FILE = f"{TMPDIR}/nzsl_raw_keys.txt" -NZSL_COOKED_KEYS_FILE = f"{TMPDIR}/nzsl_cooked_keys.txt" S3_BUCKET_RAW_KEYS_FILE = f"{TMPDIR}/s3_bucket_raw_keys.txt" -S3_BUCKET_ERROR_KEYS_FILE = f"{TMPDIR}/s3_bucket_error_keys.csv" -S3_BUCKET_CONTENTS_FILE = f"{TMPDIR}/s3_bucket_contents.csv" -S3_KEYS_NOT_IN_NZSL_FILE = f"{TMPDIR}/s3_keys_not_in_nzsl.csv" ALL_KEYS_FILE = f"{TMPDIR}/all_keys.csv" nzsl_raw_keys_dict = {} -nzsl_cooked_keys_dict = {} -s3_keys_not_in_nzsl_list = [] - -# TODO This will replace everything all_keys_dict = {} + nkeys_present = 0 nkeys_absent = 0 @@ -158,33 +152,11 @@ except FileNotFoundError: print(f"File not found: {ALL_KEYS_FILE}") exit() - """ - try: - with open(NZSL_COOKED_KEYS_FILE, "r") as f_obj: - for line in f_obj.readlines(): - video_key, is_public = line.strip().split(CSV_DELIMITER) - nzsl_cooked_keys_dict[video_key] = is_public - except FileNotFoundError: - print(f"File not found: {NZSL_COOKED_KEYS_FILE}") - exit() - try: - with open(S3_KEYS_NOT_IN_NZSL_FILE, "r") as f_obj: - s3_keys_not_in_nzsl_list = [line.strip() for line in f_obj.readlines()] - except FileNotFoundError: - print(f"File not found: {S3_KEYS_NOT_IN_NZSL_FILE}") - exit() - print(f"PRESENT: {len(nzsl_cooked_keys_dict)} keys") - print(f"ABSENT: {len(s3_keys_not_in_nzsl_list)} keys") - """ else: # Zero-out files for p in ( NZSL_RAW_KEYS_FILE, - NZSL_COOKED_KEYS_FILE, S3_BUCKET_RAW_KEYS_FILE, - S3_BUCKET_ERROR_KEYS_FILE, - S3_BUCKET_CONTENTS_FILE, - S3_KEYS_NOT_IN_NZSL_FILE, ALL_KEYS_FILE ): f = open(p, "a") @@ -257,25 +229,13 @@ nkeys_present += 1 # Add 'Present' column to start all_keys_dict[video_key] = [True] + nzsl_raw_keys_dict[video_key] - nzsl_cooked_keys_dict[video_key] = nzsl_raw_keys_dict[video_key] else: nkeys_absent += 1 - s3_keys_not_in_nzsl_list.append(video_key) # Add 'Present' (absent) column to start all_keys_dict[video_key] = [False, "", "", ""] print(f"PRESENT: {nkeys_present} keys") print(f"ABSENT: {nkeys_absent} keys") - # Write the "cooked" (i.e. present) keys back to a file - with open(NZSL_COOKED_KEYS_FILE, "w") as f_obj: - for video_key, is_public in nzsl_cooked_keys_dict.items(): - f_obj.write(f"{video_key}{CSV_DELIMITER}{str(is_public)}\n") - - # Write the absent keys back to a file - with open(S3_KEYS_NOT_IN_NZSL_FILE, "w") as f_obj: - for video_key in s3_keys_not_in_nzsl_list: - f_obj.write(f"{video_key}\n") - # Write all keys back to a file with open(ALL_KEYS_FILE, "w") as f_obj: for video_key, item_list in all_keys_dict.items(): From d2ebec3e7c32b20635c4ec31ece2f670af33b7dc Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 30 Aug 2024 17:44:48 +1000 Subject: [PATCH 028/222] cleanups --- bin/get-video-s3-acls.py | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index bd68d607..3a30c637 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -107,7 +107,7 @@ exit() CSV_DELIMITER = "," -NZSL_RAW_KEYS_FILE = f"{TMPDIR}/nzsl_raw_keys.txt" +NZSL_POSTGRES_RAW_KEYS_FILE = f"{TMPDIR}/nzsl_postgres_raw_keys.txt" S3_BUCKET_RAW_KEYS_FILE = f"{TMPDIR}/s3_bucket_raw_keys.txt" ALL_KEYS_FILE = f"{TMPDIR}/all_keys.csv" @@ -118,19 +118,17 @@ nkeys_absent = 0 if args.cached: - # Pull all info from existing files + # Pull all info from existing file try: with open(ALL_KEYS_FILE, "r") as f_obj: for line in f_obj.readlines(): - - print(line, end="") - video_key, is_present_str, db_id_str, gloss_id_str, is_public_str = line.strip().split(CSV_DELIMITER) is_present = is_present_str.strip().lower() == "true" if is_present: nkeys_present += 1 db_id = int(db_id_str) + # Some don't have gloss_id's try: gloss_id = int(gloss_id_str) except ValueError: @@ -144,18 +142,15 @@ all_keys_dict[video_key] = [is_present, db_id, gloss_id, is_public] - print(video_key, end=" ") - pprint(all_keys_dict[video_key]) - print(f"PRESENT: {nkeys_present} keys") - print(f"ABSENT: {nkeys_absent} keys") + print(f"ABSENT: {nkeys_absent} keys") except FileNotFoundError: print(f"File not found: {ALL_KEYS_FILE}") exit() else: # Zero-out files for p in ( - NZSL_RAW_KEYS_FILE, + NZSL_POSTGRES_RAW_KEYS_FILE, S3_BUCKET_RAW_KEYS_FILE, ALL_KEYS_FILE ): @@ -188,7 +183,7 @@ # Get the video files info from NZSL Signbank print(f"Getting raw list of video file info from NZSL Signbank ({NZSL_APP}) ...") - with open(NZSL_RAW_KEYS_FILE, "w") as f_obj: + with open(NZSL_POSTGRES_RAW_KEYS_FILE, "w") as f_obj: result = subprocess.run( [ PGCLIENT, @@ -203,9 +198,9 @@ text=True, stdout=f_obj, ) - with open(NZSL_RAW_KEYS_FILE, "r") as f_obj: + with open(NZSL_POSTGRES_RAW_KEYS_FILE, "r") as f_obj: nzsl_raw_keys_list = f_obj.readlines() - print(f"{len(nzsl_raw_keys_list)} rows retrieved: {NZSL_RAW_KEYS_FILE}") + print(f"{len(nzsl_raw_keys_list)} rows retrieved: {NZSL_POSTGRES_RAW_KEYS_FILE}") # Separate out the NZSL key columns # Write them to a dictionary, so we can do fast operations on them @@ -234,7 +229,7 @@ # Add 'Present' (absent) column to start all_keys_dict[video_key] = [False, "", "", ""] print(f"PRESENT: {nkeys_present} keys") - print(f"ABSENT: {nkeys_absent} keys") + print(f"ABSENT: {nkeys_absent} keys") # Write all keys back to a file with open(ALL_KEYS_FILE, "w") as f_obj: From ce51fa442a8097e86f3f089ff19c34239ee58c53 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 30 Aug 2024 17:49:36 +1000 Subject: [PATCH 029/222] s3_bucket_raw_keys_list --- bin/get-video-s3-acls.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 3a30c637..3b233382 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -112,6 +112,7 @@ ALL_KEYS_FILE = f"{TMPDIR}/all_keys.csv" nzsl_raw_keys_dict = {} +s3_bucket_raw_keys_list = [] all_keys_dict = {} nkeys_present = 0 From fedf7e9544ac20495aa144a6cdc4725a2c3878ec Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 30 Aug 2024 18:19:06 +1000 Subject: [PATCH 030/222] Output changed to JSON -> py dict for processing --- bin/get-video-s3-acls.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 3b233382..c4ac5919 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -9,6 +9,7 @@ import os import subprocess import argparse +import json from pprint import pprint # TODO @@ -203,8 +204,8 @@ nzsl_raw_keys_list = f_obj.readlines() print(f"{len(nzsl_raw_keys_list)} rows retrieved: {NZSL_POSTGRES_RAW_KEYS_FILE}") - # Separate out the NZSL key columns - # Write them to a dictionary, so we can do fast operations on them + # Separate out the NZSL db columns + # Write them to a dictionary, so we can do fast operations for rawl in nzsl_raw_keys_list: rawl = rawl.strip() if not rawl: @@ -215,7 +216,7 @@ is_public = columns[2].strip().lower() == "t" # 'videofile' data is also the key for S3 video_key = columns[3].strip() - # Each dictionary entry is all of these values + # Each dictionary slot contains these values nzsl_raw_keys_dict[video_key] = [db_id, gloss_id, is_public] # Get the s3 keys present and absent from our NZSL keys @@ -244,15 +245,13 @@ if not is_present: continue - print(video_key) - result = subprocess.run( [ AWSCLIENT, "s3api", "get-object-acl", "--output", - "text", + "json", "--bucket", AWS_S3_BUCKET, "--key", @@ -268,4 +267,6 @@ print(f"Public: {is_public}") print(f"db_id: {db_id}") print(f"gloss_id: {gloss_id}") - print(result.stdout) + + # Still figuring out how to make this into canned ACLS, shouldn't be hard + pprint(json.loads(result.stdout)) From 274ff5a1fb77133d68980cd219fa873d260076f9 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Sun, 1 Sep 2024 15:05:02 +1000 Subject: [PATCH 031/222] All fields represented and ACL logic working --- bin/get-video-s3-acls.py | 68 ++++++++++++++++++++++++---------------- 1 file changed, 41 insertions(+), 27 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index c4ac5919..dad43695 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -242,31 +242,45 @@ # From the keys present in NZSL, get all their ACL information print(f"Getting ACLs for keys from S3 ({AWS_S3_BUCKET}) ...") for video_key, [is_present, db_id, gloss_id, is_public] in all_keys_dict.items(): - if not is_present: - continue - - result = subprocess.run( - [ - AWSCLIENT, - "s3api", - "get-object-acl", - "--output", - "json", - "--bucket", - AWS_S3_BUCKET, - "--key", - video_key, - ], - env=new_env, - shell=False, - check=True, - capture_output=True, - text=True, - ) + canned_acl = "" + canned_acl_expected = "" + if is_present: + canned_acl_expected = "public-read" if is_public else "private" + result = subprocess.run( + [ + AWSCLIENT, + "s3api", + "get-object-acl", + "--output", + "json", + "--bucket", + AWS_S3_BUCKET, + "--key", + video_key, + ], + env=new_env, + shell=False, + check=True, + capture_output=True, + text=True, + ) + acls_grants_json = json.loads(result.stdout)["Grants"] + if len(acls_grants_json) > 1: + if acls_grants_json[0]["Permission"] == "FULL_CONTROL" and acls_grants_json[1]["Permission"] == "READ": + canned_acl = "public-read" + else: + canned_acl = "Unknown ACL" + else: + if acls_grants_json[0]["Permission"] == "FULL_CONTROL": + canned_acl = "private" + else: + canned_acl = "Unknown ACL" print(f"Key: {video_key}") - print(f"Public: {is_public}") - print(f"db_id: {db_id}") - print(f"gloss_id: {gloss_id}") - - # Still figuring out how to make this into canned ACLS, shouldn't be hard - pprint(json.loads(result.stdout)) + print(f"Present: {is_present}") + print(f"db_id: {db_id if is_present else ''}") + print(f"gloss_id: {gloss_id if is_present else ''}") + print(f"Public: {is_public if is_present else ''}") + print(f"Expected: {canned_acl_expected}") + print(f"Got: {canned_acl}") + print(f"Match: {str(canned_acl_expected == canned_acl) if is_present else ''}") + print("--------------------------------------") From 8fc2e414b7f6f20d2da1184dedf7242e7f598cd6 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Sun, 1 Sep 2024 15:09:36 +1000 Subject: [PATCH 032/222] black --- bin/get-video-s3-acls.py | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index dad43695..eb82fc46 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -124,7 +124,13 @@ try: with open(ALL_KEYS_FILE, "r") as f_obj: for line in f_obj.readlines(): - video_key, is_present_str, db_id_str, gloss_id_str, is_public_str = line.strip().split(CSV_DELIMITER) + ( + video_key, + is_present_str, + db_id_str, + gloss_id_str, + is_public_str, + ) = line.strip().split(CSV_DELIMITER) is_present = is_present_str.strip().lower() == "true" if is_present: @@ -151,11 +157,7 @@ exit() else: # Zero-out files - for p in ( - NZSL_POSTGRES_RAW_KEYS_FILE, - S3_BUCKET_RAW_KEYS_FILE, - ALL_KEYS_FILE - ): + for p in (NZSL_POSTGRES_RAW_KEYS_FILE, S3_BUCKET_RAW_KEYS_FILE, ALL_KEYS_FILE): f = open(p, "a") f.truncate() f.close() @@ -236,11 +238,17 @@ # Write all keys back to a file with open(ALL_KEYS_FILE, "w") as f_obj: for video_key, item_list in all_keys_dict.items(): - outstr = f"{video_key}{CSV_DELIMITER}{CSV_DELIMITER.join(map(str, item_list))}\n" + outstr = ( + f"{video_key}{CSV_DELIMITER}{CSV_DELIMITER.join(map(str, item_list))}\n" + ) f_obj.write(outstr) # From the keys present in NZSL, get all their ACL information print(f"Getting ACLs for keys from S3 ({AWS_S3_BUCKET}) ...") +# CSV header +print( + f"Key{CSV_DELIMITER}Present{CSV_DELIMITER}db_id{CSV_DELIMITER}gloss_id{CSV_DELIMITER}Public{CSV_DELIMITER}Expected{CSV_DELIMITER}Got{CSV_DELIMITER}Match" +) for video_key, [is_present, db_id, gloss_id, is_public] in all_keys_dict.items(): canned_acl = "" canned_acl_expected = "" @@ -266,7 +274,10 @@ ) acls_grants_json = json.loads(result.stdout)["Grants"] if len(acls_grants_json) > 1: - if acls_grants_json[0]["Permission"] == "FULL_CONTROL" and acls_grants_json[1]["Permission"] == "READ": + if ( + acls_grants_json[0]["Permission"] == "FULL_CONTROL" + and acls_grants_json[1]["Permission"] == "READ" + ): canned_acl = "public-read" else: canned_acl = "Unknown ACL" @@ -275,7 +286,8 @@ canned_acl = "private" else: canned_acl = "Unknown ACL" - print(f"Key: {video_key}") + # CSV columns + print(f"Key: {video_key}", end=CSV_DELIMITER) print(f"Present: {is_present}") print(f"db_id: {db_id if is_present else ''}") print(f"gloss_id: {gloss_id if is_present else ''}") From c143b2f92eb9fa5bb4198df82bfaf6feb05319ef Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Sun, 1 Sep 2024 15:13:37 +1000 Subject: [PATCH 033/222] black --- bin/get-video-s3-acls.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index eb82fc46..39b6a573 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -286,13 +286,13 @@ canned_acl = "private" else: canned_acl = "Unknown ACL" + # CSV columns - print(f"Key: {video_key}", end=CSV_DELIMITER) - print(f"Present: {is_present}") - print(f"db_id: {db_id if is_present else ''}") - print(f"gloss_id: {gloss_id if is_present else ''}") - print(f"Public: {is_public if is_present else ''}") - print(f"Expected: {canned_acl_expected}") - print(f"Got: {canned_acl}") - print(f"Match: {str(canned_acl_expected == canned_acl) if is_present else ''}") - print("--------------------------------------") + print(f"{video_key}", end=CSV_DELIMITER) + print(f"{is_present}", end=CSV_DELIMITER) + print(f"{db_id if is_present else ''}", end=CSV_DELIMITER) + print(f"{gloss_id if is_present else ''}", end=CSV_DELIMITER) + print(f"{is_public if is_present else ''}", end=CSV_DELIMITER) + print(f"{canned_acl_expected}", end=CSV_DELIMITER) + print(f"{canned_acl}", end=CSV_DELIMITER) + print(f"{str(canned_acl_expected == canned_acl) if is_present else ''}") From 1c267ff958f4c91337cde99ab0e774f1f4b1c819 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Mon, 2 Sep 2024 12:12:21 +1000 Subject: [PATCH 034/222] remove pprint --- bin/get-video-s3-acls.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 39b6a573..969c665a 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -10,7 +10,6 @@ import subprocess import argparse import json -from pprint import pprint # TODO # We are using external apps just for the moment. From 1937bf3e65d22d4f0580c559eee2cfeacb350ce6 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Mon, 2 Sep 2024 12:26:45 +1000 Subject: [PATCH 035/222] Header writes to stderr --- bin/get-video-s3-acls.py | 45 ++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 969c665a..298f5219 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -7,6 +7,7 @@ import os +import sys import subprocess import argparse import json @@ -26,7 +27,7 @@ ) # Positional arguments if DATABASE_URL: - print("DATABASE_URL defined in environment") + print("DATABASE_URL defined in environment", file=sys.stderr) else: parser.add_argument( "dburl", @@ -87,23 +88,23 @@ DATABASE_URL = args.dburl if args.cached: - print("Using the video keys we recorded on the last non-cached run.") + print("Using the video keys we recorded on the last non-cached run.", file=sys.stderr) else: - print("Generating keys from scratch.") + print("Generating keys from scratch.", file=sys.stderr) -print(f"Mode: {MODE_STR}") -print(f" NZSL app: {NZSL_APP}") -print(f" S3 bucket: {AWS_S3_BUCKET}") -print(f"AWS profile: {new_env['AWS_PROFILE']}") -print(f"AWSCLIENT: {AWSCLIENT}") -print(f"PGCLIENT: {PGCLIENT}") -print(f"DATABASE_URL:\n{DATABASE_URL}") +print(f"Mode: {MODE_STR}", file=sys.stderr) +print(f" NZSL app: {NZSL_APP}", file=sys.stderr) +print(f" S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr) +print(f"AWS profile: {new_env['AWS_PROFILE']}", file=sys.stderr) +print(f"AWSCLIENT: {AWSCLIENT}", file=sys.stderr) +print(f"PGCLIENT: {PGCLIENT}", file=sys.stderr) +print(f"DATABASE_URL:\n{DATABASE_URL}", file=sys.stderr) TMPDIR = "/tmp/nzsl" try: os.makedirs(TMPDIR, exist_ok=True) except OSError as err: - print(f"Error creating temporary directory: {TMPDIR} {err}") + print(f"Error creating temporary directory: {TMPDIR} {err}", file=sys.stderr) exit() CSV_DELIMITER = "," @@ -149,10 +150,10 @@ all_keys_dict[video_key] = [is_present, db_id, gloss_id, is_public] - print(f"PRESENT: {nkeys_present} keys") - print(f"ABSENT: {nkeys_absent} keys") + print(f"PRESENT: {nkeys_present} keys", file=sys.stderr) + print(f"ABSENT: {nkeys_absent} keys", file=sys.stderr) except FileNotFoundError: - print(f"File not found: {ALL_KEYS_FILE}") + print(f"File not found: {ALL_KEYS_FILE}", file=sys.stderr) exit() else: # Zero-out files @@ -162,7 +163,7 @@ f.close() # Get all keys from AWS S3 - print(f"Getting raw AWS S3 keys recursively ({AWS_S3_BUCKET}) ...") + print(f"Getting raw AWS S3 keys recursively ({AWS_S3_BUCKET}) ...", file=sys.stderr) with open(S3_BUCKET_RAW_KEYS_FILE, "w") as f_obj: result = subprocess.run( [AWSCLIENT, "s3", "ls", f"s3://{AWS_S3_BUCKET}", "--recursive"], @@ -177,7 +178,7 @@ # Put the keys in an in-memory list with open(S3_BUCKET_RAW_KEYS_FILE, "r") as f_obj: s3_bucket_raw_keys_list = [line.split()[3] for line in f_obj] - print(f"{len(s3_bucket_raw_keys_list)} rows retrieved: {S3_BUCKET_RAW_KEYS_FILE}") + print(f"{len(s3_bucket_raw_keys_list)} rows retrieved: {S3_BUCKET_RAW_KEYS_FILE}", file=sys.stderr) # Write the keys back to the file, for cleanliness with open(S3_BUCKET_RAW_KEYS_FILE, "w") as f_obj: @@ -185,7 +186,7 @@ f_obj.write(f"{line}\n") # Get the video files info from NZSL Signbank - print(f"Getting raw list of video file info from NZSL Signbank ({NZSL_APP}) ...") + print(f"Getting raw list of video file info from NZSL Signbank ({NZSL_APP}) ...", file=sys.stderr) with open(NZSL_POSTGRES_RAW_KEYS_FILE, "w") as f_obj: result = subprocess.run( [ @@ -203,7 +204,7 @@ ) with open(NZSL_POSTGRES_RAW_KEYS_FILE, "r") as f_obj: nzsl_raw_keys_list = f_obj.readlines() - print(f"{len(nzsl_raw_keys_list)} rows retrieved: {NZSL_POSTGRES_RAW_KEYS_FILE}") + print(f"{len(nzsl_raw_keys_list)} rows retrieved: {NZSL_POSTGRES_RAW_KEYS_FILE}", file=sys.stderr) # Separate out the NZSL db columns # Write them to a dictionary, so we can do fast operations @@ -221,7 +222,7 @@ nzsl_raw_keys_dict[video_key] = [db_id, gloss_id, is_public] # Get the s3 keys present and absent from our NZSL keys - print("Getting S3 keys present and absent from NZSL Signbank ...") + print("Getting S3 keys present and absent from NZSL Signbank ...", file=sys.stderr) for video_key in s3_bucket_raw_keys_list: if video_key in nzsl_raw_keys_dict: nkeys_present += 1 @@ -231,8 +232,8 @@ nkeys_absent += 1 # Add 'Present' (absent) column to start all_keys_dict[video_key] = [False, "", "", ""] - print(f"PRESENT: {nkeys_present} keys") - print(f"ABSENT: {nkeys_absent} keys") + print(f"PRESENT: {nkeys_present} keys", file=sys.stderr) + print(f"ABSENT: {nkeys_absent} keys", file=sys.stderr) # Write all keys back to a file with open(ALL_KEYS_FILE, "w") as f_obj: @@ -243,7 +244,7 @@ f_obj.write(outstr) # From the keys present in NZSL, get all their ACL information -print(f"Getting ACLs for keys from S3 ({AWS_S3_BUCKET}) ...") +print(f"Getting ACLs for keys from S3 ({AWS_S3_BUCKET}) ...", file=sys.stderr) # CSV header print( f"Key{CSV_DELIMITER}Present{CSV_DELIMITER}db_id{CSV_DELIMITER}gloss_id{CSV_DELIMITER}Public{CSV_DELIMITER}Expected{CSV_DELIMITER}Got{CSV_DELIMITER}Match" From 79e1361705951cbcc6ab0346c7a6539e28933331 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Mon, 2 Sep 2024 12:26:53 +1000 Subject: [PATCH 036/222] black --- bin/get-video-s3-acls.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 298f5219..128a3955 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -88,7 +88,9 @@ DATABASE_URL = args.dburl if args.cached: - print("Using the video keys we recorded on the last non-cached run.", file=sys.stderr) + print( + "Using the video keys we recorded on the last non-cached run.", file=sys.stderr + ) else: print("Generating keys from scratch.", file=sys.stderr) @@ -178,7 +180,10 @@ # Put the keys in an in-memory list with open(S3_BUCKET_RAW_KEYS_FILE, "r") as f_obj: s3_bucket_raw_keys_list = [line.split()[3] for line in f_obj] - print(f"{len(s3_bucket_raw_keys_list)} rows retrieved: {S3_BUCKET_RAW_KEYS_FILE}", file=sys.stderr) + print( + f"{len(s3_bucket_raw_keys_list)} rows retrieved: {S3_BUCKET_RAW_KEYS_FILE}", + file=sys.stderr, + ) # Write the keys back to the file, for cleanliness with open(S3_BUCKET_RAW_KEYS_FILE, "w") as f_obj: @@ -186,7 +191,10 @@ f_obj.write(f"{line}\n") # Get the video files info from NZSL Signbank - print(f"Getting raw list of video file info from NZSL Signbank ({NZSL_APP}) ...", file=sys.stderr) + print( + f"Getting raw list of video file info from NZSL Signbank ({NZSL_APP}) ...", + file=sys.stderr, + ) with open(NZSL_POSTGRES_RAW_KEYS_FILE, "w") as f_obj: result = subprocess.run( [ @@ -204,7 +212,10 @@ ) with open(NZSL_POSTGRES_RAW_KEYS_FILE, "r") as f_obj: nzsl_raw_keys_list = f_obj.readlines() - print(f"{len(nzsl_raw_keys_list)} rows retrieved: {NZSL_POSTGRES_RAW_KEYS_FILE}", file=sys.stderr) + print( + f"{len(nzsl_raw_keys_list)} rows retrieved: {NZSL_POSTGRES_RAW_KEYS_FILE}", + file=sys.stderr, + ) # Separate out the NZSL db columns # Write them to a dictionary, so we can do fast operations From 08afe68f998c569ee36b48ac2ddef26869f8adfb Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Mon, 2 Sep 2024 12:28:41 +1000 Subject: [PATCH 037/222] Long line that black missed --- bin/get-video-s3-acls.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 128a3955..249b2c8d 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -258,7 +258,8 @@ print(f"Getting ACLs for keys from S3 ({AWS_S3_BUCKET}) ...", file=sys.stderr) # CSV header print( - f"Key{CSV_DELIMITER}Present{CSV_DELIMITER}db_id{CSV_DELIMITER}gloss_id{CSV_DELIMITER}Public{CSV_DELIMITER}Expected{CSV_DELIMITER}Got{CSV_DELIMITER}Match" + f"Key{CSV_DELIMITER}Present{CSV_DELIMITER}db_id{CSV_DELIMITER}gloss_id{CSV_DELIMITER}" + "Public{CSV_DELIMITER}Expected{CSV_DELIMITER}Got{CSV_DELIMITER}Match" ) for video_key, [is_present, db_id, gloss_id, is_public] in all_keys_dict.items(): canned_acl = "" From 4b64642600519841d9875a238c6e1c04ca718c01 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Mon, 2 Sep 2024 15:20:44 +1000 Subject: [PATCH 038/222] AWS_PROFILE purely environment var --- bin/get-video-s3-acls.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 249b2c8d..b20011e6 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -21,10 +21,16 @@ # NZSL: Is there a database url defined in the environment? DATABASE_URL = os.getenv("DATABASE_URL", None) +# AWS: Is there an AWS_PROFILE defined in the environment? +AWS_PROFILE = os.getenv("AWS_PROFILE", None) +if not AWS_PROFILE: + print("You must define AWS_PROFILE in the environment. Eg. AWS_PROFILE='nzsl'") + exit() + parser = argparse.ArgumentParser( - description="You must have a configured AWSCLIENT profile to use this app. See the --awsprofile " - "argument." + description="You must define, in the environment: AWS_PROFILE" ) + # Positional arguments if DATABASE_URL: print("DATABASE_URL defined in environment", file=sys.stderr) @@ -34,12 +40,6 @@ help=f"(REQUIRED) Database url (Overridden by DATABASE_URL environment variable)", ) # Optional arguments -parser.add_argument( - "--awsprofile", - default="nzsl", - required=False, - help=f"AWS configured profile to use (default: '%(default)s')", -) parser.add_argument( "--production", default=False, @@ -78,8 +78,8 @@ NZSL_APP = "nzsl-signbank-uat" AWS_S3_BUCKET = "nzsl-signbank-media-uat" +# Get the environment new_env = os.environ.copy() -new_env["AWS_PROFILE"] = args.awsprofile AWSCLIENT = args.awsclient PGCLIENT = args.pgclient From e907e2e02f70a9923e4a1047462d181bc5418be9 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Mon, 2 Sep 2024 15:21:26 +1000 Subject: [PATCH 039/222] Revert "AWS_PROFILE purely environment var" Jumped the gun, wrong variable. This reverts commit 4b64642600519841d9875a238c6e1c04ca718c01. --- bin/get-video-s3-acls.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index b20011e6..249b2c8d 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -21,16 +21,10 @@ # NZSL: Is there a database url defined in the environment? DATABASE_URL = os.getenv("DATABASE_URL", None) -# AWS: Is there an AWS_PROFILE defined in the environment? -AWS_PROFILE = os.getenv("AWS_PROFILE", None) -if not AWS_PROFILE: - print("You must define AWS_PROFILE in the environment. Eg. AWS_PROFILE='nzsl'") - exit() - parser = argparse.ArgumentParser( - description="You must define, in the environment: AWS_PROFILE" + description="You must have a configured AWSCLIENT profile to use this app. See the --awsprofile " + "argument." ) - # Positional arguments if DATABASE_URL: print("DATABASE_URL defined in environment", file=sys.stderr) @@ -40,6 +34,12 @@ help=f"(REQUIRED) Database url (Overridden by DATABASE_URL environment variable)", ) # Optional arguments +parser.add_argument( + "--awsprofile", + default="nzsl", + required=False, + help=f"AWS configured profile to use (default: '%(default)s')", +) parser.add_argument( "--production", default=False, @@ -78,8 +78,8 @@ NZSL_APP = "nzsl-signbank-uat" AWS_S3_BUCKET = "nzsl-signbank-media-uat" -# Get the environment new_env = os.environ.copy() +new_env["AWS_PROFILE"] = args.awsprofile AWSCLIENT = args.awsclient PGCLIENT = args.pgclient From 1f8f70bfe6d54d897a83de969157e076112d44ce Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Mon, 2 Sep 2024 15:27:42 +1000 Subject: [PATCH 040/222] Revert "Revert "AWS_PROFILE purely environment var"" No, I actually had it right the first time. This reverts commit e907e2e02f70a9923e4a1047462d181bc5418be9. --- bin/get-video-s3-acls.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 249b2c8d..b20011e6 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -21,10 +21,16 @@ # NZSL: Is there a database url defined in the environment? DATABASE_URL = os.getenv("DATABASE_URL", None) +# AWS: Is there an AWS_PROFILE defined in the environment? +AWS_PROFILE = os.getenv("AWS_PROFILE", None) +if not AWS_PROFILE: + print("You must define AWS_PROFILE in the environment. Eg. AWS_PROFILE='nzsl'") + exit() + parser = argparse.ArgumentParser( - description="You must have a configured AWSCLIENT profile to use this app. See the --awsprofile " - "argument." + description="You must define, in the environment: AWS_PROFILE" ) + # Positional arguments if DATABASE_URL: print("DATABASE_URL defined in environment", file=sys.stderr) @@ -34,12 +40,6 @@ help=f"(REQUIRED) Database url (Overridden by DATABASE_URL environment variable)", ) # Optional arguments -parser.add_argument( - "--awsprofile", - default="nzsl", - required=False, - help=f"AWS configured profile to use (default: '%(default)s')", -) parser.add_argument( "--production", default=False, @@ -78,8 +78,8 @@ NZSL_APP = "nzsl-signbank-uat" AWS_S3_BUCKET = "nzsl-signbank-media-uat" +# Get the environment new_env = os.environ.copy() -new_env["AWS_PROFILE"] = args.awsprofile AWSCLIENT = args.awsclient PGCLIENT = args.pgclient From 5100287ed042508fa355479578c696aee2800d82 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Mon, 2 Sep 2024 15:32:47 +1000 Subject: [PATCH 041/222] DATABASE_URL purely environment var. Missing stderrs added. --- bin/get-video-s3-acls.py | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index b20011e6..6cba5805 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -20,25 +20,20 @@ # NZSL: Is there a database url defined in the environment? DATABASE_URL = os.getenv("DATABASE_URL", None) +if not DATABASE_URL: + print("You must define DATABASE_URL in the environment.", file=sys.stderr) + exit() # AWS: Is there an AWS_PROFILE defined in the environment? AWS_PROFILE = os.getenv("AWS_PROFILE", None) if not AWS_PROFILE: - print("You must define AWS_PROFILE in the environment. Eg. AWS_PROFILE='nzsl'") + print("You must define AWS_PROFILE in the environment. Eg. AWS_PROFILE='nzsl'", file=sys.stderr) exit() parser = argparse.ArgumentParser( - description="You must define, in the environment: AWS_PROFILE" + description="You must define, in the environment: AWS_PROFILE, DATABASE_URL" ) -# Positional arguments -if DATABASE_URL: - print("DATABASE_URL defined in environment", file=sys.stderr) -else: - parser.add_argument( - "dburl", - help=f"(REQUIRED) Database url (Overridden by DATABASE_URL environment variable)", - ) # Optional arguments parser.add_argument( "--production", @@ -84,9 +79,6 @@ AWSCLIENT = args.awsclient PGCLIENT = args.pgclient -if not DATABASE_URL: - DATABASE_URL = args.dburl - if args.cached: print( "Using the video keys we recorded on the last non-cached run.", file=sys.stderr @@ -100,7 +92,7 @@ print(f"AWS profile: {new_env['AWS_PROFILE']}", file=sys.stderr) print(f"AWSCLIENT: {AWSCLIENT}", file=sys.stderr) print(f"PGCLIENT: {PGCLIENT}", file=sys.stderr) -print(f"DATABASE_URL:\n{DATABASE_URL}", file=sys.stderr) +print(f"DATABASE_URL:\n{new_env['DATABASE_URL']}", file=sys.stderr) TMPDIR = "/tmp/nzsl" try: From 114d5c49ccc97d0d9f61f62ace6dd4d59f897985 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Mon, 2 Sep 2024 15:39:57 +1000 Subject: [PATCH 042/222] Production/UAT mode changed to string --- bin/get-video-s3-acls.py | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 6cba5805..ff91972f 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -36,11 +36,10 @@ # Optional arguments parser.add_argument( - "--production", - default=False, + "--mode", + default="uat", required=False, - action="store_true", - help="Run in PRODUCTION mode, instead of STAGING (default: %(default)s)", + help="Mode to run in, eg 'production, 'uat', etc (default: '%(default)s')", ) parser.add_argument( "--cached", @@ -64,14 +63,8 @@ ) args = parser.parse_args() -if args.production: - MODE_STR = "PRODUCTION" - NZSL_APP = "nzsl-signbank-production" - AWS_S3_BUCKET = "nzsl-signbank-media-production" -else: - MODE_STR = "STAGING" - NZSL_APP = "nzsl-signbank-uat" - AWS_S3_BUCKET = "nzsl-signbank-media-uat" +NZSL_APP = f"nzsl-signbank-{args.mode}" +AWS_S3_BUCKET = f"nzsl-signbank-media-{args.mode}" # Get the environment new_env = os.environ.copy() @@ -86,7 +79,7 @@ else: print("Generating keys from scratch.", file=sys.stderr) -print(f"Mode: {MODE_STR}", file=sys.stderr) +print(f"Mode: {args.mode}", file=sys.stderr) print(f" NZSL app: {NZSL_APP}", file=sys.stderr) print(f" S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr) print(f"AWS profile: {new_env['AWS_PROFILE']}", file=sys.stderr) From dd94fb2e4fd9c92d427351c660893d731e06eda1 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Mon, 2 Sep 2024 15:44:58 +1000 Subject: [PATCH 043/222] Better column names --- bin/get-video-s3-acls.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index ff91972f..9435b471 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -243,8 +243,9 @@ print(f"Getting ACLs for keys from S3 ({AWS_S3_BUCKET}) ...", file=sys.stderr) # CSV header print( - f"Key{CSV_DELIMITER}Present{CSV_DELIMITER}db_id{CSV_DELIMITER}gloss_id{CSV_DELIMITER}" - "Public{CSV_DELIMITER}Expected{CSV_DELIMITER}Got{CSV_DELIMITER}Match" + f"Video S3 Key{CSV_DELIMITER}Present in Signbank?{CSV_DELIMITER}Postgres ID{CSV_DELIMITER}Gloss ID{CSV_DELIMITER}" + f"Signbank is_public?{CSV_DELIMITER}Expected S3 Canned ACL{CSV_DELIMITER}Actual S3 Canned ACL" + f"{CSV_DELIMITER}Correct Canned ACL?" ) for video_key, [is_present, db_id, gloss_id, is_public] in all_keys_dict.items(): canned_acl = "" From c477058c619adc6216acdc0a1db252a89a3e6d05 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Mon, 2 Sep 2024 15:46:07 +1000 Subject: [PATCH 044/222] black --- bin/get-video-s3-acls.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 9435b471..2d05b523 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -27,7 +27,10 @@ # AWS: Is there an AWS_PROFILE defined in the environment? AWS_PROFILE = os.getenv("AWS_PROFILE", None) if not AWS_PROFILE: - print("You must define AWS_PROFILE in the environment. Eg. AWS_PROFILE='nzsl'", file=sys.stderr) + print( + "You must define AWS_PROFILE in the environment. Eg. AWS_PROFILE='nzsl'", + file=sys.stderr, + ) exit() parser = argparse.ArgumentParser( From e4ca16bb226ccfe7063e19759f19334173496631 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Mon, 2 Sep 2024 16:01:14 +1000 Subject: [PATCH 045/222] Output raw ACL data as well --- bin/get-video-s3-acls.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 2d05b523..43cd9eca 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -248,11 +248,12 @@ print( f"Video S3 Key{CSV_DELIMITER}Present in Signbank?{CSV_DELIMITER}Postgres ID{CSV_DELIMITER}Gloss ID{CSV_DELIMITER}" f"Signbank is_public?{CSV_DELIMITER}Expected S3 Canned ACL{CSV_DELIMITER}Actual S3 Canned ACL" - f"{CSV_DELIMITER}Correct Canned ACL?" + f"{CSV_DELIMITER}Correct Canned ACL?{CSV_DELIMITER}Raw ACL data" ) for video_key, [is_present, db_id, gloss_id, is_public] in all_keys_dict.items(): canned_acl = "" canned_acl_expected = "" + raw_acl = "" if is_present: canned_acl_expected = "public-read" if is_public else "private" result = subprocess.run( @@ -273,6 +274,7 @@ capture_output=True, text=True, ) + raw_acl = result.stdout.replace("\n", " ") acls_grants_json = json.loads(result.stdout)["Grants"] if len(acls_grants_json) > 1: if ( @@ -296,4 +298,5 @@ print(f"{is_public if is_present else ''}", end=CSV_DELIMITER) print(f"{canned_acl_expected}", end=CSV_DELIMITER) print(f"{canned_acl}", end=CSV_DELIMITER) - print(f"{str(canned_acl_expected == canned_acl) if is_present else ''}") + print(f"{str(canned_acl_expected == canned_acl) if is_present else ''}", end=CSV_DELIMITER) + print(f"{raw_acl}") From 9e24507d7488173a8fa94d98b0c29d3c3811a2a5 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Mon, 2 Sep 2024 17:06:49 +1000 Subject: [PATCH 046/222] Comment showing where canned ACLs are set in main app --- bin/get-video-s3-acls.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 43cd9eca..1f0a6e49 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -255,6 +255,7 @@ canned_acl_expected = "" raw_acl = "" if is_present: + # See signbank/video/models.py, line 59, in function set_public_acl() canned_acl_expected = "public-read" if is_public else "private" result = subprocess.run( [ From 15af142829818966becc9c2893fb0eec95edb249 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Tue, 3 Sep 2024 11:43:13 +1000 Subject: [PATCH 047/222] Raw ACL data and header removed again --- bin/get-video-s3-acls.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 1f0a6e49..d73c911a 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -248,7 +248,7 @@ print( f"Video S3 Key{CSV_DELIMITER}Present in Signbank?{CSV_DELIMITER}Postgres ID{CSV_DELIMITER}Gloss ID{CSV_DELIMITER}" f"Signbank is_public?{CSV_DELIMITER}Expected S3 Canned ACL{CSV_DELIMITER}Actual S3 Canned ACL" - f"{CSV_DELIMITER}Correct Canned ACL?{CSV_DELIMITER}Raw ACL data" + f"{CSV_DELIMITER}Correct Canned ACL?" ) for video_key, [is_present, db_id, gloss_id, is_public] in all_keys_dict.items(): canned_acl = "" @@ -275,7 +275,6 @@ capture_output=True, text=True, ) - raw_acl = result.stdout.replace("\n", " ") acls_grants_json = json.loads(result.stdout)["Grants"] if len(acls_grants_json) > 1: if ( @@ -299,5 +298,4 @@ print(f"{is_public if is_present else ''}", end=CSV_DELIMITER) print(f"{canned_acl_expected}", end=CSV_DELIMITER) print(f"{canned_acl}", end=CSV_DELIMITER) - print(f"{str(canned_acl_expected == canned_acl) if is_present else ''}", end=CSV_DELIMITER) - print(f"{raw_acl}") + print(f"{str(canned_acl_expected == canned_acl) if is_present else ''}") From e85309a66bbcc25ca5673585ad60e758f44c5cbd Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Tue, 3 Sep 2024 12:00:50 +1000 Subject: [PATCH 048/222] Extraneous columns removed --- bin/get-video-s3-acls.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index d73c911a..b5d4e466 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -246,9 +246,8 @@ print(f"Getting ACLs for keys from S3 ({AWS_S3_BUCKET}) ...", file=sys.stderr) # CSV header print( - f"Video S3 Key{CSV_DELIMITER}Present in Signbank?{CSV_DELIMITER}Postgres ID{CSV_DELIMITER}Gloss ID{CSV_DELIMITER}" + f"Video S3 Key{CSV_DELIMITER}Postgres ID{CSV_DELIMITER}Gloss ID{CSV_DELIMITER}" f"Signbank is_public?{CSV_DELIMITER}Expected S3 Canned ACL{CSV_DELIMITER}Actual S3 Canned ACL" - f"{CSV_DELIMITER}Correct Canned ACL?" ) for video_key, [is_present, db_id, gloss_id, is_public] in all_keys_dict.items(): canned_acl = "" @@ -292,10 +291,8 @@ # CSV columns print(f"{video_key}", end=CSV_DELIMITER) - print(f"{is_present}", end=CSV_DELIMITER) print(f"{db_id if is_present else ''}", end=CSV_DELIMITER) print(f"{gloss_id if is_present else ''}", end=CSV_DELIMITER) print(f"{is_public if is_present else ''}", end=CSV_DELIMITER) print(f"{canned_acl_expected}", end=CSV_DELIMITER) - print(f"{canned_acl}", end=CSV_DELIMITER) - print(f"{str(canned_acl_expected == canned_acl) if is_present else ''}") + print(f"{canned_acl}") From 9eed309168bf7b92062f2079ea42d84a5eb4fa11 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Tue, 3 Sep 2024 12:03:52 +1000 Subject: [PATCH 049/222] NSZL_APP removed, no longer needed --- bin/get-video-s3-acls.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index b5d4e466..e7564912 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -5,7 +5,6 @@ # s3:GetObjectAcl permissions or READ_ACP access to the object # https://docs.aws.amazon.com/cli/latest/reference/s3api/get-object-acl.html - import os import sys import subprocess @@ -66,7 +65,6 @@ ) args = parser.parse_args() -NZSL_APP = f"nzsl-signbank-{args.mode}" AWS_S3_BUCKET = f"nzsl-signbank-media-{args.mode}" # Get the environment @@ -83,8 +81,7 @@ print("Generating keys from scratch.", file=sys.stderr) print(f"Mode: {args.mode}", file=sys.stderr) -print(f" NZSL app: {NZSL_APP}", file=sys.stderr) -print(f" S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr) +print(f"S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr) print(f"AWS profile: {new_env['AWS_PROFILE']}", file=sys.stderr) print(f"AWSCLIENT: {AWSCLIENT}", file=sys.stderr) print(f"PGCLIENT: {PGCLIENT}", file=sys.stderr) @@ -180,7 +177,7 @@ # Get the video files info from NZSL Signbank print( - f"Getting raw list of video file info from NZSL Signbank ({NZSL_APP}) ...", + f"Getting raw list of video file info from NZSL Signbank ...", file=sys.stderr, ) with open(NZSL_POSTGRES_RAW_KEYS_FILE, "w") as f_obj: From 0e48d3a472a0446277d69f09f23655136f6aa377 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Tue, 3 Sep 2024 12:09:29 +1000 Subject: [PATCH 050/222] AWS_PROFILE requirement removed --- bin/get-video-s3-acls.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index e7564912..9354c55d 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -23,17 +23,8 @@ print("You must define DATABASE_URL in the environment.", file=sys.stderr) exit() -# AWS: Is there an AWS_PROFILE defined in the environment? -AWS_PROFILE = os.getenv("AWS_PROFILE", None) -if not AWS_PROFILE: - print( - "You must define AWS_PROFILE in the environment. Eg. AWS_PROFILE='nzsl'", - file=sys.stderr, - ) - exit() - parser = argparse.ArgumentParser( - description="You must define, in the environment: AWS_PROFILE, DATABASE_URL" + description="You must setup: An AWS auth means, eg. AWS_PROFILE environment variable., DATABASE_URL" ) # Optional arguments From 4aa6add07d63826610cf8b297a5221cfa72e868c Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Tue, 3 Sep 2024 12:30:21 +1000 Subject: [PATCH 051/222] Removed question mark --- bin/get-video-s3-acls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 9354c55d..28108ce0 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -235,7 +235,7 @@ # CSV header print( f"Video S3 Key{CSV_DELIMITER}Postgres ID{CSV_DELIMITER}Gloss ID{CSV_DELIMITER}" - f"Signbank is_public?{CSV_DELIMITER}Expected S3 Canned ACL{CSV_DELIMITER}Actual S3 Canned ACL" + f"Signbank is_public{CSV_DELIMITER}Expected S3 Canned ACL{CSV_DELIMITER}Actual S3 Canned ACL" ) for video_key, [is_present, db_id, gloss_id, is_public] in all_keys_dict.items(): canned_acl = "" From d7af5055582fbb5de13820d3c0ead240723f4eb6 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Tue, 3 Sep 2024 14:23:07 +1000 Subject: [PATCH 052/222] Header refactored --- bin/get-video-s3-acls.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 28108ce0..b448b090 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -232,11 +232,18 @@ # From the keys present in NZSL, get all their ACL information print(f"Getting ACLs for keys from S3 ({AWS_S3_BUCKET}) ...", file=sys.stderr) + # CSV header -print( - f"Video S3 Key{CSV_DELIMITER}Postgres ID{CSV_DELIMITER}Gloss ID{CSV_DELIMITER}" - f"Signbank is_public{CSV_DELIMITER}Expected S3 Canned ACL{CSV_DELIMITER}Actual S3 Canned ACL" -) +csv_header_list = [ + "Video S3 Key", + "Postgres ID", + "Gloss ID", + "Signbank Public", + "Expected S3 Canned ACL", + "Actual S3 Canned ACL", +] +print(CSV_DELIMITER.join(csv_header_list)) + for video_key, [is_present, db_id, gloss_id, is_public] in all_keys_dict.items(): canned_acl = "" canned_acl_expected = "" From 32b20425bc198b83cc672fffb083c2c63d47dd5e Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Tue, 3 Sep 2024 14:26:41 +1000 Subject: [PATCH 053/222] AWS_PROFILE printing conditional --- bin/get-video-s3-acls.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index b448b090..8b1799ff 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -73,7 +73,8 @@ print(f"Mode: {args.mode}", file=sys.stderr) print(f"S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr) -print(f"AWS profile: {new_env['AWS_PROFILE']}", file=sys.stderr) +if "AWS_PROFILE" in new_env: + print(f"AWS profile: {new_env['AWS_PROFILE']}", file=sys.stderr) print(f"AWSCLIENT: {AWSCLIENT}", file=sys.stderr) print(f"PGCLIENT: {PGCLIENT}", file=sys.stderr) print(f"DATABASE_URL:\n{new_env['DATABASE_URL']}", file=sys.stderr) From 4bc617c5d56807d564adfb8a7ebfdb67127567e7 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Tue, 3 Sep 2024 14:30:22 +1000 Subject: [PATCH 054/222] Tidy up --- bin/get-video-s3-acls.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 8b1799ff..830e8f21 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -57,13 +57,13 @@ args = parser.parse_args() AWS_S3_BUCKET = f"nzsl-signbank-media-{args.mode}" +AWS_S3_BUCKET = f"nzsl-signbank-media-{args.mode}" +AWSCLIENT = args.awsclient +PGCLIENT = args.pgclient # Get the environment new_env = os.environ.copy() -AWSCLIENT = args.awsclient -PGCLIENT = args.pgclient - if args.cached: print( "Using the video keys we recorded on the last non-cached run.", file=sys.stderr From ea689f4daa090e63aaffa066d27a567be78257ea Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Tue, 3 Sep 2024 17:03:55 +1000 Subject: [PATCH 055/222] duplicated line removed --- bin/get-video-s3-acls.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 830e8f21..53d2eead 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -56,7 +56,6 @@ ) args = parser.parse_args() -AWS_S3_BUCKET = f"nzsl-signbank-media-{args.mode}" AWS_S3_BUCKET = f"nzsl-signbank-media-{args.mode}" AWSCLIENT = args.awsclient PGCLIENT = args.pgclient From 53c4154b84d1995fb88fd09a2cc873f55a9bf182 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Tue, 3 Sep 2024 17:58:53 +1000 Subject: [PATCH 056/222] Initial organisation into functions, and cleanup --- bin/get-video-s3-acls.py | 290 +++++++++++++++++++++------------------ 1 file changed, 160 insertions(+), 130 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 53d2eead..3f809514 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -11,23 +11,20 @@ import argparse import json -# TODO -# We are using external apps just for the moment. -# These will be removed for native libraries. -AWSCLIENT = "/usr/local/bin/aws" -PGCLIENT = "/usr/bin/psql" - -# NZSL: Is there a database url defined in the environment? +# Globals DATABASE_URL = os.getenv("DATABASE_URL", None) if not DATABASE_URL: print("You must define DATABASE_URL in the environment.", file=sys.stderr) exit() +NEW_ENV = os.environ.copy() +CSV_DELIMITER = "," +nzsl_raw_keys_dict = {} +s3_bucket_raw_keys_list = [] +all_keys_dict = {} parser = argparse.ArgumentParser( description="You must setup: An AWS auth means, eg. AWS_PROFILE environment variable., DATABASE_URL" ) - -# Optional arguments parser.add_argument( "--mode", default="uat", @@ -44,63 +41,49 @@ ) parser.add_argument( "--pgclient", - default=PGCLIENT, + default="/usr/bin/psql", required=False, help=f"Postgres client path (default: %(default)s)", ) parser.add_argument( "--awsclient", - default=AWSCLIENT, + default="/usr/local/bin/aws", required=False, help=f"AWS client path (default: %(default)s)", ) args = parser.parse_args() -AWS_S3_BUCKET = f"nzsl-signbank-media-{args.mode}" AWSCLIENT = args.awsclient PGCLIENT = args.pgclient +AWS_S3_BUCKET = f"nzsl-signbank-media-{args.mode}" -# Get the environment -new_env = os.environ.copy() - -if args.cached: - print( - "Using the video keys we recorded on the last non-cached run.", file=sys.stderr - ) -else: - print("Generating keys from scratch.", file=sys.stderr) - -print(f"Mode: {args.mode}", file=sys.stderr) -print(f"S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr) -if "AWS_PROFILE" in new_env: - print(f"AWS profile: {new_env['AWS_PROFILE']}", file=sys.stderr) -print(f"AWSCLIENT: {AWSCLIENT}", file=sys.stderr) -print(f"PGCLIENT: {PGCLIENT}", file=sys.stderr) -print(f"DATABASE_URL:\n{new_env['DATABASE_URL']}", file=sys.stderr) - +# Files TMPDIR = "/tmp/nzsl" try: os.makedirs(TMPDIR, exist_ok=True) except OSError as err: print(f"Error creating temporary directory: {TMPDIR} {err}", file=sys.stderr) exit() - -CSV_DELIMITER = "," NZSL_POSTGRES_RAW_KEYS_FILE = f"{TMPDIR}/nzsl_postgres_raw_keys.txt" S3_BUCKET_RAW_KEYS_FILE = f"{TMPDIR}/s3_bucket_raw_keys.txt" ALL_KEYS_FILE = f"{TMPDIR}/all_keys.csv" -nzsl_raw_keys_dict = {} -s3_bucket_raw_keys_list = [] -all_keys_dict = {} +# Truncate files, creating them if necessary +def init_files(files_list): + # Zero-out files + for p in files_list: + f = open(p, "a") + f.truncate() + f.close() -nkeys_present = 0 -nkeys_absent = 0 -if args.cached: - # Pull all info from existing file +# Pull all info from existing file +def get_keys_from_cache_file(cache_file): + nkeys_present = 0 + nkeys_absent = 0 + this_all_keys_dict = {} try: - with open(ALL_KEYS_FILE, "r") as f_obj: + with open(cache_file, "r") as f_obj: for line in f_obj.readlines(): ( video_key, @@ -126,26 +109,25 @@ gloss_id = None is_public = None - all_keys_dict[video_key] = [is_present, db_id, gloss_id, is_public] + this_all_keys_dict[video_key] = [is_present, db_id, gloss_id, is_public] print(f"PRESENT: {nkeys_present} keys", file=sys.stderr) print(f"ABSENT: {nkeys_absent} keys", file=sys.stderr) + + return this_all_keys_dict + except FileNotFoundError: - print(f"File not found: {ALL_KEYS_FILE}", file=sys.stderr) + print(f"File not found: {cache_file}", file=sys.stderr) exit() -else: - # Zero-out files - for p in (NZSL_POSTGRES_RAW_KEYS_FILE, S3_BUCKET_RAW_KEYS_FILE, ALL_KEYS_FILE): - f = open(p, "a") - f.truncate() - f.close() - # Get all keys from AWS S3 - print(f"Getting raw AWS S3 keys recursively ({AWS_S3_BUCKET}) ...", file=sys.stderr) - with open(S3_BUCKET_RAW_KEYS_FILE, "w") as f_obj: + +# Get all keys from AWS S3 +def get_keys_from_s3(s3_bucket, keys_file): + print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr) + with open(keys_file, "w") as f_obj: result = subprocess.run( - [AWSCLIENT, "s3", "ls", f"s3://{AWS_S3_BUCKET}", "--recursive"], - env=new_env, + [AWSCLIENT, "s3", "ls", f"s3://{s3_bucket}", "--recursive"], + env=NEW_ENV, shell=False, check=True, text=True, @@ -154,24 +136,29 @@ # Separate out just the key (also strip newline) from date, time, size, key # Put the keys in an in-memory list - with open(S3_BUCKET_RAW_KEYS_FILE, "r") as f_obj: - s3_bucket_raw_keys_list = [line.split()[3] for line in f_obj] + with open(keys_file, "r") as f_obj: + this_s3_bucket_raw_keys_list = [line.split()[3] for line in f_obj] print( - f"{len(s3_bucket_raw_keys_list)} rows retrieved: {S3_BUCKET_RAW_KEYS_FILE}", + f"{len(this_s3_bucket_raw_keys_list)} rows retrieved: {keys_file}", file=sys.stderr, ) # Write the keys back to the file, for cleanliness - with open(S3_BUCKET_RAW_KEYS_FILE, "w") as f_obj: - for line in s3_bucket_raw_keys_list: + with open(keys_file, "w") as f_obj: + for line in this_s3_bucket_raw_keys_list: f_obj.write(f"{line}\n") - # Get the video files info from NZSL Signbank + return this_s3_bucket_raw_keys_list + + +# Get the video files info from NZSL Signbank +def get_keys_from_nzsl(keys_file): + this_nzsl_raw_keys_dict = {} print( f"Getting raw list of video file info from NZSL Signbank ...", file=sys.stderr, ) - with open(NZSL_POSTGRES_RAW_KEYS_FILE, "w") as f_obj: + with open(keys_file, "w") as f_obj: result = subprocess.run( [ PGCLIENT, @@ -180,16 +167,16 @@ "select id as db_id, gloss_id, is_public, videofile from video_glossvideo", f"{DATABASE_URL}", ], - env=new_env, + env=NEW_ENV, shell=False, check=True, text=True, stdout=f_obj, ) - with open(NZSL_POSTGRES_RAW_KEYS_FILE, "r") as f_obj: + with open(keys_file, "r") as f_obj: nzsl_raw_keys_list = f_obj.readlines() print( - f"{len(nzsl_raw_keys_list)} rows retrieved: {NZSL_POSTGRES_RAW_KEYS_FILE}", + f"{len(nzsl_raw_keys_list)} rows retrieved: {keys_file}", file=sys.stderr, ) @@ -206,88 +193,131 @@ # 'videofile' data is also the key for S3 video_key = columns[3].strip() # Each dictionary slot contains these values - nzsl_raw_keys_dict[video_key] = [db_id, gloss_id, is_public] + this_nzsl_raw_keys_dict[video_key] = [db_id, gloss_id, is_public] + + return this_nzsl_raw_keys_dict + - # Get the s3 keys present and absent from our NZSL keys +# Get the s3 keys present and absent from our NZSL keys +def create_all_keys_dict( + this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict, all_keys_file +): print("Getting S3 keys present and absent from NZSL Signbank ...", file=sys.stderr) - for video_key in s3_bucket_raw_keys_list: - if video_key in nzsl_raw_keys_dict: + nkeys_present = 0 + nkeys_absent = 0 + this_all_keys_dict = {} + for video_key in this_s3_bucket_raw_keys_list: + if video_key in this_nzsl_raw_keys_dict: nkeys_present += 1 # Add 'Present' column to start - all_keys_dict[video_key] = [True] + nzsl_raw_keys_dict[video_key] + this_all_keys_dict[video_key] = [True] + this_nzsl_raw_keys_dict[video_key] else: nkeys_absent += 1 # Add 'Present' (absent) column to start - all_keys_dict[video_key] = [False, "", "", ""] + this_all_keys_dict[video_key] = [False, "", "", ""] print(f"PRESENT: {nkeys_present} keys", file=sys.stderr) print(f"ABSENT: {nkeys_absent} keys", file=sys.stderr) # Write all keys back to a file - with open(ALL_KEYS_FILE, "w") as f_obj: - for video_key, item_list in all_keys_dict.items(): + with open(all_keys_file, "w") as f_obj: + for video_key, item_list in this_all_keys_dict.items(): outstr = ( f"{video_key}{CSV_DELIMITER}{CSV_DELIMITER.join(map(str, item_list))}\n" ) f_obj.write(outstr) + return this_all_keys_dict + + # From the keys present in NZSL, get all their ACL information -print(f"Getting ACLs for keys from S3 ({AWS_S3_BUCKET}) ...", file=sys.stderr) - -# CSV header -csv_header_list = [ - "Video S3 Key", - "Postgres ID", - "Gloss ID", - "Signbank Public", - "Expected S3 Canned ACL", - "Actual S3 Canned ACL", -] -print(CSV_DELIMITER.join(csv_header_list)) - -for video_key, [is_present, db_id, gloss_id, is_public] in all_keys_dict.items(): - canned_acl = "" - canned_acl_expected = "" - raw_acl = "" - if is_present: - # See signbank/video/models.py, line 59, in function set_public_acl() - canned_acl_expected = "public-read" if is_public else "private" - result = subprocess.run( - [ - AWSCLIENT, - "s3api", - "get-object-acl", - "--output", - "json", - "--bucket", - AWS_S3_BUCKET, - "--key", - video_key, - ], - env=new_env, - shell=False, - check=True, - capture_output=True, - text=True, - ) - acls_grants_json = json.loads(result.stdout)["Grants"] - if len(acls_grants_json) > 1: - if ( - acls_grants_json[0]["Permission"] == "FULL_CONTROL" - and acls_grants_json[1]["Permission"] == "READ" - ): - canned_acl = "public-read" - else: - canned_acl = "Unknown ACL" - else: - if acls_grants_json[0]["Permission"] == "FULL_CONTROL": - canned_acl = "private" +def output_csv(this_all_keys_dict): + print(f"Getting ACLs for keys from S3 ({AWS_S3_BUCKET}) ...", file=sys.stderr) + + # CSV header + csv_header_list = [ + "Video S3 Key", + "Postgres ID", + "Gloss ID", + "Signbank Public", + "Expected S3 Canned ACL", + "Actual S3 Canned ACL", + ] + print(CSV_DELIMITER.join(csv_header_list)) + + for video_key, [ + is_present, + db_id, + gloss_id, + is_public, + ] in this_all_keys_dict.items(): + canned_acl = "" + canned_acl_expected = "" + raw_acl = "" + if is_present: + # See signbank/video/models.py, line 59, in function set_public_acl() + canned_acl_expected = "public-read" if is_public else "private" + result = subprocess.run( + [ + AWSCLIENT, + "s3api", + "get-object-acl", + "--output", + "json", + "--bucket", + AWS_S3_BUCKET, + "--key", + video_key, + ], + env=NEW_ENV, + shell=False, + check=True, + capture_output=True, + text=True, + ) + acls_grants_json = json.loads(result.stdout)["Grants"] + if len(acls_grants_json) > 1: + if ( + acls_grants_json[0]["Permission"] == "FULL_CONTROL" + and acls_grants_json[1]["Permission"] == "READ" + ): + canned_acl = "public-read" + else: + canned_acl = "Unknown ACL" else: - canned_acl = "Unknown ACL" - - # CSV columns - print(f"{video_key}", end=CSV_DELIMITER) - print(f"{db_id if is_present else ''}", end=CSV_DELIMITER) - print(f"{gloss_id if is_present else ''}", end=CSV_DELIMITER) - print(f"{is_public if is_present else ''}", end=CSV_DELIMITER) - print(f"{canned_acl_expected}", end=CSV_DELIMITER) - print(f"{canned_acl}") + if acls_grants_json[0]["Permission"] == "FULL_CONTROL": + canned_acl = "private" + else: + canned_acl = "Unknown ACL" + + # CSV columns + print(f"{video_key}", end=CSV_DELIMITER) + print(f"{db_id if is_present else ''}", end=CSV_DELIMITER) + print(f"{gloss_id if is_present else ''}", end=CSV_DELIMITER) + print(f"{is_public if is_present else ''}", end=CSV_DELIMITER) + print(f"{canned_acl_expected}", end=CSV_DELIMITER) + print(f"{canned_acl}") + + +print(f"Mode: {args.mode}", file=sys.stderr) +print(f"S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr) +if "AWS_PROFILE" in NEW_ENV: + print(f"AWS profile: {NEW_ENV['AWS_PROFILE']}", file=sys.stderr) +print(f"AWSCLIENT: {AWSCLIENT}", file=sys.stderr) +print(f"PGCLIENT: {PGCLIENT}", file=sys.stderr) +print(f"DATABASE_URL:\n{NEW_ENV['DATABASE_URL']}", file=sys.stderr) + +if args.cached: + print( + "Using the video keys we recorded on the last non-cached run.", file=sys.stderr + ) + all_keys_dict = get_keys_from_cache_file(ALL_KEYS_FILE) +else: + print("Generating keys from scratch.", file=sys.stderr) + init_files([NZSL_POSTGRES_RAW_KEYS_FILE, S3_BUCKET_RAW_KEYS_FILE, ALL_KEYS_FILE]) + s3_bucket_raw_keys_list = get_keys_from_s3(AWS_S3_BUCKET, S3_BUCKET_RAW_KEYS_FILE) + nzsl_raw_keys_dict = get_keys_from_nzsl(NZSL_POSTGRES_RAW_KEYS_FILE) + all_keys_dict = create_all_keys_dict( + s3_bucket_raw_keys_list, nzsl_raw_keys_dict, ALL_KEYS_FILE + ) + +output_csv(all_keys_dict) From bb6a53e1d2b83f54ad96643f206215aa11359fc1 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Tue, 3 Sep 2024 18:06:46 +1000 Subject: [PATCH 057/222] Tidy ups and renaming --- bin/get-video-s3-acls.py | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 3f809514..7b65cd3a 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -11,16 +11,6 @@ import argparse import json -# Globals -DATABASE_URL = os.getenv("DATABASE_URL", None) -if not DATABASE_URL: - print("You must define DATABASE_URL in the environment.", file=sys.stderr) - exit() -NEW_ENV = os.environ.copy() -CSV_DELIMITER = "," -nzsl_raw_keys_dict = {} -s3_bucket_raw_keys_list = [] -all_keys_dict = {} parser = argparse.ArgumentParser( description="You must setup: An AWS auth means, eg. AWS_PROFILE environment variable., DATABASE_URL" @@ -53,9 +43,21 @@ ) args = parser.parse_args() +# Globals AWSCLIENT = args.awsclient PGCLIENT = args.pgclient +DATABASE_URL = os.getenv("DATABASE_URL", None) +if not DATABASE_URL: + print("You must define DATABASE_URL in the environment.", file=sys.stderr) + exit() +NEW_ENV = os.environ.copy() +CSV_DELIMITER = "," + +# Vars AWS_S3_BUCKET = f"nzsl-signbank-media-{args.mode}" +nzsl_raw_keys_dict = {} +s3_bucket_raw_keys_list = [] +all_keys_dict = {} # Files TMPDIR = "/tmp/nzsl" @@ -68,9 +70,9 @@ S3_BUCKET_RAW_KEYS_FILE = f"{TMPDIR}/s3_bucket_raw_keys.txt" ALL_KEYS_FILE = f"{TMPDIR}/all_keys.csv" + # Truncate files, creating them if necessary def init_files(files_list): - # Zero-out files for p in files_list: f = open(p, "a") f.truncate() @@ -122,10 +124,10 @@ def get_keys_from_cache_file(cache_file): # Get all keys from AWS S3 -def get_keys_from_s3(s3_bucket, keys_file): +def get_s3_bucket_raw_keys_list(s3_bucket, keys_file): print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr) with open(keys_file, "w") as f_obj: - result = subprocess.run( + subprocess.run( [AWSCLIENT, "s3", "ls", f"s3://{s3_bucket}", "--recursive"], env=NEW_ENV, shell=False, @@ -152,14 +154,14 @@ def get_keys_from_s3(s3_bucket, keys_file): # Get the video files info from NZSL Signbank -def get_keys_from_nzsl(keys_file): +def get_nzsl_raw_keys_dict(keys_file): this_nzsl_raw_keys_dict = {} print( f"Getting raw list of video file info from NZSL Signbank ...", file=sys.stderr, ) with open(keys_file, "w") as f_obj: - result = subprocess.run( + subprocess.run( [ PGCLIENT, "-t", @@ -314,8 +316,8 @@ def output_csv(this_all_keys_dict): else: print("Generating keys from scratch.", file=sys.stderr) init_files([NZSL_POSTGRES_RAW_KEYS_FILE, S3_BUCKET_RAW_KEYS_FILE, ALL_KEYS_FILE]) - s3_bucket_raw_keys_list = get_keys_from_s3(AWS_S3_BUCKET, S3_BUCKET_RAW_KEYS_FILE) - nzsl_raw_keys_dict = get_keys_from_nzsl(NZSL_POSTGRES_RAW_KEYS_FILE) + s3_bucket_raw_keys_list = get_s3_bucket_raw_keys_list(AWS_S3_BUCKET, S3_BUCKET_RAW_KEYS_FILE) + nzsl_raw_keys_dict = get_nzsl_raw_keys_dict(NZSL_POSTGRES_RAW_KEYS_FILE) all_keys_dict = create_all_keys_dict( s3_bucket_raw_keys_list, nzsl_raw_keys_dict, ALL_KEYS_FILE ) From 1b93c81bb0de2969f5ab405bea0c9179a5daa2b9 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 4 Sep 2024 08:50:50 +1000 Subject: [PATCH 058/222] DATABASE_URL warning message removed --- bin/get-video-s3-acls.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 7b65cd3a..9a02ebbd 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -11,9 +11,9 @@ import argparse import json - parser = argparse.ArgumentParser( - description="You must setup: An AWS auth means, eg. AWS_PROFILE environment variable., DATABASE_URL" + description="You must setup: An AWS auth means, eg. AWS_PROFILE environment variable." + "Postgres access details, eg. DATABASE_URL" ) parser.add_argument( "--mode", @@ -47,9 +47,6 @@ AWSCLIENT = args.awsclient PGCLIENT = args.pgclient DATABASE_URL = os.getenv("DATABASE_URL", None) -if not DATABASE_URL: - print("You must define DATABASE_URL in the environment.", file=sys.stderr) - exit() NEW_ENV = os.environ.copy() CSV_DELIMITER = "," @@ -316,7 +313,9 @@ def output_csv(this_all_keys_dict): else: print("Generating keys from scratch.", file=sys.stderr) init_files([NZSL_POSTGRES_RAW_KEYS_FILE, S3_BUCKET_RAW_KEYS_FILE, ALL_KEYS_FILE]) - s3_bucket_raw_keys_list = get_s3_bucket_raw_keys_list(AWS_S3_BUCKET, S3_BUCKET_RAW_KEYS_FILE) + s3_bucket_raw_keys_list = get_s3_bucket_raw_keys_list( + AWS_S3_BUCKET, S3_BUCKET_RAW_KEYS_FILE + ) nzsl_raw_keys_dict = get_nzsl_raw_keys_dict(NZSL_POSTGRES_RAW_KEYS_FILE) all_keys_dict = create_all_keys_dict( s3_bucket_raw_keys_list, nzsl_raw_keys_dict, ALL_KEYS_FILE From ea751e5e8497d7869dc816af833046e3c751d6cc Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 4 Sep 2024 11:21:39 +1000 Subject: [PATCH 059/222] whitespace --- bin/get-video-s3-acls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 9a02ebbd..ba90eb59 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -12,7 +12,7 @@ import json parser = argparse.ArgumentParser( - description="You must setup: An AWS auth means, eg. AWS_PROFILE environment variable." + description="You must setup: An AWS auth means, eg. AWS_PROFILE environment variable. " "Postgres access details, eg. DATABASE_URL" ) parser.add_argument( From 0b3ce8db0b26317a1f63a308bf95862376f0cac0 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 4 Sep 2024 11:23:31 +1000 Subject: [PATCH 060/222] Adding OSV ignores just to silence warnings. Remove later. --- .osv-detector.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.osv-detector.yml b/.osv-detector.yml index 794fb52a..4b9be1f1 100644 --- a/.osv-detector.yml +++ b/.osv-detector.yml @@ -6,3 +6,5 @@ ignore: - GHSA-257q-pv89-v3xv # GHSA says affected versions are jQuery v.2.2.0 until v.3.5.0 - GHSA-vm8q-m57g-pff3 - GHSA-w3h3-4rj7-4ph4 + - GHSA-248v-346w-9cwc # Certifi removes GLOBALTRUST root certificate (https://github.com/advisories/GHSA-248v-346w-9cwc) + - GHSA-g92j-qhmh-64v2 # Sentry's Python SDK unintentionally exposes environment variables to subprocesses (https://github.com/advisories/GHSA-g92j-qhmh-64v2) From fbe33feeb00869c9b35793ad78d179d9154a4adb Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 4 Sep 2024 11:25:06 +1000 Subject: [PATCH 061/222] More OSV ignores --- .osv-detector.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.osv-detector.yml b/.osv-detector.yml index 4b9be1f1..2d4acb90 100644 --- a/.osv-detector.yml +++ b/.osv-detector.yml @@ -8,3 +8,4 @@ ignore: - GHSA-w3h3-4rj7-4ph4 - GHSA-248v-346w-9cwc # Certifi removes GLOBALTRUST root certificate (https://github.com/advisories/GHSA-248v-346w-9cwc) - GHSA-g92j-qhmh-64v2 # Sentry's Python SDK unintentionally exposes environment variables to subprocesses (https://github.com/advisories/GHSA-g92j-qhmh-64v2) + - GHSA-9mvj-f7w8-pvh2 # Bootstrap Cross-Site Scripting (XSS) vulnerability (https://github.com/advisories/GHSA-9mvj-f7w8-pvh2) From 765963287eaebd6d4a53ee19a25acd39f297549b Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 4 Sep 2024 11:27:58 +1000 Subject: [PATCH 062/222] tidy ups --- bin/get-video-s3-acls.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index ba90eb59..c8e78a07 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -12,8 +12,8 @@ import json parser = argparse.ArgumentParser( - description="You must setup: An AWS auth means, eg. AWS_PROFILE environment variable. " - "Postgres access details, eg. DATABASE_URL" + description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. " + "Postgres access details, eg. DATABASE_URL env var." ) parser.add_argument( "--mode", @@ -46,7 +46,7 @@ # Globals AWSCLIENT = args.awsclient PGCLIENT = args.pgclient -DATABASE_URL = os.getenv("DATABASE_URL", None) +DATABASE_URL = os.getenv("DATABASE_URL", "") NEW_ENV = os.environ.copy() CSV_DELIMITER = "," From cb2b3a03bca237522ba0956000fce3f1e3d5e5b4 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 4 Sep 2024 11:36:37 +1000 Subject: [PATCH 063/222] File names hidden --- bin/get-video-s3-acls.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index c8e78a07..9c459015 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -69,7 +69,7 @@ # Truncate files, creating them if necessary -def init_files(files_list): +def init_files(files_list=(NZSL_POSTGRES_RAW_KEYS_FILE, S3_BUCKET_RAW_KEYS_FILE, ALL_KEYS_FILE)): for p in files_list: f = open(p, "a") f.truncate() @@ -77,7 +77,7 @@ def init_files(files_list): # Pull all info from existing file -def get_keys_from_cache_file(cache_file): +def get_keys_from_cache_file(cache_file=ALL_KEYS_FILE): nkeys_present = 0 nkeys_absent = 0 this_all_keys_dict = {} @@ -121,7 +121,7 @@ def get_keys_from_cache_file(cache_file): # Get all keys from AWS S3 -def get_s3_bucket_raw_keys_list(s3_bucket, keys_file): +def get_s3_bucket_raw_keys_list(s3_bucket, keys_file=S3_BUCKET_RAW_KEYS_FILE): print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr) with open(keys_file, "w") as f_obj: subprocess.run( @@ -151,7 +151,7 @@ def get_s3_bucket_raw_keys_list(s3_bucket, keys_file): # Get the video files info from NZSL Signbank -def get_nzsl_raw_keys_dict(keys_file): +def get_nzsl_raw_keys_dict(keys_file=NZSL_POSTGRES_RAW_KEYS_FILE): this_nzsl_raw_keys_dict = {} print( f"Getting raw list of video file info from NZSL Signbank ...", @@ -199,7 +199,7 @@ def get_nzsl_raw_keys_dict(keys_file): # Get the s3 keys present and absent from our NZSL keys def create_all_keys_dict( - this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict, all_keys_file + this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict, all_keys_file=ALL_KEYS_FILE ): print("Getting S3 keys present and absent from NZSL Signbank ...", file=sys.stderr) nkeys_present = 0 @@ -309,16 +309,14 @@ def output_csv(this_all_keys_dict): print( "Using the video keys we recorded on the last non-cached run.", file=sys.stderr ) - all_keys_dict = get_keys_from_cache_file(ALL_KEYS_FILE) + all_keys_dict = get_keys_from_cache_file() else: print("Generating keys from scratch.", file=sys.stderr) - init_files([NZSL_POSTGRES_RAW_KEYS_FILE, S3_BUCKET_RAW_KEYS_FILE, ALL_KEYS_FILE]) - s3_bucket_raw_keys_list = get_s3_bucket_raw_keys_list( - AWS_S3_BUCKET, S3_BUCKET_RAW_KEYS_FILE - ) - nzsl_raw_keys_dict = get_nzsl_raw_keys_dict(NZSL_POSTGRES_RAW_KEYS_FILE) + init_files() + s3_bucket_raw_keys_list = get_s3_bucket_raw_keys_list(AWS_S3_BUCKET) + nzsl_raw_keys_dict = get_nzsl_raw_keys_dict() all_keys_dict = create_all_keys_dict( - s3_bucket_raw_keys_list, nzsl_raw_keys_dict, ALL_KEYS_FILE + s3_bucket_raw_keys_list, nzsl_raw_keys_dict ) output_csv(all_keys_dict) From 9b502dde632da60b087c00b0a99ff0fe52f3a978 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 4 Sep 2024 11:38:17 +1000 Subject: [PATCH 064/222] Bunch of things could be made global, starting here --- bin/get-video-s3-acls.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 9c459015..58857446 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -121,7 +121,7 @@ def get_keys_from_cache_file(cache_file=ALL_KEYS_FILE): # Get all keys from AWS S3 -def get_s3_bucket_raw_keys_list(s3_bucket, keys_file=S3_BUCKET_RAW_KEYS_FILE): +def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET, keys_file=S3_BUCKET_RAW_KEYS_FILE): print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr) with open(keys_file, "w") as f_obj: subprocess.run( @@ -313,7 +313,7 @@ def output_csv(this_all_keys_dict): else: print("Generating keys from scratch.", file=sys.stderr) init_files() - s3_bucket_raw_keys_list = get_s3_bucket_raw_keys_list(AWS_S3_BUCKET) + s3_bucket_raw_keys_list = get_s3_bucket_raw_keys_list() nzsl_raw_keys_dict = get_nzsl_raw_keys_dict() all_keys_dict = create_all_keys_dict( s3_bucket_raw_keys_list, nzsl_raw_keys_dict From a4d978c2ca5fab82a94c2f9c6a8c38d5ddab8477 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 4 Sep 2024 11:41:07 +1000 Subject: [PATCH 065/222] More tidying --- bin/get-video-s3-acls.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 58857446..a9a46a1d 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -49,14 +49,7 @@ DATABASE_URL = os.getenv("DATABASE_URL", "") NEW_ENV = os.environ.copy() CSV_DELIMITER = "," - -# Vars AWS_S3_BUCKET = f"nzsl-signbank-media-{args.mode}" -nzsl_raw_keys_dict = {} -s3_bucket_raw_keys_list = [] -all_keys_dict = {} - -# Files TMPDIR = "/tmp/nzsl" try: os.makedirs(TMPDIR, exist_ok=True) @@ -67,6 +60,10 @@ S3_BUCKET_RAW_KEYS_FILE = f"{TMPDIR}/s3_bucket_raw_keys.txt" ALL_KEYS_FILE = f"{TMPDIR}/all_keys.csv" +# Vars +nzsl_raw_keys_dict = {} +s3_bucket_raw_keys_list = [] +all_keys_dict = {} # Truncate files, creating them if necessary def init_files(files_list=(NZSL_POSTGRES_RAW_KEYS_FILE, S3_BUCKET_RAW_KEYS_FILE, ALL_KEYS_FILE)): From 8364354f2e0d7b15869886b00e52e96fb102ce6e Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 4 Sep 2024 11:41:26 +1000 Subject: [PATCH 066/222] black --- bin/get-video-s3-acls.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index a9a46a1d..0d81756f 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -65,8 +65,11 @@ s3_bucket_raw_keys_list = [] all_keys_dict = {} + # Truncate files, creating them if necessary -def init_files(files_list=(NZSL_POSTGRES_RAW_KEYS_FILE, S3_BUCKET_RAW_KEYS_FILE, ALL_KEYS_FILE)): +def init_files( + files_list=(NZSL_POSTGRES_RAW_KEYS_FILE, S3_BUCKET_RAW_KEYS_FILE, ALL_KEYS_FILE) +): for p in files_list: f = open(p, "a") f.truncate() @@ -118,7 +121,9 @@ def get_keys_from_cache_file(cache_file=ALL_KEYS_FILE): # Get all keys from AWS S3 -def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET, keys_file=S3_BUCKET_RAW_KEYS_FILE): +def get_s3_bucket_raw_keys_list( + s3_bucket=AWS_S3_BUCKET, keys_file=S3_BUCKET_RAW_KEYS_FILE +): print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr) with open(keys_file, "w") as f_obj: subprocess.run( @@ -312,8 +317,6 @@ def output_csv(this_all_keys_dict): init_files() s3_bucket_raw_keys_list = get_s3_bucket_raw_keys_list() nzsl_raw_keys_dict = get_nzsl_raw_keys_dict() - all_keys_dict = create_all_keys_dict( - s3_bucket_raw_keys_list, nzsl_raw_keys_dict - ) + all_keys_dict = create_all_keys_dict(s3_bucket_raw_keys_list, nzsl_raw_keys_dict) output_csv(all_keys_dict) From 15f5443b93939f31c685f6dda8874ce958b476c5 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 4 Sep 2024 11:56:51 +1000 Subject: [PATCH 067/222] Simpler and cleaner --- bin/get-video-s3-acls.py | 92 ++++++++++++++++++++-------------------- 1 file changed, 47 insertions(+), 45 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 0d81756f..a5efe3c4 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -251,52 +251,53 @@ def output_csv(this_all_keys_dict): gloss_id, is_public, ] in this_all_keys_dict.items(): - canned_acl = "" - canned_acl_expected = "" - raw_acl = "" - if is_present: - # See signbank/video/models.py, line 59, in function set_public_acl() - canned_acl_expected = "public-read" if is_public else "private" - result = subprocess.run( - [ - AWSCLIENT, - "s3api", - "get-object-acl", - "--output", - "json", - "--bucket", - AWS_S3_BUCKET, - "--key", - video_key, - ], - env=NEW_ENV, - shell=False, - check=True, - capture_output=True, - text=True, - ) - acls_grants_json = json.loads(result.stdout)["Grants"] - if len(acls_grants_json) > 1: - if ( - acls_grants_json[0]["Permission"] == "FULL_CONTROL" - and acls_grants_json[1]["Permission"] == "READ" - ): - canned_acl = "public-read" - else: - canned_acl = "Unknown ACL" - else: - if acls_grants_json[0]["Permission"] == "FULL_CONTROL": - canned_acl = "private" - else: - canned_acl = "Unknown ACL" + + if not is_present: + print(f"{video_key},,,,,") + continue + + # See signbank/video/models.py, line 59, in function set_public_acl() + canned_acl_expected = "public-read" if is_public else "private" + result = subprocess.run( + [ + AWSCLIENT, + "s3api", + "get-object-acl", + "--output", + "json", + "--bucket", + AWS_S3_BUCKET, + "--key", + video_key, + ], + env=NEW_ENV, + shell=False, + check=True, + capture_output=True, + text=True, + ) + canned_acl = "unknown" + acls_grants_json = json.loads(result.stdout)["Grants"] + if len(acls_grants_json) > 1: + if ( + acls_grants_json[0]["Permission"] == "FULL_CONTROL" + and acls_grants_json[1]["Permission"] == "READ" + ): + canned_acl = "public-read" + else: + if acls_grants_json[0]["Permission"] == "FULL_CONTROL": + canned_acl = "private" # CSV columns - print(f"{video_key}", end=CSV_DELIMITER) - print(f"{db_id if is_present else ''}", end=CSV_DELIMITER) - print(f"{gloss_id if is_present else ''}", end=CSV_DELIMITER) - print(f"{is_public if is_present else ''}", end=CSV_DELIMITER) - print(f"{canned_acl_expected}", end=CSV_DELIMITER) - print(f"{canned_acl}") + csv_column_list = [ + f"{video_key}", + f"{db_id}", + f"{gloss_id}", + f"{is_public}", + f"{canned_acl_expected}", + f"{canned_acl}", + ] + print(CSV_DELIMITER.join(csv_column_list)) print(f"Mode: {args.mode}", file=sys.stderr) @@ -305,7 +306,8 @@ def output_csv(this_all_keys_dict): print(f"AWS profile: {NEW_ENV['AWS_PROFILE']}", file=sys.stderr) print(f"AWSCLIENT: {AWSCLIENT}", file=sys.stderr) print(f"PGCLIENT: {PGCLIENT}", file=sys.stderr) -print(f"DATABASE_URL:\n{NEW_ENV['DATABASE_URL']}", file=sys.stderr) +if "DATABASE_URL" in NEW_ENV: + print(f"DATABASE_URL:\n{NEW_ENV['DATABASE_URL']}", file=sys.stderr) if args.cached: print( From e4189fe48f0a173e7f348d23508364d10ba34e14 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 4 Sep 2024 11:59:02 +1000 Subject: [PATCH 068/222] Output text --- bin/get-video-s3-acls.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index a5efe3c4..aa904fcd 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -302,10 +302,10 @@ def output_csv(this_all_keys_dict): print(f"Mode: {args.mode}", file=sys.stderr) print(f"S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr) -if "AWS_PROFILE" in NEW_ENV: - print(f"AWS profile: {NEW_ENV['AWS_PROFILE']}", file=sys.stderr) print(f"AWSCLIENT: {AWSCLIENT}", file=sys.stderr) print(f"PGCLIENT: {PGCLIENT}", file=sys.stderr) +if "AWS_PROFILE" in NEW_ENV: + print(f"AWS profile: {NEW_ENV['AWS_PROFILE']}", file=sys.stderr) if "DATABASE_URL" in NEW_ENV: print(f"DATABASE_URL:\n{NEW_ENV['DATABASE_URL']}", file=sys.stderr) From 0d13787b688f377d53c24bba2e303c2d683373e0 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 4 Sep 2024 12:48:28 +1000 Subject: [PATCH 069/222] DATABASE_URL output removed as security issue --- bin/get-video-s3-acls.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index aa904fcd..adf3ae5b 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -306,8 +306,6 @@ def output_csv(this_all_keys_dict): print(f"PGCLIENT: {PGCLIENT}", file=sys.stderr) if "AWS_PROFILE" in NEW_ENV: print(f"AWS profile: {NEW_ENV['AWS_PROFILE']}", file=sys.stderr) -if "DATABASE_URL" in NEW_ENV: - print(f"DATABASE_URL:\n{NEW_ENV['DATABASE_URL']}", file=sys.stderr) if args.cached: print( From ec62dc64063380d53842d015c564b8e0d92701da Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 4 Sep 2024 12:51:29 +1000 Subject: [PATCH 070/222] os.environ used everywhere --- bin/get-video-s3-acls.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index adf3ae5b..0e44ff21 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -47,7 +47,6 @@ AWSCLIENT = args.awsclient PGCLIENT = args.pgclient DATABASE_URL = os.getenv("DATABASE_URL", "") -NEW_ENV = os.environ.copy() CSV_DELIMITER = "," AWS_S3_BUCKET = f"nzsl-signbank-media-{args.mode}" TMPDIR = "/tmp/nzsl" @@ -128,7 +127,7 @@ def get_s3_bucket_raw_keys_list( with open(keys_file, "w") as f_obj: subprocess.run( [AWSCLIENT, "s3", "ls", f"s3://{s3_bucket}", "--recursive"], - env=NEW_ENV, + env=os.environ, shell=False, check=True, text=True, @@ -168,7 +167,7 @@ def get_nzsl_raw_keys_dict(keys_file=NZSL_POSTGRES_RAW_KEYS_FILE): "select id as db_id, gloss_id, is_public, videofile from video_glossvideo", f"{DATABASE_URL}", ], - env=NEW_ENV, + env=os.environ, shell=False, check=True, text=True, @@ -270,7 +269,7 @@ def output_csv(this_all_keys_dict): "--key", video_key, ], - env=NEW_ENV, + env=os.environ, shell=False, check=True, capture_output=True, @@ -304,8 +303,8 @@ def output_csv(this_all_keys_dict): print(f"S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr) print(f"AWSCLIENT: {AWSCLIENT}", file=sys.stderr) print(f"PGCLIENT: {PGCLIENT}", file=sys.stderr) -if "AWS_PROFILE" in NEW_ENV: - print(f"AWS profile: {NEW_ENV['AWS_PROFILE']}", file=sys.stderr) +if "AWS_PROFILE" in os.environ: + print(f"AWS profile: {os.environ['AWS_PROFILE']}", file=sys.stderr) if args.cached: print( From 5c0fb65132cfdd95f91e86d6d6951bfc8391378f Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 4 Sep 2024 12:53:19 +1000 Subject: [PATCH 071/222] Exception test removed --- bin/get-video-s3-acls.py | 55 ++++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 30 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 0e44ff21..2602a735 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -80,44 +80,39 @@ def get_keys_from_cache_file(cache_file=ALL_KEYS_FILE): nkeys_present = 0 nkeys_absent = 0 this_all_keys_dict = {} - try: - with open(cache_file, "r") as f_obj: - for line in f_obj.readlines(): - ( - video_key, - is_present_str, - db_id_str, - gloss_id_str, - is_public_str, - ) = line.strip().split(CSV_DELIMITER) - - is_present = is_present_str.strip().lower() == "true" - if is_present: - nkeys_present += 1 - db_id = int(db_id_str) - # Some don't have gloss_id's - try: - gloss_id = int(gloss_id_str) - except ValueError: - gloss_id = None - is_public = is_public_str.strip().lower() == "true" - else: - nkeys_absent += 1 - db_id = None + with open(cache_file, "r") as f_obj: + for line in f_obj.readlines(): + ( + video_key, + is_present_str, + db_id_str, + gloss_id_str, + is_public_str, + ) = line.strip().split(CSV_DELIMITER) + + is_present = is_present_str.strip().lower() == "true" + if is_present: + nkeys_present += 1 + db_id = int(db_id_str) + # Some don't have gloss_id's + try: + gloss_id = int(gloss_id_str) + except ValueError: gloss_id = None - is_public = None + is_public = is_public_str.strip().lower() == "true" + else: + nkeys_absent += 1 + db_id = None + gloss_id = None + is_public = None - this_all_keys_dict[video_key] = [is_present, db_id, gloss_id, is_public] + this_all_keys_dict[video_key] = [is_present, db_id, gloss_id, is_public] print(f"PRESENT: {nkeys_present} keys", file=sys.stderr) print(f"ABSENT: {nkeys_absent} keys", file=sys.stderr) return this_all_keys_dict - except FileNotFoundError: - print(f"File not found: {cache_file}", file=sys.stderr) - exit() - # Get all keys from AWS S3 def get_s3_bucket_raw_keys_list( From 1a3a61254aa595557a5ad502677b450dcbe97654 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 4 Sep 2024 13:26:33 +1000 Subject: [PATCH 072/222] PSQL client works smarter using COPY --- bin/get-video-s3-acls.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 2602a735..dd2370de 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -55,7 +55,7 @@ except OSError as err: print(f"Error creating temporary directory: {TMPDIR} {err}", file=sys.stderr) exit() -NZSL_POSTGRES_RAW_KEYS_FILE = f"{TMPDIR}/nzsl_postgres_raw_keys.txt" +NZSL_POSTGRES_RAW_KEYS_FILE = f"{TMPDIR}/nzsl_postgres_raw_keys.csv" S3_BUCKET_RAW_KEYS_FILE = f"{TMPDIR}/s3_bucket_raw_keys.txt" ALL_KEYS_FILE = f"{TMPDIR}/all_keys.csv" @@ -154,12 +154,13 @@ def get_nzsl_raw_keys_dict(keys_file=NZSL_POSTGRES_RAW_KEYS_FILE): file=sys.stderr, ) with open(keys_file, "w") as f_obj: + # In theory postgres COPY could output directly to our file, but subprocess.run throws an error subprocess.run( [ PGCLIENT, - "-t", "-c", - "select id as db_id, gloss_id, is_public, videofile from video_glossvideo", + "COPY (SELECT id AS db_id, gloss_id, is_public, videofile FROM video_glossvideo) " + "TO STDOUT WITH (FORMAT CSV)", f"{DATABASE_URL}", ], env=os.environ, @@ -168,6 +169,7 @@ def get_nzsl_raw_keys_dict(keys_file=NZSL_POSTGRES_RAW_KEYS_FILE): text=True, stdout=f_obj, ) + with open(keys_file, "r") as f_obj: nzsl_raw_keys_list = f_obj.readlines() print( @@ -181,14 +183,8 @@ def get_nzsl_raw_keys_dict(keys_file=NZSL_POSTGRES_RAW_KEYS_FILE): rawl = rawl.strip() if not rawl: continue - columns = rawl.split("|") - db_id = columns[0].strip() - gloss_id = columns[1].strip() - is_public = columns[2].strip().lower() == "t" - # 'videofile' data is also the key for S3 - video_key = columns[3].strip() - # Each dictionary slot contains these values - this_nzsl_raw_keys_dict[video_key] = [db_id, gloss_id, is_public] + [db_id, gloss_id, is_public, video_key] = rawl.split(",") + this_nzsl_raw_keys_dict[video_key] = [db_id, gloss_id, is_public.lower() == "t"] return this_nzsl_raw_keys_dict From 8ff16e3acc4b3b95d172acf1c76d124a43d7760f Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Thu, 5 Sep 2024 18:09:04 +1000 Subject: [PATCH 073/222] Output canned ACL even if video_key absent from NZSL Signback postgres database. Intermediate step using re.split() --- bin/get-video-s3-acls.py | 47 +++++++++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index dd2370de..50b4159d 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -10,6 +10,7 @@ import subprocess import argparse import json +import re parser = argparse.ArgumentParser( description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. " @@ -121,7 +122,15 @@ def get_s3_bucket_raw_keys_list( print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr) with open(keys_file, "w") as f_obj: subprocess.run( - [AWSCLIENT, "s3", "ls", f"s3://{s3_bucket}", "--recursive"], + [ + AWSCLIENT, + "s3", + "ls", + f"s3://{s3_bucket}", + "--recursive", + "--output", + "json", + ], env=os.environ, shell=False, check=True, @@ -132,7 +141,9 @@ def get_s3_bucket_raw_keys_list( # Separate out just the key (also strip newline) from date, time, size, key # Put the keys in an in-memory list with open(keys_file, "r") as f_obj: - this_s3_bucket_raw_keys_list = [line.split()[3] for line in f_obj] + this_s3_bucket_raw_keys_list = [ + re.split(r"\s+", line, 3)[3].strip() for line in f_obj + ] print( f"{len(this_s3_bucket_raw_keys_list)} rows retrieved: {keys_file}", file=sys.stderr, @@ -242,24 +253,22 @@ def output_csv(this_all_keys_dict): is_public, ] in this_all_keys_dict.items(): - if not is_present: - print(f"{video_key},,,,,") - continue - # See signbank/video/models.py, line 59, in function set_public_acl() canned_acl_expected = "public-read" if is_public else "private" + run_array = [ + AWSCLIENT, + "s3api", + "get-object-acl", + "--output", + "json", + "--bucket", + AWS_S3_BUCKET, + "--key", + video_key, + ] + result = subprocess.run( - [ - AWSCLIENT, - "s3api", - "get-object-acl", - "--output", - "json", - "--bucket", - AWS_S3_BUCKET, - "--key", - video_key, - ], + run_array, env=os.environ, shell=False, check=True, @@ -278,6 +287,10 @@ def output_csv(this_all_keys_dict): if acls_grants_json[0]["Permission"] == "FULL_CONTROL": canned_acl = "private" + if not is_present: + print(f"{video_key},,,,,{canned_acl}") + continue + # CSV columns csv_column_list = [ f"{video_key}", From 05618312822d3259c769d2328e15058b4d00f2b9 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 6 Sep 2024 18:13:21 +1000 Subject: [PATCH 074/222] S3 intermediate file removed --- bin/get-video-s3-acls.py | 54 ++++++++++++++++------------------------ 1 file changed, 21 insertions(+), 33 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 50b4159d..b6379f3b 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -57,7 +57,6 @@ print(f"Error creating temporary directory: {TMPDIR} {err}", file=sys.stderr) exit() NZSL_POSTGRES_RAW_KEYS_FILE = f"{TMPDIR}/nzsl_postgres_raw_keys.csv" -S3_BUCKET_RAW_KEYS_FILE = f"{TMPDIR}/s3_bucket_raw_keys.txt" ALL_KEYS_FILE = f"{TMPDIR}/all_keys.csv" # Vars @@ -68,7 +67,7 @@ # Truncate files, creating them if necessary def init_files( - files_list=(NZSL_POSTGRES_RAW_KEYS_FILE, S3_BUCKET_RAW_KEYS_FILE, ALL_KEYS_FILE) + files_list=(NZSL_POSTGRES_RAW_KEYS_FILE, ALL_KEYS_FILE) ): for p in files_list: f = open(p, "a") @@ -117,43 +116,33 @@ def get_keys_from_cache_file(cache_file=ALL_KEYS_FILE): # Get all keys from AWS S3 def get_s3_bucket_raw_keys_list( - s3_bucket=AWS_S3_BUCKET, keys_file=S3_BUCKET_RAW_KEYS_FILE + s3_bucket=AWS_S3_BUCKET ): print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr) - with open(keys_file, "w") as f_obj: - subprocess.run( - [ - AWSCLIENT, - "s3", - "ls", - f"s3://{s3_bucket}", - "--recursive", - "--output", - "json", - ], - env=os.environ, - shell=False, - check=True, - text=True, - stdout=f_obj, - ) + result = subprocess.run( + [ + AWSCLIENT, + "s3", + "ls", + f"s3://{s3_bucket}", + "--recursive", + ], + env=os.environ, + capture_output=True, + check=True, + text=True, + ) - # Separate out just the key (also strip newline) from date, time, size, key - # Put the keys in an in-memory list - with open(keys_file, "r") as f_obj: - this_s3_bucket_raw_keys_list = [ - re.split(r"\s+", line, 3)[3].strip() for line in f_obj - ] + # Separate out just the key from date, time, size, key + this_s3_bucket_raw_keys_list = [] + for line in result.stdout.split('\n'): + if line: + this_s3_bucket_raw_keys_list.append(re.split(r"\s+", line, 3)[3]) print( - f"{len(this_s3_bucket_raw_keys_list)} rows retrieved: {keys_file}", + f"{len(this_s3_bucket_raw_keys_list)} rows retrieved", file=sys.stderr, ) - # Write the keys back to the file, for cleanliness - with open(keys_file, "w") as f_obj: - for line in this_s3_bucket_raw_keys_list: - f_obj.write(f"{line}\n") - return this_s3_bucket_raw_keys_list @@ -266,7 +255,6 @@ def output_csv(this_all_keys_dict): "--key", video_key, ] - result = subprocess.run( run_array, env=os.environ, From 47494caf733bd35f6478ce0be99c95cd9567910b Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 6 Sep 2024 18:31:04 +1000 Subject: [PATCH 075/222] Intermediate files gone. Cache file only. Tidy up. --- bin/get-video-s3-acls.py | 75 +++++++++++++++++----------------------- 1 file changed, 31 insertions(+), 44 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index b6379f3b..25974631 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -56,8 +56,7 @@ except OSError as err: print(f"Error creating temporary directory: {TMPDIR} {err}", file=sys.stderr) exit() -NZSL_POSTGRES_RAW_KEYS_FILE = f"{TMPDIR}/nzsl_postgres_raw_keys.csv" -ALL_KEYS_FILE = f"{TMPDIR}/all_keys.csv" +ALL_KEYS_CACHE_FILE = f"{TMPDIR}/all_keys_cache.csv" # Vars nzsl_raw_keys_dict = {} @@ -66,17 +65,15 @@ # Truncate files, creating them if necessary -def init_files( - files_list=(NZSL_POSTGRES_RAW_KEYS_FILE, ALL_KEYS_FILE) -): +def init_files(files_list=(ALL_KEYS_CACHE_FILE,)): for p in files_list: f = open(p, "a") f.truncate() f.close() -# Pull all info from existing file -def get_keys_from_cache_file(cache_file=ALL_KEYS_FILE): +# Pull all info from existing cache file +def get_keys_from_cache_file(cache_file=ALL_KEYS_CACHE_FILE): nkeys_present = 0 nkeys_absent = 0 this_all_keys_dict = {} @@ -115,9 +112,7 @@ def get_keys_from_cache_file(cache_file=ALL_KEYS_FILE): # Get all keys from AWS S3 -def get_s3_bucket_raw_keys_list( - s3_bucket=AWS_S3_BUCKET -): +def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET): print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr) result = subprocess.run( [ @@ -135,9 +130,10 @@ def get_s3_bucket_raw_keys_list( # Separate out just the key from date, time, size, key this_s3_bucket_raw_keys_list = [] - for line in result.stdout.split('\n'): + for line in result.stdout.split("\n"): if line: this_s3_bucket_raw_keys_list.append(re.split(r"\s+", line, 3)[3]) + print( f"{len(this_s3_bucket_raw_keys_list)} rows retrieved", file=sys.stderr, @@ -147,52 +143,45 @@ def get_s3_bucket_raw_keys_list( # Get the video files info from NZSL Signbank -def get_nzsl_raw_keys_dict(keys_file=NZSL_POSTGRES_RAW_KEYS_FILE): +def get_nzsl_raw_keys_dict(): this_nzsl_raw_keys_dict = {} print( f"Getting raw list of video file info from NZSL Signbank ...", file=sys.stderr, ) - with open(keys_file, "w") as f_obj: - # In theory postgres COPY could output directly to our file, but subprocess.run throws an error - subprocess.run( - [ - PGCLIENT, - "-c", - "COPY (SELECT id AS db_id, gloss_id, is_public, videofile FROM video_glossvideo) " - "TO STDOUT WITH (FORMAT CSV)", - f"{DATABASE_URL}", - ], - env=os.environ, - shell=False, - check=True, - text=True, - stdout=f_obj, - ) - - with open(keys_file, "r") as f_obj: - nzsl_raw_keys_list = f_obj.readlines() - print( - f"{len(nzsl_raw_keys_list)} rows retrieved: {keys_file}", - file=sys.stderr, + result = subprocess.run( + [ + PGCLIENT, + "-c", + "COPY (SELECT id AS db_id, gloss_id, is_public, videofile FROM video_glossvideo) " + "TO STDOUT WITH (FORMAT CSV)", + f"{DATABASE_URL}", + ], + env=os.environ, + capture_output=True, + check=True, + text=True, ) # Separate out the NZSL db columns # Write them to a dictionary, so we can do fast operations - for rawl in nzsl_raw_keys_list: + for rawl in result.stdout.split("\n"): rawl = rawl.strip() if not rawl: continue [db_id, gloss_id, is_public, video_key] = rawl.split(",") this_nzsl_raw_keys_dict[video_key] = [db_id, gloss_id, is_public.lower() == "t"] + print( + f"{len(this_nzsl_raw_keys_dict)} rows retrieved", + file=sys.stderr, + ) + return this_nzsl_raw_keys_dict # Get the s3 keys present and absent from our NZSL keys -def create_all_keys_dict( - this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict, all_keys_file=ALL_KEYS_FILE -): +def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict): print("Getting S3 keys present and absent from NZSL Signbank ...", file=sys.stderr) nkeys_present = 0 nkeys_absent = 0 @@ -209,8 +198,8 @@ def create_all_keys_dict( print(f"PRESENT: {nkeys_present} keys", file=sys.stderr) print(f"ABSENT: {nkeys_absent} keys", file=sys.stderr) - # Write all keys back to a file - with open(all_keys_file, "w") as f_obj: + # Write all keys back to a cache file + with open(ALL_KEYS_CACHE_FILE, "w") as f_obj: for video_key, item_list in this_all_keys_dict.items(): outstr = ( f"{video_key}{CSV_DELIMITER}{CSV_DELIMITER.join(map(str, item_list))}\n" @@ -299,12 +288,10 @@ def output_csv(this_all_keys_dict): print(f"AWS profile: {os.environ['AWS_PROFILE']}", file=sys.stderr) if args.cached: - print( - "Using the video keys we recorded on the last non-cached run.", file=sys.stderr - ) + print(f"Using video keys from cache file ({ALL_KEYS_CACHE_FILE}).", file=sys.stderr) all_keys_dict = get_keys_from_cache_file() else: - print("Generating keys from scratch.", file=sys.stderr) + print("Generating video keys from scratch.", file=sys.stderr) init_files() s3_bucket_raw_keys_list = get_s3_bucket_raw_keys_list() nzsl_raw_keys_dict = get_nzsl_raw_keys_dict() From 81d77897d44290b8224a96ac55c33106c5b26525 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 6 Sep 2024 18:37:21 +1000 Subject: [PATCH 076/222] Text, whitespace --- bin/get-video-s3-acls.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 25974631..3c7010a4 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -27,17 +27,17 @@ default=False, required=False, action="store_true", - help="Use keys generated on a previous non-cached run (default: %(default)s) " - "(Don't mix PRODUCTION and STAGING!)", + help="Use video keys generated on a previous non-cached run (default: %(default)s) " + "(Do not mix production and staging!)", ) parser.add_argument( - "--pgclient", + "--pgcli", default="/usr/bin/psql", required=False, help=f"Postgres client path (default: %(default)s)", ) parser.add_argument( - "--awsclient", + "--awscli", default="/usr/local/bin/aws", required=False, help=f"AWS client path (default: %(default)s)", @@ -45,8 +45,8 @@ args = parser.parse_args() # Globals -AWSCLIENT = args.awsclient -PGCLIENT = args.pgclient +AWSCLI = args.awscli +PGCLI = args.pgcli DATABASE_URL = os.getenv("DATABASE_URL", "") CSV_DELIMITER = "," AWS_S3_BUCKET = f"nzsl-signbank-media-{args.mode}" @@ -116,7 +116,7 @@ def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET): print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr) result = subprocess.run( [ - AWSCLIENT, + AWSCLI, "s3", "ls", f"s3://{s3_bucket}", @@ -151,7 +151,7 @@ def get_nzsl_raw_keys_dict(): ) result = subprocess.run( [ - PGCLIENT, + PGCLI, "-c", "COPY (SELECT id AS db_id, gloss_id, is_public, videofile FROM video_glossvideo) " "TO STDOUT WITH (FORMAT CSV)", @@ -195,6 +195,7 @@ def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict): nkeys_absent += 1 # Add 'Present' (absent) column to start this_all_keys_dict[video_key] = [False, "", "", ""] + print(f"PRESENT: {nkeys_present} keys", file=sys.stderr) print(f"ABSENT: {nkeys_absent} keys", file=sys.stderr) @@ -234,7 +235,7 @@ def output_csv(this_all_keys_dict): # See signbank/video/models.py, line 59, in function set_public_acl() canned_acl_expected = "public-read" if is_public else "private" run_array = [ - AWSCLIENT, + AWSCLI, "s3api", "get-object-acl", "--output", @@ -280,10 +281,10 @@ def output_csv(this_all_keys_dict): print(CSV_DELIMITER.join(csv_column_list)) -print(f"Mode: {args.mode}", file=sys.stderr) -print(f"S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr) -print(f"AWSCLIENT: {AWSCLIENT}", file=sys.stderr) -print(f"PGCLIENT: {PGCLIENT}", file=sys.stderr) +print(f"Mode: {args.mode}", file=sys.stderr) +print(f"S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr) +print(f"AWSCLI: {AWSCLI}", file=sys.stderr) +print(f"PGCLI: {PGCLI}", file=sys.stderr) if "AWS_PROFILE" in os.environ: print(f"AWS profile: {os.environ['AWS_PROFILE']}", file=sys.stderr) From 0fe589bfdbdc8b9583574ebc896577b692a45448 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 6 Sep 2024 18:39:40 +1000 Subject: [PATCH 077/222] Cache file made global --- bin/get-video-s3-acls.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 3c7010a4..de3548fc 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -73,11 +73,11 @@ def init_files(files_list=(ALL_KEYS_CACHE_FILE,)): # Pull all info from existing cache file -def get_keys_from_cache_file(cache_file=ALL_KEYS_CACHE_FILE): +def get_keys_from_cache_file(): nkeys_present = 0 nkeys_absent = 0 this_all_keys_dict = {} - with open(cache_file, "r") as f_obj: + with open(ALL_KEYS_CACHE_FILE, "r") as f_obj: for line in f_obj.readlines(): ( video_key, From 278cb49350e96cf453ad862380651a0adc53bc05 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 6 Sep 2024 18:47:44 +1000 Subject: [PATCH 078/222] Unbuffered python output --- bin/get-video-s3-acls.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index de3548fc..913c041f 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -1,4 +1,5 @@ -#!/usr/bin/env python3 +#!/usr/bin/env -S python3 -u +# Bang line above passes '-u' to python, for unbuffered output # Permissions required: # psql - access to heroku app's postgres # aws s3 - NZSL IAM access From 666c8de798a7474c02b8383d0fd78c86577098df Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Tue, 10 Sep 2024 14:34:34 +1000 Subject: [PATCH 079/222] Text change: mode -> env --- bin/get-video-s3-acls.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 913c041f..7eadeec3 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -18,10 +18,10 @@ "Postgres access details, eg. DATABASE_URL env var." ) parser.add_argument( - "--mode", + "--env", default="uat", required=False, - help="Mode to run in, eg 'production, 'uat', etc (default: '%(default)s')", + help="Environment to run against, eg 'production, 'uat', etc (default: '%(default)s')", ) parser.add_argument( "--cached", @@ -50,7 +50,7 @@ PGCLI = args.pgcli DATABASE_URL = os.getenv("DATABASE_URL", "") CSV_DELIMITER = "," -AWS_S3_BUCKET = f"nzsl-signbank-media-{args.mode}" +AWS_S3_BUCKET = f"nzsl-signbank-media-{args.env}" TMPDIR = "/tmp/nzsl" try: os.makedirs(TMPDIR, exist_ok=True) @@ -282,7 +282,7 @@ def output_csv(this_all_keys_dict): print(CSV_DELIMITER.join(csv_column_list)) -print(f"Mode: {args.mode}", file=sys.stderr) +print(f"Mode: {args.env}", file=sys.stderr) print(f"S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr) print(f"AWSCLI: {AWSCLI}", file=sys.stderr) print(f"PGCLI: {PGCLI}", file=sys.stderr) From a884e3babbab52700c9f2e8a581cf0227695dbee Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Tue, 10 Sep 2024 15:06:55 +1000 Subject: [PATCH 080/222] Cache file written in same loop as keys dictionary --- bin/get-video-s3-acls.py | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 7eadeec3..bf69d26b 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -186,27 +186,27 @@ def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict): print("Getting S3 keys present and absent from NZSL Signbank ...", file=sys.stderr) nkeys_present = 0 nkeys_absent = 0 + item_list = [] this_all_keys_dict = {} - for video_key in this_s3_bucket_raw_keys_list: - if video_key in this_nzsl_raw_keys_dict: - nkeys_present += 1 - # Add 'Present' column to start - this_all_keys_dict[video_key] = [True] + this_nzsl_raw_keys_dict[video_key] - else: - nkeys_absent += 1 - # Add 'Present' (absent) column to start - this_all_keys_dict[video_key] = [False, "", "", ""] - - print(f"PRESENT: {nkeys_present} keys", file=sys.stderr) - print(f"ABSENT: {nkeys_absent} keys", file=sys.stderr) + with open(ALL_KEYS_CACHE_FILE, "w") as cache_file: + for video_key in this_s3_bucket_raw_keys_list: + if video_key in this_nzsl_raw_keys_dict: + nkeys_present += 1 + # Add 'Present' column to start + item_list = [True] + this_nzsl_raw_keys_dict[video_key] + else: + nkeys_absent += 1 + # Add 'Present' (absent) column to start + item_list = [False, "", "", ""] + this_all_keys_dict[video_key] = item_list - # Write all keys back to a cache file - with open(ALL_KEYS_CACHE_FILE, "w") as f_obj: - for video_key, item_list in this_all_keys_dict.items(): - outstr = ( + # Write all keys back to a cache file + cache_file.write( f"{video_key}{CSV_DELIMITER}{CSV_DELIMITER.join(map(str, item_list))}\n" ) - f_obj.write(outstr) + + print(f"PRESENT: {nkeys_present} keys", file=sys.stderr) + print(f"ABSENT: {nkeys_absent} keys", file=sys.stderr) return this_all_keys_dict From e0e7a8e45cd791ad806497d99c34fa6423f11c49 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Tue, 10 Sep 2024 15:08:40 +1000 Subject: [PATCH 081/222] Simplified conditional --- bin/get-video-s3-acls.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index bf69d26b..8a7ad557 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -262,9 +262,8 @@ def output_csv(this_all_keys_dict): and acls_grants_json[1]["Permission"] == "READ" ): canned_acl = "public-read" - else: - if acls_grants_json[0]["Permission"] == "FULL_CONTROL": - canned_acl = "private" + elif acls_grants_json[0]["Permission"] == "FULL_CONTROL": + canned_acl = "private" if not is_present: print(f"{video_key},,,,,{canned_acl}") From 7249def8d08db329bb79e257c30e48e4d9347b1d Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Tue, 10 Sep 2024 15:47:30 +1000 Subject: [PATCH 082/222] CSV construction deconstructed --- bin/get-video-s3-acls.py | 129 ++++++++++++++++++++++----------------- 1 file changed, 74 insertions(+), 55 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 8a7ad557..c0e68c89 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -211,20 +211,76 @@ def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict): return this_all_keys_dict +def build_csv_header(): + return CSV_DELIMITER.join( + [ + "Video S3 Key", + "Postgres ID", + "Gloss ID", + "Signbank Public", + "Expected S3 Canned ACL", + "Actual S3 Canned ACL", + ] + ) + + +def build_csv_row( + video_key, is_present=False, db_id=None, gloss_id=None, is_public=False +): + + run_array = [ + AWSCLI, + "s3api", + "get-object-acl", + "--output", + "json", + "--bucket", + AWS_S3_BUCKET, + "--key", + video_key, + ] + result = subprocess.run( + run_array, + env=os.environ, + shell=False, + check=True, + capture_output=True, + text=True, + ) + canned_acl = "unknown" + acls_grants_json = json.loads(result.stdout)["Grants"] + if len(acls_grants_json) > 1: + if ( + acls_grants_json[0]["Permission"] == "FULL_CONTROL" + and acls_grants_json[1]["Permission"] == "READ" + ): + canned_acl = "public-read" + elif acls_grants_json[0]["Permission"] == "FULL_CONTROL": + canned_acl = "private" + + # See signbank/video/models.py, line 59, in function set_public_acl() + if is_present: + canned_acl_expected = "public-read" if is_public else "private" + else: + canned_acl_expected = "" + + return CSV_DELIMITER.join( + [ + f"{video_key}", + f"{db_id}", + f"{gloss_id}", + f"{is_public}", + f"{canned_acl_expected}", + f"{canned_acl}", + ] + ) + + # From the keys present in NZSL, get all their ACL information def output_csv(this_all_keys_dict): print(f"Getting ACLs for keys from S3 ({AWS_S3_BUCKET}) ...", file=sys.stderr) - # CSV header - csv_header_list = [ - "Video S3 Key", - "Postgres ID", - "Gloss ID", - "Signbank Public", - "Expected S3 Canned ACL", - "Actual S3 Canned ACL", - ] - print(CSV_DELIMITER.join(csv_header_list)) + print(build_csv_header()) for video_key, [ is_present, @@ -233,52 +289,15 @@ def output_csv(this_all_keys_dict): is_public, ] in this_all_keys_dict.items(): - # See signbank/video/models.py, line 59, in function set_public_acl() - canned_acl_expected = "public-read" if is_public else "private" - run_array = [ - AWSCLI, - "s3api", - "get-object-acl", - "--output", - "json", - "--bucket", - AWS_S3_BUCKET, - "--key", - video_key, - ] - result = subprocess.run( - run_array, - env=os.environ, - shell=False, - check=True, - capture_output=True, - text=True, + print( + build_csv_row( + video_key, + is_present, + db_id, + gloss_id, + is_public, + ) ) - canned_acl = "unknown" - acls_grants_json = json.loads(result.stdout)["Grants"] - if len(acls_grants_json) > 1: - if ( - acls_grants_json[0]["Permission"] == "FULL_CONTROL" - and acls_grants_json[1]["Permission"] == "READ" - ): - canned_acl = "public-read" - elif acls_grants_json[0]["Permission"] == "FULL_CONTROL": - canned_acl = "private" - - if not is_present: - print(f"{video_key},,,,,{canned_acl}") - continue - - # CSV columns - csv_column_list = [ - f"{video_key}", - f"{db_id}", - f"{gloss_id}", - f"{is_public}", - f"{canned_acl_expected}", - f"{canned_acl}", - ] - print(CSV_DELIMITER.join(csv_column_list)) print(f"Mode: {args.env}", file=sys.stderr) From 70d65fa036dee6e5b5ea355f3b835789d35f40e8 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Tue, 10 Sep 2024 16:07:36 +1000 Subject: [PATCH 083/222] Superfluous variable removed --- bin/get-video-s3-acls.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index c0e68c89..cde4175b 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -186,7 +186,6 @@ def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict): print("Getting S3 keys present and absent from NZSL Signbank ...", file=sys.stderr) nkeys_present = 0 nkeys_absent = 0 - item_list = [] this_all_keys_dict = {} with open(ALL_KEYS_CACHE_FILE, "w") as cache_file: for video_key in this_s3_bucket_raw_keys_list: From 1c558b4fcd88fdf352e5478197bb6c1ad020d7cf Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 11 Sep 2024 16:36:20 +1000 Subject: [PATCH 084/222] First approximation of bidirectional matching --- bin/get-video-s3-acls.py | 67 ++++++++++++++++++++++++++++++---------- 1 file changed, 50 insertions(+), 17 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index cde4175b..b269a3f0 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -17,6 +17,14 @@ description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. " "Postgres access details, eg. DATABASE_URL env var." ) +# This debug will be removed +parser.add_argument( + "--debug", + default=False, + required=False, + action="store_true", + help="Turn on some debug actions (default: %(default)s) " +) parser.add_argument( "--env", default="uat", @@ -181,32 +189,52 @@ def get_nzsl_raw_keys_dict(): return this_nzsl_raw_keys_dict -# Get the s3 keys present and absent from our NZSL keys +# Get the s3 keys present and absent from our NZSL keys, to dictionary: +# video_key(str) -> in_nzsl(bool), in_s3(bool), db_id(int), gloss_id(int), is_public(bool) def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict): - print("Getting S3 keys present and absent from NZSL Signbank ...", file=sys.stderr) - nkeys_present = 0 - nkeys_absent = 0 + print( + "Getting keys present and absent across NZSL Signbank and S3 ...", + file=sys.stderr, + ) this_all_keys_dict = {} with open(ALL_KEYS_CACHE_FILE, "w") as cache_file: + + # Debug, we inject fake keys: grep for 'This_' + if args.debug: + this_nzsl_raw_keys_dict["This_key_is_in_both"] = [0, 1, True] + this_s3_bucket_raw_keys_list.append("This_key_is_in_both") + this_nzsl_raw_keys_dict["This_nzsl_key_is_not_in_s3"] = [0, 1, True] + this_s3_bucket_raw_keys_list.append("This_s3_key_is_not_in_nzsl") + + # Find S3 keys that are present in NZSL, or absent for video_key in this_s3_bucket_raw_keys_list: if video_key in this_nzsl_raw_keys_dict: - nkeys_present += 1 - # Add 'Present' column to start - item_list = [True] + this_nzsl_raw_keys_dict[video_key] + if args.debug: + print(f"'{video_key}' in BOTH NZSL and S3") + # NZSL PRESENT, S3 PRESENT + this_all_keys_dict[video_key] = [True, True] + this_nzsl_raw_keys_dict[ + video_key + ] else: - nkeys_absent += 1 - # Add 'Present' (absent) column to start - item_list = [False, "", "", ""] - this_all_keys_dict[video_key] = item_list - - # Write all keys back to a cache file + if args.debug: + print(f"'{video_key}' NOT in NZSL, but in S3") + # NZSL Absent, S3 PRESENT + this_all_keys_dict[video_key] = [False, True, "", "", ""] + + # Find NZSL keys that are absent from S3 (present handled already above) + for video_key, item_list in this_nzsl_raw_keys_dict.items(): + if video_key not in this_s3_bucket_raw_keys_list: + if args.debug: + print(f"'{video_key}' in NZSL, but NOT in S3") + # NZSL PRESENT, S3 Absent + this_all_keys_dict[video_key] = [True, False] + item_list + + # Write all keys back to a cache file + for video_key, item_list in this_all_keys_dict.items(): cache_file.write( f"{video_key}{CSV_DELIMITER}{CSV_DELIMITER.join(map(str, item_list))}\n" ) - print(f"PRESENT: {nkeys_present} keys", file=sys.stderr) - print(f"ABSENT: {nkeys_absent} keys", file=sys.stderr) - return this_all_keys_dict @@ -308,7 +336,9 @@ def output_csv(this_all_keys_dict): if args.cached: print(f"Using video keys from cache file ({ALL_KEYS_CACHE_FILE}).", file=sys.stderr) - all_keys_dict = get_keys_from_cache_file() + print("We are not yet worthy.") + exit() + # all_keys_dict = get_keys_from_cache_file() else: print("Generating video keys from scratch.", file=sys.stderr) init_files() @@ -316,4 +346,7 @@ def output_csv(this_all_keys_dict): nzsl_raw_keys_dict = get_nzsl_raw_keys_dict() all_keys_dict = create_all_keys_dict(s3_bucket_raw_keys_list, nzsl_raw_keys_dict) +print("DEBUG EXIT") +exit() + output_csv(all_keys_dict) From 1ad888cd732ae3ff144201fc42dfd14a71c19413 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 11 Sep 2024 16:52:13 +1000 Subject: [PATCH 085/222] Presence/Absence S3 vs NZSL now bi-directional --- bin/get-video-s3-acls.py | 64 +++++++++++++--------------------------- 1 file changed, 20 insertions(+), 44 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index b269a3f0..780e5bc3 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -17,14 +17,6 @@ description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. " "Postgres access details, eg. DATABASE_URL env var." ) -# This debug will be removed -parser.add_argument( - "--debug", - default=False, - required=False, - action="store_true", - help="Turn on some debug actions (default: %(default)s) " -) parser.add_argument( "--env", default="uat", @@ -83,22 +75,21 @@ def init_files(files_list=(ALL_KEYS_CACHE_FILE,)): # Pull all info from existing cache file def get_keys_from_cache_file(): - nkeys_present = 0 - nkeys_absent = 0 this_all_keys_dict = {} with open(ALL_KEYS_CACHE_FILE, "r") as f_obj: for line in f_obj.readlines(): ( video_key, - is_present_str, + key_in_nzsl_str, + key_in_s3_str, db_id_str, gloss_id_str, is_public_str, ) = line.strip().split(CSV_DELIMITER) - is_present = is_present_str.strip().lower() == "true" - if is_present: - nkeys_present += 1 + # If possible, get NZSL db info + key_in_nzsl = key_in_nzsl_str.strip().lower() == "true" + if key_in_nzsl: db_id = int(db_id_str) # Some don't have gloss_id's try: @@ -107,15 +98,13 @@ def get_keys_from_cache_file(): gloss_id = None is_public = is_public_str.strip().lower() == "true" else: - nkeys_absent += 1 - db_id = None - gloss_id = None - is_public = None + db_id = "" + gloss_id = "" + is_public = "" - this_all_keys_dict[video_key] = [is_present, db_id, gloss_id, is_public] + key_in_s3 = key_in_s3_str.strip().lower() == "true" - print(f"PRESENT: {nkeys_present} keys", file=sys.stderr) - print(f"ABSENT: {nkeys_absent} keys", file=sys.stderr) + this_all_keys_dict[video_key] = [key_in_nzsl, key_in_s3, db_id, gloss_id, is_public] return this_all_keys_dict @@ -199,33 +188,20 @@ def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict): this_all_keys_dict = {} with open(ALL_KEYS_CACHE_FILE, "w") as cache_file: - # Debug, we inject fake keys: grep for 'This_' - if args.debug: - this_nzsl_raw_keys_dict["This_key_is_in_both"] = [0, 1, True] - this_s3_bucket_raw_keys_list.append("This_key_is_in_both") - this_nzsl_raw_keys_dict["This_nzsl_key_is_not_in_s3"] = [0, 1, True] - this_s3_bucket_raw_keys_list.append("This_s3_key_is_not_in_nzsl") - # Find S3 keys that are present in NZSL, or absent for video_key in this_s3_bucket_raw_keys_list: if video_key in this_nzsl_raw_keys_dict: - if args.debug: - print(f"'{video_key}' in BOTH NZSL and S3") # NZSL PRESENT, S3 PRESENT this_all_keys_dict[video_key] = [True, True] + this_nzsl_raw_keys_dict[ video_key ] else: - if args.debug: - print(f"'{video_key}' NOT in NZSL, but in S3") # NZSL Absent, S3 PRESENT this_all_keys_dict[video_key] = [False, True, "", "", ""] # Find NZSL keys that are absent from S3 (present handled already above) for video_key, item_list in this_nzsl_raw_keys_dict.items(): if video_key not in this_s3_bucket_raw_keys_list: - if args.debug: - print(f"'{video_key}' in NZSL, but NOT in S3") # NZSL PRESENT, S3 Absent this_all_keys_dict[video_key] = [True, False] + item_list @@ -252,9 +228,12 @@ def build_csv_header(): def build_csv_row( - video_key, is_present=False, db_id=None, gloss_id=None, is_public=False + video_key, key_in_nzsl=False, key_in_s3=False, db_id=None, gloss_id=None, is_public=False ): + if not key_in_s3: + return + run_array = [ AWSCLI, "s3api", @@ -286,7 +265,7 @@ def build_csv_row( canned_acl = "private" # See signbank/video/models.py, line 59, in function set_public_acl() - if is_present: + if key_in_nzsl: canned_acl_expected = "public-read" if is_public else "private" else: canned_acl_expected = "" @@ -310,7 +289,8 @@ def output_csv(this_all_keys_dict): print(build_csv_header()) for video_key, [ - is_present, + key_in_nzsl, + key_in_s3, db_id, gloss_id, is_public, @@ -319,7 +299,8 @@ def output_csv(this_all_keys_dict): print( build_csv_row( video_key, - is_present, + key_in_nzsl, + key_in_s3, db_id, gloss_id, is_public, @@ -336,9 +317,7 @@ def output_csv(this_all_keys_dict): if args.cached: print(f"Using video keys from cache file ({ALL_KEYS_CACHE_FILE}).", file=sys.stderr) - print("We are not yet worthy.") - exit() - # all_keys_dict = get_keys_from_cache_file() + all_keys_dict = get_keys_from_cache_file() else: print("Generating video keys from scratch.", file=sys.stderr) init_files() @@ -346,7 +325,4 @@ def output_csv(this_all_keys_dict): nzsl_raw_keys_dict = get_nzsl_raw_keys_dict() all_keys_dict = create_all_keys_dict(s3_bucket_raw_keys_list, nzsl_raw_keys_dict) -print("DEBUG EXIT") -exit() - output_csv(all_keys_dict) From 2c7d3752059e43150031955d756a6f233a27e44f Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 11 Sep 2024 17:03:34 +1000 Subject: [PATCH 086/222] NZSL Present S3 Absent case now outputs to CSV --- bin/get-video-s3-acls.py | 65 ++++++++++++++++++++++++++-------------- 1 file changed, 43 insertions(+), 22 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 780e5bc3..fb17a0b6 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -104,7 +104,13 @@ def get_keys_from_cache_file(): key_in_s3 = key_in_s3_str.strip().lower() == "true" - this_all_keys_dict[video_key] = [key_in_nzsl, key_in_s3, db_id, gloss_id, is_public] + this_all_keys_dict[video_key] = [ + key_in_nzsl, + key_in_s3, + db_id, + gloss_id, + is_public, + ] return this_all_keys_dict @@ -228,25 +234,46 @@ def build_csv_header(): def build_csv_row( - video_key, key_in_nzsl=False, key_in_s3=False, db_id=None, gloss_id=None, is_public=False + video_key, + key_in_nzsl=False, + key_in_s3=False, + db_id=None, + gloss_id=None, + is_public=False, ): + # See signbank/video/models.py, line 59, in function set_public_acl() + if key_in_nzsl: + canned_acl_expected = "public-read" if is_public else "private" + else: + canned_acl_expected = "" + + # If key not in S3, just return its NZSL info if not key_in_s3: - return - - run_array = [ - AWSCLI, - "s3api", - "get-object-acl", - "--output", - "json", - "--bucket", - AWS_S3_BUCKET, - "--key", - video_key, - ] + return CSV_DELIMITER.join( + [ + f"{video_key}", + f"{db_id}", + f"{gloss_id}", + f"{is_public}", + f"{canned_acl_expected}", + "", + ] + ) + + # Get S3 object's ACL result = subprocess.run( - run_array, + [ + AWSCLI, + "s3api", + "get-object-acl", + "--output", + "json", + "--bucket", + AWS_S3_BUCKET, + "--key", + video_key, + ], env=os.environ, shell=False, check=True, @@ -264,12 +291,6 @@ def build_csv_row( elif acls_grants_json[0]["Permission"] == "FULL_CONTROL": canned_acl = "private" - # See signbank/video/models.py, line 59, in function set_public_acl() - if key_in_nzsl: - canned_acl_expected = "public-read" if is_public else "private" - else: - canned_acl_expected = "" - return CSV_DELIMITER.join( [ f"{video_key}", From 3438dd9c869d13da16be3eb6ae28323e4b444bca Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 11 Sep 2024 17:08:05 +1000 Subject: [PATCH 087/222] Added --tmpdir argument --- bin/get-video-s3-acls.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index fb17a0b6..a5590027 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -43,6 +43,12 @@ required=False, help=f"AWS client path (default: %(default)s)", ) +parser.add_argument( + "--tmpdir", + default="/tmp/nzsl", + required=False, + help=f"Temp dir path (default: %(default)s)", +) args = parser.parse_args() # Globals @@ -51,7 +57,7 @@ DATABASE_URL = os.getenv("DATABASE_URL", "") CSV_DELIMITER = "," AWS_S3_BUCKET = f"nzsl-signbank-media-{args.env}" -TMPDIR = "/tmp/nzsl" +TMPDIR = args.tmpdir try: os.makedirs(TMPDIR, exist_ok=True) except OSError as err: From f13b7037129f6d70b76f2f201829b105b89339c4 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Thu, 12 Sep 2024 11:54:00 +1000 Subject: [PATCH 088/222] Minor tidy-ups --- bin/get-video-s3-acls.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index a5590027..63bbff46 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -52,10 +52,10 @@ args = parser.parse_args() # Globals +CSV_DELIMITER = "," +DATABASE_URL = os.getenv("DATABASE_URL", "") AWSCLI = args.awscli PGCLI = args.pgcli -DATABASE_URL = os.getenv("DATABASE_URL", "") -CSV_DELIMITER = "," AWS_S3_BUCKET = f"nzsl-signbank-media-{args.env}" TMPDIR = args.tmpdir try: @@ -82,8 +82,8 @@ def init_files(files_list=(ALL_KEYS_CACHE_FILE,)): # Pull all info from existing cache file def get_keys_from_cache_file(): this_all_keys_dict = {} - with open(ALL_KEYS_CACHE_FILE, "r") as f_obj: - for line in f_obj.readlines(): + with open(ALL_KEYS_CACHE_FILE, "r") as cache_file: + for line in cache_file.readlines(): ( video_key, key_in_nzsl_str, @@ -97,12 +97,12 @@ def get_keys_from_cache_file(): key_in_nzsl = key_in_nzsl_str.strip().lower() == "true" if key_in_nzsl: db_id = int(db_id_str) + is_public = is_public_str.strip().lower() == "true" # Some don't have gloss_id's try: gloss_id = int(gloss_id_str) except ValueError: gloss_id = None - is_public = is_public_str.strip().lower() == "true" else: db_id = "" gloss_id = "" @@ -154,11 +154,11 @@ def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET): # Get the video files info from NZSL Signbank def get_nzsl_raw_keys_dict(): - this_nzsl_raw_keys_dict = {} print( f"Getting raw list of video file info from NZSL Signbank ...", file=sys.stderr, ) + this_nzsl_raw_keys_dict = {} result = subprocess.run( [ PGCLI, @@ -190,7 +190,7 @@ def get_nzsl_raw_keys_dict(): return this_nzsl_raw_keys_dict -# Get the s3 keys present and absent from our NZSL keys, to dictionary: +# Get the keys present and absent across NZSL Signbank and S3, to dictionary: # video_key(str) -> in_nzsl(bool), in_s3(bool), db_id(int), gloss_id(int), is_public(bool) def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict): print( From fb5c8b728beed5a8b3604f9d021b5e26dc287dca Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Tue, 17 Sep 2024 17:00:11 +1000 Subject: [PATCH 089/222] Initial code for new columns --- bin/get-video-s3-acls.py | 142 +++++++++++++++++++++++++++++---------- 1 file changed, 105 insertions(+), 37 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 63bbff46..bdff1f24 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -79,6 +79,21 @@ def init_files(files_list=(ALL_KEYS_CACHE_FILE,)): f.close() +# DICTIONARY and CACHE FILE format +# This is used at several points +# Essentially video_key + in_nzsl + in_s3 + nzsl_raw_keys_dict +# video_key(str) -> +# in_nzsl(bool), +# in_s3(bool), +# gloss_id(int), +# gloss_idgloss(str), +# created_at(str), +# gloss_public(bool), +# video_public(bool) +# video_id(int) +# TODO For cache file format maybe move the video key to the end of the row, for consistency + + # Pull all info from existing cache file def get_keys_from_cache_file(): this_all_keys_dict = {} @@ -88,34 +103,40 @@ def get_keys_from_cache_file(): video_key, key_in_nzsl_str, key_in_s3_str, - db_id_str, gloss_id_str, - is_public_str, + gloss_idgloss, + created_at, + gloss_public_str, + video_public_str, + video_id_str, ) = line.strip().split(CSV_DELIMITER) # If possible, get NZSL db info key_in_nzsl = key_in_nzsl_str.strip().lower() == "true" if key_in_nzsl: - db_id = int(db_id_str) - is_public = is_public_str.strip().lower() == "true" + video_id = int(video_id_str) + gloss_public = gloss_public_str.strip().lower() == "true" + video_public = video_public_str.strip().lower() == "true" # Some don't have gloss_id's try: gloss_id = int(gloss_id_str) except ValueError: gloss_id = None else: - db_id = "" gloss_id = "" - is_public = "" + video_id = "" + gloss_public = "" + video_public = "" key_in_s3 = key_in_s3_str.strip().lower() == "true" this_all_keys_dict[video_key] = [ key_in_nzsl, key_in_s3, - db_id, + video_id, gloss_id, - is_public, + gloss_public, + video_public, ] return this_all_keys_dict @@ -159,12 +180,23 @@ def get_nzsl_raw_keys_dict(): file=sys.stderr, ) this_nzsl_raw_keys_dict = {} + # Column renaming is purely for readability + # We use a specific delimiter because columns might contain commas result = subprocess.run( [ PGCLI, "-c", - "COPY (SELECT id AS db_id, gloss_id, is_public, videofile FROM video_glossvideo) " - "TO STDOUT WITH (FORMAT CSV)", + "COPY (" + "SELECT " + "dg.id AS gloss_id, " + "dg.idgloss AS gloss_idgloss, " + "dg.created_at, " + "dg.published AS gloss_public, " + "vg.is_public AS video_public, " + "vg.id AS video_id, " + "vg.videofile AS video_key " + "FROM dictionary_gloss AS dg JOIN video_glossvideo AS vg ON vg.gloss_id = dg.id" + ") TO STDOUT WITH DELIMITER AS '|'", f"{DATABASE_URL}", ], env=os.environ, @@ -173,14 +205,33 @@ def get_nzsl_raw_keys_dict(): text=True, ) - # Separate out the NZSL db columns + from pprint import pprint + + # Separate the NZSL db columns # Write them to a dictionary, so we can do fast operations for rawl in result.stdout.split("\n"): rawl = rawl.strip() + print(f">>>{rawl}<<<") + pprint(rawl.split(",")) if not rawl: continue - [db_id, gloss_id, is_public, video_key] = rawl.split(",") - this_nzsl_raw_keys_dict[video_key] = [db_id, gloss_id, is_public.lower() == "t"] + [ + gloss_id, + gloss_idgloss, + created_at, + gloss_public, + video_public, + video_id, + video_key, + ] = rawl.split("|") + this_nzsl_raw_keys_dict[video_key] = [ + gloss_id, + gloss_idgloss, + created_at, + gloss_public.lower() == "t", + video_public.lower() == "t", + video_id, + ] print( f"{len(this_nzsl_raw_keys_dict)} rows retrieved", @@ -190,8 +241,8 @@ def get_nzsl_raw_keys_dict(): return this_nzsl_raw_keys_dict -# Get the keys present and absent across NZSL Signbank and S3, to dictionary: -# video_key(str) -> in_nzsl(bool), in_s3(bool), db_id(int), gloss_id(int), is_public(bool) +# Get the keys present and absent across NZSL Signbank and S3, to dictionary +# See DICTIONARY and CACHE FILE format def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict): print( "Getting keys present and absent across NZSL Signbank and S3 ...", @@ -209,7 +260,7 @@ def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict): ] else: # NZSL Absent, S3 PRESENT - this_all_keys_dict[video_key] = [False, True, "", "", ""] + this_all_keys_dict[video_key] = [False, True, "", "", "", "", "", ""] # Find NZSL keys that are absent from S3 (present handled already above) for video_key, item_list in this_nzsl_raw_keys_dict.items(): @@ -229,28 +280,34 @@ def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict): def build_csv_header(): return CSV_DELIMITER.join( [ - "Video S3 Key", - "Postgres ID", "Gloss ID", - "Signbank Public", - "Expected S3 Canned ACL", - "Actual S3 Canned ACL", + "Gloss", + "Created at", + "Gloss public", + "Vido public", + "Video ID", + "Video key", + "Expected Canned ACL", + "Actual Canned ACL", ] ) def build_csv_row( - video_key, key_in_nzsl=False, key_in_s3=False, - db_id=None, gloss_id=None, - is_public=False, + gloss_idgloss=None, + created_at=None, + gloss_public=False, + video_public=False, + video_id=None, + video_key=None, ): # See signbank/video/models.py, line 59, in function set_public_acl() if key_in_nzsl: - canned_acl_expected = "public-read" if is_public else "private" + canned_acl_expected = "public-read" if video_public else "private" else: canned_acl_expected = "" @@ -258,10 +315,13 @@ def build_csv_row( if not key_in_s3: return CSV_DELIMITER.join( [ - f"{video_key}", - f"{db_id}", f"{gloss_id}", - f"{is_public}", + f"{gloss_idgloss}", + f"{created_at}", + f"{gloss_public}", + f"{video_public}", + f"{video_id}", + f"{video_key}", f"{canned_acl_expected}", "", ] @@ -299,10 +359,13 @@ def build_csv_row( return CSV_DELIMITER.join( [ - f"{video_key}", - f"{db_id}", f"{gloss_id}", - f"{is_public}", + f"{gloss_idgloss}", + f"{created_at}", + f"{gloss_public}", + f"{video_public}", + f"{video_id}", + f"{video_key}", f"{canned_acl_expected}", f"{canned_acl}", ] @@ -314,23 +377,28 @@ def output_csv(this_all_keys_dict): print(f"Getting ACLs for keys from S3 ({AWS_S3_BUCKET}) ...", file=sys.stderr) print(build_csv_header()) - for video_key, [ key_in_nzsl, key_in_s3, - db_id, gloss_id, - is_public, + gloss_idgloss, + created_at, + gloss_public, + video_public, + video_id, ] in this_all_keys_dict.items(): print( build_csv_row( - video_key, key_in_nzsl, key_in_s3, - db_id, gloss_id, - is_public, + gloss_idgloss, + created_at, + gloss_public, + video_public, + video_id, + video_key, ) ) From abc430fd497632f1ef500036caa89de758a5ad28 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Tue, 17 Sep 2024 18:38:59 +1000 Subject: [PATCH 090/222] Debug removed, tmpdir announced --- bin/get-video-s3-acls.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index bdff1f24..f8b3cbe5 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -211,8 +211,6 @@ def get_nzsl_raw_keys_dict(): # Write them to a dictionary, so we can do fast operations for rawl in result.stdout.split("\n"): rawl = rawl.strip() - print(f">>>{rawl}<<<") - pprint(rawl.split(",")) if not rawl: continue [ @@ -407,6 +405,7 @@ def output_csv(this_all_keys_dict): print(f"S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr) print(f"AWSCLI: {AWSCLI}", file=sys.stderr) print(f"PGCLI: {PGCLI}", file=sys.stderr) +print(f"TMPDIR: {TMPDIR}", file=sys.stderr) if "AWS_PROFILE" in os.environ: print(f"AWS profile: {os.environ['AWS_PROFILE']}", file=sys.stderr) From 70027e38e72fa848953de92ea9f57baf8acac605 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 18 Sep 2024 15:20:16 +1000 Subject: [PATCH 091/222] typo --- bin/get-video-s3-acls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index f8b3cbe5..ec9d6d4e 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -282,7 +282,7 @@ def build_csv_header(): "Gloss", "Created at", "Gloss public", - "Vido public", + "Video public", "Video ID", "Video key", "Expected Canned ACL", From f9727f26976327bc8022486479003095b0471f95 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 18 Sep 2024 15:23:07 +1000 Subject: [PATCH 092/222] Debug removed --- bin/get-video-s3-acls.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index ec9d6d4e..ab1f9e7f 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -205,8 +205,6 @@ def get_nzsl_raw_keys_dict(): text=True, ) - from pprint import pprint - # Separate the NZSL db columns # Write them to a dictionary, so we can do fast operations for rawl in result.stdout.split("\n"): From 336241cfed18e3f1c8e5754e771155b559bd8936 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 18 Sep 2024 15:39:02 +1000 Subject: [PATCH 093/222] Tidy ups --- bin/get-video-s3-acls.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index ab1f9e7f..bd748e75 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -111,25 +111,23 @@ def get_keys_from_cache_file(): video_id_str, ) = line.strip().split(CSV_DELIMITER) - # If possible, get NZSL db info key_in_nzsl = key_in_nzsl_str.strip().lower() == "true" + key_in_s3 = key_in_s3_str.strip().lower() == "true" if key_in_nzsl: video_id = int(video_id_str) - gloss_public = gloss_public_str.strip().lower() == "true" - video_public = video_public_str.strip().lower() == "true" - # Some don't have gloss_id's + # Some have no gloss_id try: gloss_id = int(gloss_id_str) except ValueError: gloss_id = None + gloss_public = gloss_public_str.strip().lower() == "true" + video_public = video_public_str.strip().lower() == "true" else: - gloss_id = "" video_id = "" + gloss_id = "" gloss_public = "" video_public = "" - key_in_s3 = key_in_s3_str.strip().lower() == "true" - this_all_keys_dict[video_key] = [ key_in_nzsl, key_in_s3, @@ -181,7 +179,7 @@ def get_nzsl_raw_keys_dict(): ) this_nzsl_raw_keys_dict = {} # Column renaming is purely for readability - # We use a specific delimiter because columns might contain commas + # Special delimiter because columns might contain commas result = subprocess.run( [ PGCLI, From f6ffc184c2f5be3dcc400b9c777ac69cd9d65de0 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Thu, 19 Sep 2024 15:27:34 +1000 Subject: [PATCH 094/222] Video key moved, functions reordered, gloss quoting hardened --- bin/get-video-s3-acls.py | 76 +++++++++++++++++++--------------------- 1 file changed, 37 insertions(+), 39 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index bd748e75..cc9f793b 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -91,7 +91,6 @@ def init_files(files_list=(ALL_KEYS_CACHE_FILE,)): # gloss_public(bool), # video_public(bool) # video_id(int) -# TODO For cache file format maybe move the video key to the end of the row, for consistency # Pull all info from existing cache file @@ -100,7 +99,6 @@ def get_keys_from_cache_file(): with open(ALL_KEYS_CACHE_FILE, "r") as cache_file: for line in cache_file.readlines(): ( - video_key, key_in_nzsl_str, key_in_s3_str, gloss_id_str, @@ -109,6 +107,7 @@ def get_keys_from_cache_file(): gloss_public_str, video_public_str, video_id_str, + video_key, ) = line.strip().split(CSV_DELIMITER) key_in_nzsl = key_in_nzsl_str.strip().lower() == "true" @@ -140,37 +139,6 @@ def get_keys_from_cache_file(): return this_all_keys_dict -# Get all keys from AWS S3 -def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET): - print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr) - result = subprocess.run( - [ - AWSCLI, - "s3", - "ls", - f"s3://{s3_bucket}", - "--recursive", - ], - env=os.environ, - capture_output=True, - check=True, - text=True, - ) - - # Separate out just the key from date, time, size, key - this_s3_bucket_raw_keys_list = [] - for line in result.stdout.split("\n"): - if line: - this_s3_bucket_raw_keys_list.append(re.split(r"\s+", line, 3)[3]) - - print( - f"{len(this_s3_bucket_raw_keys_list)} rows retrieved", - file=sys.stderr, - ) - - return this_s3_bucket_raw_keys_list - - # Get the video files info from NZSL Signbank def get_nzsl_raw_keys_dict(): print( @@ -179,7 +147,6 @@ def get_nzsl_raw_keys_dict(): ) this_nzsl_raw_keys_dict = {} # Column renaming is purely for readability - # Special delimiter because columns might contain commas result = subprocess.run( [ PGCLI, @@ -193,8 +160,8 @@ def get_nzsl_raw_keys_dict(): "vg.is_public AS video_public, " "vg.id AS video_id, " "vg.videofile AS video_key " - "FROM dictionary_gloss AS dg JOIN video_glossvideo AS vg ON vg.gloss_id = dg.id" - ") TO STDOUT WITH DELIMITER AS '|'", + "FROM dictionary_gloss AS dg JOIN video_glossvideo AS vg ON vg.gloss_id = dg.id " + ") TO STDOUT WITH (FORMAT CSV, DELIMITER '|') ", f"{DATABASE_URL}", ], env=os.environ, @@ -220,7 +187,7 @@ def get_nzsl_raw_keys_dict(): ] = rawl.split("|") this_nzsl_raw_keys_dict[video_key] = [ gloss_id, - gloss_idgloss, + gloss_idgloss.replace(CSV_DELIMITER, ""), created_at, gloss_public.lower() == "t", video_public.lower() == "t", @@ -235,6 +202,37 @@ def get_nzsl_raw_keys_dict(): return this_nzsl_raw_keys_dict +# Get all keys from AWS S3 +def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET): + print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr) + result = subprocess.run( + [ + AWSCLI, + "s3", + "ls", + f"s3://{s3_bucket}", + "--recursive", + ], + env=os.environ, + capture_output=True, + check=True, + text=True, + ) + + # Separate out just the key from date, time, size, key + this_s3_bucket_raw_keys_list = [] + for line in result.stdout.split("\n"): + if line: + this_s3_bucket_raw_keys_list.append(re.split(r"\s+", line, 3)[3]) + + print( + f"{len(this_s3_bucket_raw_keys_list)} rows retrieved", + file=sys.stderr, + ) + + return this_s3_bucket_raw_keys_list + + # Get the keys present and absent across NZSL Signbank and S3, to dictionary # See DICTIONARY and CACHE FILE format def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict): @@ -265,7 +263,7 @@ def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict): # Write all keys back to a cache file for video_key, item_list in this_all_keys_dict.items(): cache_file.write( - f"{video_key}{CSV_DELIMITER}{CSV_DELIMITER.join(map(str, item_list))}\n" + f"{CSV_DELIMITER.join(map(str, item_list))}{CSV_DELIMITER}{video_key}\n" ) return this_all_keys_dict @@ -411,8 +409,8 @@ def output_csv(this_all_keys_dict): else: print("Generating video keys from scratch.", file=sys.stderr) init_files() - s3_bucket_raw_keys_list = get_s3_bucket_raw_keys_list() nzsl_raw_keys_dict = get_nzsl_raw_keys_dict() + s3_bucket_raw_keys_list = get_s3_bucket_raw_keys_list() all_keys_dict = create_all_keys_dict(s3_bucket_raw_keys_list, nzsl_raw_keys_dict) output_csv(all_keys_dict) From 76966ab9770fad531760e114e3f58d5143594afe Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Thu, 19 Sep 2024 15:49:46 +1000 Subject: [PATCH 095/222] Revert "Video key moved, functions reordered, gloss quoting hardened" Total mess, doing it another way. This reverts commit f6ffc184c2f5be3dcc400b9c777ac69cd9d65de0. --- bin/get-video-s3-acls.py | 76 +++++++++++++++++++++------------------- 1 file changed, 39 insertions(+), 37 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index cc9f793b..bd748e75 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -91,6 +91,7 @@ def init_files(files_list=(ALL_KEYS_CACHE_FILE,)): # gloss_public(bool), # video_public(bool) # video_id(int) +# TODO For cache file format maybe move the video key to the end of the row, for consistency # Pull all info from existing cache file @@ -99,6 +100,7 @@ def get_keys_from_cache_file(): with open(ALL_KEYS_CACHE_FILE, "r") as cache_file: for line in cache_file.readlines(): ( + video_key, key_in_nzsl_str, key_in_s3_str, gloss_id_str, @@ -107,7 +109,6 @@ def get_keys_from_cache_file(): gloss_public_str, video_public_str, video_id_str, - video_key, ) = line.strip().split(CSV_DELIMITER) key_in_nzsl = key_in_nzsl_str.strip().lower() == "true" @@ -139,6 +140,37 @@ def get_keys_from_cache_file(): return this_all_keys_dict +# Get all keys from AWS S3 +def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET): + print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr) + result = subprocess.run( + [ + AWSCLI, + "s3", + "ls", + f"s3://{s3_bucket}", + "--recursive", + ], + env=os.environ, + capture_output=True, + check=True, + text=True, + ) + + # Separate out just the key from date, time, size, key + this_s3_bucket_raw_keys_list = [] + for line in result.stdout.split("\n"): + if line: + this_s3_bucket_raw_keys_list.append(re.split(r"\s+", line, 3)[3]) + + print( + f"{len(this_s3_bucket_raw_keys_list)} rows retrieved", + file=sys.stderr, + ) + + return this_s3_bucket_raw_keys_list + + # Get the video files info from NZSL Signbank def get_nzsl_raw_keys_dict(): print( @@ -147,6 +179,7 @@ def get_nzsl_raw_keys_dict(): ) this_nzsl_raw_keys_dict = {} # Column renaming is purely for readability + # Special delimiter because columns might contain commas result = subprocess.run( [ PGCLI, @@ -160,8 +193,8 @@ def get_nzsl_raw_keys_dict(): "vg.is_public AS video_public, " "vg.id AS video_id, " "vg.videofile AS video_key " - "FROM dictionary_gloss AS dg JOIN video_glossvideo AS vg ON vg.gloss_id = dg.id " - ") TO STDOUT WITH (FORMAT CSV, DELIMITER '|') ", + "FROM dictionary_gloss AS dg JOIN video_glossvideo AS vg ON vg.gloss_id = dg.id" + ") TO STDOUT WITH DELIMITER AS '|'", f"{DATABASE_URL}", ], env=os.environ, @@ -187,7 +220,7 @@ def get_nzsl_raw_keys_dict(): ] = rawl.split("|") this_nzsl_raw_keys_dict[video_key] = [ gloss_id, - gloss_idgloss.replace(CSV_DELIMITER, ""), + gloss_idgloss, created_at, gloss_public.lower() == "t", video_public.lower() == "t", @@ -202,37 +235,6 @@ def get_nzsl_raw_keys_dict(): return this_nzsl_raw_keys_dict -# Get all keys from AWS S3 -def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET): - print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr) - result = subprocess.run( - [ - AWSCLI, - "s3", - "ls", - f"s3://{s3_bucket}", - "--recursive", - ], - env=os.environ, - capture_output=True, - check=True, - text=True, - ) - - # Separate out just the key from date, time, size, key - this_s3_bucket_raw_keys_list = [] - for line in result.stdout.split("\n"): - if line: - this_s3_bucket_raw_keys_list.append(re.split(r"\s+", line, 3)[3]) - - print( - f"{len(this_s3_bucket_raw_keys_list)} rows retrieved", - file=sys.stderr, - ) - - return this_s3_bucket_raw_keys_list - - # Get the keys present and absent across NZSL Signbank and S3, to dictionary # See DICTIONARY and CACHE FILE format def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict): @@ -263,7 +265,7 @@ def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict): # Write all keys back to a cache file for video_key, item_list in this_all_keys_dict.items(): cache_file.write( - f"{CSV_DELIMITER.join(map(str, item_list))}{CSV_DELIMITER}{video_key}\n" + f"{video_key}{CSV_DELIMITER}{CSV_DELIMITER.join(map(str, item_list))}\n" ) return this_all_keys_dict @@ -409,8 +411,8 @@ def output_csv(this_all_keys_dict): else: print("Generating video keys from scratch.", file=sys.stderr) init_files() - nzsl_raw_keys_dict = get_nzsl_raw_keys_dict() s3_bucket_raw_keys_list = get_s3_bucket_raw_keys_list() + nzsl_raw_keys_dict = get_nzsl_raw_keys_dict() all_keys_dict = create_all_keys_dict(s3_bucket_raw_keys_list, nzsl_raw_keys_dict) output_csv(all_keys_dict) From afcfa813072d136959e145a66fb661d596930ec1 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Thu, 19 Sep 2024 17:01:12 +1000 Subject: [PATCH 096/222] Cache file removed Too difficult to maintain --- bin/get-video-s3-acls.py | 126 ++++++++------------------------------- 1 file changed, 25 insertions(+), 101 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index bd748e75..57f8605a 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -12,6 +12,7 @@ import argparse import json import re +from pprint import pprint parser = argparse.ArgumentParser( description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. " @@ -23,14 +24,6 @@ required=False, help="Environment to run against, eg 'production, 'uat', etc (default: '%(default)s')", ) -parser.add_argument( - "--cached", - default=False, - required=False, - action="store_true", - help="Use video keys generated on a previous non-cached run (default: %(default)s) " - "(Do not mix production and staging!)", -) parser.add_argument( "--pgcli", default="/usr/bin/psql", @@ -63,23 +56,13 @@ except OSError as err: print(f"Error creating temporary directory: {TMPDIR} {err}", file=sys.stderr) exit() -ALL_KEYS_CACHE_FILE = f"{TMPDIR}/all_keys_cache.csv" # Vars nzsl_raw_keys_dict = {} s3_bucket_raw_keys_list = [] all_keys_dict = {} - -# Truncate files, creating them if necessary -def init_files(files_list=(ALL_KEYS_CACHE_FILE,)): - for p in files_list: - f = open(p, "a") - f.truncate() - f.close() - - -# DICTIONARY and CACHE FILE format +# DICTIONARY format # This is used at several points # Essentially video_key + in_nzsl + in_s3 + nzsl_raw_keys_dict # video_key(str) -> @@ -91,53 +74,6 @@ def init_files(files_list=(ALL_KEYS_CACHE_FILE,)): # gloss_public(bool), # video_public(bool) # video_id(int) -# TODO For cache file format maybe move the video key to the end of the row, for consistency - - -# Pull all info from existing cache file -def get_keys_from_cache_file(): - this_all_keys_dict = {} - with open(ALL_KEYS_CACHE_FILE, "r") as cache_file: - for line in cache_file.readlines(): - ( - video_key, - key_in_nzsl_str, - key_in_s3_str, - gloss_id_str, - gloss_idgloss, - created_at, - gloss_public_str, - video_public_str, - video_id_str, - ) = line.strip().split(CSV_DELIMITER) - - key_in_nzsl = key_in_nzsl_str.strip().lower() == "true" - key_in_s3 = key_in_s3_str.strip().lower() == "true" - if key_in_nzsl: - video_id = int(video_id_str) - # Some have no gloss_id - try: - gloss_id = int(gloss_id_str) - except ValueError: - gloss_id = None - gloss_public = gloss_public_str.strip().lower() == "true" - video_public = video_public_str.strip().lower() == "true" - else: - video_id = "" - gloss_id = "" - gloss_public = "" - video_public = "" - - this_all_keys_dict[video_key] = [ - key_in_nzsl, - key_in_s3, - video_id, - gloss_id, - gloss_public, - video_public, - ] - - return this_all_keys_dict # Get all keys from AWS S3 @@ -194,7 +130,7 @@ def get_nzsl_raw_keys_dict(): "vg.id AS video_id, " "vg.videofile AS video_key " "FROM dictionary_gloss AS dg JOIN video_glossvideo AS vg ON vg.gloss_id = dg.id" - ") TO STDOUT WITH DELIMITER AS '|'", + ") TO STDOUT WITH (FORMAT CSV, DELIMITER '|')", f"{DATABASE_URL}", ], env=os.environ, @@ -220,7 +156,7 @@ def get_nzsl_raw_keys_dict(): ] = rawl.split("|") this_nzsl_raw_keys_dict[video_key] = [ gloss_id, - gloss_idgloss, + gloss_idgloss.replace(CSV_DELIMITER, ""), created_at, gloss_public.lower() == "t", video_public.lower() == "t", @@ -243,30 +179,23 @@ def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict): file=sys.stderr, ) this_all_keys_dict = {} - with open(ALL_KEYS_CACHE_FILE, "w") as cache_file: - - # Find S3 keys that are present in NZSL, or absent - for video_key in this_s3_bucket_raw_keys_list: - if video_key in this_nzsl_raw_keys_dict: - # NZSL PRESENT, S3 PRESENT - this_all_keys_dict[video_key] = [True, True] + this_nzsl_raw_keys_dict[ - video_key - ] - else: - # NZSL Absent, S3 PRESENT - this_all_keys_dict[video_key] = [False, True, "", "", "", "", "", ""] - - # Find NZSL keys that are absent from S3 (present handled already above) - for video_key, item_list in this_nzsl_raw_keys_dict.items(): - if video_key not in this_s3_bucket_raw_keys_list: - # NZSL PRESENT, S3 Absent - this_all_keys_dict[video_key] = [True, False] + item_list - - # Write all keys back to a cache file - for video_key, item_list in this_all_keys_dict.items(): - cache_file.write( - f"{video_key}{CSV_DELIMITER}{CSV_DELIMITER.join(map(str, item_list))}\n" - ) + + # Find S3 keys that are present in NZSL, or absent + for video_key in this_s3_bucket_raw_keys_list: + if video_key in this_nzsl_raw_keys_dict: + # NZSL PRESENT, S3 PRESENT + this_all_keys_dict[video_key] = [True, True] + this_nzsl_raw_keys_dict[ + video_key + ] + else: + # NZSL Absent, S3 PRESENT + this_all_keys_dict[video_key] = [False, True, "", "", "", "", "", ""] + + # Find NZSL keys that are absent from S3 (present handled already above) + for video_key, item_list in this_nzsl_raw_keys_dict.items(): + if video_key not in this_s3_bucket_raw_keys_list: + # NZSL PRESENT, S3 Absent + this_all_keys_dict[video_key] = [True, False] + item_list return this_all_keys_dict @@ -371,6 +300,7 @@ def output_csv(this_all_keys_dict): print(f"Getting ACLs for keys from S3 ({AWS_S3_BUCKET}) ...", file=sys.stderr) print(build_csv_header()) + for video_key, [ key_in_nzsl, key_in_s3, @@ -405,14 +335,8 @@ def output_csv(this_all_keys_dict): if "AWS_PROFILE" in os.environ: print(f"AWS profile: {os.environ['AWS_PROFILE']}", file=sys.stderr) -if args.cached: - print(f"Using video keys from cache file ({ALL_KEYS_CACHE_FILE}).", file=sys.stderr) - all_keys_dict = get_keys_from_cache_file() -else: - print("Generating video keys from scratch.", file=sys.stderr) - init_files() - s3_bucket_raw_keys_list = get_s3_bucket_raw_keys_list() - nzsl_raw_keys_dict = get_nzsl_raw_keys_dict() - all_keys_dict = create_all_keys_dict(s3_bucket_raw_keys_list, nzsl_raw_keys_dict) +s3_bucket_raw_keys_list = get_s3_bucket_raw_keys_list() +nzsl_raw_keys_dict = get_nzsl_raw_keys_dict() +all_keys_dict = create_all_keys_dict(s3_bucket_raw_keys_list, nzsl_raw_keys_dict) output_csv(all_keys_dict) From a5ed6328b1b6556c53bfc7f68c621394f6d33478 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Thu, 19 Sep 2024 17:32:56 +1000 Subject: [PATCH 097/222] Fields broken out for clarity --- bin/get-video-s3-acls.py | 58 +++++++++++++++++++++++++++++++++------- 1 file changed, 49 insertions(+), 9 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 57f8605a..ff7bb19f 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -172,7 +172,6 @@ def get_nzsl_raw_keys_dict(): # Get the keys present and absent across NZSL Signbank and S3, to dictionary -# See DICTIONARY and CACHE FILE format def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict): print( "Getting keys present and absent across NZSL Signbank and S3 ...", @@ -183,19 +182,60 @@ def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict): # Find S3 keys that are present in NZSL, or absent for video_key in this_s3_bucket_raw_keys_list: if video_key in this_nzsl_raw_keys_dict: - # NZSL PRESENT, S3 PRESENT - this_all_keys_dict[video_key] = [True, True] + this_nzsl_raw_keys_dict[ - video_key + + # This is split out purely for human readability + [ + gloss_id, + gloss_idgloss, + created_at, + gloss_public, + video_public, + video_id, + ] = this_nzsl_raw_keys_dict[video_key] + this_all_keys_dict[video_key] = [ + True, # NZSL PRESENT + True, # S3 PRESENT + gloss_id, + gloss_idgloss, + created_at, + gloss_public, + video_public, + video_id, ] else: - # NZSL Absent, S3 PRESENT - this_all_keys_dict[video_key] = [False, True, "", "", "", "", "", ""] + this_all_keys_dict[video_key] = [ + False, # NZSL Absent + True, # S3 PRESENT + "", # gloss_id + "", # gloss_idgloss, + "", # created_at, + "", # gloss_public, + "", # video_public, + "" # video_id, + ] # Find NZSL keys that are absent from S3 (present handled already above) - for video_key, item_list in this_nzsl_raw_keys_dict.items(): + for (video_key, + [ + gloss_id, + gloss_idgloss, + created_at, + gloss_public, + video_public, + video_id, + ] + ) in this_nzsl_raw_keys_dict.items(): if video_key not in this_s3_bucket_raw_keys_list: - # NZSL PRESENT, S3 Absent - this_all_keys_dict[video_key] = [True, False] + item_list + this_all_keys_dict[video_key] = [ + True, # NZSL PRESENT + False, # S3 Absent + gloss_id, + gloss_idgloss, + created_at, + gloss_public, + video_public, + video_id, + ] return this_all_keys_dict From b5bec417ef1787e945caac7fc493cb63d0ffdd4f Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Thu, 19 Sep 2024 17:36:01 +1000 Subject: [PATCH 098/222] Reordered function declarations --- bin/get-video-s3-acls.py | 75 +++++++++++++++++----------------------- 1 file changed, 31 insertions(+), 44 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index ff7bb19f..15942b31 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -62,50 +62,6 @@ s3_bucket_raw_keys_list = [] all_keys_dict = {} -# DICTIONARY format -# This is used at several points -# Essentially video_key + in_nzsl + in_s3 + nzsl_raw_keys_dict -# video_key(str) -> -# in_nzsl(bool), -# in_s3(bool), -# gloss_id(int), -# gloss_idgloss(str), -# created_at(str), -# gloss_public(bool), -# video_public(bool) -# video_id(int) - - -# Get all keys from AWS S3 -def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET): - print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr) - result = subprocess.run( - [ - AWSCLI, - "s3", - "ls", - f"s3://{s3_bucket}", - "--recursive", - ], - env=os.environ, - capture_output=True, - check=True, - text=True, - ) - - # Separate out just the key from date, time, size, key - this_s3_bucket_raw_keys_list = [] - for line in result.stdout.split("\n"): - if line: - this_s3_bucket_raw_keys_list.append(re.split(r"\s+", line, 3)[3]) - - print( - f"{len(this_s3_bucket_raw_keys_list)} rows retrieved", - file=sys.stderr, - ) - - return this_s3_bucket_raw_keys_list - # Get the video files info from NZSL Signbank def get_nzsl_raw_keys_dict(): @@ -171,6 +127,37 @@ def get_nzsl_raw_keys_dict(): return this_nzsl_raw_keys_dict +# Get all keys from AWS S3 +def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET): + print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr) + result = subprocess.run( + [ + AWSCLI, + "s3", + "ls", + f"s3://{s3_bucket}", + "--recursive", + ], + env=os.environ, + capture_output=True, + check=True, + text=True, + ) + + # Separate out just the key from date, time, size, key + this_s3_bucket_raw_keys_list = [] + for line in result.stdout.split("\n"): + if line: + this_s3_bucket_raw_keys_list.append(re.split(r"\s+", line, 3)[3]) + + print( + f"{len(this_s3_bucket_raw_keys_list)} rows retrieved", + file=sys.stderr, + ) + + return this_s3_bucket_raw_keys_list + + # Get the keys present and absent across NZSL Signbank and S3, to dictionary def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict): print( From 7b7f111efa8cde94f0bf099a6ee803dee42361f4 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Thu, 19 Sep 2024 17:39:11 +1000 Subject: [PATCH 099/222] Renamed created_at prior to S3 replacement --- bin/get-video-s3-acls.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 15942b31..c6ef4967 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -80,7 +80,7 @@ def get_nzsl_raw_keys_dict(): "SELECT " "dg.id AS gloss_id, " "dg.idgloss AS gloss_idgloss, " - "dg.created_at, " + "dg.created_at AS gloss_created_at, " "dg.published AS gloss_public, " "vg.is_public AS video_public, " "vg.id AS video_id, " @@ -104,7 +104,7 @@ def get_nzsl_raw_keys_dict(): [ gloss_id, gloss_idgloss, - created_at, + gloss_created_at, gloss_public, video_public, video_id, @@ -113,7 +113,7 @@ def get_nzsl_raw_keys_dict(): this_nzsl_raw_keys_dict[video_key] = [ gloss_id, gloss_idgloss.replace(CSV_DELIMITER, ""), - created_at, + gloss_created_at, gloss_public.lower() == "t", video_public.lower() == "t", video_id, @@ -174,7 +174,7 @@ def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict): [ gloss_id, gloss_idgloss, - created_at, + gloss_created_at, gloss_public, video_public, video_id, @@ -184,7 +184,7 @@ def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict): True, # S3 PRESENT gloss_id, gloss_idgloss, - created_at, + gloss_created_at, gloss_public, video_public, video_id, @@ -195,7 +195,7 @@ def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict): True, # S3 PRESENT "", # gloss_id "", # gloss_idgloss, - "", # created_at, + "", # gloss_created_at, "", # gloss_public, "", # video_public, "" # video_id, @@ -206,7 +206,7 @@ def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict): [ gloss_id, gloss_idgloss, - created_at, + gloss_created_at, gloss_public, video_public, video_id, @@ -218,7 +218,7 @@ def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict): False, # S3 Absent gloss_id, gloss_idgloss, - created_at, + gloss_created_at, gloss_public, video_public, video_id, @@ -232,7 +232,7 @@ def build_csv_header(): [ "Gloss ID", "Gloss", - "Created at", + "Gloss created at", "Gloss public", "Video public", "Video ID", @@ -248,7 +248,7 @@ def build_csv_row( key_in_s3=False, gloss_id=None, gloss_idgloss=None, - created_at=None, + gloss_created_at=None, gloss_public=False, video_public=False, video_id=None, @@ -267,7 +267,7 @@ def build_csv_row( [ f"{gloss_id}", f"{gloss_idgloss}", - f"{created_at}", + f"{gloss_created_at}", f"{gloss_public}", f"{video_public}", f"{video_id}", @@ -311,7 +311,7 @@ def build_csv_row( [ f"{gloss_id}", f"{gloss_idgloss}", - f"{created_at}", + f"{gloss_created_at}", f"{gloss_public}", f"{video_public}", f"{video_id}", @@ -333,7 +333,7 @@ def output_csv(this_all_keys_dict): key_in_s3, gloss_id, gloss_idgloss, - created_at, + gloss_created_at, gloss_public, video_public, video_id, @@ -345,7 +345,7 @@ def output_csv(this_all_keys_dict): key_in_s3, gloss_id, gloss_idgloss, - created_at, + gloss_created_at, gloss_public, video_public, video_id, From 6360bf641bc0b4e7d02e1035aec4976f9b8b3205 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Thu, 19 Sep 2024 18:02:58 +1000 Subject: [PATCH 100/222] Reformatted --- bin/get-video-s3-acls.py | 46 ++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index c6ef4967..1672078c 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -70,7 +70,7 @@ def get_nzsl_raw_keys_dict(): file=sys.stderr, ) this_nzsl_raw_keys_dict = {} - # Column renaming is purely for readability + # Column renaming is for readability # Special delimiter because columns might contain commas result = subprocess.run( [ @@ -110,6 +110,7 @@ def get_nzsl_raw_keys_dict(): video_id, video_key, ] = rawl.split("|") + this_nzsl_raw_keys_dict[video_key] = [ gloss_id, gloss_idgloss.replace(CSV_DELIMITER, ""), @@ -170,7 +171,7 @@ def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict): for video_key in this_s3_bucket_raw_keys_list: if video_key in this_nzsl_raw_keys_dict: - # This is split out purely for human readability + # Split out for readability [ gloss_id, gloss_idgloss, @@ -179,9 +180,10 @@ def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict): video_public, video_id, ] = this_nzsl_raw_keys_dict[video_key] + this_all_keys_dict[video_key] = [ - True, # NZSL PRESENT - True, # S3 PRESENT + True, # NZSL PRESENT + True, # S3 PRESENT gloss_id, gloss_idgloss, gloss_created_at, @@ -192,29 +194,27 @@ def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict): else: this_all_keys_dict[video_key] = [ False, # NZSL Absent - True, # S3 PRESENT - "", # gloss_id - "", # gloss_idgloss, - "", # gloss_created_at, - "", # gloss_public, - "", # video_public, - "" # video_id, + True, # S3 PRESENT + "", # gloss_id + "", # gloss_idgloss, + "", # gloss_created_at, + "", # gloss_public, + "", # video_public, + "", # video_id, ] - # Find NZSL keys that are absent from S3 (present handled already above) - for (video_key, - [ - gloss_id, - gloss_idgloss, - gloss_created_at, - gloss_public, - video_public, - video_id, - ] - ) in this_nzsl_raw_keys_dict.items(): + # Find NZSL keys that are absent from S3 (present handled above) + for video_key, [ + gloss_id, + gloss_idgloss, + gloss_created_at, + gloss_public, + video_public, + video_id, + ] in this_nzsl_raw_keys_dict.items(): if video_key not in this_s3_bucket_raw_keys_list: this_all_keys_dict[video_key] = [ - True, # NZSL PRESENT + True, # NZSL PRESENT False, # S3 Absent gloss_id, gloss_idgloss, From f985cd8a8e6bfb9e08e331e8dde4403fba3b1889 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Thu, 19 Sep 2024 18:21:47 +1000 Subject: [PATCH 101/222] Fields rearranged (still gloss created_at) --- bin/get-video-s3-acls.py | 70 ++++++++++++++++++++-------------------- 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 1672078c..7316c74b 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -112,12 +112,12 @@ def get_nzsl_raw_keys_dict(): ] = rawl.split("|") this_nzsl_raw_keys_dict[video_key] = [ - gloss_id, gloss_idgloss.replace(CSV_DELIMITER, ""), gloss_created_at, + gloss_id, + video_id, gloss_public.lower() == "t", video_public.lower() == "t", - video_id, ] print( @@ -173,55 +173,55 @@ def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict): # Split out for readability [ - gloss_id, gloss_idgloss, gloss_created_at, + gloss_id, + video_id, gloss_public, video_public, - video_id, ] = this_nzsl_raw_keys_dict[video_key] this_all_keys_dict[video_key] = [ True, # NZSL PRESENT True, # S3 PRESENT - gloss_id, gloss_idgloss, gloss_created_at, + gloss_id, + video_id, gloss_public, video_public, - video_id, ] else: this_all_keys_dict[video_key] = [ False, # NZSL Absent True, # S3 PRESENT - "", # gloss_id "", # gloss_idgloss, "", # gloss_created_at, + "", # gloss_id + "", # video_id, "", # gloss_public, "", # video_public, - "", # video_id, ] # Find NZSL keys that are absent from S3 (present handled above) for video_key, [ - gloss_id, gloss_idgloss, gloss_created_at, + gloss_id, + video_id, gloss_public, video_public, - video_id, ] in this_nzsl_raw_keys_dict.items(): if video_key not in this_s3_bucket_raw_keys_list: this_all_keys_dict[video_key] = [ True, # NZSL PRESENT False, # S3 Absent - gloss_id, gloss_idgloss, gloss_created_at, + gloss_id, + video_id, gloss_public, video_public, - video_id, ] return this_all_keys_dict @@ -230,15 +230,15 @@ def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict): def build_csv_header(): return CSV_DELIMITER.join( [ - "Gloss ID", + "Video key", "Gloss", "Gloss created at", - "Gloss public", - "Video public", - "Video ID", - "Video key", "Expected Canned ACL", "Actual Canned ACL", + "Gloss ID", + "Video ID", + "Gloss public", + "Video public", ] ) @@ -246,13 +246,13 @@ def build_csv_header(): def build_csv_row( key_in_nzsl=False, key_in_s3=False, - gloss_id=None, + video_key=None, gloss_idgloss=None, gloss_created_at=None, + gloss_id=None, + video_id=None, gloss_public=False, video_public=False, - video_id=None, - video_key=None, ): # See signbank/video/models.py, line 59, in function set_public_acl() @@ -265,15 +265,15 @@ def build_csv_row( if not key_in_s3: return CSV_DELIMITER.join( [ - f"{gloss_id}", + f"{video_key}", f"{gloss_idgloss}", f"{gloss_created_at}", + f"{canned_acl_expected}", + "", # Actual Canned ACL + f"{gloss_id}", + f"{video_id}", f"{gloss_public}", f"{video_public}", - f"{video_id}", - f"{video_key}", - f"{canned_acl_expected}", - "", ] ) @@ -309,15 +309,15 @@ def build_csv_row( return CSV_DELIMITER.join( [ - f"{gloss_id}", + f"{video_key}", f"{gloss_idgloss}", f"{gloss_created_at}", - f"{gloss_public}", - f"{video_public}", - f"{video_id}", - f"{video_key}", f"{canned_acl_expected}", f"{canned_acl}", + f"{gloss_id}", + f"{video_id}", + f"{gloss_public}", + f"{video_public}", ] ) @@ -331,25 +331,25 @@ def output_csv(this_all_keys_dict): for video_key, [ key_in_nzsl, key_in_s3, - gloss_id, gloss_idgloss, gloss_created_at, + gloss_id, + video_id, gloss_public, video_public, - video_id, ] in this_all_keys_dict.items(): print( build_csv_row( key_in_nzsl, key_in_s3, - gloss_id, + video_key, gloss_idgloss, gloss_created_at, + gloss_id, + video_id, gloss_public, video_public, - video_id, - video_key, ) ) From 8387571dd727cdea3d478346caa62ce135743c7b Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Thu, 19 Sep 2024 18:24:54 +1000 Subject: [PATCH 102/222] Comments --- bin/get-video-s3-acls.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 7316c74b..887b68ea 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -255,7 +255,7 @@ def build_csv_row( video_public=False, ): - # See signbank/video/models.py, line 59, in function set_public_acl() + # See signbank/video/models.py, line 59, function set_public_acl() if key_in_nzsl: canned_acl_expected = "public-read" if video_public else "private" else: @@ -307,6 +307,9 @@ def build_csv_row( elif acls_grants_json[0]["Permission"] == "FULL_CONTROL": canned_acl = "private" + # TODO Get S3 object's LastModified date/time + # --- + return CSV_DELIMITER.join( [ f"{video_key}", From 51caea740f48cde34fc9aea779616109a7cb0912 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 20 Sep 2024 11:50:16 +1000 Subject: [PATCH 103/222] S3 Lastmodified datetime. Reordering. Column names updated. --- bin/get-video-s3-acls.py | 48 ++++++++++++++++++++++++++++++---------- 1 file changed, 36 insertions(+), 12 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 887b68ea..4da8e49b 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -230,15 +230,16 @@ def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict): def build_csv_header(): return CSV_DELIMITER.join( [ - "Video key", - "Gloss", - "Gloss created at", - "Expected Canned ACL", - "Actual Canned ACL", - "Gloss ID", - "Video ID", - "Gloss public", - "Video public", + "S3 Video key", + "Sbank Gloss", + "Sbank Gloss created at", + "S3 LastModified", + "S3 Expected Canned ACL", + "S3 Actual Canned ACL", + "Sbank Gloss ID", + "Sbank Video ID", + "Sbank Gloss public", + "Sbank Video public", ] ) @@ -307,14 +308,35 @@ def build_csv_row( elif acls_grants_json[0]["Permission"] == "FULL_CONTROL": canned_acl = "private" - # TODO Get S3 object's LastModified date/time - # --- + # Get S3 object's LastModified date/time + result = subprocess.run( + [ + AWSCLI, + "s3api", + "get-object-attributes", + "--object-attributes", + "ObjectParts", + "--output", + "json", + "--bucket", + AWS_S3_BUCKET, + "--key", + video_key, + ], + env=os.environ, + shell=False, + check=True, + capture_output=True, + text=True, + ) + s3_lastmodified = json.loads(result.stdout)["LastModified"] return CSV_DELIMITER.join( [ f"{video_key}", f"{gloss_idgloss}", f"{gloss_created_at}", + f"{s3_lastmodified}", f"{canned_acl_expected}", f"{canned_acl}", f"{gloss_id}", @@ -365,8 +387,10 @@ def output_csv(this_all_keys_dict): if "AWS_PROFILE" in os.environ: print(f"AWS profile: {os.environ['AWS_PROFILE']}", file=sys.stderr) -s3_bucket_raw_keys_list = get_s3_bucket_raw_keys_list() nzsl_raw_keys_dict = get_nzsl_raw_keys_dict() + +s3_bucket_raw_keys_list = get_s3_bucket_raw_keys_list() + all_keys_dict = create_all_keys_dict(s3_bucket_raw_keys_list, nzsl_raw_keys_dict) output_csv(all_keys_dict) From 0e92d203d624fbfaf50451c8587efdb6d3607d8e Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 20 Sep 2024 11:52:37 +1000 Subject: [PATCH 104/222] Message updated --- bin/get-video-s3-acls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 4da8e49b..cddde0c4 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -349,7 +349,7 @@ def build_csv_row( # From the keys present in NZSL, get all their ACL information def output_csv(this_all_keys_dict): - print(f"Getting ACLs for keys from S3 ({AWS_S3_BUCKET}) ...", file=sys.stderr) + print(f"Getting detailed S3 data for keys ({AWS_S3_BUCKET}) ...", file=sys.stderr) print(build_csv_header()) From c26b9aac0f6c22e56721387e2493f56e73a6360d Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 20 Sep 2024 11:54:56 +1000 Subject: [PATCH 105/222] Message --- bin/get-video-s3-acls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index cddde0c4..83a2c8b8 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -379,7 +379,7 @@ def output_csv(this_all_keys_dict): ) -print(f"Mode: {args.env}", file=sys.stderr) +print(f"Env: {args.env}", file=sys.stderr) print(f"S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr) print(f"AWSCLI: {AWSCLI}", file=sys.stderr) print(f"PGCLI: {PGCLI}", file=sys.stderr) From 86e4d34a21359d8708d9ad403d6f1abaa26ca4e9 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 20 Sep 2024 12:03:26 +1000 Subject: [PATCH 106/222] Rearranged. Intermediate variables removed. Black. --- bin/get-video-s3-acls.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 83a2c8b8..dabadd43 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -57,11 +57,6 @@ print(f"Error creating temporary directory: {TMPDIR} {err}", file=sys.stderr) exit() -# Vars -nzsl_raw_keys_dict = {} -s3_bucket_raw_keys_list = [] -all_keys_dict = {} - # Get the video files info from NZSL Signbank def get_nzsl_raw_keys_dict(): @@ -160,7 +155,7 @@ def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET): # Get the keys present and absent across NZSL Signbank and S3, to dictionary -def create_all_keys_dict(this_s3_bucket_raw_keys_list, this_nzsl_raw_keys_dict): +def create_all_keys_dict(this_nzsl_raw_keys_dict, this_s3_bucket_raw_keys_list): print( "Getting keys present and absent across NZSL Signbank and S3 ...", file=sys.stderr, @@ -387,10 +382,6 @@ def output_csv(this_all_keys_dict): if "AWS_PROFILE" in os.environ: print(f"AWS profile: {os.environ['AWS_PROFILE']}", file=sys.stderr) -nzsl_raw_keys_dict = get_nzsl_raw_keys_dict() - -s3_bucket_raw_keys_list = get_s3_bucket_raw_keys_list() - -all_keys_dict = create_all_keys_dict(s3_bucket_raw_keys_list, nzsl_raw_keys_dict) - -output_csv(all_keys_dict) +output_csv( + create_all_keys_dict(get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list()) +) From 0d564a9f161f3f0982e03d6903dc4668e76a41ce Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 20 Sep 2024 12:06:59 +1000 Subject: [PATCH 107/222] Comment --- bin/get-video-s3-acls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index dabadd43..e270dc41 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -342,7 +342,7 @@ def build_csv_row( ) -# From the keys present in NZSL, get all their ACL information +# From the keys present in NZSL, get all their S3 information def output_csv(this_all_keys_dict): print(f"Getting detailed S3 data for keys ({AWS_S3_BUCKET}) ...", file=sys.stderr) From 45064bad067f25e8a5fe66cd65914e26634722c8 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 20 Sep 2024 12:08:14 +1000 Subject: [PATCH 108/222] Missing empty LastModified for case not in S3 --- bin/get-video-s3-acls.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index e270dc41..fe886862 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -264,6 +264,7 @@ def build_csv_row( f"{video_key}", f"{gloss_idgloss}", f"{gloss_created_at}", + "", # S3 LastModified f"{canned_acl_expected}", "", # Actual Canned ACL f"{gloss_id}", From 4cff3da932afbcee24d9f569c9ac4ed01893f1b4 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 20 Sep 2024 13:58:37 +1000 Subject: [PATCH 109/222] TMPDIR removed --- bin/get-video-s3-acls.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index fe886862..5fd14e0a 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -50,12 +50,6 @@ AWSCLI = args.awscli PGCLI = args.pgcli AWS_S3_BUCKET = f"nzsl-signbank-media-{args.env}" -TMPDIR = args.tmpdir -try: - os.makedirs(TMPDIR, exist_ok=True) -except OSError as err: - print(f"Error creating temporary directory: {TMPDIR} {err}", file=sys.stderr) - exit() # Get the video files info from NZSL Signbank @@ -379,7 +373,6 @@ def output_csv(this_all_keys_dict): print(f"S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr) print(f"AWSCLI: {AWSCLI}", file=sys.stderr) print(f"PGCLI: {PGCLI}", file=sys.stderr) -print(f"TMPDIR: {TMPDIR}", file=sys.stderr) if "AWS_PROFILE" in os.environ: print(f"AWS profile: {os.environ['AWS_PROFILE']}", file=sys.stderr) From 7795405ecfcd1cf8760885901ff2fda16ea18a87 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Mon, 23 Sep 2024 10:00:36 +1000 Subject: [PATCH 110/222] More efficient dictionary lookup --- bin/get-video-s3-acls.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 5fd14e0a..817d7cb2 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -158,8 +158,8 @@ def create_all_keys_dict(this_nzsl_raw_keys_dict, this_s3_bucket_raw_keys_list): # Find S3 keys that are present in NZSL, or absent for video_key in this_s3_bucket_raw_keys_list: - if video_key in this_nzsl_raw_keys_dict: - + dict_row = this_nzsl_raw_keys_dict.get(video_key, None) + if dict_row: # Split out for readability [ gloss_idgloss, @@ -168,7 +168,7 @@ def create_all_keys_dict(this_nzsl_raw_keys_dict, this_s3_bucket_raw_keys_list): video_id, gloss_public, video_public, - ] = this_nzsl_raw_keys_dict[video_key] + ] = dict_row this_all_keys_dict[video_key] = [ True, # NZSL PRESENT From a68b89bb2d470cc9b9d095fee93c285bb74c969e Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Thu, 26 Sep 2024 19:18:56 +1000 Subject: [PATCH 111/222] comments --- bin/get-video-s3-acls.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 817d7cb2..88589d0e 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -321,6 +321,32 @@ def build_csv_row( ) s3_lastmodified = json.loads(result.stdout)["LastModified"] + + #TODO Logic goes here + # We decide what to do + # We mark what to do, but we don't actually do it + # If we are in 'Go' mode, and there is a previous mark, we do what that mark says + # Then we change the mark to past tense + + # Cases + + # Only a Signbank entry, no S3 + # - nothing to do, no action, NOOP + + # Only an S3 entry, no Signbank + # - DELETE the S3 entry + + # Both: + + # Private gloss + # - set S3 PRIVATE, that's it + + # Public gloss, video private, S3 public + # - set S3 PRIVATE + + # Public gloss, video public, S3 private + # - set S3 PUBLIC + return CSV_DELIMITER.join( [ f"{video_key}", From 6a5514ab681634731e2be4237604161f06f812c8 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Thu, 3 Oct 2024 12:25:12 +1000 Subject: [PATCH 112/222] Basic 'action' recommendation working --- bin/get-video-s3-acls.py | 64 +++++++++++++++++++++------------------- 1 file changed, 33 insertions(+), 31 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 88589d0e..c0a12c0b 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -18,6 +18,14 @@ description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. " "Postgres access details, eg. DATABASE_URL env var." ) +# 'Go' mode, args.do_actions +parser.add_argument( + "--do-actions", + action='store_true', + default=False, + required=False, + help="Actually perform Delete objects or change ACLs (DESTRUCTIVE operation)", +) parser.add_argument( "--env", default="uat", @@ -183,7 +191,7 @@ def create_all_keys_dict(this_nzsl_raw_keys_dict, this_s3_bucket_raw_keys_list): else: this_all_keys_dict[video_key] = [ False, # NZSL Absent - True, # S3 PRESENT + True, # S3 PRESENT "", # gloss_idgloss, "", # gloss_created_at, "", # gloss_id @@ -203,7 +211,7 @@ def create_all_keys_dict(this_nzsl_raw_keys_dict, this_s3_bucket_raw_keys_list): ] in this_nzsl_raw_keys_dict.items(): if video_key not in this_s3_bucket_raw_keys_list: this_all_keys_dict[video_key] = [ - True, # NZSL PRESENT + True, # NZSL PRESENT False, # S3 Absent gloss_idgloss, gloss_created_at, @@ -229,6 +237,7 @@ def build_csv_header(): "Sbank Video ID", "Sbank Gloss public", "Sbank Video public", + "Action", ] ) @@ -245,13 +254,29 @@ def build_csv_row( video_public=False, ): + action = "" + # Cases + # In S3 In NZSL Action + # Is No Delete! + # Is Is Check ACL + # Not Is Review + # (F F impossible) + + if key_in_s3: + if key_in_nzsl: + action = "Check ACL" + else: + action = "Delete" + else: + action = "Review" + # See signbank/video/models.py, line 59, function set_public_acl() if key_in_nzsl: canned_acl_expected = "public-read" if video_public else "private" else: canned_acl_expected = "" - # If key not in S3, just return its NZSL info + # If key not in S3, just return its NZSL info and action if not key_in_s3: return CSV_DELIMITER.join( [ @@ -265,6 +290,7 @@ def build_csv_row( f"{video_id}", f"{gloss_public}", f"{video_public}", + action, ] ) @@ -321,32 +347,6 @@ def build_csv_row( ) s3_lastmodified = json.loads(result.stdout)["LastModified"] - - #TODO Logic goes here - # We decide what to do - # We mark what to do, but we don't actually do it - # If we are in 'Go' mode, and there is a previous mark, we do what that mark says - # Then we change the mark to past tense - - # Cases - - # Only a Signbank entry, no S3 - # - nothing to do, no action, NOOP - - # Only an S3 entry, no Signbank - # - DELETE the S3 entry - - # Both: - - # Private gloss - # - set S3 PRIVATE, that's it - - # Public gloss, video private, S3 public - # - set S3 PRIVATE - - # Public gloss, video public, S3 private - # - set S3 PUBLIC - return CSV_DELIMITER.join( [ f"{video_key}", @@ -359,12 +359,14 @@ def build_csv_row( f"{video_id}", f"{gloss_public}", f"{video_public}", + action, ] ) # From the keys present in NZSL, get all their S3 information -def output_csv(this_all_keys_dict): +# If we are in 'Go' mode, perform actions +def process_keys(this_all_keys_dict): print(f"Getting detailed S3 data for keys ({AWS_S3_BUCKET}) ...", file=sys.stderr) print(build_csv_header()) @@ -402,6 +404,6 @@ def output_csv(this_all_keys_dict): if "AWS_PROFILE" in os.environ: print(f"AWS profile: {os.environ['AWS_PROFILE']}", file=sys.stderr) -output_csv( +process_keys( create_all_keys_dict(get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list()) ) From 27b389d9015ff54dcf0c10b9f22bede2373273c6 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Mon, 7 Oct 2024 14:18:42 +1100 Subject: [PATCH 113/222] Pass whole row to build_csv_header() --- bin/get-video-s3-acls.py | 36 +++++++----------------------------- 1 file changed, 7 insertions(+), 29 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index c0a12c0b..743dc960 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -243,9 +243,9 @@ def build_csv_header(): def build_csv_row( + video_key, key_in_nzsl=False, key_in_s3=False, - video_key=None, gloss_idgloss=None, gloss_created_at=None, gloss_id=None, @@ -371,36 +371,14 @@ def process_keys(this_all_keys_dict): print(build_csv_header()) - for video_key, [ - key_in_nzsl, - key_in_s3, - gloss_idgloss, - gloss_created_at, - gloss_id, - video_id, - gloss_public, - video_public, - ] in this_all_keys_dict.items(): - - print( - build_csv_row( - key_in_nzsl, - key_in_s3, - video_key, - gloss_idgloss, - gloss_created_at, - gloss_id, - video_id, - gloss_public, - video_public, - ) - ) + for video_key, values in this_all_keys_dict.items(): + print(build_csv_row(video_key, *values)) -print(f"Env: {args.env}", file=sys.stderr) -print(f"S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr) -print(f"AWSCLI: {AWSCLI}", file=sys.stderr) -print(f"PGCLI: {PGCLI}", file=sys.stderr) +print(f"Env: {args.env}", file=sys.stderr) +print(f"S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr) +print(f"AWSCLI: {AWSCLI}", file=sys.stderr) +print(f"PGCLI: {PGCLI}", file=sys.stderr) if "AWS_PROFILE" in os.environ: print(f"AWS profile: {os.environ['AWS_PROFILE']}", file=sys.stderr) From 777dac0000ec9417a718c38a556a6c726413b39f Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Mon, 7 Oct 2024 15:14:31 +1100 Subject: [PATCH 114/222] tweaks --- bin/get-video-s3-acls.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 743dc960..818003ae 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -21,7 +21,7 @@ # 'Go' mode, args.do_actions parser.add_argument( "--do-actions", - action='store_true', + action="store_true", default=False, required=False, help="Actually perform Delete objects or change ACLs (DESTRUCTIVE operation)", @@ -191,7 +191,7 @@ def create_all_keys_dict(this_nzsl_raw_keys_dict, this_s3_bucket_raw_keys_list): else: this_all_keys_dict[video_key] = [ False, # NZSL Absent - True, # S3 PRESENT + True, # S3 PRESENT "", # gloss_idgloss, "", # gloss_created_at, "", # gloss_id @@ -211,7 +211,7 @@ def create_all_keys_dict(this_nzsl_raw_keys_dict, this_s3_bucket_raw_keys_list): ] in this_nzsl_raw_keys_dict.items(): if video_key not in this_s3_bucket_raw_keys_list: this_all_keys_dict[video_key] = [ - True, # NZSL PRESENT + True, # NZSL PRESENT False, # S3 Absent gloss_idgloss, gloss_created_at, @@ -379,8 +379,7 @@ def process_keys(this_all_keys_dict): print(f"S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr) print(f"AWSCLI: {AWSCLI}", file=sys.stderr) print(f"PGCLI: {PGCLI}", file=sys.stderr) -if "AWS_PROFILE" in os.environ: - print(f"AWS profile: {os.environ['AWS_PROFILE']}", file=sys.stderr) +print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr) process_keys( create_all_keys_dict(get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list()) From 5b2715f2cfc205674c51bcea5da15fc86855dc4d Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Mon, 7 Oct 2024 15:40:57 +1100 Subject: [PATCH 115/222] Actions in a function, rearranged --- bin/get-video-s3-acls.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 818003ae..2d3c85ea 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -242,6 +242,20 @@ def build_csv_header(): ) +# Cases +# In S3 In NZSL Action +# Is Not Delete S3 Object +# Is Is Update ACL +# Not Is Review +def get_action(key_in_nzsl, key_in_s3): + if key_in_s3: + if key_in_nzsl: + return "Update ACL" + else: + return "Delete S3 Object" + return "Review" + + def build_csv_row( video_key, key_in_nzsl=False, @@ -254,21 +268,7 @@ def build_csv_row( video_public=False, ): - action = "" - # Cases - # In S3 In NZSL Action - # Is No Delete! - # Is Is Check ACL - # Not Is Review - # (F F impossible) - - if key_in_s3: - if key_in_nzsl: - action = "Check ACL" - else: - action = "Delete" - else: - action = "Review" + action = get_action(key_in_nzsl, key_in_s3) # See signbank/video/models.py, line 59, function set_public_acl() if key_in_nzsl: From 57ef78005ba076384accaedea9ebbb9c284c458b Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Mon, 7 Oct 2024 16:01:05 +1100 Subject: [PATCH 116/222] LastModified retrieved via query path --- bin/get-video-s3-acls.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 2d3c85ea..1d193cf1 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -332,6 +332,8 @@ def build_csv_row( "get-object-attributes", "--object-attributes", "ObjectParts", + "--query", + "LastModified", "--output", "json", "--bucket", @@ -345,7 +347,7 @@ def build_csv_row( capture_output=True, text=True, ) - s3_lastmodified = json.loads(result.stdout)["LastModified"] + s3_lastmodified = result.stdout.strip("\n\"") return CSV_DELIMITER.join( [ From 35021d6e9d87b753bb406b3dfcbab2f882e39236 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Mon, 7 Oct 2024 17:19:40 +1100 Subject: [PATCH 117/222] Lots of refactoring --- bin/get-video-s3-acls.py | 97 +++++++++++++++++++--------------------- 1 file changed, 47 insertions(+), 50 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 1d193cf1..a22a9070 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -256,52 +256,17 @@ def get_action(key_in_nzsl, key_in_s3): return "Review" -def build_csv_row( - video_key, - key_in_nzsl=False, - key_in_s3=False, - gloss_idgloss=None, - gloss_created_at=None, - gloss_id=None, - video_id=None, - gloss_public=False, - video_public=False, -): - - action = get_action(key_in_nzsl, key_in_s3) - - # See signbank/video/models.py, line 59, function set_public_acl() - if key_in_nzsl: - canned_acl_expected = "public-read" if video_public else "private" - else: - canned_acl_expected = "" - - # If key not in S3, just return its NZSL info and action - if not key_in_s3: - return CSV_DELIMITER.join( - [ - f"{video_key}", - f"{gloss_idgloss}", - f"{gloss_created_at}", - "", # S3 LastModified - f"{canned_acl_expected}", - "", # Actual Canned ACL - f"{gloss_id}", - f"{video_id}", - f"{gloss_public}", - f"{video_public}", - action, - ] - ) - - # Get S3 object's ACL +# Get S3 object's ACL +def get_s3_canned_acl(video_key): result = subprocess.run( [ AWSCLI, "s3api", "get-object-acl", "--output", - "json", + "text", + "--query", + "Grants[*].Permission", "--bucket", AWS_S3_BUCKET, "--key", @@ -313,18 +278,23 @@ def build_csv_row( capture_output=True, text=True, ) + acls_grants = result.stdout.strip().split("\t") + canned_acl = "unknown" - acls_grants_json = json.loads(result.stdout)["Grants"] - if len(acls_grants_json) > 1: + if len(acls_grants) > 1: if ( - acls_grants_json[0]["Permission"] == "FULL_CONTROL" - and acls_grants_json[1]["Permission"] == "READ" + acls_grants[0] == "FULL_CONTROL" + and acls_grants[1] == "READ" ): canned_acl = "public-read" - elif acls_grants_json[0]["Permission"] == "FULL_CONTROL": + elif acls_grants[0] == "FULL_CONTROL": canned_acl = "private" - # Get S3 object's LastModified date/time + return canned_acl + + +# Get S3 object's LastModified date/time +def get_s3_lastmodified(video_key): result = subprocess.run( [ AWSCLI, @@ -332,10 +302,10 @@ def build_csv_row( "get-object-attributes", "--object-attributes", "ObjectParts", + "--output", + "text", "--query", "LastModified", - "--output", - "json", "--bucket", AWS_S3_BUCKET, "--key", @@ -347,14 +317,41 @@ def build_csv_row( capture_output=True, text=True, ) - s3_lastmodified = result.stdout.strip("\n\"") + return result.stdout.strip() + + +def build_csv_row( + video_key, + key_in_nzsl=False, + key_in_s3=False, + gloss_idgloss=None, + gloss_created_at=None, + gloss_id=None, + video_id=None, + gloss_public=False, + video_public=False, +): + # See signbank/video/models.py, line 59, function set_public_acl() + if key_in_nzsl: + canned_acl_expected = "public-read" if video_public else "private" + else: + canned_acl_expected = "" + + if key_in_s3: + lastmodified = get_s3_lastmodified(video_key) + canned_acl = get_s3_canned_acl(video_key) + else: + lastmodified = "" + canned_acl = "" + + action = get_action(key_in_nzsl, key_in_s3) return CSV_DELIMITER.join( [ f"{video_key}", f"{gloss_idgloss}", f"{gloss_created_at}", - f"{s3_lastmodified}", + f"{lastmodified}", f"{canned_acl_expected}", f"{canned_acl}", f"{gloss_id}", From 63979e1bbb8b5d1e4a934686a07ae0e5d8c3b60b Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Mon, 7 Oct 2024 17:21:40 +1100 Subject: [PATCH 118/222] Default shell=false redundancy --- bin/get-video-s3-acls.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index a22a9070..90cad367 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -273,7 +273,6 @@ def get_s3_canned_acl(video_key): video_key, ], env=os.environ, - shell=False, check=True, capture_output=True, text=True, @@ -312,7 +311,6 @@ def get_s3_lastmodified(video_key): video_key, ], env=os.environ, - shell=False, check=True, capture_output=True, text=True, From f4a858ac0d11f7294096a75ea27c049c57a71997 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 9 Oct 2024 12:39:57 +1100 Subject: [PATCH 119/222] subprocess.run wrapped for PGCLI --- bin/get-video-s3-acls.py | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 90cad367..8efab63c 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -60,6 +60,25 @@ AWS_S3_BUCKET = f"nzsl-signbank-media-{args.env}" +def pg_cli(cmd): + return subprocess.run( + [ + PGCLI, + "-c", + cmd, + f"{DATABASE_URL}", + ], + env=os.environ, + capture_output=True, + check=True, + text=True, + ) + + +def aws_cli(): + pass + + # Get the video files info from NZSL Signbank def get_nzsl_raw_keys_dict(): print( @@ -69,10 +88,7 @@ def get_nzsl_raw_keys_dict(): this_nzsl_raw_keys_dict = {} # Column renaming is for readability # Special delimiter because columns might contain commas - result = subprocess.run( - [ - PGCLI, - "-c", + result = pg_cli( "COPY (" "SELECT " "dg.id AS gloss_id, " @@ -84,12 +100,6 @@ def get_nzsl_raw_keys_dict(): "vg.videofile AS video_key " "FROM dictionary_gloss AS dg JOIN video_glossvideo AS vg ON vg.gloss_id = dg.id" ") TO STDOUT WITH (FORMAT CSV, DELIMITER '|')", - f"{DATABASE_URL}", - ], - env=os.environ, - capture_output=True, - check=True, - text=True, ) # Separate the NZSL db columns From 31b41f21c90f47ffd035f2e8a9aaec7027d4aa80 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 9 Oct 2024 12:51:38 +1100 Subject: [PATCH 120/222] subprocess.run wrapped for AWSCLI. Better argument handling. --- bin/get-video-s3-acls.py | 68 ++++++++++++++++++---------------------- 1 file changed, 30 insertions(+), 38 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 8efab63c..2e5b1308 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -60,14 +60,11 @@ AWS_S3_BUCKET = f"nzsl-signbank-media-{args.env}" -def pg_cli(cmd): +def pg_cli(args_list): + if not isinstance(args_list, list): + args_list = [args_list] return subprocess.run( - [ - PGCLI, - "-c", - cmd, - f"{DATABASE_URL}", - ], + [PGCLI, "-c"] + args_list + [f"{DATABASE_URL}"], env=os.environ, capture_output=True, check=True, @@ -75,8 +72,16 @@ def pg_cli(cmd): ) -def aws_cli(): - pass +def aws_cli(args_list): + if not isinstance(args_list, list): + args_list = [args_list] + return subprocess.run( + [AWSCLI] + args_list, + env=os.environ, + capture_output=True, + check=True, + text=True, + ) # Get the video files info from NZSL Signbank @@ -89,17 +94,17 @@ def get_nzsl_raw_keys_dict(): # Column renaming is for readability # Special delimiter because columns might contain commas result = pg_cli( - "COPY (" - "SELECT " - "dg.id AS gloss_id, " - "dg.idgloss AS gloss_idgloss, " - "dg.created_at AS gloss_created_at, " - "dg.published AS gloss_public, " - "vg.is_public AS video_public, " - "vg.id AS video_id, " - "vg.videofile AS video_key " - "FROM dictionary_gloss AS dg JOIN video_glossvideo AS vg ON vg.gloss_id = dg.id" - ") TO STDOUT WITH (FORMAT CSV, DELIMITER '|')", + "COPY (" + "SELECT " + "dg.id AS gloss_id, " + "dg.idgloss AS gloss_idgloss, " + "dg.created_at AS gloss_created_at, " + "dg.published AS gloss_public, " + "vg.is_public AS video_public, " + "vg.id AS video_id, " + "vg.videofile AS video_key " + "FROM dictionary_gloss AS dg JOIN video_glossvideo AS vg ON vg.gloss_id = dg.id" + ") TO STDOUT WITH (FORMAT CSV, DELIMITER '|')", ) # Separate the NZSL db columns @@ -138,18 +143,13 @@ def get_nzsl_raw_keys_dict(): # Get all keys from AWS S3 def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET): print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr) - result = subprocess.run( + result = aws_cli( [ - AWSCLI, "s3", "ls", f"s3://{s3_bucket}", "--recursive", ], - env=os.environ, - capture_output=True, - check=True, - text=True, ) # Separate out just the key from date, time, size, key @@ -268,9 +268,8 @@ def get_action(key_in_nzsl, key_in_s3): # Get S3 object's ACL def get_s3_canned_acl(video_key): - result = subprocess.run( + result = aws_cli( [ - AWSCLI, "s3api", "get-object-acl", "--output", @@ -281,20 +280,13 @@ def get_s3_canned_acl(video_key): AWS_S3_BUCKET, "--key", video_key, - ], - env=os.environ, - check=True, - capture_output=True, - text=True, + ] ) acls_grants = result.stdout.strip().split("\t") - + canned_acl = "unknown" if len(acls_grants) > 1: - if ( - acls_grants[0] == "FULL_CONTROL" - and acls_grants[1] == "READ" - ): + if acls_grants[0] == "FULL_CONTROL" and acls_grants[1] == "READ": canned_acl = "public-read" elif acls_grants[0] == "FULL_CONTROL": canned_acl = "private" From 975540e7c07e437955653aa99b2476afb480a9e9 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 9 Oct 2024 12:52:53 +1100 Subject: [PATCH 121/222] Another AWSCLI wrap. --- bin/get-video-s3-acls.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 2e5b1308..c8b99b00 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -283,7 +283,7 @@ def get_s3_canned_acl(video_key): ] ) acls_grants = result.stdout.strip().split("\t") - + canned_acl = "unknown" if len(acls_grants) > 1: if acls_grants[0] == "FULL_CONTROL" and acls_grants[1] == "READ": @@ -296,9 +296,8 @@ def get_s3_canned_acl(video_key): # Get S3 object's LastModified date/time def get_s3_lastmodified(video_key): - result = subprocess.run( + result = aws_cli( [ - AWSCLI, "s3api", "get-object-attributes", "--object-attributes", @@ -311,11 +310,7 @@ def get_s3_lastmodified(video_key): AWS_S3_BUCKET, "--key", video_key, - ], - env=os.environ, - check=True, - capture_output=True, - text=True, + ] ) return result.stdout.strip() From 6370e2124e28a69e8a3629fc822136607d54b5e4 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 9 Oct 2024 16:01:22 +1100 Subject: [PATCH 122/222] get-object-attributes -> head-object for LastModified --- bin/get-video-s3-acls.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index c8b99b00..f98f46c1 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -299,9 +299,7 @@ def get_s3_lastmodified(video_key): result = aws_cli( [ "s3api", - "get-object-attributes", - "--object-attributes", - "ObjectParts", + "head-object", "--output", "text", "--query", From a773a6381817484bf10560d34f2d2ef0dfa19b43 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 9 Oct 2024 16:03:14 +1100 Subject: [PATCH 123/222] Removed json module --- bin/get-video-s3-acls.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index f98f46c1..9c04fe0a 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -10,7 +10,6 @@ import sys import subprocess import argparse -import json import re from pprint import pprint From 1adbe925ea67bf36d19ed4ff28b76ad3d937bbe3 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 9 Oct 2024 16:30:50 +1100 Subject: [PATCH 124/222] Moved header fn closer to row fn --- bin/get-video-s3-acls.py | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 9c04fe0a..4dbccf7b 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -233,24 +233,6 @@ def create_all_keys_dict(this_nzsl_raw_keys_dict, this_s3_bucket_raw_keys_list): return this_all_keys_dict -def build_csv_header(): - return CSV_DELIMITER.join( - [ - "S3 Video key", - "Sbank Gloss", - "Sbank Gloss created at", - "S3 LastModified", - "S3 Expected Canned ACL", - "S3 Actual Canned ACL", - "Sbank Gloss ID", - "Sbank Video ID", - "Sbank Gloss public", - "Sbank Video public", - "Action", - ] - ) - - # Cases # In S3 In NZSL Action # Is Not Delete S3 Object @@ -312,6 +294,24 @@ def get_s3_lastmodified(video_key): return result.stdout.strip() +def build_csv_header(): + return CSV_DELIMITER.join( + [ + "S3 Video key", + "Sbank Gloss", + "Sbank Gloss created at", + "S3 LastModified", + "S3 Expected Canned ACL", + "S3 Actual Canned ACL", + "Sbank Gloss ID", + "Sbank Video ID", + "Sbank Gloss public", + "Sbank Video public", + "Action", + ] + ) + + def build_csv_row( video_key, key_in_nzsl=False, From 2dd442fd1a03137274d98589aecf538a62e5b1d9 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Tue, 22 Oct 2024 13:26:55 +1100 Subject: [PATCH 125/222] Reorder fields --- bin/get-video-s3-acls.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 4dbccf7b..e5748d93 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -297,17 +297,17 @@ def get_s3_lastmodified(video_key): def build_csv_header(): return CSV_DELIMITER.join( [ + "Action", "S3 Video key", - "Sbank Gloss", "Sbank Gloss created at", - "S3 LastModified", "S3 Expected Canned ACL", "S3 Actual Canned ACL", "Sbank Gloss ID", "Sbank Video ID", "Sbank Gloss public", "Sbank Video public", - "Action", + "Sbank Gloss", + "S3 LastModified", ] ) @@ -340,17 +340,17 @@ def build_csv_row( return CSV_DELIMITER.join( [ + action, f"{video_key}", - f"{gloss_idgloss}", f"{gloss_created_at}", - f"{lastmodified}", f"{canned_acl_expected}", f"{canned_acl}", f"{gloss_id}", f"{video_id}", f"{gloss_public}", f"{video_public}", - action, + f"{gloss_idgloss}", + f"{lastmodified}", ] ) From 6f38ac0b26e967f956d4c22731b85b79201541a8 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Tue, 22 Oct 2024 13:33:39 +1100 Subject: [PATCH 126/222] Canned ACL variable removed --- bin/get-video-s3-acls.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index e5748d93..32b4a936 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -265,14 +265,13 @@ def get_s3_canned_acl(video_key): ) acls_grants = result.stdout.strip().split("\t") - canned_acl = "unknown" if len(acls_grants) > 1: if acls_grants[0] == "FULL_CONTROL" and acls_grants[1] == "READ": - canned_acl = "public-read" + return "public-read" elif acls_grants[0] == "FULL_CONTROL": - canned_acl = "private" + return "private" - return canned_acl + return "unknown" # Get S3 object's LastModified date/time From 92988c4bb8cd8dac0b5ddffe7a1b8c447be2f7bc Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Tue, 22 Oct 2024 13:35:47 +1100 Subject: [PATCH 127/222] Renamed action function --- bin/get-video-s3-acls.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 32b4a936..71e2bf66 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -238,7 +238,7 @@ def create_all_keys_dict(this_nzsl_raw_keys_dict, this_s3_bucket_raw_keys_list): # Is Not Delete S3 Object # Is Is Update ACL # Not Is Review -def get_action(key_in_nzsl, key_in_s3): +def get_recommended_action(key_in_nzsl, key_in_s3): if key_in_s3: if key_in_nzsl: return "Update ACL" @@ -335,7 +335,7 @@ def build_csv_row( lastmodified = "" canned_acl = "" - action = get_action(key_in_nzsl, key_in_s3) + action = get_recommended_action(key_in_nzsl, key_in_s3) return CSV_DELIMITER.join( [ From 438a83e31cf5d23415b866bc83082bbb3a0c96ca Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Tue, 22 Oct 2024 13:40:16 +1100 Subject: [PATCH 128/222] Else's defaulted --- bin/get-video-s3-acls.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 71e2bf66..e36df4c1 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -323,17 +323,15 @@ def build_csv_row( video_public=False, ): # See signbank/video/models.py, line 59, function set_public_acl() + canned_acl_expected = "" if key_in_nzsl: canned_acl_expected = "public-read" if video_public else "private" - else: - canned_acl_expected = "" + lastmodified = "" + canned_acl = "" if key_in_s3: lastmodified = get_s3_lastmodified(video_key) canned_acl = get_s3_canned_acl(video_key) - else: - lastmodified = "" - canned_acl = "" action = get_recommended_action(key_in_nzsl, key_in_s3) From ea5d6bb490a49a9196314a24584e15b5597618e8 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Tue, 22 Oct 2024 13:44:34 +1100 Subject: [PATCH 129/222] Guard code removed. --- bin/get-video-s3-acls.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index e36df4c1..7d88ccb8 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -60,8 +60,6 @@ def pg_cli(args_list): - if not isinstance(args_list, list): - args_list = [args_list] return subprocess.run( [PGCLI, "-c"] + args_list + [f"{DATABASE_URL}"], env=os.environ, @@ -72,8 +70,6 @@ def pg_cli(args_list): def aws_cli(args_list): - if not isinstance(args_list, list): - args_list = [args_list] return subprocess.run( [AWSCLI] + args_list, env=os.environ, From c214dde0220f7144c45addf1038083ab5cac324b Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Tue, 22 Oct 2024 13:45:47 +1100 Subject: [PATCH 130/222] Debug removed. --- bin/get-video-s3-acls.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 7d88ccb8..6b404ba2 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -11,7 +11,6 @@ import subprocess import argparse import re -from pprint import pprint parser = argparse.ArgumentParser( description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. " From e342fac27d8232c5ab3a85098e6922960b7e57de Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Tue, 22 Oct 2024 13:51:22 +1100 Subject: [PATCH 131/222] Array argument fix --- bin/get-video-s3-acls.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 6b404ba2..6974727d 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -88,17 +88,19 @@ def get_nzsl_raw_keys_dict(): # Column renaming is for readability # Special delimiter because columns might contain commas result = pg_cli( - "COPY (" - "SELECT " - "dg.id AS gloss_id, " - "dg.idgloss AS gloss_idgloss, " - "dg.created_at AS gloss_created_at, " - "dg.published AS gloss_public, " - "vg.is_public AS video_public, " - "vg.id AS video_id, " - "vg.videofile AS video_key " - "FROM dictionary_gloss AS dg JOIN video_glossvideo AS vg ON vg.gloss_id = dg.id" - ") TO STDOUT WITH (FORMAT CSV, DELIMITER '|')", + [ + "COPY (" + "SELECT " + "dg.id AS gloss_id, " + "dg.idgloss AS gloss_idgloss, " + "dg.created_at AS gloss_created_at, " + "dg.published AS gloss_public, " + "vg.is_public AS video_public, " + "vg.id AS video_id, " + "vg.videofile AS video_key " + "FROM dictionary_gloss AS dg JOIN video_glossvideo AS vg ON vg.gloss_id = dg.id" + ") TO STDOUT WITH (FORMAT CSV, DELIMITER '|')", + ] ) # Separate the NZSL db columns From b64e1adb5413c2e05d25f35c8e4433f323ff7191 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 23 Oct 2024 10:33:53 +1100 Subject: [PATCH 132/222] Swapped created_at and lastmodified columns --- bin/get-video-s3-acls.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 6974727d..573de55a 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -295,7 +295,7 @@ def build_csv_header(): [ "Action", "S3 Video key", - "Sbank Gloss created at", + "S3 LastModified", "S3 Expected Canned ACL", "S3 Actual Canned ACL", "Sbank Gloss ID", @@ -303,7 +303,7 @@ def build_csv_header(): "Sbank Gloss public", "Sbank Video public", "Sbank Gloss", - "S3 LastModified", + "Sbank Gloss created at", ] ) @@ -336,7 +336,7 @@ def build_csv_row( [ action, f"{video_key}", - f"{gloss_created_at}", + f"{lastmodified}", f"{canned_acl_expected}", f"{canned_acl}", f"{gloss_id}", @@ -344,7 +344,7 @@ def build_csv_row( f"{gloss_public}", f"{video_public}", f"{gloss_idgloss}", - f"{lastmodified}", + f"{gloss_created_at}", ] ) From ba137dcd07d56497f718b54a91c224549b8868c2 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Thu, 24 Oct 2024 12:53:26 +1100 Subject: [PATCH 133/222] FULL JOIN from INNER JOIN --- bin/get-video-s3-acls.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 573de55a..4aa4b469 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -98,7 +98,8 @@ def get_nzsl_raw_keys_dict(): "vg.is_public AS video_public, " "vg.id AS video_id, " "vg.videofile AS video_key " - "FROM dictionary_gloss AS dg JOIN video_glossvideo AS vg ON vg.gloss_id = dg.id" + "FROM dictionary_gloss AS dg " + "FULL JOIN video_glossvideo AS vg ON vg.gloss_id = dg.id" ") TO STDOUT WITH (FORMAT CSV, DELIMITER '|')", ] ) From a50455418fb42b46d4ffa5a4e1a60137249d32f8 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Thu, 24 Oct 2024 13:16:52 +1100 Subject: [PATCH 134/222] Internal changes --- bin/get-video-s3-acls.py | 47 +++++----------------------------------- 1 file changed, 6 insertions(+), 41 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 4aa4b469..c27adee7 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -175,58 +175,23 @@ def create_all_keys_dict(this_nzsl_raw_keys_dict, this_s3_bucket_raw_keys_list): for video_key in this_s3_bucket_raw_keys_list: dict_row = this_nzsl_raw_keys_dict.get(video_key, None) if dict_row: - # Split out for readability - [ - gloss_idgloss, - gloss_created_at, - gloss_id, - video_id, - gloss_public, - video_public, - ] = dict_row - this_all_keys_dict[video_key] = [ True, # NZSL PRESENT True, # S3 PRESENT - gloss_idgloss, - gloss_created_at, - gloss_id, - video_id, - gloss_public, - video_public, - ] + ] + dict_row else: this_all_keys_dict[video_key] = [ False, # NZSL Absent True, # S3 PRESENT - "", # gloss_idgloss, - "", # gloss_created_at, - "", # gloss_id - "", # video_id, - "", # gloss_public, - "", # video_public, - ] + ] + [""] * 6 # Find NZSL keys that are absent from S3 (present handled above) - for video_key, [ - gloss_idgloss, - gloss_created_at, - gloss_id, - video_id, - gloss_public, - video_public, - ] in this_nzsl_raw_keys_dict.items(): + for video_key, dict_row in this_nzsl_raw_keys_dict.items(): if video_key not in this_s3_bucket_raw_keys_list: this_all_keys_dict[video_key] = [ True, # NZSL PRESENT False, # S3 Absent - gloss_idgloss, - gloss_created_at, - gloss_id, - video_id, - gloss_public, - video_public, - ] + ] + dict_row return this_all_keys_dict @@ -357,8 +322,8 @@ def process_keys(this_all_keys_dict): print(build_csv_header()) - for video_key, values in this_all_keys_dict.items(): - print(build_csv_row(video_key, *values)) + for video_key, dict_row in this_all_keys_dict.items(): + print(build_csv_row(video_key, *dict_row)) print(f"Env: {args.env}", file=sys.stderr) From 75107c98e91dffc4434f5568f6ceb32e0279f06d Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Thu, 24 Oct 2024 13:20:22 +1100 Subject: [PATCH 135/222] Comments --- bin/get-video-s3-acls.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index c27adee7..f8a888ca 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -120,6 +120,7 @@ def get_nzsl_raw_keys_dict(): video_key, ] = rawl.split("|") + # This sets the initial field ordering in the dictionary this_nzsl_raw_keys_dict[video_key] = [ gloss_idgloss.replace(CSV_DELIMITER, ""), gloss_created_at, From 66b5db1e15283ae79ace7c1613e7e45772b62ed6 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Thu, 24 Oct 2024 13:21:35 +1100 Subject: [PATCH 136/222] Comments --- bin/get-video-s3-acls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index f8a888ca..a329ff07 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -120,7 +120,7 @@ def get_nzsl_raw_keys_dict(): video_key, ] = rawl.split("|") - # This sets the initial field ordering in the dictionary + # This sets the initial field ordering in the all_keys dictionary row this_nzsl_raw_keys_dict[video_key] = [ gloss_idgloss.replace(CSV_DELIMITER, ""), gloss_created_at, From e166694114f35c6e2d01d91cce4569c0983d4a76 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Thu, 24 Oct 2024 13:27:05 +1100 Subject: [PATCH 137/222] Comments --- bin/get-video-s3-acls.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index a329ff07..89d952f1 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -317,7 +317,6 @@ def build_csv_row( # From the keys present in NZSL, get all their S3 information -# If we are in 'Go' mode, perform actions def process_keys(this_all_keys_dict): print(f"Getting detailed S3 data for keys ({AWS_S3_BUCKET}) ...", file=sys.stderr) From 0642a259b58d86058b3b94e1826e35e73ebcd309 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 25 Oct 2024 09:29:50 +1100 Subject: [PATCH 138/222] Added forced retries to AWS command --- bin/get-video-s3-acls.py | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 89d952f1..b28598bf 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -11,6 +11,7 @@ import subprocess import argparse import re +from time import sleep parser = argparse.ArgumentParser( description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. " @@ -69,13 +70,26 @@ def pg_cli(args_list): def aws_cli(args_list): - return subprocess.run( - [AWSCLI] + args_list, - env=os.environ, - capture_output=True, - check=True, - text=True, - ) + # Try indefinitely + output = None + while not output: + try: + output = subprocess.run( + [AWSCLI] + args_list, + env=os.environ, + capture_output=True, + check=True, + text=True, + ) + except subprocess.CalledProcessError as e: + print( + f"Error: subprocess.run returned code {e.returncode}", file=sys.stderr + ) + print(e.cmd, file=sys.stderr) + print(e.stdout, file=sys.stderr) + print(e.stderr, file=sys.stderr) + sleep(1) + return output # Get the video files info from NZSL Signbank From 97b559a43040486f5c29e6ca9da8f3f03daae52c Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 25 Oct 2024 14:04:56 +1100 Subject: [PATCH 139/222] Superfluous arguments removed --- bin/get-video-s3-acls.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index b28598bf..4328cd08 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -17,14 +17,6 @@ description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. " "Postgres access details, eg. DATABASE_URL env var." ) -# 'Go' mode, args.do_actions -parser.add_argument( - "--do-actions", - action="store_true", - default=False, - required=False, - help="Actually perform Delete objects or change ACLs (DESTRUCTIVE operation)", -) parser.add_argument( "--env", default="uat", @@ -43,12 +35,6 @@ required=False, help=f"AWS client path (default: %(default)s)", ) -parser.add_argument( - "--tmpdir", - default="/tmp/nzsl", - required=False, - help=f"Temp dir path (default: %(default)s)", -) args = parser.parse_args() # Globals From a588af28e660e29c16478ee1c0a22c335c3c314f Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 25 Oct 2024 15:32:11 +1100 Subject: [PATCH 140/222] Experimental Django code --- bin/get-video-s3-acls.py | 53 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 49 insertions(+), 4 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 4328cd08..be20313c 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -12,6 +12,22 @@ import argparse import re from time import sleep +from pprint import pprint +import boto3 +import django + +# Magic required to allow this script to use Signbank Django classes +sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) +pprint(sys.path) +os.environ.setdefault( + "DJANGO_SETTINGS_MODULE", "signbank.settings.development") +from django.core.wsgi import get_wsgi_application +get_wsgi_application() +from django.contrib.auth import get_user_model +User = get_user_model() + +# Test +from signbank.dictionary.models import FieldChoice, Gloss parser = argparse.ArgumentParser( description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. " @@ -35,11 +51,23 @@ required=False, help=f"AWS client path (default: %(default)s)", ) +parser.add_argument( + "--tests", + action="store_true", + default=False, + required=False, + help="Run remote tests instead of generating CSV output", +) + args = parser.parse_args() # Globals CSV_DELIMITER = "," -DATABASE_URL = os.getenv("DATABASE_URL", "") +DATABASE_URL = ( + "postgres://postgres:postgres@localhost:5432/postgres" + if args.tests + else os.getenv("DATABASE_URL", "") +) AWSCLI = args.awscli PGCLI = args.pgcli AWS_S3_BUCKET = f"nzsl-signbank-media-{args.env}" @@ -316,6 +344,20 @@ def build_csv_row( ) +# Run some tests against the remote endpoints +# This is a test-harness for now +# Takes advantage of the fact we have a lot of setup infrastructure in this script already +def do_tests(): + # Debugging safety + if args.env != "dev": + print("Error: tests must be in 'dev' environment") + exit() + print(f"DATABASE_URL:{DATABASE_URL}") + print("Running tests") + s3 = boto3.client("s3") + #pprint(s3.list_objects(Bucket=AWS_S3_BUCKET)) + #get_nzsl_raw_keys_dict() + # From the keys present in NZSL, get all their S3 information def process_keys(this_all_keys_dict): print(f"Getting detailed S3 data for keys ({AWS_S3_BUCKET}) ...", file=sys.stderr) @@ -332,6 +374,9 @@ def process_keys(this_all_keys_dict): print(f"PGCLI: {PGCLI}", file=sys.stderr) print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr) -process_keys( - create_all_keys_dict(get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list()) -) +if args.tests: + do_tests() +else: + process_keys( + create_all_keys_dict(get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list()) + ) From 5d11500ca02f68fb5aa902e76d1ce46041444509 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 25 Oct 2024 16:29:50 +1100 Subject: [PATCH 141/222] Experimental Django code --- bin/get-video-s3-acls.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index be20313c..82e82638 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -26,9 +26,6 @@ from django.contrib.auth import get_user_model User = get_user_model() -# Test -from signbank.dictionary.models import FieldChoice, Gloss - parser = argparse.ArgumentParser( description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. " "Postgres access details, eg. DATABASE_URL env var." @@ -347,6 +344,7 @@ def build_csv_row( # Run some tests against the remote endpoints # This is a test-harness for now # Takes advantage of the fact we have a lot of setup infrastructure in this script already +from signbank.dictionary.models import FieldChoice, Gloss def do_tests(): # Debugging safety if args.env != "dev": From d430c59e2a13ebc7c81be947b5aeefd069b34847 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 25 Oct 2024 16:32:51 +1100 Subject: [PATCH 142/222] black --- bin/get-video-s3-acls.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 82e82638..ec4b9f4c 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -19,11 +19,12 @@ # Magic required to allow this script to use Signbank Django classes sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) pprint(sys.path) -os.environ.setdefault( - "DJANGO_SETTINGS_MODULE", "signbank.settings.development") +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development") from django.core.wsgi import get_wsgi_application + get_wsgi_application() from django.contrib.auth import get_user_model + User = get_user_model() parser = argparse.ArgumentParser( @@ -345,6 +346,8 @@ def build_csv_row( # This is a test-harness for now # Takes advantage of the fact we have a lot of setup infrastructure in this script already from signbank.dictionary.models import FieldChoice, Gloss + + def do_tests(): # Debugging safety if args.env != "dev": @@ -353,8 +356,9 @@ def do_tests(): print(f"DATABASE_URL:{DATABASE_URL}") print("Running tests") s3 = boto3.client("s3") - #pprint(s3.list_objects(Bucket=AWS_S3_BUCKET)) - #get_nzsl_raw_keys_dict() + # pprint(s3.list_objects(Bucket=AWS_S3_BUCKET)) + # get_nzsl_raw_keys_dict() + # From the keys present in NZSL, get all their S3 information def process_keys(this_all_keys_dict): From 566ad2a75b7a6d9fe268ad2b2670f3f7f596ba82 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 25 Oct 2024 16:33:55 +1100 Subject: [PATCH 143/222] black --- bin/get-video-s3-acls.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index ec4b9f4c..37f5b73f 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -14,11 +14,9 @@ from time import sleep from pprint import pprint import boto3 -import django # Magic required to allow this script to use Signbank Django classes sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) -pprint(sys.path) os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development") from django.core.wsgi import get_wsgi_application @@ -26,6 +24,7 @@ from django.contrib.auth import get_user_model User = get_user_model() +from signbank.dictionary.models import FieldChoice, Gloss parser = argparse.ArgumentParser( description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. " @@ -345,9 +344,6 @@ def build_csv_row( # Run some tests against the remote endpoints # This is a test-harness for now # Takes advantage of the fact we have a lot of setup infrastructure in this script already -from signbank.dictionary.models import FieldChoice, Gloss - - def do_tests(): # Debugging safety if args.env != "dev": From 4e62dbd6ac6459377cd043e99a02827c95985899 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 25 Oct 2024 17:09:31 +1100 Subject: [PATCH 144/222] Django imports only occur under tests (requires a virtualenv). Postgres call handles and reports exceptions informatively. --- bin/get-video-s3-acls.py | 46 +++++++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 17 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 37f5b73f..c537abfe 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -15,16 +15,6 @@ from pprint import pprint import boto3 -# Magic required to allow this script to use Signbank Django classes -sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) -os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development") -from django.core.wsgi import get_wsgi_application - -get_wsgi_application() -from django.contrib.auth import get_user_model - -User = get_user_model() -from signbank.dictionary.models import FieldChoice, Gloss parser = argparse.ArgumentParser( description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. " @@ -58,6 +48,22 @@ args = parser.parse_args() + +if args.tests: + # Magic required to allow this script to use Signbank Django classes + sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) + os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development") + from django.core.wsgi import get_wsgi_application + + get_wsgi_application() + + from django.contrib.auth import get_user_model + + User = get_user_model() + + from signbank.dictionary.models import FieldChoice, Gloss + + # Globals CSV_DELIMITER = "," DATABASE_URL = ( @@ -71,13 +77,19 @@ def pg_cli(args_list): - return subprocess.run( - [PGCLI, "-c"] + args_list + [f"{DATABASE_URL}"], - env=os.environ, - capture_output=True, - check=True, - text=True, - ) + try: + return subprocess.run( + [PGCLI, "-c"] + args_list + [f"{DATABASE_URL}"], + env=os.environ, + capture_output=True, + check=True, + text=True, + ) + except subprocess.CalledProcessError as e: + print(f"Error: subprocess.run returned code {e.returncode}", file=sys.stderr) + print(e.cmd, file=sys.stderr) + print(e.stdout, file=sys.stderr) + print(e.stderr, file=sys.stderr) def aws_cli(args_list): From 7c997a662573790caa02875fc1d0353c16358f74 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 25 Oct 2024 17:13:17 +1100 Subject: [PATCH 145/222] exit on postgres exception --- bin/get-video-s3-acls.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index c537abfe..5203b34a 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -90,6 +90,7 @@ def pg_cli(args_list): print(e.cmd, file=sys.stderr) print(e.stdout, file=sys.stderr) print(e.stderr, file=sys.stderr) + exit() def aws_cli(args_list): From a1e10a71a2122699903da236be2cde1064a7233f Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 25 Oct 2024 17:39:25 +1100 Subject: [PATCH 146/222] Postgres tests safety guard --- bin/get-video-s3-acls.py | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 5203b34a..7625239a 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -61,16 +61,20 @@ User = get_user_model() - from signbank.dictionary.models import FieldChoice, Gloss - + from signbank.dictionary.models import ( + Dataset, + FieldChoice, + Gloss, + GlossTranslations, + Language, + ManualValidationAggregation, + ShareValidationAggregation, + ValidationRecord, + ) # Globals CSV_DELIMITER = "," -DATABASE_URL = ( - "postgres://postgres:postgres@localhost:5432/postgres" - if args.tests - else os.getenv("DATABASE_URL", "") -) +DATABASE_URL = os.getenv("DATABASE_URL", "") AWSCLI = args.awscli PGCLI = args.pgcli AWS_S3_BUCKET = f"nzsl-signbank-media-{args.env}" @@ -363,10 +367,15 @@ def do_tests(): print("Error: tests must be in 'dev' environment") exit() print(f"DATABASE_URL:{DATABASE_URL}") + if DATABASE_URL.find("@localhost") < 0: + print("Error: database url must contain '@localhost'") + exit() + print("Running tests") s3 = boto3.client("s3") # pprint(s3.list_objects(Bucket=AWS_S3_BUCKET)) # get_nzsl_raw_keys_dict() + pprint(Gloss.objects.all()) # From the keys present in NZSL, get all their S3 information @@ -387,7 +396,8 @@ def process_keys(this_all_keys_dict): if args.tests: do_tests() -else: - process_keys( - create_all_keys_dict(get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list()) - ) + exit() + +process_keys( + create_all_keys_dict(get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list()) +) From a05eb003fca162354b76ea95c05c492b4f8ad606 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Mon, 28 Oct 2024 13:39:31 +1100 Subject: [PATCH 147/222] Experimental refactor csv_import --- signbank/dictionary/csv_import.py | 884 ++++++++++++++++++------------ 1 file changed, 538 insertions(+), 346 deletions(-) diff --git a/signbank/dictionary/csv_import.py b/signbank/dictionary/csv_import.py index 0bfdff69..0685d701 100644 --- a/signbank/dictionary/csv_import.py +++ b/signbank/dictionary/csv_import.py @@ -1,6 +1,8 @@ # -*- coding: utf-8 -*- from __future__ import unicode_literals +from pprint import pprint + import codecs import csv import datetime @@ -26,8 +28,16 @@ from tagging.models import Tag, TaggedItem from .forms import CSVFileOnlyUpload, CSVUploadForm -from .models import (Dataset, FieldChoice, Gloss, GlossTranslations, Language, - ManualValidationAggregation, ShareValidationAggregation, ValidationRecord) +from .models import ( + Dataset, + FieldChoice, + Gloss, + GlossTranslations, + Language, + ManualValidationAggregation, + ShareValidationAggregation, + ValidationRecord, +) from .tasks import retrieve_videos_for_glosses from ..video.models import GlossVideo @@ -35,7 +45,7 @@ @login_required -@permission_required('dictionary.import_csv') +@permission_required("dictionary.import_csv") def import_gloss_csv(request): """ Check which objects exist and which not. Then show the user a list of glosses that will be added if user confirms. @@ -44,31 +54,53 @@ def import_gloss_csv(request): glosses_new = [] glosses_exists = [] # Make sure that the session variables are flushed before using this view. - if 'dataset_id' in request.session: del request.session['dataset_id'] - if 'glosses_new' in request.session: del request.session['glosses_new'] + if "dataset_id" in request.session: + del request.session["dataset_id"] + if "glosses_new" in request.session: + del request.session["glosses_new"] - if request.method == 'POST': + if request.method == "POST": form = CSVUploadForm(request.POST, request.FILES) if form.is_valid(): - dataset = form.cleaned_data['dataset'] - if 'view_dataset' not in get_perms(request.user, dataset): + dataset = form.cleaned_data["dataset"] + if "view_dataset" not in get_perms(request.user, dataset): # If user has no permissions to dataset, raise PermissionDenied to show 403 template. - msg = _("You do not have permissions to import glosses to this lexicon.") + msg = _( + "You do not have permissions to import glosses to this lexicon." + ) messages.error(request, msg) raise PermissionDenied(msg) try: - glossreader = csv.reader(codecs.iterdecode(form.cleaned_data['file'], 'utf-8'), delimiter=',', quotechar='"') + glossreader = csv.reader( + codecs.iterdecode(form.cleaned_data["file"], "utf-8"), + delimiter=",", + quotechar='"', + ) except csv.Error as e: # Can't open file, remove session variables - if 'dataset_id' in request.session: del request.session['dataset_id'] - if 'glosses_new' in request.session: del request.session['glosses_new'] + if "dataset_id" in request.session: + del request.session["dataset_id"] + if "glosses_new" in request.session: + del request.session["glosses_new"] # Set a message to be shown so that the user knows what is going on. - messages.add_message(request, messages.ERROR, _('Cannot open the file:' + str(e))) - return render(request, 'dictionary/import_gloss_csv.html', {'import_csv_form': CSVUploadForm()}, ) + messages.add_message( + request, messages.ERROR, _("Cannot open the file:" + str(e)) + ) + return render( + request, + "dictionary/import_gloss_csv.html", + {"import_csv_form": CSVUploadForm()}, + ) except UnicodeDecodeError as e: # File is not UTF-8 encoded. - messages.add_message(request, messages.ERROR, _('File must be UTF-8 encoded!')) - return render(request, 'dictionary/import_gloss_csv.html', {'import_csv_form': CSVUploadForm()}, ) + messages.add_message( + request, messages.ERROR, _("File must be UTF-8 encoded!") + ) + return render( + request, + "dictionary/import_gloss_csv.html", + {"import_csv_form": CSVUploadForm()}, + ) for row in glossreader: if glossreader.line_num == 1: @@ -87,74 +119,113 @@ def import_gloss_csv(request): continue # Store dataset's id and the list of glosses to be added in session. - request.session['dataset_id'] = dataset.id - request.session['glosses_new'] = glosses_new - - return render(request, 'dictionary/import_gloss_csv_confirmation.html', - {'glosses_new': glosses_new, - 'glosses_exists': glosses_exists, - 'dataset': dataset, }) + request.session["dataset_id"] = dataset.id + request.session["glosses_new"] = glosses_new + + return render( + request, + "dictionary/import_gloss_csv_confirmation.html", + { + "glosses_new": glosses_new, + "glosses_exists": glosses_exists, + "dataset": dataset, + }, + ) else: # If form is not valid, set a error message and return to the original form. - messages.add_message(request, messages.ERROR, _('The provided CSV-file does not meet the requirements ' - 'or there is some other problem.')) - return render(request, 'dictionary/import_gloss_csv.html', {'import_csv_form': form}, ) + messages.add_message( + request, + messages.ERROR, + _( + "The provided CSV-file does not meet the requirements " + "or there is some other problem." + ), + ) + return render( + request, + "dictionary/import_gloss_csv.html", + {"import_csv_form": form}, + ) else: # If request type is not POST, return to the original form. csv_form = CSVUploadForm() - allowed_datasets = get_objects_for_user(request.user, 'dictionary.view_dataset') + allowed_datasets = get_objects_for_user(request.user, "dictionary.view_dataset") # Make sure we only list datasets the user has permissions to. - csv_form.fields["dataset"].queryset = csv_form.fields["dataset"].queryset.filter( - id__in=[x.id for x in allowed_datasets]) - return render(request, "dictionary/import_gloss_csv.html", - {'import_csv_form': csv_form}, ) + csv_form.fields["dataset"].queryset = csv_form.fields[ + "dataset" + ].queryset.filter(id__in=[x.id for x in allowed_datasets]) + return render( + request, + "dictionary/import_gloss_csv.html", + {"import_csv_form": csv_form}, + ) @login_required -@permission_required('dictionary.import_csv') +@permission_required("dictionary.import_csv") def confirm_import_gloss_csv(request): """This view adds the data to database if the user confirms the action""" - if request.method == 'POST': - if 'cancel' in request.POST: + if request.method == "POST": + if "cancel" in request.POST: # If user cancels adding data, flush session variables - if 'dataset_id' in request.session: del request.session['dataset_id'] - if 'glosses_new' in request.session: del request.session['glosses_new'] + if "dataset_id" in request.session: + del request.session["dataset_id"] + if "glosses_new" in request.session: + del request.session["glosses_new"] # Set a message to be shown so that the user knows what is going on. - messages.add_message(request, messages.WARNING, _('Cancelled adding CSV data.')) - return HttpResponseRedirect(reverse('dictionary:import_gloss_csv')) + messages.add_message( + request, messages.WARNING, _("Cancelled adding CSV data.") + ) + return HttpResponseRedirect(reverse("dictionary:import_gloss_csv")) - elif 'confirm' in request.POST: + elif "confirm" in request.POST: glosses_added = [] dataset = None - if 'glosses_new' and 'dataset_id' in request.session: - dataset = Dataset.objects.get(id=request.session['dataset_id']) - for gloss in request.session['glosses_new']: + if "glosses_new" and "dataset_id" in request.session: + dataset = Dataset.objects.get(id=request.session["dataset_id"]) + for gloss in request.session["glosses_new"]: # If the Gloss does not already exist, continue adding. - if not Gloss.objects.filter(dataset=dataset, idgloss=gloss[0]).exists(): + if not Gloss.objects.filter( + dataset=dataset, idgloss=gloss[0] + ).exists(): try: - new_gloss = Gloss(dataset=dataset, idgloss=gloss[0], idgloss_mi=gloss[1], - created_by=request.user, updated_by=request.user) + new_gloss = Gloss( + dataset=dataset, + idgloss=gloss[0], + idgloss_mi=gloss[1], + created_by=request.user, + updated_by=request.user, + ) except IndexError: # If we get IndexError, idgloss_mi was probably not provided - new_gloss = Gloss(dataset=dataset, idgloss=gloss[0], - created_by=request.user, updated_by=request.user) + new_gloss = Gloss( + dataset=dataset, + idgloss=gloss[0], + created_by=request.user, + updated_by=request.user, + ) new_gloss.save() glosses_added.append((new_gloss.idgloss, new_gloss.idgloss_mi)) # Flush request.session['glosses_new'] and request.session['dataset'] - del request.session['glosses_new'] - del request.session['dataset_id'] + del request.session["glosses_new"] + del request.session["dataset_id"] # Set a message to be shown so that the user knows what is going on. - messages.add_message(request, messages.SUCCESS, _('Glosses were added successfully.')) - return render(request, "dictionary/import_gloss_csv_confirmation.html", {'glosses_added': glosses_added, - 'dataset': dataset.name}) + messages.add_message( + request, messages.SUCCESS, _("Glosses were added successfully.") + ) + return render( + request, + "dictionary/import_gloss_csv_confirmation.html", + {"glosses_added": glosses_added, "dataset": dataset.name}, + ) else: - return HttpResponseRedirect(reverse('dictionary:import_gloss_csv')) + return HttpResponseRedirect(reverse("dictionary:import_gloss_csv")) else: # If request method is not POST, redirect to the import form - return HttpResponseRedirect(reverse('dictionary:import_gloss_csv')) + return HttpResponseRedirect(reverse("dictionary:import_gloss_csv")) share_csv_header_list = [ @@ -191,20 +262,32 @@ def import_nzsl_share_gloss_csv(request): csv_form = CSVUploadForm() allowed_datasets = get_objects_for_user(request.user, "dictionary.view_dataset") # Make sure we only list datasets the user has permissions to. - csv_form.fields["dataset"].queryset = csv_form.fields["dataset"].queryset.filter( - id__in=[x.id for x in allowed_datasets]) - return render(request, "dictionary/import_nzsl_share_gloss_csv.html", - {"import_csv_form": csv_form}, ) + csv_form.fields["dataset"].queryset = csv_form.fields[ + "dataset" + ].queryset.filter(id__in=[x.id for x in allowed_datasets]) + return render( + request, + "dictionary/import_nzsl_share_gloss_csv.html", + {"import_csv_form": csv_form}, + ) form = CSVUploadForm(request.POST, request.FILES) if not form.is_valid(): # If form is not valid, set a error message and return to the original form. - messages.add_message(request, messages.ERROR, - _("The provided CSV-file does not meet the requirements " - "or there is some other problem.")) - return render(request, "dictionary/import_nzsl_share_gloss_csv.html", - {"import_csv_form": form}, ) + messages.add_message( + request, + messages.ERROR, + _( + "The provided CSV-file does not meet the requirements " + "or there is some other problem." + ), + ) + return render( + request, + "dictionary/import_nzsl_share_gloss_csv.html", + {"import_csv_form": form}, + ) new_glosses = [] dataset = form.cleaned_data["dataset"] @@ -218,7 +301,7 @@ def import_nzsl_share_gloss_csv(request): codecs.iterdecode(form.cleaned_data["file"], "utf-8"), fieldnames=share_csv_header_list, delimiter=",", - quotechar='"' + quotechar='"', ) skipped_existing_glosses = [] @@ -254,29 +337,40 @@ def import_nzsl_share_gloss_csv(request): request.session.pop("dataset_id", None) request.session.pop("glosses_new", None) # Set a message to be shown so that the user knows what is going on. - messages.add_message(request, messages.ERROR, _("Cannot open the file:" + str(e))) - return render(request, "dictionary/import_nzsl_share_gloss_csv.html", - {"import_csv_form": CSVUploadForm()}, ) + messages.add_message( + request, messages.ERROR, _("Cannot open the file:" + str(e)) + ) + return render( + request, + "dictionary/import_nzsl_share_gloss_csv.html", + {"import_csv_form": CSVUploadForm()}, + ) except UnicodeDecodeError as e: # File is not UTF-8 encoded. messages.add_message(request, messages.ERROR, _("File must be UTF-8 encoded!")) - return render(request, "dictionary/import_nzsl_share_gloss_csv.html", - {"import_csv_form": CSVUploadForm()}, ) + return render( + request, + "dictionary/import_nzsl_share_gloss_csv.html", + {"import_csv_form": CSVUploadForm()}, + ) # Store dataset's id and the list of glosses to be added in session. request.session["dataset_id"] = dataset.id request.session["glosses_new"] = new_glosses - return render(request, "dictionary/import_nzsl_share_gloss_csv_confirmation.html", - { - "glosses_new": new_glosses, - "dataset": dataset, - "skipped_existing_glosses": skipped_existing_glosses - }) + return render( + request, + "dictionary/import_nzsl_share_gloss_csv_confirmation.html", + { + "glosses_new": new_glosses, + "dataset": dataset, + "skipped_existing_glosses": skipped_existing_glosses, + }, + ) def update_retrieval_videos(videos, gloss_data): - """ prep videos, illustrations and usage example for video retrieval """ + """prep videos, illustrations and usage example for video retrieval""" gloss_pk = gloss_data["gloss"].pk gloss_word = gloss_data["word"] @@ -284,16 +378,14 @@ def update_retrieval_videos(videos, gloss_data): if gloss_data.get("videos", None): video_url = gloss_data["videos"] extension = video_url[-3:] - file_name = ( - f"{gloss_pk}-{gloss_word}.{gloss_pk}_video.{extension}" - ) + file_name = f"{gloss_pk}-{gloss_word}.{gloss_pk}_video.{extension}" glossvideo = { "url": video_url, "file_name": file_name, "gloss_pk": gloss_pk, "video_type": "main", - "version": 0 + "version": 0, } videos.append(glossvideo) @@ -309,7 +401,7 @@ def update_retrieval_videos(videos, gloss_data): "file_name": file_name, "gloss_pk": gloss_pk, "video_type": "main", - "version": i + "version": i, } videos.append(glossvideo) @@ -325,14 +417,18 @@ def update_retrieval_videos(videos, gloss_data): "file_name": file_name, "gloss_pk": gloss_pk, "video_type": f"finalexample{i + 1}", - "version": i + "version": i, } videos.append(glossvideo) + @login_required @permission_required("dictionary.import_csv") @transaction.atomic() def confirm_import_nzsl_share_gloss_csv(request): + + pprint(request.session.__dict__) + """This view adds the data to database if the user confirms the action""" if not request.method == "POST": # If request method is not POST, redirect to the import form @@ -348,6 +444,31 @@ def confirm_import_nzsl_share_gloss_csv(request): elif not "confirm" in request.POST: return HttpResponseRedirect(reverse("dictionary:import_nzsl_share_gloss_csv")) + if "glosses_new" and "dataset_id" in request.session: + [glosses_added, dataset_name] = confirm_import_nzsl_share_gloss_csv_inner( + request.session["glosses_new"], request.session["dataset_id"] + ) + + del request.session["glosses_new"] + del request.session["dataset_id"] + + # Set a message to be shown so that the user knows what is going on. + messages.add_message( + request, messages.SUCCESS, _("Glosses were added successfully.") + ) + + return render( + request, + "dictionary/import_nzsl_share_gloss_csv_confirmation.html", + {"glosses_added": glosses_added, "dataset": dataset_name}, + ) + + +def confirm_import_nzsl_share_gloss_csv_inner(session_glosses_new, session_dataset_id): + """Does the thing""" + + print("IN CONFIRM INNER") + glosses_added = [] dataset = None translations = [] @@ -362,49 +483,49 @@ def confirm_import_nzsl_share_gloss_csv(request): bulk_share_validation_aggregations = [] video_import_only_glosses_data = [] - if "glosses_new" and "dataset_id" in request.session: - dataset = Dataset.objects.get(id=request.session["dataset_id"]) - language_en = Language.objects.get(name="English") - language_mi = Language.objects.get(name="Māori") - gloss_content_type = ContentType.objects.get_for_model(Gloss) - site = Site.objects.get_current() - comment_submit_date = datetime.datetime.now(tz=get_current_timezone()) - semantic_fields = FieldChoice.objects.filter( - field="semantic_field" - ).values_list("english_name", "pk") - semantic_fields_dict = {field[0]: field[1] for field in semantic_fields} - signers = FieldChoice.objects.filter(field="signer") - signer_dict = {signer.english_name: signer for signer in signers} - existing_machine_values = [ - mv for mv in FieldChoice.objects.all().values_list("machine_value", flat=True) - ] - not_public_tag = Tag.objects.get(name="not public") - nzsl_share_tag = Tag.objects.get(name="nzsl-share") - import_user = User.objects.get( - username="nzsl_share_importer", - first_name="Importer", - last_name="NZSL Share", - ) - - for row_num, gloss_data in enumerate(request.session["glosses_new"]): - # will iterate over these glosses again after bulk creating - # and to ensure we get the correct gloss_data for words that appear multiple - # times we'll use the row_num as the identifier for the gloss data - - # if the gloss already exists at this point, it can only mean that - # it has no videos and we want to import videos for it - # try-except saves us a db call - try: - gloss = Gloss.objects.filter(nzsl_share_id=gloss_data["id"]).get() - gloss_data_copy = gloss_data.copy() - gloss_data_copy["gloss"] = gloss - video_import_only_glosses_data.append(gloss_data_copy) - continue - except Gloss.DoesNotExist: - pass + dataset = Dataset.objects.get(id=session_dataset_id) + language_en = Language.objects.get(name="English") + language_mi = Language.objects.get(name="Māori") + gloss_content_type = ContentType.objects.get_for_model(Gloss) + site = Site.objects.get_current() + comment_submit_date = datetime.datetime.now(tz=get_current_timezone()) + semantic_fields = FieldChoice.objects.filter(field="semantic_field").values_list( + "english_name", "pk" + ) + semantic_fields_dict = {field[0]: field[1] for field in semantic_fields} + signers = FieldChoice.objects.filter(field="signer") + signer_dict = {signer.english_name: signer for signer in signers} + existing_machine_values = [ + mv for mv in FieldChoice.objects.all().values_list("machine_value", flat=True) + ] + not_public_tag = Tag.objects.get(name="not public") + nzsl_share_tag = Tag.objects.get(name="nzsl-share") + import_user = User.objects.get( + username="nzsl_share_importer", + first_name="Importer", + last_name="NZSL Share", + ) - new_glosses[str(row_num)] = gloss_data - bulk_create_gloss.append(Gloss( + for row_num, gloss_data in enumerate(session_glosses_new): + # will iterate over these glosses again after bulk creating + # and to ensure we get the correct gloss_data for words that appear multiple + # times we'll use the row_num as the identifier for the gloss data + + # if the gloss already exists at this point, it can only mean that + # it has no videos and we want to import videos for it + # try-except saves us a db call + try: + gloss = Gloss.objects.filter(nzsl_share_id=gloss_data["id"]).get() + gloss_data_copy = gloss_data.copy() + gloss_data_copy["gloss"] = gloss + video_import_only_glosses_data.append(gloss_data_copy) + continue + except Gloss.DoesNotExist: + pass + + new_glosses[str(row_num)] = gloss_data + bulk_create_gloss.append( + Gloss( dataset=dataset, nzsl_share_id=gloss_data["id"], # need to make idgloss unique in dataset, @@ -415,183 +536,174 @@ def confirm_import_nzsl_share_gloss_csv(request): created_by=import_user, updated_by=import_user, exclude_from_ecv=True, - )) - contributors.append(gloss_data["contributor_username"]) - - bulk_created = Gloss.objects.bulk_create(bulk_create_gloss) - - # Create new signers for contributors that do not exist as signers yet - contributors = set(contributors) - create_signers = [] - signers = signer_dict.keys() - for contributor in contributors: - if contributor not in signers: + ) + ) + contributors.append(gloss_data["contributor_username"]) + + bulk_created = Gloss.objects.bulk_create(bulk_create_gloss) + + # Create new signers for contributors that do not exist as signers yet + contributors = set(contributors) + create_signers = [] + signers = signer_dict.keys() + for contributor in contributors: + if contributor not in signers: + new_machine_value = random.randint(0, 99999999) + while new_machine_value in existing_machine_values: new_machine_value = random.randint(0, 99999999) - while new_machine_value in existing_machine_values: - new_machine_value = random.randint(0, 99999999) - existing_machine_values.append(new_machine_value) - create_signers.append(FieldChoice( + existing_machine_values.append(new_machine_value) + create_signers.append( + FieldChoice( field="signer", english_name=contributor, - machine_value=new_machine_value - )) - new_signers = FieldChoice.objects.bulk_create(create_signers) - for signer in new_signers: - signer_dict[signer.english_name] = signer - - for gloss in bulk_created: - word_en, row_num = gloss.idgloss.split("_row") - gloss_data = new_glosses[row_num] - gloss_data["gloss"] = gloss - - # get semantic fields for gloss_data topics - if gloss_data.get("topic_names", None): - gloss_topics = gloss_data["topic_names"].split("|") - # ignore all signs and All signs - cleaned_gloss_topics = [ - x for x in gloss_topics if x not in ["all signs", "All signs"] - ] - add_miscellaneous = False - - for topic in cleaned_gloss_topics: - if topic in semantic_fields_dict.keys(): - bulk_semantic_fields.append( - Gloss.semantic_field.through( - gloss_id=gloss.id, - fieldchoice_id=semantic_fields_dict[topic] - ) - ) - else: - # add the miscellaneous semantic field if a topic does not exist - add_miscellaneous = True - - if add_miscellaneous: + machine_value=new_machine_value, + ) + ) + new_signers = FieldChoice.objects.bulk_create(create_signers) + for signer in new_signers: + signer_dict[signer.english_name] = signer + + for gloss in bulk_created: + word_en, row_num = gloss.idgloss.split("_row") + gloss_data = new_glosses[row_num] + gloss_data["gloss"] = gloss + + # get semantic fields for gloss_data topics + if gloss_data.get("topic_names", None): + gloss_topics = gloss_data["topic_names"].split("|") + # ignore all signs and All signs + cleaned_gloss_topics = [ + x for x in gloss_topics if x not in ["all signs", "All signs"] + ] + add_miscellaneous = False + + for topic in cleaned_gloss_topics: + if topic in semantic_fields_dict.keys(): bulk_semantic_fields.append( Gloss.semantic_field.through( gloss_id=gloss.id, - fieldchoice_id=semantic_fields_dict["Miscellaneous"] + fieldchoice_id=semantic_fields_dict[topic], ) ) + else: + # add the miscellaneous semantic field if a topic does not exist + add_miscellaneous = True + + if add_miscellaneous: + bulk_semantic_fields.append( + Gloss.semantic_field.through( + gloss_id=gloss.id, + fieldchoice_id=semantic_fields_dict["Miscellaneous"], + ) + ) - # create GlossTranslations for english and maori words - translations.append(GlossTranslations( + # create GlossTranslations for english and maori words + translations.append( + GlossTranslations( gloss=gloss, language=language_en, translations=gloss_data["word"], - translations_secondary=gloss_data.get("secondary", None) - )) - if gloss_data.get("maori", None): - # There is potentially several comma separated maori words - maori_words = gloss_data["maori"].split(", ") - - # Update idgloss_mi using first maori word, then create translation - gloss.idgloss_mi = f"{maori_words[0]}:{gloss.pk}" - - translation = GlossTranslations( - gloss=gloss, - language=language_mi, - translations=maori_words[0] - ) - if len(maori_words) > 1: - translation.translations_secondary = ", ".join(maori_words[1:]) + translations_secondary=gloss_data.get("secondary", None), + ) + ) + if gloss_data.get("maori", None): + # There is potentially several comma separated maori words + maori_words = gloss_data["maori"].split(", ") + + # Update idgloss_mi using first maori word, then create translation + gloss.idgloss_mi = f"{maori_words[0]}:{gloss.pk}" - translations.append(translation) + translation = GlossTranslations( + gloss=gloss, language=language_mi, translations=maori_words[0] + ) + if len(maori_words) > 1: + translation.translations_secondary = ", ".join(maori_words[1:]) - # Prepare new idgloss and signer fields for bulk update - gloss.idgloss = f"{word_en}:{gloss.pk}" - gloss.signer = signer_dict[gloss_data["contributor_username"]] - bulk_update_glosses.append(gloss) + translations.append(translation) - # Create comment for gloss_data notes - comments.append(Comment( + # Prepare new idgloss and signer fields for bulk update + gloss.idgloss = f"{word_en}:{gloss.pk}" + gloss.signer = signer_dict[gloss_data["contributor_username"]] + bulk_update_glosses.append(gloss) + + # Create comment for gloss_data notes + comments.append( + Comment( content_type=gloss_content_type, object_pk=gloss.pk, user_name=gloss_data.get("contributor_username", ""), comment=gloss_data.get("notes", ""), site=site, is_public=False, - submit_date=comment_submit_date - )) - if gloss_data.get("sign_comments", None): - # create Comments for all gloss_data sign_comments - for comment in gloss_data["sign_comments"].split("|"): - try: - comment_content = comment.split(":") - user_name = comment_content[0] - comment_content = comment_content[1] - except IndexError: - comment_content = comment - user_name = "Unknown" - comments.append(Comment( + submit_date=comment_submit_date, + ) + ) + if gloss_data.get("sign_comments", None): + # create Comments for all gloss_data sign_comments + for comment in gloss_data["sign_comments"].split("|"): + try: + comment_content = comment.split(":") + user_name = comment_content[0] + comment_content = comment_content[1] + except IndexError: + comment_content = comment + user_name = "Unknown" + comments.append( + Comment( content_type=gloss_content_type, object_pk=gloss.pk, user_name=user_name, comment=comment_content, site=site, is_public=False, - submit_date=comment_submit_date - )) + submit_date=comment_submit_date, + ) + ) - # Add ShareValidationAggregation - bulk_share_validation_aggregations.append(ShareValidationAggregation( + # Add ShareValidationAggregation + bulk_share_validation_aggregations.append( + ShareValidationAggregation( gloss=gloss, agrees=int(gloss_data["agrees"]), - disagrees=int(gloss_data["disagrees"]) - )) - - # prep videos, illustrations and usage example for video retrieval - update_retrieval_videos(videos, gloss_data) - - glosses_added.append(gloss) - - bulk_tagged_items.append(TaggedItem( - content_type=gloss_content_type, - object_id=gloss.pk, - tag=nzsl_share_tag - - )) - bulk_tagged_items.append(TaggedItem( - content_type=gloss_content_type, - object_id=gloss.pk, - tag=not_public_tag - - )) + disagrees=int(gloss_data["disagrees"]), + ) + ) - # Bulk create entities related to the gloss, and bulk update the glosses' idgloss - Comment.objects.bulk_create(comments) - GlossTranslations.objects.bulk_create(translations) - Gloss.objects.bulk_update(bulk_update_glosses, ["idgloss", "idgloss_mi", "signer"]) - Gloss.semantic_field.through.objects.bulk_create(bulk_semantic_fields) - TaggedItem.objects.bulk_create(bulk_tagged_items) - ShareValidationAggregation.objects.bulk_create(bulk_share_validation_aggregations) + # prep videos, illustrations and usage example for video retrieval + update_retrieval_videos(videos, gloss_data) - # Add the video-update only glosses - for video_import_gloss_data in video_import_only_glosses_data: - # prep videos, illustrations and usage example for video retrieval - update_retrieval_videos(videos, video_import_gloss_data) - glosses_added.append(video_import_gloss_data["gloss"]) + glosses_added.append(gloss) - # start Thread to process gloss video retrieval in the background - t = threading.Thread( - target=retrieve_videos_for_glosses, - args=[videos], - daemon=True + bulk_tagged_items.append( + TaggedItem( + content_type=gloss_content_type, object_id=gloss.pk, tag=nzsl_share_tag + ) + ) + bulk_tagged_items.append( + TaggedItem( + content_type=gloss_content_type, object_id=gloss.pk, tag=not_public_tag + ) ) - t.start() - del request.session["glosses_new"] - del request.session["dataset_id"] + # Bulk create entities related to the gloss, and bulk update the glosses' idgloss + Comment.objects.bulk_create(comments) + GlossTranslations.objects.bulk_create(translations) + Gloss.objects.bulk_update(bulk_update_glosses, ["idgloss", "idgloss_mi", "signer"]) + Gloss.semantic_field.through.objects.bulk_create(bulk_semantic_fields) + TaggedItem.objects.bulk_create(bulk_tagged_items) + ShareValidationAggregation.objects.bulk_create(bulk_share_validation_aggregations) - # Set a message to be shown so that the user knows what is going on. - messages.add_message(request, messages.SUCCESS, _("Glosses were added successfully.")) - return render( - request, "dictionary/import_nzsl_share_gloss_csv_confirmation.html", - { - "glosses_added": glosses_added, - "dataset": dataset.name - } - ) + # Add the video-update only glosses + for video_import_gloss_data in video_import_only_glosses_data: + # prep videos, illustrations and usage example for video retrieval + update_retrieval_videos(videos, video_import_gloss_data) + glosses_added.append(video_import_gloss_data["gloss"]) + # start Thread to process gloss video retrieval in the background + t = threading.Thread(target=retrieve_videos_for_glosses, args=[videos], daemon=True) + t.start() + + return [glosses_added, dataset.name] @login_required @@ -608,18 +720,29 @@ def import_qualtrics_csv(request): if not request.method == "POST": # If request type is not POST, return to the original form. csv_form = CSVFileOnlyUpload() - return render(request, "dictionary/import_qualtrics_csv.html", - {"import_csv_form": csv_form}, ) + return render( + request, + "dictionary/import_qualtrics_csv.html", + {"import_csv_form": csv_form}, + ) form = CSVFileOnlyUpload(request.POST, request.FILES) if not form.is_valid(): # If form is not valid, set a error message and return to the original form. - messages.add_message(request, messages.ERROR, - _("The provided CSV-file does not meet the requirements " - "or there is some other problem.")) - return render(request, "dictionary/import_qualtrics_csv.html", - {"import_csv_form": form}, ) + messages.add_message( + request, + messages.ERROR, + _( + "The provided CSV-file does not meet the requirements " + "or there is some other problem." + ), + ) + return render( + request, + "dictionary/import_qualtrics_csv.html", + {"import_csv_form": form}, + ) validation_records = [] skipped_rows = [] @@ -627,7 +750,7 @@ def import_qualtrics_csv(request): validation_record_reader = csv.DictReader( codecs.iterdecode(form.cleaned_data["file"], "utf-8"), delimiter=",", - quotechar='"' + quotechar='"', ) question_numbers = [] @@ -669,22 +792,33 @@ def import_qualtrics_csv(request): request.session.pop("question_numbers", None) request.session.pop("question_gloss_map", None) # Set a message to be shown so that the user knows what is going on. - messages.add_message(request, messages.ERROR, _("Cannot open the file:" + str(e))) - return render(request, "dictionary/import_qualtrics_csv.html", - {"import_csv_form": CSVFileOnlyUpload()}, ) + messages.add_message( + request, messages.ERROR, _("Cannot open the file:" + str(e)) + ) + return render( + request, + "dictionary/import_qualtrics_csv.html", + {"import_csv_form": CSVFileOnlyUpload()}, + ) except UnicodeDecodeError as e: # File is not UTF-8 encoded. messages.add_message(request, messages.ERROR, _("File must be UTF-8 encoded!")) - return render(request, "dictionary/import_qualtrics_csv.html", - {"import_csv_form": CSVFileOnlyUpload()}, ) + return render( + request, + "dictionary/import_qualtrics_csv.html", + {"import_csv_form": CSVFileOnlyUpload()}, + ) # Store dataset's id and the list of glosses to be added in session. request.session["validation_records"] = validation_records request.session["question_numbers"] = question_numbers request.session["question_glossvideo_map"] = question_to_glossvideo_map - return render(request, "dictionary/import_qualtrics_csv_confirmation.html", - {"validation_records": validation_records, "skipped_rows": skipped_rows}) + return render( + request, + "dictionary/import_qualtrics_csv_confirmation.html", + {"validation_records": validation_records, "skipped_rows": skipped_rows}, + ) @login_required @@ -714,13 +848,21 @@ def confirm_import_qualtrics_csv(request): bulk_tagged_items = [] gloss_pks = set() - if "validation_records" and "question_numbers" and "question_glossvideo_map" in request.session: + if ( + "validation_records" + and "question_numbers" + and "question_glossvideo_map" in request.session + ): # Retrieve glosses glossvideo_pk_list = request.session["question_glossvideo_map"].values() - glossvideo_dict = GlossVideo.objects.select_related("gloss").in_bulk(glossvideo_pk_list) + glossvideo_dict = GlossVideo.objects.select_related("gloss").in_bulk( + glossvideo_pk_list + ) gloss_content_type = ContentType.objects.get_for_model(Gloss) check_result_tag = Tag.objects.get(name=settings.TAG_VALIDATION_CHECK_RESULTS) - ready_for_validation_tag = Tag.objects.get(name=settings.TAG_READY_FOR_VALIDATION) + ready_for_validation_tag = Tag.objects.get( + name=settings.TAG_READY_FOR_VALIDATION + ) questions_numbers = request.session["question_numbers"] question_glossvideo_map = request.session["question_glossvideo_map"] @@ -740,35 +882,43 @@ def confirm_import_qualtrics_csv(request): sign_seen = ValidationRecord.SignSeenChoices.NOT_SURE.value try: - gloss = glossvideo_dict[question_glossvideo_map[question_number]].gloss - validation_records_added.append(ValidationRecord( - gloss=gloss, - sign_seen=ValidationRecord.SignSeenChoices(sign_seen), - response_id=response_id, - respondent_first_name=respondent_first_name, - respondent_last_name=respondent_last_name, - comment=record.get(f"{question_number}_Q2_5_TEXT", ""), - )) + gloss = glossvideo_dict[ + question_glossvideo_map[question_number] + ].gloss + validation_records_added.append( + ValidationRecord( + gloss=gloss, + sign_seen=ValidationRecord.SignSeenChoices(sign_seen), + response_id=response_id, + respondent_first_name=respondent_first_name, + respondent_last_name=respondent_last_name, + comment=record.get(f"{question_number}_Q2_5_TEXT", ""), + ) + ) gloss_pks.add(gloss.pk) except KeyError: - missing_gloss_pk_question_pairs[question_number] = question_glossvideo_map[ - question_number] + missing_gloss_pk_question_pairs[question_number] = ( + question_glossvideo_map[question_number] + ) for gloss_pk in gloss_pks: - bulk_tagged_items.append(TaggedItem( - content_type=gloss_content_type, - object_id=gloss_pk, - tag=check_result_tag - - )) + bulk_tagged_items.append( + TaggedItem( + content_type=gloss_content_type, + object_id=gloss_pk, + tag=check_result_tag, + ) + ) # ignoring conflicts so the unique together on the model filters out potential duplicates - ValidationRecord.objects.bulk_create(validation_records_added, ignore_conflicts=True) + ValidationRecord.objects.bulk_create( + validation_records_added, ignore_conflicts=True + ) TaggedItem.objects.bulk_create(bulk_tagged_items, ignore_conflicts=True) TaggedItem.objects.filter( content_type=gloss_content_type, object_id__in=gloss_pks, - tag=ready_for_validation_tag + tag=ready_for_validation_tag, ).delete() del request.session["validation_records"] @@ -776,17 +926,19 @@ def confirm_import_qualtrics_csv(request): del request.session["question_glossvideo_map"] # Set a message to be shown so that the user knows what is going on. - messages.add_message(request, messages.SUCCESS, - _("ValidationRecords were added successfully.")) + messages.add_message( + request, messages.SUCCESS, _("ValidationRecords were added successfully.") + ) return render( - request, "dictionary/import_qualtrics_csv_confirmation.html", + request, + "dictionary/import_qualtrics_csv_confirmation.html", { "validation_records_added": validation_records_added, "validation_record_count": len(validation_records_added), "responses_count": len(validation_records), "gloss_count": len(gloss_pks), - "missing_gloss_question_pairs": missing_gloss_pk_question_pairs - } + "missing_gloss_question_pairs": missing_gloss_pk_question_pairs, + }, ) @@ -815,18 +967,29 @@ def import_manual_validation(request): if request.method != "POST": # If request type is not POST, return to the original form. csv_form = CSVFileOnlyUpload() - return render(request, "dictionary/import_manual_validation_csv.html", - {"import_csv_form": csv_form}, ) + return render( + request, + "dictionary/import_manual_validation_csv.html", + {"import_csv_form": csv_form}, + ) form = CSVFileOnlyUpload(request.POST, request.FILES) if not form.is_valid(): # If form is not valid, set a error message and return to the original form. - messages.add_message(request, messages.ERROR, - _("The provided CSV-file does not meet the requirements " - "or there is some other problem.")) - return render(request, "dictionary/import_manual_validation_csv.html", - {"import_csv_form": form}, ) + messages.add_message( + request, + messages.ERROR, + _( + "The provided CSV-file does not meet the requirements " + "or there is some other problem." + ), + ) + return render( + request, + "dictionary/import_manual_validation_csv.html", + {"import_csv_form": form}, + ) group_row_map = defaultdict(list) group_gloss_count = defaultdict(int) @@ -837,29 +1000,38 @@ def import_manual_validation(request): "yes", "no", "abstain or not sure", - "comments" + "comments", ] try: validation_record_reader = csv.DictReader( codecs.iterdecode(form.cleaned_data["file"], "utf-8-sig"), delimiter=",", - quotechar='"' + quotechar='"', + ) + missing_headers = set(required_headers) - set( + validation_record_reader.fieldnames ) - missing_headers = set(required_headers) - set(validation_record_reader.fieldnames) if missing_headers != set(): request.session.pop("group_row_map", None) request.session.pop("glosses", None) # Set a message to be shown so that the user knows what is going on. - messages.add_message(request, messages.ERROR, - _(f"CSV is missing required columns: {missing_headers}")) - return render(request, - "dictionary/import_manual_validation_csv.html", - {"import_csv_form": CSVFileOnlyUpload()}, ) + messages.add_message( + request, + messages.ERROR, + _(f"CSV is missing required columns: {missing_headers}"), + ) + return render( + request, + "dictionary/import_manual_validation_csv.html", + {"import_csv_form": CSVFileOnlyUpload()}, + ) for row in validation_record_reader: if validation_record_reader.line_num == 1: continue - _check_row_can_be_converted_to_integer(row, ["yes", "no", "abstain or not sure"]) + _check_row_can_be_converted_to_integer( + row, ["yes", "no", "abstain or not sure"] + ) group_row_map[row["group"]].append(row) group_gloss_count[row["group"]] += 1 glosses.append(row["idgloss"].split(":")[1]) @@ -868,35 +1040,49 @@ def import_manual_validation(request): request.session.pop("group_row_map", None) request.session.pop("glosses", None) # Set a message to be shown so that the user knows what is going on. - messages.add_message(request, messages.ERROR, _("File contains non-compliant data:" + str(e))) - return render(request, "dictionary/import_manual_validation_csv.html", - {"import_csv_form": CSVFileOnlyUpload()}, ) + messages.add_message( + request, messages.ERROR, _("File contains non-compliant data:" + str(e)) + ) + return render( + request, + "dictionary/import_manual_validation_csv.html", + {"import_csv_form": CSVFileOnlyUpload()}, + ) except csv.Error as e: # Can't open file, remove session variables request.session.pop("group_row_map", None) request.session.pop("glosses", None) # Set a message to be shown so that the user knows what is going on. - messages.add_message(request, messages.ERROR, _("Cannot open the file:" + str(e))) - return render(request, "dictionary/import_manual_validation_csv.html", - {"import_csv_form": CSVFileOnlyUpload()}, ) + messages.add_message( + request, messages.ERROR, _("Cannot open the file:" + str(e)) + ) + return render( + request, + "dictionary/import_manual_validation_csv.html", + {"import_csv_form": CSVFileOnlyUpload()}, + ) except UnicodeDecodeError as e: # File is not UTF-8 encoded. messages.add_message(request, messages.ERROR, _("File must be UTF-8 encoded!")) - return render(request, "dictionary/import_manual_validation_csv.html", - {"import_csv_form": CSVFileOnlyUpload()}, ) + return render( + request, + "dictionary/import_manual_validation_csv.html", + {"import_csv_form": CSVFileOnlyUpload()}, + ) # Store dataset's id and the list of glosses to be added in session. request.session["group_row_map"] = group_row_map request.session["glosses"] = list(set(glosses)) return render( - request, "dictionary/import_manual_validation_csv_confirmation.html", + request, + "dictionary/import_manual_validation_csv_confirmation.html", { # iterating over defaultdicts causes issues in template rendering "group_row_map": dict(group_row_map), - "group_gloss_count": dict(group_gloss_count) - } + "group_gloss_count": dict(group_gloss_count), + }, ) @@ -940,14 +1126,18 @@ def confirm_import_manual_validation(request): sign_seen_no = row["no"] sign_seen_not_sure = row["abstain or not sure"] comments = row["comments"] - manual_validation_aggregations.append(ManualValidationAggregation( - gloss=gloss, - group=group, - sign_seen_yes=int(sign_seen_yes) if sign_seen_yes else 0, - sign_seen_no=int(sign_seen_no) if sign_seen_no else 0, - sign_seen_not_sure=int(sign_seen_not_sure) if sign_seen_not_sure else 0, - comments=comments - )) + manual_validation_aggregations.append( + ManualValidationAggregation( + gloss=gloss, + group=group, + sign_seen_yes=int(sign_seen_yes) if sign_seen_yes else 0, + sign_seen_no=int(sign_seen_no) if sign_seen_no else 0, + sign_seen_not_sure=( + int(sign_seen_not_sure) if sign_seen_not_sure else 0 + ), + comments=comments, + ) + ) ManualValidationAggregation.objects.bulk_create(manual_validation_aggregations) @@ -955,13 +1145,15 @@ def confirm_import_manual_validation(request): del request.session["glosses"] # Set a message to be shown so that the user knows what is going on. - messages.add_message(request, messages.SUCCESS, - _("ValidationRecords were added successfully.")) + messages.add_message( + request, messages.SUCCESS, _("ValidationRecords were added successfully.") + ) return render( - request, "dictionary/import_manual_validation_csv_confirmation.html", + request, + "dictionary/import_manual_validation_csv_confirmation.html", { "manual_validation_aggregations": manual_validation_aggregations, "manual_validation_aggregations_count": len(manual_validation_aggregations), - "missing_glosses": missing_glosses - } + "missing_glosses": missing_glosses, + }, ) From 00a58b6e9ae1b43465cf013c3c16e04b2c5a336b Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Mon, 28 Oct 2024 13:40:27 +1100 Subject: [PATCH 148/222] Revert "Experimental refactor csv_import" Inadvertently left debug behind. This reverts commit a05eb003fca162354b76ea95c05c492b4f8ad606. --- signbank/dictionary/csv_import.py | 884 ++++++++++++------------------ 1 file changed, 346 insertions(+), 538 deletions(-) diff --git a/signbank/dictionary/csv_import.py b/signbank/dictionary/csv_import.py index 0685d701..0bfdff69 100644 --- a/signbank/dictionary/csv_import.py +++ b/signbank/dictionary/csv_import.py @@ -1,8 +1,6 @@ # -*- coding: utf-8 -*- from __future__ import unicode_literals -from pprint import pprint - import codecs import csv import datetime @@ -28,16 +26,8 @@ from tagging.models import Tag, TaggedItem from .forms import CSVFileOnlyUpload, CSVUploadForm -from .models import ( - Dataset, - FieldChoice, - Gloss, - GlossTranslations, - Language, - ManualValidationAggregation, - ShareValidationAggregation, - ValidationRecord, -) +from .models import (Dataset, FieldChoice, Gloss, GlossTranslations, Language, + ManualValidationAggregation, ShareValidationAggregation, ValidationRecord) from .tasks import retrieve_videos_for_glosses from ..video.models import GlossVideo @@ -45,7 +35,7 @@ @login_required -@permission_required("dictionary.import_csv") +@permission_required('dictionary.import_csv') def import_gloss_csv(request): """ Check which objects exist and which not. Then show the user a list of glosses that will be added if user confirms. @@ -54,53 +44,31 @@ def import_gloss_csv(request): glosses_new = [] glosses_exists = [] # Make sure that the session variables are flushed before using this view. - if "dataset_id" in request.session: - del request.session["dataset_id"] - if "glosses_new" in request.session: - del request.session["glosses_new"] + if 'dataset_id' in request.session: del request.session['dataset_id'] + if 'glosses_new' in request.session: del request.session['glosses_new'] - if request.method == "POST": + if request.method == 'POST': form = CSVUploadForm(request.POST, request.FILES) if form.is_valid(): - dataset = form.cleaned_data["dataset"] - if "view_dataset" not in get_perms(request.user, dataset): + dataset = form.cleaned_data['dataset'] + if 'view_dataset' not in get_perms(request.user, dataset): # If user has no permissions to dataset, raise PermissionDenied to show 403 template. - msg = _( - "You do not have permissions to import glosses to this lexicon." - ) + msg = _("You do not have permissions to import glosses to this lexicon.") messages.error(request, msg) raise PermissionDenied(msg) try: - glossreader = csv.reader( - codecs.iterdecode(form.cleaned_data["file"], "utf-8"), - delimiter=",", - quotechar='"', - ) + glossreader = csv.reader(codecs.iterdecode(form.cleaned_data['file'], 'utf-8'), delimiter=',', quotechar='"') except csv.Error as e: # Can't open file, remove session variables - if "dataset_id" in request.session: - del request.session["dataset_id"] - if "glosses_new" in request.session: - del request.session["glosses_new"] + if 'dataset_id' in request.session: del request.session['dataset_id'] + if 'glosses_new' in request.session: del request.session['glosses_new'] # Set a message to be shown so that the user knows what is going on. - messages.add_message( - request, messages.ERROR, _("Cannot open the file:" + str(e)) - ) - return render( - request, - "dictionary/import_gloss_csv.html", - {"import_csv_form": CSVUploadForm()}, - ) + messages.add_message(request, messages.ERROR, _('Cannot open the file:' + str(e))) + return render(request, 'dictionary/import_gloss_csv.html', {'import_csv_form': CSVUploadForm()}, ) except UnicodeDecodeError as e: # File is not UTF-8 encoded. - messages.add_message( - request, messages.ERROR, _("File must be UTF-8 encoded!") - ) - return render( - request, - "dictionary/import_gloss_csv.html", - {"import_csv_form": CSVUploadForm()}, - ) + messages.add_message(request, messages.ERROR, _('File must be UTF-8 encoded!')) + return render(request, 'dictionary/import_gloss_csv.html', {'import_csv_form': CSVUploadForm()}, ) for row in glossreader: if glossreader.line_num == 1: @@ -119,113 +87,74 @@ def import_gloss_csv(request): continue # Store dataset's id and the list of glosses to be added in session. - request.session["dataset_id"] = dataset.id - request.session["glosses_new"] = glosses_new - - return render( - request, - "dictionary/import_gloss_csv_confirmation.html", - { - "glosses_new": glosses_new, - "glosses_exists": glosses_exists, - "dataset": dataset, - }, - ) + request.session['dataset_id'] = dataset.id + request.session['glosses_new'] = glosses_new + + return render(request, 'dictionary/import_gloss_csv_confirmation.html', + {'glosses_new': glosses_new, + 'glosses_exists': glosses_exists, + 'dataset': dataset, }) else: # If form is not valid, set a error message and return to the original form. - messages.add_message( - request, - messages.ERROR, - _( - "The provided CSV-file does not meet the requirements " - "or there is some other problem." - ), - ) - return render( - request, - "dictionary/import_gloss_csv.html", - {"import_csv_form": form}, - ) + messages.add_message(request, messages.ERROR, _('The provided CSV-file does not meet the requirements ' + 'or there is some other problem.')) + return render(request, 'dictionary/import_gloss_csv.html', {'import_csv_form': form}, ) else: # If request type is not POST, return to the original form. csv_form = CSVUploadForm() - allowed_datasets = get_objects_for_user(request.user, "dictionary.view_dataset") + allowed_datasets = get_objects_for_user(request.user, 'dictionary.view_dataset') # Make sure we only list datasets the user has permissions to. - csv_form.fields["dataset"].queryset = csv_form.fields[ - "dataset" - ].queryset.filter(id__in=[x.id for x in allowed_datasets]) - return render( - request, - "dictionary/import_gloss_csv.html", - {"import_csv_form": csv_form}, - ) + csv_form.fields["dataset"].queryset = csv_form.fields["dataset"].queryset.filter( + id__in=[x.id for x in allowed_datasets]) + return render(request, "dictionary/import_gloss_csv.html", + {'import_csv_form': csv_form}, ) @login_required -@permission_required("dictionary.import_csv") +@permission_required('dictionary.import_csv') def confirm_import_gloss_csv(request): """This view adds the data to database if the user confirms the action""" - if request.method == "POST": - if "cancel" in request.POST: + if request.method == 'POST': + if 'cancel' in request.POST: # If user cancels adding data, flush session variables - if "dataset_id" in request.session: - del request.session["dataset_id"] - if "glosses_new" in request.session: - del request.session["glosses_new"] + if 'dataset_id' in request.session: del request.session['dataset_id'] + if 'glosses_new' in request.session: del request.session['glosses_new'] # Set a message to be shown so that the user knows what is going on. - messages.add_message( - request, messages.WARNING, _("Cancelled adding CSV data.") - ) - return HttpResponseRedirect(reverse("dictionary:import_gloss_csv")) + messages.add_message(request, messages.WARNING, _('Cancelled adding CSV data.')) + return HttpResponseRedirect(reverse('dictionary:import_gloss_csv')) - elif "confirm" in request.POST: + elif 'confirm' in request.POST: glosses_added = [] dataset = None - if "glosses_new" and "dataset_id" in request.session: - dataset = Dataset.objects.get(id=request.session["dataset_id"]) - for gloss in request.session["glosses_new"]: + if 'glosses_new' and 'dataset_id' in request.session: + dataset = Dataset.objects.get(id=request.session['dataset_id']) + for gloss in request.session['glosses_new']: # If the Gloss does not already exist, continue adding. - if not Gloss.objects.filter( - dataset=dataset, idgloss=gloss[0] - ).exists(): + if not Gloss.objects.filter(dataset=dataset, idgloss=gloss[0]).exists(): try: - new_gloss = Gloss( - dataset=dataset, - idgloss=gloss[0], - idgloss_mi=gloss[1], - created_by=request.user, - updated_by=request.user, - ) + new_gloss = Gloss(dataset=dataset, idgloss=gloss[0], idgloss_mi=gloss[1], + created_by=request.user, updated_by=request.user) except IndexError: # If we get IndexError, idgloss_mi was probably not provided - new_gloss = Gloss( - dataset=dataset, - idgloss=gloss[0], - created_by=request.user, - updated_by=request.user, - ) + new_gloss = Gloss(dataset=dataset, idgloss=gloss[0], + created_by=request.user, updated_by=request.user) new_gloss.save() glosses_added.append((new_gloss.idgloss, new_gloss.idgloss_mi)) # Flush request.session['glosses_new'] and request.session['dataset'] - del request.session["glosses_new"] - del request.session["dataset_id"] + del request.session['glosses_new'] + del request.session['dataset_id'] # Set a message to be shown so that the user knows what is going on. - messages.add_message( - request, messages.SUCCESS, _("Glosses were added successfully.") - ) - return render( - request, - "dictionary/import_gloss_csv_confirmation.html", - {"glosses_added": glosses_added, "dataset": dataset.name}, - ) + messages.add_message(request, messages.SUCCESS, _('Glosses were added successfully.')) + return render(request, "dictionary/import_gloss_csv_confirmation.html", {'glosses_added': glosses_added, + 'dataset': dataset.name}) else: - return HttpResponseRedirect(reverse("dictionary:import_gloss_csv")) + return HttpResponseRedirect(reverse('dictionary:import_gloss_csv')) else: # If request method is not POST, redirect to the import form - return HttpResponseRedirect(reverse("dictionary:import_gloss_csv")) + return HttpResponseRedirect(reverse('dictionary:import_gloss_csv')) share_csv_header_list = [ @@ -262,32 +191,20 @@ def import_nzsl_share_gloss_csv(request): csv_form = CSVUploadForm() allowed_datasets = get_objects_for_user(request.user, "dictionary.view_dataset") # Make sure we only list datasets the user has permissions to. - csv_form.fields["dataset"].queryset = csv_form.fields[ - "dataset" - ].queryset.filter(id__in=[x.id for x in allowed_datasets]) - return render( - request, - "dictionary/import_nzsl_share_gloss_csv.html", - {"import_csv_form": csv_form}, - ) + csv_form.fields["dataset"].queryset = csv_form.fields["dataset"].queryset.filter( + id__in=[x.id for x in allowed_datasets]) + return render(request, "dictionary/import_nzsl_share_gloss_csv.html", + {"import_csv_form": csv_form}, ) form = CSVUploadForm(request.POST, request.FILES) if not form.is_valid(): # If form is not valid, set a error message and return to the original form. - messages.add_message( - request, - messages.ERROR, - _( - "The provided CSV-file does not meet the requirements " - "or there is some other problem." - ), - ) - return render( - request, - "dictionary/import_nzsl_share_gloss_csv.html", - {"import_csv_form": form}, - ) + messages.add_message(request, messages.ERROR, + _("The provided CSV-file does not meet the requirements " + "or there is some other problem.")) + return render(request, "dictionary/import_nzsl_share_gloss_csv.html", + {"import_csv_form": form}, ) new_glosses = [] dataset = form.cleaned_data["dataset"] @@ -301,7 +218,7 @@ def import_nzsl_share_gloss_csv(request): codecs.iterdecode(form.cleaned_data["file"], "utf-8"), fieldnames=share_csv_header_list, delimiter=",", - quotechar='"', + quotechar='"' ) skipped_existing_glosses = [] @@ -337,40 +254,29 @@ def import_nzsl_share_gloss_csv(request): request.session.pop("dataset_id", None) request.session.pop("glosses_new", None) # Set a message to be shown so that the user knows what is going on. - messages.add_message( - request, messages.ERROR, _("Cannot open the file:" + str(e)) - ) - return render( - request, - "dictionary/import_nzsl_share_gloss_csv.html", - {"import_csv_form": CSVUploadForm()}, - ) + messages.add_message(request, messages.ERROR, _("Cannot open the file:" + str(e))) + return render(request, "dictionary/import_nzsl_share_gloss_csv.html", + {"import_csv_form": CSVUploadForm()}, ) except UnicodeDecodeError as e: # File is not UTF-8 encoded. messages.add_message(request, messages.ERROR, _("File must be UTF-8 encoded!")) - return render( - request, - "dictionary/import_nzsl_share_gloss_csv.html", - {"import_csv_form": CSVUploadForm()}, - ) + return render(request, "dictionary/import_nzsl_share_gloss_csv.html", + {"import_csv_form": CSVUploadForm()}, ) # Store dataset's id and the list of glosses to be added in session. request.session["dataset_id"] = dataset.id request.session["glosses_new"] = new_glosses - return render( - request, - "dictionary/import_nzsl_share_gloss_csv_confirmation.html", - { - "glosses_new": new_glosses, - "dataset": dataset, - "skipped_existing_glosses": skipped_existing_glosses, - }, - ) + return render(request, "dictionary/import_nzsl_share_gloss_csv_confirmation.html", + { + "glosses_new": new_glosses, + "dataset": dataset, + "skipped_existing_glosses": skipped_existing_glosses + }) def update_retrieval_videos(videos, gloss_data): - """prep videos, illustrations and usage example for video retrieval""" + """ prep videos, illustrations and usage example for video retrieval """ gloss_pk = gloss_data["gloss"].pk gloss_word = gloss_data["word"] @@ -378,14 +284,16 @@ def update_retrieval_videos(videos, gloss_data): if gloss_data.get("videos", None): video_url = gloss_data["videos"] extension = video_url[-3:] - file_name = f"{gloss_pk}-{gloss_word}.{gloss_pk}_video.{extension}" + file_name = ( + f"{gloss_pk}-{gloss_word}.{gloss_pk}_video.{extension}" + ) glossvideo = { "url": video_url, "file_name": file_name, "gloss_pk": gloss_pk, "video_type": "main", - "version": 0, + "version": 0 } videos.append(glossvideo) @@ -401,7 +309,7 @@ def update_retrieval_videos(videos, gloss_data): "file_name": file_name, "gloss_pk": gloss_pk, "video_type": "main", - "version": i, + "version": i } videos.append(glossvideo) @@ -417,18 +325,14 @@ def update_retrieval_videos(videos, gloss_data): "file_name": file_name, "gloss_pk": gloss_pk, "video_type": f"finalexample{i + 1}", - "version": i, + "version": i } videos.append(glossvideo) - @login_required @permission_required("dictionary.import_csv") @transaction.atomic() def confirm_import_nzsl_share_gloss_csv(request): - - pprint(request.session.__dict__) - """This view adds the data to database if the user confirms the action""" if not request.method == "POST": # If request method is not POST, redirect to the import form @@ -444,31 +348,6 @@ def confirm_import_nzsl_share_gloss_csv(request): elif not "confirm" in request.POST: return HttpResponseRedirect(reverse("dictionary:import_nzsl_share_gloss_csv")) - if "glosses_new" and "dataset_id" in request.session: - [glosses_added, dataset_name] = confirm_import_nzsl_share_gloss_csv_inner( - request.session["glosses_new"], request.session["dataset_id"] - ) - - del request.session["glosses_new"] - del request.session["dataset_id"] - - # Set a message to be shown so that the user knows what is going on. - messages.add_message( - request, messages.SUCCESS, _("Glosses were added successfully.") - ) - - return render( - request, - "dictionary/import_nzsl_share_gloss_csv_confirmation.html", - {"glosses_added": glosses_added, "dataset": dataset_name}, - ) - - -def confirm_import_nzsl_share_gloss_csv_inner(session_glosses_new, session_dataset_id): - """Does the thing""" - - print("IN CONFIRM INNER") - glosses_added = [] dataset = None translations = [] @@ -483,49 +362,49 @@ def confirm_import_nzsl_share_gloss_csv_inner(session_glosses_new, session_datas bulk_share_validation_aggregations = [] video_import_only_glosses_data = [] - dataset = Dataset.objects.get(id=session_dataset_id) - language_en = Language.objects.get(name="English") - language_mi = Language.objects.get(name="Māori") - gloss_content_type = ContentType.objects.get_for_model(Gloss) - site = Site.objects.get_current() - comment_submit_date = datetime.datetime.now(tz=get_current_timezone()) - semantic_fields = FieldChoice.objects.filter(field="semantic_field").values_list( - "english_name", "pk" - ) - semantic_fields_dict = {field[0]: field[1] for field in semantic_fields} - signers = FieldChoice.objects.filter(field="signer") - signer_dict = {signer.english_name: signer for signer in signers} - existing_machine_values = [ - mv for mv in FieldChoice.objects.all().values_list("machine_value", flat=True) - ] - not_public_tag = Tag.objects.get(name="not public") - nzsl_share_tag = Tag.objects.get(name="nzsl-share") - import_user = User.objects.get( - username="nzsl_share_importer", - first_name="Importer", - last_name="NZSL Share", - ) + if "glosses_new" and "dataset_id" in request.session: + dataset = Dataset.objects.get(id=request.session["dataset_id"]) + language_en = Language.objects.get(name="English") + language_mi = Language.objects.get(name="Māori") + gloss_content_type = ContentType.objects.get_for_model(Gloss) + site = Site.objects.get_current() + comment_submit_date = datetime.datetime.now(tz=get_current_timezone()) + semantic_fields = FieldChoice.objects.filter( + field="semantic_field" + ).values_list("english_name", "pk") + semantic_fields_dict = {field[0]: field[1] for field in semantic_fields} + signers = FieldChoice.objects.filter(field="signer") + signer_dict = {signer.english_name: signer for signer in signers} + existing_machine_values = [ + mv for mv in FieldChoice.objects.all().values_list("machine_value", flat=True) + ] + not_public_tag = Tag.objects.get(name="not public") + nzsl_share_tag = Tag.objects.get(name="nzsl-share") + import_user = User.objects.get( + username="nzsl_share_importer", + first_name="Importer", + last_name="NZSL Share", + ) + + for row_num, gloss_data in enumerate(request.session["glosses_new"]): + # will iterate over these glosses again after bulk creating + # and to ensure we get the correct gloss_data for words that appear multiple + # times we'll use the row_num as the identifier for the gloss data + + # if the gloss already exists at this point, it can only mean that + # it has no videos and we want to import videos for it + # try-except saves us a db call + try: + gloss = Gloss.objects.filter(nzsl_share_id=gloss_data["id"]).get() + gloss_data_copy = gloss_data.copy() + gloss_data_copy["gloss"] = gloss + video_import_only_glosses_data.append(gloss_data_copy) + continue + except Gloss.DoesNotExist: + pass - for row_num, gloss_data in enumerate(session_glosses_new): - # will iterate over these glosses again after bulk creating - # and to ensure we get the correct gloss_data for words that appear multiple - # times we'll use the row_num as the identifier for the gloss data - - # if the gloss already exists at this point, it can only mean that - # it has no videos and we want to import videos for it - # try-except saves us a db call - try: - gloss = Gloss.objects.filter(nzsl_share_id=gloss_data["id"]).get() - gloss_data_copy = gloss_data.copy() - gloss_data_copy["gloss"] = gloss - video_import_only_glosses_data.append(gloss_data_copy) - continue - except Gloss.DoesNotExist: - pass - - new_glosses[str(row_num)] = gloss_data - bulk_create_gloss.append( - Gloss( + new_glosses[str(row_num)] = gloss_data + bulk_create_gloss.append(Gloss( dataset=dataset, nzsl_share_id=gloss_data["id"], # need to make idgloss unique in dataset, @@ -536,174 +415,183 @@ def confirm_import_nzsl_share_gloss_csv_inner(session_glosses_new, session_datas created_by=import_user, updated_by=import_user, exclude_from_ecv=True, - ) - ) - contributors.append(gloss_data["contributor_username"]) - - bulk_created = Gloss.objects.bulk_create(bulk_create_gloss) - - # Create new signers for contributors that do not exist as signers yet - contributors = set(contributors) - create_signers = [] - signers = signer_dict.keys() - for contributor in contributors: - if contributor not in signers: - new_machine_value = random.randint(0, 99999999) - while new_machine_value in existing_machine_values: + )) + contributors.append(gloss_data["contributor_username"]) + + bulk_created = Gloss.objects.bulk_create(bulk_create_gloss) + + # Create new signers for contributors that do not exist as signers yet + contributors = set(contributors) + create_signers = [] + signers = signer_dict.keys() + for contributor in contributors: + if contributor not in signers: new_machine_value = random.randint(0, 99999999) - existing_machine_values.append(new_machine_value) - create_signers.append( - FieldChoice( + while new_machine_value in existing_machine_values: + new_machine_value = random.randint(0, 99999999) + existing_machine_values.append(new_machine_value) + create_signers.append(FieldChoice( field="signer", english_name=contributor, - machine_value=new_machine_value, - ) - ) - new_signers = FieldChoice.objects.bulk_create(create_signers) - for signer in new_signers: - signer_dict[signer.english_name] = signer - - for gloss in bulk_created: - word_en, row_num = gloss.idgloss.split("_row") - gloss_data = new_glosses[row_num] - gloss_data["gloss"] = gloss - - # get semantic fields for gloss_data topics - if gloss_data.get("topic_names", None): - gloss_topics = gloss_data["topic_names"].split("|") - # ignore all signs and All signs - cleaned_gloss_topics = [ - x for x in gloss_topics if x not in ["all signs", "All signs"] - ] - add_miscellaneous = False - - for topic in cleaned_gloss_topics: - if topic in semantic_fields_dict.keys(): + machine_value=new_machine_value + )) + new_signers = FieldChoice.objects.bulk_create(create_signers) + for signer in new_signers: + signer_dict[signer.english_name] = signer + + for gloss in bulk_created: + word_en, row_num = gloss.idgloss.split("_row") + gloss_data = new_glosses[row_num] + gloss_data["gloss"] = gloss + + # get semantic fields for gloss_data topics + if gloss_data.get("topic_names", None): + gloss_topics = gloss_data["topic_names"].split("|") + # ignore all signs and All signs + cleaned_gloss_topics = [ + x for x in gloss_topics if x not in ["all signs", "All signs"] + ] + add_miscellaneous = False + + for topic in cleaned_gloss_topics: + if topic in semantic_fields_dict.keys(): + bulk_semantic_fields.append( + Gloss.semantic_field.through( + gloss_id=gloss.id, + fieldchoice_id=semantic_fields_dict[topic] + ) + ) + else: + # add the miscellaneous semantic field if a topic does not exist + add_miscellaneous = True + + if add_miscellaneous: bulk_semantic_fields.append( Gloss.semantic_field.through( gloss_id=gloss.id, - fieldchoice_id=semantic_fields_dict[topic], + fieldchoice_id=semantic_fields_dict["Miscellaneous"] ) ) - else: - # add the miscellaneous semantic field if a topic does not exist - add_miscellaneous = True - - if add_miscellaneous: - bulk_semantic_fields.append( - Gloss.semantic_field.through( - gloss_id=gloss.id, - fieldchoice_id=semantic_fields_dict["Miscellaneous"], - ) - ) - # create GlossTranslations for english and maori words - translations.append( - GlossTranslations( + # create GlossTranslations for english and maori words + translations.append(GlossTranslations( gloss=gloss, language=language_en, translations=gloss_data["word"], - translations_secondary=gloss_data.get("secondary", None), - ) - ) - if gloss_data.get("maori", None): - # There is potentially several comma separated maori words - maori_words = gloss_data["maori"].split(", ") - - # Update idgloss_mi using first maori word, then create translation - gloss.idgloss_mi = f"{maori_words[0]}:{gloss.pk}" - - translation = GlossTranslations( - gloss=gloss, language=language_mi, translations=maori_words[0] - ) - if len(maori_words) > 1: - translation.translations_secondary = ", ".join(maori_words[1:]) + translations_secondary=gloss_data.get("secondary", None) + )) + if gloss_data.get("maori", None): + # There is potentially several comma separated maori words + maori_words = gloss_data["maori"].split(", ") + + # Update idgloss_mi using first maori word, then create translation + gloss.idgloss_mi = f"{maori_words[0]}:{gloss.pk}" + + translation = GlossTranslations( + gloss=gloss, + language=language_mi, + translations=maori_words[0] + ) + if len(maori_words) > 1: + translation.translations_secondary = ", ".join(maori_words[1:]) - translations.append(translation) + translations.append(translation) - # Prepare new idgloss and signer fields for bulk update - gloss.idgloss = f"{word_en}:{gloss.pk}" - gloss.signer = signer_dict[gloss_data["contributor_username"]] - bulk_update_glosses.append(gloss) + # Prepare new idgloss and signer fields for bulk update + gloss.idgloss = f"{word_en}:{gloss.pk}" + gloss.signer = signer_dict[gloss_data["contributor_username"]] + bulk_update_glosses.append(gloss) - # Create comment for gloss_data notes - comments.append( - Comment( + # Create comment for gloss_data notes + comments.append(Comment( content_type=gloss_content_type, object_pk=gloss.pk, user_name=gloss_data.get("contributor_username", ""), comment=gloss_data.get("notes", ""), site=site, is_public=False, - submit_date=comment_submit_date, - ) - ) - if gloss_data.get("sign_comments", None): - # create Comments for all gloss_data sign_comments - for comment in gloss_data["sign_comments"].split("|"): - try: - comment_content = comment.split(":") - user_name = comment_content[0] - comment_content = comment_content[1] - except IndexError: - comment_content = comment - user_name = "Unknown" - comments.append( - Comment( + submit_date=comment_submit_date + )) + if gloss_data.get("sign_comments", None): + # create Comments for all gloss_data sign_comments + for comment in gloss_data["sign_comments"].split("|"): + try: + comment_content = comment.split(":") + user_name = comment_content[0] + comment_content = comment_content[1] + except IndexError: + comment_content = comment + user_name = "Unknown" + comments.append(Comment( content_type=gloss_content_type, object_pk=gloss.pk, user_name=user_name, comment=comment_content, site=site, is_public=False, - submit_date=comment_submit_date, - ) - ) + submit_date=comment_submit_date + )) - # Add ShareValidationAggregation - bulk_share_validation_aggregations.append( - ShareValidationAggregation( + # Add ShareValidationAggregation + bulk_share_validation_aggregations.append(ShareValidationAggregation( gloss=gloss, agrees=int(gloss_data["agrees"]), - disagrees=int(gloss_data["disagrees"]), - ) - ) + disagrees=int(gloss_data["disagrees"]) + )) - # prep videos, illustrations and usage example for video retrieval - update_retrieval_videos(videos, gloss_data) + # prep videos, illustrations and usage example for video retrieval + update_retrieval_videos(videos, gloss_data) - glosses_added.append(gloss) + glosses_added.append(gloss) - bulk_tagged_items.append( - TaggedItem( - content_type=gloss_content_type, object_id=gloss.pk, tag=nzsl_share_tag - ) - ) - bulk_tagged_items.append( - TaggedItem( - content_type=gloss_content_type, object_id=gloss.pk, tag=not_public_tag - ) - ) + bulk_tagged_items.append(TaggedItem( + content_type=gloss_content_type, + object_id=gloss.pk, + tag=nzsl_share_tag - # Bulk create entities related to the gloss, and bulk update the glosses' idgloss - Comment.objects.bulk_create(comments) - GlossTranslations.objects.bulk_create(translations) - Gloss.objects.bulk_update(bulk_update_glosses, ["idgloss", "idgloss_mi", "signer"]) - Gloss.semantic_field.through.objects.bulk_create(bulk_semantic_fields) - TaggedItem.objects.bulk_create(bulk_tagged_items) - ShareValidationAggregation.objects.bulk_create(bulk_share_validation_aggregations) + )) + bulk_tagged_items.append(TaggedItem( + content_type=gloss_content_type, + object_id=gloss.pk, + tag=not_public_tag - # Add the video-update only glosses - for video_import_gloss_data in video_import_only_glosses_data: - # prep videos, illustrations and usage example for video retrieval - update_retrieval_videos(videos, video_import_gloss_data) - glosses_added.append(video_import_gloss_data["gloss"]) + )) - # start Thread to process gloss video retrieval in the background - t = threading.Thread(target=retrieve_videos_for_glosses, args=[videos], daemon=True) - t.start() + # Bulk create entities related to the gloss, and bulk update the glosses' idgloss + Comment.objects.bulk_create(comments) + GlossTranslations.objects.bulk_create(translations) + Gloss.objects.bulk_update(bulk_update_glosses, ["idgloss", "idgloss_mi", "signer"]) + Gloss.semantic_field.through.objects.bulk_create(bulk_semantic_fields) + TaggedItem.objects.bulk_create(bulk_tagged_items) + ShareValidationAggregation.objects.bulk_create(bulk_share_validation_aggregations) + + # Add the video-update only glosses + for video_import_gloss_data in video_import_only_glosses_data: + # prep videos, illustrations and usage example for video retrieval + update_retrieval_videos(videos, video_import_gloss_data) + glosses_added.append(video_import_gloss_data["gloss"]) + + # start Thread to process gloss video retrieval in the background + t = threading.Thread( + target=retrieve_videos_for_glosses, + args=[videos], + daemon=True + ) + t.start() + + del request.session["glosses_new"] + del request.session["dataset_id"] + + # Set a message to be shown so that the user knows what is going on. + messages.add_message(request, messages.SUCCESS, _("Glosses were added successfully.")) + return render( + request, "dictionary/import_nzsl_share_gloss_csv_confirmation.html", + { + "glosses_added": glosses_added, + "dataset": dataset.name + } + ) - return [glosses_added, dataset.name] @login_required @@ -720,29 +608,18 @@ def import_qualtrics_csv(request): if not request.method == "POST": # If request type is not POST, return to the original form. csv_form = CSVFileOnlyUpload() - return render( - request, - "dictionary/import_qualtrics_csv.html", - {"import_csv_form": csv_form}, - ) + return render(request, "dictionary/import_qualtrics_csv.html", + {"import_csv_form": csv_form}, ) form = CSVFileOnlyUpload(request.POST, request.FILES) if not form.is_valid(): # If form is not valid, set a error message and return to the original form. - messages.add_message( - request, - messages.ERROR, - _( - "The provided CSV-file does not meet the requirements " - "or there is some other problem." - ), - ) - return render( - request, - "dictionary/import_qualtrics_csv.html", - {"import_csv_form": form}, - ) + messages.add_message(request, messages.ERROR, + _("The provided CSV-file does not meet the requirements " + "or there is some other problem.")) + return render(request, "dictionary/import_qualtrics_csv.html", + {"import_csv_form": form}, ) validation_records = [] skipped_rows = [] @@ -750,7 +627,7 @@ def import_qualtrics_csv(request): validation_record_reader = csv.DictReader( codecs.iterdecode(form.cleaned_data["file"], "utf-8"), delimiter=",", - quotechar='"', + quotechar='"' ) question_numbers = [] @@ -792,33 +669,22 @@ def import_qualtrics_csv(request): request.session.pop("question_numbers", None) request.session.pop("question_gloss_map", None) # Set a message to be shown so that the user knows what is going on. - messages.add_message( - request, messages.ERROR, _("Cannot open the file:" + str(e)) - ) - return render( - request, - "dictionary/import_qualtrics_csv.html", - {"import_csv_form": CSVFileOnlyUpload()}, - ) + messages.add_message(request, messages.ERROR, _("Cannot open the file:" + str(e))) + return render(request, "dictionary/import_qualtrics_csv.html", + {"import_csv_form": CSVFileOnlyUpload()}, ) except UnicodeDecodeError as e: # File is not UTF-8 encoded. messages.add_message(request, messages.ERROR, _("File must be UTF-8 encoded!")) - return render( - request, - "dictionary/import_qualtrics_csv.html", - {"import_csv_form": CSVFileOnlyUpload()}, - ) + return render(request, "dictionary/import_qualtrics_csv.html", + {"import_csv_form": CSVFileOnlyUpload()}, ) # Store dataset's id and the list of glosses to be added in session. request.session["validation_records"] = validation_records request.session["question_numbers"] = question_numbers request.session["question_glossvideo_map"] = question_to_glossvideo_map - return render( - request, - "dictionary/import_qualtrics_csv_confirmation.html", - {"validation_records": validation_records, "skipped_rows": skipped_rows}, - ) + return render(request, "dictionary/import_qualtrics_csv_confirmation.html", + {"validation_records": validation_records, "skipped_rows": skipped_rows}) @login_required @@ -848,21 +714,13 @@ def confirm_import_qualtrics_csv(request): bulk_tagged_items = [] gloss_pks = set() - if ( - "validation_records" - and "question_numbers" - and "question_glossvideo_map" in request.session - ): + if "validation_records" and "question_numbers" and "question_glossvideo_map" in request.session: # Retrieve glosses glossvideo_pk_list = request.session["question_glossvideo_map"].values() - glossvideo_dict = GlossVideo.objects.select_related("gloss").in_bulk( - glossvideo_pk_list - ) + glossvideo_dict = GlossVideo.objects.select_related("gloss").in_bulk(glossvideo_pk_list) gloss_content_type = ContentType.objects.get_for_model(Gloss) check_result_tag = Tag.objects.get(name=settings.TAG_VALIDATION_CHECK_RESULTS) - ready_for_validation_tag = Tag.objects.get( - name=settings.TAG_READY_FOR_VALIDATION - ) + ready_for_validation_tag = Tag.objects.get(name=settings.TAG_READY_FOR_VALIDATION) questions_numbers = request.session["question_numbers"] question_glossvideo_map = request.session["question_glossvideo_map"] @@ -882,43 +740,35 @@ def confirm_import_qualtrics_csv(request): sign_seen = ValidationRecord.SignSeenChoices.NOT_SURE.value try: - gloss = glossvideo_dict[ - question_glossvideo_map[question_number] - ].gloss - validation_records_added.append( - ValidationRecord( - gloss=gloss, - sign_seen=ValidationRecord.SignSeenChoices(sign_seen), - response_id=response_id, - respondent_first_name=respondent_first_name, - respondent_last_name=respondent_last_name, - comment=record.get(f"{question_number}_Q2_5_TEXT", ""), - ) - ) + gloss = glossvideo_dict[question_glossvideo_map[question_number]].gloss + validation_records_added.append(ValidationRecord( + gloss=gloss, + sign_seen=ValidationRecord.SignSeenChoices(sign_seen), + response_id=response_id, + respondent_first_name=respondent_first_name, + respondent_last_name=respondent_last_name, + comment=record.get(f"{question_number}_Q2_5_TEXT", ""), + )) gloss_pks.add(gloss.pk) except KeyError: - missing_gloss_pk_question_pairs[question_number] = ( - question_glossvideo_map[question_number] - ) + missing_gloss_pk_question_pairs[question_number] = question_glossvideo_map[ + question_number] for gloss_pk in gloss_pks: - bulk_tagged_items.append( - TaggedItem( - content_type=gloss_content_type, - object_id=gloss_pk, - tag=check_result_tag, - ) - ) + bulk_tagged_items.append(TaggedItem( + content_type=gloss_content_type, + object_id=gloss_pk, + tag=check_result_tag + + )) # ignoring conflicts so the unique together on the model filters out potential duplicates - ValidationRecord.objects.bulk_create( - validation_records_added, ignore_conflicts=True - ) + ValidationRecord.objects.bulk_create(validation_records_added, ignore_conflicts=True) TaggedItem.objects.bulk_create(bulk_tagged_items, ignore_conflicts=True) TaggedItem.objects.filter( content_type=gloss_content_type, object_id__in=gloss_pks, - tag=ready_for_validation_tag, + tag=ready_for_validation_tag ).delete() del request.session["validation_records"] @@ -926,19 +776,17 @@ def confirm_import_qualtrics_csv(request): del request.session["question_glossvideo_map"] # Set a message to be shown so that the user knows what is going on. - messages.add_message( - request, messages.SUCCESS, _("ValidationRecords were added successfully.") - ) + messages.add_message(request, messages.SUCCESS, + _("ValidationRecords were added successfully.")) return render( - request, - "dictionary/import_qualtrics_csv_confirmation.html", + request, "dictionary/import_qualtrics_csv_confirmation.html", { "validation_records_added": validation_records_added, "validation_record_count": len(validation_records_added), "responses_count": len(validation_records), "gloss_count": len(gloss_pks), - "missing_gloss_question_pairs": missing_gloss_pk_question_pairs, - }, + "missing_gloss_question_pairs": missing_gloss_pk_question_pairs + } ) @@ -967,29 +815,18 @@ def import_manual_validation(request): if request.method != "POST": # If request type is not POST, return to the original form. csv_form = CSVFileOnlyUpload() - return render( - request, - "dictionary/import_manual_validation_csv.html", - {"import_csv_form": csv_form}, - ) + return render(request, "dictionary/import_manual_validation_csv.html", + {"import_csv_form": csv_form}, ) form = CSVFileOnlyUpload(request.POST, request.FILES) if not form.is_valid(): # If form is not valid, set a error message and return to the original form. - messages.add_message( - request, - messages.ERROR, - _( - "The provided CSV-file does not meet the requirements " - "or there is some other problem." - ), - ) - return render( - request, - "dictionary/import_manual_validation_csv.html", - {"import_csv_form": form}, - ) + messages.add_message(request, messages.ERROR, + _("The provided CSV-file does not meet the requirements " + "or there is some other problem.")) + return render(request, "dictionary/import_manual_validation_csv.html", + {"import_csv_form": form}, ) group_row_map = defaultdict(list) group_gloss_count = defaultdict(int) @@ -1000,38 +837,29 @@ def import_manual_validation(request): "yes", "no", "abstain or not sure", - "comments", + "comments" ] try: validation_record_reader = csv.DictReader( codecs.iterdecode(form.cleaned_data["file"], "utf-8-sig"), delimiter=",", - quotechar='"', - ) - missing_headers = set(required_headers) - set( - validation_record_reader.fieldnames + quotechar='"' ) + missing_headers = set(required_headers) - set(validation_record_reader.fieldnames) if missing_headers != set(): request.session.pop("group_row_map", None) request.session.pop("glosses", None) # Set a message to be shown so that the user knows what is going on. - messages.add_message( - request, - messages.ERROR, - _(f"CSV is missing required columns: {missing_headers}"), - ) - return render( - request, - "dictionary/import_manual_validation_csv.html", - {"import_csv_form": CSVFileOnlyUpload()}, - ) + messages.add_message(request, messages.ERROR, + _(f"CSV is missing required columns: {missing_headers}")) + return render(request, + "dictionary/import_manual_validation_csv.html", + {"import_csv_form": CSVFileOnlyUpload()}, ) for row in validation_record_reader: if validation_record_reader.line_num == 1: continue - _check_row_can_be_converted_to_integer( - row, ["yes", "no", "abstain or not sure"] - ) + _check_row_can_be_converted_to_integer(row, ["yes", "no", "abstain or not sure"]) group_row_map[row["group"]].append(row) group_gloss_count[row["group"]] += 1 glosses.append(row["idgloss"].split(":")[1]) @@ -1040,49 +868,35 @@ def import_manual_validation(request): request.session.pop("group_row_map", None) request.session.pop("glosses", None) # Set a message to be shown so that the user knows what is going on. - messages.add_message( - request, messages.ERROR, _("File contains non-compliant data:" + str(e)) - ) - return render( - request, - "dictionary/import_manual_validation_csv.html", - {"import_csv_form": CSVFileOnlyUpload()}, - ) + messages.add_message(request, messages.ERROR, _("File contains non-compliant data:" + str(e))) + return render(request, "dictionary/import_manual_validation_csv.html", + {"import_csv_form": CSVFileOnlyUpload()}, ) except csv.Error as e: # Can't open file, remove session variables request.session.pop("group_row_map", None) request.session.pop("glosses", None) # Set a message to be shown so that the user knows what is going on. - messages.add_message( - request, messages.ERROR, _("Cannot open the file:" + str(e)) - ) - return render( - request, - "dictionary/import_manual_validation_csv.html", - {"import_csv_form": CSVFileOnlyUpload()}, - ) + messages.add_message(request, messages.ERROR, _("Cannot open the file:" + str(e))) + return render(request, "dictionary/import_manual_validation_csv.html", + {"import_csv_form": CSVFileOnlyUpload()}, ) except UnicodeDecodeError as e: # File is not UTF-8 encoded. messages.add_message(request, messages.ERROR, _("File must be UTF-8 encoded!")) - return render( - request, - "dictionary/import_manual_validation_csv.html", - {"import_csv_form": CSVFileOnlyUpload()}, - ) + return render(request, "dictionary/import_manual_validation_csv.html", + {"import_csv_form": CSVFileOnlyUpload()}, ) # Store dataset's id and the list of glosses to be added in session. request.session["group_row_map"] = group_row_map request.session["glosses"] = list(set(glosses)) return render( - request, - "dictionary/import_manual_validation_csv_confirmation.html", + request, "dictionary/import_manual_validation_csv_confirmation.html", { # iterating over defaultdicts causes issues in template rendering "group_row_map": dict(group_row_map), - "group_gloss_count": dict(group_gloss_count), - }, + "group_gloss_count": dict(group_gloss_count) + } ) @@ -1126,18 +940,14 @@ def confirm_import_manual_validation(request): sign_seen_no = row["no"] sign_seen_not_sure = row["abstain or not sure"] comments = row["comments"] - manual_validation_aggregations.append( - ManualValidationAggregation( - gloss=gloss, - group=group, - sign_seen_yes=int(sign_seen_yes) if sign_seen_yes else 0, - sign_seen_no=int(sign_seen_no) if sign_seen_no else 0, - sign_seen_not_sure=( - int(sign_seen_not_sure) if sign_seen_not_sure else 0 - ), - comments=comments, - ) - ) + manual_validation_aggregations.append(ManualValidationAggregation( + gloss=gloss, + group=group, + sign_seen_yes=int(sign_seen_yes) if sign_seen_yes else 0, + sign_seen_no=int(sign_seen_no) if sign_seen_no else 0, + sign_seen_not_sure=int(sign_seen_not_sure) if sign_seen_not_sure else 0, + comments=comments + )) ManualValidationAggregation.objects.bulk_create(manual_validation_aggregations) @@ -1145,15 +955,13 @@ def confirm_import_manual_validation(request): del request.session["glosses"] # Set a message to be shown so that the user knows what is going on. - messages.add_message( - request, messages.SUCCESS, _("ValidationRecords were added successfully.") - ) + messages.add_message(request, messages.SUCCESS, + _("ValidationRecords were added successfully.")) return render( - request, - "dictionary/import_manual_validation_csv_confirmation.html", + request, "dictionary/import_manual_validation_csv_confirmation.html", { "manual_validation_aggregations": manual_validation_aggregations, "manual_validation_aggregations_count": len(manual_validation_aggregations), - "missing_glosses": missing_glosses, - }, + "missing_glosses": missing_glosses + } ) From 5c5fcc60ecef0ea2d62e00987ea5fa62a92d0266 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Mon, 28 Oct 2024 13:41:36 +1100 Subject: [PATCH 149/222] Revert "Revert "Experimental refactor csv_import"" This reverts commit 00a58b6e9ae1b43465cf013c3c16e04b2c5a336b. --- signbank/dictionary/csv_import.py | 884 ++++++++++++++++++------------ 1 file changed, 538 insertions(+), 346 deletions(-) diff --git a/signbank/dictionary/csv_import.py b/signbank/dictionary/csv_import.py index 0bfdff69..0685d701 100644 --- a/signbank/dictionary/csv_import.py +++ b/signbank/dictionary/csv_import.py @@ -1,6 +1,8 @@ # -*- coding: utf-8 -*- from __future__ import unicode_literals +from pprint import pprint + import codecs import csv import datetime @@ -26,8 +28,16 @@ from tagging.models import Tag, TaggedItem from .forms import CSVFileOnlyUpload, CSVUploadForm -from .models import (Dataset, FieldChoice, Gloss, GlossTranslations, Language, - ManualValidationAggregation, ShareValidationAggregation, ValidationRecord) +from .models import ( + Dataset, + FieldChoice, + Gloss, + GlossTranslations, + Language, + ManualValidationAggregation, + ShareValidationAggregation, + ValidationRecord, +) from .tasks import retrieve_videos_for_glosses from ..video.models import GlossVideo @@ -35,7 +45,7 @@ @login_required -@permission_required('dictionary.import_csv') +@permission_required("dictionary.import_csv") def import_gloss_csv(request): """ Check which objects exist and which not. Then show the user a list of glosses that will be added if user confirms. @@ -44,31 +54,53 @@ def import_gloss_csv(request): glosses_new = [] glosses_exists = [] # Make sure that the session variables are flushed before using this view. - if 'dataset_id' in request.session: del request.session['dataset_id'] - if 'glosses_new' in request.session: del request.session['glosses_new'] + if "dataset_id" in request.session: + del request.session["dataset_id"] + if "glosses_new" in request.session: + del request.session["glosses_new"] - if request.method == 'POST': + if request.method == "POST": form = CSVUploadForm(request.POST, request.FILES) if form.is_valid(): - dataset = form.cleaned_data['dataset'] - if 'view_dataset' not in get_perms(request.user, dataset): + dataset = form.cleaned_data["dataset"] + if "view_dataset" not in get_perms(request.user, dataset): # If user has no permissions to dataset, raise PermissionDenied to show 403 template. - msg = _("You do not have permissions to import glosses to this lexicon.") + msg = _( + "You do not have permissions to import glosses to this lexicon." + ) messages.error(request, msg) raise PermissionDenied(msg) try: - glossreader = csv.reader(codecs.iterdecode(form.cleaned_data['file'], 'utf-8'), delimiter=',', quotechar='"') + glossreader = csv.reader( + codecs.iterdecode(form.cleaned_data["file"], "utf-8"), + delimiter=",", + quotechar='"', + ) except csv.Error as e: # Can't open file, remove session variables - if 'dataset_id' in request.session: del request.session['dataset_id'] - if 'glosses_new' in request.session: del request.session['glosses_new'] + if "dataset_id" in request.session: + del request.session["dataset_id"] + if "glosses_new" in request.session: + del request.session["glosses_new"] # Set a message to be shown so that the user knows what is going on. - messages.add_message(request, messages.ERROR, _('Cannot open the file:' + str(e))) - return render(request, 'dictionary/import_gloss_csv.html', {'import_csv_form': CSVUploadForm()}, ) + messages.add_message( + request, messages.ERROR, _("Cannot open the file:" + str(e)) + ) + return render( + request, + "dictionary/import_gloss_csv.html", + {"import_csv_form": CSVUploadForm()}, + ) except UnicodeDecodeError as e: # File is not UTF-8 encoded. - messages.add_message(request, messages.ERROR, _('File must be UTF-8 encoded!')) - return render(request, 'dictionary/import_gloss_csv.html', {'import_csv_form': CSVUploadForm()}, ) + messages.add_message( + request, messages.ERROR, _("File must be UTF-8 encoded!") + ) + return render( + request, + "dictionary/import_gloss_csv.html", + {"import_csv_form": CSVUploadForm()}, + ) for row in glossreader: if glossreader.line_num == 1: @@ -87,74 +119,113 @@ def import_gloss_csv(request): continue # Store dataset's id and the list of glosses to be added in session. - request.session['dataset_id'] = dataset.id - request.session['glosses_new'] = glosses_new - - return render(request, 'dictionary/import_gloss_csv_confirmation.html', - {'glosses_new': glosses_new, - 'glosses_exists': glosses_exists, - 'dataset': dataset, }) + request.session["dataset_id"] = dataset.id + request.session["glosses_new"] = glosses_new + + return render( + request, + "dictionary/import_gloss_csv_confirmation.html", + { + "glosses_new": glosses_new, + "glosses_exists": glosses_exists, + "dataset": dataset, + }, + ) else: # If form is not valid, set a error message and return to the original form. - messages.add_message(request, messages.ERROR, _('The provided CSV-file does not meet the requirements ' - 'or there is some other problem.')) - return render(request, 'dictionary/import_gloss_csv.html', {'import_csv_form': form}, ) + messages.add_message( + request, + messages.ERROR, + _( + "The provided CSV-file does not meet the requirements " + "or there is some other problem." + ), + ) + return render( + request, + "dictionary/import_gloss_csv.html", + {"import_csv_form": form}, + ) else: # If request type is not POST, return to the original form. csv_form = CSVUploadForm() - allowed_datasets = get_objects_for_user(request.user, 'dictionary.view_dataset') + allowed_datasets = get_objects_for_user(request.user, "dictionary.view_dataset") # Make sure we only list datasets the user has permissions to. - csv_form.fields["dataset"].queryset = csv_form.fields["dataset"].queryset.filter( - id__in=[x.id for x in allowed_datasets]) - return render(request, "dictionary/import_gloss_csv.html", - {'import_csv_form': csv_form}, ) + csv_form.fields["dataset"].queryset = csv_form.fields[ + "dataset" + ].queryset.filter(id__in=[x.id for x in allowed_datasets]) + return render( + request, + "dictionary/import_gloss_csv.html", + {"import_csv_form": csv_form}, + ) @login_required -@permission_required('dictionary.import_csv') +@permission_required("dictionary.import_csv") def confirm_import_gloss_csv(request): """This view adds the data to database if the user confirms the action""" - if request.method == 'POST': - if 'cancel' in request.POST: + if request.method == "POST": + if "cancel" in request.POST: # If user cancels adding data, flush session variables - if 'dataset_id' in request.session: del request.session['dataset_id'] - if 'glosses_new' in request.session: del request.session['glosses_new'] + if "dataset_id" in request.session: + del request.session["dataset_id"] + if "glosses_new" in request.session: + del request.session["glosses_new"] # Set a message to be shown so that the user knows what is going on. - messages.add_message(request, messages.WARNING, _('Cancelled adding CSV data.')) - return HttpResponseRedirect(reverse('dictionary:import_gloss_csv')) + messages.add_message( + request, messages.WARNING, _("Cancelled adding CSV data.") + ) + return HttpResponseRedirect(reverse("dictionary:import_gloss_csv")) - elif 'confirm' in request.POST: + elif "confirm" in request.POST: glosses_added = [] dataset = None - if 'glosses_new' and 'dataset_id' in request.session: - dataset = Dataset.objects.get(id=request.session['dataset_id']) - for gloss in request.session['glosses_new']: + if "glosses_new" and "dataset_id" in request.session: + dataset = Dataset.objects.get(id=request.session["dataset_id"]) + for gloss in request.session["glosses_new"]: # If the Gloss does not already exist, continue adding. - if not Gloss.objects.filter(dataset=dataset, idgloss=gloss[0]).exists(): + if not Gloss.objects.filter( + dataset=dataset, idgloss=gloss[0] + ).exists(): try: - new_gloss = Gloss(dataset=dataset, idgloss=gloss[0], idgloss_mi=gloss[1], - created_by=request.user, updated_by=request.user) + new_gloss = Gloss( + dataset=dataset, + idgloss=gloss[0], + idgloss_mi=gloss[1], + created_by=request.user, + updated_by=request.user, + ) except IndexError: # If we get IndexError, idgloss_mi was probably not provided - new_gloss = Gloss(dataset=dataset, idgloss=gloss[0], - created_by=request.user, updated_by=request.user) + new_gloss = Gloss( + dataset=dataset, + idgloss=gloss[0], + created_by=request.user, + updated_by=request.user, + ) new_gloss.save() glosses_added.append((new_gloss.idgloss, new_gloss.idgloss_mi)) # Flush request.session['glosses_new'] and request.session['dataset'] - del request.session['glosses_new'] - del request.session['dataset_id'] + del request.session["glosses_new"] + del request.session["dataset_id"] # Set a message to be shown so that the user knows what is going on. - messages.add_message(request, messages.SUCCESS, _('Glosses were added successfully.')) - return render(request, "dictionary/import_gloss_csv_confirmation.html", {'glosses_added': glosses_added, - 'dataset': dataset.name}) + messages.add_message( + request, messages.SUCCESS, _("Glosses were added successfully.") + ) + return render( + request, + "dictionary/import_gloss_csv_confirmation.html", + {"glosses_added": glosses_added, "dataset": dataset.name}, + ) else: - return HttpResponseRedirect(reverse('dictionary:import_gloss_csv')) + return HttpResponseRedirect(reverse("dictionary:import_gloss_csv")) else: # If request method is not POST, redirect to the import form - return HttpResponseRedirect(reverse('dictionary:import_gloss_csv')) + return HttpResponseRedirect(reverse("dictionary:import_gloss_csv")) share_csv_header_list = [ @@ -191,20 +262,32 @@ def import_nzsl_share_gloss_csv(request): csv_form = CSVUploadForm() allowed_datasets = get_objects_for_user(request.user, "dictionary.view_dataset") # Make sure we only list datasets the user has permissions to. - csv_form.fields["dataset"].queryset = csv_form.fields["dataset"].queryset.filter( - id__in=[x.id for x in allowed_datasets]) - return render(request, "dictionary/import_nzsl_share_gloss_csv.html", - {"import_csv_form": csv_form}, ) + csv_form.fields["dataset"].queryset = csv_form.fields[ + "dataset" + ].queryset.filter(id__in=[x.id for x in allowed_datasets]) + return render( + request, + "dictionary/import_nzsl_share_gloss_csv.html", + {"import_csv_form": csv_form}, + ) form = CSVUploadForm(request.POST, request.FILES) if not form.is_valid(): # If form is not valid, set a error message and return to the original form. - messages.add_message(request, messages.ERROR, - _("The provided CSV-file does not meet the requirements " - "or there is some other problem.")) - return render(request, "dictionary/import_nzsl_share_gloss_csv.html", - {"import_csv_form": form}, ) + messages.add_message( + request, + messages.ERROR, + _( + "The provided CSV-file does not meet the requirements " + "or there is some other problem." + ), + ) + return render( + request, + "dictionary/import_nzsl_share_gloss_csv.html", + {"import_csv_form": form}, + ) new_glosses = [] dataset = form.cleaned_data["dataset"] @@ -218,7 +301,7 @@ def import_nzsl_share_gloss_csv(request): codecs.iterdecode(form.cleaned_data["file"], "utf-8"), fieldnames=share_csv_header_list, delimiter=",", - quotechar='"' + quotechar='"', ) skipped_existing_glosses = [] @@ -254,29 +337,40 @@ def import_nzsl_share_gloss_csv(request): request.session.pop("dataset_id", None) request.session.pop("glosses_new", None) # Set a message to be shown so that the user knows what is going on. - messages.add_message(request, messages.ERROR, _("Cannot open the file:" + str(e))) - return render(request, "dictionary/import_nzsl_share_gloss_csv.html", - {"import_csv_form": CSVUploadForm()}, ) + messages.add_message( + request, messages.ERROR, _("Cannot open the file:" + str(e)) + ) + return render( + request, + "dictionary/import_nzsl_share_gloss_csv.html", + {"import_csv_form": CSVUploadForm()}, + ) except UnicodeDecodeError as e: # File is not UTF-8 encoded. messages.add_message(request, messages.ERROR, _("File must be UTF-8 encoded!")) - return render(request, "dictionary/import_nzsl_share_gloss_csv.html", - {"import_csv_form": CSVUploadForm()}, ) + return render( + request, + "dictionary/import_nzsl_share_gloss_csv.html", + {"import_csv_form": CSVUploadForm()}, + ) # Store dataset's id and the list of glosses to be added in session. request.session["dataset_id"] = dataset.id request.session["glosses_new"] = new_glosses - return render(request, "dictionary/import_nzsl_share_gloss_csv_confirmation.html", - { - "glosses_new": new_glosses, - "dataset": dataset, - "skipped_existing_glosses": skipped_existing_glosses - }) + return render( + request, + "dictionary/import_nzsl_share_gloss_csv_confirmation.html", + { + "glosses_new": new_glosses, + "dataset": dataset, + "skipped_existing_glosses": skipped_existing_glosses, + }, + ) def update_retrieval_videos(videos, gloss_data): - """ prep videos, illustrations and usage example for video retrieval """ + """prep videos, illustrations and usage example for video retrieval""" gloss_pk = gloss_data["gloss"].pk gloss_word = gloss_data["word"] @@ -284,16 +378,14 @@ def update_retrieval_videos(videos, gloss_data): if gloss_data.get("videos", None): video_url = gloss_data["videos"] extension = video_url[-3:] - file_name = ( - f"{gloss_pk}-{gloss_word}.{gloss_pk}_video.{extension}" - ) + file_name = f"{gloss_pk}-{gloss_word}.{gloss_pk}_video.{extension}" glossvideo = { "url": video_url, "file_name": file_name, "gloss_pk": gloss_pk, "video_type": "main", - "version": 0 + "version": 0, } videos.append(glossvideo) @@ -309,7 +401,7 @@ def update_retrieval_videos(videos, gloss_data): "file_name": file_name, "gloss_pk": gloss_pk, "video_type": "main", - "version": i + "version": i, } videos.append(glossvideo) @@ -325,14 +417,18 @@ def update_retrieval_videos(videos, gloss_data): "file_name": file_name, "gloss_pk": gloss_pk, "video_type": f"finalexample{i + 1}", - "version": i + "version": i, } videos.append(glossvideo) + @login_required @permission_required("dictionary.import_csv") @transaction.atomic() def confirm_import_nzsl_share_gloss_csv(request): + + pprint(request.session.__dict__) + """This view adds the data to database if the user confirms the action""" if not request.method == "POST": # If request method is not POST, redirect to the import form @@ -348,6 +444,31 @@ def confirm_import_nzsl_share_gloss_csv(request): elif not "confirm" in request.POST: return HttpResponseRedirect(reverse("dictionary:import_nzsl_share_gloss_csv")) + if "glosses_new" and "dataset_id" in request.session: + [glosses_added, dataset_name] = confirm_import_nzsl_share_gloss_csv_inner( + request.session["glosses_new"], request.session["dataset_id"] + ) + + del request.session["glosses_new"] + del request.session["dataset_id"] + + # Set a message to be shown so that the user knows what is going on. + messages.add_message( + request, messages.SUCCESS, _("Glosses were added successfully.") + ) + + return render( + request, + "dictionary/import_nzsl_share_gloss_csv_confirmation.html", + {"glosses_added": glosses_added, "dataset": dataset_name}, + ) + + +def confirm_import_nzsl_share_gloss_csv_inner(session_glosses_new, session_dataset_id): + """Does the thing""" + + print("IN CONFIRM INNER") + glosses_added = [] dataset = None translations = [] @@ -362,49 +483,49 @@ def confirm_import_nzsl_share_gloss_csv(request): bulk_share_validation_aggregations = [] video_import_only_glosses_data = [] - if "glosses_new" and "dataset_id" in request.session: - dataset = Dataset.objects.get(id=request.session["dataset_id"]) - language_en = Language.objects.get(name="English") - language_mi = Language.objects.get(name="Māori") - gloss_content_type = ContentType.objects.get_for_model(Gloss) - site = Site.objects.get_current() - comment_submit_date = datetime.datetime.now(tz=get_current_timezone()) - semantic_fields = FieldChoice.objects.filter( - field="semantic_field" - ).values_list("english_name", "pk") - semantic_fields_dict = {field[0]: field[1] for field in semantic_fields} - signers = FieldChoice.objects.filter(field="signer") - signer_dict = {signer.english_name: signer for signer in signers} - existing_machine_values = [ - mv for mv in FieldChoice.objects.all().values_list("machine_value", flat=True) - ] - not_public_tag = Tag.objects.get(name="not public") - nzsl_share_tag = Tag.objects.get(name="nzsl-share") - import_user = User.objects.get( - username="nzsl_share_importer", - first_name="Importer", - last_name="NZSL Share", - ) - - for row_num, gloss_data in enumerate(request.session["glosses_new"]): - # will iterate over these glosses again after bulk creating - # and to ensure we get the correct gloss_data for words that appear multiple - # times we'll use the row_num as the identifier for the gloss data - - # if the gloss already exists at this point, it can only mean that - # it has no videos and we want to import videos for it - # try-except saves us a db call - try: - gloss = Gloss.objects.filter(nzsl_share_id=gloss_data["id"]).get() - gloss_data_copy = gloss_data.copy() - gloss_data_copy["gloss"] = gloss - video_import_only_glosses_data.append(gloss_data_copy) - continue - except Gloss.DoesNotExist: - pass + dataset = Dataset.objects.get(id=session_dataset_id) + language_en = Language.objects.get(name="English") + language_mi = Language.objects.get(name="Māori") + gloss_content_type = ContentType.objects.get_for_model(Gloss) + site = Site.objects.get_current() + comment_submit_date = datetime.datetime.now(tz=get_current_timezone()) + semantic_fields = FieldChoice.objects.filter(field="semantic_field").values_list( + "english_name", "pk" + ) + semantic_fields_dict = {field[0]: field[1] for field in semantic_fields} + signers = FieldChoice.objects.filter(field="signer") + signer_dict = {signer.english_name: signer for signer in signers} + existing_machine_values = [ + mv for mv in FieldChoice.objects.all().values_list("machine_value", flat=True) + ] + not_public_tag = Tag.objects.get(name="not public") + nzsl_share_tag = Tag.objects.get(name="nzsl-share") + import_user = User.objects.get( + username="nzsl_share_importer", + first_name="Importer", + last_name="NZSL Share", + ) - new_glosses[str(row_num)] = gloss_data - bulk_create_gloss.append(Gloss( + for row_num, gloss_data in enumerate(session_glosses_new): + # will iterate over these glosses again after bulk creating + # and to ensure we get the correct gloss_data for words that appear multiple + # times we'll use the row_num as the identifier for the gloss data + + # if the gloss already exists at this point, it can only mean that + # it has no videos and we want to import videos for it + # try-except saves us a db call + try: + gloss = Gloss.objects.filter(nzsl_share_id=gloss_data["id"]).get() + gloss_data_copy = gloss_data.copy() + gloss_data_copy["gloss"] = gloss + video_import_only_glosses_data.append(gloss_data_copy) + continue + except Gloss.DoesNotExist: + pass + + new_glosses[str(row_num)] = gloss_data + bulk_create_gloss.append( + Gloss( dataset=dataset, nzsl_share_id=gloss_data["id"], # need to make idgloss unique in dataset, @@ -415,183 +536,174 @@ def confirm_import_nzsl_share_gloss_csv(request): created_by=import_user, updated_by=import_user, exclude_from_ecv=True, - )) - contributors.append(gloss_data["contributor_username"]) - - bulk_created = Gloss.objects.bulk_create(bulk_create_gloss) - - # Create new signers for contributors that do not exist as signers yet - contributors = set(contributors) - create_signers = [] - signers = signer_dict.keys() - for contributor in contributors: - if contributor not in signers: + ) + ) + contributors.append(gloss_data["contributor_username"]) + + bulk_created = Gloss.objects.bulk_create(bulk_create_gloss) + + # Create new signers for contributors that do not exist as signers yet + contributors = set(contributors) + create_signers = [] + signers = signer_dict.keys() + for contributor in contributors: + if contributor not in signers: + new_machine_value = random.randint(0, 99999999) + while new_machine_value in existing_machine_values: new_machine_value = random.randint(0, 99999999) - while new_machine_value in existing_machine_values: - new_machine_value = random.randint(0, 99999999) - existing_machine_values.append(new_machine_value) - create_signers.append(FieldChoice( + existing_machine_values.append(new_machine_value) + create_signers.append( + FieldChoice( field="signer", english_name=contributor, - machine_value=new_machine_value - )) - new_signers = FieldChoice.objects.bulk_create(create_signers) - for signer in new_signers: - signer_dict[signer.english_name] = signer - - for gloss in bulk_created: - word_en, row_num = gloss.idgloss.split("_row") - gloss_data = new_glosses[row_num] - gloss_data["gloss"] = gloss - - # get semantic fields for gloss_data topics - if gloss_data.get("topic_names", None): - gloss_topics = gloss_data["topic_names"].split("|") - # ignore all signs and All signs - cleaned_gloss_topics = [ - x for x in gloss_topics if x not in ["all signs", "All signs"] - ] - add_miscellaneous = False - - for topic in cleaned_gloss_topics: - if topic in semantic_fields_dict.keys(): - bulk_semantic_fields.append( - Gloss.semantic_field.through( - gloss_id=gloss.id, - fieldchoice_id=semantic_fields_dict[topic] - ) - ) - else: - # add the miscellaneous semantic field if a topic does not exist - add_miscellaneous = True - - if add_miscellaneous: + machine_value=new_machine_value, + ) + ) + new_signers = FieldChoice.objects.bulk_create(create_signers) + for signer in new_signers: + signer_dict[signer.english_name] = signer + + for gloss in bulk_created: + word_en, row_num = gloss.idgloss.split("_row") + gloss_data = new_glosses[row_num] + gloss_data["gloss"] = gloss + + # get semantic fields for gloss_data topics + if gloss_data.get("topic_names", None): + gloss_topics = gloss_data["topic_names"].split("|") + # ignore all signs and All signs + cleaned_gloss_topics = [ + x for x in gloss_topics if x not in ["all signs", "All signs"] + ] + add_miscellaneous = False + + for topic in cleaned_gloss_topics: + if topic in semantic_fields_dict.keys(): bulk_semantic_fields.append( Gloss.semantic_field.through( gloss_id=gloss.id, - fieldchoice_id=semantic_fields_dict["Miscellaneous"] + fieldchoice_id=semantic_fields_dict[topic], ) ) + else: + # add the miscellaneous semantic field if a topic does not exist + add_miscellaneous = True + + if add_miscellaneous: + bulk_semantic_fields.append( + Gloss.semantic_field.through( + gloss_id=gloss.id, + fieldchoice_id=semantic_fields_dict["Miscellaneous"], + ) + ) - # create GlossTranslations for english and maori words - translations.append(GlossTranslations( + # create GlossTranslations for english and maori words + translations.append( + GlossTranslations( gloss=gloss, language=language_en, translations=gloss_data["word"], - translations_secondary=gloss_data.get("secondary", None) - )) - if gloss_data.get("maori", None): - # There is potentially several comma separated maori words - maori_words = gloss_data["maori"].split(", ") - - # Update idgloss_mi using first maori word, then create translation - gloss.idgloss_mi = f"{maori_words[0]}:{gloss.pk}" - - translation = GlossTranslations( - gloss=gloss, - language=language_mi, - translations=maori_words[0] - ) - if len(maori_words) > 1: - translation.translations_secondary = ", ".join(maori_words[1:]) + translations_secondary=gloss_data.get("secondary", None), + ) + ) + if gloss_data.get("maori", None): + # There is potentially several comma separated maori words + maori_words = gloss_data["maori"].split(", ") + + # Update idgloss_mi using first maori word, then create translation + gloss.idgloss_mi = f"{maori_words[0]}:{gloss.pk}" - translations.append(translation) + translation = GlossTranslations( + gloss=gloss, language=language_mi, translations=maori_words[0] + ) + if len(maori_words) > 1: + translation.translations_secondary = ", ".join(maori_words[1:]) - # Prepare new idgloss and signer fields for bulk update - gloss.idgloss = f"{word_en}:{gloss.pk}" - gloss.signer = signer_dict[gloss_data["contributor_username"]] - bulk_update_glosses.append(gloss) + translations.append(translation) - # Create comment for gloss_data notes - comments.append(Comment( + # Prepare new idgloss and signer fields for bulk update + gloss.idgloss = f"{word_en}:{gloss.pk}" + gloss.signer = signer_dict[gloss_data["contributor_username"]] + bulk_update_glosses.append(gloss) + + # Create comment for gloss_data notes + comments.append( + Comment( content_type=gloss_content_type, object_pk=gloss.pk, user_name=gloss_data.get("contributor_username", ""), comment=gloss_data.get("notes", ""), site=site, is_public=False, - submit_date=comment_submit_date - )) - if gloss_data.get("sign_comments", None): - # create Comments for all gloss_data sign_comments - for comment in gloss_data["sign_comments"].split("|"): - try: - comment_content = comment.split(":") - user_name = comment_content[0] - comment_content = comment_content[1] - except IndexError: - comment_content = comment - user_name = "Unknown" - comments.append(Comment( + submit_date=comment_submit_date, + ) + ) + if gloss_data.get("sign_comments", None): + # create Comments for all gloss_data sign_comments + for comment in gloss_data["sign_comments"].split("|"): + try: + comment_content = comment.split(":") + user_name = comment_content[0] + comment_content = comment_content[1] + except IndexError: + comment_content = comment + user_name = "Unknown" + comments.append( + Comment( content_type=gloss_content_type, object_pk=gloss.pk, user_name=user_name, comment=comment_content, site=site, is_public=False, - submit_date=comment_submit_date - )) + submit_date=comment_submit_date, + ) + ) - # Add ShareValidationAggregation - bulk_share_validation_aggregations.append(ShareValidationAggregation( + # Add ShareValidationAggregation + bulk_share_validation_aggregations.append( + ShareValidationAggregation( gloss=gloss, agrees=int(gloss_data["agrees"]), - disagrees=int(gloss_data["disagrees"]) - )) - - # prep videos, illustrations and usage example for video retrieval - update_retrieval_videos(videos, gloss_data) - - glosses_added.append(gloss) - - bulk_tagged_items.append(TaggedItem( - content_type=gloss_content_type, - object_id=gloss.pk, - tag=nzsl_share_tag - - )) - bulk_tagged_items.append(TaggedItem( - content_type=gloss_content_type, - object_id=gloss.pk, - tag=not_public_tag - - )) + disagrees=int(gloss_data["disagrees"]), + ) + ) - # Bulk create entities related to the gloss, and bulk update the glosses' idgloss - Comment.objects.bulk_create(comments) - GlossTranslations.objects.bulk_create(translations) - Gloss.objects.bulk_update(bulk_update_glosses, ["idgloss", "idgloss_mi", "signer"]) - Gloss.semantic_field.through.objects.bulk_create(bulk_semantic_fields) - TaggedItem.objects.bulk_create(bulk_tagged_items) - ShareValidationAggregation.objects.bulk_create(bulk_share_validation_aggregations) + # prep videos, illustrations and usage example for video retrieval + update_retrieval_videos(videos, gloss_data) - # Add the video-update only glosses - for video_import_gloss_data in video_import_only_glosses_data: - # prep videos, illustrations and usage example for video retrieval - update_retrieval_videos(videos, video_import_gloss_data) - glosses_added.append(video_import_gloss_data["gloss"]) + glosses_added.append(gloss) - # start Thread to process gloss video retrieval in the background - t = threading.Thread( - target=retrieve_videos_for_glosses, - args=[videos], - daemon=True + bulk_tagged_items.append( + TaggedItem( + content_type=gloss_content_type, object_id=gloss.pk, tag=nzsl_share_tag + ) + ) + bulk_tagged_items.append( + TaggedItem( + content_type=gloss_content_type, object_id=gloss.pk, tag=not_public_tag + ) ) - t.start() - del request.session["glosses_new"] - del request.session["dataset_id"] + # Bulk create entities related to the gloss, and bulk update the glosses' idgloss + Comment.objects.bulk_create(comments) + GlossTranslations.objects.bulk_create(translations) + Gloss.objects.bulk_update(bulk_update_glosses, ["idgloss", "idgloss_mi", "signer"]) + Gloss.semantic_field.through.objects.bulk_create(bulk_semantic_fields) + TaggedItem.objects.bulk_create(bulk_tagged_items) + ShareValidationAggregation.objects.bulk_create(bulk_share_validation_aggregations) - # Set a message to be shown so that the user knows what is going on. - messages.add_message(request, messages.SUCCESS, _("Glosses were added successfully.")) - return render( - request, "dictionary/import_nzsl_share_gloss_csv_confirmation.html", - { - "glosses_added": glosses_added, - "dataset": dataset.name - } - ) + # Add the video-update only glosses + for video_import_gloss_data in video_import_only_glosses_data: + # prep videos, illustrations and usage example for video retrieval + update_retrieval_videos(videos, video_import_gloss_data) + glosses_added.append(video_import_gloss_data["gloss"]) + # start Thread to process gloss video retrieval in the background + t = threading.Thread(target=retrieve_videos_for_glosses, args=[videos], daemon=True) + t.start() + + return [glosses_added, dataset.name] @login_required @@ -608,18 +720,29 @@ def import_qualtrics_csv(request): if not request.method == "POST": # If request type is not POST, return to the original form. csv_form = CSVFileOnlyUpload() - return render(request, "dictionary/import_qualtrics_csv.html", - {"import_csv_form": csv_form}, ) + return render( + request, + "dictionary/import_qualtrics_csv.html", + {"import_csv_form": csv_form}, + ) form = CSVFileOnlyUpload(request.POST, request.FILES) if not form.is_valid(): # If form is not valid, set a error message and return to the original form. - messages.add_message(request, messages.ERROR, - _("The provided CSV-file does not meet the requirements " - "or there is some other problem.")) - return render(request, "dictionary/import_qualtrics_csv.html", - {"import_csv_form": form}, ) + messages.add_message( + request, + messages.ERROR, + _( + "The provided CSV-file does not meet the requirements " + "or there is some other problem." + ), + ) + return render( + request, + "dictionary/import_qualtrics_csv.html", + {"import_csv_form": form}, + ) validation_records = [] skipped_rows = [] @@ -627,7 +750,7 @@ def import_qualtrics_csv(request): validation_record_reader = csv.DictReader( codecs.iterdecode(form.cleaned_data["file"], "utf-8"), delimiter=",", - quotechar='"' + quotechar='"', ) question_numbers = [] @@ -669,22 +792,33 @@ def import_qualtrics_csv(request): request.session.pop("question_numbers", None) request.session.pop("question_gloss_map", None) # Set a message to be shown so that the user knows what is going on. - messages.add_message(request, messages.ERROR, _("Cannot open the file:" + str(e))) - return render(request, "dictionary/import_qualtrics_csv.html", - {"import_csv_form": CSVFileOnlyUpload()}, ) + messages.add_message( + request, messages.ERROR, _("Cannot open the file:" + str(e)) + ) + return render( + request, + "dictionary/import_qualtrics_csv.html", + {"import_csv_form": CSVFileOnlyUpload()}, + ) except UnicodeDecodeError as e: # File is not UTF-8 encoded. messages.add_message(request, messages.ERROR, _("File must be UTF-8 encoded!")) - return render(request, "dictionary/import_qualtrics_csv.html", - {"import_csv_form": CSVFileOnlyUpload()}, ) + return render( + request, + "dictionary/import_qualtrics_csv.html", + {"import_csv_form": CSVFileOnlyUpload()}, + ) # Store dataset's id and the list of glosses to be added in session. request.session["validation_records"] = validation_records request.session["question_numbers"] = question_numbers request.session["question_glossvideo_map"] = question_to_glossvideo_map - return render(request, "dictionary/import_qualtrics_csv_confirmation.html", - {"validation_records": validation_records, "skipped_rows": skipped_rows}) + return render( + request, + "dictionary/import_qualtrics_csv_confirmation.html", + {"validation_records": validation_records, "skipped_rows": skipped_rows}, + ) @login_required @@ -714,13 +848,21 @@ def confirm_import_qualtrics_csv(request): bulk_tagged_items = [] gloss_pks = set() - if "validation_records" and "question_numbers" and "question_glossvideo_map" in request.session: + if ( + "validation_records" + and "question_numbers" + and "question_glossvideo_map" in request.session + ): # Retrieve glosses glossvideo_pk_list = request.session["question_glossvideo_map"].values() - glossvideo_dict = GlossVideo.objects.select_related("gloss").in_bulk(glossvideo_pk_list) + glossvideo_dict = GlossVideo.objects.select_related("gloss").in_bulk( + glossvideo_pk_list + ) gloss_content_type = ContentType.objects.get_for_model(Gloss) check_result_tag = Tag.objects.get(name=settings.TAG_VALIDATION_CHECK_RESULTS) - ready_for_validation_tag = Tag.objects.get(name=settings.TAG_READY_FOR_VALIDATION) + ready_for_validation_tag = Tag.objects.get( + name=settings.TAG_READY_FOR_VALIDATION + ) questions_numbers = request.session["question_numbers"] question_glossvideo_map = request.session["question_glossvideo_map"] @@ -740,35 +882,43 @@ def confirm_import_qualtrics_csv(request): sign_seen = ValidationRecord.SignSeenChoices.NOT_SURE.value try: - gloss = glossvideo_dict[question_glossvideo_map[question_number]].gloss - validation_records_added.append(ValidationRecord( - gloss=gloss, - sign_seen=ValidationRecord.SignSeenChoices(sign_seen), - response_id=response_id, - respondent_first_name=respondent_first_name, - respondent_last_name=respondent_last_name, - comment=record.get(f"{question_number}_Q2_5_TEXT", ""), - )) + gloss = glossvideo_dict[ + question_glossvideo_map[question_number] + ].gloss + validation_records_added.append( + ValidationRecord( + gloss=gloss, + sign_seen=ValidationRecord.SignSeenChoices(sign_seen), + response_id=response_id, + respondent_first_name=respondent_first_name, + respondent_last_name=respondent_last_name, + comment=record.get(f"{question_number}_Q2_5_TEXT", ""), + ) + ) gloss_pks.add(gloss.pk) except KeyError: - missing_gloss_pk_question_pairs[question_number] = question_glossvideo_map[ - question_number] + missing_gloss_pk_question_pairs[question_number] = ( + question_glossvideo_map[question_number] + ) for gloss_pk in gloss_pks: - bulk_tagged_items.append(TaggedItem( - content_type=gloss_content_type, - object_id=gloss_pk, - tag=check_result_tag - - )) + bulk_tagged_items.append( + TaggedItem( + content_type=gloss_content_type, + object_id=gloss_pk, + tag=check_result_tag, + ) + ) # ignoring conflicts so the unique together on the model filters out potential duplicates - ValidationRecord.objects.bulk_create(validation_records_added, ignore_conflicts=True) + ValidationRecord.objects.bulk_create( + validation_records_added, ignore_conflicts=True + ) TaggedItem.objects.bulk_create(bulk_tagged_items, ignore_conflicts=True) TaggedItem.objects.filter( content_type=gloss_content_type, object_id__in=gloss_pks, - tag=ready_for_validation_tag + tag=ready_for_validation_tag, ).delete() del request.session["validation_records"] @@ -776,17 +926,19 @@ def confirm_import_qualtrics_csv(request): del request.session["question_glossvideo_map"] # Set a message to be shown so that the user knows what is going on. - messages.add_message(request, messages.SUCCESS, - _("ValidationRecords were added successfully.")) + messages.add_message( + request, messages.SUCCESS, _("ValidationRecords were added successfully.") + ) return render( - request, "dictionary/import_qualtrics_csv_confirmation.html", + request, + "dictionary/import_qualtrics_csv_confirmation.html", { "validation_records_added": validation_records_added, "validation_record_count": len(validation_records_added), "responses_count": len(validation_records), "gloss_count": len(gloss_pks), - "missing_gloss_question_pairs": missing_gloss_pk_question_pairs - } + "missing_gloss_question_pairs": missing_gloss_pk_question_pairs, + }, ) @@ -815,18 +967,29 @@ def import_manual_validation(request): if request.method != "POST": # If request type is not POST, return to the original form. csv_form = CSVFileOnlyUpload() - return render(request, "dictionary/import_manual_validation_csv.html", - {"import_csv_form": csv_form}, ) + return render( + request, + "dictionary/import_manual_validation_csv.html", + {"import_csv_form": csv_form}, + ) form = CSVFileOnlyUpload(request.POST, request.FILES) if not form.is_valid(): # If form is not valid, set a error message and return to the original form. - messages.add_message(request, messages.ERROR, - _("The provided CSV-file does not meet the requirements " - "or there is some other problem.")) - return render(request, "dictionary/import_manual_validation_csv.html", - {"import_csv_form": form}, ) + messages.add_message( + request, + messages.ERROR, + _( + "The provided CSV-file does not meet the requirements " + "or there is some other problem." + ), + ) + return render( + request, + "dictionary/import_manual_validation_csv.html", + {"import_csv_form": form}, + ) group_row_map = defaultdict(list) group_gloss_count = defaultdict(int) @@ -837,29 +1000,38 @@ def import_manual_validation(request): "yes", "no", "abstain or not sure", - "comments" + "comments", ] try: validation_record_reader = csv.DictReader( codecs.iterdecode(form.cleaned_data["file"], "utf-8-sig"), delimiter=",", - quotechar='"' + quotechar='"', + ) + missing_headers = set(required_headers) - set( + validation_record_reader.fieldnames ) - missing_headers = set(required_headers) - set(validation_record_reader.fieldnames) if missing_headers != set(): request.session.pop("group_row_map", None) request.session.pop("glosses", None) # Set a message to be shown so that the user knows what is going on. - messages.add_message(request, messages.ERROR, - _(f"CSV is missing required columns: {missing_headers}")) - return render(request, - "dictionary/import_manual_validation_csv.html", - {"import_csv_form": CSVFileOnlyUpload()}, ) + messages.add_message( + request, + messages.ERROR, + _(f"CSV is missing required columns: {missing_headers}"), + ) + return render( + request, + "dictionary/import_manual_validation_csv.html", + {"import_csv_form": CSVFileOnlyUpload()}, + ) for row in validation_record_reader: if validation_record_reader.line_num == 1: continue - _check_row_can_be_converted_to_integer(row, ["yes", "no", "abstain or not sure"]) + _check_row_can_be_converted_to_integer( + row, ["yes", "no", "abstain or not sure"] + ) group_row_map[row["group"]].append(row) group_gloss_count[row["group"]] += 1 glosses.append(row["idgloss"].split(":")[1]) @@ -868,35 +1040,49 @@ def import_manual_validation(request): request.session.pop("group_row_map", None) request.session.pop("glosses", None) # Set a message to be shown so that the user knows what is going on. - messages.add_message(request, messages.ERROR, _("File contains non-compliant data:" + str(e))) - return render(request, "dictionary/import_manual_validation_csv.html", - {"import_csv_form": CSVFileOnlyUpload()}, ) + messages.add_message( + request, messages.ERROR, _("File contains non-compliant data:" + str(e)) + ) + return render( + request, + "dictionary/import_manual_validation_csv.html", + {"import_csv_form": CSVFileOnlyUpload()}, + ) except csv.Error as e: # Can't open file, remove session variables request.session.pop("group_row_map", None) request.session.pop("glosses", None) # Set a message to be shown so that the user knows what is going on. - messages.add_message(request, messages.ERROR, _("Cannot open the file:" + str(e))) - return render(request, "dictionary/import_manual_validation_csv.html", - {"import_csv_form": CSVFileOnlyUpload()}, ) + messages.add_message( + request, messages.ERROR, _("Cannot open the file:" + str(e)) + ) + return render( + request, + "dictionary/import_manual_validation_csv.html", + {"import_csv_form": CSVFileOnlyUpload()}, + ) except UnicodeDecodeError as e: # File is not UTF-8 encoded. messages.add_message(request, messages.ERROR, _("File must be UTF-8 encoded!")) - return render(request, "dictionary/import_manual_validation_csv.html", - {"import_csv_form": CSVFileOnlyUpload()}, ) + return render( + request, + "dictionary/import_manual_validation_csv.html", + {"import_csv_form": CSVFileOnlyUpload()}, + ) # Store dataset's id and the list of glosses to be added in session. request.session["group_row_map"] = group_row_map request.session["glosses"] = list(set(glosses)) return render( - request, "dictionary/import_manual_validation_csv_confirmation.html", + request, + "dictionary/import_manual_validation_csv_confirmation.html", { # iterating over defaultdicts causes issues in template rendering "group_row_map": dict(group_row_map), - "group_gloss_count": dict(group_gloss_count) - } + "group_gloss_count": dict(group_gloss_count), + }, ) @@ -940,14 +1126,18 @@ def confirm_import_manual_validation(request): sign_seen_no = row["no"] sign_seen_not_sure = row["abstain or not sure"] comments = row["comments"] - manual_validation_aggregations.append(ManualValidationAggregation( - gloss=gloss, - group=group, - sign_seen_yes=int(sign_seen_yes) if sign_seen_yes else 0, - sign_seen_no=int(sign_seen_no) if sign_seen_no else 0, - sign_seen_not_sure=int(sign_seen_not_sure) if sign_seen_not_sure else 0, - comments=comments - )) + manual_validation_aggregations.append( + ManualValidationAggregation( + gloss=gloss, + group=group, + sign_seen_yes=int(sign_seen_yes) if sign_seen_yes else 0, + sign_seen_no=int(sign_seen_no) if sign_seen_no else 0, + sign_seen_not_sure=( + int(sign_seen_not_sure) if sign_seen_not_sure else 0 + ), + comments=comments, + ) + ) ManualValidationAggregation.objects.bulk_create(manual_validation_aggregations) @@ -955,13 +1145,15 @@ def confirm_import_manual_validation(request): del request.session["glosses"] # Set a message to be shown so that the user knows what is going on. - messages.add_message(request, messages.SUCCESS, - _("ValidationRecords were added successfully.")) + messages.add_message( + request, messages.SUCCESS, _("ValidationRecords were added successfully.") + ) return render( - request, "dictionary/import_manual_validation_csv_confirmation.html", + request, + "dictionary/import_manual_validation_csv_confirmation.html", { "manual_validation_aggregations": manual_validation_aggregations, "manual_validation_aggregations_count": len(manual_validation_aggregations), - "missing_glosses": missing_glosses - } + "missing_glosses": missing_glosses, + }, ) From 3de214c844c138d568d9c5e36c234a1b1e6d692e Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Mon, 28 Oct 2024 13:43:23 +1100 Subject: [PATCH 150/222] Debug removed --- signbank/dictionary/csv_import.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/signbank/dictionary/csv_import.py b/signbank/dictionary/csv_import.py index 0685d701..ec6fed32 100644 --- a/signbank/dictionary/csv_import.py +++ b/signbank/dictionary/csv_import.py @@ -1,8 +1,6 @@ # -*- coding: utf-8 -*- from __future__ import unicode_literals -from pprint import pprint - import codecs import csv import datetime @@ -426,9 +424,6 @@ def update_retrieval_videos(videos, gloss_data): @permission_required("dictionary.import_csv") @transaction.atomic() def confirm_import_nzsl_share_gloss_csv(request): - - pprint(request.session.__dict__) - """This view adds the data to database if the user confirms the action""" if not request.method == "POST": # If request method is not POST, redirect to the import form @@ -465,10 +460,7 @@ def confirm_import_nzsl_share_gloss_csv(request): def confirm_import_nzsl_share_gloss_csv_inner(session_glosses_new, session_dataset_id): - """Does the thing""" - - print("IN CONFIRM INNER") - + """Performs CSV import actions""" glosses_added = [] dataset = None translations = [] From 335f07dbbf4c129620eb9d60a2b2e3afbcdbd0d3 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Mon, 28 Oct 2024 13:45:56 +1100 Subject: [PATCH 151/222] Revert "Debug removed" This reverts commit 3de214c844c138d568d9c5e36c234a1b1e6d692e. --- signbank/dictionary/csv_import.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/signbank/dictionary/csv_import.py b/signbank/dictionary/csv_import.py index ec6fed32..0685d701 100644 --- a/signbank/dictionary/csv_import.py +++ b/signbank/dictionary/csv_import.py @@ -1,6 +1,8 @@ # -*- coding: utf-8 -*- from __future__ import unicode_literals +from pprint import pprint + import codecs import csv import datetime @@ -424,6 +426,9 @@ def update_retrieval_videos(videos, gloss_data): @permission_required("dictionary.import_csv") @transaction.atomic() def confirm_import_nzsl_share_gloss_csv(request): + + pprint(request.session.__dict__) + """This view adds the data to database if the user confirms the action""" if not request.method == "POST": # If request method is not POST, redirect to the import form @@ -460,7 +465,10 @@ def confirm_import_nzsl_share_gloss_csv(request): def confirm_import_nzsl_share_gloss_csv_inner(session_glosses_new, session_dataset_id): - """Performs CSV import actions""" + """Does the thing""" + + print("IN CONFIRM INNER") + glosses_added = [] dataset = None translations = [] From 65de3ca1d902598de75c42d87ccf3a411df9895f Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Mon, 28 Oct 2024 13:46:00 +1100 Subject: [PATCH 152/222] Revert "Revert "Revert "Experimental refactor csv_import""" This reverts commit 5c5fcc60ecef0ea2d62e00987ea5fa62a92d0266. --- signbank/dictionary/csv_import.py | 884 ++++++++++++------------------ 1 file changed, 346 insertions(+), 538 deletions(-) diff --git a/signbank/dictionary/csv_import.py b/signbank/dictionary/csv_import.py index 0685d701..0bfdff69 100644 --- a/signbank/dictionary/csv_import.py +++ b/signbank/dictionary/csv_import.py @@ -1,8 +1,6 @@ # -*- coding: utf-8 -*- from __future__ import unicode_literals -from pprint import pprint - import codecs import csv import datetime @@ -28,16 +26,8 @@ from tagging.models import Tag, TaggedItem from .forms import CSVFileOnlyUpload, CSVUploadForm -from .models import ( - Dataset, - FieldChoice, - Gloss, - GlossTranslations, - Language, - ManualValidationAggregation, - ShareValidationAggregation, - ValidationRecord, -) +from .models import (Dataset, FieldChoice, Gloss, GlossTranslations, Language, + ManualValidationAggregation, ShareValidationAggregation, ValidationRecord) from .tasks import retrieve_videos_for_glosses from ..video.models import GlossVideo @@ -45,7 +35,7 @@ @login_required -@permission_required("dictionary.import_csv") +@permission_required('dictionary.import_csv') def import_gloss_csv(request): """ Check which objects exist and which not. Then show the user a list of glosses that will be added if user confirms. @@ -54,53 +44,31 @@ def import_gloss_csv(request): glosses_new = [] glosses_exists = [] # Make sure that the session variables are flushed before using this view. - if "dataset_id" in request.session: - del request.session["dataset_id"] - if "glosses_new" in request.session: - del request.session["glosses_new"] + if 'dataset_id' in request.session: del request.session['dataset_id'] + if 'glosses_new' in request.session: del request.session['glosses_new'] - if request.method == "POST": + if request.method == 'POST': form = CSVUploadForm(request.POST, request.FILES) if form.is_valid(): - dataset = form.cleaned_data["dataset"] - if "view_dataset" not in get_perms(request.user, dataset): + dataset = form.cleaned_data['dataset'] + if 'view_dataset' not in get_perms(request.user, dataset): # If user has no permissions to dataset, raise PermissionDenied to show 403 template. - msg = _( - "You do not have permissions to import glosses to this lexicon." - ) + msg = _("You do not have permissions to import glosses to this lexicon.") messages.error(request, msg) raise PermissionDenied(msg) try: - glossreader = csv.reader( - codecs.iterdecode(form.cleaned_data["file"], "utf-8"), - delimiter=",", - quotechar='"', - ) + glossreader = csv.reader(codecs.iterdecode(form.cleaned_data['file'], 'utf-8'), delimiter=',', quotechar='"') except csv.Error as e: # Can't open file, remove session variables - if "dataset_id" in request.session: - del request.session["dataset_id"] - if "glosses_new" in request.session: - del request.session["glosses_new"] + if 'dataset_id' in request.session: del request.session['dataset_id'] + if 'glosses_new' in request.session: del request.session['glosses_new'] # Set a message to be shown so that the user knows what is going on. - messages.add_message( - request, messages.ERROR, _("Cannot open the file:" + str(e)) - ) - return render( - request, - "dictionary/import_gloss_csv.html", - {"import_csv_form": CSVUploadForm()}, - ) + messages.add_message(request, messages.ERROR, _('Cannot open the file:' + str(e))) + return render(request, 'dictionary/import_gloss_csv.html', {'import_csv_form': CSVUploadForm()}, ) except UnicodeDecodeError as e: # File is not UTF-8 encoded. - messages.add_message( - request, messages.ERROR, _("File must be UTF-8 encoded!") - ) - return render( - request, - "dictionary/import_gloss_csv.html", - {"import_csv_form": CSVUploadForm()}, - ) + messages.add_message(request, messages.ERROR, _('File must be UTF-8 encoded!')) + return render(request, 'dictionary/import_gloss_csv.html', {'import_csv_form': CSVUploadForm()}, ) for row in glossreader: if glossreader.line_num == 1: @@ -119,113 +87,74 @@ def import_gloss_csv(request): continue # Store dataset's id and the list of glosses to be added in session. - request.session["dataset_id"] = dataset.id - request.session["glosses_new"] = glosses_new - - return render( - request, - "dictionary/import_gloss_csv_confirmation.html", - { - "glosses_new": glosses_new, - "glosses_exists": glosses_exists, - "dataset": dataset, - }, - ) + request.session['dataset_id'] = dataset.id + request.session['glosses_new'] = glosses_new + + return render(request, 'dictionary/import_gloss_csv_confirmation.html', + {'glosses_new': glosses_new, + 'glosses_exists': glosses_exists, + 'dataset': dataset, }) else: # If form is not valid, set a error message and return to the original form. - messages.add_message( - request, - messages.ERROR, - _( - "The provided CSV-file does not meet the requirements " - "or there is some other problem." - ), - ) - return render( - request, - "dictionary/import_gloss_csv.html", - {"import_csv_form": form}, - ) + messages.add_message(request, messages.ERROR, _('The provided CSV-file does not meet the requirements ' + 'or there is some other problem.')) + return render(request, 'dictionary/import_gloss_csv.html', {'import_csv_form': form}, ) else: # If request type is not POST, return to the original form. csv_form = CSVUploadForm() - allowed_datasets = get_objects_for_user(request.user, "dictionary.view_dataset") + allowed_datasets = get_objects_for_user(request.user, 'dictionary.view_dataset') # Make sure we only list datasets the user has permissions to. - csv_form.fields["dataset"].queryset = csv_form.fields[ - "dataset" - ].queryset.filter(id__in=[x.id for x in allowed_datasets]) - return render( - request, - "dictionary/import_gloss_csv.html", - {"import_csv_form": csv_form}, - ) + csv_form.fields["dataset"].queryset = csv_form.fields["dataset"].queryset.filter( + id__in=[x.id for x in allowed_datasets]) + return render(request, "dictionary/import_gloss_csv.html", + {'import_csv_form': csv_form}, ) @login_required -@permission_required("dictionary.import_csv") +@permission_required('dictionary.import_csv') def confirm_import_gloss_csv(request): """This view adds the data to database if the user confirms the action""" - if request.method == "POST": - if "cancel" in request.POST: + if request.method == 'POST': + if 'cancel' in request.POST: # If user cancels adding data, flush session variables - if "dataset_id" in request.session: - del request.session["dataset_id"] - if "glosses_new" in request.session: - del request.session["glosses_new"] + if 'dataset_id' in request.session: del request.session['dataset_id'] + if 'glosses_new' in request.session: del request.session['glosses_new'] # Set a message to be shown so that the user knows what is going on. - messages.add_message( - request, messages.WARNING, _("Cancelled adding CSV data.") - ) - return HttpResponseRedirect(reverse("dictionary:import_gloss_csv")) + messages.add_message(request, messages.WARNING, _('Cancelled adding CSV data.')) + return HttpResponseRedirect(reverse('dictionary:import_gloss_csv')) - elif "confirm" in request.POST: + elif 'confirm' in request.POST: glosses_added = [] dataset = None - if "glosses_new" and "dataset_id" in request.session: - dataset = Dataset.objects.get(id=request.session["dataset_id"]) - for gloss in request.session["glosses_new"]: + if 'glosses_new' and 'dataset_id' in request.session: + dataset = Dataset.objects.get(id=request.session['dataset_id']) + for gloss in request.session['glosses_new']: # If the Gloss does not already exist, continue adding. - if not Gloss.objects.filter( - dataset=dataset, idgloss=gloss[0] - ).exists(): + if not Gloss.objects.filter(dataset=dataset, idgloss=gloss[0]).exists(): try: - new_gloss = Gloss( - dataset=dataset, - idgloss=gloss[0], - idgloss_mi=gloss[1], - created_by=request.user, - updated_by=request.user, - ) + new_gloss = Gloss(dataset=dataset, idgloss=gloss[0], idgloss_mi=gloss[1], + created_by=request.user, updated_by=request.user) except IndexError: # If we get IndexError, idgloss_mi was probably not provided - new_gloss = Gloss( - dataset=dataset, - idgloss=gloss[0], - created_by=request.user, - updated_by=request.user, - ) + new_gloss = Gloss(dataset=dataset, idgloss=gloss[0], + created_by=request.user, updated_by=request.user) new_gloss.save() glosses_added.append((new_gloss.idgloss, new_gloss.idgloss_mi)) # Flush request.session['glosses_new'] and request.session['dataset'] - del request.session["glosses_new"] - del request.session["dataset_id"] + del request.session['glosses_new'] + del request.session['dataset_id'] # Set a message to be shown so that the user knows what is going on. - messages.add_message( - request, messages.SUCCESS, _("Glosses were added successfully.") - ) - return render( - request, - "dictionary/import_gloss_csv_confirmation.html", - {"glosses_added": glosses_added, "dataset": dataset.name}, - ) + messages.add_message(request, messages.SUCCESS, _('Glosses were added successfully.')) + return render(request, "dictionary/import_gloss_csv_confirmation.html", {'glosses_added': glosses_added, + 'dataset': dataset.name}) else: - return HttpResponseRedirect(reverse("dictionary:import_gloss_csv")) + return HttpResponseRedirect(reverse('dictionary:import_gloss_csv')) else: # If request method is not POST, redirect to the import form - return HttpResponseRedirect(reverse("dictionary:import_gloss_csv")) + return HttpResponseRedirect(reverse('dictionary:import_gloss_csv')) share_csv_header_list = [ @@ -262,32 +191,20 @@ def import_nzsl_share_gloss_csv(request): csv_form = CSVUploadForm() allowed_datasets = get_objects_for_user(request.user, "dictionary.view_dataset") # Make sure we only list datasets the user has permissions to. - csv_form.fields["dataset"].queryset = csv_form.fields[ - "dataset" - ].queryset.filter(id__in=[x.id for x in allowed_datasets]) - return render( - request, - "dictionary/import_nzsl_share_gloss_csv.html", - {"import_csv_form": csv_form}, - ) + csv_form.fields["dataset"].queryset = csv_form.fields["dataset"].queryset.filter( + id__in=[x.id for x in allowed_datasets]) + return render(request, "dictionary/import_nzsl_share_gloss_csv.html", + {"import_csv_form": csv_form}, ) form = CSVUploadForm(request.POST, request.FILES) if not form.is_valid(): # If form is not valid, set a error message and return to the original form. - messages.add_message( - request, - messages.ERROR, - _( - "The provided CSV-file does not meet the requirements " - "or there is some other problem." - ), - ) - return render( - request, - "dictionary/import_nzsl_share_gloss_csv.html", - {"import_csv_form": form}, - ) + messages.add_message(request, messages.ERROR, + _("The provided CSV-file does not meet the requirements " + "or there is some other problem.")) + return render(request, "dictionary/import_nzsl_share_gloss_csv.html", + {"import_csv_form": form}, ) new_glosses = [] dataset = form.cleaned_data["dataset"] @@ -301,7 +218,7 @@ def import_nzsl_share_gloss_csv(request): codecs.iterdecode(form.cleaned_data["file"], "utf-8"), fieldnames=share_csv_header_list, delimiter=",", - quotechar='"', + quotechar='"' ) skipped_existing_glosses = [] @@ -337,40 +254,29 @@ def import_nzsl_share_gloss_csv(request): request.session.pop("dataset_id", None) request.session.pop("glosses_new", None) # Set a message to be shown so that the user knows what is going on. - messages.add_message( - request, messages.ERROR, _("Cannot open the file:" + str(e)) - ) - return render( - request, - "dictionary/import_nzsl_share_gloss_csv.html", - {"import_csv_form": CSVUploadForm()}, - ) + messages.add_message(request, messages.ERROR, _("Cannot open the file:" + str(e))) + return render(request, "dictionary/import_nzsl_share_gloss_csv.html", + {"import_csv_form": CSVUploadForm()}, ) except UnicodeDecodeError as e: # File is not UTF-8 encoded. messages.add_message(request, messages.ERROR, _("File must be UTF-8 encoded!")) - return render( - request, - "dictionary/import_nzsl_share_gloss_csv.html", - {"import_csv_form": CSVUploadForm()}, - ) + return render(request, "dictionary/import_nzsl_share_gloss_csv.html", + {"import_csv_form": CSVUploadForm()}, ) # Store dataset's id and the list of glosses to be added in session. request.session["dataset_id"] = dataset.id request.session["glosses_new"] = new_glosses - return render( - request, - "dictionary/import_nzsl_share_gloss_csv_confirmation.html", - { - "glosses_new": new_glosses, - "dataset": dataset, - "skipped_existing_glosses": skipped_existing_glosses, - }, - ) + return render(request, "dictionary/import_nzsl_share_gloss_csv_confirmation.html", + { + "glosses_new": new_glosses, + "dataset": dataset, + "skipped_existing_glosses": skipped_existing_glosses + }) def update_retrieval_videos(videos, gloss_data): - """prep videos, illustrations and usage example for video retrieval""" + """ prep videos, illustrations and usage example for video retrieval """ gloss_pk = gloss_data["gloss"].pk gloss_word = gloss_data["word"] @@ -378,14 +284,16 @@ def update_retrieval_videos(videos, gloss_data): if gloss_data.get("videos", None): video_url = gloss_data["videos"] extension = video_url[-3:] - file_name = f"{gloss_pk}-{gloss_word}.{gloss_pk}_video.{extension}" + file_name = ( + f"{gloss_pk}-{gloss_word}.{gloss_pk}_video.{extension}" + ) glossvideo = { "url": video_url, "file_name": file_name, "gloss_pk": gloss_pk, "video_type": "main", - "version": 0, + "version": 0 } videos.append(glossvideo) @@ -401,7 +309,7 @@ def update_retrieval_videos(videos, gloss_data): "file_name": file_name, "gloss_pk": gloss_pk, "video_type": "main", - "version": i, + "version": i } videos.append(glossvideo) @@ -417,18 +325,14 @@ def update_retrieval_videos(videos, gloss_data): "file_name": file_name, "gloss_pk": gloss_pk, "video_type": f"finalexample{i + 1}", - "version": i, + "version": i } videos.append(glossvideo) - @login_required @permission_required("dictionary.import_csv") @transaction.atomic() def confirm_import_nzsl_share_gloss_csv(request): - - pprint(request.session.__dict__) - """This view adds the data to database if the user confirms the action""" if not request.method == "POST": # If request method is not POST, redirect to the import form @@ -444,31 +348,6 @@ def confirm_import_nzsl_share_gloss_csv(request): elif not "confirm" in request.POST: return HttpResponseRedirect(reverse("dictionary:import_nzsl_share_gloss_csv")) - if "glosses_new" and "dataset_id" in request.session: - [glosses_added, dataset_name] = confirm_import_nzsl_share_gloss_csv_inner( - request.session["glosses_new"], request.session["dataset_id"] - ) - - del request.session["glosses_new"] - del request.session["dataset_id"] - - # Set a message to be shown so that the user knows what is going on. - messages.add_message( - request, messages.SUCCESS, _("Glosses were added successfully.") - ) - - return render( - request, - "dictionary/import_nzsl_share_gloss_csv_confirmation.html", - {"glosses_added": glosses_added, "dataset": dataset_name}, - ) - - -def confirm_import_nzsl_share_gloss_csv_inner(session_glosses_new, session_dataset_id): - """Does the thing""" - - print("IN CONFIRM INNER") - glosses_added = [] dataset = None translations = [] @@ -483,49 +362,49 @@ def confirm_import_nzsl_share_gloss_csv_inner(session_glosses_new, session_datas bulk_share_validation_aggregations = [] video_import_only_glosses_data = [] - dataset = Dataset.objects.get(id=session_dataset_id) - language_en = Language.objects.get(name="English") - language_mi = Language.objects.get(name="Māori") - gloss_content_type = ContentType.objects.get_for_model(Gloss) - site = Site.objects.get_current() - comment_submit_date = datetime.datetime.now(tz=get_current_timezone()) - semantic_fields = FieldChoice.objects.filter(field="semantic_field").values_list( - "english_name", "pk" - ) - semantic_fields_dict = {field[0]: field[1] for field in semantic_fields} - signers = FieldChoice.objects.filter(field="signer") - signer_dict = {signer.english_name: signer for signer in signers} - existing_machine_values = [ - mv for mv in FieldChoice.objects.all().values_list("machine_value", flat=True) - ] - not_public_tag = Tag.objects.get(name="not public") - nzsl_share_tag = Tag.objects.get(name="nzsl-share") - import_user = User.objects.get( - username="nzsl_share_importer", - first_name="Importer", - last_name="NZSL Share", - ) + if "glosses_new" and "dataset_id" in request.session: + dataset = Dataset.objects.get(id=request.session["dataset_id"]) + language_en = Language.objects.get(name="English") + language_mi = Language.objects.get(name="Māori") + gloss_content_type = ContentType.objects.get_for_model(Gloss) + site = Site.objects.get_current() + comment_submit_date = datetime.datetime.now(tz=get_current_timezone()) + semantic_fields = FieldChoice.objects.filter( + field="semantic_field" + ).values_list("english_name", "pk") + semantic_fields_dict = {field[0]: field[1] for field in semantic_fields} + signers = FieldChoice.objects.filter(field="signer") + signer_dict = {signer.english_name: signer for signer in signers} + existing_machine_values = [ + mv for mv in FieldChoice.objects.all().values_list("machine_value", flat=True) + ] + not_public_tag = Tag.objects.get(name="not public") + nzsl_share_tag = Tag.objects.get(name="nzsl-share") + import_user = User.objects.get( + username="nzsl_share_importer", + first_name="Importer", + last_name="NZSL Share", + ) + + for row_num, gloss_data in enumerate(request.session["glosses_new"]): + # will iterate over these glosses again after bulk creating + # and to ensure we get the correct gloss_data for words that appear multiple + # times we'll use the row_num as the identifier for the gloss data + + # if the gloss already exists at this point, it can only mean that + # it has no videos and we want to import videos for it + # try-except saves us a db call + try: + gloss = Gloss.objects.filter(nzsl_share_id=gloss_data["id"]).get() + gloss_data_copy = gloss_data.copy() + gloss_data_copy["gloss"] = gloss + video_import_only_glosses_data.append(gloss_data_copy) + continue + except Gloss.DoesNotExist: + pass - for row_num, gloss_data in enumerate(session_glosses_new): - # will iterate over these glosses again after bulk creating - # and to ensure we get the correct gloss_data for words that appear multiple - # times we'll use the row_num as the identifier for the gloss data - - # if the gloss already exists at this point, it can only mean that - # it has no videos and we want to import videos for it - # try-except saves us a db call - try: - gloss = Gloss.objects.filter(nzsl_share_id=gloss_data["id"]).get() - gloss_data_copy = gloss_data.copy() - gloss_data_copy["gloss"] = gloss - video_import_only_glosses_data.append(gloss_data_copy) - continue - except Gloss.DoesNotExist: - pass - - new_glosses[str(row_num)] = gloss_data - bulk_create_gloss.append( - Gloss( + new_glosses[str(row_num)] = gloss_data + bulk_create_gloss.append(Gloss( dataset=dataset, nzsl_share_id=gloss_data["id"], # need to make idgloss unique in dataset, @@ -536,174 +415,183 @@ def confirm_import_nzsl_share_gloss_csv_inner(session_glosses_new, session_datas created_by=import_user, updated_by=import_user, exclude_from_ecv=True, - ) - ) - contributors.append(gloss_data["contributor_username"]) - - bulk_created = Gloss.objects.bulk_create(bulk_create_gloss) - - # Create new signers for contributors that do not exist as signers yet - contributors = set(contributors) - create_signers = [] - signers = signer_dict.keys() - for contributor in contributors: - if contributor not in signers: - new_machine_value = random.randint(0, 99999999) - while new_machine_value in existing_machine_values: + )) + contributors.append(gloss_data["contributor_username"]) + + bulk_created = Gloss.objects.bulk_create(bulk_create_gloss) + + # Create new signers for contributors that do not exist as signers yet + contributors = set(contributors) + create_signers = [] + signers = signer_dict.keys() + for contributor in contributors: + if contributor not in signers: new_machine_value = random.randint(0, 99999999) - existing_machine_values.append(new_machine_value) - create_signers.append( - FieldChoice( + while new_machine_value in existing_machine_values: + new_machine_value = random.randint(0, 99999999) + existing_machine_values.append(new_machine_value) + create_signers.append(FieldChoice( field="signer", english_name=contributor, - machine_value=new_machine_value, - ) - ) - new_signers = FieldChoice.objects.bulk_create(create_signers) - for signer in new_signers: - signer_dict[signer.english_name] = signer - - for gloss in bulk_created: - word_en, row_num = gloss.idgloss.split("_row") - gloss_data = new_glosses[row_num] - gloss_data["gloss"] = gloss - - # get semantic fields for gloss_data topics - if gloss_data.get("topic_names", None): - gloss_topics = gloss_data["topic_names"].split("|") - # ignore all signs and All signs - cleaned_gloss_topics = [ - x for x in gloss_topics if x not in ["all signs", "All signs"] - ] - add_miscellaneous = False - - for topic in cleaned_gloss_topics: - if topic in semantic_fields_dict.keys(): + machine_value=new_machine_value + )) + new_signers = FieldChoice.objects.bulk_create(create_signers) + for signer in new_signers: + signer_dict[signer.english_name] = signer + + for gloss in bulk_created: + word_en, row_num = gloss.idgloss.split("_row") + gloss_data = new_glosses[row_num] + gloss_data["gloss"] = gloss + + # get semantic fields for gloss_data topics + if gloss_data.get("topic_names", None): + gloss_topics = gloss_data["topic_names"].split("|") + # ignore all signs and All signs + cleaned_gloss_topics = [ + x for x in gloss_topics if x not in ["all signs", "All signs"] + ] + add_miscellaneous = False + + for topic in cleaned_gloss_topics: + if topic in semantic_fields_dict.keys(): + bulk_semantic_fields.append( + Gloss.semantic_field.through( + gloss_id=gloss.id, + fieldchoice_id=semantic_fields_dict[topic] + ) + ) + else: + # add the miscellaneous semantic field if a topic does not exist + add_miscellaneous = True + + if add_miscellaneous: bulk_semantic_fields.append( Gloss.semantic_field.through( gloss_id=gloss.id, - fieldchoice_id=semantic_fields_dict[topic], + fieldchoice_id=semantic_fields_dict["Miscellaneous"] ) ) - else: - # add the miscellaneous semantic field if a topic does not exist - add_miscellaneous = True - - if add_miscellaneous: - bulk_semantic_fields.append( - Gloss.semantic_field.through( - gloss_id=gloss.id, - fieldchoice_id=semantic_fields_dict["Miscellaneous"], - ) - ) - # create GlossTranslations for english and maori words - translations.append( - GlossTranslations( + # create GlossTranslations for english and maori words + translations.append(GlossTranslations( gloss=gloss, language=language_en, translations=gloss_data["word"], - translations_secondary=gloss_data.get("secondary", None), - ) - ) - if gloss_data.get("maori", None): - # There is potentially several comma separated maori words - maori_words = gloss_data["maori"].split(", ") - - # Update idgloss_mi using first maori word, then create translation - gloss.idgloss_mi = f"{maori_words[0]}:{gloss.pk}" - - translation = GlossTranslations( - gloss=gloss, language=language_mi, translations=maori_words[0] - ) - if len(maori_words) > 1: - translation.translations_secondary = ", ".join(maori_words[1:]) + translations_secondary=gloss_data.get("secondary", None) + )) + if gloss_data.get("maori", None): + # There is potentially several comma separated maori words + maori_words = gloss_data["maori"].split(", ") + + # Update idgloss_mi using first maori word, then create translation + gloss.idgloss_mi = f"{maori_words[0]}:{gloss.pk}" + + translation = GlossTranslations( + gloss=gloss, + language=language_mi, + translations=maori_words[0] + ) + if len(maori_words) > 1: + translation.translations_secondary = ", ".join(maori_words[1:]) - translations.append(translation) + translations.append(translation) - # Prepare new idgloss and signer fields for bulk update - gloss.idgloss = f"{word_en}:{gloss.pk}" - gloss.signer = signer_dict[gloss_data["contributor_username"]] - bulk_update_glosses.append(gloss) + # Prepare new idgloss and signer fields for bulk update + gloss.idgloss = f"{word_en}:{gloss.pk}" + gloss.signer = signer_dict[gloss_data["contributor_username"]] + bulk_update_glosses.append(gloss) - # Create comment for gloss_data notes - comments.append( - Comment( + # Create comment for gloss_data notes + comments.append(Comment( content_type=gloss_content_type, object_pk=gloss.pk, user_name=gloss_data.get("contributor_username", ""), comment=gloss_data.get("notes", ""), site=site, is_public=False, - submit_date=comment_submit_date, - ) - ) - if gloss_data.get("sign_comments", None): - # create Comments for all gloss_data sign_comments - for comment in gloss_data["sign_comments"].split("|"): - try: - comment_content = comment.split(":") - user_name = comment_content[0] - comment_content = comment_content[1] - except IndexError: - comment_content = comment - user_name = "Unknown" - comments.append( - Comment( + submit_date=comment_submit_date + )) + if gloss_data.get("sign_comments", None): + # create Comments for all gloss_data sign_comments + for comment in gloss_data["sign_comments"].split("|"): + try: + comment_content = comment.split(":") + user_name = comment_content[0] + comment_content = comment_content[1] + except IndexError: + comment_content = comment + user_name = "Unknown" + comments.append(Comment( content_type=gloss_content_type, object_pk=gloss.pk, user_name=user_name, comment=comment_content, site=site, is_public=False, - submit_date=comment_submit_date, - ) - ) + submit_date=comment_submit_date + )) - # Add ShareValidationAggregation - bulk_share_validation_aggregations.append( - ShareValidationAggregation( + # Add ShareValidationAggregation + bulk_share_validation_aggregations.append(ShareValidationAggregation( gloss=gloss, agrees=int(gloss_data["agrees"]), - disagrees=int(gloss_data["disagrees"]), - ) - ) + disagrees=int(gloss_data["disagrees"]) + )) - # prep videos, illustrations and usage example for video retrieval - update_retrieval_videos(videos, gloss_data) + # prep videos, illustrations and usage example for video retrieval + update_retrieval_videos(videos, gloss_data) - glosses_added.append(gloss) + glosses_added.append(gloss) - bulk_tagged_items.append( - TaggedItem( - content_type=gloss_content_type, object_id=gloss.pk, tag=nzsl_share_tag - ) - ) - bulk_tagged_items.append( - TaggedItem( - content_type=gloss_content_type, object_id=gloss.pk, tag=not_public_tag - ) - ) + bulk_tagged_items.append(TaggedItem( + content_type=gloss_content_type, + object_id=gloss.pk, + tag=nzsl_share_tag - # Bulk create entities related to the gloss, and bulk update the glosses' idgloss - Comment.objects.bulk_create(comments) - GlossTranslations.objects.bulk_create(translations) - Gloss.objects.bulk_update(bulk_update_glosses, ["idgloss", "idgloss_mi", "signer"]) - Gloss.semantic_field.through.objects.bulk_create(bulk_semantic_fields) - TaggedItem.objects.bulk_create(bulk_tagged_items) - ShareValidationAggregation.objects.bulk_create(bulk_share_validation_aggregations) + )) + bulk_tagged_items.append(TaggedItem( + content_type=gloss_content_type, + object_id=gloss.pk, + tag=not_public_tag - # Add the video-update only glosses - for video_import_gloss_data in video_import_only_glosses_data: - # prep videos, illustrations and usage example for video retrieval - update_retrieval_videos(videos, video_import_gloss_data) - glosses_added.append(video_import_gloss_data["gloss"]) + )) - # start Thread to process gloss video retrieval in the background - t = threading.Thread(target=retrieve_videos_for_glosses, args=[videos], daemon=True) - t.start() + # Bulk create entities related to the gloss, and bulk update the glosses' idgloss + Comment.objects.bulk_create(comments) + GlossTranslations.objects.bulk_create(translations) + Gloss.objects.bulk_update(bulk_update_glosses, ["idgloss", "idgloss_mi", "signer"]) + Gloss.semantic_field.through.objects.bulk_create(bulk_semantic_fields) + TaggedItem.objects.bulk_create(bulk_tagged_items) + ShareValidationAggregation.objects.bulk_create(bulk_share_validation_aggregations) + + # Add the video-update only glosses + for video_import_gloss_data in video_import_only_glosses_data: + # prep videos, illustrations and usage example for video retrieval + update_retrieval_videos(videos, video_import_gloss_data) + glosses_added.append(video_import_gloss_data["gloss"]) + + # start Thread to process gloss video retrieval in the background + t = threading.Thread( + target=retrieve_videos_for_glosses, + args=[videos], + daemon=True + ) + t.start() + + del request.session["glosses_new"] + del request.session["dataset_id"] + + # Set a message to be shown so that the user knows what is going on. + messages.add_message(request, messages.SUCCESS, _("Glosses were added successfully.")) + return render( + request, "dictionary/import_nzsl_share_gloss_csv_confirmation.html", + { + "glosses_added": glosses_added, + "dataset": dataset.name + } + ) - return [glosses_added, dataset.name] @login_required @@ -720,29 +608,18 @@ def import_qualtrics_csv(request): if not request.method == "POST": # If request type is not POST, return to the original form. csv_form = CSVFileOnlyUpload() - return render( - request, - "dictionary/import_qualtrics_csv.html", - {"import_csv_form": csv_form}, - ) + return render(request, "dictionary/import_qualtrics_csv.html", + {"import_csv_form": csv_form}, ) form = CSVFileOnlyUpload(request.POST, request.FILES) if not form.is_valid(): # If form is not valid, set a error message and return to the original form. - messages.add_message( - request, - messages.ERROR, - _( - "The provided CSV-file does not meet the requirements " - "or there is some other problem." - ), - ) - return render( - request, - "dictionary/import_qualtrics_csv.html", - {"import_csv_form": form}, - ) + messages.add_message(request, messages.ERROR, + _("The provided CSV-file does not meet the requirements " + "or there is some other problem.")) + return render(request, "dictionary/import_qualtrics_csv.html", + {"import_csv_form": form}, ) validation_records = [] skipped_rows = [] @@ -750,7 +627,7 @@ def import_qualtrics_csv(request): validation_record_reader = csv.DictReader( codecs.iterdecode(form.cleaned_data["file"], "utf-8"), delimiter=",", - quotechar='"', + quotechar='"' ) question_numbers = [] @@ -792,33 +669,22 @@ def import_qualtrics_csv(request): request.session.pop("question_numbers", None) request.session.pop("question_gloss_map", None) # Set a message to be shown so that the user knows what is going on. - messages.add_message( - request, messages.ERROR, _("Cannot open the file:" + str(e)) - ) - return render( - request, - "dictionary/import_qualtrics_csv.html", - {"import_csv_form": CSVFileOnlyUpload()}, - ) + messages.add_message(request, messages.ERROR, _("Cannot open the file:" + str(e))) + return render(request, "dictionary/import_qualtrics_csv.html", + {"import_csv_form": CSVFileOnlyUpload()}, ) except UnicodeDecodeError as e: # File is not UTF-8 encoded. messages.add_message(request, messages.ERROR, _("File must be UTF-8 encoded!")) - return render( - request, - "dictionary/import_qualtrics_csv.html", - {"import_csv_form": CSVFileOnlyUpload()}, - ) + return render(request, "dictionary/import_qualtrics_csv.html", + {"import_csv_form": CSVFileOnlyUpload()}, ) # Store dataset's id and the list of glosses to be added in session. request.session["validation_records"] = validation_records request.session["question_numbers"] = question_numbers request.session["question_glossvideo_map"] = question_to_glossvideo_map - return render( - request, - "dictionary/import_qualtrics_csv_confirmation.html", - {"validation_records": validation_records, "skipped_rows": skipped_rows}, - ) + return render(request, "dictionary/import_qualtrics_csv_confirmation.html", + {"validation_records": validation_records, "skipped_rows": skipped_rows}) @login_required @@ -848,21 +714,13 @@ def confirm_import_qualtrics_csv(request): bulk_tagged_items = [] gloss_pks = set() - if ( - "validation_records" - and "question_numbers" - and "question_glossvideo_map" in request.session - ): + if "validation_records" and "question_numbers" and "question_glossvideo_map" in request.session: # Retrieve glosses glossvideo_pk_list = request.session["question_glossvideo_map"].values() - glossvideo_dict = GlossVideo.objects.select_related("gloss").in_bulk( - glossvideo_pk_list - ) + glossvideo_dict = GlossVideo.objects.select_related("gloss").in_bulk(glossvideo_pk_list) gloss_content_type = ContentType.objects.get_for_model(Gloss) check_result_tag = Tag.objects.get(name=settings.TAG_VALIDATION_CHECK_RESULTS) - ready_for_validation_tag = Tag.objects.get( - name=settings.TAG_READY_FOR_VALIDATION - ) + ready_for_validation_tag = Tag.objects.get(name=settings.TAG_READY_FOR_VALIDATION) questions_numbers = request.session["question_numbers"] question_glossvideo_map = request.session["question_glossvideo_map"] @@ -882,43 +740,35 @@ def confirm_import_qualtrics_csv(request): sign_seen = ValidationRecord.SignSeenChoices.NOT_SURE.value try: - gloss = glossvideo_dict[ - question_glossvideo_map[question_number] - ].gloss - validation_records_added.append( - ValidationRecord( - gloss=gloss, - sign_seen=ValidationRecord.SignSeenChoices(sign_seen), - response_id=response_id, - respondent_first_name=respondent_first_name, - respondent_last_name=respondent_last_name, - comment=record.get(f"{question_number}_Q2_5_TEXT", ""), - ) - ) + gloss = glossvideo_dict[question_glossvideo_map[question_number]].gloss + validation_records_added.append(ValidationRecord( + gloss=gloss, + sign_seen=ValidationRecord.SignSeenChoices(sign_seen), + response_id=response_id, + respondent_first_name=respondent_first_name, + respondent_last_name=respondent_last_name, + comment=record.get(f"{question_number}_Q2_5_TEXT", ""), + )) gloss_pks.add(gloss.pk) except KeyError: - missing_gloss_pk_question_pairs[question_number] = ( - question_glossvideo_map[question_number] - ) + missing_gloss_pk_question_pairs[question_number] = question_glossvideo_map[ + question_number] for gloss_pk in gloss_pks: - bulk_tagged_items.append( - TaggedItem( - content_type=gloss_content_type, - object_id=gloss_pk, - tag=check_result_tag, - ) - ) + bulk_tagged_items.append(TaggedItem( + content_type=gloss_content_type, + object_id=gloss_pk, + tag=check_result_tag + + )) # ignoring conflicts so the unique together on the model filters out potential duplicates - ValidationRecord.objects.bulk_create( - validation_records_added, ignore_conflicts=True - ) + ValidationRecord.objects.bulk_create(validation_records_added, ignore_conflicts=True) TaggedItem.objects.bulk_create(bulk_tagged_items, ignore_conflicts=True) TaggedItem.objects.filter( content_type=gloss_content_type, object_id__in=gloss_pks, - tag=ready_for_validation_tag, + tag=ready_for_validation_tag ).delete() del request.session["validation_records"] @@ -926,19 +776,17 @@ def confirm_import_qualtrics_csv(request): del request.session["question_glossvideo_map"] # Set a message to be shown so that the user knows what is going on. - messages.add_message( - request, messages.SUCCESS, _("ValidationRecords were added successfully.") - ) + messages.add_message(request, messages.SUCCESS, + _("ValidationRecords were added successfully.")) return render( - request, - "dictionary/import_qualtrics_csv_confirmation.html", + request, "dictionary/import_qualtrics_csv_confirmation.html", { "validation_records_added": validation_records_added, "validation_record_count": len(validation_records_added), "responses_count": len(validation_records), "gloss_count": len(gloss_pks), - "missing_gloss_question_pairs": missing_gloss_pk_question_pairs, - }, + "missing_gloss_question_pairs": missing_gloss_pk_question_pairs + } ) @@ -967,29 +815,18 @@ def import_manual_validation(request): if request.method != "POST": # If request type is not POST, return to the original form. csv_form = CSVFileOnlyUpload() - return render( - request, - "dictionary/import_manual_validation_csv.html", - {"import_csv_form": csv_form}, - ) + return render(request, "dictionary/import_manual_validation_csv.html", + {"import_csv_form": csv_form}, ) form = CSVFileOnlyUpload(request.POST, request.FILES) if not form.is_valid(): # If form is not valid, set a error message and return to the original form. - messages.add_message( - request, - messages.ERROR, - _( - "The provided CSV-file does not meet the requirements " - "or there is some other problem." - ), - ) - return render( - request, - "dictionary/import_manual_validation_csv.html", - {"import_csv_form": form}, - ) + messages.add_message(request, messages.ERROR, + _("The provided CSV-file does not meet the requirements " + "or there is some other problem.")) + return render(request, "dictionary/import_manual_validation_csv.html", + {"import_csv_form": form}, ) group_row_map = defaultdict(list) group_gloss_count = defaultdict(int) @@ -1000,38 +837,29 @@ def import_manual_validation(request): "yes", "no", "abstain or not sure", - "comments", + "comments" ] try: validation_record_reader = csv.DictReader( codecs.iterdecode(form.cleaned_data["file"], "utf-8-sig"), delimiter=",", - quotechar='"', - ) - missing_headers = set(required_headers) - set( - validation_record_reader.fieldnames + quotechar='"' ) + missing_headers = set(required_headers) - set(validation_record_reader.fieldnames) if missing_headers != set(): request.session.pop("group_row_map", None) request.session.pop("glosses", None) # Set a message to be shown so that the user knows what is going on. - messages.add_message( - request, - messages.ERROR, - _(f"CSV is missing required columns: {missing_headers}"), - ) - return render( - request, - "dictionary/import_manual_validation_csv.html", - {"import_csv_form": CSVFileOnlyUpload()}, - ) + messages.add_message(request, messages.ERROR, + _(f"CSV is missing required columns: {missing_headers}")) + return render(request, + "dictionary/import_manual_validation_csv.html", + {"import_csv_form": CSVFileOnlyUpload()}, ) for row in validation_record_reader: if validation_record_reader.line_num == 1: continue - _check_row_can_be_converted_to_integer( - row, ["yes", "no", "abstain or not sure"] - ) + _check_row_can_be_converted_to_integer(row, ["yes", "no", "abstain or not sure"]) group_row_map[row["group"]].append(row) group_gloss_count[row["group"]] += 1 glosses.append(row["idgloss"].split(":")[1]) @@ -1040,49 +868,35 @@ def import_manual_validation(request): request.session.pop("group_row_map", None) request.session.pop("glosses", None) # Set a message to be shown so that the user knows what is going on. - messages.add_message( - request, messages.ERROR, _("File contains non-compliant data:" + str(e)) - ) - return render( - request, - "dictionary/import_manual_validation_csv.html", - {"import_csv_form": CSVFileOnlyUpload()}, - ) + messages.add_message(request, messages.ERROR, _("File contains non-compliant data:" + str(e))) + return render(request, "dictionary/import_manual_validation_csv.html", + {"import_csv_form": CSVFileOnlyUpload()}, ) except csv.Error as e: # Can't open file, remove session variables request.session.pop("group_row_map", None) request.session.pop("glosses", None) # Set a message to be shown so that the user knows what is going on. - messages.add_message( - request, messages.ERROR, _("Cannot open the file:" + str(e)) - ) - return render( - request, - "dictionary/import_manual_validation_csv.html", - {"import_csv_form": CSVFileOnlyUpload()}, - ) + messages.add_message(request, messages.ERROR, _("Cannot open the file:" + str(e))) + return render(request, "dictionary/import_manual_validation_csv.html", + {"import_csv_form": CSVFileOnlyUpload()}, ) except UnicodeDecodeError as e: # File is not UTF-8 encoded. messages.add_message(request, messages.ERROR, _("File must be UTF-8 encoded!")) - return render( - request, - "dictionary/import_manual_validation_csv.html", - {"import_csv_form": CSVFileOnlyUpload()}, - ) + return render(request, "dictionary/import_manual_validation_csv.html", + {"import_csv_form": CSVFileOnlyUpload()}, ) # Store dataset's id and the list of glosses to be added in session. request.session["group_row_map"] = group_row_map request.session["glosses"] = list(set(glosses)) return render( - request, - "dictionary/import_manual_validation_csv_confirmation.html", + request, "dictionary/import_manual_validation_csv_confirmation.html", { # iterating over defaultdicts causes issues in template rendering "group_row_map": dict(group_row_map), - "group_gloss_count": dict(group_gloss_count), - }, + "group_gloss_count": dict(group_gloss_count) + } ) @@ -1126,18 +940,14 @@ def confirm_import_manual_validation(request): sign_seen_no = row["no"] sign_seen_not_sure = row["abstain or not sure"] comments = row["comments"] - manual_validation_aggregations.append( - ManualValidationAggregation( - gloss=gloss, - group=group, - sign_seen_yes=int(sign_seen_yes) if sign_seen_yes else 0, - sign_seen_no=int(sign_seen_no) if sign_seen_no else 0, - sign_seen_not_sure=( - int(sign_seen_not_sure) if sign_seen_not_sure else 0 - ), - comments=comments, - ) - ) + manual_validation_aggregations.append(ManualValidationAggregation( + gloss=gloss, + group=group, + sign_seen_yes=int(sign_seen_yes) if sign_seen_yes else 0, + sign_seen_no=int(sign_seen_no) if sign_seen_no else 0, + sign_seen_not_sure=int(sign_seen_not_sure) if sign_seen_not_sure else 0, + comments=comments + )) ManualValidationAggregation.objects.bulk_create(manual_validation_aggregations) @@ -1145,15 +955,13 @@ def confirm_import_manual_validation(request): del request.session["glosses"] # Set a message to be shown so that the user knows what is going on. - messages.add_message( - request, messages.SUCCESS, _("ValidationRecords were added successfully.") - ) + messages.add_message(request, messages.SUCCESS, + _("ValidationRecords were added successfully.")) return render( - request, - "dictionary/import_manual_validation_csv_confirmation.html", + request, "dictionary/import_manual_validation_csv_confirmation.html", { "manual_validation_aggregations": manual_validation_aggregations, "manual_validation_aggregations_count": len(manual_validation_aggregations), - "missing_glosses": missing_glosses, - }, + "missing_glosses": missing_glosses + } ) From 95c31790487ce6e7083bffbb0c363cb936c1c1d1 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Mon, 28 Oct 2024 16:45:57 +1100 Subject: [PATCH 153/222] More experimental client code --- bin/get-video-s3-acls.py | 81 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 78 insertions(+), 3 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 7625239a..60199b1c 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -14,6 +14,8 @@ from time import sleep from pprint import pprint import boto3 +import copy +import csv parser = argparse.ArgumentParser( @@ -57,6 +59,7 @@ get_wsgi_application() + from django.contrib.auth.models import Permission from django.contrib.auth import get_user_model User = get_user_model() @@ -71,6 +74,11 @@ ShareValidationAggregation, ValidationRecord, ) + from signbank.video.models import GlossVideo + from django.test import Client + from django.core.files.uploadedfile import SimpleUploadedFile + from django.urls import reverse + from django.db.utils import IntegrityError # Globals CSV_DELIMITER = "," @@ -366,16 +374,83 @@ def do_tests(): if args.env != "dev": print("Error: tests must be in 'dev' environment") exit() - print(f"DATABASE_URL:{DATABASE_URL}") if DATABASE_URL.find("@localhost") < 0: print("Error: database url must contain '@localhost'") exit() + print(f"DATABASE_URL:{DATABASE_URL}") print("Running tests") - s3 = boto3.client("s3") + #s3 = boto3.client("s3") # pprint(s3.list_objects(Bucket=AWS_S3_BUCKET)) # get_nzsl_raw_keys_dict() - pprint(Gloss.objects.all()) + # pprint(Gloss.objects.all()) + + # This is a cut and paste of the mock tests, but we're doing it "live" on dev + _csv_content = { + "id": "111", + "word": "Test", + "maori": "maori, maori 2", + "secondary": "test", + "notes": "a note", + "created_at": "2023-09-12 22:37:59 UTC", + "contributor_email": "ops@ackama.com", + "contributor_username": "Ackama Ops", + "agrees": "0", + "disagrees": "1", + "topic_names": "Test Topic|Test", + "videos": "/VID_20170815_153446275.mp4", + "illustrations": "/kiwifruit-2-6422.png", + "usage_examples": "/fire.1923.finalexample1.mb.r480x360.mp4", + "sign_comments": ("contribution_limit_test_1: Comment 0|Comment 33"), + } + file_name = "test.csv" + csv_content = [copy.deepcopy(_csv_content)] + csv_content[0]["id"] = "12345" + with open(file_name, "w") as file: + writer = csv.writer(file) + writer.writerow(csv_content[0].keys()) + for row in csv_content: + writer.writerow(row.values()) + data = open(file_name, "rb") + file = SimpleUploadedFile( + content=data.read(), name=data.name, content_type="content/multipart" + ) + dataset = Dataset.objects.get(name="NZSL") + + try: + Gloss.objects.get(idgloss="Share:11").delete() + except ValueError: + pass + Gloss.objects.create( + dataset=dataset, + idgloss="Share:11", + nzsl_share_id="12345", + ) + + # Create user and add permissions + try: + user = User.objects.create_user(username="test", email=None, password="test") + csv_permission = Permission.objects.get(codename='import_csv') + user.user_permissions.add(csv_permission) + except IntegrityError: + user = User.objects.get(username="test") + + # Create client with change_gloss permission. + client = Client() + client.force_login(user) + s = client.session + s.update({ + "dataset_id": dataset.pk, + "glosses_new": csv_content + }) + s.save() + pprint("CLIENT SESSION") + pprint(client.session.items()) + response = client.post( + reverse("dictionary:confirm_import_nzsl_share_gloss_csv"), + {"confirm": True} + ) + pprint(response.__dict__) # From the keys present in NZSL, get all their S3 information From 11dffb62cab3f3aaa2805083e6db307d225e58f9 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 30 Oct 2024 11:06:47 +1100 Subject: [PATCH 154/222] Forking video tests away from ACL script --- bin/test-videos-s3.py | 478 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 478 insertions(+) create mode 100755 bin/test-videos-s3.py diff --git a/bin/test-videos-s3.py b/bin/test-videos-s3.py new file mode 100755 index 00000000..9dbc6359 --- /dev/null +++ b/bin/test-videos-s3.py @@ -0,0 +1,478 @@ +#!/usr/bin/env -S python3 -u +# Bang line above passes '-u' to python, for unbuffered output +# Permissions required: +# psql - access to heroku app's postgres +# aws s3 - NZSL IAM access +# s3:GetObjectAcl permissions or READ_ACP access to the object +# https://docs.aws.amazon.com/cli/latest/reference/s3api/get-object-acl.html + +import os +import sys +import subprocess +import argparse +import re +from time import sleep +from pprint import pprint +import boto3 +import copy +import csv + + +parser = argparse.ArgumentParser( + description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. " + "Postgres access details, eg. DATABASE_URL env var." +) +parser.add_argument( + "--env", + default="uat", + required=False, + help="Environment to run against, eg 'production, 'uat', etc (default: '%(default)s')", +) +parser.add_argument( + "--pgcli", + default="/usr/bin/psql", + required=False, + help=f"Postgres client path (default: %(default)s)", +) +parser.add_argument( + "--awscli", + default="/usr/local/bin/aws", + required=False, + help=f"AWS client path (default: %(default)s)", +) +parser.add_argument( + "--tests", + action="store_true", + default=False, + required=False, + help="Run remote tests instead of generating CSV output", +) + +args = parser.parse_args() + + +if args.tests: + # Magic required to allow this script to use Signbank Django classes + sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) + os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development") + from django.core.wsgi import get_wsgi_application + + get_wsgi_application() + + from django.contrib.auth.models import Permission + from django.contrib.auth import get_user_model + + User = get_user_model() + + from signbank.dictionary.models import ( + Dataset, + FieldChoice, + Gloss, + GlossTranslations, + Language, + ManualValidationAggregation, + ShareValidationAggregation, + ValidationRecord, + ) + from signbank.video.models import GlossVideo + from django.test import Client + from django.core.files.uploadedfile import SimpleUploadedFile + from django.urls import reverse + from django.db.utils import IntegrityError + +# Globals +CSV_DELIMITER = "," +DATABASE_URL = os.getenv("DATABASE_URL", "") +AWSCLI = args.awscli +PGCLI = args.pgcli +AWS_S3_BUCKET = f"nzsl-signbank-media-{args.env}" + + +def pg_cli(args_list): + try: + return subprocess.run( + [PGCLI, "-c"] + args_list + [f"{DATABASE_URL}"], + env=os.environ, + capture_output=True, + check=True, + text=True, + ) + except subprocess.CalledProcessError as e: + print(f"Error: subprocess.run returned code {e.returncode}", file=sys.stderr) + print(e.cmd, file=sys.stderr) + print(e.stdout, file=sys.stderr) + print(e.stderr, file=sys.stderr) + exit() + + +def aws_cli(args_list): + # Try indefinitely + output = None + while not output: + try: + output = subprocess.run( + [AWSCLI] + args_list, + env=os.environ, + capture_output=True, + check=True, + text=True, + ) + except subprocess.CalledProcessError as e: + print( + f"Error: subprocess.run returned code {e.returncode}", file=sys.stderr + ) + print(e.cmd, file=sys.stderr) + print(e.stdout, file=sys.stderr) + print(e.stderr, file=sys.stderr) + sleep(1) + return output + + +# Get the video files info from NZSL Signbank +def get_nzsl_raw_keys_dict(): + print( + f"Getting raw list of video file info from NZSL Signbank ...", + file=sys.stderr, + ) + this_nzsl_raw_keys_dict = {} + # Column renaming is for readability + # Special delimiter because columns might contain commas + result = pg_cli( + [ + "COPY (" + "SELECT " + "dg.id AS gloss_id, " + "dg.idgloss AS gloss_idgloss, " + "dg.created_at AS gloss_created_at, " + "dg.published AS gloss_public, " + "vg.is_public AS video_public, " + "vg.id AS video_id, " + "vg.videofile AS video_key " + "FROM dictionary_gloss AS dg " + "FULL JOIN video_glossvideo AS vg ON vg.gloss_id = dg.id" + ") TO STDOUT WITH (FORMAT CSV, DELIMITER '|')", + ] + ) + + # Separate the NZSL db columns + # Write them to a dictionary, so we can do fast operations + for rawl in result.stdout.split("\n"): + rawl = rawl.strip() + if not rawl: + continue + [ + gloss_id, + gloss_idgloss, + gloss_created_at, + gloss_public, + video_public, + video_id, + video_key, + ] = rawl.split("|") + + # This sets the initial field ordering in the all_keys dictionary row + this_nzsl_raw_keys_dict[video_key] = [ + gloss_idgloss.replace(CSV_DELIMITER, ""), + gloss_created_at, + gloss_id, + video_id, + gloss_public.lower() == "t", + video_public.lower() == "t", + ] + + print( + f"{len(this_nzsl_raw_keys_dict)} rows retrieved", + file=sys.stderr, + ) + + return this_nzsl_raw_keys_dict + + +# Get all keys from AWS S3 +def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET): + print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr) + result = aws_cli( + [ + "s3", + "ls", + f"s3://{s3_bucket}", + "--recursive", + ], + ) + + # Separate out just the key from date, time, size, key + this_s3_bucket_raw_keys_list = [] + for line in result.stdout.split("\n"): + if line: + this_s3_bucket_raw_keys_list.append(re.split(r"\s+", line, 3)[3]) + + print( + f"{len(this_s3_bucket_raw_keys_list)} rows retrieved", + file=sys.stderr, + ) + + return this_s3_bucket_raw_keys_list + + +# Get the keys present and absent across NZSL Signbank and S3, to dictionary +def create_all_keys_dict(this_nzsl_raw_keys_dict, this_s3_bucket_raw_keys_list): + print( + "Getting keys present and absent across NZSL Signbank and S3 ...", + file=sys.stderr, + ) + this_all_keys_dict = {} + + # Find S3 keys that are present in NZSL, or absent + for video_key in this_s3_bucket_raw_keys_list: + dict_row = this_nzsl_raw_keys_dict.get(video_key, None) + if dict_row: + this_all_keys_dict[video_key] = [ + True, # NZSL PRESENT + True, # S3 PRESENT + ] + dict_row + else: + this_all_keys_dict[video_key] = [ + False, # NZSL Absent + True, # S3 PRESENT + ] + [""] * 6 + + # Find NZSL keys that are absent from S3 (present handled above) + for video_key, dict_row in this_nzsl_raw_keys_dict.items(): + if video_key not in this_s3_bucket_raw_keys_list: + this_all_keys_dict[video_key] = [ + True, # NZSL PRESENT + False, # S3 Absent + ] + dict_row + + return this_all_keys_dict + + +# Cases +# In S3 In NZSL Action +# Is Not Delete S3 Object +# Is Is Update ACL +# Not Is Review +def get_recommended_action(key_in_nzsl, key_in_s3): + if key_in_s3: + if key_in_nzsl: + return "Update ACL" + else: + return "Delete S3 Object" + return "Review" + + +# Get S3 object's ACL +def get_s3_canned_acl(video_key): + result = aws_cli( + [ + "s3api", + "get-object-acl", + "--output", + "text", + "--query", + "Grants[*].Permission", + "--bucket", + AWS_S3_BUCKET, + "--key", + video_key, + ] + ) + acls_grants = result.stdout.strip().split("\t") + + if len(acls_grants) > 1: + if acls_grants[0] == "FULL_CONTROL" and acls_grants[1] == "READ": + return "public-read" + elif acls_grants[0] == "FULL_CONTROL": + return "private" + + return "unknown" + + +# Get S3 object's LastModified date/time +def get_s3_lastmodified(video_key): + result = aws_cli( + [ + "s3api", + "head-object", + "--output", + "text", + "--query", + "LastModified", + "--bucket", + AWS_S3_BUCKET, + "--key", + video_key, + ] + ) + return result.stdout.strip() + + +def build_csv_header(): + return CSV_DELIMITER.join( + [ + "Action", + "S3 Video key", + "S3 LastModified", + "S3 Expected Canned ACL", + "S3 Actual Canned ACL", + "Sbank Gloss ID", + "Sbank Video ID", + "Sbank Gloss public", + "Sbank Video public", + "Sbank Gloss", + "Sbank Gloss created at", + ] + ) + + +def build_csv_row( + video_key, + key_in_nzsl=False, + key_in_s3=False, + gloss_idgloss=None, + gloss_created_at=None, + gloss_id=None, + video_id=None, + gloss_public=False, + video_public=False, +): + # See signbank/video/models.py, line 59, function set_public_acl() + canned_acl_expected = "" + if key_in_nzsl: + canned_acl_expected = "public-read" if video_public else "private" + + lastmodified = "" + canned_acl = "" + if key_in_s3: + lastmodified = get_s3_lastmodified(video_key) + canned_acl = get_s3_canned_acl(video_key) + + action = get_recommended_action(key_in_nzsl, key_in_s3) + + return CSV_DELIMITER.join( + [ + action, + f"{video_key}", + f"{lastmodified}", + f"{canned_acl_expected}", + f"{canned_acl}", + f"{gloss_id}", + f"{video_id}", + f"{gloss_public}", + f"{video_public}", + f"{gloss_idgloss}", + f"{gloss_created_at}", + ] + ) + + +# Run some tests against the remote endpoints +# This is a test-harness for now +# Takes advantage of the fact we have a lot of setup infrastructure in this script already +def do_tests(): + # Debugging safety + if args.env != "dev": + print("Error: tests must be in 'dev' environment") + exit() + if DATABASE_URL.find("@localhost") < 0: + print("Error: database url must contain '@localhost'") + exit() + print(f"DATABASE_URL:{DATABASE_URL}") + + print("Running tests") + #s3 = boto3.client("s3") + # pprint(s3.list_objects(Bucket=AWS_S3_BUCKET)) + # get_nzsl_raw_keys_dict() + # pprint(Gloss.objects.all()) + + # This is a cut and paste of the mock tests, but we're doing it "live" on dev + _csv_content = { + "id": "111", + "word": "Test", + "maori": "maori, maori 2", + "secondary": "test", + "notes": "a note", + "created_at": "2023-09-12 22:37:59 UTC", + "contributor_email": "ops@ackama.com", + "contributor_username": "Ackama Ops", + "agrees": "0", + "disagrees": "1", + "topic_names": "Test Topic|Test", + "videos": "/rails/active_storage/blobs/redirect/eyJfcmFpbHMiOnsibWVzc2FnZSI6IkJBaHBBc2pFIiwiZXhwIjoiMjAyNC0xMS0wM1QyMzoyNzo1Ni4yNDNaIiwicHVyIjoiYmxvYl9pZCJ9fQ==--53448dc4efcf056e7ba7fe6b711d6b1ae551d171/Zimbabwe.mp4", + "illustrations": "/kiwifruit-2-6422.png", + "usage_examples": "/fire.1923.finalexample1.mb.r480x360.mp4", + "sign_comments": ("contribution_limit_test_1: Comment 0|Comment 33"), + } + file_name = "test.csv" + csv_content = [copy.deepcopy(_csv_content)] + csv_content[0]["id"] = "12345" + with open(file_name, "w") as file: + writer = csv.writer(file) + writer.writerow(csv_content[0].keys()) + for row in csv_content: + writer.writerow(row.values()) + data = open(file_name, "rb") + file = SimpleUploadedFile( + content=data.read(), name=data.name, content_type="content/multipart" + ) + dataset = Dataset.objects.get(name="NZSL") + + try: + Gloss.objects.get(idgloss="Share:11").delete() + except ValueError: + pass + Gloss.objects.create( + dataset=dataset, + idgloss="Share:11", + nzsl_share_id="12345", + ) + + # Create user and add permissions + try: + user = User.objects.create_user(username="test", email=None, password="test") + csv_permission = Permission.objects.get(codename='import_csv') + user.user_permissions.add(csv_permission) + except IntegrityError: + user = User.objects.get(username="test") + + # Create client with change_gloss permission. + client = Client() + client.force_login(user) + s = client.session + s.update({ + "dataset_id": dataset.pk, + "glosses_new": csv_content + }) + s.save() + response = client.post( + reverse("dictionary:confirm_import_nzsl_share_gloss_csv"), + {"confirm": True} + ) + + # test to see if we have to wait for thread + sleep(20) + + +# From the keys present in NZSL, get all their S3 information +def process_keys(this_all_keys_dict): + print(f"Getting detailed S3 data for keys ({AWS_S3_BUCKET}) ...", file=sys.stderr) + + print(build_csv_header()) + + for video_key, dict_row in this_all_keys_dict.items(): + print(build_csv_row(video_key, *dict_row)) + + +print(f"Env: {args.env}", file=sys.stderr) +print(f"S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr) +print(f"AWSCLI: {AWSCLI}", file=sys.stderr) +print(f"PGCLI: {PGCLI}", file=sys.stderr) +print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr) + +if args.tests: + do_tests() + exit() + +process_keys( + create_all_keys_dict(get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list()) +) From 3f63e4206483066ec453f07a96919add098bb1dd Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 30 Oct 2024 11:17:50 +1100 Subject: [PATCH 155/222] Moved video tests out of this script --- bin/get-video-s3-acls.py | 131 --------------------------------------- 1 file changed, 131 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 60199b1c..30e2dca6 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -13,11 +13,9 @@ import re from time import sleep from pprint import pprint -import boto3 import copy import csv - parser = argparse.ArgumentParser( description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. " "Postgres access details, eg. DATABASE_URL env var." @@ -40,46 +38,8 @@ required=False, help=f"AWS client path (default: %(default)s)", ) -parser.add_argument( - "--tests", - action="store_true", - default=False, - required=False, - help="Run remote tests instead of generating CSV output", -) - args = parser.parse_args() - -if args.tests: - # Magic required to allow this script to use Signbank Django classes - sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) - os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development") - from django.core.wsgi import get_wsgi_application - - get_wsgi_application() - - from django.contrib.auth.models import Permission - from django.contrib.auth import get_user_model - - User = get_user_model() - - from signbank.dictionary.models import ( - Dataset, - FieldChoice, - Gloss, - GlossTranslations, - Language, - ManualValidationAggregation, - ShareValidationAggregation, - ValidationRecord, - ) - from signbank.video.models import GlossVideo - from django.test import Client - from django.core.files.uploadedfile import SimpleUploadedFile - from django.urls import reverse - from django.db.utils import IntegrityError - # Globals CSV_DELIMITER = "," DATABASE_URL = os.getenv("DATABASE_URL", "") @@ -366,93 +326,6 @@ def build_csv_row( ) -# Run some tests against the remote endpoints -# This is a test-harness for now -# Takes advantage of the fact we have a lot of setup infrastructure in this script already -def do_tests(): - # Debugging safety - if args.env != "dev": - print("Error: tests must be in 'dev' environment") - exit() - if DATABASE_URL.find("@localhost") < 0: - print("Error: database url must contain '@localhost'") - exit() - print(f"DATABASE_URL:{DATABASE_URL}") - - print("Running tests") - #s3 = boto3.client("s3") - # pprint(s3.list_objects(Bucket=AWS_S3_BUCKET)) - # get_nzsl_raw_keys_dict() - # pprint(Gloss.objects.all()) - - # This is a cut and paste of the mock tests, but we're doing it "live" on dev - _csv_content = { - "id": "111", - "word": "Test", - "maori": "maori, maori 2", - "secondary": "test", - "notes": "a note", - "created_at": "2023-09-12 22:37:59 UTC", - "contributor_email": "ops@ackama.com", - "contributor_username": "Ackama Ops", - "agrees": "0", - "disagrees": "1", - "topic_names": "Test Topic|Test", - "videos": "/VID_20170815_153446275.mp4", - "illustrations": "/kiwifruit-2-6422.png", - "usage_examples": "/fire.1923.finalexample1.mb.r480x360.mp4", - "sign_comments": ("contribution_limit_test_1: Comment 0|Comment 33"), - } - file_name = "test.csv" - csv_content = [copy.deepcopy(_csv_content)] - csv_content[0]["id"] = "12345" - with open(file_name, "w") as file: - writer = csv.writer(file) - writer.writerow(csv_content[0].keys()) - for row in csv_content: - writer.writerow(row.values()) - data = open(file_name, "rb") - file = SimpleUploadedFile( - content=data.read(), name=data.name, content_type="content/multipart" - ) - dataset = Dataset.objects.get(name="NZSL") - - try: - Gloss.objects.get(idgloss="Share:11").delete() - except ValueError: - pass - Gloss.objects.create( - dataset=dataset, - idgloss="Share:11", - nzsl_share_id="12345", - ) - - # Create user and add permissions - try: - user = User.objects.create_user(username="test", email=None, password="test") - csv_permission = Permission.objects.get(codename='import_csv') - user.user_permissions.add(csv_permission) - except IntegrityError: - user = User.objects.get(username="test") - - # Create client with change_gloss permission. - client = Client() - client.force_login(user) - s = client.session - s.update({ - "dataset_id": dataset.pk, - "glosses_new": csv_content - }) - s.save() - pprint("CLIENT SESSION") - pprint(client.session.items()) - response = client.post( - reverse("dictionary:confirm_import_nzsl_share_gloss_csv"), - {"confirm": True} - ) - pprint(response.__dict__) - - # From the keys present in NZSL, get all their S3 information def process_keys(this_all_keys_dict): print(f"Getting detailed S3 data for keys ({AWS_S3_BUCKET}) ...", file=sys.stderr) @@ -469,10 +342,6 @@ def process_keys(this_all_keys_dict): print(f"PGCLI: {PGCLI}", file=sys.stderr) print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr) -if args.tests: - do_tests() - exit() - process_keys( create_all_keys_dict(get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list()) ) From 68310617f5a88ed89da474ac7e6053eab92fca11 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 30 Oct 2024 11:30:49 +1100 Subject: [PATCH 156/222] Minimum functionality --- bin/test-videos-s3.py | 331 +++++------------------------------------- 1 file changed, 34 insertions(+), 297 deletions(-) diff --git a/bin/test-videos-s3.py b/bin/test-videos-s3.py index 9dbc6359..b145227a 100755 --- a/bin/test-videos-s3.py +++ b/bin/test-videos-s3.py @@ -17,14 +17,42 @@ import copy import csv +# Magic required to allow this script to use Signbank Django classes +print(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) +sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development") +from django.core.wsgi import get_wsgi_application + +get_wsgi_application() + +from django.contrib.auth.models import Permission +from django.contrib.auth import get_user_model + +User = get_user_model() + +from signbank.dictionary.models import ( + Dataset, + FieldChoice, + Gloss, + GlossTranslations, + Language, + ManualValidationAggregation, + ShareValidationAggregation, + ValidationRecord, +) +from signbank.video.models import GlossVideo +from django.test import Client +from django.core.files.uploadedfile import SimpleUploadedFile +from django.urls import reverse +from django.db.utils import IntegrityError parser = argparse.ArgumentParser( - description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. " + description="You need to run this in a venv that has all the right Python site-packages. You must setup: An AWS auth means, eg. AWS_PROFILE env var. " "Postgres access details, eg. DATABASE_URL env var." ) parser.add_argument( "--env", - default="uat", + default="dev", required=False, help="Environment to run against, eg 'production, 'uat', etc (default: '%(default)s')", ) @@ -40,46 +68,9 @@ required=False, help=f"AWS client path (default: %(default)s)", ) -parser.add_argument( - "--tests", - action="store_true", - default=False, - required=False, - help="Run remote tests instead of generating CSV output", -) - args = parser.parse_args() -if args.tests: - # Magic required to allow this script to use Signbank Django classes - sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) - os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development") - from django.core.wsgi import get_wsgi_application - - get_wsgi_application() - - from django.contrib.auth.models import Permission - from django.contrib.auth import get_user_model - - User = get_user_model() - - from signbank.dictionary.models import ( - Dataset, - FieldChoice, - Gloss, - GlossTranslations, - Language, - ManualValidationAggregation, - ShareValidationAggregation, - ValidationRecord, - ) - from signbank.video.models import GlossVideo - from django.test import Client - from django.core.files.uploadedfile import SimpleUploadedFile - from django.urls import reverse - from django.db.utils import IntegrityError - # Globals CSV_DELIMITER = "," DATABASE_URL = os.getenv("DATABASE_URL", "") @@ -128,247 +119,7 @@ def aws_cli(args_list): return output -# Get the video files info from NZSL Signbank -def get_nzsl_raw_keys_dict(): - print( - f"Getting raw list of video file info from NZSL Signbank ...", - file=sys.stderr, - ) - this_nzsl_raw_keys_dict = {} - # Column renaming is for readability - # Special delimiter because columns might contain commas - result = pg_cli( - [ - "COPY (" - "SELECT " - "dg.id AS gloss_id, " - "dg.idgloss AS gloss_idgloss, " - "dg.created_at AS gloss_created_at, " - "dg.published AS gloss_public, " - "vg.is_public AS video_public, " - "vg.id AS video_id, " - "vg.videofile AS video_key " - "FROM dictionary_gloss AS dg " - "FULL JOIN video_glossvideo AS vg ON vg.gloss_id = dg.id" - ") TO STDOUT WITH (FORMAT CSV, DELIMITER '|')", - ] - ) - - # Separate the NZSL db columns - # Write them to a dictionary, so we can do fast operations - for rawl in result.stdout.split("\n"): - rawl = rawl.strip() - if not rawl: - continue - [ - gloss_id, - gloss_idgloss, - gloss_created_at, - gloss_public, - video_public, - video_id, - video_key, - ] = rawl.split("|") - - # This sets the initial field ordering in the all_keys dictionary row - this_nzsl_raw_keys_dict[video_key] = [ - gloss_idgloss.replace(CSV_DELIMITER, ""), - gloss_created_at, - gloss_id, - video_id, - gloss_public.lower() == "t", - video_public.lower() == "t", - ] - - print( - f"{len(this_nzsl_raw_keys_dict)} rows retrieved", - file=sys.stderr, - ) - - return this_nzsl_raw_keys_dict - - -# Get all keys from AWS S3 -def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET): - print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr) - result = aws_cli( - [ - "s3", - "ls", - f"s3://{s3_bucket}", - "--recursive", - ], - ) - - # Separate out just the key from date, time, size, key - this_s3_bucket_raw_keys_list = [] - for line in result.stdout.split("\n"): - if line: - this_s3_bucket_raw_keys_list.append(re.split(r"\s+", line, 3)[3]) - - print( - f"{len(this_s3_bucket_raw_keys_list)} rows retrieved", - file=sys.stderr, - ) - - return this_s3_bucket_raw_keys_list - - -# Get the keys present and absent across NZSL Signbank and S3, to dictionary -def create_all_keys_dict(this_nzsl_raw_keys_dict, this_s3_bucket_raw_keys_list): - print( - "Getting keys present and absent across NZSL Signbank and S3 ...", - file=sys.stderr, - ) - this_all_keys_dict = {} - - # Find S3 keys that are present in NZSL, or absent - for video_key in this_s3_bucket_raw_keys_list: - dict_row = this_nzsl_raw_keys_dict.get(video_key, None) - if dict_row: - this_all_keys_dict[video_key] = [ - True, # NZSL PRESENT - True, # S3 PRESENT - ] + dict_row - else: - this_all_keys_dict[video_key] = [ - False, # NZSL Absent - True, # S3 PRESENT - ] + [""] * 6 - - # Find NZSL keys that are absent from S3 (present handled above) - for video_key, dict_row in this_nzsl_raw_keys_dict.items(): - if video_key not in this_s3_bucket_raw_keys_list: - this_all_keys_dict[video_key] = [ - True, # NZSL PRESENT - False, # S3 Absent - ] + dict_row - - return this_all_keys_dict - - -# Cases -# In S3 In NZSL Action -# Is Not Delete S3 Object -# Is Is Update ACL -# Not Is Review -def get_recommended_action(key_in_nzsl, key_in_s3): - if key_in_s3: - if key_in_nzsl: - return "Update ACL" - else: - return "Delete S3 Object" - return "Review" - - -# Get S3 object's ACL -def get_s3_canned_acl(video_key): - result = aws_cli( - [ - "s3api", - "get-object-acl", - "--output", - "text", - "--query", - "Grants[*].Permission", - "--bucket", - AWS_S3_BUCKET, - "--key", - video_key, - ] - ) - acls_grants = result.stdout.strip().split("\t") - - if len(acls_grants) > 1: - if acls_grants[0] == "FULL_CONTROL" and acls_grants[1] == "READ": - return "public-read" - elif acls_grants[0] == "FULL_CONTROL": - return "private" - - return "unknown" - - -# Get S3 object's LastModified date/time -def get_s3_lastmodified(video_key): - result = aws_cli( - [ - "s3api", - "head-object", - "--output", - "text", - "--query", - "LastModified", - "--bucket", - AWS_S3_BUCKET, - "--key", - video_key, - ] - ) - return result.stdout.strip() - - -def build_csv_header(): - return CSV_DELIMITER.join( - [ - "Action", - "S3 Video key", - "S3 LastModified", - "S3 Expected Canned ACL", - "S3 Actual Canned ACL", - "Sbank Gloss ID", - "Sbank Video ID", - "Sbank Gloss public", - "Sbank Video public", - "Sbank Gloss", - "Sbank Gloss created at", - ] - ) - - -def build_csv_row( - video_key, - key_in_nzsl=False, - key_in_s3=False, - gloss_idgloss=None, - gloss_created_at=None, - gloss_id=None, - video_id=None, - gloss_public=False, - video_public=False, -): - # See signbank/video/models.py, line 59, function set_public_acl() - canned_acl_expected = "" - if key_in_nzsl: - canned_acl_expected = "public-read" if video_public else "private" - - lastmodified = "" - canned_acl = "" - if key_in_s3: - lastmodified = get_s3_lastmodified(video_key) - canned_acl = get_s3_canned_acl(video_key) - - action = get_recommended_action(key_in_nzsl, key_in_s3) - - return CSV_DELIMITER.join( - [ - action, - f"{video_key}", - f"{lastmodified}", - f"{canned_acl_expected}", - f"{canned_acl}", - f"{gloss_id}", - f"{video_id}", - f"{gloss_public}", - f"{video_public}", - f"{gloss_idgloss}", - f"{gloss_created_at}", - ] - ) - - # Run some tests against the remote endpoints -# This is a test-harness for now -# Takes advantage of the fact we have a lot of setup infrastructure in this script already def do_tests(): # Debugging safety if args.env != "dev": @@ -450,17 +201,9 @@ def do_tests(): ) # test to see if we have to wait for thread - sleep(20) - - -# From the keys present in NZSL, get all their S3 information -def process_keys(this_all_keys_dict): - print(f"Getting detailed S3 data for keys ({AWS_S3_BUCKET}) ...", file=sys.stderr) - - print(build_csv_header()) - - for video_key, dict_row in this_all_keys_dict.items(): - print(build_csv_row(video_key, *dict_row)) + X_SECONDS=20 + print(f"Sleeping {X_SECONDS} seconds to allow threads to complete ...") + sleep(X_SECONDS) print(f"Env: {args.env}", file=sys.stderr) @@ -469,10 +212,4 @@ def process_keys(this_all_keys_dict): print(f"PGCLI: {PGCLI}", file=sys.stderr) print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr) -if args.tests: - do_tests() - exit() - -process_keys( - create_all_keys_dict(get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list()) -) +do_tests() From 19a08811e52d0fa299459ea3be4697b665ff45d7 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 30 Oct 2024 13:13:51 +1100 Subject: [PATCH 157/222] Fake key to handle FULL JOIN absent video keys --- bin/get-video-s3-acls.py | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 30e2dca6..453a3229 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -12,9 +12,8 @@ import argparse import re from time import sleep +from uuid import uuid4 from pprint import pprint -import copy -import csv parser = argparse.ArgumentParser( description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. " @@ -38,10 +37,18 @@ required=False, help=f"AWS client path (default: %(default)s)", ) +parser.add_argument( + "--dumpnzsl", + default=False, + required=False, + action="store_true", + help=f"Dump raw NZSL database output", +) args = parser.parse_args() # Globals CSV_DELIMITER = "," +FAKEKEY_PREFIX = "this_is_not_a_key_" DATABASE_URL = os.getenv("DATABASE_URL", "") AWSCLI = args.awscli PGCLI = args.pgcli @@ -88,6 +95,15 @@ def aws_cli(args_list): return output +# Fake key is a hack to handle FULL JOIN +def maybe_fakekey(instring): + return instring if instring else FAKEKEY_PREFIX + str(uuid4()) + + +def filter_fakekey(instring): + return "" if instring.startswith(FAKEKEY_PREFIX) else instring + + # Get the video files info from NZSL Signbank def get_nzsl_raw_keys_dict(): print( @@ -130,6 +146,9 @@ def get_nzsl_raw_keys_dict(): video_key, ] = rawl.split("|") + # Hack to handle FULL JOIN + video_key = maybe_fakekey(video_key.strip()) + # This sets the initial field ordering in the all_keys dictionary row this_nzsl_raw_keys_dict[video_key] = [ gloss_idgloss.replace(CSV_DELIMITER, ""), @@ -212,6 +231,7 @@ def create_all_keys_dict(this_nzsl_raw_keys_dict, this_s3_bucket_raw_keys_list): # Is Not Delete S3 Object # Is Is Update ACL # Not Is Review +# Other Review def get_recommended_action(key_in_nzsl, key_in_s3): if key_in_s3: if key_in_nzsl: @@ -312,7 +332,7 @@ def build_csv_row( return CSV_DELIMITER.join( [ action, - f"{video_key}", + f"{filter_fakekey(video_key)}", f"{lastmodified}", f"{canned_acl_expected}", f"{canned_acl}", @@ -342,6 +362,10 @@ def process_keys(this_all_keys_dict): print(f"PGCLI: {PGCLI}", file=sys.stderr) print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr) +if args.dumpnzsl: + pprint(get_nzsl_raw_keys_dict()) + exit() + process_keys( create_all_keys_dict(get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list()) ) From 078b479cbe5389a5596f5cbbef322f8de8c3f337 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 30 Oct 2024 16:55:05 +1100 Subject: [PATCH 158/222] black --- bin/test-videos-s3.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/bin/test-videos-s3.py b/bin/test-videos-s3.py index b145227a..c3fa9f89 100755 --- a/bin/test-videos-s3.py +++ b/bin/test-videos-s3.py @@ -131,7 +131,7 @@ def do_tests(): print(f"DATABASE_URL:{DATABASE_URL}") print("Running tests") - #s3 = boto3.client("s3") + # s3 = boto3.client("s3") # pprint(s3.list_objects(Bucket=AWS_S3_BUCKET)) # get_nzsl_raw_keys_dict() # pprint(Gloss.objects.all()) @@ -181,7 +181,7 @@ def do_tests(): # Create user and add permissions try: user = User.objects.create_user(username="test", email=None, password="test") - csv_permission = Permission.objects.get(codename='import_csv') + csv_permission = Permission.objects.get(codename="import_csv") user.user_permissions.add(csv_permission) except IntegrityError: user = User.objects.get(username="test") @@ -190,18 +190,14 @@ def do_tests(): client = Client() client.force_login(user) s = client.session - s.update({ - "dataset_id": dataset.pk, - "glosses_new": csv_content - }) + s.update({"dataset_id": dataset.pk, "glosses_new": csv_content}) s.save() response = client.post( - reverse("dictionary:confirm_import_nzsl_share_gloss_csv"), - {"confirm": True} + reverse("dictionary:confirm_import_nzsl_share_gloss_csv"), {"confirm": True} ) # test to see if we have to wait for thread - X_SECONDS=20 + X_SECONDS = 20 print(f"Sleeping {X_SECONDS} seconds to allow threads to complete ...") sleep(X_SECONDS) From fa4689d6c04ba3970b9c0d4f2a859d06250676b5 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 30 Oct 2024 16:57:07 +1100 Subject: [PATCH 159/222] Rearranging --- bin/test-videos-s3.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/bin/test-videos-s3.py b/bin/test-videos-s3.py index c3fa9f89..918cd980 100755 --- a/bin/test-videos-s3.py +++ b/bin/test-videos-s3.py @@ -1,4 +1,5 @@ #!/usr/bin/env -S python3 -u +# You need to run this in a venv that has all the right Python site-packages. # Bang line above passes '-u' to python, for unbuffered output # Permissions required: # psql - access to heroku app's postgres @@ -30,6 +31,12 @@ User = get_user_model() +from django.test import Client +from django.core.files.uploadedfile import SimpleUploadedFile +from django.urls import reverse +from django.db.utils import IntegrityError + + from signbank.dictionary.models import ( Dataset, FieldChoice, @@ -41,10 +48,6 @@ ValidationRecord, ) from signbank.video.models import GlossVideo -from django.test import Client -from django.core.files.uploadedfile import SimpleUploadedFile -from django.urls import reverse -from django.db.utils import IntegrityError parser = argparse.ArgumentParser( description="You need to run this in a venv that has all the right Python site-packages. You must setup: An AWS auth means, eg. AWS_PROFILE env var. " From 0aa68e01b1f8c8b94e87e3d2e4fd2dd751af1f43 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Thu, 31 Oct 2024 15:56:49 +1100 Subject: [PATCH 160/222] S3 orphan resolution next pass --- bin/get-video-s3-acls.py | 118 +++++++++++++++++++++++++++++++++++++++ bin/test-videos-s3.py | 5 +- 2 files changed, 120 insertions(+), 3 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 453a3229..dc291178 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -5,6 +5,8 @@ # aws s3 - NZSL IAM access # s3:GetObjectAcl permissions or READ_ACP access to the object # https://docs.aws.amazon.com/cli/latest/reference/s3api/get-object-acl.html +# For some commands you need to run this in a venv that has all the right Python site-packages. +# TODO Convert this script to a Django Management Command import os import sys @@ -15,6 +17,7 @@ from uuid import uuid4 from pprint import pprint + parser = argparse.ArgumentParser( description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. " "Postgres access details, eg. DATABASE_URL env var." @@ -44,8 +47,54 @@ action="store_true", help=f"Dump raw NZSL database output", ) +parser.add_argument( + "--pyenv", + default=False, + required=False, + action="store_true", + help=f"Yes, we are running in a pyenv virtualenv that has all the right site-packages installed", +) +parser.add_argument( + "--orphans", + default=False, + required=False, + action="store_true", + help=f"Try to identify and match-up S3 orphans (requires --pyenv)", +) args = parser.parse_args() +if args.pyenv: + # Magic required to allow this script to use Signbank Django classes + # This goes away if this script becomes a Django Management Command + print ("Importing site-packages environment") + print(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) + sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) + os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development") + from django.core.wsgi import get_wsgi_application + + get_wsgi_application() + + from django.contrib.auth.models import Permission + from django.contrib.auth import get_user_model + + User = get_user_model() + + from django.test import Client + from django.core.files.uploadedfile import SimpleUploadedFile + from django.urls import reverse + from django.db.utils import IntegrityError + from signbank.dictionary.models import ( + Dataset, + FieldChoice, + Gloss, + GlossTranslations, + Language, + ManualValidationAggregation, + ShareValidationAggregation, + ValidationRecord, + ) + from signbank.video.models import GlossVideo + # Globals CSV_DELIMITER = "," FAKEKEY_PREFIX = "this_is_not_a_key_" @@ -355,6 +404,68 @@ def process_keys(this_all_keys_dict): for video_key, dict_row in this_all_keys_dict.items(): print(build_csv_row(video_key, *dict_row)) +def process_orphans(): + all_keys_dict = create_all_keys_dict( + get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list() + ) + + # Traverse all the NZSL Signbank glosses that are missing S3 objects + # NOTE This may actually be the wrong way around, we may want to go from + # orphaned S3 objects _back_ to glosses, but it depends on what Micky says + for video_key, [ + key_in_nzsl, + key_in_s3, + gloss_idgloss, + gloss_created_at, + gloss_id, + video_id, + gloss_public, + video_public, + ] in all_keys_dict.items(): + + if not key_in_nzsl: + # This is an S3 object, not a Signbank record + continue + + if key_in_s3: + # This Signbank record already has an S3 object, all is well + continue + + # Business rule + if int(gloss_id) < 8000: + continue + + # The gloss_id is the only reliable retrieval key at the Signbank end + gloss = Gloss.objects.get(id=gloss_id) + video_path = gloss.get_video_path() + + # Skip any that already have a video path + # If these had S3 video candidates they should not have made it this far + # These will have to have their videos reinstated (separate operation) + if len(video_path) > 0: + continue + + gloss_name = gloss.idgloss.split(":")[0].strip() + + # We try to find the orphaned S3 object, if it exists + # TODO We could improve on brute-force by installing new libraries eg. rapidfuzz + for test_key, [ key_nzsl_yes, key_s3_yes, *_ ] in all_keys_dict.items(): + if gloss_name in test_key: + if str(gloss_id) in test_key: + if key_nzsl_yes: + print(f"Anomaly (in NZSL): {gloss.idgloss}", file=sys.stderr) + continue + if not key_s3_yes: + print(f"Anomaly (not in S3): {gloss.idgloss}", file=sys.stderr) + continue + print(f"{gloss_id} {gloss.idgloss}") + print(test_key) + + + + + + print(f"Env: {args.env}", file=sys.stderr) print(f"S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr) @@ -366,6 +477,13 @@ def process_keys(this_all_keys_dict): pprint(get_nzsl_raw_keys_dict()) exit() +if args.orphans: + if args.pyenv: + process_orphans() + else: + print("Error: You need to tell us you're in an environment with all needed site-packages. See --pyenv") + exit() + process_keys( create_all_keys_dict(get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list()) ) diff --git a/bin/test-videos-s3.py b/bin/test-videos-s3.py index 918cd980..2eb27e24 100755 --- a/bin/test-videos-s3.py +++ b/bin/test-videos-s3.py @@ -35,8 +35,6 @@ from django.core.files.uploadedfile import SimpleUploadedFile from django.urls import reverse from django.db.utils import IntegrityError - - from signbank.dictionary.models import ( Dataset, FieldChoice, @@ -50,7 +48,8 @@ from signbank.video.models import GlossVideo parser = argparse.ArgumentParser( - description="You need to run this in a venv that has all the right Python site-packages. You must setup: An AWS auth means, eg. AWS_PROFILE env var. " + description="You need to run this in a venv that has all the right Python site-packages. " + "You must setup: An AWS auth means, eg. AWS_PROFILE env var. " "Postgres access details, eg. DATABASE_URL env var." ) parser.add_argument( From 09e124b494d0633c64ae2b0894d64fbbd35397a0 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Thu, 31 Oct 2024 15:57:09 +1100 Subject: [PATCH 161/222] black --- bin/get-video-s3-acls.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index dc291178..4ecd327e 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -66,7 +66,7 @@ if args.pyenv: # Magic required to allow this script to use Signbank Django classes # This goes away if this script becomes a Django Management Command - print ("Importing site-packages environment") + print("Importing site-packages environment") print(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development") @@ -404,6 +404,7 @@ def process_keys(this_all_keys_dict): for video_key, dict_row in this_all_keys_dict.items(): print(build_csv_row(video_key, *dict_row)) + def process_orphans(): all_keys_dict = create_all_keys_dict( get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list() @@ -449,7 +450,7 @@ def process_orphans(): # We try to find the orphaned S3 object, if it exists # TODO We could improve on brute-force by installing new libraries eg. rapidfuzz - for test_key, [ key_nzsl_yes, key_s3_yes, *_ ] in all_keys_dict.items(): + for test_key, [key_nzsl_yes, key_s3_yes, *_] in all_keys_dict.items(): if gloss_name in test_key: if str(gloss_id) in test_key: if key_nzsl_yes: @@ -462,11 +463,6 @@ def process_orphans(): print(test_key) - - - - - print(f"Env: {args.env}", file=sys.stderr) print(f"S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr) print(f"AWSCLI: {AWSCLI}", file=sys.stderr) @@ -481,7 +477,9 @@ def process_orphans(): if args.pyenv: process_orphans() else: - print("Error: You need to tell us you're in an environment with all needed site-packages. See --pyenv") + print( + "Error: You need to tell us you're in an environment with all needed site-packages. See --pyenv" + ) exit() process_keys( From 12a309bb64227d835aa020a31be158bf56fa73f3 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Thu, 31 Oct 2024 16:01:11 +1100 Subject: [PATCH 162/222] Comment --- bin/get-video-s3-acls.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 4ecd327e..3e9a9ce8 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -411,8 +411,6 @@ def process_orphans(): ) # Traverse all the NZSL Signbank glosses that are missing S3 objects - # NOTE This may actually be the wrong way around, we may want to go from - # orphaned S3 objects _back_ to glosses, but it depends on what Micky says for video_key, [ key_in_nzsl, key_in_s3, From fd62f81a52d0942358a3c48c2b7cee88f31b6ea9 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Thu, 31 Oct 2024 16:13:50 +1100 Subject: [PATCH 163/222] CSV orphans --- bin/get-video-s3-acls.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 3e9a9ce8..7599182b 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -66,8 +66,8 @@ if args.pyenv: # Magic required to allow this script to use Signbank Django classes # This goes away if this script becomes a Django Management Command - print("Importing site-packages environment") - print(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) + print("Importing site-packages environment", file=sys.stderr) + print(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), file=sys.stderr) sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development") from django.core.wsgi import get_wsgi_application @@ -410,6 +410,8 @@ def process_orphans(): get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list() ) + print("Gloss ID,Gloss,Suggested Video key") + # Traverse all the NZSL Signbank glosses that are missing S3 objects for video_key, [ key_in_nzsl, @@ -446,6 +448,8 @@ def process_orphans(): gloss_name = gloss.idgloss.split(":")[0].strip() + csv_rows = [] + # We try to find the orphaned S3 object, if it exists # TODO We could improve on brute-force by installing new libraries eg. rapidfuzz for test_key, [key_nzsl_yes, key_s3_yes, *_] in all_keys_dict.items(): @@ -457,8 +461,10 @@ def process_orphans(): if not key_s3_yes: print(f"Anomaly (not in S3): {gloss.idgloss}", file=sys.stderr) continue - print(f"{gloss_id} {gloss.idgloss}") - print(test_key) + csv_rows.append([gloss_id,gloss.idgloss,test_key]) + if csv_rows: + for c_row in csv_rows: + print(CSV_DELIMITER.join(c_row)) print(f"Env: {args.env}", file=sys.stderr) From 83d1c82da553789b73723bd7409c678958190402 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Thu, 31 Oct 2024 16:14:02 +1100 Subject: [PATCH 164/222] black --- bin/get-video-s3-acls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 7599182b..efe041cd 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -461,7 +461,7 @@ def process_orphans(): if not key_s3_yes: print(f"Anomaly (not in S3): {gloss.idgloss}", file=sys.stderr) continue - csv_rows.append([gloss_id,gloss.idgloss,test_key]) + csv_rows.append([gloss_id, gloss.idgloss, test_key]) if csv_rows: for c_row in csv_rows: print(CSV_DELIMITER.join(c_row)) From 2418d96a5c51494e735bc8d0f2e1cc2c4bae3955 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 1 Nov 2024 12:14:35 +1100 Subject: [PATCH 165/222] Script splitting --- bin/get-orphaned-videos.py | 491 +++++++++++++++++++++++++++++++++++++ bin/test-videos-s3.py | 213 ---------------- 2 files changed, 491 insertions(+), 213 deletions(-) create mode 100755 bin/get-orphaned-videos.py delete mode 100755 bin/test-videos-s3.py diff --git a/bin/get-orphaned-videos.py b/bin/get-orphaned-videos.py new file mode 100755 index 00000000..efe041cd --- /dev/null +++ b/bin/get-orphaned-videos.py @@ -0,0 +1,491 @@ +#!/usr/bin/env -S python3 -u +# Bang line above passes '-u' to python, for unbuffered output +# Permissions required: +# psql - access to heroku app's postgres +# aws s3 - NZSL IAM access +# s3:GetObjectAcl permissions or READ_ACP access to the object +# https://docs.aws.amazon.com/cli/latest/reference/s3api/get-object-acl.html +# For some commands you need to run this in a venv that has all the right Python site-packages. +# TODO Convert this script to a Django Management Command + +import os +import sys +import subprocess +import argparse +import re +from time import sleep +from uuid import uuid4 +from pprint import pprint + + +parser = argparse.ArgumentParser( + description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. " + "Postgres access details, eg. DATABASE_URL env var." +) +parser.add_argument( + "--env", + default="uat", + required=False, + help="Environment to run against, eg 'production, 'uat', etc (default: '%(default)s')", +) +parser.add_argument( + "--pgcli", + default="/usr/bin/psql", + required=False, + help=f"Postgres client path (default: %(default)s)", +) +parser.add_argument( + "--awscli", + default="/usr/local/bin/aws", + required=False, + help=f"AWS client path (default: %(default)s)", +) +parser.add_argument( + "--dumpnzsl", + default=False, + required=False, + action="store_true", + help=f"Dump raw NZSL database output", +) +parser.add_argument( + "--pyenv", + default=False, + required=False, + action="store_true", + help=f"Yes, we are running in a pyenv virtualenv that has all the right site-packages installed", +) +parser.add_argument( + "--orphans", + default=False, + required=False, + action="store_true", + help=f"Try to identify and match-up S3 orphans (requires --pyenv)", +) +args = parser.parse_args() + +if args.pyenv: + # Magic required to allow this script to use Signbank Django classes + # This goes away if this script becomes a Django Management Command + print("Importing site-packages environment", file=sys.stderr) + print(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), file=sys.stderr) + sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) + os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development") + from django.core.wsgi import get_wsgi_application + + get_wsgi_application() + + from django.contrib.auth.models import Permission + from django.contrib.auth import get_user_model + + User = get_user_model() + + from django.test import Client + from django.core.files.uploadedfile import SimpleUploadedFile + from django.urls import reverse + from django.db.utils import IntegrityError + from signbank.dictionary.models import ( + Dataset, + FieldChoice, + Gloss, + GlossTranslations, + Language, + ManualValidationAggregation, + ShareValidationAggregation, + ValidationRecord, + ) + from signbank.video.models import GlossVideo + +# Globals +CSV_DELIMITER = "," +FAKEKEY_PREFIX = "this_is_not_a_key_" +DATABASE_URL = os.getenv("DATABASE_URL", "") +AWSCLI = args.awscli +PGCLI = args.pgcli +AWS_S3_BUCKET = f"nzsl-signbank-media-{args.env}" + + +def pg_cli(args_list): + try: + return subprocess.run( + [PGCLI, "-c"] + args_list + [f"{DATABASE_URL}"], + env=os.environ, + capture_output=True, + check=True, + text=True, + ) + except subprocess.CalledProcessError as e: + print(f"Error: subprocess.run returned code {e.returncode}", file=sys.stderr) + print(e.cmd, file=sys.stderr) + print(e.stdout, file=sys.stderr) + print(e.stderr, file=sys.stderr) + exit() + + +def aws_cli(args_list): + # Try indefinitely + output = None + while not output: + try: + output = subprocess.run( + [AWSCLI] + args_list, + env=os.environ, + capture_output=True, + check=True, + text=True, + ) + except subprocess.CalledProcessError as e: + print( + f"Error: subprocess.run returned code {e.returncode}", file=sys.stderr + ) + print(e.cmd, file=sys.stderr) + print(e.stdout, file=sys.stderr) + print(e.stderr, file=sys.stderr) + sleep(1) + return output + + +# Fake key is a hack to handle FULL JOIN +def maybe_fakekey(instring): + return instring if instring else FAKEKEY_PREFIX + str(uuid4()) + + +def filter_fakekey(instring): + return "" if instring.startswith(FAKEKEY_PREFIX) else instring + + +# Get the video files info from NZSL Signbank +def get_nzsl_raw_keys_dict(): + print( + f"Getting raw list of video file info from NZSL Signbank ...", + file=sys.stderr, + ) + this_nzsl_raw_keys_dict = {} + # Column renaming is for readability + # Special delimiter because columns might contain commas + result = pg_cli( + [ + "COPY (" + "SELECT " + "dg.id AS gloss_id, " + "dg.idgloss AS gloss_idgloss, " + "dg.created_at AS gloss_created_at, " + "dg.published AS gloss_public, " + "vg.is_public AS video_public, " + "vg.id AS video_id, " + "vg.videofile AS video_key " + "FROM dictionary_gloss AS dg " + "FULL JOIN video_glossvideo AS vg ON vg.gloss_id = dg.id" + ") TO STDOUT WITH (FORMAT CSV, DELIMITER '|')", + ] + ) + + # Separate the NZSL db columns + # Write them to a dictionary, so we can do fast operations + for rawl in result.stdout.split("\n"): + rawl = rawl.strip() + if not rawl: + continue + [ + gloss_id, + gloss_idgloss, + gloss_created_at, + gloss_public, + video_public, + video_id, + video_key, + ] = rawl.split("|") + + # Hack to handle FULL JOIN + video_key = maybe_fakekey(video_key.strip()) + + # This sets the initial field ordering in the all_keys dictionary row + this_nzsl_raw_keys_dict[video_key] = [ + gloss_idgloss.replace(CSV_DELIMITER, ""), + gloss_created_at, + gloss_id, + video_id, + gloss_public.lower() == "t", + video_public.lower() == "t", + ] + + print( + f"{len(this_nzsl_raw_keys_dict)} rows retrieved", + file=sys.stderr, + ) + + return this_nzsl_raw_keys_dict + + +# Get all keys from AWS S3 +def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET): + print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr) + result = aws_cli( + [ + "s3", + "ls", + f"s3://{s3_bucket}", + "--recursive", + ], + ) + + # Separate out just the key from date, time, size, key + this_s3_bucket_raw_keys_list = [] + for line in result.stdout.split("\n"): + if line: + this_s3_bucket_raw_keys_list.append(re.split(r"\s+", line, 3)[3]) + + print( + f"{len(this_s3_bucket_raw_keys_list)} rows retrieved", + file=sys.stderr, + ) + + return this_s3_bucket_raw_keys_list + + +# Get the keys present and absent across NZSL Signbank and S3, to dictionary +def create_all_keys_dict(this_nzsl_raw_keys_dict, this_s3_bucket_raw_keys_list): + print( + "Getting keys present and absent across NZSL Signbank and S3 ...", + file=sys.stderr, + ) + this_all_keys_dict = {} + + # Find S3 keys that are present in NZSL, or absent + for video_key in this_s3_bucket_raw_keys_list: + dict_row = this_nzsl_raw_keys_dict.get(video_key, None) + if dict_row: + this_all_keys_dict[video_key] = [ + True, # NZSL PRESENT + True, # S3 PRESENT + ] + dict_row + else: + this_all_keys_dict[video_key] = [ + False, # NZSL Absent + True, # S3 PRESENT + ] + [""] * 6 + + # Find NZSL keys that are absent from S3 (present handled above) + for video_key, dict_row in this_nzsl_raw_keys_dict.items(): + if video_key not in this_s3_bucket_raw_keys_list: + this_all_keys_dict[video_key] = [ + True, # NZSL PRESENT + False, # S3 Absent + ] + dict_row + + return this_all_keys_dict + + +# Cases +# In S3 In NZSL Action +# Is Not Delete S3 Object +# Is Is Update ACL +# Not Is Review +# Other Review +def get_recommended_action(key_in_nzsl, key_in_s3): + if key_in_s3: + if key_in_nzsl: + return "Update ACL" + else: + return "Delete S3 Object" + return "Review" + + +# Get S3 object's ACL +def get_s3_canned_acl(video_key): + result = aws_cli( + [ + "s3api", + "get-object-acl", + "--output", + "text", + "--query", + "Grants[*].Permission", + "--bucket", + AWS_S3_BUCKET, + "--key", + video_key, + ] + ) + acls_grants = result.stdout.strip().split("\t") + + if len(acls_grants) > 1: + if acls_grants[0] == "FULL_CONTROL" and acls_grants[1] == "READ": + return "public-read" + elif acls_grants[0] == "FULL_CONTROL": + return "private" + + return "unknown" + + +# Get S3 object's LastModified date/time +def get_s3_lastmodified(video_key): + result = aws_cli( + [ + "s3api", + "head-object", + "--output", + "text", + "--query", + "LastModified", + "--bucket", + AWS_S3_BUCKET, + "--key", + video_key, + ] + ) + return result.stdout.strip() + + +def build_csv_header(): + return CSV_DELIMITER.join( + [ + "Action", + "S3 Video key", + "S3 LastModified", + "S3 Expected Canned ACL", + "S3 Actual Canned ACL", + "Sbank Gloss ID", + "Sbank Video ID", + "Sbank Gloss public", + "Sbank Video public", + "Sbank Gloss", + "Sbank Gloss created at", + ] + ) + + +def build_csv_row( + video_key, + key_in_nzsl=False, + key_in_s3=False, + gloss_idgloss=None, + gloss_created_at=None, + gloss_id=None, + video_id=None, + gloss_public=False, + video_public=False, +): + # See signbank/video/models.py, line 59, function set_public_acl() + canned_acl_expected = "" + if key_in_nzsl: + canned_acl_expected = "public-read" if video_public else "private" + + lastmodified = "" + canned_acl = "" + if key_in_s3: + lastmodified = get_s3_lastmodified(video_key) + canned_acl = get_s3_canned_acl(video_key) + + action = get_recommended_action(key_in_nzsl, key_in_s3) + + return CSV_DELIMITER.join( + [ + action, + f"{filter_fakekey(video_key)}", + f"{lastmodified}", + f"{canned_acl_expected}", + f"{canned_acl}", + f"{gloss_id}", + f"{video_id}", + f"{gloss_public}", + f"{video_public}", + f"{gloss_idgloss}", + f"{gloss_created_at}", + ] + ) + + +# From the keys present in NZSL, get all their S3 information +def process_keys(this_all_keys_dict): + print(f"Getting detailed S3 data for keys ({AWS_S3_BUCKET}) ...", file=sys.stderr) + + print(build_csv_header()) + + for video_key, dict_row in this_all_keys_dict.items(): + print(build_csv_row(video_key, *dict_row)) + + +def process_orphans(): + all_keys_dict = create_all_keys_dict( + get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list() + ) + + print("Gloss ID,Gloss,Suggested Video key") + + # Traverse all the NZSL Signbank glosses that are missing S3 objects + for video_key, [ + key_in_nzsl, + key_in_s3, + gloss_idgloss, + gloss_created_at, + gloss_id, + video_id, + gloss_public, + video_public, + ] in all_keys_dict.items(): + + if not key_in_nzsl: + # This is an S3 object, not a Signbank record + continue + + if key_in_s3: + # This Signbank record already has an S3 object, all is well + continue + + # Business rule + if int(gloss_id) < 8000: + continue + + # The gloss_id is the only reliable retrieval key at the Signbank end + gloss = Gloss.objects.get(id=gloss_id) + video_path = gloss.get_video_path() + + # Skip any that already have a video path + # If these had S3 video candidates they should not have made it this far + # These will have to have their videos reinstated (separate operation) + if len(video_path) > 0: + continue + + gloss_name = gloss.idgloss.split(":")[0].strip() + + csv_rows = [] + + # We try to find the orphaned S3 object, if it exists + # TODO We could improve on brute-force by installing new libraries eg. rapidfuzz + for test_key, [key_nzsl_yes, key_s3_yes, *_] in all_keys_dict.items(): + if gloss_name in test_key: + if str(gloss_id) in test_key: + if key_nzsl_yes: + print(f"Anomaly (in NZSL): {gloss.idgloss}", file=sys.stderr) + continue + if not key_s3_yes: + print(f"Anomaly (not in S3): {gloss.idgloss}", file=sys.stderr) + continue + csv_rows.append([gloss_id, gloss.idgloss, test_key]) + if csv_rows: + for c_row in csv_rows: + print(CSV_DELIMITER.join(c_row)) + + +print(f"Env: {args.env}", file=sys.stderr) +print(f"S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr) +print(f"AWSCLI: {AWSCLI}", file=sys.stderr) +print(f"PGCLI: {PGCLI}", file=sys.stderr) +print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr) + +if args.dumpnzsl: + pprint(get_nzsl_raw_keys_dict()) + exit() + +if args.orphans: + if args.pyenv: + process_orphans() + else: + print( + "Error: You need to tell us you're in an environment with all needed site-packages. See --pyenv" + ) + exit() + +process_keys( + create_all_keys_dict(get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list()) +) diff --git a/bin/test-videos-s3.py b/bin/test-videos-s3.py deleted file mode 100755 index 2eb27e24..00000000 --- a/bin/test-videos-s3.py +++ /dev/null @@ -1,213 +0,0 @@ -#!/usr/bin/env -S python3 -u -# You need to run this in a venv that has all the right Python site-packages. -# Bang line above passes '-u' to python, for unbuffered output -# Permissions required: -# psql - access to heroku app's postgres -# aws s3 - NZSL IAM access -# s3:GetObjectAcl permissions or READ_ACP access to the object -# https://docs.aws.amazon.com/cli/latest/reference/s3api/get-object-acl.html - -import os -import sys -import subprocess -import argparse -import re -from time import sleep -from pprint import pprint -import boto3 -import copy -import csv - -# Magic required to allow this script to use Signbank Django classes -print(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) -sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) -os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development") -from django.core.wsgi import get_wsgi_application - -get_wsgi_application() - -from django.contrib.auth.models import Permission -from django.contrib.auth import get_user_model - -User = get_user_model() - -from django.test import Client -from django.core.files.uploadedfile import SimpleUploadedFile -from django.urls import reverse -from django.db.utils import IntegrityError -from signbank.dictionary.models import ( - Dataset, - FieldChoice, - Gloss, - GlossTranslations, - Language, - ManualValidationAggregation, - ShareValidationAggregation, - ValidationRecord, -) -from signbank.video.models import GlossVideo - -parser = argparse.ArgumentParser( - description="You need to run this in a venv that has all the right Python site-packages. " - "You must setup: An AWS auth means, eg. AWS_PROFILE env var. " - "Postgres access details, eg. DATABASE_URL env var." -) -parser.add_argument( - "--env", - default="dev", - required=False, - help="Environment to run against, eg 'production, 'uat', etc (default: '%(default)s')", -) -parser.add_argument( - "--pgcli", - default="/usr/bin/psql", - required=False, - help=f"Postgres client path (default: %(default)s)", -) -parser.add_argument( - "--awscli", - default="/usr/local/bin/aws", - required=False, - help=f"AWS client path (default: %(default)s)", -) -args = parser.parse_args() - - -# Globals -CSV_DELIMITER = "," -DATABASE_URL = os.getenv("DATABASE_URL", "") -AWSCLI = args.awscli -PGCLI = args.pgcli -AWS_S3_BUCKET = f"nzsl-signbank-media-{args.env}" - - -def pg_cli(args_list): - try: - return subprocess.run( - [PGCLI, "-c"] + args_list + [f"{DATABASE_URL}"], - env=os.environ, - capture_output=True, - check=True, - text=True, - ) - except subprocess.CalledProcessError as e: - print(f"Error: subprocess.run returned code {e.returncode}", file=sys.stderr) - print(e.cmd, file=sys.stderr) - print(e.stdout, file=sys.stderr) - print(e.stderr, file=sys.stderr) - exit() - - -def aws_cli(args_list): - # Try indefinitely - output = None - while not output: - try: - output = subprocess.run( - [AWSCLI] + args_list, - env=os.environ, - capture_output=True, - check=True, - text=True, - ) - except subprocess.CalledProcessError as e: - print( - f"Error: subprocess.run returned code {e.returncode}", file=sys.stderr - ) - print(e.cmd, file=sys.stderr) - print(e.stdout, file=sys.stderr) - print(e.stderr, file=sys.stderr) - sleep(1) - return output - - -# Run some tests against the remote endpoints -def do_tests(): - # Debugging safety - if args.env != "dev": - print("Error: tests must be in 'dev' environment") - exit() - if DATABASE_URL.find("@localhost") < 0: - print("Error: database url must contain '@localhost'") - exit() - print(f"DATABASE_URL:{DATABASE_URL}") - - print("Running tests") - # s3 = boto3.client("s3") - # pprint(s3.list_objects(Bucket=AWS_S3_BUCKET)) - # get_nzsl_raw_keys_dict() - # pprint(Gloss.objects.all()) - - # This is a cut and paste of the mock tests, but we're doing it "live" on dev - _csv_content = { - "id": "111", - "word": "Test", - "maori": "maori, maori 2", - "secondary": "test", - "notes": "a note", - "created_at": "2023-09-12 22:37:59 UTC", - "contributor_email": "ops@ackama.com", - "contributor_username": "Ackama Ops", - "agrees": "0", - "disagrees": "1", - "topic_names": "Test Topic|Test", - "videos": "/rails/active_storage/blobs/redirect/eyJfcmFpbHMiOnsibWVzc2FnZSI6IkJBaHBBc2pFIiwiZXhwIjoiMjAyNC0xMS0wM1QyMzoyNzo1Ni4yNDNaIiwicHVyIjoiYmxvYl9pZCJ9fQ==--53448dc4efcf056e7ba7fe6b711d6b1ae551d171/Zimbabwe.mp4", - "illustrations": "/kiwifruit-2-6422.png", - "usage_examples": "/fire.1923.finalexample1.mb.r480x360.mp4", - "sign_comments": ("contribution_limit_test_1: Comment 0|Comment 33"), - } - file_name = "test.csv" - csv_content = [copy.deepcopy(_csv_content)] - csv_content[0]["id"] = "12345" - with open(file_name, "w") as file: - writer = csv.writer(file) - writer.writerow(csv_content[0].keys()) - for row in csv_content: - writer.writerow(row.values()) - data = open(file_name, "rb") - file = SimpleUploadedFile( - content=data.read(), name=data.name, content_type="content/multipart" - ) - dataset = Dataset.objects.get(name="NZSL") - - try: - Gloss.objects.get(idgloss="Share:11").delete() - except ValueError: - pass - Gloss.objects.create( - dataset=dataset, - idgloss="Share:11", - nzsl_share_id="12345", - ) - - # Create user and add permissions - try: - user = User.objects.create_user(username="test", email=None, password="test") - csv_permission = Permission.objects.get(codename="import_csv") - user.user_permissions.add(csv_permission) - except IntegrityError: - user = User.objects.get(username="test") - - # Create client with change_gloss permission. - client = Client() - client.force_login(user) - s = client.session - s.update({"dataset_id": dataset.pk, "glosses_new": csv_content}) - s.save() - response = client.post( - reverse("dictionary:confirm_import_nzsl_share_gloss_csv"), {"confirm": True} - ) - - # test to see if we have to wait for thread - X_SECONDS = 20 - print(f"Sleeping {X_SECONDS} seconds to allow threads to complete ...") - sleep(X_SECONDS) - - -print(f"Env: {args.env}", file=sys.stderr) -print(f"S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr) -print(f"AWSCLI: {AWSCLI}", file=sys.stderr) -print(f"PGCLI: {PGCLI}", file=sys.stderr) -print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr) - -do_tests() From 7c8d4612031e8209b192e11288d6c0ed74c05203 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 1 Nov 2024 12:28:08 +1100 Subject: [PATCH 166/222] Orphan-detection code removed --- bin/get-video-s3-acls.py | 118 --------------------------------------- 1 file changed, 118 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index efe041cd..eb5436be 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -17,7 +17,6 @@ from uuid import uuid4 from pprint import pprint - parser = argparse.ArgumentParser( description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. " "Postgres access details, eg. DATABASE_URL env var." @@ -47,54 +46,8 @@ action="store_true", help=f"Dump raw NZSL database output", ) -parser.add_argument( - "--pyenv", - default=False, - required=False, - action="store_true", - help=f"Yes, we are running in a pyenv virtualenv that has all the right site-packages installed", -) -parser.add_argument( - "--orphans", - default=False, - required=False, - action="store_true", - help=f"Try to identify and match-up S3 orphans (requires --pyenv)", -) args = parser.parse_args() -if args.pyenv: - # Magic required to allow this script to use Signbank Django classes - # This goes away if this script becomes a Django Management Command - print("Importing site-packages environment", file=sys.stderr) - print(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), file=sys.stderr) - sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) - os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development") - from django.core.wsgi import get_wsgi_application - - get_wsgi_application() - - from django.contrib.auth.models import Permission - from django.contrib.auth import get_user_model - - User = get_user_model() - - from django.test import Client - from django.core.files.uploadedfile import SimpleUploadedFile - from django.urls import reverse - from django.db.utils import IntegrityError - from signbank.dictionary.models import ( - Dataset, - FieldChoice, - Gloss, - GlossTranslations, - Language, - ManualValidationAggregation, - ShareValidationAggregation, - ValidationRecord, - ) - from signbank.video.models import GlossVideo - # Globals CSV_DELIMITER = "," FAKEKEY_PREFIX = "this_is_not_a_key_" @@ -405,68 +358,6 @@ def process_keys(this_all_keys_dict): print(build_csv_row(video_key, *dict_row)) -def process_orphans(): - all_keys_dict = create_all_keys_dict( - get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list() - ) - - print("Gloss ID,Gloss,Suggested Video key") - - # Traverse all the NZSL Signbank glosses that are missing S3 objects - for video_key, [ - key_in_nzsl, - key_in_s3, - gloss_idgloss, - gloss_created_at, - gloss_id, - video_id, - gloss_public, - video_public, - ] in all_keys_dict.items(): - - if not key_in_nzsl: - # This is an S3 object, not a Signbank record - continue - - if key_in_s3: - # This Signbank record already has an S3 object, all is well - continue - - # Business rule - if int(gloss_id) < 8000: - continue - - # The gloss_id is the only reliable retrieval key at the Signbank end - gloss = Gloss.objects.get(id=gloss_id) - video_path = gloss.get_video_path() - - # Skip any that already have a video path - # If these had S3 video candidates they should not have made it this far - # These will have to have their videos reinstated (separate operation) - if len(video_path) > 0: - continue - - gloss_name = gloss.idgloss.split(":")[0].strip() - - csv_rows = [] - - # We try to find the orphaned S3 object, if it exists - # TODO We could improve on brute-force by installing new libraries eg. rapidfuzz - for test_key, [key_nzsl_yes, key_s3_yes, *_] in all_keys_dict.items(): - if gloss_name in test_key: - if str(gloss_id) in test_key: - if key_nzsl_yes: - print(f"Anomaly (in NZSL): {gloss.idgloss}", file=sys.stderr) - continue - if not key_s3_yes: - print(f"Anomaly (not in S3): {gloss.idgloss}", file=sys.stderr) - continue - csv_rows.append([gloss_id, gloss.idgloss, test_key]) - if csv_rows: - for c_row in csv_rows: - print(CSV_DELIMITER.join(c_row)) - - print(f"Env: {args.env}", file=sys.stderr) print(f"S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr) print(f"AWSCLI: {AWSCLI}", file=sys.stderr) @@ -477,15 +368,6 @@ def process_orphans(): pprint(get_nzsl_raw_keys_dict()) exit() -if args.orphans: - if args.pyenv: - process_orphans() - else: - print( - "Error: You need to tell us you're in an environment with all needed site-packages. See --pyenv" - ) - exit() - process_keys( create_all_keys_dict(get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list()) ) From 7ef56b8b08bad189166f615784efcf1ac97fc2b9 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 1 Nov 2024 13:01:29 +1100 Subject: [PATCH 167/222] Orphan detection script separated --- bin/get-orphaned-videos.py | 168 ++----------------------------------- 1 file changed, 7 insertions(+), 161 deletions(-) diff --git a/bin/get-orphaned-videos.py b/bin/get-orphaned-videos.py index efe041cd..f18b9d3f 100755 --- a/bin/get-orphaned-videos.py +++ b/bin/get-orphaned-videos.py @@ -40,13 +40,6 @@ required=False, help=f"AWS client path (default: %(default)s)", ) -parser.add_argument( - "--dumpnzsl", - default=False, - required=False, - action="store_true", - help=f"Dump raw NZSL database output", -) parser.add_argument( "--pyenv", default=False, @@ -54,13 +47,6 @@ action="store_true", help=f"Yes, we are running in a pyenv virtualenv that has all the right site-packages installed", ) -parser.add_argument( - "--orphans", - default=False, - required=False, - action="store_true", - help=f"Try to identify and match-up S3 orphans (requires --pyenv)", -) args = parser.parse_args() if args.pyenv: @@ -275,137 +261,7 @@ def create_all_keys_dict(this_nzsl_raw_keys_dict, this_s3_bucket_raw_keys_list): return this_all_keys_dict -# Cases -# In S3 In NZSL Action -# Is Not Delete S3 Object -# Is Is Update ACL -# Not Is Review -# Other Review -def get_recommended_action(key_in_nzsl, key_in_s3): - if key_in_s3: - if key_in_nzsl: - return "Update ACL" - else: - return "Delete S3 Object" - return "Review" - - -# Get S3 object's ACL -def get_s3_canned_acl(video_key): - result = aws_cli( - [ - "s3api", - "get-object-acl", - "--output", - "text", - "--query", - "Grants[*].Permission", - "--bucket", - AWS_S3_BUCKET, - "--key", - video_key, - ] - ) - acls_grants = result.stdout.strip().split("\t") - - if len(acls_grants) > 1: - if acls_grants[0] == "FULL_CONTROL" and acls_grants[1] == "READ": - return "public-read" - elif acls_grants[0] == "FULL_CONTROL": - return "private" - - return "unknown" - - -# Get S3 object's LastModified date/time -def get_s3_lastmodified(video_key): - result = aws_cli( - [ - "s3api", - "head-object", - "--output", - "text", - "--query", - "LastModified", - "--bucket", - AWS_S3_BUCKET, - "--key", - video_key, - ] - ) - return result.stdout.strip() - - -def build_csv_header(): - return CSV_DELIMITER.join( - [ - "Action", - "S3 Video key", - "S3 LastModified", - "S3 Expected Canned ACL", - "S3 Actual Canned ACL", - "Sbank Gloss ID", - "Sbank Video ID", - "Sbank Gloss public", - "Sbank Video public", - "Sbank Gloss", - "Sbank Gloss created at", - ] - ) - - -def build_csv_row( - video_key, - key_in_nzsl=False, - key_in_s3=False, - gloss_idgloss=None, - gloss_created_at=None, - gloss_id=None, - video_id=None, - gloss_public=False, - video_public=False, -): - # See signbank/video/models.py, line 59, function set_public_acl() - canned_acl_expected = "" - if key_in_nzsl: - canned_acl_expected = "public-read" if video_public else "private" - - lastmodified = "" - canned_acl = "" - if key_in_s3: - lastmodified = get_s3_lastmodified(video_key) - canned_acl = get_s3_canned_acl(video_key) - - action = get_recommended_action(key_in_nzsl, key_in_s3) - - return CSV_DELIMITER.join( - [ - action, - f"{filter_fakekey(video_key)}", - f"{lastmodified}", - f"{canned_acl_expected}", - f"{canned_acl}", - f"{gloss_id}", - f"{video_id}", - f"{gloss_public}", - f"{video_public}", - f"{gloss_idgloss}", - f"{gloss_created_at}", - ] - ) - - -# From the keys present in NZSL, get all their S3 information -def process_keys(this_all_keys_dict): - print(f"Getting detailed S3 data for keys ({AWS_S3_BUCKET}) ...", file=sys.stderr) - - print(build_csv_header()) - - for video_key, dict_row in this_all_keys_dict.items(): - print(build_csv_row(video_key, *dict_row)) - - -def process_orphans(): +def find_orphans(): all_keys_dict = create_all_keys_dict( get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list() ) @@ -473,19 +329,9 @@ def process_orphans(): print(f"PGCLI: {PGCLI}", file=sys.stderr) print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr) -if args.dumpnzsl: - pprint(get_nzsl_raw_keys_dict()) - exit() - -if args.orphans: - if args.pyenv: - process_orphans() - else: - print( - "Error: You need to tell us you're in an environment with all needed site-packages. See --pyenv" - ) - exit() - -process_keys( - create_all_keys_dict(get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list()) -) +if args.pyenv: + find_orphans() +else: + print( + "Error: You need to tell us you're in an environment with all needed site-packages. See --pyenv" + ) From fcbde529cf54d61c8fed77a5e1dd5c63f53ff46f Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 1 Nov 2024 13:38:12 +1100 Subject: [PATCH 168/222] Removed --pyenv requirement, prior to management command --- bin/get-orphaned-videos.py | 68 +++++++++++++++++--------------------- 1 file changed, 31 insertions(+), 37 deletions(-) diff --git a/bin/get-orphaned-videos.py b/bin/get-orphaned-videos.py index f18b9d3f..97e4aabf 100755 --- a/bin/get-orphaned-videos.py +++ b/bin/get-orphaned-videos.py @@ -49,37 +49,36 @@ ) args = parser.parse_args() -if args.pyenv: - # Magic required to allow this script to use Signbank Django classes - # This goes away if this script becomes a Django Management Command - print("Importing site-packages environment", file=sys.stderr) - print(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), file=sys.stderr) - sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) - os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development") - from django.core.wsgi import get_wsgi_application - - get_wsgi_application() - - from django.contrib.auth.models import Permission - from django.contrib.auth import get_user_model - - User = get_user_model() - - from django.test import Client - from django.core.files.uploadedfile import SimpleUploadedFile - from django.urls import reverse - from django.db.utils import IntegrityError - from signbank.dictionary.models import ( - Dataset, - FieldChoice, - Gloss, - GlossTranslations, - Language, - ManualValidationAggregation, - ShareValidationAggregation, - ValidationRecord, - ) - from signbank.video.models import GlossVideo +# Magic required to allow this script to use Signbank Django classes +# This goes away if this script becomes a Django Management Command +print("Importing site-packages environment", file=sys.stderr) +print(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), file=sys.stderr) +sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development") +from django.core.wsgi import get_wsgi_application + +get_wsgi_application() + +from django.contrib.auth.models import Permission +from django.contrib.auth import get_user_model + +User = get_user_model() + +from django.test import Client +from django.core.files.uploadedfile import SimpleUploadedFile +from django.urls import reverse +from django.db.utils import IntegrityError +from signbank.dictionary.models import ( + Dataset, + FieldChoice, + Gloss, + GlossTranslations, + Language, + ManualValidationAggregation, + ShareValidationAggregation, + ValidationRecord, +) +from signbank.video.models import GlossVideo # Globals CSV_DELIMITER = "," @@ -329,9 +328,4 @@ def find_orphans(): print(f"PGCLI: {PGCLI}", file=sys.stderr) print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr) -if args.pyenv: - find_orphans() -else: - print( - "Error: You need to tell us you're in an environment with all needed site-packages. See --pyenv" - ) +find_orphans() From bfe08487de13bd13004c3493c8bd7131099ad726 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 1 Nov 2024 13:42:21 +1100 Subject: [PATCH 169/222] Moved to management dir --- .../commands/get-orphaned-videos.py | 331 ++++++++++++++++++ 1 file changed, 331 insertions(+) create mode 100755 signbank/dictionary/management/commands/get-orphaned-videos.py diff --git a/signbank/dictionary/management/commands/get-orphaned-videos.py b/signbank/dictionary/management/commands/get-orphaned-videos.py new file mode 100755 index 00000000..97e4aabf --- /dev/null +++ b/signbank/dictionary/management/commands/get-orphaned-videos.py @@ -0,0 +1,331 @@ +#!/usr/bin/env -S python3 -u +# Bang line above passes '-u' to python, for unbuffered output +# Permissions required: +# psql - access to heroku app's postgres +# aws s3 - NZSL IAM access +# s3:GetObjectAcl permissions or READ_ACP access to the object +# https://docs.aws.amazon.com/cli/latest/reference/s3api/get-object-acl.html +# For some commands you need to run this in a venv that has all the right Python site-packages. +# TODO Convert this script to a Django Management Command + +import os +import sys +import subprocess +import argparse +import re +from time import sleep +from uuid import uuid4 +from pprint import pprint + + +parser = argparse.ArgumentParser( + description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. " + "Postgres access details, eg. DATABASE_URL env var." +) +parser.add_argument( + "--env", + default="uat", + required=False, + help="Environment to run against, eg 'production, 'uat', etc (default: '%(default)s')", +) +parser.add_argument( + "--pgcli", + default="/usr/bin/psql", + required=False, + help=f"Postgres client path (default: %(default)s)", +) +parser.add_argument( + "--awscli", + default="/usr/local/bin/aws", + required=False, + help=f"AWS client path (default: %(default)s)", +) +parser.add_argument( + "--pyenv", + default=False, + required=False, + action="store_true", + help=f"Yes, we are running in a pyenv virtualenv that has all the right site-packages installed", +) +args = parser.parse_args() + +# Magic required to allow this script to use Signbank Django classes +# This goes away if this script becomes a Django Management Command +print("Importing site-packages environment", file=sys.stderr) +print(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), file=sys.stderr) +sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development") +from django.core.wsgi import get_wsgi_application + +get_wsgi_application() + +from django.contrib.auth.models import Permission +from django.contrib.auth import get_user_model + +User = get_user_model() + +from django.test import Client +from django.core.files.uploadedfile import SimpleUploadedFile +from django.urls import reverse +from django.db.utils import IntegrityError +from signbank.dictionary.models import ( + Dataset, + FieldChoice, + Gloss, + GlossTranslations, + Language, + ManualValidationAggregation, + ShareValidationAggregation, + ValidationRecord, +) +from signbank.video.models import GlossVideo + +# Globals +CSV_DELIMITER = "," +FAKEKEY_PREFIX = "this_is_not_a_key_" +DATABASE_URL = os.getenv("DATABASE_URL", "") +AWSCLI = args.awscli +PGCLI = args.pgcli +AWS_S3_BUCKET = f"nzsl-signbank-media-{args.env}" + + +def pg_cli(args_list): + try: + return subprocess.run( + [PGCLI, "-c"] + args_list + [f"{DATABASE_URL}"], + env=os.environ, + capture_output=True, + check=True, + text=True, + ) + except subprocess.CalledProcessError as e: + print(f"Error: subprocess.run returned code {e.returncode}", file=sys.stderr) + print(e.cmd, file=sys.stderr) + print(e.stdout, file=sys.stderr) + print(e.stderr, file=sys.stderr) + exit() + + +def aws_cli(args_list): + # Try indefinitely + output = None + while not output: + try: + output = subprocess.run( + [AWSCLI] + args_list, + env=os.environ, + capture_output=True, + check=True, + text=True, + ) + except subprocess.CalledProcessError as e: + print( + f"Error: subprocess.run returned code {e.returncode}", file=sys.stderr + ) + print(e.cmd, file=sys.stderr) + print(e.stdout, file=sys.stderr) + print(e.stderr, file=sys.stderr) + sleep(1) + return output + + +# Fake key is a hack to handle FULL JOIN +def maybe_fakekey(instring): + return instring if instring else FAKEKEY_PREFIX + str(uuid4()) + + +def filter_fakekey(instring): + return "" if instring.startswith(FAKEKEY_PREFIX) else instring + + +# Get the video files info from NZSL Signbank +def get_nzsl_raw_keys_dict(): + print( + f"Getting raw list of video file info from NZSL Signbank ...", + file=sys.stderr, + ) + this_nzsl_raw_keys_dict = {} + # Column renaming is for readability + # Special delimiter because columns might contain commas + result = pg_cli( + [ + "COPY (" + "SELECT " + "dg.id AS gloss_id, " + "dg.idgloss AS gloss_idgloss, " + "dg.created_at AS gloss_created_at, " + "dg.published AS gloss_public, " + "vg.is_public AS video_public, " + "vg.id AS video_id, " + "vg.videofile AS video_key " + "FROM dictionary_gloss AS dg " + "FULL JOIN video_glossvideo AS vg ON vg.gloss_id = dg.id" + ") TO STDOUT WITH (FORMAT CSV, DELIMITER '|')", + ] + ) + + # Separate the NZSL db columns + # Write them to a dictionary, so we can do fast operations + for rawl in result.stdout.split("\n"): + rawl = rawl.strip() + if not rawl: + continue + [ + gloss_id, + gloss_idgloss, + gloss_created_at, + gloss_public, + video_public, + video_id, + video_key, + ] = rawl.split("|") + + # Hack to handle FULL JOIN + video_key = maybe_fakekey(video_key.strip()) + + # This sets the initial field ordering in the all_keys dictionary row + this_nzsl_raw_keys_dict[video_key] = [ + gloss_idgloss.replace(CSV_DELIMITER, ""), + gloss_created_at, + gloss_id, + video_id, + gloss_public.lower() == "t", + video_public.lower() == "t", + ] + + print( + f"{len(this_nzsl_raw_keys_dict)} rows retrieved", + file=sys.stderr, + ) + + return this_nzsl_raw_keys_dict + + +# Get all keys from AWS S3 +def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET): + print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr) + result = aws_cli( + [ + "s3", + "ls", + f"s3://{s3_bucket}", + "--recursive", + ], + ) + + # Separate out just the key from date, time, size, key + this_s3_bucket_raw_keys_list = [] + for line in result.stdout.split("\n"): + if line: + this_s3_bucket_raw_keys_list.append(re.split(r"\s+", line, 3)[3]) + + print( + f"{len(this_s3_bucket_raw_keys_list)} rows retrieved", + file=sys.stderr, + ) + + return this_s3_bucket_raw_keys_list + + +# Get the keys present and absent across NZSL Signbank and S3, to dictionary +def create_all_keys_dict(this_nzsl_raw_keys_dict, this_s3_bucket_raw_keys_list): + print( + "Getting keys present and absent across NZSL Signbank and S3 ...", + file=sys.stderr, + ) + this_all_keys_dict = {} + + # Find S3 keys that are present in NZSL, or absent + for video_key in this_s3_bucket_raw_keys_list: + dict_row = this_nzsl_raw_keys_dict.get(video_key, None) + if dict_row: + this_all_keys_dict[video_key] = [ + True, # NZSL PRESENT + True, # S3 PRESENT + ] + dict_row + else: + this_all_keys_dict[video_key] = [ + False, # NZSL Absent + True, # S3 PRESENT + ] + [""] * 6 + + # Find NZSL keys that are absent from S3 (present handled above) + for video_key, dict_row in this_nzsl_raw_keys_dict.items(): + if video_key not in this_s3_bucket_raw_keys_list: + this_all_keys_dict[video_key] = [ + True, # NZSL PRESENT + False, # S3 Absent + ] + dict_row + + return this_all_keys_dict + + +def find_orphans(): + all_keys_dict = create_all_keys_dict( + get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list() + ) + + print("Gloss ID,Gloss,Suggested Video key") + + # Traverse all the NZSL Signbank glosses that are missing S3 objects + for video_key, [ + key_in_nzsl, + key_in_s3, + gloss_idgloss, + gloss_created_at, + gloss_id, + video_id, + gloss_public, + video_public, + ] in all_keys_dict.items(): + + if not key_in_nzsl: + # This is an S3 object, not a Signbank record + continue + + if key_in_s3: + # This Signbank record already has an S3 object, all is well + continue + + # Business rule + if int(gloss_id) < 8000: + continue + + # The gloss_id is the only reliable retrieval key at the Signbank end + gloss = Gloss.objects.get(id=gloss_id) + video_path = gloss.get_video_path() + + # Skip any that already have a video path + # If these had S3 video candidates they should not have made it this far + # These will have to have their videos reinstated (separate operation) + if len(video_path) > 0: + continue + + gloss_name = gloss.idgloss.split(":")[0].strip() + + csv_rows = [] + + # We try to find the orphaned S3 object, if it exists + # TODO We could improve on brute-force by installing new libraries eg. rapidfuzz + for test_key, [key_nzsl_yes, key_s3_yes, *_] in all_keys_dict.items(): + if gloss_name in test_key: + if str(gloss_id) in test_key: + if key_nzsl_yes: + print(f"Anomaly (in NZSL): {gloss.idgloss}", file=sys.stderr) + continue + if not key_s3_yes: + print(f"Anomaly (not in S3): {gloss.idgloss}", file=sys.stderr) + continue + csv_rows.append([gloss_id, gloss.idgloss, test_key]) + if csv_rows: + for c_row in csv_rows: + print(CSV_DELIMITER.join(c_row)) + + +print(f"Env: {args.env}", file=sys.stderr) +print(f"S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr) +print(f"AWSCLI: {AWSCLI}", file=sys.stderr) +print(f"PGCLI: {PGCLI}", file=sys.stderr) +print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr) + +find_orphans() From 068244d7ccf207dbccb6e54deda4d8c3e5804248 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 1 Nov 2024 15:00:02 +1100 Subject: [PATCH 170/222] Revert "Moved to management dir" This reverts commit bfe08487de13bd13004c3493c8bd7131099ad726. --- .../commands/get-orphaned-videos.py | 331 ------------------ 1 file changed, 331 deletions(-) delete mode 100755 signbank/dictionary/management/commands/get-orphaned-videos.py diff --git a/signbank/dictionary/management/commands/get-orphaned-videos.py b/signbank/dictionary/management/commands/get-orphaned-videos.py deleted file mode 100755 index 97e4aabf..00000000 --- a/signbank/dictionary/management/commands/get-orphaned-videos.py +++ /dev/null @@ -1,331 +0,0 @@ -#!/usr/bin/env -S python3 -u -# Bang line above passes '-u' to python, for unbuffered output -# Permissions required: -# psql - access to heroku app's postgres -# aws s3 - NZSL IAM access -# s3:GetObjectAcl permissions or READ_ACP access to the object -# https://docs.aws.amazon.com/cli/latest/reference/s3api/get-object-acl.html -# For some commands you need to run this in a venv that has all the right Python site-packages. -# TODO Convert this script to a Django Management Command - -import os -import sys -import subprocess -import argparse -import re -from time import sleep -from uuid import uuid4 -from pprint import pprint - - -parser = argparse.ArgumentParser( - description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. " - "Postgres access details, eg. DATABASE_URL env var." -) -parser.add_argument( - "--env", - default="uat", - required=False, - help="Environment to run against, eg 'production, 'uat', etc (default: '%(default)s')", -) -parser.add_argument( - "--pgcli", - default="/usr/bin/psql", - required=False, - help=f"Postgres client path (default: %(default)s)", -) -parser.add_argument( - "--awscli", - default="/usr/local/bin/aws", - required=False, - help=f"AWS client path (default: %(default)s)", -) -parser.add_argument( - "--pyenv", - default=False, - required=False, - action="store_true", - help=f"Yes, we are running in a pyenv virtualenv that has all the right site-packages installed", -) -args = parser.parse_args() - -# Magic required to allow this script to use Signbank Django classes -# This goes away if this script becomes a Django Management Command -print("Importing site-packages environment", file=sys.stderr) -print(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), file=sys.stderr) -sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) -os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development") -from django.core.wsgi import get_wsgi_application - -get_wsgi_application() - -from django.contrib.auth.models import Permission -from django.contrib.auth import get_user_model - -User = get_user_model() - -from django.test import Client -from django.core.files.uploadedfile import SimpleUploadedFile -from django.urls import reverse -from django.db.utils import IntegrityError -from signbank.dictionary.models import ( - Dataset, - FieldChoice, - Gloss, - GlossTranslations, - Language, - ManualValidationAggregation, - ShareValidationAggregation, - ValidationRecord, -) -from signbank.video.models import GlossVideo - -# Globals -CSV_DELIMITER = "," -FAKEKEY_PREFIX = "this_is_not_a_key_" -DATABASE_URL = os.getenv("DATABASE_URL", "") -AWSCLI = args.awscli -PGCLI = args.pgcli -AWS_S3_BUCKET = f"nzsl-signbank-media-{args.env}" - - -def pg_cli(args_list): - try: - return subprocess.run( - [PGCLI, "-c"] + args_list + [f"{DATABASE_URL}"], - env=os.environ, - capture_output=True, - check=True, - text=True, - ) - except subprocess.CalledProcessError as e: - print(f"Error: subprocess.run returned code {e.returncode}", file=sys.stderr) - print(e.cmd, file=sys.stderr) - print(e.stdout, file=sys.stderr) - print(e.stderr, file=sys.stderr) - exit() - - -def aws_cli(args_list): - # Try indefinitely - output = None - while not output: - try: - output = subprocess.run( - [AWSCLI] + args_list, - env=os.environ, - capture_output=True, - check=True, - text=True, - ) - except subprocess.CalledProcessError as e: - print( - f"Error: subprocess.run returned code {e.returncode}", file=sys.stderr - ) - print(e.cmd, file=sys.stderr) - print(e.stdout, file=sys.stderr) - print(e.stderr, file=sys.stderr) - sleep(1) - return output - - -# Fake key is a hack to handle FULL JOIN -def maybe_fakekey(instring): - return instring if instring else FAKEKEY_PREFIX + str(uuid4()) - - -def filter_fakekey(instring): - return "" if instring.startswith(FAKEKEY_PREFIX) else instring - - -# Get the video files info from NZSL Signbank -def get_nzsl_raw_keys_dict(): - print( - f"Getting raw list of video file info from NZSL Signbank ...", - file=sys.stderr, - ) - this_nzsl_raw_keys_dict = {} - # Column renaming is for readability - # Special delimiter because columns might contain commas - result = pg_cli( - [ - "COPY (" - "SELECT " - "dg.id AS gloss_id, " - "dg.idgloss AS gloss_idgloss, " - "dg.created_at AS gloss_created_at, " - "dg.published AS gloss_public, " - "vg.is_public AS video_public, " - "vg.id AS video_id, " - "vg.videofile AS video_key " - "FROM dictionary_gloss AS dg " - "FULL JOIN video_glossvideo AS vg ON vg.gloss_id = dg.id" - ") TO STDOUT WITH (FORMAT CSV, DELIMITER '|')", - ] - ) - - # Separate the NZSL db columns - # Write them to a dictionary, so we can do fast operations - for rawl in result.stdout.split("\n"): - rawl = rawl.strip() - if not rawl: - continue - [ - gloss_id, - gloss_idgloss, - gloss_created_at, - gloss_public, - video_public, - video_id, - video_key, - ] = rawl.split("|") - - # Hack to handle FULL JOIN - video_key = maybe_fakekey(video_key.strip()) - - # This sets the initial field ordering in the all_keys dictionary row - this_nzsl_raw_keys_dict[video_key] = [ - gloss_idgloss.replace(CSV_DELIMITER, ""), - gloss_created_at, - gloss_id, - video_id, - gloss_public.lower() == "t", - video_public.lower() == "t", - ] - - print( - f"{len(this_nzsl_raw_keys_dict)} rows retrieved", - file=sys.stderr, - ) - - return this_nzsl_raw_keys_dict - - -# Get all keys from AWS S3 -def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET): - print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr) - result = aws_cli( - [ - "s3", - "ls", - f"s3://{s3_bucket}", - "--recursive", - ], - ) - - # Separate out just the key from date, time, size, key - this_s3_bucket_raw_keys_list = [] - for line in result.stdout.split("\n"): - if line: - this_s3_bucket_raw_keys_list.append(re.split(r"\s+", line, 3)[3]) - - print( - f"{len(this_s3_bucket_raw_keys_list)} rows retrieved", - file=sys.stderr, - ) - - return this_s3_bucket_raw_keys_list - - -# Get the keys present and absent across NZSL Signbank and S3, to dictionary -def create_all_keys_dict(this_nzsl_raw_keys_dict, this_s3_bucket_raw_keys_list): - print( - "Getting keys present and absent across NZSL Signbank and S3 ...", - file=sys.stderr, - ) - this_all_keys_dict = {} - - # Find S3 keys that are present in NZSL, or absent - for video_key in this_s3_bucket_raw_keys_list: - dict_row = this_nzsl_raw_keys_dict.get(video_key, None) - if dict_row: - this_all_keys_dict[video_key] = [ - True, # NZSL PRESENT - True, # S3 PRESENT - ] + dict_row - else: - this_all_keys_dict[video_key] = [ - False, # NZSL Absent - True, # S3 PRESENT - ] + [""] * 6 - - # Find NZSL keys that are absent from S3 (present handled above) - for video_key, dict_row in this_nzsl_raw_keys_dict.items(): - if video_key not in this_s3_bucket_raw_keys_list: - this_all_keys_dict[video_key] = [ - True, # NZSL PRESENT - False, # S3 Absent - ] + dict_row - - return this_all_keys_dict - - -def find_orphans(): - all_keys_dict = create_all_keys_dict( - get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list() - ) - - print("Gloss ID,Gloss,Suggested Video key") - - # Traverse all the NZSL Signbank glosses that are missing S3 objects - for video_key, [ - key_in_nzsl, - key_in_s3, - gloss_idgloss, - gloss_created_at, - gloss_id, - video_id, - gloss_public, - video_public, - ] in all_keys_dict.items(): - - if not key_in_nzsl: - # This is an S3 object, not a Signbank record - continue - - if key_in_s3: - # This Signbank record already has an S3 object, all is well - continue - - # Business rule - if int(gloss_id) < 8000: - continue - - # The gloss_id is the only reliable retrieval key at the Signbank end - gloss = Gloss.objects.get(id=gloss_id) - video_path = gloss.get_video_path() - - # Skip any that already have a video path - # If these had S3 video candidates they should not have made it this far - # These will have to have their videos reinstated (separate operation) - if len(video_path) > 0: - continue - - gloss_name = gloss.idgloss.split(":")[0].strip() - - csv_rows = [] - - # We try to find the orphaned S3 object, if it exists - # TODO We could improve on brute-force by installing new libraries eg. rapidfuzz - for test_key, [key_nzsl_yes, key_s3_yes, *_] in all_keys_dict.items(): - if gloss_name in test_key: - if str(gloss_id) in test_key: - if key_nzsl_yes: - print(f"Anomaly (in NZSL): {gloss.idgloss}", file=sys.stderr) - continue - if not key_s3_yes: - print(f"Anomaly (not in S3): {gloss.idgloss}", file=sys.stderr) - continue - csv_rows.append([gloss_id, gloss.idgloss, test_key]) - if csv_rows: - for c_row in csv_rows: - print(CSV_DELIMITER.join(c_row)) - - -print(f"Env: {args.env}", file=sys.stderr) -print(f"S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr) -print(f"AWSCLI: {AWSCLI}", file=sys.stderr) -print(f"PGCLI: {PGCLI}", file=sys.stderr) -print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr) - -find_orphans() From a49f9df16347d3622e9ce075093722e737121799 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 1 Nov 2024 15:00:06 +1100 Subject: [PATCH 171/222] Revert "Removed --pyenv requirement, prior to management command" This reverts commit fcbde529cf54d61c8fed77a5e1dd5c63f53ff46f. --- bin/get-orphaned-videos.py | 68 +++++++++++++++++++++----------------- 1 file changed, 37 insertions(+), 31 deletions(-) diff --git a/bin/get-orphaned-videos.py b/bin/get-orphaned-videos.py index 97e4aabf..f18b9d3f 100755 --- a/bin/get-orphaned-videos.py +++ b/bin/get-orphaned-videos.py @@ -49,36 +49,37 @@ ) args = parser.parse_args() -# Magic required to allow this script to use Signbank Django classes -# This goes away if this script becomes a Django Management Command -print("Importing site-packages environment", file=sys.stderr) -print(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), file=sys.stderr) -sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) -os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development") -from django.core.wsgi import get_wsgi_application - -get_wsgi_application() - -from django.contrib.auth.models import Permission -from django.contrib.auth import get_user_model - -User = get_user_model() - -from django.test import Client -from django.core.files.uploadedfile import SimpleUploadedFile -from django.urls import reverse -from django.db.utils import IntegrityError -from signbank.dictionary.models import ( - Dataset, - FieldChoice, - Gloss, - GlossTranslations, - Language, - ManualValidationAggregation, - ShareValidationAggregation, - ValidationRecord, -) -from signbank.video.models import GlossVideo +if args.pyenv: + # Magic required to allow this script to use Signbank Django classes + # This goes away if this script becomes a Django Management Command + print("Importing site-packages environment", file=sys.stderr) + print(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), file=sys.stderr) + sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) + os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development") + from django.core.wsgi import get_wsgi_application + + get_wsgi_application() + + from django.contrib.auth.models import Permission + from django.contrib.auth import get_user_model + + User = get_user_model() + + from django.test import Client + from django.core.files.uploadedfile import SimpleUploadedFile + from django.urls import reverse + from django.db.utils import IntegrityError + from signbank.dictionary.models import ( + Dataset, + FieldChoice, + Gloss, + GlossTranslations, + Language, + ManualValidationAggregation, + ShareValidationAggregation, + ValidationRecord, + ) + from signbank.video.models import GlossVideo # Globals CSV_DELIMITER = "," @@ -328,4 +329,9 @@ def find_orphans(): print(f"PGCLI: {PGCLI}", file=sys.stderr) print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr) -find_orphans() +if args.pyenv: + find_orphans() +else: + print( + "Error: You need to tell us you're in an environment with all needed site-packages. See --pyenv" + ) From 61158e981407a829ffc134b71da0081451734ce5 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 1 Nov 2024 16:37:56 +1100 Subject: [PATCH 172/222] Comments --- bin/get-orphaned-videos.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/bin/get-orphaned-videos.py b/bin/get-orphaned-videos.py index f18b9d3f..1d53619e 100755 --- a/bin/get-orphaned-videos.py +++ b/bin/get-orphaned-videos.py @@ -240,19 +240,25 @@ def create_all_keys_dict(this_nzsl_raw_keys_dict, this_s3_bucket_raw_keys_list): for video_key in this_s3_bucket_raw_keys_list: dict_row = this_nzsl_raw_keys_dict.get(video_key, None) if dict_row: + # NZSL glossvideo record for this S3 key this_all_keys_dict[video_key] = [ True, # NZSL PRESENT True, # S3 PRESENT ] + dict_row else: + # S3 key with no corresponding NZSL glossvideo record this_all_keys_dict[video_key] = [ False, # NZSL Absent True, # S3 PRESENT ] + [""] * 6 - # Find NZSL keys that are absent from S3 (present handled above) + # Find NZSL keys that are absent from S3 (present in both handled above) for video_key, dict_row in this_nzsl_raw_keys_dict.items(): if video_key not in this_s3_bucket_raw_keys_list: + # gloss/glossvideo record with no corresponding S3 key + # Either: + # video_key is real, but the S3 object is missing + # video_key is fake (to handle the FULL JOIN) and this gloss/glossvideo never had an S3 object this_all_keys_dict[video_key] = [ True, # NZSL PRESENT False, # S3 Absent From 8f5b88afaa4b46d3c3ad594672020ee456815063 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 1 Nov 2024 16:40:22 +1100 Subject: [PATCH 173/222] Comment --- bin/get-orphaned-videos.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/get-orphaned-videos.py b/bin/get-orphaned-videos.py index 1d53619e..eeb7189e 100755 --- a/bin/get-orphaned-videos.py +++ b/bin/get-orphaned-videos.py @@ -302,8 +302,8 @@ def find_orphans(): gloss = Gloss.objects.get(id=gloss_id) video_path = gloss.get_video_path() - # Skip any that already have a video path - # If these had S3 video candidates they should not have made it this far + # Skip any that already have a video path. + # These should have an S3 object but don't. For some reason the video never made it to S3. # These will have to have their videos reinstated (separate operation) if len(video_path) > 0: continue From 917e9ad4968b4d1672db407780ff5d3fd6ae87cd Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 1 Nov 2024 16:44:58 +1100 Subject: [PATCH 174/222] refactor --- bin/get-orphaned-videos.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/bin/get-orphaned-videos.py b/bin/get-orphaned-videos.py index eeb7189e..b5dae6ea 100755 --- a/bin/get-orphaned-videos.py +++ b/bin/get-orphaned-videos.py @@ -300,18 +300,16 @@ def find_orphans(): # The gloss_id is the only reliable retrieval key at the Signbank end gloss = Gloss.objects.get(id=gloss_id) + gloss_name = gloss.idgloss.split(":")[0].strip() video_path = gloss.get_video_path() - # Skip any that already have a video path. - # These should have an S3 object but don't. For some reason the video never made it to S3. + # Skip any that already have a video path + # These should have an S3 object but don't. For some reason the video never made it to S3 # These will have to have their videos reinstated (separate operation) + # TODO If it's worth it, make a --param to output these if len(video_path) > 0: continue - gloss_name = gloss.idgloss.split(":")[0].strip() - - csv_rows = [] - # We try to find the orphaned S3 object, if it exists # TODO We could improve on brute-force by installing new libraries eg. rapidfuzz for test_key, [key_nzsl_yes, key_s3_yes, *_] in all_keys_dict.items(): @@ -323,10 +321,7 @@ def find_orphans(): if not key_s3_yes: print(f"Anomaly (not in S3): {gloss.idgloss}", file=sys.stderr) continue - csv_rows.append([gloss_id, gloss.idgloss, test_key]) - if csv_rows: - for c_row in csv_rows: - print(CSV_DELIMITER.join(c_row)) + print(CSV_DELIMITER.join([gloss_id, gloss.idgloss, test_key])) print(f"Env: {args.env}", file=sys.stderr) From f7485239f6da127efd11c9b6d3f30695023924dc Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 1 Nov 2024 16:48:05 +1100 Subject: [PATCH 175/222] Comment --- bin/get-orphaned-videos.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bin/get-orphaned-videos.py b/bin/get-orphaned-videos.py index b5dae6ea..627200bc 100755 --- a/bin/get-orphaned-videos.py +++ b/bin/get-orphaned-videos.py @@ -306,7 +306,6 @@ def find_orphans(): # Skip any that already have a video path # These should have an S3 object but don't. For some reason the video never made it to S3 # These will have to have their videos reinstated (separate operation) - # TODO If it's worth it, make a --param to output these if len(video_path) > 0: continue From ad2733dfc11cf8c06f407ef5034a5f7f3189e901 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 1 Nov 2024 16:52:46 +1100 Subject: [PATCH 176/222] Cleanups --- bin/get-orphaned-videos.py | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/bin/get-orphaned-videos.py b/bin/get-orphaned-videos.py index 627200bc..74d9e3ee 100755 --- a/bin/get-orphaned-videos.py +++ b/bin/get-orphaned-videos.py @@ -1,4 +1,7 @@ #!/usr/bin/env -S python3 -u +# +# This script needs to be run in a pyenv virtualenv with the Django project installed. +# # Bang line above passes '-u' to python, for unbuffered output # Permissions required: # psql - access to heroku app's postgres @@ -40,13 +43,6 @@ required=False, help=f"AWS client path (default: %(default)s)", ) -parser.add_argument( - "--pyenv", - default=False, - required=False, - action="store_true", - help=f"Yes, we are running in a pyenv virtualenv that has all the right site-packages installed", -) args = parser.parse_args() if args.pyenv: @@ -304,7 +300,7 @@ def find_orphans(): video_path = gloss.get_video_path() # Skip any that already have a video path - # These should have an S3 object but don't. For some reason the video never made it to S3 + # These should have an S3 object but don't: For some reason the video never made it to S3 # These will have to have their videos reinstated (separate operation) if len(video_path) > 0: continue @@ -329,9 +325,4 @@ def find_orphans(): print(f"PGCLI: {PGCLI}", file=sys.stderr) print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr) -if args.pyenv: - find_orphans() -else: - print( - "Error: You need to tell us you're in an environment with all needed site-packages. See --pyenv" - ) +find_orphans() From 12ab098b4c66e4aae54817d90f4eb53d6911c658 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 1 Nov 2024 17:07:14 +1100 Subject: [PATCH 177/222] rename --- bin/{get-orphaned-videos.py => find-orphaned-videos.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename bin/{get-orphaned-videos.py => find-orphaned-videos.py} (100%) diff --git a/bin/get-orphaned-videos.py b/bin/find-orphaned-videos.py similarity index 100% rename from bin/get-orphaned-videos.py rename to bin/find-orphaned-videos.py From 75f0a8f29b85dc3ed5c85eadc805a71c84978729 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 1 Nov 2024 17:08:16 +1100 Subject: [PATCH 178/222] initial commit of orphan video repair script --- bin/repair-orphaned-videos.py | 328 ++++++++++++++++++++++++++++++++++ 1 file changed, 328 insertions(+) create mode 100755 bin/repair-orphaned-videos.py diff --git a/bin/repair-orphaned-videos.py b/bin/repair-orphaned-videos.py new file mode 100755 index 00000000..74d9e3ee --- /dev/null +++ b/bin/repair-orphaned-videos.py @@ -0,0 +1,328 @@ +#!/usr/bin/env -S python3 -u +# +# This script needs to be run in a pyenv virtualenv with the Django project installed. +# +# Bang line above passes '-u' to python, for unbuffered output +# Permissions required: +# psql - access to heroku app's postgres +# aws s3 - NZSL IAM access +# s3:GetObjectAcl permissions or READ_ACP access to the object +# https://docs.aws.amazon.com/cli/latest/reference/s3api/get-object-acl.html +# For some commands you need to run this in a venv that has all the right Python site-packages. +# TODO Convert this script to a Django Management Command + +import os +import sys +import subprocess +import argparse +import re +from time import sleep +from uuid import uuid4 +from pprint import pprint + + +parser = argparse.ArgumentParser( + description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. " + "Postgres access details, eg. DATABASE_URL env var." +) +parser.add_argument( + "--env", + default="uat", + required=False, + help="Environment to run against, eg 'production, 'uat', etc (default: '%(default)s')", +) +parser.add_argument( + "--pgcli", + default="/usr/bin/psql", + required=False, + help=f"Postgres client path (default: %(default)s)", +) +parser.add_argument( + "--awscli", + default="/usr/local/bin/aws", + required=False, + help=f"AWS client path (default: %(default)s)", +) +args = parser.parse_args() + +if args.pyenv: + # Magic required to allow this script to use Signbank Django classes + # This goes away if this script becomes a Django Management Command + print("Importing site-packages environment", file=sys.stderr) + print(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), file=sys.stderr) + sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) + os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development") + from django.core.wsgi import get_wsgi_application + + get_wsgi_application() + + from django.contrib.auth.models import Permission + from django.contrib.auth import get_user_model + + User = get_user_model() + + from django.test import Client + from django.core.files.uploadedfile import SimpleUploadedFile + from django.urls import reverse + from django.db.utils import IntegrityError + from signbank.dictionary.models import ( + Dataset, + FieldChoice, + Gloss, + GlossTranslations, + Language, + ManualValidationAggregation, + ShareValidationAggregation, + ValidationRecord, + ) + from signbank.video.models import GlossVideo + +# Globals +CSV_DELIMITER = "," +FAKEKEY_PREFIX = "this_is_not_a_key_" +DATABASE_URL = os.getenv("DATABASE_URL", "") +AWSCLI = args.awscli +PGCLI = args.pgcli +AWS_S3_BUCKET = f"nzsl-signbank-media-{args.env}" + + +def pg_cli(args_list): + try: + return subprocess.run( + [PGCLI, "-c"] + args_list + [f"{DATABASE_URL}"], + env=os.environ, + capture_output=True, + check=True, + text=True, + ) + except subprocess.CalledProcessError as e: + print(f"Error: subprocess.run returned code {e.returncode}", file=sys.stderr) + print(e.cmd, file=sys.stderr) + print(e.stdout, file=sys.stderr) + print(e.stderr, file=sys.stderr) + exit() + + +def aws_cli(args_list): + # Try indefinitely + output = None + while not output: + try: + output = subprocess.run( + [AWSCLI] + args_list, + env=os.environ, + capture_output=True, + check=True, + text=True, + ) + except subprocess.CalledProcessError as e: + print( + f"Error: subprocess.run returned code {e.returncode}", file=sys.stderr + ) + print(e.cmd, file=sys.stderr) + print(e.stdout, file=sys.stderr) + print(e.stderr, file=sys.stderr) + sleep(1) + return output + + +# Fake key is a hack to handle FULL JOIN +def maybe_fakekey(instring): + return instring if instring else FAKEKEY_PREFIX + str(uuid4()) + + +def filter_fakekey(instring): + return "" if instring.startswith(FAKEKEY_PREFIX) else instring + + +# Get the video files info from NZSL Signbank +def get_nzsl_raw_keys_dict(): + print( + f"Getting raw list of video file info from NZSL Signbank ...", + file=sys.stderr, + ) + this_nzsl_raw_keys_dict = {} + # Column renaming is for readability + # Special delimiter because columns might contain commas + result = pg_cli( + [ + "COPY (" + "SELECT " + "dg.id AS gloss_id, " + "dg.idgloss AS gloss_idgloss, " + "dg.created_at AS gloss_created_at, " + "dg.published AS gloss_public, " + "vg.is_public AS video_public, " + "vg.id AS video_id, " + "vg.videofile AS video_key " + "FROM dictionary_gloss AS dg " + "FULL JOIN video_glossvideo AS vg ON vg.gloss_id = dg.id" + ") TO STDOUT WITH (FORMAT CSV, DELIMITER '|')", + ] + ) + + # Separate the NZSL db columns + # Write them to a dictionary, so we can do fast operations + for rawl in result.stdout.split("\n"): + rawl = rawl.strip() + if not rawl: + continue + [ + gloss_id, + gloss_idgloss, + gloss_created_at, + gloss_public, + video_public, + video_id, + video_key, + ] = rawl.split("|") + + # Hack to handle FULL JOIN + video_key = maybe_fakekey(video_key.strip()) + + # This sets the initial field ordering in the all_keys dictionary row + this_nzsl_raw_keys_dict[video_key] = [ + gloss_idgloss.replace(CSV_DELIMITER, ""), + gloss_created_at, + gloss_id, + video_id, + gloss_public.lower() == "t", + video_public.lower() == "t", + ] + + print( + f"{len(this_nzsl_raw_keys_dict)} rows retrieved", + file=sys.stderr, + ) + + return this_nzsl_raw_keys_dict + + +# Get all keys from AWS S3 +def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET): + print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr) + result = aws_cli( + [ + "s3", + "ls", + f"s3://{s3_bucket}", + "--recursive", + ], + ) + + # Separate out just the key from date, time, size, key + this_s3_bucket_raw_keys_list = [] + for line in result.stdout.split("\n"): + if line: + this_s3_bucket_raw_keys_list.append(re.split(r"\s+", line, 3)[3]) + + print( + f"{len(this_s3_bucket_raw_keys_list)} rows retrieved", + file=sys.stderr, + ) + + return this_s3_bucket_raw_keys_list + + +# Get the keys present and absent across NZSL Signbank and S3, to dictionary +def create_all_keys_dict(this_nzsl_raw_keys_dict, this_s3_bucket_raw_keys_list): + print( + "Getting keys present and absent across NZSL Signbank and S3 ...", + file=sys.stderr, + ) + this_all_keys_dict = {} + + # Find S3 keys that are present in NZSL, or absent + for video_key in this_s3_bucket_raw_keys_list: + dict_row = this_nzsl_raw_keys_dict.get(video_key, None) + if dict_row: + # NZSL glossvideo record for this S3 key + this_all_keys_dict[video_key] = [ + True, # NZSL PRESENT + True, # S3 PRESENT + ] + dict_row + else: + # S3 key with no corresponding NZSL glossvideo record + this_all_keys_dict[video_key] = [ + False, # NZSL Absent + True, # S3 PRESENT + ] + [""] * 6 + + # Find NZSL keys that are absent from S3 (present in both handled above) + for video_key, dict_row in this_nzsl_raw_keys_dict.items(): + if video_key not in this_s3_bucket_raw_keys_list: + # gloss/glossvideo record with no corresponding S3 key + # Either: + # video_key is real, but the S3 object is missing + # video_key is fake (to handle the FULL JOIN) and this gloss/glossvideo never had an S3 object + this_all_keys_dict[video_key] = [ + True, # NZSL PRESENT + False, # S3 Absent + ] + dict_row + + return this_all_keys_dict + + +def find_orphans(): + all_keys_dict = create_all_keys_dict( + get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list() + ) + + print("Gloss ID,Gloss,Suggested Video key") + + # Traverse all the NZSL Signbank glosses that are missing S3 objects + for video_key, [ + key_in_nzsl, + key_in_s3, + gloss_idgloss, + gloss_created_at, + gloss_id, + video_id, + gloss_public, + video_public, + ] in all_keys_dict.items(): + + if not key_in_nzsl: + # This is an S3 object, not a Signbank record + continue + + if key_in_s3: + # This Signbank record already has an S3 object, all is well + continue + + # Business rule + if int(gloss_id) < 8000: + continue + + # The gloss_id is the only reliable retrieval key at the Signbank end + gloss = Gloss.objects.get(id=gloss_id) + gloss_name = gloss.idgloss.split(":")[0].strip() + video_path = gloss.get_video_path() + + # Skip any that already have a video path + # These should have an S3 object but don't: For some reason the video never made it to S3 + # These will have to have their videos reinstated (separate operation) + if len(video_path) > 0: + continue + + # We try to find the orphaned S3 object, if it exists + # TODO We could improve on brute-force by installing new libraries eg. rapidfuzz + for test_key, [key_nzsl_yes, key_s3_yes, *_] in all_keys_dict.items(): + if gloss_name in test_key: + if str(gloss_id) in test_key: + if key_nzsl_yes: + print(f"Anomaly (in NZSL): {gloss.idgloss}", file=sys.stderr) + continue + if not key_s3_yes: + print(f"Anomaly (not in S3): {gloss.idgloss}", file=sys.stderr) + continue + print(CSV_DELIMITER.join([gloss_id, gloss.idgloss, test_key])) + + +print(f"Env: {args.env}", file=sys.stderr) +print(f"S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr) +print(f"AWSCLI: {AWSCLI}", file=sys.stderr) +print(f"PGCLI: {PGCLI}", file=sys.stderr) +print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr) + +find_orphans() From f16ca20adc60747a86cb0dd5d75d7371a6375db1 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 1 Nov 2024 17:10:56 +1100 Subject: [PATCH 179/222] pyenv whoops --- bin/find-orphaned-videos.py | 61 +++++++++++++++++------------------ bin/repair-orphaned-videos.py | 61 +++++++++++++++++------------------ 2 files changed, 60 insertions(+), 62 deletions(-) diff --git a/bin/find-orphaned-videos.py b/bin/find-orphaned-videos.py index 74d9e3ee..3066bd20 100755 --- a/bin/find-orphaned-videos.py +++ b/bin/find-orphaned-videos.py @@ -45,37 +45,36 @@ ) args = parser.parse_args() -if args.pyenv: - # Magic required to allow this script to use Signbank Django classes - # This goes away if this script becomes a Django Management Command - print("Importing site-packages environment", file=sys.stderr) - print(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), file=sys.stderr) - sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) - os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development") - from django.core.wsgi import get_wsgi_application - - get_wsgi_application() - - from django.contrib.auth.models import Permission - from django.contrib.auth import get_user_model - - User = get_user_model() - - from django.test import Client - from django.core.files.uploadedfile import SimpleUploadedFile - from django.urls import reverse - from django.db.utils import IntegrityError - from signbank.dictionary.models import ( - Dataset, - FieldChoice, - Gloss, - GlossTranslations, - Language, - ManualValidationAggregation, - ShareValidationAggregation, - ValidationRecord, - ) - from signbank.video.models import GlossVideo +# Magic required to allow this script to use Signbank Django classes +# This goes away if this script becomes a Django Management Command +print("Importing site-packages environment", file=sys.stderr) +print(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), file=sys.stderr) +sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development") +from django.core.wsgi import get_wsgi_application + +get_wsgi_application() + +from django.contrib.auth.models import Permission +from django.contrib.auth import get_user_model + +User = get_user_model() + +from django.test import Client +from django.core.files.uploadedfile import SimpleUploadedFile +from django.urls import reverse +from django.db.utils import IntegrityError +from signbank.dictionary.models import ( + Dataset, + FieldChoice, + Gloss, + GlossTranslations, + Language, + ManualValidationAggregation, + ShareValidationAggregation, + ValidationRecord, +) +from signbank.video.models import GlossVideo # Globals CSV_DELIMITER = "," diff --git a/bin/repair-orphaned-videos.py b/bin/repair-orphaned-videos.py index 74d9e3ee..3066bd20 100755 --- a/bin/repair-orphaned-videos.py +++ b/bin/repair-orphaned-videos.py @@ -45,37 +45,36 @@ ) args = parser.parse_args() -if args.pyenv: - # Magic required to allow this script to use Signbank Django classes - # This goes away if this script becomes a Django Management Command - print("Importing site-packages environment", file=sys.stderr) - print(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), file=sys.stderr) - sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) - os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development") - from django.core.wsgi import get_wsgi_application - - get_wsgi_application() - - from django.contrib.auth.models import Permission - from django.contrib.auth import get_user_model - - User = get_user_model() - - from django.test import Client - from django.core.files.uploadedfile import SimpleUploadedFile - from django.urls import reverse - from django.db.utils import IntegrityError - from signbank.dictionary.models import ( - Dataset, - FieldChoice, - Gloss, - GlossTranslations, - Language, - ManualValidationAggregation, - ShareValidationAggregation, - ValidationRecord, - ) - from signbank.video.models import GlossVideo +# Magic required to allow this script to use Signbank Django classes +# This goes away if this script becomes a Django Management Command +print("Importing site-packages environment", file=sys.stderr) +print(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), file=sys.stderr) +sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development") +from django.core.wsgi import get_wsgi_application + +get_wsgi_application() + +from django.contrib.auth.models import Permission +from django.contrib.auth import get_user_model + +User = get_user_model() + +from django.test import Client +from django.core.files.uploadedfile import SimpleUploadedFile +from django.urls import reverse +from django.db.utils import IntegrityError +from signbank.dictionary.models import ( + Dataset, + FieldChoice, + Gloss, + GlossTranslations, + Language, + ManualValidationAggregation, + ShareValidationAggregation, + ValidationRecord, +) +from signbank.video.models import GlossVideo # Globals CSV_DELIMITER = "," From 4951b52a298c12e3b16b50e74008402d5e3c5547 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 1 Nov 2024 17:12:14 +1100 Subject: [PATCH 180/222] Repair script stripped --- bin/repair-orphaned-videos.py | 194 ---------------------------------- 1 file changed, 194 deletions(-) diff --git a/bin/repair-orphaned-videos.py b/bin/repair-orphaned-videos.py index 3066bd20..5b97a4b6 100755 --- a/bin/repair-orphaned-videos.py +++ b/bin/repair-orphaned-videos.py @@ -125,203 +125,9 @@ def aws_cli(args_list): return output -# Fake key is a hack to handle FULL JOIN -def maybe_fakekey(instring): - return instring if instring else FAKEKEY_PREFIX + str(uuid4()) - - -def filter_fakekey(instring): - return "" if instring.startswith(FAKEKEY_PREFIX) else instring - - -# Get the video files info from NZSL Signbank -def get_nzsl_raw_keys_dict(): - print( - f"Getting raw list of video file info from NZSL Signbank ...", - file=sys.stderr, - ) - this_nzsl_raw_keys_dict = {} - # Column renaming is for readability - # Special delimiter because columns might contain commas - result = pg_cli( - [ - "COPY (" - "SELECT " - "dg.id AS gloss_id, " - "dg.idgloss AS gloss_idgloss, " - "dg.created_at AS gloss_created_at, " - "dg.published AS gloss_public, " - "vg.is_public AS video_public, " - "vg.id AS video_id, " - "vg.videofile AS video_key " - "FROM dictionary_gloss AS dg " - "FULL JOIN video_glossvideo AS vg ON vg.gloss_id = dg.id" - ") TO STDOUT WITH (FORMAT CSV, DELIMITER '|')", - ] - ) - - # Separate the NZSL db columns - # Write them to a dictionary, so we can do fast operations - for rawl in result.stdout.split("\n"): - rawl = rawl.strip() - if not rawl: - continue - [ - gloss_id, - gloss_idgloss, - gloss_created_at, - gloss_public, - video_public, - video_id, - video_key, - ] = rawl.split("|") - - # Hack to handle FULL JOIN - video_key = maybe_fakekey(video_key.strip()) - - # This sets the initial field ordering in the all_keys dictionary row - this_nzsl_raw_keys_dict[video_key] = [ - gloss_idgloss.replace(CSV_DELIMITER, ""), - gloss_created_at, - gloss_id, - video_id, - gloss_public.lower() == "t", - video_public.lower() == "t", - ] - - print( - f"{len(this_nzsl_raw_keys_dict)} rows retrieved", - file=sys.stderr, - ) - - return this_nzsl_raw_keys_dict - - -# Get all keys from AWS S3 -def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET): - print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr) - result = aws_cli( - [ - "s3", - "ls", - f"s3://{s3_bucket}", - "--recursive", - ], - ) - - # Separate out just the key from date, time, size, key - this_s3_bucket_raw_keys_list = [] - for line in result.stdout.split("\n"): - if line: - this_s3_bucket_raw_keys_list.append(re.split(r"\s+", line, 3)[3]) - - print( - f"{len(this_s3_bucket_raw_keys_list)} rows retrieved", - file=sys.stderr, - ) - - return this_s3_bucket_raw_keys_list - - -# Get the keys present and absent across NZSL Signbank and S3, to dictionary -def create_all_keys_dict(this_nzsl_raw_keys_dict, this_s3_bucket_raw_keys_list): - print( - "Getting keys present and absent across NZSL Signbank and S3 ...", - file=sys.stderr, - ) - this_all_keys_dict = {} - - # Find S3 keys that are present in NZSL, or absent - for video_key in this_s3_bucket_raw_keys_list: - dict_row = this_nzsl_raw_keys_dict.get(video_key, None) - if dict_row: - # NZSL glossvideo record for this S3 key - this_all_keys_dict[video_key] = [ - True, # NZSL PRESENT - True, # S3 PRESENT - ] + dict_row - else: - # S3 key with no corresponding NZSL glossvideo record - this_all_keys_dict[video_key] = [ - False, # NZSL Absent - True, # S3 PRESENT - ] + [""] * 6 - - # Find NZSL keys that are absent from S3 (present in both handled above) - for video_key, dict_row in this_nzsl_raw_keys_dict.items(): - if video_key not in this_s3_bucket_raw_keys_list: - # gloss/glossvideo record with no corresponding S3 key - # Either: - # video_key is real, but the S3 object is missing - # video_key is fake (to handle the FULL JOIN) and this gloss/glossvideo never had an S3 object - this_all_keys_dict[video_key] = [ - True, # NZSL PRESENT - False, # S3 Absent - ] + dict_row - - return this_all_keys_dict - - -def find_orphans(): - all_keys_dict = create_all_keys_dict( - get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list() - ) - - print("Gloss ID,Gloss,Suggested Video key") - - # Traverse all the NZSL Signbank glosses that are missing S3 objects - for video_key, [ - key_in_nzsl, - key_in_s3, - gloss_idgloss, - gloss_created_at, - gloss_id, - video_id, - gloss_public, - video_public, - ] in all_keys_dict.items(): - - if not key_in_nzsl: - # This is an S3 object, not a Signbank record - continue - - if key_in_s3: - # This Signbank record already has an S3 object, all is well - continue - - # Business rule - if int(gloss_id) < 8000: - continue - - # The gloss_id is the only reliable retrieval key at the Signbank end - gloss = Gloss.objects.get(id=gloss_id) - gloss_name = gloss.idgloss.split(":")[0].strip() - video_path = gloss.get_video_path() - - # Skip any that already have a video path - # These should have an S3 object but don't: For some reason the video never made it to S3 - # These will have to have their videos reinstated (separate operation) - if len(video_path) > 0: - continue - - # We try to find the orphaned S3 object, if it exists - # TODO We could improve on brute-force by installing new libraries eg. rapidfuzz - for test_key, [key_nzsl_yes, key_s3_yes, *_] in all_keys_dict.items(): - if gloss_name in test_key: - if str(gloss_id) in test_key: - if key_nzsl_yes: - print(f"Anomaly (in NZSL): {gloss.idgloss}", file=sys.stderr) - continue - if not key_s3_yes: - print(f"Anomaly (not in S3): {gloss.idgloss}", file=sys.stderr) - continue - print(CSV_DELIMITER.join([gloss_id, gloss.idgloss, test_key])) - - print(f"Env: {args.env}", file=sys.stderr) print(f"S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr) print(f"AWSCLI: {AWSCLI}", file=sys.stderr) print(f"PGCLI: {PGCLI}", file=sys.stderr) print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr) -find_orphans() From aae1bd74e5fa4c4bf7a20400f1562496c6f51be4 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 1 Nov 2024 17:29:38 +1100 Subject: [PATCH 181/222] Import cleanup --- bin/find-orphaned-videos.py | 13 ------------- bin/repair-orphaned-videos.py | 25 +++++++++++++------------ 2 files changed, 13 insertions(+), 25 deletions(-) diff --git a/bin/find-orphaned-videos.py b/bin/find-orphaned-videos.py index 3066bd20..55221552 100755 --- a/bin/find-orphaned-videos.py +++ b/bin/find-orphaned-videos.py @@ -55,26 +55,13 @@ get_wsgi_application() -from django.contrib.auth.models import Permission from django.contrib.auth import get_user_model User = get_user_model() -from django.test import Client -from django.core.files.uploadedfile import SimpleUploadedFile -from django.urls import reverse -from django.db.utils import IntegrityError from signbank.dictionary.models import ( - Dataset, - FieldChoice, Gloss, - GlossTranslations, - Language, - ManualValidationAggregation, - ShareValidationAggregation, - ValidationRecord, ) -from signbank.video.models import GlossVideo # Globals CSV_DELIMITER = "," diff --git a/bin/repair-orphaned-videos.py b/bin/repair-orphaned-videos.py index 5b97a4b6..762f1fcd 100755 --- a/bin/repair-orphaned-videos.py +++ b/bin/repair-orphaned-videos.py @@ -13,6 +13,7 @@ import os import sys +import csv import subprocess import argparse import re @@ -25,6 +26,14 @@ description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. " "Postgres access details, eg. DATABASE_URL env var." ) + +# Positional arguments +parser.add_argument( + "csv_filename", + help="Name of CSV file" +) + +# Optional arguments parser.add_argument( "--env", default="uat", @@ -55,24 +64,12 @@ get_wsgi_application() -from django.contrib.auth.models import Permission from django.contrib.auth import get_user_model User = get_user_model() -from django.test import Client -from django.core.files.uploadedfile import SimpleUploadedFile -from django.urls import reverse -from django.db.utils import IntegrityError from signbank.dictionary.models import ( - Dataset, - FieldChoice, Gloss, - GlossTranslations, - Language, - ManualValidationAggregation, - ShareValidationAggregation, - ValidationRecord, ) from signbank.video.models import GlossVideo @@ -125,6 +122,10 @@ def aws_cli(args_list): return output +def read_csv(csv_filename): + pass + + print(f"Env: {args.env}", file=sys.stderr) print(f"S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr) print(f"AWSCLI: {AWSCLI}", file=sys.stderr) From 44a1bd8e5dc830e9871d5fd4332610f5d7401544 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 1 Nov 2024 17:41:15 +1100 Subject: [PATCH 182/222] Syncing headers --- bin/find-orphaned-videos.py | 3 ++- bin/repair-orphaned-videos.py | 22 +++++++++++++++------- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/bin/find-orphaned-videos.py b/bin/find-orphaned-videos.py index 55221552..467440c4 100755 --- a/bin/find-orphaned-videos.py +++ b/bin/find-orphaned-videos.py @@ -64,6 +64,7 @@ ) # Globals +GLOBAL_COLUMN_HEADINGS = ["Gloss ID", "Gloss", "Suggested Video key"] # Keep synced with other scripts CSV_DELIMITER = "," FAKEKEY_PREFIX = "this_is_not_a_key_" DATABASE_URL = os.getenv("DATABASE_URL", "") @@ -254,7 +255,7 @@ def find_orphans(): get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list() ) - print("Gloss ID,Gloss,Suggested Video key") + print(CSV_DELIMITER.join(GLOBAL_COLUMN_HEADINGS)) # Traverse all the NZSL Signbank glosses that are missing S3 objects for video_key, [ diff --git a/bin/repair-orphaned-videos.py b/bin/repair-orphaned-videos.py index 762f1fcd..d8bf5400 100755 --- a/bin/repair-orphaned-videos.py +++ b/bin/repair-orphaned-videos.py @@ -16,9 +16,7 @@ import csv import subprocess import argparse -import re from time import sleep -from uuid import uuid4 from pprint import pprint @@ -28,10 +26,7 @@ ) # Positional arguments -parser.add_argument( - "csv_filename", - help="Name of CSV file" -) +parser.add_argument("csv_filename", help="Name of CSV file") # Optional arguments parser.add_argument( @@ -74,6 +69,11 @@ from signbank.video.models import GlossVideo # Globals +GLOBAL_COLUMN_HEADINGS = [ + "Gloss ID", + "Gloss", + "Suggested Video key", +] # Keep synced with other scripts CSV_DELIMITER = "," FAKEKEY_PREFIX = "this_is_not_a_key_" DATABASE_URL = os.getenv("DATABASE_URL", "") @@ -123,7 +123,14 @@ def aws_cli(args_list): def read_csv(csv_filename): - pass + if csv_filename == "-": + f = sys.stdin.read().splitlines() + else: + f = open(csv_filename, "r") + csv_dict = csv.DictReader(f) + for row in csv_dict: + pprint(row) + # print(dict(row)) print(f"Env: {args.env}", file=sys.stderr) @@ -132,3 +139,4 @@ def read_csv(csv_filename): print(f"PGCLI: {PGCLI}", file=sys.stderr) print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr) +read_csv(args.csv_filename) From a23eadb4e918f5ae434a8331e02409fbab2c9bbc Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 1 Nov 2024 17:47:20 +1100 Subject: [PATCH 183/222] cleanups --- bin/find-orphaned-videos.py | 9 +++++++-- bin/repair-orphaned-videos.py | 13 +++++++------ 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/bin/find-orphaned-videos.py b/bin/find-orphaned-videos.py index 467440c4..c592fd77 100755 --- a/bin/find-orphaned-videos.py +++ b/bin/find-orphaned-videos.py @@ -63,8 +63,13 @@ Gloss, ) -# Globals -GLOBAL_COLUMN_HEADINGS = ["Gloss ID", "Gloss", "Suggested Video key"] # Keep synced with other scripts +# Keep synced with other scripts +GLOSS_ID_COLUMN = "Gloss ID" +GLOSS_COLUMN = "Gloss" +GLOSS_VIDEO_COLUMN = "Suggested Video key" +GLOBAL_COLUMN_HEADINGS = [GLOSS_ID_COLUMN, GLOSS_COLUMN, GLOSS_VIDEO_COLUMN] + +# Other globals CSV_DELIMITER = "," FAKEKEY_PREFIX = "this_is_not_a_key_" DATABASE_URL = os.getenv("DATABASE_URL", "") diff --git a/bin/repair-orphaned-videos.py b/bin/repair-orphaned-videos.py index d8bf5400..b60e1021 100755 --- a/bin/repair-orphaned-videos.py +++ b/bin/repair-orphaned-videos.py @@ -68,12 +68,13 @@ ) from signbank.video.models import GlossVideo -# Globals -GLOBAL_COLUMN_HEADINGS = [ - "Gloss ID", - "Gloss", - "Suggested Video key", -] # Keep synced with other scripts +# Keep synced with other scripts +GLOSS_ID_COLUMN = "Gloss ID" +GLOSS_COLUMN = "Gloss" +GLOSS_VIDEO_COLUMN = "Suggested Video key" +GLOBAL_COLUMN_HEADINGS = [GLOSS_ID_COLUMN, GLOSS_COLUMN, GLOSS_VIDEO_COLUMN] + +# Other globals CSV_DELIMITER = "," FAKEKEY_PREFIX = "this_is_not_a_key_" DATABASE_URL = os.getenv("DATABASE_URL", "") From bc48b268618c993fa91d9497914543c3a431305c Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 1 Nov 2024 17:49:10 +1100 Subject: [PATCH 184/222] help message --- bin/repair-orphaned-videos.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/repair-orphaned-videos.py b/bin/repair-orphaned-videos.py index b60e1021..5425121d 100755 --- a/bin/repair-orphaned-videos.py +++ b/bin/repair-orphaned-videos.py @@ -26,7 +26,7 @@ ) # Positional arguments -parser.add_argument("csv_filename", help="Name of CSV file") +parser.add_argument("csv_filename", help="Name of CSV file, or '-' for STDIN") # Optional arguments parser.add_argument( @@ -130,7 +130,7 @@ def read_csv(csv_filename): f = open(csv_filename, "r") csv_dict = csv.DictReader(f) for row in csv_dict: - pprint(row) + pprint(row[GLOSS_COLUMN]) # print(dict(row)) From 8b2aa15f03f45a60355b3d06d4c56a710f2750bf Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 1 Nov 2024 17:54:51 +1100 Subject: [PATCH 185/222] refactor --- bin/repair-orphaned-videos.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/bin/repair-orphaned-videos.py b/bin/repair-orphaned-videos.py index 5425121d..7842d3bf 100755 --- a/bin/repair-orphaned-videos.py +++ b/bin/repair-orphaned-videos.py @@ -123,15 +123,24 @@ def aws_cli(args_list): return output +# Returns a list of dictionaries, one for each CSV row def read_csv(csv_filename): if csv_filename == "-": f = sys.stdin.read().splitlines() else: f = open(csv_filename, "r") - csv_dict = csv.DictReader(f) - for row in csv_dict: - pprint(row[GLOSS_COLUMN]) - # print(dict(row)) + return csv.DictReader(f) + + +def process_csv(): + csv_rows = read_csv(args.csv_filename) + for csv_row in csv_rows: + gloss_id = int(csv_row[GLOSS_ID_COLUMN]) + gloss_idgloss = csv_row[GLOSS_COLUMN] + video_key = csv_row[GLOSS_VIDEO_COLUMN] + print(gloss_id) + print(gloss_idgloss) + print(video_key) print(f"Env: {args.env}", file=sys.stderr) @@ -140,4 +149,5 @@ def read_csv(csv_filename): print(f"PGCLI: {PGCLI}", file=sys.stderr) print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr) -read_csv(args.csv_filename) + +process_csv() From 860235f84766506bfca8402dec383c43b7eec6f2 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 1 Nov 2024 18:06:23 +1100 Subject: [PATCH 186/222] Basics working --- bin/repair-orphaned-videos.py | 70 +++++++++++++++++++++++------------ 1 file changed, 47 insertions(+), 23 deletions(-) diff --git a/bin/repair-orphaned-videos.py b/bin/repair-orphaned-videos.py index 7842d3bf..10120eea 100755 --- a/bin/repair-orphaned-videos.py +++ b/bin/repair-orphaned-videos.py @@ -19,6 +19,28 @@ from time import sleep from pprint import pprint +# Magic required to allow this script to use Signbank Django classes +# This goes away if this script becomes a Django Management Command +print("Importing site-packages environment", file=sys.stderr) +print(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), file=sys.stderr) +sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development") +from django.core.wsgi import get_wsgi_application + +get_wsgi_application() + +from django.contrib.auth import get_user_model + +User = get_user_model() + +from signbank.dictionary.models import ( + FieldChoice, + Gloss, +) +from signbank.video.models import GlossVideo + +from django.core.exceptions import ObjectDoesNotExist + parser = argparse.ArgumentParser( description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. " @@ -49,25 +71,6 @@ ) args = parser.parse_args() -# Magic required to allow this script to use Signbank Django classes -# This goes away if this script becomes a Django Management Command -print("Importing site-packages environment", file=sys.stderr) -print(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), file=sys.stderr) -sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) -os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development") -from django.core.wsgi import get_wsgi_application - -get_wsgi_application() - -from django.contrib.auth import get_user_model - -User = get_user_model() - -from signbank.dictionary.models import ( - Gloss, -) -from signbank.video.models import GlossVideo - # Keep synced with other scripts GLOSS_ID_COLUMN = "Gloss ID" GLOSS_COLUMN = "Gloss" @@ -133,14 +136,35 @@ def read_csv(csv_filename): def process_csv(): + main_video_type = FieldChoice.objects.filter(field="video_type", english_name="main").first() + csv_rows = read_csv(args.csv_filename) for csv_row in csv_rows: - gloss_id = int(csv_row[GLOSS_ID_COLUMN]) + gloss_id = csv_row[GLOSS_ID_COLUMN] gloss_idgloss = csv_row[GLOSS_COLUMN] video_key = csv_row[GLOSS_VIDEO_COLUMN] - print(gloss_id) - print(gloss_idgloss) - print(video_key) + print(CSV_DELIMITER.join([gloss_id, gloss_idgloss, video_key])) + gloss_id = int(gloss_id) + + try: + gloss = Gloss.objects.get(id=gloss_id) + print(gloss) + except ObjectDoesNotExist as e: + print(e) + continue + + gloss_video = GlossVideo( + gloss=gloss, + dataset=gloss.dataset, + videofile=video_key, + title=video_key, + version=0, + is_public=False, + video_type=main_video_type + ) + print(gloss_video) + + print(f"Env: {args.env}", file=sys.stderr) From 81ef0cc25d9113c7d3e1b249e3734a7fe473cbc7 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 1 Nov 2024 18:08:08 +1100 Subject: [PATCH 187/222] More import cleanups --- bin/find-orphaned-videos.py | 36 +++++++++++++++++------------------ bin/repair-orphaned-videos.py | 8 ++++---- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/bin/find-orphaned-videos.py b/bin/find-orphaned-videos.py index c592fd77..353015b5 100755 --- a/bin/find-orphaned-videos.py +++ b/bin/find-orphaned-videos.py @@ -20,6 +20,24 @@ from uuid import uuid4 from pprint import pprint +# Magic required to allow this script to use Signbank Django classes +# This goes away if this script becomes a Django Management Command +print("Importing site-packages environment", file=sys.stderr) +print(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), file=sys.stderr) +sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development") +from django.core.wsgi import get_wsgi_application + +get_wsgi_application() + +from django.contrib.auth import get_user_model + +User = get_user_model() + +from signbank.dictionary.models import ( + Gloss, +) + parser = argparse.ArgumentParser( description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. " @@ -45,24 +63,6 @@ ) args = parser.parse_args() -# Magic required to allow this script to use Signbank Django classes -# This goes away if this script becomes a Django Management Command -print("Importing site-packages environment", file=sys.stderr) -print(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), file=sys.stderr) -sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) -os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development") -from django.core.wsgi import get_wsgi_application - -get_wsgi_application() - -from django.contrib.auth import get_user_model - -User = get_user_model() - -from signbank.dictionary.models import ( - Gloss, -) - # Keep synced with other scripts GLOSS_ID_COLUMN = "Gloss ID" GLOSS_COLUMN = "Gloss" diff --git a/bin/repair-orphaned-videos.py b/bin/repair-orphaned-videos.py index 10120eea..1f768a5a 100755 --- a/bin/repair-orphaned-videos.py +++ b/bin/repair-orphaned-videos.py @@ -136,7 +136,9 @@ def read_csv(csv_filename): def process_csv(): - main_video_type = FieldChoice.objects.filter(field="video_type", english_name="main").first() + main_video_type = FieldChoice.objects.filter( + field="video_type", english_name="main" + ).first() csv_rows = read_csv(args.csv_filename) for csv_row in csv_rows: @@ -160,13 +162,11 @@ def process_csv(): title=video_key, version=0, is_public=False, - video_type=main_video_type + video_type=main_video_type, ) print(gloss_video) - - print(f"Env: {args.env}", file=sys.stderr) print(f"S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr) print(f"AWSCLI: {AWSCLI}", file=sys.stderr) From 39bb18f7089354b7b28d8cee4be2955e1b44ef44 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 1 Nov 2024 18:12:45 +1100 Subject: [PATCH 188/222] Warnings --- bin/repair-orphaned-videos.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/bin/repair-orphaned-videos.py b/bin/repair-orphaned-videos.py index 1f768a5a..1fec0e93 100755 --- a/bin/repair-orphaned-videos.py +++ b/bin/repair-orphaned-videos.py @@ -155,6 +155,13 @@ def process_csv(): print(e) continue + try: + GlossVideo.objects.get(videofile=video_key) + print(f"Error: GlossVideo already exists: {video_key}") + continue + except ObjectDoesNotExist: + pass + gloss_video = GlossVideo( gloss=gloss, dataset=gloss.dataset, @@ -166,6 +173,11 @@ def process_csv(): ) print(gloss_video) + # At this point the repair should be complete + # WARNING, it tries to save to the current storage medium, so this needs sorting out! + # Hm, maybe we SHOULD just write to the database after all, and hope Django copes? + #gloss_video.save() + print(f"Env: {args.env}", file=sys.stderr) print(f"S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr) From 9e6d570481146389771e510a19240f5f3d9f3217 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 1 Nov 2024 18:16:02 +1100 Subject: [PATCH 189/222] Notes --- bin/repair-orphaned-videos.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/repair-orphaned-videos.py b/bin/repair-orphaned-videos.py index 1fec0e93..be8b23ab 100755 --- a/bin/repair-orphaned-videos.py +++ b/bin/repair-orphaned-videos.py @@ -173,8 +173,9 @@ def process_csv(): ) print(gloss_video) - # At this point the repair should be complete + # At this point we complete the repair # WARNING, it tries to save to the current storage medium, so this needs sorting out! + # save() is overridden in the GlossVideo model # Hm, maybe we SHOULD just write to the database after all, and hope Django copes? #gloss_video.save() From f94518b376b6f6e5ec0cd99477813e3fe3aea6e6 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 1 Nov 2024 18:18:30 +1100 Subject: [PATCH 190/222] Notes --- bin/repair-orphaned-videos.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bin/repair-orphaned-videos.py b/bin/repair-orphaned-videos.py index be8b23ab..20430b56 100755 --- a/bin/repair-orphaned-videos.py +++ b/bin/repair-orphaned-videos.py @@ -175,8 +175,10 @@ def process_csv(): # At this point we complete the repair # WARNING, it tries to save to the current storage medium, so this needs sorting out! + # We absolutely DO NOT want it to try and save! # save() is overridden in the GlossVideo model # Hm, maybe we SHOULD just write to the database after all, and hope Django copes? + # Yeah, starting to think that's the way to go, IF postgres will allow us to do so (constraints) #gloss_video.save() From d7c3bc28c1a8ff9322b46747f9bbefa78c1a049b Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 1 Nov 2024 18:48:10 +1100 Subject: [PATCH 191/222] First success --- bin/repair-orphaned-videos.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/bin/repair-orphaned-videos.py b/bin/repair-orphaned-videos.py index 20430b56..aadcbd36 100755 --- a/bin/repair-orphaned-videos.py +++ b/bin/repair-orphaned-videos.py @@ -40,6 +40,7 @@ from signbank.video.models import GlossVideo from django.core.exceptions import ObjectDoesNotExist +from django.db import models parser = argparse.ArgumentParser( @@ -172,6 +173,8 @@ def process_csv(): video_type=main_video_type, ) print(gloss_video) + # HOLY ****, this works! + gloss_video.save = models.Model.save # At this point we complete the repair # WARNING, it tries to save to the current storage medium, so this needs sorting out! @@ -179,7 +182,8 @@ def process_csv(): # save() is overridden in the GlossVideo model # Hm, maybe we SHOULD just write to the database after all, and hope Django copes? # Yeah, starting to think that's the way to go, IF postgres will allow us to do so (constraints) - #gloss_video.save() + # HOLY ****, this works! + gloss_video.save(gloss_video) print(f"Env: {args.env}", file=sys.stderr) From 9684c694292e2dd487021d3d82bd1b659979386a Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Mon, 4 Nov 2024 13:17:06 +1100 Subject: [PATCH 192/222] Uses bulk_create() so that save() does not run --- bin/repair-orphaned-videos.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/bin/repair-orphaned-videos.py b/bin/repair-orphaned-videos.py index aadcbd36..590cceab 100755 --- a/bin/repair-orphaned-videos.py +++ b/bin/repair-orphaned-videos.py @@ -158,7 +158,7 @@ def process_csv(): try: GlossVideo.objects.get(videofile=video_key) - print(f"Error: GlossVideo already exists: {video_key}") + print(f"Ignoring: GlossVideo already exists: {video_key}") continue except ObjectDoesNotExist: pass @@ -173,17 +173,12 @@ def process_csv(): video_type=main_video_type, ) print(gloss_video) - # HOLY ****, this works! - gloss_video.save = models.Model.save - # At this point we complete the repair - # WARNING, it tries to save to the current storage medium, so this needs sorting out! - # We absolutely DO NOT want it to try and save! - # save() is overridden in the GlossVideo model - # Hm, maybe we SHOULD just write to the database after all, and hope Django copes? - # Yeah, starting to think that's the way to go, IF postgres will allow us to do so (constraints) - # HOLY ****, this works! - gloss_video.save(gloss_video) + # We cannot allow the GlossVideo save() method to run, as it has side-effects including + # trying to save the video file to the current storage medium (eg. S3) + createds = GlossVideo.objects.bulk_create([gloss_video]) + if len(createds) < 1: + print(f"Error: could not create {gloss_video}") print(f"Env: {args.env}", file=sys.stderr) From cd0143a65ee3487113acfea8fee2efa27a614219 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Mon, 4 Nov 2024 13:23:46 +1100 Subject: [PATCH 193/222] Neatening and rename --- bin/{find-orphaned-videos.py => find-fixable-orphans.py} | 0 ...{repair-orphaned-videos.py => repair-fixable-orphans.py} | 6 ++---- 2 files changed, 2 insertions(+), 4 deletions(-) rename bin/{find-orphaned-videos.py => find-fixable-orphans.py} (100%) rename bin/{repair-orphaned-videos.py => repair-fixable-orphans.py} (95%) diff --git a/bin/find-orphaned-videos.py b/bin/find-fixable-orphans.py similarity index 100% rename from bin/find-orphaned-videos.py rename to bin/find-fixable-orphans.py diff --git a/bin/repair-orphaned-videos.py b/bin/repair-fixable-orphans.py similarity index 95% rename from bin/repair-orphaned-videos.py rename to bin/repair-fixable-orphans.py index 590cceab..b99830ef 100755 --- a/bin/repair-orphaned-videos.py +++ b/bin/repair-fixable-orphans.py @@ -174,10 +174,8 @@ def process_csv(): ) print(gloss_video) # At this point we complete the repair - # We cannot allow the GlossVideo save() method to run, as it has side-effects including - # trying to save the video file to the current storage medium (eg. S3) - createds = GlossVideo.objects.bulk_create([gloss_video]) - if len(createds) < 1: + # We use bulk_create() because we cannot allow save() to run + if len(GlossVideo.objects.bulk_create([gloss_video])) < 1: print(f"Error: could not create {gloss_video}") From 36d251e0da290b0c1ef66314e34c265a65133eb7 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Mon, 4 Nov 2024 13:26:48 +1100 Subject: [PATCH 194/222] Comments --- bin/find-fixable-orphans.py | 3 +++ bin/repair-fixable-orphans.py | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/bin/find-fixable-orphans.py b/bin/find-fixable-orphans.py index 353015b5..c649293b 100755 --- a/bin/find-fixable-orphans.py +++ b/bin/find-fixable-orphans.py @@ -2,6 +2,9 @@ # # This script needs to be run in a pyenv virtualenv with the Django project installed. # +# Finds orphaned S3 objects that can be matched back to NZSL entries that are missing S3 objects. +# Essentially finds one form of import error. +# # Bang line above passes '-u' to python, for unbuffered output # Permissions required: # psql - access to heroku app's postgres diff --git a/bin/repair-fixable-orphans.py b/bin/repair-fixable-orphans.py index b99830ef..3e98bd82 100755 --- a/bin/repair-fixable-orphans.py +++ b/bin/repair-fixable-orphans.py @@ -2,6 +2,10 @@ # # This script needs to be run in a pyenv virtualenv with the Django project installed. # +# Given a CSV file containing S3 objects that can be matched back to NZSL entries. +# Updates the database to repair the NZSL entries. +# Essentially repairs one form of import error. +# # Bang line above passes '-u' to python, for unbuffered output # Permissions required: # psql - access to heroku app's postgres From 964321f9d729e515cc6e6999a0a118be746c241d Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Mon, 4 Nov 2024 15:06:31 +1100 Subject: [PATCH 195/222] Added S3 dumper --- bin/get-video-s3-acls.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index eb5436be..5ffbefe8 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -46,6 +46,13 @@ action="store_true", help=f"Dump raw NZSL database output", ) +parser.add_argument( + "--dumps3", + default=False, + required=False, + action="store_true", + help=f"Dump raw S3 keys output", +) args = parser.parse_args() # Globals @@ -368,6 +375,11 @@ def process_keys(this_all_keys_dict): pprint(get_nzsl_raw_keys_dict()) exit() +if args.dumps3: + pprint(get_s3_bucket_raw_keys_list()) + exit() + process_keys( create_all_keys_dict(get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list()) ) + From bd6f86d369f30e934449ed696bbc366dac01a120 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Mon, 4 Nov 2024 16:34:44 +1100 Subject: [PATCH 196/222] Boto3 conversion of get-video-s3-acls --- bin/get-video-s3-acls.py | 86 +++++----------------------------------- 1 file changed, 9 insertions(+), 77 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 5ffbefe8..5021f17c 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -16,6 +16,7 @@ from time import sleep from uuid import uuid4 from pprint import pprint +import boto3 parser = argparse.ArgumentParser( description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. " @@ -33,12 +34,6 @@ required=False, help=f"Postgres client path (default: %(default)s)", ) -parser.add_argument( - "--awscli", - default="/usr/local/bin/aws", - required=False, - help=f"AWS client path (default: %(default)s)", -) parser.add_argument( "--dumpnzsl", default=False, @@ -59,7 +54,6 @@ CSV_DELIMITER = "," FAKEKEY_PREFIX = "this_is_not_a_key_" DATABASE_URL = os.getenv("DATABASE_URL", "") -AWSCLI = args.awscli PGCLI = args.pgcli AWS_S3_BUCKET = f"nzsl-signbank-media-{args.env}" @@ -81,29 +75,6 @@ def pg_cli(args_list): exit() -def aws_cli(args_list): - # Try indefinitely - output = None - while not output: - try: - output = subprocess.run( - [AWSCLI] + args_list, - env=os.environ, - capture_output=True, - check=True, - text=True, - ) - except subprocess.CalledProcessError as e: - print( - f"Error: subprocess.run returned code {e.returncode}", file=sys.stderr - ) - print(e.cmd, file=sys.stderr) - print(e.stdout, file=sys.stderr) - print(e.stderr, file=sys.stderr) - sleep(1) - return output - - # Fake key is a hack to handle FULL JOIN def maybe_fakekey(instring): return instring if instring else FAKEKEY_PREFIX + str(uuid4()) @@ -179,20 +150,10 @@ def get_nzsl_raw_keys_dict(): # Get all keys from AWS S3 def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET): print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr) - result = aws_cli( - [ - "s3", - "ls", - f"s3://{s3_bucket}", - "--recursive", - ], - ) - # Separate out just the key from date, time, size, key - this_s3_bucket_raw_keys_list = [] - for line in result.stdout.split("\n"): - if line: - this_s3_bucket_raw_keys_list.append(re.split(r"\s+", line, 3)[3]) + s3_resource = boto3.resource('s3') + s3_resource_bucket = s3_resource.Bucket(s3_bucket) + this_s3_bucket_raw_keys_list = [ s3_object.key for s3_object in s3_resource_bucket.objects.all() ] print( f"{len(this_s3_bucket_raw_keys_list)} rows retrieved", @@ -252,26 +213,12 @@ def get_recommended_action(key_in_nzsl, key_in_s3): # Get S3 object's ACL def get_s3_canned_acl(video_key): - result = aws_cli( - [ - "s3api", - "get-object-acl", - "--output", - "text", - "--query", - "Grants[*].Permission", - "--bucket", - AWS_S3_BUCKET, - "--key", - video_key, - ] - ) - acls_grants = result.stdout.strip().split("\t") - + s3_client = boto3.client("s3") + acls_grants = s3_client.get_object_acl(Bucket=AWS_S3_BUCKET, Key=video_key)["Grants"] if len(acls_grants) > 1: - if acls_grants[0] == "FULL_CONTROL" and acls_grants[1] == "READ": + if acls_grants[0]["Permission"] == "FULL_CONTROL" and acls_grants[1]["Permission"] == "READ": return "public-read" - elif acls_grants[0] == "FULL_CONTROL": + elif acls_grants[0]["Permission"] == "FULL_CONTROL": return "private" return "unknown" @@ -279,21 +226,7 @@ def get_s3_canned_acl(video_key): # Get S3 object's LastModified date/time def get_s3_lastmodified(video_key): - result = aws_cli( - [ - "s3api", - "head-object", - "--output", - "text", - "--query", - "LastModified", - "--bucket", - AWS_S3_BUCKET, - "--key", - video_key, - ] - ) - return result.stdout.strip() + return boto3.client("s3").head_object(Bucket=AWS_S3_BUCKET, Key=video_key)["LastModified"] def build_csv_header(): @@ -367,7 +300,6 @@ def process_keys(this_all_keys_dict): print(f"Env: {args.env}", file=sys.stderr) print(f"S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr) -print(f"AWSCLI: {AWSCLI}", file=sys.stderr) print(f"PGCLI: {PGCLI}", file=sys.stderr) print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr) From 4b37c9346349b63fba7b09581537acc6ba9bf47c Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Mon, 4 Nov 2024 16:48:32 +1100 Subject: [PATCH 197/222] black --- bin/get-video-s3-acls.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 5021f17c..316f8e82 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -151,9 +151,11 @@ def get_nzsl_raw_keys_dict(): def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET): print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr) - s3_resource = boto3.resource('s3') + s3_resource = boto3.resource("s3") s3_resource_bucket = s3_resource.Bucket(s3_bucket) - this_s3_bucket_raw_keys_list = [ s3_object.key for s3_object in s3_resource_bucket.objects.all() ] + this_s3_bucket_raw_keys_list = [ + s3_object.key for s3_object in s3_resource_bucket.objects.all() + ] print( f"{len(this_s3_bucket_raw_keys_list)} rows retrieved", @@ -214,9 +216,14 @@ def get_recommended_action(key_in_nzsl, key_in_s3): # Get S3 object's ACL def get_s3_canned_acl(video_key): s3_client = boto3.client("s3") - acls_grants = s3_client.get_object_acl(Bucket=AWS_S3_BUCKET, Key=video_key)["Grants"] + acls_grants = s3_client.get_object_acl(Bucket=AWS_S3_BUCKET, Key=video_key)[ + "Grants" + ] if len(acls_grants) > 1: - if acls_grants[0]["Permission"] == "FULL_CONTROL" and acls_grants[1]["Permission"] == "READ": + if ( + acls_grants[0]["Permission"] == "FULL_CONTROL" + and acls_grants[1]["Permission"] == "READ" + ): return "public-read" elif acls_grants[0]["Permission"] == "FULL_CONTROL": return "private" @@ -226,7 +233,9 @@ def get_s3_canned_acl(video_key): # Get S3 object's LastModified date/time def get_s3_lastmodified(video_key): - return boto3.client("s3").head_object(Bucket=AWS_S3_BUCKET, Key=video_key)["LastModified"] + return boto3.client("s3").head_object(Bucket=AWS_S3_BUCKET, Key=video_key)[ + "LastModified" + ] def build_csv_header(): @@ -314,4 +323,3 @@ def process_keys(this_all_keys_dict): process_keys( create_all_keys_dict(get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list()) ) - From 01ce2dd579c873d4cdd1be404224f84a1d58393f Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Mon, 4 Nov 2024 16:48:48 +1100 Subject: [PATCH 198/222] Boto3 conversion: find-fixable-orphans --- bin/find-fixable-orphans.py | 50 +++++-------------------------------- 1 file changed, 6 insertions(+), 44 deletions(-) diff --git a/bin/find-fixable-orphans.py b/bin/find-fixable-orphans.py index c649293b..ca36c435 100755 --- a/bin/find-fixable-orphans.py +++ b/bin/find-fixable-orphans.py @@ -21,6 +21,7 @@ import re from time import sleep from uuid import uuid4 +import boto3 from pprint import pprint # Magic required to allow this script to use Signbank Django classes @@ -58,12 +59,6 @@ required=False, help=f"Postgres client path (default: %(default)s)", ) -parser.add_argument( - "--awscli", - default="/usr/local/bin/aws", - required=False, - help=f"AWS client path (default: %(default)s)", -) args = parser.parse_args() # Keep synced with other scripts @@ -76,7 +71,6 @@ CSV_DELIMITER = "," FAKEKEY_PREFIX = "this_is_not_a_key_" DATABASE_URL = os.getenv("DATABASE_URL", "") -AWSCLI = args.awscli PGCLI = args.pgcli AWS_S3_BUCKET = f"nzsl-signbank-media-{args.env}" @@ -98,29 +92,6 @@ def pg_cli(args_list): exit() -def aws_cli(args_list): - # Try indefinitely - output = None - while not output: - try: - output = subprocess.run( - [AWSCLI] + args_list, - env=os.environ, - capture_output=True, - check=True, - text=True, - ) - except subprocess.CalledProcessError as e: - print( - f"Error: subprocess.run returned code {e.returncode}", file=sys.stderr - ) - print(e.cmd, file=sys.stderr) - print(e.stdout, file=sys.stderr) - print(e.stderr, file=sys.stderr) - sleep(1) - return output - - # Fake key is a hack to handle FULL JOIN def maybe_fakekey(instring): return instring if instring else FAKEKEY_PREFIX + str(uuid4()) @@ -196,20 +167,12 @@ def get_nzsl_raw_keys_dict(): # Get all keys from AWS S3 def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET): print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr) - result = aws_cli( - [ - "s3", - "ls", - f"s3://{s3_bucket}", - "--recursive", - ], - ) - # Separate out just the key from date, time, size, key - this_s3_bucket_raw_keys_list = [] - for line in result.stdout.split("\n"): - if line: - this_s3_bucket_raw_keys_list.append(re.split(r"\s+", line, 3)[3]) + s3_resource = boto3.resource("s3") + s3_resource_bucket = s3_resource.Bucket(s3_bucket) + this_s3_bucket_raw_keys_list = [ + s3_object.key for s3_object in s3_resource_bucket.objects.all() + ] print( f"{len(this_s3_bucket_raw_keys_list)} rows retrieved", @@ -316,7 +279,6 @@ def find_orphans(): print(f"Env: {args.env}", file=sys.stderr) print(f"S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr) -print(f"AWSCLI: {AWSCLI}", file=sys.stderr) print(f"PGCLI: {PGCLI}", file=sys.stderr) print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr) From ab294e57705d56240f08896959f06ff2992b64d8 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Mon, 4 Nov 2024 16:54:45 +1100 Subject: [PATCH 199/222] Boto3 conversion: repair-fixable-orphans.py --- bin/repair-fixable-orphans.py | 36 +---------------------------------- 1 file changed, 1 insertion(+), 35 deletions(-) diff --git a/bin/repair-fixable-orphans.py b/bin/repair-fixable-orphans.py index 3e98bd82..144e1171 100755 --- a/bin/repair-fixable-orphans.py +++ b/bin/repair-fixable-orphans.py @@ -20,8 +20,6 @@ import csv import subprocess import argparse -from time import sleep -from pprint import pprint # Magic required to allow this script to use Signbank Django classes # This goes away if this script becomes a Django Management Command @@ -68,12 +66,6 @@ required=False, help=f"Postgres client path (default: %(default)s)", ) -parser.add_argument( - "--awscli", - default="/usr/local/bin/aws", - required=False, - help=f"AWS client path (default: %(default)s)", -) args = parser.parse_args() # Keep synced with other scripts @@ -86,7 +78,6 @@ CSV_DELIMITER = "," FAKEKEY_PREFIX = "this_is_not_a_key_" DATABASE_URL = os.getenv("DATABASE_URL", "") -AWSCLI = args.awscli PGCLI = args.pgcli AWS_S3_BUCKET = f"nzsl-signbank-media-{args.env}" @@ -108,29 +99,6 @@ def pg_cli(args_list): exit() -def aws_cli(args_list): - # Try indefinitely - output = None - while not output: - try: - output = subprocess.run( - [AWSCLI] + args_list, - env=os.environ, - capture_output=True, - check=True, - text=True, - ) - except subprocess.CalledProcessError as e: - print( - f"Error: subprocess.run returned code {e.returncode}", file=sys.stderr - ) - print(e.cmd, file=sys.stderr) - print(e.stdout, file=sys.stderr) - print(e.stderr, file=sys.stderr) - sleep(1) - return output - - # Returns a list of dictionaries, one for each CSV row def read_csv(csv_filename): if csv_filename == "-": @@ -155,7 +123,6 @@ def process_csv(): try: gloss = Gloss.objects.get(id=gloss_id) - print(gloss) except ObjectDoesNotExist as e: print(e) continue @@ -176,6 +143,7 @@ def process_csv(): is_public=False, video_type=main_video_type, ) + print(gloss) print(gloss_video) # At this point we complete the repair # We use bulk_create() because we cannot allow save() to run @@ -185,9 +153,7 @@ def process_csv(): print(f"Env: {args.env}", file=sys.stderr) print(f"S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr) -print(f"AWSCLI: {AWSCLI}", file=sys.stderr) print(f"PGCLI: {PGCLI}", file=sys.stderr) print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr) - process_csv() From 57e0b4611f196c8d6b28d484922140a2778f03f3 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Mon, 11 Nov 2024 14:07:48 +1100 Subject: [PATCH 200/222] Added a public/published boolean column --- bin/find-fixable-orphans.py | 5 +++-- bin/repair-fixable-orphans.py | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/bin/find-fixable-orphans.py b/bin/find-fixable-orphans.py index ca36c435..7f6b4b13 100755 --- a/bin/find-fixable-orphans.py +++ b/bin/find-fixable-orphans.py @@ -64,8 +64,9 @@ # Keep synced with other scripts GLOSS_ID_COLUMN = "Gloss ID" GLOSS_COLUMN = "Gloss" +GLOSS_PUBLIC_COLUMN = "Gloss public" GLOSS_VIDEO_COLUMN = "Suggested Video key" -GLOBAL_COLUMN_HEADINGS = [GLOSS_ID_COLUMN, GLOSS_COLUMN, GLOSS_VIDEO_COLUMN] +GLOBAL_COLUMN_HEADINGS = [GLOSS_ID_COLUMN, GLOSS_COLUMN, GLOSS_PUBLIC_COLUMN, GLOSS_VIDEO_COLUMN] # Other globals CSV_DELIMITER = "," @@ -274,7 +275,7 @@ def find_orphans(): if not key_s3_yes: print(f"Anomaly (not in S3): {gloss.idgloss}", file=sys.stderr) continue - print(CSV_DELIMITER.join([gloss_id, gloss.idgloss, test_key])) + print(CSV_DELIMITER.join([gloss_id, gloss.idgloss, str(gloss_public), test_key])) print(f"Env: {args.env}", file=sys.stderr) diff --git a/bin/repair-fixable-orphans.py b/bin/repair-fixable-orphans.py index 144e1171..e795d03a 100755 --- a/bin/repair-fixable-orphans.py +++ b/bin/repair-fixable-orphans.py @@ -71,8 +71,9 @@ # Keep synced with other scripts GLOSS_ID_COLUMN = "Gloss ID" GLOSS_COLUMN = "Gloss" +GLOSS_PUBLIC_COLUMN = "Gloss public" GLOSS_VIDEO_COLUMN = "Suggested Video key" -GLOBAL_COLUMN_HEADINGS = [GLOSS_ID_COLUMN, GLOSS_COLUMN, GLOSS_VIDEO_COLUMN] +GLOBAL_COLUMN_HEADINGS = [GLOSS_ID_COLUMN, GLOSS_COLUMN, GLOSS_PUBLIC_COLUMN, GLOSS_VIDEO_COLUMN] # Other globals CSV_DELIMITER = "," From 587e01c1ac83f813651463cf43398d645a816d2a Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Mon, 11 Nov 2024 15:27:55 +1100 Subject: [PATCH 201/222] message --- bin/find-fixable-orphans.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/find-fixable-orphans.py b/bin/find-fixable-orphans.py index 7f6b4b13..403d23ca 100755 --- a/bin/find-fixable-orphans.py +++ b/bin/find-fixable-orphans.py @@ -226,6 +226,7 @@ def find_orphans(): all_keys_dict = create_all_keys_dict( get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list() ) + print("Finding fixable orphans", file=sys.stderr) print(CSV_DELIMITER.join(GLOBAL_COLUMN_HEADINGS)) From a240a3d2a9bbb5b02e4a5f3c4da624a065f049fd Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Mon, 11 Nov 2024 15:49:20 +1100 Subject: [PATCH 202/222] comments/black --- bin/find-fixable-orphans.py | 13 +++++++++++-- bin/get-video-s3-acls.py | 2 ++ bin/repair-fixable-orphans.py | 7 ++++++- 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/bin/find-fixable-orphans.py b/bin/find-fixable-orphans.py index 403d23ca..31f51fe4 100755 --- a/bin/find-fixable-orphans.py +++ b/bin/find-fixable-orphans.py @@ -66,7 +66,12 @@ GLOSS_COLUMN = "Gloss" GLOSS_PUBLIC_COLUMN = "Gloss public" GLOSS_VIDEO_COLUMN = "Suggested Video key" -GLOBAL_COLUMN_HEADINGS = [GLOSS_ID_COLUMN, GLOSS_COLUMN, GLOSS_PUBLIC_COLUMN, GLOSS_VIDEO_COLUMN] +GLOBAL_COLUMN_HEADINGS = [ + GLOSS_ID_COLUMN, + GLOSS_COLUMN, + GLOSS_PUBLIC_COLUMN, + GLOSS_VIDEO_COLUMN, +] # Other globals CSV_DELIMITER = "," @@ -276,7 +281,11 @@ def find_orphans(): if not key_s3_yes: print(f"Anomaly (not in S3): {gloss.idgloss}", file=sys.stderr) continue - print(CSV_DELIMITER.join([gloss_id, gloss.idgloss, str(gloss_public), test_key])) + print( + CSV_DELIMITER.join( + [gloss_id, gloss.idgloss, str(gloss_public), test_key] + ) + ) print(f"Env: {args.env}", file=sys.stderr) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 316f8e82..656de197 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -215,6 +215,7 @@ def get_recommended_action(key_in_nzsl, key_in_s3): # Get S3 object's ACL def get_s3_canned_acl(video_key): + # TODO pass in a boto client instead of recreating one each time s3_client = boto3.client("s3") acls_grants = s3_client.get_object_acl(Bucket=AWS_S3_BUCKET, Key=video_key)[ "Grants" @@ -233,6 +234,7 @@ def get_s3_canned_acl(video_key): # Get S3 object's LastModified date/time def get_s3_lastmodified(video_key): + # TODO pass in a boto client instead of recreating one each time return boto3.client("s3").head_object(Bucket=AWS_S3_BUCKET, Key=video_key)[ "LastModified" ] diff --git a/bin/repair-fixable-orphans.py b/bin/repair-fixable-orphans.py index e795d03a..ce948be2 100755 --- a/bin/repair-fixable-orphans.py +++ b/bin/repair-fixable-orphans.py @@ -73,7 +73,12 @@ GLOSS_COLUMN = "Gloss" GLOSS_PUBLIC_COLUMN = "Gloss public" GLOSS_VIDEO_COLUMN = "Suggested Video key" -GLOBAL_COLUMN_HEADINGS = [GLOSS_ID_COLUMN, GLOSS_COLUMN, GLOSS_PUBLIC_COLUMN, GLOSS_VIDEO_COLUMN] +GLOBAL_COLUMN_HEADINGS = [ + GLOSS_ID_COLUMN, + GLOSS_COLUMN, + GLOSS_PUBLIC_COLUMN, + GLOSS_VIDEO_COLUMN, +] # Other globals CSV_DELIMITER = "," From 76e81b5a5719fbe699186e22f9299ee92e470bee Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Mon, 11 Nov 2024 15:50:26 +1100 Subject: [PATCH 203/222] Unused imports removed --- bin/find-fixable-orphans.py | 3 --- bin/get-video-s3-acls.py | 2 -- 2 files changed, 5 deletions(-) diff --git a/bin/find-fixable-orphans.py b/bin/find-fixable-orphans.py index 31f51fe4..90555cfc 100755 --- a/bin/find-fixable-orphans.py +++ b/bin/find-fixable-orphans.py @@ -18,11 +18,8 @@ import sys import subprocess import argparse -import re -from time import sleep from uuid import uuid4 import boto3 -from pprint import pprint # Magic required to allow this script to use Signbank Django classes # This goes away if this script becomes a Django Management Command diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 656de197..e0851953 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -12,8 +12,6 @@ import sys import subprocess import argparse -import re -from time import sleep from uuid import uuid4 from pprint import pprint import boto3 From 1d2a86a197ef020df3366464747a458833ad6886 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 11 Dec 2024 15:44:11 +1100 Subject: [PATCH 204/222] Initial review commits/black --- bin/get-video-s3-acls.py | 48 ++++++++++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index e0851953..00b820f9 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -54,6 +54,8 @@ DATABASE_URL = os.getenv("DATABASE_URL", "") PGCLI = args.pgcli AWS_S3_BUCKET = f"nzsl-signbank-media-{args.env}" +S3_CLIENT = boto3.client("s3") +S3_RESOURCE = boto3.resource("s3") def pg_cli(args_list): @@ -124,7 +126,23 @@ def get_nzsl_raw_keys_dict(): video_key, ] = rawl.split("|") - # Hack to handle FULL JOIN + """ + Hack to handle FULL JOIN. + We are storing data rows in a dictionary, indexed by video_key. + Because we are doing a FULL JOIN on the NZSL Signbank database, + we also get rows where there are gloss entries that do not have + a corresponding video_glossvideo. + (These are erroneous and one of the reasons this script exists, + to find them.) + Consequently there is no video_key, and we cannot use it to index + the data row. + Instead, we create a fake video_key that is unique and, theoretically, + impossible for anything else to try and use. It also has a 'safe', + easily filtered prefix, which means later code can easily tell + a fake key from a real key. + Always having a key, in this way, means that code, eg. loops, + that depends on there being a dictionary key axis will not break. + """ video_key = maybe_fakekey(video_key.strip()) # This sets the initial field ordering in the all_keys dictionary row @@ -149,8 +167,7 @@ def get_nzsl_raw_keys_dict(): def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET): print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr) - s3_resource = boto3.resource("s3") - s3_resource_bucket = s3_resource.Bucket(s3_bucket) + s3_resource_bucket = S3_RESOURCE.Bucket(s3_bucket) this_s3_bucket_raw_keys_list = [ s3_object.key for s3_object in s3_resource_bucket.objects.all() ] @@ -172,6 +189,9 @@ def create_all_keys_dict(this_nzsl_raw_keys_dict, this_s3_bucket_raw_keys_list): this_all_keys_dict = {} # Find S3 keys that are present in NZSL, or absent + # TODO This could be changed to use pop(), so that on each pass we are left + # with a smaller subset of the rows, which we can search faster. If the + # database becomes very large in future this could save a lot of processing. for video_key in this_s3_bucket_raw_keys_list: dict_row = this_nzsl_raw_keys_dict.get(video_key, None) if dict_row: @@ -196,13 +216,14 @@ def create_all_keys_dict(this_nzsl_raw_keys_dict, this_s3_bucket_raw_keys_list): return this_all_keys_dict -# Cases -# In S3 In NZSL Action -# Is Not Delete S3 Object -# Is Is Update ACL -# Not Is Review -# Other Review def get_recommended_action(key_in_nzsl, key_in_s3): + """ + Cases + In S3 In NZSL Action + Is Not Delete S3 Object + Is Is Update ACL + Not -- Review + """ if key_in_s3: if key_in_nzsl: return "Update ACL" @@ -213,9 +234,7 @@ def get_recommended_action(key_in_nzsl, key_in_s3): # Get S3 object's ACL def get_s3_canned_acl(video_key): - # TODO pass in a boto client instead of recreating one each time - s3_client = boto3.client("s3") - acls_grants = s3_client.get_object_acl(Bucket=AWS_S3_BUCKET, Key=video_key)[ + acls_grants = S3_CLIENT.get_object_acl(Bucket=AWS_S3_BUCKET, Key=video_key)[ "Grants" ] if len(acls_grants) > 1: @@ -232,10 +251,7 @@ def get_s3_canned_acl(video_key): # Get S3 object's LastModified date/time def get_s3_lastmodified(video_key): - # TODO pass in a boto client instead of recreating one each time - return boto3.client("s3").head_object(Bucket=AWS_S3_BUCKET, Key=video_key)[ - "LastModified" - ] + return S3_CLIENT.head_object(Bucket=AWS_S3_BUCKET, Key=video_key)["LastModified"] def build_csv_header(): From 749bb20d3a902a5c865c537875ca55f132ab7045 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 11 Dec 2024 15:44:34 +1100 Subject: [PATCH 205/222] Script renamings --- ...-orphans.py => find-fixable-s3-orphans.py} | 35 +++++++++++++------ ...rphans.py => repair-fixable-s3-orphans.py} | 12 +++++++ 2 files changed, 37 insertions(+), 10 deletions(-) rename bin/{find-fixable-orphans.py => find-fixable-s3-orphans.py} (88%) rename bin/{repair-fixable-orphans.py => repair-fixable-s3-orphans.py} (95%) diff --git a/bin/find-fixable-orphans.py b/bin/find-fixable-s3-orphans.py similarity index 88% rename from bin/find-fixable-orphans.py rename to bin/find-fixable-s3-orphans.py index 90555cfc..0b886714 100755 --- a/bin/find-fixable-orphans.py +++ b/bin/find-fixable-s3-orphans.py @@ -23,8 +23,6 @@ # Magic required to allow this script to use Signbank Django classes # This goes away if this script becomes a Django Management Command -print("Importing site-packages environment", file=sys.stderr) -print(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), file=sys.stderr) sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development") from django.core.wsgi import get_wsgi_application @@ -72,11 +70,14 @@ # Other globals CSV_DELIMITER = "," -FAKEKEY_PREFIX = "this_is_not_a_key_" DATABASE_URL = os.getenv("DATABASE_URL", "") PGCLI = args.pgcli AWS_S3_BUCKET = f"nzsl-signbank-media-{args.env}" +# Hack to handle FULL JOIN +# See get_nzsl_raw_keys_dict() +FAKEKEY_PREFIX = "this_is_not_a_key_" + def pg_cli(args_list): try: @@ -146,7 +147,23 @@ def get_nzsl_raw_keys_dict(): video_key, ] = rawl.split("|") - # Hack to handle FULL JOIN + """ + Hack to handle FULL JOIN. + We are storing data rows in a dictionary, indexed by video_key. + Because we are doing a FULL JOIN on the NZSL Signbank database, + we also get rows where there are gloss entries that do not have + a corresponding video_glossvideo. + (These are erroneous and one of the reasons this script exists, + to find them.) + Consequently there is no video_key, and we cannot use it to index + the data row. + Instead, we create a fake video_key that is unique and, theoretically, + impossible for anything else to try and use. It also has a 'safe', + easily filtered prefix, which means later code can easily tell + a fake key from a real key. + Always having a key, in this way, means that code, eg. loops, + that depends on there being a dictionary key axis will not break. + """ video_key = maybe_fakekey(video_key.strip()) # This sets the initial field ordering in the all_keys dictionary row @@ -194,6 +211,9 @@ def create_all_keys_dict(this_nzsl_raw_keys_dict, this_s3_bucket_raw_keys_list): this_all_keys_dict = {} # Find S3 keys that are present in NZSL, or absent + # TODO This could be changed to use pop(), so that on each pass we are left + # with a smaller subset of the rows, which we can search faster. If the + # database becomes very large in future this could save a lot of processing. for video_key in this_s3_bucket_raw_keys_list: dict_row = this_nzsl_raw_keys_dict.get(video_key, None) if dict_row: @@ -252,19 +272,14 @@ def find_orphans(): # This Signbank record already has an S3 object, all is well continue - # Business rule - if int(gloss_id) < 8000: - continue - # The gloss_id is the only reliable retrieval key at the Signbank end gloss = Gloss.objects.get(id=gloss_id) gloss_name = gloss.idgloss.split(":")[0].strip() - video_path = gloss.get_video_path() # Skip any that already have a video path # These should have an S3 object but don't: For some reason the video never made it to S3 # These will have to have their videos reinstated (separate operation) - if len(video_path) > 0: + if gloss.glossvideo_set.exists(): continue # We try to find the orphaned S3 object, if it exists diff --git a/bin/repair-fixable-orphans.py b/bin/repair-fixable-s3-orphans.py similarity index 95% rename from bin/repair-fixable-orphans.py rename to bin/repair-fixable-s3-orphans.py index ce948be2..84648b49 100755 --- a/bin/repair-fixable-orphans.py +++ b/bin/repair-fixable-s3-orphans.py @@ -66,6 +66,13 @@ required=False, help=f"Postgres client path (default: %(default)s)", ) +parser.add_argument( + "--dryrun", + default=False, + required=False, + action="store_true", + help=f"Don't actually make any changes, just output what would happen", +) args = parser.parse_args() # Keep synced with other scripts @@ -151,6 +158,11 @@ def process_csv(): ) print(gloss) print(gloss_video) + + if args.dryrun: + print("Dry run, no changes") + continue + # At this point we complete the repair # We use bulk_create() because we cannot allow save() to run if len(GlossVideo.objects.bulk_create([gloss_video])) < 1: From 48a320755b3c444932f2185a55b95f334cfa4182 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 11 Dec 2024 16:11:43 +1100 Subject: [PATCH 206/222] OSV ignore GHSA-rrqc-c2jx-6jgv to suppress build warnings (We have a Django upgrade in progress anyway that will address this vuln) --- .osv-detector.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.osv-detector.yml b/.osv-detector.yml index 2d4acb90..59841218 100644 --- a/.osv-detector.yml +++ b/.osv-detector.yml @@ -9,3 +9,4 @@ ignore: - GHSA-248v-346w-9cwc # Certifi removes GLOBALTRUST root certificate (https://github.com/advisories/GHSA-248v-346w-9cwc) - GHSA-g92j-qhmh-64v2 # Sentry's Python SDK unintentionally exposes environment variables to subprocesses (https://github.com/advisories/GHSA-g92j-qhmh-64v2) - GHSA-9mvj-f7w8-pvh2 # Bootstrap Cross-Site Scripting (XSS) vulnerability (https://github.com/advisories/GHSA-9mvj-f7w8-pvh2) + - GHSA-rrqc-c2jx-6jgv # Django allows enumeration of user e-mail addresses From 760dd8e0e5c8097e098856cc4bda2b5be4bd9d42 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 11 Dec 2024 16:24:49 +1100 Subject: [PATCH 207/222] Do not orphan-test fake keys --- bin/find-fixable-s3-orphans.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bin/find-fixable-s3-orphans.py b/bin/find-fixable-s3-orphans.py index 0b886714..28494514 100755 --- a/bin/find-fixable-s3-orphans.py +++ b/bin/find-fixable-s3-orphans.py @@ -285,6 +285,8 @@ def find_orphans(): # We try to find the orphaned S3 object, if it exists # TODO We could improve on brute-force by installing new libraries eg. rapidfuzz for test_key, [key_nzsl_yes, key_s3_yes, *_] in all_keys_dict.items(): + if test_key.startswith(FAKEKEY_PREFIX): + continue if gloss_name in test_key: if str(gloss_id) in test_key: if key_nzsl_yes: From 781beddf34e91c3f8a670553486684458820e82c Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 11 Dec 2024 16:45:50 +1100 Subject: [PATCH 208/222] Use csv.writer() for get_ script --- bin/get-video-s3-acls.py | 62 +++++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 32 deletions(-) diff --git a/bin/get-video-s3-acls.py b/bin/get-video-s3-acls.py index 00b820f9..ee6d70d8 100755 --- a/bin/get-video-s3-acls.py +++ b/bin/get-video-s3-acls.py @@ -15,6 +15,7 @@ from uuid import uuid4 from pprint import pprint import boto3 +import csv parser = argparse.ArgumentParser( description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. " @@ -255,21 +256,19 @@ def get_s3_lastmodified(video_key): def build_csv_header(): - return CSV_DELIMITER.join( - [ - "Action", - "S3 Video key", - "S3 LastModified", - "S3 Expected Canned ACL", - "S3 Actual Canned ACL", - "Sbank Gloss ID", - "Sbank Video ID", - "Sbank Gloss public", - "Sbank Video public", - "Sbank Gloss", - "Sbank Gloss created at", - ] - ) + return [ + "Action", + "S3 Video key", + "S3 LastModified", + "S3 Expected Canned ACL", + "S3 Actual Canned ACL", + "Sbank Gloss ID", + "Sbank Video ID", + "Sbank Gloss public", + "Sbank Video public", + "Sbank Gloss", + "Sbank Gloss created at", + ] def build_csv_row( @@ -296,31 +295,30 @@ def build_csv_row( action = get_recommended_action(key_in_nzsl, key_in_s3) - return CSV_DELIMITER.join( - [ - action, - f"{filter_fakekey(video_key)}", - f"{lastmodified}", - f"{canned_acl_expected}", - f"{canned_acl}", - f"{gloss_id}", - f"{video_id}", - f"{gloss_public}", - f"{video_public}", - f"{gloss_idgloss}", - f"{gloss_created_at}", - ] - ) + return [ + action, + f"{filter_fakekey(video_key)}", + f"{lastmodified}", + f"{canned_acl_expected}", + f"{canned_acl}", + f"{gloss_id}", + f"{video_id}", + f"{gloss_public}", + f"{video_public}", + f"{gloss_idgloss}", + f"{gloss_created_at}", + ] # From the keys present in NZSL, get all their S3 information def process_keys(this_all_keys_dict): print(f"Getting detailed S3 data for keys ({AWS_S3_BUCKET}) ...", file=sys.stderr) - print(build_csv_header()) + out = csv.writer(sys.stdout, delimiter=CSV_DELIMITER, quoting=csv.QUOTE_NONE) + out.writerow(build_csv_header()) for video_key, dict_row in this_all_keys_dict.items(): - print(build_csv_row(video_key, *dict_row)) + out.writerow(build_csv_row(video_key, *dict_row)) print(f"Env: {args.env}", file=sys.stderr) From 04c1cc986fdd89bb0bb562691c055d5670dfe333 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 11 Dec 2024 16:57:14 +1100 Subject: [PATCH 209/222] Other scripts now using csv.writerow() also --- bin/find-fixable-s3-orphans.py | 10 ++++------ bin/repair-fixable-s3-orphans.py | 7 +++++-- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/bin/find-fixable-s3-orphans.py b/bin/find-fixable-s3-orphans.py index 28494514..2be5967f 100755 --- a/bin/find-fixable-s3-orphans.py +++ b/bin/find-fixable-s3-orphans.py @@ -20,6 +20,7 @@ import argparse from uuid import uuid4 import boto3 +import csv # Magic required to allow this script to use Signbank Django classes # This goes away if this script becomes a Django Management Command @@ -250,7 +251,8 @@ def find_orphans(): ) print("Finding fixable orphans", file=sys.stderr) - print(CSV_DELIMITER.join(GLOBAL_COLUMN_HEADINGS)) + out = csv.writer(sys.stdout, delimiter=CSV_DELIMITER, quoting=csv.QUOTE_NONE) + out.writerow(GLOBAL_COLUMN_HEADINGS) # Traverse all the NZSL Signbank glosses that are missing S3 objects for video_key, [ @@ -295,11 +297,7 @@ def find_orphans(): if not key_s3_yes: print(f"Anomaly (not in S3): {gloss.idgloss}", file=sys.stderr) continue - print( - CSV_DELIMITER.join( - [gloss_id, gloss.idgloss, str(gloss_public), test_key] - ) - ) + out.writerow([gloss_id, gloss.idgloss, str(gloss_public), test_key]) print(f"Env: {args.env}", file=sys.stderr) diff --git a/bin/repair-fixable-s3-orphans.py b/bin/repair-fixable-s3-orphans.py index 84648b49..67c7321d 100755 --- a/bin/repair-fixable-s3-orphans.py +++ b/bin/repair-fixable-s3-orphans.py @@ -17,9 +17,9 @@ import os import sys -import csv import subprocess import argparse +import csv # Magic required to allow this script to use Signbank Django classes # This goes away if this script becomes a Django Management Command @@ -127,11 +127,14 @@ def process_csv(): ).first() csv_rows = read_csv(args.csv_filename) + + out = csv.writer(sys.stdout, delimiter=CSV_DELIMITER, quoting=csv.QUOTE_NONE) + for csv_row in csv_rows: gloss_id = csv_row[GLOSS_ID_COLUMN] gloss_idgloss = csv_row[GLOSS_COLUMN] video_key = csv_row[GLOSS_VIDEO_COLUMN] - print(CSV_DELIMITER.join([gloss_id, gloss_idgloss, video_key])) + out.writerow([gloss_id, gloss_idgloss, video_key]) gloss_id = int(gloss_id) try: From 87db6a24008ae4ca678e12809dce8e779443e49a Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Wed, 11 Dec 2024 17:03:30 +1100 Subject: [PATCH 210/222] Dry run mode made default, flag changed to --commit --- bin/repair-fixable-s3-orphans.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/bin/repair-fixable-s3-orphans.py b/bin/repair-fixable-s3-orphans.py index 67c7321d..32c76b96 100755 --- a/bin/repair-fixable-s3-orphans.py +++ b/bin/repair-fixable-s3-orphans.py @@ -67,11 +67,11 @@ help=f"Postgres client path (default: %(default)s)", ) parser.add_argument( - "--dryrun", + "--commit", default=False, required=False, action="store_true", - help=f"Don't actually make any changes, just output what would happen", + help=f"Actually make changes, instead of just outputting what would happen (default)", ) args = parser.parse_args() @@ -162,8 +162,8 @@ def process_csv(): print(gloss) print(gloss_video) - if args.dryrun: - print("Dry run, no changes") + if not args.commit: + print("Dry run, no changes (use --commit flag to make changes)") continue # At this point we complete the repair @@ -176,5 +176,6 @@ def process_csv(): print(f"S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr) print(f"PGCLI: {PGCLI}", file=sys.stderr) print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr) +print(f"Mode: {'Commit' if args.commit else 'Dry-run'}") process_csv() From 556e709fec8af3d56e62c766b5e423bfa4b02f2f Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 13 Dec 2024 13:51:31 +1100 Subject: [PATCH 211/222] moved get script --- .../dictionary/management/commands}/get-video-s3-acls.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename {bin => signbank/dictionary/management/commands}/get-video-s3-acls.py (100%) diff --git a/bin/get-video-s3-acls.py b/signbank/dictionary/management/commands/get-video-s3-acls.py similarity index 100% rename from bin/get-video-s3-acls.py rename to signbank/dictionary/management/commands/get-video-s3-acls.py From da0befbdd45828ff5994df3bb35a0edd4a0f5e74 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 13 Dec 2024 13:52:02 +1100 Subject: [PATCH 212/222] rename get script for consistency --- .../commands/{get-video-s3-acls.py => get_video_s3_acls.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename signbank/dictionary/management/commands/{get-video-s3-acls.py => get_video_s3_acls.py} (100%) diff --git a/signbank/dictionary/management/commands/get-video-s3-acls.py b/signbank/dictionary/management/commands/get_video_s3_acls.py similarity index 100% rename from signbank/dictionary/management/commands/get-video-s3-acls.py rename to signbank/dictionary/management/commands/get_video_s3_acls.py From 20f8bf428347976f982eb4e37a53ddb13949e633 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 13 Dec 2024 13:52:44 +1100 Subject: [PATCH 213/222] changed permissions on get script for consistency --- signbank/dictionary/management/commands/get_video_s3_acls.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 signbank/dictionary/management/commands/get_video_s3_acls.py diff --git a/signbank/dictionary/management/commands/get_video_s3_acls.py b/signbank/dictionary/management/commands/get_video_s3_acls.py old mode 100755 new mode 100644 From 695b398702736635e74f977a8b8eacdca0c559aa Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 13 Dec 2024 15:34:29 +1100 Subject: [PATCH 214/222] get_video_s3_acls -> Management Command --- .../management/commands/get_video_s3_acls.py | 107 +++++++++--------- 1 file changed, 56 insertions(+), 51 deletions(-) diff --git a/signbank/dictionary/management/commands/get_video_s3_acls.py b/signbank/dictionary/management/commands/get_video_s3_acls.py index ee6d70d8..9804759d 100644 --- a/signbank/dictionary/management/commands/get_video_s3_acls.py +++ b/signbank/dictionary/management/commands/get_video_s3_acls.py @@ -6,57 +6,25 @@ # s3:GetObjectAcl permissions or READ_ACP access to the object # https://docs.aws.amazon.com/cli/latest/reference/s3api/get-object-acl.html # For some commands you need to run this in a venv that has all the right Python site-packages. -# TODO Convert this script to a Django Management Command +from django.core.management.base import BaseCommand import os import sys import subprocess -import argparse from uuid import uuid4 from pprint import pprint import boto3 import csv -parser = argparse.ArgumentParser( - description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. " - "Postgres access details, eg. DATABASE_URL env var." -) -parser.add_argument( - "--env", - default="uat", - required=False, - help="Environment to run against, eg 'production, 'uat', etc (default: '%(default)s')", -) -parser.add_argument( - "--pgcli", - default="/usr/bin/psql", - required=False, - help=f"Postgres client path (default: %(default)s)", -) -parser.add_argument( - "--dumpnzsl", - default=False, - required=False, - action="store_true", - help=f"Dump raw NZSL database output", -) -parser.add_argument( - "--dumps3", - default=False, - required=False, - action="store_true", - help=f"Dump raw S3 keys output", -) -args = parser.parse_args() # Globals CSV_DELIMITER = "," FAKEKEY_PREFIX = "this_is_not_a_key_" DATABASE_URL = os.getenv("DATABASE_URL", "") -PGCLI = args.pgcli -AWS_S3_BUCKET = f"nzsl-signbank-media-{args.env}" S3_CLIENT = boto3.client("s3") S3_RESOURCE = boto3.resource("s3") +PGCLI = "/usr/bin/psql" +AWS_S3_BUCKET = "" def pg_cli(args_list): @@ -165,10 +133,10 @@ def get_nzsl_raw_keys_dict(): # Get all keys from AWS S3 -def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET): - print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr) +def get_s3_bucket_raw_keys_list(): + print(f"Getting raw AWS S3 keys recursively ({AWS_S3_BUCKET}) ...", file=sys.stderr) - s3_resource_bucket = S3_RESOURCE.Bucket(s3_bucket) + s3_resource_bucket = S3_RESOURCE.Bucket(AWS_S3_BUCKET) this_s3_bucket_raw_keys_list = [ s3_object.key for s3_object in s3_resource_bucket.objects.all() ] @@ -321,19 +289,56 @@ def process_keys(this_all_keys_dict): out.writerow(build_csv_row(video_key, *dict_row)) -print(f"Env: {args.env}", file=sys.stderr) -print(f"S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr) -print(f"PGCLI: {PGCLI}", file=sys.stderr) -print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr) +class Command(BaseCommand): + help = "You must setup: An AWS auth means, eg. AWS_PROFILE env var. " + "Postgres access details, eg. DATABASE_URL env var." + + def add_arguments(self, parser): + parser.add_argument( + "--env", + default="uat", + required=False, + help="Environment to run against, eg 'production, 'uat', etc (default: '%(default)s')", + ) + parser.add_argument( + "--pgcli", + default=PGCLI, + required=False, + help=f"Postgres client path (default: %(default)s)", + ) + parser.add_argument( + "--dumpnzsl", + default=False, + required=False, + action="store_true", + help=f"Dump raw NZSL database output", + ) + parser.add_argument( + "--dumps3", + default=False, + required=False, + action="store_true", + help=f"Dump raw S3 keys output", + ) + + def handle(self, *args, **options): + global PGCLI, AWS_S3_BUCKET + PGCLI = options["pgcli"] + AWS_S3_BUCKET = f"nzsl-signbank-media-{options['env']}" + + print(f"Env: {options['env']}", file=sys.stderr) + print(f"S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr) + print(f"PGCLI: {PGCLI}", file=sys.stderr) + print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr) -if args.dumpnzsl: - pprint(get_nzsl_raw_keys_dict()) - exit() + if options["dumpnzsl"]: + pprint(get_nzsl_raw_keys_dict()) + exit() -if args.dumps3: - pprint(get_s3_bucket_raw_keys_list()) - exit() + if options["dumps3"]: + pprint(get_s3_bucket_raw_keys_list()) + exit() -process_keys( - create_all_keys_dict(get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list()) -) + process_keys( + create_all_keys_dict(get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list()) + ) From 4d79d329d5855ec10cfa83883e6e9f135523d81d Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 13 Dec 2024 15:38:54 +1100 Subject: [PATCH 215/222] Comments --- .../management/commands/get_video_s3_acls.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/signbank/dictionary/management/commands/get_video_s3_acls.py b/signbank/dictionary/management/commands/get_video_s3_acls.py index 9804759d..f4273f97 100644 --- a/signbank/dictionary/management/commands/get_video_s3_acls.py +++ b/signbank/dictionary/management/commands/get_video_s3_acls.py @@ -5,7 +5,6 @@ # aws s3 - NZSL IAM access # s3:GetObjectAcl permissions or READ_ACP access to the object # https://docs.aws.amazon.com/cli/latest/reference/s3api/get-object-acl.html -# For some commands you need to run this in a venv that has all the right Python site-packages. from django.core.management.base import BaseCommand import os @@ -290,8 +289,11 @@ def process_keys(this_all_keys_dict): class Command(BaseCommand): - help = "You must setup: An AWS auth means, eg. AWS_PROFILE env var. " - "Postgres access details, eg. DATABASE_URL env var." + help = ( + "Gets all S3 bucket video objects and recommends actions for them. " + "You must setup: (1) An AWS auth means, eg. AWS_PROFILE env var. " + "(2) Postgres access details, eg. DATABASE_URL env var." + ) def add_arguments(self, parser): parser.add_argument( @@ -340,5 +342,7 @@ def handle(self, *args, **options): exit() process_keys( - create_all_keys_dict(get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list()) + create_all_keys_dict( + get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list() + ) ) From 4caa11a027a69c649fe4ca31d2216cfced30958b Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 13 Dec 2024 15:52:06 +1100 Subject: [PATCH 216/222] Comments --- signbank/dictionary/management/commands/get_video_s3_acls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/signbank/dictionary/management/commands/get_video_s3_acls.py b/signbank/dictionary/management/commands/get_video_s3_acls.py index f4273f97..fa23ebd7 100644 --- a/signbank/dictionary/management/commands/get_video_s3_acls.py +++ b/signbank/dictionary/management/commands/get_video_s3_acls.py @@ -188,8 +188,8 @@ def get_recommended_action(key_in_nzsl, key_in_s3): """ Cases In S3 In NZSL Action - Is Not Delete S3 Object Is Is Update ACL + Is Not Delete S3 Object Not -- Review """ if key_in_s3: From 75e82cfb8f47974ff94c50adbc7564de168980e0 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 13 Dec 2024 15:55:15 +1100 Subject: [PATCH 217/222] Comments --- signbank/dictionary/management/commands/get_video_s3_acls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/signbank/dictionary/management/commands/get_video_s3_acls.py b/signbank/dictionary/management/commands/get_video_s3_acls.py index fa23ebd7..51396c01 100644 --- a/signbank/dictionary/management/commands/get_video_s3_acls.py +++ b/signbank/dictionary/management/commands/get_video_s3_acls.py @@ -290,7 +290,7 @@ def process_keys(this_all_keys_dict): class Command(BaseCommand): help = ( - "Gets all S3 bucket video objects and recommends actions for them. " + "Get all S3 bucket video object and recommends actions for them. " "You must setup: (1) An AWS auth means, eg. AWS_PROFILE env var. " "(2) Postgres access details, eg. DATABASE_URL env var." ) From 967daaf13c0de995b56c8ea5cb83a17b2e47491c Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 13 Dec 2024 16:23:36 +1100 Subject: [PATCH 218/222] Moved remaining commands --- .../dictionary/management/commands}/find-fixable-s3-orphans.py | 0 .../dictionary/management/commands}/repair-fixable-s3-orphans.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename {bin => signbank/dictionary/management/commands}/find-fixable-s3-orphans.py (100%) rename {bin => signbank/dictionary/management/commands}/repair-fixable-s3-orphans.py (100%) diff --git a/bin/find-fixable-s3-orphans.py b/signbank/dictionary/management/commands/find-fixable-s3-orphans.py similarity index 100% rename from bin/find-fixable-s3-orphans.py rename to signbank/dictionary/management/commands/find-fixable-s3-orphans.py diff --git a/bin/repair-fixable-s3-orphans.py b/signbank/dictionary/management/commands/repair-fixable-s3-orphans.py similarity index 100% rename from bin/repair-fixable-s3-orphans.py rename to signbank/dictionary/management/commands/repair-fixable-s3-orphans.py From 2ae11cd46093b163ffbe45e93bfb4394e5eab79c Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 13 Dec 2024 16:24:48 +1100 Subject: [PATCH 219/222] Renamed --- ...-orphans.py => find_fixable_s3_orphans.py} | 73 +++++++++---------- ...rphans.py => repair_fixable_s3_orphans.py} | 0 2 files changed, 34 insertions(+), 39 deletions(-) rename signbank/dictionary/management/commands/{find-fixable-s3-orphans.py => find_fixable_s3_orphans.py} (84%) rename signbank/dictionary/management/commands/{repair-fixable-s3-orphans.py => repair_fixable_s3_orphans.py} (100%) diff --git a/signbank/dictionary/management/commands/find-fixable-s3-orphans.py b/signbank/dictionary/management/commands/find_fixable_s3_orphans.py similarity index 84% rename from signbank/dictionary/management/commands/find-fixable-s3-orphans.py rename to signbank/dictionary/management/commands/find_fixable_s3_orphans.py index 2be5967f..308ac5e2 100755 --- a/signbank/dictionary/management/commands/find-fixable-s3-orphans.py +++ b/signbank/dictionary/management/commands/find_fixable_s3_orphans.py @@ -1,7 +1,5 @@ #!/usr/bin/env -S python3 -u # -# This script needs to be run in a pyenv virtualenv with the Django project installed. -# # Finds orphaned S3 objects that can be matched back to NZSL entries that are missing S3 objects. # Essentially finds one form of import error. # @@ -11,24 +9,15 @@ # aws s3 - NZSL IAM access # s3:GetObjectAcl permissions or READ_ACP access to the object # https://docs.aws.amazon.com/cli/latest/reference/s3api/get-object-acl.html -# For some commands you need to run this in a venv that has all the right Python site-packages. -# TODO Convert this script to a Django Management Command +from django.core.management.base import BaseCommand import os import sys import subprocess -import argparse from uuid import uuid4 import boto3 import csv -# Magic required to allow this script to use Signbank Django classes -# This goes away if this script becomes a Django Management Command -sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) -os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development") -from django.core.wsgi import get_wsgi_application - -get_wsgi_application() from django.contrib.auth import get_user_model @@ -39,23 +28,6 @@ ) -parser = argparse.ArgumentParser( - description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. " - "Postgres access details, eg. DATABASE_URL env var." -) -parser.add_argument( - "--env", - default="uat", - required=False, - help="Environment to run against, eg 'production, 'uat', etc (default: '%(default)s')", -) -parser.add_argument( - "--pgcli", - default="/usr/bin/psql", - required=False, - help=f"Postgres client path (default: %(default)s)", -) -args = parser.parse_args() # Keep synced with other scripts GLOSS_ID_COLUMN = "Gloss ID" @@ -72,8 +44,8 @@ # Other globals CSV_DELIMITER = "," DATABASE_URL = os.getenv("DATABASE_URL", "") -PGCLI = args.pgcli -AWS_S3_BUCKET = f"nzsl-signbank-media-{args.env}" +PGCLI = "/usr/bin/psql" +AWS_S3_BUCKET = "" # Hack to handle FULL JOIN # See get_nzsl_raw_keys_dict() @@ -186,11 +158,11 @@ def get_nzsl_raw_keys_dict(): # Get all keys from AWS S3 -def get_s3_bucket_raw_keys_list(s3_bucket=AWS_S3_BUCKET): - print(f"Getting raw AWS S3 keys recursively ({s3_bucket}) ...", file=sys.stderr) +def get_s3_bucket_raw_keys_list(): + print(f"Getting raw AWS S3 keys recursively ({AWS_S3_BUCKET}) ...", file=sys.stderr) s3_resource = boto3.resource("s3") - s3_resource_bucket = s3_resource.Bucket(s3_bucket) + s3_resource_bucket = s3_resource.Bucket(AWS_S3_BUCKET) this_s3_bucket_raw_keys_list = [ s3_object.key for s3_object in s3_resource_bucket.objects.all() ] @@ -300,9 +272,32 @@ def find_orphans(): out.writerow([gloss_id, gloss.idgloss, str(gloss_public), test_key]) -print(f"Env: {args.env}", file=sys.stderr) -print(f"S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr) -print(f"PGCLI: {PGCLI}", file=sys.stderr) -print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr) +class Command(BaseCommand): + help = ( "You must setup: An AWS auth means, eg. AWS_PROFILE env var. " + "Postgres access details, eg. DATABASE_URL env var." ) + + def add_arguments(self, parser): + parser.add_argument( + "--env", + default="uat", + required=False, + help="Environment to run against, eg 'production, 'uat', etc (default: '%(default)s')", + ) + parser.add_argument( + "--pgcli", + default=PGCLI, + required=False, + help=f"Postgres client path (default: %(default)s)", + ) + + def handle(self, *args, **options): + global PGCLI, AWS_S3_BUCKET + PGCLI = options["pgcli"] + AWS_S3_BUCKET = f"nzsl-signbank-media-{options['env']}" + + print(f"Env: {options['env']}", file=sys.stderr) + print(f"S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr) + print(f"PGCLI: {PGCLI}", file=sys.stderr) + print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr) -find_orphans() + find_orphans() diff --git a/signbank/dictionary/management/commands/repair-fixable-s3-orphans.py b/signbank/dictionary/management/commands/repair_fixable_s3_orphans.py similarity index 100% rename from signbank/dictionary/management/commands/repair-fixable-s3-orphans.py rename to signbank/dictionary/management/commands/repair_fixable_s3_orphans.py From cbe56c8465684f54d4cf27426a74486a7a4477c4 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 13 Dec 2024 16:25:17 +1100 Subject: [PATCH 220/222] black --- .../management/commands/find_fixable_s3_orphans.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/signbank/dictionary/management/commands/find_fixable_s3_orphans.py b/signbank/dictionary/management/commands/find_fixable_s3_orphans.py index 308ac5e2..f11df71c 100755 --- a/signbank/dictionary/management/commands/find_fixable_s3_orphans.py +++ b/signbank/dictionary/management/commands/find_fixable_s3_orphans.py @@ -28,7 +28,6 @@ ) - # Keep synced with other scripts GLOSS_ID_COLUMN = "Gloss ID" GLOSS_COLUMN = "Gloss" @@ -273,8 +272,10 @@ def find_orphans(): class Command(BaseCommand): - help = ( "You must setup: An AWS auth means, eg. AWS_PROFILE env var. " - "Postgres access details, eg. DATABASE_URL env var." ) + help = ( + "You must setup: An AWS auth means, eg. AWS_PROFILE env var. " + "Postgres access details, eg. DATABASE_URL env var." + ) def add_arguments(self, parser): parser.add_argument( From 1fb9978d26f4b345cec343fd4a46d3e51ea5ec41 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 13 Dec 2024 16:31:05 +1100 Subject: [PATCH 221/222] find_fixable_s3_orphans.py -> Management Command --- .../management/commands/find_fixable_s3_orphans.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/signbank/dictionary/management/commands/find_fixable_s3_orphans.py b/signbank/dictionary/management/commands/find_fixable_s3_orphans.py index f11df71c..081e9622 100755 --- a/signbank/dictionary/management/commands/find_fixable_s3_orphans.py +++ b/signbank/dictionary/management/commands/find_fixable_s3_orphans.py @@ -17,15 +17,7 @@ from uuid import uuid4 import boto3 import csv - - -from django.contrib.auth import get_user_model - -User = get_user_model() - -from signbank.dictionary.models import ( - Gloss, -) +from signbank.dictionary.models import Gloss # Keep synced with other scripts @@ -273,6 +265,7 @@ def find_orphans(): class Command(BaseCommand): help = ( + "Find orphaned S3 objects that can be matched back to NZSL entries that are missing S3 objects. " "You must setup: An AWS auth means, eg. AWS_PROFILE env var. " "Postgres access details, eg. DATABASE_URL env var." ) From 4f1934a770aa0dafb2afa1bbc98228ae9d1809e5 Mon Sep 17 00:00:00 2001 From: jonholdsworth <82071930+jonholdsworth@users.noreply.github.com> Date: Fri, 13 Dec 2024 17:02:02 +1100 Subject: [PATCH 222/222] black and cleanups --- .../commands/find_fixable_s3_orphans.py | 5 +- .../commands/repair_fixable_s3_orphans.py | 119 +++++++++--------- 2 files changed, 58 insertions(+), 66 deletions(-) diff --git a/signbank/dictionary/management/commands/find_fixable_s3_orphans.py b/signbank/dictionary/management/commands/find_fixable_s3_orphans.py index 081e9622..6fcc73c2 100755 --- a/signbank/dictionary/management/commands/find_fixable_s3_orphans.py +++ b/signbank/dictionary/management/commands/find_fixable_s3_orphans.py @@ -34,14 +34,11 @@ # Other globals CSV_DELIMITER = "," +FAKEKEY_PREFIX = "this_is_not_a_key_" DATABASE_URL = os.getenv("DATABASE_URL", "") PGCLI = "/usr/bin/psql" AWS_S3_BUCKET = "" -# Hack to handle FULL JOIN -# See get_nzsl_raw_keys_dict() -FAKEKEY_PREFIX = "this_is_not_a_key_" - def pg_cli(args_list): try: diff --git a/signbank/dictionary/management/commands/repair_fixable_s3_orphans.py b/signbank/dictionary/management/commands/repair_fixable_s3_orphans.py index 32c76b96..06085051 100755 --- a/signbank/dictionary/management/commands/repair_fixable_s3_orphans.py +++ b/signbank/dictionary/management/commands/repair_fixable_s3_orphans.py @@ -1,7 +1,5 @@ #!/usr/bin/env -S python3 -u # -# This script needs to be run in a pyenv virtualenv with the Django project installed. -# # Given a CSV file containing S3 objects that can be matched back to NZSL entries. # Updates the database to repair the NZSL entries. # Essentially repairs one form of import error. @@ -12,68 +10,19 @@ # aws s3 - NZSL IAM access # s3:GetObjectAcl permissions or READ_ACP access to the object # https://docs.aws.amazon.com/cli/latest/reference/s3api/get-object-acl.html -# For some commands you need to run this in a venv that has all the right Python site-packages. -# TODO Convert this script to a Django Management Command +from django.core.management.base import BaseCommand import os import sys import subprocess -import argparse import csv - -# Magic required to allow this script to use Signbank Django classes -# This goes away if this script becomes a Django Management Command -print("Importing site-packages environment", file=sys.stderr) -print(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), file=sys.stderr) -sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) -os.environ.setdefault("DJANGO_SETTINGS_MODULE", "signbank.settings.development") -from django.core.wsgi import get_wsgi_application - -get_wsgi_application() - -from django.contrib.auth import get_user_model - -User = get_user_model() - from signbank.dictionary.models import ( FieldChoice, Gloss, ) from signbank.video.models import GlossVideo - from django.core.exceptions import ObjectDoesNotExist -from django.db import models - - -parser = argparse.ArgumentParser( - description="You must setup: An AWS auth means, eg. AWS_PROFILE env var. " - "Postgres access details, eg. DATABASE_URL env var." -) -# Positional arguments -parser.add_argument("csv_filename", help="Name of CSV file, or '-' for STDIN") - -# Optional arguments -parser.add_argument( - "--env", - default="uat", - required=False, - help="Environment to run against, eg 'production, 'uat', etc (default: '%(default)s')", -) -parser.add_argument( - "--pgcli", - default="/usr/bin/psql", - required=False, - help=f"Postgres client path (default: %(default)s)", -) -parser.add_argument( - "--commit", - default=False, - required=False, - action="store_true", - help=f"Actually make changes, instead of just outputting what would happen (default)", -) -args = parser.parse_args() # Keep synced with other scripts GLOSS_ID_COLUMN = "Gloss ID" @@ -91,8 +40,10 @@ CSV_DELIMITER = "," FAKEKEY_PREFIX = "this_is_not_a_key_" DATABASE_URL = os.getenv("DATABASE_URL", "") -PGCLI = args.pgcli -AWS_S3_BUCKET = f"nzsl-signbank-media-{args.env}" +PGCLI = "/usr/bin/psql" +AWS_S3_BUCKET = "" +DO_COMMIT = False +CSV_INPUT_FILENAME = "-" def pg_cli(args_list): @@ -126,7 +77,7 @@ def process_csv(): field="video_type", english_name="main" ).first() - csv_rows = read_csv(args.csv_filename) + csv_rows = read_csv(CSV_INPUT_FILENAME) out = csv.writer(sys.stdout, delimiter=CSV_DELIMITER, quoting=csv.QUOTE_NONE) @@ -162,7 +113,7 @@ def process_csv(): print(gloss) print(gloss_video) - if not args.commit: + if not DO_COMMIT: print("Dry run, no changes (use --commit flag to make changes)") continue @@ -172,10 +123,54 @@ def process_csv(): print(f"Error: could not create {gloss_video}") -print(f"Env: {args.env}", file=sys.stderr) -print(f"S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr) -print(f"PGCLI: {PGCLI}", file=sys.stderr) -print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr) -print(f"Mode: {'Commit' if args.commit else 'Dry-run'}") +class Command(BaseCommand): + help = ( + f"Given a CSV file containing S3 objects that can be matched back to NZSL entries: " + f"Update the database to repair the NZSL entries. " + f"CSV Column headings {GLOBAL_COLUMN_HEADINGS}. " + f"You must have setup: An AWS auth means, eg. AWS_PROFILE env var. " + f"Postgres access details, eg. DATABASE_URL env var." + ) + + def add_arguments(self, parser): + # Positional arguments + parser.add_argument( + "csv_filename", help="Name of CSV input file, or '-' for STDIN" + ) + + # Optional arguments + parser.add_argument( + "--env", + default="uat", + required=False, + help="Environment to run against, eg 'production, 'uat', etc (default: '%(default)s')", + ) + parser.add_argument( + "--pgcli", + default="/usr/bin/psql", + required=False, + help=f"Postgres client path (default: %(default)s)", + ) + parser.add_argument( + "--commit", + default=DO_COMMIT, + required=False, + action="store_true", + help=f"Actually make changes, instead of just outputting what would happen (default)", + ) -process_csv() + def handle(self, *args, **options): + global PGCLI, AWS_S3_BUCKET, CSV_INPUT_FILENAME, DO_COMMIT + PGCLI = options["pgcli"] + AWS_S3_BUCKET = f"nzsl-signbank-media-{options['env']}" + CSV_INPUT_FILENAME = options["csv_filename"] + DO_COMMIT = options["commit"] + + print(f"Env: {options['env']}", file=sys.stderr) + print(f"S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr) + print(f"PGCLI: {PGCLI}", file=sys.stderr) + print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr) + print(f"Input file: {options['csv_filename']}", file=sys.stderr) + print(f"Mode: {'Commit' if DO_COMMIT else 'Dry-run'}", file=sys.stderr) + + process_csv()