diff --git a/.osv-detector.yml b/.osv-detector.yml index 794fb52a..59841218 100644 --- a/.osv-detector.yml +++ b/.osv-detector.yml @@ -6,3 +6,7 @@ ignore: - GHSA-257q-pv89-v3xv # GHSA says affected versions are jQuery v.2.2.0 until v.3.5.0 - GHSA-vm8q-m57g-pff3 - GHSA-w3h3-4rj7-4ph4 + - GHSA-248v-346w-9cwc # Certifi removes GLOBALTRUST root certificate (https://github.com/advisories/GHSA-248v-346w-9cwc) + - GHSA-g92j-qhmh-64v2 # Sentry's Python SDK unintentionally exposes environment variables to subprocesses (https://github.com/advisories/GHSA-g92j-qhmh-64v2) + - GHSA-9mvj-f7w8-pvh2 # Bootstrap Cross-Site Scripting (XSS) vulnerability (https://github.com/advisories/GHSA-9mvj-f7w8-pvh2) + - GHSA-rrqc-c2jx-6jgv # Django allows enumeration of user e-mail addresses diff --git a/signbank/dictionary/management/commands/find_fixable_s3_orphans.py b/signbank/dictionary/management/commands/find_fixable_s3_orphans.py new file mode 100755 index 00000000..6fcc73c2 --- /dev/null +++ b/signbank/dictionary/management/commands/find_fixable_s3_orphans.py @@ -0,0 +1,294 @@ +#!/usr/bin/env -S python3 -u +# +# Finds orphaned S3 objects that can be matched back to NZSL entries that are missing S3 objects. +# Essentially finds one form of import error. +# +# Bang line above passes '-u' to python, for unbuffered output +# Permissions required: +# psql - access to heroku app's postgres +# aws s3 - NZSL IAM access +# s3:GetObjectAcl permissions or READ_ACP access to the object +# https://docs.aws.amazon.com/cli/latest/reference/s3api/get-object-acl.html + +from django.core.management.base import BaseCommand +import os +import sys +import subprocess +from uuid import uuid4 +import boto3 +import csv +from signbank.dictionary.models import Gloss + + +# Keep synced with other scripts +GLOSS_ID_COLUMN = "Gloss ID" +GLOSS_COLUMN = "Gloss" +GLOSS_PUBLIC_COLUMN = "Gloss public" +GLOSS_VIDEO_COLUMN = "Suggested Video key" +GLOBAL_COLUMN_HEADINGS = [ + GLOSS_ID_COLUMN, + GLOSS_COLUMN, + GLOSS_PUBLIC_COLUMN, + GLOSS_VIDEO_COLUMN, +] + +# Other globals +CSV_DELIMITER = "," +FAKEKEY_PREFIX = "this_is_not_a_key_" +DATABASE_URL = os.getenv("DATABASE_URL", "") +PGCLI = "/usr/bin/psql" +AWS_S3_BUCKET = "" + + +def pg_cli(args_list): + try: + return subprocess.run( + [PGCLI, "-c"] + args_list + [f"{DATABASE_URL}"], + env=os.environ, + capture_output=True, + check=True, + text=True, + ) + except subprocess.CalledProcessError as e: + print(f"Error: subprocess.run returned code {e.returncode}", file=sys.stderr) + print(e.cmd, file=sys.stderr) + print(e.stdout, file=sys.stderr) + print(e.stderr, file=sys.stderr) + exit() + + +# Fake key is a hack to handle FULL JOIN +def maybe_fakekey(instring): + return instring if instring else FAKEKEY_PREFIX + str(uuid4()) + + +def filter_fakekey(instring): + return "" if instring.startswith(FAKEKEY_PREFIX) else instring + + +# Get the video files info from NZSL Signbank +def get_nzsl_raw_keys_dict(): + print( + f"Getting raw list of video file info from NZSL Signbank ...", + file=sys.stderr, + ) + this_nzsl_raw_keys_dict = {} + # Column renaming is for readability + # Special delimiter because columns might contain commas + result = pg_cli( + [ + "COPY (" + "SELECT " + "dg.id AS gloss_id, " + "dg.idgloss AS gloss_idgloss, " + "dg.created_at AS gloss_created_at, " + "dg.published AS gloss_public, " + "vg.is_public AS video_public, " + "vg.id AS video_id, " + "vg.videofile AS video_key " + "FROM dictionary_gloss AS dg " + "FULL JOIN video_glossvideo AS vg ON vg.gloss_id = dg.id" + ") TO STDOUT WITH (FORMAT CSV, DELIMITER '|')", + ] + ) + + # Separate the NZSL db columns + # Write them to a dictionary, so we can do fast operations + for rawl in result.stdout.split("\n"): + rawl = rawl.strip() + if not rawl: + continue + [ + gloss_id, + gloss_idgloss, + gloss_created_at, + gloss_public, + video_public, + video_id, + video_key, + ] = rawl.split("|") + + """ + Hack to handle FULL JOIN. + We are storing data rows in a dictionary, indexed by video_key. + Because we are doing a FULL JOIN on the NZSL Signbank database, + we also get rows where there are gloss entries that do not have + a corresponding video_glossvideo. + (These are erroneous and one of the reasons this script exists, + to find them.) + Consequently there is no video_key, and we cannot use it to index + the data row. + Instead, we create a fake video_key that is unique and, theoretically, + impossible for anything else to try and use. It also has a 'safe', + easily filtered prefix, which means later code can easily tell + a fake key from a real key. + Always having a key, in this way, means that code, eg. loops, + that depends on there being a dictionary key axis will not break. + """ + video_key = maybe_fakekey(video_key.strip()) + + # This sets the initial field ordering in the all_keys dictionary row + this_nzsl_raw_keys_dict[video_key] = [ + gloss_idgloss.replace(CSV_DELIMITER, ""), + gloss_created_at, + gloss_id, + video_id, + gloss_public.lower() == "t", + video_public.lower() == "t", + ] + + print( + f"{len(this_nzsl_raw_keys_dict)} rows retrieved", + file=sys.stderr, + ) + + return this_nzsl_raw_keys_dict + + +# Get all keys from AWS S3 +def get_s3_bucket_raw_keys_list(): + print(f"Getting raw AWS S3 keys recursively ({AWS_S3_BUCKET}) ...", file=sys.stderr) + + s3_resource = boto3.resource("s3") + s3_resource_bucket = s3_resource.Bucket(AWS_S3_BUCKET) + this_s3_bucket_raw_keys_list = [ + s3_object.key for s3_object in s3_resource_bucket.objects.all() + ] + + print( + f"{len(this_s3_bucket_raw_keys_list)} rows retrieved", + file=sys.stderr, + ) + + return this_s3_bucket_raw_keys_list + + +# Get the keys present and absent across NZSL Signbank and S3, to dictionary +def create_all_keys_dict(this_nzsl_raw_keys_dict, this_s3_bucket_raw_keys_list): + print( + "Getting keys present and absent across NZSL Signbank and S3 ...", + file=sys.stderr, + ) + this_all_keys_dict = {} + + # Find S3 keys that are present in NZSL, or absent + # TODO This could be changed to use pop(), so that on each pass we are left + # with a smaller subset of the rows, which we can search faster. If the + # database becomes very large in future this could save a lot of processing. + for video_key in this_s3_bucket_raw_keys_list: + dict_row = this_nzsl_raw_keys_dict.get(video_key, None) + if dict_row: + # NZSL glossvideo record for this S3 key + this_all_keys_dict[video_key] = [ + True, # NZSL PRESENT + True, # S3 PRESENT + ] + dict_row + else: + # S3 key with no corresponding NZSL glossvideo record + this_all_keys_dict[video_key] = [ + False, # NZSL Absent + True, # S3 PRESENT + ] + [""] * 6 + + # Find NZSL keys that are absent from S3 (present in both handled above) + for video_key, dict_row in this_nzsl_raw_keys_dict.items(): + if video_key not in this_s3_bucket_raw_keys_list: + # gloss/glossvideo record with no corresponding S3 key + # Either: + # video_key is real, but the S3 object is missing + # video_key is fake (to handle the FULL JOIN) and this gloss/glossvideo never had an S3 object + this_all_keys_dict[video_key] = [ + True, # NZSL PRESENT + False, # S3 Absent + ] + dict_row + + return this_all_keys_dict + + +def find_orphans(): + all_keys_dict = create_all_keys_dict( + get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list() + ) + print("Finding fixable orphans", file=sys.stderr) + + out = csv.writer(sys.stdout, delimiter=CSV_DELIMITER, quoting=csv.QUOTE_NONE) + out.writerow(GLOBAL_COLUMN_HEADINGS) + + # Traverse all the NZSL Signbank glosses that are missing S3 objects + for video_key, [ + key_in_nzsl, + key_in_s3, + gloss_idgloss, + gloss_created_at, + gloss_id, + video_id, + gloss_public, + video_public, + ] in all_keys_dict.items(): + + if not key_in_nzsl: + # This is an S3 object, not a Signbank record + continue + + if key_in_s3: + # This Signbank record already has an S3 object, all is well + continue + + # The gloss_id is the only reliable retrieval key at the Signbank end + gloss = Gloss.objects.get(id=gloss_id) + gloss_name = gloss.idgloss.split(":")[0].strip() + + # Skip any that already have a video path + # These should have an S3 object but don't: For some reason the video never made it to S3 + # These will have to have their videos reinstated (separate operation) + if gloss.glossvideo_set.exists(): + continue + + # We try to find the orphaned S3 object, if it exists + # TODO We could improve on brute-force by installing new libraries eg. rapidfuzz + for test_key, [key_nzsl_yes, key_s3_yes, *_] in all_keys_dict.items(): + if test_key.startswith(FAKEKEY_PREFIX): + continue + if gloss_name in test_key: + if str(gloss_id) in test_key: + if key_nzsl_yes: + print(f"Anomaly (in NZSL): {gloss.idgloss}", file=sys.stderr) + continue + if not key_s3_yes: + print(f"Anomaly (not in S3): {gloss.idgloss}", file=sys.stderr) + continue + out.writerow([gloss_id, gloss.idgloss, str(gloss_public), test_key]) + + +class Command(BaseCommand): + help = ( + "Find orphaned S3 objects that can be matched back to NZSL entries that are missing S3 objects. " + "You must setup: An AWS auth means, eg. AWS_PROFILE env var. " + "Postgres access details, eg. DATABASE_URL env var." + ) + + def add_arguments(self, parser): + parser.add_argument( + "--env", + default="uat", + required=False, + help="Environment to run against, eg 'production, 'uat', etc (default: '%(default)s')", + ) + parser.add_argument( + "--pgcli", + default=PGCLI, + required=False, + help=f"Postgres client path (default: %(default)s)", + ) + + def handle(self, *args, **options): + global PGCLI, AWS_S3_BUCKET + PGCLI = options["pgcli"] + AWS_S3_BUCKET = f"nzsl-signbank-media-{options['env']}" + + print(f"Env: {options['env']}", file=sys.stderr) + print(f"S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr) + print(f"PGCLI: {PGCLI}", file=sys.stderr) + print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr) + + find_orphans() diff --git a/signbank/dictionary/management/commands/get_video_s3_acls.py b/signbank/dictionary/management/commands/get_video_s3_acls.py new file mode 100644 index 00000000..51396c01 --- /dev/null +++ b/signbank/dictionary/management/commands/get_video_s3_acls.py @@ -0,0 +1,348 @@ +#!/usr/bin/env -S python3 -u +# Bang line above passes '-u' to python, for unbuffered output +# Permissions required: +# psql - access to heroku app's postgres +# aws s3 - NZSL IAM access +# s3:GetObjectAcl permissions or READ_ACP access to the object +# https://docs.aws.amazon.com/cli/latest/reference/s3api/get-object-acl.html + +from django.core.management.base import BaseCommand +import os +import sys +import subprocess +from uuid import uuid4 +from pprint import pprint +import boto3 +import csv + + +# Globals +CSV_DELIMITER = "," +FAKEKEY_PREFIX = "this_is_not_a_key_" +DATABASE_URL = os.getenv("DATABASE_URL", "") +S3_CLIENT = boto3.client("s3") +S3_RESOURCE = boto3.resource("s3") +PGCLI = "/usr/bin/psql" +AWS_S3_BUCKET = "" + + +def pg_cli(args_list): + try: + return subprocess.run( + [PGCLI, "-c"] + args_list + [f"{DATABASE_URL}"], + env=os.environ, + capture_output=True, + check=True, + text=True, + ) + except subprocess.CalledProcessError as e: + print(f"Error: subprocess.run returned code {e.returncode}", file=sys.stderr) + print(e.cmd, file=sys.stderr) + print(e.stdout, file=sys.stderr) + print(e.stderr, file=sys.stderr) + exit() + + +# Fake key is a hack to handle FULL JOIN +def maybe_fakekey(instring): + return instring if instring else FAKEKEY_PREFIX + str(uuid4()) + + +def filter_fakekey(instring): + return "" if instring.startswith(FAKEKEY_PREFIX) else instring + + +# Get the video files info from NZSL Signbank +def get_nzsl_raw_keys_dict(): + print( + f"Getting raw list of video file info from NZSL Signbank ...", + file=sys.stderr, + ) + this_nzsl_raw_keys_dict = {} + # Column renaming is for readability + # Special delimiter because columns might contain commas + result = pg_cli( + [ + "COPY (" + "SELECT " + "dg.id AS gloss_id, " + "dg.idgloss AS gloss_idgloss, " + "dg.created_at AS gloss_created_at, " + "dg.published AS gloss_public, " + "vg.is_public AS video_public, " + "vg.id AS video_id, " + "vg.videofile AS video_key " + "FROM dictionary_gloss AS dg " + "FULL JOIN video_glossvideo AS vg ON vg.gloss_id = dg.id" + ") TO STDOUT WITH (FORMAT CSV, DELIMITER '|')", + ] + ) + + # Separate the NZSL db columns + # Write them to a dictionary, so we can do fast operations + for rawl in result.stdout.split("\n"): + rawl = rawl.strip() + if not rawl: + continue + [ + gloss_id, + gloss_idgloss, + gloss_created_at, + gloss_public, + video_public, + video_id, + video_key, + ] = rawl.split("|") + + """ + Hack to handle FULL JOIN. + We are storing data rows in a dictionary, indexed by video_key. + Because we are doing a FULL JOIN on the NZSL Signbank database, + we also get rows where there are gloss entries that do not have + a corresponding video_glossvideo. + (These are erroneous and one of the reasons this script exists, + to find them.) + Consequently there is no video_key, and we cannot use it to index + the data row. + Instead, we create a fake video_key that is unique and, theoretically, + impossible for anything else to try and use. It also has a 'safe', + easily filtered prefix, which means later code can easily tell + a fake key from a real key. + Always having a key, in this way, means that code, eg. loops, + that depends on there being a dictionary key axis will not break. + """ + video_key = maybe_fakekey(video_key.strip()) + + # This sets the initial field ordering in the all_keys dictionary row + this_nzsl_raw_keys_dict[video_key] = [ + gloss_idgloss.replace(CSV_DELIMITER, ""), + gloss_created_at, + gloss_id, + video_id, + gloss_public.lower() == "t", + video_public.lower() == "t", + ] + + print( + f"{len(this_nzsl_raw_keys_dict)} rows retrieved", + file=sys.stderr, + ) + + return this_nzsl_raw_keys_dict + + +# Get all keys from AWS S3 +def get_s3_bucket_raw_keys_list(): + print(f"Getting raw AWS S3 keys recursively ({AWS_S3_BUCKET}) ...", file=sys.stderr) + + s3_resource_bucket = S3_RESOURCE.Bucket(AWS_S3_BUCKET) + this_s3_bucket_raw_keys_list = [ + s3_object.key for s3_object in s3_resource_bucket.objects.all() + ] + + print( + f"{len(this_s3_bucket_raw_keys_list)} rows retrieved", + file=sys.stderr, + ) + + return this_s3_bucket_raw_keys_list + + +# Get the keys present and absent across NZSL Signbank and S3, to dictionary +def create_all_keys_dict(this_nzsl_raw_keys_dict, this_s3_bucket_raw_keys_list): + print( + "Getting keys present and absent across NZSL Signbank and S3 ...", + file=sys.stderr, + ) + this_all_keys_dict = {} + + # Find S3 keys that are present in NZSL, or absent + # TODO This could be changed to use pop(), so that on each pass we are left + # with a smaller subset of the rows, which we can search faster. If the + # database becomes very large in future this could save a lot of processing. + for video_key in this_s3_bucket_raw_keys_list: + dict_row = this_nzsl_raw_keys_dict.get(video_key, None) + if dict_row: + this_all_keys_dict[video_key] = [ + True, # NZSL PRESENT + True, # S3 PRESENT + ] + dict_row + else: + this_all_keys_dict[video_key] = [ + False, # NZSL Absent + True, # S3 PRESENT + ] + [""] * 6 + + # Find NZSL keys that are absent from S3 (present handled above) + for video_key, dict_row in this_nzsl_raw_keys_dict.items(): + if video_key not in this_s3_bucket_raw_keys_list: + this_all_keys_dict[video_key] = [ + True, # NZSL PRESENT + False, # S3 Absent + ] + dict_row + + return this_all_keys_dict + + +def get_recommended_action(key_in_nzsl, key_in_s3): + """ + Cases + In S3 In NZSL Action + Is Is Update ACL + Is Not Delete S3 Object + Not -- Review + """ + if key_in_s3: + if key_in_nzsl: + return "Update ACL" + else: + return "Delete S3 Object" + return "Review" + + +# Get S3 object's ACL +def get_s3_canned_acl(video_key): + acls_grants = S3_CLIENT.get_object_acl(Bucket=AWS_S3_BUCKET, Key=video_key)[ + "Grants" + ] + if len(acls_grants) > 1: + if ( + acls_grants[0]["Permission"] == "FULL_CONTROL" + and acls_grants[1]["Permission"] == "READ" + ): + return "public-read" + elif acls_grants[0]["Permission"] == "FULL_CONTROL": + return "private" + + return "unknown" + + +# Get S3 object's LastModified date/time +def get_s3_lastmodified(video_key): + return S3_CLIENT.head_object(Bucket=AWS_S3_BUCKET, Key=video_key)["LastModified"] + + +def build_csv_header(): + return [ + "Action", + "S3 Video key", + "S3 LastModified", + "S3 Expected Canned ACL", + "S3 Actual Canned ACL", + "Sbank Gloss ID", + "Sbank Video ID", + "Sbank Gloss public", + "Sbank Video public", + "Sbank Gloss", + "Sbank Gloss created at", + ] + + +def build_csv_row( + video_key, + key_in_nzsl=False, + key_in_s3=False, + gloss_idgloss=None, + gloss_created_at=None, + gloss_id=None, + video_id=None, + gloss_public=False, + video_public=False, +): + # See signbank/video/models.py, line 59, function set_public_acl() + canned_acl_expected = "" + if key_in_nzsl: + canned_acl_expected = "public-read" if video_public else "private" + + lastmodified = "" + canned_acl = "" + if key_in_s3: + lastmodified = get_s3_lastmodified(video_key) + canned_acl = get_s3_canned_acl(video_key) + + action = get_recommended_action(key_in_nzsl, key_in_s3) + + return [ + action, + f"{filter_fakekey(video_key)}", + f"{lastmodified}", + f"{canned_acl_expected}", + f"{canned_acl}", + f"{gloss_id}", + f"{video_id}", + f"{gloss_public}", + f"{video_public}", + f"{gloss_idgloss}", + f"{gloss_created_at}", + ] + + +# From the keys present in NZSL, get all their S3 information +def process_keys(this_all_keys_dict): + print(f"Getting detailed S3 data for keys ({AWS_S3_BUCKET}) ...", file=sys.stderr) + + out = csv.writer(sys.stdout, delimiter=CSV_DELIMITER, quoting=csv.QUOTE_NONE) + out.writerow(build_csv_header()) + + for video_key, dict_row in this_all_keys_dict.items(): + out.writerow(build_csv_row(video_key, *dict_row)) + + +class Command(BaseCommand): + help = ( + "Get all S3 bucket video object and recommends actions for them. " + "You must setup: (1) An AWS auth means, eg. AWS_PROFILE env var. " + "(2) Postgres access details, eg. DATABASE_URL env var." + ) + + def add_arguments(self, parser): + parser.add_argument( + "--env", + default="uat", + required=False, + help="Environment to run against, eg 'production, 'uat', etc (default: '%(default)s')", + ) + parser.add_argument( + "--pgcli", + default=PGCLI, + required=False, + help=f"Postgres client path (default: %(default)s)", + ) + parser.add_argument( + "--dumpnzsl", + default=False, + required=False, + action="store_true", + help=f"Dump raw NZSL database output", + ) + parser.add_argument( + "--dumps3", + default=False, + required=False, + action="store_true", + help=f"Dump raw S3 keys output", + ) + + def handle(self, *args, **options): + global PGCLI, AWS_S3_BUCKET + PGCLI = options["pgcli"] + AWS_S3_BUCKET = f"nzsl-signbank-media-{options['env']}" + + print(f"Env: {options['env']}", file=sys.stderr) + print(f"S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr) + print(f"PGCLI: {PGCLI}", file=sys.stderr) + print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr) + + if options["dumpnzsl"]: + pprint(get_nzsl_raw_keys_dict()) + exit() + + if options["dumps3"]: + pprint(get_s3_bucket_raw_keys_list()) + exit() + + process_keys( + create_all_keys_dict( + get_nzsl_raw_keys_dict(), get_s3_bucket_raw_keys_list() + ) + ) diff --git a/signbank/dictionary/management/commands/repair_fixable_s3_orphans.py b/signbank/dictionary/management/commands/repair_fixable_s3_orphans.py new file mode 100755 index 00000000..06085051 --- /dev/null +++ b/signbank/dictionary/management/commands/repair_fixable_s3_orphans.py @@ -0,0 +1,176 @@ +#!/usr/bin/env -S python3 -u +# +# Given a CSV file containing S3 objects that can be matched back to NZSL entries. +# Updates the database to repair the NZSL entries. +# Essentially repairs one form of import error. +# +# Bang line above passes '-u' to python, for unbuffered output +# Permissions required: +# psql - access to heroku app's postgres +# aws s3 - NZSL IAM access +# s3:GetObjectAcl permissions or READ_ACP access to the object +# https://docs.aws.amazon.com/cli/latest/reference/s3api/get-object-acl.html + +from django.core.management.base import BaseCommand +import os +import sys +import subprocess +import csv +from signbank.dictionary.models import ( + FieldChoice, + Gloss, +) +from signbank.video.models import GlossVideo +from django.core.exceptions import ObjectDoesNotExist + + +# Keep synced with other scripts +GLOSS_ID_COLUMN = "Gloss ID" +GLOSS_COLUMN = "Gloss" +GLOSS_PUBLIC_COLUMN = "Gloss public" +GLOSS_VIDEO_COLUMN = "Suggested Video key" +GLOBAL_COLUMN_HEADINGS = [ + GLOSS_ID_COLUMN, + GLOSS_COLUMN, + GLOSS_PUBLIC_COLUMN, + GLOSS_VIDEO_COLUMN, +] + +# Other globals +CSV_DELIMITER = "," +FAKEKEY_PREFIX = "this_is_not_a_key_" +DATABASE_URL = os.getenv("DATABASE_URL", "") +PGCLI = "/usr/bin/psql" +AWS_S3_BUCKET = "" +DO_COMMIT = False +CSV_INPUT_FILENAME = "-" + + +def pg_cli(args_list): + try: + return subprocess.run( + [PGCLI, "-c"] + args_list + [f"{DATABASE_URL}"], + env=os.environ, + capture_output=True, + check=True, + text=True, + ) + except subprocess.CalledProcessError as e: + print(f"Error: subprocess.run returned code {e.returncode}", file=sys.stderr) + print(e.cmd, file=sys.stderr) + print(e.stdout, file=sys.stderr) + print(e.stderr, file=sys.stderr) + exit() + + +# Returns a list of dictionaries, one for each CSV row +def read_csv(csv_filename): + if csv_filename == "-": + f = sys.stdin.read().splitlines() + else: + f = open(csv_filename, "r") + return csv.DictReader(f) + + +def process_csv(): + main_video_type = FieldChoice.objects.filter( + field="video_type", english_name="main" + ).first() + + csv_rows = read_csv(CSV_INPUT_FILENAME) + + out = csv.writer(sys.stdout, delimiter=CSV_DELIMITER, quoting=csv.QUOTE_NONE) + + for csv_row in csv_rows: + gloss_id = csv_row[GLOSS_ID_COLUMN] + gloss_idgloss = csv_row[GLOSS_COLUMN] + video_key = csv_row[GLOSS_VIDEO_COLUMN] + out.writerow([gloss_id, gloss_idgloss, video_key]) + gloss_id = int(gloss_id) + + try: + gloss = Gloss.objects.get(id=gloss_id) + except ObjectDoesNotExist as e: + print(e) + continue + + try: + GlossVideo.objects.get(videofile=video_key) + print(f"Ignoring: GlossVideo already exists: {video_key}") + continue + except ObjectDoesNotExist: + pass + + gloss_video = GlossVideo( + gloss=gloss, + dataset=gloss.dataset, + videofile=video_key, + title=video_key, + version=0, + is_public=False, + video_type=main_video_type, + ) + print(gloss) + print(gloss_video) + + if not DO_COMMIT: + print("Dry run, no changes (use --commit flag to make changes)") + continue + + # At this point we complete the repair + # We use bulk_create() because we cannot allow save() to run + if len(GlossVideo.objects.bulk_create([gloss_video])) < 1: + print(f"Error: could not create {gloss_video}") + + +class Command(BaseCommand): + help = ( + f"Given a CSV file containing S3 objects that can be matched back to NZSL entries: " + f"Update the database to repair the NZSL entries. " + f"CSV Column headings {GLOBAL_COLUMN_HEADINGS}. " + f"You must have setup: An AWS auth means, eg. AWS_PROFILE env var. " + f"Postgres access details, eg. DATABASE_URL env var." + ) + + def add_arguments(self, parser): + # Positional arguments + parser.add_argument( + "csv_filename", help="Name of CSV input file, or '-' for STDIN" + ) + + # Optional arguments + parser.add_argument( + "--env", + default="uat", + required=False, + help="Environment to run against, eg 'production, 'uat', etc (default: '%(default)s')", + ) + parser.add_argument( + "--pgcli", + default="/usr/bin/psql", + required=False, + help=f"Postgres client path (default: %(default)s)", + ) + parser.add_argument( + "--commit", + default=DO_COMMIT, + required=False, + action="store_true", + help=f"Actually make changes, instead of just outputting what would happen (default)", + ) + + def handle(self, *args, **options): + global PGCLI, AWS_S3_BUCKET, CSV_INPUT_FILENAME, DO_COMMIT + PGCLI = options["pgcli"] + AWS_S3_BUCKET = f"nzsl-signbank-media-{options['env']}" + CSV_INPUT_FILENAME = options["csv_filename"] + DO_COMMIT = options["commit"] + + print(f"Env: {options['env']}", file=sys.stderr) + print(f"S3 bucket: {AWS_S3_BUCKET}", file=sys.stderr) + print(f"PGCLI: {PGCLI}", file=sys.stderr) + print(f"AWS profile: {os.environ.get('AWS_PROFILE', '')}", file=sys.stderr) + print(f"Input file: {options['csv_filename']}", file=sys.stderr) + print(f"Mode: {'Commit' if DO_COMMIT else 'Dry-run'}", file=sys.stderr) + + process_csv() diff --git a/signbank/video/admin.py b/signbank/video/admin.py index f2d62811..5c1282b7 100644 --- a/signbank/video/admin.py +++ b/signbank/video/admin.py @@ -75,18 +75,18 @@ def queryset(self, request, queryset): return queryset -def set_public(modeladmin, request, queryset): +def admin_set_public(modeladmin, request, queryset): for glossvideo in queryset.all(): glossvideo.set_public(True) -def set_hidden(modeladmin, request, queryset): +def admin_set_hidden(modeladmin, request, queryset): for glossvideo in queryset.all(): glossvideo.set_public(False) -set_public.short_description = _lazy("Set selected videos public") -set_hidden.short_description = _lazy("Set selected videos hidden") +admin_set_public.short_description = _lazy("Set selected videos public") +admin_set_hidden.short_description = _lazy("Set selected videos hidden") class GlossVideoAdmin(admin.ModelAdmin): @@ -98,7 +98,7 @@ class GlossVideoAdmin(admin.ModelAdmin): 'videofile', 'video_type', 'posterfile', 'id', 'version') list_filter = ('is_public', 'video_type', 'gloss__dataset', HasGlossFilter, 'dataset', HasPosterFilter, GlossesVideoCountFilter) - actions = [set_public, set_hidden] + actions = [admin_set_public, admin_set_hidden] def get_queryset(self, request): qs = super(GlossVideoAdmin, self).get_queryset(request) diff --git a/signbank/video/models.py b/signbank/video/models.py index 8de46671..e1617765 100644 --- a/signbank/video/models.py +++ b/signbank/video/models.py @@ -48,7 +48,7 @@ def public_url(self, name): return f'{domain}{path}' - def set_public(self, name, is_public): + def set_public_acl(self, name, is_public): """ Set the object ACL on the object. This is only supported for S3 storage, and is a no-op for local file storage """ @@ -62,8 +62,6 @@ def set_public(self, name, is_public): ) - - class GlossVideo(models.Model): """A video that represents a particular idgloss""" #: Descriptive title of the GlossVideo. @@ -247,7 +245,7 @@ def is_video(self): def set_public(self, is_public): self.is_public = is_public self.save() - self.videofile.storage.set_public(self.videofile.name, is_public) + self.videofile.storage.set_public_acl(self.videofile.name, is_public) True