From 24fad9c77e5611786806e76c6240f1a72f3afba8 Mon Sep 17 00:00:00 2001 From: Aaron shiel <57824522+aaronshiel@users.noreply.github.com> Date: Fri, 18 Feb 2022 09:22:01 -0800 Subject: [PATCH] Trim existing video fix (#54) * upload entrypoint recieves hasEditedTranscript, and will not transcribe if true * regen vtts brought over to new blueprints --- .../src/mentor_upload_api/api.py | 66 +++++++++++++- .../blueprints/upload/answer.py | 1 + .../blueprints/upload/answer_queue.py | 90 +++++++++++++++++-- .../src/mentor_upload_api/media_tools.py | 83 +++++++++++++++++ 4 files changed, 230 insertions(+), 10 deletions(-) create mode 100644 mentor_upload_api/src/mentor_upload_api/media_tools.py diff --git a/mentor_upload_api/src/mentor_upload_api/api.py b/mentor_upload_api/src/mentor_upload_api/api.py index 5101a11..0efe643 100644 --- a/mentor_upload_api/src/mentor_upload_api/api.py +++ b/mentor_upload_api/src/mentor_upload_api/api.py @@ -11,7 +11,7 @@ from os import environ from typing import TypedDict, List -from mentor_upload_api.helpers import validate_json +from mentor_upload_api.helpers import validate_json, exec_graphql_with_json_validation log = logging.getLogger() @@ -227,9 +227,10 @@ def upload_answer_and_task_req_gql( variables["questionId"] = answer_req.question variables["answer"] = { - "transcript": answer_req.transcript, "media": answer_req.media, } + if answer_req.transcript: + variables["answer"]["transcript"] = (answer_req.transcript,) if answer_req.has_edited_transcript is not None: variables["answer"]["hasEditedTranscript"] = answer_req.has_edited_transcript @@ -260,3 +261,64 @@ def upload_answer_and_task_update( tdjson = res.json() if "errors" in tdjson: raise Exception(json.dumps(tdjson.get("errors"))) + + +def fetch_answer_transcript_and_media_gql(mentor: str, question: str) -> GQLQueryBody: + return { + "query": """query Answer($mentor: ID!, $question: ID!) { + answer(mentor: $mentor, question: $question){ + transcript + media { + type + tag + url + } + } + }""", + "variables": {"mentor": mentor, "question": question}, + } + + +fetch_answer_transcript_media_json_schema = { + "type": "object", + "properties": { + "data": { + "type": "object", + "properties": { + "answer": { + "type": "object", + "properties": { + "transcript": {"type": "string"}, + "media": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": {"type": "string"}, + "tag": {"type": "string"}, + "url": {"type": "string"}, + }, + "required": ["type", "tag", "url"], + }, + }, + }, + "required": ["transcript", "media"], + } + }, + "required": ["answer"], + }, + }, + "required": ["data"], +} + + +def fetch_answer_transcript_and_media(mentor: str, question: str): + headers = {"mentor-graphql-req": "true", "Authorization": f"bearer {get_api_key()}"} + gql_query = fetch_answer_transcript_and_media_gql(mentor, question) + json_res = exec_graphql_with_json_validation( + gql_query, fetch_answer_transcript_media_json_schema, headers=headers + ) + return ( + json_res["data"]["answer"]["transcript"], + json_res["data"]["answer"]["media"], + ) diff --git a/mentor_upload_api/src/mentor_upload_api/blueprints/upload/answer.py b/mentor_upload_api/src/mentor_upload_api/blueprints/upload/answer.py index 79d0d7e..fea5edc 100644 --- a/mentor_upload_api/src/mentor_upload_api/blueprints/upload/answer.py +++ b/mentor_upload_api/src/mentor_upload_api/blueprints/upload/answer.py @@ -152,6 +152,7 @@ def trim_existing_upload(body): }, "required": ["start", "end"], }, + "hasEditedTranscript": {"type": "boolean"}, }, "required": ["mentor", "question"], "additionalProperties": False, diff --git a/mentor_upload_api/src/mentor_upload_api/blueprints/upload/answer_queue.py b/mentor_upload_api/src/mentor_upload_api/blueprints/upload/answer_queue.py index 0d39f4d..6699fa8 100644 --- a/mentor_upload_api/src/mentor_upload_api/blueprints/upload/answer_queue.py +++ b/mentor_upload_api/src/mentor_upload_api/blueprints/upload/answer_queue.py @@ -5,6 +5,7 @@ # The full terms of this copyright and license should always be found in the root directory of this software deliverable as "license.txt" and if these terms are not found with this software, please contact the USC Stevens Center for the full license. # from datetime import datetime +import tempfile from dateutil import tz import json import logging @@ -21,13 +22,18 @@ UploadTaskRequest, is_upload_in_progress, upload_answer_and_task_update, + fetch_answer_transcript_and_media, ) from mentor_upload_api.blueprints.upload.answer import video_upload_json_schema from mentor_upload_api.helpers import ( validate_form_payload_decorator, + validate_json_payload_decorator, ValidateFormJsonBody, ) -from mentor_upload_api.authorization_decorator import authorize_to_manage_content +from mentor_upload_api.authorization_decorator import ( + authorize_to_manage_content, + authorize_to_edit_mentor, +) from pymediainfo import MediaInfo from werkzeug.exceptions import BadRequest from flask_wtf import FlaskForm @@ -35,6 +41,8 @@ from wtforms.validators import DataRequired from flask_wtf.file import FileRequired, FileAllowed, FileField +from mentor_upload_api.media_tools import transcript_to_vtt + log = logging.getLogger() answer_queue_blueprint = Blueprint("answer-queue", __name__) @@ -118,7 +126,7 @@ def submit_job(req): log.info("sns message published %s", json.dumps(sns_msg)) -def create_task_list(trim): +def create_task_list(trim, has_edited_transcript): task_list = [] if trim: task_list.append( @@ -143,9 +151,14 @@ def create_task_list(trim): "status": "QUEUED", } ) - task_list.append( - {"task_name": "transcribing", "task_id": str(uuid.uuid4()), "status": "QUEUED"} - ) + if not has_edited_transcript: + task_list.append( + { + "task_name": "transcribing", + "task_id": str(uuid.uuid4()), + "status": "QUEUED", + } + ) return task_list @@ -199,6 +212,7 @@ def upload(body): mentor = body.get("mentor") question = body.get("question") + has_edited_transcript = body.get("hasEditedTranscript") verify_no_upload_in_progress(mentor, question) trim = body.get("trim") upload_file = request.files["video"] @@ -237,7 +251,7 @@ def upload(body): s3_path = f"videos/{mentor}/{question}" upload_to_s3(file_path, s3_path) - task_list = create_task_list(trim) + task_list = create_task_list(trim, has_edited_transcript) req = { "request": { @@ -256,14 +270,14 @@ def upload(body): mentor=mentor, question=question, transcript="", - media=[{"type": "video", "tag": "web", "url": original_video_url}], + media=[{"type": "video", "tag": "original", "url": original_video_url}], ), UploadTaskRequest( mentor=mentor, question=question, task_list=task_list, transcript="", - media=[{"type": "video", "tag": "web", "url": original_video_url}], + media=[{"type": "video", "tag": "original", "url": original_video_url}], ), ) submit_job(req) @@ -346,3 +360,63 @@ def download_mounted_file(file_name: str): f"failed to find video file {file_name} in folder {file_directory}" ) logging.exception(x) + + +regen_vtt_json_schema = { + "type": "object", + "properties": { + "mentor": {"type": "string", "maxLength": 60, "minLength": 5}, + "question": {"type": "string", "maxLength": 60, "minLength": 5}, + }, + "required": ["mentor", "question"], + "additionalProperties": False, +} + + +@answer_queue_blueprint.route("/regen_vtt/", methods=["POST"]) +@answer_queue_blueprint.route("/regen_vtt", methods=["POST"]) +@validate_json_payload_decorator(json_schema=regen_vtt_json_schema) +@authorize_to_edit_mentor +def regen_vtt(body): + mentor = body.get("mentor") + question = body.get("question") + result = _regen_vtt(mentor, question) + return jsonify({"data": result}) + + +def _regen_vtt(mentor: str, question: str): + with tempfile.TemporaryDirectory() as tmp_dir: + try: + vtt_file_path = os.path.join(tmp_dir, "en.vtt") + ( + transcript, + answer_media, + ) = fetch_answer_transcript_and_media(mentor, question) + web_media = next((x for x in answer_media if x["tag"] == "web"), None) + if not web_media: + raise Exception( + f"failed to find answer media for mentor: {mentor} and question: {question}" + ) + transcript_to_vtt(web_media["url"], vtt_file_path, transcript) + video_path_base = f"videos/{mentor}/{question}/" + if path.isfile(vtt_file_path): + item_path = f"{video_path_base}en.vtt" + s3_client.upload_file( + str(vtt_file_path), + static_s3_bucket, + item_path, + ExtraArgs={"ContentType": "text/vtt"}, + ) + else: + import logging + + logging.error(f"Failed to find file at {vtt_file_path}") + return {"regen_vtt": True} + except Exception as x: + import logging + + logging.error( + f"failed to regenerate vtt for mentor {mentor} and question {question}" + ) + logging.exception(x) + return {"regen_vtt": False} diff --git a/mentor_upload_api/src/mentor_upload_api/media_tools.py b/mentor_upload_api/src/mentor_upload_api/media_tools.py new file mode 100644 index 0000000..c2862b4 --- /dev/null +++ b/mentor_upload_api/src/mentor_upload_api/media_tools.py @@ -0,0 +1,83 @@ +# +# This software is Copyright ©️ 2020 The University of Southern California. All Rights Reserved. +# Permission to use, copy, modify, and distribute this software and its documentation for educational, research and non-profit purposes, without fee, and without a written agreement is hereby granted, provided that the above copyright notice and subject to the full license file found in the root of this software deliverable. Permission to make commercial use of this software may be obtained by contacting: USC Stevens Center for Innovation University of Southern California 1150 S. Olive Street, Suite 2300, Los Angeles, CA 90115, USA Email: accounting@stevens.usc.edu +# +# The full terms of this copyright and license should always be found in the root directory of this software deliverable as "license.txt" and if these terms are not found with this software, please contact the USC Stevens Center for the full license. +# +import logging +import os +import re +import math +from pymediainfo import MediaInfo + +log = logging.getLogger() + + +def find_duration(audio_or_video_file: str) -> float: + log.info(audio_or_video_file) + media_info = MediaInfo.parse(audio_or_video_file) + for t in media_info.tracks: + if t.track_type in ["Video", "Audio"]: + try: + log.debug(t) + return float(t.duration / 1000) + except Exception: + pass + return -1.0 + + +def find( + s: str, ch: str +): # gives indexes of all of the spaces so we don't split words apart + return [i for i, ltr in enumerate(s) if ltr == ch] + + +def transcript_to_vtt( + audio_or_video_file_or_url: str, vtt_file: str, transcript: str +) -> str: + log.info("%s, %s, %s", audio_or_video_file_or_url, vtt_file, transcript) + + if not os.path.exists(audio_or_video_file_or_url) and not re.search( + "^https?", audio_or_video_file_or_url + ): + raise Exception( + f"ERROR: Can't generate vtt, {audio_or_video_file_or_url} doesn't exist or is not a valid url" + ) + duration = find_duration(audio_or_video_file_or_url) + log.debug(duration) + if duration <= 0: + log.warning(f"video duration for {audio_or_video_file_or_url} returned 0") + return "" + piece_length = 68 + word_indexes = find(transcript, " ") + split_index = [0] + for k in range(1, len(word_indexes)): + for el in range(1, len(word_indexes)): + if word_indexes[el] > piece_length * k: + split_index.append(word_indexes[el]) + break + split_index.append(len(transcript)) + log.debug(split_index) + amount_of_chunks = math.ceil(len(transcript) / piece_length) + log.debug(amount_of_chunks) + vtt_str = "WEBVTT FILE:\n\n" + for j in range(len(split_index) - 1): # this uses a constant piece length + seconds_start = round((duration / amount_of_chunks) * j, 2) + 0.85 + seconds_end = round((duration / amount_of_chunks) * (j + 1), 2) + 0.85 + output_start = ( + str(math.floor(seconds_start / 60)).zfill(2) + + ":" + + ("%.3f" % (seconds_start % 60)).zfill(6) + ) + output_end = ( + str(math.floor(seconds_end / 60)).zfill(2) + + ":" + + ("%.3f" % (seconds_end % 60)).zfill(6) + ) + vtt_str += f"00:{output_start} --> 00:{output_end}\n" + vtt_str += f"{transcript[split_index[j] : split_index[j + 1]]}\n\n" + os.makedirs(os.path.dirname(vtt_file), exist_ok=True) + with open(vtt_file, "w") as f: + f.write(vtt_str) + log.debug(vtt_str) + return vtt_str