Trim existing video fix (#54)

* upload entrypoint recieves hasEditedTranscript, and will not transcribe if true * regen vtts brought over to new blueprints
mentorpal · Feb 18, 2022 · 24fad9c · 24fad9c
1 parent 46002df
commit 24fad9c
Show file tree

Hide file tree

Showing 4 changed files with 230 additions and 10 deletions.
diff --git a/mentor_upload_api/src/mentor_upload_api/api.py b/mentor_upload_api/src/mentor_upload_api/api.py
@@ -11,7 +11,7 @@
 from os import environ
 from typing import TypedDict, List
 
-from mentor_upload_api.helpers import validate_json
+from mentor_upload_api.helpers import validate_json, exec_graphql_with_json_validation
 
 log = logging.getLogger()
 
@@ -227,9 +227,10 @@ def upload_answer_and_task_req_gql(
     variables["questionId"] = answer_req.question
 
     variables["answer"] = {
-        "transcript": answer_req.transcript,
         "media": answer_req.media,
     }
+    if answer_req.transcript:
+        variables["answer"]["transcript"] = (answer_req.transcript,)
     if answer_req.has_edited_transcript is not None:
         variables["answer"]["hasEditedTranscript"] = answer_req.has_edited_transcript
 
@@ -260,3 +261,64 @@ def upload_answer_and_task_update(
     tdjson = res.json()
     if "errors" in tdjson:
         raise Exception(json.dumps(tdjson.get("errors")))
+
+
+def fetch_answer_transcript_and_media_gql(mentor: str, question: str) -> GQLQueryBody:
+    return {
+        "query": """query Answer($mentor: ID!, $question: ID!) {
+            answer(mentor: $mentor, question: $question){
+                transcript
+                media {
+                type
+                tag
+                url
+              }
+            }
+        }""",
+        "variables": {"mentor": mentor, "question": question},
+    }
+
+
+fetch_answer_transcript_media_json_schema = {
+    "type": "object",
+    "properties": {
+        "data": {
+            "type": "object",
+            "properties": {
+                "answer": {
+                    "type": "object",
+                    "properties": {
+                        "transcript": {"type": "string"},
+                        "media": {
+                            "type": "array",
+                            "items": {
+                                "type": "object",
+                                "properties": {
+                                    "type": {"type": "string"},
+                                    "tag": {"type": "string"},
+                                    "url": {"type": "string"},
+                                },
+                                "required": ["type", "tag", "url"],
+                            },
+                        },
+                    },
+                    "required": ["transcript", "media"],
+                }
+            },
+            "required": ["answer"],
+        },
+    },
+    "required": ["data"],
+}
+
+
+def fetch_answer_transcript_and_media(mentor: str, question: str):
+    headers = {"mentor-graphql-req": "true", "Authorization": f"bearer {get_api_key()}"}
+    gql_query = fetch_answer_transcript_and_media_gql(mentor, question)
+    json_res = exec_graphql_with_json_validation(
+        gql_query, fetch_answer_transcript_media_json_schema, headers=headers
+    )
+    return (
+        json_res["data"]["answer"]["transcript"],
+        json_res["data"]["answer"]["media"],
+    )
diff --git a/mentor_upload_api/src/mentor_upload_api/blueprints/upload/answer.py b/mentor_upload_api/src/mentor_upload_api/blueprints/upload/answer.py
@@ -152,6 +152,7 @@ def trim_existing_upload(body):
             },
             "required": ["start", "end"],
         },
+        "hasEditedTranscript": {"type": "boolean"},
     },
     "required": ["mentor", "question"],
     "additionalProperties": False,

diff --git a/mentor_upload_api/src/mentor_upload_api/blueprints/upload/answer_queue.py b/mentor_upload_api/src/mentor_upload_api/blueprints/upload/answer_queue.py
@@ -5,6 +5,7 @@
 # The full terms of this copyright and license should always be found in the root directory of this software deliverable as "license.txt" and if these terms are not found with this software, please contact the USC Stevens Center for the full license.
 #
 from datetime import datetime
+import tempfile
 from dateutil import tz
 import json
 import logging
@@ -21,20 +22,27 @@
     UploadTaskRequest,
     is_upload_in_progress,
     upload_answer_and_task_update,
+    fetch_answer_transcript_and_media,
 )
 from mentor_upload_api.blueprints.upload.answer import video_upload_json_schema
 from mentor_upload_api.helpers import (
     validate_form_payload_decorator,
+    validate_json_payload_decorator,
     ValidateFormJsonBody,
 )
-from mentor_upload_api.authorization_decorator import authorize_to_manage_content
+from mentor_upload_api.authorization_decorator import (
+    authorize_to_manage_content,
+    authorize_to_edit_mentor,
+)
 from pymediainfo import MediaInfo
 from werkzeug.exceptions import BadRequest
 from flask_wtf import FlaskForm
 from wtforms import StringField
 from wtforms.validators import DataRequired
 from flask_wtf.file import FileRequired, FileAllowed, FileField
 
+from mentor_upload_api.media_tools import transcript_to_vtt
+
 log = logging.getLogger()
 answer_queue_blueprint = Blueprint("answer-queue", __name__)
 
@@ -118,7 +126,7 @@ def submit_job(req):
     log.info("sns message published %s", json.dumps(sns_msg))
 
 
-def create_task_list(trim):
+def create_task_list(trim, has_edited_transcript):
     task_list = []
     if trim:
         task_list.append(
@@ -143,9 +151,14 @@ def create_task_list(trim):
             "status": "QUEUED",
         }
     )
-    task_list.append(
-        {"task_name": "transcribing", "task_id": str(uuid.uuid4()), "status": "QUEUED"}
-    )
+    if not has_edited_transcript:
+        task_list.append(
+            {
+                "task_name": "transcribing",
+                "task_id": str(uuid.uuid4()),
+                "status": "QUEUED",
+            }
+        )
 
     return task_list
 
@@ -199,6 +212,7 @@ def upload(body):
 
     mentor = body.get("mentor")
     question = body.get("question")
+    has_edited_transcript = body.get("hasEditedTranscript")
     verify_no_upload_in_progress(mentor, question)
     trim = body.get("trim")
     upload_file = request.files["video"]
@@ -237,7 +251,7 @@ def upload(body):
     s3_path = f"videos/{mentor}/{question}"
     upload_to_s3(file_path, s3_path)
 
-    task_list = create_task_list(trim)
+    task_list = create_task_list(trim, has_edited_transcript)
 
     req = {
         "request": {
@@ -256,14 +270,14 @@ def upload(body):
             mentor=mentor,
             question=question,
             transcript="",
-            media=[{"type": "video", "tag": "web", "url": original_video_url}],
+            media=[{"type": "video", "tag": "original", "url": original_video_url}],
         ),
         UploadTaskRequest(
             mentor=mentor,
             question=question,
             task_list=task_list,
             transcript="",
-            media=[{"type": "video", "tag": "web", "url": original_video_url}],
+            media=[{"type": "video", "tag": "original", "url": original_video_url}],
         ),
     )
     submit_job(req)
@@ -346,3 +360,63 @@ def download_mounted_file(file_name: str):
             f"failed to find video file {file_name} in folder {file_directory}"
         )
         logging.exception(x)
+
+
+regen_vtt_json_schema = {
+    "type": "object",
+    "properties": {
+        "mentor": {"type": "string", "maxLength": 60, "minLength": 5},
+        "question": {"type": "string", "maxLength": 60, "minLength": 5},
+    },
+    "required": ["mentor", "question"],
+    "additionalProperties": False,
+}
+
+
+@answer_queue_blueprint.route("/regen_vtt/", methods=["POST"])
+@answer_queue_blueprint.route("/regen_vtt", methods=["POST"])
+@validate_json_payload_decorator(json_schema=regen_vtt_json_schema)
+@authorize_to_edit_mentor
+def regen_vtt(body):
+    mentor = body.get("mentor")
+    question = body.get("question")
+    result = _regen_vtt(mentor, question)
+    return jsonify({"data": result})
+
+
+def _regen_vtt(mentor: str, question: str):
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        try:
+            vtt_file_path = os.path.join(tmp_dir, "en.vtt")
+            (
+                transcript,
+                answer_media,
+            ) = fetch_answer_transcript_and_media(mentor, question)
+            web_media = next((x for x in answer_media if x["tag"] == "web"), None)
+            if not web_media:
+                raise Exception(
+                    f"failed to find answer media for mentor: {mentor} and question: {question}"
+                )
+            transcript_to_vtt(web_media["url"], vtt_file_path, transcript)
+            video_path_base = f"videos/{mentor}/{question}/"
+            if path.isfile(vtt_file_path):
+                item_path = f"{video_path_base}en.vtt"
+                s3_client.upload_file(
+                    str(vtt_file_path),
+                    static_s3_bucket,
+                    item_path,
+                    ExtraArgs={"ContentType": "text/vtt"},
+                )
+            else:
+                import logging
+
+                logging.error(f"Failed to find file at {vtt_file_path}")
+            return {"regen_vtt": True}
+        except Exception as x:
+            import logging
+
+            logging.error(
+                f"failed to regenerate vtt for mentor {mentor} and question {question}"
+            )
+            logging.exception(x)
+            return {"regen_vtt": False}
diff --git a/mentor_upload_api/src/mentor_upload_api/media_tools.py b/mentor_upload_api/src/mentor_upload_api/media_tools.py
@@ -0,0 +1,83 @@
+#
+# This software is Copyright ©️ 2020 The University of Southern California. All Rights Reserved.
+# Permission to use, copy, modify, and distribute this software and its documentation for educational, research and non-profit purposes, without fee, and without a written agreement is hereby granted, provided that the above copyright notice and subject to the full license file found in the root of this software deliverable. Permission to make commercial use of this software may be obtained by contacting:  USC Stevens Center for Innovation University of Southern California 1150 S. Olive Street, Suite 2300, Los Angeles, CA 90115, USA Email: [email protected]
+#
+# The full terms of this copyright and license should always be found in the root directory of this software deliverable as "license.txt" and if these terms are not found with this software, please contact the USC Stevens Center for the full license.
+#
+import logging
+import os
+import re
+import math
+from pymediainfo import MediaInfo
+
+log = logging.getLogger()
+
+
+def find_duration(audio_or_video_file: str) -> float:
+    log.info(audio_or_video_file)
+    media_info = MediaInfo.parse(audio_or_video_file)
+    for t in media_info.tracks:
+        if t.track_type in ["Video", "Audio"]:
+            try:
+                log.debug(t)
+                return float(t.duration / 1000)
+            except Exception:
+                pass
+    return -1.0
+
+
+def find(
+    s: str, ch: str
+):  # gives indexes of all of the spaces so we don't split words apart
+    return [i for i, ltr in enumerate(s) if ltr == ch]
+
+
+def transcript_to_vtt(
+    audio_or_video_file_or_url: str, vtt_file: str, transcript: str
+) -> str:
+    log.info("%s, %s, %s", audio_or_video_file_or_url, vtt_file, transcript)
+
+    if not os.path.exists(audio_or_video_file_or_url) and not re.search(
+        "^https?", audio_or_video_file_or_url
+    ):
+        raise Exception(
+            f"ERROR: Can't generate vtt, {audio_or_video_file_or_url} doesn't exist or is not a valid url"
+        )
+    duration = find_duration(audio_or_video_file_or_url)
+    log.debug(duration)
+    if duration <= 0:
+        log.warning(f"video duration for {audio_or_video_file_or_url} returned 0")
+        return ""
+    piece_length = 68
+    word_indexes = find(transcript, " ")
+    split_index = [0]
+    for k in range(1, len(word_indexes)):
+        for el in range(1, len(word_indexes)):
+            if word_indexes[el] > piece_length * k:
+                split_index.append(word_indexes[el])
+                break
+    split_index.append(len(transcript))
+    log.debug(split_index)
+    amount_of_chunks = math.ceil(len(transcript) / piece_length)
+    log.debug(amount_of_chunks)
+    vtt_str = "WEBVTT FILE:\n\n"
+    for j in range(len(split_index) - 1):  # this uses a constant piece length
+        seconds_start = round((duration / amount_of_chunks) * j, 2) + 0.85
+        seconds_end = round((duration / amount_of_chunks) * (j + 1), 2) + 0.85
+        output_start = (
+            str(math.floor(seconds_start / 60)).zfill(2)
+            + ":"
+            + ("%.3f" % (seconds_start % 60)).zfill(6)
+        )
+        output_end = (
+            str(math.floor(seconds_end / 60)).zfill(2)
+            + ":"
+            + ("%.3f" % (seconds_end % 60)).zfill(6)
+        )
+        vtt_str += f"00:{output_start} --> 00:{output_end}\n"
+        vtt_str += f"{transcript[split_index[j] : split_index[j + 1]]}\n\n"
+    os.makedirs(os.path.dirname(vtt_file), exist_ok=True)
+    with open(vtt_file, "w") as f:
+        f.write(vtt_str)
+    log.debug(vtt_str)
+    return vtt_str