Skip to content

Commit

Permalink
Trim existing video fix (#54)
Browse files Browse the repository at this point in the history
* upload entrypoint recieves hasEditedTranscript, and will not transcribe if true

* regen vtts brought over to new blueprints
  • Loading branch information
aaronshiel authored Feb 18, 2022
1 parent 46002df commit 24fad9c
Show file tree
Hide file tree
Showing 4 changed files with 230 additions and 10 deletions.
66 changes: 64 additions & 2 deletions mentor_upload_api/src/mentor_upload_api/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from os import environ
from typing import TypedDict, List

from mentor_upload_api.helpers import validate_json
from mentor_upload_api.helpers import validate_json, exec_graphql_with_json_validation

log = logging.getLogger()

Expand Down Expand Up @@ -227,9 +227,10 @@ def upload_answer_and_task_req_gql(
variables["questionId"] = answer_req.question

variables["answer"] = {
"transcript": answer_req.transcript,
"media": answer_req.media,
}
if answer_req.transcript:
variables["answer"]["transcript"] = (answer_req.transcript,)
if answer_req.has_edited_transcript is not None:
variables["answer"]["hasEditedTranscript"] = answer_req.has_edited_transcript

Expand Down Expand Up @@ -260,3 +261,64 @@ def upload_answer_and_task_update(
tdjson = res.json()
if "errors" in tdjson:
raise Exception(json.dumps(tdjson.get("errors")))


def fetch_answer_transcript_and_media_gql(mentor: str, question: str) -> GQLQueryBody:
return {
"query": """query Answer($mentor: ID!, $question: ID!) {
answer(mentor: $mentor, question: $question){
transcript
media {
type
tag
url
}
}
}""",
"variables": {"mentor": mentor, "question": question},
}


fetch_answer_transcript_media_json_schema = {
"type": "object",
"properties": {
"data": {
"type": "object",
"properties": {
"answer": {
"type": "object",
"properties": {
"transcript": {"type": "string"},
"media": {
"type": "array",
"items": {
"type": "object",
"properties": {
"type": {"type": "string"},
"tag": {"type": "string"},
"url": {"type": "string"},
},
"required": ["type", "tag", "url"],
},
},
},
"required": ["transcript", "media"],
}
},
"required": ["answer"],
},
},
"required": ["data"],
}


def fetch_answer_transcript_and_media(mentor: str, question: str):
headers = {"mentor-graphql-req": "true", "Authorization": f"bearer {get_api_key()}"}
gql_query = fetch_answer_transcript_and_media_gql(mentor, question)
json_res = exec_graphql_with_json_validation(
gql_query, fetch_answer_transcript_media_json_schema, headers=headers
)
return (
json_res["data"]["answer"]["transcript"],
json_res["data"]["answer"]["media"],
)
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ def trim_existing_upload(body):
},
"required": ["start", "end"],
},
"hasEditedTranscript": {"type": "boolean"},
},
"required": ["mentor", "question"],
"additionalProperties": False,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
# The full terms of this copyright and license should always be found in the root directory of this software deliverable as "license.txt" and if these terms are not found with this software, please contact the USC Stevens Center for the full license.
#
from datetime import datetime
import tempfile
from dateutil import tz
import json
import logging
Expand All @@ -21,20 +22,27 @@
UploadTaskRequest,
is_upload_in_progress,
upload_answer_and_task_update,
fetch_answer_transcript_and_media,
)
from mentor_upload_api.blueprints.upload.answer import video_upload_json_schema
from mentor_upload_api.helpers import (
validate_form_payload_decorator,
validate_json_payload_decorator,
ValidateFormJsonBody,
)
from mentor_upload_api.authorization_decorator import authorize_to_manage_content
from mentor_upload_api.authorization_decorator import (
authorize_to_manage_content,
authorize_to_edit_mentor,
)
from pymediainfo import MediaInfo
from werkzeug.exceptions import BadRequest
from flask_wtf import FlaskForm
from wtforms import StringField
from wtforms.validators import DataRequired
from flask_wtf.file import FileRequired, FileAllowed, FileField

from mentor_upload_api.media_tools import transcript_to_vtt

log = logging.getLogger()
answer_queue_blueprint = Blueprint("answer-queue", __name__)

Expand Down Expand Up @@ -118,7 +126,7 @@ def submit_job(req):
log.info("sns message published %s", json.dumps(sns_msg))


def create_task_list(trim):
def create_task_list(trim, has_edited_transcript):
task_list = []
if trim:
task_list.append(
Expand All @@ -143,9 +151,14 @@ def create_task_list(trim):
"status": "QUEUED",
}
)
task_list.append(
{"task_name": "transcribing", "task_id": str(uuid.uuid4()), "status": "QUEUED"}
)
if not has_edited_transcript:
task_list.append(
{
"task_name": "transcribing",
"task_id": str(uuid.uuid4()),
"status": "QUEUED",
}
)

return task_list

Expand Down Expand Up @@ -199,6 +212,7 @@ def upload(body):

mentor = body.get("mentor")
question = body.get("question")
has_edited_transcript = body.get("hasEditedTranscript")
verify_no_upload_in_progress(mentor, question)
trim = body.get("trim")
upload_file = request.files["video"]
Expand Down Expand Up @@ -237,7 +251,7 @@ def upload(body):
s3_path = f"videos/{mentor}/{question}"
upload_to_s3(file_path, s3_path)

task_list = create_task_list(trim)
task_list = create_task_list(trim, has_edited_transcript)

req = {
"request": {
Expand All @@ -256,14 +270,14 @@ def upload(body):
mentor=mentor,
question=question,
transcript="",
media=[{"type": "video", "tag": "web", "url": original_video_url}],
media=[{"type": "video", "tag": "original", "url": original_video_url}],
),
UploadTaskRequest(
mentor=mentor,
question=question,
task_list=task_list,
transcript="",
media=[{"type": "video", "tag": "web", "url": original_video_url}],
media=[{"type": "video", "tag": "original", "url": original_video_url}],
),
)
submit_job(req)
Expand Down Expand Up @@ -346,3 +360,63 @@ def download_mounted_file(file_name: str):
f"failed to find video file {file_name} in folder {file_directory}"
)
logging.exception(x)


regen_vtt_json_schema = {
"type": "object",
"properties": {
"mentor": {"type": "string", "maxLength": 60, "minLength": 5},
"question": {"type": "string", "maxLength": 60, "minLength": 5},
},
"required": ["mentor", "question"],
"additionalProperties": False,
}


@answer_queue_blueprint.route("/regen_vtt/", methods=["POST"])
@answer_queue_blueprint.route("/regen_vtt", methods=["POST"])
@validate_json_payload_decorator(json_schema=regen_vtt_json_schema)
@authorize_to_edit_mentor
def regen_vtt(body):
mentor = body.get("mentor")
question = body.get("question")
result = _regen_vtt(mentor, question)
return jsonify({"data": result})


def _regen_vtt(mentor: str, question: str):
with tempfile.TemporaryDirectory() as tmp_dir:
try:
vtt_file_path = os.path.join(tmp_dir, "en.vtt")
(
transcript,
answer_media,
) = fetch_answer_transcript_and_media(mentor, question)
web_media = next((x for x in answer_media if x["tag"] == "web"), None)
if not web_media:
raise Exception(
f"failed to find answer media for mentor: {mentor} and question: {question}"
)
transcript_to_vtt(web_media["url"], vtt_file_path, transcript)
video_path_base = f"videos/{mentor}/{question}/"
if path.isfile(vtt_file_path):
item_path = f"{video_path_base}en.vtt"
s3_client.upload_file(
str(vtt_file_path),
static_s3_bucket,
item_path,
ExtraArgs={"ContentType": "text/vtt"},
)
else:
import logging

logging.error(f"Failed to find file at {vtt_file_path}")
return {"regen_vtt": True}
except Exception as x:
import logging

logging.error(
f"failed to regenerate vtt for mentor {mentor} and question {question}"
)
logging.exception(x)
return {"regen_vtt": False}
83 changes: 83 additions & 0 deletions mentor_upload_api/src/mentor_upload_api/media_tools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#
# This software is Copyright ©️ 2020 The University of Southern California. All Rights Reserved.
# Permission to use, copy, modify, and distribute this software and its documentation for educational, research and non-profit purposes, without fee, and without a written agreement is hereby granted, provided that the above copyright notice and subject to the full license file found in the root of this software deliverable. Permission to make commercial use of this software may be obtained by contacting: USC Stevens Center for Innovation University of Southern California 1150 S. Olive Street, Suite 2300, Los Angeles, CA 90115, USA Email: [email protected]
#
# The full terms of this copyright and license should always be found in the root directory of this software deliverable as "license.txt" and if these terms are not found with this software, please contact the USC Stevens Center for the full license.
#
import logging
import os
import re
import math
from pymediainfo import MediaInfo

log = logging.getLogger()


def find_duration(audio_or_video_file: str) -> float:
log.info(audio_or_video_file)
media_info = MediaInfo.parse(audio_or_video_file)
for t in media_info.tracks:
if t.track_type in ["Video", "Audio"]:
try:
log.debug(t)
return float(t.duration / 1000)
except Exception:
pass
return -1.0


def find(
s: str, ch: str
): # gives indexes of all of the spaces so we don't split words apart
return [i for i, ltr in enumerate(s) if ltr == ch]


def transcript_to_vtt(
audio_or_video_file_or_url: str, vtt_file: str, transcript: str
) -> str:
log.info("%s, %s, %s", audio_or_video_file_or_url, vtt_file, transcript)

if not os.path.exists(audio_or_video_file_or_url) and not re.search(
"^https?", audio_or_video_file_or_url
):
raise Exception(
f"ERROR: Can't generate vtt, {audio_or_video_file_or_url} doesn't exist or is not a valid url"
)
duration = find_duration(audio_or_video_file_or_url)
log.debug(duration)
if duration <= 0:
log.warning(f"video duration for {audio_or_video_file_or_url} returned 0")
return ""
piece_length = 68
word_indexes = find(transcript, " ")
split_index = [0]
for k in range(1, len(word_indexes)):
for el in range(1, len(word_indexes)):
if word_indexes[el] > piece_length * k:
split_index.append(word_indexes[el])
break
split_index.append(len(transcript))
log.debug(split_index)
amount_of_chunks = math.ceil(len(transcript) / piece_length)
log.debug(amount_of_chunks)
vtt_str = "WEBVTT FILE:\n\n"
for j in range(len(split_index) - 1): # this uses a constant piece length
seconds_start = round((duration / amount_of_chunks) * j, 2) + 0.85
seconds_end = round((duration / amount_of_chunks) * (j + 1), 2) + 0.85
output_start = (
str(math.floor(seconds_start / 60)).zfill(2)
+ ":"
+ ("%.3f" % (seconds_start % 60)).zfill(6)
)
output_end = (
str(math.floor(seconds_end / 60)).zfill(2)
+ ":"
+ ("%.3f" % (seconds_end % 60)).zfill(6)
)
vtt_str += f"00:{output_start} --> 00:{output_end}\n"
vtt_str += f"{transcript[split_index[j] : split_index[j + 1]]}\n\n"
os.makedirs(os.path.dirname(vtt_file), exist_ok=True)
with open(vtt_file, "w") as f:
f.write(vtt_str)
log.debug(vtt_str)
return vtt_str

0 comments on commit 24fad9c

Please sign in to comment.