From 56db424f44ffb67e36da96636c2db9441b27c058 Mon Sep 17 00:00:00 2001
From: Josh Allmann <joshua.allmann@gmail.com>
Date: Tue, 30 Jul 2019 18:38:24 +0000
Subject: [PATCH] ffmpeg: Reuse transcode session for GPU encoding.

---
 ffmpeg/api_test.go                           |  90 ++
 ffmpeg/ffmpeg.go                             |  64 +-
 ffmpeg/ffmpeg_errors.go                      |   1 +
 ffmpeg/lpms_ffmpeg.c                         | 971 +++++++++++++------
 ffmpeg/lpms_ffmpeg.h                         |  11 +
 ffmpeg/nvidia_test.go                        | 297 ++++++
 transcoder/ffmpeg_segment_transcoder_test.go |   2 +-
 7 files changed, 1113 insertions(+), 323 deletions(-)
 create mode 100644 ffmpeg/api_test.go

diff --git a/ffmpeg/api_test.go b/ffmpeg/api_test.go
new file mode 100644
index 0000000000..0823d4fbfb
--- /dev/null
+++ b/ffmpeg/api_test.go
@@ -0,0 +1,90 @@
+package ffmpeg
+
+import (
+	"testing"
+)
+
+func TestTranscoderAPI_InvalidFile(t *testing.T) {
+	// Test the following file open results on input: fail, success, fail, success
+
+	tc := NewTranscoder()
+	defer tc.StopTranscoder()
+	in := &TranscodeOptionsIn{}
+	out := []TranscodeOptions{TranscodeOptions{
+		Oname:        "-",
+		AudioEncoder: ComponentOptions{Name: "copy"},
+		VideoEncoder: ComponentOptions{Name: "drop"},
+		Muxer:        ComponentOptions{Name: "null"},
+	}}
+
+	// fail # 1
+	in.Fname = "none"
+	_, err := tc.Transcode(in, out)
+	if err == nil || err.Error() != "No such file or directory" {
+		t.Error("Expected 'No such file or directory', got ", err)
+	}
+
+	// success # 1
+	in.Fname = "../transcoder/test.ts"
+	_, err = tc.Transcode(in, out)
+	if err != nil {
+		t.Error(err)
+	}
+
+	// fail # 2
+	in.Fname = "none"
+	_, err = tc.Transcode(in, out)
+	if err == nil || err.Error() != "No such file or directory" {
+		t.Error("Expected 'No such file or directory', got ", err)
+	}
+
+	// success # 2
+	in.Fname = "../transcoder/test.ts"
+	_, err = tc.Transcode(in, out)
+	if err != nil {
+		t.Error(err)
+	}
+
+	// Now check invalid output filename
+	out[0].Muxer = ComponentOptions{Name: "md5"}
+	out[0].Oname = "/not/really/anywhere"
+	_, err = tc.Transcode(in, out)
+	if err == nil {
+		t.Error(err)
+	}
+
+}
+
+func TestTranscoderAPI_Stopped(t *testing.T) {
+
+	// Test stopped transcoder
+	tc := NewTranscoder()
+	tc.StopTranscoder()
+	in := &TranscodeOptionsIn{}
+	_, err := tc.Transcode(in, nil)
+	if err != ErrTranscoderStp {
+		t.Errorf("Unexpected error; wanted %v but got %v", ErrTranscoderStp, err)
+	}
+
+	// test somehow munged transcoder handle
+	tc2 := NewTranscoder()
+	tc2.handle = nil // technically this leaks memory ... OK for test
+	_, err = tc2.Transcode(in, nil)
+	if err != ErrTranscoderStp {
+		t.Errorf("Unexpected error; wanted %v but got %v", ErrTranscoderStp, err)
+	}
+}
+
+func TestTranscoderAPI_TooManyOutputs(t *testing.T) {
+
+	out := make([]TranscodeOptions, 11)
+	for i, _ := range out {
+		out[i].VideoEncoder = ComponentOptions{Name: "drop"}
+	}
+	in := &TranscodeOptionsIn{}
+	tc := NewTranscoder()
+	_, err := tc.Transcode(in, out)
+	if err == nil || err.Error() != "Too many outputs" {
+		t.Error("Expected 'Too many outputs', got ", err)
+	}
+}
diff --git a/ffmpeg/ffmpeg.go b/ffmpeg/ffmpeg.go
index 4f5bd9d9ed..18ceb88e50 100644
--- a/ffmpeg/ffmpeg.go
+++ b/ffmpeg/ffmpeg.go
@@ -8,6 +8,7 @@ import (
 	"path/filepath"
 	"strconv"
 	"strings"
+	"sync"
 	"unsafe"
 )
 
@@ -19,6 +20,7 @@ import "C"
 var ErrTranscoderRes = errors.New("TranscoderInvalidResolution")
 var ErrTranscoderHw = errors.New("TranscoderInvalidHardware")
 var ErrTranscoderInp = errors.New("TranscoderInvalidInput")
+var ErrTranscoderStp = errors.New("TranscoderStopped")
 
 type Acceleration int
 
@@ -33,6 +35,12 @@ type ComponentOptions struct {
 	Opts map[string]string
 }
 
+type Transcoder struct {
+	handle  *C.struct_transcode_thread
+	stopped bool
+	mu      *sync.Mutex
+}
+
 type TranscodeOptionsIn struct {
 	Fname  string
 	Accel  Acceleration
@@ -155,6 +163,17 @@ func Transcode2(input *TranscodeOptionsIn, ps []TranscodeOptions) error {
 }
 
 func Transcode3(input *TranscodeOptionsIn, ps []TranscodeOptions) (*TranscodeResults, error) {
+	t := NewTranscoder()
+	defer t.StopTranscoder()
+	return t.Transcode(input, ps)
+}
+
+func (t *Transcoder) Transcode(input *TranscodeOptionsIn, ps []TranscodeOptions) (*TranscodeResults, error) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if t.stopped || t.handle == nil {
+		return nil, ErrTranscoderStp
+	}
 	if input == nil {
 		return nil, ErrTranscoderInp
 	}
@@ -198,7 +217,7 @@ func Transcode3(input *TranscodeOptionsIn, ps []TranscodeOptions) (*TranscodeRes
 		filters += fmt.Sprintf("%s='w=if(gte(iw,ih),%d,-2):h=if(lt(iw,ih),%d,-2)'", scale_filter, w, h)
 		if input.Accel != Software && p.Accel == Software {
 			// needed for hw dec -> hw rescale -> sw enc
-			filters = filters + ":format=yuv420p,hwdownload"
+			filters = filters + ",hwdownload,format=nv12"
 		}
 		muxOpts := C.component_opts{
 			opts: newAVOpts(p.Muxer.Opts), // don't free this bc of avformat_write_header API
@@ -207,6 +226,12 @@ func Transcode3(input *TranscodeOptionsIn, ps []TranscodeOptions) (*TranscodeRes
 			muxOpts.name = C.CString(p.Muxer.Name)
 			defer C.free(unsafe.Pointer(muxOpts.name))
 		}
+		// Set some default encoding options
+		if len(p.VideoEncoder.Name) <= 0 && len(p.VideoEncoder.Opts) <= 0 {
+			p.VideoEncoder.Opts = map[string]string{
+				"forced-idr": "1",
+			}
+		}
 		vidOpts := C.component_opts{
 			name: C.CString(encoder),
 			opts: newAVOpts(p.VideoEncoder.Opts),
@@ -230,13 +255,28 @@ func Transcode3(input *TranscodeOptionsIn, ps []TranscodeOptions) (*TranscodeRes
 		params[i] = C.output_params{fname: oname, fps: fps,
 			w: C.int(w), h: C.int(h), bitrate: C.int(bitrate),
 			muxer: muxOpts, audio: audioOpts, video: vidOpts, vfilters: vfilt}
+		defer func(param *C.output_params) {
+			// Work around the ownership rules:
+			// ffmpeg normally takes ownership of the following AVDictionary options
+			// However, if we don't pass these opts to ffmpeg, then we need to free
+			if param.muxer.opts != nil {
+				C.av_dict_free(&param.muxer.opts)
+			}
+			if param.audio.opts != nil {
+				C.av_dict_free(&param.audio.opts)
+			}
+			if param.video.opts != nil {
+				C.av_dict_free(&param.video.opts)
+			}
+		}(&params[i])
 	}
 	var device *C.char
 	if input.Device != "" {
 		device = C.CString(input.Device)
 		defer C.free(unsafe.Pointer(device))
 	}
-	inp := &C.input_params{fname: fname, hw_type: hw_type, device: device}
+	inp := &C.input_params{fname: fname, hw_type: hw_type, device: device,
+		handle: t.handle}
 	results := make([]C.output_results, len(ps))
 	decoded := &C.output_results{}
 	var (
@@ -249,7 +289,7 @@ func Transcode3(input *TranscodeOptionsIn, ps []TranscodeOptions) (*TranscodeRes
 	}
 	ret := int(C.lpms_transcode(inp, paramsPointer, resultsPointer, C.int(len(params)), decoded))
 	if 0 != ret {
-		glog.Error("Transcoder Return : ", Strerror(ret))
+		glog.Error("Transcoder Return : ", ErrorMap[ret])
 		return nil, ErrorMap[ret]
 	}
 	tr := make([]MediaInfo, len(ps))
@@ -266,6 +306,24 @@ func Transcode3(input *TranscodeOptionsIn, ps []TranscodeOptions) (*TranscodeRes
 	return &TranscodeResults{Encoded: tr, Decoded: dec}, nil
 }
 
+func NewTranscoder() *Transcoder {
+	return &Transcoder{
+		handle: C.lpms_transcode_new(),
+		mu:     &sync.Mutex{},
+	}
+}
+
+func (t *Transcoder) StopTranscoder() {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if t.stopped {
+		return
+	}
+	C.lpms_transcode_stop(t.handle)
+	t.handle = nil // prevent accidental reuse
+	t.stopped = true
+}
+
 func InitFFmpeg() {
 	C.lpms_init()
 }
diff --git a/ffmpeg/ffmpeg_errors.go b/ffmpeg/ffmpeg_errors.go
index e87143a291..a3a61e74d6 100644
--- a/ffmpeg/ffmpeg_errors.go
+++ b/ffmpeg/ffmpeg_errors.go
@@ -35,6 +35,7 @@ func error_map() map[int]error {
 	}{
 		{code: C.lpms_ERR_INPUT_PIXFMT, desc: "Unsupported input pixel format"},
 		{code: C.lpms_ERR_FILTERS, desc: "Error initializing filtergraph"},
+		{code: C.lpms_ERR_OUTPUTS, desc: "Too many outputs"},
 	}
 	for _, v := range lpmsErrors {
 		m[int(v.code)] = errors.New(v.desc)
diff --git a/ffmpeg/lpms_ffmpeg.c b/ffmpeg/lpms_ffmpeg.c
index 7bf5a92417..09e8ae063a 100644
--- a/ffmpeg/lpms_ffmpeg.c
+++ b/ffmpeg/lpms_ffmpeg.c
@@ -1,15 +1,21 @@
 #include "lpms_ffmpeg.h"
 
+#include <libavcodec/avcodec.h>
+
 #include <libavformat/avformat.h>
 #include <libavfilter/avfilter.h>
 #include <libavfilter/buffersink.h>
 #include <libavfilter/buffersrc.h>
 #include <libavutil/opt.h>
+#include <libavutil/pixdesc.h>
+
+#include <pthread.h>
 
 // Not great to appropriate internal API like this...
 const int lpms_ERR_INPUT_PIXFMT = FFERRTAG('I','N','P','X');
 const int lpms_ERR_FILTERS = FFERRTAG('F','L','T','R');
 const int lpms_ERR_PACKET_ONLY = FFERRTAG('P','K','O','N');
+const int lpms_ERR_OUTPUTS = FFERRTAG('O','U','T','P');
 
 //
 // Internal transcoder data structures
@@ -24,8 +30,13 @@ struct input_ctx {
   // Hardware decoding support
   AVBufferRef *hw_device_ctx;
   enum AVHWDeviceType hw_type;
+  char *device;
 
   int64_t next_pts_a, next_pts_v;
+
+  // Decoder flush
+  AVPacket *first_pkt;
+  int flushed;
 };
 
 struct filter_ctx {
@@ -50,6 +61,9 @@ struct output_ctx {
   int dv, da; // flags whether to drop video or audio
   struct filter_ctx vf, af;
 
+  // Optional hardware encoding support
+  enum AVHWDeviceType hw_type;
+
   // muxer and encoder information (name + options)
   component_opts *muxer;
   component_opts *video;
@@ -58,6 +72,19 @@ struct output_ctx {
   int64_t drop_ts;     // preroll audio ts to drop
 
   output_results  *res; // data to return for this output
+
+};
+
+#define MAX_OUTPUT_SIZE 10
+
+struct transcode_thread {
+  int initialized;
+
+  struct input_ctx ictx;
+  struct output_ctx outputs[MAX_OUTPUT_SIZE];
+
+  int nb_outputs;
+
 };
 
 void lpms_init()
@@ -185,7 +212,7 @@ static void free_output(struct output_ctx *octx)
     avformat_free_context(octx->oc);
     octx->oc = NULL;
   }
-  if (octx->vc) avcodec_free_context(&octx->vc);
+  if (octx->vc && AV_HWDEVICE_TYPE_NONE == octx->hw_type) avcodec_free_context(&octx->vc);
   if (octx->ac) avcodec_free_context(&octx->ac);
   free_filter(&octx->vf);
   free_filter(&octx->af);
@@ -205,6 +232,23 @@ static int needs_decoder(char *encoder) {
   return !(is_copy(encoder) || is_drop(encoder));
 }
 
+static int is_flush_frame(AVFrame *frame)
+{
+  return -1 == frame->pts;
+}
+
+static void send_first_pkt(struct input_ctx *ictx)
+{
+  if (ictx->flushed || !ictx->first_pkt) return;
+
+  int ret = avcodec_send_packet(ictx->vc, ictx->first_pkt);
+  if (ret < 0) {
+    char errstr[AV_ERROR_MAX_STRING_SIZE];
+    av_strerror(ret, errstr, sizeof errstr);
+    fprintf(stderr, "Error sending flush packet : %s\n", errstr);
+  }
+}
+
 static enum AVPixelFormat hw2pixfmt(AVCodecContext *ctx)
 {
   const AVCodec *decoder = ctx->codec;
@@ -223,241 +267,49 @@ static enum AVPixelFormat hw2pixfmt(AVCodecContext *ctx)
   return AV_PIX_FMT_NONE;
 }
 
-static enum AVPixelFormat get_hw_pixfmt(AVCodecContext *ctx, const enum AVPixelFormat *pix_fmts)
+static enum AVPixelFormat get_hw_pixfmt(AVCodecContext *vc, const enum AVPixelFormat *pix_fmts)
 {
-  // XXX see avcodec_get_hw_frames_parameters if fmt changes mid-stream
-  return hw2pixfmt(ctx);
-}
-
-static int open_output(struct output_ctx *octx, struct input_ctx *ictx)
-{
-#define em_err(msg) { \
-  if (!ret) ret = -1; \
-  fprintf(stderr, msg); \
-  goto open_output_err; \
-}
-  int ret = 0, inp_has_stream;
-
-  AVOutputFormat *fmt = NULL;
-  AVFormatContext *oc = NULL;
-  AVCodecContext *vc  = NULL;
-  AVCodecContext *ac  = NULL;
-  AVCodec *codec      = NULL;
-  AVStream *st        = NULL;
-
-  // open muxer
-  fmt = av_guess_format(octx->muxer->name, octx->fname, NULL);
-  if (!fmt) em_err("Unable to guess output format\n");
-  ret = avformat_alloc_output_context2(&oc, fmt, NULL, octx->fname);
-  if (ret < 0) em_err("Unable to alloc output context\n");
-  octx->oc = oc;
-
-  // add video encoder if a decoder exists and this output requires one
-  if (ictx->vc && needs_decoder(octx->video->name)) {
-    codec = avcodec_find_encoder_by_name(octx->video->name);
-    if (!codec) em_err("Unable to find encoder");
-
-    // open video encoder
-    // XXX use avoptions rather than manual enumeration
-    vc = avcodec_alloc_context3(codec);
-    if (!vc) em_err("Unable to alloc video encoder\n");
-    octx->vc = vc;
-    vc->width = av_buffersink_get_w(octx->vf.sink_ctx);
-    vc->height = av_buffersink_get_h(octx->vf.sink_ctx);
-    if (octx->fps.den) vc->framerate = av_buffersink_get_frame_rate(octx->vf.sink_ctx);
-    else vc->framerate = ictx->vc->framerate;
-    if (octx->fps.den) vc->time_base = av_buffersink_get_time_base(octx->vf.sink_ctx);
-    else if (ictx->vc->time_base.num && ictx->vc->time_base.den) vc->time_base = ictx->vc->time_base;
-    else vc->time_base = ictx->ic->streams[ictx->vi]->time_base;
-    if (octx->bitrate) vc->rc_min_rate = vc->rc_max_rate = vc->rc_buffer_size = octx->bitrate;
-    if (av_buffersink_get_hw_frames_ctx(octx->vf.sink_ctx)) {
-      vc->hw_frames_ctx =
-        av_buffer_ref(av_buffersink_get_hw_frames_ctx(octx->vf.sink_ctx));
-    }
-    vc->pix_fmt = av_buffersink_get_format(octx->vf.sink_ctx); // XXX select based on encoder + input support
-    if (fmt->flags & AVFMT_GLOBALHEADER) vc->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
-    ret = avcodec_open2(vc, codec, &octx->video->opts);
-    av_dict_free(&octx->video->opts); // avcodec_open2 replaces this
-    if (ret < 0) em_err("Error opening video encoder\n");
-  }
-
-  // add video stream if input contains video
-  inp_has_stream = ictx->vi >= 0;
-  if (inp_has_stream && !octx->dv) {
-    // video stream in muxer
-    st = avformat_new_stream(oc, NULL);
-    if (!st) em_err("Unable to alloc video stream\n");
-    octx->vi = st->index;
-    st->avg_frame_rate = octx->fps;
-    if (is_copy(octx->video->name)) {
-      AVStream *ist = ictx->ic->streams[ictx->vi];
-      st->time_base = ist->time_base;
-      ret = avcodec_parameters_copy(st->codecpar, ist->codecpar);
-      if (ret < 0) em_err("Error copying video params from input stream\n");
-      // Sometimes the codec tag is wonky for some reason, so correct it
-      ret = av_codec_get_tag2(fmt->codec_tag, st->codecpar->codec_id, &st->codecpar->codec_tag);
-      //if (!ret) fprintf(stderr, "Video codec tag not found. Continuing anyway\n");
-      avformat_transfer_internal_stream_timing_info(fmt, st, ist, AVFMT_TBCF_DEMUXER);
-    } else if (vc) {
-      st->time_base = vc->time_base;
-      ret = avcodec_parameters_from_context(st->codecpar, vc);
-      if (ret < 0) em_err("Error setting video params from encoder\n");
-    } else em_err("No video encoder, not a copy; what is this?\n");
-  }
-
-  // add audio encoder if a decoder exists and this output requires one
-  if (ictx->ac && needs_decoder(octx->audio->name)) {
-    codec = avcodec_find_encoder_by_name(octx->audio->name);
-    if (!codec) em_err("Unable to find audio encoder\n");
-    // open audio encoder
-    ac = avcodec_alloc_context3(codec);
-    if (!ac) em_err("Unable to alloc audio encoder\n");
-    octx->ac = ac;
-    ac->sample_fmt = av_buffersink_get_format(octx->af.sink_ctx);
-    ac->channel_layout = av_buffersink_get_channel_layout(octx->af.sink_ctx);
-    ac->channels = av_buffersink_get_channels(octx->af.sink_ctx);
-    ac->sample_rate = av_buffersink_get_sample_rate(octx->af.sink_ctx);
-    ac->time_base = av_buffersink_get_time_base(octx->af.sink_ctx);
-    if (fmt->flags & AVFMT_GLOBALHEADER) ac->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
-    ret = avcodec_open2(ac, codec, &octx->audio->opts);
-    av_dict_free(&octx->audio->opts); // avcodec_open2 replaces this
-    if (ret < 0) em_err("Error opening audio encoder\n");
-    av_buffersink_set_frame_size(octx->af.sink_ctx, ac->frame_size);
-  }
-
-  // add audio stream if input contains audio
-  inp_has_stream = ictx->ai >= 0;
-  if (inp_has_stream && !octx->da) {
-    // audio stream in muxer
-    st = avformat_new_stream(oc, NULL);
-    if (!st) em_err("Unable to alloc audio stream\n");
-    if (is_copy(octx->audio->name)) {
-      AVStream *ist = ictx->ic->streams[ictx->ai];
-      st->time_base = ist->time_base;
-      ret = avcodec_parameters_copy(st->codecpar, ist->codecpar);
-      if (ret < 0) em_err("Error copying audio params from input stream\n");
-      // Sometimes the codec tag is wonky for some reason, so correct it
-      ret = av_codec_get_tag2(fmt->codec_tag, st->codecpar->codec_id, &st->codecpar->codec_tag);
-      //if (!ret) fprintf(stderr, "Audio codec tag not found. Continuing anyway\n");
-      avformat_transfer_internal_stream_timing_info(fmt, st, ist, AVFMT_TBCF_DEMUXER);
-    } else if (ac) {
-      st->time_base = ac->time_base;
-      ret = avcodec_parameters_from_context(st->codecpar, ac);
-      if (ret < 0) em_err("Error setting audio params from encoder\n");
-    } else em_err("No audio encoder; not a copy; what is this?\n");
-    octx->ai = st->index;
-
-    // signal whether to drop preroll audio
-    if (st->codecpar->initial_padding) octx->drop_ts = AV_NOPTS_VALUE;
+  AVHWFramesContext *frames;
+  int ret;
+
+  // XXX Ideally this would be auto initialized by the HW device ctx
+  //     However the initialization doesn't occur in time to set up filters
+  //     So we do it here. Also see avcodec_get_hw_frames_parameters
+  av_buffer_unref(&vc->hw_frames_ctx);
+  vc->hw_frames_ctx = av_hwframe_ctx_alloc(vc->hw_device_ctx);
+  if (!vc->hw_frames_ctx) {
+    fprintf(stderr, "Unable to allocate hwframe context for decoding\n");
+    return AV_PIX_FMT_NONE;
   }
 
-  if (!(fmt->flags & AVFMT_NOFILE)) {
-    ret = avio_open(&octx->oc->pb, octx->fname, AVIO_FLAG_WRITE);
-    if (ret < 0) em_err("Error opening output file\n");
+  frames = (AVHWFramesContext*)vc->hw_frames_ctx->data;
+  frames->format = hw2pixfmt(vc);
+  frames->sw_format = vc->sw_pix_fmt;
+  frames->width = vc->width;
+  frames->height = vc->height;
+
+  // May want to allocate extra HW frames if we encounter samples where
+  // the defaults are insufficient. Raising this increases GPU memory usage
+  // For now, the defaults seems OK.
+  //vc->extra_hw_frames = 16 + 1; // H.264 max refs
+
+  ret = av_hwframe_ctx_init(vc->hw_frames_ctx);
+  if (AVERROR(ENOSYS) == ret) ret = lpms_ERR_INPUT_PIXFMT; // most likely
+  if (ret < 0) {
+    fprintf(stderr,"Unable to initialize a hardware frame pool\n");
+    return AV_PIX_FMT_NONE;
   }
 
-  ret = avformat_write_header(oc, &octx->muxer->opts);
-  av_dict_free(&octx->muxer->opts); // avformat_write_header replaces this
-  if (ret < 0) em_err("Error writing header\n");
-
-  return 0;
-
-open_output_err:
-  free_output(octx);
-  return ret;
-}
-
-static void free_input(struct input_ctx *inctx)
-{
-  if (inctx->ic) avformat_close_input(&inctx->ic);
-  if (inctx->vc) avcodec_free_context(&inctx->vc);
-  if (inctx->ac) avcodec_free_context(&inctx->ac);
-  if (inctx->hw_device_ctx) av_buffer_unref(&inctx->hw_device_ctx);
-}
-
-static int open_input(input_params *params, struct input_ctx *ctx)
-{
-#define dd_err(msg) { \
-  if (!ret) ret = -1; \
-  fprintf(stderr, msg); \
-  goto open_input_err; \
+/*
+fprintf(stderr, "selected format: hw %s sw %s\n",
+av_get_pix_fmt_name(frames->format), av_get_pix_fmt_name(frames->sw_format));
+const enum AVPixelFormat *p;
+for (p = pix_fmts; *p != -1; p++) {
+fprintf(stderr,"possible format: %s\n", av_get_pix_fmt_name(*p));
 }
-  AVCodec *codec = NULL;
-  AVFormatContext *ic   = NULL;
-  char *inp = params->fname;
-
-  // open demuxer
-  int ret = avformat_open_input(&ic, inp, NULL, NULL);
-  if (ret < 0) dd_err("demuxer: Unable to open input\n");
-  ctx->ic = ic;
-  ret = avformat_find_stream_info(ic, NULL);
-  if (ret < 0) dd_err("Unable to find input info\n");
-
-  // open video decoder
-  ctx->vi = av_find_best_stream(ic, AVMEDIA_TYPE_VIDEO, -1, -1, &codec, 0);
-  if (ctx->dv) ; // skip decoding video
-  else if (ctx->vi < 0) {
-    fprintf(stderr, "No video stream found in input\n");
-  } else {
-    AVCodecContext *vc = avcodec_alloc_context3(codec);
-    if (!vc) dd_err("Unable to alloc video codec\n");
-    ctx->vc = vc;
-    ret = avcodec_parameters_to_context(vc, ic->streams[ctx->vi]->codecpar);
-    if (ret < 0) dd_err("Unable to assign video params\n");
-    if (params->hw_type != AV_HWDEVICE_TYPE_NONE) {
-      // First set the hw device then set the hw frame
-      AVHWFramesContext *frames;
-      ret = av_hwdevice_ctx_create(&ctx->hw_device_ctx, params->hw_type, params->device, NULL, 0);
-      if (ret < 0) dd_err("Unable to open hardware context for decoding\n")
-      ctx->hw_type = params->hw_type;
-      vc->hw_device_ctx = av_buffer_ref(ctx->hw_device_ctx);
-      vc->get_format = get_hw_pixfmt;
-      vc->opaque = (void*)ctx;
-      // XXX Ideally this would be auto initialized by the HW device ctx
-      //     However the initialization doesn't occur in time to set up filters
-      //     So we do it here. Also see avcodec_get_hw_frames_parameters
-      vc->hw_frames_ctx = av_hwframe_ctx_alloc(vc->hw_device_ctx);
-      if (!vc->hw_frames_ctx) dd_err("Unable to allocate hwframe context for decoding\n")
-      frames = (AVHWFramesContext*)vc->hw_frames_ctx->data;
-      frames->format = hw2pixfmt(vc);
-      frames->sw_format = vc->pix_fmt;
-      frames->width = vc->width;
-      frames->height = vc->height;
-
-      // May want to allocate extra HW frames if we encounter samples where
-      // the defaults are insufficient. Raising this increases GPU memory usage
-      // For now, the defaults seems OK.
-      //vc->extra_hw_frames = 16 + 1; // H.264 max refs
-
-      ret = av_hwframe_ctx_init(vc->hw_frames_ctx);
-      if (AVERROR(ENOSYS) == ret) ret = lpms_ERR_INPUT_PIXFMT; // most likely
-      if (ret < 0) dd_err("Unable to initialize a hardware frame pool\n")
-    }
-    ret = avcodec_open2(vc, codec, NULL);
-    if (ret < 0) dd_err("Unable to open video decoder\n");
-  }
-
-  // open audio decoder
-  ctx->ai = av_find_best_stream(ic, AVMEDIA_TYPE_AUDIO, -1, -1, &codec, 0);
-  if (ctx->da) ; // skip decoding audio
-  else if (ctx->ai < 0) {
-    fprintf(stderr, "No audio stream found in input\n");
-  } else {
-    AVCodecContext * ac = avcodec_alloc_context3(codec);
-    if (!ac) dd_err("Unable to alloc audio codec\n");
-    ctx->ac = ac;
-    ret = avcodec_parameters_to_context(ac, ic->streams[ctx->ai]->codecpar);
-    if (ret < 0) dd_err("Unable to assign audio params\n");
-    ret = avcodec_open2(ac, codec, NULL);
-    if (ret < 0) dd_err("Unable to open audio decoder\n");
-  }
+*/
 
-  return 0;
-
-open_input_err:
-  free_input(ctx);
-  return ret;
-#undef dd_err
+  return frames->format;
 }
 
 static int init_video_filters(struct input_ctx *ictx, struct output_ctx *octx)
@@ -563,8 +415,7 @@ static int init_video_filters(struct input_ctx *ictx, struct output_ctx *octx)
 }
 
 
-static int init_audio_filters(struct input_ctx *ictx, struct output_ctx *octx,
-    char *filters_descr)
+static int init_audio_filters(struct input_ctx *ictx, struct output_ctx *octx)
 {
 #define af_err(msg) { \
   if (!ret) ret = -1; \
@@ -573,6 +424,7 @@ static int init_audio_filters(struct input_ctx *ictx, struct output_ctx *octx,
 }
   int ret = 0;
   char args[512];
+  char filters_descr[256];
   const AVFilter *buffersrc  = avfilter_get_by_name("abuffer");
   const AVFilter *buffersink = avfilter_get_by_name("abuffersink");
   AVFilterInOut *outputs = avfilter_inout_alloc();
@@ -595,6 +447,11 @@ static int init_audio_filters(struct input_ctx *ictx, struct output_ctx *octx,
       ictx->ac->sample_rate, ictx->ac->sample_fmt, ictx->ac->channel_layout,
       ictx->ac->channels, time_base.num, time_base.den);
 
+  // TODO set sample format and rate based on encoder support,
+  //      rather than hardcoding
+  snprintf(filters_descr, sizeof filters_descr,
+    "aformat=sample_fmts=fltp:channel_layouts=stereo:sample_rates=44100");
+
   ret = avfilter_graph_create_filter(&af->src_ctx, buffersrc,
                                      "in", args, NULL, af->graph);
   if (ret < 0) af_err("Cannot create audio buffer source\n");
@@ -650,6 +507,351 @@ static int init_audio_filters(struct input_ctx *ictx, struct output_ctx *octx,
 #undef af_err
 }
 
+
+static int add_video_stream(struct output_ctx *octx, struct input_ctx *ictx)
+{
+#define vs_err(msg) { \
+  if (!ret) ret = -1; \
+  fprintf(stderr, "Error adding video stream: " msg); \
+  goto add_video_err; \
+}
+
+  // video stream to muxer
+  int ret = 0;
+  AVStream *st = avformat_new_stream(octx->oc, NULL);
+  if (!st) vs_err("Unable to alloc video stream\n");
+  octx->vi = st->index;
+  st->avg_frame_rate = octx->fps;
+  if (is_copy(octx->video->name)) {
+    AVStream *ist = ictx->ic->streams[ictx->vi];
+    if (ictx->vi < 0 || !ist) vs_err("Input video stream does not exist\n");
+    st->time_base = ist->time_base;
+    ret = avcodec_parameters_copy(st->codecpar, ist->codecpar);
+    if (ret < 0) vs_err("Error copying video params from input stream\n");
+    // Sometimes the codec tag is wonky for some reason, so correct it
+    ret = av_codec_get_tag2(octx->oc->oformat->codec_tag, st->codecpar->codec_id, &st->codecpar->codec_tag);
+    avformat_transfer_internal_stream_timing_info(octx->oc->oformat, st, ist, AVFMT_TBCF_DEMUXER);
+  } else if (octx->vc) {
+    st->time_base = octx->vc->time_base;
+    ret = avcodec_parameters_from_context(st->codecpar, octx->vc);
+    if (ret < 0) vs_err("Error setting video params from encoder\n");
+  } else vs_err("No video encoder, not a copy; what is this?\n");
+  return 0;
+
+add_video_err:
+  // XXX free anything here?
+  return ret;
+#undef vs_err
+}
+
+static int add_audio_stream(struct input_ctx *ictx, struct output_ctx *octx)
+{
+#define as_err(msg) { \
+  if (!ret) ret = -1; \
+  fprintf(stderr, "Error adding audio stream: " msg); \
+  goto add_audio_err; \
+}
+
+  if (ictx->ai < 0 || octx->da) {
+    // Don't need to add an audio stream if no input audio exists,
+    // or we're dropping the output audio stream
+    return 0;
+  }
+
+  // audio stream to muxer
+  int ret = 0;
+  AVStream *st = avformat_new_stream(octx->oc, NULL);
+  if (!st) as_err("Unable to alloc audio stream\n");
+  if (is_copy(octx->audio->name)) {
+    AVStream *ist = ictx->ic->streams[ictx->ai];
+    if (ictx->ai < 0 || !ist) as_err("Input audio stream does not exist\n");
+    st->time_base = ist->time_base;
+    ret = avcodec_parameters_copy(st->codecpar, ist->codecpar);
+    if (ret < 0) as_err("Error copying audio params from input stream\n");
+    // Sometimes the codec tag is wonky for some reason, so correct it
+    ret = av_codec_get_tag2(octx->oc->oformat->codec_tag, st->codecpar->codec_id, &st->codecpar->codec_tag);
+    avformat_transfer_internal_stream_timing_info(octx->oc->oformat, st, ist, AVFMT_TBCF_DEMUXER);
+  } else if (octx->ac) {
+    st->time_base = octx->ac->time_base;
+    ret = avcodec_parameters_from_context(st->codecpar, octx->ac);
+    if (ret < 0) as_err("Error setting audio params from encoder\n");
+  } else if (is_drop(octx->audio->name)) {
+    // Supposed to exit this function early if there's a drop
+    as_err("Shouldn't ever happen here\n");
+  } else {
+    as_err("No audio encoder; not a copy; what is this?\n");
+  }
+  octx->ai = st->index;
+
+  // signal whether to drop preroll audio
+  if (st->codecpar->initial_padding) octx->drop_ts = AV_NOPTS_VALUE;
+  return 0;
+
+add_audio_err:
+  // XXX free anything here?
+  return ret;
+#undef as_err
+}
+
+static int open_audio_output(struct input_ctx *ictx, struct output_ctx *octx,
+  AVOutputFormat *fmt)
+{
+#define ao_err(msg) { \
+  if (!ret) ret = -1; \
+  fprintf(stderr, msg"\n"); \
+  goto audio_output_err; \
+}
+
+  int ret = 0;
+  AVCodec *codec = NULL;
+  AVCodecContext *ac = NULL;
+
+  // add audio encoder if a decoder exists and this output requires one
+  if (ictx->ac && needs_decoder(octx->audio->name)) {
+
+    // initialize audio filters
+    ret = init_audio_filters(ictx, octx);
+    if (ret < 0) ao_err("Unable to open audio filter")
+
+    // open encoder
+    codec = avcodec_find_encoder_by_name(octx->audio->name);
+    if (!codec) ao_err("Unable to find audio encoder");
+    // open audio encoder
+    ac = avcodec_alloc_context3(codec);
+    if (!ac) ao_err("Unable to alloc audio encoder");
+    octx->ac = ac;
+    ac->sample_fmt = av_buffersink_get_format(octx->af.sink_ctx);
+    ac->channel_layout = av_buffersink_get_channel_layout(octx->af.sink_ctx);
+    ac->channels = av_buffersink_get_channels(octx->af.sink_ctx);
+    ac->sample_rate = av_buffersink_get_sample_rate(octx->af.sink_ctx);
+    ac->time_base = av_buffersink_get_time_base(octx->af.sink_ctx);
+    if (fmt->flags & AVFMT_GLOBALHEADER) ac->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
+    ret = avcodec_open2(ac, codec, &octx->audio->opts);
+    if (ret < 0) ao_err("Error opening audio encoder");
+    av_buffersink_set_frame_size(octx->af.sink_ctx, ac->frame_size);
+  }
+
+  ret = add_audio_stream(ictx, octx);
+  if (ret < 0) ao_err("Error adding audio stream")
+
+audio_output_err:
+  // TODO clean up anything here?
+  return ret;
+
+#undef ao_err
+}
+
+
+static int open_output(struct output_ctx *octx, struct input_ctx *ictx)
+{
+#define em_err(msg) { \
+  if (!ret) ret = -1; \
+  fprintf(stderr, msg); \
+  goto open_output_err; \
+}
+  int ret = 0, inp_has_stream;
+
+  AVOutputFormat *fmt = NULL;
+  AVFormatContext *oc = NULL;
+  AVCodecContext *vc  = NULL;
+  AVCodec *codec      = NULL;
+
+  // open muxer
+  fmt = av_guess_format(octx->muxer->name, octx->fname, NULL);
+  if (!fmt) em_err("Unable to guess output format\n");
+  ret = avformat_alloc_output_context2(&oc, fmt, NULL, octx->fname);
+  if (ret < 0) em_err("Unable to alloc output context\n");
+  octx->oc = oc;
+
+  // add video encoder if a decoder exists and this output requires one
+  if (ictx->vc && needs_decoder(octx->video->name)) {
+    ret = init_video_filters(ictx, octx);
+    if (ret < 0) em_err("Unable to open video filter");
+
+    codec = avcodec_find_encoder_by_name(octx->video->name);
+    if (!codec) em_err("Unable to find encoder");
+
+    // open video encoder
+    // XXX use avoptions rather than manual enumeration
+    vc = avcodec_alloc_context3(codec);
+    if (!vc) em_err("Unable to alloc video encoder\n");
+    octx->vc = vc;
+    vc->width = av_buffersink_get_w(octx->vf.sink_ctx);
+    vc->height = av_buffersink_get_h(octx->vf.sink_ctx);
+    if (octx->fps.den) vc->framerate = av_buffersink_get_frame_rate(octx->vf.sink_ctx);
+    else vc->framerate = ictx->vc->framerate;
+    if (octx->fps.den) vc->time_base = av_buffersink_get_time_base(octx->vf.sink_ctx);
+    else if (ictx->vc->time_base.num && ictx->vc->time_base.den) vc->time_base = ictx->vc->time_base;
+    else vc->time_base = ictx->ic->streams[ictx->vi]->time_base;
+    if (octx->bitrate) vc->rc_min_rate = vc->rc_max_rate = vc->rc_buffer_size = octx->bitrate;
+    if (av_buffersink_get_hw_frames_ctx(octx->vf.sink_ctx)) {
+      vc->hw_frames_ctx =
+        av_buffer_ref(av_buffersink_get_hw_frames_ctx(octx->vf.sink_ctx));
+    }
+    vc->pix_fmt = av_buffersink_get_format(octx->vf.sink_ctx); // XXX select based on encoder + input support
+    if (fmt->flags & AVFMT_GLOBALHEADER) vc->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
+    ret = avcodec_open2(vc, codec, &octx->video->opts);
+    if (ret < 0) em_err("Error opening video encoder\n");
+    octx->hw_type = ictx->hw_type;
+  }
+
+  // add video stream if input contains video
+  inp_has_stream = ictx->vi >= 0;
+  if (inp_has_stream && !octx->dv) {
+    ret = add_video_stream(octx, ictx);
+    if (ret < 0) em_err("Error adding video stream\n");
+  }
+
+  ret = open_audio_output(ictx, octx, fmt);
+  if (ret < 0) em_err("Error opening audio output\n");
+
+  if (!(fmt->flags & AVFMT_NOFILE)) {
+    ret = avio_open(&octx->oc->pb, octx->fname, AVIO_FLAG_WRITE);
+    if (ret < 0) em_err("Error opening output file\n");
+  }
+
+  ret = avformat_write_header(oc, &octx->muxer->opts);
+  if (ret < 0) em_err("Error writing header\n");
+
+  return 0;
+
+open_output_err:
+  free_output(octx);
+  return ret;
+}
+
+static void free_input(struct input_ctx *inctx)
+{
+  if (inctx->ic) avformat_close_input(&inctx->ic);
+  if (inctx->vc) {
+    if (inctx->vc->hw_device_ctx) av_buffer_unref(&inctx->vc->hw_device_ctx);
+    avcodec_free_context(&inctx->vc);
+  }
+  if (inctx->ac) avcodec_free_context(&inctx->ac);
+  if (inctx->hw_device_ctx) av_buffer_unref(&inctx->hw_device_ctx);
+}
+
+static int open_video_decoder(input_params *params, struct input_ctx *ctx)
+{
+#define dd_err(msg) { \
+  if (!ret) ret = -1; \
+  fprintf(stderr, msg); \
+  goto open_decoder_err; \
+}
+  int ret = 0;
+  AVCodec *codec = NULL;
+  AVFormatContext *ic = ctx->ic;
+
+  // open video decoder
+  ctx->vi = av_find_best_stream(ic, AVMEDIA_TYPE_VIDEO, -1, -1, &codec, 0);
+  if (ctx->dv) ; // skip decoding video
+  else if (ctx->vi < 0) {
+    fprintf(stderr, "No video stream found in input\n");
+  } else {
+    if (AV_CODEC_ID_H264 == codec->id &&
+        AV_HWDEVICE_TYPE_CUDA == params->hw_type) {
+      AVCodec *c = avcodec_find_decoder_by_name("h264_cuvid");
+      if (c) codec = c;
+      else fprintf(stderr, "Cuvid decoder not found; defaulting to software\n");
+    }
+    AVCodecContext *vc = avcodec_alloc_context3(codec);
+    if (!vc) dd_err("Unable to alloc video codec\n");
+    ctx->vc = vc;
+    ret = avcodec_parameters_to_context(vc, ic->streams[ctx->vi]->codecpar);
+    if (ret < 0) dd_err("Unable to assign video params\n");
+    vc->opaque = (void*)ctx;
+    // XXX Could this break if the original device falls out of scope in golang?
+    if (params->hw_type != AV_HWDEVICE_TYPE_NONE) {
+      // First set the hw device then set the hw frame
+      ret = av_hwdevice_ctx_create(&ctx->hw_device_ctx, params->hw_type, params->device, NULL, 0);
+      if (ret < 0) dd_err("Unable to open hardware context for decoding\n")
+      ctx->hw_type = params->hw_type;
+      vc->hw_device_ctx = av_buffer_ref(ctx->hw_device_ctx);
+      vc->get_format = get_hw_pixfmt;
+    }
+    vc->pkt_timebase = ic->streams[ctx->vi]->time_base;
+    ret = avcodec_open2(vc, codec, NULL);
+    if (ret < 0) dd_err("Unable to open video decoder\n");
+  }
+
+  return 0;
+
+open_decoder_err:
+  free_input(ctx);
+  return ret;
+#undef dd_err
+}
+
+static int open_audio_decoder(input_params *params, struct input_ctx *ctx)
+{
+#define ad_err(msg) { \
+  if (!ret) ret = -1; \
+  fprintf(stderr, msg); \
+  goto open_audio_err; \
+}
+  int ret = 0;
+  AVCodec *codec = NULL;
+  AVFormatContext *ic = ctx->ic;
+
+  // open audio decoder
+  ctx->ai = av_find_best_stream(ic, AVMEDIA_TYPE_AUDIO, -1, -1, &codec, 0);
+  if (ctx->da) ; // skip decoding audio
+  else if (ctx->ai < 0) {
+    fprintf(stderr, "No audio stream found in input\n");
+  } else {
+    AVCodecContext * ac = avcodec_alloc_context3(codec);
+    if (!ac) ad_err("Unable to alloc audio codec\n");
+    if (ctx->ac) fprintf(stderr, "Audio context already open! %p\n", ctx->ac);
+    ctx->ac = ac;
+    ret = avcodec_parameters_to_context(ac, ic->streams[ctx->ai]->codecpar);
+    if (ret < 0) ad_err("Unable to assign audio params\n");
+    ret = avcodec_open2(ac, codec, NULL);
+    if (ret < 0) ad_err("Unable to open audio decoder\n");
+  }
+
+  return 0;
+
+open_audio_err:
+  free_input(ctx);
+  return ret;
+#undef ad_err
+}
+
+static int open_input(input_params *params, struct input_ctx *ctx)
+{
+#define dd_err(msg) { \
+  if (!ret) ret = -1; \
+  fprintf(stderr, msg); \
+  goto open_input_err; \
+}
+  AVFormatContext *ic   = NULL;
+  char *inp = params->fname;
+  int ret = 0;
+
+  // open demuxer
+  ic = avformat_alloc_context();
+  if (!ic) dd_err("demuxer: Unable to alloc context\n");
+  ret = avio_open(&ic->pb, inp, AVIO_FLAG_READ);
+  if (ret < 0) dd_err("demuxer: Unable to open file\n");
+  ret = avformat_open_input(&ic, NULL, NULL, NULL);
+  if (ret < 0) dd_err("demuxer: Unable to open input\n");
+  ctx->ic = ic;
+  ret = avformat_find_stream_info(ic, NULL);
+  if (ret < 0) dd_err("Unable to find input info\n");
+  ret = open_video_decoder(params, ctx);
+  if (ret < 0) dd_err("Unable to open video decoder\n")
+  ret = open_audio_decoder(params, ctx);
+  if (ret < 0) dd_err("Unable to open audio decoder\n")
+
+  return 0;
+
+open_input_err:
+fprintf(stderr, "Freeing input based on OPEN INPUT error\n");
+  free_input(ctx);
+  return ret;
+#undef dd_err
+}
+
 int process_in(struct input_ctx *ictx, AVFrame *frame, AVPacket *pkt)
 {
 #define dec_err(msg) { \
@@ -673,6 +875,11 @@ int process_in(struct input_ctx *ictx, AVFrame *frame, AVPacket *pkt)
     else if (pkt->stream_index == ictx->vi || pkt->stream_index == ictx->ai) break;
     else dec_err("Could not find decoder or stream\n");
 
+    if (!ictx->first_pkt && pkt->flags & AV_PKT_FLAG_KEY && decoder == ictx->vc) {
+      ictx->first_pkt = av_packet_clone(pkt);
+      ictx->first_pkt->pts = -1;
+    }
+
     ret = avcodec_send_packet(decoder, pkt);
     if (ret < 0) dec_err("Error sending packet to decoder\n");
     ret = avcodec_receive_frame(decoder, frame);
@@ -690,23 +897,32 @@ int process_in(struct input_ctx *ictx, AVFrame *frame, AVPacket *pkt)
   return ret;
 
 dec_flush:
+
+  // Flush and close decoder for non-CUDA
+
   if (ictx->vc) {
-    avcodec_send_packet(ictx->vc, NULL);
+    send_first_pkt(ictx);
+
+    // XXX sanity check this works as expected with SW encoding!
     ret = avcodec_receive_frame(ictx->vc, frame);
-    pkt->stream_index = ictx->vi; // XXX ugly?
-    if (!ret) return ret;
+    pkt->stream_index = ictx->vi;
+    if (!ret) {
+      if (is_flush_frame(frame)) ictx->flushed = 1;
+      return ret;
+    }
   }
   if (ictx->ac) {
     avcodec_send_packet(ictx->ac, NULL);
     ret = avcodec_receive_frame(ictx->ac, frame);
-    pkt->stream_index = ictx->ai; // XXX ugly?
+    pkt->stream_index = ictx->ai;
+    if (!ret) return ret;
   }
-  return ret;
+  return AVERROR_EOF;
 
 #undef dec_err
 }
 
-int mux(AVPacket *pkt, AVRational tb, struct output_ctx *octx, AVStream *ost)
+static int mux(AVPacket *pkt, AVRational tb, struct output_ctx *octx, AVStream *ost)
 {
   pkt->stream_index = ost->index;
   if (av_cmp_q(tb, ost->time_base)) {
@@ -729,20 +945,34 @@ int encode(AVCodecContext* encoder, AVFrame *frame, struct output_ctx* octx, AVS
   char errstr[AV_ERROR_MAX_STRING_SIZE] = {0}; \
   if (!ret) { fprintf(stderr, "should not happen\n"); ret = AVERROR(ENOMEM); } \
   if (ret < -1) av_strerror(ret, errstr, sizeof errstr); \
-  fprintf(stderr, "%s: %s", msg, errstr); \
+  fprintf(stderr, "%s: %s\n", msg, errstr); \
   goto encode_cleanup; \
 }
 
+  int ret = 0;
   AVPacket pkt = {0};
 
   if (AVMEDIA_TYPE_VIDEO == ost->codecpar->codec_type && frame) {
+    if (!octx->res->frames) {
+      frame->pict_type = AV_PICTURE_TYPE_I;
+    }
     octx->res->frames++;
     octx->res->pixels += encoder->width * encoder->height;
   }
 
-  int ret = avcodec_send_frame(encoder, frame);
-  if (AVERROR_EOF == ret) ; // continue ; drain encoder
-  else if (ret < 0) encode_err("Error sending frame to encoder");
+
+  // We don't want to send NULL frames for HW encoding
+  // because that closes the encoder: not something we want
+  if (AV_HWDEVICE_TYPE_NONE == octx->hw_type || frame) {
+    ret = avcodec_send_frame(encoder, frame);
+    if (AVERROR_EOF == ret) ; // continue ; drain encoder
+    else if (ret < 0) encode_err("Error sending frame to encoder");
+  }
+
+  if (AVMEDIA_TYPE_VIDEO == ost->codecpar->codec_type &&
+      AV_HWDEVICE_TYPE_CUDA == octx->hw_type && !frame) {
+    avcodec_flush_buffers(encoder);
+  }
 
   while (1) {
     av_init_packet(&pkt);
@@ -768,7 +998,7 @@ int process_out(struct input_ctx *ictx, struct output_ctx *octx, AVCodecContext
   char errstr[AV_ERROR_MAX_STRING_SIZE] = {0}; \
   if (!ret) { fprintf(stderr, "u done messed up\n"); ret = AVERROR(ENOMEM); } \
   if (ret < -1) av_strerror(ret, errstr, sizeof errstr); \
-  fprintf(stderr, "%s: %s", msg, errstr); \
+  fprintf(stderr, "%s: %s\n", msg, errstr); \
   goto proc_cleanup; \
 }
   int ret = 0;
@@ -792,7 +1022,7 @@ int process_out(struct input_ctx *ictx, struct output_ctx *octx, AVCodecContext
   }
   if (inf) {
     ret = av_buffersrc_write_frame(filter->src_ctx, inf);
-    if (ret < 0) proc_err("Error feeding the filtergraph\n");
+    if (ret < 0) proc_err("Error feeding the filtergraph");
   } else {
     // We need to set the pts at EOF to the *end* of the last packet
     // in order to avoid discarding any queued packets
@@ -815,6 +1045,10 @@ int process_out(struct input_ctx *ictx, struct output_ctx *octx, AVCodecContext
     } else if (ret < 0) proc_err("Error consuming the filtergraph\n");
     ret = encode(encoder, frame, octx, ost);
     av_frame_unref(frame);
+    // For HW we keep the encoder open so will only get EAGAIN.
+    // Return EOF in place of EAGAIN for to terminate the flush
+    if (frame == NULL && AV_HWDEVICE_TYPE_NONE != octx->hw_type &&
+        AVERROR(EAGAIN) == ret && !inf) return AVERROR_EOF;
     if (frame == NULL) return ret;
   }
 
@@ -823,67 +1057,107 @@ int process_out(struct input_ctx *ictx, struct output_ctx *octx, AVCodecContext
 #undef proc_err
 }
 
-#define MAX_OUTPUT_SIZE 10
+int flush_outputs(struct input_ctx *ictx, struct output_ctx *octx)
+{
+  // only issue w this flushing method is it's not necessarily sequential
+  // wrt all the outputs; might want to iterate on each output per frame?
+  int ret = 0;
+  if (octx->vc) { // flush video
+    while (!ret || ret == AVERROR(EAGAIN)) {
+      ret = process_out(ictx, octx, octx->vc, octx->oc->streams[octx->vi], &octx->vf, NULL);
+    }
+  }
+  ret = 0;
+  if (octx->ac) { // flush audio
+    while (!ret || ret == AVERROR(EAGAIN)) {
+      ret = process_out(ictx, octx, octx->ac, octx->oc->streams[octx->ai], &octx->af, NULL);
+    }
+  }
+  av_interleaved_write_frame(octx->oc, NULL); // flush muxer
+  return av_write_trailer(octx->oc);
+}
 
-int lpms_transcode(input_params *inp, output_params *params,
-    output_results *results, int nb_outputs, output_results *decoded_results)
+
+int transcode(struct transcode_thread *h,
+  input_params *inp, output_params *params,
+  output_results *results, output_results *decoded_results)
 {
 #define main_err(msg) { \
+  char errstr[AV_ERROR_MAX_STRING_SIZE] = {0}; \
   if (!ret) ret = AVERROR(EINVAL); \
-  fprintf(stderr, msg); \
+  if (ret < -1) av_strerror(ret, errstr, sizeof errstr); \
+  fprintf(stderr, "%s: %s\n", msg, errstr); \
   goto transcode_cleanup; \
 }
   int ret = 0, i = 0;
-  int decode_a = 0, decode_v = 0;
-  struct input_ctx ictx;
+  struct input_ctx *ictx = &h->ictx;
+  struct output_ctx *outputs = h->outputs;
+  int nb_outputs = h->nb_outputs;
   AVPacket ipkt;
-  struct output_ctx outputs[MAX_OUTPUT_SIZE];
   AVFrame *dframe = NULL;
 
-  memset(&ictx, 0, sizeof ictx);
-  memset(outputs, 0, sizeof outputs);
-
   if (!inp) main_err("transcoder: Missing input params\n")
-  if (nb_outputs > MAX_OUTPUT_SIZE) main_err("transcoder: Too many outputs\n");
 
-  // Check to see if we can skip decoding
-  for (i = 0; i < nb_outputs; i++) {
-    if (!needs_decoder(params[i].video.name)) ictx.dv = ++decode_v == nb_outputs;
-    if (!needs_decoder(params[i].audio.name)) ictx.da = ++decode_a == nb_outputs;
+  if (!ictx->ic->pb) {
+    ret = avio_open(&ictx->ic->pb, inp->fname, AVIO_FLAG_READ);
+    if (ret < 0) main_err("Unable to reopen file");
+    // XXX check to see if we can also reuse decoder for sw decoding
+    if (AV_HWDEVICE_TYPE_CUDA != ictx->hw_type) {
+      ret = open_video_decoder(inp, ictx);
+      if (ret < 0) main_err("Unable to reopen video decoder");
+    }
+    ret = open_audio_decoder(inp, ictx);
+    if (ret < 0) main_err("Unable to reopen audio decoder")
   }
 
-  // populate input context
-  ret = open_input(inp, &ictx);
-  if (ret < 0) main_err("transcoder: Unable to open input\n");
-
   // populate output contexts
-  for (i = 0; i < nb_outputs; i++) {
-    struct output_ctx *octx = &outputs[i];
-    octx->fname = params[i].fname;
-    octx->width = params[i].w;
-    octx->height = params[i].h;
-    octx->muxer = &params[i].muxer;
-    octx->audio = &params[i].audio;
-    octx->video = &params[i].video;
-    octx->vfilters = params[i].vfilters;
-    if (params[i].bitrate) octx->bitrate = params[i].bitrate;
-    if (params[i].fps.den) octx->fps = params[i].fps;
-    octx->dv = ictx.vi < 0 || is_drop(octx->video->name);
-    octx->da = ictx.ai < 0 || is_drop(octx->audio->name);
-    octx->res = &results[i];
-    if (ictx.vc) {
-      ret = init_video_filters(&ictx, octx);
-      if (ret < 0) main_err("Unable to open video filter");
-    }
-    if (ictx.ac) {
-      char filter_str[256];
-      //snprintf(filter_str, sizeof filter_str, "aformat=sample_fmts=s16:channel_layouts=stereo:sample_rates=44100,asetnsamples=n=1152,aresample");
-      snprintf(filter_str, sizeof filter_str, "aformat=sample_fmts=fltp:channel_layouts=stereo:sample_rates=44100"); // set sample format and rate based on encoder support
-      ret = init_audio_filters(&ictx, octx, filter_str);
-      if (ret < 0) main_err("Unable to open audio filter");
-    }
-    ret = open_output(octx, &ictx);
-    if (ret < 0) main_err("transcoder: Unable to open output\n");
+  for (i = 0; i <  nb_outputs; i++) {
+      struct output_ctx *octx = &outputs[i];
+      octx->fname = params[i].fname;
+      octx->width = params[i].w;
+      octx->height = params[i].h;
+      octx->muxer = &params[i].muxer;
+      octx->audio = &params[i].audio;
+      octx->video = &params[i].video;
+      octx->vfilters = params[i].vfilters;
+      if (params[i].bitrate) octx->bitrate = params[i].bitrate;
+      if (params[i].fps.den) octx->fps = params[i].fps;
+      octx->dv = ictx->vi < 0 || is_drop(octx->video->name);
+      octx->da = ictx->ai < 0 || is_drop(octx->audio->name);
+      octx->res = &results[i];
+
+      // XXX valgrind this line up
+      if (!h->initialized || AV_HWDEVICE_TYPE_NONE == octx->hw_type) {
+        ret = open_output(octx, ictx);
+        if (ret < 0) main_err("transcoder: Unable to open output");
+        continue;
+      }
+
+      // reopen output for HW encoding
+
+      AVOutputFormat *fmt = av_guess_format(octx->muxer->name, octx->fname, NULL);
+      if (!fmt) main_err("Unable to guess format for reopen\n");
+      ret = avformat_alloc_output_context2(&octx->oc, fmt, NULL, octx->fname);
+      if (ret < 0) main_err("Unable to alloc reopened out context\n");
+
+      // re-attach video encoder
+      if (octx->vc) {
+        ret = add_video_stream(octx, ictx);
+        if (ret < 0) main_err("Unable to re-add video stream\n");
+        ret = init_video_filters(ictx, octx);
+        if (ret < 0) main_err("Unable to re-open video filter\n")
+      } else fprintf(stderr, "no video stream\n");
+
+      // re-attach audio encoder
+      ret = open_audio_output(ictx, octx, fmt);
+      if (ret < 0) main_err("Unable to re-add audio stream\n");
+
+      if (!(fmt->flags & AVFMT_NOFILE)) {
+        ret = avio_open(&octx->oc->pb, octx->fname, AVIO_FLAG_WRITE);
+        if (ret < 0) main_err("Error re-opening output file\n");
+      }
+      ret = avformat_write_header(octx->oc, NULL);
+      if (ret < 0) main_err("Error re-writing header\n");
   }
 
   av_init_packet(&ipkt);
@@ -894,21 +1168,32 @@ int lpms_transcode(input_params *inp, output_params *params,
     int has_frame = 0;
     AVStream *ist = NULL;
     av_frame_unref(dframe);
-    ret = process_in(&ictx, dframe, &ipkt);
+    ret = process_in(ictx, dframe, &ipkt);
     if (ret == AVERROR_EOF) break;
                             // Bail out on streams that appear to be broken
     else if (lpms_ERR_PACKET_ONLY == ret) ; // keep going for stream copy
     else if (ret < 0) main_err("transcoder: Could not decode; stopping\n");
-    ist = ictx.ic->streams[ipkt.stream_index];
+    ist = ictx->ic->streams[ipkt.stream_index];
     has_frame = lpms_ERR_PACKET_ONLY != ret;
 
     if (AVMEDIA_TYPE_VIDEO == ist->codecpar->codec_type) {
+      if (is_flush_frame(dframe)) goto whileloop_end;
       // width / height will be zero for pure streamcopy (no decoding)
       decoded_results->frames += dframe->width && dframe->height;
       decoded_results->pixels += dframe->width * dframe->height;
-      if (has_frame) ictx.next_pts_v = dframe->pts + dframe->pkt_duration;
+      if (has_frame) {
+        int64_t dur = 0;
+        if (dframe->pkt_duration) dur = dframe->pkt_duration;
+        else if (ist->avg_frame_rate.den) {
+          dur = av_rescale_q(1, av_inv_q(ist->avg_frame_rate), ist->time_base);
+        } else {
+          // TODO use better heuristics for this; look at how ffmpeg does it
+          //fprintf(stderr, "Could not determine next pts; filter might drop\n");
+        }
+        ictx->next_pts_v = dframe->pts + dur;
+      }
     } else if (AVMEDIA_TYPE_AUDIO == ist->codecpar->codec_type) {
-      if (has_frame) ictx.next_pts_a = dframe->pts + dframe->pkt_duration;
+      if (has_frame) ictx->next_pts_a = dframe->pts + dframe->pkt_duration;
     }
 
     for (i = 0; i < nb_outputs; i++) {
@@ -918,17 +1203,17 @@ int lpms_transcode(input_params *inp, output_params *params,
       AVCodecContext *encoder = NULL;
       ret = 0; // reset to avoid any carry-through
 
-      if (ist->index == ictx.vi) {
+      if (ist->index == ictx->vi) {
         if (octx->dv) continue; // drop video stream for this output
         ost = octx->oc->streams[0];
-        if (ictx.vc) {
+        if (ictx->vc) {
           encoder = octx->vc;
           filter = &octx->vf;
         }
-      } else if (ist->index == ictx.ai) {
+      } else if (ist->index == ictx->ai) {
         if (octx->da) continue; // drop audio stream for this output
         ost = octx->oc->streams[!octx->dv]; // depends on whether video exists
-        if (ictx.ac) {
+        if (ictx->ac) {
           encoder = octx->ac;
           filter = &octx->af;
         }
@@ -947,42 +1232,90 @@ int lpms_transcode(input_params *inp, output_params *params,
         ret = mux(pkt, ist->time_base, octx, ost);
         av_packet_free(&pkt);
       } else if (has_frame) {
-        ret = process_out(&ictx, octx, encoder, ost, filter, dframe);
+        ret = process_out(ictx, octx, encoder, ost, filter, dframe);
       }
       if (AVERROR(EAGAIN) == ret || AVERROR_EOF == ret) continue;
       else if (ret < 0) main_err("transcoder: Error encoding\n");
     }
-
 whileloop_end:
     av_packet_unref(&ipkt);
   }
 
   // flush outputs
   for (i = 0; i < nb_outputs; i++) {
-    struct output_ctx *octx = &outputs[i];
-    // only issue w this flushing method is it's not necessarily sequential
-    // wrt all the outputs; might want to iterate on each output per frame?
-    ret = 0;
-    if (octx->vc) { // flush video
-      while (!ret || ret == AVERROR(EAGAIN)) {
-        ret = process_out(&ictx, octx, octx->vc, octx->oc->streams[octx->vi], &octx->vf, NULL);
-      }
-    }
-    ret = 0;
-    if (octx->ac) { // flush audio
-      while (!ret || ret == AVERROR(EAGAIN)) {
-        ret = process_out(&ictx, octx, octx->ac, octx->oc->streams[octx->ai], &octx->af, NULL);
-      }
-    }
-    av_interleaved_write_frame(octx->oc, NULL); // flush muxer
-    ret = av_write_trailer(octx->oc);
-    if (ret < 0) main_err("transcoder: Unable to write trailer");
+    ret = flush_outputs(ictx, &outputs[i]);
+    if (ret < 0) main_err("transcoder: Unable to fully flush outputs")
   }
 
 transcode_cleanup:
-  free_input(&ictx);
-  for (i = 0; i < MAX_OUTPUT_SIZE; i++) free_output(&outputs[i]);
+  avio_closep(&ictx->ic->pb);
   if (dframe) av_frame_free(&dframe);
+  ictx->flushed = 0;
+  if (ictx->first_pkt) av_packet_free(&ictx->first_pkt);
+  if (ictx->ac) avcodec_free_context(&ictx->ac);
+  if (ictx->vc && AV_HWDEVICE_TYPE_NONE == ictx->hw_type) avcodec_free_context(&ictx->vc);
+  for (i = 0; i < nb_outputs; i++) free_output(&outputs[i]);
   return ret == AVERROR_EOF ? 0 : ret;
 #undef main_err
 }
+
+int lpms_transcode(input_params *inp, output_params *params,
+  output_results *results, int nb_outputs, output_results *decoded_results)
+{
+  int ret = 0;
+  struct transcode_thread *h = inp->handle;
+
+  if (!h->initialized) {
+    int i = 0;
+    int decode_a = 0, decode_v = 0;
+    if (nb_outputs > MAX_OUTPUT_SIZE) {
+      return lpms_ERR_OUTPUTS;
+    }
+
+    // Check to see if we can skip decoding
+    for (i = 0; i < nb_outputs; i++) {
+      if (!needs_decoder(params[i].video.name)) h->ictx.dv = ++decode_v == nb_outputs;
+      if (!needs_decoder(params[i].audio.name)) h->ictx.da = ++decode_a == nb_outputs;
+    }
+
+    h->nb_outputs = nb_outputs;
+
+    // populate input context
+    ret = open_input(inp, &h->ictx);
+    if (ret < 0) {
+      return ret;
+    }
+  }
+
+  if (h->nb_outputs != nb_outputs) {
+    return lpms_ERR_OUTPUTS; // Not the most accurate error...
+  }
+
+  ret = transcode(h, inp, params, results, decoded_results);
+  h->initialized = 1;
+
+  return ret;
+}
+
+struct transcode_thread* lpms_transcode_new() {
+  struct transcode_thread *h = malloc(sizeof (struct transcode_thread));
+  if (!h) return NULL;
+  memset(h, 0, sizeof *h);
+  return h;
+}
+
+void lpms_transcode_stop(struct transcode_thread *handle) {
+  // not threadsafe as-is; calling function must ensure exclusivity!
+
+  int i;
+
+  if (!handle) return;
+
+  free_input(&handle->ictx);
+  for (i = 0; i < MAX_OUTPUT_SIZE; i++) {
+    free_output(&handle->outputs[i]);
+    if (handle->outputs[i].vc) avcodec_free_context(&handle->outputs[i].vc);
+  }
+
+  free(handle);
+}
diff --git a/ffmpeg/lpms_ffmpeg.h b/ffmpeg/lpms_ffmpeg.h
index cc19bd3f7c..350eaeefee 100644
--- a/ffmpeg/lpms_ffmpeg.h
+++ b/ffmpeg/lpms_ffmpeg.h
@@ -7,6 +7,9 @@
 // LPMS specific errors
 extern const int lpms_ERR_INPUT_PIXFMT;
 extern const int lpms_ERR_FILTERS;
+extern const int lpms_ERR_OUTPUTS;
+
+struct transcode_thread;
 
 typedef struct {
     char *name;
@@ -28,6 +31,12 @@ typedef struct {
 typedef struct {
   char *fname;
 
+  // Handle to a transcode thread.
+  // If null, a new transcode thread is allocated.
+  // The transcode thread is returned within `output_results`.
+  // Must be freed with lpms_transcode_stop.
+  struct transcode_thread *handle;
+
   // Optional hardware acceleration
   enum AVHWDeviceType hw_type;
   char *device;
@@ -41,5 +50,7 @@ typedef struct {
 void lpms_init();
 int  lpms_rtmp2hls(char *listen, char *outf, char *ts_tmpl, char *seg_time, char *seg_start);
 int  lpms_transcode(input_params *inp, output_params *params, output_results *results, int nb_outputs, output_results *decoded_results);
+struct transcode_thread* lpms_transcode_new();
+void lpms_transcode_stop(struct transcode_thread* handle);
 
 #endif // _LPMS_FFMPEG_H_
diff --git a/ffmpeg/nvidia_test.go b/ffmpeg/nvidia_test.go
index 9e920b6daa..7e8181f330 100644
--- a/ffmpeg/nvidia_test.go
+++ b/ffmpeg/nvidia_test.go
@@ -466,3 +466,300 @@ func TestNvidia_DrainFilters(t *testing.T) {
 	run(cmd)
 
 }
+
+func TestNvidia_CountFrames(t *testing.T) {
+	run, dir := setupTest(t)
+	defer os.RemoveAll(dir)
+
+	cmd := `
+    set -eux
+    cd "$0"
+
+    # run segmenter and sanity check frame counts . Hardcode for now.
+    ffmpeg -loglevel warning -i "$1"/../transcoder/test.ts -c:a copy -c:v copy -f hls test.m3u8
+    ffprobe -loglevel warning -select_streams v -count_frames -show_streams test0.ts | grep nb_read_frames=120
+    ffprobe -loglevel warning -select_streams v -count_frames -show_streams test1.ts | grep nb_read_frames=120
+    ffprobe -loglevel warning -select_streams v -count_frames -show_streams test2.ts | grep nb_read_frames=120
+    ffprobe -loglevel warning -select_streams v -count_frames -show_streams test3.ts | grep nb_read_frames=120
+  `
+	run(cmd)
+
+	tc := NewTranscoder()
+
+	// Test decoding
+	for i := 0; i < 4; i++ {
+		in := &TranscodeOptionsIn{
+			Fname:  fmt.Sprintf("%s/test%d.ts", dir, i),
+			Accel:  Nvidia,
+			Device: "3",
+		}
+		res, err := tc.Transcode(in, nil)
+		if err != nil {
+			t.Error(err)
+		}
+		if res.Decoded.Frames != 120 {
+			t.Error(in.Fname, " Mismatched frame count: expected 120 got ", res.Decoded.Frames)
+		}
+	}
+	tc.StopTranscoder()
+}
+
+func TestNvidia_CountEncodedFrames(t *testing.T) {
+	run, dir := setupTest(t)
+	defer os.RemoveAll(dir)
+
+	cmd := `
+    # run segmenter and sanity check frame counts . Hardcode for now.
+    ffmpeg -loglevel warning -i "$1"/../transcoder/test.ts -c:a copy -c:v copy -f hls test.m3u8
+    ffprobe -loglevel warning -select_streams v -count_frames -show_streams test0.ts | grep nb_read_frames=120
+    ffprobe -loglevel warning -select_streams v -count_frames -show_streams test1.ts | grep nb_read_frames=120
+    ffprobe -loglevel warning -select_streams v -count_frames -show_streams test2.ts | grep nb_read_frames=120
+    ffprobe -loglevel warning -select_streams v -count_frames -show_streams test3.ts | grep nb_read_frames=120
+  `
+	run(cmd)
+
+	tc := NewTranscoder()
+
+	// Test decoding
+	for i := 0; i < 4; i++ {
+		in := &TranscodeOptionsIn{
+			Fname:  fmt.Sprintf("%s/test%d.ts", dir, i),
+			Accel:  Nvidia,
+			Device: "3",
+		}
+		p60fps := P144p30fps16x9
+		p60fps.Framerate = 60
+		p120fps := P144p30fps16x9
+		p120fps.Framerate = 120
+		out := []TranscodeOptions{TranscodeOptions{
+			Oname:   fmt.Sprintf("%s/out_30fps_%d.ts", dir, i),
+			Profile: P144p30fps16x9,
+			Accel:   Nvidia,
+		}, TranscodeOptions{
+			Oname:   fmt.Sprintf("%s/out_60fps_%d.ts", dir, i),
+			Profile: p60fps,
+			Accel:   Nvidia,
+		}, TranscodeOptions{
+			Oname:   fmt.Sprintf("%s/out_120fps_%d.ts", dir, i),
+			Profile: p120fps,
+			Accel:   Nvidia,
+		}}
+
+		res, err := tc.Transcode(in, out)
+		if err != nil {
+			t.Error(err)
+		}
+		if res.Encoded[0].Frames != 60 {
+			t.Error(in.Fname, " Mismatched frame count: expected 60 got ", res.Encoded[0].Frames)
+		}
+		if res.Encoded[1].Frames != 120 {
+			t.Error(in.Fname, " Mismatched frame count: expected 120 got ", res.Encoded[1].Frames)
+		}
+		if res.Encoded[2].Frames != 240 {
+			t.Error(in.Fname, " Mismatched frame count: expected 240 got ", res.Encoded[2].Frames)
+		}
+	}
+	tc.StopTranscoder()
+}
+
+func TestNvidia_RepeatedSpecialOpts(t *testing.T) {
+
+	_, dir := setupTest(t)
+
+	err := RTMPToHLS("../transcoder/test.ts", dir+"/out.m3u8", dir+"/out_%d.ts", "2", 0)
+	if err != nil {
+		t.Error(err)
+	}
+
+	// At some point we forgot to set the muxer type in reopened outputs
+	// This used to cause an error, so just check that it's resolved
+	in := &TranscodeOptionsIn{Accel: Nvidia}
+	out := []TranscodeOptions{TranscodeOptions{
+		Oname:        "-",
+		Profile:      P144p30fps16x9,
+		VideoEncoder: ComponentOptions{Opts: map[string]string{"zerolatency": "1"}},
+		Muxer:        ComponentOptions{Name: "null"},
+		Accel:        Nvidia}}
+	tc := NewTranscoder()
+	for i := 0; i < 4; i++ {
+		in.Fname = fmt.Sprintf("%s/out_%d.ts", dir, i)
+		_, err := tc.Transcode(in, out)
+		if err != nil {
+			t.Error(err)
+		}
+	}
+	tc.StopTranscoder()
+
+	// ALso test when a repeated option fails ?? Special behaviour for this?
+}
+
+func TestNvidia_API_MixedOutput(t *testing.T) {
+	run, dir := setupTest(t)
+	err := RTMPToHLS("../transcoder/test.ts", dir+"/out.m3u8", dir+"/out_%d.ts", "2", 0)
+	if err != nil {
+		t.Error(err)
+	}
+
+	profile := P144p30fps16x9
+	profile.Framerate = 123
+	tc := NewTranscoder()
+	for i := 0; i < 4; i++ {
+		in := &TranscodeOptionsIn{Fname: fmt.Sprintf("%s/out_%d.ts", dir, i)}
+		out := []TranscodeOptions{TranscodeOptions{
+			Oname:        fmt.Sprintf("%s/%d.md5", dir, i),
+			AudioEncoder: ComponentOptions{Name: "drop"},
+			VideoEncoder: ComponentOptions{Name: "copy"},
+			Muxer:        ComponentOptions{Name: "md5"},
+		}, TranscodeOptions{
+			Oname:        fmt.Sprintf("%s/nv_%d.ts", dir, i),
+			Profile:      profile,
+			AudioEncoder: ComponentOptions{Name: "copy"},
+			Accel:        Nvidia,
+		}, TranscodeOptions{
+			Oname:   fmt.Sprintf("%s/nv_audio_encode_%d.ts", dir, i),
+			Profile: profile,
+			Accel:   Nvidia,
+		}, TranscodeOptions{
+			Oname:   fmt.Sprintf("%s/sw_%d.ts", dir, i),
+			Profile: profile,
+		}}
+		res, err := tc.Transcode(in, out)
+		if err != nil {
+			t.Error(err)
+		}
+		if res.Decoded.Frames != 120 {
+			t.Error("Did not get decoded frames", res.Decoded.Frames)
+		}
+		if res.Encoded[1].Frames != res.Encoded[2].Frames {
+			t.Error("Mismatched frame count for hw/nv")
+		}
+	}
+	cmd := `
+    function check {
+
+      # Check md5sum for stream copy / drop
+      ffmpeg -loglevel warning -i out_$1.ts -an -c:v copy -f md5 ffmpeg_$1.md5
+      diff -u $1.md5 ffmpeg_$1.md5
+
+      ffmpeg -loglevel warning -i out_$1.ts -c:a aac -ar 44100 -ac 2 \
+        -vf hwupload_cuda,fps=123,scale_cuda=w=256:h=144 -c:v h264_nvenc \
+        ffmpeg_nv_$1.ts
+
+      # sanity check ffmpeg frame count against ours
+      ffprobe -count_frames -show_streams -select_streams v ffmpeg_nv_$1.ts | grep nb_read_frames=246
+      ffprobe -count_frames -show_streams -select_streams v nv_$1.ts | grep nb_read_frames=246
+      ffprobe -count_frames -show_streams -select_streams v sw_$1.ts | grep nb_read_frames=246
+      ffprobe -count_frames -show_streams -select_streams v nv_audio_encode_$1.ts | grep nb_read_frames=246
+
+    # check image quality
+    ffmpeg -loglevel warning -i nv_$1.ts -i ffmpeg_nv_$1.ts \
+      -lavfi "[0:v][1:v]ssim=nv_stats_$1.log" -f null -
+    grep -Po 'All:\K\d+.\d+' nv_stats_$1.log | \
+      awk '{ if ($1 < 0.95) count=count+1 } END{ exit count > 5 }'
+
+    ffmpeg -loglevel warning -i sw_$1.ts -i ffmpeg_nv_$1.ts \
+      -lavfi "[0:v][1:v]ssim=sw_stats_$1.log" -f null -
+    grep -Po 'All:\K\d+.\d+' sw_stats_$1.log | \
+      awk '{ if ($1 < 0.95) count=count+1 } END{ exit count > 5 }'
+
+    # Really should check relevant audio as well...
+
+    }
+
+
+    check 0
+    check 1
+    check 2
+    check 3
+  `
+	run(cmd)
+	tc.StopTranscoder()
+}
+
+func TestNvidia_API_AlternatingTimestamps(t *testing.T) {
+	// Really should refactor this test to increase commonality with other
+	// tests that also check things like SSIM, MD5 hashes, etc...
+	// See TestNvidia_API_MixedOutput / TestTranscoder_EncoderOpts / TestTranscoder_StreamCopy
+	run, dir := setupTest(t)
+	err := RTMPToHLS("../transcoder/test.ts", dir+"/out.m3u8", dir+"/out_%d.ts", "2", 0)
+	if err != nil {
+		t.Error(err)
+	}
+
+	profile := P144p30fps16x9
+	profile.Framerate = 123
+	tc := NewTranscoder()
+	idx := []int{1, 0, 3, 2}
+	for _, i := range idx {
+		in := &TranscodeOptionsIn{Fname: fmt.Sprintf("%s/out_%d.ts", dir, i)}
+		out := []TranscodeOptions{TranscodeOptions{
+			Oname:        fmt.Sprintf("%s/%d.md5", dir, i),
+			AudioEncoder: ComponentOptions{Name: "drop"},
+			VideoEncoder: ComponentOptions{Name: "copy"},
+			Muxer:        ComponentOptions{Name: "md5"},
+		}, TranscodeOptions{
+			Oname:        fmt.Sprintf("%s/nv_%d.ts", dir, i),
+			Profile:      profile,
+			AudioEncoder: ComponentOptions{Name: "copy"},
+			Accel:        Nvidia,
+		}, TranscodeOptions{
+			Oname:   fmt.Sprintf("%s/nv_audio_encode_%d.ts", dir, i),
+			Profile: profile,
+			Accel:   Nvidia,
+		}, TranscodeOptions{
+			Oname:   fmt.Sprintf("%s/sw_%d.ts", dir, i),
+			Profile: profile,
+		}}
+		res, err := tc.Transcode(in, out)
+		if err != nil {
+			t.Error(err)
+		}
+		if res.Decoded.Frames != 120 {
+			t.Error("Did not get decoded frames", res.Decoded.Frames)
+		}
+		if res.Encoded[1].Frames != res.Encoded[2].Frames {
+			t.Error("Mismatched frame count for hw/nv")
+		}
+	}
+	cmd := `
+    function check {
+
+      # Check md5sum for stream copy / drop
+      ffmpeg -loglevel warning -i out_$1.ts -an -c:v copy -f md5 ffmpeg_$1.md5
+      diff -u $1.md5 ffmpeg_$1.md5
+
+      ffmpeg -loglevel warning -i out_$1.ts -c:a aac -ar 44100 -ac 2 \
+        -vf hwupload_cuda,fps=123,scale_cuda=w=256:h=144 -c:v h264_nvenc \
+        ffmpeg_nv_$1.ts
+
+      # sanity check ffmpeg frame count against ours
+      ffprobe -count_frames -show_streams -select_streams v ffmpeg_nv_$1.ts | grep nb_read_frames=246
+      ffprobe -count_frames -show_streams -select_streams v nv_$1.ts | grep nb_read_frames=246
+      ffprobe -count_frames -show_streams -select_streams v sw_$1.ts | grep nb_read_frames=246
+      ffprobe -count_frames -show_streams -select_streams v nv_audio_encode_$1.ts | grep nb_read_frames=246
+
+    # check image quality
+    ffmpeg -loglevel warning -i nv_$1.ts -i ffmpeg_nv_$1.ts \
+      -lavfi "[0:v][1:v]ssim=nv_stats_$1.log" -f null -
+    grep -Po 'All:\K\d+.\d+' nv_stats_$1.log | \
+      awk '{ if ($1 < 0.95) count=count+1 } END{ exit count > 5 }'
+
+    ffmpeg -loglevel warning -i sw_$1.ts -i ffmpeg_nv_$1.ts \
+      -lavfi "[0:v][1:v]ssim=sw_stats_$1.log" -f null -
+    grep -Po 'All:\K\d+.\d+' sw_stats_$1.log | \
+      awk '{ if ($1 < 0.95) count=count+1 } END{ exit count > 5 }'
+
+    # Really should check relevant audio as well...
+    }
+
+
+    check 0
+    check 1
+    check 2
+    check 3
+  `
+	run(cmd)
+	tc.StopTranscoder()
+}
+
+// XXX test bframes or delayed frames
diff --git a/transcoder/ffmpeg_segment_transcoder_test.go b/transcoder/ffmpeg_segment_transcoder_test.go
index ff918bc09b..80dccfca61 100644
--- a/transcoder/ffmpeg_segment_transcoder_test.go
+++ b/transcoder/ffmpeg_segment_transcoder_test.go
@@ -71,7 +71,7 @@ func TestInvalidProfiles(t *testing.T) {
 	_, err := tr.Transcode("test.ts")
 	if err == nil {
 		t.Errorf("Expected an error transcoding too many segments")
-	} else if err.Error() != "Invalid argument" {
+	} else if err.Error() != "Too many outputs" {
 		t.Errorf("Did not get the expected error while transcoding: %v", err)
 	}