mfvideoenc: Enhance B-frame timestamp handling

When B-frame is enabled, encoder seems to adjust PTS of encoded sample by using frame duration. For instance, one observed timestamp pattern by using B-frame enabled and 30fps stream is: * Frame-1: MF pts 0:00.033333300 MF dts 0:00.000000000 * Frame-2: MF pts 0:00.133333300 MF dts 0:00.033333300 * Frame-3: MF pts 0:00.066666600 MF dts 0:00.066666600 * Frame-4: MF pts 0:00.099999900 MF dts 0:00.100000000 We can notice that the amount of PTS shift is frame duration and Frame-4 exhibits PTS < DTS. To compensate shifted timestamp, we should calculate the timestamp offset and re-calculate DTS correspondingly. Otherwise, total timeline of output stream will be shifted, and that can cause time sync issue. Part-of: <https://gitlab.freedesktop.org/gstreamer/gst-plugins-bad/-/merge_requests/2354>
author: Seungha Yang <seungha@centricular.com> 2021-06-23 01:43:08 +0900
committer: GStreamer Marge Bot <gitlab-merge-bot@gstreamer-foundation.org> 2021-06-23 14:12:22 +0000
commit: ba26a5aea87f4fd3b913908d8865448d14e5d694 (patch)
tree: a18d5c953f4bda4bceb407f5359ac9c0a53804db /sys/mediafoundation
parent: 127ade39cf175c7961a0eb75d689f6526920ab15 (diff)
download: gstreamer-plugins-bad-ba26a5aea87f4fd3b913908d8865448d14e5d694.tar.gz
4 files changed, 144 insertions, 31 deletions
diff --git a/sys/mediafoundation/gstmfh264enc.cpp b/sys/mediafoundation/gstmfh264enc.cpp
index a1c4b5f7e..4a4b27df8 100644
--- a/sys/mediafoundation/gstmfh264enc.cpp
+++ b/sys/mediafoundation/gstmfh264enc.cpp
@@ -914,9 +914,13 @@ gst_mf_h264_enc_set_option (GstMFVideoEnc * mfenc, GstVideoCodecState * state,
     WARNING_HR (hr, CODECAPI_AVEncH264PPSID);
   }
 
+  mfenc->has_reorder_frame = FALSE;
   if (device_caps->bframes && selected_profile != eAVEncH264VProfile_Base) {
     hr = gst_mf_transform_set_codec_api_uint32 (transform,
         &CODECAPI_AVEncMPVDefaultBPictureCount, self->bframes);
+    if (SUCCEEDED (hr) && self->bframes > 0)
+      mfenc->has_reorder_frame = TRUE;
+
     WARNING_HR (hr, CODECAPI_AVEncMPVDefaultBPictureCount);
   }
 
diff --git a/sys/mediafoundation/gstmfh265enc.cpp b/sys/mediafoundation/gstmfh265enc.cpp
index 4972cbe16..75c1c533d 100644
--- a/sys/mediafoundation/gstmfh265enc.cpp
+++ b/sys/mediafoundation/gstmfh265enc.cpp
@@ -652,9 +652,13 @@ gst_mf_h265_enc_set_option (GstMFVideoEnc * mfenc, GstVideoCodecState * state,
     WARNING_HR (hr, CODECAPI_AVEncCommonQualityVsSpeed);
   }
 
+  mfenc->has_reorder_frame = FALSE;
   if (device_caps->bframes) {
     hr = gst_mf_transform_set_codec_api_uint32 (transform,
         &CODECAPI_AVEncMPVDefaultBPictureCount, self->bframes);
+    if (SUCCEEDED (hr) && self->bframes > 0)
+      mfenc->has_reorder_frame = TRUE;
+
     WARNING_HR (hr, CODECAPI_AVEncMPVDefaultBPictureCount);
   }
 
diff --git a/sys/mediafoundation/gstmfvideoenc.cpp b/sys/mediafoundation/gstmfvideoenc.cpp
index 5ff22e24c..da710a328 100644
--- a/sys/mediafoundation/gstmfvideoenc.cpp
+++ b/sys/mediafoundation/gstmfvideoenc.cpp
@@ -27,6 +27,7 @@
 #include <wrl.h>
 #include "gstmfvideobuffer.h"
 #include <string.h>
+#include <cmath>
 
 #if GST_MF_HAVE_D3D11
 #include <d3d10.h>
@@ -50,6 +51,7 @@ static void gst_mf_video_enc_set_context (GstElement * element,
     GstContext * context);
 static gboolean gst_mf_video_enc_open (GstVideoEncoder * enc);
 static gboolean gst_mf_video_enc_close (GstVideoEncoder * enc);
+static gboolean gst_mf_video_enc_start (GstVideoEncoder * enc);
 static gboolean gst_mf_video_enc_set_format (GstVideoEncoder * enc,
     GstVideoCodecState * state);
 static GstFlowReturn gst_mf_video_enc_handle_frame (GstVideoEncoder * enc,
@@ -79,6 +81,7 @@ gst_mf_video_enc_class_init (GstMFVideoEncClass * klass)
 
   videoenc_class->open = GST_DEBUG_FUNCPTR (gst_mf_video_enc_open);
   videoenc_class->close = GST_DEBUG_FUNCPTR (gst_mf_video_enc_close);
+  videoenc_class->start = GST_DEBUG_FUNCPTR (gst_mf_video_enc_start);
   videoenc_class->set_format = GST_DEBUG_FUNCPTR (gst_mf_video_enc_set_format);
   videoenc_class->handle_frame =
       GST_DEBUG_FUNCPTR (gst_mf_video_enc_handle_frame);
@@ -268,6 +271,16 @@ gst_mf_video_enc_close (GstVideoEncoder * enc)
 }
 
 static gboolean
+gst_mf_video_enc_start (GstVideoEncoder * enc)
+{
+  /* Media Foundation Transform will shift PTS in case that B-frame is enabled.
+   * We need to adjust DTS correspondingly */
+  gst_video_encoder_set_min_pts (enc, GST_SECOND * 60 * 60 * 1000);
+
+  return TRUE;
+}
+
+static gboolean
 gst_mf_video_enc_set_format (GstVideoEncoder * enc, GstVideoCodecState * state)
 {
   GstMFVideoEnc *self = GST_MF_VIDEO_ENC (enc);
@@ -284,6 +297,10 @@ gst_mf_video_enc_set_format (GstVideoEncoder * enc, GstVideoCodecState * state)
 
   gst_mf_video_enc_finish (enc);
 
+  self->mf_pts_offset = 0;
+  self->has_reorder_frame = FALSE;
+  self->last_ret = GST_FLOW_OK;
+
   if (self->input_state)
     gst_video_codec_state_unref (self->input_state);
   self->input_state = gst_video_codec_state_ref (state);
@@ -620,7 +637,7 @@ gst_mf_video_enc_frame_needs_copy (GstVideoFrame * vframe)
 
 typedef struct
 {
-  GstClockTime mf_pts;
+  LONGLONG mf_pts;
 } GstMFVideoEncFrameData;
 
 static gboolean
@@ -684,47 +701,46 @@ gst_mf_video_enc_process_input (GstMFVideoEnc * self,
 }
 
 static GstVideoCodecFrame *
-gst_mf_video_enc_find_output_frame (GstMFVideoEnc * self, UINT64 mf_dts,
-    UINT64 mf_pts)
+gst_mf_video_enc_find_output_frame (GstMFVideoEnc * self, LONGLONG mf_pts)
 {
   GList *l, *walk = gst_video_encoder_get_frames (GST_VIDEO_ENCODER (self));
   GstVideoCodecFrame *ret = NULL;
+  GstVideoCodecFrame *closest = NULL;
+  LONGLONG min_pts_abs_diff = 0;
 
   for (l = walk; l; l = l->next) {
     GstVideoCodecFrame *frame = (GstVideoCodecFrame *) l->data;
     GstMFVideoEncFrameData *data = (GstMFVideoEncFrameData *)
         gst_video_codec_frame_get_user_data (frame);
+    LONGLONG abs_diff;
 
     if (!data)
       continue;
 
-    if (mf_dts == data->mf_pts) {
+    if (mf_pts == data->mf_pts) {
       ret = frame;
       break;
     }
-  }
-
-  /* find target with pts */
-  if (!ret) {
-    for (l = walk; l; l = l->next) {
-      GstVideoCodecFrame *frame = (GstVideoCodecFrame *) l->data;
-      GstMFVideoEncFrameData *data = (GstMFVideoEncFrameData *)
-          gst_video_codec_frame_get_user_data (frame);
 
-      if (!data)
-        continue;
+    abs_diff = std::abs (mf_pts - data->mf_pts);
 
-      if (mf_pts == data->mf_pts) {
-        ret = frame;
-        break;
-      }
+    if (!closest || abs_diff < min_pts_abs_diff) {
+      closest = frame;
+      min_pts_abs_diff = abs_diff;
     }
   }
 
+  if (!ret && closest)
+    ret = closest;
+
   if (ret) {
     gst_video_codec_frame_ref (ret);
   } else {
-    /* just return the oldest one */
+    /* XXX: Shouldn't happen, but possible if no GstVideoCodecFrame holds
+     * user data for some reasons */
+    GST_WARNING_OBJECT (self,
+        "Failed to find closest GstVideoCodecFrame with MF pts %"
+        G_GINT64_FORMAT, mf_pts);
     ret = gst_video_encoder_get_oldest_frame (GST_VIDEO_ENCODER (self));
   }
 
@@ -745,9 +761,11 @@ gst_mf_video_enc_finish_sample (GstMFVideoEnc * self, IMFSample * sample)
   GstVideoCodecFrame *frame;
   LONGLONG sample_timestamp;
   LONGLONG sample_duration;
+  LONGLONG target_mf_pts;
+  UINT64 mf_dts;
   UINT32 keyframe = FALSE;
-  UINT64 mf_dts = GST_CLOCK_TIME_NONE;
   DWORD buffer_len;
+  GstClockTime pts, dts, duration;
 
   hr = sample->GetBufferByIndex (0, media_buffer.GetAddressOf ());
   if (!gst_mf_result (hr))
@@ -762,6 +780,7 @@ gst_mf_video_enc_finish_sample (GstMFVideoEnc * self, IMFSample * sample)
   media_buffer->Unlock ();
 
   sample->GetSampleTime (&sample_timestamp);
+  target_mf_pts = sample_timestamp;
   sample->GetSampleDuration (&sample_duration);
   sample->GetUINT32 (MFSampleExtension_CleanPoint, &keyframe);
 
@@ -771,29 +790,105 @@ gst_mf_video_enc_finish_sample (GstMFVideoEnc * self, IMFSample * sample)
     hr = S_OK;
   }
 
-  frame = gst_mf_video_enc_find_output_frame (self,
-      mf_dts, (UINT64) sample_timestamp);
+  pts = sample_timestamp * 100;
+  dts = mf_dts * 100;
+  duration = sample_duration * 100;
+
+  GST_LOG_OBJECT (self, "Finish sample, MF pts %" GST_TIME_FORMAT " MF dts %"
+      GST_TIME_FORMAT ", MF duration %" GST_TIME_FORMAT,
+      GST_TIME_ARGS (pts), GST_TIME_ARGS (dts), GST_TIME_ARGS (duration));
+
+  /* NOTE: When B-frame is enabled, MFT shows following pattern
+   * (input timestamp starts from 1000:00:00.000000000, and 30fps)
+   *
+   * Frame-1: MF pts 0:00.033333300 MF dts 0:00.000000000
+   * Frame-2: MF pts 0:00.133333300 MF dts 0:00.033333300
+   * Frame-3: MF pts 0:00.066666600 MF dts 0:00.066666600
+   * Frame-4: MF pts 0:00.099999900 MF dts 0:00.100000000
+   *
+   * - Sounds MFT doesn't support negative timestamp, so PTS of each frame seems
+   *   to be shifthed
+   * - DTS is likely based on timestamp we've set to input sample,
+   *   but some frames has (especially Frame-4 case) unexpected PTS and
+   *   even PTS < DTS. That would be the result of PTS shifting
+   *
+   * To handle this case,
+   * - Calculate timestamp offset "Frame-1 PTS" - "Frame-1 DTS" (== duration),
+   *   and compensate PTS/DTS of each frame
+   * - Needs additional offset for DTS to compenstate GST/MF timescale difference
+   *   (MF uses 100ns timescale). So DTS offset should be "PTS offset + 100ns"
+   * - Find corresponding GstVideoCodecFrame by using compensated PTS.
+   *   Note that MFT doesn't support user-data for tracing input/output sample
+   *   pair. So, timestamp based lookup is the only way to map MF sample
+   *   and our GstVideoCodecFrame
+   */
+  if (self->has_reorder_frame) {
+    /* This would be the first frame */
+    if (self->mf_pts_offset == 0) {
+      LONGLONG mf_pts_offset = -1;
+      if (sample_timestamp > mf_dts) {
+        mf_pts_offset = sample_timestamp - mf_dts;
+        GST_DEBUG_OBJECT (self, "Calculates PTS offset using \"PTS - DTS\": %"
+            G_GINT64_FORMAT, mf_pts_offset);
+      } else if (sample_duration > 0) {
+        mf_pts_offset = sample_duration;
+        GST_DEBUG_OBJECT (self, "Calculates PTS offset using duration: %"
+            G_GINT64_FORMAT, mf_pts_offset);
+      } else {
+        GST_WARNING_OBJECT (self, "Cannot calculate PTS offset");
+      }
+
+      self->mf_pts_offset = mf_pts_offset;
+    }
+
+    if (self->mf_pts_offset > 0) {
+      target_mf_pts -= self->mf_pts_offset;
+
+      pts -= (self->mf_pts_offset * 100);
+      /* +1 to compensate timescale difference */
+      dts -= ((self->mf_pts_offset + 1) * 100);
+    }
+  }
+
+  frame = gst_mf_video_enc_find_output_frame (self, target_mf_pts);
 
   if (frame) {
     if (keyframe) {
       GST_DEBUG_OBJECT (self, "Keyframe pts %" GST_TIME_FORMAT,
           GST_TIME_ARGS (frame->pts));
       GST_VIDEO_CODEC_FRAME_SET_SYNC_POINT (frame);
-      GST_BUFFER_FLAG_UNSET (buffer, GST_BUFFER_FLAG_DELTA_UNIT);
-    } else {
-      GST_BUFFER_FLAG_SET (buffer, GST_BUFFER_FLAG_DELTA_UNIT);
     }
 
-    frame->pts = sample_timestamp * 100;
-    frame->dts = mf_dts * 100;
-    frame->duration = sample_duration * 100;
     frame->output_buffer = buffer;
 
+    /* Update DTS only if B-frame was enabled, but use input frame pts as-is.
+     * Otherwise we will lost at most 100ns precision */
+    if (self->has_reorder_frame) {
+      frame->dts = dts;
+    } else {
+      frame->dts = frame->pts;
+    }
+
+    /* make sure PTS > DTS */
+    if (GST_CLOCK_TIME_IS_VALID (frame->pts) &&
+        GST_CLOCK_TIME_IS_VALID (frame->dts) &&
+        frame->pts < frame->dts) {
+      GST_WARNING_OBJECT (self, "Calculated DTS %" GST_TIME_FORMAT
+          " is larger than PTS %" GST_TIME_FORMAT, GST_TIME_ARGS (frame->pts),
+          GST_TIME_ARGS (frame->dts));
+
+      /* XXX: just set clock-time-none? */
+      frame->dts = frame->pts;
+    }
+
+    GST_LOG_OBJECT (self, "Frame pts %" GST_TIME_FORMAT ", Frame DTS %"
+        GST_TIME_FORMAT, GST_TIME_ARGS (frame->pts), GST_TIME_ARGS (frame->dts));
+
     res = gst_video_encoder_finish_frame (GST_VIDEO_ENCODER (self), frame);
   } else {
-    GST_BUFFER_DTS (buffer) = mf_dts * 100;
-    GST_BUFFER_PTS (buffer) = sample_timestamp * 100;
-    GST_BUFFER_DURATION (buffer) = sample_duration * 100;
+    GST_BUFFER_PTS (buffer) = pts;
+    GST_BUFFER_DTS (buffer) = dts;
+    GST_BUFFER_DURATION (buffer) = duration;
 
     if (keyframe) {
       GST_DEBUG_OBJECT (self, "Keyframe pts %" GST_TIME_FORMAT,
@@ -803,6 +898,9 @@ gst_mf_video_enc_finish_sample (GstMFVideoEnc * self, IMFSample * sample)
       GST_BUFFER_FLAG_SET (buffer, GST_BUFFER_FLAG_DELTA_UNIT);
     }
 
+    GST_LOG_OBJECT (self, "Buffer pts %" GST_TIME_FORMAT ", Buffer DTS %"
+        GST_TIME_FORMAT, GST_TIME_ARGS (pts), GST_TIME_ARGS (dts));
+
     res = gst_pad_push (GST_VIDEO_ENCODER_SRC_PAD (self), buffer);
   }
 
diff --git a/sys/mediafoundation/gstmfvideoenc.h b/sys/mediafoundation/gstmfvideoenc.h
index 4f30ae22b..1248fb5f2 100644
--- a/sys/mediafoundation/gstmfvideoenc.h
+++ b/sys/mediafoundation/gstmfvideoenc.h
@@ -100,6 +100,13 @@ struct _GstMFVideoEnc
 
   GstVideoCodecState *input_state;
 
+  /* Set by subclass */
+  gboolean has_reorder_frame;
+
+  /* Calculated timestamp offset in MF timescale (100ns scale)
+   * when B-frame is enabled. */
+  LONGLONG mf_pts_offset;
+
 #if GST_MF_HAVE_D3D11
   /* For D3D11 interop. */
   GstD3D11Device *other_d3d11_device;
author	Seungha Yang <seungha@centricular.com>	2021-06-23 01:43:08 +0900
committer	GStreamer Marge Bot <gitlab-merge-bot@gstreamer-foundation.org>	2021-06-23 14:12:22 +0000
commit	ba26a5aea87f4fd3b913908d8865448d14e5d694 (patch)
tree	a18d5c953f4bda4bceb407f5359ac9c0a53804db /sys/mediafoundation
parent	127ade39cf175c7961a0eb75d689f6526920ab15 (diff)
download	gstreamer-plugins-bad-ba26a5aea87f4fd3b913908d8865448d14e5d694.tar.gz