fftools/ffmpeg: rework audio-decode timestamp handling

Stop using InputStream.dts for generating missing timestamps for decoded frames, because it contains pre-decoding timestamps and there may be arbitrary amount of delay between input packets and output frames (e.g. dependent on the thread count when frame threading is used). It is also in AV_TIME_BASE (i.e. microseconds), which may introduce unnecessary rounding issues. New code maintains a timebase that is the inverse of the LCM of all the samplerates seen so far, and thus can accurately represent every audio sample. This timebase is used to generate missing timestamps after decoding. Changes the result of the following FATE tests * pcm_dvd-16-5.1-96000 * lavf-smjpeg * adpcm-ima-smjpeg In all of these the timestamps now better correspond to actual frame durations.
author: Anton Khirnov <anton@khirnov.net> 2023-04-24 12:28:13 +0200
committer: Anton Khirnov <anton@khirnov.net> 2023-05-02 10:59:24 +0200
commit: d85c6aba0cf27db2a6c4dfa3452cfb9c248d1b4a (patch)
tree: 2cc717199739690512fc066f32b6ad74ce71324f /fftools
parent: 6bbea932ca9a0f124b713bef361a9e4ef19d2583 (diff)
download: ffmpeg-d85c6aba0cf27db2a6c4dfa3452cfb9c248d1b4a.tar.gz
3 files changed, 89 insertions, 23 deletions
diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c
index 8829a163e0..8dcc70e879 100644
--- a/fftools/ffmpeg.c
+++ b/fftools/ffmpeg.c
@@ -881,6 +881,85 @@ static int send_frame_to_filters(InputStream *ist, AVFrame *decoded_frame)
     return ret;
 }
 
+static AVRational audio_samplerate_update(InputStream *ist, const AVFrame *frame)
+{
+    const int prev = ist->last_frame_tb.den;
+    const int sr   = frame->sample_rate;
+
+    AVRational tb_new;
+    int64_t gcd;
+
+    if (frame->sample_rate == ist->last_frame_sample_rate)
+        goto finish;
+
+    gcd  = av_gcd(prev, sr);
+
+    if (prev / gcd >= INT_MAX / sr) {
+        av_log(ist, AV_LOG_WARNING,
+               "Audio timestamps cannot be represented exactly after "
+               "sample rate change: %d -> %d\n", prev, sr);
+
+        // LCM of 192000, 44100, allows to represent all common samplerates
+        tb_new = (AVRational){ 1, 28224000 };
+    } else
+        tb_new = (AVRational){ 1, prev / gcd * sr };
+
+    // keep the frame timebase if it is strictly better than
+    // the samplerate-defined one
+    if (frame->time_base.num == 1 && frame->time_base.den > tb_new.den &&
+        !(frame->time_base.den % tb_new.den))
+        tb_new = frame->time_base;
+
+    if (ist->last_frame_pts != AV_NOPTS_VALUE)
+        ist->last_frame_pts = av_rescale_q(ist->last_frame_pts,
+                                           ist->last_frame_tb, tb_new);
+    ist->last_frame_duration_est = av_rescale_q(ist->last_frame_duration_est,
+                                                ist->last_frame_tb, tb_new);
+
+    ist->last_frame_tb          = tb_new;
+    ist->last_frame_sample_rate = frame->sample_rate;
+
+finish:
+    return ist->last_frame_tb;
+}
+
+static void audio_ts_process(InputStream *ist, AVFrame *frame)
+{
+    AVRational tb_filter = (AVRational){1, frame->sample_rate};
+    AVRational tb;
+    int64_t pts_pred;
+
+    // on samplerate change, choose a new internal timebase for timestamp
+    // generation that can represent timestamps from all the samplerates
+    // seen so far
+    tb = audio_samplerate_update(ist, frame);
+    pts_pred = ist->last_frame_pts == AV_NOPTS_VALUE ? 0 :
+               ist->last_frame_pts + ist->last_frame_duration_est;
+
+    if (frame->pts == AV_NOPTS_VALUE) {
+        frame->pts = pts_pred;
+        frame->time_base = tb;
+    } else if (ist->last_frame_pts != AV_NOPTS_VALUE &&
+               frame->pts > av_rescale_q_rnd(pts_pred, tb, frame->time_base,
+                                             AV_ROUND_UP)) {
+        // there was a gap in timestamps, reset conversion state
+        ist->filter_in_rescale_delta_last = AV_NOPTS_VALUE;
+    }
+
+    frame->pts = av_rescale_delta(frame->time_base, frame->pts,
+                                  tb, frame->nb_samples,
+                                  &ist->filter_in_rescale_delta_last, tb);
+
+    ist->last_frame_pts          = frame->pts;
+    ist->last_frame_duration_est = av_rescale_q(frame->nb_samples,
+                                                tb_filter, tb);
+
+    // finally convert to filtering timebase
+    frame->pts       = av_rescale_q(frame->pts, tb, tb_filter);
+    frame->duration  = frame->nb_samples;
+    frame->time_base = tb_filter;
+}
+
 static int decode_audio(InputStream *ist, AVPacket *pkt, int *got_output,
                         int *decode_failed)
 {
@@ -910,23 +989,7 @@ static int decode_audio(InputStream *ist, AVPacket *pkt, int *got_output,
     ist->next_dts += ((int64_t)AV_TIME_BASE * decoded_frame->nb_samples) /
                      decoded_frame->sample_rate;
 
-    if (decoded_frame->pts == AV_NOPTS_VALUE) {
-        decoded_frame->pts = ist->dts;
-        decoded_frame->time_base = AV_TIME_BASE_Q;
-    }
-    if (pkt && pkt->duration && ist->prev_pkt_pts != AV_NOPTS_VALUE &&
-        pkt->pts != AV_NOPTS_VALUE && pkt->pts - ist->prev_pkt_pts > pkt->duration)
-        ist->filter_in_rescale_delta_last = AV_NOPTS_VALUE;
-    if (pkt)
-        ist->prev_pkt_pts = pkt->pts;
-    if (decoded_frame->pts != AV_NOPTS_VALUE) {
-        AVRational tb_filter = (AVRational){1, decoded_frame->sample_rate};
-        decoded_frame->pts = av_rescale_delta(decoded_frame->time_base, decoded_frame->pts,
-                                              tb_filter, decoded_frame->nb_samples,
-                                              &ist->filter_in_rescale_delta_last,
-                                              tb_filter);
-        decoded_frame->time_base = tb_filter;
-    }
+    audio_ts_process(ist, decoded_frame);
 
     ist->nb_samples = decoded_frame->nb_samples;
     err = send_frame_to_filters(ist, decoded_frame);
@@ -1076,6 +1139,7 @@ static int decode_video(InputStream *ist, AVPacket *pkt, int *got_output, int64_
     // update timestamp history
     ist->last_frame_duration_est = video_duration_estimate(ist, decoded_frame);
     ist->last_frame_pts          = decoded_frame->pts;
+    ist->last_frame_tb           = decoded_frame->time_base;
 
     if (debug_ts) {
         av_log(ist, AV_LOG_INFO,
diff --git a/fftools/ffmpeg.h b/fftools/ffmpeg.h
index c07a1b86b6..c4abf89b58 100644
--- a/fftools/ffmpeg.h
+++ b/fftools/ffmpeg.h
@@ -359,7 +359,6 @@ typedef struct InputStream {
 
     AVRational framerate_guessed;
 
-    int64_t       prev_pkt_pts;
     int64_t       start;     /* time when read started */
     /* predicted dts of the next packet read for this stream or (when there are
      * several frames in a packet) of the next frame in current packet (in AV_TIME_BASE units) */
@@ -371,10 +370,13 @@ typedef struct InputStream {
     int64_t       next_pts;
     int64_t       pts;       ///< current pts of the decoded frame  (in AV_TIME_BASE units)
 
-    // pts/estimated duration of the last decoded video frame
-    // in decoder timebase
+    // pts/estimated duration of the last decoded frame
+    // * in decoder timebase for video,
+    // * in last_frame_tb (may change during decoding) for audio
     int64_t last_frame_pts;
     int64_t last_frame_duration_est;
+    AVRational    last_frame_tb;
+    int           last_frame_sample_rate;
 
     int           wrap_correction_done;
 
diff --git a/fftools/ffmpeg_demux.c b/fftools/ffmpeg_demux.c
index 5afb3ff2c8..f8d95d1de6 100644
--- a/fftools/ffmpeg_demux.c
+++ b/fftools/ffmpeg_demux.c
@@ -864,7 +864,9 @@ static void add_input_streams(const OptionsContext *o, Demuxer *d)
         }
 
         ist->filter_in_rescale_delta_last = AV_NOPTS_VALUE;
-        ist->prev_pkt_pts = AV_NOPTS_VALUE;
+
+        ist->last_frame_pts = AV_NOPTS_VALUE;
+        ist->last_frame_tb  = (AVRational){ 1, 1 };
 
         ist->dec_ctx = avcodec_alloc_context3(ist->dec);
         if (!ist->dec_ctx)
@@ -905,8 +907,6 @@ static void add_input_streams(const OptionsContext *o, Demuxer *d)
 
             ist->framerate_guessed = av_guess_frame_rate(ic, st, NULL);
 
-            ist->last_frame_pts = AV_NOPTS_VALUE;
-
             break;
         case AVMEDIA_TYPE_AUDIO: {
             int guess_layout_max = INT_MAX;
author	Anton Khirnov <anton@khirnov.net>	2023-04-24 12:28:13 +0200
committer	Anton Khirnov <anton@khirnov.net>	2023-05-02 10:59:24 +0200
commit	d85c6aba0cf27db2a6c4dfa3452cfb9c248d1b4a (patch)
tree	2cc717199739690512fc066f32b6ad74ce71324f /fftools
parent	6bbea932ca9a0f124b713bef361a9e4ef19d2583 (diff)
download	ffmpeg-d85c6aba0cf27db2a6c4dfa3452cfb9c248d1b4a.tar.gz