libavfilter/vf_normalize.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386

/*
 * Copyright (c) 2017 Richard Ling
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

/*
 * Normalize RGB video (aka histogram stretching, contrast stretching).
 * See: https://en.wikipedia.org/wiki/Normalization_(image_processing)
 *
 * For each channel of each frame, the filter computes the input range and maps
 * it linearly to the user-specified output range. The output range defaults
 * to the full dynamic range from pure black to pure white.
 *
 * Naively maximising the dynamic range of each frame of video in isolation
 * may cause flickering (rapid changes in brightness of static objects in the
 * scene) when small dark or bright objects enter or leave the scene. This
 * filter can apply temporal smoothing to the input range to reduce flickering.
 * Temporal smoothing is similar to the auto-exposure (automatic gain control)
 * on a video camera, which performs the same function; and, like a video
 * camera, it may cause a period of over- or under-exposure of the video.
 *
 * The filter can normalize the R,G,B channels independently, which may cause
 * color shifting, or link them together as a single channel, which prevents
 * color shifting. More precisely, linked normalization preserves hue (as it's
 * defined in HSV/HSL color spaces) while independent normalization does not.
 * Independent normalization can be used to remove color casts, such as the
 * blue cast from underwater video, restoring more natural colors. The filter
 * can also combine independent and linked normalization in any ratio.
 *
 * Finally the overall strength of the filter can be adjusted, from no effect
 * to full normalization.
 *
 * The 5 AVOptions are:
 *   blackpt,   Colors which define the output range. The minimum input value
 *   whitept    is mapped to the blackpt. The maximum input value is mapped to
 *              the whitept. The defaults are black and white respectively.
 *              Specifying white for blackpt and black for whitept will give
 *              color-inverted, normalized video. Shades of grey can be used
 *              to reduce the dynamic range (contrast). Specifying saturated
 *              colors here can create some interesting effects.
 *
 *   smoothing  The amount of temporal smoothing, expressed in frames (>=0).
 *              the minimum and maximum input values of each channel are
 *              smoothed using a rolling average over the current frame and
 *              that many previous frames of video.  Defaults to 0 (no temporal
 *              smoothing).
 *
 *   independence
 *              Controls the ratio of independent (color shifting) channel
 *              normalization to linked (color preserving) normalization. 0.0
 *              is fully linked, 1.0 is fully independent. Defaults to fully
 *              independent.
 *
 *   strength   Overall strength of the filter. 1.0 is full strength. 0.0 is
 *              a rather expensive no-op. Values in between can give a gentle
 *              boost to low-contrast video without creating an artificial
 *              over-processed look. The default is full strength.
 */

#include "libavutil/imgutils.h"
#include "libavutil/opt.h"
#include "libavutil/pixdesc.h"
#include "avfilter.h"
#include "formats.h"
#include "internal.h"
#include "video.h"

typedef struct NormalizeContext {
    const AVClass *class;

    // Storage for the corresponding AVOptions
    uint8_t blackpt[4];
    uint8_t whitept[4];
    int smoothing;
    float independence;
    float strength;

    int co[4];          // Offsets to R,G,B,A bytes respectively in each pixel
    int num_components; // Number of components in the pixel format
    int history_len;    // Number of frames to average; based on smoothing factor
    int frame_num;      // Increments on each frame, starting from 0.

    // Per-extremum, per-channel history, for temporal smoothing.
    struct {
        uint8_t *history;       // History entries.
        uint32_t history_sum;   // Sum of history entries.
    } min[3], max[3];           // Min and max for each channel in {R,G,B}.
    uint8_t *history_mem;       // Single allocation for above history entries

} NormalizeContext;

#define OFFSET(x) offsetof(NormalizeContext, x)
#define FLAGS AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_FILTERING_PARAM

static const AVOption normalize_options[] = {
    { "blackpt",  "output color to which darkest input color is mapped",  OFFSET(blackpt), AV_OPT_TYPE_COLOR, { .str = "black" }, CHAR_MIN, CHAR_MAX, FLAGS },
    { "whitept",  "output color to which brightest input color is mapped",  OFFSET(whitept), AV_OPT_TYPE_COLOR, { .str = "white" }, CHAR_MIN, CHAR_MAX, FLAGS },
    { "smoothing",  "amount of temporal smoothing of the input range, to reduce flicker", OFFSET(smoothing), AV_OPT_TYPE_INT, {.i64=0}, 0, INT_MAX/8, FLAGS },
    { "independence", "proportion of independent to linked channel normalization", OFFSET(independence), AV_OPT_TYPE_FLOAT, {.dbl=1.0}, 0.0, 1.0, FLAGS },
    { "strength", "strength of filter, from no effect to full normalization", OFFSET(strength), AV_OPT_TYPE_FLOAT, {.dbl=1.0}, 0.0, 1.0, FLAGS },
    { NULL }
};

AVFILTER_DEFINE_CLASS(normalize);

// This function is the main guts of the filter. Normalizes the input frame
// into the output frame. The frames are known to have the same dimensions
// and pixel format.
static void normalize(NormalizeContext *s, AVFrame *in, AVFrame *out)
{
    // Per-extremum, per-channel local variables.
    struct {
        uint8_t in;     // Original input byte value for this frame.
        float smoothed; // Smoothed input value [0,255].
        float out;      // Output value [0,255].
    } min[3], max[3];   // Min and max for each channel in {R,G,B}.

    float rgb_min_smoothed; // Min input range for linked normalization
    float rgb_max_smoothed; // Max input range for linked normalization
    uint8_t lut[3][256];    // Lookup table
    int x, y, c;

    // First, scan the input frame to find, for each channel, the minimum
    // (min.in) and maximum (max.in) values present in the channel.
    for (c = 0; c < 3; c++)
        min[c].in = max[c].in = in->data[0][s->co[c]];
    for (y = 0; y < in->height; y++) {
        uint8_t *inp = in->data[0] + y * in->linesize[0];
        uint8_t *outp = out->data[0] + y * out->linesize[0];
        for (x = 0; x < in->width; x++) {
            for (c = 0; c < 3; c++) {
                min[c].in = FFMIN(min[c].in, inp[s->co[c]]);
                max[c].in = FFMAX(max[c].in, inp[s->co[c]]);
            }
            inp += s->num_components;
            outp += s->num_components;
        }
    }

    // Next, for each channel, push min.in and max.in into their respective
    // histories, to determine the min.smoothed and max.smoothed for this frame.
    {
        int history_idx = s->frame_num % s->history_len;
        // Assume the history is not yet full; num_history_vals is the number
        // of frames received so far including the current frame.
        int num_history_vals = s->frame_num + 1;
        if (s->frame_num >= s->history_len) {
            //The history is full; drop oldest value and cap num_history_vals.
            for (c = 0; c < 3; c++) {
                s->min[c].history_sum -= s->min[c].history[history_idx];
                s->max[c].history_sum -= s->max[c].history[history_idx];
            }
            num_history_vals = s->history_len;
        }
        // For each extremum, update history_sum and calculate smoothed value
        // as the rolling average of the history entries.
        for (c = 0; c < 3; c++) {
            s->min[c].history_sum += (s->min[c].history[history_idx] = min[c].in);
            min[c].smoothed = s->min[c].history_sum / (float)num_history_vals;
            s->max[c].history_sum += (s->max[c].history[history_idx] = max[c].in);
            max[c].smoothed = s->max[c].history_sum / (float)num_history_vals;
        }
    }

    // Determine the input range for linked normalization. This is simply the
    // minimum of the per-channel minimums, and the maximum of the per-channel
    // maximums.
    rgb_min_smoothed = FFMIN3(min[0].smoothed, min[1].smoothed, min[2].smoothed);
    rgb_max_smoothed = FFMAX3(max[0].smoothed, max[1].smoothed, max[2].smoothed);

    // Now, process each channel to determine the input and output range and
    // build the lookup tables.
    for (c = 0; c < 3; c++) {
        int in_val;
        // Adjust the input range for this channel [min.smoothed,max.smoothed]
        // by mixing in the correct proportion of the linked normalization
        // input range [rgb_min_smoothed,rgb_max_smoothed].
        min[c].smoothed = (min[c].smoothed  *         s->independence)
                        + (rgb_min_smoothed * (1.0f - s->independence));
        max[c].smoothed = (max[c].smoothed  *         s->independence)
                        + (rgb_max_smoothed * (1.0f - s->independence));

        // Calculate the output range [min.out,max.out] as a ratio of the full-
        // strength output range [blackpt,whitept] and the original input range
        // [min.in,max.in], based on the user-specified filter strength.
        min[c].out = (s->blackpt[c] *         s->strength)
                   + (min[c].in     * (1.0f - s->strength));
        max[c].out = (s->whitept[c] *         s->strength)
                   + (max[c].in     * (1.0f - s->strength));

        // Now, build a lookup table which linearly maps the adjusted input range
        // [min.smoothed,max.smoothed] to the output range [min.out,max.out].
        // Perform the linear interpolation for each x:
        //     lut[x] = (int)(float(x - min.smoothed) * scale + max.out + 0.5)
        // where scale = (max.out - min.out) / (max.smoothed - min.smoothed)
        if (min[c].smoothed == max[c].smoothed) {
            // There is no dynamic range to expand. No mapping for this channel.
            for (in_val = min[c].in; in_val <= max[c].in; in_val++)
                lut[c][in_val] = min[c].out;
        } else {
            // We must set lookup values for all values in the original input
            // range [min.in,max.in]. Since the original input range may be
            // larger than [min.smoothed,max.smoothed], some output values may
            // fall outside the [0,255] dynamic range. We need to clamp them.
            float scale = (max[c].out - min[c].out) / (max[c].smoothed - min[c].smoothed);
            for (in_val = min[c].in; in_val <= max[c].in; in_val++) {
                int out_val = (in_val - min[c].smoothed) * scale + min[c].out + 0.5f;
                out_val = FFMAX(out_val, 0);
                out_val = FFMIN(out_val, 255);
                lut[c][in_val] = out_val;
            }
        }
    }

    // Finally, process the pixels of the input frame using the lookup tables.
    for (y = 0; y < in->height; y++) {
        uint8_t *inp = in->data[0] + y * in->linesize[0];
        uint8_t *outp = out->data[0] + y * out->linesize[0];
        for (x = 0; x < in->width; x++) {
            for (c = 0; c < 3; c++)
                outp[s->co[c]] = lut[c][inp[s->co[c]]];
            if (s->num_components == 4)
                // Copy alpha as-is.
                outp[s->co[3]] = inp[s->co[3]];
            inp += s->num_components;
            outp += s->num_components;
        }
    }

    s->frame_num++;
}

// Now we define all the functions accessible from the ff_vf_normalize class,
// which is ffmpeg's interface to our filter.  See doc/filter_design.txt and
// doc/writing_filters.txt for descriptions of what these interface functions
// are expected to do.

// Set the pixel formats that our filter supports. We should be able to process
// any 8-bit RGB formats. 16-bit support might be useful one day.
static int query_formats(AVFilterContext *ctx)
{
    static const enum AVPixelFormat pixel_fmts[] = {
        AV_PIX_FMT_RGB24,
        AV_PIX_FMT_BGR24,
        AV_PIX_FMT_ARGB,
        AV_PIX_FMT_RGBA,
        AV_PIX_FMT_ABGR,
        AV_PIX_FMT_BGRA,
        AV_PIX_FMT_0RGB,
        AV_PIX_FMT_RGB0,
        AV_PIX_FMT_0BGR,
        AV_PIX_FMT_BGR0,
        AV_PIX_FMT_NONE
    };
    // According to filter_design.txt, using ff_set_common_formats() this way
    // ensures the pixel formats of the input and output will be the same. That
    // saves a bit of effort possibly needing to handle format conversions.
    AVFilterFormats *formats = ff_make_format_list(pixel_fmts);
    if (!formats)
        return AVERROR(ENOMEM);
    return ff_set_common_formats(ctx, formats);
}

// At this point we know the pixel format used for both input and output.  We
// can also access the frame rate of the input video and allocate some memory
// appropriately
static int config_input(AVFilterLink *inlink)
{
    NormalizeContext *s = inlink->dst->priv;
    // Store offsets to R,G,B,A bytes respectively in each pixel
    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(inlink->format);
    int c;

    for (c = 0; c < 4; ++c)
        s->co[c] = desc->comp[c].offset;
    s->num_components = desc->nb_components;
    // Convert smoothing value to history_len (a count of frames to average,
    // must be at least 1).  Currently this is a direct assignment, but the
    // smoothing value was originally envisaged as a number of seconds.  In
    // future it would be nice to set history_len using a number of seconds,
    // but VFR video is currently an obstacle to doing so.
    s->history_len = s->smoothing + 1;
    // Allocate the history buffers -- there are 6 -- one for each extrema.
    // s->smoothing is limited to INT_MAX/8, so that (s->history_len * 6)
    // can't overflow on 32bit causing a too-small allocation.
    s->history_mem = av_malloc(s->history_len * 6);
    if (s->history_mem == NULL)
        return AVERROR(ENOMEM);

    for (c = 0; c < 3; c++) {
        s->min[c].history = s->history_mem + (c*2)   * s->history_len;
        s->max[c].history = s->history_mem + (c*2+1) * s->history_len;
    }
    return 0;
}

// Free any memory allocations here
static av_cold void uninit(AVFilterContext *ctx)
{
    NormalizeContext *s = ctx->priv;

    av_freep(&s->history_mem);
}

// This function is pretty much standard from doc/writing_filters.txt.  It
// tries to do in-place filtering where possible, only allocating a new output
// frame when absolutely necessary.
static int filter_frame(AVFilterLink *inlink, AVFrame *in)
{
    AVFilterContext *ctx = inlink->dst;
    AVFilterLink *outlink = ctx->outputs[0];
    NormalizeContext *s = ctx->priv;
    AVFrame *out;
    // Set 'direct' if we can modify the input frame in-place.  Otherwise we
    // need to retrieve a new frame from the output link.
    int direct = av_frame_is_writable(in) && !ctx->is_disabled;

    if (direct) {
        out = in;
    } else {
        out = ff_get_video_buffer(outlink, outlink->w, outlink->h);
        if (!out) {
            av_frame_free(&in);
            return AVERROR(ENOMEM);
        }
        av_frame_copy_props(out, in);
    }

    // Now we've got the input and output frames (which may be the same frame)
    // perform the filtering with our custom function.
    normalize(s, in, out);

    if (ctx->is_disabled) {
        av_frame_free(&out);
        return ff_filter_frame(outlink, in);
    }

    if (!direct)
        av_frame_free(&in);

    return ff_filter_frame(outlink, out);
}

static const AVFilterPad inputs[] = {
    {
        .name         = "default",
        .type         = AVMEDIA_TYPE_VIDEO,
        .filter_frame = filter_frame,
        .config_props = config_input,
    },
    { NULL }
};

static const AVFilterPad outputs[] = {
    {
        .name = "default",
        .type = AVMEDIA_TYPE_VIDEO,
    },
    { NULL }
};

AVFilter ff_vf_normalize = {
    .name          = "normalize",
    .description   = NULL_IF_CONFIG_SMALL("Normalize RGB video."),
    .priv_size     = sizeof(NormalizeContext),
    .priv_class    = &normalize_class,
    .uninit        = uninit,
    .query_formats = query_formats,
    .inputs        = inputs,
    .outputs       = outputs,
};