From 4d7a1a4619c9ed89c5f6dbf8a0ae85b9a8aa056b Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt Date: Thu, 8 Sep 2022 03:45:12 +0200 Subject: swscale/input: Avoid calls to av_pix_fmt_desc_get() Up until now, libswscale/input.c used a macro to read an input pixel which involved a call to av_pix_fmt_desc_get() to find out whether the input pixel format is BE or LE despite this being known at compile-time (there are templates per pixfmt). Even worse, these calls are made in a loop, so that e.g. there are six calls to av_pix_fmt_desc_get() for every pair of UV pixel processed in rgb64ToUV_half_c_template(). This commit modifies these macros to ensure that isBE() is evaluated at compile-time. This saved 9743B of .text for me (GCC 11.2, -O3). For a simple RGB64LE->YUV420P transformation like ffmpeg -f lavfi -i haldclutsrc,format=rgba64le -pix_fmt yuv420p \ -threads 1 -t 1:00 -f null - the amount of decicycles spent in rgb64LEToUV_half_c (which is created via the template mentioned above) decreases from 19751 to 5341; for RGBA64BE the number went down from 11945 to 5393. For shared builds (where the call to av_pix_fmt_desc_get() is indirect) the old numbers are 15230 for RGBA64BE and 27502 for RGBA64LE, whereas the numbers with this patch are indistinguishable from the numbers from a static build. Also make the macros that are touched conform to the usual convention of using uppercase names while just at it. Reviewed-by: Anton Khirnov Reviewed-by: Paul B Mahol Reviewed-by: Michael Niedermayer Signed-off-by: Andreas Rheinhardt --- libswscale/input.c | 122 +++++++++++++++++++++++++++++------------------------ 1 file changed, 68 insertions(+), 54 deletions(-) (limited to 'libswscale') diff --git a/libswscale/input.c b/libswscale/input.c index 88e318e664..7ff7bfaa01 100644 --- a/libswscale/input.c +++ b/libswscale/input.c @@ -28,14 +28,21 @@ #include "config.h" #include "swscale_internal.h" -#define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos)) +#define input_pixel(pos) (is_be ? AV_RB16(pos) : AV_RL16(pos)) + +#define IS_BE_LE 0 +#define IS_BE_BE 1 +#define IS_BE_ 0 +/* ENDIAN_IDENTIFIER needs to be "BE", "LE" or "". The latter is intended + * for single-byte cases where the concept of endianness does not apply. */ +#define IS_BE(ENDIAN_IDENTIFIER) IS_BE_ ## ENDIAN_IDENTIFIER #define r ((origin == AV_PIX_FMT_BGR48BE || origin == AV_PIX_FMT_BGR48LE || origin == AV_PIX_FMT_BGRA64BE || origin == AV_PIX_FMT_BGRA64LE) ? b_r : r_b) #define b ((origin == AV_PIX_FMT_BGR48BE || origin == AV_PIX_FMT_BGR48LE || origin == AV_PIX_FMT_BGRA64BE || origin == AV_PIX_FMT_BGRA64LE) ? r_b : b_r) static av_always_inline void rgb64ToY_c_template(uint16_t *dst, const uint16_t *src, int width, - enum AVPixelFormat origin, int32_t *rgb2yuv) + enum AVPixelFormat origin, int32_t *rgb2yuv, int is_be) { int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX]; int i; @@ -51,7 +58,7 @@ rgb64ToY_c_template(uint16_t *dst, const uint16_t *src, int width, static av_always_inline void rgb64ToUV_c_template(uint16_t *dstU, uint16_t *dstV, const uint16_t *src1, const uint16_t *src2, - int width, enum AVPixelFormat origin, int32_t *rgb2yuv) + int width, enum AVPixelFormat origin, int32_t *rgb2yuv, int is_be) { int i; int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX]; @@ -70,7 +77,7 @@ rgb64ToUV_c_template(uint16_t *dstU, uint16_t *dstV, static av_always_inline void rgb64ToUV_half_c_template(uint16_t *dstU, uint16_t *dstV, const uint16_t *src1, const uint16_t *src2, - int width, enum AVPixelFormat origin, int32_t *rgb2yuv) + int width, enum AVPixelFormat origin, int32_t *rgb2yuv, int is_be) { int i; int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX]; @@ -86,13 +93,13 @@ rgb64ToUV_half_c_template(uint16_t *dstU, uint16_t *dstV, } } -#define rgb64funcs(pattern, BE_LE, origin) \ +#define RGB64FUNCS_EXT(pattern, BE_LE, origin, is_be) \ static void pattern ## 64 ## BE_LE ## ToY_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused0, const uint8_t *unused1,\ int width, uint32_t *rgb2yuv, void *opq) \ { \ const uint16_t *src = (const uint16_t *) _src; \ uint16_t *dst = (uint16_t *) _dst; \ - rgb64ToY_c_template(dst, src, width, origin, rgb2yuv); \ + rgb64ToY_c_template(dst, src, width, origin, rgb2yuv, is_be); \ } \ \ static void pattern ## 64 ## BE_LE ## ToUV_c(uint8_t *_dstU, uint8_t *_dstV, \ @@ -102,7 +109,7 @@ static void pattern ## 64 ## BE_LE ## ToUV_c(uint8_t *_dstU, uint8_t *_dstV, \ const uint16_t *src1 = (const uint16_t *) _src1, \ *src2 = (const uint16_t *) _src2; \ uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \ - rgb64ToUV_c_template(dstU, dstV, src1, src2, width, origin, rgb2yuv); \ + rgb64ToUV_c_template(dstU, dstV, src1, src2, width, origin, rgb2yuv, is_be); \ } \ \ static void pattern ## 64 ## BE_LE ## ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV, \ @@ -112,18 +119,20 @@ static void pattern ## 64 ## BE_LE ## ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV const uint16_t *src1 = (const uint16_t *) _src1, \ *src2 = (const uint16_t *) _src2; \ uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \ - rgb64ToUV_half_c_template(dstU, dstV, src1, src2, width, origin, rgb2yuv); \ + rgb64ToUV_half_c_template(dstU, dstV, src1, src2, width, origin, rgb2yuv, is_be); \ } +#define RGB64FUNCS(pattern, endianness, base_fmt) \ + RGB64FUNCS_EXT(pattern, endianness, base_fmt ## endianness, IS_BE(endianness)) -rgb64funcs(rgb, LE, AV_PIX_FMT_RGBA64LE) -rgb64funcs(rgb, BE, AV_PIX_FMT_RGBA64BE) -rgb64funcs(bgr, LE, AV_PIX_FMT_BGRA64LE) -rgb64funcs(bgr, BE, AV_PIX_FMT_BGRA64BE) +RGB64FUNCS(rgb, LE, AV_PIX_FMT_RGBA64) +RGB64FUNCS(rgb, BE, AV_PIX_FMT_RGBA64) +RGB64FUNCS(bgr, LE, AV_PIX_FMT_BGRA64) +RGB64FUNCS(bgr, BE, AV_PIX_FMT_BGRA64) static av_always_inline void rgb48ToY_c_template(uint16_t *dst, const uint16_t *src, int width, enum AVPixelFormat origin, - int32_t *rgb2yuv) + int32_t *rgb2yuv, int is_be) { int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX]; int i; @@ -142,7 +151,7 @@ static av_always_inline void rgb48ToUV_c_template(uint16_t *dstU, const uint16_t *src2, int width, enum AVPixelFormat origin, - int32_t *rgb2yuv) + int32_t *rgb2yuv, int is_be) { int i; int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX]; @@ -164,7 +173,7 @@ static av_always_inline void rgb48ToUV_half_c_template(uint16_t *dstU, const uint16_t *src2, int width, enum AVPixelFormat origin, - int32_t *rgb2yuv) + int32_t *rgb2yuv, int is_be) { int i; int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX]; @@ -187,7 +196,7 @@ static av_always_inline void rgb48ToUV_half_c_template(uint16_t *dstU, #undef b #undef input_pixel -#define rgb48funcs(pattern, BE_LE, origin) \ +#define RGB48FUNCS_EXT(pattern, BE_LE, origin, is_be) \ static void pattern ## 48 ## BE_LE ## ToY_c(uint8_t *_dst, \ const uint8_t *_src, \ const uint8_t *unused0, const uint8_t *unused1,\ @@ -197,7 +206,7 @@ static void pattern ## 48 ## BE_LE ## ToY_c(uint8_t *_dst, \ { \ const uint16_t *src = (const uint16_t *)_src; \ uint16_t *dst = (uint16_t *)_dst; \ - rgb48ToY_c_template(dst, src, width, origin, rgb2yuv); \ + rgb48ToY_c_template(dst, src, width, origin, rgb2yuv, is_be); \ } \ \ static void pattern ## 48 ## BE_LE ## ToUV_c(uint8_t *_dstU, \ @@ -213,7 +222,7 @@ static void pattern ## 48 ## BE_LE ## ToUV_c(uint8_t *_dstU, \ *src2 = (const uint16_t *)_src2; \ uint16_t *dstU = (uint16_t *)_dstU, \ *dstV = (uint16_t *)_dstV; \ - rgb48ToUV_c_template(dstU, dstV, src1, src2, width, origin, rgb2yuv); \ + rgb48ToUV_c_template(dstU, dstV, src1, src2, width, origin, rgb2yuv, is_be); \ } \ \ static void pattern ## 48 ## BE_LE ## ToUV_half_c(uint8_t *_dstU, \ @@ -229,13 +238,15 @@ static void pattern ## 48 ## BE_LE ## ToUV_half_c(uint8_t *_dstU, \ *src2 = (const uint16_t *)_src2; \ uint16_t *dstU = (uint16_t *)_dstU, \ *dstV = (uint16_t *)_dstV; \ - rgb48ToUV_half_c_template(dstU, dstV, src1, src2, width, origin, rgb2yuv); \ + rgb48ToUV_half_c_template(dstU, dstV, src1, src2, width, origin, rgb2yuv, is_be); \ } +#define RGB48FUNCS(pattern, endianness, base_fmt) \ + RGB48FUNCS_EXT(pattern, endianness, base_fmt ## endianness, IS_BE(endianness)) -rgb48funcs(rgb, LE, AV_PIX_FMT_RGB48LE) -rgb48funcs(rgb, BE, AV_PIX_FMT_RGB48BE) -rgb48funcs(bgr, LE, AV_PIX_FMT_BGR48LE) -rgb48funcs(bgr, BE, AV_PIX_FMT_BGR48BE) +RGB48FUNCS(rgb, LE, AV_PIX_FMT_RGB48) +RGB48FUNCS(rgb, BE, AV_PIX_FMT_RGB48) +RGB48FUNCS(bgr, LE, AV_PIX_FMT_BGR48) +RGB48FUNCS(bgr, BE, AV_PIX_FMT_BGR48) #define input_pixel(i) ((origin == AV_PIX_FMT_RGBA || \ origin == AV_PIX_FMT_BGRA || \ @@ -245,7 +256,7 @@ rgb48funcs(bgr, BE, AV_PIX_FMT_BGR48BE) : ((origin == AV_PIX_FMT_X2RGB10LE || \ origin == AV_PIX_FMT_X2BGR10LE) \ ? AV_RL32(&src[(i) * 4]) \ - : (isBE(origin) ? AV_RB16(&src[(i) * 2]) \ + : (is_be ? AV_RB16(&src[(i) * 2]) \ : AV_RL16(&src[(i) * 2])))) static av_always_inline void rgb16_32ToY_c_template(int16_t *dst, @@ -257,7 +268,7 @@ static av_always_inline void rgb16_32ToY_c_template(int16_t *dst, int maskr, int maskg, int maskb, int rsh, int gsh, int bsh, int S, - int32_t *rgb2yuv) + int32_t *rgb2yuv, int is_be) { const int ry = rgb2yuv[RY_IDX]<