diff options
-rwxr-xr-x | configure | 4 | ||||
-rw-r--r-- | doc/ffmpeg_powerpc_performance_evaluation_howto.txt | 172 | ||||
-rw-r--r-- | ffmpeg.c | 5 | ||||
-rw-r--r-- | libavcodec/ppc/dsputil_altivec.c | 42 | ||||
-rw-r--r-- | libavcodec/ppc/dsputil_ppc.c | 79 | ||||
-rw-r--r-- | libavcodec/ppc/dsputil_ppc.h | 154 | ||||
-rw-r--r-- | libavcodec/ppc/fdct_altivec.c | 7 | ||||
-rw-r--r-- | libavcodec/ppc/fft_altivec.c | 6 | ||||
-rw-r--r-- | libavcodec/ppc/gmc_altivec.c | 8 | ||||
-rw-r--r-- | libavcodec/ppc/h264_altivec.c | 1 | ||||
-rw-r--r-- | libavcodec/ppc/h264_template_altivec.c | 16 | ||||
-rw-r--r-- | libavcodec/ppc/idct_altivec.c | 14 | ||||
-rw-r--r-- | libavcodec/ppc/mpegvideo_altivec.c | 5 |
13 files changed, 0 insertions, 513 deletions
@@ -211,8 +211,6 @@ Advanced options (experts only): --arch=ARCH select architecture [$arch] --cpu=CPU select the minimum required CPU (affects instruction selection, may crash on older CPUs) - --enable-powerpc-perf enable performance report on PPC - (requires enabling PMC) --disable-asm disable all assembler optimizations --disable-altivec disable AltiVec optimizations --disable-amd3dnow disable 3DNow! optimizations @@ -886,7 +884,6 @@ CONFIG_LIST=" nonfree pic postproc - powerpc_perf rdft runtime_cpudetect shared @@ -2772,7 +2769,6 @@ if enabled ppc; then echo "AltiVec enabled ${altivec-no}" echo "PPC 4xx optimizations ${ppc4xx-no}" echo "dcbzl available ${dcbzl-no}" - echo "performance report ${powerpc_perf-no}" fi if enabled sparc; then echo "VIS enabled ${vis-no}" diff --git a/doc/ffmpeg_powerpc_performance_evaluation_howto.txt b/doc/ffmpeg_powerpc_performance_evaluation_howto.txt deleted file mode 100644 index 2eb4ee71a5..0000000000 --- a/doc/ffmpeg_powerpc_performance_evaluation_howto.txt +++ /dev/null @@ -1,172 +0,0 @@ -FFmpeg & evaluating performance on the PowerPC Architecture HOWTO - -(c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> - - - -I - Introduction - -The PowerPC architecture and its SIMD extension AltiVec offer some -interesting tools to evaluate performance and improve the code. -This document tries to explain how to use those tools with FFmpeg. - -The architecture itself offers two ways to evaluate the performance of -a given piece of code: - -1) The Time Base Registers (TBL) -2) The Performance Monitor Counter Registers (PMC) - -The first ones are always available, always active, but they're not very -accurate: the registers increment by one every four *bus* cycles. On -my 667 Mhz tiBook (ppc7450), this means once every twenty *processor* -cycles. So we won't use that. - -The PMC are much more useful: not only can they report cycle-accurate -timing, but they can also be used to monitor many other parameters, -such as the number of AltiVec stalls for every kind of instruction, -or instruction cache misses. The downside is that not all processors -support the PMC (all G3, all G4 and the 970 do support them), and -they're inactive by default - you need to activate them with a -dedicated tool. Also, the number of available PMC depends on the -procesor: the various 604 have 2, the various 75x (aka. G3) have 4, -and the various 74xx (aka G4) have 6. - -*WARNING*: The PowerPC 970 is not very well documented, and its PMC -registers are 64 bits wide. To properly notify the code, you *must* -tune for the 970 (using --tune=970), or the code will assume 32 bit -registers. - - -II - Enabling FFmpeg PowerPC performance support - -This needs to be done by hand. First, you need to configure FFmpeg as -usual, but add the "--powerpc-perf-enable" option. For instance: - -##### -./configure --prefix=/usr/local/ffmpeg-svn --cc=gcc-3.3 --tune=7450 --powerpc-perf-enable -##### - -This will configure FFmpeg to install inside /usr/local/ffmpeg-svn, -compiling with gcc-3.3 (you should try to use this one or a newer -gcc), and tuning for the PowerPC 7450 (i.e. the newer G4; as a rule of -thumb, those at 550Mhz and more). It will also enable the PMC. - -You may also edit the file "config.h" to enable the following line: - -##### -// #define ALTIVEC_USE_REFERENCE_C_CODE 1 -##### - -If you enable this line, then the code will not make use of AltiVec, -but will use the reference C code instead. This is useful to compare -performance between two versions of the code. - -Also, the number of enabled PMC is defined in "libavcodec/ppc/dsputil_ppc.h": - -##### -#define POWERPC_NUM_PMC_ENABLED 4 -##### - -If you have a G4 CPU, you can enable all 6 PMC. DO NOT enable more -PMC than available on your CPU! - -Then, simply compile FFmpeg as usual (make && make install). - - - -III - Using FFmpeg PowerPC performance support - -This FFmeg can be used exactly as usual. But before exiting, FFmpeg -will dump a per-function report that looks like this: - -##### -PowerPC performance report - Values are from the PMC registers, and represent whatever the - registers are set to record. - Function "gmc1_altivec" (pmc1): - min: 231 - max: 1339867 - avg: 558.25 (255302) - Function "gmc1_altivec" (pmc2): - min: 93 - max: 2164 - avg: 267.31 (255302) - Function "gmc1_altivec" (pmc3): - min: 72 - max: 1987 - avg: 276.20 (255302) -(...) -##### - -In this example, PMC1 was set to record CPU cycles, PMC2 was set to -record AltiVec Permute Stall Cycles, and PMC3 was set to record AltiVec -Issue Stalls. - -The function "gmc1_altivec" was monitored 255302 times, and the -minimum execution time was 231 processor cycles. The max and average -aren't much use, as it's very likely the OS interrupted execution for -reasons of its own :-( - -With the exact same settings and source file, but using the reference C -code we get: - -##### -PowerPC performance report - Values are from the PMC registers, and represent whatever the - registers are set to record. - Function "gmc1_altivec" (pmc1): - min: 592 - max: 2532235 - avg: 962.88 (255302) - Function "gmc1_altivec" (pmc2): - min: 0 - max: 33 - avg: 0.00 (255302) - Function "gmc1_altivec" (pmc3): - min: 0 - max: 350 - avg: 0.03 (255302) -(...) -##### - -592 cycles, so the fastest AltiVec execution is about 2.5x faster than -the fastest C execution in this example. It's not perfect but it's not -bad (well I wrote this function so I can't say otherwise :-). - -Once you have that kind of report, you can try to improve things by -finding what goes wrong and fixing it; in the example above, one -should try to diminish the number of AltiVec stalls, as this *may* -improve performance. - - - -IV) Enabling the PMC in Mac OS X - -This is easy. Use "Monster" and "monster". Those tools come from -Apple's CHUD package, and can be found hidden in the developer web -site & FTP site. "MONster" is the graphical application, use it to -generate a config file specifying what each register should -monitor. Then use the command-line application "monster" to use that -config file, and enjoy the results. - -Note that "MONster" can be used for many other things, but it's -documented by Apple, it's not my subject. - -If you are using CHUD 4.4.2 or later, you'll notice that MONster is -no longer available. It's been superseeded by Shark, where -configuration of PMCs is available as a plugin. - - - -V) Enabling the PMC on Linux - -On linux you may use oprofile from http://oprofile.sf.net, depending on the -version and the cpu you may need to apply a patch[1] to access a set of the -possibile counters from the userspace application. You can always define them -using the kernel interface /dev/oprofile/* . - -[1] http://dev.gentoo.org/~lu_zero/development/oprofile-g4-20060423.patch - --- -Romain Dolbeau <romain@dolbeau.org> -Luca Barbato <lu_zero@gentoo.org> @@ -618,11 +618,6 @@ static int av_exit(int ret) av_free(video_standard); -#if CONFIG_POWERPC_PERF - void powerpc_display_perf_report(void); - powerpc_display_perf_report(); -#endif /* CONFIG_POWERPC_PERF */ - for (i=0;i<AVMEDIA_TYPE_NB;i++) av_free(avcodec_opts[i]); av_free(avformat_opts); diff --git a/libavcodec/ppc/dsputil_altivec.c b/libavcodec/ppc/dsputil_altivec.c index 925b6c83b7..163323264c 100644 --- a/libavcodec/ppc/dsputil_altivec.c +++ b/libavcodec/ppc/dsputil_altivec.c @@ -25,7 +25,6 @@ #include <altivec.h> #endif #include "libavcodec/dsputil.h" -#include "dsputil_ppc.h" #include "util_altivec.h" #include "types_altivec.h" #include "dsputil_altivec.h" @@ -610,7 +609,6 @@ static void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) { /* next one assumes that ((line_size % 16) == 0) */ void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) { -POWERPC_PERF_DECLARE(altivec_put_pixels16_num, 1); register vector unsigned char pixelsv1, pixelsv2; register vector unsigned char pixelsv1B, pixelsv2B; register vector unsigned char pixelsv1C, pixelsv2C; @@ -622,7 +620,6 @@ POWERPC_PERF_DECLARE(altivec_put_pixels16_num, 1); register int line_size_3 = line_size + line_size_2; register int line_size_4 = line_size << 2; -POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1); // hand-unrolling the loop by 4 gains about 15% // mininum execution time goes from 74 to 60 cycles // it's faster than -funroll-loops, but using @@ -659,20 +656,16 @@ POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1); block +=line_size_4; } #endif -POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1); } /* next one assumes that ((line_size % 16) == 0) */ #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) ) void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) { -POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1); register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; register vector unsigned char perm = vec_lvsl(0, pixels); int i; -POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1); - for (i = 0; i < h; i++) { pixelsv1 = vec_ld( 0, pixels); pixelsv2 = vec_ld(16,pixels); @@ -683,19 +676,14 @@ POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1); pixels+=line_size; block +=line_size; } - -POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1); } /* next one assumes that ((line_size % 8) == 0) */ static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) { -POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1); register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; int i; -POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1); - for (i = 0; i < h; i++) { /* block is 8 bytes-aligned, so we're either in the left block (16 bytes-aligned) or in the right block (not) */ @@ -719,14 +707,11 @@ POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1); pixels += line_size; block += line_size; } - -POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1); } /* next one assumes that ((line_size % 8) == 0) */ static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) { -POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1); register int i; register vector unsigned char pixelsv1, pixelsv2, pixelsavg; register vector unsigned char blockv, temp1, temp2; @@ -748,7 +733,6 @@ POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1); (vector unsigned short)pixelsv2); pixelssum1 = vec_add(pixelssum1, vctwo); -POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1); for (i = 0; i < h ; i++) { int rightside = ((unsigned long)block & 0x0000000F); blockv = vec_ld(0, block); @@ -782,14 +766,11 @@ POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1); block += line_size; pixels += line_size; } - -POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1); } /* next one assumes that ((line_size % 8) == 0) */ static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) { -POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1); register int i; register vector unsigned char pixelsv1, pixelsv2, pixelsavg; register vector unsigned char blockv, temp1, temp2; @@ -812,7 +793,6 @@ POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1); (vector unsigned short)pixelsv2); pixelssum1 = vec_add(pixelssum1, vcone); -POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); for (i = 0; i < h ; i++) { int rightside = ((unsigned long)block & 0x0000000F); blockv = vec_ld(0, block); @@ -846,14 +826,11 @@ POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); block += line_size; pixels += line_size; } - -POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); } /* next one assumes that ((line_size % 16) == 0) */ static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) { -POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1); register int i; register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4; register vector unsigned char blockv, temp1, temp2; @@ -862,8 +839,6 @@ POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1); register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); -POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1); - temp1 = vec_ld(0, pixels); temp2 = vec_ld(16, pixels); pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); @@ -919,14 +894,11 @@ POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1); block += line_size; pixels += line_size; } - -POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1); } /* next one assumes that ((line_size % 16) == 0) */ static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) { -POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1); register int i; register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4; register vector unsigned char blockv, temp1, temp2; @@ -936,8 +908,6 @@ POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1); register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); -POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); - temp1 = vec_ld(0, pixels); temp2 = vec_ld(16, pixels); pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); @@ -993,18 +963,14 @@ POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); block += line_size; pixels += line_size; } - -POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); } static int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){ -POWERPC_PERF_DECLARE(altivec_hadamard8_diff8x8_num, 1); int sum; register const vector unsigned char vzero = (const vector unsigned char)vec_splat_u8(0); register vector signed short temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; -POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1); { register const vector signed short vprod1 =(const vector signed short) { 1,-1, 1,-1, 1,-1, 1,-1 }; @@ -1100,7 +1066,6 @@ POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1); vsum = vec_splat(vsum, 3); vec_ste(vsum, 0, &sum); } -POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff8x8_num, 1); return sum; } @@ -1319,16 +1284,13 @@ static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, } static int hadamard8_diff16_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){ -POWERPC_PERF_DECLARE(altivec_hadamard8_diff16_num, 1); int score; -POWERPC_PERF_START_COUNT(altivec_hadamard8_diff16_num, 1); score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8); if (h==16) { dst += 8*stride; src += 8*stride; score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8); } -POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff16_num, 1); return score; } @@ -1358,7 +1320,6 @@ static void vorbis_inverse_coupling_altivec(float *mag, float *ang, /* next one assumes that ((line_size % 8) == 0) */ static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) { -POWERPC_PERF_DECLARE(altivec_avg_pixels8_xy2_num, 1); register int i; register vector unsigned char pixelsv1, pixelsv2, pixelsavg; register vector unsigned char blockv, temp1, temp2, blocktemp; @@ -1383,7 +1344,6 @@ POWERPC_PERF_DECLARE(altivec_avg_pixels8_xy2_num, 1); (vector unsigned short)pixelsv2); pixelssum1 = vec_add(pixelssum1, vctwo); -POWERPC_PERF_START_COUNT(altivec_avg_pixels8_xy2_num, 1); for (i = 0; i < h ; i++) { int rightside = ((unsigned long)block & 0x0000000F); blockv = vec_ld(0, block); @@ -1418,8 +1378,6 @@ POWERPC_PERF_START_COUNT(altivec_avg_pixels8_xy2_num, 1); block += line_size; pixels += line_size; } - -POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_xy2_num, 1); } void dsputil_init_altivec(DSPContext* c, AVCodecContext *avctx) diff --git a/libavcodec/ppc/dsputil_ppc.c b/libavcodec/ppc/dsputil_ppc.c index cf0ac39de3..229ca09960 100644 --- a/libavcodec/ppc/dsputil_ppc.c +++ b/libavcodec/ppc/dsputil_ppc.c @@ -21,9 +21,6 @@ */ #include "libavcodec/dsputil.h" - -#include "dsputil_ppc.h" - #include "dsputil_altivec.h" int mm_flags = 0; @@ -39,63 +36,6 @@ int mm_support(void) return result; } -#if CONFIG_POWERPC_PERF -unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total]; -/* list below must match enum in dsputil_ppc.h */ -static unsigned char* perfname[] = { - "ff_fft_calc_altivec", - "gmc1_altivec", - "dct_unquantize_h263_altivec", - "fdct_altivec", - "idct_add_altivec", - "idct_put_altivec", - "put_pixels16_altivec", - "avg_pixels16_altivec", - "avg_pixels8_altivec", - "put_pixels8_xy2_altivec", - "put_no_rnd_pixels8_xy2_altivec", - "put_pixels16_xy2_altivec", - "put_no_rnd_pixels16_xy2_altivec", - "hadamard8_diff8x8_altivec", - "hadamard8_diff16_altivec", - "avg_pixels8_xy2_altivec", - "clear_blocks_dcbz32_ppc", - "clear_blocks_dcbz128_ppc", - "put_h264_chroma_mc8_altivec", - "avg_h264_chroma_mc8_altivec", - "put_h264_qpel16_h_lowpass_altivec", - "avg_h264_qpel16_h_lowpass_altivec", - "put_h264_qpel16_v_lowpass_altivec", - "avg_h264_qpel16_v_lowpass_altivec", - "put_h264_qpel16_hv_lowpass_altivec", - "avg_h264_qpel16_hv_lowpass_altivec", - "" -}; -#include <stdio.h> -#endif - -#if CONFIG_POWERPC_PERF -void powerpc_display_perf_report(void) -{ - int i, j; - av_log(NULL, AV_LOG_INFO, "PowerPC performance report\n Values are from the PMC registers, and represent whatever the registers are set to record.\n"); - for(i = 0 ; i < powerpc_perf_total ; i++) { - for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++) { - if (perfdata[j][i][powerpc_data_num] != (unsigned long long)0) - av_log(NULL, AV_LOG_INFO, - " Function \"%s\" (pmc%d):\n\tmin: %"PRIu64"\n\tmax: %"PRIu64"\n\tavg: %1.2lf (%"PRIu64")\n", - perfname[i], - j+1, - perfdata[j][i][powerpc_data_min], - perfdata[j][i][powerpc_data_max], - (double)perfdata[j][i][powerpc_data_sum] / - (double)perfdata[j][i][powerpc_data_num], - perfdata[j][i][powerpc_data_num]); - } - } -} -#endif /* CONFIG_POWERPC_PERF */ - /* ***** WARNING ***** WARNING ***** WARNING ***** */ /* clear_blocks_dcbz32_ppc will not work properly on PowerPC processors with a @@ -118,10 +58,8 @@ and <http://developer.apple.com/technotes/tn/tn2086.html> */ static void clear_blocks_dcbz32_ppc(DCTELEM *blocks) { -POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz32, 1); register int misal = ((unsigned long)blocks & 0x00000010); register int i = 0; -POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz32, 1); #if 1 if (misal) { ((unsigned long*)blocks)[0] = 0L; @@ -143,7 +81,6 @@ POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz32, 1); #else memset(blocks, 0, sizeof(DCTELEM)*6*64); #endif -POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz32, 1); } /* same as above, when dcbzl clear a whole 128B cache line @@ -151,10 +88,8 @@ POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz32, 1); #if HAVE_DCBZL static void clear_blocks_dcbz128_ppc(DCTELEM *blocks) { -POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz128, 1); register int misal = ((unsigned long)blocks & 0x0000007f); register int i = 0; -POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz128, 1); #if 1 if (misal) { // we could probably also optimize this case, @@ -169,7 +104,6 @@ POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz128, 1); #else memset(blocks, 0, sizeof(DCTELEM)*6*64); #endif -POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz128, 1); } #else static void clear_blocks_dcbz128_ppc(DCTELEM *blocks) @@ -279,19 +213,6 @@ void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx) } } -#if CONFIG_POWERPC_PERF - { - int i, j; - for (i = 0 ; i < powerpc_perf_total ; i++) { - for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++) { - perfdata[j][i][powerpc_data_min] = 0xFFFFFFFFFFFFFFFFULL; - perfdata[j][i][powerpc_data_max] = 0x0000000000000000ULL; - perfdata[j][i][powerpc_data_sum] = 0x0000000000000000ULL; - perfdata[j][i][powerpc_data_num] = 0x0000000000000000ULL; - } - } - } -#endif /* CONFIG_POWERPC_PERF */ } #endif /* HAVE_ALTIVEC */ } diff --git a/libavcodec/ppc/dsputil_ppc.h b/libavcodec/ppc/dsputil_ppc.h deleted file mode 100644 index d028574f02..0000000000 --- a/libavcodec/ppc/dsputil_ppc.h +++ /dev/null @@ -1,154 +0,0 @@ -/* - * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_PPC_DSPUTIL_PPC_H -#define AVCODEC_PPC_DSPUTIL_PPC_H - -#include "config.h" - -#if CONFIG_POWERPC_PERF -void powerpc_display_perf_report(void); -/* the 604* have 2, the G3* have 4, the G4s have 6, - and the G5 are completely different (they MUST use - ARCH_PPC64, and let's hope all future 64 bis PPC - will use the same PMCs... */ -#define POWERPC_NUM_PMC_ENABLED 6 -/* if you add to the enum below, also add to the perfname array - in dsputil_ppc.c */ -enum powerpc_perf_index { - altivec_fft_num = 0, - altivec_gmc1_num, - altivec_dct_unquantize_h263_num, - altivec_fdct, - altivec_idct_add_num, - altivec_idct_put_num, - altivec_put_pixels16_num, - altivec_avg_pixels16_num, - altivec_avg_pixels8_num, - altivec_put_pixels8_xy2_num, - altivec_put_no_rnd_pixels8_xy2_num, - altivec_put_pixels16_xy2_num, - altivec_put_no_rnd_pixels16_xy2_num, - altivec_hadamard8_diff8x8_num, - altivec_hadamard8_diff16_num, - altivec_avg_pixels8_xy2_num, - powerpc_clear_blocks_dcbz32, - powerpc_clear_blocks_dcbz128, - altivec_put_h264_chroma_mc8_num, - altivec_avg_h264_chroma_mc8_num, - altivec_put_h264_qpel16_h_lowpass_num, - altivec_avg_h264_qpel16_h_lowpass_num, - altivec_put_h264_qpel16_v_lowpass_num, - altivec_avg_h264_qpel16_v_lowpass_num, - altivec_put_h264_qpel16_hv_lowpass_num, - altivec_avg_h264_qpel16_hv_lowpass_num, - powerpc_perf_total -}; -enum powerpc_data_index { - powerpc_data_min = 0, - powerpc_data_max, - powerpc_data_sum, - powerpc_data_num, - powerpc_data_total -}; -extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total]; - -#if !ARCH_PPC64 -#define POWERP_PMC_DATATYPE unsigned long -#define POWERPC_GET_PMC1(a) __asm__ volatile("mfspr %0, 937" : "=r" (a)) -#define POWERPC_GET_PMC2(a) __asm__ volatile("mfspr %0, 938" : "=r" (a)) -#if (POWERPC_NUM_PMC_ENABLED > 2) -#define POWERPC_GET_PMC3(a) __asm__ volatile("mfspr %0, 941" : "=r" (a)) -#define POWERPC_GET_PMC4(a) __asm__ volatile("mfspr %0, 942" : "=r" (a)) -#else -#define POWERPC_GET_PMC3(a) do {} while (0) -#define POWERPC_GET_PMC4(a) do {} while (0) -#endif -#if (POWERPC_NUM_PMC_ENABLED > 4) -#define POWERPC_GET_PMC5(a) __asm__ volatile("mfspr %0, 929" : "=r" (a)) -#define POWERPC_GET_PMC6(a) __asm__ volatile("mfspr %0, 930" : "=r" (a)) -#else -#define POWERPC_GET_PMC5(a) do {} while (0) -#define POWERPC_GET_PMC6(a) do {} while (0) -#endif -#else /* ARCH_PPC64 */ -#define POWERP_PMC_DATATYPE unsigned long long -#define POWERPC_GET_PMC1(a) __asm__ volatile("mfspr %0, 771" : "=r" (a)) -#define POWERPC_GET_PMC2(a) __asm__ volatile("mfspr %0, 772" : "=r" (a)) -#if (POWERPC_NUM_PMC_ENABLED > 2) -#define POWERPC_GET_PMC3(a) __asm__ volatile("mfspr %0, 773" : "=r" (a)) -#define POWERPC_GET_PMC4(a) __asm__ volatile("mfspr %0, 774" : "=r" (a)) -#else -#define POWERPC_GET_PMC3(a) do {} while (0) -#define POWERPC_GET_PMC4(a) do {} while (0) -#endif -#if (POWERPC_NUM_PMC_ENABLED > 4) -#define POWERPC_GET_PMC5(a) __asm__ volatile("mfspr %0, 775" : "=r" (a)) -#define POWERPC_GET_PMC6(a) __asm__ volatile("mfspr %0, 776" : "=r" (a)) -#else -#define POWERPC_GET_PMC5(a) do {} while (0) -#define POWERPC_GET_PMC6(a) do {} while (0) -#endif -#endif /* ARCH_PPC64 */ -#define POWERPC_PERF_DECLARE(a, cond) \ - POWERP_PMC_DATATYPE \ - pmc_start[POWERPC_NUM_PMC_ENABLED], \ - pmc_stop[POWERPC_NUM_PMC_ENABLED], \ - pmc_loop_index; -#define POWERPC_PERF_START_COUNT(a, cond) do { \ - POWERPC_GET_PMC6(pmc_start[5]); \ - POWERPC_GET_PMC5(pmc_start[4]); \ - POWERPC_GET_PMC4(pmc_start[3]); \ - POWERPC_GET_PMC3(pmc_start[2]); \ - POWERPC_GET_PMC2(pmc_start[1]); \ - POWERPC_GET_PMC1(pmc_start[0]); \ - } while (0) -#define POWERPC_PERF_STOP_COUNT(a, cond) do { \ - POWERPC_GET_PMC1(pmc_stop[0]); \ - POWERPC_GET_PMC2(pmc_stop[1]); \ - POWERPC_GET_PMC3(pmc_stop[2]); \ - POWERPC_GET_PMC4(pmc_stop[3]); \ - POWERPC_GET_PMC5(pmc_stop[4]); \ - POWERPC_GET_PMC6(pmc_stop[5]); \ - if (cond) { \ - for(pmc_loop_index = 0; \ - pmc_loop_index < POWERPC_NUM_PMC_ENABLED; \ - pmc_loop_index++) { \ - if (pmc_stop[pmc_loop_index] >= pmc_start[pmc_loop_index]) { \ - POWERP_PMC_DATATYPE diff = \ - pmc_stop[pmc_loop_index] - pmc_start[pmc_loop_index]; \ - if (diff < perfdata[pmc_loop_index][a][powerpc_data_min]) \ - perfdata[pmc_loop_index][a][powerpc_data_min] = diff; \ - if (diff > perfdata[pmc_loop_index][a][powerpc_data_max]) \ - perfdata[pmc_loop_index][a][powerpc_data_max] = diff; \ - perfdata[pmc_loop_index][a][powerpc_data_sum] += diff; \ - perfdata[pmc_loop_index][a][powerpc_data_num] ++; \ - } \ - } \ - } \ -} while (0) -#else /* CONFIG_POWERPC_PERF */ -// those are needed to avoid empty statements. -#define POWERPC_PERF_DECLARE(a, cond) int altivec_placeholder __attribute__ ((unused)) -#define POWERPC_PERF_START_COUNT(a, cond) do {} while (0) -#define POWERPC_PERF_STOP_COUNT(a, cond) do {} while (0) -#endif /* CONFIG_POWERPC_PERF */ - -#endif /* AVCODEC_PPC_DSPUTIL_PPC_H */ diff --git a/libavcodec/ppc/fdct_altivec.c b/libavcodec/ppc/fdct_altivec.c index 8f3bc26947..6309a47f32 100644 --- a/libavcodec/ppc/fdct_altivec.c +++ b/libavcodec/ppc/fdct_altivec.c @@ -24,7 +24,6 @@ #endif #include "libavutil/common.h" #include "libavcodec/dsputil.h" -#include "dsputil_ppc.h" #include "dsputil_altivec.h" #define vs16(v) ((vector signed short)(v)) @@ -198,7 +197,6 @@ static vector float fdctconsts[3] = { void fdct_altivec(int16_t *block) { -POWERPC_PERF_DECLARE(altivec_fdct, 1); vector signed short *bp; vector float *cp; vector float b00, b10, b20, b30, b40, b50, b60, b70; @@ -206,9 +204,6 @@ POWERPC_PERF_DECLARE(altivec_fdct, 1); vector float mzero, cnst, cnsts0, cnsts1, cnsts2; vector float x0, x1, x2, x3, x4, x5, x6, x7, x8; - POWERPC_PERF_START_COUNT(altivec_fdct, 1); - - /* setup constants {{{ */ /* mzero = -0.0 */ mzero = ((vector float)vec_splat_u32(-1)); @@ -487,8 +482,6 @@ POWERPC_PERF_DECLARE(altivec_fdct, 1); #undef CTS /* }}} */ - -POWERPC_PERF_STOP_COUNT(altivec_fdct, 1); } /* vim:set foldmethod=marker foldlevel=0: */ diff --git a/libavcodec/ppc/fft_altivec.c b/libavcodec/ppc/fft_altivec.c index ce35ab602c..5ef3bd3559 100644 --- a/libavcodec/ppc/fft_altivec.c +++ b/libavcodec/ppc/fft_altivec.c @@ -21,7 +21,6 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "libavcodec/fft.h" -#include "dsputil_ppc.h" #include "util_altivec.h" #include "dsputil_altivec.h" @@ -38,7 +37,6 @@ */ static void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z) { -POWERPC_PERF_DECLARE(altivec_fft_num, s->nbits >= 6); register const vector float vczero = (const vector float)vec_splat_u32(0.); int ln = s->nbits; @@ -48,8 +46,6 @@ POWERPC_PERF_DECLARE(altivec_fft_num, s->nbits >= 6); FFTComplex *cptr, *cptr1; int k; -POWERPC_PERF_START_COUNT(altivec_fft_num, s->nbits >= 6); - np = 1 << ln; { @@ -132,8 +128,6 @@ POWERPC_PERF_START_COUNT(altivec_fft_num, s->nbits >= 6); nblocks = nblocks >> 1; nloops = nloops << 1; } while (nblocks != 0); - -POWERPC_PERF_STOP_COUNT(altivec_fft_num, s->nbits >= 6); } av_cold void ff_fft_init_altivec(FFTContext *s) diff --git a/libavcodec/ppc/gmc_altivec.c b/libavcodec/ppc/gmc_altivec.c index fa71047b72..0e93c337f7 100644 --- a/libavcodec/ppc/gmc_altivec.c +++ b/libavcodec/ppc/gmc_altivec.c @@ -21,7 +21,6 @@ */ #include "libavcodec/dsputil.h" -#include "dsputil_ppc.h" #include "util_altivec.h" #include "types_altivec.h" #include "dsputil_altivec.h" @@ -30,10 +29,8 @@ altivec-enhanced gmc1. ATM this code assume stride is a multiple of 8, to preserve proper dst alignment. */ -#define GMC1_PERF_COND (h==8) void gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int stride, int h, int x16, int y16, int rounder) { -POWERPC_PERF_DECLARE(altivec_gmc1_num, GMC1_PERF_COND); const DECLARE_ALIGNED(16, unsigned short, rounder_a) = rounder; const DECLARE_ALIGNED(16, unsigned short, ABCD)[8] = { @@ -51,9 +48,6 @@ POWERPC_PERF_DECLARE(altivec_gmc1_num, GMC1_PERF_COND); unsigned long dst_odd = (unsigned long)dst & 0x0000000F; unsigned long src_really_odd = (unsigned long)src & 0x0000000F; - -POWERPC_PERF_START_COUNT(altivec_gmc1_num, GMC1_PERF_COND); - tempA = vec_ld(0, (unsigned short*)ABCD); Av = vec_splat(tempA, 0); Bv = vec_splat(tempA, 1); @@ -133,6 +127,4 @@ POWERPC_PERF_START_COUNT(altivec_gmc1_num, GMC1_PERF_COND); dst += stride; src += stride; } - -POWERPC_PERF_STOP_COUNT(altivec_gmc1_num, GMC1_PERF_COND); } diff --git a/libavcodec/ppc/h264_altivec.c b/libavcodec/ppc/h264_altivec.c index 47e416b82b..c9c29442ce 100644 --- a/libavcodec/ppc/h264_altivec.c +++ b/libavcodec/ppc/h264_altivec.c @@ -22,7 +22,6 @@ #include "libavcodec/h264data.h" #include "libavcodec/h264dsp.h" -#include "dsputil_ppc.h" #include "dsputil_altivec.h" #include "util_altivec.h" #include "types_altivec.h" diff --git a/libavcodec/ppc/h264_template_altivec.c b/libavcodec/ppc/h264_template_altivec.c index c0a4eb7a60..8cf39c8be7 100644 --- a/libavcodec/ppc/h264_template_altivec.c +++ b/libavcodec/ppc/h264_template_altivec.c @@ -77,7 +77,6 @@ static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) { - POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1); DECLARE_ALIGNED(16, signed int, ABCD)[4] = {((8 - x) * (8 - y)), (( x) * (8 - y)), @@ -103,8 +102,6 @@ static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, vec_s16 vsrc2ssH, vsrc3ssH, psum; vec_u8 vdst, ppsum, vfdst, fsum; - POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1); - if (((unsigned long)dst) % 16 == 0) { fperm = (vec_u8){0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, @@ -203,7 +200,6 @@ static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, } } } - POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1); } /* this code assume that stride % 16 == 0 */ @@ -295,7 +291,6 @@ static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, i /* this code assume stride % 16 == 0 */ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { - POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1); register int i; LOAD_ZERO; @@ -323,8 +318,6 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i vec_u8 sum, vdst, fsum; - POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); - for (i = 0 ; i < 16 ; i ++) { vec_u8 srcR1 = vec_ld(-2, src); vec_u8 srcR2 = vec_ld(14, src); @@ -433,13 +426,10 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i src += srcStride; dst += dstStride; } - POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); } /* this code assume stride % 16 == 0 */ static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { - POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1); - register int i; LOAD_ZERO; @@ -490,8 +480,6 @@ static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, i vec_u8 sum, vdst, fsum, srcP3a, srcP3b, srcP3; - POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); - for (i = 0 ; i < 16 ; i++) { srcP3a = vec_ld(0, srcbis += srcStride); srcP3b = vec_ld(16, srcbis); @@ -544,12 +532,10 @@ static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, i dst += dstStride; } - POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); } /* this code assume stride % 16 == 0 *and* tmp is properly aligned */ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) { - POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1); register int i; LOAD_ZERO; const vec_u8 permM2 = vec_lvsl(-2, src); @@ -589,7 +575,6 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, vec_u8 fsum, sumv, sum, vdst; vec_s16 ssume, ssumo; - POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); src -= (2 * srcStride); for (i = 0 ; i < 21 ; i ++) { vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; @@ -779,5 +764,4 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, dst += dstStride; } - POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); } diff --git a/libavcodec/ppc/idct_altivec.c b/libavcodec/ppc/idct_altivec.c index 7c6b79e9c0..d65ba24f3a 100644 --- a/libavcodec/ppc/idct_altivec.c +++ b/libavcodec/ppc/idct_altivec.c @@ -43,7 +43,6 @@ #endif #include "libavcodec/dsputil.h" #include "types_altivec.h" -#include "dsputil_ppc.h" #include "dsputil_altivec.h" #define IDCT_HALF \ @@ -161,13 +160,9 @@ static const vec_s16 constants[5] = { void idct_put_altivec(uint8_t* dest, int stride, int16_t *blk) { -POWERPC_PERF_DECLARE(altivec_idct_put_num, 1); vec_s16 *block = (vec_s16*)blk; vec_u8 tmp; -#if CONFIG_POWERPC_PERF -POWERPC_PERF_START_COUNT(altivec_idct_put_num, 1); -#endif IDCT #define COPY(dest,src) \ @@ -183,13 +178,10 @@ POWERPC_PERF_START_COUNT(altivec_idct_put_num, 1); COPY (dest, vx5) dest += stride; COPY (dest, vx6) dest += stride; COPY (dest, vx7) - -POWERPC_PERF_STOP_COUNT(altivec_idct_put_num, 1); } void idct_add_altivec(uint8_t* dest, int stride, int16_t *blk) { -POWERPC_PERF_DECLARE(altivec_idct_add_num, 1); vec_s16 *block = (vec_s16*)blk; vec_u8 tmp; vec_s16 tmp2, tmp3; @@ -197,10 +189,6 @@ POWERPC_PERF_DECLARE(altivec_idct_add_num, 1); vec_u8 perm1; vec_u8 p0, p1, p; -#if CONFIG_POWERPC_PERF -POWERPC_PERF_START_COUNT(altivec_idct_add_num, 1); -#endif - IDCT p0 = vec_lvsl (0, dest); @@ -226,7 +214,5 @@ POWERPC_PERF_START_COUNT(altivec_idct_add_num, 1); ADD (dest, vx5, perm1) dest += stride; ADD (dest, vx6, perm0) dest += stride; ADD (dest, vx7, perm1) - -POWERPC_PERF_STOP_COUNT(altivec_idct_add_num, 1); } diff --git a/libavcodec/ppc/mpegvideo_altivec.c b/libavcodec/ppc/mpegvideo_altivec.c index 07e63be47a..0126b7f42e 100644 --- a/libavcodec/ppc/mpegvideo_altivec.c +++ b/libavcodec/ppc/mpegvideo_altivec.c @@ -26,7 +26,6 @@ #include "libavcodec/dsputil.h" #include "libavcodec/mpegvideo.h" -#include "dsputil_ppc.h" #include "util_altivec.h" #include "types_altivec.h" #include "dsputil_altivec.h" @@ -479,14 +478,11 @@ static int dct_quantize_altivec(MpegEncContext* s, static void dct_unquantize_h263_altivec(MpegEncContext *s, DCTELEM *block, int n, int qscale) { -POWERPC_PERF_DECLARE(altivec_dct_unquantize_h263_num, 1); int i, level, qmul, qadd; int nCoeffs; assert(s->block_last_index[n]>=0); -POWERPC_PERF_START_COUNT(altivec_dct_unquantize_h263_num, 1); - qadd = (qscale - 1) | 1; qmul = qscale << 1; @@ -569,7 +565,6 @@ POWERPC_PERF_START_COUNT(altivec_dct_unquantize_h263_num, 1); block[0] = backup_0; } } -POWERPC_PERF_STOP_COUNT(altivec_dct_unquantize_h263_num, nCoeffs == 63); } |