From fc35039e97ae63e9542404f512fa126055ee4c67 Mon Sep 17 00:00:00 2001 From: paulberg Date: Sat, 25 Feb 2006 22:08:39 +0000 Subject: Initial commit of Vollbeding changes --- cdjpeg.h | 5 +- cjpeg.c | 38 +- jcapimin.c | 4 +- jccoefct.c | 8 +- jcdctmgr.c | 289 +++- jcinit.c | 6 +- jcmainct.c | 14 +- jcmarker.c | 13 +- jcmaster.c | 217 ++- jcparam.c | 68 +- jcprepct.c | 14 +- jcsample.c | 94 +- jctrans.c | 2 +- jdapistd.c | 2 +- jdcoefct.c | 14 +- jdct.h | 239 ++- jddctmgr.c | 125 +- jdhuff.c | 126 +- jdinput.c | 10 +- jdmainct.c | 42 +- jdmaster.c | 148 +- jdsample.c | 26 +- jdtrans.c | 6 +- jerror.h | 16 +- jfdctflt.c | 47 +- jfdctfst.c | 47 +- jfdctint.c | 4202 ++++++++++++++++++++++++++++++++++++++++++++++++++-- jidctint.c | 4674 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ jmorecfg.h | 7 +- jpegint.h | 2 + jpeglib.h | 38 +- jpegtran.c | 94 +- makefile.cfg | 16 +- rdswitch.c | 39 +- transupp.c | 1039 ++++++++++--- transupp.h | 126 +- 36 files changed, 11104 insertions(+), 753 deletions(-) diff --git a/cdjpeg.h b/cdjpeg.h index 2b387b6..ed024ac 100644 --- a/cdjpeg.h +++ b/cdjpeg.h @@ -104,6 +104,7 @@ typedef struct cdjpeg_progress_mgr * cd_progress_ptr; #define jinit_write_targa jIWrTarga #define read_quant_tables RdQTables #define read_scan_script RdScnScript +#define set_quality_ratings SetQRates #define set_quant_slots SetQSlots #define set_sample_factors SetSFacts #define read_color_map RdCMap @@ -131,8 +132,10 @@ EXTERN(djpeg_dest_ptr) jinit_write_targa JPP((j_decompress_ptr cinfo)); /* cjpeg support routines (in rdswitch.c) */ EXTERN(boolean) read_quant_tables JPP((j_compress_ptr cinfo, char * filename, - int scale_factor, boolean force_baseline)); + boolean force_baseline)); EXTERN(boolean) read_scan_script JPP((j_compress_ptr cinfo, char * filename)); +EXTERN(boolean) set_quality_ratings JPP((j_compress_ptr cinfo, char *arg, + boolean force_baseline)); EXTERN(boolean) set_quant_slots JPP((j_compress_ptr cinfo, char *arg)); EXTERN(boolean) set_sample_factors JPP((j_compress_ptr cinfo, char *arg)); diff --git a/cjpeg.c b/cjpeg.c index f2a929f..b55ad6f 100644 --- a/cjpeg.c +++ b/cjpeg.c @@ -149,7 +149,7 @@ usage (void) #endif fprintf(stderr, "Switches (names may be abbreviated):\n"); - fprintf(stderr, " -quality N Compression quality (0..100; 5-95 is useful range)\n"); + fprintf(stderr, " -quality N[,...] Compression quality (0..100; 5-95 is useful range)\n"); fprintf(stderr, " -grayscale Create monochrome JPEG file\n"); #ifdef ENTROPY_OPT_SUPPORTED fprintf(stderr, " -optimize Optimize Huffman table (smaller file, but slow compression)\n"); @@ -157,6 +157,9 @@ usage (void) #ifdef C_PROGRESSIVE_SUPPORTED fprintf(stderr, " -progressive Create progressive JPEG file\n"); #endif +#ifdef DCT_SCALING_SUPPORTED + fprintf(stderr, " -scale M/N Scale image by fraction M/N, eg, 1/2\n"); +#endif #ifdef TARGA_SUPPORTED fprintf(stderr, " -targa Input file is Targa format (usually not needed)\n"); #endif @@ -209,21 +212,16 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv, { int argn; char * arg; - int quality; /* -quality parameter */ - int q_scale_factor; /* scaling percentage for -qtables */ boolean force_baseline; boolean simple_progressive; + char * qualityarg = NULL; /* saves -quality parm if any */ char * qtablefile = NULL; /* saves -qtables filename if any */ char * qslotsarg = NULL; /* saves -qslots parm if any */ char * samplearg = NULL; /* saves -sample parm if any */ char * scansarg = NULL; /* saves -scans parm if any */ /* Set up default JPEG parameters. */ - /* Note that default -quality level need not, and does not, - * match the default scaling for an explicit -qtables argument. - */ - quality = 75; /* default -quality value */ - q_scale_factor = 100; /* default to no scaling for -qtables */ + force_baseline = FALSE; /* by default, allow 16-bit quantizers */ simple_progressive = FALSE; is_targa = FALSE; @@ -328,13 +326,10 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv, #endif } else if (keymatch(arg, "quality", 1)) { - /* Quality factor (quantization table scaling factor). */ + /* Quality ratings (quantization table scaling factors). */ if (++argn >= argc) /* advance to next argument */ usage(); - if (sscanf(argv[argn], "%d", &quality) != 1) - usage(); - /* Change scale factor in case -qtables is present. */ - q_scale_factor = jpeg_quality_scaling(quality); + qualityarg = argv[argn]; } else if (keymatch(arg, "qslots", 2)) { /* Quantization table slot numbers. */ @@ -382,7 +377,15 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv, * default sampling factors. */ - } else if (keymatch(arg, "scans", 2)) { + } else if (keymatch(arg, "scale", 4)) { + /* Scale the image by a fraction M/N. */ + if (++argn >= argc) /* advance to next argument */ + usage(); + if (sscanf(argv[argn], "%d/%d", + &cinfo->scale_num, &cinfo->scale_denom) != 2) + usage(); + + } else if (keymatch(arg, "scans", 4)) { /* Set scan script. */ #ifdef C_MULTISCAN_FILES_SUPPORTED if (++argn >= argc) /* advance to next argument */ @@ -422,11 +425,12 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv, /* Set quantization tables for selected quality. */ /* Some or all may be overridden if -qtables is present. */ - jpeg_set_quality(cinfo, quality, force_baseline); + if (qualityarg != NULL) /* process -quality if it was present */ + if (! set_quality_ratings(cinfo, qualityarg, force_baseline)) + usage(); if (qtablefile != NULL) /* process -qtables if it was present */ - if (! read_quant_tables(cinfo, qtablefile, - q_scale_factor, force_baseline)) + if (! read_quant_tables(cinfo, qtablefile, force_baseline)) usage(); if (qslotsarg != NULL) /* process -qslots if it was present */ diff --git a/jcapimin.c b/jcapimin.c index 54fb8c5..563ab42 100644 --- a/jcapimin.c +++ b/jcapimin.c @@ -63,8 +63,10 @@ jpeg_CreateCompress (j_compress_ptr cinfo, int version, size_t structsize) cinfo->comp_info = NULL; - for (i = 0; i < NUM_QUANT_TBLS; i++) + for (i = 0; i < NUM_QUANT_TBLS; i++) { cinfo->quant_tbl_ptrs[i] = NULL; + cinfo->q_scale_factor[i] = 100; + } for (i = 0; i < NUM_HUFF_TBLS; i++) { cinfo->dc_huff_tbl_ptrs[i] = NULL; diff --git a/jccoefct.c b/jccoefct.c index 1963ddb..14ccaeb 100644 --- a/jccoefct.c +++ b/jccoefct.c @@ -170,7 +170,8 @@ compress_data (j_compress_ptr cinfo, JSAMPIMAGE input_buf) blockcnt = (MCU_col_num < last_MCU_col) ? compptr->MCU_width : compptr->last_col_width; xpos = MCU_col_num * compptr->MCU_sample_width; - ypos = yoffset * DCTSIZE; /* ypos == (yoffset+yindex) * DCTSIZE */ + ypos = yoffset * compptr->DCT_v_scaled_size; + /* ypos == (yoffset+yindex) * DCTSIZE */ for (yindex = 0; yindex < compptr->MCU_height; yindex++) { if (coef->iMCU_row_num < last_iMCU_row || yoffset+yindex < compptr->last_row_height) { @@ -195,7 +196,7 @@ compress_data (j_compress_ptr cinfo, JSAMPIMAGE input_buf) } } blkn += compptr->MCU_width; - ypos += DCTSIZE; + ypos += compptr->DCT_v_scaled_size; } } /* Try to write the MCU. In event of a suspension failure, we will @@ -281,7 +282,8 @@ compress_first_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf) thisblockrow = buffer[block_row]; (*cinfo->fdct->forward_DCT) (cinfo, compptr, input_buf[ci], thisblockrow, - (JDIMENSION) (block_row * DCTSIZE), + (JDIMENSION) (block_row * + compptr->DCT_v_scaled_size), (JDIMENSION) 0, blocks_across); if (ndummy > 0) { /* Create dummy blocks at the right edge of the image. */ diff --git a/jcdctmgr.c b/jcdctmgr.c index 61fa79b..f8f5e76 100644 --- a/jcdctmgr.c +++ b/jcdctmgr.c @@ -23,7 +23,7 @@ typedef struct { struct jpeg_forward_dct pub; /* public fields */ /* Pointer to the DCT routine actually in use */ - forward_DCT_method_ptr do_dct; + forward_DCT_method_ptr do_dct[MAX_COMPONENTS]; /* The actual post-DCT divisors --- not identical to the quant table * entries, because of scaling (especially for an unnormalized DCT). @@ -33,7 +33,7 @@ typedef struct { #ifdef DCT_FLOAT_SUPPORTED /* Same as above for the floating-point case. */ - float_DCT_method_ptr do_float_dct; + float_DCT_method_ptr do_float_dct[MAX_COMPONENTS]; FAST_FLOAT * float_divisors[NUM_QUANT_TBLS]; #endif } my_fdct_controller; @@ -61,6 +61,223 @@ start_pass_fdctmgr (j_compress_ptr cinfo) for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; ci++, compptr++) { + /* Select the proper DCT routine for this component's scaling */ + switch ((compptr->DCT_h_scaled_size << 8) + compptr->DCT_v_scaled_size) { +#ifdef DCT_SCALING_SUPPORTED + case ((1 << 8) + 1): + if (cinfo->dct_method == JDCT_ISLOW) + fdct->do_dct[ci] = jpeg_fdct_1x1; + else + ERREXIT(cinfo, JERR_NOT_COMPILED); + break; + case ((2 << 8) + 2): + if (cinfo->dct_method == JDCT_ISLOW) + fdct->do_dct[ci] = jpeg_fdct_2x2; + else + ERREXIT(cinfo, JERR_NOT_COMPILED); + break; + case ((3 << 8) + 3): + if (cinfo->dct_method == JDCT_ISLOW) + fdct->do_dct[ci] = jpeg_fdct_3x3; + else + ERREXIT(cinfo, JERR_NOT_COMPILED); + break; + case ((4 << 8) + 4): + if (cinfo->dct_method == JDCT_ISLOW) + fdct->do_dct[ci] = jpeg_fdct_4x4; + else + ERREXIT(cinfo, JERR_NOT_COMPILED); + break; + case ((5 << 8) + 5): + if (cinfo->dct_method == JDCT_ISLOW) + fdct->do_dct[ci] = jpeg_fdct_5x5; + else + ERREXIT(cinfo, JERR_NOT_COMPILED); + break; + case ((6 << 8) + 6): + if (cinfo->dct_method == JDCT_ISLOW) + fdct->do_dct[ci] = jpeg_fdct_6x6; + else + ERREXIT(cinfo, JERR_NOT_COMPILED); + break; + case ((7 << 8) + 7): + if (cinfo->dct_method == JDCT_ISLOW) + fdct->do_dct[ci] = jpeg_fdct_7x7; + else + ERREXIT(cinfo, JERR_NOT_COMPILED); + break; + case ((9 << 8) + 9): + if (cinfo->dct_method == JDCT_ISLOW) + fdct->do_dct[ci] = jpeg_fdct_9x9; + else + ERREXIT(cinfo, JERR_NOT_COMPILED); + break; + case ((10 << 8) + 10): + if (cinfo->dct_method == JDCT_ISLOW) + fdct->do_dct[ci] = jpeg_fdct_10x10; + else + ERREXIT(cinfo, JERR_NOT_COMPILED); + break; + case ((11 << 8) + 11): + if (cinfo->dct_method == JDCT_ISLOW) + fdct->do_dct[ci] = jpeg_fdct_11x11; + else + ERREXIT(cinfo, JERR_NOT_COMPILED); + break; + case ((12 << 8) + 12): + if (cinfo->dct_method == JDCT_ISLOW) + fdct->do_dct[ci] = jpeg_fdct_12x12; + else + ERREXIT(cinfo, JERR_NOT_COMPILED); + break; + case ((13 << 8) + 13): + if (cinfo->dct_method == JDCT_ISLOW) + fdct->do_dct[ci] = jpeg_fdct_13x13; + else + ERREXIT(cinfo, JERR_NOT_COMPILED); + break; + case ((14 << 8) + 14): + if (cinfo->dct_method == JDCT_ISLOW) + fdct->do_dct[ci] = jpeg_fdct_14x14; + else + ERREXIT(cinfo, JERR_NOT_COMPILED); + break; + case ((15 << 8) + 15): + if (cinfo->dct_method == JDCT_ISLOW) + fdct->do_dct[ci] = jpeg_fdct_15x15; + else + ERREXIT(cinfo, JERR_NOT_COMPILED); + break; + case ((16 << 8) + 16): + if (cinfo->dct_method == JDCT_ISLOW) + fdct->do_dct[ci] = jpeg_fdct_16x16; + else + ERREXIT(cinfo, JERR_NOT_COMPILED); + break; + case ((16 << 8) + 8): + if (cinfo->dct_method == JDCT_ISLOW) + fdct->do_dct[ci] = jpeg_fdct_16x8; + else + ERREXIT(cinfo, JERR_NOT_COMPILED); + break; + case ((14 << 8) + 7): + if (cinfo->dct_method == JDCT_ISLOW) + fdct->do_dct[ci] = jpeg_fdct_14x7; + else + ERREXIT(cinfo, JERR_NOT_COMPILED); + break; + case ((12 << 8) + 6): + if (cinfo->dct_method == JDCT_ISLOW) + fdct->do_dct[ci] = jpeg_fdct_12x6; + else + ERREXIT(cinfo, JERR_NOT_COMPILED); + break; + case ((10 << 8) + 5): + if (cinfo->dct_method == JDCT_ISLOW) + fdct->do_dct[ci] = jpeg_fdct_10x5; + else + ERREXIT(cinfo, JERR_NOT_COMPILED); + break; + case ((8 << 8) + 4): + if (cinfo->dct_method == JDCT_ISLOW) + fdct->do_dct[ci] = jpeg_fdct_8x4; + else + ERREXIT(cinfo, JERR_NOT_COMPILED); + break; + case ((6 << 8) + 3): + if (cinfo->dct_method == JDCT_ISLOW) + fdct->do_dct[ci] = jpeg_fdct_6x3; + else + ERREXIT(cinfo, JERR_NOT_COMPILED); + break; + case ((4 << 8) + 2): + if (cinfo->dct_method == JDCT_ISLOW) + fdct->do_dct[ci] = jpeg_fdct_4x2; + else + ERREXIT(cinfo, JERR_NOT_COMPILED); + break; + case ((2 << 8) + 1): + if (cinfo->dct_method == JDCT_ISLOW) + fdct->do_dct[ci] = jpeg_fdct_2x1; + else + ERREXIT(cinfo, JERR_NOT_COMPILED); + break; + case ((8 << 8) + 16): + if (cinfo->dct_method == JDCT_ISLOW) + fdct->do_dct[ci] = jpeg_fdct_8x16; + else + ERREXIT(cinfo, JERR_NOT_COMPILED); + break; + case ((7 << 8) + 14): + if (cinfo->dct_method == JDCT_ISLOW) + fdct->do_dct[ci] = jpeg_fdct_7x14; + else + ERREXIT(cinfo, JERR_NOT_COMPILED); + break; + case ((6 << 8) + 12): + if (cinfo->dct_method == JDCT_ISLOW) + fdct->do_dct[ci] = jpeg_fdct_6x12; + else + ERREXIT(cinfo, JERR_NOT_COMPILED); + break; + case ((5 << 8) + 10): + if (cinfo->dct_method == JDCT_ISLOW) + fdct->do_dct[ci] = jpeg_fdct_5x10; + else + ERREXIT(cinfo, JERR_NOT_COMPILED); + break; + case ((4 << 8) + 8): + if (cinfo->dct_method == JDCT_ISLOW) + fdct->do_dct[ci] = jpeg_fdct_4x8; + else + ERREXIT(cinfo, JERR_NOT_COMPILED); + break; + case ((3 << 8) + 6): + if (cinfo->dct_method == JDCT_ISLOW) + fdct->do_dct[ci] = jpeg_fdct_3x6; + else + ERREXIT(cinfo, JERR_NOT_COMPILED); + break; + case ((2 << 8) + 4): + if (cinfo->dct_method == JDCT_ISLOW) + fdct->do_dct[ci] = jpeg_fdct_2x4; + else + ERREXIT(cinfo, JERR_NOT_COMPILED); + break; + case ((1 << 8) + 2): + if (cinfo->dct_method == JDCT_ISLOW) + fdct->do_dct[ci] = jpeg_fdct_1x2; + else + ERREXIT(cinfo, JERR_NOT_COMPILED); + break; +#endif + case ((DCTSIZE << 8) + DCTSIZE): + switch (cinfo->dct_method) { +#ifdef DCT_ISLOW_SUPPORTED + case JDCT_ISLOW: + fdct->do_dct[ci] = jpeg_fdct_islow; + break; +#endif +#ifdef DCT_IFAST_SUPPORTED + case JDCT_IFAST: + fdct->do_dct[ci] = jpeg_fdct_ifast; + break; +#endif +#ifdef DCT_FLOAT_SUPPORTED + case JDCT_FLOAT: + fdct->do_float_dct[ci] = jpeg_fdct_float; + break; +#endif + default: + ERREXIT(cinfo, JERR_NOT_COMPILED); + break; + } + break; + default: + ERREXIT2(cinfo, JERR_BAD_DCTSIZE, + compptr->DCT_h_scaled_size, compptr->DCT_v_scaled_size); + break; + } qtblno = compptr->quant_tbl_no; /* Make sure specified quantization table is present */ if (qtblno < 0 || qtblno >= NUM_QUANT_TBLS || @@ -185,43 +402,16 @@ forward_DCT (j_compress_ptr cinfo, jpeg_component_info * compptr, { /* This routine is heavily used, so it's worth coding it tightly. */ my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct; - forward_DCT_method_ptr do_dct = fdct->do_dct; + forward_DCT_method_ptr do_dct = fdct->do_dct[compptr->component_index]; DCTELEM * divisors = fdct->divisors[compptr->quant_tbl_no]; DCTELEM workspace[DCTSIZE2]; /* work area for FDCT subroutine */ JDIMENSION bi; sample_data += start_row; /* fold in the vertical offset once */ - for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) { - /* Load data into workspace, applying unsigned->signed conversion */ - { register DCTELEM *workspaceptr; - register JSAMPROW elemptr; - register int elemr; - - workspaceptr = workspace; - for (elemr = 0; elemr < DCTSIZE; elemr++) { - elemptr = sample_data[elemr] + start_col; -#if DCTSIZE == 8 /* unroll the inner loop */ - *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE; - *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE; - *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE; - *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE; - *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE; - *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE; - *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE; - *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE; -#else - { register int elemc; - for (elemc = DCTSIZE; elemc > 0; elemc--) { - *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE; - } - } -#endif - } - } - + for (bi = 0; bi < num_blocks; bi++, start_col += compptr->DCT_h_scaled_size) { /* Perform the DCT */ - (*do_dct) (workspace); + (*do_dct) (workspace, sample_data, start_col); /* Quantize/descale the coefficients, and store into coef_blocks[] */ { register DCTELEM temp, qval; @@ -275,7 +465,7 @@ forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info * compptr, { /* This routine is heavily used, so it's worth coding it tightly. */ my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct; - float_DCT_method_ptr do_dct = fdct->do_float_dct; + float_DCT_method_ptr do_dct = fdct->do_float_dct[compptr->component_index]; FAST_FLOAT * divisors = fdct->float_divisors[compptr->quant_tbl_no]; FAST_FLOAT workspace[DCTSIZE2]; /* work area for FDCT subroutine */ JDIMENSION bi; @@ -283,36 +473,8 @@ forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info * compptr, sample_data += start_row; /* fold in the vertical offset once */ for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) { - /* Load data into workspace, applying unsigned->signed conversion */ - { register FAST_FLOAT *workspaceptr; - register JSAMPROW elemptr; - register int elemr; - - workspaceptr = workspace; - for (elemr = 0; elemr < DCTSIZE; elemr++) { - elemptr = sample_data[elemr] + start_col; -#if DCTSIZE == 8 /* unroll the inner loop */ - *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE); - *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE); - *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE); - *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE); - *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE); - *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE); - *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE); - *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE); -#else - { register int elemc; - for (elemc = DCTSIZE; elemc > 0; elemc--) { - *workspaceptr++ = (FAST_FLOAT) - (GETJSAMPLE(*elemptr++) - CENTERJSAMPLE); - } - } -#endif - } - } - /* Perform the DCT */ - (*do_dct) (workspace); + (*do_dct) (workspace, sample_data, start_col); /* Quantize/descale the coefficients, and store into coef_blocks[] */ { register FAST_FLOAT temp; @@ -357,19 +519,16 @@ jinit_forward_dct (j_compress_ptr cinfo) #ifdef DCT_ISLOW_SUPPORTED case JDCT_ISLOW: fdct->pub.forward_DCT = forward_DCT; - fdct->do_dct = jpeg_fdct_islow; break; #endif #ifdef DCT_IFAST_SUPPORTED case JDCT_IFAST: fdct->pub.forward_DCT = forward_DCT; - fdct->do_dct = jpeg_fdct_ifast; break; #endif #ifdef DCT_FLOAT_SUPPORTED case JDCT_FLOAT: fdct->pub.forward_DCT = forward_DCT_float; - fdct->do_float_dct = jpeg_fdct_float; break; #endif default: diff --git a/jcinit.c b/jcinit.c index 5efffe3..a0f3e4f 100644 --- a/jcinit.c +++ b/jcinit.c @@ -41,9 +41,9 @@ jinit_compress_master (j_compress_ptr cinfo) /* Forward DCT */ jinit_forward_dct(cinfo); /* Entropy encoding: either Huffman or arithmetic coding. */ - if (cinfo->arith_code) { - ERREXIT(cinfo, JERR_ARITH_NOTIMPL); - } else { + if (cinfo->arith_code) + jinit_arith_encoder(cinfo); + else { if (cinfo->progressive_mode) { #ifdef C_PROGRESSIVE_SUPPORTED jinit_phuff_encoder(cinfo); diff --git a/jcmainct.c b/jcmainct.c index e0279a7..7de75d1 100644 --- a/jcmainct.c +++ b/jcmainct.c @@ -118,17 +118,17 @@ process_data_simple_main (j_compress_ptr cinfo, while (main->cur_iMCU_row < cinfo->total_iMCU_rows) { /* Read input data if we haven't filled the main buffer yet */ - if (main->rowgroup_ctr < DCTSIZE) + if (main->rowgroup_ctr < (JDIMENSION) cinfo->min_DCT_v_scaled_size) (*cinfo->prep->pre_process_data) (cinfo, input_buf, in_row_ctr, in_rows_avail, main->buffer, &main->rowgroup_ctr, - (JDIMENSION) DCTSIZE); + (JDIMENSION) cinfo->min_DCT_v_scaled_size); /* If we don't have a full iMCU row buffered, return to application for * more data. Note that preprocessor will always pad to fill the iMCU row * at the bottom of the image. */ - if (main->rowgroup_ctr != DCTSIZE) + if (main->rowgroup_ctr != (JDIMENSION) cinfo->min_DCT_v_scaled_size) return; /* Send the completed row to the compressor */ @@ -269,10 +269,10 @@ jinit_c_main_controller (j_compress_ptr cinfo, boolean need_full_buffer) ci++, compptr++) { main->whole_image[ci] = (*cinfo->mem->request_virt_sarray) ((j_common_ptr) cinfo, JPOOL_IMAGE, FALSE, - compptr->width_in_blocks * DCTSIZE, + compptr->width_in_blocks * compptr->DCT_h_scaled_size, (JDIMENSION) jround_up((long) compptr->height_in_blocks, (long) compptr->v_samp_factor) * DCTSIZE, - (JDIMENSION) (compptr->v_samp_factor * DCTSIZE)); + (JDIMENSION) (compptr->v_samp_factor * compptr->DCT_v_scaled_size)); } #else ERREXIT(cinfo, JERR_BAD_BUFFER_MODE); @@ -286,8 +286,8 @@ jinit_c_main_controller (j_compress_ptr cinfo, boolean need_full_buffer) ci++, compptr++) { main->buffer[ci] = (*cinfo->mem->alloc_sarray) ((j_common_ptr) cinfo, JPOOL_IMAGE, - compptr->width_in_blocks * DCTSIZE, - (JDIMENSION) (compptr->v_samp_factor * DCTSIZE)); + compptr->width_in_blocks * compptr->DCT_h_scaled_size, + (JDIMENSION) (compptr->v_samp_factor * compptr->DCT_v_scaled_size)); } } } diff --git a/jcmarker.c b/jcmarker.c index 3d1e6c6..54d109a 100644 --- a/jcmarker.c +++ b/jcmarker.c @@ -285,13 +285,13 @@ emit_sof (j_compress_ptr cinfo, JPEG_MARKER code) emit_2bytes(cinfo, 3 * cinfo->num_components + 2 + 5 + 1); /* length */ /* Make sure image isn't bigger than SOF field can handle */ - if ((long) cinfo->image_height > 65535L || - (long) cinfo->image_width > 65535L) + if ((long) cinfo->jpeg_height > 65535L || + (long) cinfo->jpeg_width > 65535L) ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int) 65535); emit_byte(cinfo, cinfo->data_precision); - emit_2bytes(cinfo, (int) cinfo->image_height); - emit_2bytes(cinfo, (int) cinfo->image_width); + emit_2bytes(cinfo, (int) cinfo->jpeg_height); + emit_2bytes(cinfo, (int) cinfo->jpeg_width); emit_byte(cinfo, cinfo->num_components); @@ -529,7 +529,10 @@ write_frame_header (j_compress_ptr cinfo) /* Emit the proper SOF marker */ if (cinfo->arith_code) { - emit_sof(cinfo, M_SOF9); /* SOF code for arithmetic coding */ + if (cinfo->progressive_mode) + emit_sof(cinfo, M_SOF10); /* SOF code for progressive arithmetic */ + else + emit_sof(cinfo, M_SOF9); /* SOF code for sequential arithmetic */ } else { if (cinfo->progressive_mode) emit_sof(cinfo, M_SOF2); /* SOF code for progressive Huffman */ diff --git a/jcmaster.c b/jcmaster.c index aab4020..72b3769 100644 --- a/jcmaster.c +++ b/jcmaster.c @@ -42,23 +42,172 @@ typedef my_comp_master * my_master_ptr; * Support routines that do various essential calculations. */ +/* + * Compute JPEG image dimensions and related values. + * NOTE: this is exported for possible use by application. + * Hence it mustn't do anything that can't be done twice. + */ + +GLOBAL(void) +jpeg_calc_jpeg_dimensions (j_compress_ptr cinfo) +/* Do computations that are needed before master selection phase */ +{ +#ifdef DCT_SCALING_SUPPORTED + + /* Compute actual JPEG image dimensions and DCT scaling choices. */ + if (cinfo->scale_num >= cinfo->scale_denom * 8) { + /* Provide 8/1 scaling */ + cinfo->jpeg_width = cinfo->image_width << 3; + cinfo->jpeg_height = cinfo->image_height << 3; + cinfo->min_DCT_h_scaled_size = 1; + cinfo->min_DCT_v_scaled_size = 1; + } else if (cinfo->scale_num >= cinfo->scale_denom * 4) { + /* Provide 4/1 scaling */ + cinfo->jpeg_width = cinfo->image_width << 2; + cinfo->jpeg_height = cinfo->image_height << 2; + cinfo->min_DCT_h_scaled_size = 2; + cinfo->min_DCT_v_scaled_size = 2; + } else if (cinfo->scale_num * 3 >= cinfo->scale_denom * 8) { + /* Provide 8/3 scaling */ + cinfo->jpeg_width = (cinfo->image_width << 1) + + jdiv_round_up((long) cinfo->image_width * 2, 3L); + cinfo->jpeg_height = (cinfo->image_height << 1) + + jdiv_round_up((long) cinfo->image_height * 2, 3L); + cinfo->min_DCT_h_scaled_size = 3; + cinfo->min_DCT_v_scaled_size = 3; + } else if (cinfo->scale_num >= cinfo->scale_denom * 2) { + /* Provide 2/1 scaling */ + cinfo->jpeg_width = cinfo->image_width << 1; + cinfo->jpeg_height = cinfo->image_height << 1; + cinfo->min_DCT_h_scaled_size = 4; + cinfo->min_DCT_v_scaled_size = 4; + } else if (cinfo->scale_num * 5 >= cinfo->scale_denom * 8) { + /* Provide 8/5 scaling */ + cinfo->jpeg_width = cinfo->image_width + + jdiv_round_up((long) cinfo->image_width * 3, 5L); + cinfo->jpeg_height = cinfo->image_height + + jdiv_round_up((long) cinfo->image_height * 3, 5L); + cinfo->min_DCT_h_scaled_size = 5; + cinfo->min_DCT_v_scaled_size = 5; + } else if (cinfo->scale_num * 3 >= cinfo->scale_denom * 4) { + /* Provide 4/3 scaling */ + cinfo->jpeg_width = cinfo->image_width + + jdiv_round_up((long) cinfo->image_width, 3L); + cinfo->jpeg_height = cinfo->image_height + + jdiv_round_up((long) cinfo->image_height, 3L); + cinfo->min_DCT_h_scaled_size = 6; + cinfo->min_DCT_v_scaled_size = 6; + } else if (cinfo->scale_num * 7 >= cinfo->scale_denom * 8) { + /* Provide 8/7 scaling */ + cinfo->jpeg_width = cinfo->image_width + + jdiv_round_up((long) cinfo->image_width, 7L); + cinfo->jpeg_height = cinfo->image_height + + jdiv_round_up((long) cinfo->image_height, 7L); + cinfo->min_DCT_h_scaled_size = 7; + cinfo->min_DCT_v_scaled_size = 7; + } else if (cinfo->scale_num >= cinfo->scale_denom) { + /* Provide 1/1 scaling */ + cinfo->jpeg_width = cinfo->image_width; + cinfo->jpeg_height = cinfo->image_height; + cinfo->min_DCT_h_scaled_size = DCTSIZE; + cinfo->min_DCT_v_scaled_size = DCTSIZE; + } else if (cinfo->scale_num * 9 >= cinfo->scale_denom * 8) { + /* Provide 8/9 scaling */ + cinfo->jpeg_width = (JDIMENSION) + jdiv_round_up((long) cinfo->image_width * 8, 9L); + cinfo->jpeg_height = (JDIMENSION) + jdiv_round_up((long) cinfo->image_height * 8, 9L); + cinfo->min_DCT_h_scaled_size = 9; + cinfo->min_DCT_v_scaled_size = 9; + } else if (cinfo->scale_num * 5 >= cinfo->scale_denom * 4) { + /* Provide 4/5 scaling */ + cinfo->jpeg_width = (JDIMENSION) + jdiv_round_up((long) cinfo->image_width * 4, 5L); + cinfo->jpeg_height = (JDIMENSION) + jdiv_round_up((long) cinfo->image_height * 4, 5L); + cinfo->min_DCT_h_scaled_size = 10; + cinfo->min_DCT_v_scaled_size = 10; + } else if (cinfo->scale_num * 11 >= cinfo->scale_denom * 8) { + /* Provide 8/11 scaling */ + cinfo->jpeg_width = (JDIMENSION) + jdiv_round_up((long) cinfo->image_width * 8, 11L); + cinfo->jpeg_height = (JDIMENSION) + jdiv_round_up((long) cinfo->image_height * 8, 11L); + cinfo->min_DCT_h_scaled_size = 11; + cinfo->min_DCT_v_scaled_size = 11; + } else if (cinfo->scale_num * 3 >= cinfo->scale_denom * 2) { + /* Provide 2/3 scaling */ + cinfo->jpeg_width = (JDIMENSION) + jdiv_round_up((long) cinfo->image_width * 2, 3L); + cinfo->jpeg_height = (JDIMENSION) + jdiv_round_up((long) cinfo->image_height * 2, 3L); + cinfo->min_DCT_h_scaled_size = 12; + cinfo->min_DCT_v_scaled_size = 12; + } else if (cinfo->scale_num * 13 >= cinfo->scale_denom * 8) { + /* Provide 8/13 scaling */ + cinfo->jpeg_width = (JDIMENSION) + jdiv_round_up((long) cinfo->image_width * 8, 13L); + cinfo->jpeg_height = (JDIMENSION) + jdiv_round_up((long) cinfo->image_height * 8, 13L); + cinfo->min_DCT_h_scaled_size = 13; + cinfo->min_DCT_v_scaled_size = 13; + } else if (cinfo->scale_num * 7 >= cinfo->scale_denom * 4) { + /* Provide 4/7 scaling */ + cinfo->jpeg_width = (JDIMENSION) + jdiv_round_up((long) cinfo->image_width * 4, 7L); + cinfo->jpeg_height = (JDIMENSION) + jdiv_round_up((long) cinfo->image_height * 4, 7L); + cinfo->min_DCT_h_scaled_size = 14; + cinfo->min_DCT_v_scaled_size = 14; + } else if (cinfo->scale_num * 15 >= cinfo->scale_denom * 8) { + /* Provide 8/15 scaling */ + cinfo->jpeg_width = (JDIMENSION) + jdiv_round_up((long) cinfo->image_width * 8, 15L); + cinfo->jpeg_height = (JDIMENSION) + jdiv_round_up((long) cinfo->image_height * 8, 15L); + cinfo->min_DCT_h_scaled_size = 15; + cinfo->min_DCT_v_scaled_size = 15; + } else { + /* Provide 1/2 scaling */ + cinfo->jpeg_width = (JDIMENSION) + jdiv_round_up((long) cinfo->image_width, 2L); + cinfo->jpeg_height = (JDIMENSION) + jdiv_round_up((long) cinfo->image_height, 2L); + cinfo->min_DCT_h_scaled_size = 16; + cinfo->min_DCT_v_scaled_size = 16; + } + +#else /* !DCT_SCALING_SUPPORTED */ + + /* Hardwire it to "no scaling" */ + cinfo->jpeg_width = cinfo->image_width; + cinfo->jpeg_height = cinfo->image_height; + cinfo->min_DCT_h_scaled_size = DCTSIZE; + cinfo->min_DCT_v_scaled_size = DCTSIZE; + +#endif /* DCT_SCALING_SUPPORTED */ +} + + LOCAL(void) initial_setup (j_compress_ptr cinfo) /* Do computations that are needed before master selection phase */ { - int ci; + int ci, ssize; jpeg_component_info *compptr; long samplesperrow; JDIMENSION jd_samplesperrow; + jpeg_calc_jpeg_dimensions(cinfo); + /* Sanity check on image dimensions */ - if (cinfo->image_height <= 0 || cinfo->image_width <= 0 + if (cinfo->jpeg_height <= 0 || cinfo->jpeg_width <= 0 || cinfo->num_components <= 0 || cinfo->input_components <= 0) ERREXIT(cinfo, JERR_EMPTY_IMAGE); /* Make sure image isn't bigger than I can handle */ - if ((long) cinfo->image_height > (long) JPEG_MAX_DIMENSION || - (long) cinfo->image_width > (long) JPEG_MAX_DIMENSION) + if ((long) cinfo->jpeg_height > (long) JPEG_MAX_DIMENSION || + (long) cinfo->jpeg_width > (long) JPEG_MAX_DIMENSION) ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int) JPEG_MAX_DIMENSION); /* Width of an input scanline must be representable as JDIMENSION. */ @@ -95,22 +244,52 @@ initial_setup (j_compress_ptr cinfo) ci++, compptr++) { /* Fill in the correct component_index value; don't rely on application */ compptr->component_index = ci; - /* For compression, we never do DCT scaling. */ - compptr->DCT_scaled_size = DCTSIZE; + /* In selecting the actual DCT scaling for each component, we try to + * scale down the chroma components via DCT scaling rather than downsampling. + * This saves time if the downsampler gets to use 1:1 scaling. + * Note this code adapts subsampling ratios which are powers of 2. + */ + ssize = 1; +#ifdef DCT_SCALING_SUPPORTED + while (cinfo->dct_method == JDCT_ISLOW && + cinfo->min_DCT_h_scaled_size * ssize <= DCTSIZE && + (cinfo->max_h_samp_factor % (compptr->h_samp_factor * ssize * 2)) == 0) { + ssize = ssize * 2; + } +#endif + compptr->DCT_h_scaled_size = cinfo->min_DCT_h_scaled_size * ssize; + ssize = 1; +#ifdef DCT_SCALING_SUPPORTED + while (cinfo->dct_method == JDCT_ISLOW && + cinfo->min_DCT_v_scaled_size * ssize <= DCTSIZE && + (cinfo->max_v_samp_factor % (compptr->v_samp_factor * ssize * 2)) == 0) { + ssize = ssize * 2; + } +#endif + compptr->DCT_v_scaled_size = cinfo->min_DCT_v_scaled_size * ssize; + + /* We don't support DCT ratios larger than 2. */ + if (compptr->DCT_h_scaled_size > compptr->DCT_v_scaled_size * 2) + compptr->DCT_h_scaled_size = compptr->DCT_v_scaled_size * 2; + else if (compptr->DCT_v_scaled_size > compptr->DCT_h_scaled_size * 2) + compptr->DCT_v_scaled_size = compptr->DCT_h_scaled_size * 2; + /* Size in DCT blocks */ compptr->width_in_blocks = (JDIMENSION) - jdiv_round_up((long) cinfo->image_width * (long) compptr->h_samp_factor, + jdiv_round_up((long) cinfo->jpeg_width * (long) compptr->h_samp_factor, (long) (cinfo->max_h_samp_factor * DCTSIZE)); compptr->height_in_blocks = (JDIMENSION) - jdiv_round_up((long) cinfo->image_height * (long) compptr->v_samp_factor, + jdiv_round_up((long) cinfo->jpeg_height * (long) compptr->v_samp_factor, (long) (cinfo->max_v_samp_factor * DCTSIZE)); /* Size in samples */ compptr->downsampled_width = (JDIMENSION) - jdiv_round_up((long) cinfo->image_width * (long) compptr->h_samp_factor, - (long) cinfo->max_h_samp_factor); + jdiv_round_up((long) cinfo->jpeg_width * + (long) (compptr->h_samp_factor * compptr->DCT_h_scaled_size), + (long) (cinfo->max_h_samp_factor * DCTSIZE)); compptr->downsampled_height = (JDIMENSION) - jdiv_round_up((long) cinfo->image_height * (long) compptr->v_samp_factor, - (long) cinfo->max_v_samp_factor); + jdiv_round_up((long) cinfo->jpeg_height * + (long) (compptr->v_samp_factor * compptr->DCT_v_scaled_size), + (long) (cinfo->max_v_samp_factor * DCTSIZE)); /* Mark component needed (this flag isn't actually used for compression) */ compptr->component_needed = TRUE; } @@ -119,7 +298,7 @@ initial_setup (j_compress_ptr cinfo) * main controller will call coefficient controller). */ cinfo->total_iMCU_rows = (JDIMENSION) - jdiv_round_up((long) cinfo->image_height, + jdiv_round_up((long) cinfo->jpeg_height, (long) (cinfo->max_v_samp_factor*DCTSIZE)); } @@ -325,7 +504,7 @@ per_scan_setup (j_compress_ptr cinfo) compptr->MCU_width = 1; compptr->MCU_height = 1; compptr->MCU_blocks = 1; - compptr->MCU_sample_width = DCTSIZE; + compptr->MCU_sample_width = compptr->DCT_h_scaled_size; compptr->last_col_width = 1; /* For noninterleaved scans, it is convenient to define last_row_height * as the number of block rows present in the last iMCU row. @@ -347,10 +526,10 @@ per_scan_setup (j_compress_ptr cinfo) /* Overall image size in MCUs */ cinfo->MCUs_per_row = (JDIMENSION) - jdiv_round_up((long) cinfo->image_width, + jdiv_round_up((long) cinfo->jpeg_width, (long) (cinfo->max_h_samp_factor*DCTSIZE)); cinfo->MCU_rows_in_scan = (JDIMENSION) - jdiv_round_up((long) cinfo->image_height, + jdiv_round_up((long) cinfo->jpeg_height, (long) (cinfo->max_v_samp_factor*DCTSIZE)); cinfo->blocks_in_MCU = 0; @@ -361,7 +540,7 @@ per_scan_setup (j_compress_ptr cinfo) compptr->MCU_width = compptr->h_samp_factor; compptr->MCU_height = compptr->v_samp_factor; compptr->MCU_blocks = compptr->MCU_width * compptr->MCU_height; - compptr->MCU_sample_width = compptr->MCU_width * DCTSIZE; + compptr->MCU_sample_width = compptr->MCU_width * compptr->DCT_h_scaled_size; /* Figure number of non-dummy blocks in last MCU column & row */ tmp = (int) (compptr->width_in_blocks % compptr->MCU_width); if (tmp == 0) tmp = compptr->MCU_width; @@ -433,7 +612,7 @@ prepare_for_pass (j_compress_ptr cinfo) /* Do Huffman optimization for a scan after the first one. */ select_scan_parameters(cinfo); per_scan_setup(cinfo); - if (cinfo->Ss != 0 || cinfo->Ah == 0 || cinfo->arith_code) { + if (cinfo->Ss != 0 || cinfo->Ah == 0) { (*cinfo->entropy->start_pass) (cinfo, TRUE); (*cinfo->coef->start_pass) (cinfo, JBUF_CRANK_DEST); master->pub.call_pass_startup = FALSE; @@ -567,7 +746,7 @@ jinit_c_master_control (j_compress_ptr cinfo, boolean transcode_only) cinfo->num_scans = 1; } - if (cinfo->progressive_mode) /* TEMPORARY HACK ??? */ + if (cinfo->progressive_mode && cinfo->arith_code == 0) /* TEMPORARY HACK ??? */ cinfo->optimize_coding = TRUE; /* assume default tables no good for progressive mode */ /* Initialize my private state */ diff --git a/jcparam.c b/jcparam.c index 6fc48f5..6bef48d 100644 --- a/jcparam.c +++ b/jcparam.c @@ -60,6 +60,47 @@ jpeg_add_quant_table (j_compress_ptr cinfo, int which_tbl, } +/* These are the sample quantization tables given in JPEG spec section K.1. + * The spec says that the values given produce "good" quality, and + * when divided by 2, "very good" quality. + */ +static const unsigned int std_luminance_quant_tbl[DCTSIZE2] = { + 16, 11, 10, 16, 24, 40, 51, 61, + 12, 12, 14, 19, 26, 58, 60, 55, + 14, 13, 16, 24, 40, 57, 69, 56, + 14, 17, 22, 29, 51, 87, 80, 62, + 18, 22, 37, 56, 68, 109, 103, 77, + 24, 35, 55, 64, 81, 104, 113, 92, + 49, 64, 78, 87, 103, 121, 120, 101, + 72, 92, 95, 98, 112, 100, 103, 99 +}; +static const unsigned int std_chrominance_quant_tbl[DCTSIZE2] = { + 17, 18, 24, 47, 99, 99, 99, 99, + 18, 21, 26, 66, 99, 99, 99, 99, + 24, 26, 56, 99, 99, 99, 99, 99, + 47, 66, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99 +}; + + +GLOBAL(void) +jpeg_default_qtables (j_compress_ptr cinfo, boolean force_baseline) +/* Set or change the 'quality' (quantization) setting, using default tables + * and straight percentage-scaling quality scales. + * This entry point allows different scalings for luminance and chrominance. + */ +{ + /* Set up two quantization tables using the specified scaling */ + jpeg_add_quant_table(cinfo, 0, std_luminance_quant_tbl, + cinfo->q_scale_factor[0], force_baseline); + jpeg_add_quant_table(cinfo, 1, std_chrominance_quant_tbl, + cinfo->q_scale_factor[1], force_baseline); +} + + GLOBAL(void) jpeg_set_linear_quality (j_compress_ptr cinfo, int scale_factor, boolean force_baseline) @@ -69,31 +110,6 @@ jpeg_set_linear_quality (j_compress_ptr cinfo, int scale_factor, * applications that insist on a linear percentage scaling. */ { - /* These are the sample quantization tables given in JPEG spec section K.1. - * The spec says that the values given produce "good" quality, and - * when divided by 2, "very good" quality. - */ - static const unsigned int std_luminance_quant_tbl[DCTSIZE2] = { - 16, 11, 10, 16, 24, 40, 51, 61, - 12, 12, 14, 19, 26, 58, 60, 55, - 14, 13, 16, 24, 40, 57, 69, 56, - 14, 17, 22, 29, 51, 87, 80, 62, - 18, 22, 37, 56, 68, 109, 103, 77, - 24, 35, 55, 64, 81, 104, 113, 92, - 49, 64, 78, 87, 103, 121, 120, 101, - 72, 92, 95, 98, 112, 100, 103, 99 - }; - static const unsigned int std_chrominance_quant_tbl[DCTSIZE2] = { - 17, 18, 24, 47, 99, 99, 99, 99, - 18, 21, 26, 66, 99, 99, 99, 99, - 24, 26, 56, 99, 99, 99, 99, 99, - 47, 66, 99, 99, 99, 99, 99, 99, - 99, 99, 99, 99, 99, 99, 99, 99, - 99, 99, 99, 99, 99, 99, 99, 99, - 99, 99, 99, 99, 99, 99, 99, 99, - 99, 99, 99, 99, 99, 99, 99, 99 - }; - /* Set up two quantization tables using the specified scaling */ jpeg_add_quant_table(cinfo, 0, std_luminance_quant_tbl, scale_factor, force_baseline); @@ -284,6 +300,8 @@ jpeg_set_defaults (j_compress_ptr cinfo) /* Initialize everything not dependent on the color space */ + cinfo->scale_num = 1; /* 1:1 scaling */ + cinfo->scale_denom = 1; cinfo->data_precision = BITS_IN_JSAMPLE; /* Set up two quantization tables using default quality of 75 */ jpeg_set_quality(cinfo, 75, TRUE); diff --git a/jcprepct.c b/jcprepct.c index fa93333..be44cc4 100644 --- a/jcprepct.c +++ b/jcprepct.c @@ -173,10 +173,12 @@ pre_process_data (j_compress_ptr cinfo, *out_row_group_ctr < out_row_groups_avail) { for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; ci++, compptr++) { + numrows = (compptr->v_samp_factor * compptr->DCT_v_scaled_size) / + cinfo->min_DCT_v_scaled_size; expand_bottom_edge(output_buf[ci], - compptr->width_in_blocks * DCTSIZE, - (int) (*out_row_group_ctr * compptr->v_samp_factor), - (int) (out_row_groups_avail * compptr->v_samp_factor)); + compptr->width_in_blocks * compptr->DCT_h_scaled_size, + (int) (*out_row_group_ctr * numrows), + (int) (out_row_groups_avail * numrows)); } *out_row_group_ctr = out_row_groups_avail; break; /* can exit outer loop without test */ @@ -288,7 +290,8 @@ create_context_buffer (j_compress_ptr cinfo) */ true_buffer = (*cinfo->mem->alloc_sarray) ((j_common_ptr) cinfo, JPOOL_IMAGE, - (JDIMENSION) (((long) compptr->width_in_blocks * DCTSIZE * + (JDIMENSION) (((long) compptr->width_in_blocks * + cinfo->min_DCT_h_scaled_size * cinfo->max_h_samp_factor) / compptr->h_samp_factor), (JDIMENSION) (3 * rgroup_height)); /* Copy true buffer row pointers into the middle of the fake row array */ @@ -346,7 +349,8 @@ jinit_c_prep_controller (j_compress_ptr cinfo, boolean need_full_buffer) ci++, compptr++) { prep->color_buf[ci] = (*cinfo->mem->alloc_sarray) ((j_common_ptr) cinfo, JPOOL_IMAGE, - (JDIMENSION) (((long) compptr->width_in_blocks * DCTSIZE * + (JDIMENSION) (((long) compptr->width_in_blocks * + cinfo->min_DCT_h_scaled_size * cinfo->max_h_samp_factor) / compptr->h_samp_factor), (JDIMENSION) cinfo->max_v_samp_factor); } diff --git a/jcsample.c b/jcsample.c index 212ec87..4d36f85 100644 --- a/jcsample.c +++ b/jcsample.c @@ -62,6 +62,15 @@ typedef struct { /* Downsampling method pointers, one per component */ downsample1_ptr methods[MAX_COMPONENTS]; + + /* Height of an output row group for each component. */ + int rowgroup_height[MAX_COMPONENTS]; + + /* These arrays save pixel expansion factors so that int_downsample need not + * recompute them each time. They are unused for other downsampling methods. + */ + UINT8 h_expand[MAX_COMPONENTS]; + UINT8 v_expand[MAX_COMPONENTS]; } my_downsampler; typedef my_downsampler * my_downsample_ptr; @@ -123,7 +132,8 @@ sep_downsample (j_compress_ptr cinfo, for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; ci++, compptr++) { in_ptr = input_buf[ci] + in_row_index; - out_ptr = output_buf[ci] + (out_row_group_index * compptr->v_samp_factor); + out_ptr = output_buf[ci] + + (out_row_group_index * downsample->rowgroup_height[ci]); (*downsample->methods[ci]) (cinfo, compptr, in_ptr, out_ptr); } } @@ -140,14 +150,15 @@ METHODDEF(void) int_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, JSAMPARRAY input_data, JSAMPARRAY output_data) { + my_downsample_ptr downsample = (my_downsample_ptr) cinfo->downsample; int inrow, outrow, h_expand, v_expand, numpix, numpix2, h, v; JDIMENSION outcol, outcol_h; /* outcol_h == outcol*h_expand */ - JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE; + JDIMENSION output_cols = compptr->width_in_blocks * compptr->DCT_h_scaled_size; JSAMPROW inptr, outptr; INT32 outvalue; - h_expand = cinfo->max_h_samp_factor / compptr->h_samp_factor; - v_expand = cinfo->max_v_samp_factor / compptr->v_samp_factor; + h_expand = downsample->h_expand[compptr->component_index]; + v_expand = downsample->v_expand[compptr->component_index]; numpix = h_expand * v_expand; numpix2 = numpix/2; @@ -158,8 +169,8 @@ int_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, expand_right_edge(input_data, cinfo->max_v_samp_factor, cinfo->image_width, output_cols * h_expand); - inrow = 0; - for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) { + inrow = outrow = 0; + while (inrow < cinfo->max_v_samp_factor) { outptr = output_data[outrow]; for (outcol = 0, outcol_h = 0; outcol < output_cols; outcol++, outcol_h += h_expand) { @@ -173,6 +184,7 @@ int_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, *outptr++ = (JSAMPLE) ((outvalue + numpix2) / numpix); } inrow += v_expand; + outrow++; } } @@ -191,8 +203,8 @@ fullsize_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, jcopy_sample_rows(input_data, 0, output_data, 0, cinfo->max_v_samp_factor, cinfo->image_width); /* Edge-expand */ - expand_right_edge(output_data, cinfo->max_v_samp_factor, - cinfo->image_width, compptr->width_in_blocks * DCTSIZE); + expand_right_edge(output_data, cinfo->max_v_samp_factor, cinfo->image_width, + compptr->width_in_blocks * compptr->DCT_h_scaled_size); } @@ -212,9 +224,9 @@ METHODDEF(void) h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, JSAMPARRAY input_data, JSAMPARRAY output_data) { - int outrow; + int inrow; JDIMENSION outcol; - JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE; + JDIMENSION output_cols = compptr->width_in_blocks * compptr->DCT_h_scaled_size; register JSAMPROW inptr, outptr; register int bias; @@ -225,9 +237,9 @@ h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, expand_right_edge(input_data, cinfo->max_v_samp_factor, cinfo->image_width, output_cols * 2); - for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) { - outptr = output_data[outrow]; - inptr = input_data[outrow]; + for (inrow = 0; inrow < cinfo->max_v_samp_factor; inrow++) { + outptr = output_data[inrow]; + inptr = input_data[inrow]; bias = 0; /* bias = 0,1,0,1,... for successive samples */ for (outcol = 0; outcol < output_cols; outcol++) { *outptr++ = (JSAMPLE) ((GETJSAMPLE(*inptr) + GETJSAMPLE(inptr[1]) @@ -251,7 +263,7 @@ h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, { int inrow, outrow; JDIMENSION outcol; - JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE; + JDIMENSION output_cols = compptr->width_in_blocks * compptr->DCT_h_scaled_size; register JSAMPROW inptr0, inptr1, outptr; register int bias; @@ -262,8 +274,8 @@ h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, expand_right_edge(input_data, cinfo->max_v_samp_factor, cinfo->image_width, output_cols * 2); - inrow = 0; - for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) { + inrow = outrow = 0; + while (inrow < cinfo->max_v_samp_factor) { outptr = output_data[outrow]; inptr0 = input_data[inrow]; inptr1 = input_data[inrow+1]; @@ -276,6 +288,7 @@ h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, inptr0 += 2; inptr1 += 2; } inrow += 2; + outrow++; } } @@ -294,7 +307,7 @@ h2v2_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, { int inrow, outrow; JDIMENSION colctr; - JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE; + JDIMENSION output_cols = compptr->width_in_blocks * compptr->DCT_h_scaled_size; register JSAMPROW inptr0, inptr1, above_ptr, below_ptr, outptr; INT32 membersum, neighsum, memberscale, neighscale; @@ -321,8 +334,8 @@ h2v2_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, memberscale = 16384 - cinfo->smoothing_factor * 80; /* scaled (1-5*SF)/4 */ neighscale = cinfo->smoothing_factor * 16; /* scaled SF/4 */ - inrow = 0; - for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) { + inrow = outrow = 0; + while (inrow < cinfo->max_v_samp_factor) { outptr = output_data[outrow]; inptr0 = input_data[inrow]; inptr1 = input_data[inrow+1]; @@ -378,6 +391,7 @@ h2v2_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, *outptr = (JSAMPLE) ((membersum + 32768) >> 16); inrow += 2; + outrow++; } } @@ -392,9 +406,9 @@ METHODDEF(void) fullsize_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr, JSAMPARRAY input_data, JSAMPARRAY output_data) { - int outrow; + int inrow; JDIMENSION colctr; - JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE; + JDIMENSION output_cols = compptr->width_in_blocks * compptr->DCT_h_scaled_size; register JSAMPROW inptr, above_ptr, below_ptr, outptr; INT32 membersum, neighsum, memberscale, neighscale; int colsum, lastcolsum, nextcolsum; @@ -415,11 +429,11 @@ fullsize_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr, memberscale = 65536L - cinfo->smoothing_factor * 512L; /* scaled 1-8*SF */ neighscale = cinfo->smoothing_factor * 64; /* scaled SF */ - for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) { - outptr = output_data[outrow]; - inptr = input_data[outrow]; - above_ptr = input_data[outrow-1]; - below_ptr = input_data[outrow+1]; + for (inrow = 0; inrow < cinfo->max_v_samp_factor; inrow++) { + outptr = output_data[inrow]; + inptr = input_data[inrow]; + above_ptr = input_data[inrow-1]; + below_ptr = input_data[inrow+1]; /* Special case for first column */ colsum = GETJSAMPLE(*above_ptr++) + GETJSAMPLE(*below_ptr++) + @@ -467,6 +481,7 @@ jinit_downsampler (j_compress_ptr cinfo) int ci; jpeg_component_info * compptr; boolean smoothok = TRUE; + int h_in_group, v_in_group, h_out_group, v_out_group; downsample = (my_downsample_ptr) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, @@ -482,8 +497,17 @@ jinit_downsampler (j_compress_ptr cinfo) /* Verify we can handle the sampling factors, and set up method pointers */ for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; ci++, compptr++) { - if (compptr->h_samp_factor == cinfo->max_h_samp_factor && - compptr->v_samp_factor == cinfo->max_v_samp_factor) { + /* Compute size of an "output group" for DCT scaling. This many samples + * are to be converted from max_h_samp_factor * max_v_samp_factor pixels. + */ + h_out_group = (compptr->h_samp_factor * compptr->DCT_h_scaled_size) / + cinfo->min_DCT_h_scaled_size; + v_out_group = (compptr->v_samp_factor * compptr->DCT_v_scaled_size) / + cinfo->min_DCT_v_scaled_size; + h_in_group = cinfo->max_h_samp_factor; + v_in_group = cinfo->max_v_samp_factor; + downsample->rowgroup_height[ci] = v_out_group; /* save for use later */ + if (h_in_group == h_out_group && v_in_group == v_out_group) { #ifdef INPUT_SMOOTHING_SUPPORTED if (cinfo->smoothing_factor) { downsample->methods[ci] = fullsize_smooth_downsample; @@ -491,12 +515,12 @@ jinit_downsampler (j_compress_ptr cinfo) } else #endif downsample->methods[ci] = fullsize_downsample; - } else if (compptr->h_samp_factor * 2 == cinfo->max_h_samp_factor && - compptr->v_samp_factor == cinfo->max_v_samp_factor) { + } else if (h_in_group == h_out_group * 2 && + v_in_group == v_out_group) { smoothok = FALSE; downsample->methods[ci] = h2v1_downsample; - } else if (compptr->h_samp_factor * 2 == cinfo->max_h_samp_factor && - compptr->v_samp_factor * 2 == cinfo->max_v_samp_factor) { + } else if (h_in_group == h_out_group * 2 && + v_in_group == v_out_group * 2) { #ifdef INPUT_SMOOTHING_SUPPORTED if (cinfo->smoothing_factor) { downsample->methods[ci] = h2v2_smooth_downsample; @@ -504,10 +528,12 @@ jinit_downsampler (j_compress_ptr cinfo) } else #endif downsample->methods[ci] = h2v2_downsample; - } else if ((cinfo->max_h_samp_factor % compptr->h_samp_factor) == 0 && - (cinfo->max_v_samp_factor % compptr->v_samp_factor) == 0) { + } else if ((h_in_group % h_out_group) == 0 && + (v_in_group % v_out_group) == 0) { smoothok = FALSE; downsample->methods[ci] = int_downsample; + downsample->h_expand[ci] = (UINT8) (h_in_group / h_out_group); + downsample->v_expand[ci] = (UINT8) (v_in_group / v_out_group); } else ERREXIT(cinfo, JERR_FRACT_SAMPLE_NOTIMPL); } diff --git a/jctrans.c b/jctrans.c index 0e6d707..9753a43 100644 --- a/jctrans.c +++ b/jctrans.c @@ -167,7 +167,7 @@ transencode_master_selection (j_compress_ptr cinfo, /* Entropy encoding: either Huffman or arithmetic coding. */ if (cinfo->arith_code) { - ERREXIT(cinfo, JERR_ARITH_NOTIMPL); + jinit_arith_encoder(cinfo); } else { if (cinfo->progressive_mode) { #ifdef C_PROGRESSIVE_SUPPORTED diff --git a/jdapistd.c b/jdapistd.c index c8e3fa0..9d74537 100644 --- a/jdapistd.c +++ b/jdapistd.c @@ -202,7 +202,7 @@ jpeg_read_raw_data (j_decompress_ptr cinfo, JSAMPIMAGE data, } /* Verify that at least one iMCU row can be returned. */ - lines_per_iMCU_row = cinfo->max_v_samp_factor * cinfo->min_DCT_scaled_size; + lines_per_iMCU_row = cinfo->max_v_samp_factor * cinfo->min_DCT_v_scaled_size; if (max_lines < lines_per_iMCU_row) ERREXIT(cinfo, JERR_BUFFER_SIZE); diff --git a/jdcoefct.c b/jdcoefct.c index 4938d20..462e92c 100644 --- a/jdcoefct.c +++ b/jdcoefct.c @@ -187,7 +187,7 @@ decompress_onepass (j_decompress_ptr cinfo, JSAMPIMAGE output_buf) useful_width = (MCU_col_num < last_MCU_col) ? compptr->MCU_width : compptr->last_col_width; output_ptr = output_buf[compptr->component_index] + - yoffset * compptr->DCT_scaled_size; + yoffset * compptr->DCT_v_scaled_size; start_col = MCU_col_num * compptr->MCU_sample_width; for (yindex = 0; yindex < compptr->MCU_height; yindex++) { if (cinfo->input_iMCU_row < last_iMCU_row || @@ -197,11 +197,11 @@ decompress_onepass (j_decompress_ptr cinfo, JSAMPIMAGE output_buf) (*inverse_DCT) (cinfo, compptr, (JCOEFPTR) coef->MCU_buffer[blkn+xindex], output_ptr, output_col); - output_col += compptr->DCT_scaled_size; + output_col += compptr->DCT_h_scaled_size; } } blkn += compptr->MCU_width; - output_ptr += compptr->DCT_scaled_size; + output_ptr += compptr->DCT_v_scaled_size; } } } @@ -362,9 +362,9 @@ decompress_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf) (*inverse_DCT) (cinfo, compptr, (JCOEFPTR) buffer_ptr, output_ptr, output_col); buffer_ptr++; - output_col += compptr->DCT_scaled_size; + output_col += compptr->DCT_h_scaled_size; } - output_ptr += compptr->DCT_scaled_size; + output_ptr += compptr->DCT_v_scaled_size; } } @@ -654,9 +654,9 @@ decompress_smooth_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf) DC4 = DC5; DC5 = DC6; DC7 = DC8; DC8 = DC9; buffer_ptr++, prev_block_row++, next_block_row++; - output_col += compptr->DCT_scaled_size; + output_col += compptr->DCT_h_scaled_size; } - output_ptr += compptr->DCT_scaled_size; + output_ptr += compptr->DCT_v_scaled_size; } } diff --git a/jdct.h b/jdct.h index 04192a2..360dec8 100644 --- a/jdct.h +++ b/jdct.h @@ -14,11 +14,16 @@ /* - * A forward DCT routine is given a pointer to a work area of type DCTELEM[]; - * the DCT is to be performed in-place in that buffer. Type DCTELEM is int - * for 8-bit samples, INT32 for 12-bit samples. (NOTE: Floating-point DCT - * implementations use an array of type FAST_FLOAT, instead.) - * The DCT inputs are expected to be signed (range +-CENTERJSAMPLE). + * A forward DCT routine is given a pointer to an input sample array and + * a pointer to a work area of type DCTELEM[]; the DCT is to be performed + * in-place in that buffer. Type DCTELEM is int for 8-bit samples, INT32 + * for 12-bit samples. (NOTE: Floating-point DCT implementations use an + * array of type FAST_FLOAT, instead.) + * The input data is to be fetched from the sample array starting at a + * specified column. (Any row offset needed will be applied to the array + * pointer before it is passed to the FDCT code.) + * Note that the number of samples fetched by the FDCT routine is + * DCT_h_scaled_size * DCT_v_scaled_size. * The DCT outputs are returned scaled up by a factor of 8; they therefore * have a range of +-8K for 8-bit data, +-128K for 12-bit data. This * convention improves accuracy in integer implementations and saves some @@ -32,8 +37,12 @@ typedef int DCTELEM; /* 16 or 32 bits is fine */ typedef INT32 DCTELEM; /* must have 32 bits */ #endif -typedef JMETHOD(void, forward_DCT_method_ptr, (DCTELEM * data)); -typedef JMETHOD(void, float_DCT_method_ptr, (FAST_FLOAT * data)); +typedef JMETHOD(void, forward_DCT_method_ptr, (DCTELEM * data, + JSAMPARRAY sample_data, + JDIMENSION start_col)); +typedef JMETHOD(void, float_DCT_method_ptr, (FAST_FLOAT * data, + JSAMPARRAY sample_data, + JDIMENSION start_col)); /* @@ -44,7 +53,7 @@ typedef JMETHOD(void, float_DCT_method_ptr, (FAST_FLOAT * data)); * sample array starting at a specified column. (Any row offset needed will * be applied to the array pointer before it is passed to the IDCT code.) * Note that the number of samples emitted by the IDCT routine is - * DCT_scaled_size * DCT_scaled_size. + * DCT_h_scaled_size * DCT_v_scaled_size. */ /* typedef inverse_DCT_method_ptr is declared in jpegint.h */ @@ -84,19 +93,143 @@ typedef FAST_FLOAT FLOAT_MULT_TYPE; /* preferred floating type */ #define jpeg_fdct_islow jFDislow #define jpeg_fdct_ifast jFDifast #define jpeg_fdct_float jFDfloat +#define jpeg_fdct_7x7 jFD7x7 +#define jpeg_fdct_6x6 jFD6x6 +#define jpeg_fdct_5x5 jFD5x5 +#define jpeg_fdct_4x4 jFD4x4 +#define jpeg_fdct_3x3 jFD3x3 +#define jpeg_fdct_2x2 jFD2x2 +#define jpeg_fdct_1x1 jFD1x1 +#define jpeg_fdct_9x9 jFD9x9 +#define jpeg_fdct_10x10 jFD10x10 +#define jpeg_fdct_11x11 jFD11x11 +#define jpeg_fdct_12x12 jFD12x12 +#define jpeg_fdct_13x13 jFD13x13 +#define jpeg_fdct_14x14 jFD14x14 +#define jpeg_fdct_15x15 jFD15x15 +#define jpeg_fdct_16x16 jFD16x16 +#define jpeg_fdct_16x8 jFD16x8 +#define jpeg_fdct_14x7 jFD14x7 +#define jpeg_fdct_12x6 jFD12x6 +#define jpeg_fdct_10x5 jFD10x5 +#define jpeg_fdct_8x4 jFD8x4 +#define jpeg_fdct_6x3 jFD6x3 +#define jpeg_fdct_4x2 jFD4x2 +#define jpeg_fdct_2x1 jFD2x1 +#define jpeg_fdct_8x16 jFD8x16 +#define jpeg_fdct_7x14 jFD7x14 +#define jpeg_fdct_6x12 jFD6x12 +#define jpeg_fdct_5x10 jFD5x10 +#define jpeg_fdct_4x8 jFD4x8 +#define jpeg_fdct_3x6 jFD3x6 +#define jpeg_fdct_2x4 jFD2x4 +#define jpeg_fdct_1x2 jFD1x2 #define jpeg_idct_islow jRDislow #define jpeg_idct_ifast jRDifast #define jpeg_idct_float jRDfloat +#define jpeg_idct_7x7 jRD7x7 +#define jpeg_idct_6x6 jRD6x6 +#define jpeg_idct_5x5 jRD5x5 #define jpeg_idct_4x4 jRD4x4 +#define jpeg_idct_3x3 jRD3x3 #define jpeg_idct_2x2 jRD2x2 #define jpeg_idct_1x1 jRD1x1 +#define jpeg_idct_9x9 jRD9x9 +#define jpeg_idct_10x10 jRD10x10 +#define jpeg_idct_11x11 jRD11x11 +#define jpeg_idct_12x12 jRD12x12 +#define jpeg_idct_13x13 jRD13x13 +#define jpeg_idct_14x14 jRD14x14 +#define jpeg_idct_15x15 jRD15x15 +#define jpeg_idct_16x16 jRD16x16 +#define jpeg_idct_16x8 jRD16x8 +#define jpeg_idct_14x7 jRD14x7 +#define jpeg_idct_12x6 jRD12x6 +#define jpeg_idct_10x5 jRD10x5 +#define jpeg_idct_8x4 jRD8x4 +#define jpeg_idct_6x3 jRD6x3 +#define jpeg_idct_4x2 jRD4x2 +#define jpeg_idct_2x1 jRD2x1 +#define jpeg_idct_8x16 jRD8x16 +#define jpeg_idct_7x14 jRD7x14 +#define jpeg_idct_6x12 jRD6x12 +#define jpeg_idct_5x10 jRD5x10 +#define jpeg_idct_4x8 jRD4x8 +#define jpeg_idct_3x6 jRD3x8 +#define jpeg_idct_2x4 jRD2x4 +#define jpeg_idct_1x2 jRD1x2 #endif /* NEED_SHORT_EXTERNAL_NAMES */ /* Extern declarations for the forward and inverse DCT routines. */ -EXTERN(void) jpeg_fdct_islow JPP((DCTELEM * data)); -EXTERN(void) jpeg_fdct_ifast JPP((DCTELEM * data)); -EXTERN(void) jpeg_fdct_float JPP((FAST_FLOAT * data)); +EXTERN(void) jpeg_fdct_islow + JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)); +EXTERN(void) jpeg_fdct_ifast + JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)); +EXTERN(void) jpeg_fdct_float + JPP((FAST_FLOAT * data, JSAMPARRAY sample_data, JDIMENSION start_col)); +EXTERN(void) jpeg_fdct_7x7 + JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)); +EXTERN(void) jpeg_fdct_6x6 + JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)); +EXTERN(void) jpeg_fdct_5x5 + JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)); +EXTERN(void) jpeg_fdct_4x4 + JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)); +EXTERN(void) jpeg_fdct_3x3 + JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)); +EXTERN(void) jpeg_fdct_2x2 + JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)); +EXTERN(void) jpeg_fdct_1x1 + JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)); +EXTERN(void) jpeg_fdct_9x9 + JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)); +EXTERN(void) jpeg_fdct_10x10 + JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)); +EXTERN(void) jpeg_fdct_11x11 + JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)); +EXTERN(void) jpeg_fdct_12x12 + JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)); +EXTERN(void) jpeg_fdct_13x13 + JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)); +EXTERN(void) jpeg_fdct_14x14 + JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)); +EXTERN(void) jpeg_fdct_15x15 + JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)); +EXTERN(void) jpeg_fdct_16x16 + JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)); +EXTERN(void) jpeg_fdct_16x8 + JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)); +EXTERN(void) jpeg_fdct_14x7 + JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)); +EXTERN(void) jpeg_fdct_12x6 + JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)); +EXTERN(void) jpeg_fdct_10x5 + JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)); +EXTERN(void) jpeg_fdct_8x4 + JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)); +EXTERN(void) jpeg_fdct_6x3 + JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)); +EXTERN(void) jpeg_fdct_4x2 + JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)); +EXTERN(void) jpeg_fdct_2x1 + JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)); +EXTERN(void) jpeg_fdct_8x16 + JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)); +EXTERN(void) jpeg_fdct_7x14 + JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)); +EXTERN(void) jpeg_fdct_6x12 + JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)); +EXTERN(void) jpeg_fdct_5x10 + JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)); +EXTERN(void) jpeg_fdct_4x8 + JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)); +EXTERN(void) jpeg_fdct_3x6 + JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)); +EXTERN(void) jpeg_fdct_2x4 + JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)); +EXTERN(void) jpeg_fdct_1x2 + JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)); EXTERN(void) jpeg_idct_islow JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, @@ -107,15 +240,99 @@ EXTERN(void) jpeg_idct_ifast EXTERN(void) jpeg_idct_float JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); +EXTERN(void) jpeg_idct_7x7 + JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); +EXTERN(void) jpeg_idct_6x6 + JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); +EXTERN(void) jpeg_idct_5x5 + JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); EXTERN(void) jpeg_idct_4x4 JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); +EXTERN(void) jpeg_idct_3x3 + JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); EXTERN(void) jpeg_idct_2x2 JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); EXTERN(void) jpeg_idct_1x1 JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); +EXTERN(void) jpeg_idct_9x9 + JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); +EXTERN(void) jpeg_idct_10x10 + JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); +EXTERN(void) jpeg_idct_11x11 + JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); +EXTERN(void) jpeg_idct_12x12 + JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); +EXTERN(void) jpeg_idct_13x13 + JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); +EXTERN(void) jpeg_idct_14x14 + JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); +EXTERN(void) jpeg_idct_15x15 + JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); +EXTERN(void) jpeg_idct_16x16 + JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); +EXTERN(void) jpeg_idct_16x8 + JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); +EXTERN(void) jpeg_idct_14x7 + JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); +EXTERN(void) jpeg_idct_12x6 + JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); +EXTERN(void) jpeg_idct_10x5 + JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); +EXTERN(void) jpeg_idct_8x4 + JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); +EXTERN(void) jpeg_idct_6x3 + JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); +EXTERN(void) jpeg_idct_4x2 + JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); +EXTERN(void) jpeg_idct_2x1 + JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); +EXTERN(void) jpeg_idct_8x16 + JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); +EXTERN(void) jpeg_idct_7x14 + JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); +EXTERN(void) jpeg_idct_6x12 + JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); +EXTERN(void) jpeg_idct_5x10 + JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); +EXTERN(void) jpeg_idct_4x8 + JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); +EXTERN(void) jpeg_idct_3x6 + JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); +EXTERN(void) jpeg_idct_2x4 + JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); +EXTERN(void) jpeg_idct_1x2 + JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); /* diff --git a/jddctmgr.c b/jddctmgr.c index bbf8d0e..c0a22cf 100644 --- a/jddctmgr.c +++ b/jddctmgr.c @@ -98,22 +98,134 @@ start_pass (j_decompress_ptr cinfo) for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; ci++, compptr++) { /* Select the proper IDCT routine for this component's scaling */ - switch (compptr->DCT_scaled_size) { + switch ((compptr->DCT_h_scaled_size << 8) + compptr->DCT_v_scaled_size) { #ifdef IDCT_SCALING_SUPPORTED - case 1: + case ((1 << 8) + 1): method_ptr = jpeg_idct_1x1; method = JDCT_ISLOW; /* jidctred uses islow-style table */ break; - case 2: + case ((2 << 8) + 2): method_ptr = jpeg_idct_2x2; method = JDCT_ISLOW; /* jidctred uses islow-style table */ break; - case 4: + case ((3 << 8) + 3): + method_ptr = jpeg_idct_3x3; + method = JDCT_ISLOW; /* jidctred uses islow-style table */ + break; + case ((4 << 8) + 4): method_ptr = jpeg_idct_4x4; method = JDCT_ISLOW; /* jidctred uses islow-style table */ break; + case ((5 << 8) + 5): + method_ptr = jpeg_idct_5x5; + method = JDCT_ISLOW; /* jidctred uses islow-style table */ + break; + case ((6 << 8) + 6): + method_ptr = jpeg_idct_6x6; + method = JDCT_ISLOW; /* jidctred uses islow-style table */ + break; + case ((7 << 8) + 7): + method_ptr = jpeg_idct_7x7; + method = JDCT_ISLOW; /* jidctred uses islow-style table */ + break; + case ((9 << 8) + 9): + method_ptr = jpeg_idct_9x9; + method = JDCT_ISLOW; /* jidctred uses islow-style table */ + break; + case ((10 << 8) + 10): + method_ptr = jpeg_idct_10x10; + method = JDCT_ISLOW; /* jidctred uses islow-style table */ + break; + case ((11 << 8) + 11): + method_ptr = jpeg_idct_11x11; + method = JDCT_ISLOW; /* jidctred uses islow-style table */ + break; + case ((12 << 8) + 12): + method_ptr = jpeg_idct_12x12; + method = JDCT_ISLOW; /* jidctred uses islow-style table */ + break; + case ((13 << 8) + 13): + method_ptr = jpeg_idct_13x13; + method = JDCT_ISLOW; /* jidctred uses islow-style table */ + break; + case ((14 << 8) + 14): + method_ptr = jpeg_idct_14x14; + method = JDCT_ISLOW; /* jidctred uses islow-style table */ + break; + case ((15 << 8) + 15): + method_ptr = jpeg_idct_15x15; + method = JDCT_ISLOW; /* jidctred uses islow-style table */ + break; + case ((16 << 8) + 16): + method_ptr = jpeg_idct_16x16; + method = JDCT_ISLOW; /* jidctred uses islow-style table */ + break; + case ((16 << 8) + 8): + method_ptr = jpeg_idct_16x8; + method = JDCT_ISLOW; /* jidctred uses islow-style table */ + break; + case ((14 << 8) + 7): + method_ptr = jpeg_idct_14x7; + method = JDCT_ISLOW; /* jidctred uses islow-style table */ + break; + case ((12 << 8) + 6): + method_ptr = jpeg_idct_12x6; + method = JDCT_ISLOW; /* jidctred uses islow-style table */ + break; + case ((10 << 8) + 5): + method_ptr = jpeg_idct_10x5; + method = JDCT_ISLOW; /* jidctred uses islow-style table */ + break; + case ((8 << 8) + 4): + method_ptr = jpeg_idct_8x4; + method = JDCT_ISLOW; /* jidctred uses islow-style table */ + break; + case ((6 << 8) + 3): + method_ptr = jpeg_idct_6x3; + method = JDCT_ISLOW; /* jidctred uses islow-style table */ + break; + case ((4 << 8) + 2): + method_ptr = jpeg_idct_4x2; + method = JDCT_ISLOW; /* jidctred uses islow-style table */ + break; + case ((2 << 8) + 1): + method_ptr = jpeg_idct_2x1; + method = JDCT_ISLOW; /* jidctred uses islow-style table */ + break; + case ((8 << 8) + 16): + method_ptr = jpeg_idct_8x16; + method = JDCT_ISLOW; /* jidctred uses islow-style table */ + break; + case ((7 << 8) + 14): + method_ptr = jpeg_idct_7x14; + method = JDCT_ISLOW; /* jidctred uses islow-style table */ + break; + case ((6 << 8) + 12): + method_ptr = jpeg_idct_6x12; + method = JDCT_ISLOW; /* jidctred uses islow-style table */ + break; + case ((5 << 8) + 10): + method_ptr = jpeg_idct_5x10; + method = JDCT_ISLOW; /* jidctred uses islow-style table */ + break; + case ((4 << 8) + 8): + method_ptr = jpeg_idct_4x8; + method = JDCT_ISLOW; /* jidctred uses islow-style table */ + break; + case ((3 << 8) + 6): + method_ptr = jpeg_idct_3x6; + method = JDCT_ISLOW; /* jidctred uses islow-style table */ + break; + case ((2 << 8) + 4): + method_ptr = jpeg_idct_2x4; + method = JDCT_ISLOW; /* jidctred uses islow-style table */ + break; + case ((1 << 8) + 2): + method_ptr = jpeg_idct_1x2; + method = JDCT_ISLOW; /* jidctred uses islow-style table */ + break; #endif - case DCTSIZE: + case ((DCTSIZE << 8) + DCTSIZE): switch (cinfo->dct_method) { #ifdef DCT_ISLOW_SUPPORTED case JDCT_ISLOW: @@ -139,7 +251,8 @@ start_pass (j_decompress_ptr cinfo) } break; default: - ERREXIT1(cinfo, JERR_BAD_DCTSIZE, compptr->DCT_scaled_size); + ERREXIT2(cinfo, JERR_BAD_DCTSIZE, + compptr->DCT_h_scaled_size, compptr->DCT_v_scaled_size); break; } idct->pub.inverse_DCT[ci] = method_ptr; diff --git a/jdhuff.c b/jdhuff.c index b5ba39f..f721d48 100644 --- a/jdhuff.c +++ b/jdhuff.c @@ -71,13 +71,24 @@ typedef struct { d_derived_tbl * dc_cur_tbls[D_MAX_BLOCKS_IN_MCU]; d_derived_tbl * ac_cur_tbls[D_MAX_BLOCKS_IN_MCU]; /* Whether we care about the DC and AC coefficient values for each block */ - boolean dc_needed[D_MAX_BLOCKS_IN_MCU]; - boolean ac_needed[D_MAX_BLOCKS_IN_MCU]; + int coef_limit[D_MAX_BLOCKS_IN_MCU]; } huff_entropy_decoder; typedef huff_entropy_decoder * huff_entropy_ptr; +static const int jpeg_zigzag_order[DCTSIZE2] = { + 0, 1, 5, 6, 14, 15, 27, 28, + 2, 4, 7, 13, 16, 26, 29, 42, + 3, 8, 12, 17, 25, 30, 41, 43, + 9, 11, 18, 24, 31, 40, 44, 53, + 10, 19, 23, 32, 39, 45, 52, 54, + 20, 22, 33, 38, 46, 51, 55, 60, + 21, 34, 37, 47, 50, 56, 59, 61, + 35, 36, 48, 49, 57, 58, 62, 63 +}; + + /* * Initialize for a Huffman-compressed scan. */ @@ -86,7 +97,7 @@ METHODDEF(void) start_pass_huff_decoder (j_decompress_ptr cinfo) { huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy; - int ci, blkn, dctbl, actbl; + int ci, blkn, dctbl, actbl, i; jpeg_component_info * compptr; /* Check that the scan parameters Ss, Se, Ah/Al are OK for sequential JPEG. @@ -120,11 +131,13 @@ start_pass_huff_decoder (j_decompress_ptr cinfo) entropy->ac_cur_tbls[blkn] = entropy->ac_derived_tbls[compptr->ac_tbl_no]; /* Decide whether we really care about the coefficient values */ if (compptr->component_needed) { - entropy->dc_needed[blkn] = TRUE; - /* we don't need the ACs if producing a 1/8th-size image */ - entropy->ac_needed[blkn] = (compptr->DCT_scaled_size > 1); + ci = compptr->DCT_v_scaled_size; + if (ci <= 0 || ci > 8) ci = 8; + i = compptr->DCT_h_scaled_size; + if (i <= 0 || i > 8) i = 8; + entropy->coef_limit[blkn] = 1 + jpeg_zigzag_order[(ci - 1) * DCTSIZE + i - 1]; } else { - entropy->dc_needed[blkn] = entropy->ac_needed[blkn] = FALSE; + entropy->coef_limit[blkn] = 0; } } @@ -435,26 +448,20 @@ jpeg_huff_decode (bitread_working_state * state, /* * Figure F.12: extend sign bit. - * On some machines, a shift and add will be faster than a table lookup. + * On some machines, a shift and sub will be faster than a table lookup. */ #ifdef AVOID_TABLES -#define HUFF_EXTEND(x,s) ((x) < (1<<((s)-1)) ? (x) + (((-1)<<(s)) + 1) : (x)) +#define HUFF_EXTEND(x,s) ((x) < (1<<((s)-1)) ? (x) - ((1<<(s))-1) : (x)) #else -#define HUFF_EXTEND(x,s) ((x) < extend_test[s] ? (x) + extend_offset[s] : (x)) +#define HUFF_EXTEND(x,s) ((x) <= bmask[(s) - 1] ? (x) - bmask[s] : (x)) -static const int extend_test[16] = /* entry n is 2**(n-1) */ - { 0, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080, - 0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000 }; - -static const int extend_offset[16] = /* entry n is (-1 << n) + 1 */ - { 0, ((-1)<<1) + 1, ((-1)<<2) + 1, ((-1)<<3) + 1, ((-1)<<4) + 1, - ((-1)<<5) + 1, ((-1)<<6) + 1, ((-1)<<7) + 1, ((-1)<<8) + 1, - ((-1)<<9) + 1, ((-1)<<10) + 1, ((-1)<<11) + 1, ((-1)<<12) + 1, - ((-1)<<13) + 1, ((-1)<<14) + 1, ((-1)<<15) + 1 }; +static const int bmask[16] = /* bmask[n] is mask for n rightmost bits */ + { 0, 0x0001, 0x0003, 0x0007, 0x000F, 0x001F, 0x003F, 0x007F, 0x00FF, + 0x01FF, 0x03FF, 0x07FF, 0x0FFF, 0x1FFF, 0x3FFF, 0x7FFF }; #endif /* AVOID_TABLES */ @@ -541,39 +548,40 @@ decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data) for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) { JBLOCKROW block = MCU_data[blkn]; - d_derived_tbl * dctbl = entropy->dc_cur_tbls[blkn]; - d_derived_tbl * actbl = entropy->ac_cur_tbls[blkn]; + d_derived_tbl * htbl; register int s, k, r; + int coef_limit, ci; /* Decode a single block's worth of coefficients */ /* Section F.2.2.1: decode the DC coefficient difference */ - HUFF_DECODE(s, br_state, dctbl, return FALSE, label1); - if (s) { - CHECK_BIT_BUFFER(br_state, s, return FALSE); - r = GET_BITS(s); - s = HUFF_EXTEND(r, s); - } + htbl = entropy->dc_cur_tbls[blkn]; + HUFF_DECODE(s, br_state, htbl, return FALSE, label1); - if (entropy->dc_needed[blkn]) { + htbl = entropy->ac_cur_tbls[blkn]; + k = 1; + coef_limit = entropy->coef_limit[blkn]; + if (coef_limit) { /* Convert DC difference to actual value, update last_dc_val */ - int ci = cinfo->MCU_membership[blkn]; + if (s) { + CHECK_BIT_BUFFER(br_state, s, return FALSE); + r = GET_BITS(s); + s = HUFF_EXTEND(r, s); + } + ci = cinfo->MCU_membership[blkn]; s += state.last_dc_val[ci]; state.last_dc_val[ci] = s; - /* Output the DC coefficient (assumes jpeg_natural_order[0] = 0) */ + /* Output the DC coefficient */ (*block)[0] = (JCOEF) s; - } - - if (entropy->ac_needed[blkn]) { /* Section F.2.2.2: decode the AC coefficients */ /* Since zeroes are skipped, output area must be cleared beforehand */ - for (k = 1; k < DCTSIZE2; k++) { - HUFF_DECODE(s, br_state, actbl, return FALSE, label2); - + for (; k < coef_limit; k++) { + HUFF_DECODE(s, br_state, htbl, return FALSE, label2); + r = s >> 4; s &= 15; - + if (s) { k += r; CHECK_BIT_BUFFER(br_state, s, return FALSE); @@ -586,33 +594,37 @@ decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data) (*block)[jpeg_natural_order[k]] = (JCOEF) s; } else { if (r != 15) - break; + goto EndOfBlock; k += 15; } } - } else { - - /* Section F.2.2.2: decode the AC coefficients */ - /* In this path we just discard the values */ - for (k = 1; k < DCTSIZE2; k++) { - HUFF_DECODE(s, br_state, actbl, return FALSE, label3); - - r = s >> 4; - s &= 15; - - if (s) { - k += r; - CHECK_BIT_BUFFER(br_state, s, return FALSE); - DROP_BITS(s); - } else { - if (r != 15) - break; - k += 15; - } + if (s) { + CHECK_BIT_BUFFER(br_state, s, return FALSE); + DROP_BITS(s); } + } + + /* Section F.2.2.2: decode the AC coefficients */ + /* In this path we just discard the values */ + for (; k < DCTSIZE2; k++) { + HUFF_DECODE(s, br_state, htbl, return FALSE, label3); + + r = s >> 4; + s &= 15; + if (s) { + k += r; + CHECK_BIT_BUFFER(br_state, s, return FALSE); + DROP_BITS(s); + } else { + if (r != 15) + break; + k += 15; + } } + + EndOfBlock: ; } /* Completed MCU, so update state */ diff --git a/jdinput.c b/jdinput.c index 0c2ac8f..2e87693 100644 --- a/jdinput.c +++ b/jdinput.c @@ -74,12 +74,14 @@ initial_setup (j_decompress_ptr cinfo) * In the full decompressor, this will be overridden by jdmaster.c; * but in the transcoder, jdmaster.c is not used, so we must do it here. */ - cinfo->min_DCT_scaled_size = DCTSIZE; + cinfo->min_DCT_h_scaled_size = DCTSIZE; + cinfo->min_DCT_v_scaled_size = DCTSIZE; /* Compute dimensions of components */ for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; ci++, compptr++) { - compptr->DCT_scaled_size = DCTSIZE; + compptr->DCT_h_scaled_size = DCTSIZE; + compptr->DCT_v_scaled_size = DCTSIZE; /* Size in DCT blocks */ compptr->width_in_blocks = (JDIMENSION) jdiv_round_up((long) cinfo->image_width * (long) compptr->h_samp_factor, @@ -138,7 +140,7 @@ per_scan_setup (j_decompress_ptr cinfo) compptr->MCU_width = 1; compptr->MCU_height = 1; compptr->MCU_blocks = 1; - compptr->MCU_sample_width = compptr->DCT_scaled_size; + compptr->MCU_sample_width = compptr->DCT_h_scaled_size; compptr->last_col_width = 1; /* For noninterleaved scans, it is convenient to define last_row_height * as the number of block rows present in the last iMCU row. @@ -174,7 +176,7 @@ per_scan_setup (j_decompress_ptr cinfo) compptr->MCU_width = compptr->h_samp_factor; compptr->MCU_height = compptr->v_samp_factor; compptr->MCU_blocks = compptr->MCU_width * compptr->MCU_height; - compptr->MCU_sample_width = compptr->MCU_width * compptr->DCT_scaled_size; + compptr->MCU_sample_width = compptr->MCU_width * compptr->DCT_h_scaled_size; /* Figure number of non-dummy blocks in last MCU column & row */ tmp = (int) (compptr->width_in_blocks % compptr->MCU_width); if (tmp == 0) tmp = compptr->MCU_width; diff --git a/jdmainct.c b/jdmainct.c index 13c956f..02723ca 100644 --- a/jdmainct.c +++ b/jdmainct.c @@ -161,7 +161,7 @@ alloc_funny_pointers (j_decompress_ptr cinfo) { my_main_ptr main = (my_main_ptr) cinfo->main; int ci, rgroup; - int M = cinfo->min_DCT_scaled_size; + int M = cinfo->min_DCT_v_scaled_size; jpeg_component_info *compptr; JSAMPARRAY xbuf; @@ -175,8 +175,8 @@ alloc_funny_pointers (j_decompress_ptr cinfo) for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; ci++, compptr++) { - rgroup = (compptr->v_samp_factor * compptr->DCT_scaled_size) / - cinfo->min_DCT_scaled_size; /* height of a row group of component */ + rgroup = (compptr->v_samp_factor * compptr->DCT_v_scaled_size) / + cinfo->min_DCT_v_scaled_size; /* height of a row group of component */ /* Get space for pointer lists --- M+4 row groups in each list. * We alloc both pointer lists with one call to save a few cycles. */ @@ -202,14 +202,14 @@ make_funny_pointers (j_decompress_ptr cinfo) { my_main_ptr main = (my_main_ptr) cinfo->main; int ci, i, rgroup; - int M = cinfo->min_DCT_scaled_size; + int M = cinfo->min_DCT_v_scaled_size; jpeg_component_info *compptr; JSAMPARRAY buf, xbuf0, xbuf1; for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; ci++, compptr++) { - rgroup = (compptr->v_samp_factor * compptr->DCT_scaled_size) / - cinfo->min_DCT_scaled_size; /* height of a row group of component */ + rgroup = (compptr->v_samp_factor * compptr->DCT_v_scaled_size) / + cinfo->min_DCT_v_scaled_size; /* height of a row group of component */ xbuf0 = main->xbuffer[0][ci]; xbuf1 = main->xbuffer[1][ci]; /* First copy the workspace pointers as-is */ @@ -242,14 +242,14 @@ set_wraparound_pointers (j_decompress_ptr cinfo) { my_main_ptr main = (my_main_ptr) cinfo->main; int ci, i, rgroup; - int M = cinfo->min_DCT_scaled_size; + int M = cinfo->min_DCT_v_scaled_size; jpeg_component_info *compptr; JSAMPARRAY xbuf0, xbuf1; for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; ci++, compptr++) { - rgroup = (compptr->v_samp_factor * compptr->DCT_scaled_size) / - cinfo->min_DCT_scaled_size; /* height of a row group of component */ + rgroup = (compptr->v_samp_factor * compptr->DCT_v_scaled_size) / + cinfo->min_DCT_v_scaled_size; /* height of a row group of component */ xbuf0 = main->xbuffer[0][ci]; xbuf1 = main->xbuffer[1][ci]; for (i = 0; i < rgroup; i++) { @@ -277,8 +277,8 @@ set_bottom_pointers (j_decompress_ptr cinfo) for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; ci++, compptr++) { /* Count sample rows in one iMCU row and in one row group */ - iMCUheight = compptr->v_samp_factor * compptr->DCT_scaled_size; - rgroup = iMCUheight / cinfo->min_DCT_scaled_size; + iMCUheight = compptr->v_samp_factor * compptr->DCT_v_scaled_size; + rgroup = iMCUheight / cinfo->min_DCT_v_scaled_size; /* Count nondummy sample rows remaining for this component */ rows_left = (int) (compptr->downsampled_height % (JDIMENSION) iMCUheight); if (rows_left == 0) rows_left = iMCUheight; @@ -357,7 +357,7 @@ process_data_simple_main (j_decompress_ptr cinfo, } /* There are always min_DCT_scaled_size row groups in an iMCU row. */ - rowgroups_avail = (JDIMENSION) cinfo->min_DCT_scaled_size; + rowgroups_avail = (JDIMENSION) cinfo->min_DCT_v_scaled_size; /* Note: at the bottom of the image, we may pass extra garbage row groups * to the postprocessor. The postprocessor has to check for bottom * of image anyway (at row resolution), so no point in us doing it too. @@ -417,7 +417,7 @@ process_data_context_main (j_decompress_ptr cinfo, case CTX_PREPARE_FOR_IMCU: /* Prepare to process first M-1 row groups of this iMCU row */ main->rowgroup_ctr = 0; - main->rowgroups_avail = (JDIMENSION) (cinfo->min_DCT_scaled_size - 1); + main->rowgroups_avail = (JDIMENSION) (cinfo->min_DCT_v_scaled_size - 1); /* Check for bottom of image: if so, tweak pointers to "duplicate" * the last sample row, and adjust rowgroups_avail to ignore padding rows. */ @@ -440,8 +440,8 @@ process_data_context_main (j_decompress_ptr cinfo, main->buffer_full = FALSE; /* Still need to process last row group of this iMCU row, */ /* which is saved at index M+1 of the other xbuffer */ - main->rowgroup_ctr = (JDIMENSION) (cinfo->min_DCT_scaled_size + 1); - main->rowgroups_avail = (JDIMENSION) (cinfo->min_DCT_scaled_size + 2); + main->rowgroup_ctr = (JDIMENSION) (cinfo->min_DCT_v_scaled_size + 1); + main->rowgroups_avail = (JDIMENSION) (cinfo->min_DCT_v_scaled_size + 2); main->context_state = CTX_POSTPONED_ROW; } } @@ -492,21 +492,21 @@ jinit_d_main_controller (j_decompress_ptr cinfo, boolean need_full_buffer) * ngroups is the number of row groups we need. */ if (cinfo->upsample->need_context_rows) { - if (cinfo->min_DCT_scaled_size < 2) /* unsupported, see comments above */ + if (cinfo->min_DCT_v_scaled_size < 2) /* unsupported, see comments above */ ERREXIT(cinfo, JERR_NOTIMPL); alloc_funny_pointers(cinfo); /* Alloc space for xbuffer[] lists */ - ngroups = cinfo->min_DCT_scaled_size + 2; + ngroups = cinfo->min_DCT_v_scaled_size + 2; } else { - ngroups = cinfo->min_DCT_scaled_size; + ngroups = cinfo->min_DCT_v_scaled_size; } for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; ci++, compptr++) { - rgroup = (compptr->v_samp_factor * compptr->DCT_scaled_size) / - cinfo->min_DCT_scaled_size; /* height of a row group of component */ + rgroup = (compptr->v_samp_factor * compptr->DCT_v_scaled_size) / + cinfo->min_DCT_v_scaled_size; /* height of a row group of component */ main->buffer[ci] = (*cinfo->mem->alloc_sarray) ((j_common_ptr) cinfo, JPOOL_IMAGE, - compptr->width_in_blocks * compptr->DCT_scaled_size, + compptr->width_in_blocks * compptr->DCT_h_scaled_size, (JDIMENSION) (rgroup * ngroups)); } } diff --git a/jdmaster.c b/jdmaster.c index 2802c5b..0cdafbd 100644 --- a/jdmaster.c +++ b/jdmaster.c @@ -61,9 +61,12 @@ use_merged_upsample (j_decompress_ptr cinfo) cinfo->comp_info[2].v_samp_factor != 1) return FALSE; /* furthermore, it doesn't work if we've scaled the IDCTs differently */ - if (cinfo->comp_info[0].DCT_scaled_size != cinfo->min_DCT_scaled_size || - cinfo->comp_info[1].DCT_scaled_size != cinfo->min_DCT_scaled_size || - cinfo->comp_info[2].DCT_scaled_size != cinfo->min_DCT_scaled_size) + if (cinfo->comp_info[0].DCT_h_scaled_size != cinfo->min_DCT_h_scaled_size || + cinfo->comp_info[1].DCT_h_scaled_size != cinfo->min_DCT_h_scaled_size || + cinfo->comp_info[2].DCT_h_scaled_size != cinfo->min_DCT_h_scaled_size || + cinfo->comp_info[0].DCT_v_scaled_size != cinfo->min_DCT_v_scaled_size || + cinfo->comp_info[1].DCT_v_scaled_size != cinfo->min_DCT_v_scaled_size || + cinfo->comp_info[2].DCT_v_scaled_size != cinfo->min_DCT_v_scaled_size) return FALSE; /* ??? also need to test for upsample-time rescaling, when & if supported */ return TRUE; /* by golly, it'll work... */ @@ -102,43 +105,150 @@ jpeg_calc_output_dimensions (j_decompress_ptr cinfo) jdiv_round_up((long) cinfo->image_width, 8L); cinfo->output_height = (JDIMENSION) jdiv_round_up((long) cinfo->image_height, 8L); - cinfo->min_DCT_scaled_size = 1; + cinfo->min_DCT_h_scaled_size = 1; + cinfo->min_DCT_v_scaled_size = 1; } else if (cinfo->scale_num * 4 <= cinfo->scale_denom) { /* Provide 1/4 scaling */ cinfo->output_width = (JDIMENSION) jdiv_round_up((long) cinfo->image_width, 4L); cinfo->output_height = (JDIMENSION) jdiv_round_up((long) cinfo->image_height, 4L); - cinfo->min_DCT_scaled_size = 2; + cinfo->min_DCT_h_scaled_size = 2; + cinfo->min_DCT_v_scaled_size = 2; + } else if (cinfo->scale_num * 8 <= cinfo->scale_denom * 3) { + /* Provide 3/8 scaling */ + cinfo->output_width = (JDIMENSION) + jdiv_round_up((long) cinfo->image_width * 3L, 8L); + cinfo->output_height = (JDIMENSION) + jdiv_round_up((long) cinfo->image_height * 3L, 8L); + cinfo->min_DCT_h_scaled_size = 3; + cinfo->min_DCT_v_scaled_size = 3; } else if (cinfo->scale_num * 2 <= cinfo->scale_denom) { /* Provide 1/2 scaling */ cinfo->output_width = (JDIMENSION) jdiv_round_up((long) cinfo->image_width, 2L); cinfo->output_height = (JDIMENSION) jdiv_round_up((long) cinfo->image_height, 2L); - cinfo->min_DCT_scaled_size = 4; - } else { + cinfo->min_DCT_h_scaled_size = 4; + cinfo->min_DCT_v_scaled_size = 4; + } else if (cinfo->scale_num * 8 <= cinfo->scale_denom * 5) { + /* Provide 5/8 scaling */ + cinfo->output_width = (JDIMENSION) + jdiv_round_up((long) cinfo->image_width * 5L, 8L); + cinfo->output_height = (JDIMENSION) + jdiv_round_up((long) cinfo->image_height * 5L, 8L); + cinfo->min_DCT_h_scaled_size = 5; + cinfo->min_DCT_v_scaled_size = 5; + } else if (cinfo->scale_num * 4 <= cinfo->scale_denom * 3) { + /* Provide 3/4 scaling */ + cinfo->output_width = (JDIMENSION) + jdiv_round_up((long) cinfo->image_width * 3L, 4L); + cinfo->output_height = (JDIMENSION) + jdiv_round_up((long) cinfo->image_height * 3L, 4L); + cinfo->min_DCT_h_scaled_size = 6; + cinfo->min_DCT_v_scaled_size = 6; + } else if (cinfo->scale_num * 8 <= cinfo->scale_denom * 7) { + /* Provide 7/8 scaling */ + cinfo->output_width = (JDIMENSION) + jdiv_round_up((long) cinfo->image_width * 7L, 8L); + cinfo->output_height = (JDIMENSION) + jdiv_round_up((long) cinfo->image_height * 7L, 8L); + cinfo->min_DCT_h_scaled_size = 7; + cinfo->min_DCT_v_scaled_size = 7; + } else if (cinfo->scale_num <= cinfo->scale_denom) { /* Provide 1/1 scaling */ cinfo->output_width = cinfo->image_width; cinfo->output_height = cinfo->image_height; - cinfo->min_DCT_scaled_size = DCTSIZE; + cinfo->min_DCT_h_scaled_size = DCTSIZE; + cinfo->min_DCT_v_scaled_size = DCTSIZE; + } else if (cinfo->scale_num * 8 <= cinfo->scale_denom * 9) { + /* Provide 9/8 scaling */ + cinfo->output_width = cinfo->image_width + (JDIMENSION) + jdiv_round_up((long) cinfo->image_width, 8L); + cinfo->output_height = cinfo->image_height + (JDIMENSION) + jdiv_round_up((long) cinfo->image_height, 8L); + cinfo->min_DCT_h_scaled_size = 9; + cinfo->min_DCT_v_scaled_size = 9; + } else if (cinfo->scale_num * 4 <= cinfo->scale_denom * 5) { + /* Provide 5/4 scaling */ + cinfo->output_width = cinfo->image_width + (JDIMENSION) + jdiv_round_up((long) cinfo->image_width, 4L); + cinfo->output_height = cinfo->image_height + (JDIMENSION) + jdiv_round_up((long) cinfo->image_height, 4L); + cinfo->min_DCT_h_scaled_size = 10; + cinfo->min_DCT_v_scaled_size = 10; + } else if (cinfo->scale_num * 8 <= cinfo->scale_denom * 11) { + /* Provide 11/8 scaling */ + cinfo->output_width = cinfo->image_width + (JDIMENSION) + jdiv_round_up((long) cinfo->image_width * 3L, 8L); + cinfo->output_height = cinfo->image_height + (JDIMENSION) + jdiv_round_up((long) cinfo->image_height * 3L, 8L); + cinfo->min_DCT_h_scaled_size = 11; + cinfo->min_DCT_v_scaled_size = 11; + } else if (cinfo->scale_num * 2 <= cinfo->scale_denom * 3) { + /* Provide 3/2 scaling */ + cinfo->output_width = cinfo->image_width + (JDIMENSION) + jdiv_round_up((long) cinfo->image_width, 2L); + cinfo->output_height = cinfo->image_height + (JDIMENSION) + jdiv_round_up((long) cinfo->image_height, 2L); + cinfo->min_DCT_h_scaled_size = 12; + cinfo->min_DCT_v_scaled_size = 12; + } else if (cinfo->scale_num * 8 <= cinfo->scale_denom * 13) { + /* Provide 13/8 scaling */ + cinfo->output_width = cinfo->image_width + (JDIMENSION) + jdiv_round_up((long) cinfo->image_width * 5L, 8L); + cinfo->output_height = cinfo->image_height + (JDIMENSION) + jdiv_round_up((long) cinfo->image_height * 5L, 8L); + cinfo->min_DCT_h_scaled_size = 13; + cinfo->min_DCT_v_scaled_size = 13; + } else if (cinfo->scale_num * 4 <= cinfo->scale_denom * 7) { + /* Provide 7/4 scaling */ + cinfo->output_width = cinfo->image_width + (JDIMENSION) + jdiv_round_up((long) cinfo->image_width * 3L, 4L); + cinfo->output_height = cinfo->image_height + (JDIMENSION) + jdiv_round_up((long) cinfo->image_height * 3L, 4L); + cinfo->min_DCT_h_scaled_size = 14; + cinfo->min_DCT_v_scaled_size = 14; + } else if (cinfo->scale_num * 8 <= cinfo->scale_denom * 15) { + /* Provide 15/8 scaling */ + cinfo->output_width = cinfo->image_width + (JDIMENSION) + jdiv_round_up((long) cinfo->image_width * 7L, 8L); + cinfo->output_height = cinfo->image_height + (JDIMENSION) + jdiv_round_up((long) cinfo->image_height * 7L, 8L); + cinfo->min_DCT_h_scaled_size = 15; + cinfo->min_DCT_v_scaled_size = 15; + } else { + /* Provide 2/1 scaling */ + cinfo->output_width = cinfo->image_width << 1; + cinfo->output_height = cinfo->image_height << 1; + cinfo->min_DCT_h_scaled_size = 16; + cinfo->min_DCT_v_scaled_size = 16; } /* In selecting the actual DCT scaling for each component, we try to * scale up the chroma components via IDCT scaling rather than upsampling. * This saves time if the upsampler gets to use 1:1 scaling. - * Note this code assumes that the supported DCT scalings are powers of 2. + * Note this code adapts subsampling ratios which are powers of 2. */ for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; ci++, compptr++) { - int ssize = cinfo->min_DCT_scaled_size; - while (ssize < DCTSIZE && - (compptr->h_samp_factor * ssize * 2 <= - cinfo->max_h_samp_factor * cinfo->min_DCT_scaled_size) && - (compptr->v_samp_factor * ssize * 2 <= - cinfo->max_v_samp_factor * cinfo->min_DCT_scaled_size)) { + int ssize = 1; + while (cinfo->min_DCT_h_scaled_size * ssize <= DCTSIZE && + (cinfo->max_h_samp_factor % (compptr->h_samp_factor * ssize * 2)) == 0) { + ssize = ssize * 2; + } + compptr->DCT_h_scaled_size = cinfo->min_DCT_h_scaled_size * ssize; + ssize = 1; + while (cinfo->min_DCT_v_scaled_size * ssize <= DCTSIZE && + (cinfo->max_v_samp_factor % (compptr->v_samp_factor * ssize * 2)) == 0) { ssize = ssize * 2; } - compptr->DCT_scaled_size = ssize; + compptr->DCT_v_scaled_size = cinfo->min_DCT_v_scaled_size * ssize; + + /* We don't support IDCT ratios larger than 2. */ + if (compptr->DCT_h_scaled_size > compptr->DCT_v_scaled_size * 2) + compptr->DCT_h_scaled_size = compptr->DCT_v_scaled_size * 2; + else if (compptr->DCT_v_scaled_size > compptr->DCT_h_scaled_size * 2) + compptr->DCT_v_scaled_size = compptr->DCT_h_scaled_size * 2; } /* Recompute downsampled dimensions of components; @@ -149,11 +259,11 @@ jpeg_calc_output_dimensions (j_decompress_ptr cinfo) /* Size in samples, after IDCT scaling */ compptr->downsampled_width = (JDIMENSION) jdiv_round_up((long) cinfo->image_width * - (long) (compptr->h_samp_factor * compptr->DCT_scaled_size), + (long) (compptr->h_samp_factor * compptr->DCT_h_scaled_size), (long) (cinfo->max_h_samp_factor * DCTSIZE)); compptr->downsampled_height = (JDIMENSION) jdiv_round_up((long) cinfo->image_height * - (long) (compptr->v_samp_factor * compptr->DCT_scaled_size), + (long) (compptr->v_samp_factor * compptr->DCT_v_scaled_size), (long) (cinfo->max_v_samp_factor * DCTSIZE)); } @@ -373,7 +483,7 @@ master_selection (j_decompress_ptr cinfo) jinit_inverse_dct(cinfo); /* Entropy decoding: either Huffman or arithmetic coding. */ if (cinfo->arith_code) { - ERREXIT(cinfo, JERR_ARITH_NOTIMPL); + jinit_arith_decoder(cinfo); } else { if (cinfo->progressive_mode) { #ifdef D_PROGRESSIVE_SUPPORTED diff --git a/jdsample.c b/jdsample.c index 80ffefb..fdd8579 100644 --- a/jdsample.c +++ b/jdsample.c @@ -237,11 +237,11 @@ h2v1_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr, register JSAMPROW inptr, outptr; register JSAMPLE invalue; JSAMPROW outend; - int inrow; + int outrow; - for (inrow = 0; inrow < cinfo->max_v_samp_factor; inrow++) { - inptr = input_data[inrow]; - outptr = output_data[inrow]; + for (outrow = 0; outrow < cinfo->max_v_samp_factor; outrow++) { + inptr = input_data[outrow]; + outptr = output_data[outrow]; outend = outptr + cinfo->output_width; while (outptr < outend) { invalue = *inptr++; /* don't need GETJSAMPLE() here */ @@ -308,11 +308,11 @@ h2v1_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr, register JSAMPROW inptr, outptr; register int invalue; register JDIMENSION colctr; - int inrow; + int outrow; - for (inrow = 0; inrow < cinfo->max_v_samp_factor; inrow++) { - inptr = input_data[inrow]; - outptr = output_data[inrow]; + for (outrow = 0; outrow < cinfo->max_v_samp_factor; outrow++) { + inptr = input_data[outrow]; + outptr = output_data[outrow]; /* Special case for first column */ invalue = GETJSAMPLE(*inptr++); *outptr++ = (JSAMPLE) invalue; @@ -418,7 +418,7 @@ jinit_upsampler (j_decompress_ptr cinfo) /* jdmainct.c doesn't support context rows when min_DCT_scaled_size = 1, * so don't ask for it. */ - do_fancy = cinfo->do_fancy_upsampling && cinfo->min_DCT_scaled_size > 1; + do_fancy = cinfo->do_fancy_upsampling && cinfo->min_DCT_v_scaled_size > 1; /* Verify we can handle the sampling factors, select per-component methods, * and create storage as needed. @@ -428,10 +428,10 @@ jinit_upsampler (j_decompress_ptr cinfo) /* Compute size of an "input group" after IDCT scaling. This many samples * are to be converted to max_h_samp_factor * max_v_samp_factor pixels. */ - h_in_group = (compptr->h_samp_factor * compptr->DCT_scaled_size) / - cinfo->min_DCT_scaled_size; - v_in_group = (compptr->v_samp_factor * compptr->DCT_scaled_size) / - cinfo->min_DCT_scaled_size; + h_in_group = (compptr->h_samp_factor * compptr->DCT_h_scaled_size) / + cinfo->min_DCT_h_scaled_size; + v_in_group = (compptr->v_samp_factor * compptr->DCT_v_scaled_size) / + cinfo->min_DCT_v_scaled_size; h_out_group = cinfo->max_h_samp_factor; v_out_group = cinfo->max_v_samp_factor; upsample->rowgroup_height[ci] = v_in_group; /* save for use later */ diff --git a/jdtrans.c b/jdtrans.c index 6c0ab71..7a3b268 100644 --- a/jdtrans.c +++ b/jdtrans.c @@ -100,9 +100,9 @@ transdecode_master_selection (j_decompress_ptr cinfo) cinfo->buffered_image = TRUE; /* Entropy decoding: either Huffman or arithmetic coding. */ - if (cinfo->arith_code) { - ERREXIT(cinfo, JERR_ARITH_NOTIMPL); - } else { + if (cinfo->arith_code) + jinit_arith_decoder(cinfo); + else { if (cinfo->progressive_mode) { #ifdef D_PROGRESSIVE_SUPPORTED jinit_phuff_decoder(cinfo); diff --git a/jerror.h b/jerror.h index fc2fffe..715f898 100644 --- a/jerror.h +++ b/jerror.h @@ -45,8 +45,11 @@ JMESSAGE(JERR_BAD_ALIGN_TYPE, "ALIGN_TYPE is wrong, please fix") JMESSAGE(JERR_BAD_ALLOC_CHUNK, "MAX_ALLOC_CHUNK is wrong, please fix") JMESSAGE(JERR_BAD_BUFFER_MODE, "Bogus buffer control mode") JMESSAGE(JERR_BAD_COMPONENT_ID, "Invalid component ID %d in SOS") +JMESSAGE(JERR_BAD_CROP_SPEC, "Invalid crop request") JMESSAGE(JERR_BAD_DCT_COEF, "DCT coefficient out of range") -JMESSAGE(JERR_BAD_DCTSIZE, "IDCT output block size %d not supported") +JMESSAGE(JERR_BAD_DCTSIZE, "DCT scaled block size %dx%d not supported") +JMESSAGE(JERR_BAD_DROP_SAMPLING, + "Component index %d: mismatching sampling ratio %d:%d, %d:%d, %c") JMESSAGE(JERR_BAD_HUFF_TABLE, "Bogus Huffman table definition") JMESSAGE(JERR_BAD_IN_COLORSPACE, "Bogus input colorspace") JMESSAGE(JERR_BAD_J_COLORSPACE, "Bogus JPEG colorspace") @@ -93,6 +96,7 @@ JMESSAGE(JERR_MISSING_DATA, "Scan script does not transmit all data") JMESSAGE(JERR_MODE_CHANGE, "Invalid color quantization mode change") JMESSAGE(JERR_NOTIMPL, "Not implemented yet") JMESSAGE(JERR_NOT_COMPILED, "Requested feature was omitted at compile time") +JMESSAGE(JERR_NO_ARITH_TABLE, "Arithmetic table 0x%02x was not defined") JMESSAGE(JERR_NO_BACKING_STORE, "Backing store not supported") JMESSAGE(JERR_NO_HUFF_TABLE, "Huffman table 0x%02x was not defined") JMESSAGE(JERR_NO_IMAGE, "JPEG datastream contains no image") @@ -170,6 +174,7 @@ JMESSAGE(JTRC_UNKNOWN_IDS, JMESSAGE(JTRC_XMS_CLOSE, "Freed XMS handle %u") JMESSAGE(JTRC_XMS_OPEN, "Obtained XMS handle %u") JMESSAGE(JWRN_ADOBE_XFORM, "Unknown Adobe color transform code %d") +JMESSAGE(JWRN_ARITH_BAD_CODE, "Corrupt JPEG data: bad arithmetic code") JMESSAGE(JWRN_BOGUS_PROGRESSION, "Inconsistent progression sequence for component %d coefficient %d") JMESSAGE(JWRN_EXTRANEOUS_DATA, @@ -227,6 +232,15 @@ JMESSAGE(JWRN_TOO_MUCH_DATA, "Application transferred too many scanlines") (cinfo)->err->msg_parm.i[2] = (p3), \ (cinfo)->err->msg_parm.i[3] = (p4), \ (*(cinfo)->err->error_exit) ((j_common_ptr) (cinfo))) +#define ERREXIT6(cinfo,code,p1,p2,p3,p4,p5,p6) \ + ((cinfo)->err->msg_code = (code), \ + (cinfo)->err->msg_parm.i[0] = (p1), \ + (cinfo)->err->msg_parm.i[1] = (p2), \ + (cinfo)->err->msg_parm.i[2] = (p3), \ + (cinfo)->err->msg_parm.i[3] = (p4), \ + (cinfo)->err->msg_parm.i[4] = (p5), \ + (cinfo)->err->msg_parm.i[5] = (p6), \ + (*(cinfo)->err->error_exit) ((j_common_ptr) (cinfo))) #define ERREXITS(cinfo,code,str) \ ((cinfo)->err->msg_code = (code), \ strncpy((cinfo)->err->msg_parm.s, (str), JMSG_STR_PARM_MAX), \ diff --git a/jfdctflt.c b/jfdctflt.c index 79d7a00..275a528 100644 --- a/jfdctflt.c +++ b/jfdctflt.c @@ -56,41 +56,46 @@ */ GLOBAL(void) -jpeg_fdct_float (FAST_FLOAT * data) +jpeg_fdct_float (FAST_FLOAT * data, JSAMPARRAY sample_data, JDIMENSION start_col) { FAST_FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; FAST_FLOAT tmp10, tmp11, tmp12, tmp13; FAST_FLOAT z1, z2, z3, z4, z5, z11, z13; FAST_FLOAT *dataptr; + JSAMPROW elemptr; int ctr; /* Pass 1: process rows. */ dataptr = data; - for (ctr = DCTSIZE-1; ctr >= 0; ctr--) { - tmp0 = dataptr[0] + dataptr[7]; - tmp7 = dataptr[0] - dataptr[7]; - tmp1 = dataptr[1] + dataptr[6]; - tmp6 = dataptr[1] - dataptr[6]; - tmp2 = dataptr[2] + dataptr[5]; - tmp5 = dataptr[2] - dataptr[5]; - tmp3 = dataptr[3] + dataptr[4]; - tmp4 = dataptr[3] - dataptr[4]; - + for (ctr = 0; ctr < DCTSIZE; ctr++) { + elemptr = sample_data[ctr] + start_col; + + /* Load data into workspace, applying unsigned->signed conversion */ + tmp0 = (FAST_FLOAT) (GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7])); + tmp7 = (FAST_FLOAT) (GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7])); + tmp1 = (FAST_FLOAT) (GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6])); + tmp6 = (FAST_FLOAT) (GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6])); + tmp2 = (FAST_FLOAT) (GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5])); + tmp5 = (FAST_FLOAT) (GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5])); + tmp3 = (FAST_FLOAT) (GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4])); + tmp4 = (FAST_FLOAT) (GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4])); + /* Even part */ - + tmp10 = tmp0 + tmp3; /* phase 2 */ tmp13 = tmp0 - tmp3; tmp11 = tmp1 + tmp2; tmp12 = tmp1 - tmp2; - - dataptr[0] = tmp10 + tmp11; /* phase 3 */ + + /* Apply unsigned->signed conversion */ + dataptr[0] = tmp10 + tmp11 - 8 * CENTERJSAMPLE; /* phase 3 */ dataptr[4] = tmp10 - tmp11; - + z1 = (tmp12 + tmp13) * ((FAST_FLOAT) 0.707106781); /* c4 */ dataptr[2] = tmp13 + z1; /* phase 5 */ dataptr[6] = tmp13 - z1; - + /* Odd part */ tmp10 = tmp4 + tmp5; /* phase 2 */ @@ -126,21 +131,21 @@ jpeg_fdct_float (FAST_FLOAT * data) tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5]; tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4]; tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4]; - + /* Even part */ - + tmp10 = tmp0 + tmp3; /* phase 2 */ tmp13 = tmp0 - tmp3; tmp11 = tmp1 + tmp2; tmp12 = tmp1 - tmp2; - + dataptr[DCTSIZE*0] = tmp10 + tmp11; /* phase 3 */ dataptr[DCTSIZE*4] = tmp10 - tmp11; - + z1 = (tmp12 + tmp13) * ((FAST_FLOAT) 0.707106781); /* c4 */ dataptr[DCTSIZE*2] = tmp13 + z1; /* phase 5 */ dataptr[DCTSIZE*6] = tmp13 - z1; - + /* Odd part */ tmp10 = tmp4 + tmp5; /* phase 2 */ diff --git a/jfdctfst.c b/jfdctfst.c index ccb378a..350ce35 100644 --- a/jfdctfst.c +++ b/jfdctfst.c @@ -111,42 +111,47 @@ */ GLOBAL(void) -jpeg_fdct_ifast (DCTELEM * data) +jpeg_fdct_ifast (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col) { DCTELEM tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; DCTELEM tmp10, tmp11, tmp12, tmp13; DCTELEM z1, z2, z3, z4, z5, z11, z13; DCTELEM *dataptr; + JSAMPROW elemptr; int ctr; SHIFT_TEMPS /* Pass 1: process rows. */ dataptr = data; - for (ctr = DCTSIZE-1; ctr >= 0; ctr--) { - tmp0 = dataptr[0] + dataptr[7]; - tmp7 = dataptr[0] - dataptr[7]; - tmp1 = dataptr[1] + dataptr[6]; - tmp6 = dataptr[1] - dataptr[6]; - tmp2 = dataptr[2] + dataptr[5]; - tmp5 = dataptr[2] - dataptr[5]; - tmp3 = dataptr[3] + dataptr[4]; - tmp4 = dataptr[3] - dataptr[4]; - + for (ctr = 0; ctr < DCTSIZE; ctr++) { + elemptr = sample_data[ctr] + start_col; + + /* Load data into workspace */ + tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]); + tmp7 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]); + tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]); + tmp6 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]); + tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]); + tmp5 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]); + tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]); + tmp4 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]); + /* Even part */ - + tmp10 = tmp0 + tmp3; /* phase 2 */ tmp13 = tmp0 - tmp3; tmp11 = tmp1 + tmp2; tmp12 = tmp1 - tmp2; - - dataptr[0] = tmp10 + tmp11; /* phase 3 */ + + /* Apply unsigned->signed conversion */ + dataptr[0] = tmp10 + tmp11 - 8 * CENTERJSAMPLE; /* phase 3 */ dataptr[4] = tmp10 - tmp11; - + z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */ dataptr[2] = tmp13 + z1; /* phase 5 */ dataptr[6] = tmp13 - z1; - + /* Odd part */ tmp10 = tmp4 + tmp5; /* phase 2 */ @@ -182,21 +187,21 @@ jpeg_fdct_ifast (DCTELEM * data) tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5]; tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4]; tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4]; - + /* Even part */ - + tmp10 = tmp0 + tmp3; /* phase 2 */ tmp13 = tmp0 - tmp3; tmp11 = tmp1 + tmp2; tmp12 = tmp1 - tmp2; - + dataptr[DCTSIZE*0] = tmp10 + tmp11; /* phase 3 */ dataptr[DCTSIZE*4] = tmp10 - tmp11; - + z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */ dataptr[DCTSIZE*2] = tmp13 + z1; /* phase 5 */ dataptr[DCTSIZE*6] = tmp13 - z1; - + /* Odd part */ tmp10 = tmp4 + tmp5; /* phase 2 */ diff --git a/jfdctint.c b/jfdctint.c index 0a78b64..f58ee8c 100644 --- a/jfdctint.c +++ b/jfdctint.c @@ -2,6 +2,7 @@ * jfdctint.c * * Copyright (C) 1991-1996, Thomas G. Lane. + * Modification developed 2003-2004 by Guido Vollbeding. * This file is part of the Independent JPEG Group's software. * For conditions of distribution and use, see the accompanying README file. * @@ -21,6 +22,23 @@ * The advantage of this method is that no data path contains more than one * multiplication; this allows a very simple and accurate implementation in * scaled fixed-point arithmetic, with a minimal number of shifts. + * + * We also provide FDCT routines with various input sample block sizes for + * direct resolution reduction or enlargement and for direct resolving the + * common 2x1 and 1x2 subsampling cases without additional resampling: NxN + * (N=1...16), 2NxN, and Nx2N (N=1...8) pixels for one 8x8 output DCT block. + * + * For N<8 we fill the remaining block coefficients with zero. + * For N>8 we apply a partial N-point FDCT on the input samples, computing + * just the lower 8 frequency coefficients and discarding the rest. + * + * We must scale the output coefficients of the N-point FDCT appropriately + * to the standard 8-point FDCT level by 8/N per 1-D pass. This scaling + * is folded into the constant multipliers (pass 2) and/or final/initial + * shifting. + * + * CAUTION: We rely on the FIX() macro except for the N=1,2,4,8 cases + * since there would be too many additional constants to pre-calculate. */ #define JPEG_INTERNALS @@ -137,12 +155,13 @@ */ GLOBAL(void) -jpeg_fdct_islow (DCTELEM * data) +jpeg_fdct_islow (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col) { - INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + INT32 tmp0, tmp1, tmp2, tmp3; INT32 tmp10, tmp11, tmp12, tmp13; - INT32 z1, z2, z3, z4, z5; + INT32 z1; DCTELEM *dataptr; + JSAMPROW elemptr; int ctr; SHIFT_TEMPS @@ -151,62 +170,66 @@ jpeg_fdct_islow (DCTELEM * data) /* furthermore, we scale the results by 2**PASS1_BITS. */ dataptr = data; - for (ctr = DCTSIZE-1; ctr >= 0; ctr--) { - tmp0 = dataptr[0] + dataptr[7]; - tmp7 = dataptr[0] - dataptr[7]; - tmp1 = dataptr[1] + dataptr[6]; - tmp6 = dataptr[1] - dataptr[6]; - tmp2 = dataptr[2] + dataptr[5]; - tmp5 = dataptr[2] - dataptr[5]; - tmp3 = dataptr[3] + dataptr[4]; - tmp4 = dataptr[3] - dataptr[4]; - + for (ctr = 0; ctr < DCTSIZE; ctr++) { + elemptr = sample_data[ctr] + start_col; + /* Even part per LL&M figure 1 --- note that published figure is faulty; * rotator "sqrt(2)*c1" should be "sqrt(2)*c6". */ - + + tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]); + tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]); + tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]); + tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]); + tmp10 = tmp0 + tmp3; - tmp13 = tmp0 - tmp3; + tmp12 = tmp0 - tmp3; tmp11 = tmp1 + tmp2; - tmp12 = tmp1 - tmp2; - - dataptr[0] = (DCTELEM) ((tmp10 + tmp11) << PASS1_BITS); + tmp13 = tmp1 - tmp2; + + tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]); + tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]); + tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]); + tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]); + + /* Apply unsigned->signed conversion */ + dataptr[0] = (DCTELEM) ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << PASS1_BITS); dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS); - + z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100); - dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865), + dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, FIX_0_765366865), CONST_BITS-PASS1_BITS); - dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065), + dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, - FIX_1_847759065), CONST_BITS-PASS1_BITS); - + /* Odd part per figure 8 --- note paper omits factor of sqrt(2). - * cK represents cos(K*pi/16). - * i0..i3 in the paper are tmp4..tmp7 here. + * cK represents sqrt(2) * cos(K*pi/16). + * i0..i3 in the paper are tmp0..tmp3 here. */ - - z1 = tmp4 + tmp7; - z2 = tmp5 + tmp6; - z3 = tmp4 + tmp6; - z4 = tmp5 + tmp7; - z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */ - - tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */ - tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */ - tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */ - tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */ - z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */ - z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */ - z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */ - z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */ - - z3 += z5; - z4 += z5; - - dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); - dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); - dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); - dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); - + + tmp10 = tmp0 + tmp3; + tmp11 = tmp1 + tmp2; + tmp12 = tmp0 + tmp2; + tmp13 = tmp1 + tmp3; + z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */ + + tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */ + tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */ + tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */ + tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */ + tmp10 = MULTIPLY(tmp10, - FIX_0_899976223); /* c7-c3 */ + tmp11 = MULTIPLY(tmp11, - FIX_2_562915447); /* -c1-c3 */ + tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* c5-c3 */ + tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */ + + tmp12 += z1; + tmp13 += z1; + + dataptr[1] = (DCTELEM) DESCALE(tmp0 + tmp10 + tmp12, CONST_BITS-PASS1_BITS); + dataptr[3] = (DCTELEM) DESCALE(tmp1 + tmp11 + tmp13, CONST_BITS-PASS1_BITS); + dataptr[5] = (DCTELEM) DESCALE(tmp2 + tmp11 + tmp12, CONST_BITS-PASS1_BITS); + dataptr[7] = (DCTELEM) DESCALE(tmp3 + tmp10 + tmp13, CONST_BITS-PASS1_BITS); + dataptr += DCTSIZE; /* advance pointer to next row */ } @@ -217,67 +240,4066 @@ jpeg_fdct_islow (DCTELEM * data) dataptr = data; for (ctr = DCTSIZE-1; ctr >= 0; ctr--) { + /* Even part per LL&M figure 1 --- note that published figure is faulty; + * rotator "sqrt(2)*c1" should be "sqrt(2)*c6". + */ + tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7]; - tmp7 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7]; tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6]; - tmp6 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6]; tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5]; - tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5]; tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4]; - tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4]; - - /* Even part per LL&M figure 1 --- note that published figure is faulty; - * rotator "sqrt(2)*c1" should be "sqrt(2)*c6". - */ - + tmp10 = tmp0 + tmp3; - tmp13 = tmp0 - tmp3; + tmp12 = tmp0 - tmp3; tmp11 = tmp1 + tmp2; - tmp12 = tmp1 - tmp2; - + tmp13 = tmp1 - tmp2; + + tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7]; + tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6]; + tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5]; + tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4]; + dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS); dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS); - + z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100); - dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865), + dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, FIX_0_765366865), CONST_BITS+PASS1_BITS); - dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065), + dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, - FIX_1_847759065), CONST_BITS+PASS1_BITS); - + /* Odd part per figure 8 --- note paper omits factor of sqrt(2). - * cK represents cos(K*pi/16). - * i0..i3 in the paper are tmp4..tmp7 here. + * cK represents sqrt(2) * cos(K*pi/16). + * i0..i3 in the paper are tmp0..tmp3 here. */ - - z1 = tmp4 + tmp7; - z2 = tmp5 + tmp6; - z3 = tmp4 + tmp6; - z4 = tmp5 + tmp7; - z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */ - - tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */ - tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */ - tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */ - tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */ - z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */ - z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */ - z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */ - z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */ - - z3 += z5; - z4 += z5; - - dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, + + tmp10 = tmp0 + tmp3; + tmp11 = tmp1 + tmp2; + tmp12 = tmp0 + tmp2; + tmp13 = tmp1 + tmp3; + z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */ + + tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */ + tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */ + tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */ + tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */ + tmp10 = MULTIPLY(tmp10, - FIX_0_899976223); /* c7-c3 */ + tmp11 = MULTIPLY(tmp11, - FIX_2_562915447); /* -c1-c3 */ + tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* c5-c3 */ + tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */ + + tmp12 += z1; + tmp13 += z1; + + dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0 + tmp10 + tmp12, CONST_BITS+PASS1_BITS); - dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, + dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1 + tmp11 + tmp13, CONST_BITS+PASS1_BITS); - dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, + dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2 + tmp11 + tmp12, CONST_BITS+PASS1_BITS); - dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, + dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3 + tmp10 + tmp13, CONST_BITS+PASS1_BITS); - + + dataptr++; /* advance pointer to next column */ + } +} + +#ifdef DCT_SCALING_SUPPORTED + + +/* + * Perform the forward DCT on a 7x7 sample block. + */ + +GLOBAL(void) +jpeg_fdct_7x7 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col) +{ + INT32 tmp0, tmp1, tmp2, tmp3; + INT32 tmp10, tmp11, tmp12; + INT32 z1, z2, z3; + DCTELEM *dataptr; + JSAMPROW elemptr; + int ctr; + SHIFT_TEMPS + + /* Pre-zero output coefficient block. */ + MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2); + + /* Pass 1: process rows. */ + /* Note results are scaled up by sqrt(8) compared to a true DCT; */ + /* furthermore, we scale the results by 2**PASS1_BITS. */ + /* cK represents sqrt(2) * cos(K*pi/14). */ + + dataptr = data; + for (ctr = 0; ctr < 7; ctr++) { + elemptr = sample_data[ctr] + start_col; + + /* Even part */ + + tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[6]); + tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[5]); + tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[4]); + tmp3 = GETJSAMPLE(elemptr[3]); + + tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[6]); + tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[5]); + tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[4]); + + z1 = tmp0 + tmp2; + /* Apply unsigned->signed conversion */ + dataptr[0] = (DCTELEM) + ((z1 + tmp1 + tmp3 - 7 * CENTERJSAMPLE) << PASS1_BITS); + tmp3 += tmp3; + z1 -= tmp3; + z1 -= tmp3; + z1 = MULTIPLY(z1, FIX(0.353553391)); /* (c2+c6-c4)/2 */ + z2 = MULTIPLY(tmp0 - tmp2, FIX(0.920609002)); /* (c2+c4-c6)/2 */ + z3 = MULTIPLY(tmp1 - tmp2, FIX(0.314692123)); /* c6 */ + dataptr[2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS-PASS1_BITS); + z1 -= z2; + z2 = MULTIPLY(tmp0 - tmp1, FIX(0.881747734)); /* c4 */ + dataptr[4] = (DCTELEM) + DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.707106781)), /* c2+c6-c4 */ + CONST_BITS-PASS1_BITS); + dataptr[6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS-PASS1_BITS); + + /* Odd part */ + + tmp1 = MULTIPLY(tmp10 + tmp11, FIX(0.935414347)); /* (c3+c1-c5)/2 */ + tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.170262339)); /* (c3+c5-c1)/2 */ + tmp0 = tmp1 - tmp2; + tmp1 += tmp2; + tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.378756276)); /* -c1 */ + tmp1 += tmp2; + tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.613604268)); /* c5 */ + tmp0 += tmp3; + tmp2 += tmp3 + MULTIPLY(tmp12, FIX(1.870828693)); /* c3+c1-c5 */ + + dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-PASS1_BITS); + dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-PASS1_BITS); + dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-PASS1_BITS); + + dataptr += DCTSIZE; /* advance pointer to next row */ + } + + /* Pass 2: process columns. + * We remove the PASS1_BITS scaling, but leave the results scaled up + * by an overall factor of 8. + * We must also scale the output by (8/7)**2 = 64/49, which we fold + * into the constant multipliers: + * cK now represents sqrt(2) * cos(K*pi/14) * 64/49. + */ + + dataptr = data; + for (ctr = 0; ctr < 7; ctr++) { + /* Even part */ + + tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*6]; + tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*5]; + tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*4]; + tmp3 = dataptr[DCTSIZE*3]; + + tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*6]; + tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*5]; + tmp12 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*4]; + + z1 = tmp0 + tmp2; + dataptr[DCTSIZE*0] = (DCTELEM) + DESCALE(MULTIPLY(z1 + tmp1 + tmp3, FIX(1.306122449)), /* 64/49 */ + CONST_BITS+PASS1_BITS); + tmp3 += tmp3; + z1 -= tmp3; + z1 -= tmp3; + z1 = MULTIPLY(z1, FIX(0.461784020)); /* (c2+c6-c4)/2 */ + z2 = MULTIPLY(tmp0 - tmp2, FIX(1.202428084)); /* (c2+c4-c6)/2 */ + z3 = MULTIPLY(tmp1 - tmp2, FIX(0.411026446)); /* c6 */ + dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS+PASS1_BITS); + z1 -= z2; + z2 = MULTIPLY(tmp0 - tmp1, FIX(1.151670509)); /* c4 */ + dataptr[DCTSIZE*4] = (DCTELEM) + DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.923568041)), /* c2+c6-c4 */ + CONST_BITS+PASS1_BITS); + dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+PASS1_BITS); + + /* Odd part */ + + tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.221765677)); /* (c3+c1-c5)/2 */ + tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.222383464)); /* (c3+c5-c1)/2 */ + tmp0 = tmp1 - tmp2; + tmp1 += tmp2; + tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.800824523)); /* -c1 */ + tmp1 += tmp2; + tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.801442310)); /* c5 */ + tmp0 += tmp3; + tmp2 += tmp3 + MULTIPLY(tmp12, FIX(2.443531355)); /* c3+c1-c5 */ + + dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+PASS1_BITS); + dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+PASS1_BITS); + dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+PASS1_BITS); + + dataptr++; /* advance pointer to next column */ + } +} + + +/* + * Perform the forward DCT on a 6x6 sample block. + */ + +GLOBAL(void) +jpeg_fdct_6x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col) +{ + INT32 tmp0, tmp1, tmp2; + INT32 tmp10, tmp11, tmp12; + DCTELEM *dataptr; + JSAMPROW elemptr; + int ctr; + SHIFT_TEMPS + + /* Pre-zero output coefficient block. */ + MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2); + + /* Pass 1: process rows. */ + /* Note results are scaled up by sqrt(8) compared to a true DCT; */ + /* furthermore, we scale the results by 2**PASS1_BITS. */ + /* cK represents sqrt(2) * cos(K*pi/12). */ + + dataptr = data; + for (ctr = 0; ctr < 6; ctr++) { + elemptr = sample_data[ctr] + start_col; + + /* Even part */ + + tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]); + tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]); + tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]); + + tmp10 = tmp0 + tmp2; + tmp12 = tmp0 - tmp2; + + tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]); + tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]); + tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]); + + /* Apply unsigned->signed conversion */ + dataptr[0] = (DCTELEM) + ((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << PASS1_BITS); + dataptr[2] = (DCTELEM) + DESCALE(MULTIPLY(tmp12, FIX(1.224744871)), /* c2 */ + CONST_BITS-PASS1_BITS); + dataptr[4] = (DCTELEM) + DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */ + CONST_BITS-PASS1_BITS); + + /* Odd part */ + + tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)), /* c5 */ + CONST_BITS-PASS1_BITS); + + dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << PASS1_BITS)); + dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << PASS1_BITS); + dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << PASS1_BITS)); + + dataptr += DCTSIZE; /* advance pointer to next row */ + } + + /* Pass 2: process columns. + * We remove the PASS1_BITS scaling, but leave the results scaled up + * by an overall factor of 8. + * We must also scale the output by (8/6)**2 = 16/9, which we fold + * into the constant multipliers: + * cK now represents sqrt(2) * cos(K*pi/12) * 16/9. + */ + + dataptr = data; + for (ctr = 0; ctr < 6; ctr++) { + /* Even part */ + + tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*5]; + tmp11 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*4]; + tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*3]; + + tmp10 = tmp0 + tmp2; + tmp12 = tmp0 - tmp2; + + tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*5]; + tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*4]; + tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*3]; + + dataptr[DCTSIZE*0] = (DCTELEM) + DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)), /* 16/9 */ + CONST_BITS+PASS1_BITS); + dataptr[DCTSIZE*2] = (DCTELEM) + DESCALE(MULTIPLY(tmp12, FIX(2.177324216)), /* c2 */ + CONST_BITS+PASS1_BITS); + dataptr[DCTSIZE*4] = (DCTELEM) + DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */ + CONST_BITS+PASS1_BITS); + + /* Odd part */ + + tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829)); /* c5 */ + + dataptr[DCTSIZE*1] = (DCTELEM) + DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)), /* 16/9 */ + CONST_BITS+PASS1_BITS); + dataptr[DCTSIZE*3] = (DCTELEM) + DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)), /* 16/9 */ + CONST_BITS+PASS1_BITS); + dataptr[DCTSIZE*5] = (DCTELEM) + DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)), /* 16/9 */ + CONST_BITS+PASS1_BITS); + + dataptr++; /* advance pointer to next column */ + } +} + + +/* + * Perform the forward DCT on a 5x5 sample block. + */ + +GLOBAL(void) +jpeg_fdct_5x5 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col) +{ + INT32 tmp0, tmp1, tmp2; + INT32 tmp10, tmp11; + DCTELEM *dataptr; + JSAMPROW elemptr; + int ctr; + SHIFT_TEMPS + + /* Pre-zero output coefficient block. */ + MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2); + + /* Pass 1: process rows. */ + /* Note results are scaled up by sqrt(8) compared to a true DCT; */ + /* furthermore, we scale the results by 2**PASS1_BITS. */ + /* We scale the results further by 2 as part of output adaption */ + /* scaling for different DCT size. */ + /* cK represents sqrt(2) * cos(K*pi/10). */ + + dataptr = data; + for (ctr = 0; ctr < 5; ctr++) { + elemptr = sample_data[ctr] + start_col; + + /* Even part */ + + tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[4]); + tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[3]); + tmp2 = GETJSAMPLE(elemptr[2]); + + tmp10 = tmp0 + tmp1; + tmp11 = tmp0 - tmp1; + + tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[4]); + tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[3]); + + /* Apply unsigned->signed conversion */ + dataptr[0] = (DCTELEM) + ((tmp10 + tmp2 - 5 * CENTERJSAMPLE) << (PASS1_BITS+1)); + tmp11 = MULTIPLY(tmp11, FIX(0.790569415)); /* (c2+c4)/2 */ + tmp10 -= tmp2 << 2; + tmp10 = MULTIPLY(tmp10, FIX(0.353553391)); /* (c2-c4)/2 */ + dataptr[2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS-PASS1_BITS-1); + dataptr[4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS-PASS1_BITS-1); + + /* Odd part */ + + tmp10 = MULTIPLY(tmp0 + tmp1, FIX(0.831253876)); /* c3 */ + + dataptr[1] = (DCTELEM) + DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.513743148)), /* c1-c3 */ + CONST_BITS-PASS1_BITS-1); + dataptr[3] = (DCTELEM) + DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.176250899)), /* c1+c3 */ + CONST_BITS-PASS1_BITS-1); + + dataptr += DCTSIZE; /* advance pointer to next row */ + } + + /* Pass 2: process columns. + * We remove the PASS1_BITS scaling, but leave the results scaled up + * by an overall factor of 8. + * We must also scale the output by (8/5)**2 = 64/25, which we partially + * fold into the constant multipliers (other part was done in pass 1): + * cK now represents sqrt(2) * cos(K*pi/10) * 32/25. + */ + + dataptr = data; + for (ctr = 0; ctr < 5; ctr++) { + /* Even part */ + + tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*4]; + tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*3]; + tmp2 = dataptr[DCTSIZE*2]; + + tmp10 = tmp0 + tmp1; + tmp11 = tmp0 - tmp1; + + tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*4]; + tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*3]; + + dataptr[DCTSIZE*0] = (DCTELEM) + DESCALE(MULTIPLY(tmp10 + tmp2, FIX(1.28)), /* 32/25 */ + CONST_BITS+PASS1_BITS); + tmp11 = MULTIPLY(tmp11, FIX(1.011928851)); /* (c2+c4)/2 */ + tmp10 -= tmp2 << 2; + tmp10 = MULTIPLY(tmp10, FIX(0.452548340)); /* (c2-c4)/2 */ + dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS+PASS1_BITS); + dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS+PASS1_BITS); + + /* Odd part */ + + tmp10 = MULTIPLY(tmp0 + tmp1, FIX(1.064004961)); /* c3 */ + + dataptr[DCTSIZE*1] = (DCTELEM) + DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.657591230)), /* c1-c3 */ + CONST_BITS+PASS1_BITS); + dataptr[DCTSIZE*3] = (DCTELEM) + DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.785601151)), /* c1+c3 */ + CONST_BITS+PASS1_BITS); + + dataptr++; /* advance pointer to next column */ + } +} + + +/* + * Perform the forward DCT on a 4x4 sample block. + */ + +GLOBAL(void) +jpeg_fdct_4x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col) +{ + INT32 tmp0, tmp1; + INT32 tmp10, tmp11; + DCTELEM *dataptr; + JSAMPROW elemptr; + int ctr; + SHIFT_TEMPS + + /* Pre-zero output coefficient block. */ + MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2); + + /* Pass 1: process rows. */ + /* Note results are scaled up by sqrt(8) compared to a true DCT; */ + /* furthermore, we scale the results by 2**PASS1_BITS. */ + /* We must also scale the output by (8/4)**2 = 2**2, which we add here. */ + /* cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT]. */ + + dataptr = data; + for (ctr = 0; ctr < 4; ctr++) { + elemptr = sample_data[ctr] + start_col; + + /* Even part */ + + tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]); + tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]); + + tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]); + tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]); + + /* Apply unsigned->signed conversion */ + dataptr[0] = (DCTELEM) + ((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+2)); + dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+2)); + + /* Odd part */ + + tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */ + + dataptr[1] = (DCTELEM) + DESCALE(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */ + CONST_BITS-PASS1_BITS-2); + dataptr[3] = (DCTELEM) + DESCALE(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */ + CONST_BITS-PASS1_BITS-2); + + dataptr += DCTSIZE; /* advance pointer to next row */ + } + + /* Pass 2: process columns. + * We remove the PASS1_BITS scaling, but leave the results scaled up + * by an overall factor of 8. + */ + + dataptr = data; + for (ctr = 0; ctr < 4; ctr++) { + /* Even part */ + + tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3]; + tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*2]; + + tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*3]; + tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*2]; + + dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp0 + tmp1, PASS1_BITS); + dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(tmp0 - tmp1, PASS1_BITS); + + /* Odd part */ + + tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */ + + dataptr[DCTSIZE*1] = (DCTELEM) + DESCALE(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */ + CONST_BITS+PASS1_BITS); + dataptr[DCTSIZE*3] = (DCTELEM) + DESCALE(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */ + CONST_BITS+PASS1_BITS); + dataptr++; /* advance pointer to next column */ } } + +/* + * Perform the forward DCT on a 3x3 sample block. + */ + +GLOBAL(void) +jpeg_fdct_3x3 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col) +{ + INT32 tmp0, tmp1, tmp2; + DCTELEM *dataptr; + JSAMPROW elemptr; + int ctr; + SHIFT_TEMPS + + /* Pre-zero output coefficient block. */ + MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2); + + /* Pass 1: process rows. */ + /* Note results are scaled up by sqrt(8) compared to a true DCT; */ + /* furthermore, we scale the results by 2**PASS1_BITS. */ + /* We scale the results further by 2**2 as part of output adaption */ + /* scaling for different DCT size. */ + /* cK represents sqrt(2) * cos(K*pi/6). */ + + dataptr = data; + for (ctr = 0; ctr < 3; ctr++) { + elemptr = sample_data[ctr] + start_col; + + /* Even part */ + + tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[2]); + tmp1 = GETJSAMPLE(elemptr[1]); + + tmp2 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[2]); + + /* Apply unsigned->signed conversion */ + dataptr[0] = (DCTELEM) + ((tmp0 + tmp1 - 3 * CENTERJSAMPLE) << (PASS1_BITS+2)); + dataptr[2] = (DCTELEM) + DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(0.707106781)), /* c2 */ + CONST_BITS-PASS1_BITS-2); + + /* Odd part */ + + dataptr[1] = (DCTELEM) + DESCALE(MULTIPLY(tmp2, FIX(1.224744871)), /* c1 */ + CONST_BITS-PASS1_BITS-2); + + dataptr += DCTSIZE; /* advance pointer to next row */ + } + + /* Pass 2: process columns. + * We remove the PASS1_BITS scaling, but leave the results scaled up + * by an overall factor of 8. + * We must also scale the output by (8/3)**2 = 64/9, which we partially + * fold into the constant multipliers (other part was done in pass 1): + * cK now represents sqrt(2) * cos(K*pi/6) * 16/9. + */ + + dataptr = data; + for (ctr = 0; ctr < 3; ctr++) { + /* Even part */ + + tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*2]; + tmp1 = dataptr[DCTSIZE*1]; + + tmp2 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*2]; + + dataptr[DCTSIZE*0] = (DCTELEM) + DESCALE(MULTIPLY(tmp0 + tmp1, FIX(1.777777778)), /* 16/9 */ + CONST_BITS+PASS1_BITS); + dataptr[DCTSIZE*2] = (DCTELEM) + DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(1.257078722)), /* c2 */ + CONST_BITS+PASS1_BITS); + + /* Odd part */ + + dataptr[DCTSIZE*1] = (DCTELEM) + DESCALE(MULTIPLY(tmp2, FIX(2.177324216)), /* c1 */ + CONST_BITS+PASS1_BITS); + + dataptr++; /* advance pointer to next column */ + } +} + + +/* + * Perform the forward DCT on a 2x2 sample block. + */ + +GLOBAL(void) +jpeg_fdct_2x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col) +{ + INT32 tmp0, tmp1, tmp2, tmp3; + JSAMPROW elemptr; + + /* Pre-zero output coefficient block. */ + MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2); + + /* Pass 1: process rows. */ + /* Note results are scaled up by sqrt(8) compared to a true DCT. */ + + /* Row 0 */ + elemptr = sample_data[0] + start_col; + + tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[1]); + tmp1 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[1]); + + /* Row 1 */ + elemptr = sample_data[1] + start_col; + + tmp2 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[1]); + tmp3 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[1]); + + /* Pass 2: process columns. + * We leave the results scaled up by an overall factor of 8. + * We must also scale the output by (8/2)**2 = 2**4. + */ + + /* Column 0 */ + /* Apply unsigned->signed conversion */ + data[DCTSIZE*0] = (DCTELEM) ((tmp0 + tmp2 - 4 * CENTERJSAMPLE) << 4); + data[DCTSIZE*1] = (DCTELEM) ((tmp0 - tmp2) << 4); + + /* Column 1 */ + data[DCTSIZE*0+1] = (DCTELEM) ((tmp1 + tmp3) << 4); + data[DCTSIZE*1+1] = (DCTELEM) ((tmp1 - tmp3) << 4); +} + + +/* + * Perform the forward DCT on a 1x1 sample block. + */ + +GLOBAL(void) +jpeg_fdct_1x1 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col) +{ + /* Pre-zero output coefficient block. */ + MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2); + + /* We leave the result scaled up by an overall factor of 8. */ + /* We must also scale the output by (8/1)**2 = 2**6. */ + /* Apply unsigned->signed conversion */ + data[0] = (DCTELEM) + ((GETJSAMPLE(sample_data[0][start_col]) - CENTERJSAMPLE) << 6); +} + + +/* + * Perform the forward DCT on a 9x9 sample block. + */ + +GLOBAL(void) +jpeg_fdct_9x9 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col) +{ + INT32 tmp0, tmp1, tmp2, tmp3, tmp4; + INT32 tmp10, tmp11, tmp12, tmp13; + INT32 z1, z2; + DCTELEM workspace[8]; + DCTELEM *dataptr; + DCTELEM *wsptr; + JSAMPROW elemptr; + int ctr; + SHIFT_TEMPS + + /* Pass 1: process rows. */ + /* Note results are scaled up by sqrt(8) compared to a true DCT; */ + /* we scale the results further by 2 as part of output adaption */ + /* scaling for different DCT size. */ + /* cK represents sqrt(2) * cos(K*pi/18). */ + + dataptr = data; + ctr = 0; + for (;;) { + elemptr = sample_data[ctr] + start_col; + + /* Even part */ + + tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[8]); + tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[7]); + tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[6]); + tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[5]); + tmp4 = GETJSAMPLE(elemptr[4]); + + tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[8]); + tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[7]); + tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[6]); + tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[5]); + + z1 = tmp0 + tmp2 + tmp3; + z2 = tmp1 + tmp4; + /* Apply unsigned->signed conversion */ + dataptr[0] = (DCTELEM) ((z1 + z2 - 9 * CENTERJSAMPLE) << 1); + dataptr[6] = (DCTELEM) + DESCALE(MULTIPLY(z1 - z2 - z2, FIX(0.707106781)), /* c6 */ + CONST_BITS-1); + z1 = MULTIPLY(tmp0 - tmp2, FIX(1.328926049)); /* c2 */ + z2 = MULTIPLY(tmp1 - tmp4 - tmp4, FIX(0.707106781)); /* c6 */ + dataptr[2] = (DCTELEM) + DESCALE(MULTIPLY(tmp2 - tmp3, FIX(1.083350441)) /* c4 */ + + z1 + z2, CONST_BITS-1); + dataptr[4] = (DCTELEM) + DESCALE(MULTIPLY(tmp3 - tmp0, FIX(0.245575608)) /* c8 */ + + z1 - z2, CONST_BITS-1); + + /* Odd part */ + + dataptr[3] = (DCTELEM) + DESCALE(MULTIPLY(tmp10 - tmp12 - tmp13, FIX(1.224744871)), /* c3 */ + CONST_BITS-1); + + tmp11 = MULTIPLY(tmp11, FIX(1.224744871)); /* c3 */ + tmp0 = MULTIPLY(tmp10 + tmp12, FIX(0.909038955)); /* c5 */ + tmp1 = MULTIPLY(tmp10 + tmp13, FIX(0.483689525)); /* c7 */ + + dataptr[1] = (DCTELEM) DESCALE(tmp11 + tmp0 + tmp1, CONST_BITS-1); + + tmp2 = MULTIPLY(tmp12 - tmp13, FIX(1.392728481)); /* c1 */ + + dataptr[5] = (DCTELEM) DESCALE(tmp0 - tmp11 - tmp2, CONST_BITS-1); + dataptr[7] = (DCTELEM) DESCALE(tmp1 - tmp11 + tmp2, CONST_BITS-1); + + ctr++; + + if (ctr != DCTSIZE) { + if (ctr == 9) + break; /* Done. */ + dataptr += DCTSIZE; /* advance pointer to next row */ + } else + dataptr = workspace; /* switch pointer to extended workspace */ + } + + /* Pass 2: process columns. + * We leave the results scaled up by an overall factor of 8. + * We must also scale the output by (8/9)**2 = 64/81, which we partially + * fold into the constant multipliers and final/initial shifting: + * cK now represents sqrt(2) * cos(K*pi/18) * 128/81. + */ + + dataptr = data; + wsptr = workspace; + for (ctr = DCTSIZE-1; ctr >= 0; ctr--) { + /* Even part */ + + tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*0]; + tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*7]; + tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*6]; + tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*5]; + tmp4 = dataptr[DCTSIZE*4]; + + tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*0]; + tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*7]; + tmp12 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*6]; + tmp13 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*5]; + + z1 = tmp0 + tmp2 + tmp3; + z2 = tmp1 + tmp4; + dataptr[DCTSIZE*0] = (DCTELEM) + DESCALE(MULTIPLY(z1 + z2, FIX(1.580246914)), /* 128/81 */ + CONST_BITS+2); + dataptr[DCTSIZE*6] = (DCTELEM) + DESCALE(MULTIPLY(z1 - z2 - z2, FIX(1.117403309)), /* c6 */ + CONST_BITS+2); + z1 = MULTIPLY(tmp0 - tmp2, FIX(2.100031287)); /* c2 */ + z2 = MULTIPLY(tmp1 - tmp4 - tmp4, FIX(1.117403309)); /* c6 */ + dataptr[DCTSIZE*2] = (DCTELEM) + DESCALE(MULTIPLY(tmp2 - tmp3, FIX(1.711961190)) /* c4 */ + + z1 + z2, CONST_BITS+2); + dataptr[DCTSIZE*4] = (DCTELEM) + DESCALE(MULTIPLY(tmp3 - tmp0, FIX(0.388070096)) /* c8 */ + + z1 - z2, CONST_BITS+2); + + /* Odd part */ + + dataptr[DCTSIZE*3] = (DCTELEM) + DESCALE(MULTIPLY(tmp10 - tmp12 - tmp13, FIX(1.935399303)), /* c3 */ + CONST_BITS+2); + + tmp11 = MULTIPLY(tmp11, FIX(1.935399303)); /* c3 */ + tmp0 = MULTIPLY(tmp10 + tmp12, FIX(1.436506004)); /* c5 */ + tmp1 = MULTIPLY(tmp10 + tmp13, FIX(0.764348879)); /* c7 */ + + dataptr[DCTSIZE*1] = (DCTELEM) + DESCALE(tmp11 + tmp0 + tmp1, CONST_BITS+2); + + tmp2 = MULTIPLY(tmp12 - tmp13, FIX(2.200854883)); /* c1 */ + + dataptr[DCTSIZE*5] = (DCTELEM) + DESCALE(tmp0 - tmp11 - tmp2, CONST_BITS+2); + dataptr[DCTSIZE*7] = (DCTELEM) + DESCALE(tmp1 - tmp11 + tmp2, CONST_BITS+2); + + dataptr++; /* advance pointer to next column */ + wsptr++; /* advance pointer to next column */ + } +} + + +/* + * Perform the forward DCT on a 10x10 sample block. + */ + +GLOBAL(void) +jpeg_fdct_10x10 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col) +{ + INT32 tmp0, tmp1, tmp2, tmp3, tmp4; + INT32 tmp10, tmp11, tmp12, tmp13, tmp14; + DCTELEM workspace[8*2]; + DCTELEM *dataptr; + DCTELEM *wsptr; + JSAMPROW elemptr; + int ctr; + SHIFT_TEMPS + + /* Pass 1: process rows. */ + /* Note results are scaled up by sqrt(8) compared to a true DCT; */ + /* we scale the results further by 2 as part of output adaption */ + /* scaling for different DCT size. */ + /* cK represents sqrt(2) * cos(K*pi/20). */ + + dataptr = data; + ctr = 0; + for (;;) { + elemptr = sample_data[ctr] + start_col; + + /* Even part */ + + tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[9]); + tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[8]); + tmp12 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[7]); + tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[6]); + tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[5]); + + tmp10 = tmp0 + tmp4; + tmp13 = tmp0 - tmp4; + tmp11 = tmp1 + tmp3; + tmp14 = tmp1 - tmp3; + + tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[9]); + tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[8]); + tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[7]); + tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[6]); + tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[5]); + + /* Apply unsigned->signed conversion */ + dataptr[0] = (DCTELEM) + ((tmp10 + tmp11 + tmp12 - 10 * CENTERJSAMPLE) << 1); + tmp12 += tmp12; + dataptr[4] = (DCTELEM) + DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.144122806)) - /* c4 */ + MULTIPLY(tmp11 - tmp12, FIX(0.437016024)), /* c8 */ + CONST_BITS-1); + tmp10 = MULTIPLY(tmp13 + tmp14, FIX(0.831253876)); /* c6 */ + dataptr[2] = (DCTELEM) + DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.513743148)), /* c2-c6 */ + CONST_BITS-1); + dataptr[6] = (DCTELEM) + DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.176250899)), /* c2+c6 */ + CONST_BITS-1); + + /* Odd part */ + + tmp10 = tmp0 + tmp4; + tmp11 = tmp1 - tmp3; + dataptr[5] = (DCTELEM) ((tmp10 - tmp11 - tmp2) << 1); + tmp2 <<= CONST_BITS; + dataptr[1] = (DCTELEM) + DESCALE(MULTIPLY(tmp0, FIX(1.396802247)) + /* c1 */ + MULTIPLY(tmp1, FIX(1.260073511)) + tmp2 + /* c3 */ + MULTIPLY(tmp3, FIX(0.642039522)) + /* c7 */ + MULTIPLY(tmp4, FIX(0.221231742)), /* c9 */ + CONST_BITS-1); + tmp12 = MULTIPLY(tmp0 - tmp4, FIX(0.951056516)) - /* (c3+c7)/2 */ + MULTIPLY(tmp1 + tmp3, FIX(0.587785252)); /* (c1-c9)/2 */ + tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.309016994)) + /* (c3-c7)/2 */ + (tmp11 << (CONST_BITS - 1)) - tmp2; + dataptr[3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS-1); + dataptr[7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS-1); + + ctr++; + + if (ctr != DCTSIZE) { + if (ctr == 10) + break; /* Done. */ + dataptr += DCTSIZE; /* advance pointer to next row */ + } else + dataptr = workspace; /* switch pointer to extended workspace */ + } + + /* Pass 2: process columns. + * We leave the results scaled up by an overall factor of 8. + * We must also scale the output by (8/10)**2 = 16/25, which we partially + * fold into the constant multipliers and final/initial shifting: + * cK now represents sqrt(2) * cos(K*pi/20) * 32/25. + */ + + dataptr = data; + wsptr = workspace; + for (ctr = DCTSIZE-1; ctr >= 0; ctr--) { + /* Even part */ + + tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*1]; + tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*0]; + tmp12 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*7]; + tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*6]; + tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*5]; + + tmp10 = tmp0 + tmp4; + tmp13 = tmp0 - tmp4; + tmp11 = tmp1 + tmp3; + tmp14 = tmp1 - tmp3; + + tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*1]; + tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*0]; + tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*7]; + tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*6]; + tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*5]; + + dataptr[DCTSIZE*0] = (DCTELEM) + DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(1.28)), /* 32/25 */ + CONST_BITS+2); + tmp12 += tmp12; + dataptr[DCTSIZE*4] = (DCTELEM) + DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.464477191)) - /* c4 */ + MULTIPLY(tmp11 - tmp12, FIX(0.559380511)), /* c8 */ + CONST_BITS+2); + tmp10 = MULTIPLY(tmp13 + tmp14, FIX(1.064004961)); /* c6 */ + dataptr[DCTSIZE*2] = (DCTELEM) + DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.657591230)), /* c2-c6 */ + CONST_BITS+2); + dataptr[DCTSIZE*6] = (DCTELEM) + DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.785601151)), /* c2+c6 */ + CONST_BITS+2); + + /* Odd part */ + + tmp10 = tmp0 + tmp4; + tmp11 = tmp1 - tmp3; + dataptr[DCTSIZE*5] = (DCTELEM) + DESCALE(MULTIPLY(tmp10 - tmp11 - tmp2, FIX(1.28)), /* 32/25 */ + CONST_BITS+2); + tmp2 = MULTIPLY(tmp2, FIX(1.28)); /* 32/25 */ + dataptr[DCTSIZE*1] = (DCTELEM) + DESCALE(MULTIPLY(tmp0, FIX(1.787906876)) + /* c1 */ + MULTIPLY(tmp1, FIX(1.612894094)) + tmp2 + /* c3 */ + MULTIPLY(tmp3, FIX(0.821810588)) + /* c7 */ + MULTIPLY(tmp4, FIX(0.283176630)), /* c9 */ + CONST_BITS+2); + tmp12 = MULTIPLY(tmp0 - tmp4, FIX(1.217352341)) - /* (c3+c7)/2 */ + MULTIPLY(tmp1 + tmp3, FIX(0.752365123)); /* (c1-c9)/2 */ + tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.395541753)) + /* (c3-c7)/2 */ + MULTIPLY(tmp11, FIX(0.64)) - tmp2; /* 16/25 */ + dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS+2); + dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS+2); + + dataptr++; /* advance pointer to next column */ + wsptr++; /* advance pointer to next column */ + } +} + + +/* + * Perform the forward DCT on an 11x11 sample block. + */ + +GLOBAL(void) +jpeg_fdct_11x11 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col) +{ + INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + INT32 tmp10, tmp11, tmp12, tmp13, tmp14; + INT32 z1, z2, z3; + DCTELEM workspace[8*3]; + DCTELEM *dataptr; + DCTELEM *wsptr; + JSAMPROW elemptr; + int ctr; + SHIFT_TEMPS + + /* Pass 1: process rows. */ + /* Note results are scaled up by sqrt(8) compared to a true DCT; */ + /* we scale the results further by 2 as part of output adaption */ + /* scaling for different DCT size. */ + /* cK represents sqrt(2) * cos(K*pi/22). */ + + dataptr = data; + ctr = 0; + for (;;) { + elemptr = sample_data[ctr] + start_col; + + /* Even part */ + + tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[10]); + tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[9]); + tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[8]); + tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[7]); + tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[6]); + tmp5 = GETJSAMPLE(elemptr[5]); + + tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[10]); + tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[9]); + tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[8]); + tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[7]); + tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[6]); + + /* Apply unsigned->signed conversion */ + dataptr[0] = (DCTELEM) + ((tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 - 11 * CENTERJSAMPLE) << 1); + tmp5 += tmp5; + tmp0 -= tmp5; + tmp1 -= tmp5; + tmp2 -= tmp5; + tmp3 -= tmp5; + tmp4 -= tmp5; + z1 = MULTIPLY(tmp0 + tmp3, FIX(1.356927976)) + /* c2 */ + MULTIPLY(tmp2 + tmp4, FIX(0.201263574)); /* c10 */ + z2 = MULTIPLY(tmp1 - tmp3, FIX(0.926112931)); /* c6 */ + z3 = MULTIPLY(tmp0 - tmp1, FIX(1.189712156)); /* c4 */ + dataptr[2] = (DCTELEM) + DESCALE(z1 + z2 - MULTIPLY(tmp3, FIX(1.018300590)) /* c2+c8-c6 */ + - MULTIPLY(tmp4, FIX(1.390975730)), /* c4+c10 */ + CONST_BITS-1); + dataptr[4] = (DCTELEM) + DESCALE(z2 + z3 + MULTIPLY(tmp1, FIX(0.062335650)) /* c4-c6-c10 */ + - MULTIPLY(tmp2, FIX(1.356927976)) /* c2 */ + + MULTIPLY(tmp4, FIX(0.587485545)), /* c8 */ + CONST_BITS-1); + dataptr[6] = (DCTELEM) + DESCALE(z1 + z3 - MULTIPLY(tmp0, FIX(1.620527200)) /* c2+c4-c6 */ + - MULTIPLY(tmp2, FIX(0.788749120)), /* c8+c10 */ + CONST_BITS-1); + + /* Odd part */ + + tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.286413905)); /* c3 */ + tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.068791298)); /* c5 */ + tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.764581576)); /* c7 */ + tmp0 = tmp1 + tmp2 + tmp3 - MULTIPLY(tmp10, FIX(1.719967871)) /* c7+c5+c3-c1 */ + + MULTIPLY(tmp14, FIX(0.398430003)); /* c9 */ + tmp4 = MULTIPLY(tmp11 + tmp12, - FIX(0.764581576)); /* -c7 */ + tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.399818907)); /* -c1 */ + tmp1 += tmp4 + tmp5 + MULTIPLY(tmp11, FIX(1.276416582)) /* c9+c7+c1-c3 */ + - MULTIPLY(tmp14, FIX(1.068791298)); /* c5 */ + tmp10 = MULTIPLY(tmp12 + tmp13, FIX(0.398430003)); /* c9 */ + tmp2 += tmp4 + tmp10 - MULTIPLY(tmp12, FIX(1.989053629)) /* c9+c5+c3-c7 */ + + MULTIPLY(tmp14, FIX(1.399818907)); /* c1 */ + tmp3 += tmp5 + tmp10 + MULTIPLY(tmp13, FIX(1.305598626)) /* c1+c5-c9-c7 */ + - MULTIPLY(tmp14, FIX(1.286413905)); /* c3 */ + + dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-1); + dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-1); + dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-1); + dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS-1); + + ctr++; + + if (ctr != DCTSIZE) { + if (ctr == 11) + break; /* Done. */ + dataptr += DCTSIZE; /* advance pointer to next row */ + } else + dataptr = workspace; /* switch pointer to extended workspace */ + } + + /* Pass 2: process columns. + * We leave the results scaled up by an overall factor of 8. + * We must also scale the output by (8/11)**2 = 64/121, which we partially + * fold into the constant multipliers and final/initial shifting: + * cK now represents sqrt(2) * cos(K*pi/22) * 128/121. + */ + + dataptr = data; + wsptr = workspace; + for (ctr = DCTSIZE-1; ctr >= 0; ctr--) { + /* Even part */ + + tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*2]; + tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*1]; + tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*0]; + tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*7]; + tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*6]; + tmp5 = dataptr[DCTSIZE*5]; + + tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*2]; + tmp11 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*1]; + tmp12 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*0]; + tmp13 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*7]; + tmp14 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*6]; + + dataptr[DCTSIZE*0] = (DCTELEM) + DESCALE(MULTIPLY(tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5, + FIX(1.057851240)), /* 128/121 */ + CONST_BITS+2); + tmp5 += tmp5; + tmp0 -= tmp5; + tmp1 -= tmp5; + tmp2 -= tmp5; + tmp3 -= tmp5; + tmp4 -= tmp5; + z1 = MULTIPLY(tmp0 + tmp3, FIX(1.435427942)) + /* c2 */ + MULTIPLY(tmp2 + tmp4, FIX(0.212906922)); /* c10 */ + z2 = MULTIPLY(tmp1 - tmp3, FIX(0.979689713)); /* c6 */ + z3 = MULTIPLY(tmp0 - tmp1, FIX(1.258538479)); /* c4 */ + dataptr[DCTSIZE*2] = (DCTELEM) + DESCALE(z1 + z2 - MULTIPLY(tmp3, FIX(1.077210542)) /* c2+c8-c6 */ + - MULTIPLY(tmp4, FIX(1.471445400)), /* c4+c10 */ + CONST_BITS+2); + dataptr[DCTSIZE*4] = (DCTELEM) + DESCALE(z2 + z3 + MULTIPLY(tmp1, FIX(0.065941844)) /* c4-c6-c10 */ + - MULTIPLY(tmp2, FIX(1.435427942)) /* c2 */ + + MULTIPLY(tmp4, FIX(0.621472312)), /* c8 */ + CONST_BITS+2); + dataptr[DCTSIZE*6] = (DCTELEM) + DESCALE(z1 + z3 - MULTIPLY(tmp0, FIX(1.714276708)) /* c2+c4-c6 */ + - MULTIPLY(tmp2, FIX(0.834379234)), /* c8+c10 */ + CONST_BITS+2); + + /* Odd part */ + + tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.360834544)); /* c3 */ + tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.130622199)); /* c5 */ + tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.808813568)); /* c7 */ + tmp0 = tmp1 + tmp2 + tmp3 - MULTIPLY(tmp10, FIX(1.819470145)) /* c7+c5+c3-c1 */ + + MULTIPLY(tmp14, FIX(0.421479672)); /* c9 */ + tmp4 = MULTIPLY(tmp11 + tmp12, - FIX(0.808813568)); /* -c7 */ + tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.480800167)); /* -c1 */ + tmp1 += tmp4 + tmp5 + MULTIPLY(tmp11, FIX(1.350258864)) /* c9+c7+c1-c3 */ + - MULTIPLY(tmp14, FIX(1.130622199)); /* c5 */ + tmp10 = MULTIPLY(tmp12 + tmp13, FIX(0.421479672)); /* c9 */ + tmp2 += tmp4 + tmp10 - MULTIPLY(tmp12, FIX(2.104122847)) /* c9+c5+c3-c7 */ + + MULTIPLY(tmp14, FIX(1.480800167)); /* c1 */ + tmp3 += tmp5 + tmp10 + MULTIPLY(tmp13, FIX(1.381129125)) /* c1+c5-c9-c7 */ + - MULTIPLY(tmp14, FIX(1.360834544)); /* c3 */ + + dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+2); + dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+2); + dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+2); + dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3, CONST_BITS+2); + + dataptr++; /* advance pointer to next column */ + wsptr++; /* advance pointer to next column */ + } +} + + +/* + * Perform the forward DCT on a 12x12 sample block. + */ + +GLOBAL(void) +jpeg_fdct_12x12 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col) +{ + INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; + DCTELEM workspace[8*4]; + DCTELEM *dataptr; + DCTELEM *wsptr; + JSAMPROW elemptr; + int ctr; + SHIFT_TEMPS + + /* Pass 1: process rows. */ + /* Note results are scaled up by sqrt(8) compared to a true DCT. */ + /* cK represents sqrt(2) * cos(K*pi/24). */ + + dataptr = data; + ctr = 0; + for (;;) { + elemptr = sample_data[ctr] + start_col; + + /* Even part */ + + tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[11]); + tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[10]); + tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[9]); + tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[8]); + tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[7]); + tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[6]); + + tmp10 = tmp0 + tmp5; + tmp13 = tmp0 - tmp5; + tmp11 = tmp1 + tmp4; + tmp14 = tmp1 - tmp4; + tmp12 = tmp2 + tmp3; + tmp15 = tmp2 - tmp3; + + tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[11]); + tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[10]); + tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[9]); + tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[8]); + tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[7]); + tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[6]); + + /* Apply unsigned->signed conversion */ + dataptr[0] = (DCTELEM) (tmp10 + tmp11 + tmp12 - 12 * CENTERJSAMPLE); + dataptr[6] = (DCTELEM) (tmp13 - tmp14 - tmp15); + dataptr[4] = (DCTELEM) + DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.224744871)), /* c4 */ + CONST_BITS); + dataptr[2] = (DCTELEM) + DESCALE(tmp14 - tmp15 + MULTIPLY(tmp13 + tmp15, FIX(1.366025404)), /* c2 */ + CONST_BITS); + + /* Odd part */ + + tmp10 = MULTIPLY(tmp1 + tmp4, FIX_0_541196100); /* c9 */ + tmp14 = tmp10 + MULTIPLY(tmp1, FIX_0_765366865); /* c3-c9 */ + tmp15 = tmp10 - MULTIPLY(tmp4, FIX_1_847759065); /* c3+c9 */ + tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.121971054)); /* c5 */ + tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.860918669)); /* c7 */ + tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.580774953)) /* c5+c7-c1 */ + + MULTIPLY(tmp5, FIX(0.184591911)); /* c11 */ + tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.184591911)); /* -c11 */ + tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.339493912)) /* c1+c5-c11 */ + + MULTIPLY(tmp5, FIX(0.860918669)); /* c7 */ + tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.725788011)) /* c1+c11-c7 */ + - MULTIPLY(tmp5, FIX(1.121971054)); /* c5 */ + tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.306562965)) /* c3 */ + - MULTIPLY(tmp2 + tmp5, FIX_0_541196100); /* c9 */ + + dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS); + dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS); + dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS); + dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS); + + ctr++; + + if (ctr != DCTSIZE) { + if (ctr == 12) + break; /* Done. */ + dataptr += DCTSIZE; /* advance pointer to next row */ + } else + dataptr = workspace; /* switch pointer to extended workspace */ + } + + /* Pass 2: process columns. + * We leave the results scaled up by an overall factor of 8. + * We must also scale the output by (8/12)**2 = 4/9, which we partially + * fold into the constant multipliers and final shifting: + * cK now represents sqrt(2) * cos(K*pi/24) * 8/9. + */ + + dataptr = data; + wsptr = workspace; + for (ctr = DCTSIZE-1; ctr >= 0; ctr--) { + /* Even part */ + + tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*3]; + tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*2]; + tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*1]; + tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*0]; + tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*7]; + tmp5 = dataptr[DCTSIZE*5] + dataptr[DCTSIZE*6]; + + tmp10 = tmp0 + tmp5; + tmp13 = tmp0 - tmp5; + tmp11 = tmp1 + tmp4; + tmp14 = tmp1 - tmp4; + tmp12 = tmp2 + tmp3; + tmp15 = tmp2 - tmp3; + + tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*3]; + tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*2]; + tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*1]; + tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*0]; + tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*7]; + tmp5 = dataptr[DCTSIZE*5] - dataptr[DCTSIZE*6]; + + dataptr[DCTSIZE*0] = (DCTELEM) + DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(0.888888889)), /* 8/9 */ + CONST_BITS+1); + dataptr[DCTSIZE*6] = (DCTELEM) + DESCALE(MULTIPLY(tmp13 - tmp14 - tmp15, FIX(0.888888889)), /* 8/9 */ + CONST_BITS+1); + dataptr[DCTSIZE*4] = (DCTELEM) + DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.088662108)), /* c4 */ + CONST_BITS+1); + dataptr[DCTSIZE*2] = (DCTELEM) + DESCALE(MULTIPLY(tmp14 - tmp15, FIX(0.888888889)) + /* 8/9 */ + MULTIPLY(tmp13 + tmp15, FIX(1.214244803)), /* c2 */ + CONST_BITS+1); + + /* Odd part */ + + tmp10 = MULTIPLY(tmp1 + tmp4, FIX(0.481063200)); /* c9 */ + tmp14 = tmp10 + MULTIPLY(tmp1, FIX(0.680326102)); /* c3-c9 */ + tmp15 = tmp10 - MULTIPLY(tmp4, FIX(1.642452502)); /* c3+c9 */ + tmp12 = MULTIPLY(tmp0 + tmp2, FIX(0.997307603)); /* c5 */ + tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.765261039)); /* c7 */ + tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.516244403)) /* c5+c7-c1 */ + + MULTIPLY(tmp5, FIX(0.164081699)); /* c11 */ + tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.164081699)); /* -c11 */ + tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.079550144)) /* c1+c5-c11 */ + + MULTIPLY(tmp5, FIX(0.765261039)); /* c7 */ + tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.645144899)) /* c1+c11-c7 */ + - MULTIPLY(tmp5, FIX(0.997307603)); /* c5 */ + tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.161389302)) /* c3 */ + - MULTIPLY(tmp2 + tmp5, FIX(0.481063200)); /* c9 */ + + dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+1); + dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+1); + dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+1); + dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+1); + + dataptr++; /* advance pointer to next column */ + wsptr++; /* advance pointer to next column */ + } +} + + +/* + * Perform the forward DCT on a 13x13 sample block. + */ + +GLOBAL(void) +jpeg_fdct_13x13 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col) +{ + INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6; + INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; + INT32 z1, z2; + DCTELEM workspace[8*5]; + DCTELEM *dataptr; + DCTELEM *wsptr; + JSAMPROW elemptr; + int ctr; + SHIFT_TEMPS + + /* Pass 1: process rows. */ + /* Note results are scaled up by sqrt(8) compared to a true DCT. */ + /* cK represents sqrt(2) * cos(K*pi/26). */ + + dataptr = data; + ctr = 0; + for (;;) { + elemptr = sample_data[ctr] + start_col; + + /* Even part */ + + tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[12]); + tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[11]); + tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[10]); + tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[9]); + tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[8]); + tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[7]); + tmp6 = GETJSAMPLE(elemptr[6]); + + tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[12]); + tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[11]); + tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[10]); + tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[9]); + tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[8]); + tmp15 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[7]); + + /* Apply unsigned->signed conversion */ + dataptr[0] = (DCTELEM) + (tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 + tmp6 - 13 * CENTERJSAMPLE); + tmp6 += tmp6; + tmp0 -= tmp6; + tmp1 -= tmp6; + tmp2 -= tmp6; + tmp3 -= tmp6; + tmp4 -= tmp6; + tmp5 -= tmp6; + dataptr[2] = (DCTELEM) + DESCALE(MULTIPLY(tmp0, FIX(1.373119086)) + /* c2 */ + MULTIPLY(tmp1, FIX(1.058554052)) + /* c6 */ + MULTIPLY(tmp2, FIX(0.501487041)) - /* c10 */ + MULTIPLY(tmp3, FIX(0.170464608)) - /* c12 */ + MULTIPLY(tmp4, FIX(0.803364869)) - /* c8 */ + MULTIPLY(tmp5, FIX(1.252223920)), /* c4 */ + CONST_BITS); + z1 = MULTIPLY(tmp0 - tmp2, FIX(1.155388986)) - /* (c4+c6)/2 */ + MULTIPLY(tmp3 - tmp4, FIX(0.435816023)) - /* (c2-c10)/2 */ + MULTIPLY(tmp1 - tmp5, FIX(0.316450131)); /* (c8-c12)/2 */ + z2 = MULTIPLY(tmp0 + tmp2, FIX(0.096834934)) - /* (c4-c6)/2 */ + MULTIPLY(tmp3 + tmp4, FIX(0.937303064)) + /* (c2+c10)/2 */ + MULTIPLY(tmp1 + tmp5, FIX(0.486914739)); /* (c8+c12)/2 */ + + dataptr[4] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS); + dataptr[6] = (DCTELEM) DESCALE(z1 - z2, CONST_BITS); + + /* Odd part */ + + tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.322312651)); /* c3 */ + tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.163874945)); /* c5 */ + tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.937797057)) + /* c7 */ + MULTIPLY(tmp14 + tmp15, FIX(0.338443458)); /* c11 */ + tmp0 = tmp1 + tmp2 + tmp3 - + MULTIPLY(tmp10, FIX(2.020082300)) + /* c3+c5+c7-c1 */ + MULTIPLY(tmp14, FIX(0.318774355)); /* c9-c11 */ + tmp4 = MULTIPLY(tmp14 - tmp15, FIX(0.937797057)) - /* c7 */ + MULTIPLY(tmp11 + tmp12, FIX(0.338443458)); /* c11 */ + tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.163874945)); /* -c5 */ + tmp1 += tmp4 + tmp5 + + MULTIPLY(tmp11, FIX(0.837223564)) - /* c5+c9+c11-c3 */ + MULTIPLY(tmp14, FIX(2.341699410)); /* c1+c7 */ + tmp6 = MULTIPLY(tmp12 + tmp13, - FIX(0.657217813)); /* -c9 */ + tmp2 += tmp4 + tmp6 - + MULTIPLY(tmp12, FIX(1.572116027)) + /* c1+c5-c9-c11 */ + MULTIPLY(tmp15, FIX(2.260109708)); /* c3+c7 */ + tmp3 += tmp5 + tmp6 + + MULTIPLY(tmp13, FIX(2.205608352)) - /* c3+c5+c9-c7 */ + MULTIPLY(tmp15, FIX(1.742345811)); /* c1+c11 */ + + dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS); + dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS); + dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS); + dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS); + + ctr++; + + if (ctr != DCTSIZE) { + if (ctr == 13) + break; /* Done. */ + dataptr += DCTSIZE; /* advance pointer to next row */ + } else + dataptr = workspace; /* switch pointer to extended workspace */ + } + + /* Pass 2: process columns. + * We leave the results scaled up by an overall factor of 8. + * We must also scale the output by (8/13)**2 = 64/169, which we partially + * fold into the constant multipliers and final shifting: + * cK now represents sqrt(2) * cos(K*pi/26) * 128/169. + */ + + dataptr = data; + wsptr = workspace; + for (ctr = DCTSIZE-1; ctr >= 0; ctr--) { + /* Even part */ + + tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*4]; + tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*3]; + tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*2]; + tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*1]; + tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*0]; + tmp5 = dataptr[DCTSIZE*5] + dataptr[DCTSIZE*7]; + tmp6 = dataptr[DCTSIZE*6]; + + tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*4]; + tmp11 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*3]; + tmp12 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*2]; + tmp13 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*1]; + tmp14 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*0]; + tmp15 = dataptr[DCTSIZE*5] - dataptr[DCTSIZE*7]; + + dataptr[DCTSIZE*0] = (DCTELEM) + DESCALE(MULTIPLY(tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 + tmp6, + FIX(0.757396450)), /* 128/169 */ + CONST_BITS+1); + tmp6 += tmp6; + tmp0 -= tmp6; + tmp1 -= tmp6; + tmp2 -= tmp6; + tmp3 -= tmp6; + tmp4 -= tmp6; + tmp5 -= tmp6; + dataptr[DCTSIZE*2] = (DCTELEM) + DESCALE(MULTIPLY(tmp0, FIX(1.039995521)) + /* c2 */ + MULTIPLY(tmp1, FIX(0.801745081)) + /* c6 */ + MULTIPLY(tmp2, FIX(0.379824504)) - /* c10 */ + MULTIPLY(tmp3, FIX(0.129109289)) - /* c12 */ + MULTIPLY(tmp4, FIX(0.608465700)) - /* c8 */ + MULTIPLY(tmp5, FIX(0.948429952)), /* c4 */ + CONST_BITS+1); + z1 = MULTIPLY(tmp0 - tmp2, FIX(0.875087516)) - /* (c4+c6)/2 */ + MULTIPLY(tmp3 - tmp4, FIX(0.330085509)) - /* (c2-c10)/2 */ + MULTIPLY(tmp1 - tmp5, FIX(0.239678205)); /* (c8-c12)/2 */ + z2 = MULTIPLY(tmp0 + tmp2, FIX(0.073342435)) - /* (c4-c6)/2 */ + MULTIPLY(tmp3 + tmp4, FIX(0.709910013)) + /* (c2+c10)/2 */ + MULTIPLY(tmp1 + tmp5, FIX(0.368787494)); /* (c8+c12)/2 */ + + dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+1); + dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 - z2, CONST_BITS+1); + + /* Odd part */ + + tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.001514908)); /* c3 */ + tmp2 = MULTIPLY(tmp10 + tmp12, FIX(0.881514751)); /* c5 */ + tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.710284161)) + /* c7 */ + MULTIPLY(tmp14 + tmp15, FIX(0.256335874)); /* c11 */ + tmp0 = tmp1 + tmp2 + tmp3 - + MULTIPLY(tmp10, FIX(1.530003162)) + /* c3+c5+c7-c1 */ + MULTIPLY(tmp14, FIX(0.241438564)); /* c9-c11 */ + tmp4 = MULTIPLY(tmp14 - tmp15, FIX(0.710284161)) - /* c7 */ + MULTIPLY(tmp11 + tmp12, FIX(0.256335874)); /* c11 */ + tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(0.881514751)); /* -c5 */ + tmp1 += tmp4 + tmp5 + + MULTIPLY(tmp11, FIX(0.634110155)) - /* c5+c9+c11-c3 */ + MULTIPLY(tmp14, FIX(1.773594819)); /* c1+c7 */ + tmp6 = MULTIPLY(tmp12 + tmp13, - FIX(0.497774438)); /* -c9 */ + tmp2 += tmp4 + tmp6 - + MULTIPLY(tmp12, FIX(1.190715098)) + /* c1+c5-c9-c11 */ + MULTIPLY(tmp15, FIX(1.711799069)); /* c3+c7 */ + tmp3 += tmp5 + tmp6 + + MULTIPLY(tmp13, FIX(1.670519935)) - /* c3+c5+c9-c7 */ + MULTIPLY(tmp15, FIX(1.319646532)); /* c1+c11 */ + + dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+1); + dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+1); + dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+1); + dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3, CONST_BITS+1); + + dataptr++; /* advance pointer to next column */ + wsptr++; /* advance pointer to next column */ + } +} + + +/* + * Perform the forward DCT on a 14x14 sample block. + */ + +GLOBAL(void) +jpeg_fdct_14x14 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col) +{ + INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6; + INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16; + DCTELEM workspace[8*6]; + DCTELEM *dataptr; + DCTELEM *wsptr; + JSAMPROW elemptr; + int ctr; + SHIFT_TEMPS + + /* Pass 1: process rows. */ + /* Note results are scaled up by sqrt(8) compared to a true DCT. */ + /* cK represents sqrt(2) * cos(K*pi/28). */ + + dataptr = data; + ctr = 0; + for (;;) { + elemptr = sample_data[ctr] + start_col; + + /* Even part */ + + tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[13]); + tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[12]); + tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[11]); + tmp13 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[10]); + tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[9]); + tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[8]); + tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[7]); + + tmp10 = tmp0 + tmp6; + tmp14 = tmp0 - tmp6; + tmp11 = tmp1 + tmp5; + tmp15 = tmp1 - tmp5; + tmp12 = tmp2 + tmp4; + tmp16 = tmp2 - tmp4; + + tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[13]); + tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[12]); + tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[11]); + tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[10]); + tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[9]); + tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[8]); + tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[7]); + + /* Apply unsigned->signed conversion */ + dataptr[0] = (DCTELEM) + (tmp10 + tmp11 + tmp12 + tmp13 - 14 * CENTERJSAMPLE); + tmp13 += tmp13; + dataptr[4] = (DCTELEM) + DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.274162392)) + /* c4 */ + MULTIPLY(tmp11 - tmp13, FIX(0.314692123)) - /* c12 */ + MULTIPLY(tmp12 - tmp13, FIX(0.881747734)), /* c8 */ + CONST_BITS); + + tmp10 = MULTIPLY(tmp14 + tmp15, FIX(1.105676686)); /* c6 */ + + dataptr[2] = (DCTELEM) + DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.273079590)) /* c2-c6 */ + + MULTIPLY(tmp16, FIX(0.613604268)), /* c10 */ + CONST_BITS); + dataptr[6] = (DCTELEM) + DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.719280954)) /* c6+c10 */ + - MULTIPLY(tmp16, FIX(1.378756276)), /* c2 */ + CONST_BITS); + + /* Odd part */ + + tmp10 = tmp1 + tmp2; + tmp11 = tmp5 - tmp4; + dataptr[7] = (DCTELEM) (tmp0 - tmp10 + tmp3 - tmp11 - tmp6); + tmp3 <<= CONST_BITS; + tmp10 = MULTIPLY(tmp10, - FIX(0.158341681)); /* -c13 */ + tmp11 = MULTIPLY(tmp11, FIX(1.405321284)); /* c1 */ + tmp10 += tmp11 - tmp3; + tmp11 = MULTIPLY(tmp0 + tmp2, FIX(1.197448846)) + /* c5 */ + MULTIPLY(tmp4 + tmp6, FIX(0.752406978)); /* c9 */ + dataptr[5] = (DCTELEM) + DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(2.373959773)) /* c3+c5-c13 */ + + MULTIPLY(tmp4, FIX(1.119999435)), /* c1+c11-c9 */ + CONST_BITS); + tmp12 = MULTIPLY(tmp0 + tmp1, FIX(1.334852607)) + /* c3 */ + MULTIPLY(tmp5 - tmp6, FIX(0.467085129)); /* c11 */ + dataptr[3] = (DCTELEM) + DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.424103948)) /* c3-c9-c13 */ + - MULTIPLY(tmp5, FIX(3.069855259)), /* c1+c5+c11 */ + CONST_BITS); + dataptr[1] = (DCTELEM) + DESCALE(tmp11 + tmp12 + tmp3 + tmp6 - + MULTIPLY(tmp0 + tmp6, FIX(1.126980169)), /* c3+c5-c1 */ + CONST_BITS); + + ctr++; + + if (ctr != DCTSIZE) { + if (ctr == 14) + break; /* Done. */ + dataptr += DCTSIZE; /* advance pointer to next row */ + } else + dataptr = workspace; /* switch pointer to extended workspace */ + } + + /* Pass 2: process columns. + * We leave the results scaled up by an overall factor of 8. + * We must also scale the output by (8/14)**2 = 16/49, which we partially + * fold into the constant multipliers and final shifting: + * cK now represents sqrt(2) * cos(K*pi/28) * 32/49. + */ + + dataptr = data; + wsptr = workspace; + for (ctr = DCTSIZE-1; ctr >= 0; ctr--) { + /* Even part */ + + tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*5]; + tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*4]; + tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*3]; + tmp13 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*2]; + tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*1]; + tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*0]; + tmp6 = dataptr[DCTSIZE*6] + dataptr[DCTSIZE*7]; + + tmp10 = tmp0 + tmp6; + tmp14 = tmp0 - tmp6; + tmp11 = tmp1 + tmp5; + tmp15 = tmp1 - tmp5; + tmp12 = tmp2 + tmp4; + tmp16 = tmp2 - tmp4; + + tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*5]; + tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*4]; + tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*3]; + tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*2]; + tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*1]; + tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*0]; + tmp6 = dataptr[DCTSIZE*6] - dataptr[DCTSIZE*7]; + + dataptr[DCTSIZE*0] = (DCTELEM) + DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12 + tmp13, + FIX(0.653061224)), /* 32/49 */ + CONST_BITS+1); + tmp13 += tmp13; + dataptr[DCTSIZE*4] = (DCTELEM) + DESCALE(MULTIPLY(tmp10 - tmp13, FIX(0.832106052)) + /* c4 */ + MULTIPLY(tmp11 - tmp13, FIX(0.205513223)) - /* c12 */ + MULTIPLY(tmp12 - tmp13, FIX(0.575835255)), /* c8 */ + CONST_BITS+1); + + tmp10 = MULTIPLY(tmp14 + tmp15, FIX(0.722074570)); /* c6 */ + + dataptr[DCTSIZE*2] = (DCTELEM) + DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.178337691)) /* c2-c6 */ + + MULTIPLY(tmp16, FIX(0.400721155)), /* c10 */ + CONST_BITS+1); + dataptr[DCTSIZE*6] = (DCTELEM) + DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.122795725)) /* c6+c10 */ + - MULTIPLY(tmp16, FIX(0.900412262)), /* c2 */ + CONST_BITS+1); + + /* Odd part */ + + tmp10 = tmp1 + tmp2; + tmp11 = tmp5 - tmp4; + dataptr[DCTSIZE*7] = (DCTELEM) + DESCALE(MULTIPLY(tmp0 - tmp10 + tmp3 - tmp11 - tmp6, + FIX(0.653061224)), /* 32/49 */ + CONST_BITS+1); + tmp3 = MULTIPLY(tmp3 , FIX(0.653061224)); /* 32/49 */ + tmp10 = MULTIPLY(tmp10, - FIX(0.103406812)); /* -c13 */ + tmp11 = MULTIPLY(tmp11, FIX(0.917760839)); /* c1 */ + tmp10 += tmp11 - tmp3; + tmp11 = MULTIPLY(tmp0 + tmp2, FIX(0.782007410)) + /* c5 */ + MULTIPLY(tmp4 + tmp6, FIX(0.491367823)); /* c9 */ + dataptr[DCTSIZE*5] = (DCTELEM) + DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(1.550341076)) /* c3+c5-c13 */ + + MULTIPLY(tmp4, FIX(0.731428202)), /* c1+c11-c9 */ + CONST_BITS+1); + tmp12 = MULTIPLY(tmp0 + tmp1, FIX(0.871740478)) + /* c3 */ + MULTIPLY(tmp5 - tmp6, FIX(0.305035186)); /* c11 */ + dataptr[DCTSIZE*3] = (DCTELEM) + DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.276965844)) /* c3-c9-c13 */ + - MULTIPLY(tmp5, FIX(2.004803435)), /* c1+c5+c11 */ + CONST_BITS+1); + dataptr[DCTSIZE*1] = (DCTELEM) + DESCALE(tmp11 + tmp12 + tmp3 + - MULTIPLY(tmp0, FIX(0.735987049)) /* c3+c5-c1 */ + - MULTIPLY(tmp6, FIX(0.082925825)), /* c9-c11-c13 */ + CONST_BITS+1); + + dataptr++; /* advance pointer to next column */ + wsptr++; /* advance pointer to next column */ + } +} + + +/* + * Perform the forward DCT on a 15x15 sample block. + */ + +GLOBAL(void) +jpeg_fdct_15x15 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col) +{ + INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16; + INT32 z1, z2, z3; + DCTELEM workspace[8*7]; + DCTELEM *dataptr; + DCTELEM *wsptr; + JSAMPROW elemptr; + int ctr; + SHIFT_TEMPS + + /* Pass 1: process rows. */ + /* Note results are scaled up by sqrt(8) compared to a true DCT. */ + /* cK represents sqrt(2) * cos(K*pi/30). */ + + dataptr = data; + ctr = 0; + for (;;) { + elemptr = sample_data[ctr] + start_col; + + /* Even part */ + + tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[14]); + tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[13]); + tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[12]); + tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[11]); + tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[10]); + tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[9]); + tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[8]); + tmp7 = GETJSAMPLE(elemptr[7]); + + tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[14]); + tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[13]); + tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[12]); + tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[11]); + tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[10]); + tmp15 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[9]); + tmp16 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[8]); + + z1 = tmp0 + tmp4 + tmp5; + z2 = tmp1 + tmp3 + tmp6; + z3 = tmp2 + tmp7; + /* Apply unsigned->signed conversion */ + dataptr[0] = (DCTELEM) (z1 + z2 + z3 - 15 * CENTERJSAMPLE); + z3 += z3; + dataptr[6] = (DCTELEM) + DESCALE(MULTIPLY(z1 - z3, FIX(1.144122806)) - /* c6 */ + MULTIPLY(z2 - z3, FIX(0.437016024)), /* c12 */ + CONST_BITS); + tmp2 += ((tmp1 + tmp4) >> 1) - tmp7 - tmp7; + z1 = MULTIPLY(tmp3 - tmp2, FIX(1.531135173)) - /* c2+c14 */ + MULTIPLY(tmp6 - tmp2, FIX(2.238241955)); /* c4+c8 */ + z2 = MULTIPLY(tmp5 - tmp2, FIX(0.798468008)) - /* c8-c14 */ + MULTIPLY(tmp0 - tmp2, FIX(0.091361227)); /* c2-c4 */ + z3 = MULTIPLY(tmp0 - tmp3, FIX(1.383309603)) + /* c2 */ + MULTIPLY(tmp6 - tmp5, FIX(0.946293579)) + /* c8 */ + MULTIPLY(tmp1 - tmp4, FIX(0.790569415)); /* (c6+c12)/2 */ + + dataptr[2] = (DCTELEM) DESCALE(z1 + z3, CONST_BITS); + dataptr[4] = (DCTELEM) DESCALE(z2 + z3, CONST_BITS); + + /* Odd part */ + + tmp2 = MULTIPLY(tmp10 - tmp12 - tmp13 + tmp15 + tmp16, + FIX(1.224744871)); /* c5 */ + tmp1 = MULTIPLY(tmp10 - tmp14 - tmp15, FIX(1.344997024)) + /* c3 */ + MULTIPLY(tmp11 - tmp13 - tmp16, FIX(0.831253876)); /* c9 */ + tmp12 = MULTIPLY(tmp12, FIX(1.224744871)); /* c5 */ + tmp4 = MULTIPLY(tmp10 - tmp16, FIX(1.406466353)) + /* c1 */ + MULTIPLY(tmp11 + tmp14, FIX(1.344997024)) + /* c3 */ + MULTIPLY(tmp13 + tmp15, FIX(0.575212477)); /* c11 */ + tmp0 = MULTIPLY(tmp13, FIX(0.475753014)) - /* c7-c11 */ + MULTIPLY(tmp14, FIX(0.513743148)) + /* c3-c9 */ + MULTIPLY(tmp16, FIX(1.700497885)) + tmp4 + tmp12; /* c1+c13 */ + tmp3 = MULTIPLY(tmp10, - FIX(0.355500862)) - /* -(c1-c7) */ + MULTIPLY(tmp11, FIX(2.176250899)) - /* c3+c9 */ + MULTIPLY(tmp15, FIX(0.869244010)) + tmp4 - tmp12; /* c11+c13 */ + + dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS); + dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS); + dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS); + dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS); + + ctr++; + + if (ctr != DCTSIZE) { + if (ctr == 15) + break; /* Done. */ + dataptr += DCTSIZE; /* advance pointer to next row */ + } else + dataptr = workspace; /* switch pointer to extended workspace */ + } + + /* Pass 2: process columns. + * We leave the results scaled up by an overall factor of 8. + * We must also scale the output by (8/15)**2 = 64/225, which we partially + * fold into the constant multipliers and final shifting: + * cK now represents sqrt(2) * cos(K*pi/30) * 256/225. + */ + + dataptr = data; + wsptr = workspace; + for (ctr = DCTSIZE-1; ctr >= 0; ctr--) { + /* Even part */ + + tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*6]; + tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*5]; + tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*4]; + tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*3]; + tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*2]; + tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*1]; + tmp6 = dataptr[DCTSIZE*6] + wsptr[DCTSIZE*0]; + tmp7 = dataptr[DCTSIZE*7]; + + tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*6]; + tmp11 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*5]; + tmp12 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*4]; + tmp13 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*3]; + tmp14 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*2]; + tmp15 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*1]; + tmp16 = dataptr[DCTSIZE*6] - wsptr[DCTSIZE*0]; + + z1 = tmp0 + tmp4 + tmp5; + z2 = tmp1 + tmp3 + tmp6; + z3 = tmp2 + tmp7; + dataptr[DCTSIZE*0] = (DCTELEM) + DESCALE(MULTIPLY(z1 + z2 + z3, FIX(1.137777778)), /* 256/225 */ + CONST_BITS+2); + z3 += z3; + dataptr[DCTSIZE*6] = (DCTELEM) + DESCALE(MULTIPLY(z1 - z3, FIX(1.301757503)) - /* c6 */ + MULTIPLY(z2 - z3, FIX(0.497227121)), /* c12 */ + CONST_BITS+2); + tmp2 += ((tmp1 + tmp4) >> 1) - tmp7 - tmp7; + z1 = MULTIPLY(tmp3 - tmp2, FIX(1.742091575)) - /* c2+c14 */ + MULTIPLY(tmp6 - tmp2, FIX(2.546621957)); /* c4+c8 */ + z2 = MULTIPLY(tmp5 - tmp2, FIX(0.908479156)) - /* c8-c14 */ + MULTIPLY(tmp0 - tmp2, FIX(0.103948774)); /* c2-c4 */ + z3 = MULTIPLY(tmp0 - tmp3, FIX(1.573898926)) + /* c2 */ + MULTIPLY(tmp6 - tmp5, FIX(1.076671805)) + /* c8 */ + MULTIPLY(tmp1 - tmp4, FIX(0.899492312)); /* (c6+c12)/2 */ + + dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + z3, CONST_BITS+2); + dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(z2 + z3, CONST_BITS+2); + + /* Odd part */ + + tmp2 = MULTIPLY(tmp10 - tmp12 - tmp13 + tmp15 + tmp16, + FIX(1.393487498)); /* c5 */ + tmp1 = MULTIPLY(tmp10 - tmp14 - tmp15, FIX(1.530307725)) + /* c3 */ + MULTIPLY(tmp11 - tmp13 - tmp16, FIX(0.945782187)); /* c9 */ + tmp12 = MULTIPLY(tmp12, FIX(1.393487498)); /* c5 */ + tmp4 = MULTIPLY(tmp10 - tmp16, FIX(1.600246161)) + /* c1 */ + MULTIPLY(tmp11 + tmp14, FIX(1.530307725)) + /* c3 */ + MULTIPLY(tmp13 + tmp15, FIX(0.654463974)); /* c11 */ + tmp0 = MULTIPLY(tmp13, FIX(0.541301207)) - /* c7-c11 */ + MULTIPLY(tmp14, FIX(0.584525538)) + /* c3-c9 */ + MULTIPLY(tmp16, FIX(1.934788705)) + tmp4 + tmp12; /* c1+c13 */ + tmp3 = MULTIPLY(tmp10, - FIX(0.404480980)) - /* -(c1-c7) */ + MULTIPLY(tmp11, FIX(2.476089912)) - /* c3+c9 */ + MULTIPLY(tmp15, FIX(0.989006518)) + tmp4 - tmp12; /* c11+c13 */ + + dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+2); + dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+2); + dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+2); + dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3, CONST_BITS+2); + + dataptr++; /* advance pointer to next column */ + wsptr++; /* advance pointer to next column */ + } +} + + +/* + * Perform the forward DCT on a 16x16 sample block. + */ + +GLOBAL(void) +jpeg_fdct_16x16 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col) +{ + INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17; + DCTELEM workspace[DCTSIZE2]; + DCTELEM *dataptr; + DCTELEM *wsptr; + JSAMPROW elemptr; + int ctr; + SHIFT_TEMPS + + /* Pass 1: process rows. */ + /* Note results are scaled up by sqrt(8) compared to a true DCT; */ + /* furthermore, we scale the results by 2**PASS1_BITS. */ + /* cK represents sqrt(2) * cos(K*pi/32). */ + + dataptr = data; + ctr = 0; + for (;;) { + elemptr = sample_data[ctr] + start_col; + + /* Even part */ + + tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[15]); + tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[14]); + tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[13]); + tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[12]); + tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[11]); + tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[10]); + tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[9]); + tmp7 = GETJSAMPLE(elemptr[7]) + GETJSAMPLE(elemptr[8]); + + tmp10 = tmp0 + tmp7; + tmp14 = tmp0 - tmp7; + tmp11 = tmp1 + tmp6; + tmp15 = tmp1 - tmp6; + tmp12 = tmp2 + tmp5; + tmp16 = tmp2 - tmp5; + tmp13 = tmp3 + tmp4; + tmp17 = tmp3 - tmp4; + + tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[15]); + tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[14]); + tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[13]); + tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[12]); + tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[11]); + tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[10]); + tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[9]); + tmp7 = GETJSAMPLE(elemptr[7]) - GETJSAMPLE(elemptr[8]); + + /* Apply unsigned->signed conversion */ + dataptr[0] = (DCTELEM) + ((tmp10 + tmp11 + tmp12 + tmp13 - 16 * CENTERJSAMPLE) << PASS1_BITS); + dataptr[4] = (DCTELEM) + DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */ + MULTIPLY(tmp11 - tmp12, FIX_0_541196100), /* c12[16] = c6[8] */ + CONST_BITS-PASS1_BITS); + + tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) + /* c14[16] = c7[8] */ + MULTIPLY(tmp14 - tmp16, FIX(1.387039845)); /* c2[16] = c1[8] */ + + dataptr[2] = (DCTELEM) + DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982)) /* c6+c14 */ + + MULTIPLY(tmp16, FIX(2.172734804)), /* c2+c10 */ + CONST_BITS-PASS1_BITS); + dataptr[6] = (DCTELEM) + DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243)) /* c2-c6 */ + - MULTIPLY(tmp17, FIX(1.061594338)), /* c10+c14 */ + CONST_BITS-PASS1_BITS); + + /* Odd part */ + + tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) + /* c3 */ + MULTIPLY(tmp6 - tmp7, FIX(0.410524528)); /* c13 */ + tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) + /* c5 */ + MULTIPLY(tmp5 + tmp7, FIX(0.666655658)); /* c11 */ + tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) + /* c7 */ + MULTIPLY(tmp4 - tmp7, FIX(0.897167586)); /* c9 */ + tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) + /* c15 */ + MULTIPLY(tmp6 - tmp5, FIX(1.407403738)); /* c1 */ + tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) + /* -c11 */ + MULTIPLY(tmp4 + tmp6, - FIX(1.247225013)); /* -c5 */ + tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) + /* -c3 */ + MULTIPLY(tmp5 - tmp4, FIX(0.410524528)); /* c13 */ + tmp10 = tmp11 + tmp12 + tmp13 - + MULTIPLY(tmp0, FIX(2.286341144)) + /* c7+c5+c3-c1 */ + MULTIPLY(tmp7, FIX(0.779653625)); /* c15+c13-c11+c9 */ + tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */ + - MULTIPLY(tmp6, FIX(1.663905119)); /* c7+c13+c1-c5 */ + tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */ + + MULTIPLY(tmp5, FIX(1.227391138)); /* c9-c11+c1-c13 */ + tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */ + + MULTIPLY(tmp4, FIX(2.167985692)); /* c1+c13+c5-c9 */ + + dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS); + dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS); + dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS); + dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS); + + ctr++; + + if (ctr != DCTSIZE) { + if (ctr == DCTSIZE * 2) + break; /* Done. */ + dataptr += DCTSIZE; /* advance pointer to next row */ + } else + dataptr = workspace; /* switch pointer to extended workspace */ + } + + /* Pass 2: process columns. + * We remove the PASS1_BITS scaling, but leave the results scaled up + * by an overall factor of 8. + * We must also scale the output by (8/16)**2 = 1/2**2. + */ + + dataptr = data; + wsptr = workspace; + for (ctr = DCTSIZE-1; ctr >= 0; ctr--) { + /* Even part */ + + tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*7]; + tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*6]; + tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*5]; + tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*4]; + tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*3]; + tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*2]; + tmp6 = dataptr[DCTSIZE*6] + wsptr[DCTSIZE*1]; + tmp7 = dataptr[DCTSIZE*7] + wsptr[DCTSIZE*0]; + + tmp10 = tmp0 + tmp7; + tmp14 = tmp0 - tmp7; + tmp11 = tmp1 + tmp6; + tmp15 = tmp1 - tmp6; + tmp12 = tmp2 + tmp5; + tmp16 = tmp2 - tmp5; + tmp13 = tmp3 + tmp4; + tmp17 = tmp3 - tmp4; + + tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*7]; + tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*6]; + tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*5]; + tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*4]; + tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*3]; + tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*2]; + tmp6 = dataptr[DCTSIZE*6] - wsptr[DCTSIZE*1]; + tmp7 = dataptr[DCTSIZE*7] - wsptr[DCTSIZE*0]; + + dataptr[DCTSIZE*0] = (DCTELEM) + DESCALE(tmp10 + tmp11 + tmp12 + tmp13, PASS1_BITS+2); + dataptr[DCTSIZE*4] = (DCTELEM) + DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */ + MULTIPLY(tmp11 - tmp12, FIX_0_541196100), /* c12[16] = c6[8] */ + CONST_BITS+PASS1_BITS+2); + + tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) + /* c14[16] = c7[8] */ + MULTIPLY(tmp14 - tmp16, FIX(1.387039845)); /* c2[16] = c1[8] */ + + dataptr[DCTSIZE*2] = (DCTELEM) + DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982)) /* c6+c14 */ + + MULTIPLY(tmp16, FIX(2.172734804)), /* c2+10 */ + CONST_BITS+PASS1_BITS+2); + dataptr[DCTSIZE*6] = (DCTELEM) + DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243)) /* c2-c6 */ + - MULTIPLY(tmp17, FIX(1.061594338)), /* c10+c14 */ + CONST_BITS+PASS1_BITS+2); + + /* Odd part */ + + tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) + /* c3 */ + MULTIPLY(tmp6 - tmp7, FIX(0.410524528)); /* c13 */ + tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) + /* c5 */ + MULTIPLY(tmp5 + tmp7, FIX(0.666655658)); /* c11 */ + tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) + /* c7 */ + MULTIPLY(tmp4 - tmp7, FIX(0.897167586)); /* c9 */ + tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) + /* c15 */ + MULTIPLY(tmp6 - tmp5, FIX(1.407403738)); /* c1 */ + tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) + /* -c11 */ + MULTIPLY(tmp4 + tmp6, - FIX(1.247225013)); /* -c5 */ + tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) + /* -c3 */ + MULTIPLY(tmp5 - tmp4, FIX(0.410524528)); /* c13 */ + tmp10 = tmp11 + tmp12 + tmp13 - + MULTIPLY(tmp0, FIX(2.286341144)) + /* c7+c5+c3-c1 */ + MULTIPLY(tmp7, FIX(0.779653625)); /* c15+c13-c11+c9 */ + tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */ + - MULTIPLY(tmp6, FIX(1.663905119)); /* c7+c13+c1-c5 */ + tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */ + + MULTIPLY(tmp5, FIX(1.227391138)); /* c9-c11+c1-c13 */ + tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */ + + MULTIPLY(tmp4, FIX(2.167985692)); /* c1+c13+c5-c9 */ + + dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS1_BITS+2); + dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS1_BITS+2); + dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS1_BITS+2); + dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS1_BITS+2); + + dataptr++; /* advance pointer to next column */ + wsptr++; /* advance pointer to next column */ + } +} + + +/* + * Perform the forward DCT on a 16x8 sample block. + * + * 16-point FDCT in pass 1 (rows), 8-point in pass 2 (columns). + */ + +GLOBAL(void) +jpeg_fdct_16x8 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col) +{ + INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17; + INT32 z1; + DCTELEM *dataptr; + JSAMPROW elemptr; + int ctr; + SHIFT_TEMPS + + /* Pass 1: process rows. */ + /* Note results are scaled up by sqrt(8) compared to a true DCT; */ + /* furthermore, we scale the results by 2**PASS1_BITS. */ + /* 16-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/32). */ + + dataptr = data; + ctr = 0; + for (ctr = 0; ctr < DCTSIZE; ctr++) { + elemptr = sample_data[ctr] + start_col; + + /* Even part */ + + tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[15]); + tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[14]); + tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[13]); + tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[12]); + tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[11]); + tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[10]); + tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[9]); + tmp7 = GETJSAMPLE(elemptr[7]) + GETJSAMPLE(elemptr[8]); + + tmp10 = tmp0 + tmp7; + tmp14 = tmp0 - tmp7; + tmp11 = tmp1 + tmp6; + tmp15 = tmp1 - tmp6; + tmp12 = tmp2 + tmp5; + tmp16 = tmp2 - tmp5; + tmp13 = tmp3 + tmp4; + tmp17 = tmp3 - tmp4; + + tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[15]); + tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[14]); + tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[13]); + tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[12]); + tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[11]); + tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[10]); + tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[9]); + tmp7 = GETJSAMPLE(elemptr[7]) - GETJSAMPLE(elemptr[8]); + + /* Apply unsigned->signed conversion */ + dataptr[0] = (DCTELEM) + ((tmp10 + tmp11 + tmp12 + tmp13 - 16 * CENTERJSAMPLE) << PASS1_BITS); + dataptr[4] = (DCTELEM) + DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */ + MULTIPLY(tmp11 - tmp12, FIX_0_541196100), /* c12[16] = c6[8] */ + CONST_BITS-PASS1_BITS); + + tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) + /* c14[16] = c7[8] */ + MULTIPLY(tmp14 - tmp16, FIX(1.387039845)); /* c2[16] = c1[8] */ + + dataptr[2] = (DCTELEM) + DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982)) /* c6+c14 */ + + MULTIPLY(tmp16, FIX(2.172734804)), /* c2+c10 */ + CONST_BITS-PASS1_BITS); + dataptr[6] = (DCTELEM) + DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243)) /* c2-c6 */ + - MULTIPLY(tmp17, FIX(1.061594338)), /* c10+c14 */ + CONST_BITS-PASS1_BITS); + + /* Odd part */ + + tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) + /* c3 */ + MULTIPLY(tmp6 - tmp7, FIX(0.410524528)); /* c13 */ + tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) + /* c5 */ + MULTIPLY(tmp5 + tmp7, FIX(0.666655658)); /* c11 */ + tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) + /* c7 */ + MULTIPLY(tmp4 - tmp7, FIX(0.897167586)); /* c9 */ + tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) + /* c15 */ + MULTIPLY(tmp6 - tmp5, FIX(1.407403738)); /* c1 */ + tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) + /* -c11 */ + MULTIPLY(tmp4 + tmp6, - FIX(1.247225013)); /* -c5 */ + tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) + /* -c3 */ + MULTIPLY(tmp5 - tmp4, FIX(0.410524528)); /* c13 */ + tmp10 = tmp11 + tmp12 + tmp13 - + MULTIPLY(tmp0, FIX(2.286341144)) + /* c7+c5+c3-c1 */ + MULTIPLY(tmp7, FIX(0.779653625)); /* c15+c13-c11+c9 */ + tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */ + - MULTIPLY(tmp6, FIX(1.663905119)); /* c7+c13+c1-c5 */ + tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */ + + MULTIPLY(tmp5, FIX(1.227391138)); /* c9-c11+c1-c13 */ + tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */ + + MULTIPLY(tmp4, FIX(2.167985692)); /* c1+c13+c5-c9 */ + + dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS); + dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS); + dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS); + dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS); + + dataptr += DCTSIZE; /* advance pointer to next row */ + } + + /* Pass 2: process columns. + * We remove the PASS1_BITS scaling, but leave the results scaled up + * by an overall factor of 8. + * We must also scale the output by 8/16 = 1/2. + */ + + dataptr = data; + for (ctr = DCTSIZE-1; ctr >= 0; ctr--) { + /* Even part per LL&M figure 1 --- note that published figure is faulty; + * rotator "sqrt(2)*c1" should be "sqrt(2)*c6". + */ + + tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7]; + tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6]; + tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5]; + tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4]; + + tmp10 = tmp0 + tmp3; + tmp12 = tmp0 - tmp3; + tmp11 = tmp1 + tmp2; + tmp13 = tmp1 - tmp2; + + tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7]; + tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6]; + tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5]; + tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4]; + + dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS+1); + dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS+1); + + z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100); + dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, FIX_0_765366865), + CONST_BITS+PASS1_BITS+1); + dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, - FIX_1_847759065), + CONST_BITS+PASS1_BITS+1); + + /* Odd part per figure 8 --- note paper omits factor of sqrt(2). + * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16). + * i0..i3 in the paper are tmp0..tmp3 here. + */ + + tmp10 = tmp0 + tmp3; + tmp11 = tmp1 + tmp2; + tmp12 = tmp0 + tmp2; + tmp13 = tmp1 + tmp3; + z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */ + + tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */ + tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */ + tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */ + tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */ + tmp10 = MULTIPLY(tmp10, - FIX_0_899976223); /* c7-c3 */ + tmp11 = MULTIPLY(tmp11, - FIX_2_562915447); /* -c1-c3 */ + tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* c5-c3 */ + tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */ + + tmp12 += z1; + tmp13 += z1; + + dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0 + tmp10 + tmp12, + CONST_BITS+PASS1_BITS+1); + dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1 + tmp11 + tmp13, + CONST_BITS+PASS1_BITS+1); + dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2 + tmp11 + tmp12, + CONST_BITS+PASS1_BITS+1); + dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3 + tmp10 + tmp13, + CONST_BITS+PASS1_BITS+1); + + dataptr++; /* advance pointer to next column */ + } +} + + +/* + * Perform the forward DCT on a 14x7 sample block. + * + * 14-point FDCT in pass 1 (rows), 7-point in pass 2 (columns). + */ + +GLOBAL(void) +jpeg_fdct_14x7 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col) +{ + INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6; + INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16; + INT32 z1, z2, z3; + DCTELEM *dataptr; + JSAMPROW elemptr; + int ctr; + SHIFT_TEMPS + + /* Zero bottom row of output coefficient block. */ + MEMZERO(&data[DCTSIZE*7], SIZEOF(DCTELEM) * DCTSIZE); + + /* Pass 1: process rows. */ + /* Note results are scaled up by sqrt(8) compared to a true DCT; */ + /* furthermore, we scale the results by 2**PASS1_BITS. */ + /* 14-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/28). */ + + dataptr = data; + for (ctr = 0; ctr < 7; ctr++) { + elemptr = sample_data[ctr] + start_col; + + /* Even part */ + + tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[13]); + tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[12]); + tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[11]); + tmp13 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[10]); + tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[9]); + tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[8]); + tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[7]); + + tmp10 = tmp0 + tmp6; + tmp14 = tmp0 - tmp6; + tmp11 = tmp1 + tmp5; + tmp15 = tmp1 - tmp5; + tmp12 = tmp2 + tmp4; + tmp16 = tmp2 - tmp4; + + tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[13]); + tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[12]); + tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[11]); + tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[10]); + tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[9]); + tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[8]); + tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[7]); + + /* Apply unsigned->signed conversion */ + dataptr[0] = (DCTELEM) + ((tmp10 + tmp11 + tmp12 + tmp13 - 14 * CENTERJSAMPLE) << PASS1_BITS); + tmp13 += tmp13; + dataptr[4] = (DCTELEM) + DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.274162392)) + /* c4 */ + MULTIPLY(tmp11 - tmp13, FIX(0.314692123)) - /* c12 */ + MULTIPLY(tmp12 - tmp13, FIX(0.881747734)), /* c8 */ + CONST_BITS-PASS1_BITS); + + tmp10 = MULTIPLY(tmp14 + tmp15, FIX(1.105676686)); /* c6 */ + + dataptr[2] = (DCTELEM) + DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.273079590)) /* c2-c6 */ + + MULTIPLY(tmp16, FIX(0.613604268)), /* c10 */ + CONST_BITS-PASS1_BITS); + dataptr[6] = (DCTELEM) + DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.719280954)) /* c6+c10 */ + - MULTIPLY(tmp16, FIX(1.378756276)), /* c2 */ + CONST_BITS-PASS1_BITS); + + /* Odd part */ + + tmp10 = tmp1 + tmp2; + tmp11 = tmp5 - tmp4; + dataptr[7] = (DCTELEM) ((tmp0 - tmp10 + tmp3 - tmp11 - tmp6) << PASS1_BITS); + tmp3 <<= CONST_BITS; + tmp10 = MULTIPLY(tmp10, - FIX(0.158341681)); /* -c13 */ + tmp11 = MULTIPLY(tmp11, FIX(1.405321284)); /* c1 */ + tmp10 += tmp11 - tmp3; + tmp11 = MULTIPLY(tmp0 + tmp2, FIX(1.197448846)) + /* c5 */ + MULTIPLY(tmp4 + tmp6, FIX(0.752406978)); /* c9 */ + dataptr[5] = (DCTELEM) + DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(2.373959773)) /* c3+c5-c13 */ + + MULTIPLY(tmp4, FIX(1.119999435)), /* c1+c11-c9 */ + CONST_BITS-PASS1_BITS); + tmp12 = MULTIPLY(tmp0 + tmp1, FIX(1.334852607)) + /* c3 */ + MULTIPLY(tmp5 - tmp6, FIX(0.467085129)); /* c11 */ + dataptr[3] = (DCTELEM) + DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.424103948)) /* c3-c9-c13 */ + - MULTIPLY(tmp5, FIX(3.069855259)), /* c1+c5+c11 */ + CONST_BITS-PASS1_BITS); + dataptr[1] = (DCTELEM) + DESCALE(tmp11 + tmp12 + tmp3 + tmp6 - + MULTIPLY(tmp0 + tmp6, FIX(1.126980169)), /* c3+c5-c1 */ + CONST_BITS-PASS1_BITS); + + dataptr += DCTSIZE; /* advance pointer to next row */ + } + + /* Pass 2: process columns. + * We remove the PASS1_BITS scaling, but leave the results scaled up + * by an overall factor of 8. + * We must also scale the output by (8/14)*(8/7) = 32/49, which we + * partially fold into the constant multipliers and final shifting: + * 7-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/14) * 64/49. + */ + + dataptr = data; + for (ctr = DCTSIZE-1; ctr >= 0; ctr--) { + /* Even part */ + + tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*6]; + tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*5]; + tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*4]; + tmp3 = dataptr[DCTSIZE*3]; + + tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*6]; + tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*5]; + tmp12 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*4]; + + z1 = tmp0 + tmp2; + dataptr[DCTSIZE*0] = (DCTELEM) + DESCALE(MULTIPLY(z1 + tmp1 + tmp3, FIX(1.306122449)), /* 64/49 */ + CONST_BITS+PASS1_BITS+1); + tmp3 += tmp3; + z1 -= tmp3; + z1 -= tmp3; + z1 = MULTIPLY(z1, FIX(0.461784020)); /* (c2+c6-c4)/2 */ + z2 = MULTIPLY(tmp0 - tmp2, FIX(1.202428084)); /* (c2+c4-c6)/2 */ + z3 = MULTIPLY(tmp1 - tmp2, FIX(0.411026446)); /* c6 */ + dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS+PASS1_BITS+1); + z1 -= z2; + z2 = MULTIPLY(tmp0 - tmp1, FIX(1.151670509)); /* c4 */ + dataptr[DCTSIZE*4] = (DCTELEM) + DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.923568041)), /* c2+c6-c4 */ + CONST_BITS+PASS1_BITS+1); + dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+PASS1_BITS+1); + + /* Odd part */ + + tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.221765677)); /* (c3+c1-c5)/2 */ + tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.222383464)); /* (c3+c5-c1)/2 */ + tmp0 = tmp1 - tmp2; + tmp1 += tmp2; + tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.800824523)); /* -c1 */ + tmp1 += tmp2; + tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.801442310)); /* c5 */ + tmp0 += tmp3; + tmp2 += tmp3 + MULTIPLY(tmp12, FIX(2.443531355)); /* c3+c1-c5 */ + + dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+PASS1_BITS+1); + dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+PASS1_BITS+1); + dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+PASS1_BITS+1); + + dataptr++; /* advance pointer to next column */ + } +} + + +/* + * Perform the forward DCT on a 12x6 sample block. + * + * 12-point FDCT in pass 1 (rows), 6-point in pass 2 (columns). + */ + +GLOBAL(void) +jpeg_fdct_12x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col) +{ + INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; + DCTELEM *dataptr; + JSAMPROW elemptr; + int ctr; + SHIFT_TEMPS + + /* Zero 2 bottom rows of output coefficient block. */ + MEMZERO(&data[DCTSIZE*6], SIZEOF(DCTELEM) * DCTSIZE * 2); + + /* Pass 1: process rows. */ + /* Note results are scaled up by sqrt(8) compared to a true DCT; */ + /* furthermore, we scale the results by 2**PASS1_BITS. */ + /* 12-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/24). */ + + dataptr = data; + for (ctr = 0; ctr < 6; ctr++) { + elemptr = sample_data[ctr] + start_col; + + /* Even part */ + + tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[11]); + tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[10]); + tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[9]); + tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[8]); + tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[7]); + tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[6]); + + tmp10 = tmp0 + tmp5; + tmp13 = tmp0 - tmp5; + tmp11 = tmp1 + tmp4; + tmp14 = tmp1 - tmp4; + tmp12 = tmp2 + tmp3; + tmp15 = tmp2 - tmp3; + + tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[11]); + tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[10]); + tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[9]); + tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[8]); + tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[7]); + tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[6]); + + /* Apply unsigned->signed conversion */ + dataptr[0] = (DCTELEM) + ((tmp10 + tmp11 + tmp12 - 12 * CENTERJSAMPLE) << PASS1_BITS); + dataptr[6] = (DCTELEM) ((tmp13 - tmp14 - tmp15) << PASS1_BITS); + dataptr[4] = (DCTELEM) + DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.224744871)), /* c4 */ + CONST_BITS-PASS1_BITS); + dataptr[2] = (DCTELEM) + DESCALE(tmp14 - tmp15 + MULTIPLY(tmp13 + tmp15, FIX(1.366025404)), /* c2 */ + CONST_BITS-PASS1_BITS); + + /* Odd part */ + + tmp10 = MULTIPLY(tmp1 + tmp4, FIX_0_541196100); /* c9 */ + tmp14 = tmp10 + MULTIPLY(tmp1, FIX_0_765366865); /* c3-c9 */ + tmp15 = tmp10 - MULTIPLY(tmp4, FIX_1_847759065); /* c3+c9 */ + tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.121971054)); /* c5 */ + tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.860918669)); /* c7 */ + tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.580774953)) /* c5+c7-c1 */ + + MULTIPLY(tmp5, FIX(0.184591911)); /* c11 */ + tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.184591911)); /* -c11 */ + tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.339493912)) /* c1+c5-c11 */ + + MULTIPLY(tmp5, FIX(0.860918669)); /* c7 */ + tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.725788011)) /* c1+c11-c7 */ + - MULTIPLY(tmp5, FIX(1.121971054)); /* c5 */ + tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.306562965)) /* c3 */ + - MULTIPLY(tmp2 + tmp5, FIX_0_541196100); /* c9 */ + + dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS); + dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS); + dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS); + dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS); + + dataptr += DCTSIZE; /* advance pointer to next row */ + } + + /* Pass 2: process columns. + * We remove the PASS1_BITS scaling, but leave the results scaled up + * by an overall factor of 8. + * We must also scale the output by (8/12)*(8/6) = 8/9, which we + * partially fold into the constant multipliers and final shifting: + * 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12) * 16/9. + */ + + dataptr = data; + for (ctr = DCTSIZE-1; ctr >= 0; ctr--) { + /* Even part */ + + tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*5]; + tmp11 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*4]; + tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*3]; + + tmp10 = tmp0 + tmp2; + tmp12 = tmp0 - tmp2; + + tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*5]; + tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*4]; + tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*3]; + + dataptr[DCTSIZE*0] = (DCTELEM) + DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)), /* 16/9 */ + CONST_BITS+PASS1_BITS+1); + dataptr[DCTSIZE*2] = (DCTELEM) + DESCALE(MULTIPLY(tmp12, FIX(2.177324216)), /* c2 */ + CONST_BITS+PASS1_BITS+1); + dataptr[DCTSIZE*4] = (DCTELEM) + DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */ + CONST_BITS+PASS1_BITS+1); + + /* Odd part */ + + tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829)); /* c5 */ + + dataptr[DCTSIZE*1] = (DCTELEM) + DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)), /* 16/9 */ + CONST_BITS+PASS1_BITS+1); + dataptr[DCTSIZE*3] = (DCTELEM) + DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)), /* 16/9 */ + CONST_BITS+PASS1_BITS+1); + dataptr[DCTSIZE*5] = (DCTELEM) + DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)), /* 16/9 */ + CONST_BITS+PASS1_BITS+1); + + dataptr++; /* advance pointer to next column */ + } +} + + +/* + * Perform the forward DCT on a 10x5 sample block. + * + * 10-point FDCT in pass 1 (rows), 5-point in pass 2 (columns). + */ + +GLOBAL(void) +jpeg_fdct_10x5 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col) +{ + INT32 tmp0, tmp1, tmp2, tmp3, tmp4; + INT32 tmp10, tmp11, tmp12, tmp13, tmp14; + DCTELEM *dataptr; + JSAMPROW elemptr; + int ctr; + SHIFT_TEMPS + + /* Zero 3 bottom rows of output coefficient block. */ + MEMZERO(&data[DCTSIZE*5], SIZEOF(DCTELEM) * DCTSIZE * 3); + + /* Pass 1: process rows. */ + /* Note results are scaled up by sqrt(8) compared to a true DCT; */ + /* furthermore, we scale the results by 2**PASS1_BITS. */ + /* 10-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/20). */ + + dataptr = data; + for (ctr = 0; ctr < 5; ctr++) { + elemptr = sample_data[ctr] + start_col; + + /* Even part */ + + tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[9]); + tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[8]); + tmp12 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[7]); + tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[6]); + tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[5]); + + tmp10 = tmp0 + tmp4; + tmp13 = tmp0 - tmp4; + tmp11 = tmp1 + tmp3; + tmp14 = tmp1 - tmp3; + + tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[9]); + tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[8]); + tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[7]); + tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[6]); + tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[5]); + + /* Apply unsigned->signed conversion */ + dataptr[0] = (DCTELEM) + ((tmp10 + tmp11 + tmp12 - 10 * CENTERJSAMPLE) << PASS1_BITS); + tmp12 += tmp12; + dataptr[4] = (DCTELEM) + DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.144122806)) - /* c4 */ + MULTIPLY(tmp11 - tmp12, FIX(0.437016024)), /* c8 */ + CONST_BITS-PASS1_BITS); + tmp10 = MULTIPLY(tmp13 + tmp14, FIX(0.831253876)); /* c6 */ + dataptr[2] = (DCTELEM) + DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.513743148)), /* c2-c6 */ + CONST_BITS-PASS1_BITS); + dataptr[6] = (DCTELEM) + DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.176250899)), /* c2+c6 */ + CONST_BITS-PASS1_BITS); + + /* Odd part */ + + tmp10 = tmp0 + tmp4; + tmp11 = tmp1 - tmp3; + dataptr[5] = (DCTELEM) ((tmp10 - tmp11 - tmp2) << PASS1_BITS); + tmp2 <<= CONST_BITS; + dataptr[1] = (DCTELEM) + DESCALE(MULTIPLY(tmp0, FIX(1.396802247)) + /* c1 */ + MULTIPLY(tmp1, FIX(1.260073511)) + tmp2 + /* c3 */ + MULTIPLY(tmp3, FIX(0.642039522)) + /* c7 */ + MULTIPLY(tmp4, FIX(0.221231742)), /* c9 */ + CONST_BITS-PASS1_BITS); + tmp12 = MULTIPLY(tmp0 - tmp4, FIX(0.951056516)) - /* (c3+c7)/2 */ + MULTIPLY(tmp1 + tmp3, FIX(0.587785252)); /* (c1-c9)/2 */ + tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.309016994)) + /* (c3-c7)/2 */ + (tmp11 << (CONST_BITS - 1)) - tmp2; + dataptr[3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS-PASS1_BITS); + dataptr[7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS-PASS1_BITS); + + dataptr += DCTSIZE; /* advance pointer to next row */ + } + + /* Pass 2: process columns. + * We remove the PASS1_BITS scaling, but leave the results scaled up + * by an overall factor of 8. + * We must also scale the output by (8/10)*(8/5) = 32/25, which we + * fold into the constant multipliers: + * 5-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/10) * 32/25. + */ + + dataptr = data; + for (ctr = DCTSIZE-1; ctr >= 0; ctr--) { + /* Even part */ + + tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*4]; + tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*3]; + tmp2 = dataptr[DCTSIZE*2]; + + tmp10 = tmp0 + tmp1; + tmp11 = tmp0 - tmp1; + + tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*4]; + tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*3]; + + dataptr[DCTSIZE*0] = (DCTELEM) + DESCALE(MULTIPLY(tmp10 + tmp2, FIX(1.28)), /* 32/25 */ + CONST_BITS+PASS1_BITS); + tmp11 = MULTIPLY(tmp11, FIX(1.011928851)); /* (c2+c4)/2 */ + tmp10 -= tmp2 << 2; + tmp10 = MULTIPLY(tmp10, FIX(0.452548340)); /* (c2-c4)/2 */ + dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS+PASS1_BITS); + dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS+PASS1_BITS); + + /* Odd part */ + + tmp10 = MULTIPLY(tmp0 + tmp1, FIX(1.064004961)); /* c3 */ + + dataptr[DCTSIZE*1] = (DCTELEM) + DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.657591230)), /* c1-c3 */ + CONST_BITS+PASS1_BITS); + dataptr[DCTSIZE*3] = (DCTELEM) + DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.785601151)), /* c1+c3 */ + CONST_BITS+PASS1_BITS); + + dataptr++; /* advance pointer to next column */ + } +} + + +/* + * Perform the forward DCT on an 8x4 sample block. + * + * 8-point FDCT in pass 1 (rows), 4-point in pass 2 (columns). + */ + +GLOBAL(void) +jpeg_fdct_8x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col) +{ + INT32 tmp0, tmp1, tmp2, tmp3; + INT32 tmp10, tmp11, tmp12, tmp13; + INT32 z1; + DCTELEM *dataptr; + JSAMPROW elemptr; + int ctr; + SHIFT_TEMPS + + /* Zero 4 bottom rows of output coefficient block. */ + MEMZERO(&data[DCTSIZE*4], SIZEOF(DCTELEM) * DCTSIZE * 4); + + /* Pass 1: process rows. */ + /* Note results are scaled up by sqrt(8) compared to a true DCT; */ + /* furthermore, we scale the results by 2**PASS1_BITS. */ + /* We must also scale the output by 8/4 = 2, which we add here. */ + + dataptr = data; + for (ctr = 0; ctr < 4; ctr++) { + elemptr = sample_data[ctr] + start_col; + + /* Even part per LL&M figure 1 --- note that published figure is faulty; + * rotator "sqrt(2)*c1" should be "sqrt(2)*c6". + */ + + tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]); + tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]); + tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]); + tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]); + + tmp10 = tmp0 + tmp3; + tmp12 = tmp0 - tmp3; + tmp11 = tmp1 + tmp2; + tmp13 = tmp1 - tmp2; + + tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]); + tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]); + tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]); + tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]); + + /* Apply unsigned->signed conversion */ + dataptr[0] = (DCTELEM) + ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << (PASS1_BITS+1)); + dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << (PASS1_BITS+1)); + + z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100); + dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, FIX_0_765366865), + CONST_BITS-PASS1_BITS-1); + dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, - FIX_1_847759065), + CONST_BITS-PASS1_BITS-1); + + /* Odd part per figure 8 --- note paper omits factor of sqrt(2). + * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16). + * i0..i3 in the paper are tmp0..tmp3 here. + */ + + tmp10 = tmp0 + tmp3; + tmp11 = tmp1 + tmp2; + tmp12 = tmp0 + tmp2; + tmp13 = tmp1 + tmp3; + z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */ + + tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */ + tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */ + tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */ + tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */ + tmp10 = MULTIPLY(tmp10, - FIX_0_899976223); /* c7-c3 */ + tmp11 = MULTIPLY(tmp11, - FIX_2_562915447); /* -c1-c3 */ + tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* c5-c3 */ + tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */ + + tmp12 += z1; + tmp13 += z1; + + dataptr[1] = (DCTELEM) DESCALE(tmp0 + tmp10 + tmp12, CONST_BITS-PASS1_BITS-1); + dataptr[3] = (DCTELEM) DESCALE(tmp1 + tmp11 + tmp13, CONST_BITS-PASS1_BITS-1); + dataptr[5] = (DCTELEM) DESCALE(tmp2 + tmp11 + tmp12, CONST_BITS-PASS1_BITS-1); + dataptr[7] = (DCTELEM) DESCALE(tmp3 + tmp10 + tmp13, CONST_BITS-PASS1_BITS-1); + + dataptr += DCTSIZE; /* advance pointer to next row */ + } + + /* Pass 2: process columns. + * We remove the PASS1_BITS scaling, but leave the results scaled up + * by an overall factor of 8. + * 4-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16). + */ + + dataptr = data; + for (ctr = DCTSIZE-1; ctr >= 0; ctr--) { + /* Even part */ + + tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3]; + tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*2]; + + tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*3]; + tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*2]; + + dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp0 + tmp1, PASS1_BITS); + dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(tmp0 - tmp1, PASS1_BITS); + + /* Odd part */ + + tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */ + + dataptr[DCTSIZE*1] = (DCTELEM) + DESCALE(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */ + CONST_BITS+PASS1_BITS); + dataptr[DCTSIZE*3] = (DCTELEM) + DESCALE(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */ + CONST_BITS+PASS1_BITS); + + dataptr++; /* advance pointer to next column */ + } +} + + +/* + * Perform the forward DCT on a 6x3 sample block. + * + * 6-point FDCT in pass 1 (rows), 3-point in pass 2 (columns). + */ + +GLOBAL(void) +jpeg_fdct_6x3 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col) +{ + INT32 tmp0, tmp1, tmp2; + INT32 tmp10, tmp11, tmp12; + DCTELEM *dataptr; + JSAMPROW elemptr; + int ctr; + SHIFT_TEMPS + + /* Pre-zero output coefficient block. */ + MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2); + + /* Pass 1: process rows. */ + /* Note results are scaled up by sqrt(8) compared to a true DCT; */ + /* furthermore, we scale the results by 2**PASS1_BITS. */ + /* We scale the results further by 2 as part of output adaption */ + /* scaling for different DCT size. */ + /* 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12). */ + + dataptr = data; + for (ctr = 0; ctr < 3; ctr++) { + elemptr = sample_data[ctr] + start_col; + + /* Even part */ + + tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]); + tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]); + tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]); + + tmp10 = tmp0 + tmp2; + tmp12 = tmp0 - tmp2; + + tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]); + tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]); + tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]); + + /* Apply unsigned->signed conversion */ + dataptr[0] = (DCTELEM) + ((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << (PASS1_BITS+1)); + dataptr[2] = (DCTELEM) + DESCALE(MULTIPLY(tmp12, FIX(1.224744871)), /* c2 */ + CONST_BITS-PASS1_BITS-1); + dataptr[4] = (DCTELEM) + DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */ + CONST_BITS-PASS1_BITS-1); + + /* Odd part */ + + tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)), /* c5 */ + CONST_BITS-PASS1_BITS-1); + + dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << (PASS1_BITS+1))); + dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << (PASS1_BITS+1)); + dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << (PASS1_BITS+1))); + + dataptr += DCTSIZE; /* advance pointer to next row */ + } + + /* Pass 2: process columns. + * We remove the PASS1_BITS scaling, but leave the results scaled up + * by an overall factor of 8. + * We must also scale the output by (8/6)*(8/3) = 32/9, which we partially + * fold into the constant multipliers (other part was done in pass 1): + * 3-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/6) * 16/9. + */ + + dataptr = data; + for (ctr = 0; ctr < 6; ctr++) { + /* Even part */ + + tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*2]; + tmp1 = dataptr[DCTSIZE*1]; + + tmp2 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*2]; + + dataptr[DCTSIZE*0] = (DCTELEM) + DESCALE(MULTIPLY(tmp0 + tmp1, FIX(1.777777778)), /* 16/9 */ + CONST_BITS+PASS1_BITS); + dataptr[DCTSIZE*2] = (DCTELEM) + DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(1.257078722)), /* c2 */ + CONST_BITS+PASS1_BITS); + + /* Odd part */ + + dataptr[DCTSIZE*1] = (DCTELEM) + DESCALE(MULTIPLY(tmp2, FIX(2.177324216)), /* c1 */ + CONST_BITS+PASS1_BITS); + + dataptr++; /* advance pointer to next column */ + } +} + + +/* + * Perform the forward DCT on a 4x2 sample block. + * + * 4-point FDCT in pass 1 (rows), 2-point in pass 2 (columns). + */ + +GLOBAL(void) +jpeg_fdct_4x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col) +{ + INT32 tmp0, tmp1; + INT32 tmp10, tmp11; + DCTELEM *dataptr; + JSAMPROW elemptr; + int ctr; + SHIFT_TEMPS + + /* Pre-zero output coefficient block. */ + MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2); + + /* Pass 1: process rows. */ + /* Note results are scaled up by sqrt(8) compared to a true DCT; */ + /* furthermore, we scale the results by 2**PASS1_BITS. */ + /* We must also scale the output by (8/4)*(8/2) = 2**3, which we add here. */ + /* 4-point FDCT kernel, */ + /* cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT]. */ + + dataptr = data; + for (ctr = 0; ctr < 2; ctr++) { + elemptr = sample_data[ctr] + start_col; + + /* Even part */ + + tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]); + tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]); + + tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]); + tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]); + + /* Apply unsigned->signed conversion */ + dataptr[0] = (DCTELEM) + ((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+3)); + dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+3)); + + /* Odd part */ + + tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */ + + dataptr[1] = (DCTELEM) + DESCALE(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */ + CONST_BITS-PASS1_BITS-3); + dataptr[3] = (DCTELEM) + DESCALE(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */ + CONST_BITS-PASS1_BITS-3); + + dataptr += DCTSIZE; /* advance pointer to next row */ + } + + /* Pass 2: process columns. + * We remove the PASS1_BITS scaling, but leave the results scaled up + * by an overall factor of 8. + */ + + dataptr = data; + for (ctr = 0; ctr < 4; ctr++) { + /* Even part */ + + tmp0 = dataptr[DCTSIZE*0]; + tmp1 = dataptr[DCTSIZE*1]; + + dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp0 + tmp1, PASS1_BITS); + + /* Odd part */ + + dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0 - tmp1, PASS1_BITS); + + dataptr++; /* advance pointer to next column */ + } +} + + +/* + * Perform the forward DCT on a 2x1 sample block. + * + * 2-point FDCT in pass 1 (rows), 1-point in pass 2 (columns). + */ + +GLOBAL(void) +jpeg_fdct_2x1 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col) +{ + INT32 tmp0, tmp1; + JSAMPROW elemptr; + + /* Pre-zero output coefficient block. */ + MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2); + + elemptr = sample_data[0] + start_col; + + tmp0 = GETJSAMPLE(elemptr[0]); + tmp1 = GETJSAMPLE(elemptr[1]); + + /* We leave the results scaled up by an overall factor of 8. + * We must also scale the output by (8/2)*(8/1) = 2**5. + */ + + /* Even part */ + /* Apply unsigned->signed conversion */ + data[0] = (DCTELEM) ((tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 5); + + /* Odd part */ + data[1] = (DCTELEM) ((tmp0 - tmp1) << 5); +} + + +/* + * Perform the forward DCT on an 8x16 sample block. + * + * 8-point FDCT in pass 1 (rows), 16-point in pass 2 (columns). + */ + +GLOBAL(void) +jpeg_fdct_8x16 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col) +{ + INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17; + INT32 z1; + DCTELEM workspace[DCTSIZE2]; + DCTELEM *dataptr; + DCTELEM *wsptr; + JSAMPROW elemptr; + int ctr; + SHIFT_TEMPS + + /* Pass 1: process rows. */ + /* Note results are scaled up by sqrt(8) compared to a true DCT; */ + /* furthermore, we scale the results by 2**PASS1_BITS. */ + + dataptr = data; + ctr = 0; + for (;;) { + elemptr = sample_data[ctr] + start_col; + + /* Even part per LL&M figure 1 --- note that published figure is faulty; + * rotator "sqrt(2)*c1" should be "sqrt(2)*c6". + */ + + tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]); + tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]); + tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]); + tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]); + + tmp10 = tmp0 + tmp3; + tmp12 = tmp0 - tmp3; + tmp11 = tmp1 + tmp2; + tmp13 = tmp1 - tmp2; + + tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]); + tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]); + tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]); + tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]); + + /* Apply unsigned->signed conversion */ + dataptr[0] = (DCTELEM) ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << PASS1_BITS); + dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS); + + z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100); + dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, FIX_0_765366865), + CONST_BITS-PASS1_BITS); + dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, - FIX_1_847759065), + CONST_BITS-PASS1_BITS); + + /* Odd part per figure 8 --- note paper omits factor of sqrt(2). + * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16). + * i0..i3 in the paper are tmp0..tmp3 here. + */ + + tmp10 = tmp0 + tmp3; + tmp11 = tmp1 + tmp2; + tmp12 = tmp0 + tmp2; + tmp13 = tmp1 + tmp3; + z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */ + + tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */ + tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */ + tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */ + tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */ + tmp10 = MULTIPLY(tmp10, - FIX_0_899976223); /* c7-c3 */ + tmp11 = MULTIPLY(tmp11, - FIX_2_562915447); /* -c1-c3 */ + tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* c5-c3 */ + tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */ + + tmp12 += z1; + tmp13 += z1; + + dataptr[1] = (DCTELEM) DESCALE(tmp0 + tmp10 + tmp12, CONST_BITS-PASS1_BITS); + dataptr[3] = (DCTELEM) DESCALE(tmp1 + tmp11 + tmp13, CONST_BITS-PASS1_BITS); + dataptr[5] = (DCTELEM) DESCALE(tmp2 + tmp11 + tmp12, CONST_BITS-PASS1_BITS); + dataptr[7] = (DCTELEM) DESCALE(tmp3 + tmp10 + tmp13, CONST_BITS-PASS1_BITS); + + ctr++; + + if (ctr != DCTSIZE) { + if (ctr == DCTSIZE * 2) + break; /* Done. */ + dataptr += DCTSIZE; /* advance pointer to next row */ + } else + dataptr = workspace; /* switch pointer to extended workspace */ + } + + /* Pass 2: process columns. + * We remove the PASS1_BITS scaling, but leave the results scaled up + * by an overall factor of 8. + * We must also scale the output by 8/16 = 1/2. + * 16-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/32). + */ + + dataptr = data; + wsptr = workspace; + for (ctr = DCTSIZE-1; ctr >= 0; ctr--) { + /* Even part */ + + tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*7]; + tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*6]; + tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*5]; + tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*4]; + tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*3]; + tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*2]; + tmp6 = dataptr[DCTSIZE*6] + wsptr[DCTSIZE*1]; + tmp7 = dataptr[DCTSIZE*7] + wsptr[DCTSIZE*0]; + + tmp10 = tmp0 + tmp7; + tmp14 = tmp0 - tmp7; + tmp11 = tmp1 + tmp6; + tmp15 = tmp1 - tmp6; + tmp12 = tmp2 + tmp5; + tmp16 = tmp2 - tmp5; + tmp13 = tmp3 + tmp4; + tmp17 = tmp3 - tmp4; + + tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*7]; + tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*6]; + tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*5]; + tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*4]; + tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*3]; + tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*2]; + tmp6 = dataptr[DCTSIZE*6] - wsptr[DCTSIZE*1]; + tmp7 = dataptr[DCTSIZE*7] - wsptr[DCTSIZE*0]; + + dataptr[DCTSIZE*0] = (DCTELEM) + DESCALE(tmp10 + tmp11 + tmp12 + tmp13, PASS1_BITS+1); + dataptr[DCTSIZE*4] = (DCTELEM) + DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */ + MULTIPLY(tmp11 - tmp12, FIX_0_541196100), /* c12[16] = c6[8] */ + CONST_BITS+PASS1_BITS+1); + + tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) + /* c14[16] = c7[8] */ + MULTIPLY(tmp14 - tmp16, FIX(1.387039845)); /* c2[16] = c1[8] */ + + dataptr[DCTSIZE*2] = (DCTELEM) + DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982)) /* c6+c14 */ + + MULTIPLY(tmp16, FIX(2.172734804)), /* c2+c10 */ + CONST_BITS+PASS1_BITS+1); + dataptr[DCTSIZE*6] = (DCTELEM) + DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243)) /* c2-c6 */ + - MULTIPLY(tmp17, FIX(1.061594338)), /* c10+c14 */ + CONST_BITS+PASS1_BITS+1); + + /* Odd part */ + + tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) + /* c3 */ + MULTIPLY(tmp6 - tmp7, FIX(0.410524528)); /* c13 */ + tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) + /* c5 */ + MULTIPLY(tmp5 + tmp7, FIX(0.666655658)); /* c11 */ + tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) + /* c7 */ + MULTIPLY(tmp4 - tmp7, FIX(0.897167586)); /* c9 */ + tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) + /* c15 */ + MULTIPLY(tmp6 - tmp5, FIX(1.407403738)); /* c1 */ + tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) + /* -c11 */ + MULTIPLY(tmp4 + tmp6, - FIX(1.247225013)); /* -c5 */ + tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) + /* -c3 */ + MULTIPLY(tmp5 - tmp4, FIX(0.410524528)); /* c13 */ + tmp10 = tmp11 + tmp12 + tmp13 - + MULTIPLY(tmp0, FIX(2.286341144)) + /* c7+c5+c3-c1 */ + MULTIPLY(tmp7, FIX(0.779653625)); /* c15+c13-c11+c9 */ + tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */ + - MULTIPLY(tmp6, FIX(1.663905119)); /* c7+c13+c1-c5 */ + tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */ + + MULTIPLY(tmp5, FIX(1.227391138)); /* c9-c11+c1-c13 */ + tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */ + + MULTIPLY(tmp4, FIX(2.167985692)); /* c1+c13+c5-c9 */ + + dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS1_BITS+1); + dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS1_BITS+1); + dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS1_BITS+1); + dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS1_BITS+1); + + dataptr++; /* advance pointer to next column */ + wsptr++; /* advance pointer to next column */ + } +} + + +/* + * Perform the forward DCT on a 7x14 sample block. + * + * 7-point FDCT in pass 1 (rows), 14-point in pass 2 (columns). + */ + +GLOBAL(void) +jpeg_fdct_7x14 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col) +{ + INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6; + INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16; + INT32 z1, z2, z3; + DCTELEM workspace[8*6]; + DCTELEM *dataptr; + DCTELEM *wsptr; + JSAMPROW elemptr; + int ctr; + SHIFT_TEMPS + + /* Pre-zero output coefficient block. */ + MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2); + + /* Pass 1: process rows. */ + /* Note results are scaled up by sqrt(8) compared to a true DCT; */ + /* furthermore, we scale the results by 2**PASS1_BITS. */ + /* 7-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/14). */ + + dataptr = data; + ctr = 0; + for (;;) { + elemptr = sample_data[ctr] + start_col; + + /* Even part */ + + tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[6]); + tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[5]); + tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[4]); + tmp3 = GETJSAMPLE(elemptr[3]); + + tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[6]); + tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[5]); + tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[4]); + + z1 = tmp0 + tmp2; + /* Apply unsigned->signed conversion */ + dataptr[0] = (DCTELEM) + ((z1 + tmp1 + tmp3 - 7 * CENTERJSAMPLE) << PASS1_BITS); + tmp3 += tmp3; + z1 -= tmp3; + z1 -= tmp3; + z1 = MULTIPLY(z1, FIX(0.353553391)); /* (c2+c6-c4)/2 */ + z2 = MULTIPLY(tmp0 - tmp2, FIX(0.920609002)); /* (c2+c4-c6)/2 */ + z3 = MULTIPLY(tmp1 - tmp2, FIX(0.314692123)); /* c6 */ + dataptr[2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS-PASS1_BITS); + z1 -= z2; + z2 = MULTIPLY(tmp0 - tmp1, FIX(0.881747734)); /* c4 */ + dataptr[4] = (DCTELEM) + DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.707106781)), /* c2+c6-c4 */ + CONST_BITS-PASS1_BITS); + dataptr[6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS-PASS1_BITS); + + /* Odd part */ + + tmp1 = MULTIPLY(tmp10 + tmp11, FIX(0.935414347)); /* (c3+c1-c5)/2 */ + tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.170262339)); /* (c3+c5-c1)/2 */ + tmp0 = tmp1 - tmp2; + tmp1 += tmp2; + tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.378756276)); /* -c1 */ + tmp1 += tmp2; + tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.613604268)); /* c5 */ + tmp0 += tmp3; + tmp2 += tmp3 + MULTIPLY(tmp12, FIX(1.870828693)); /* c3+c1-c5 */ + + dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-PASS1_BITS); + dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-PASS1_BITS); + dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-PASS1_BITS); + + ctr++; + + if (ctr != DCTSIZE) { + if (ctr == 14) + break; /* Done. */ + dataptr += DCTSIZE; /* advance pointer to next row */ + } else + dataptr = workspace; /* switch pointer to extended workspace */ + } + + /* Pass 2: process columns. + * We remove the PASS1_BITS scaling, but leave the results scaled up + * by an overall factor of 8. + * We must also scale the output by (8/7)*(8/14) = 32/49, which we + * fold into the constant multipliers: + * 14-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/28) * 32/49. + */ + + dataptr = data; + wsptr = workspace; + for (ctr = 0; ctr < 7; ctr++) { + /* Even part */ + + tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*5]; + tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*4]; + tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*3]; + tmp13 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*2]; + tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*1]; + tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*0]; + tmp6 = dataptr[DCTSIZE*6] + dataptr[DCTSIZE*7]; + + tmp10 = tmp0 + tmp6; + tmp14 = tmp0 - tmp6; + tmp11 = tmp1 + tmp5; + tmp15 = tmp1 - tmp5; + tmp12 = tmp2 + tmp4; + tmp16 = tmp2 - tmp4; + + tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*5]; + tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*4]; + tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*3]; + tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*2]; + tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*1]; + tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*0]; + tmp6 = dataptr[DCTSIZE*6] - dataptr[DCTSIZE*7]; + + dataptr[DCTSIZE*0] = (DCTELEM) + DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12 + tmp13, + FIX(0.653061224)), /* 32/49 */ + CONST_BITS+PASS1_BITS); + tmp13 += tmp13; + dataptr[DCTSIZE*4] = (DCTELEM) + DESCALE(MULTIPLY(tmp10 - tmp13, FIX(0.832106052)) + /* c4 */ + MULTIPLY(tmp11 - tmp13, FIX(0.205513223)) - /* c12 */ + MULTIPLY(tmp12 - tmp13, FIX(0.575835255)), /* c8 */ + CONST_BITS+PASS1_BITS); + + tmp10 = MULTIPLY(tmp14 + tmp15, FIX(0.722074570)); /* c6 */ + + dataptr[DCTSIZE*2] = (DCTELEM) + DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.178337691)) /* c2-c6 */ + + MULTIPLY(tmp16, FIX(0.400721155)), /* c10 */ + CONST_BITS+PASS1_BITS); + dataptr[DCTSIZE*6] = (DCTELEM) + DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.122795725)) /* c6+c10 */ + - MULTIPLY(tmp16, FIX(0.900412262)), /* c2 */ + CONST_BITS+PASS1_BITS); + + /* Odd part */ + + tmp10 = tmp1 + tmp2; + tmp11 = tmp5 - tmp4; + dataptr[DCTSIZE*7] = (DCTELEM) + DESCALE(MULTIPLY(tmp0 - tmp10 + tmp3 - tmp11 - tmp6, + FIX(0.653061224)), /* 32/49 */ + CONST_BITS+PASS1_BITS); + tmp3 = MULTIPLY(tmp3 , FIX(0.653061224)); /* 32/49 */ + tmp10 = MULTIPLY(tmp10, - FIX(0.103406812)); /* -c13 */ + tmp11 = MULTIPLY(tmp11, FIX(0.917760839)); /* c1 */ + tmp10 += tmp11 - tmp3; + tmp11 = MULTIPLY(tmp0 + tmp2, FIX(0.782007410)) + /* c5 */ + MULTIPLY(tmp4 + tmp6, FIX(0.491367823)); /* c9 */ + dataptr[DCTSIZE*5] = (DCTELEM) + DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(1.550341076)) /* c3+c5-c13 */ + + MULTIPLY(tmp4, FIX(0.731428202)), /* c1+c11-c9 */ + CONST_BITS+PASS1_BITS); + tmp12 = MULTIPLY(tmp0 + tmp1, FIX(0.871740478)) + /* c3 */ + MULTIPLY(tmp5 - tmp6, FIX(0.305035186)); /* c11 */ + dataptr[DCTSIZE*3] = (DCTELEM) + DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.276965844)) /* c3-c9-c13 */ + - MULTIPLY(tmp5, FIX(2.004803435)), /* c1+c5+c11 */ + CONST_BITS+PASS1_BITS); + dataptr[DCTSIZE*1] = (DCTELEM) + DESCALE(tmp11 + tmp12 + tmp3 + - MULTIPLY(tmp0, FIX(0.735987049)) /* c3+c5-c1 */ + - MULTIPLY(tmp6, FIX(0.082925825)), /* c9-c11-c13 */ + CONST_BITS+PASS1_BITS); + + dataptr++; /* advance pointer to next column */ + wsptr++; /* advance pointer to next column */ + } +} + + +/* + * Perform the forward DCT on a 6x12 sample block. + * + * 6-point FDCT in pass 1 (rows), 12-point in pass 2 (columns). + */ + +GLOBAL(void) +jpeg_fdct_6x12 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col) +{ + INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; + DCTELEM workspace[8*4]; + DCTELEM *dataptr; + DCTELEM *wsptr; + JSAMPROW elemptr; + int ctr; + SHIFT_TEMPS + + /* Pre-zero output coefficient block. */ + MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2); + + /* Pass 1: process rows. */ + /* Note results are scaled up by sqrt(8) compared to a true DCT; */ + /* furthermore, we scale the results by 2**PASS1_BITS. */ + /* 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12). */ + + dataptr = data; + ctr = 0; + for (;;) { + elemptr = sample_data[ctr] + start_col; + + /* Even part */ + + tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]); + tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]); + tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]); + + tmp10 = tmp0 + tmp2; + tmp12 = tmp0 - tmp2; + + tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]); + tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]); + tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]); + + /* Apply unsigned->signed conversion */ + dataptr[0] = (DCTELEM) + ((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << PASS1_BITS); + dataptr[2] = (DCTELEM) + DESCALE(MULTIPLY(tmp12, FIX(1.224744871)), /* c2 */ + CONST_BITS-PASS1_BITS); + dataptr[4] = (DCTELEM) + DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */ + CONST_BITS-PASS1_BITS); + + /* Odd part */ + + tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)), /* c5 */ + CONST_BITS-PASS1_BITS); + + dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << PASS1_BITS)); + dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << PASS1_BITS); + dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << PASS1_BITS)); + + ctr++; + + if (ctr != DCTSIZE) { + if (ctr == 12) + break; /* Done. */ + dataptr += DCTSIZE; /* advance pointer to next row */ + } else + dataptr = workspace; /* switch pointer to extended workspace */ + } + + /* Pass 2: process columns. + * We remove the PASS1_BITS scaling, but leave the results scaled up + * by an overall factor of 8. + * We must also scale the output by (8/6)*(8/12) = 8/9, which we + * fold into the constant multipliers: + * 12-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/24) * 8/9. + */ + + dataptr = data; + wsptr = workspace; + for (ctr = 0; ctr < 6; ctr++) { + /* Even part */ + + tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*3]; + tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*2]; + tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*1]; + tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*0]; + tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*7]; + tmp5 = dataptr[DCTSIZE*5] + dataptr[DCTSIZE*6]; + + tmp10 = tmp0 + tmp5; + tmp13 = tmp0 - tmp5; + tmp11 = tmp1 + tmp4; + tmp14 = tmp1 - tmp4; + tmp12 = tmp2 + tmp3; + tmp15 = tmp2 - tmp3; + + tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*3]; + tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*2]; + tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*1]; + tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*0]; + tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*7]; + tmp5 = dataptr[DCTSIZE*5] - dataptr[DCTSIZE*6]; + + dataptr[DCTSIZE*0] = (DCTELEM) + DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(0.888888889)), /* 8/9 */ + CONST_BITS+PASS1_BITS); + dataptr[DCTSIZE*6] = (DCTELEM) + DESCALE(MULTIPLY(tmp13 - tmp14 - tmp15, FIX(0.888888889)), /* 8/9 */ + CONST_BITS+PASS1_BITS); + dataptr[DCTSIZE*4] = (DCTELEM) + DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.088662108)), /* c4 */ + CONST_BITS+PASS1_BITS); + dataptr[DCTSIZE*2] = (DCTELEM) + DESCALE(MULTIPLY(tmp14 - tmp15, FIX(0.888888889)) + /* 8/9 */ + MULTIPLY(tmp13 + tmp15, FIX(1.214244803)), /* c2 */ + CONST_BITS+PASS1_BITS); + + /* Odd part */ + + tmp10 = MULTIPLY(tmp1 + tmp4, FIX(0.481063200)); /* c9 */ + tmp14 = tmp10 + MULTIPLY(tmp1, FIX(0.680326102)); /* c3-c9 */ + tmp15 = tmp10 - MULTIPLY(tmp4, FIX(1.642452502)); /* c3+c9 */ + tmp12 = MULTIPLY(tmp0 + tmp2, FIX(0.997307603)); /* c5 */ + tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.765261039)); /* c7 */ + tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.516244403)) /* c5+c7-c1 */ + + MULTIPLY(tmp5, FIX(0.164081699)); /* c11 */ + tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.164081699)); /* -c11 */ + tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.079550144)) /* c1+c5-c11 */ + + MULTIPLY(tmp5, FIX(0.765261039)); /* c7 */ + tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.645144899)) /* c1+c11-c7 */ + - MULTIPLY(tmp5, FIX(0.997307603)); /* c5 */ + tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.161389302)) /* c3 */ + - MULTIPLY(tmp2 + tmp5, FIX(0.481063200)); /* c9 */ + + dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS1_BITS); + dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS1_BITS); + dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS1_BITS); + dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS1_BITS); + + dataptr++; /* advance pointer to next column */ + wsptr++; /* advance pointer to next column */ + } +} + + +/* + * Perform the forward DCT on a 5x10 sample block. + * + * 5-point FDCT in pass 1 (rows), 10-point in pass 2 (columns). + */ + +GLOBAL(void) +jpeg_fdct_5x10 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col) +{ + INT32 tmp0, tmp1, tmp2, tmp3, tmp4; + INT32 tmp10, tmp11, tmp12, tmp13, tmp14; + DCTELEM workspace[8*2]; + DCTELEM *dataptr; + DCTELEM *wsptr; + JSAMPROW elemptr; + int ctr; + SHIFT_TEMPS + + /* Pre-zero output coefficient block. */ + MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2); + + /* Pass 1: process rows. */ + /* Note results are scaled up by sqrt(8) compared to a true DCT; */ + /* furthermore, we scale the results by 2**PASS1_BITS. */ + /* 5-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/10). */ + + dataptr = data; + ctr = 0; + for (;;) { + elemptr = sample_data[ctr] + start_col; + + /* Even part */ + + tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[4]); + tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[3]); + tmp2 = GETJSAMPLE(elemptr[2]); + + tmp10 = tmp0 + tmp1; + tmp11 = tmp0 - tmp1; + + tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[4]); + tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[3]); + + /* Apply unsigned->signed conversion */ + dataptr[0] = (DCTELEM) + ((tmp10 + tmp2 - 5 * CENTERJSAMPLE) << PASS1_BITS); + tmp11 = MULTIPLY(tmp11, FIX(0.790569415)); /* (c2+c4)/2 */ + tmp10 -= tmp2 << 2; + tmp10 = MULTIPLY(tmp10, FIX(0.353553391)); /* (c2-c4)/2 */ + dataptr[2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS-PASS1_BITS); + dataptr[4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS-PASS1_BITS); + + /* Odd part */ + + tmp10 = MULTIPLY(tmp0 + tmp1, FIX(0.831253876)); /* c3 */ + + dataptr[1] = (DCTELEM) + DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.513743148)), /* c1-c3 */ + CONST_BITS-PASS1_BITS); + dataptr[3] = (DCTELEM) + DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.176250899)), /* c1+c3 */ + CONST_BITS-PASS1_BITS); + + ctr++; + + if (ctr != DCTSIZE) { + if (ctr == 10) + break; /* Done. */ + dataptr += DCTSIZE; /* advance pointer to next row */ + } else + dataptr = workspace; /* switch pointer to extended workspace */ + } + + /* Pass 2: process columns. + * We remove the PASS1_BITS scaling, but leave the results scaled up + * by an overall factor of 8. + * We must also scale the output by (8/5)*(8/10) = 32/25, which we + * fold into the constant multipliers: + * 10-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/20) * 32/25. + */ + + dataptr = data; + wsptr = workspace; + for (ctr = 0; ctr < 5; ctr++) { + /* Even part */ + + tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*1]; + tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*0]; + tmp12 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*7]; + tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*6]; + tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*5]; + + tmp10 = tmp0 + tmp4; + tmp13 = tmp0 - tmp4; + tmp11 = tmp1 + tmp3; + tmp14 = tmp1 - tmp3; + + tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*1]; + tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*0]; + tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*7]; + tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*6]; + tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*5]; + + dataptr[DCTSIZE*0] = (DCTELEM) + DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(1.28)), /* 32/25 */ + CONST_BITS+PASS1_BITS); + tmp12 += tmp12; + dataptr[DCTSIZE*4] = (DCTELEM) + DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.464477191)) - /* c4 */ + MULTIPLY(tmp11 - tmp12, FIX(0.559380511)), /* c8 */ + CONST_BITS+PASS1_BITS); + tmp10 = MULTIPLY(tmp13 + tmp14, FIX(1.064004961)); /* c6 */ + dataptr[DCTSIZE*2] = (DCTELEM) + DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.657591230)), /* c2-c6 */ + CONST_BITS+PASS1_BITS); + dataptr[DCTSIZE*6] = (DCTELEM) + DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.785601151)), /* c2+c6 */ + CONST_BITS+PASS1_BITS); + + /* Odd part */ + + tmp10 = tmp0 + tmp4; + tmp11 = tmp1 - tmp3; + dataptr[DCTSIZE*5] = (DCTELEM) + DESCALE(MULTIPLY(tmp10 - tmp11 - tmp2, FIX(1.28)), /* 32/25 */ + CONST_BITS+PASS1_BITS); + tmp2 = MULTIPLY(tmp2, FIX(1.28)); /* 32/25 */ + dataptr[DCTSIZE*1] = (DCTELEM) + DESCALE(MULTIPLY(tmp0, FIX(1.787906876)) + /* c1 */ + MULTIPLY(tmp1, FIX(1.612894094)) + tmp2 + /* c3 */ + MULTIPLY(tmp3, FIX(0.821810588)) + /* c7 */ + MULTIPLY(tmp4, FIX(0.283176630)), /* c9 */ + CONST_BITS+PASS1_BITS); + tmp12 = MULTIPLY(tmp0 - tmp4, FIX(1.217352341)) - /* (c3+c7)/2 */ + MULTIPLY(tmp1 + tmp3, FIX(0.752365123)); /* (c1-c9)/2 */ + tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.395541753)) + /* (c3-c7)/2 */ + MULTIPLY(tmp11, FIX(0.64)) - tmp2; /* 16/25 */ + dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS+PASS1_BITS); + dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS+PASS1_BITS); + + dataptr++; /* advance pointer to next column */ + wsptr++; /* advance pointer to next column */ + } +} + + +/* + * Perform the forward DCT on a 4x8 sample block. + * + * 4-point FDCT in pass 1 (rows), 8-point in pass 2 (columns). + */ + +GLOBAL(void) +jpeg_fdct_4x8 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col) +{ + INT32 tmp0, tmp1, tmp2, tmp3; + INT32 tmp10, tmp11, tmp12, tmp13; + INT32 z1; + DCTELEM *dataptr; + JSAMPROW elemptr; + int ctr; + SHIFT_TEMPS + + /* Pre-zero output coefficient block. */ + MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2); + + /* Pass 1: process rows. */ + /* Note results are scaled up by sqrt(8) compared to a true DCT; */ + /* furthermore, we scale the results by 2**PASS1_BITS. */ + /* We must also scale the output by 8/4 = 2, which we add here. */ + /* 4-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16). */ + + dataptr = data; + for (ctr = 0; ctr < DCTSIZE; ctr++) { + elemptr = sample_data[ctr] + start_col; + + /* Even part */ + + tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]); + tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]); + + tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]); + tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]); + + /* Apply unsigned->signed conversion */ + dataptr[0] = (DCTELEM) + ((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+1)); + dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+1)); + + /* Odd part */ + + tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */ + + dataptr[1] = (DCTELEM) + DESCALE(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */ + CONST_BITS-PASS1_BITS-1); + dataptr[3] = (DCTELEM) + DESCALE(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */ + CONST_BITS-PASS1_BITS-1); + + dataptr += DCTSIZE; /* advance pointer to next row */ + } + + /* Pass 2: process columns. + * We remove the PASS1_BITS scaling, but leave the results scaled up + * by an overall factor of 8. + */ + + dataptr = data; + for (ctr = 0; ctr < 4; ctr++) { + /* Even part per LL&M figure 1 --- note that published figure is faulty; + * rotator "sqrt(2)*c1" should be "sqrt(2)*c6". + */ + + tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7]; + tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6]; + tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5]; + tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4]; + + tmp10 = tmp0 + tmp3; + tmp12 = tmp0 - tmp3; + tmp11 = tmp1 + tmp2; + tmp13 = tmp1 - tmp2; + + tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7]; + tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6]; + tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5]; + tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4]; + + dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS); + dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS); + + z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100); + dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, FIX_0_765366865), + CONST_BITS+PASS1_BITS); + dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, - FIX_1_847759065), + CONST_BITS+PASS1_BITS); + + /* Odd part per figure 8 --- note paper omits factor of sqrt(2). + * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16). + * i0..i3 in the paper are tmp0..tmp3 here. + */ + + tmp10 = tmp0 + tmp3; + tmp11 = tmp1 + tmp2; + tmp12 = tmp0 + tmp2; + tmp13 = tmp1 + tmp3; + z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */ + + tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */ + tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */ + tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */ + tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */ + tmp10 = MULTIPLY(tmp10, - FIX_0_899976223); /* c7-c3 */ + tmp11 = MULTIPLY(tmp11, - FIX_2_562915447); /* -c1-c3 */ + tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* c5-c3 */ + tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */ + + tmp12 += z1; + tmp13 += z1; + + dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0 + tmp10 + tmp12, + CONST_BITS+PASS1_BITS); + dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1 + tmp11 + tmp13, + CONST_BITS+PASS1_BITS); + dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2 + tmp11 + tmp12, + CONST_BITS+PASS1_BITS); + dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3 + tmp10 + tmp13, + CONST_BITS+PASS1_BITS); + + dataptr++; /* advance pointer to next column */ + } +} + + +/* + * Perform the forward DCT on a 3x6 sample block. + * + * 3-point FDCT in pass 1 (rows), 6-point in pass 2 (columns). + */ + +GLOBAL(void) +jpeg_fdct_3x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col) +{ + INT32 tmp0, tmp1, tmp2; + INT32 tmp10, tmp11, tmp12; + DCTELEM *dataptr; + JSAMPROW elemptr; + int ctr; + SHIFT_TEMPS + + /* Pre-zero output coefficient block. */ + MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2); + + /* Pass 1: process rows. */ + /* Note results are scaled up by sqrt(8) compared to a true DCT; */ + /* furthermore, we scale the results by 2**PASS1_BITS. */ + /* We scale the results further by 2 as part of output adaption */ + /* scaling for different DCT size. */ + /* 3-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/6). */ + + dataptr = data; + for (ctr = 0; ctr < 6; ctr++) { + elemptr = sample_data[ctr] + start_col; + + /* Even part */ + + tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[2]); + tmp1 = GETJSAMPLE(elemptr[1]); + + tmp2 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[2]); + + /* Apply unsigned->signed conversion */ + dataptr[0] = (DCTELEM) + ((tmp0 + tmp1 - 3 * CENTERJSAMPLE) << (PASS1_BITS+1)); + dataptr[2] = (DCTELEM) + DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(0.707106781)), /* c2 */ + CONST_BITS-PASS1_BITS-1); + + /* Odd part */ + + dataptr[1] = (DCTELEM) + DESCALE(MULTIPLY(tmp2, FIX(1.224744871)), /* c1 */ + CONST_BITS-PASS1_BITS-1); + + dataptr += DCTSIZE; /* advance pointer to next row */ + } + + /* Pass 2: process columns. + * We remove the PASS1_BITS scaling, but leave the results scaled up + * by an overall factor of 8. + * We must also scale the output by (8/6)*(8/3) = 32/9, which we partially + * fold into the constant multipliers (other part was done in pass 1): + * 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12) * 16/9. + */ + + dataptr = data; + for (ctr = 0; ctr < 3; ctr++) { + /* Even part */ + + tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*5]; + tmp11 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*4]; + tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*3]; + + tmp10 = tmp0 + tmp2; + tmp12 = tmp0 - tmp2; + + tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*5]; + tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*4]; + tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*3]; + + dataptr[DCTSIZE*0] = (DCTELEM) + DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)), /* 16/9 */ + CONST_BITS+PASS1_BITS); + dataptr[DCTSIZE*2] = (DCTELEM) + DESCALE(MULTIPLY(tmp12, FIX(2.177324216)), /* c2 */ + CONST_BITS+PASS1_BITS); + dataptr[DCTSIZE*4] = (DCTELEM) + DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */ + CONST_BITS+PASS1_BITS); + + /* Odd part */ + + tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829)); /* c5 */ + + dataptr[DCTSIZE*1] = (DCTELEM) + DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)), /* 16/9 */ + CONST_BITS+PASS1_BITS); + dataptr[DCTSIZE*3] = (DCTELEM) + DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)), /* 16/9 */ + CONST_BITS+PASS1_BITS); + dataptr[DCTSIZE*5] = (DCTELEM) + DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)), /* 16/9 */ + CONST_BITS+PASS1_BITS); + + dataptr++; /* advance pointer to next column */ + } +} + + +/* + * Perform the forward DCT on a 2x4 sample block. + * + * 2-point FDCT in pass 1 (rows), 4-point in pass 2 (columns). + */ + +GLOBAL(void) +jpeg_fdct_2x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col) +{ + INT32 tmp0, tmp1; + INT32 tmp10, tmp11; + DCTELEM *dataptr; + JSAMPROW elemptr; + int ctr; + SHIFT_TEMPS + + /* Pre-zero output coefficient block. */ + MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2); + + /* Pass 1: process rows. */ + /* Note results are scaled up by sqrt(8) compared to a true DCT. */ + /* We must also scale the output by (8/2)*(8/4) = 2**3, which we add here. */ + + dataptr = data; + for (ctr = 0; ctr < 4; ctr++) { + elemptr = sample_data[ctr] + start_col; + + /* Even part */ + + tmp0 = GETJSAMPLE(elemptr[0]); + tmp1 = GETJSAMPLE(elemptr[1]); + + /* Apply unsigned->signed conversion */ + dataptr[0] = (DCTELEM) ((tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 3); + + /* Odd part */ + + dataptr[1] = (DCTELEM) ((tmp0 - tmp1) << 3); + + dataptr += DCTSIZE; /* advance pointer to next row */ + } + + /* Pass 2: process columns. + * We leave the results scaled up by an overall factor of 8. + * 4-point FDCT kernel, + * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT]. + */ + + dataptr = data; + for (ctr = 0; ctr < 2; ctr++) { + /* Even part */ + + tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3]; + tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*2]; + + tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*3]; + tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*2]; + + dataptr[DCTSIZE*0] = (DCTELEM) (tmp0 + tmp1); + dataptr[DCTSIZE*2] = (DCTELEM) (tmp0 - tmp1); + + /* Odd part */ + + tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */ + + dataptr[DCTSIZE*1] = (DCTELEM) + DESCALE(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), CONST_BITS); /* c2-c6 */ + dataptr[DCTSIZE*3] = (DCTELEM) + DESCALE(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), CONST_BITS); /* c2+c6 */ + + dataptr++; /* advance pointer to next column */ + } +} + + +/* + * Perform the forward DCT on a 1x2 sample block. + * + * 1-point FDCT in pass 1 (rows), 2-point in pass 2 (columns). + */ + +GLOBAL(void) +jpeg_fdct_1x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col) +{ + INT32 tmp0, tmp1; + + /* Pre-zero output coefficient block. */ + MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2); + + tmp0 = GETJSAMPLE(sample_data[0][start_col]); + tmp1 = GETJSAMPLE(sample_data[1][start_col]); + + /* We leave the results scaled up by an overall factor of 8. + * We must also scale the output by (8/1)*(8/2) = 2**5. + */ + + /* Even part */ + /* Apply unsigned->signed conversion */ + data[DCTSIZE*0] = (DCTELEM) ((tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 5); + + /* Odd part */ + data[DCTSIZE*1] = (DCTELEM) ((tmp0 - tmp1) << 5); +} + +#endif /* DCT_SCALING_SUPPORTED */ #endif /* DCT_ISLOW_SUPPORTED */ diff --git a/jidctint.c b/jidctint.c index a72b320..5582fd4 100644 --- a/jidctint.c +++ b/jidctint.c @@ -2,6 +2,7 @@ * jidctint.c * * Copyright (C) 1991-1998, Thomas G. Lane. + * Modification developed 2002-2004 by Guido Vollbeding. * This file is part of the Independent JPEG Group's software. * For conditions of distribution and use, see the accompanying README file. * @@ -23,6 +24,28 @@ * The advantage of this method is that no data path contains more than one * multiplication; this allows a very simple and accurate implementation in * scaled fixed-point arithmetic, with a minimal number of shifts. + * + * We also provide IDCT routines with various output sample block sizes for + * direct resolution reduction or enlargement and for direct resolving the + * common 2x1 and 1x2 subsampling cases without additional resampling: NxN + * (N=1...16), 2NxN, and Nx2N (N=1...8) pixels for one 8x8 input DCT block. + * + * For N<8 we simply take the corresponding low-frequency coefficients of + * the 8x8 input DCT block and apply an NxN point IDCT on the sub-block + * to yield the downscaled outputs. + * This can be seen as direct low-pass downsampling from the DCT domain + * point of view rather than the usual spatial domain point of view, + * yielding significant computational savings and results at least + * as good as common bilinear (averaging) spatial downsampling. + * + * For N>8 we apply a partial NxN IDCT on the 8 input coefficients as + * lower frequencies and higher frequencies assumed to be zero. + * It turns out that the computational effort is similar to the 8x8 IDCT + * regarding the output size. + * Furthermore, the scaling and descaling is the same for all IDCT sizes. + * + * CAUTION: We rely on the FIX() macro except for the N=1,2,4,8 cases + * since there would be too many additional constants to pre-calculate. */ #define JPEG_INTERNALS @@ -386,4 +409,4655 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr, } } +#ifdef IDCT_SCALING_SUPPORTED + + +/* + * Perform dequantization and inverse DCT on one block of coefficients, + * producing a 7x7 output block. + * + * Optimized algorithm with 12 multiplications in the 1-D kernel. + * cK represents sqrt(2) * cos(K*pi/14). + */ + +GLOBAL(void) +jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12, tmp13; + INT32 z1, z2, z3; + JCOEFPTR inptr; + ISLOW_MULT_TYPE * quantptr; + int * wsptr; + JSAMPROW outptr; + JSAMPLE *range_limit = IDCT_range_limit(cinfo); + int ctr; + int workspace[7*7]; /* buffers data between passes */ + SHIFT_TEMPS + + /* Pass 1: process columns from input, store into work array. */ + + inptr = coef_block; + quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table; + wsptr = workspace; + for (ctr = 0; ctr < 7; ctr++, inptr++, quantptr++, wsptr++) { + /* Even part */ + + tmp13 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); + tmp13 <<= CONST_BITS; + + z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); + z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); + z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]); + + tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734)); /* c4 */ + tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123)); /* c6 */ + tmp11 = tmp10 + tmp12 + tmp13 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */ + tmp0 = z1 + z3; + z2 -= tmp0; + tmp0 = MULTIPLY(tmp0, FIX(1.274162392)) + tmp13; /* c2 */ + tmp10 += tmp0 - MULTIPLY(z3, FIX(0.077722536)); /* c2-c4-c6 */ + tmp12 += tmp0 - MULTIPLY(z1, FIX(2.470602249)); /* c2+c4+c6 */ + tmp13 += MULTIPLY(z2, FIX(1.414213562)); /* c0 */ + + /* Odd part */ + + z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); + z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); + z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); + + tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347)); /* (c3+c1-c5)/2 */ + tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339)); /* (c3+c5-c1)/2 */ + tmp0 = tmp1 - tmp2; + tmp1 += tmp2; + tmp2 = MULTIPLY(z2 + z3, - FIX(1.378756276)); /* -c1 */ + tmp1 += tmp2; + z2 = MULTIPLY(z1 + z3, FIX(0.613604268)); /* c5 */ + tmp0 += z2; + tmp2 += z2 + MULTIPLY(z3, FIX(1.870828693)); /* c3+c1-c5 */ + + /* Final output stage */ + + wsptr[7*0] = (int) DESCALE(tmp10 + tmp0, CONST_BITS-PASS1_BITS); + wsptr[7*6] = (int) DESCALE(tmp10 - tmp0, CONST_BITS-PASS1_BITS); + wsptr[7*1] = (int) DESCALE(tmp11 + tmp1, CONST_BITS-PASS1_BITS); + wsptr[7*5] = (int) DESCALE(tmp11 - tmp1, CONST_BITS-PASS1_BITS); + wsptr[7*2] = (int) DESCALE(tmp12 + tmp2, CONST_BITS-PASS1_BITS); + wsptr[7*4] = (int) DESCALE(tmp12 - tmp2, CONST_BITS-PASS1_BITS); + wsptr[7*3] = (int) DESCALE(tmp13, CONST_BITS-PASS1_BITS); + } + + /* Pass 2: process 7 rows from work array, store into output array. */ + + wsptr = workspace; + for (ctr = 0; ctr < 7; ctr++) { + outptr = output_buf[ctr] + output_col; + + /* Even part */ + + tmp13 = (INT32) wsptr[0]; + tmp13 <<= CONST_BITS; + + z1 = (INT32) wsptr[2]; + z2 = (INT32) wsptr[4]; + z3 = (INT32) wsptr[6]; + + tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734)); /* c4 */ + tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123)); /* c6 */ + tmp11 = tmp10 + tmp12 + tmp13 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */ + tmp0 = z1 + z3; + z2 -= tmp0; + tmp0 = MULTIPLY(tmp0, FIX(1.274162392)) + tmp13; /* c2 */ + tmp10 += tmp0 - MULTIPLY(z3, FIX(0.077722536)); /* c2-c4-c6 */ + tmp12 += tmp0 - MULTIPLY(z1, FIX(2.470602249)); /* c2+c4+c6 */ + tmp13 += MULTIPLY(z2, FIX(1.414213562)); /* c0 */ + + /* Odd part */ + + z1 = (INT32) wsptr[1]; + z2 = (INT32) wsptr[3]; + z3 = (INT32) wsptr[5]; + + tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347)); /* (c3+c1-c5)/2 */ + tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339)); /* (c3+c5-c1)/2 */ + tmp0 = tmp1 - tmp2; + tmp1 += tmp2; + tmp2 = MULTIPLY(z2 + z3, - FIX(1.378756276)); /* -c1 */ + tmp1 += tmp2; + z2 = MULTIPLY(z1 + z3, FIX(0.613604268)); /* c5 */ + tmp0 += z2; + tmp2 += z2 + MULTIPLY(z3, FIX(1.870828693)); /* c3+c1-c5 */ + + /* Final output stage */ + + outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp0, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[6] = range_limit[(int) DESCALE(tmp10 - tmp0, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[1] = range_limit[(int) DESCALE(tmp11 + tmp1, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[5] = range_limit[(int) DESCALE(tmp11 - tmp1, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[2] = range_limit[(int) DESCALE(tmp12 + tmp2, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[4] = range_limit[(int) DESCALE(tmp12 - tmp2, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[3] = range_limit[(int) DESCALE(tmp13, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + + wsptr += 7; /* advance pointer to next row */ + } +} + + +/* + * Perform dequantization and inverse DCT on one block of coefficients, + * producing a reduced-size 6x6 output block. + * + * Optimized algorithm with 3 multiplications in the 1-D kernel. + * cK represents sqrt(2) * cos(K*pi/12). + */ + +GLOBAL(void) +jpeg_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12; + INT32 z1, z2, z3; + JCOEFPTR inptr; + ISLOW_MULT_TYPE * quantptr; + int * wsptr; + JSAMPROW outptr; + JSAMPLE *range_limit = IDCT_range_limit(cinfo); + int ctr; + int workspace[6*6]; /* buffers data between passes */ + SHIFT_TEMPS + + /* Pass 1: process columns from input, store into work array. */ + + inptr = coef_block; + quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table; + wsptr = workspace; + for (ctr = 0; ctr < 6; ctr++, inptr++, quantptr++, wsptr++) { + /* Even part */ + + tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); + tmp0 <<= CONST_BITS; + tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); + tmp10 = MULTIPLY(tmp2, FIX(0.707106781)); /* c4 */ + tmp1 = tmp0 + tmp10; + tmp11 = DESCALE(tmp0 - tmp10 - tmp10, CONST_BITS-PASS1_BITS); + tmp10 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); + tmp0 = MULTIPLY(tmp10, FIX(1.224744871)); /* c2 */ + tmp10 = tmp1 + tmp0; + tmp12 = tmp1 - tmp0; + + /* Odd part */ + + z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); + z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); + z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); + tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */ + tmp0 = tmp1 + ((z1 + z2) << CONST_BITS); + tmp2 = tmp1 + ((z3 - z2) << CONST_BITS); + tmp1 = (z1 - z2 - z3) << PASS1_BITS; + + /* Final output stage */ + + wsptr[6*0] = DESCALE(tmp10 + tmp0, CONST_BITS-PASS1_BITS); + wsptr[6*5] = DESCALE(tmp10 - tmp0, CONST_BITS-PASS1_BITS); + wsptr[6*1] = (int) (tmp11 + tmp1); + wsptr[6*4] = (int) (tmp11 - tmp1); + wsptr[6*2] = DESCALE(tmp12 + tmp2, CONST_BITS-PASS1_BITS); + wsptr[6*3] = DESCALE(tmp12 - tmp2, CONST_BITS-PASS1_BITS); + } + + /* Pass 2: process 6 rows from work array, store into output array. */ + + wsptr = workspace; + for (ctr = 0; ctr < 6; ctr++) { + outptr = output_buf[ctr] + output_col; + + /* Even part */ + + tmp0 = (INT32) wsptr[0]; + tmp0 <<= CONST_BITS; + tmp2 = (INT32) wsptr[4]; + tmp10 = MULTIPLY(tmp2, FIX(0.707106781)); /* c4 */ + tmp1 = tmp0 + tmp10; + tmp11 = tmp0 - tmp10 - tmp10; + tmp10 = (INT32) wsptr[2]; + tmp0 = MULTIPLY(tmp10, FIX(1.224744871)); /* c2 */ + tmp10 = tmp1 + tmp0; + tmp12 = tmp1 - tmp0; + + /* Odd part */ + + z1 = (INT32) wsptr[1]; + z2 = (INT32) wsptr[3]; + z3 = (INT32) wsptr[5]; + tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */ + z1 <<= CONST_BITS; + z2 <<= CONST_BITS; + z3 <<= CONST_BITS; + tmp0 = tmp1 + z1 + z2; + tmp2 = tmp1 - z2 + z3; + tmp1 = z1 - z2 - z3; + + /* Final output stage */ + + outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp0, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[5] = range_limit[(int) DESCALE(tmp10 - tmp0, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[1] = range_limit[(int) DESCALE(tmp11 + tmp1, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[4] = range_limit[(int) DESCALE(tmp11 - tmp1, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[2] = range_limit[(int) DESCALE(tmp12 + tmp2, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[3] = range_limit[(int) DESCALE(tmp12 - tmp2, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + + wsptr += 6; /* advance pointer to next row */ + } +} + + +/* + * Perform dequantization and inverse DCT on one block of coefficients, + * producing a reduced-size 5x5 output block. + * + * Optimized algorithm with 5 multiplications in the 1-D kernel. + * cK represents sqrt(2) * cos(K*pi/10). + */ + +GLOBAL(void) +jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + INT32 tmp0, tmp1, tmp10, tmp11, tmp12; + INT32 z1, z2, z3; + JCOEFPTR inptr; + ISLOW_MULT_TYPE * quantptr; + int * wsptr; + JSAMPROW outptr; + JSAMPLE *range_limit = IDCT_range_limit(cinfo); + int ctr; + int workspace[5*5]; /* buffers data between passes */ + SHIFT_TEMPS + + /* Pass 1: process columns from input, store into work array. */ + + inptr = coef_block; + quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table; + wsptr = workspace; + for (ctr = 0; ctr < 5; ctr++, inptr++, quantptr++, wsptr++) { + /* Even part */ + + tmp12 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); + tmp12 <<= CONST_BITS; + tmp0 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); + tmp1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); + z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */ + z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */ + z3 = tmp12 + z2; + tmp10 = z3 + z1; + tmp11 = z3 - z1; + tmp12 -= z2 << 2; + + /* Odd part */ + + z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); + z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); + + z1 = MULTIPLY(z2 + z3, FIX(0.831253876)); /* c3 */ + tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c1-c3 */ + tmp1 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c1+c3 */ + + /* Final output stage */ + + wsptr[5*0] = DESCALE(tmp10 + tmp0, CONST_BITS-PASS1_BITS); + wsptr[5*4] = DESCALE(tmp10 - tmp0, CONST_BITS-PASS1_BITS); + wsptr[5*1] = DESCALE(tmp11 + tmp1, CONST_BITS-PASS1_BITS); + wsptr[5*3] = DESCALE(tmp11 - tmp1, CONST_BITS-PASS1_BITS); + wsptr[5*2] = DESCALE(tmp12, CONST_BITS-PASS1_BITS); + } + + /* Pass 2: process 5 rows from work array, store into output array. */ + + wsptr = workspace; + for (ctr = 0; ctr < 5; ctr++) { + outptr = output_buf[ctr] + output_col; + + /* Even part */ + + tmp12 = (INT32) wsptr[0]; + tmp12 <<= CONST_BITS; + tmp0 = (INT32) wsptr[2]; + tmp1 = (INT32) wsptr[4]; + z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */ + z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */ + z3 = tmp12 + z2; + tmp10 = z3 + z1; + tmp11 = z3 - z1; + tmp12 -= z2 << 2; + + /* Odd part */ + + z2 = (INT32) wsptr[1]; + z3 = (INT32) wsptr[3]; + + z1 = MULTIPLY(z2 + z3, FIX(0.831253876)); /* c3 */ + tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c1-c3 */ + tmp1 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c1+c3 */ + + /* Final output stage */ + + outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp0, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[4] = range_limit[(int) DESCALE(tmp10 - tmp0, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[1] = range_limit[(int) DESCALE(tmp11 + tmp1, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[3] = range_limit[(int) DESCALE(tmp11 - tmp1, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[2] = range_limit[(int) DESCALE(tmp12, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + + wsptr += 5; /* advance pointer to next row */ + } +} + + +/* + * Perform dequantization and inverse DCT on one block of coefficients, + * producing a reduced-size 4x4 output block. + * + * Optimized algorithm with 3 multiplications in the 1-D kernel. + * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point IDCT]. + */ + +GLOBAL(void) +jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + INT32 tmp0, tmp2, tmp10, tmp12; + INT32 z1, z2, z3; + JCOEFPTR inptr; + ISLOW_MULT_TYPE * quantptr; + int * wsptr; + JSAMPROW outptr; + JSAMPLE *range_limit = IDCT_range_limit(cinfo); + int ctr; + int workspace[4*4]; /* buffers data between passes */ + SHIFT_TEMPS + + /* Pass 1: process columns from input, store into work array. */ + + inptr = coef_block; + quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table; + wsptr = workspace; + for (ctr = 0; ctr < 4; ctr++, inptr++, quantptr++, wsptr++) { + /* Even part */ + + tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); + tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); + + tmp10 = (tmp0 + tmp2) << PASS1_BITS; + tmp12 = (tmp0 - tmp2) << PASS1_BITS; + + /* Odd part */ + /* Same rotation as in the even part of the 8x8 LL&M IDCT */ + + z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); + z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); + + z1 = MULTIPLY(z2 + z3, FIX_0_541196100); /* c6 */ + tmp0 = DESCALE(z1 + MULTIPLY(z3, - FIX_1_847759065), /* -(c2+c6) */ + CONST_BITS-PASS1_BITS); + tmp2 = DESCALE(z1 + MULTIPLY(z2, FIX_0_765366865), /* c2-c6 */ + CONST_BITS-PASS1_BITS); + + /* Final output stage */ + + wsptr[4*0] = (int) (tmp10 + tmp2); + wsptr[4*3] = (int) (tmp10 - tmp2); + wsptr[4*1] = (int) (tmp12 + tmp0); + wsptr[4*2] = (int) (tmp12 - tmp0); + } + + /* Pass 2: process 4 rows from work array, store into output array. */ + + wsptr = workspace; + for (ctr = 0; ctr < 4; ctr++) { + outptr = output_buf[ctr] + output_col; + + /* Even part */ + + tmp0 = (INT32) wsptr[0]; + tmp2 = (INT32) wsptr[2]; + + tmp10 = (tmp0 + tmp2) << CONST_BITS; + tmp12 = (tmp0 - tmp2) << CONST_BITS; + + /* Odd part */ + /* Same rotation as in the even part of the 8x8 LL&M IDCT */ + + z2 = (INT32) wsptr[1]; + z3 = (INT32) wsptr[3]; + + z1 = MULTIPLY(z2 + z3, FIX_0_541196100); /* c6 */ + tmp0 = z1 + MULTIPLY(z3, - FIX_1_847759065); /* -(c2+c6) */ + tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */ + + /* Final output stage */ + + outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp2, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[3] = range_limit[(int) DESCALE(tmp10 - tmp2, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[1] = range_limit[(int) DESCALE(tmp12 + tmp0, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[2] = range_limit[(int) DESCALE(tmp12 - tmp0, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + + wsptr += 4; /* advance pointer to next row */ + } +} + + +/* + * Perform dequantization and inverse DCT on one block of coefficients, + * producing a reduced-size 3x3 output block. + * + * Optimized algorithm with 2 multiplications in the 1-D kernel. + * cK represents sqrt(2) * cos(K*pi/6). + */ + +GLOBAL(void) +jpeg_idct_3x3 (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + INT32 tmp0, tmp2, tmp10, tmp12; + JCOEFPTR inptr; + ISLOW_MULT_TYPE * quantptr; + int * wsptr; + JSAMPROW outptr; + JSAMPLE *range_limit = IDCT_range_limit(cinfo); + int ctr; + int workspace[3*3]; /* buffers data between passes */ + SHIFT_TEMPS + + /* Pass 1: process columns from input, store into work array. */ + + inptr = coef_block; + quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table; + wsptr = workspace; + for (ctr = 0; ctr < 3; ctr++, inptr++, quantptr++, wsptr++) { + /* Even part */ + + tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); + tmp0 <<= CONST_BITS; + tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); + tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */ + tmp10 = tmp0 + tmp12; + tmp2 = tmp0 - tmp12 - tmp12; + + /* Odd part */ + + tmp12 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); + tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */ + + /* Final output stage */ + + wsptr[3*0] = (int) DESCALE(tmp10 + tmp0, CONST_BITS-PASS1_BITS); + wsptr[3*2] = (int) DESCALE(tmp10 - tmp0, CONST_BITS-PASS1_BITS); + wsptr[3*1] = (int) DESCALE(tmp2, CONST_BITS-PASS1_BITS); + } + + /* Pass 2: process 3 rows from work array, store into output array. */ + + wsptr = workspace; + for (ctr = 0; ctr < 3; ctr++) { + outptr = output_buf[ctr] + output_col; + + /* Even part */ + + tmp0 = (INT32) wsptr[0]; + tmp0 <<= CONST_BITS; + tmp2 = (INT32) wsptr[2]; + tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */ + tmp10 = tmp0 + tmp12; + tmp2 = tmp0 - tmp12 - tmp12; + + /* Odd part */ + + tmp12 = (INT32) wsptr[1]; + tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */ + + /* Final output stage */ + + outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp0, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[2] = range_limit[(int) DESCALE(tmp10 - tmp0, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[1] = range_limit[(int) DESCALE(tmp2, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + + wsptr += 3; /* advance pointer to next row */ + } +} + + +/* + * Perform dequantization and inverse DCT on one block of coefficients, + * producing a reduced-size 2x2 output block. + * + * Multiplication-less algorithm. + */ + +GLOBAL(void) +jpeg_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + ISLOW_MULT_TYPE * quantptr; + JSAMPROW outptr; + JSAMPLE *range_limit = IDCT_range_limit(cinfo); + SHIFT_TEMPS + + /* Pass 1: process columns from input. */ + + quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table; + + /* Column 0 */ + tmp4 = DEQUANTIZE(coef_block[DCTSIZE*0], quantptr[DCTSIZE*0]); + tmp5 = DEQUANTIZE(coef_block[DCTSIZE*1], quantptr[DCTSIZE*1]); + + tmp0 = tmp4 + tmp5; + tmp2 = tmp4 - tmp5; + + /* Column 1 */ + tmp4 = DEQUANTIZE(coef_block[DCTSIZE*0+1], quantptr[DCTSIZE*0+1]); + tmp5 = DEQUANTIZE(coef_block[DCTSIZE*1+1], quantptr[DCTSIZE*1+1]); + + tmp1 = tmp4 + tmp5; + tmp3 = tmp4 - tmp5; + + /* Pass 2: process 2 rows, store into output array. */ + + /* Row 0 */ + outptr = output_buf[0] + output_col; + + outptr[0] = range_limit[(int) DESCALE(tmp0 + tmp1, 3) & RANGE_MASK]; + outptr[1] = range_limit[(int) DESCALE(tmp0 - tmp1, 3) & RANGE_MASK]; + + /* Row 1 */ + outptr = output_buf[1] + output_col; + + outptr[0] = range_limit[(int) DESCALE(tmp2 + tmp3, 3) & RANGE_MASK]; + outptr[1] = range_limit[(int) DESCALE(tmp2 - tmp3, 3) & RANGE_MASK]; +} + + +/* + * Perform dequantization and inverse DCT on one block of coefficients, + * producing a reduced-size 1x1 output block. + * + * We hardly need an inverse DCT routine for this: just take the + * average pixel value, which is one-eighth of the DC coefficient. + */ + +GLOBAL(void) +jpeg_idct_1x1 (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + int dcval; + ISLOW_MULT_TYPE * quantptr; + JSAMPLE *range_limit = IDCT_range_limit(cinfo); + SHIFT_TEMPS + + /* 1x1 is trivial: just take the DC coefficient divided by 8. */ + quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table; + dcval = DEQUANTIZE(coef_block[0], quantptr[0]); + dcval = (int) DESCALE((INT32) dcval, 3); + + output_buf[0][output_col] = range_limit[dcval & RANGE_MASK]; +} + + +/* + * Perform dequantization and inverse DCT on one block of coefficients, + * producing a 9x9 output block. + * + * Optimized algorithm with 10 multiplications in the 1-D kernel. + * cK represents sqrt(2) * cos(K*pi/18). + */ + +GLOBAL(void) +jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13, tmp14; + INT32 z1, z2, z3, z4; + JCOEFPTR inptr; + ISLOW_MULT_TYPE * quantptr; + int * wsptr; + JSAMPROW outptr; + JSAMPLE *range_limit = IDCT_range_limit(cinfo); + int ctr; + int workspace[8*9]; /* buffers data between passes */ + SHIFT_TEMPS + + /* Pass 1: process columns from input, store into work array. */ + + inptr = coef_block; + quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table; + wsptr = workspace; + for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) { + /* Even part */ + + tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); + tmp0 <<= CONST_BITS; + + z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); + z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); + z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]); + + tmp3 = MULTIPLY(z3, FIX(0.707106781)); /* c6 */ + tmp1 = tmp0 + tmp3; + tmp2 = tmp0 - tmp3 - tmp3; + + tmp0 = MULTIPLY(z1 - z2, FIX(0.707106781)); /* c6 */ + tmp11 = tmp2 + tmp0; + tmp14 = tmp2 - tmp0 - tmp0; + + tmp0 = MULTIPLY(z1 + z2, FIX(1.328926049)); /* c2 */ + tmp2 = MULTIPLY(z1, FIX(1.083350441)); /* c4 */ + tmp3 = MULTIPLY(z2, FIX(0.245575608)); /* c8 */ + + tmp10 = tmp1 + tmp0 - tmp3; + tmp12 = tmp1 - tmp0 + tmp2; + tmp13 = tmp1 - tmp2 + tmp3; + + /* Odd part */ + + z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); + z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); + z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); + z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]); + + z2 = MULTIPLY(z2, - FIX(1.224744871)); /* -c3 */ + + tmp2 = MULTIPLY(z1 + z3, FIX(0.909038955)); /* c5 */ + tmp3 = MULTIPLY(z1 + z4, FIX(0.483689525)); /* c7 */ + tmp0 = tmp2 + tmp3 - z2; + tmp1 = MULTIPLY(z3 - z4, FIX(1.392728481)); /* c1 */ + tmp2 += z2 - tmp1; + tmp3 += z2 + tmp1; + tmp1 = MULTIPLY(z1 - z3 - z4, FIX(1.224744871)); /* c3 */ + + /* Final output stage */ + + wsptr[8*0] = (int) DESCALE(tmp10 + tmp0, CONST_BITS-PASS1_BITS); + wsptr[8*8] = (int) DESCALE(tmp10 - tmp0, CONST_BITS-PASS1_BITS); + wsptr[8*1] = (int) DESCALE(tmp11 + tmp1, CONST_BITS-PASS1_BITS); + wsptr[8*7] = (int) DESCALE(tmp11 - tmp1, CONST_BITS-PASS1_BITS); + wsptr[8*2] = (int) DESCALE(tmp12 + tmp2, CONST_BITS-PASS1_BITS); + wsptr[8*6] = (int) DESCALE(tmp12 - tmp2, CONST_BITS-PASS1_BITS); + wsptr[8*3] = (int) DESCALE(tmp13 + tmp3, CONST_BITS-PASS1_BITS); + wsptr[8*5] = (int) DESCALE(tmp13 - tmp3, CONST_BITS-PASS1_BITS); + wsptr[8*4] = (int) DESCALE(tmp14, CONST_BITS-PASS1_BITS); + } + + /* Pass 2: process 9 rows from work array, store into output array. */ + + wsptr = workspace; + for (ctr = 0; ctr < 9; ctr++) { + outptr = output_buf[ctr] + output_col; + + /* Even part */ + + tmp0 = (INT32) wsptr[0]; + tmp0 <<= CONST_BITS; + + z1 = (INT32) wsptr[2]; + z2 = (INT32) wsptr[4]; + z3 = (INT32) wsptr[6]; + + tmp3 = MULTIPLY(z3, FIX(0.707106781)); /* c6 */ + tmp1 = tmp0 + tmp3; + tmp2 = tmp0 - tmp3 - tmp3; + + tmp0 = MULTIPLY(z1 - z2, FIX(0.707106781)); /* c6 */ + tmp11 = tmp2 + tmp0; + tmp14 = tmp2 - tmp0 - tmp0; + + tmp0 = MULTIPLY(z1 + z2, FIX(1.328926049)); /* c2 */ + tmp2 = MULTIPLY(z1, FIX(1.083350441)); /* c4 */ + tmp3 = MULTIPLY(z2, FIX(0.245575608)); /* c8 */ + + tmp10 = tmp1 + tmp0 - tmp3; + tmp12 = tmp1 - tmp0 + tmp2; + tmp13 = tmp1 - tmp2 + tmp3; + + /* Odd part */ + + z1 = (INT32) wsptr[1]; + z2 = (INT32) wsptr[3]; + z3 = (INT32) wsptr[5]; + z4 = (INT32) wsptr[7]; + + z2 = MULTIPLY(z2, - FIX(1.224744871)); /* -c3 */ + + tmp2 = MULTIPLY(z1 + z3, FIX(0.909038955)); /* c5 */ + tmp3 = MULTIPLY(z1 + z4, FIX(0.483689525)); /* c7 */ + tmp0 = tmp2 + tmp3 - z2; + tmp1 = MULTIPLY(z3 - z4, FIX(1.392728481)); /* c1 */ + tmp2 += z2 - tmp1; + tmp3 += z2 + tmp1; + tmp1 = MULTIPLY(z1 - z3 - z4, FIX(1.224744871)); /* c3 */ + + /* Final output stage */ + + outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp0, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[8] = range_limit[(int) DESCALE(tmp10 - tmp0, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[1] = range_limit[(int) DESCALE(tmp11 + tmp1, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[7] = range_limit[(int) DESCALE(tmp11 - tmp1, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[2] = range_limit[(int) DESCALE(tmp12 + tmp2, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[6] = range_limit[(int) DESCALE(tmp12 - tmp2, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[3] = range_limit[(int) DESCALE(tmp13 + tmp3, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[5] = range_limit[(int) DESCALE(tmp13 - tmp3, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[4] = range_limit[(int) DESCALE(tmp14, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + + wsptr += 8; /* advance pointer to next row */ + } +} + + +/* + * Perform dequantization and inverse DCT on one block of coefficients, + * producing a 10x10 output block. + * + * Optimized algorithm with 12 multiplications in the 1-D kernel. + * cK represents sqrt(2) * cos(K*pi/20). + */ + +GLOBAL(void) +jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + INT32 tmp0, tmp1; + INT32 tmp10, tmp11, tmp12, tmp13, tmp14; + INT32 tmp20, tmp21, tmp22, tmp23, tmp24; + INT32 z1, z2, z3, z4; + JCOEFPTR inptr; + ISLOW_MULT_TYPE * quantptr; + int * wsptr; + JSAMPROW outptr; + JSAMPLE *range_limit = IDCT_range_limit(cinfo); + int ctr; + int workspace[8*10]; /* buffers data between passes */ + SHIFT_TEMPS + + /* Pass 1: process columns from input, store into work array. */ + + inptr = coef_block; + quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table; + wsptr = workspace; + for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) { + /* Even part */ + + tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); + tmp0 <<= CONST_BITS; + tmp1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); + z1 = MULTIPLY(tmp1, FIX(1.144122806)); /* c4 */ + z2 = MULTIPLY(tmp1, FIX(0.437016024)); /* c8 */ + tmp10 = tmp0 + z1; + tmp11 = tmp0 - z2; + + tmp22 = DESCALE(tmp0 - ((z1 - z2) << 1), /* c0 = (c4-c8)*2 */ + CONST_BITS-PASS1_BITS); + + z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); + z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]); + + z1 = MULTIPLY(z2 + z3, FIX(0.831253876)); /* c6 */ + tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */ + tmp1 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */ + + tmp20 = tmp10 + tmp0; + tmp24 = tmp10 - tmp0; + tmp21 = tmp11 + tmp1; + tmp23 = tmp11 - tmp1; + + /* Odd part */ + + z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); + z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); + z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); + z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]); + + tmp0 = z2 + z4; + tmp1 = z2 - z4; + + tmp12 = MULTIPLY(tmp1, FIX(0.309016994)); /* (c3-c7)/2 */ + tmp13 = z3 << CONST_BITS; + + z2 = MULTIPLY(tmp0, FIX(0.951056516)); /* (c3+c7)/2 */ + z4 = tmp13 + tmp12; + + tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */ + tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */ + + z2 = MULTIPLY(tmp0, FIX(0.587785252)); /* (c1-c9)/2 */ + z4 = tmp13 - tmp12 - (tmp1 << (CONST_BITS - 1)); + + tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */ + tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */ + + tmp12 = (z1 - tmp1 - z3) << PASS1_BITS; + + /* Final output stage */ + + wsptr[8*0] = (int) DESCALE(tmp20 + tmp10, CONST_BITS-PASS1_BITS); + wsptr[8*9] = (int) DESCALE(tmp20 - tmp10, CONST_BITS-PASS1_BITS); + wsptr[8*1] = (int) DESCALE(tmp21 + tmp11, CONST_BITS-PASS1_BITS); + wsptr[8*8] = (int) DESCALE(tmp21 - tmp11, CONST_BITS-PASS1_BITS); + wsptr[8*2] = (int) (tmp22 + tmp12); + wsptr[8*7] = (int) (tmp22 - tmp12); + wsptr[8*3] = (int) DESCALE(tmp23 + tmp13, CONST_BITS-PASS1_BITS); + wsptr[8*6] = (int) DESCALE(tmp23 - tmp13, CONST_BITS-PASS1_BITS); + wsptr[8*4] = (int) DESCALE(tmp24 + tmp14, CONST_BITS-PASS1_BITS); + wsptr[8*5] = (int) DESCALE(tmp24 - tmp14, CONST_BITS-PASS1_BITS); + } + + /* Pass 2: process 10 rows from work array, store into output array. */ + + wsptr = workspace; + for (ctr = 0; ctr < 10; ctr++) { + outptr = output_buf[ctr] + output_col; + + /* Even part */ + + tmp0 = (INT32) wsptr[0]; + tmp0 <<= CONST_BITS; + tmp1 = (INT32) wsptr[4]; + z1 = MULTIPLY(tmp1, FIX(1.144122806)); /* c4 */ + z2 = MULTIPLY(tmp1, FIX(0.437016024)); /* c8 */ + tmp10 = tmp0 + z1; + tmp11 = tmp0 - z2; + + tmp22 = tmp0 - ((z1 - z2) << 1); /* c0 = (c4-c8)*2 */ + + z2 = (INT32) wsptr[2]; + z3 = (INT32) wsptr[6]; + + z1 = MULTIPLY(z2 + z3, FIX(0.831253876)); /* c6 */ + tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */ + tmp1 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */ + + tmp20 = tmp10 + tmp0; + tmp24 = tmp10 - tmp0; + tmp21 = tmp11 + tmp1; + tmp23 = tmp11 - tmp1; + + /* Odd part */ + + z1 = (INT32) wsptr[1]; + z2 = (INT32) wsptr[3]; + z3 = (INT32) wsptr[5]; + z3 <<= CONST_BITS; + z4 = (INT32) wsptr[7]; + + tmp0 = z2 + z4; + tmp1 = z2 - z4; + + tmp12 = MULTIPLY(tmp1, FIX(0.309016994)); /* (c3-c7)/2 */ + + z2 = MULTIPLY(tmp0, FIX(0.951056516)); /* (c3+c7)/2 */ + z4 = z3 + tmp12; + + tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */ + tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */ + + z2 = MULTIPLY(tmp0, FIX(0.587785252)); /* (c1-c9)/2 */ + z4 = z3 - tmp12 - (tmp1 << (CONST_BITS - 1)); + + tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */ + tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */ + + tmp12 = ((z1 - tmp1) << CONST_BITS) - z3; + + /* Final output stage */ + + outptr[0] = range_limit[(int) DESCALE(tmp20 + tmp10, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[9] = range_limit[(int) DESCALE(tmp20 - tmp10, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[1] = range_limit[(int) DESCALE(tmp21 + tmp11, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[8] = range_limit[(int) DESCALE(tmp21 - tmp11, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[2] = range_limit[(int) DESCALE(tmp22 + tmp12, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[7] = range_limit[(int) DESCALE(tmp22 - tmp12, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[3] = range_limit[(int) DESCALE(tmp23 + tmp13, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[6] = range_limit[(int) DESCALE(tmp23 - tmp13, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[4] = range_limit[(int) DESCALE(tmp24 + tmp14, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[5] = range_limit[(int) DESCALE(tmp24 - tmp14, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + + wsptr += 8; /* advance pointer to next row */ + } +} + + +/* + * Perform dequantization and inverse DCT on one block of coefficients, + * producing a 11x11 output block. + * + * Optimized algorithm with 24 multiplications in the 1-D kernel. + * cK represents sqrt(2) * cos(K*pi/22). + */ + +GLOBAL(void) +jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + INT32 tmp10, tmp11, tmp12, tmp13, tmp14; + INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25; + INT32 z1, z2, z3, z4; + JCOEFPTR inptr; + ISLOW_MULT_TYPE * quantptr; + int * wsptr; + JSAMPROW outptr; + JSAMPLE *range_limit = IDCT_range_limit(cinfo); + int ctr; + int workspace[8*11]; /* buffers data between passes */ + SHIFT_TEMPS + + /* Pass 1: process columns from input, store into work array. */ + + inptr = coef_block; + quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table; + wsptr = workspace; + for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) { + /* Even part */ + + tmp10 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); + tmp10 <<= CONST_BITS; + + z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); + z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); + z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]); + + tmp20 = MULTIPLY(z2 - z3, FIX(2.546640132)); /* c2+c4 */ + tmp23 = MULTIPLY(z2 - z1, FIX(0.430815045)); /* c2-c6 */ + z4 = z1 + z3; + tmp24 = MULTIPLY(z4, - FIX(1.155664402)); /* -(c2-c10) */ + z4 -= z2; + tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976)); /* c2 */ + tmp21 = tmp20 + tmp23 + tmp25 - + MULTIPLY(z2, FIX(1.821790775)); /* c2+c4+c10-c6 */ + tmp20 += tmp25 + MULTIPLY(z3, FIX(2.115825087)); /* c4+c6 */ + tmp23 += tmp25 - MULTIPLY(z1, FIX(1.513598477)); /* c6+c8 */ + tmp24 += tmp25; + tmp22 = tmp24 - MULTIPLY(z3, FIX(0.788749120)); /* c8+c10 */ + tmp24 += MULTIPLY(z2, FIX(1.944413522)) - /* c2+c8 */ + MULTIPLY(z1, FIX(1.390975730)); /* c4+c10 */ + tmp25 = tmp10 - MULTIPLY(z4, FIX(1.414213562)); /* c0 */ + + /* Odd part */ + + z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); + z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); + z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); + z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]); + + tmp11 = z1 + z2; + tmp14 = MULTIPLY(tmp11 + z3 + z4, FIX(0.398430003)); /* c9 */ + tmp11 = MULTIPLY(tmp11, FIX(0.887983902)); /* c3-c9 */ + tmp12 = MULTIPLY(z1 + z3, FIX(0.670361295)); /* c5-c9 */ + tmp13 = tmp14 + MULTIPLY(z1 + z4, FIX(0.366151574)); /* c7-c9 */ + tmp10 = tmp11 + tmp12 + tmp13 - + MULTIPLY(z1, FIX(0.923107866)); /* c7+c5+c3-c1-2*c9 */ + z1 = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */ + tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588)); /* c1+c7+3*c9-c3 */ + tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623)); /* c3+c5-c7-c9 */ + z1 = MULTIPLY(z2 + z4, - FIX(1.798248910)); /* -(c1+c9) */ + tmp11 += z1; + tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632)); /* c1+c5+c9-c7 */ + tmp14 += MULTIPLY(z2, - FIX(1.467221301)) + /* -(c5+c9) */ + MULTIPLY(z3, FIX(1.001388905)) - /* c1-c9 */ + MULTIPLY(z4, FIX(1.684843907)); /* c3+c9 */ + + /* Final output stage */ + + wsptr[8*0] = (int) DESCALE(tmp20 + tmp10, CONST_BITS-PASS1_BITS); + wsptr[8*10] = (int) DESCALE(tmp20 - tmp10, CONST_BITS-PASS1_BITS); + wsptr[8*1] = (int) DESCALE(tmp21 + tmp11, CONST_BITS-PASS1_BITS); + wsptr[8*9] = (int) DESCALE(tmp21 - tmp11, CONST_BITS-PASS1_BITS); + wsptr[8*2] = (int) DESCALE(tmp22 + tmp12, CONST_BITS-PASS1_BITS); + wsptr[8*8] = (int) DESCALE(tmp22 - tmp12, CONST_BITS-PASS1_BITS); + wsptr[8*3] = (int) DESCALE(tmp23 + tmp13, CONST_BITS-PASS1_BITS); + wsptr[8*7] = (int) DESCALE(tmp23 - tmp13, CONST_BITS-PASS1_BITS); + wsptr[8*4] = (int) DESCALE(tmp24 + tmp14, CONST_BITS-PASS1_BITS); + wsptr[8*6] = (int) DESCALE(tmp24 - tmp14, CONST_BITS-PASS1_BITS); + wsptr[8*5] = (int) DESCALE(tmp25, CONST_BITS-PASS1_BITS); + } + + /* Pass 2: process 11 rows from work array, store into output array. */ + + wsptr = workspace; + for (ctr = 0; ctr < 11; ctr++) { + outptr = output_buf[ctr] + output_col; + + /* Even part */ + + tmp10 = (INT32) wsptr[0]; + tmp10 <<= CONST_BITS; + + z1 = (INT32) wsptr[2]; + z2 = (INT32) wsptr[4]; + z3 = (INT32) wsptr[6]; + + tmp20 = MULTIPLY(z2 - z3, FIX(2.546640132)); /* c2+c4 */ + tmp23 = MULTIPLY(z2 - z1, FIX(0.430815045)); /* c2-c6 */ + z4 = z1 + z3; + tmp24 = MULTIPLY(z4, - FIX(1.155664402)); /* -(c2-c10) */ + z4 -= z2; + tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976)); /* c2 */ + tmp21 = tmp20 + tmp23 + tmp25 - + MULTIPLY(z2, FIX(1.821790775)); /* c2+c4+c10-c6 */ + tmp20 += tmp25 + MULTIPLY(z3, FIX(2.115825087)); /* c4+c6 */ + tmp23 += tmp25 - MULTIPLY(z1, FIX(1.513598477)); /* c6+c8 */ + tmp24 += tmp25; + tmp22 = tmp24 - MULTIPLY(z3, FIX(0.788749120)); /* c8+c10 */ + tmp24 += MULTIPLY(z2, FIX(1.944413522)) - /* c2+c8 */ + MULTIPLY(z1, FIX(1.390975730)); /* c4+c10 */ + tmp25 = tmp10 - MULTIPLY(z4, FIX(1.414213562)); /* c0 */ + + /* Odd part */ + + z1 = (INT32) wsptr[1]; + z2 = (INT32) wsptr[3]; + z3 = (INT32) wsptr[5]; + z4 = (INT32) wsptr[7]; + + tmp11 = z1 + z2; + tmp14 = MULTIPLY(tmp11 + z3 + z4, FIX(0.398430003)); /* c9 */ + tmp11 = MULTIPLY(tmp11, FIX(0.887983902)); /* c3-c9 */ + tmp12 = MULTIPLY(z1 + z3, FIX(0.670361295)); /* c5-c9 */ + tmp13 = tmp14 + MULTIPLY(z1 + z4, FIX(0.366151574)); /* c7-c9 */ + tmp10 = tmp11 + tmp12 + tmp13 - + MULTIPLY(z1, FIX(0.923107866)); /* c7+c5+c3-c1-2*c9 */ + z1 = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */ + tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588)); /* c1+c7+3*c9-c3 */ + tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623)); /* c3+c5-c7-c9 */ + z1 = MULTIPLY(z2 + z4, - FIX(1.798248910)); /* -(c1+c9) */ + tmp11 += z1; + tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632)); /* c1+c5+c9-c7 */ + tmp14 += MULTIPLY(z2, - FIX(1.467221301)) + /* -(c5+c9) */ + MULTIPLY(z3, FIX(1.001388905)) - /* c1-c9 */ + MULTIPLY(z4, FIX(1.684843907)); /* c3+c9 */ + + /* Final output stage */ + + outptr[0] = range_limit[(int) DESCALE(tmp20 + tmp10, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[10] = range_limit[(int) DESCALE(tmp20 - tmp10, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[1] = range_limit[(int) DESCALE(tmp21 + tmp11, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[9] = range_limit[(int) DESCALE(tmp21 - tmp11, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[2] = range_limit[(int) DESCALE(tmp22 + tmp12, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[8] = range_limit[(int) DESCALE(tmp22 - tmp12, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[3] = range_limit[(int) DESCALE(tmp23 + tmp13, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[7] = range_limit[(int) DESCALE(tmp23 - tmp13, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[4] = range_limit[(int) DESCALE(tmp24 + tmp14, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[6] = range_limit[(int) DESCALE(tmp24 - tmp14, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[5] = range_limit[(int) DESCALE(tmp25, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + + wsptr += 8; /* advance pointer to next row */ + } +} + + +/* + * Perform dequantization and inverse DCT on one block of coefficients, + * producing a 12x12 output block. + * + * Optimized algorithm with 15 multiplications in the 1-D kernel. + * cK represents sqrt(2) * cos(K*pi/24). + */ + +GLOBAL(void) +jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + INT32 tmp0, tmp1; + INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; + INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25; + INT32 z1, z2, z3, z4; + JCOEFPTR inptr; + ISLOW_MULT_TYPE * quantptr; + int * wsptr; + JSAMPROW outptr; + JSAMPLE *range_limit = IDCT_range_limit(cinfo); + int ctr; + int workspace[8*12]; /* buffers data between passes */ + SHIFT_TEMPS + + /* Pass 1: process columns from input, store into work array. */ + + inptr = coef_block; + quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table; + wsptr = workspace; + for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) { + /* Even part */ + + tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); + tmp0 <<= CONST_BITS; + + z1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); + tmp1 = MULTIPLY(z1, FIX(1.224744871)); /* c4 */ + + tmp10 = tmp0 + tmp1; + tmp11 = tmp0 - tmp1; + + z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); + tmp1 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */ + z1 <<= CONST_BITS; + z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]); + z2 <<= CONST_BITS; + + tmp12 = z1 - z2; + + tmp21 = tmp0 + tmp12; + tmp24 = tmp0 - tmp12; + + tmp12 = tmp1 + z2; + + tmp20 = tmp10 + tmp12; + tmp25 = tmp10 - tmp12; + + tmp12 = tmp1 - z1 - z2; + + tmp22 = tmp11 + tmp12; + tmp23 = tmp11 - tmp12; + + /* Odd part */ + + z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); + z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); + z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); + z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]); + + tmp0 = MULTIPLY(z2, FIX(1.306562965)); /* c3 */ + tmp1 = MULTIPLY(z2, - FIX_0_541196100); /* -c9 */ + + tmp10 = z1 - z4; + z2 -= z3; + tmp12 = MULTIPLY(tmp10 + z2, FIX_0_541196100); /* c9 */ + tmp11 = tmp12 + MULTIPLY(tmp10, FIX_0_765366865); /* c3-c9 */ + tmp14 = tmp12 + MULTIPLY(z2, - FIX_1_847759065); /* c3+c9 */ + + z2 = z1 + z3; + tmp15 = MULTIPLY(z2 + z4, FIX(0.860918669)); /* c7 */ + tmp12 = tmp15 + MULTIPLY(z2, FIX(0.261052384)); /* c5-c7 */ + tmp10 = tmp12 + tmp0 + MULTIPLY(z1, FIX(0.280143716)); /* c1-c5 */ + tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580)); /* -(c7+c11) */ + tmp12 += tmp13 + tmp1 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */ + tmp13 += tmp15 - tmp0 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */ + tmp15 += tmp1 - MULTIPLY(z1, FIX(0.676326758)) - /* c7-c11 */ + MULTIPLY(z4, FIX(1.982889723)); /* c5+c7 */ + + /* Final output stage */ + + wsptr[8*0] = (int) DESCALE(tmp20 + tmp10, CONST_BITS-PASS1_BITS); + wsptr[8*11] = (int) DESCALE(tmp20 - tmp10, CONST_BITS-PASS1_BITS); + wsptr[8*1] = (int) DESCALE(tmp21 + tmp11, CONST_BITS-PASS1_BITS); + wsptr[8*10] = (int) DESCALE(tmp21 - tmp11, CONST_BITS-PASS1_BITS); + wsptr[8*2] = (int) DESCALE(tmp22 + tmp12, CONST_BITS-PASS1_BITS); + wsptr[8*9] = (int) DESCALE(tmp22 - tmp12, CONST_BITS-PASS1_BITS); + wsptr[8*3] = (int) DESCALE(tmp23 + tmp13, CONST_BITS-PASS1_BITS); + wsptr[8*8] = (int) DESCALE(tmp23 - tmp13, CONST_BITS-PASS1_BITS); + wsptr[8*4] = (int) DESCALE(tmp24 + tmp14, CONST_BITS-PASS1_BITS); + wsptr[8*7] = (int) DESCALE(tmp24 - tmp14, CONST_BITS-PASS1_BITS); + wsptr[8*5] = (int) DESCALE(tmp25 + tmp15, CONST_BITS-PASS1_BITS); + wsptr[8*6] = (int) DESCALE(tmp25 - tmp15, CONST_BITS-PASS1_BITS); + } + + /* Pass 2: process 12 rows from work array, store into output array. */ + + wsptr = workspace; + for (ctr = 0; ctr < 12; ctr++) { + outptr = output_buf[ctr] + output_col; + + /* Even part */ + + tmp0 = (INT32) wsptr[0]; + tmp0 <<= CONST_BITS; + + z1 = (INT32) wsptr[4]; + tmp1 = MULTIPLY(z1, FIX(1.224744871)); /* c4 */ + + tmp10 = tmp0 + tmp1; + tmp11 = tmp0 - tmp1; + + z1 = (INT32) wsptr[2]; + tmp1 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */ + z1 <<= CONST_BITS; + z2 = (INT32) wsptr[6]; + z2 <<= CONST_BITS; + + tmp12 = z1 - z2; + + tmp21 = tmp0 + tmp12; + tmp24 = tmp0 - tmp12; + + tmp12 = tmp1 + z2; + + tmp20 = tmp10 + tmp12; + tmp25 = tmp10 - tmp12; + + tmp12 = tmp1 - z1 - z2; + + tmp22 = tmp11 + tmp12; + tmp23 = tmp11 - tmp12; + + /* Odd part */ + + z1 = (INT32) wsptr[1]; + z2 = (INT32) wsptr[3]; + z3 = (INT32) wsptr[5]; + z4 = (INT32) wsptr[7]; + + tmp0 = MULTIPLY(z2, FIX(1.306562965)); /* c3 */ + tmp1 = MULTIPLY(z2, - FIX_0_541196100); /* -c9 */ + + tmp10 = z1 - z4; + z2 -= z3; + tmp12 = MULTIPLY(tmp10 + z2, FIX_0_541196100); /* c9 */ + tmp11 = tmp12 + MULTIPLY(tmp10, FIX_0_765366865); /* c3-c9 */ + tmp14 = tmp12 + MULTIPLY(z2, - FIX_1_847759065); /* c3+c9 */ + + z2 = z1 + z3; + tmp15 = MULTIPLY(z2 + z4, FIX(0.860918669)); /* c7 */ + tmp12 = tmp15 + MULTIPLY(z2, FIX(0.261052384)); /* c5-c7 */ + tmp10 = tmp12 + tmp0 + MULTIPLY(z1, FIX(0.280143716)); /* c1-c5 */ + tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580)); /* -(c7+c11) */ + tmp12 += tmp13 + tmp1 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */ + tmp13 += tmp15 - tmp0 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */ + tmp15 += tmp1 - MULTIPLY(z1, FIX(0.676326758)) - /* c7-c11 */ + MULTIPLY(z4, FIX(1.982889723)); /* c5+c7 */ + + /* Final output stage */ + + outptr[0] = range_limit[(int) DESCALE(tmp20 + tmp10, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[11] = range_limit[(int) DESCALE(tmp20 - tmp10, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[1] = range_limit[(int) DESCALE(tmp21 + tmp11, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[10] = range_limit[(int) DESCALE(tmp21 - tmp11, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[2] = range_limit[(int) DESCALE(tmp22 + tmp12, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[9] = range_limit[(int) DESCALE(tmp22 - tmp12, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[3] = range_limit[(int) DESCALE(tmp23 + tmp13, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[8] = range_limit[(int) DESCALE(tmp23 - tmp13, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[4] = range_limit[(int) DESCALE(tmp24 + tmp14, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[7] = range_limit[(int) DESCALE(tmp24 - tmp14, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[5] = range_limit[(int) DESCALE(tmp25 + tmp15, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[6] = range_limit[(int) DESCALE(tmp25 - tmp15, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + + wsptr += 8; /* advance pointer to next row */ + } +} + + +/* + * Perform dequantization and inverse DCT on one block of coefficients, + * producing a 13x13 output block. + * + * Optimized algorithm with 29 multiplications in the 1-D kernel. + * cK represents sqrt(2) * cos(K*pi/26). + */ + +GLOBAL(void) +jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; + INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26; + INT32 z1, z2, z3, z4; + JCOEFPTR inptr; + ISLOW_MULT_TYPE * quantptr; + int * wsptr; + JSAMPROW outptr; + JSAMPLE *range_limit = IDCT_range_limit(cinfo); + int ctr; + int workspace[8*13]; /* buffers data between passes */ + SHIFT_TEMPS + + /* Pass 1: process columns from input, store into work array. */ + + inptr = coef_block; + quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table; + wsptr = workspace; + for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) { + /* Even part */ + + z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); + z1 <<= CONST_BITS; + + z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); + z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); + z4 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]); + + tmp10 = z3 + z4; + tmp11 = z3 - z4; + + tmp12 = MULTIPLY(tmp10, FIX(1.155388986)); /* (c4+c6)/2 */ + tmp13 = MULTIPLY(tmp11, FIX(0.096834934)) + z1; /* (c4-c6)/2 */ + + tmp20 = MULTIPLY(z2, FIX(1.373119086)) + tmp12 + tmp13; /* c2 */ + tmp22 = MULTIPLY(z2, FIX(0.501487041)) - tmp12 + tmp13; /* c10 */ + + tmp12 = MULTIPLY(tmp10, FIX(0.316450131)); /* (c8-c12)/2 */ + tmp13 = MULTIPLY(tmp11, FIX(0.486914739)) + z1; /* (c8+c12)/2 */ + + tmp21 = MULTIPLY(z2, FIX(1.058554052)) - tmp12 + tmp13; /* c6 */ + tmp25 = MULTIPLY(z2, - FIX(1.252223920)) + tmp12 + tmp13; /* c4 */ + + tmp12 = MULTIPLY(tmp10, FIX(0.435816023)); /* (c2-c10)/2 */ + tmp13 = MULTIPLY(tmp11, FIX(0.937303064)) - z1; /* (c2+c10)/2 */ + + tmp23 = MULTIPLY(z2, - FIX(0.170464608)) - tmp12 - tmp13; /* c12 */ + tmp24 = MULTIPLY(z2, - FIX(0.803364869)) + tmp12 - tmp13; /* c8 */ + + tmp26 = MULTIPLY(tmp11 - z2, FIX(1.414213562)) + z1; /* c0 */ + + /* Odd part */ + + z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); + z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); + z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); + z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]); + + tmp11 = MULTIPLY(z1 + z2, FIX(1.322312651)); /* c3 */ + tmp12 = MULTIPLY(z1 + z3, FIX(1.163874945)); /* c5 */ + tmp15 = z1 + z4; + tmp13 = MULTIPLY(tmp15, FIX(0.937797057)); /* c7 */ + tmp10 = tmp11 + tmp12 + tmp13 - + MULTIPLY(z1, FIX(2.020082300)); /* c7+c5+c3-c1 */ + tmp14 = MULTIPLY(z2 + z3, - FIX(0.338443458)); /* -c11 */ + tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */ + tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */ + tmp14 = MULTIPLY(z2 + z4, - FIX(1.163874945)); /* -c5 */ + tmp11 += tmp14; + tmp13 += tmp14 + MULTIPLY(z4, FIX(2.205608352)); /* c3+c5+c9-c7 */ + tmp14 = MULTIPLY(z3 + z4, - FIX(0.657217813)); /* -c9 */ + tmp12 += tmp14; + tmp13 += tmp14; + tmp15 = MULTIPLY(tmp15, FIX(0.338443458)); /* c11 */ + tmp14 = tmp15 + MULTIPLY(z1, FIX(0.318774355)) - /* c9-c11 */ + MULTIPLY(z2, FIX(0.466105296)); /* c1-c7 */ + z1 = MULTIPLY(z3 - z2, FIX(0.937797057)); /* c7 */ + tmp14 += z1; + tmp15 += z1 + MULTIPLY(z3, FIX(0.384515595)) - /* c3-c7 */ + MULTIPLY(z4, FIX(1.742345811)); /* c1+c11 */ + + /* Final output stage */ + + wsptr[8*0] = (int) DESCALE(tmp20 + tmp10, CONST_BITS-PASS1_BITS); + wsptr[8*12] = (int) DESCALE(tmp20 - tmp10, CONST_BITS-PASS1_BITS); + wsptr[8*1] = (int) DESCALE(tmp21 + tmp11, CONST_BITS-PASS1_BITS); + wsptr[8*11] = (int) DESCALE(tmp21 - tmp11, CONST_BITS-PASS1_BITS); + wsptr[8*2] = (int) DESCALE(tmp22 + tmp12, CONST_BITS-PASS1_BITS); + wsptr[8*10] = (int) DESCALE(tmp22 - tmp12, CONST_BITS-PASS1_BITS); + wsptr[8*3] = (int) DESCALE(tmp23 + tmp13, CONST_BITS-PASS1_BITS); + wsptr[8*9] = (int) DESCALE(tmp23 - tmp13, CONST_BITS-PASS1_BITS); + wsptr[8*4] = (int) DESCALE(tmp24 + tmp14, CONST_BITS-PASS1_BITS); + wsptr[8*8] = (int) DESCALE(tmp24 - tmp14, CONST_BITS-PASS1_BITS); + wsptr[8*5] = (int) DESCALE(tmp25 + tmp15, CONST_BITS-PASS1_BITS); + wsptr[8*7] = (int) DESCALE(tmp25 - tmp15, CONST_BITS-PASS1_BITS); + wsptr[8*6] = (int) DESCALE(tmp26, CONST_BITS-PASS1_BITS); + } + + /* Pass 2: process 13 rows from work array, store into output array. */ + + wsptr = workspace; + for (ctr = 0; ctr < 13; ctr++) { + outptr = output_buf[ctr] + output_col; + + /* Even part */ + + z1 = (INT32) wsptr[0]; + z1 <<= CONST_BITS; + + z2 = (INT32) wsptr[2]; + z3 = (INT32) wsptr[4]; + z4 = (INT32) wsptr[6]; + + tmp10 = z3 + z4; + tmp11 = z3 - z4; + + tmp12 = MULTIPLY(tmp10, FIX(1.155388986)); /* (c4+c6)/2 */ + tmp13 = MULTIPLY(tmp11, FIX(0.096834934)) + z1; /* (c4-c6)/2 */ + + tmp20 = MULTIPLY(z2, FIX(1.373119086)) + tmp12 + tmp13; /* c2 */ + tmp22 = MULTIPLY(z2, FIX(0.501487041)) - tmp12 + tmp13; /* c10 */ + + tmp12 = MULTIPLY(tmp10, FIX(0.316450131)); /* (c8-c12)/2 */ + tmp13 = MULTIPLY(tmp11, FIX(0.486914739)) + z1; /* (c8+c12)/2 */ + + tmp21 = MULTIPLY(z2, FIX(1.058554052)) - tmp12 + tmp13; /* c6 */ + tmp25 = MULTIPLY(z2, - FIX(1.252223920)) + tmp12 + tmp13; /* c4 */ + + tmp12 = MULTIPLY(tmp10, FIX(0.435816023)); /* (c2-c10)/2 */ + tmp13 = MULTIPLY(tmp11, FIX(0.937303064)) - z1; /* (c2+c10)/2 */ + + tmp23 = MULTIPLY(z2, - FIX(0.170464608)) - tmp12 - tmp13; /* c12 */ + tmp24 = MULTIPLY(z2, - FIX(0.803364869)) + tmp12 - tmp13; /* c8 */ + + tmp26 = MULTIPLY(tmp11 - z2, FIX(1.414213562)) + z1; /* c0 */ + + /* Odd part */ + + z1 = (INT32) wsptr[1]; + z2 = (INT32) wsptr[3]; + z3 = (INT32) wsptr[5]; + z4 = (INT32) wsptr[7]; + + tmp11 = MULTIPLY(z1 + z2, FIX(1.322312651)); /* c3 */ + tmp12 = MULTIPLY(z1 + z3, FIX(1.163874945)); /* c5 */ + tmp15 = z1 + z4; + tmp13 = MULTIPLY(tmp15, FIX(0.937797057)); /* c7 */ + tmp10 = tmp11 + tmp12 + tmp13 - + MULTIPLY(z1, FIX(2.020082300)); /* c7+c5+c3-c1 */ + tmp14 = MULTIPLY(z2 + z3, - FIX(0.338443458)); /* -c11 */ + tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */ + tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */ + tmp14 = MULTIPLY(z2 + z4, - FIX(1.163874945)); /* -c5 */ + tmp11 += tmp14; + tmp13 += tmp14 + MULTIPLY(z4, FIX(2.205608352)); /* c3+c5+c9-c7 */ + tmp14 = MULTIPLY(z3 + z4, - FIX(0.657217813)); /* -c9 */ + tmp12 += tmp14; + tmp13 += tmp14; + tmp15 = MULTIPLY(tmp15, FIX(0.338443458)); /* c11 */ + tmp14 = tmp15 + MULTIPLY(z1, FIX(0.318774355)) - /* c9-c11 */ + MULTIPLY(z2, FIX(0.466105296)); /* c1-c7 */ + z1 = MULTIPLY(z3 - z2, FIX(0.937797057)); /* c7 */ + tmp14 += z1; + tmp15 += z1 + MULTIPLY(z3, FIX(0.384515595)) - /* c3-c7 */ + MULTIPLY(z4, FIX(1.742345811)); /* c1+c11 */ + + /* Final output stage */ + + outptr[0] = range_limit[(int) DESCALE(tmp20 + tmp10, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[12] = range_limit[(int) DESCALE(tmp20 - tmp10, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[1] = range_limit[(int) DESCALE(tmp21 + tmp11, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[11] = range_limit[(int) DESCALE(tmp21 - tmp11, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[2] = range_limit[(int) DESCALE(tmp22 + tmp12, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[10] = range_limit[(int) DESCALE(tmp22 - tmp12, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[3] = range_limit[(int) DESCALE(tmp23 + tmp13, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[9] = range_limit[(int) DESCALE(tmp23 - tmp13, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[4] = range_limit[(int) DESCALE(tmp24 + tmp14, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[8] = range_limit[(int) DESCALE(tmp24 - tmp14, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[5] = range_limit[(int) DESCALE(tmp25 + tmp15, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[7] = range_limit[(int) DESCALE(tmp25 - tmp15, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[6] = range_limit[(int) DESCALE(tmp26, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + + wsptr += 8; /* advance pointer to next row */ + } +} + + +/* + * Perform dequantization and inverse DCT on one block of coefficients, + * producing a 14x14 output block. + * + * Optimized algorithm with 20 multiplications in the 1-D kernel. + * cK represents sqrt(2) * cos(K*pi/28). + */ + +GLOBAL(void) +jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16; + INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26; + INT32 z1, z2, z3, z4; + JCOEFPTR inptr; + ISLOW_MULT_TYPE * quantptr; + int * wsptr; + JSAMPROW outptr; + JSAMPLE *range_limit = IDCT_range_limit(cinfo); + int ctr; + int workspace[8*14]; /* buffers data between passes */ + SHIFT_TEMPS + + /* Pass 1: process columns from input, store into work array. */ + + inptr = coef_block; + quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table; + wsptr = workspace; + for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) { + /* Even part */ + + z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); + z1 <<= CONST_BITS; + z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); + z2 = MULTIPLY(z4, FIX(1.274162392)); /* c4 */ + z3 = MULTIPLY(z4, FIX(0.314692123)); /* c12 */ + z4 = MULTIPLY(z4, FIX(0.881747734)); /* c8 */ + + tmp10 = z1 + z2; + tmp11 = z1 + z3; + tmp12 = z1 - z4; + + tmp23 = DESCALE(z1 - ((z2 + z3 - z4) << 1), /* c0 = (c4+c12-c8)*2 */ + CONST_BITS-PASS1_BITS); + + z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); + z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]); + + z3 = MULTIPLY(z1 + z2, FIX(1.105676686)); /* c6 */ + + tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */ + tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */ + tmp15 = MULTIPLY(z1, FIX(0.613604268)) - /* c10 */ + MULTIPLY(z2, FIX(1.378756276)); /* c2 */ + + tmp20 = tmp10 + tmp13; + tmp26 = tmp10 - tmp13; + tmp21 = tmp11 + tmp14; + tmp25 = tmp11 - tmp14; + tmp22 = tmp12 + tmp15; + tmp24 = tmp12 - tmp15; + + /* Odd part */ + + z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); + z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); + z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); + z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]); + tmp13 = z4 << CONST_BITS; + + tmp14 = z1 + z3; + tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607)); /* c3 */ + tmp12 = MULTIPLY(tmp14, FIX(1.197448846)); /* c5 */ + tmp10 = tmp11 + tmp12 + tmp13 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */ + tmp14 = MULTIPLY(tmp14, FIX(0.752406978)); /* c9 */ + tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426)); /* c9+c11-c13 */ + z1 -= z2; + tmp15 = MULTIPLY(z1, FIX(0.467085129)) - tmp13; /* c11 */ + tmp16 += tmp15; + z1 += z4; + z4 = MULTIPLY(z2 + z3, - FIX(0.158341681)) - tmp13; /* -c13 */ + tmp11 += z4 - MULTIPLY(z2, FIX(0.424103948)); /* c3-c9-c13 */ + tmp12 += z4 - MULTIPLY(z3, FIX(2.373959773)); /* c3+c5-c13 */ + z4 = MULTIPLY(z3 - z2, FIX(1.405321284)); /* c1 */ + tmp14 += z4 + tmp13 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */ + tmp15 += z4 + MULTIPLY(z2, FIX(0.674957567)); /* c1+c11-c5 */ + + tmp13 = (z1 - z3) << PASS1_BITS; + + /* Final output stage */ + + wsptr[8*0] = (int) DESCALE(tmp20 + tmp10, CONST_BITS-PASS1_BITS); + wsptr[8*13] = (int) DESCALE(tmp20 - tmp10, CONST_BITS-PASS1_BITS); + wsptr[8*1] = (int) DESCALE(tmp21 + tmp11, CONST_BITS-PASS1_BITS); + wsptr[8*12] = (int) DESCALE(tmp21 - tmp11, CONST_BITS-PASS1_BITS); + wsptr[8*2] = (int) DESCALE(tmp22 + tmp12, CONST_BITS-PASS1_BITS); + wsptr[8*11] = (int) DESCALE(tmp22 - tmp12, CONST_BITS-PASS1_BITS); + wsptr[8*3] = (int) (tmp23 + tmp13); + wsptr[8*10] = (int) (tmp23 - tmp13); + wsptr[8*4] = (int) DESCALE(tmp24 + tmp14, CONST_BITS-PASS1_BITS); + wsptr[8*9] = (int) DESCALE(tmp24 - tmp14, CONST_BITS-PASS1_BITS); + wsptr[8*5] = (int) DESCALE(tmp25 + tmp15, CONST_BITS-PASS1_BITS); + wsptr[8*8] = (int) DESCALE(tmp25 - tmp15, CONST_BITS-PASS1_BITS); + wsptr[8*6] = (int) DESCALE(tmp26 + tmp16, CONST_BITS-PASS1_BITS); + wsptr[8*7] = (int) DESCALE(tmp26 - tmp16, CONST_BITS-PASS1_BITS); + } + + /* Pass 2: process 14 rows from work array, store into output array. */ + + wsptr = workspace; + for (ctr = 0; ctr < 14; ctr++) { + outptr = output_buf[ctr] + output_col; + + /* Even part */ + + z1 = (INT32) wsptr[0]; + z1 <<= CONST_BITS; + z4 = (INT32) wsptr[4]; + z2 = MULTIPLY(z4, FIX(1.274162392)); /* c4 */ + z3 = MULTIPLY(z4, FIX(0.314692123)); /* c12 */ + z4 = MULTIPLY(z4, FIX(0.881747734)); /* c8 */ + + tmp10 = z1 + z2; + tmp11 = z1 + z3; + tmp12 = z1 - z4; + + tmp23 = z1 - ((z2 + z3 - z4) << 1); /* c0 = (c4+c12-c8)*2 */ + + z1 = (INT32) wsptr[2]; + z2 = (INT32) wsptr[6]; + + z3 = MULTIPLY(z1 + z2, FIX(1.105676686)); /* c6 */ + + tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */ + tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */ + tmp15 = MULTIPLY(z1, FIX(0.613604268)) - /* c10 */ + MULTIPLY(z2, FIX(1.378756276)); /* c2 */ + + tmp20 = tmp10 + tmp13; + tmp26 = tmp10 - tmp13; + tmp21 = tmp11 + tmp14; + tmp25 = tmp11 - tmp14; + tmp22 = tmp12 + tmp15; + tmp24 = tmp12 - tmp15; + + /* Odd part */ + + z1 = (INT32) wsptr[1]; + z2 = (INT32) wsptr[3]; + z3 = (INT32) wsptr[5]; + z4 = (INT32) wsptr[7]; + z4 <<= CONST_BITS; + + tmp14 = z1 + z3; + tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607)); /* c3 */ + tmp12 = MULTIPLY(tmp14, FIX(1.197448846)); /* c5 */ + tmp10 = tmp11 + tmp12 + z4 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */ + tmp14 = MULTIPLY(tmp14, FIX(0.752406978)); /* c9 */ + tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426)); /* c9+c11-c13 */ + z1 -= z2; + tmp15 = MULTIPLY(z1, FIX(0.467085129)) - z4; /* c11 */ + tmp16 += tmp15; + tmp13 = MULTIPLY(z2 + z3, - FIX(0.158341681)) - z4; /* -c13 */ + tmp11 += tmp13 - MULTIPLY(z2, FIX(0.424103948)); /* c3-c9-c13 */ + tmp12 += tmp13 - MULTIPLY(z3, FIX(2.373959773)); /* c3+c5-c13 */ + tmp13 = MULTIPLY(z3 - z2, FIX(1.405321284)); /* c1 */ + tmp14 += tmp13 + z4 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */ + tmp15 += tmp13 + MULTIPLY(z2, FIX(0.674957567)); /* c1+c11-c5 */ + + tmp13 = ((z1 - z3) << CONST_BITS) + z4; + + /* Final output stage */ + + outptr[0] = range_limit[(int) DESCALE(tmp20 + tmp10, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[13] = range_limit[(int) DESCALE(tmp20 - tmp10, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[1] = range_limit[(int) DESCALE(tmp21 + tmp11, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[12] = range_limit[(int) DESCALE(tmp21 - tmp11, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[2] = range_limit[(int) DESCALE(tmp22 + tmp12, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[11] = range_limit[(int) DESCALE(tmp22 - tmp12, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[3] = range_limit[(int) DESCALE(tmp23 + tmp13, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[10] = range_limit[(int) DESCALE(tmp23 - tmp13, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[4] = range_limit[(int) DESCALE(tmp24 + tmp14, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[9] = range_limit[(int) DESCALE(tmp24 - tmp14, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[5] = range_limit[(int) DESCALE(tmp25 + tmp15, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[8] = range_limit[(int) DESCALE(tmp25 - tmp15, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[6] = range_limit[(int) DESCALE(tmp26 + tmp16, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[7] = range_limit[(int) DESCALE(tmp26 - tmp16, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + + wsptr += 8; /* advance pointer to next row */ + } +} + + +/* + * Perform dequantization and inverse DCT on one block of coefficients, + * producing a 15x15 output block. + * + * Optimized algorithm with 22 multiplications in the 1-D kernel. + * cK represents sqrt(2) * cos(K*pi/30). + */ + +GLOBAL(void) +jpeg_idct_15x15 (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16; + INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27; + INT32 z1, z2, z3, z4; + JCOEFPTR inptr; + ISLOW_MULT_TYPE * quantptr; + int * wsptr; + JSAMPROW outptr; + JSAMPLE *range_limit = IDCT_range_limit(cinfo); + int ctr; + int workspace[8*15]; /* buffers data between passes */ + SHIFT_TEMPS + + /* Pass 1: process columns from input, store into work array. */ + + inptr = coef_block; + quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table; + wsptr = workspace; + for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) { + /* Even part */ + + z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); + z1 <<= CONST_BITS; + + z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); + z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); + z4 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]); + + tmp10 = MULTIPLY(z4, FIX(0.437016024)); /* c12 */ + tmp11 = MULTIPLY(z4, FIX(1.144122806)); /* c6 */ + + tmp12 = z1 - tmp10; + tmp13 = z1 + tmp11; + z1 -= (tmp11 - tmp10) << 1; /* c0 = (c6-c12)*2 */ + + z4 = z2 - z3; + z3 += z2; + tmp10 = MULTIPLY(z3, FIX(1.337628990)); /* (c2+c4)/2 */ + tmp11 = MULTIPLY(z4, FIX(0.045680613)); /* (c2-c4)/2 */ + z2 = MULTIPLY(z2, FIX(1.439773946)); /* c4+c14 */ + + tmp20 = tmp13 + tmp10 + tmp11; + tmp23 = tmp12 - tmp10 + tmp11 + z2; + + tmp10 = MULTIPLY(z3, FIX(0.547059574)); /* (c8+c14)/2 */ + tmp11 = MULTIPLY(z4, FIX(0.399234004)); /* (c8-c14)/2 */ + + tmp25 = tmp13 - tmp10 - tmp11; + tmp26 = tmp12 + tmp10 - tmp11 - z2; + + tmp10 = MULTIPLY(z3, FIX(0.790569415)); /* (c6+c12)/2 */ + tmp11 = MULTIPLY(z4, FIX(0.353553391)); /* (c6-c12)/2 */ + + tmp21 = tmp12 + tmp10 + tmp11; + tmp24 = tmp13 - tmp10 + tmp11; + tmp11 += tmp11; + tmp22 = z1 + tmp11; /* c10 = c6-c12 */ + tmp27 = z1 - tmp11 - tmp11; /* c0 = (c6-c12)*2 */ + + /* Odd part */ + + z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); + z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); + z4 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); + z3 = MULTIPLY(z4, FIX(1.224744871)); /* c5 */ + z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]); + + tmp13 = z2 - z4; + tmp15 = MULTIPLY(z1 + tmp13, FIX(0.831253876)); /* c9 */ + tmp11 = tmp15 + MULTIPLY(z1, FIX(0.513743148)); /* c3-c9 */ + tmp14 = tmp15 - MULTIPLY(tmp13, FIX(2.176250899)); /* c3+c9 */ + + tmp13 = MULTIPLY(z2, - FIX(0.831253876)); /* -c9 */ + tmp15 = MULTIPLY(z2, - FIX(1.344997024)); /* -c3 */ + z2 = z1 - z4; + tmp12 = z3 + MULTIPLY(z2, FIX(1.406466353)); /* c1 */ + + tmp10 = tmp12 + MULTIPLY(z4, FIX(2.457431844)) - tmp15; /* c1+c7 */ + tmp16 = tmp12 - MULTIPLY(z1, FIX(1.112434820)) + tmp13; /* c1-c13 */ + tmp12 = MULTIPLY(z2, FIX(1.224744871)) - z3; /* c5 */ + z2 = MULTIPLY(z1 + z4, FIX(0.575212477)); /* c11 */ + tmp13 += z2 + MULTIPLY(z1, FIX(0.475753014)) - z3; /* c7-c11 */ + tmp15 += z2 - MULTIPLY(z4, FIX(0.869244010)) + z3; /* c11+c13 */ + + /* Final output stage */ + + wsptr[8*0] = (int) DESCALE(tmp20 + tmp10, CONST_BITS-PASS1_BITS); + wsptr[8*14] = (int) DESCALE(tmp20 - tmp10, CONST_BITS-PASS1_BITS); + wsptr[8*1] = (int) DESCALE(tmp21 + tmp11, CONST_BITS-PASS1_BITS); + wsptr[8*13] = (int) DESCALE(tmp21 - tmp11, CONST_BITS-PASS1_BITS); + wsptr[8*2] = (int) DESCALE(tmp22 + tmp12, CONST_BITS-PASS1_BITS); + wsptr[8*12] = (int) DESCALE(tmp22 - tmp12, CONST_BITS-PASS1_BITS); + wsptr[8*3] = (int) DESCALE(tmp23 + tmp13, CONST_BITS-PASS1_BITS); + wsptr[8*11] = (int) DESCALE(tmp23 - tmp13, CONST_BITS-PASS1_BITS); + wsptr[8*4] = (int) DESCALE(tmp24 + tmp14, CONST_BITS-PASS1_BITS); + wsptr[8*10] = (int) DESCALE(tmp24 - tmp14, CONST_BITS-PASS1_BITS); + wsptr[8*5] = (int) DESCALE(tmp25 + tmp15, CONST_BITS-PASS1_BITS); + wsptr[8*9] = (int) DESCALE(tmp25 - tmp15, CONST_BITS-PASS1_BITS); + wsptr[8*6] = (int) DESCALE(tmp26 + tmp16, CONST_BITS-PASS1_BITS); + wsptr[8*8] = (int) DESCALE(tmp26 - tmp16, CONST_BITS-PASS1_BITS); + wsptr[8*7] = (int) DESCALE(tmp27, CONST_BITS-PASS1_BITS); + } + + /* Pass 2: process 15 rows from work array, store into output array. */ + + wsptr = workspace; + for (ctr = 0; ctr < 15; ctr++) { + outptr = output_buf[ctr] + output_col; + + /* Even part */ + + z1 = (INT32) wsptr[0]; + z1 <<= CONST_BITS; + + z2 = (INT32) wsptr[2]; + z3 = (INT32) wsptr[4]; + z4 = (INT32) wsptr[6]; + + tmp10 = MULTIPLY(z4, FIX(0.437016024)); /* c12 */ + tmp11 = MULTIPLY(z4, FIX(1.144122806)); /* c6 */ + + tmp12 = z1 - tmp10; + tmp13 = z1 + tmp11; + z1 -= (tmp11 - tmp10) << 1; /* c0 = (c6-c12)*2 */ + + z4 = z2 - z3; + z3 += z2; + tmp10 = MULTIPLY(z3, FIX(1.337628990)); /* (c2+c4)/2 */ + tmp11 = MULTIPLY(z4, FIX(0.045680613)); /* (c2-c4)/2 */ + z2 = MULTIPLY(z2, FIX(1.439773946)); /* c4+c14 */ + + tmp20 = tmp13 + tmp10 + tmp11; + tmp23 = tmp12 - tmp10 + tmp11 + z2; + + tmp10 = MULTIPLY(z3, FIX(0.547059574)); /* (c8+c14)/2 */ + tmp11 = MULTIPLY(z4, FIX(0.399234004)); /* (c8-c14)/2 */ + + tmp25 = tmp13 - tmp10 - tmp11; + tmp26 = tmp12 + tmp10 - tmp11 - z2; + + tmp10 = MULTIPLY(z3, FIX(0.790569415)); /* (c6+c12)/2 */ + tmp11 = MULTIPLY(z4, FIX(0.353553391)); /* (c6-c12)/2 */ + + tmp21 = tmp12 + tmp10 + tmp11; + tmp24 = tmp13 - tmp10 + tmp11; + tmp11 += tmp11; + tmp22 = z1 + tmp11; /* c10 = c6-c12 */ + tmp27 = z1 - tmp11 - tmp11; /* c0 = (c6-c12)*2 */ + + /* Odd part */ + + z1 = (INT32) wsptr[1]; + z2 = (INT32) wsptr[3]; + z4 = (INT32) wsptr[5]; + z3 = MULTIPLY(z4, FIX(1.224744871)); /* c5 */ + z4 = (INT32) wsptr[7]; + + tmp13 = z2 - z4; + tmp15 = MULTIPLY(z1 + tmp13, FIX(0.831253876)); /* c9 */ + tmp11 = tmp15 + MULTIPLY(z1, FIX(0.513743148)); /* c3-c9 */ + tmp14 = tmp15 - MULTIPLY(tmp13, FIX(2.176250899)); /* c3+c9 */ + + tmp13 = MULTIPLY(z2, - FIX(0.831253876)); /* -c9 */ + tmp15 = MULTIPLY(z2, - FIX(1.344997024)); /* -c3 */ + z2 = z1 - z4; + tmp12 = z3 + MULTIPLY(z2, FIX(1.406466353)); /* c1 */ + + tmp10 = tmp12 + MULTIPLY(z4, FIX(2.457431844)) - tmp15; /* c1+c7 */ + tmp16 = tmp12 - MULTIPLY(z1, FIX(1.112434820)) + tmp13; /* c1-c13 */ + tmp12 = MULTIPLY(z2, FIX(1.224744871)) - z3; /* c5 */ + z2 = MULTIPLY(z1 + z4, FIX(0.575212477)); /* c11 */ + tmp13 += z2 + MULTIPLY(z1, FIX(0.475753014)) - z3; /* c7-c11 */ + tmp15 += z2 - MULTIPLY(z4, FIX(0.869244010)) + z3; /* c11+c13 */ + + /* Final output stage */ + + outptr[0] = range_limit[(int) DESCALE(tmp20 + tmp10, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[14] = range_limit[(int) DESCALE(tmp20 - tmp10, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[1] = range_limit[(int) DESCALE(tmp21 + tmp11, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[13] = range_limit[(int) DESCALE(tmp21 - tmp11, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[2] = range_limit[(int) DESCALE(tmp22 + tmp12, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[12] = range_limit[(int) DESCALE(tmp22 - tmp12, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[3] = range_limit[(int) DESCALE(tmp23 + tmp13, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[11] = range_limit[(int) DESCALE(tmp23 - tmp13, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[4] = range_limit[(int) DESCALE(tmp24 + tmp14, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[10] = range_limit[(int) DESCALE(tmp24 - tmp14, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[5] = range_limit[(int) DESCALE(tmp25 + tmp15, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[9] = range_limit[(int) DESCALE(tmp25 - tmp15, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[6] = range_limit[(int) DESCALE(tmp26 + tmp16, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[8] = range_limit[(int) DESCALE(tmp26 - tmp16, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[7] = range_limit[(int) DESCALE(tmp27, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + + wsptr += 8; /* advance pointer to next row */ + } +} + + +/* + * Perform dequantization and inverse DCT on one block of coefficients, + * producing a 16x16 output block. + * + * Optimized algorithm with 28 multiplications in the 1-D kernel. + * cK represents sqrt(2) * cos(K*pi/32). + */ + +GLOBAL(void) +jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13; + INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27; + INT32 z1, z2, z3, z4; + JCOEFPTR inptr; + ISLOW_MULT_TYPE * quantptr; + int * wsptr; + JSAMPROW outptr; + JSAMPLE *range_limit = IDCT_range_limit(cinfo); + int ctr; + int workspace[8*16]; /* buffers data between passes */ + SHIFT_TEMPS + + /* Pass 1: process columns from input, store into work array. */ + + inptr = coef_block; + quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table; + wsptr = workspace; + for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) { + /* Even part */ + + tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); + tmp0 <<= CONST_BITS; + + z1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); + tmp1 = MULTIPLY(z1, FIX(1.306562965)); /* c4[16] = c2[8] */ + tmp2 = MULTIPLY(z1, FIX_0_541196100); /* c12[16] = c6[8] */ + + tmp10 = tmp0 + tmp1; + tmp11 = tmp0 - tmp1; + tmp12 = tmp0 + tmp2; + tmp13 = tmp0 - tmp2; + + z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); + z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]); + z3 = z1 - z2; + z4 = MULTIPLY(z3, FIX(0.275899379)); /* c14[16] = c7[8] */ + z3 = MULTIPLY(z3, FIX(1.387039845)); /* c2[16] = c1[8] */ + + tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447); /* (c6+c2)[16] = (c3+c1)[8] */ + tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223); /* (c6-c14)[16] = (c3-c7)[8] */ + tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */ + tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */ + + tmp20 = tmp10 + tmp0; + tmp27 = tmp10 - tmp0; + tmp21 = tmp12 + tmp1; + tmp26 = tmp12 - tmp1; + tmp22 = tmp13 + tmp2; + tmp25 = tmp13 - tmp2; + tmp23 = tmp11 + tmp3; + tmp24 = tmp11 - tmp3; + + /* Odd part */ + + z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); + z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); + z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); + z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]); + + tmp11 = z1 + z3; + + tmp1 = MULTIPLY(z1 + z2, FIX(1.353318001)); /* c3 */ + tmp2 = MULTIPLY(tmp11, FIX(1.247225013)); /* c5 */ + tmp3 = MULTIPLY(z1 + z4, FIX(1.093201867)); /* c7 */ + tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586)); /* c9 */ + tmp11 = MULTIPLY(tmp11, FIX(0.666655658)); /* c11 */ + tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528)); /* c13 */ + tmp0 = tmp1 + tmp2 + tmp3 - + MULTIPLY(z1, FIX(2.286341144)); /* c7+c5+c3-c1 */ + tmp13 = tmp10 + tmp11 + tmp12 - + MULTIPLY(z1, FIX(1.835730603)); /* c9+c11+c13-c15 */ + z1 = MULTIPLY(z2 + z3, FIX(0.138617169)); /* c15 */ + tmp1 += z1 + MULTIPLY(z2, FIX(0.071888074)); /* c9+c11-c3-c15 */ + tmp2 += z1 - MULTIPLY(z3, FIX(1.125726048)); /* c5+c7+c15-c3 */ + z1 = MULTIPLY(z3 - z2, FIX(1.407403738)); /* c1 */ + tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282)); /* c1+c11-c9-c13 */ + tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411)); /* c1+c5+c13-c7 */ + z2 += z4; + z1 = MULTIPLY(z2, - FIX(0.666655658)); /* -c11 */ + tmp1 += z1; + tmp3 += z1 + MULTIPLY(z4, FIX(1.065388962)); /* c3+c11+c15-c7 */ + z2 = MULTIPLY(z2, - FIX(1.247225013)); /* -c5 */ + tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809)); /* c1+c5+c9-c13 */ + tmp12 += z2; + z2 = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */ + tmp2 += z2; + tmp3 += z2; + z2 = MULTIPLY(z4 - z3, FIX(0.410524528)); /* c13 */ + tmp10 += z2; + tmp11 += z2; + + /* Final output stage */ + + wsptr[8*0] = (int) DESCALE(tmp20 + tmp0, CONST_BITS-PASS1_BITS); + wsptr[8*15] = (int) DESCALE(tmp20 - tmp0, CONST_BITS-PASS1_BITS); + wsptr[8*1] = (int) DESCALE(tmp21 + tmp1, CONST_BITS-PASS1_BITS); + wsptr[8*14] = (int) DESCALE(tmp21 - tmp1, CONST_BITS-PASS1_BITS); + wsptr[8*2] = (int) DESCALE(tmp22 + tmp2, CONST_BITS-PASS1_BITS); + wsptr[8*13] = (int) DESCALE(tmp22 - tmp2, CONST_BITS-PASS1_BITS); + wsptr[8*3] = (int) DESCALE(tmp23 + tmp3, CONST_BITS-PASS1_BITS); + wsptr[8*12] = (int) DESCALE(tmp23 - tmp3, CONST_BITS-PASS1_BITS); + wsptr[8*4] = (int) DESCALE(tmp24 + tmp10, CONST_BITS-PASS1_BITS); + wsptr[8*11] = (int) DESCALE(tmp24 - tmp10, CONST_BITS-PASS1_BITS); + wsptr[8*5] = (int) DESCALE(tmp25 + tmp11, CONST_BITS-PASS1_BITS); + wsptr[8*10] = (int) DESCALE(tmp25 - tmp11, CONST_BITS-PASS1_BITS); + wsptr[8*6] = (int) DESCALE(tmp26 + tmp12, CONST_BITS-PASS1_BITS); + wsptr[8*9] = (int) DESCALE(tmp26 - tmp12, CONST_BITS-PASS1_BITS); + wsptr[8*7] = (int) DESCALE(tmp27 + tmp13, CONST_BITS-PASS1_BITS); + wsptr[8*8] = (int) DESCALE(tmp27 - tmp13, CONST_BITS-PASS1_BITS); + } + + /* Pass 2: process 16 rows from work array, store into output array. */ + + wsptr = workspace; + for (ctr = 0; ctr < 16; ctr++) { + outptr = output_buf[ctr] + output_col; + + /* Even part */ + + tmp0 = (INT32) wsptr[0]; + tmp0 <<= CONST_BITS; + + z1 = (INT32) wsptr[4]; + tmp1 = MULTIPLY(z1, FIX(1.306562965)); /* c4[16] = c2[8] */ + tmp2 = MULTIPLY(z1, FIX_0_541196100); /* c12[16] = c6[8] */ + + tmp10 = tmp0 + tmp1; + tmp11 = tmp0 - tmp1; + tmp12 = tmp0 + tmp2; + tmp13 = tmp0 - tmp2; + + z1 = (INT32) wsptr[2]; + z2 = (INT32) wsptr[6]; + z3 = z1 - z2; + z4 = MULTIPLY(z3, FIX(0.275899379)); /* c14[16] = c7[8] */ + z3 = MULTIPLY(z3, FIX(1.387039845)); /* c2[16] = c1[8] */ + + tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447); /* (c6+c2)[16] = (c3+c1)[8] */ + tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223); /* (c6-c14)[16] = (c3-c7)[8] */ + tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */ + tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */ + + tmp20 = tmp10 + tmp0; + tmp27 = tmp10 - tmp0; + tmp21 = tmp12 + tmp1; + tmp26 = tmp12 - tmp1; + tmp22 = tmp13 + tmp2; + tmp25 = tmp13 - tmp2; + tmp23 = tmp11 + tmp3; + tmp24 = tmp11 - tmp3; + + /* Odd part */ + + z1 = (INT32) wsptr[1]; + z2 = (INT32) wsptr[3]; + z3 = (INT32) wsptr[5]; + z4 = (INT32) wsptr[7]; + + tmp11 = z1 + z3; + + tmp1 = MULTIPLY(z1 + z2, FIX(1.353318001)); /* c3 */ + tmp2 = MULTIPLY(tmp11, FIX(1.247225013)); /* c5 */ + tmp3 = MULTIPLY(z1 + z4, FIX(1.093201867)); /* c7 */ + tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586)); /* c9 */ + tmp11 = MULTIPLY(tmp11, FIX(0.666655658)); /* c11 */ + tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528)); /* c13 */ + tmp0 = tmp1 + tmp2 + tmp3 - + MULTIPLY(z1, FIX(2.286341144)); /* c7+c5+c3-c1 */ + tmp13 = tmp10 + tmp11 + tmp12 - + MULTIPLY(z1, FIX(1.835730603)); /* c9+c11+c13-c15 */ + z1 = MULTIPLY(z2 + z3, FIX(0.138617169)); /* c15 */ + tmp1 += z1 + MULTIPLY(z2, FIX(0.071888074)); /* c9+c11-c3-c15 */ + tmp2 += z1 - MULTIPLY(z3, FIX(1.125726048)); /* c5+c7+c15-c3 */ + z1 = MULTIPLY(z3 - z2, FIX(1.407403738)); /* c1 */ + tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282)); /* c1+c11-c9-c13 */ + tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411)); /* c1+c5+c13-c7 */ + z2 += z4; + z1 = MULTIPLY(z2, - FIX(0.666655658)); /* -c11 */ + tmp1 += z1; + tmp3 += z1 + MULTIPLY(z4, FIX(1.065388962)); /* c3+c11+c15-c7 */ + z2 = MULTIPLY(z2, - FIX(1.247225013)); /* -c5 */ + tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809)); /* c1+c5+c9-c13 */ + tmp12 += z2; + z2 = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */ + tmp2 += z2; + tmp3 += z2; + z2 = MULTIPLY(z4 - z3, FIX(0.410524528)); /* c13 */ + tmp10 += z2; + tmp11 += z2; + + /* Final output stage */ + + outptr[0] = range_limit[(int) DESCALE(tmp20 + tmp0, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[15] = range_limit[(int) DESCALE(tmp20 - tmp0, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[1] = range_limit[(int) DESCALE(tmp21 + tmp1, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[14] = range_limit[(int) DESCALE(tmp21 - tmp1, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[2] = range_limit[(int) DESCALE(tmp22 + tmp2, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[13] = range_limit[(int) DESCALE(tmp22 - tmp2, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[3] = range_limit[(int) DESCALE(tmp23 + tmp3, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[12] = range_limit[(int) DESCALE(tmp23 - tmp3, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[4] = range_limit[(int) DESCALE(tmp24 + tmp10, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[11] = range_limit[(int) DESCALE(tmp24 - tmp10, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[5] = range_limit[(int) DESCALE(tmp25 + tmp11, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[10] = range_limit[(int) DESCALE(tmp25 - tmp11, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[6] = range_limit[(int) DESCALE(tmp26 + tmp12, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[9] = range_limit[(int) DESCALE(tmp26 - tmp12, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[7] = range_limit[(int) DESCALE(tmp27 + tmp13, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[8] = range_limit[(int) DESCALE(tmp27 - tmp13, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + + wsptr += 8; /* advance pointer to next row */ + } +} + + +/* + * Perform dequantization and inverse DCT on one block of coefficients, + * producing a 16x8 output block. + * + * 8-point IDCT in pass 1 (columns), 16-point in pass 2 (rows). + */ + +GLOBAL(void) +jpeg_idct_16x8 (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13; + INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27; + INT32 z1, z2, z3, z4, z5; + JCOEFPTR inptr; + ISLOW_MULT_TYPE * quantptr; + int * wsptr; + JSAMPROW outptr; + JSAMPLE *range_limit = IDCT_range_limit(cinfo); + int ctr; + int workspace[8*8]; /* buffers data between passes */ + SHIFT_TEMPS + + /* Pass 1: process columns from input, store into work array. */ + /* Note results are scaled up by sqrt(8) compared to a true IDCT; */ + /* furthermore, we scale the results by 2**PASS1_BITS. */ + + inptr = coef_block; + quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table; + wsptr = workspace; + for (ctr = DCTSIZE; ctr > 0; ctr--) { + /* Due to quantization, we will usually find that many of the input + * coefficients are zero, especially the AC terms. We can exploit this + * by short-circuiting the IDCT calculation for any column in which all + * the AC terms are zero. In that case each output is equal to the + * DC coefficient (with scale factor as needed). + * With typical images and quantization tables, half or more of the + * column DCT calculations can be simplified this way. + */ + + if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 && + inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 && + inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 && + inptr[DCTSIZE*7] == 0) { + /* AC terms all zero */ + int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS; + + wsptr[DCTSIZE*0] = dcval; + wsptr[DCTSIZE*1] = dcval; + wsptr[DCTSIZE*2] = dcval; + wsptr[DCTSIZE*3] = dcval; + wsptr[DCTSIZE*4] = dcval; + wsptr[DCTSIZE*5] = dcval; + wsptr[DCTSIZE*6] = dcval; + wsptr[DCTSIZE*7] = dcval; + + inptr++; /* advance pointers to next column */ + quantptr++; + wsptr++; + continue; + } + + /* Even part: reverse the even part of the forward DCT. */ + /* The rotator is sqrt(2)*c(-6). */ + + z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); + z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]); + + z1 = MULTIPLY(z2 + z3, FIX_0_541196100); + tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); + tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); + + z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); + z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); + + tmp0 = (z2 + z3) << CONST_BITS; + tmp1 = (z2 - z3) << CONST_BITS; + + tmp10 = tmp0 + tmp3; + tmp13 = tmp0 - tmp3; + tmp11 = tmp1 + tmp2; + tmp12 = tmp1 - tmp2; + + /* Odd part per figure 8; the matrix is unitary and hence its + * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. + */ + + tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]); + tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); + tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); + tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); + + z1 = tmp0 + tmp3; + z2 = tmp1 + tmp2; + z3 = tmp0 + tmp2; + z4 = tmp1 + tmp3; + z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */ + + tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */ + tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */ + tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */ + tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */ + z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */ + z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */ + z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */ + z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */ + + z3 += z5; + z4 += z5; + + tmp0 += z1 + z3; + tmp1 += z2 + z4; + tmp2 += z2 + z3; + tmp3 += z1 + z4; + + /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ + + wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS); + wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS); + wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS); + wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS); + wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS); + wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS); + wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS); + wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS); + + inptr++; /* advance pointers to next column */ + quantptr++; + wsptr++; + } + + /* Pass 2: process 8 rows from work array, store into output array. + * 16-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/32). + */ + wsptr = workspace; + for (ctr = 0; ctr < 8; ctr++) { + outptr = output_buf[ctr] + output_col; + + /* Even part */ + + tmp0 = (INT32) wsptr[0]; + tmp0 <<= CONST_BITS; + + z1 = (INT32) wsptr[4]; + tmp1 = MULTIPLY(z1, FIX(1.306562965)); /* c4[16] = c2[8] */ + tmp2 = MULTIPLY(z1, FIX_0_541196100); /* c12[16] = c6[8] */ + + tmp10 = tmp0 + tmp1; + tmp11 = tmp0 - tmp1; + tmp12 = tmp0 + tmp2; + tmp13 = tmp0 - tmp2; + + z1 = (INT32) wsptr[2]; + z2 = (INT32) wsptr[6]; + z3 = z1 - z2; + z4 = MULTIPLY(z3, FIX(0.275899379)); /* c14[16] = c7[8] */ + z3 = MULTIPLY(z3, FIX(1.387039845)); /* c2[16] = c1[8] */ + + tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447); /* (c6+c2)[16] = (c3+c1)[8] */ + tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223); /* (c6-c14)[16] = (c3-c7)[8] */ + tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */ + tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */ + + tmp20 = tmp10 + tmp0; + tmp27 = tmp10 - tmp0; + tmp21 = tmp12 + tmp1; + tmp26 = tmp12 - tmp1; + tmp22 = tmp13 + tmp2; + tmp25 = tmp13 - tmp2; + tmp23 = tmp11 + tmp3; + tmp24 = tmp11 - tmp3; + + /* Odd part */ + + z1 = (INT32) wsptr[1]; + z2 = (INT32) wsptr[3]; + z3 = (INT32) wsptr[5]; + z4 = (INT32) wsptr[7]; + + tmp11 = z1 + z3; + + tmp1 = MULTIPLY(z1 + z2, FIX(1.353318001)); /* c3 */ + tmp2 = MULTIPLY(tmp11, FIX(1.247225013)); /* c5 */ + tmp3 = MULTIPLY(z1 + z4, FIX(1.093201867)); /* c7 */ + tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586)); /* c9 */ + tmp11 = MULTIPLY(tmp11, FIX(0.666655658)); /* c11 */ + tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528)); /* c13 */ + tmp0 = tmp1 + tmp2 + tmp3 - + MULTIPLY(z1, FIX(2.286341144)); /* c7+c5+c3-c1 */ + tmp13 = tmp10 + tmp11 + tmp12 - + MULTIPLY(z1, FIX(1.835730603)); /* c9+c11+c13-c15 */ + z1 = MULTIPLY(z2 + z3, FIX(0.138617169)); /* c15 */ + tmp1 += z1 + MULTIPLY(z2, FIX(0.071888074)); /* c9+c11-c3-c15 */ + tmp2 += z1 - MULTIPLY(z3, FIX(1.125726048)); /* c5+c7+c15-c3 */ + z1 = MULTIPLY(z3 - z2, FIX(1.407403738)); /* c1 */ + tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282)); /* c1+c11-c9-c13 */ + tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411)); /* c1+c5+c13-c7 */ + z2 += z4; + z1 = MULTIPLY(z2, - FIX(0.666655658)); /* -c11 */ + tmp1 += z1; + tmp3 += z1 + MULTIPLY(z4, FIX(1.065388962)); /* c3+c11+c15-c7 */ + z2 = MULTIPLY(z2, - FIX(1.247225013)); /* -c5 */ + tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809)); /* c1+c5+c9-c13 */ + tmp12 += z2; + z2 = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */ + tmp2 += z2; + tmp3 += z2; + z2 = MULTIPLY(z4 - z3, FIX(0.410524528)); /* c13 */ + tmp10 += z2; + tmp11 += z2; + + /* Final output stage */ + + outptr[0] = range_limit[(int) DESCALE(tmp20 + tmp0, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[15] = range_limit[(int) DESCALE(tmp20 - tmp0, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[1] = range_limit[(int) DESCALE(tmp21 + tmp1, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[14] = range_limit[(int) DESCALE(tmp21 - tmp1, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[2] = range_limit[(int) DESCALE(tmp22 + tmp2, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[13] = range_limit[(int) DESCALE(tmp22 - tmp2, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[3] = range_limit[(int) DESCALE(tmp23 + tmp3, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[12] = range_limit[(int) DESCALE(tmp23 - tmp3, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[4] = range_limit[(int) DESCALE(tmp24 + tmp10, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[11] = range_limit[(int) DESCALE(tmp24 - tmp10, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[5] = range_limit[(int) DESCALE(tmp25 + tmp11, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[10] = range_limit[(int) DESCALE(tmp25 - tmp11, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[6] = range_limit[(int) DESCALE(tmp26 + tmp12, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[9] = range_limit[(int) DESCALE(tmp26 - tmp12, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[7] = range_limit[(int) DESCALE(tmp27 + tmp13, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[8] = range_limit[(int) DESCALE(tmp27 - tmp13, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + + wsptr += 8; /* advance pointer to next row */ + } +} + + +/* + * Perform dequantization and inverse DCT on one block of coefficients, + * producing a 14x7 output block. + * + * 7-point IDCT in pass 1 (columns), 14-point in pass 2 (rows). + */ + +GLOBAL(void) +jpeg_idct_14x7 (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16; + INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26; + INT32 z1, z2, z3, z4; + JCOEFPTR inptr; + ISLOW_MULT_TYPE * quantptr; + int * wsptr; + JSAMPROW outptr; + JSAMPLE *range_limit = IDCT_range_limit(cinfo); + int ctr; + int workspace[8*7]; /* buffers data between passes */ + SHIFT_TEMPS + + /* Pass 1: process columns from input, store into work array. + * 7-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/14). + */ + inptr = coef_block; + quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table; + wsptr = workspace; + for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) { + /* Even part */ + + tmp23 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); + tmp23 <<= CONST_BITS; + + z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); + z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); + z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]); + + tmp20 = MULTIPLY(z2 - z3, FIX(0.881747734)); /* c4 */ + tmp22 = MULTIPLY(z1 - z2, FIX(0.314692123)); /* c6 */ + tmp21 = tmp20 + tmp22 + tmp23 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */ + tmp10 = z1 + z3; + z2 -= tmp10; + tmp10 = MULTIPLY(tmp10, FIX(1.274162392)) + tmp23; /* c2 */ + tmp20 += tmp10 - MULTIPLY(z3, FIX(0.077722536)); /* c2-c4-c6 */ + tmp22 += tmp10 - MULTIPLY(z1, FIX(2.470602249)); /* c2+c4+c6 */ + tmp23 += MULTIPLY(z2, FIX(1.414213562)); /* c0 */ + + /* Odd part */ + + z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); + z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); + z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); + + tmp11 = MULTIPLY(z1 + z2, FIX(0.935414347)); /* (c3+c1-c5)/2 */ + tmp12 = MULTIPLY(z1 - z2, FIX(0.170262339)); /* (c3+c5-c1)/2 */ + tmp10 = tmp11 - tmp12; + tmp11 += tmp12; + tmp12 = MULTIPLY(z2 + z3, - FIX(1.378756276)); /* -c1 */ + tmp11 += tmp12; + z2 = MULTIPLY(z1 + z3, FIX(0.613604268)); /* c5 */ + tmp10 += z2; + tmp12 += z2 + MULTIPLY(z3, FIX(1.870828693)); /* c3+c1-c5 */ + + /* Final output stage */ + + wsptr[8*0] = (int) DESCALE(tmp20 + tmp10, CONST_BITS-PASS1_BITS); + wsptr[8*6] = (int) DESCALE(tmp20 - tmp10, CONST_BITS-PASS1_BITS); + wsptr[8*1] = (int) DESCALE(tmp21 + tmp11, CONST_BITS-PASS1_BITS); + wsptr[8*5] = (int) DESCALE(tmp21 - tmp11, CONST_BITS-PASS1_BITS); + wsptr[8*2] = (int) DESCALE(tmp22 + tmp12, CONST_BITS-PASS1_BITS); + wsptr[8*4] = (int) DESCALE(tmp22 - tmp12, CONST_BITS-PASS1_BITS); + wsptr[8*3] = (int) DESCALE(tmp23, CONST_BITS-PASS1_BITS); + } + + /* Pass 2: process 7 rows from work array, store into output array. + * 14-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/28). + */ + wsptr = workspace; + for (ctr = 0; ctr < 7; ctr++) { + outptr = output_buf[ctr] + output_col; + + /* Even part */ + + z1 = (INT32) wsptr[0]; + z1 <<= CONST_BITS; + z4 = (INT32) wsptr[4]; + z2 = MULTIPLY(z4, FIX(1.274162392)); /* c4 */ + z3 = MULTIPLY(z4, FIX(0.314692123)); /* c12 */ + z4 = MULTIPLY(z4, FIX(0.881747734)); /* c8 */ + + tmp10 = z1 + z2; + tmp11 = z1 + z3; + tmp12 = z1 - z4; + + tmp23 = z1 - ((z2 + z3 - z4) << 1); /* c0 = (c4+c12-c8)*2 */ + + z1 = (INT32) wsptr[2]; + z2 = (INT32) wsptr[6]; + + z3 = MULTIPLY(z1 + z2, FIX(1.105676686)); /* c6 */ + + tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */ + tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */ + tmp15 = MULTIPLY(z1, FIX(0.613604268)) - /* c10 */ + MULTIPLY(z2, FIX(1.378756276)); /* c2 */ + + tmp20 = tmp10 + tmp13; + tmp26 = tmp10 - tmp13; + tmp21 = tmp11 + tmp14; + tmp25 = tmp11 - tmp14; + tmp22 = tmp12 + tmp15; + tmp24 = tmp12 - tmp15; + + /* Odd part */ + + z1 = (INT32) wsptr[1]; + z2 = (INT32) wsptr[3]; + z3 = (INT32) wsptr[5]; + z4 = (INT32) wsptr[7]; + z4 <<= CONST_BITS; + + tmp14 = z1 + z3; + tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607)); /* c3 */ + tmp12 = MULTIPLY(tmp14, FIX(1.197448846)); /* c5 */ + tmp10 = tmp11 + tmp12 + z4 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */ + tmp14 = MULTIPLY(tmp14, FIX(0.752406978)); /* c9 */ + tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426)); /* c9+c11-c13 */ + z1 -= z2; + tmp15 = MULTIPLY(z1, FIX(0.467085129)) - z4; /* c11 */ + tmp16 += tmp15; + tmp13 = MULTIPLY(z2 + z3, - FIX(0.158341681)) - z4; /* -c13 */ + tmp11 += tmp13 - MULTIPLY(z2, FIX(0.424103948)); /* c3-c9-c13 */ + tmp12 += tmp13 - MULTIPLY(z3, FIX(2.373959773)); /* c3+c5-c13 */ + tmp13 = MULTIPLY(z3 - z2, FIX(1.405321284)); /* c1 */ + tmp14 += tmp13 + z4 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */ + tmp15 += tmp13 + MULTIPLY(z2, FIX(0.674957567)); /* c1+c11-c5 */ + + tmp13 = ((z1 - z3) << CONST_BITS) + z4; + + /* Final output stage */ + + outptr[0] = range_limit[(int) DESCALE(tmp20 + tmp10, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[13] = range_limit[(int) DESCALE(tmp20 - tmp10, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[1] = range_limit[(int) DESCALE(tmp21 + tmp11, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[12] = range_limit[(int) DESCALE(tmp21 - tmp11, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[2] = range_limit[(int) DESCALE(tmp22 + tmp12, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[11] = range_limit[(int) DESCALE(tmp22 - tmp12, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[3] = range_limit[(int) DESCALE(tmp23 + tmp13, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[10] = range_limit[(int) DESCALE(tmp23 - tmp13, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[4] = range_limit[(int) DESCALE(tmp24 + tmp14, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[9] = range_limit[(int) DESCALE(tmp24 - tmp14, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[5] = range_limit[(int) DESCALE(tmp25 + tmp15, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[8] = range_limit[(int) DESCALE(tmp25 - tmp15, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[6] = range_limit[(int) DESCALE(tmp26 + tmp16, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[7] = range_limit[(int) DESCALE(tmp26 - tmp16, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + + wsptr += 8; /* advance pointer to next row */ + } +} + + +/* + * Perform dequantization and inverse DCT on one block of coefficients, + * producing a 12x6 output block. + * + * 6-point IDCT in pass 1 (columns), 12-point in pass 2 (rows). + */ + +GLOBAL(void) +jpeg_idct_12x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + INT32 tmp0, tmp1; + INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; + INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25; + INT32 z1, z2, z3, z4; + JCOEFPTR inptr; + ISLOW_MULT_TYPE * quantptr; + int * wsptr; + JSAMPROW outptr; + JSAMPLE *range_limit = IDCT_range_limit(cinfo); + int ctr; + int workspace[8*6]; /* buffers data between passes */ + SHIFT_TEMPS + + /* Pass 1: process columns from input, store into work array. + * 6-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/12). + */ + inptr = coef_block; + quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table; + wsptr = workspace; + for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) { + /* Even part */ + + tmp10 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); + tmp10 <<= CONST_BITS; + tmp12 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); + tmp20 = MULTIPLY(tmp12, FIX(0.707106781)); /* c4 */ + tmp11 = tmp10 + tmp20; + tmp21 = DESCALE(tmp10 - tmp20 - tmp20, CONST_BITS-PASS1_BITS); + tmp20 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); + tmp10 = MULTIPLY(tmp20, FIX(1.224744871)); /* c2 */ + tmp20 = tmp11 + tmp10; + tmp22 = tmp11 - tmp10; + + /* Odd part */ + + z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); + z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); + z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); + tmp11 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */ + tmp10 = tmp11 + ((z1 + z2) << CONST_BITS); + tmp12 = tmp11 + ((z3 - z2) << CONST_BITS); + tmp11 = (z1 - z2 - z3) << PASS1_BITS; + + /* Final output stage */ + + wsptr[8*0] = DESCALE(tmp20 + tmp10, CONST_BITS-PASS1_BITS); + wsptr[8*5] = DESCALE(tmp20 - tmp10, CONST_BITS-PASS1_BITS); + wsptr[8*1] = (int) (tmp21 + tmp11); + wsptr[8*4] = (int) (tmp21 - tmp11); + wsptr[8*2] = DESCALE(tmp22 + tmp12, CONST_BITS-PASS1_BITS); + wsptr[8*3] = DESCALE(tmp22 - tmp12, CONST_BITS-PASS1_BITS); + } + + /* Pass 2: process 6 rows from work array, store into output array. + * 12-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/24). + */ + wsptr = workspace; + for (ctr = 0; ctr < 6; ctr++) { + outptr = output_buf[ctr] + output_col; + + /* Even part */ + + tmp0 = (INT32) wsptr[0]; + tmp0 <<= CONST_BITS; + + z1 = (INT32) wsptr[4]; + tmp1 = MULTIPLY(z1, FIX(1.224744871)); /* c4 */ + + tmp10 = tmp0 + tmp1; + tmp11 = tmp0 - tmp1; + + z1 = (INT32) wsptr[2]; + tmp1 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */ + z1 <<= CONST_BITS; + z2 = (INT32) wsptr[6]; + z2 <<= CONST_BITS; + + tmp12 = z1 - z2; + + tmp21 = tmp0 + tmp12; + tmp24 = tmp0 - tmp12; + + tmp12 = tmp1 + z2; + + tmp20 = tmp10 + tmp12; + tmp25 = tmp10 - tmp12; + + tmp12 = tmp1 - z1 - z2; + + tmp22 = tmp11 + tmp12; + tmp23 = tmp11 - tmp12; + + /* Odd part */ + + z1 = (INT32) wsptr[1]; + z2 = (INT32) wsptr[3]; + z3 = (INT32) wsptr[5]; + z4 = (INT32) wsptr[7]; + + tmp0 = MULTIPLY(z2, FIX(1.306562965)); /* c3 */ + tmp1 = MULTIPLY(z2, - FIX_0_541196100); /* -c9 */ + + tmp10 = z1 - z4; + z2 -= z3; + tmp12 = MULTIPLY(tmp10 + z2, FIX_0_541196100); /* c9 */ + tmp11 = tmp12 + MULTIPLY(tmp10, FIX_0_765366865); /* c3-c9 */ + tmp14 = tmp12 + MULTIPLY(z2, - FIX_1_847759065); /* c3+c9 */ + + z2 = z1 + z3; + tmp15 = MULTIPLY(z2 + z4, FIX(0.860918669)); /* c7 */ + tmp12 = tmp15 + MULTIPLY(z2, FIX(0.261052384)); /* c5-c7 */ + tmp10 = tmp12 + tmp0 + MULTIPLY(z1, FIX(0.280143716)); /* c1-c5 */ + tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580)); /* -(c7+c11) */ + tmp12 += tmp13 + tmp1 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */ + tmp13 += tmp15 - tmp0 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */ + tmp15 += tmp1 - MULTIPLY(z1, FIX(0.676326758)) - /* c7-c11 */ + MULTIPLY(z4, FIX(1.982889723)); /* c5+c7 */ + + /* Final output stage */ + + outptr[0] = range_limit[(int) DESCALE(tmp20 + tmp10, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[11] = range_limit[(int) DESCALE(tmp20 - tmp10, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[1] = range_limit[(int) DESCALE(tmp21 + tmp11, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[10] = range_limit[(int) DESCALE(tmp21 - tmp11, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[2] = range_limit[(int) DESCALE(tmp22 + tmp12, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[9] = range_limit[(int) DESCALE(tmp22 - tmp12, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[3] = range_limit[(int) DESCALE(tmp23 + tmp13, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[8] = range_limit[(int) DESCALE(tmp23 - tmp13, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[4] = range_limit[(int) DESCALE(tmp24 + tmp14, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[7] = range_limit[(int) DESCALE(tmp24 - tmp14, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[5] = range_limit[(int) DESCALE(tmp25 + tmp15, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[6] = range_limit[(int) DESCALE(tmp25 - tmp15, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + + wsptr += 8; /* advance pointer to next row */ + } +} + + +/* + * Perform dequantization and inverse DCT on one block of coefficients, + * producing a 10x5 output block. + * + * 5-point IDCT in pass 1 (columns), 10-point in pass 2 (rows). + */ + +GLOBAL(void) +jpeg_idct_10x5 (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + INT32 tmp0, tmp1; + INT32 tmp10, tmp11, tmp12, tmp13, tmp14; + INT32 tmp20, tmp21, tmp22, tmp23, tmp24; + INT32 z1, z2, z3, z4; + JCOEFPTR inptr; + ISLOW_MULT_TYPE * quantptr; + int * wsptr; + JSAMPROW outptr; + JSAMPLE *range_limit = IDCT_range_limit(cinfo); + int ctr; + int workspace[8*5]; /* buffers data between passes */ + SHIFT_TEMPS + + /* Pass 1: process columns from input, store into work array. + * 5-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/10). + */ + inptr = coef_block; + quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table; + wsptr = workspace; + for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) { + /* Even part */ + + tmp12 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); + tmp12 <<= CONST_BITS; + tmp0 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); + tmp1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); + z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */ + z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */ + z3 = tmp12 + z2; + tmp10 = z3 + z1; + tmp11 = z3 - z1; + tmp12 -= z2 << 2; + + /* Odd part */ + + z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); + z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); + + z1 = MULTIPLY(z2 + z3, FIX(0.831253876)); /* c3 */ + tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c1-c3 */ + tmp1 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c1+c3 */ + + /* Final output stage */ + + wsptr[8*0] = DESCALE(tmp10 + tmp0, CONST_BITS-PASS1_BITS); + wsptr[8*4] = DESCALE(tmp10 - tmp0, CONST_BITS-PASS1_BITS); + wsptr[8*1] = DESCALE(tmp11 + tmp1, CONST_BITS-PASS1_BITS); + wsptr[8*3] = DESCALE(tmp11 - tmp1, CONST_BITS-PASS1_BITS); + wsptr[8*2] = DESCALE(tmp12, CONST_BITS-PASS1_BITS); + } + + /* Pass 2: process 5 rows from work array, store into output array. + * 10-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/20). + */ + wsptr = workspace; + for (ctr = 0; ctr < 5; ctr++) { + outptr = output_buf[ctr] + output_col; + + /* Even part */ + + tmp0 = (INT32) wsptr[0]; + tmp0 <<= CONST_BITS; + tmp1 = (INT32) wsptr[4]; + z1 = MULTIPLY(tmp1, FIX(1.144122806)); /* c4 */ + z2 = MULTIPLY(tmp1, FIX(0.437016024)); /* c8 */ + tmp10 = tmp0 + z1; + tmp11 = tmp0 - z2; + + tmp22 = tmp0 - ((z1 - z2) << 1); /* c0 = (c4-c8)*2 */ + + z2 = (INT32) wsptr[2]; + z3 = (INT32) wsptr[6]; + + z1 = MULTIPLY(z2 + z3, FIX(0.831253876)); /* c6 */ + tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */ + tmp1 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */ + + tmp20 = tmp10 + tmp0; + tmp24 = tmp10 - tmp0; + tmp21 = tmp11 + tmp1; + tmp23 = tmp11 - tmp1; + + /* Odd part */ + + z1 = (INT32) wsptr[1]; + z2 = (INT32) wsptr[3]; + z3 = (INT32) wsptr[5]; + z3 <<= CONST_BITS; + z4 = (INT32) wsptr[7]; + + tmp0 = z2 + z4; + tmp1 = z2 - z4; + + tmp12 = MULTIPLY(tmp1, FIX(0.309016994)); /* (c3-c7)/2 */ + + z2 = MULTIPLY(tmp0, FIX(0.951056516)); /* (c3+c7)/2 */ + z4 = z3 + tmp12; + + tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */ + tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */ + + z2 = MULTIPLY(tmp0, FIX(0.587785252)); /* (c1-c9)/2 */ + z4 = z3 - tmp12 - (tmp1 << (CONST_BITS - 1)); + + tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */ + tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */ + + tmp12 = ((z1 - tmp1) << CONST_BITS) - z3; + + /* Final output stage */ + + outptr[0] = range_limit[(int) DESCALE(tmp20 + tmp10, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[9] = range_limit[(int) DESCALE(tmp20 - tmp10, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[1] = range_limit[(int) DESCALE(tmp21 + tmp11, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[8] = range_limit[(int) DESCALE(tmp21 - tmp11, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[2] = range_limit[(int) DESCALE(tmp22 + tmp12, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[7] = range_limit[(int) DESCALE(tmp22 - tmp12, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[3] = range_limit[(int) DESCALE(tmp23 + tmp13, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[6] = range_limit[(int) DESCALE(tmp23 - tmp13, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[4] = range_limit[(int) DESCALE(tmp24 + tmp14, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[5] = range_limit[(int) DESCALE(tmp24 - tmp14, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + + wsptr += 8; /* advance pointer to next row */ + } +} + + +/* + * Perform dequantization and inverse DCT on one block of coefficients, + * producing a 8x4 output block. + * + * 4-point IDCT in pass 1 (columns), 8-point in pass 2 (rows). + */ + +GLOBAL(void) +jpeg_idct_8x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + INT32 tmp0, tmp1, tmp2, tmp3; + INT32 tmp10, tmp11, tmp12, tmp13; + INT32 z1, z2, z3, z4, z5; + JCOEFPTR inptr; + ISLOW_MULT_TYPE * quantptr; + int * wsptr; + JSAMPROW outptr; + JSAMPLE *range_limit = IDCT_range_limit(cinfo); + int ctr; + int workspace[8*4]; /* buffers data between passes */ + SHIFT_TEMPS + + /* Pass 1: process columns from input, store into work array. + * 4-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/16). + */ + inptr = coef_block; + quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table; + wsptr = workspace; + for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) { + /* Even part */ + + tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); + tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); + + tmp10 = (tmp0 + tmp2) << PASS1_BITS; + tmp12 = (tmp0 - tmp2) << PASS1_BITS; + + /* Odd part */ + /* Same rotation as in the even part of the 8x8 LL&M IDCT */ + + z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); + z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); + + z1 = MULTIPLY(z2 + z3, FIX_0_541196100); /* c6 */ + tmp0 = DESCALE(z1 + MULTIPLY(z3, - FIX_1_847759065), /* -(c2+c6) */ + CONST_BITS-PASS1_BITS); + tmp2 = DESCALE(z1 + MULTIPLY(z2, FIX_0_765366865), /* c2-c6 */ + CONST_BITS-PASS1_BITS); + + /* Final output stage */ + + wsptr[8*0] = (int) (tmp10 + tmp2); + wsptr[8*3] = (int) (tmp10 - tmp2); + wsptr[8*1] = (int) (tmp12 + tmp0); + wsptr[8*2] = (int) (tmp12 - tmp0); + } + + /* Pass 2: process rows from work array, store into output array. */ + /* Note that we must descale the results by a factor of 8 == 2**3, */ + /* and also undo the PASS1_BITS scaling. */ + + wsptr = workspace; + for (ctr = 0; ctr < 4; ctr++) { + outptr = output_buf[ctr] + output_col; + + /* Even part: reverse the even part of the forward DCT. */ + /* The rotator is sqrt(2)*c(-6). */ + + z2 = (INT32) wsptr[2]; + z3 = (INT32) wsptr[6]; + + z1 = MULTIPLY(z2 + z3, FIX_0_541196100); + tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); + tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); + + tmp0 = ((INT32) wsptr[0] + (INT32) wsptr[4]) << CONST_BITS; + tmp1 = ((INT32) wsptr[0] - (INT32) wsptr[4]) << CONST_BITS; + + tmp10 = tmp0 + tmp3; + tmp13 = tmp0 - tmp3; + tmp11 = tmp1 + tmp2; + tmp12 = tmp1 - tmp2; + + /* Odd part per figure 8; the matrix is unitary and hence its + * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. + */ + + tmp0 = (INT32) wsptr[7]; + tmp1 = (INT32) wsptr[5]; + tmp2 = (INT32) wsptr[3]; + tmp3 = (INT32) wsptr[1]; + + z1 = tmp0 + tmp3; + z2 = tmp1 + tmp2; + z3 = tmp0 + tmp2; + z4 = tmp1 + tmp3; + z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */ + + tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */ + tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */ + tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */ + tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */ + z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */ + z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */ + z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */ + z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */ + + z3 += z5; + z4 += z5; + + tmp0 += z1 + z3; + tmp1 += z2 + z4; + tmp2 += z2 + z3; + tmp3 += z1 + z4; + + /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ + + outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp3, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[7] = range_limit[(int) DESCALE(tmp10 - tmp3, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[1] = range_limit[(int) DESCALE(tmp11 + tmp2, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[6] = range_limit[(int) DESCALE(tmp11 - tmp2, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[2] = range_limit[(int) DESCALE(tmp12 + tmp1, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[5] = range_limit[(int) DESCALE(tmp12 - tmp1, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[3] = range_limit[(int) DESCALE(tmp13 + tmp0, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[4] = range_limit[(int) DESCALE(tmp13 - tmp0, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + + wsptr += DCTSIZE; /* advance pointer to next row */ + } +} + + +/* + * Perform dequantization and inverse DCT on one block of coefficients, + * producing a reduced-size 6x3 output block. + * + * 3-point IDCT in pass 1 (columns), 6-point in pass 2 (rows). + */ + +GLOBAL(void) +jpeg_idct_6x3 (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12; + INT32 z1, z2, z3; + JCOEFPTR inptr; + ISLOW_MULT_TYPE * quantptr; + int * wsptr; + JSAMPROW outptr; + JSAMPLE *range_limit = IDCT_range_limit(cinfo); + int ctr; + int workspace[6*3]; /* buffers data between passes */ + SHIFT_TEMPS + + /* Pass 1: process columns from input, store into work array. + * 3-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/6). + */ + inptr = coef_block; + quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table; + wsptr = workspace; + for (ctr = 0; ctr < 6; ctr++, inptr++, quantptr++, wsptr++) { + /* Even part */ + + tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); + tmp0 <<= CONST_BITS; + tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); + tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */ + tmp10 = tmp0 + tmp12; + tmp2 = tmp0 - tmp12 - tmp12; + + /* Odd part */ + + tmp12 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); + tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */ + + /* Final output stage */ + + wsptr[6*0] = (int) DESCALE(tmp10 + tmp0, CONST_BITS-PASS1_BITS); + wsptr[6*2] = (int) DESCALE(tmp10 - tmp0, CONST_BITS-PASS1_BITS); + wsptr[6*1] = (int) DESCALE(tmp2, CONST_BITS-PASS1_BITS); + } + + /* Pass 2: process 3 rows from work array, store into output array. + * 6-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/12). + */ + wsptr = workspace; + for (ctr = 0; ctr < 3; ctr++) { + outptr = output_buf[ctr] + output_col; + + /* Even part */ + + tmp0 = (INT32) wsptr[0]; + tmp0 <<= CONST_BITS; + tmp2 = (INT32) wsptr[4]; + tmp10 = MULTIPLY(tmp2, FIX(0.707106781)); /* c4 */ + tmp1 = tmp0 + tmp10; + tmp11 = tmp0 - tmp10 - tmp10; + tmp10 = (INT32) wsptr[2]; + tmp0 = MULTIPLY(tmp10, FIX(1.224744871)); /* c2 */ + tmp10 = tmp1 + tmp0; + tmp12 = tmp1 - tmp0; + + /* Odd part */ + + z1 = (INT32) wsptr[1]; + z2 = (INT32) wsptr[3]; + z3 = (INT32) wsptr[5]; + tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */ + z1 <<= CONST_BITS; + z2 <<= CONST_BITS; + z3 <<= CONST_BITS; + tmp0 = tmp1 + z1 + z2; + tmp2 = tmp1 - z2 + z3; + tmp1 = z1 - z2 - z3; + + /* Final output stage */ + + outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp0, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[5] = range_limit[(int) DESCALE(tmp10 - tmp0, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[1] = range_limit[(int) DESCALE(tmp11 + tmp1, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[4] = range_limit[(int) DESCALE(tmp11 - tmp1, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[2] = range_limit[(int) DESCALE(tmp12 + tmp2, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[3] = range_limit[(int) DESCALE(tmp12 - tmp2, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + + wsptr += 6; /* advance pointer to next row */ + } +} + + +/* + * Perform dequantization and inverse DCT on one block of coefficients, + * producing a 4x2 output block. + * + * 2-point IDCT in pass 1 (columns), 4-point in pass 2 (rows). + */ + +GLOBAL(void) +jpeg_idct_4x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + INT32 tmp0, tmp2, tmp10, tmp12; + INT32 z1, z2, z3; + JCOEFPTR inptr; + ISLOW_MULT_TYPE * quantptr; + INT32 * wsptr; + JSAMPROW outptr; + JSAMPLE *range_limit = IDCT_range_limit(cinfo); + int ctr; + INT32 workspace[4*2]; /* buffers data between passes */ + SHIFT_TEMPS + + /* Pass 1: process columns from input, store into work array. */ + + inptr = coef_block; + quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table; + wsptr = workspace; + for (ctr = 0; ctr < 4; ctr++, inptr++, quantptr++, wsptr++) { + /* Even part */ + + tmp10 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); + + /* Odd part */ + + tmp0 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); + + /* Final output stage */ + + wsptr[4*0] = tmp10 + tmp0; + wsptr[4*1] = tmp10 - tmp0; + } + + /* Pass 2: process 2 rows from work array, store into output array. + * 4-point IDCT kernel, + * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point IDCT]. + */ + wsptr = workspace; + for (ctr = 0; ctr < 2; ctr++) { + outptr = output_buf[ctr] + output_col; + + /* Even part */ + + tmp0 = wsptr[0]; + tmp2 = wsptr[2]; + + tmp10 = (tmp0 + tmp2) << CONST_BITS; + tmp12 = (tmp0 - tmp2) << CONST_BITS; + + /* Odd part */ + /* Same rotation as in the even part of the 8x8 LL&M IDCT */ + + z2 = wsptr[1]; + z3 = wsptr[3]; + + z1 = MULTIPLY(z2 + z3, FIX_0_541196100); /* c6 */ + tmp0 = z1 + MULTIPLY(z3, - FIX_1_847759065); /* -(c2+c6) */ + tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */ + + /* Final output stage */ + + outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp2, + CONST_BITS+3) + & RANGE_MASK]; + outptr[3] = range_limit[(int) DESCALE(tmp10 - tmp2, + CONST_BITS+3) + & RANGE_MASK]; + outptr[1] = range_limit[(int) DESCALE(tmp12 + tmp0, + CONST_BITS+3) + & RANGE_MASK]; + outptr[2] = range_limit[(int) DESCALE(tmp12 - tmp0, + CONST_BITS+3) + & RANGE_MASK]; + + wsptr += 4; /* advance pointer to next row */ + } +} + + +/* + * Perform dequantization and inverse DCT on one block of coefficients, + * producing a 2x1 output block. + * + * 1-point IDCT in pass 1 (columns), 2-point in pass 2 (rows). + */ + +GLOBAL(void) +jpeg_idct_2x1 (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + INT32 tmp0, tmp10; + ISLOW_MULT_TYPE * quantptr; + JSAMPROW outptr; + JSAMPLE *range_limit = IDCT_range_limit(cinfo); + SHIFT_TEMPS + + /* Pass 1: empty. */ + + /* Pass 2: process 1 row from input, store into output array. */ + + quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table; + outptr = output_buf[0] + output_col; + + /* Even part */ + + tmp10 = DEQUANTIZE(coef_block[0], quantptr[0]); + + /* Odd part */ + + tmp0 = DEQUANTIZE(coef_block[1], quantptr[1]); + + /* Final output stage */ + + outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp0, 3) & RANGE_MASK]; + outptr[1] = range_limit[(int) DESCALE(tmp10 - tmp0, 3) & RANGE_MASK]; +} + + +/* + * Perform dequantization and inverse DCT on one block of coefficients, + * producing a 8x16 output block. + * + * 16-point IDCT in pass 1 (columns), 8-point in pass 2 (rows). + */ + +GLOBAL(void) +jpeg_idct_8x16 (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13; + INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27; + INT32 z1, z2, z3, z4, z5; + JCOEFPTR inptr; + ISLOW_MULT_TYPE * quantptr; + int * wsptr; + JSAMPROW outptr; + JSAMPLE *range_limit = IDCT_range_limit(cinfo); + int ctr; + int workspace[8*16]; /* buffers data between passes */ + SHIFT_TEMPS + + /* Pass 1: process columns from input, store into work array. + * 16-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/32). + */ + inptr = coef_block; + quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table; + wsptr = workspace; + for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) { + /* Even part */ + + tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); + tmp0 <<= CONST_BITS; + + z1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); + tmp1 = MULTIPLY(z1, FIX(1.306562965)); /* c4[16] = c2[8] */ + tmp2 = MULTIPLY(z1, FIX_0_541196100); /* c12[16] = c6[8] */ + + tmp10 = tmp0 + tmp1; + tmp11 = tmp0 - tmp1; + tmp12 = tmp0 + tmp2; + tmp13 = tmp0 - tmp2; + + z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); + z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]); + z3 = z1 - z2; + z4 = MULTIPLY(z3, FIX(0.275899379)); /* c14[16] = c7[8] */ + z3 = MULTIPLY(z3, FIX(1.387039845)); /* c2[16] = c1[8] */ + + tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447); /* (c6+c2)[16] = (c3+c1)[8] */ + tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223); /* (c6-c14)[16] = (c3-c7)[8] */ + tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */ + tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */ + + tmp20 = tmp10 + tmp0; + tmp27 = tmp10 - tmp0; + tmp21 = tmp12 + tmp1; + tmp26 = tmp12 - tmp1; + tmp22 = tmp13 + tmp2; + tmp25 = tmp13 - tmp2; + tmp23 = tmp11 + tmp3; + tmp24 = tmp11 - tmp3; + + /* Odd part */ + + z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); + z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); + z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); + z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]); + + tmp11 = z1 + z3; + + tmp1 = MULTIPLY(z1 + z2, FIX(1.353318001)); /* c3 */ + tmp2 = MULTIPLY(tmp11, FIX(1.247225013)); /* c5 */ + tmp3 = MULTIPLY(z1 + z4, FIX(1.093201867)); /* c7 */ + tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586)); /* c9 */ + tmp11 = MULTIPLY(tmp11, FIX(0.666655658)); /* c11 */ + tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528)); /* c13 */ + tmp0 = tmp1 + tmp2 + tmp3 - + MULTIPLY(z1, FIX(2.286341144)); /* c7+c5+c3-c1 */ + tmp13 = tmp10 + tmp11 + tmp12 - + MULTIPLY(z1, FIX(1.835730603)); /* c9+c11+c13-c15 */ + z1 = MULTIPLY(z2 + z3, FIX(0.138617169)); /* c15 */ + tmp1 += z1 + MULTIPLY(z2, FIX(0.071888074)); /* c9+c11-c3-c15 */ + tmp2 += z1 - MULTIPLY(z3, FIX(1.125726048)); /* c5+c7+c15-c3 */ + z1 = MULTIPLY(z3 - z2, FIX(1.407403738)); /* c1 */ + tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282)); /* c1+c11-c9-c13 */ + tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411)); /* c1+c5+c13-c7 */ + z2 += z4; + z1 = MULTIPLY(z2, - FIX(0.666655658)); /* -c11 */ + tmp1 += z1; + tmp3 += z1 + MULTIPLY(z4, FIX(1.065388962)); /* c3+c11+c15-c7 */ + z2 = MULTIPLY(z2, - FIX(1.247225013)); /* -c5 */ + tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809)); /* c1+c5+c9-c13 */ + tmp12 += z2; + z2 = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */ + tmp2 += z2; + tmp3 += z2; + z2 = MULTIPLY(z4 - z3, FIX(0.410524528)); /* c13 */ + tmp10 += z2; + tmp11 += z2; + + /* Final output stage */ + + wsptr[8*0] = (int) DESCALE(tmp20 + tmp0, CONST_BITS-PASS1_BITS); + wsptr[8*15] = (int) DESCALE(tmp20 - tmp0, CONST_BITS-PASS1_BITS); + wsptr[8*1] = (int) DESCALE(tmp21 + tmp1, CONST_BITS-PASS1_BITS); + wsptr[8*14] = (int) DESCALE(tmp21 - tmp1, CONST_BITS-PASS1_BITS); + wsptr[8*2] = (int) DESCALE(tmp22 + tmp2, CONST_BITS-PASS1_BITS); + wsptr[8*13] = (int) DESCALE(tmp22 - tmp2, CONST_BITS-PASS1_BITS); + wsptr[8*3] = (int) DESCALE(tmp23 + tmp3, CONST_BITS-PASS1_BITS); + wsptr[8*12] = (int) DESCALE(tmp23 - tmp3, CONST_BITS-PASS1_BITS); + wsptr[8*4] = (int) DESCALE(tmp24 + tmp10, CONST_BITS-PASS1_BITS); + wsptr[8*11] = (int) DESCALE(tmp24 - tmp10, CONST_BITS-PASS1_BITS); + wsptr[8*5] = (int) DESCALE(tmp25 + tmp11, CONST_BITS-PASS1_BITS); + wsptr[8*10] = (int) DESCALE(tmp25 - tmp11, CONST_BITS-PASS1_BITS); + wsptr[8*6] = (int) DESCALE(tmp26 + tmp12, CONST_BITS-PASS1_BITS); + wsptr[8*9] = (int) DESCALE(tmp26 - tmp12, CONST_BITS-PASS1_BITS); + wsptr[8*7] = (int) DESCALE(tmp27 + tmp13, CONST_BITS-PASS1_BITS); + wsptr[8*8] = (int) DESCALE(tmp27 - tmp13, CONST_BITS-PASS1_BITS); + } + + /* Pass 2: process rows from work array, store into output array. */ + /* Note that we must descale the results by a factor of 8 == 2**3, */ + /* and also undo the PASS1_BITS scaling. */ + + wsptr = workspace; + for (ctr = 0; ctr < 16; ctr++) { + outptr = output_buf[ctr] + output_col; + + /* Even part: reverse the even part of the forward DCT. */ + /* The rotator is sqrt(2)*c(-6). */ + + z2 = (INT32) wsptr[2]; + z3 = (INT32) wsptr[6]; + + z1 = MULTIPLY(z2 + z3, FIX_0_541196100); + tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); + tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); + + tmp0 = ((INT32) wsptr[0] + (INT32) wsptr[4]) << CONST_BITS; + tmp1 = ((INT32) wsptr[0] - (INT32) wsptr[4]) << CONST_BITS; + + tmp10 = tmp0 + tmp3; + tmp13 = tmp0 - tmp3; + tmp11 = tmp1 + tmp2; + tmp12 = tmp1 - tmp2; + + /* Odd part per figure 8; the matrix is unitary and hence its + * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. + */ + + tmp0 = (INT32) wsptr[7]; + tmp1 = (INT32) wsptr[5]; + tmp2 = (INT32) wsptr[3]; + tmp3 = (INT32) wsptr[1]; + + z1 = tmp0 + tmp3; + z2 = tmp1 + tmp2; + z3 = tmp0 + tmp2; + z4 = tmp1 + tmp3; + z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */ + + tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */ + tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */ + tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */ + tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */ + z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */ + z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */ + z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */ + z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */ + + z3 += z5; + z4 += z5; + + tmp0 += z1 + z3; + tmp1 += z2 + z4; + tmp2 += z2 + z3; + tmp3 += z1 + z4; + + /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ + + outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp3, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[7] = range_limit[(int) DESCALE(tmp10 - tmp3, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[1] = range_limit[(int) DESCALE(tmp11 + tmp2, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[6] = range_limit[(int) DESCALE(tmp11 - tmp2, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[2] = range_limit[(int) DESCALE(tmp12 + tmp1, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[5] = range_limit[(int) DESCALE(tmp12 - tmp1, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[3] = range_limit[(int) DESCALE(tmp13 + tmp0, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[4] = range_limit[(int) DESCALE(tmp13 - tmp0, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + + wsptr += DCTSIZE; /* advance pointer to next row */ + } +} + + +/* + * Perform dequantization and inverse DCT on one block of coefficients, + * producing a 7x14 output block. + * + * 14-point IDCT in pass 1 (columns), 7-point in pass 2 (rows). + */ + +GLOBAL(void) +jpeg_idct_7x14 (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16; + INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26; + INT32 z1, z2, z3, z4; + JCOEFPTR inptr; + ISLOW_MULT_TYPE * quantptr; + int * wsptr; + JSAMPROW outptr; + JSAMPLE *range_limit = IDCT_range_limit(cinfo); + int ctr; + int workspace[7*14]; /* buffers data between passes */ + SHIFT_TEMPS + + /* Pass 1: process columns from input, store into work array. + * 14-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/28). + */ + inptr = coef_block; + quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table; + wsptr = workspace; + for (ctr = 0; ctr < 7; ctr++, inptr++, quantptr++, wsptr++) { + /* Even part */ + + z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); + z1 <<= CONST_BITS; + z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); + z2 = MULTIPLY(z4, FIX(1.274162392)); /* c4 */ + z3 = MULTIPLY(z4, FIX(0.314692123)); /* c12 */ + z4 = MULTIPLY(z4, FIX(0.881747734)); /* c8 */ + + tmp10 = z1 + z2; + tmp11 = z1 + z3; + tmp12 = z1 - z4; + + tmp23 = DESCALE(z1 - ((z2 + z3 - z4) << 1), /* c0 = (c4+c12-c8)*2 */ + CONST_BITS-PASS1_BITS); + + z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); + z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]); + + z3 = MULTIPLY(z1 + z2, FIX(1.105676686)); /* c6 */ + + tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */ + tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */ + tmp15 = MULTIPLY(z1, FIX(0.613604268)) - /* c10 */ + MULTIPLY(z2, FIX(1.378756276)); /* c2 */ + + tmp20 = tmp10 + tmp13; + tmp26 = tmp10 - tmp13; + tmp21 = tmp11 + tmp14; + tmp25 = tmp11 - tmp14; + tmp22 = tmp12 + tmp15; + tmp24 = tmp12 - tmp15; + + /* Odd part */ + + z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); + z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); + z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); + z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]); + tmp13 = z4 << CONST_BITS; + + tmp14 = z1 + z3; + tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607)); /* c3 */ + tmp12 = MULTIPLY(tmp14, FIX(1.197448846)); /* c5 */ + tmp10 = tmp11 + tmp12 + tmp13 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */ + tmp14 = MULTIPLY(tmp14, FIX(0.752406978)); /* c9 */ + tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426)); /* c9+c11-c13 */ + z1 -= z2; + tmp15 = MULTIPLY(z1, FIX(0.467085129)) - tmp13; /* c11 */ + tmp16 += tmp15; + z1 += z4; + z4 = MULTIPLY(z2 + z3, - FIX(0.158341681)) - tmp13; /* -c13 */ + tmp11 += z4 - MULTIPLY(z2, FIX(0.424103948)); /* c3-c9-c13 */ + tmp12 += z4 - MULTIPLY(z3, FIX(2.373959773)); /* c3+c5-c13 */ + z4 = MULTIPLY(z3 - z2, FIX(1.405321284)); /* c1 */ + tmp14 += z4 + tmp13 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */ + tmp15 += z4 + MULTIPLY(z2, FIX(0.674957567)); /* c1+c11-c5 */ + + tmp13 = (z1 - z3) << PASS1_BITS; + + /* Final output stage */ + + wsptr[7*0] = (int) DESCALE(tmp20 + tmp10, CONST_BITS-PASS1_BITS); + wsptr[7*13] = (int) DESCALE(tmp20 - tmp10, CONST_BITS-PASS1_BITS); + wsptr[7*1] = (int) DESCALE(tmp21 + tmp11, CONST_BITS-PASS1_BITS); + wsptr[7*12] = (int) DESCALE(tmp21 - tmp11, CONST_BITS-PASS1_BITS); + wsptr[7*2] = (int) DESCALE(tmp22 + tmp12, CONST_BITS-PASS1_BITS); + wsptr[7*11] = (int) DESCALE(tmp22 - tmp12, CONST_BITS-PASS1_BITS); + wsptr[7*3] = (int) (tmp23 + tmp13); + wsptr[7*10] = (int) (tmp23 - tmp13); + wsptr[7*4] = (int) DESCALE(tmp24 + tmp14, CONST_BITS-PASS1_BITS); + wsptr[7*9] = (int) DESCALE(tmp24 - tmp14, CONST_BITS-PASS1_BITS); + wsptr[7*5] = (int) DESCALE(tmp25 + tmp15, CONST_BITS-PASS1_BITS); + wsptr[7*8] = (int) DESCALE(tmp25 - tmp15, CONST_BITS-PASS1_BITS); + wsptr[7*6] = (int) DESCALE(tmp26 + tmp16, CONST_BITS-PASS1_BITS); + wsptr[7*7] = (int) DESCALE(tmp26 - tmp16, CONST_BITS-PASS1_BITS); + } + + /* Pass 2: process 14 rows from work array, store into output array. + * 7-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/14). + */ + wsptr = workspace; + for (ctr = 0; ctr < 14; ctr++) { + outptr = output_buf[ctr] + output_col; + + /* Even part */ + + tmp23 = (INT32) wsptr[0]; + tmp23 <<= CONST_BITS; + + z1 = (INT32) wsptr[2]; + z2 = (INT32) wsptr[4]; + z3 = (INT32) wsptr[6]; + + tmp20 = MULTIPLY(z2 - z3, FIX(0.881747734)); /* c4 */ + tmp22 = MULTIPLY(z1 - z2, FIX(0.314692123)); /* c6 */ + tmp21 = tmp20 + tmp22 + tmp23 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */ + tmp10 = z1 + z3; + z2 -= tmp10; + tmp10 = MULTIPLY(tmp10, FIX(1.274162392)) + tmp23; /* c2 */ + tmp20 += tmp10 - MULTIPLY(z3, FIX(0.077722536)); /* c2-c4-c6 */ + tmp22 += tmp10 - MULTIPLY(z1, FIX(2.470602249)); /* c2+c4+c6 */ + tmp23 += MULTIPLY(z2, FIX(1.414213562)); /* c0 */ + + /* Odd part */ + + z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); + z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); + z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); + + tmp11 = MULTIPLY(z1 + z2, FIX(0.935414347)); /* (c3+c1-c5)/2 */ + tmp12 = MULTIPLY(z1 - z2, FIX(0.170262339)); /* (c3+c5-c1)/2 */ + tmp10 = tmp11 - tmp12; + tmp11 += tmp12; + tmp12 = MULTIPLY(z2 + z3, - FIX(1.378756276)); /* -c1 */ + tmp11 += tmp12; + z2 = MULTIPLY(z1 + z3, FIX(0.613604268)); /* c5 */ + tmp10 += z2; + tmp12 += z2 + MULTIPLY(z3, FIX(1.870828693)); /* c3+c1-c5 */ + + /* Final output stage */ + + outptr[0] = range_limit[(int) DESCALE(tmp20 + tmp10, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[6] = range_limit[(int) DESCALE(tmp20 - tmp10, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[1] = range_limit[(int) DESCALE(tmp21 + tmp11, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[5] = range_limit[(int) DESCALE(tmp21 - tmp11, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[2] = range_limit[(int) DESCALE(tmp22 + tmp12, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[4] = range_limit[(int) DESCALE(tmp22 - tmp12, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[3] = range_limit[(int) DESCALE(tmp23, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + + wsptr += 7; /* advance pointer to next row */ + } +} + + +/* + * Perform dequantization and inverse DCT on one block of coefficients, + * producing a 6x12 output block. + * + * 12-point IDCT in pass 1 (columns), 6-point in pass 2 (rows). + */ + +GLOBAL(void) +jpeg_idct_6x12 (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + INT32 tmp0, tmp1; + INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; + INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25; + INT32 z1, z2, z3, z4; + JCOEFPTR inptr; + ISLOW_MULT_TYPE * quantptr; + int * wsptr; + JSAMPROW outptr; + JSAMPLE *range_limit = IDCT_range_limit(cinfo); + int ctr; + int workspace[6*12]; /* buffers data between passes */ + SHIFT_TEMPS + + /* Pass 1: process columns from input, store into work array. + * 12-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/24). + */ + inptr = coef_block; + quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table; + wsptr = workspace; + for (ctr = 0; ctr < 6; ctr++, inptr++, quantptr++, wsptr++) { + /* Even part */ + + tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); + tmp0 <<= CONST_BITS; + + z1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); + tmp1 = MULTIPLY(z1, FIX(1.224744871)); /* c4 */ + + tmp10 = tmp0 + tmp1; + tmp11 = tmp0 - tmp1; + + z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); + tmp1 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */ + z1 <<= CONST_BITS; + z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]); + z2 <<= CONST_BITS; + + tmp12 = z1 - z2; + + tmp21 = tmp0 + tmp12; + tmp24 = tmp0 - tmp12; + + tmp12 = tmp1 + z2; + + tmp20 = tmp10 + tmp12; + tmp25 = tmp10 - tmp12; + + tmp12 = tmp1 - z1 - z2; + + tmp22 = tmp11 + tmp12; + tmp23 = tmp11 - tmp12; + + /* Odd part */ + + z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); + z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); + z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); + z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]); + + tmp0 = MULTIPLY(z2, FIX(1.306562965)); /* c3 */ + tmp1 = MULTIPLY(z2, - FIX_0_541196100); /* -c9 */ + + tmp10 = z1 - z4; + z2 -= z3; + tmp12 = MULTIPLY(tmp10 + z2, FIX_0_541196100); /* c9 */ + tmp11 = tmp12 + MULTIPLY(tmp10, FIX_0_765366865); /* c3-c9 */ + tmp14 = tmp12 + MULTIPLY(z2, - FIX_1_847759065); /* c3+c9 */ + + z2 = z1 + z3; + tmp15 = MULTIPLY(z2 + z4, FIX(0.860918669)); /* c7 */ + tmp12 = tmp15 + MULTIPLY(z2, FIX(0.261052384)); /* c5-c7 */ + tmp10 = tmp12 + tmp0 + MULTIPLY(z1, FIX(0.280143716)); /* c1-c5 */ + tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580)); /* -(c7+c11) */ + tmp12 += tmp13 + tmp1 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */ + tmp13 += tmp15 - tmp0 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */ + tmp15 += tmp1 - MULTIPLY(z1, FIX(0.676326758)) - /* c7-c11 */ + MULTIPLY(z4, FIX(1.982889723)); /* c5+c7 */ + + /* Final output stage */ + + wsptr[6*0] = (int) DESCALE(tmp20 + tmp10, CONST_BITS-PASS1_BITS); + wsptr[6*11] = (int) DESCALE(tmp20 - tmp10, CONST_BITS-PASS1_BITS); + wsptr[6*1] = (int) DESCALE(tmp21 + tmp11, CONST_BITS-PASS1_BITS); + wsptr[6*10] = (int) DESCALE(tmp21 - tmp11, CONST_BITS-PASS1_BITS); + wsptr[6*2] = (int) DESCALE(tmp22 + tmp12, CONST_BITS-PASS1_BITS); + wsptr[6*9] = (int) DESCALE(tmp22 - tmp12, CONST_BITS-PASS1_BITS); + wsptr[6*3] = (int) DESCALE(tmp23 + tmp13, CONST_BITS-PASS1_BITS); + wsptr[6*8] = (int) DESCALE(tmp23 - tmp13, CONST_BITS-PASS1_BITS); + wsptr[6*4] = (int) DESCALE(tmp24 + tmp14, CONST_BITS-PASS1_BITS); + wsptr[6*7] = (int) DESCALE(tmp24 - tmp14, CONST_BITS-PASS1_BITS); + wsptr[6*5] = (int) DESCALE(tmp25 + tmp15, CONST_BITS-PASS1_BITS); + wsptr[6*6] = (int) DESCALE(tmp25 - tmp15, CONST_BITS-PASS1_BITS); + } + + /* Pass 2: process 12 rows from work array, store into output array. + * 6-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/12). + */ + wsptr = workspace; + for (ctr = 0; ctr < 12; ctr++) { + outptr = output_buf[ctr] + output_col; + + /* Even part */ + + tmp10 = (INT32) wsptr[0]; + tmp10 <<= CONST_BITS; + tmp12 = (INT32) wsptr[4]; + tmp20 = MULTIPLY(tmp12, FIX(0.707106781)); /* c4 */ + tmp11 = tmp10 + tmp20; + tmp21 = tmp10 - tmp20 - tmp20; + tmp20 = (INT32) wsptr[2]; + tmp10 = MULTIPLY(tmp20, FIX(1.224744871)); /* c2 */ + tmp20 = tmp11 + tmp10; + tmp22 = tmp11 - tmp10; + + /* Odd part */ + + z1 = (INT32) wsptr[1]; + z2 = (INT32) wsptr[3]; + z3 = (INT32) wsptr[5]; + tmp11 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */ + z1 <<= CONST_BITS; + z2 <<= CONST_BITS; + z3 <<= CONST_BITS; + tmp10 = tmp11 + z1 + z2; + tmp12 = tmp11 - z2 + z3; + tmp11 = z1 - z2 - z3; + + /* Final output stage */ + + outptr[0] = range_limit[(int) DESCALE(tmp20 + tmp10, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[5] = range_limit[(int) DESCALE(tmp20 - tmp10, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[1] = range_limit[(int) DESCALE(tmp21 + tmp11, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[4] = range_limit[(int) DESCALE(tmp21 - tmp11, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[2] = range_limit[(int) DESCALE(tmp22 + tmp12, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[3] = range_limit[(int) DESCALE(tmp22 - tmp12, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + + wsptr += 6; /* advance pointer to next row */ + } +} + + +/* + * Perform dequantization and inverse DCT on one block of coefficients, + * producing a 5x10 output block. + * + * 10-point IDCT in pass 1 (columns), 5-point in pass 2 (rows). + */ + +GLOBAL(void) +jpeg_idct_5x10 (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + INT32 tmp0, tmp1; + INT32 tmp10, tmp11, tmp12, tmp13, tmp14; + INT32 tmp20, tmp21, tmp22, tmp23, tmp24; + INT32 z1, z2, z3, z4; + JCOEFPTR inptr; + ISLOW_MULT_TYPE * quantptr; + int * wsptr; + JSAMPROW outptr; + JSAMPLE *range_limit = IDCT_range_limit(cinfo); + int ctr; + int workspace[5*10]; /* buffers data between passes */ + SHIFT_TEMPS + + /* Pass 1: process columns from input, store into work array. + * 10-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/20). + */ + inptr = coef_block; + quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table; + wsptr = workspace; + for (ctr = 0; ctr < 5; ctr++, inptr++, quantptr++, wsptr++) { + /* Even part */ + + tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); + tmp0 <<= CONST_BITS; + tmp1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); + z1 = MULTIPLY(tmp1, FIX(1.144122806)); /* c4 */ + z2 = MULTIPLY(tmp1, FIX(0.437016024)); /* c8 */ + tmp10 = tmp0 + z1; + tmp11 = tmp0 - z2; + + tmp22 = DESCALE(tmp0 - ((z1 - z2) << 1), /* c0 = (c4-c8)*2 */ + CONST_BITS-PASS1_BITS); + + z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); + z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]); + + z1 = MULTIPLY(z2 + z3, FIX(0.831253876)); /* c6 */ + tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */ + tmp1 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */ + + tmp20 = tmp10 + tmp0; + tmp24 = tmp10 - tmp0; + tmp21 = tmp11 + tmp1; + tmp23 = tmp11 - tmp1; + + /* Odd part */ + + z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); + z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); + z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); + z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]); + + tmp0 = z2 + z4; + tmp1 = z2 - z4; + + tmp12 = MULTIPLY(tmp1, FIX(0.309016994)); /* (c3-c7)/2 */ + tmp13 = z3 << CONST_BITS; + + z2 = MULTIPLY(tmp0, FIX(0.951056516)); /* (c3+c7)/2 */ + z4 = tmp13 + tmp12; + + tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */ + tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */ + + z2 = MULTIPLY(tmp0, FIX(0.587785252)); /* (c1-c9)/2 */ + z4 = tmp13 - tmp12 - (tmp1 << (CONST_BITS - 1)); + + tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */ + tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */ + + tmp12 = (z1 - tmp1 - z3) << PASS1_BITS; + + /* Final output stage */ + + wsptr[5*0] = (int) DESCALE(tmp20 + tmp10, CONST_BITS-PASS1_BITS); + wsptr[5*9] = (int) DESCALE(tmp20 - tmp10, CONST_BITS-PASS1_BITS); + wsptr[5*1] = (int) DESCALE(tmp21 + tmp11, CONST_BITS-PASS1_BITS); + wsptr[5*8] = (int) DESCALE(tmp21 - tmp11, CONST_BITS-PASS1_BITS); + wsptr[5*2] = (int) (tmp22 + tmp12); + wsptr[5*7] = (int) (tmp22 - tmp12); + wsptr[5*3] = (int) DESCALE(tmp23 + tmp13, CONST_BITS-PASS1_BITS); + wsptr[5*6] = (int) DESCALE(tmp23 - tmp13, CONST_BITS-PASS1_BITS); + wsptr[5*4] = (int) DESCALE(tmp24 + tmp14, CONST_BITS-PASS1_BITS); + wsptr[5*5] = (int) DESCALE(tmp24 - tmp14, CONST_BITS-PASS1_BITS); + } + + /* Pass 2: process 10 rows from work array, store into output array. + * 5-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/10). + */ + wsptr = workspace; + for (ctr = 0; ctr < 10; ctr++) { + outptr = output_buf[ctr] + output_col; + + /* Even part */ + + tmp12 = (INT32) wsptr[0]; + tmp12 <<= CONST_BITS; + tmp0 = (INT32) wsptr[2]; + tmp1 = (INT32) wsptr[4]; + z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */ + z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */ + z3 = tmp12 + z2; + tmp10 = z3 + z1; + tmp11 = z3 - z1; + tmp12 -= z2 << 2; + + /* Odd part */ + + z2 = (INT32) wsptr[1]; + z3 = (INT32) wsptr[3]; + + z1 = MULTIPLY(z2 + z3, FIX(0.831253876)); /* c3 */ + tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c1-c3 */ + tmp1 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c1+c3 */ + + /* Final output stage */ + + outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp0, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[4] = range_limit[(int) DESCALE(tmp10 - tmp0, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[1] = range_limit[(int) DESCALE(tmp11 + tmp1, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[3] = range_limit[(int) DESCALE(tmp11 - tmp1, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[2] = range_limit[(int) DESCALE(tmp12, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + + wsptr += 5; /* advance pointer to next row */ + } +} + + +/* + * Perform dequantization and inverse DCT on one block of coefficients, + * producing a 4x8 output block. + * + * 8-point IDCT in pass 1 (columns), 4-point in pass 2 (rows). + */ + +GLOBAL(void) +jpeg_idct_4x8 (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + INT32 tmp0, tmp1, tmp2, tmp3; + INT32 tmp10, tmp11, tmp12, tmp13; + INT32 z1, z2, z3, z4, z5; + JCOEFPTR inptr; + ISLOW_MULT_TYPE * quantptr; + int * wsptr; + JSAMPROW outptr; + JSAMPLE *range_limit = IDCT_range_limit(cinfo); + int ctr; + int workspace[4*8]; /* buffers data between passes */ + SHIFT_TEMPS + + /* Pass 1: process columns from input, store into work array. */ + /* Note results are scaled up by sqrt(8) compared to a true IDCT; */ + /* furthermore, we scale the results by 2**PASS1_BITS. */ + + inptr = coef_block; + quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table; + wsptr = workspace; + for (ctr = 4; ctr > 0; ctr--) { + /* Due to quantization, we will usually find that many of the input + * coefficients are zero, especially the AC terms. We can exploit this + * by short-circuiting the IDCT calculation for any column in which all + * the AC terms are zero. In that case each output is equal to the + * DC coefficient (with scale factor as needed). + * With typical images and quantization tables, half or more of the + * column DCT calculations can be simplified this way. + */ + + if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 && + inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 && + inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 && + inptr[DCTSIZE*7] == 0) { + /* AC terms all zero */ + int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS; + + wsptr[4*0] = dcval; + wsptr[4*1] = dcval; + wsptr[4*2] = dcval; + wsptr[4*3] = dcval; + wsptr[4*4] = dcval; + wsptr[4*5] = dcval; + wsptr[4*6] = dcval; + wsptr[4*7] = dcval; + + inptr++; /* advance pointers to next column */ + quantptr++; + wsptr++; + continue; + } + + /* Even part: reverse the even part of the forward DCT. */ + /* The rotator is sqrt(2)*c(-6). */ + + z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); + z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]); + + z1 = MULTIPLY(z2 + z3, FIX_0_541196100); + tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); + tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); + + z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); + z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); + + tmp0 = (z2 + z3) << CONST_BITS; + tmp1 = (z2 - z3) << CONST_BITS; + + tmp10 = tmp0 + tmp3; + tmp13 = tmp0 - tmp3; + tmp11 = tmp1 + tmp2; + tmp12 = tmp1 - tmp2; + + /* Odd part per figure 8; the matrix is unitary and hence its + * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. + */ + + tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]); + tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); + tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); + tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); + + z1 = tmp0 + tmp3; + z2 = tmp1 + tmp2; + z3 = tmp0 + tmp2; + z4 = tmp1 + tmp3; + z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */ + + tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */ + tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */ + tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */ + tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */ + z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */ + z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */ + z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */ + z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */ + + z3 += z5; + z4 += z5; + + tmp0 += z1 + z3; + tmp1 += z2 + z4; + tmp2 += z2 + z3; + tmp3 += z1 + z4; + + /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ + + wsptr[4*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS); + wsptr[4*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS); + wsptr[4*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS); + wsptr[4*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS); + wsptr[4*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS); + wsptr[4*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS); + wsptr[4*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS); + wsptr[4*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS); + + inptr++; /* advance pointers to next column */ + quantptr++; + wsptr++; + } + + /* Pass 2: process 8 rows from work array, store into output array. + * 4-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/16). + */ + wsptr = workspace; + for (ctr = 0; ctr < 8; ctr++) { + outptr = output_buf[ctr] + output_col; + + /* Even part */ + + tmp0 = (INT32) wsptr[0]; + tmp2 = (INT32) wsptr[2]; + + tmp10 = (tmp0 + tmp2) << CONST_BITS; + tmp12 = (tmp0 - tmp2) << CONST_BITS; + + /* Odd part */ + /* Same rotation as in the even part of the 8x8 LL&M IDCT */ + + z2 = (INT32) wsptr[1]; + z3 = (INT32) wsptr[3]; + + z1 = MULTIPLY(z2 + z3, FIX_0_541196100); /* c6 */ + tmp0 = z1 + MULTIPLY(z3, - FIX_1_847759065); /* -(c2+c6) */ + tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */ + + /* Final output stage */ + + outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp2, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[3] = range_limit[(int) DESCALE(tmp10 - tmp2, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[1] = range_limit[(int) DESCALE(tmp12 + tmp0, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[2] = range_limit[(int) DESCALE(tmp12 - tmp0, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + + wsptr += 4; /* advance pointer to next row */ + } +} + + +/* + * Perform dequantization and inverse DCT on one block of coefficients, + * producing a reduced-size 3x6 output block. + * + * 6-point IDCT in pass 1 (columns), 3-point in pass 2 (rows). + */ + +GLOBAL(void) +jpeg_idct_3x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12; + INT32 z1, z2, z3; + JCOEFPTR inptr; + ISLOW_MULT_TYPE * quantptr; + int * wsptr; + JSAMPROW outptr; + JSAMPLE *range_limit = IDCT_range_limit(cinfo); + int ctr; + int workspace[3*6]; /* buffers data between passes */ + SHIFT_TEMPS + + /* Pass 1: process columns from input, store into work array. + * 6-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/12). + */ + inptr = coef_block; + quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table; + wsptr = workspace; + for (ctr = 0; ctr < 3; ctr++, inptr++, quantptr++, wsptr++) { + /* Even part */ + + tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); + tmp0 <<= CONST_BITS; + tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); + tmp10 = MULTIPLY(tmp2, FIX(0.707106781)); /* c4 */ + tmp1 = tmp0 + tmp10; + tmp11 = DESCALE(tmp0 - tmp10 - tmp10, CONST_BITS-PASS1_BITS); + tmp10 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); + tmp0 = MULTIPLY(tmp10, FIX(1.224744871)); /* c2 */ + tmp10 = tmp1 + tmp0; + tmp12 = tmp1 - tmp0; + + /* Odd part */ + + z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); + z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); + z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); + tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */ + tmp0 = tmp1 + ((z1 + z2) << CONST_BITS); + tmp2 = tmp1 + ((z3 - z2) << CONST_BITS); + tmp1 = (z1 - z2 - z3) << PASS1_BITS; + + /* Final output stage */ + + wsptr[3*0] = DESCALE(tmp10 + tmp0, CONST_BITS-PASS1_BITS); + wsptr[3*5] = DESCALE(tmp10 - tmp0, CONST_BITS-PASS1_BITS); + wsptr[3*1] = (int) (tmp11 + tmp1); + wsptr[3*4] = (int) (tmp11 - tmp1); + wsptr[3*2] = DESCALE(tmp12 + tmp2, CONST_BITS-PASS1_BITS); + wsptr[3*3] = DESCALE(tmp12 - tmp2, CONST_BITS-PASS1_BITS); + } + + /* Pass 2: process 6 rows from work array, store into output array. + * 3-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/6). + */ + wsptr = workspace; + for (ctr = 0; ctr < 6; ctr++) { + outptr = output_buf[ctr] + output_col; + + /* Even part */ + + tmp0 = (INT32) wsptr[0]; + tmp0 <<= CONST_BITS; + tmp2 = (INT32) wsptr[2]; + tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */ + tmp10 = tmp0 + tmp12; + tmp2 = tmp0 - tmp12 - tmp12; + + /* Odd part */ + + tmp12 = (INT32) wsptr[1]; + tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */ + + /* Final output stage */ + + outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp0, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[2] = range_limit[(int) DESCALE(tmp10 - tmp0, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + outptr[1] = range_limit[(int) DESCALE(tmp2, + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + + wsptr += 3; /* advance pointer to next row */ + } +} + + +/* + * Perform dequantization and inverse DCT on one block of coefficients, + * producing a 2x4 output block. + * + * 4-point IDCT in pass 1 (columns), 2-point in pass 2 (rows). + */ + +GLOBAL(void) +jpeg_idct_2x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + INT32 tmp0, tmp2, tmp10, tmp12; + INT32 z1, z2, z3; + JCOEFPTR inptr; + ISLOW_MULT_TYPE * quantptr; + INT32 * wsptr; + JSAMPROW outptr; + JSAMPLE *range_limit = IDCT_range_limit(cinfo); + int ctr; + INT32 workspace[2*4]; /* buffers data between passes */ + SHIFT_TEMPS + + /* Pass 1: process columns from input, store into work array. + * 4-point IDCT kernel, + * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point IDCT]. + */ + inptr = coef_block; + quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table; + wsptr = workspace; + for (ctr = 0; ctr < 2; ctr++, inptr++, quantptr++, wsptr++) { + /* Even part */ + + tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); + tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); + + tmp10 = (tmp0 + tmp2) << CONST_BITS; + tmp12 = (tmp0 - tmp2) << CONST_BITS; + + /* Odd part */ + /* Same rotation as in the even part of the 8x8 LL&M IDCT */ + + z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); + z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); + + z1 = MULTIPLY(z2 + z3, FIX_0_541196100); /* c6 */ + tmp0 = z1 + MULTIPLY(z3, - FIX_1_847759065); /* -(c2+c6) */ + tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */ + + /* Final output stage */ + + wsptr[2*0] = tmp10 + tmp2; + wsptr[2*3] = tmp10 - tmp2; + wsptr[2*1] = tmp12 + tmp0; + wsptr[2*2] = tmp12 - tmp0; + } + + /* Pass 2: process 4 rows from work array, store into output array. */ + + wsptr = workspace; + for (ctr = 0; ctr < 4; ctr++) { + outptr = output_buf[ctr] + output_col; + + /* Even part */ + + tmp10 = wsptr[0]; + + /* Odd part */ + + tmp0 = wsptr[1]; + + /* Final output stage */ + + outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp0, CONST_BITS+3) + & RANGE_MASK]; + outptr[1] = range_limit[(int) DESCALE(tmp10 - tmp0, CONST_BITS+3) + & RANGE_MASK]; + + wsptr += 2; /* advance pointer to next row */ + } +} + + +/* + * Perform dequantization and inverse DCT on one block of coefficients, + * producing a 1x2 output block. + * + * 2-point IDCT in pass 1 (columns), 1-point in pass 2 (rows). + */ + +GLOBAL(void) +jpeg_idct_1x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + INT32 tmp0, tmp10; + ISLOW_MULT_TYPE * quantptr; + JSAMPLE *range_limit = IDCT_range_limit(cinfo); + SHIFT_TEMPS + + /* Process 1 column from input, store into output array. */ + + quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table; + + /* Even part */ + + tmp10 = DEQUANTIZE(coef_block[DCTSIZE*0], quantptr[DCTSIZE*0]); + + /* Odd part */ + + tmp0 = DEQUANTIZE(coef_block[DCTSIZE*1], quantptr[DCTSIZE*1]); + + /* Final output stage */ + + output_buf[0][output_col] = range_limit[(int) DESCALE(tmp10 + tmp0, 3) + & RANGE_MASK]; + output_buf[1][output_col] = range_limit[(int) DESCALE(tmp10 - tmp0, 3) + & RANGE_MASK]; +} + +#endif /* IDCT_SCALING_SUPPORTED */ #endif /* DCT_ISLOW_SUPPORTED */ diff --git a/jmorecfg.h b/jmorecfg.h index 54a7d1c..c88ab1f 100644 --- a/jmorecfg.h +++ b/jmorecfg.h @@ -266,9 +266,10 @@ typedef int boolean; /* Encoder capability options: */ -#undef C_ARITH_CODING_SUPPORTED /* Arithmetic coding back end? */ +#define C_ARITH_CODING_SUPPORTED /* Arithmetic coding back end? */ #define C_MULTISCAN_FILES_SUPPORTED /* Multiple-scan JPEG files? */ #define C_PROGRESSIVE_SUPPORTED /* Progressive JPEG? (Requires MULTISCAN)*/ +#define DCT_SCALING_SUPPORTED /* Input rescaling via DCT? (Requires DCT_ISLOW)*/ #define ENTROPY_OPT_SUPPORTED /* Optimization of entropy coding parms? */ /* Note: if you selected 12-bit data precision, it is dangerous to turn off * ENTROPY_OPT_SUPPORTED. The standard Huffman tables are only good for 8-bit @@ -282,12 +283,12 @@ typedef int boolean; /* Decoder capability options: */ -#undef D_ARITH_CODING_SUPPORTED /* Arithmetic coding back end? */ +#define D_ARITH_CODING_SUPPORTED /* Arithmetic coding back end? */ #define D_MULTISCAN_FILES_SUPPORTED /* Multiple-scan JPEG files? */ #define D_PROGRESSIVE_SUPPORTED /* Progressive JPEG? (Requires MULTISCAN)*/ +#define IDCT_SCALING_SUPPORTED /* Output rescaling via IDCT? */ #define SAVE_MARKERS_SUPPORTED /* jpeg_save_markers() needed? */ #define BLOCK_SMOOTHING_SUPPORTED /* Block smoothing? (Progressive only) */ -#define IDCT_SCALING_SUPPORTED /* Output rescaling via IDCT? */ #undef UPSAMPLE_SCALING_SUPPORTED /* Output rescaling at upsample stage? */ #define UPSAMPLE_MERGING_SUPPORTED /* Fast path for sloppy upsampling? */ #define QUANT_1PASS_SUPPORTED /* 1-pass color quantization? */ diff --git a/jpegint.h b/jpegint.h index 95b00d4..3378133 100644 --- a/jpegint.h +++ b/jpegint.h @@ -345,6 +345,7 @@ EXTERN(void) jinit_downsampler JPP((j_compress_ptr cinfo)); EXTERN(void) jinit_forward_dct JPP((j_compress_ptr cinfo)); EXTERN(void) jinit_huff_encoder JPP((j_compress_ptr cinfo)); EXTERN(void) jinit_phuff_encoder JPP((j_compress_ptr cinfo)); +EXTERN(void) jinit_arith_encoder JPP((j_compress_ptr cinfo)); EXTERN(void) jinit_marker_writer JPP((j_compress_ptr cinfo)); /* Decompression module initialization routines */ EXTERN(void) jinit_master_decompress JPP((j_decompress_ptr cinfo)); @@ -358,6 +359,7 @@ EXTERN(void) jinit_input_controller JPP((j_decompress_ptr cinfo)); EXTERN(void) jinit_marker_reader JPP((j_decompress_ptr cinfo)); EXTERN(void) jinit_huff_decoder JPP((j_decompress_ptr cinfo)); EXTERN(void) jinit_phuff_decoder JPP((j_decompress_ptr cinfo)); +EXTERN(void) jinit_arith_decoder JPP((j_decompress_ptr cinfo)); EXTERN(void) jinit_inverse_dct JPP((j_decompress_ptr cinfo)); EXTERN(void) jinit_upsampler JPP((j_decompress_ptr cinfo)); EXTERN(void) jinit_color_deconverter JPP((j_decompress_ptr cinfo)); diff --git a/jpeglib.h b/jpeglib.h index d1be8dd..0b0684c 100644 --- a/jpeglib.h +++ b/jpeglib.h @@ -144,7 +144,8 @@ typedef struct { * Values of 1,2,4,8 are likely to be supported. Note that different * components may receive different IDCT scalings. */ - int DCT_scaled_size; + int DCT_h_scaled_size; + int DCT_v_scaled_size; /* The downsampled dimensions are the component's actual, unpadded number * of samples at the main buffer (preprocessing/compression interface), thus * downsampled_width = ceil(image_width * Hi/Hmax) @@ -291,6 +292,17 @@ struct jpeg_compress_struct { * helper routines to simplify changing parameters. */ + unsigned int scale_num, scale_denom; /* fraction by which to scale image */ + + JDIMENSION jpeg_width; /* scaled JPEG image width */ + JDIMENSION jpeg_height; /* scaled JPEG image height */ + /* Dimensions of actual JPEG image that will be written to file, + * derived from input dimensions by scaling factors above. + * These fields are computed by jpeg_start_compress(). + * You can also use jpeg_calc_jpeg_dimensions() to determine these values + * in advance of calling jpeg_start_compress(). + */ + int data_precision; /* bits of precision in image data */ int num_components; /* # of color components in JPEG image */ @@ -298,14 +310,17 @@ struct jpeg_compress_struct { jpeg_component_info * comp_info; /* comp_info[i] describes component that appears i'th in SOF */ - + JQUANT_TBL * quant_tbl_ptrs[NUM_QUANT_TBLS]; - /* ptrs to coefficient quantization tables, or NULL if not defined */ - + int q_scale_factor[NUM_QUANT_TBLS]; + /* ptrs to coefficient quantization tables, or NULL if not defined, + * and corresponding scale factors (percentage, initialized 100). + */ + JHUFF_TBL * dc_huff_tbl_ptrs[NUM_HUFF_TBLS]; JHUFF_TBL * ac_huff_tbl_ptrs[NUM_HUFF_TBLS]; /* ptrs to Huffman coding tables, or NULL if not defined */ - + UINT8 arith_dc_L[NUM_ARITH_TBLS]; /* L values for DC arith-coding tables */ UINT8 arith_dc_U[NUM_ARITH_TBLS]; /* U values for DC arith-coding tables */ UINT8 arith_ac_K[NUM_ARITH_TBLS]; /* Kx values for AC arith-coding tables */ @@ -364,6 +379,9 @@ struct jpeg_compress_struct { int max_h_samp_factor; /* largest h_samp_factor */ int max_v_samp_factor; /* largest v_samp_factor */ + int min_DCT_h_scaled_size; /* smallest DCT_scaled_size of any component */ + int min_DCT_v_scaled_size; /* smallest DCT_scaled_size of any component */ + JDIMENSION total_iMCU_rows; /* # of iMCU rows to be input to coef ctlr */ /* The coefficient controller receives data in units of MCU rows as defined * for fully interleaved scans (whether the JPEG file is interleaved or not). @@ -575,7 +593,8 @@ struct jpeg_decompress_struct { int max_h_samp_factor; /* largest h_samp_factor */ int max_v_samp_factor; /* largest v_samp_factor */ - int min_DCT_scaled_size; /* smallest DCT_scaled_size of any component */ + int min_DCT_h_scaled_size; /* smallest DCT_scaled_size of any component */ + int min_DCT_v_scaled_size; /* smallest DCT_scaled_size of any component */ JDIMENSION total_iMCU_rows; /* # of iMCU rows in image */ /* The coefficient controller's input and output progress is measured in @@ -841,6 +860,7 @@ typedef JMETHOD(boolean, jpeg_marker_parser_method, (j_decompress_ptr cinfo)); #define jpeg_default_colorspace jDefColorspace #define jpeg_set_quality jSetQuality #define jpeg_set_linear_quality jSetLQuality +#define jpeg_default_qtables jDefQTables #define jpeg_add_quant_table jAddQuantTable #define jpeg_quality_scaling jQualityScaling #define jpeg_simple_progression jSimProgress @@ -850,6 +870,7 @@ typedef JMETHOD(boolean, jpeg_marker_parser_method, (j_decompress_ptr cinfo)); #define jpeg_start_compress jStrtCompress #define jpeg_write_scanlines jWrtScanlines #define jpeg_finish_compress jFinCompress +#define jpeg_calc_jpeg_dimensions jCjpegDimensions #define jpeg_write_raw_data jWrtRawData #define jpeg_write_marker jWrtMarker #define jpeg_write_m_header jWrtMHeader @@ -921,6 +942,8 @@ EXTERN(void) jpeg_set_quality JPP((j_compress_ptr cinfo, int quality, EXTERN(void) jpeg_set_linear_quality JPP((j_compress_ptr cinfo, int scale_factor, boolean force_baseline)); +EXTERN(void) jpeg_default_qtables JPP((j_compress_ptr cinfo, + boolean force_baseline)); EXTERN(void) jpeg_add_quant_table JPP((j_compress_ptr cinfo, int which_tbl, const unsigned int *basic_table, int scale_factor, @@ -940,6 +963,9 @@ EXTERN(JDIMENSION) jpeg_write_scanlines JPP((j_compress_ptr cinfo, JDIMENSION num_lines)); EXTERN(void) jpeg_finish_compress JPP((j_compress_ptr cinfo)); +/* Precalculate JPEG dimensions for current compression parameters. */ +EXTERN(void) jpeg_calc_jpeg_dimensions JPP((j_compress_ptr cinfo)); + /* Replaces jpeg_write_scanlines when writing raw downsampled data. */ EXTERN(JDIMENSION) jpeg_write_raw_data JPP((j_compress_ptr cinfo, JSAMPIMAGE data, diff --git a/jpegtran.c b/jpegtran.c index 20ef111..44c061a 100644 --- a/jpegtran.c +++ b/jpegtran.c @@ -1,7 +1,7 @@ /* * jpegtran.c * - * Copyright (C) 1995-1997, Thomas G. Lane. + * Copyright (C) 1995-2001, Thomas G. Lane. * This file is part of the Independent JPEG Group's software. * For conditions of distribution and use, see the accompanying README file. * @@ -64,8 +64,10 @@ usage (void) #endif #if TRANSFORMS_SUPPORTED fprintf(stderr, "Switches for modifying the image:\n"); + fprintf(stderr, " -crop WxH+X+Y Crop to a rectangular subarea\n"); fprintf(stderr, " -grayscale Reduce to grayscale (omit color data)\n"); fprintf(stderr, " -flip [horizontal|vertical] Mirror image (left-right or top-bottom)\n"); + fprintf(stderr, " -perfect Fail if there is non-transformable edge blocks\n"); fprintf(stderr, " -rotate [90|180|270] Rotate image (degrees clockwise)\n"); fprintf(stderr, " -transpose Transpose image\n"); fprintf(stderr, " -transverse Transverse transpose image\n"); @@ -133,7 +135,9 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv, copyoption = JCOPYOPT_DEFAULT; transformoption.transform = JXFORM_NONE; transformoption.trim = FALSE; + transformoption.perfect = FALSE; transformoption.force_grayscale = FALSE; + transformoption.crop = FALSE; cinfo->err->trace_level = 0; /* Scan command line options, adjust parameters */ @@ -160,7 +164,7 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv, exit(EXIT_FAILURE); #endif - } else if (keymatch(arg, "copy", 1)) { + } else if (keymatch(arg, "copy", 2)) { /* Select which extra markers to copy. */ if (++argn >= argc) /* advance to next argument */ usage(); @@ -173,6 +177,20 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv, } else usage(); + } else if (keymatch(arg, "crop", 2)) { + /* Perform lossless cropping. */ +#if TRANSFORMS_SUPPORTED + if (++argn >= argc) /* advance to next argument */ + usage(); + if (! jtransform_parse_crop_spec(&transformoption, argv[argn])) { + fprintf(stderr, "%s: bogus -crop argument '%s'\n", + progname, argv[argn]); + exit(EXIT_FAILURE); + } +#else + select_transform(JXFORM_NONE); /* force an error */ +#endif + } else if (keymatch(arg, "debug", 1) || keymatch(arg, "verbose", 1)) { /* Enable debug printouts. */ /* On first -d, print version identification */ @@ -233,7 +251,12 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv, usage(); outfilename = argv[argn]; /* save it away for later use */ - } else if (keymatch(arg, "progressive", 1)) { + } else if (keymatch(arg, "perfect", 2)) { + /* Fail if there is any partial edge MCUs that the transform can't + * handle. */ + transformoption.perfect = TRUE; + + } else if (keymatch(arg, "progressive", 2)) { /* Select simple progressive mode. */ #ifdef C_PROGRESSIVE_SUPPORTED simple_progressive = TRUE; @@ -342,8 +365,10 @@ main (int argc, char **argv) jvirt_barray_ptr * src_coef_arrays; jvirt_barray_ptr * dst_coef_arrays; int file_index; - FILE * input_file; - FILE * output_file; + /* We assume all-in-memory processing and can therefore use only a + * single file pointer for sequential input and output operation. + */ + FILE * fp; /* On Mac, fetch a command line. */ #ifdef USE_CCOMMAND @@ -406,24 +431,13 @@ main (int argc, char **argv) /* Open the input file. */ if (file_index < argc) { - if ((input_file = fopen(argv[file_index], READ_BINARY)) == NULL) { - fprintf(stderr, "%s: can't open %s\n", progname, argv[file_index]); + if ((fp = fopen(argv[file_index], READ_BINARY)) == NULL) { + fprintf(stderr, "%s: can't open %s for reading\n", progname, argv[file_index]); exit(EXIT_FAILURE); } } else { /* default input file is stdin */ - input_file = read_stdin(); - } - - /* Open the output file. */ - if (outfilename != NULL) { - if ((output_file = fopen(outfilename, WRITE_BINARY)) == NULL) { - fprintf(stderr, "%s: can't open %s\n", progname, outfilename); - exit(EXIT_FAILURE); - } - } else { - /* default output file is stdout */ - output_file = write_stdout(); + fp = read_stdin(); } #ifdef PROGRESS_REPORT @@ -431,7 +445,7 @@ main (int argc, char **argv) #endif /* Specify data source for decompression */ - jpeg_stdio_src(&srcinfo, input_file); + jpeg_stdio_src(&srcinfo, fp); /* Enable saving of extra markers that we want to copy */ jcopy_markers_setup(&srcinfo, copyoption); @@ -443,6 +457,15 @@ main (int argc, char **argv) * jpeg_read_coefficients so that memory allocation will be done right. */ #if TRANSFORMS_SUPPORTED + /* Fails right away if -perfect is given and transformation is not perfect. + */ + if (transformoption.perfect && + !jtransform_perfect_transform(srcinfo.image_width, srcinfo.image_height, + srcinfo.max_h_samp_factor * DCTSIZE, srcinfo.max_v_samp_factor * DCTSIZE, + transformoption.transform)) { + fprintf(stderr, "%s: transformation is not perfect\n", progname); + exit(EXIT_FAILURE); + } jtransform_request_workspace(&srcinfo, &transformoption); #endif @@ -463,11 +486,32 @@ main (int argc, char **argv) dst_coef_arrays = src_coef_arrays; #endif + /* Close input file, if we opened it. + * Note: we assume that jpeg_read_coefficients consumed all input + * until JPEG_REACHED_EOI, and that jpeg_finish_decompress will + * only consume more while (! cinfo->inputctl->eoi_reached). + * We cannot call jpeg_finish_decompress here since we still need the + * virtual arrays allocated from the source object for processing. + */ + if (fp != stdin) + fclose(fp); + + /* Open the output file. */ + if (outfilename != NULL) { + if ((fp = fopen(outfilename, WRITE_BINARY)) == NULL) { + fprintf(stderr, "%s: can't open %s for writing\n", progname, outfilename); + exit(EXIT_FAILURE); + } + } else { + /* default output file is stdout */ + fp = write_stdout(); + } + /* Adjust default compression parameters by re-parsing the options */ file_index = parse_switches(&dstinfo, argc, argv, 0, TRUE); /* Specify data destination for compression */ - jpeg_stdio_dest(&dstinfo, output_file); + jpeg_stdio_dest(&dstinfo, fp); /* Start compressor (note no image data is actually written here) */ jpeg_write_coefficients(&dstinfo, dst_coef_arrays); @@ -488,11 +532,9 @@ main (int argc, char **argv) (void) jpeg_finish_decompress(&srcinfo); jpeg_destroy_decompress(&srcinfo); - /* Close files, if we opened them */ - if (input_file != stdin) - fclose(input_file); - if (output_file != stdout) - fclose(output_file); + /* Close output file, if we opened it */ + if (fp != stdout) + fclose(fp); #ifdef PROGRESS_REPORT end_progress_monitor((j_common_ptr) &dstinfo); diff --git a/makefile.cfg b/makefile.cfg index f25e42e..9c7ac9d 100644 --- a/makefile.cfg +++ b/makefile.cfg @@ -79,8 +79,8 @@ LIBSOURCES= jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c \ jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c \ jdinput.c jdmainct.c jdmarker.c jdmaster.c jdmerge.c jdphuff.c \ jdpostct.c jdsample.c jdtrans.c jerror.c jfdctflt.c jfdctfst.c \ - jfdctint.c jidctflt.c jidctfst.c jidctint.c jidctred.c jquant1.c \ - jquant2.c jutils.c jmemmgr.c + jfdctint.c jidctflt.c jidctfst.c jidctint.c jquant1.c \ + jquant2.c jutils.c jmemmgr.c jaricom.c jcarith.c jdarith.c # memmgr back ends: compile only one of these into a working library SYSDEPSOURCES= jmemansi.c jmemname.c jmemnobs.c jmemdos.c jmemmac.c # source files: cjpeg/djpeg/jpegtran applications, also rdjpgcom/wrjpgcom @@ -110,19 +110,19 @@ TESTFILES= testorig.jpg testimg.ppm testimg.bmp testimg.jpg testprog.jpg \ DISTFILES= $(DOCS) $(MKFILES) $(CONFIGFILES) $(SOURCES) $(INCLUDES) \ $(CONFIGUREFILES) $(OTHERFILES) $(TESTFILES) # library object files common to compression and decompression -COMOBJECTS= jcomapi.$(O) jutils.$(O) jerror.$(O) jmemmgr.$(O) $(SYSDEPMEM) +COMOBJECTS= jcomapi.$(O) jutils.$(O) jerror.$(O) jmemmgr.$(O) jaricom.$(O) $(SYSDEPMEM) # compression library object files CLIBOBJECTS= jcapimin.$(O) jcapistd.$(O) jctrans.$(O) jcparam.$(O) \ jdatadst.$(O) jcinit.$(O) jcmaster.$(O) jcmarker.$(O) jcmainct.$(O) \ jcprepct.$(O) jccoefct.$(O) jccolor.$(O) jcsample.$(O) jchuff.$(O) \ jcphuff.$(O) jcdctmgr.$(O) jfdctfst.$(O) jfdctflt.$(O) \ - jfdctint.$(O) + jfdctint.$(O) jcarith.$(O) # decompression library object files DLIBOBJECTS= jdapimin.$(O) jdapistd.$(O) jdtrans.$(O) jdatasrc.$(O) \ jdmaster.$(O) jdinput.$(O) jdmarker.$(O) jdhuff.$(O) jdphuff.$(O) \ jdmainct.$(O) jdcoefct.$(O) jdpostct.$(O) jddctmgr.$(O) \ - jidctfst.$(O) jidctflt.$(O) jidctint.$(O) jidctred.$(O) \ - jdsample.$(O) jdcolor.$(O) jquant1.$(O) jquant2.$(O) jdmerge.$(O) + jidctfst.$(O) jidctflt.$(O) jidctint.$(O) jdsample.$(O) \ + jdcolor.$(O) jquant1.$(O) jquant2.$(O) jdmerge.$(O) jdarith.$(O) # These objectfiles are included in libjpeg.a LIBOBJECTS= $(CLIBOBJECTS) $(DLIBOBJECTS) $(COMOBJECTS) # object files for sample applications (excluding library files) @@ -288,7 +288,6 @@ jfdctint.$(O): jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h je jidctflt.$(O): jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h jidctfst.$(O): jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h jidctint.$(O): jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h -jidctred.$(O): jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h jquant1.$(O): jquant1.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jquant2.$(O): jquant2.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jutils.$(O): jutils.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h @@ -317,3 +316,6 @@ rdbmp.$(O): rdbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h wrbmp.$(O): wrbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h rdrle.$(O): rdrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h wrrle.$(O): wrrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h +jcarith.$(O): jcarith.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h +jdarith.$(O): jdarith.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h +jaricom.$(O): jaricom.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h diff --git a/rdswitch.c b/rdswitch.c index 4f4bb4f..7a839af 100644 --- a/rdswitch.c +++ b/rdswitch.c @@ -9,6 +9,7 @@ * command-line switches. Switches processed here are: * -qtables file Read quantization tables from text file * -scans file Read scan script from text file + * -quality N[,N,...] Set quality ratings * -qslots N[,N,...] Set component quantization table selectors * -sample HxV[,HxV,...] Set component sampling factors */ @@ -70,8 +71,7 @@ read_text_integer (FILE * file, long * result, int * termchar) GLOBAL(boolean) -read_quant_tables (j_compress_ptr cinfo, char * filename, - int scale_factor, boolean force_baseline) +read_quant_tables (j_compress_ptr cinfo, char * filename, boolean force_baseline) /* Read a set of quantization tables from the specified file. * The file is plain ASCII text: decimal numbers with whitespace between. * Comments preceded by '#' may be included in the file. @@ -108,7 +108,8 @@ read_quant_tables (j_compress_ptr cinfo, char * filename, } table[i] = (unsigned int) val; } - jpeg_add_quant_table(cinfo, tblno, table, scale_factor, force_baseline); + jpeg_add_quant_table(cinfo, tblno, table, cinfo->q_scale_factor[tblno], + force_baseline); tblno++; } @@ -262,6 +263,38 @@ bogus: #endif /* C_MULTISCAN_FILES_SUPPORTED */ +GLOBAL(boolean) +set_quality_ratings (j_compress_ptr cinfo, char *arg, boolean force_baseline) +/* Process a quality-ratings parameter string, of the form + * N[,N,...] + * If there are more q-table slots than parameters, the last value is replicated. + */ +{ + int val = 75; /* default value */ + int tblno; + char ch; + + for (tblno = 0; tblno < NUM_QUANT_TBLS; tblno++) { + if (*arg) { + ch = ','; /* if not set by sscanf, will be ',' */ + if (sscanf(arg, "%d%c", &val, &ch) < 1) + return FALSE; + if (ch != ',') /* syntax check */ + return FALSE; + /* Convert user 0-100 rating to percentage scaling */ + cinfo->q_scale_factor[tblno] = jpeg_quality_scaling(val); + while (*arg && *arg++ != ',') /* advance to next segment of arg string */ + ; + } else { + /* reached end of parameter, set remaining factors to last value */ + cinfo->q_scale_factor[tblno] = jpeg_quality_scaling(val); + } + } + jpeg_default_qtables(cinfo, force_baseline); + return TRUE; +} + + GLOBAL(boolean) set_quant_slots (j_compress_ptr cinfo, char *arg) /* Process a quantization-table-selectors parameter string, of the form diff --git a/transupp.c b/transupp.c index e5ec564..ff0f275 100644 --- a/transupp.c +++ b/transupp.c @@ -1,7 +1,7 @@ /* * transupp.c * - * Copyright (C) 1997, Thomas G. Lane. + * Copyright (C) 1997-2001, Thomas G. Lane. * This file is part of the Independent JPEG Group's software. * For conditions of distribution and use, see the accompanying README file. * @@ -20,6 +20,7 @@ #include "jinclude.h" #include "jpeglib.h" #include "transupp.h" /* My own external interface */ +#include /* to declare isdigit() */ #if TRANSFORMS_SUPPORTED @@ -28,7 +29,8 @@ * Lossless image transformation routines. These routines work on DCT * coefficient arrays and thus do not require any lossy decompression * or recompression of the image. - * Thanks to Guido Vollbeding for the initial design and code of this feature. + * Thanks to Guido Vollbeding for the initial design and code of this feature, + * and to Ben Jackson for introducing the cropping feature. * * Horizontal flipping is done in-place, using a single top-to-bottom * pass through the virtual source array. It will thus be much the @@ -42,6 +44,13 @@ * arrays for most of the transforms. That could result in much thrashing * if the image is larger than main memory. * + * If cropping or trimming is involved, the destination arrays may be smaller + * than the source arrays. Note it is not possible to do horizontal flip + * in-place when a nonzero Y crop offset is specified, since we'd have to move + * data from one block row to another but the virtual array manager doesn't + * guarantee we can touch more than one row at a time. So in that case, + * we have to use a separate destination array. + * * Some notes about the operating environment of the individual transform * routines: * 1. Both the source and destination virtual arrays are allocated from the @@ -54,20 +63,65 @@ * and we may as well take that as the effective iMCU size. * 4. When "trim" is in effect, the destination's dimensions will be the * trimmed values but the source's will be untrimmed. - * 5. All the routines assume that the source and destination buffers are + * 5. When "crop" is in effect, the destination's dimensions will be the + * cropped values but the source's will be uncropped. Each transform + * routine is responsible for picking up source data starting at the + * correct X and Y offset for the crop region. (The X and Y offsets + * passed to the transform routines are measured in iMCU blocks of the + * destination.) + * 6. All the routines assume that the source and destination buffers are * padded out to a full iMCU boundary. This is true, although for the * source buffer it is an undocumented property of jdcoefct.c. - * Notes 2,3,4 boil down to this: generally we should use the destination's - * dimensions and ignore the source's. */ LOCAL(void) -do_flip_h (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, - jvirt_barray_ptr *src_coef_arrays) -/* Horizontal flip; done in-place, so no separate dest array is required */ +do_crop (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, + JDIMENSION x_crop_offset, JDIMENSION y_crop_offset, + jvirt_barray_ptr *src_coef_arrays, + jvirt_barray_ptr *dst_coef_arrays) +/* Crop. This is only used when no rotate/flip is requested with the crop. */ +{ + JDIMENSION dst_blk_y, x_crop_blocks, y_crop_blocks; + int ci, offset_y; + JBLOCKARRAY src_buffer, dst_buffer; + jpeg_component_info *compptr; + + /* We simply have to copy the right amount of data (the destination's + * image size) starting at the given X and Y offsets in the source. + */ + for (ci = 0; ci < dstinfo->num_components; ci++) { + compptr = dstinfo->comp_info + ci; + x_crop_blocks = x_crop_offset * compptr->h_samp_factor; + y_crop_blocks = y_crop_offset * compptr->v_samp_factor; + for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks; + dst_blk_y += compptr->v_samp_factor) { + dst_buffer = (*srcinfo->mem->access_virt_barray) + ((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y, + (JDIMENSION) compptr->v_samp_factor, TRUE); + src_buffer = (*srcinfo->mem->access_virt_barray) + ((j_common_ptr) srcinfo, src_coef_arrays[ci], + dst_blk_y + y_crop_blocks, + (JDIMENSION) compptr->v_samp_factor, FALSE); + for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) { + jcopy_block_row(src_buffer[offset_y] + x_crop_blocks, + dst_buffer[offset_y], + compptr->width_in_blocks); + } + } + } +} + + +LOCAL(void) +do_flip_h_no_crop (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, + JDIMENSION x_crop_offset, + jvirt_barray_ptr *src_coef_arrays) +/* Horizontal flip; done in-place, so no separate dest array is required. + * NB: this only works when y_crop_offset is zero. + */ { - JDIMENSION MCU_cols, comp_width, blk_x, blk_y; + JDIMENSION MCU_cols, comp_width, blk_x, blk_y, x_crop_blocks; int ci, k, offset_y; JBLOCKARRAY buffer; JCOEFPTR ptr1, ptr2; @@ -79,17 +133,19 @@ do_flip_h (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, * mirroring by changing the signs of odd-numbered columns. * Partial iMCUs at the right edge are left untouched. */ - MCU_cols = dstinfo->image_width / (dstinfo->max_h_samp_factor * DCTSIZE); + MCU_cols = srcinfo->image_width / (dstinfo->max_h_samp_factor * DCTSIZE); for (ci = 0; ci < dstinfo->num_components; ci++) { compptr = dstinfo->comp_info + ci; comp_width = MCU_cols * compptr->h_samp_factor; + x_crop_blocks = x_crop_offset * compptr->h_samp_factor; for (blk_y = 0; blk_y < compptr->height_in_blocks; blk_y += compptr->v_samp_factor) { buffer = (*srcinfo->mem->access_virt_barray) ((j_common_ptr) srcinfo, src_coef_arrays[ci], blk_y, (JDIMENSION) compptr->v_samp_factor, TRUE); for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) { + /* Do the mirroring */ for (blk_x = 0; blk_x * 2 < comp_width; blk_x++) { ptr1 = buffer[offset_y][blk_x]; ptr2 = buffer[offset_y][comp_width - blk_x - 1]; @@ -105,6 +161,79 @@ do_flip_h (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, *ptr2++ = -temp1; } } + if (x_crop_blocks > 0) { + /* Now left-justify the portion of the data to be kept. + * We can't use a single jcopy_block_row() call because that routine + * depends on memcpy(), whose behavior is unspecified for overlapping + * source and destination areas. Sigh. + */ + for (blk_x = 0; blk_x < compptr->width_in_blocks; blk_x++) { + jcopy_block_row(buffer[offset_y] + blk_x + x_crop_blocks, + buffer[offset_y] + blk_x, + (JDIMENSION) 1); + } + } + } + } + } +} + + +LOCAL(void) +do_flip_h (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, + JDIMENSION x_crop_offset, JDIMENSION y_crop_offset, + jvirt_barray_ptr *src_coef_arrays, + jvirt_barray_ptr *dst_coef_arrays) +/* Horizontal flip in general cropping case */ +{ + JDIMENSION MCU_cols, comp_width, dst_blk_x, dst_blk_y; + JDIMENSION x_crop_blocks, y_crop_blocks; + int ci, k, offset_y; + JBLOCKARRAY src_buffer, dst_buffer; + JBLOCKROW src_row_ptr, dst_row_ptr; + JCOEFPTR src_ptr, dst_ptr; + jpeg_component_info *compptr; + + /* Here we must output into a separate array because we can't touch + * different rows of a single virtual array simultaneously. Otherwise, + * this is essentially the same as the routine above. + */ + MCU_cols = srcinfo->image_width / (dstinfo->max_h_samp_factor * DCTSIZE); + + for (ci = 0; ci < dstinfo->num_components; ci++) { + compptr = dstinfo->comp_info + ci; + comp_width = MCU_cols * compptr->h_samp_factor; + x_crop_blocks = x_crop_offset * compptr->h_samp_factor; + y_crop_blocks = y_crop_offset * compptr->v_samp_factor; + for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks; + dst_blk_y += compptr->v_samp_factor) { + dst_buffer = (*srcinfo->mem->access_virt_barray) + ((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y, + (JDIMENSION) compptr->v_samp_factor, TRUE); + src_buffer = (*srcinfo->mem->access_virt_barray) + ((j_common_ptr) srcinfo, src_coef_arrays[ci], + dst_blk_y + y_crop_blocks, + (JDIMENSION) compptr->v_samp_factor, FALSE); + for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) { + dst_row_ptr = dst_buffer[offset_y]; + src_row_ptr = src_buffer[offset_y]; + for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks; dst_blk_x++) { + if (x_crop_blocks + dst_blk_x < comp_width) { + /* Do the mirrorable blocks */ + dst_ptr = dst_row_ptr[dst_blk_x]; + src_ptr = src_row_ptr[comp_width - x_crop_blocks - dst_blk_x - 1]; + /* this unrolled loop doesn't need to know which row it's on... */ + for (k = 0; k < DCTSIZE2; k += 2) { + *dst_ptr++ = *src_ptr++; /* copy even column */ + *dst_ptr++ = - *src_ptr++; /* copy odd column with sign change */ + } + } else { + /* Copy last partial block(s) verbatim */ + jcopy_block_row(src_row_ptr + dst_blk_x + x_crop_blocks, + dst_row_ptr + dst_blk_x, + (JDIMENSION) 1); + } + } } } } @@ -113,11 +242,13 @@ do_flip_h (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, LOCAL(void) do_flip_v (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, + JDIMENSION x_crop_offset, JDIMENSION y_crop_offset, jvirt_barray_ptr *src_coef_arrays, jvirt_barray_ptr *dst_coef_arrays) /* Vertical flip */ { JDIMENSION MCU_rows, comp_height, dst_blk_x, dst_blk_y; + JDIMENSION x_crop_blocks, y_crop_blocks; int ci, i, j, offset_y; JBLOCKARRAY src_buffer, dst_buffer; JBLOCKROW src_row_ptr, dst_row_ptr; @@ -131,33 +262,38 @@ do_flip_v (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, * of odd-numbered rows. * Partial iMCUs at the bottom edge are copied verbatim. */ - MCU_rows = dstinfo->image_height / (dstinfo->max_v_samp_factor * DCTSIZE); + MCU_rows = srcinfo->image_height / (dstinfo->max_v_samp_factor * DCTSIZE); for (ci = 0; ci < dstinfo->num_components; ci++) { compptr = dstinfo->comp_info + ci; comp_height = MCU_rows * compptr->v_samp_factor; + x_crop_blocks = x_crop_offset * compptr->h_samp_factor; + y_crop_blocks = y_crop_offset * compptr->v_samp_factor; for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks; dst_blk_y += compptr->v_samp_factor) { dst_buffer = (*srcinfo->mem->access_virt_barray) ((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y, (JDIMENSION) compptr->v_samp_factor, TRUE); - if (dst_blk_y < comp_height) { + if (y_crop_blocks + dst_blk_y < comp_height) { /* Row is within the mirrorable area. */ src_buffer = (*srcinfo->mem->access_virt_barray) ((j_common_ptr) srcinfo, src_coef_arrays[ci], - comp_height - dst_blk_y - (JDIMENSION) compptr->v_samp_factor, + comp_height - y_crop_blocks - dst_blk_y - + (JDIMENSION) compptr->v_samp_factor, (JDIMENSION) compptr->v_samp_factor, FALSE); } else { /* Bottom-edge blocks will be copied verbatim. */ src_buffer = (*srcinfo->mem->access_virt_barray) - ((j_common_ptr) srcinfo, src_coef_arrays[ci], dst_blk_y, + ((j_common_ptr) srcinfo, src_coef_arrays[ci], + dst_blk_y + y_crop_blocks, (JDIMENSION) compptr->v_samp_factor, FALSE); } for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) { - if (dst_blk_y < comp_height) { + if (y_crop_blocks + dst_blk_y < comp_height) { /* Row is within the mirrorable area. */ dst_row_ptr = dst_buffer[offset_y]; src_row_ptr = src_buffer[compptr->v_samp_factor - offset_y - 1]; + src_row_ptr += x_crop_blocks; for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks; dst_blk_x++) { dst_ptr = dst_row_ptr[dst_blk_x]; @@ -173,7 +309,8 @@ do_flip_v (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, } } else { /* Just copy row verbatim. */ - jcopy_block_row(src_buffer[offset_y], dst_buffer[offset_y], + jcopy_block_row(src_buffer[offset_y] + x_crop_blocks, + dst_buffer[offset_y], compptr->width_in_blocks); } } @@ -184,11 +321,12 @@ do_flip_v (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, LOCAL(void) do_transpose (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, + JDIMENSION x_crop_offset, JDIMENSION y_crop_offset, jvirt_barray_ptr *src_coef_arrays, jvirt_barray_ptr *dst_coef_arrays) /* Transpose source into destination */ { - JDIMENSION dst_blk_x, dst_blk_y; + JDIMENSION dst_blk_x, dst_blk_y, x_crop_blocks, y_crop_blocks; int ci, i, j, offset_x, offset_y; JBLOCKARRAY src_buffer, dst_buffer; JCOEFPTR src_ptr, dst_ptr; @@ -201,6 +339,8 @@ do_transpose (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, */ for (ci = 0; ci < dstinfo->num_components; ci++) { compptr = dstinfo->comp_info + ci; + x_crop_blocks = x_crop_offset * compptr->h_samp_factor; + y_crop_blocks = y_crop_offset * compptr->v_samp_factor; for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks; dst_blk_y += compptr->v_samp_factor) { dst_buffer = (*srcinfo->mem->access_virt_barray) @@ -210,11 +350,12 @@ do_transpose (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks; dst_blk_x += compptr->h_samp_factor) { src_buffer = (*srcinfo->mem->access_virt_barray) - ((j_common_ptr) srcinfo, src_coef_arrays[ci], dst_blk_x, + ((j_common_ptr) srcinfo, src_coef_arrays[ci], + dst_blk_x + x_crop_blocks, (JDIMENSION) compptr->h_samp_factor, FALSE); for (offset_x = 0; offset_x < compptr->h_samp_factor; offset_x++) { - src_ptr = src_buffer[offset_x][dst_blk_y + offset_y]; dst_ptr = dst_buffer[offset_y][dst_blk_x + offset_x]; + src_ptr = src_buffer[offset_x][dst_blk_y + offset_y + y_crop_blocks]; for (i = 0; i < DCTSIZE; i++) for (j = 0; j < DCTSIZE; j++) dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j]; @@ -228,6 +369,7 @@ do_transpose (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, LOCAL(void) do_rot_90 (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, + JDIMENSION x_crop_offset, JDIMENSION y_crop_offset, jvirt_barray_ptr *src_coef_arrays, jvirt_barray_ptr *dst_coef_arrays) /* 90 degree rotation is equivalent to @@ -237,6 +379,7 @@ do_rot_90 (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, */ { JDIMENSION MCU_cols, comp_width, dst_blk_x, dst_blk_y; + JDIMENSION x_crop_blocks, y_crop_blocks; int ci, i, j, offset_x, offset_y; JBLOCKARRAY src_buffer, dst_buffer; JCOEFPTR src_ptr, dst_ptr; @@ -246,11 +389,13 @@ do_rot_90 (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, * at the (output) right edge properly. They just get transposed and * not mirrored. */ - MCU_cols = dstinfo->image_width / (dstinfo->max_h_samp_factor * DCTSIZE); + MCU_cols = srcinfo->image_height / (dstinfo->max_h_samp_factor * DCTSIZE); for (ci = 0; ci < dstinfo->num_components; ci++) { compptr = dstinfo->comp_info + ci; comp_width = MCU_cols * compptr->h_samp_factor; + x_crop_blocks = x_crop_offset * compptr->h_samp_factor; + y_crop_blocks = y_crop_offset * compptr->v_samp_factor; for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks; dst_blk_y += compptr->v_samp_factor) { dst_buffer = (*srcinfo->mem->access_virt_barray) @@ -259,15 +404,26 @@ do_rot_90 (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) { for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks; dst_blk_x += compptr->h_samp_factor) { - src_buffer = (*srcinfo->mem->access_virt_barray) - ((j_common_ptr) srcinfo, src_coef_arrays[ci], dst_blk_x, - (JDIMENSION) compptr->h_samp_factor, FALSE); + if (x_crop_blocks + dst_blk_x < comp_width) { + /* Block is within the mirrorable area. */ + src_buffer = (*srcinfo->mem->access_virt_barray) + ((j_common_ptr) srcinfo, src_coef_arrays[ci], + comp_width - x_crop_blocks - dst_blk_x - + (JDIMENSION) compptr->h_samp_factor, + (JDIMENSION) compptr->h_samp_factor, FALSE); + } else { + /* Edge blocks are transposed but not mirrored. */ + src_buffer = (*srcinfo->mem->access_virt_barray) + ((j_common_ptr) srcinfo, src_coef_arrays[ci], + dst_blk_x + x_crop_blocks, + (JDIMENSION) compptr->h_samp_factor, FALSE); + } for (offset_x = 0; offset_x < compptr->h_samp_factor; offset_x++) { - src_ptr = src_buffer[offset_x][dst_blk_y + offset_y]; - if (dst_blk_x < comp_width) { + dst_ptr = dst_buffer[offset_y][dst_blk_x + offset_x]; + if (x_crop_blocks + dst_blk_x < comp_width) { /* Block is within the mirrorable area. */ - dst_ptr = dst_buffer[offset_y] - [comp_width - dst_blk_x - offset_x - 1]; + src_ptr = src_buffer[compptr->h_samp_factor - offset_x - 1] + [dst_blk_y + offset_y + y_crop_blocks]; for (i = 0; i < DCTSIZE; i++) { for (j = 0; j < DCTSIZE; j++) dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j]; @@ -277,7 +433,8 @@ do_rot_90 (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, } } else { /* Edge blocks are transposed but not mirrored. */ - dst_ptr = dst_buffer[offset_y][dst_blk_x + offset_x]; + src_ptr = src_buffer[offset_x] + [dst_blk_y + offset_y + y_crop_blocks]; for (i = 0; i < DCTSIZE; i++) for (j = 0; j < DCTSIZE; j++) dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j]; @@ -292,6 +449,7 @@ do_rot_90 (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, LOCAL(void) do_rot_270 (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, + JDIMENSION x_crop_offset, JDIMENSION y_crop_offset, jvirt_barray_ptr *src_coef_arrays, jvirt_barray_ptr *dst_coef_arrays) /* 270 degree rotation is equivalent to @@ -301,6 +459,7 @@ do_rot_270 (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, */ { JDIMENSION MCU_rows, comp_height, dst_blk_x, dst_blk_y; + JDIMENSION x_crop_blocks, y_crop_blocks; int ci, i, j, offset_x, offset_y; JBLOCKARRAY src_buffer, dst_buffer; JCOEFPTR src_ptr, dst_ptr; @@ -310,11 +469,13 @@ do_rot_270 (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, * at the (output) bottom edge properly. They just get transposed and * not mirrored. */ - MCU_rows = dstinfo->image_height / (dstinfo->max_v_samp_factor * DCTSIZE); + MCU_rows = srcinfo->image_width / (dstinfo->max_v_samp_factor * DCTSIZE); for (ci = 0; ci < dstinfo->num_components; ci++) { compptr = dstinfo->comp_info + ci; comp_height = MCU_rows * compptr->v_samp_factor; + x_crop_blocks = x_crop_offset * compptr->h_samp_factor; + y_crop_blocks = y_crop_offset * compptr->v_samp_factor; for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks; dst_blk_y += compptr->v_samp_factor) { dst_buffer = (*srcinfo->mem->access_virt_barray) @@ -324,14 +485,15 @@ do_rot_270 (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks; dst_blk_x += compptr->h_samp_factor) { src_buffer = (*srcinfo->mem->access_virt_barray) - ((j_common_ptr) srcinfo, src_coef_arrays[ci], dst_blk_x, + ((j_common_ptr) srcinfo, src_coef_arrays[ci], + dst_blk_x + x_crop_blocks, (JDIMENSION) compptr->h_samp_factor, FALSE); for (offset_x = 0; offset_x < compptr->h_samp_factor; offset_x++) { dst_ptr = dst_buffer[offset_y][dst_blk_x + offset_x]; - if (dst_blk_y < comp_height) { + if (y_crop_blocks + dst_blk_y < comp_height) { /* Block is within the mirrorable area. */ src_ptr = src_buffer[offset_x] - [comp_height - dst_blk_y - offset_y - 1]; + [comp_height - y_crop_blocks - dst_blk_y - offset_y - 1]; for (i = 0; i < DCTSIZE; i++) { for (j = 0; j < DCTSIZE; j++) { dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j]; @@ -341,7 +503,8 @@ do_rot_270 (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, } } else { /* Edge blocks are transposed but not mirrored. */ - src_ptr = src_buffer[offset_x][dst_blk_y + offset_y]; + src_ptr = src_buffer[offset_x] + [dst_blk_y + offset_y + y_crop_blocks]; for (i = 0; i < DCTSIZE; i++) for (j = 0; j < DCTSIZE; j++) dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j]; @@ -356,6 +519,7 @@ do_rot_270 (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, LOCAL(void) do_rot_180 (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, + JDIMENSION x_crop_offset, JDIMENSION y_crop_offset, jvirt_barray_ptr *src_coef_arrays, jvirt_barray_ptr *dst_coef_arrays) /* 180 degree rotation is equivalent to @@ -365,89 +529,93 @@ do_rot_180 (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, */ { JDIMENSION MCU_cols, MCU_rows, comp_width, comp_height, dst_blk_x, dst_blk_y; + JDIMENSION x_crop_blocks, y_crop_blocks; int ci, i, j, offset_y; JBLOCKARRAY src_buffer, dst_buffer; JBLOCKROW src_row_ptr, dst_row_ptr; JCOEFPTR src_ptr, dst_ptr; jpeg_component_info *compptr; - MCU_cols = dstinfo->image_width / (dstinfo->max_h_samp_factor * DCTSIZE); - MCU_rows = dstinfo->image_height / (dstinfo->max_v_samp_factor * DCTSIZE); + MCU_cols = srcinfo->image_width / (dstinfo->max_h_samp_factor * DCTSIZE); + MCU_rows = srcinfo->image_height / (dstinfo->max_v_samp_factor * DCTSIZE); for (ci = 0; ci < dstinfo->num_components; ci++) { compptr = dstinfo->comp_info + ci; comp_width = MCU_cols * compptr->h_samp_factor; comp_height = MCU_rows * compptr->v_samp_factor; + x_crop_blocks = x_crop_offset * compptr->h_samp_factor; + y_crop_blocks = y_crop_offset * compptr->v_samp_factor; for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks; dst_blk_y += compptr->v_samp_factor) { dst_buffer = (*srcinfo->mem->access_virt_barray) ((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y, (JDIMENSION) compptr->v_samp_factor, TRUE); - if (dst_blk_y < comp_height) { + if (y_crop_blocks + dst_blk_y < comp_height) { /* Row is within the vertically mirrorable area. */ src_buffer = (*srcinfo->mem->access_virt_barray) ((j_common_ptr) srcinfo, src_coef_arrays[ci], - comp_height - dst_blk_y - (JDIMENSION) compptr->v_samp_factor, + comp_height - y_crop_blocks - dst_blk_y - + (JDIMENSION) compptr->v_samp_factor, (JDIMENSION) compptr->v_samp_factor, FALSE); } else { /* Bottom-edge rows are only mirrored horizontally. */ src_buffer = (*srcinfo->mem->access_virt_barray) - ((j_common_ptr) srcinfo, src_coef_arrays[ci], dst_blk_y, + ((j_common_ptr) srcinfo, src_coef_arrays[ci], + dst_blk_y + y_crop_blocks, (JDIMENSION) compptr->v_samp_factor, FALSE); } for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) { - if (dst_blk_y < comp_height) { + dst_row_ptr = dst_buffer[offset_y]; + if (y_crop_blocks + dst_blk_y < comp_height) { /* Row is within the mirrorable area. */ - dst_row_ptr = dst_buffer[offset_y]; src_row_ptr = src_buffer[compptr->v_samp_factor - offset_y - 1]; - /* Process the blocks that can be mirrored both ways. */ - for (dst_blk_x = 0; dst_blk_x < comp_width; dst_blk_x++) { + for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks; dst_blk_x++) { dst_ptr = dst_row_ptr[dst_blk_x]; - src_ptr = src_row_ptr[comp_width - dst_blk_x - 1]; - for (i = 0; i < DCTSIZE; i += 2) { - /* For even row, negate every odd column. */ - for (j = 0; j < DCTSIZE; j += 2) { - *dst_ptr++ = *src_ptr++; - *dst_ptr++ = - *src_ptr++; + if (x_crop_blocks + dst_blk_x < comp_width) { + /* Process the blocks that can be mirrored both ways. */ + src_ptr = src_row_ptr[comp_width - x_crop_blocks - dst_blk_x - 1]; + for (i = 0; i < DCTSIZE; i += 2) { + /* For even row, negate every odd column. */ + for (j = 0; j < DCTSIZE; j += 2) { + *dst_ptr++ = *src_ptr++; + *dst_ptr++ = - *src_ptr++; + } + /* For odd row, negate every even column. */ + for (j = 0; j < DCTSIZE; j += 2) { + *dst_ptr++ = - *src_ptr++; + *dst_ptr++ = *src_ptr++; + } } - /* For odd row, negate every even column. */ - for (j = 0; j < DCTSIZE; j += 2) { - *dst_ptr++ = - *src_ptr++; - *dst_ptr++ = *src_ptr++; + } else { + /* Any remaining right-edge blocks are only mirrored vertically. */ + src_ptr = src_row_ptr[x_crop_blocks + dst_blk_x]; + for (i = 0; i < DCTSIZE; i += 2) { + for (j = 0; j < DCTSIZE; j++) + *dst_ptr++ = *src_ptr++; + for (j = 0; j < DCTSIZE; j++) + *dst_ptr++ = - *src_ptr++; } } } - /* Any remaining right-edge blocks are only mirrored vertically. */ - for (; dst_blk_x < compptr->width_in_blocks; dst_blk_x++) { - dst_ptr = dst_row_ptr[dst_blk_x]; - src_ptr = src_row_ptr[dst_blk_x]; - for (i = 0; i < DCTSIZE; i += 2) { - for (j = 0; j < DCTSIZE; j++) - *dst_ptr++ = *src_ptr++; - for (j = 0; j < DCTSIZE; j++) - *dst_ptr++ = - *src_ptr++; - } - } } else { /* Remaining rows are just mirrored horizontally. */ - dst_row_ptr = dst_buffer[offset_y]; src_row_ptr = src_buffer[offset_y]; - /* Process the blocks that can be mirrored. */ - for (dst_blk_x = 0; dst_blk_x < comp_width; dst_blk_x++) { - dst_ptr = dst_row_ptr[dst_blk_x]; - src_ptr = src_row_ptr[comp_width - dst_blk_x - 1]; - for (i = 0; i < DCTSIZE2; i += 2) { - *dst_ptr++ = *src_ptr++; - *dst_ptr++ = - *src_ptr++; + for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks; dst_blk_x++) { + if (x_crop_blocks + dst_blk_x < comp_width) { + /* Process the blocks that can be mirrored. */ + dst_ptr = dst_row_ptr[dst_blk_x]; + src_ptr = src_row_ptr[comp_width - x_crop_blocks - dst_blk_x - 1]; + for (i = 0; i < DCTSIZE2; i += 2) { + *dst_ptr++ = *src_ptr++; + *dst_ptr++ = - *src_ptr++; + } + } else { + /* Any remaining right-edge blocks are only copied. */ + jcopy_block_row(src_row_ptr + dst_blk_x + x_crop_blocks, + dst_row_ptr + dst_blk_x, + (JDIMENSION) 1); } } - /* Any remaining right-edge blocks are only copied. */ - for (; dst_blk_x < compptr->width_in_blocks; dst_blk_x++) { - dst_ptr = dst_row_ptr[dst_blk_x]; - src_ptr = src_row_ptr[dst_blk_x]; - for (i = 0; i < DCTSIZE2; i++) - *dst_ptr++ = *src_ptr++; - } } } } @@ -457,6 +625,7 @@ do_rot_180 (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, LOCAL(void) do_transverse (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, + JDIMENSION x_crop_offset, JDIMENSION y_crop_offset, jvirt_barray_ptr *src_coef_arrays, jvirt_barray_ptr *dst_coef_arrays) /* Transverse transpose is equivalent to @@ -470,18 +639,21 @@ do_transverse (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, */ { JDIMENSION MCU_cols, MCU_rows, comp_width, comp_height, dst_blk_x, dst_blk_y; + JDIMENSION x_crop_blocks, y_crop_blocks; int ci, i, j, offset_x, offset_y; JBLOCKARRAY src_buffer, dst_buffer; JCOEFPTR src_ptr, dst_ptr; jpeg_component_info *compptr; - MCU_cols = dstinfo->image_width / (dstinfo->max_h_samp_factor * DCTSIZE); - MCU_rows = dstinfo->image_height / (dstinfo->max_v_samp_factor * DCTSIZE); + MCU_cols = srcinfo->image_height / (dstinfo->max_h_samp_factor * DCTSIZE); + MCU_rows = srcinfo->image_width / (dstinfo->max_v_samp_factor * DCTSIZE); for (ci = 0; ci < dstinfo->num_components; ci++) { compptr = dstinfo->comp_info + ci; comp_width = MCU_cols * compptr->h_samp_factor; comp_height = MCU_rows * compptr->v_samp_factor; + x_crop_blocks = x_crop_offset * compptr->h_samp_factor; + y_crop_blocks = y_crop_offset * compptr->v_samp_factor; for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks; dst_blk_y += compptr->v_samp_factor) { dst_buffer = (*srcinfo->mem->access_virt_barray) @@ -490,17 +662,26 @@ do_transverse (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) { for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks; dst_blk_x += compptr->h_samp_factor) { - src_buffer = (*srcinfo->mem->access_virt_barray) - ((j_common_ptr) srcinfo, src_coef_arrays[ci], dst_blk_x, - (JDIMENSION) compptr->h_samp_factor, FALSE); + if (x_crop_blocks + dst_blk_x < comp_width) { + /* Block is within the mirrorable area. */ + src_buffer = (*srcinfo->mem->access_virt_barray) + ((j_common_ptr) srcinfo, src_coef_arrays[ci], + comp_width - x_crop_blocks - dst_blk_x - + (JDIMENSION) compptr->h_samp_factor, + (JDIMENSION) compptr->h_samp_factor, FALSE); + } else { + src_buffer = (*srcinfo->mem->access_virt_barray) + ((j_common_ptr) srcinfo, src_coef_arrays[ci], + dst_blk_x + x_crop_blocks, + (JDIMENSION) compptr->h_samp_factor, FALSE); + } for (offset_x = 0; offset_x < compptr->h_samp_factor; offset_x++) { - if (dst_blk_y < comp_height) { - src_ptr = src_buffer[offset_x] - [comp_height - dst_blk_y - offset_y - 1]; - if (dst_blk_x < comp_width) { + dst_ptr = dst_buffer[offset_y][dst_blk_x + offset_x]; + if (y_crop_blocks + dst_blk_y < comp_height) { + if (x_crop_blocks + dst_blk_x < comp_width) { /* Block is within the mirrorable area. */ - dst_ptr = dst_buffer[offset_y] - [comp_width - dst_blk_x - offset_x - 1]; + src_ptr = src_buffer[compptr->h_samp_factor - offset_x - 1] + [comp_height - y_crop_blocks - dst_blk_y - offset_y - 1]; for (i = 0; i < DCTSIZE; i++) { for (j = 0; j < DCTSIZE; j++) { dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j]; @@ -516,7 +697,8 @@ do_transverse (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, } } else { /* Right-edge blocks are mirrored in y only */ - dst_ptr = dst_buffer[offset_y][dst_blk_x + offset_x]; + src_ptr = src_buffer[offset_x] + [comp_height - y_crop_blocks - dst_blk_y - offset_y - 1]; for (i = 0; i < DCTSIZE; i++) { for (j = 0; j < DCTSIZE; j++) { dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j]; @@ -526,11 +708,10 @@ do_transverse (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, } } } else { - src_ptr = src_buffer[offset_x][dst_blk_y + offset_y]; - if (dst_blk_x < comp_width) { + if (x_crop_blocks + dst_blk_x < comp_width) { /* Bottom-edge blocks are mirrored in x only */ - dst_ptr = dst_buffer[offset_y] - [comp_width - dst_blk_x - offset_x - 1]; + src_ptr = src_buffer[compptr->h_samp_factor - offset_x - 1] + [dst_blk_y + offset_y + y_crop_blocks]; for (i = 0; i < DCTSIZE; i++) { for (j = 0; j < DCTSIZE; j++) dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j]; @@ -540,7 +721,8 @@ do_transverse (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, } } else { /* At lower right corner, just transpose, no mirroring */ - dst_ptr = dst_buffer[offset_y][dst_blk_x + offset_x]; + src_ptr = src_buffer[offset_x] + [dst_blk_y + offset_y + y_crop_blocks]; for (i = 0; i < DCTSIZE; i++) for (j = 0; j < DCTSIZE; j++) dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j]; @@ -554,7 +736,115 @@ do_transverse (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, } +/* Parse an unsigned integer: subroutine for jtransform_parse_crop_spec. + * Returns TRUE if valid integer found, FALSE if not. + * *strptr is advanced over the digit string, and *result is set to its value. + */ + +LOCAL(boolean) +jt_read_integer (const char ** strptr, JDIMENSION * result) +{ + const char * ptr = *strptr; + JDIMENSION val = 0; + + for (; isdigit(*ptr); ptr++) { + val = val * 10 + (JDIMENSION) (*ptr - '0'); + } + *result = val; + if (ptr == *strptr) + return FALSE; /* oops, no digits */ + *strptr = ptr; + return TRUE; +} + + +/* Parse a crop specification (written in X11 geometry style). + * The routine returns TRUE if the spec string is valid, FALSE if not. + * + * The crop spec string should have the format + * x{+-}{+-} + * where width, height, xoffset, and yoffset are unsigned integers. + * Each of the elements can be omitted to indicate a default value. + * (A weakness of this style is that it is not possible to omit xoffset + * while specifying yoffset, since they look alike.) + * + * This code is loosely based on XParseGeometry from the X11 distribution. + */ + +GLOBAL(boolean) +jtransform_parse_crop_spec (jpeg_transform_info *info, const char *spec) +{ + info->crop = FALSE; + info->crop_width_set = JCROP_UNSET; + info->crop_height_set = JCROP_UNSET; + info->crop_xoffset_set = JCROP_UNSET; + info->crop_yoffset_set = JCROP_UNSET; + + if (isdigit(*spec)) { + /* fetch width */ + if (! jt_read_integer(&spec, &info->crop_width)) + return FALSE; + info->crop_width_set = JCROP_POS; + } + if (*spec == 'x' || *spec == 'X') { + /* fetch height */ + spec++; + if (! jt_read_integer(&spec, &info->crop_height)) + return FALSE; + info->crop_height_set = JCROP_POS; + } + if (*spec == '+' || *spec == '-') { + /* fetch xoffset */ + info->crop_xoffset_set = (*spec == '-') ? JCROP_NEG : JCROP_POS; + spec++; + if (! jt_read_integer(&spec, &info->crop_xoffset)) + return FALSE; + } + if (*spec == '+' || *spec == '-') { + /* fetch yoffset */ + info->crop_yoffset_set = (*spec == '-') ? JCROP_NEG : JCROP_POS; + spec++; + if (! jt_read_integer(&spec, &info->crop_yoffset)) + return FALSE; + } + /* We had better have gotten to the end of the string. */ + if (*spec != '\0') + return FALSE; + info->crop = TRUE; + return TRUE; +} + + +/* Trim off any partial iMCUs on the indicated destination edge */ + +LOCAL(void) +trim_right_edge (jpeg_transform_info *info, JDIMENSION full_width) +{ + JDIMENSION MCU_cols; + + MCU_cols = info->output_width / (info->max_h_samp_factor * DCTSIZE); + if (MCU_cols > 0 && info->x_crop_offset + MCU_cols == + full_width / (info->max_h_samp_factor * DCTSIZE)) + info->output_width = MCU_cols * (info->max_h_samp_factor * DCTSIZE); +} + +LOCAL(void) +trim_bottom_edge (jpeg_transform_info *info, JDIMENSION full_height) +{ + JDIMENSION MCU_rows; + + MCU_rows = info->output_height / (info->max_v_samp_factor * DCTSIZE); + if (MCU_rows > 0 && info->y_crop_offset + MCU_rows == + full_height / (info->max_v_samp_factor * DCTSIZE)) + info->output_height = MCU_rows * (info->max_v_samp_factor * DCTSIZE); +} + + /* Request any required workspace. + * + * This routine figures out the size that the output image will be + * (which implies that all the transform parameters must be set before + * it is called). * * We allocate the workspace virtual arrays from the source decompression * object, so that all the arrays (both the original data and the workspace) @@ -569,9 +859,13 @@ jtransform_request_workspace (j_decompress_ptr srcinfo, jpeg_transform_info *info) { jvirt_barray_ptr *coef_arrays = NULL; + boolean need_workspace, transpose_it; jpeg_component_info *compptr; - int ci; + JDIMENSION xoffset, yoffset, width_in_iMCUs, height_in_iMCUs; + JDIMENSION width_in_blocks, height_in_blocks; + int ci, h_samp_factor, v_samp_factor; + /* Determine number of components in output image */ if (info->force_grayscale && srcinfo->jpeg_color_space == JCS_YCbCr && srcinfo->num_components == 3) { @@ -581,55 +875,181 @@ jtransform_request_workspace (j_decompress_ptr srcinfo, /* Process all the components */ info->num_components = srcinfo->num_components; } + /* If there is only one output component, force the iMCU size to be 1; + * else use the source iMCU size. (This allows us to do the right thing + * when reducing color to grayscale, and also provides a handy way of + * cleaning up "funny" grayscale images whose sampling factors are not 1x1.) + */ + + switch (info->transform) { + case JXFORM_TRANSPOSE: + case JXFORM_TRANSVERSE: + case JXFORM_ROT_90: + case JXFORM_ROT_270: + info->output_width = srcinfo->image_height; + info->output_height = srcinfo->image_width; + if (info->num_components == 1) { + info->max_h_samp_factor = 1; + info->max_v_samp_factor = 1; + } else { + info->max_h_samp_factor = srcinfo->max_v_samp_factor; + info->max_v_samp_factor = srcinfo->max_h_samp_factor; + } + break; + default: + info->output_width = srcinfo->image_width; + info->output_height = srcinfo->image_height; + if (info->num_components == 1) { + info->max_h_samp_factor = 1; + info->max_v_samp_factor = 1; + } else { + info->max_h_samp_factor = srcinfo->max_h_samp_factor; + info->max_v_samp_factor = srcinfo->max_v_samp_factor; + } + break; + } + + /* If cropping has been requested, compute the crop area's position and + * dimensions, ensuring that its upper left corner falls at an iMCU boundary. + */ + if (info->crop) { + /* Insert default values for unset crop parameters */ + if (info->crop_xoffset_set == JCROP_UNSET) + info->crop_xoffset = 0; /* default to +0 */ + if (info->crop_yoffset_set == JCROP_UNSET) + info->crop_yoffset = 0; /* default to +0 */ + if (info->crop_xoffset >= info->output_width || + info->crop_yoffset >= info->output_height) + ERREXIT(srcinfo, JERR_BAD_CROP_SPEC); + if (info->crop_width_set == JCROP_UNSET) + info->crop_width = info->output_width - info->crop_xoffset; + if (info->crop_height_set == JCROP_UNSET) + info->crop_height = info->output_height - info->crop_yoffset; + /* Ensure parameters are valid */ + if (info->crop_width <= 0 || info->crop_width > info->output_width || + info->crop_height <= 0 || info->crop_height > info->output_height || + info->crop_xoffset > info->output_width - info->crop_width || + info->crop_yoffset > info->output_height - info->crop_height) + ERREXIT(srcinfo, JERR_BAD_CROP_SPEC); + /* Convert negative crop offsets into regular offsets */ + if (info->crop_xoffset_set == JCROP_NEG) + xoffset = info->output_width - info->crop_width - info->crop_xoffset; + else + xoffset = info->crop_xoffset; + if (info->crop_yoffset_set == JCROP_NEG) + yoffset = info->output_height - info->crop_height - info->crop_yoffset; + else + yoffset = info->crop_yoffset; + /* Now adjust so that upper left corner falls at an iMCU boundary */ + info->output_width = + info->crop_width + (xoffset % (info->max_h_samp_factor * DCTSIZE)); + info->output_height = + info->crop_height + (yoffset % (info->max_v_samp_factor * DCTSIZE)); + /* Save x/y offsets measured in iMCUs */ + info->x_crop_offset = xoffset / (info->max_h_samp_factor * DCTSIZE); + info->y_crop_offset = yoffset / (info->max_v_samp_factor * DCTSIZE); + } else { + info->x_crop_offset = 0; + info->y_crop_offset = 0; + } + /* Figure out whether we need workspace arrays, + * and if so whether they are transposed relative to the source. + */ + need_workspace = FALSE; + transpose_it = FALSE; switch (info->transform) { case JXFORM_NONE: + if (info->x_crop_offset != 0 || info->y_crop_offset != 0) + need_workspace = TRUE; + /* No workspace needed if neither cropping nor transforming */ + break; case JXFORM_FLIP_H: - /* Don't need a workspace array */ + if (info->trim) + trim_right_edge(info, srcinfo->image_width); + if (info->y_crop_offset != 0) + need_workspace = TRUE; + /* do_flip_h_no_crop doesn't need a workspace array */ break; case JXFORM_FLIP_V: - case JXFORM_ROT_180: - /* Need workspace arrays having same dimensions as source image. - * Note that we allocate arrays padded out to the next iMCU boundary, - * so that transform routines need not worry about missing edge blocks. - */ - coef_arrays = (jvirt_barray_ptr *) - (*srcinfo->mem->alloc_small) ((j_common_ptr) srcinfo, JPOOL_IMAGE, - SIZEOF(jvirt_barray_ptr) * info->num_components); - for (ci = 0; ci < info->num_components; ci++) { - compptr = srcinfo->comp_info + ci; - coef_arrays[ci] = (*srcinfo->mem->request_virt_barray) - ((j_common_ptr) srcinfo, JPOOL_IMAGE, FALSE, - (JDIMENSION) jround_up((long) compptr->width_in_blocks, - (long) compptr->h_samp_factor), - (JDIMENSION) jround_up((long) compptr->height_in_blocks, - (long) compptr->v_samp_factor), - (JDIMENSION) compptr->v_samp_factor); - } + if (info->trim) + trim_bottom_edge(info, srcinfo->image_height); + /* Need workspace arrays having same dimensions as source image. */ + need_workspace = TRUE; break; case JXFORM_TRANSPOSE: + /* transpose does NOT have to trim anything */ + /* Need workspace arrays having transposed dimensions. */ + need_workspace = TRUE; + transpose_it = TRUE; + break; case JXFORM_TRANSVERSE: + if (info->trim) { + trim_right_edge(info, srcinfo->image_height); + trim_bottom_edge(info, srcinfo->image_width); + } + /* Need workspace arrays having transposed dimensions. */ + need_workspace = TRUE; + transpose_it = TRUE; + break; case JXFORM_ROT_90: + if (info->trim) + trim_right_edge(info, srcinfo->image_height); + /* Need workspace arrays having transposed dimensions. */ + need_workspace = TRUE; + transpose_it = TRUE; + break; + case JXFORM_ROT_180: + if (info->trim) { + trim_right_edge(info, srcinfo->image_width); + trim_bottom_edge(info, srcinfo->image_height); + } + /* Need workspace arrays having same dimensions as source image. */ + need_workspace = TRUE; + break; case JXFORM_ROT_270: - /* Need workspace arrays having transposed dimensions. - * Note that we allocate arrays padded out to the next iMCU boundary, - * so that transform routines need not worry about missing edge blocks. - */ + if (info->trim) + trim_bottom_edge(info, srcinfo->image_width); + /* Need workspace arrays having transposed dimensions. */ + need_workspace = TRUE; + transpose_it = TRUE; + break; + } + + /* Allocate workspace if needed. + * Note that we allocate arrays padded out to the next iMCU boundary, + * so that transform routines need not worry about missing edge blocks. + */ + if (need_workspace) { coef_arrays = (jvirt_barray_ptr *) (*srcinfo->mem->alloc_small) ((j_common_ptr) srcinfo, JPOOL_IMAGE, - SIZEOF(jvirt_barray_ptr) * info->num_components); + SIZEOF(jvirt_barray_ptr) * info->num_components); + width_in_iMCUs = (JDIMENSION) + jdiv_round_up((long) info->output_width, + (long) (info->max_h_samp_factor * DCTSIZE)); + height_in_iMCUs = (JDIMENSION) + jdiv_round_up((long) info->output_height, + (long) (info->max_v_samp_factor * DCTSIZE)); for (ci = 0; ci < info->num_components; ci++) { compptr = srcinfo->comp_info + ci; + if (info->num_components == 1) { + /* we're going to force samp factors to 1x1 in this case */ + h_samp_factor = v_samp_factor = 1; + } else if (transpose_it) { + h_samp_factor = compptr->v_samp_factor; + v_samp_factor = compptr->h_samp_factor; + } else { + h_samp_factor = compptr->h_samp_factor; + v_samp_factor = compptr->v_samp_factor; + } + width_in_blocks = width_in_iMCUs * h_samp_factor; + height_in_blocks = height_in_iMCUs * v_samp_factor; coef_arrays[ci] = (*srcinfo->mem->request_virt_barray) ((j_common_ptr) srcinfo, JPOOL_IMAGE, FALSE, - (JDIMENSION) jround_up((long) compptr->height_in_blocks, - (long) compptr->v_samp_factor), - (JDIMENSION) jround_up((long) compptr->width_in_blocks, - (long) compptr->h_samp_factor), - (JDIMENSION) compptr->h_samp_factor); + width_in_blocks, height_in_blocks, (JDIMENSION) v_samp_factor); } - break; } + info->workspace_coef_arrays = coef_arrays; } @@ -642,14 +1062,8 @@ transpose_critical_parameters (j_compress_ptr dstinfo) int tblno, i, j, ci, itemp; jpeg_component_info *compptr; JQUANT_TBL *qtblptr; - JDIMENSION dtemp; UINT16 qtemp; - /* Transpose basic image dimensions */ - dtemp = dstinfo->image_width; - dstinfo->image_width = dstinfo->image_height; - dstinfo->image_height = dtemp; - /* Transpose sampling factors */ for (ci = 0; ci < dstinfo->num_components; ci++) { compptr = dstinfo->comp_info + ci; @@ -674,46 +1088,159 @@ transpose_critical_parameters (j_compress_ptr dstinfo) } -/* Trim off any partial iMCUs on the indicated destination edge */ +/* Adjust Exif image parameters. + * + * We try to adjust the Tags ExifImageWidth and ExifImageHeight if possible. + */ LOCAL(void) -trim_right_edge (j_compress_ptr dstinfo) +adjust_exif_parameters (JOCTET FAR * data, unsigned int length, + JDIMENSION new_width, JDIMENSION new_height) { - int ci, max_h_samp_factor; - JDIMENSION MCU_cols; + boolean is_motorola; /* Flag for byte order */ + unsigned int number_of_tags, tagnum; + unsigned int firstoffset, offset; + JDIMENSION new_value; + + if (length < 12) return; /* Length of an IFD entry */ + + /* Discover byte order */ + if (GETJOCTET(data[0]) == 0x49 && GETJOCTET(data[1]) == 0x49) + is_motorola = FALSE; + else if (GETJOCTET(data[0]) == 0x4D && GETJOCTET(data[1]) == 0x4D) + is_motorola = TRUE; + else + return; + + /* Check Tag Mark */ + if (is_motorola) { + if (GETJOCTET(data[2]) != 0) return; + if (GETJOCTET(data[3]) != 0x2A) return; + } else { + if (GETJOCTET(data[3]) != 0) return; + if (GETJOCTET(data[2]) != 0x2A) return; + } - /* We have to compute max_h_samp_factor ourselves, - * because it hasn't been set yet in the destination - * (and we don't want to use the source's value). - */ - max_h_samp_factor = 1; - for (ci = 0; ci < dstinfo->num_components; ci++) { - int h_samp_factor = dstinfo->comp_info[ci].h_samp_factor; - max_h_samp_factor = MAX(max_h_samp_factor, h_samp_factor); + /* Get first IFD offset (offset to IFD0) */ + if (is_motorola) { + if (GETJOCTET(data[4]) != 0) return; + if (GETJOCTET(data[5]) != 0) return; + firstoffset = GETJOCTET(data[6]); + firstoffset <<= 8; + firstoffset += GETJOCTET(data[7]); + } else { + if (GETJOCTET(data[7]) != 0) return; + if (GETJOCTET(data[6]) != 0) return; + firstoffset = GETJOCTET(data[5]); + firstoffset <<= 8; + firstoffset += GETJOCTET(data[4]); } - MCU_cols = dstinfo->image_width / (max_h_samp_factor * DCTSIZE); - if (MCU_cols > 0) /* can't trim to 0 pixels */ - dstinfo->image_width = MCU_cols * (max_h_samp_factor * DCTSIZE); -} + if (firstoffset > length - 2) return; /* check end of data segment */ -LOCAL(void) -trim_bottom_edge (j_compress_ptr dstinfo) -{ - int ci, max_v_samp_factor; - JDIMENSION MCU_rows; + /* Get the number of directory entries contained in this IFD */ + if (is_motorola) { + number_of_tags = GETJOCTET(data[firstoffset]); + number_of_tags <<= 8; + number_of_tags += GETJOCTET(data[firstoffset+1]); + } else { + number_of_tags = GETJOCTET(data[firstoffset+1]); + number_of_tags <<= 8; + number_of_tags += GETJOCTET(data[firstoffset]); + } + if (number_of_tags == 0) return; + firstoffset += 2; + + /* Search for ExifSubIFD offset Tag in IFD0 */ + for (;;) { + if (firstoffset > length - 12) return; /* check end of data segment */ + /* Get Tag number */ + if (is_motorola) { + tagnum = GETJOCTET(data[firstoffset]); + tagnum <<= 8; + tagnum += GETJOCTET(data[firstoffset+1]); + } else { + tagnum = GETJOCTET(data[firstoffset+1]); + tagnum <<= 8; + tagnum += GETJOCTET(data[firstoffset]); + } + if (tagnum == 0x8769) break; /* found ExifSubIFD offset Tag */ + if (--number_of_tags == 0) return; + firstoffset += 12; + } - /* We have to compute max_v_samp_factor ourselves, - * because it hasn't been set yet in the destination - * (and we don't want to use the source's value). - */ - max_v_samp_factor = 1; - for (ci = 0; ci < dstinfo->num_components; ci++) { - int v_samp_factor = dstinfo->comp_info[ci].v_samp_factor; - max_v_samp_factor = MAX(max_v_samp_factor, v_samp_factor); + /* Get the ExifSubIFD offset */ + if (is_motorola) { + if (GETJOCTET(data[firstoffset+8]) != 0) return; + if (GETJOCTET(data[firstoffset+9]) != 0) return; + offset = GETJOCTET(data[firstoffset+10]); + offset <<= 8; + offset += GETJOCTET(data[firstoffset+11]); + } else { + if (GETJOCTET(data[firstoffset+11]) != 0) return; + if (GETJOCTET(data[firstoffset+10]) != 0) return; + offset = GETJOCTET(data[firstoffset+9]); + offset <<= 8; + offset += GETJOCTET(data[firstoffset+8]); } - MCU_rows = dstinfo->image_height / (max_v_samp_factor * DCTSIZE); - if (MCU_rows > 0) /* can't trim to 0 pixels */ - dstinfo->image_height = MCU_rows * (max_v_samp_factor * DCTSIZE); + if (offset > length - 2) return; /* check end of data segment */ + + /* Get the number of directory entries contained in this SubIFD */ + if (is_motorola) { + number_of_tags = GETJOCTET(data[offset]); + number_of_tags <<= 8; + number_of_tags += GETJOCTET(data[offset+1]); + } else { + number_of_tags = GETJOCTET(data[offset+1]); + number_of_tags <<= 8; + number_of_tags += GETJOCTET(data[offset]); + } + if (number_of_tags < 2) return; + offset += 2; + + /* Search for ExifImageWidth and ExifImageHeight Tags in this SubIFD */ + do { + if (offset > length - 12) return; /* check end of data segment */ + /* Get Tag number */ + if (is_motorola) { + tagnum = GETJOCTET(data[offset]); + tagnum <<= 8; + tagnum += GETJOCTET(data[offset+1]); + } else { + tagnum = GETJOCTET(data[offset+1]); + tagnum <<= 8; + tagnum += GETJOCTET(data[offset]); + } + if (tagnum == 0xA002 || tagnum == 0xA003) { + if (tagnum == 0xA002) + new_value = new_width; /* ExifImageWidth Tag */ + else + new_value = new_height; /* ExifImageHeight Tag */ + if (is_motorola) { + data[offset+2] = 0; /* Format = unsigned long (4 octets) */ + data[offset+3] = 4; + data[offset+4] = 0; /* Number Of Components = 1 */ + data[offset+5] = 0; + data[offset+6] = 0; + data[offset+7] = 1; + data[offset+8] = 0; + data[offset+9] = 0; + data[offset+10] = (JOCTET)((new_value >> 8) & 0xFF); + data[offset+11] = (JOCTET)(new_value & 0xFF); + } else { + data[offset+2] = 4; /* Format = unsigned long (4 octets) */ + data[offset+3] = 0; + data[offset+4] = 1; /* Number Of Components = 1 */ + data[offset+5] = 0; + data[offset+6] = 0; + data[offset+7] = 0; + data[offset+8] = (JOCTET)(new_value & 0xFF); + data[offset+9] = (JOCTET)((new_value >> 8) & 0xFF); + data[offset+10] = 0; + data[offset+11] = 0; + } + } + offset += 12; + } while (--number_of_tags); } @@ -736,18 +1263,22 @@ jtransform_adjust_parameters (j_decompress_ptr srcinfo, { /* If force-to-grayscale is requested, adjust destination parameters */ if (info->force_grayscale) { - /* We use jpeg_set_colorspace to make sure subsidiary settings get fixed - * properly. Among other things, the target h_samp_factor & v_samp_factor - * will get set to 1, which typically won't match the source. - * In fact we do this even if the source is already grayscale; that - * provides an easy way of coercing a grayscale JPEG with funny sampling - * factors to the customary 1,1. (Some decoders fail on other factors.) + /* First, ensure we have YCbCr or grayscale data, and that the source's + * Y channel is full resolution. (No reasonable person would make Y + * be less than full resolution, so actually coping with that case + * isn't worth extra code space. But we check it to avoid crashing.) */ - if ((dstinfo->jpeg_color_space == JCS_YCbCr && - dstinfo->num_components == 3) || - (dstinfo->jpeg_color_space == JCS_GRAYSCALE && - dstinfo->num_components == 1)) { - /* We have to preserve the source's quantization table number. */ + if (((dstinfo->jpeg_color_space == JCS_YCbCr && + dstinfo->num_components == 3) || + (dstinfo->jpeg_color_space == JCS_GRAYSCALE && + dstinfo->num_components == 1)) && + srcinfo->comp_info[0].h_samp_factor == srcinfo->max_h_samp_factor && + srcinfo->comp_info[0].v_samp_factor == srcinfo->max_v_samp_factor) { + /* We use jpeg_set_colorspace to make sure subsidiary settings get fixed + * properly. Among other things, it sets the target h_samp_factor & + * v_samp_factor to 1, which typically won't match the source. + * We have to preserve the source's quantization table number, however. + */ int sv_quant_tbl_no = dstinfo->comp_info[0].quant_tbl_no; jpeg_set_colorspace(dstinfo, JCS_GRAYSCALE); dstinfo->comp_info[0].quant_tbl_no = sv_quant_tbl_no; @@ -755,50 +1286,52 @@ jtransform_adjust_parameters (j_decompress_ptr srcinfo, /* Sorry, can't do it */ ERREXIT(dstinfo, JERR_CONVERSION_NOTIMPL); } + } else if (info->num_components == 1) { + /* For a single-component source, we force the destination sampling factors + * to 1x1, with or without force_grayscale. This is useful because some + * decoders choke on grayscale images with other sampling factors. + */ + dstinfo->comp_info[0].h_samp_factor = 1; + dstinfo->comp_info[0].v_samp_factor = 1; } - /* Correct the destination's image dimensions etc if necessary */ + /* Correct the destination's image dimensions as necessary + * for crop and rotate/flip operations. + */ + dstinfo->image_width = info->output_width; + dstinfo->image_height = info->output_height; + + /* Transpose destination image parameters */ switch (info->transform) { - case JXFORM_NONE: - /* Nothing to do */ - break; - case JXFORM_FLIP_H: - if (info->trim) - trim_right_edge(dstinfo); - break; - case JXFORM_FLIP_V: - if (info->trim) - trim_bottom_edge(dstinfo); - break; case JXFORM_TRANSPOSE: - transpose_critical_parameters(dstinfo); - /* transpose does NOT have to trim anything */ - break; case JXFORM_TRANSVERSE: - transpose_critical_parameters(dstinfo); - if (info->trim) { - trim_right_edge(dstinfo); - trim_bottom_edge(dstinfo); - } - break; case JXFORM_ROT_90: - transpose_critical_parameters(dstinfo); - if (info->trim) - trim_right_edge(dstinfo); - break; - case JXFORM_ROT_180: - if (info->trim) { - trim_right_edge(dstinfo); - trim_bottom_edge(dstinfo); - } - break; case JXFORM_ROT_270: transpose_critical_parameters(dstinfo); - if (info->trim) - trim_bottom_edge(dstinfo); break; } + /* Adjust Exif properties */ + if (srcinfo->marker_list != NULL && + srcinfo->marker_list->marker == JPEG_APP0+1 && + srcinfo->marker_list->data_length >= 6 && + GETJOCTET(srcinfo->marker_list->data[0]) == 0x45 && + GETJOCTET(srcinfo->marker_list->data[1]) == 0x78 && + GETJOCTET(srcinfo->marker_list->data[2]) == 0x69 && + GETJOCTET(srcinfo->marker_list->data[3]) == 0x66 && + GETJOCTET(srcinfo->marker_list->data[4]) == 0 && + GETJOCTET(srcinfo->marker_list->data[5]) == 0) { + /* Suppress output of JFIF marker */ + dstinfo->write_JFIF_header = FALSE; + /* Adjust Exif image parameters */ + if (dstinfo->image_width != srcinfo->image_width || + dstinfo->image_height != srcinfo->image_height) + /* Align data segment to start of TIFF structure for parsing */ + adjust_exif_parameters(srcinfo->marker_list->data + 6, + srcinfo->marker_list->data_length - 6, + dstinfo->image_width, dstinfo->image_height); + } + /* Return the appropriate output data set */ if (info->workspace_coef_arrays != NULL) return info->workspace_coef_arrays; @@ -816,40 +1349,108 @@ jtransform_adjust_parameters (j_decompress_ptr srcinfo, */ GLOBAL(void) -jtransform_execute_transformation (j_decompress_ptr srcinfo, - j_compress_ptr dstinfo, - jvirt_barray_ptr *src_coef_arrays, - jpeg_transform_info *info) +jtransform_execute_transform (j_decompress_ptr srcinfo, + j_compress_ptr dstinfo, + jvirt_barray_ptr *src_coef_arrays, + jpeg_transform_info *info) { jvirt_barray_ptr *dst_coef_arrays = info->workspace_coef_arrays; + /* Note: conditions tested here should match those in switch statement + * in jtransform_request_workspace() + */ switch (info->transform) { case JXFORM_NONE: + if (info->x_crop_offset != 0 || info->y_crop_offset != 0) + do_crop(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset, + src_coef_arrays, dst_coef_arrays); break; case JXFORM_FLIP_H: - do_flip_h(srcinfo, dstinfo, src_coef_arrays); + if (info->y_crop_offset != 0) + do_flip_h(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset, + src_coef_arrays, dst_coef_arrays); + else + do_flip_h_no_crop(srcinfo, dstinfo, info->x_crop_offset, + src_coef_arrays); break; case JXFORM_FLIP_V: - do_flip_v(srcinfo, dstinfo, src_coef_arrays, dst_coef_arrays); + do_flip_v(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset, + src_coef_arrays, dst_coef_arrays); break; case JXFORM_TRANSPOSE: - do_transpose(srcinfo, dstinfo, src_coef_arrays, dst_coef_arrays); + do_transpose(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset, + src_coef_arrays, dst_coef_arrays); break; case JXFORM_TRANSVERSE: - do_transverse(srcinfo, dstinfo, src_coef_arrays, dst_coef_arrays); + do_transverse(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset, + src_coef_arrays, dst_coef_arrays); break; case JXFORM_ROT_90: - do_rot_90(srcinfo, dstinfo, src_coef_arrays, dst_coef_arrays); + do_rot_90(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset, + src_coef_arrays, dst_coef_arrays); break; case JXFORM_ROT_180: - do_rot_180(srcinfo, dstinfo, src_coef_arrays, dst_coef_arrays); + do_rot_180(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset, + src_coef_arrays, dst_coef_arrays); break; case JXFORM_ROT_270: - do_rot_270(srcinfo, dstinfo, src_coef_arrays, dst_coef_arrays); + do_rot_270(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset, + src_coef_arrays, dst_coef_arrays); break; } } +/* jtransform_perfect_transform + * + * Determine whether lossless transformation is perfectly + * possible for a specified image and transformation. + * + * Inputs: + * image_width, image_height: source image dimensions. + * MCU_width, MCU_height: pixel dimensions of MCU. + * transform: transformation identifier. + * Parameter sources from initialized jpeg_struct + * (after reading source header): + * image_width = cinfo.image_width + * image_height = cinfo.image_height + * MCU_width = cinfo.max_h_samp_factor * DCTSIZE + * MCU_height = cinfo.max_v_samp_factor * DCTSIZE + * Result: + * TRUE = perfect transformation possible + * FALSE = perfect transformation not possible + * (may use custom action then) + */ + +GLOBAL(boolean) +jtransform_perfect_transform(JDIMENSION image_width, JDIMENSION image_height, + int MCU_width, int MCU_height, + JXFORM_CODE transform) +{ + boolean result = TRUE; /* initialize TRUE */ + + switch (transform) { + case JXFORM_FLIP_H: + case JXFORM_ROT_270: + if (image_width % (JDIMENSION) MCU_width) + result = FALSE; + break; + case JXFORM_FLIP_V: + case JXFORM_ROT_90: + if (image_height % (JDIMENSION) MCU_height) + result = FALSE; + break; + case JXFORM_TRANSVERSE: + case JXFORM_ROT_180: + if (image_width % (JDIMENSION) MCU_width) + result = FALSE; + if (image_height % (JDIMENSION) MCU_height) + result = FALSE; + break; + } + + return result; +} + #endif /* TRANSFORMS_SUPPORTED */ diff --git a/transupp.h b/transupp.h index 5c2d32a..981b1ce 100644 --- a/transupp.h +++ b/transupp.h @@ -1,7 +1,7 @@ /* * transupp.h * - * Copyright (C) 1997, Thomas G. Lane. + * Copyright (C) 1997-2001, Thomas G. Lane. * This file is part of the Independent JPEG Group's software. * For conditions of distribution and use, see the accompanying README file. * @@ -22,32 +22,6 @@ #define TRANSFORMS_SUPPORTED 1 /* 0 disables transform code */ #endif -/* Short forms of external names for systems with brain-damaged linkers. */ - -#ifdef NEED_SHORT_EXTERNAL_NAMES -#define jtransform_request_workspace jTrRequest -#define jtransform_adjust_parameters jTrAdjust -#define jtransform_execute_transformation jTrExec -#define jcopy_markers_setup jCMrkSetup -#define jcopy_markers_execute jCMrkExec -#endif /* NEED_SHORT_EXTERNAL_NAMES */ - - -/* - * Codes for supported types of image transformations. - */ - -typedef enum { - JXFORM_NONE, /* no transformation */ - JXFORM_FLIP_H, /* horizontal flip */ - JXFORM_FLIP_V, /* vertical flip */ - JXFORM_TRANSPOSE, /* transpose across UL-to-LR axis */ - JXFORM_TRANSVERSE, /* transpose across UR-to-LL axis */ - JXFORM_ROT_90, /* 90-degree clockwise rotation */ - JXFORM_ROT_180, /* 180-degree rotation */ - JXFORM_ROT_270 /* 270-degree clockwise (or 90 ccw) */ -} JXFORM_CODE; - /* * Although rotating and flipping data expressed as DCT coefficients is not * hard, there is an asymmetry in the JPEG format specification for images @@ -75,6 +49,19 @@ typedef enum { * (For example, -rot 270 -trim trims only the bottom edge, but -rot 90 -trim * followed by -rot 180 -trim trims both edges.) * + * We also offer a lossless-crop option, which discards data outside a given + * image region but losslessly preserves what is inside. Like the rotate and + * flip transforms, lossless crop is restricted by the JPEG format: the upper + * left corner of the selected region must fall on an iMCU boundary. If this + * does not hold for the given crop parameters, we silently move the upper left + * corner up and/or left to make it so, simultaneously increasing the region + * dimensions to keep the lower right crop corner unchanged. (Thus, the + * output image covers at least the requested region, but may cover more.) + * + * If both crop and a rotate/flip transform are requested, the crop is applied + * last --- that is, the crop region is specified in terms of the destination + * image. + * * We also offer a "force to grayscale" option, which simply discards the * chrominance channels of a YCbCr image. This is lossless in the sense that * the luminance channel is preserved exactly. It's not the same kind of @@ -83,20 +70,89 @@ typedef enum { * be aware of the option to know how many components to work on. */ + +/* Short forms of external names for systems with brain-damaged linkers. */ + +#ifdef NEED_SHORT_EXTERNAL_NAMES +#define jtransform_parse_crop_spec jTrParCrop +#define jtransform_request_workspace jTrRequest +#define jtransform_adjust_parameters jTrAdjust +#define jtransform_execute_transform jTrExec +#define jtransform_perfect_transform jTrPerfect +#define jcopy_markers_setup jCMrkSetup +#define jcopy_markers_execute jCMrkExec +#endif /* NEED_SHORT_EXTERNAL_NAMES */ + + +/* + * Codes for supported types of image transformations. + */ + +typedef enum { + JXFORM_NONE, /* no transformation */ + JXFORM_FLIP_H, /* horizontal flip */ + JXFORM_FLIP_V, /* vertical flip */ + JXFORM_TRANSPOSE, /* transpose across UL-to-LR axis */ + JXFORM_TRANSVERSE, /* transpose across UR-to-LL axis */ + JXFORM_ROT_90, /* 90-degree clockwise rotation */ + JXFORM_ROT_180, /* 180-degree rotation */ + JXFORM_ROT_270 /* 270-degree clockwise (or 90 ccw) */ +} JXFORM_CODE; + +/* + * Codes for crop parameters, which can individually be unspecified, + * positive, or negative. (Negative width or height makes no sense, though.) + */ + +typedef enum { + JCROP_UNSET, + JCROP_POS, + JCROP_NEG +} JCROP_CODE; + +/* + * Transform parameters struct. + * NB: application must not change any elements of this struct after + * calling jtransform_request_workspace. + */ + typedef struct { /* Options: set by caller */ JXFORM_CODE transform; /* image transform operator */ + boolean perfect; /* if TRUE, fail if partial MCUs are requested */ boolean trim; /* if TRUE, trim partial MCUs as needed */ boolean force_grayscale; /* if TRUE, convert color image to grayscale */ + boolean crop; /* if TRUE, crop source image */ + + /* Crop parameters: application need not set these unless crop is TRUE. + * These can be filled in by jtransform_parse_crop_spec(). + */ + JDIMENSION crop_width; /* Width of selected region */ + JCROP_CODE crop_width_set; + JDIMENSION crop_height; /* Height of selected region */ + JCROP_CODE crop_height_set; + JDIMENSION crop_xoffset; /* X offset of selected region */ + JCROP_CODE crop_xoffset_set; /* (negative measures from right edge) */ + JDIMENSION crop_yoffset; /* Y offset of selected region */ + JCROP_CODE crop_yoffset_set; /* (negative measures from bottom edge) */ /* Internal workspace: caller should not touch these */ int num_components; /* # of components in workspace */ jvirt_barray_ptr * workspace_coef_arrays; /* workspace for transformations */ + JDIMENSION output_width; /* cropped destination dimensions */ + JDIMENSION output_height; + JDIMENSION x_crop_offset; /* destination crop offsets measured in iMCUs */ + JDIMENSION y_crop_offset; + int max_h_samp_factor; /* destination iMCU size */ + int max_v_samp_factor; } jpeg_transform_info; #if TRANSFORMS_SUPPORTED +/* Parse a crop specification (written in X11 geometry style) */ +EXTERN(boolean) jtransform_parse_crop_spec + JPP((jpeg_transform_info *info, const char *spec)); /* Request any required workspace */ EXTERN(void) jtransform_request_workspace JPP((j_decompress_ptr srcinfo, jpeg_transform_info *info)); @@ -106,10 +162,24 @@ EXTERN(jvirt_barray_ptr *) jtransform_adjust_parameters jvirt_barray_ptr *src_coef_arrays, jpeg_transform_info *info)); /* Execute the actual transformation, if any */ -EXTERN(void) jtransform_execute_transformation +EXTERN(void) jtransform_execute_transform JPP((j_decompress_ptr srcinfo, j_compress_ptr dstinfo, jvirt_barray_ptr *src_coef_arrays, jpeg_transform_info *info)); +/* Determine whether lossless transformation is perfectly + * possible for a specified image and transformation. + */ +EXTERN(boolean) jtransform_perfect_transform + JPP((JDIMENSION image_width, JDIMENSION image_height, + int MCU_width, int MCU_height, + JXFORM_CODE transform)); + +/* jtransform_execute_transform used to be called + * jtransform_execute_transformation, but some compilers complain about + * routine names that long. This macro is here to avoid breaking any + * old source code that uses the original name... + */ +#define jtransform_execute_transformation jtransform_execute_transform #endif /* TRANSFORMS_SUPPORTED */ -- cgit v1.2.1