1  /********************************************************************** <BR>


2  This file is part of Crack dot Com's free source code release of


3  Golgotha. <a href="http://www.crack.com/golgotha_release"> <BR> for


4  information about compiling & licensing issues visit this URL</a>


5  <PRE> If that doesn't help, contact Jonathan Clark at


6  golgotha_source@usa.net (Subject should have "GOLG" in it)


7  ***********************************************************************/


8 


9  /*


10  * jidctint.c


11  *


12  * Copyright (C) 19911996, Thomas G. Lane.


13  * This file is part of the Independent JPEG Group's software.


14  * For conditions of distribution and use, see the accompanying README file.


15  *


16  * This file contains a slowbutaccurate integer implementation of the


17  * inverse DCT (Discrete Cosine Transform). In the IJG code, this routine


18  * must also perform dequantization of the input coefficients.


19  *


20  * A 2D IDCT can be done by 1D IDCT on each column followed by 1D IDCT


21  * on each row (or vice versa, but it's more convenient to emit a row at


22  * a time). Direct algorithms are also available, but they are much more


23  * complex and seem not to be any faster when reduced to code.


24  *


25  * This implementation is based on an algorithm described in


26  * C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1D DCT


27  * Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,


28  * Speech, and Signal Processing 1989 (ICASSP '89), pp. 988991.


29  * The primary algorithm described there uses 11 multiplies and 29 adds.


30  * We use their alternate method with 12 multiplies and 32 adds.


31  * The advantage of this method is that no data path contains more than one


32  * multiplication; this allows a very simple and accurate implementation in


33  * scaled fixedpoint arithmetic, with a minimal number of shifts.


34  */


35 


36  #define JPEG_INTERNALS


37  #include "loaders/jpg/jinclude.h"


38  #include "loaders/jpg/jpeglib.h"


39  #include "loaders/jpg/jdct.h" /* Private declarations for DCT subsystem */


40 


41  #ifdef DCT_ISLOW_SUPPORTED


42 


43 


44  /*


45  * This module is specialized to the case DCTSIZE = 8.


46  */


47 


48  #if DCTSIZE != 8


49  Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */


50  #endif


51 


52 


53  /*


54  * The poop on this scaling stuff is as follows:


55  *


56  * Each 1D IDCT step produces outputs which are a factor of sqrt(N)


57  * larger than the true IDCT outputs. The final outputs are therefore


58  * a factor of N larger than desired; since N=8 this can be cured by


59  * a simple right shift at the end of the algorithm. The advantage of


60  * this arrangement is that we save two multiplications per 1D IDCT,


61  * because the y0 and y4 inputs need not be divided by sqrt(N).


62  *


63  * We have to do addition and subtraction of the integer inputs, which


64  * is no problem, and multiplication by fractional constants, which is


65  * a problem to do in integer arithmetic. We multiply all the constants


66  * by CONST_SCALE and convert them to integer constants (thus retaining


67  * CONST_BITS bits of precision in the constants). After doing a


68  * multiplication we have to divide the product by CONST_SCALE, with proper


69  * rounding, to produce the correct output. This division can be done


70  * cheaply as a right shift of CONST_BITS bits. We postpone shifting


71  * as long as possible so that partial sums can be added together with


72  * full fractional precision.


73  *


74  * The outputs of the first pass are scaled up by PASS1_BITS bits so that


75  * they are represented to betterthanintegral precision. These outputs


76  * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16bit word


77  * with the recommended scaling. (To scale up 12bit sample data further, an


78  * intermediate INT32 array would be needed.)


79  *


80  * To avoid overflow of the 32bit intermediate results in pass 2, we must


81  * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26. Error analysis


82  * shows that the values given below are the most effective.


83  */


84 


85  #if BITS_IN_JSAMPLE == 8


86  #define CONST_BITS 13


87  #define PASS1_BITS 2


88  #else


89  #define CONST_BITS 13


90  #define PASS1_BITS 1 /* lose a little precision to avoid overflow */


91  #endif


92 


93  /* Some C compilers fail to reduce "FIX(constant)" at compile time, thus


94  * causing a lot of useless floatingpoint operations at run time.


95  * To get around this we use the following precalculated constants.


96  * If you change CONST_BITS you may want to add appropriate values.


97  * (With a reasonable C compiler, you can just rely on the FIX() macro...)


98  */


99 


100  #if CONST_BITS == 13


101  #define FIX_0_298631336 ((INT32) 2446) /* FIX(0.298631336) */


102  #define FIX_0_390180644 ((INT32) 3196) /* FIX(0.390180644) */


103  #define FIX_0_541196100 ((INT32) 4433) /* FIX(0.541196100) */


104  #define FIX_0_765366865 ((INT32) 6270) /* FIX(0.765366865) */


105  #define FIX_0_899976223 ((INT32) 7373) /* FIX(0.899976223) */


106  #define FIX_1_175875602 ((INT32) 9633) /* FIX(1.175875602) */


107  #define FIX_1_501321110 ((INT32) 12299) /* FIX(1.501321110) */


108  #define FIX_1_847759065 ((INT32) 15137) /* FIX(1.847759065) */


109  #define FIX_1_961570560 ((INT32) 16069) /* FIX(1.961570560) */


110  #define FIX_2_053119869 ((INT32) 16819) /* FIX(2.053119869) */


111  #define FIX_2_562915447 ((INT32) 20995) /* FIX(2.562915447) */


112  #define FIX_3_072711026 ((INT32) 25172) /* FIX(3.072711026) */


113  #else


114  #define FIX_0_298631336 FIX(0.298631336)


115  #define FIX_0_390180644 FIX(0.390180644)


116  #define FIX_0_541196100 FIX(0.541196100)


117  #define FIX_0_765366865 FIX(0.765366865)


118  #define FIX_0_899976223 FIX(0.899976223)


119  #define FIX_1_175875602 FIX(1.175875602)


120  #define FIX_1_501321110 FIX(1.501321110)


121  #define FIX_1_847759065 FIX(1.847759065)


122  #define FIX_1_961570560 FIX(1.961570560)


123  #define FIX_2_053119869 FIX(2.053119869)


124  #define FIX_2_562915447 FIX(2.562915447)


125  #define FIX_3_072711026 FIX(3.072711026)


126  #endif


127 


128 


129  /* Multiply an INT32 variable by an INT32 constant to yield an INT32 result.


130  * For 8bit samples with the recommended scaling, all the variable


131  * and constant values involved are no more than 16 bits wide, so a


132  * 16x16>32 bit multiply can be used instead of a full 32x32 multiply.


133  * For 12bit samples, a full 32bit multiplication will be needed.


134  */


135 


136  #if BITS_IN_JSAMPLE == 8


137  #define MULTIPLY(var,const) MULTIPLY16C16(var,const)


138  #else


139  #define MULTIPLY(var,const) ((var) * (const))


140  #endif


141 


142 


143  /* Dequantize a coefficient by multiplying it by the multipliertable


144  * entry; produce an int result. In this module, both inputs and result


145  * are 16 bits or less, so either int or short multiply will work.


146  */


147 


148  #define DEQUANTIZE(coef,quantval) (((ISLOW_MULT_TYPE) (coef)) * (quantval))


149 


150 


151  /*


152  * Perform dequantization and inverse DCT on one block of coefficients.


153  */


154 


155  GLOBAL(void)


156  jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,


157  JCOEFPTR coef_block,


158  JSAMPARRAY output_buf, JDIMENSION output_col)


159  {


160  INT32 tmp0, tmp1, tmp2, tmp3;


161  INT32 tmp10, tmp11, tmp12, tmp13;


162  INT32 z1, z2, z3, z4, z5;


163  JCOEFPTR inptr;


164  ISLOW_MULT_TYPE * quantptr;


165  int * wsptr;


166  JSAMPROW outptr;


167  JSAMPLE *range_limit = IDCT_range_limit(cinfo);


168  int ctr;


169  int workspace[DCTSIZE2]; /* buffers data between passes */


170  SHIFT_TEMPS


171 


172  /* Pass 1: process columns from input, store into work array. */


173  /* Note results are scaled up by sqrt(8) compared to a true IDCT; */


174  /* furthermore, we scale the results by 2**PASS1_BITS. */


175 


176  inptr = coef_block;


177  quantptr = (ISLOW_MULT_TYPE *) compptr>dct_table;


178  wsptr = workspace;


179  for (ctr = DCTSIZE; ctr > 0; ctr) {


180  /* Due to quantization, we will usually find that many of the input


181  * coefficients are zero, especially the AC terms. We can exploit this


182  * by shortcircuiting the IDCT calculation for any column in which all


183  * the AC terms are zero. In that case each output is equal to the


184  * DC coefficient (with scale factor as needed).


185  * With typical images and quantization tables, half or more of the


186  * column DCT calculations can be simplified this way.


187  */


188 


189  if ((inptr[DCTSIZE*1]  inptr[DCTSIZE*2]  inptr[DCTSIZE*3] 


190  inptr[DCTSIZE*4]  inptr[DCTSIZE*5]  inptr[DCTSIZE*6] 


191  inptr[DCTSIZE*7]) == 0) {


192  /* AC terms all zero */


193  int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS;


194 


195  wsptr[DCTSIZE*0] = dcval;


196  wsptr[DCTSIZE*1] = dcval;


197  wsptr[DCTSIZE*2] = dcval;


198  wsptr[DCTSIZE*3] = dcval;


199  wsptr[DCTSIZE*4] = dcval;


200  wsptr[DCTSIZE*5] = dcval;


201  wsptr[DCTSIZE*6] = dcval;


202  wsptr[DCTSIZE*7] = dcval;


203 


204  inptr++; /* advance pointers to next column */


205  quantptr++;


206  wsptr++;


207  continue;


208  }


209 


210  /* Even part: reverse the even part of the forward DCT. */


211  /* The rotator is sqrt(2)*c(6). */


212 


213  z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);


214  z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);


215 


216  z1 = MULTIPLY(z2 + z3, FIX_0_541196100);


217  tmp2 = z1 + MULTIPLY(z3,  FIX_1_847759065);


218  tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);


219 


220  z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);


221  z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);


222 


223  tmp0 = (z2 + z3) << CONST_BITS;


224  tmp1 = (z2  z3) << CONST_BITS;


225 


226  tmp10 = tmp0 + tmp3;


227  tmp13 = tmp0  tmp3;


228  tmp11 = tmp1 + tmp2;


229  tmp12 = tmp1  tmp2;


230 


231  /* Odd part per figure 8; the matrix is unitary and hence its


232  * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.


233  */


234 


235  tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);


236  tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);


237  tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);


238  tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);


239 


240  z1 = tmp0 + tmp3;


241  z2 = tmp1 + tmp2;


242  z3 = tmp0 + tmp2;


243  z4 = tmp1 + tmp3;


244  z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */


245 


246  tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (c1+c3+c5c7) */


247  tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3c5+c7) */


248  tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5c7) */


249  tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3c5c7) */


250  z1 = MULTIPLY(z1,  FIX_0_899976223); /* sqrt(2) * (c7c3) */


251  z2 = MULTIPLY(z2,  FIX_2_562915447); /* sqrt(2) * (c1c3) */


252  z3 = MULTIPLY(z3,  FIX_1_961570560); /* sqrt(2) * (c3c5) */


253  z4 = MULTIPLY(z4,  FIX_0_390180644); /* sqrt(2) * (c5c3) */


254 


255  z3 += z5;


256  z4 += z5;


257 


258  tmp0 += z1 + z3;


259  tmp1 += z2 + z4;


260  tmp2 += z2 + z3;


261  tmp3 += z1 + z4;


262 


263  /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */


264 


265  wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITSPASS1_BITS);


266  wsptr[DCTSIZE*7] = (int) DESCALE(tmp10  tmp3, CONST_BITSPASS1_BITS);


267  wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITSPASS1_BITS);


268  wsptr[DCTSIZE*6] = (int) DESCALE(tmp11  tmp2, CONST_BITSPASS1_BITS);


269  wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITSPASS1_BITS);


270  wsptr[DCTSIZE*5] = (int) DESCALE(tmp12  tmp1, CONST_BITSPASS1_BITS);


271  wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITSPASS1_BITS);


272  wsptr[DCTSIZE*4] = (int) DESCALE(tmp13  tmp0, CONST_BITSPASS1_BITS);


273 


274  inptr++; /* advance pointers to next column */


275  quantptr++;


276  wsptr++;


277  }


278 


279  /* Pass 2: process rows from work array, store into output array. */


280  /* Note that we must descale the results by a factor of 8 == 2**3, */


281  /* and also undo the PASS1_BITS scaling. */


282 


283  wsptr = workspace;


284  for (ctr = 0; ctr < DCTSIZE; ctr++) {


285  outptr = output_buf[ctr] + output_col;


286  /* Rows of zeroes can be exploited in the same way as we did with columns.


287  * However, the column calculation has created many nonzero AC terms, so


288  * the simplification applies less often (typically 5% to 10% of the time).


289  * On machines with very fast multiplication, it's possible that the


290  * test takes more time than it's worth. In that case this section


291  * may be commented out.


292  */


293 


294  #ifndef NO_ZERO_ROW_TEST


295  if ((wsptr[1]  wsptr[2]  wsptr[3]  wsptr[4]  wsptr[5]  wsptr[6] 


296  wsptr[7]) == 0) {


297  /* AC terms all zero */


298  JSAMPLE dcval = range_limit[(int) DESCALE((INT32) wsptr[0], PASS1_BITS+3)


299  & RANGE_MASK];


300 


301  outptr[0] = dcval;


302  outptr[1] = dcval;


303  outptr[2] = dcval;


304  outptr[3] = dcval;


305  outptr[4] = dcval;


306  outptr[5] = dcval;


307  outptr[6] = dcval;


308  outptr[7] = dcval;


309 


310  wsptr += DCTSIZE; /* advance pointer to next row */


311  continue;


312  }


313  #endif


314 


315  /* Even part: reverse the even part of the forward DCT. */


316  /* The rotator is sqrt(2)*c(6). */


317 


318  z2 = (INT32) wsptr[2];


319  z3 = (INT32) wsptr[6];


320 


321  z1 = MULTIPLY(z2 + z3, FIX_0_541196100);


322  tmp2 = z1 + MULTIPLY(z3,  FIX_1_847759065);


323  tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);


324 


325  tmp0 = ((INT32) wsptr[0] + (INT32) wsptr[4]) << CONST_BITS;


326  tmp1 = ((INT32) wsptr[0]  (INT32) wsptr[4]) << CONST_BITS;


327 


328  tmp10 = tmp0 + tmp3;


329  tmp13 = tmp0  tmp3;


330  tmp11 = tmp1 + tmp2;


331  tmp12 = tmp1  tmp2;


332 


333  /* Odd part per figure 8; the matrix is unitary and hence its


334  * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.


335  */


336 


337  tmp0 = (INT32) wsptr[7];


338  tmp1 = (INT32) wsptr[5];


339  tmp2 = (INT32) wsptr[3];


340  tmp3 = (INT32) wsptr[1];


341 


342  z1 = tmp0 + tmp3;


343  z2 = tmp1 + tmp2;


344  z3 = tmp0 + tmp2;


345  z4 = tmp1 + tmp3;


346  z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */


347 


348  tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (c1+c3+c5c7) */


349  tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3c5+c7) */


350  tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5c7) */


351  tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3c5c7) */


352  z1 = MULTIPLY(z1,  FIX_0_899976223); /* sqrt(2) * (c7c3) */


353  z2 = MULTIPLY(z2,  FIX_2_562915447); /* sqrt(2) * (c1c3) */


354  z3 = MULTIPLY(z3,  FIX_1_961570560); /* sqrt(2) * (c3c5) */


355  z4 = MULTIPLY(z4,  FIX_0_390180644); /* sqrt(2) * (c5c3) */


356 


357  z3 += z5;


358  z4 += z5;


359 


360  tmp0 += z1 + z3;


361  tmp1 += z2 + z4;


362  tmp2 += z2 + z3;


363  tmp3 += z1 + z4;


364 


365  /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */


366 


367  outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp3,


368  CONST_BITS+PASS1_BITS+3)


369  & RANGE_MASK];


370  outptr[7] = range_limit[(int) DESCALE(tmp10  tmp3,


371  CONST_BITS+PASS1_BITS+3)


372  & RANGE_MASK];


373  outptr[1] = range_limit[(int) DESCALE(tmp11 + tmp2,


374  CONST_BITS+PASS1_BITS+3)


375  & RANGE_MASK];


376  outptr[6] = range_limit[(int) DESCALE(tmp11  tmp2,


377  CONST_BITS+PASS1_BITS+3)


378  & RANGE_MASK];


379  outptr[2] = range_limit[(int) DESCALE(tmp12 + tmp1,


380  CONST_BITS+PASS1_BITS+3)


381  & RANGE_MASK];


382  outptr[5] = range_limit[(int) DESCALE(tmp12  tmp1,


383  CONST_BITS+PASS1_BITS+3)


384  & RANGE_MASK];


385  outptr[3] = range_limit[(int) DESCALE(tmp13 + tmp0,


386  CONST_BITS+PASS1_BITS+3)


387  & RANGE_MASK];


388  outptr[4] = range_limit[(int) DESCALE(tmp13  tmp0,


389  CONST_BITS+PASS1_BITS+3)


390  & RANGE_MASK];


391 


392  wsptr += DCTSIZE; /* advance pointer to next row */


393  }


394  }


395 


396  #endif /* DCT_ISLOW_SUPPORTED */

