1  /********************************************************************** <BR>


2  This file is part of Crack dot Com's free source code release of


3  Golgotha. <a href="http://www.crack.com/golgotha_release"> <BR> for


4  information about compiling & licensing issues visit this URL</a>


5  <PRE> If that doesn't help, contact Jonathan Clark at


6  golgotha_source@usa.net (Subject should have "GOLG" in it)


7  ***********************************************************************/


8 


9  /*


10  * jfdctint.c


11  *


12  * Copyright (C) 19911996, Thomas G. Lane.


13  * This file is part of the Independent JPEG Group's software.


14  * For conditions of distribution and use, see the accompanying README file.


15  *


16  * This file contains a slowbutaccurate integer implementation of the


17  * forward DCT (Discrete Cosine Transform).


18  *


19  * A 2D DCT can be done by 1D DCT on each row followed by 1D DCT


20  * on each column. Direct algorithms are also available, but they are


21  * much more complex and seem not to be any faster when reduced to code.


22  *


23  * This implementation is based on an algorithm described in


24  * C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1D DCT


25  * Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,


26  * Speech, and Signal Processing 1989 (ICASSP '89), pp. 988991.


27  * The primary algorithm described there uses 11 multiplies and 29 adds.


28  * We use their alternate method with 12 multiplies and 32 adds.


29  * The advantage of this method is that no data path contains more than one


30  * multiplication; this allows a very simple and accurate implementation in


31  * scaled fixedpoint arithmetic, with a minimal number of shifts.


32  */


33 


34  #define JPEG_INTERNALS


35  #include "loaders/jpg/jinclude.h"


36  #include "loaders/jpg/jpeglib.h"


37  #include "loaders/jpg/jdct.h" /* Private declarations for DCT subsystem */


38 


39  #ifdef DCT_ISLOW_SUPPORTED


40 


41 


42  /*


43  * This module is specialized to the case DCTSIZE = 8.


44  */


45 


46  #if DCTSIZE != 8


47  Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */


48  #endif


49 


50 


51  /*


52  * The poop on this scaling stuff is as follows:


53  *


54  * Each 1D DCT step produces outputs which are a factor of sqrt(N)


55  * larger than the true DCT outputs. The final outputs are therefore


56  * a factor of N larger than desired; since N=8 this can be cured by


57  * a simple right shift at the end of the algorithm. The advantage of


58  * this arrangement is that we save two multiplications per 1D DCT,


59  * because the y0 and y4 outputs need not be divided by sqrt(N).


60  * In the IJG code, this factor of 8 is removed by the quantization step


61  * (in jcdctmgr.c), NOT in this module.


62  *


63  * We have to do addition and subtraction of the integer inputs, which


64  * is no problem, and multiplication by fractional constants, which is


65  * a problem to do in integer arithmetic. We multiply all the constants


66  * by CONST_SCALE and convert them to integer constants (thus retaining


67  * CONST_BITS bits of precision in the constants). After doing a


68  * multiplication we have to divide the product by CONST_SCALE, with proper


69  * rounding, to produce the correct output. This division can be done


70  * cheaply as a right shift of CONST_BITS bits. We postpone shifting


71  * as long as possible so that partial sums can be added together with


72  * full fractional precision.


73  *


74  * The outputs of the first pass are scaled up by PASS1_BITS bits so that


75  * they are represented to betterthanintegral precision. These outputs


76  * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16bit word


77  * with the recommended scaling. (For 12bit sample data, the intermediate


78  * array is INT32 anyway.)


79  *


80  * To avoid overflow of the 32bit intermediate results in pass 2, we must


81  * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26. Error analysis


82  * shows that the values given below are the most effective.


83  */


84 


85  #if BITS_IN_JSAMPLE == 8


86  #define CONST_BITS 13


87  #define PASS1_BITS 2


88  #else


89  #define CONST_BITS 13


90  #define PASS1_BITS 1 /* lose a little precision to avoid overflow */


91  #endif


92 


93  /* Some C compilers fail to reduce "FIX(constant)" at compile time, thus


94  * causing a lot of useless floatingpoint operations at run time.


95  * To get around this we use the following precalculated constants.


96  * If you change CONST_BITS you may want to add appropriate values.


97  * (With a reasonable C compiler, you can just rely on the FIX() macro...)


98  */


99 


100  #if CONST_BITS == 13


101  #define FIX_0_298631336 ((INT32) 2446) /* FIX(0.298631336) */


102  #define FIX_0_390180644 ((INT32) 3196) /* FIX(0.390180644) */


103  #define FIX_0_541196100 ((INT32) 4433) /* FIX(0.541196100) */


104  #define FIX_0_765366865 ((INT32) 6270) /* FIX(0.765366865) */


105  #define FIX_0_899976223 ((INT32) 7373) /* FIX(0.899976223) */


106  #define FIX_1_175875602 ((INT32) 9633) /* FIX(1.175875602) */


107  #define FIX_1_501321110 ((INT32) 12299) /* FIX(1.501321110) */


108  #define FIX_1_847759065 ((INT32) 15137) /* FIX(1.847759065) */


109  #define FIX_1_961570560 ((INT32) 16069) /* FIX(1.961570560) */


110  #define FIX_2_053119869 ((INT32) 16819) /* FIX(2.053119869) */


111  #define FIX_2_562915447 ((INT32) 20995) /* FIX(2.562915447) */


112  #define FIX_3_072711026 ((INT32) 25172) /* FIX(3.072711026) */


113  #else


114  #define FIX_0_298631336 FIX(0.298631336)


115  #define FIX_0_390180644 FIX(0.390180644)


116  #define FIX_0_541196100 FIX(0.541196100)


117  #define FIX_0_765366865 FIX(0.765366865)


118  #define FIX_0_899976223 FIX(0.899976223)


119  #define FIX_1_175875602 FIX(1.175875602)


120  #define FIX_1_501321110 FIX(1.501321110)


121  #define FIX_1_847759065 FIX(1.847759065)


122  #define FIX_1_961570560 FIX(1.961570560)


123  #define FIX_2_053119869 FIX(2.053119869)


124  #define FIX_2_562915447 FIX(2.562915447)


125  #define FIX_3_072711026 FIX(3.072711026)


126  #endif


127 


128 


129  /* Multiply an INT32 variable by an INT32 constant to yield an INT32 result.


130  * For 8bit samples with the recommended scaling, all the variable


131  * and constant values involved are no more than 16 bits wide, so a


132  * 16x16>32 bit multiply can be used instead of a full 32x32 multiply.


133  * For 12bit samples, a full 32bit multiplication will be needed.


134  */


135 


136  #if BITS_IN_JSAMPLE == 8


137  #define MULTIPLY(var,const) MULTIPLY16C16(var,const)


138  #else


139  #define MULTIPLY(var,const) ((var) * (const))


140  #endif


141 


142 


143  /*


144  * Perform the forward DCT on one block of samples.


145  */


146 


147  GLOBAL(void)


148  jpeg_fdct_islow (DCTELEM * data)


149  {


150  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;


151  INT32 tmp10, tmp11, tmp12, tmp13;


152  INT32 z1, z2, z3, z4, z5;


153  DCTELEM *dataptr;


154  int ctr;


155  SHIFT_TEMPS


156 


157  /* Pass 1: process rows. */


158  /* Note results are scaled up by sqrt(8) compared to a true DCT; */


159  /* furthermore, we scale the results by 2**PASS1_BITS. */


160 


161  dataptr = data;


162  for (ctr = DCTSIZE1; ctr >= 0; ctr) {


163  tmp0 = dataptr[0] + dataptr[7];


164  tmp7 = dataptr[0]  dataptr[7];


165  tmp1 = dataptr[1] + dataptr[6];


166  tmp6 = dataptr[1]  dataptr[6];


167  tmp2 = dataptr[2] + dataptr[5];


168  tmp5 = dataptr[2]  dataptr[5];


169  tmp3 = dataptr[3] + dataptr[4];


170  tmp4 = dataptr[3]  dataptr[4];


171 


172  /* Even part per LL&M figure 1  note that published figure is faulty;


173  * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".


174  */


175 


176  tmp10 = tmp0 + tmp3;


177  tmp13 = tmp0  tmp3;


178  tmp11 = tmp1 + tmp2;


179  tmp12 = tmp1  tmp2;


180 


181  dataptr[0] = (DCTELEM) ((tmp10 + tmp11) << PASS1_BITS);


182  dataptr[4] = (DCTELEM) ((tmp10  tmp11) << PASS1_BITS);


183 


184  z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);


185  dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),


186  CONST_BITSPASS1_BITS);


187  dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12,  FIX_1_847759065),


188  CONST_BITSPASS1_BITS);


189 


190  /* Odd part per figure 8  note paper omits factor of sqrt(2).


191  * cK represents cos(K*pi/16).


192  * i0..i3 in the paper are tmp4..tmp7 here.


193  */


194 


195  z1 = tmp4 + tmp7;


196  z2 = tmp5 + tmp6;


197  z3 = tmp4 + tmp6;


198  z4 = tmp5 + tmp7;


199  z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */


200 


201  tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (c1+c3+c5c7) */


202  tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3c5+c7) */


203  tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5c7) */


204  tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3c5c7) */


205  z1 = MULTIPLY(z1,  FIX_0_899976223); /* sqrt(2) * (c7c3) */


206  z2 = MULTIPLY(z2,  FIX_2_562915447); /* sqrt(2) * (c1c3) */


207  z3 = MULTIPLY(z3,  FIX_1_961570560); /* sqrt(2) * (c3c5) */


208  z4 = MULTIPLY(z4,  FIX_0_390180644); /* sqrt(2) * (c5c3) */


209 


210  z3 += z5;


211  z4 += z5;


212 


213  dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITSPASS1_BITS);


214  dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITSPASS1_BITS);


215  dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITSPASS1_BITS);


216  dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITSPASS1_BITS);


217 


218  dataptr += DCTSIZE; /* advance pointer to next row */


219  }


220 


221  /* Pass 2: process columns.


222  * We remove the PASS1_BITS scaling, but leave the results scaled up


223  * by an overall factor of 8.


224  */


225 


226  dataptr = data;


227  for (ctr = DCTSIZE1; ctr >= 0; ctr) {


228  tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];


229  tmp7 = dataptr[DCTSIZE*0]  dataptr[DCTSIZE*7];


230  tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];


231  tmp6 = dataptr[DCTSIZE*1]  dataptr[DCTSIZE*6];


232  tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];


233  tmp5 = dataptr[DCTSIZE*2]  dataptr[DCTSIZE*5];


234  tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];


235  tmp4 = dataptr[DCTSIZE*3]  dataptr[DCTSIZE*4];


236 


237  /* Even part per LL&M figure 1  note that published figure is faulty;


238  * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".


239  */


240 


241  tmp10 = tmp0 + tmp3;


242  tmp13 = tmp0  tmp3;


243  tmp11 = tmp1 + tmp2;


244  tmp12 = tmp1  tmp2;


245 


246  dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS);


247  dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp10  tmp11, PASS1_BITS);


248 


249  z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);


250  dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),


251  CONST_BITS+PASS1_BITS);


252  dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12,  FIX_1_847759065),


253  CONST_BITS+PASS1_BITS);


254 


255  /* Odd part per figure 8  note paper omits factor of sqrt(2).


256  * cK represents cos(K*pi/16).


257  * i0..i3 in the paper are tmp4..tmp7 here.


258  */


259 


260  z1 = tmp4 + tmp7;


261  z2 = tmp5 + tmp6;


262  z3 = tmp4 + tmp6;


263  z4 = tmp5 + tmp7;


264  z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */


265 


266  tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (c1+c3+c5c7) */


267  tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3c5+c7) */


268  tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5c7) */


269  tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3c5c7) */


270  z1 = MULTIPLY(z1,  FIX_0_899976223); /* sqrt(2) * (c7c3) */


271  z2 = MULTIPLY(z2,  FIX_2_562915447); /* sqrt(2) * (c1c3) */


272  z3 = MULTIPLY(z3,  FIX_1_961570560); /* sqrt(2) * (c3c5) */


273  z4 = MULTIPLY(z4,  FIX_0_390180644); /* sqrt(2) * (c5c3) */


274 


275  z3 += z5;


276  z4 += z5;


277 


278  dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp4 + z1 + z3,


279  CONST_BITS+PASS1_BITS);


280  dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp5 + z2 + z4,


281  CONST_BITS+PASS1_BITS);


282  dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp6 + z2 + z3,


283  CONST_BITS+PASS1_BITS);


284  dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp7 + z1 + z4,


285  CONST_BITS+PASS1_BITS);


286 


287  dataptr++; /* advance pointer to next column */


288  }


289  }


290 


291  #endif /* DCT_ISLOW_SUPPORTED */

