source: golgotha/src/i4/loaders/jpg/jidctfst.cc

Last change on this file was 80, checked in by Sam Hocevar, 12 years ago
  • Adding the Golgotha source code. Not sure what's going to be interesting in there, but since it's all public domain, there's certainly stuff to pick up.
File size: 13.7 KB
Line 
1/********************************************************************** <BR>
2  This file is part of Crack dot Com's free source code release of
3  Golgotha. <a href="http://www.crack.com/golgotha_release"> <BR> for
4  information about compiling & licensing issues visit this URL</a>
5  <PRE> If that doesn't help, contact Jonathan Clark at
6  golgotha_source@usa.net (Subject should have "GOLG" in it)
7***********************************************************************/
8
9/*
10 * jidctfst.c
11 *
12 * Copyright (C) 1994-1996, Thomas G. Lane.
13 * This file is part of the Independent JPEG Group's software.
14 * For conditions of distribution and use, see the accompanying README file.
15 *
16 * This file contains a fast, not so accurate integer implementation of the
17 * inverse DCT (Discrete Cosine Transform).  In the IJG code, this routine
18 * must also perform dequantization of the input coefficients.
19 *
20 * A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT
21 * on each row (or vice versa, but it's more convenient to emit a row at
22 * a time).  Direct algorithms are also available, but they are much more
23 * complex and seem not to be any faster when reduced to code.
24 *
25 * This implementation is based on Arai, Agui, and Nakajima's algorithm for
26 * scaled DCT.  Their original paper (Trans. IEICE E-71(11):1095) is in
27 * Japanese, but the algorithm is described in the Pennebaker & Mitchell
28 * JPEG textbook (see REFERENCES section in file README).  The following code
29 * is based directly on figure 4-8 in P&M.
30 * While an 8-point DCT cannot be done in less than 11 multiplies, it is
31 * possible to arrange the computation so that many of the multiplies are
32 * simple scalings of the final outputs.  These multiplies can then be
33 * folded into the multiplications or divisions by the JPEG quantization
34 * table entries.  The AA&N method leaves only 5 multiplies and 29 adds
35 * to be done in the DCT itself.
36 * The primary disadvantage of this method is that with fixed-point math,
37 * accuracy is lost due to imprecise representation of the scaled
38 * quantization values.  The smaller the quantization table entry, the less
39 * precise the scaled value, so this implementation does worse with high-
40 * quality-setting files than with low-quality ones.
41 */
42
43#define JPEG_INTERNALS
44#include "loaders/jpg/jinclude.h"
45#include "loaders/jpg/jpeglib.h"
46#include "loaders/jpg/jdct.h"           /* Private declarations for DCT subsystem */
47
48#ifdef DCT_IFAST_SUPPORTED
49
50
51/*
52 * This module is specialized to the case DCTSIZE = 8.
53 */
54
55#if DCTSIZE != 8
56  Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
57#endif
58
59
60/* Scaling decisions are generally the same as in the LL&M algorithm;
61 * see jidctint.c for more details.  However, we choose to descale
62 * (right shift) multiplication products as soon as they are formed,
63 * rather than carrying additional fractional bits into subsequent additions.
64 * This compromises accuracy slightly, but it lets us save a few shifts.
65 * More importantly, 16-bit arithmetic is then adequate (for 8-bit samples)
66 * everywhere except in the multiplications proper; this saves a good deal
67 * of work on 16-bit-int machines.
68 *
69 * The dequantized coefficients are not integers because the AA&N scaling
70 * factors have been incorporated.  We represent them scaled up by PASS1_BITS,
71 * so that the first and second IDCT rounds have the same input scaling.
72 * For 8-bit JSAMPLEs, we choose IFAST_SCALE_BITS = PASS1_BITS so as to
73 * avoid a descaling shift; this compromises accuracy rather drastically
74 * for small quantization table entries, but it saves a lot of shifts.
75 * For 12-bit JSAMPLEs, there's no hope of using 16x16 multiplies anyway,
76 * so we use a much larger scaling factor to preserve accuracy.
77 *
78 * A final compromise is to represent the multiplicative constants to only
79 * 8 fractional bits, rather than 13.  This saves some shifting work on some
80 * machines, and may also reduce the cost of multiplication (since there
81 * are fewer one-bits in the constants).
82 */
83
84#if BITS_IN_JSAMPLE == 8
85#define CONST_BITS  8
86#define PASS1_BITS  2
87#else
88#define CONST_BITS  8
89#define PASS1_BITS  1           /* lose a little precision to avoid overflow */
90#endif
91
92/* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
93 * causing a lot of useless floating-point operations at run time.
94 * To get around this we use the following pre-calculated constants.
95 * If you change CONST_BITS you may want to add appropriate values.
96 * (With a reasonable C compiler, you can just rely on the FIX() macro...)
97 */
98
99#if CONST_BITS == 8
100#define FIX_1_082392200  ((INT32)  277)         /* FIX(1.082392200) */
101#define FIX_1_414213562  ((INT32)  362)         /* FIX(1.414213562) */
102#define FIX_1_847759065  ((INT32)  473)         /* FIX(1.847759065) */
103#define FIX_2_613125930  ((INT32)  669)         /* FIX(2.613125930) */
104#else
105#define FIX_1_082392200  FIX(1.082392200)
106#define FIX_1_414213562  FIX(1.414213562)
107#define FIX_1_847759065  FIX(1.847759065)
108#define FIX_2_613125930  FIX(2.613125930)
109#endif
110
111
112/* We can gain a little more speed, with a further compromise in accuracy,
113 * by omitting the addition in a descaling shift.  This yields an incorrectly
114 * rounded result half the time...
115 */
116
117#ifndef USE_ACCURATE_ROUNDING
118#undef DESCALE
119#define DESCALE(x,n)  RIGHT_SHIFT(x, n)
120#endif
121
122
123/* Multiply a DCTELEM variable by an INT32 constant, and immediately
124 * descale to yield a DCTELEM result.
125 */
126
127#define MULTIPLY(var,const)  ((DCTELEM) DESCALE((var) * (const), CONST_BITS))
128
129
130/* Dequantize a coefficient by multiplying it by the multiplier-table
131 * entry; produce a DCTELEM result.  For 8-bit data a 16x16->16
132 * multiplication will do.  For 12-bit data, the multiplier table is
133 * declared INT32, so a 32-bit multiply will be used.
134 */
135
136#if BITS_IN_JSAMPLE == 8
137#define DEQUANTIZE(coef,quantval)  (((IFAST_MULT_TYPE) (coef)) * (quantval))
138#else
139#define DEQUANTIZE(coef,quantval)  \
140        DESCALE((coef)*(quantval), IFAST_SCALE_BITS-PASS1_BITS)
141#endif
142
143
144/* Like DESCALE, but applies to a DCTELEM and produces an int.
145 * We assume that int right shift is unsigned if INT32 right shift is.
146 */
147
148#ifdef RIGHT_SHIFT_IS_UNSIGNED
149#define ISHIFT_TEMPS    DCTELEM ishift_temp;
150#if BITS_IN_JSAMPLE == 8
151#define DCTELEMBITS  16         /* DCTELEM may be 16 or 32 bits */
152#else
153#define DCTELEMBITS  32         /* DCTELEM must be 32 bits */
154#endif
155#define IRIGHT_SHIFT(x,shft)  \
156    ((ishift_temp = (x)) < 0 ? \
157     (ishift_temp >> (shft)) | ((~((DCTELEM) 0)) << (DCTELEMBITS-(shft))) : \
158     (ishift_temp >> (shft)))
159#else
160#define ISHIFT_TEMPS
161#define IRIGHT_SHIFT(x,shft)    ((x) >> (shft))
162#endif
163
164#ifdef USE_ACCURATE_ROUNDING
165#define IDESCALE(x,n)  ((int) IRIGHT_SHIFT((x) + (1 << ((n)-1)), n))
166#else
167#define IDESCALE(x,n)  ((int) IRIGHT_SHIFT(x, n))
168#endif
169
170
171/*
172 * Perform dequantization and inverse DCT on one block of coefficients.
173 */
174
175GLOBAL(void)
176jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
177                 JCOEFPTR coef_block,
178                 JSAMPARRAY output_buf, JDIMENSION output_col)
179{
180  DCTELEM tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
181  DCTELEM tmp10, tmp11, tmp12, tmp13;
182  DCTELEM z5, z10, z11, z12, z13;
183  JCOEFPTR inptr;
184  IFAST_MULT_TYPE * quantptr;
185  int * wsptr;
186  JSAMPROW outptr;
187  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
188  int ctr;
189  int workspace[DCTSIZE2];      /* buffers data between passes */
190  SHIFT_TEMPS                   /* for DESCALE */
191  ISHIFT_TEMPS                  /* for IDESCALE */
192
193  /* Pass 1: process columns from input, store into work array. */
194
195  inptr = coef_block;
196  quantptr = (IFAST_MULT_TYPE *) compptr->dct_table;
197  wsptr = workspace;
198  for (ctr = DCTSIZE; ctr > 0; ctr--) {
199    /* Due to quantization, we will usually find that many of the input
200     * coefficients are zero, especially the AC terms.  We can exploit this
201     * by short-circuiting the IDCT calculation for any column in which all
202     * the AC terms are zero.  In that case each output is equal to the
203     * DC coefficient (with scale factor as needed).
204     * With typical images and quantization tables, half or more of the
205     * column DCT calculations can be simplified this way.
206     */
207   
208    if ((inptr[DCTSIZE*1] | inptr[DCTSIZE*2] | inptr[DCTSIZE*3] |
209         inptr[DCTSIZE*4] | inptr[DCTSIZE*5] | inptr[DCTSIZE*6] |
210         inptr[DCTSIZE*7]) == 0) {
211      /* AC terms all zero */
212      int dcval = (int) DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
213
214      wsptr[DCTSIZE*0] = dcval;
215      wsptr[DCTSIZE*1] = dcval;
216      wsptr[DCTSIZE*2] = dcval;
217      wsptr[DCTSIZE*3] = dcval;
218      wsptr[DCTSIZE*4] = dcval;
219      wsptr[DCTSIZE*5] = dcval;
220      wsptr[DCTSIZE*6] = dcval;
221      wsptr[DCTSIZE*7] = dcval;
222     
223      inptr++;                  /* advance pointers to next column */
224      quantptr++;
225      wsptr++;
226      continue;
227    }
228   
229    /* Even part */
230
231    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
232    tmp1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
233    tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
234    tmp3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
235
236    tmp10 = tmp0 + tmp2;        /* phase 3 */
237    tmp11 = tmp0 - tmp2;
238
239    tmp13 = tmp1 + tmp3;        /* phases 5-3 */
240    tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */
241
242    tmp0 = tmp10 + tmp13;       /* phase 2 */
243    tmp3 = tmp10 - tmp13;
244    tmp1 = tmp11 + tmp12;
245    tmp2 = tmp11 - tmp12;
246   
247    /* Odd part */
248
249    tmp4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
250    tmp5 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
251    tmp6 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
252    tmp7 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
253
254    z13 = tmp6 + tmp5;          /* phase 6 */
255    z10 = tmp6 - tmp5;
256    z11 = tmp4 + tmp7;
257    z12 = tmp4 - tmp7;
258
259    tmp7 = z11 + z13;           /* phase 5 */
260    tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */
261
262    z5 = MULTIPLY(z10 + z12, FIX_1_847759065); /* 2*c2 */
263    tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; /* 2*(c2-c6) */
264    tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; /* -2*(c2+c6) */
265
266    tmp6 = tmp12 - tmp7;        /* phase 2 */
267    tmp5 = tmp11 - tmp6;
268    tmp4 = tmp10 + tmp5;
269
270    wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7);
271    wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7);
272    wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6);
273    wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6);
274    wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5);
275    wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5);
276    wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4);
277    wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4);
278
279    inptr++;                    /* advance pointers to next column */
280    quantptr++;
281    wsptr++;
282  }
283 
284  /* Pass 2: process rows from work array, store into output array. */
285  /* Note that we must descale the results by a factor of 8 == 2**3, */
286  /* and also undo the PASS1_BITS scaling. */
287
288  wsptr = workspace;
289  for (ctr = 0; ctr < DCTSIZE; ctr++) {
290    outptr = output_buf[ctr] + output_col;
291    /* Rows of zeroes can be exploited in the same way as we did with columns.
292     * However, the column calculation has created many nonzero AC terms, so
293     * the simplification applies less often (typically 5% to 10% of the time).
294     * On machines with very fast multiplication, it's possible that the
295     * test takes more time than it's worth.  In that case this section
296     * may be commented out.
297     */
298   
299#ifndef NO_ZERO_ROW_TEST
300    if ((wsptr[1] | wsptr[2] | wsptr[3] | wsptr[4] | wsptr[5] | wsptr[6] |
301         wsptr[7]) == 0) {
302      /* AC terms all zero */
303      JSAMPLE dcval = range_limit[IDESCALE(wsptr[0], PASS1_BITS+3)
304                                  & RANGE_MASK];
305     
306      outptr[0] = dcval;
307      outptr[1] = dcval;
308      outptr[2] = dcval;
309      outptr[3] = dcval;
310      outptr[4] = dcval;
311      outptr[5] = dcval;
312      outptr[6] = dcval;
313      outptr[7] = dcval;
314
315      wsptr += DCTSIZE;         /* advance pointer to next row */
316      continue;
317    }
318#endif
319   
320    /* Even part */
321
322    tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]);
323    tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]);
324
325    tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]);
326    tmp12 = MULTIPLY((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6], FIX_1_414213562)
327            - tmp13;
328
329    tmp0 = tmp10 + tmp13;
330    tmp3 = tmp10 - tmp13;
331    tmp1 = tmp11 + tmp12;
332    tmp2 = tmp11 - tmp12;
333
334    /* Odd part */
335
336    z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3];
337    z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3];
338    z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7];
339    z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7];
340
341    tmp7 = z11 + z13;           /* phase 5 */
342    tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */
343
344    z5 = MULTIPLY(z10 + z12, FIX_1_847759065); /* 2*c2 */
345    tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; /* 2*(c2-c6) */
346    tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; /* -2*(c2+c6) */
347
348    tmp6 = tmp12 - tmp7;        /* phase 2 */
349    tmp5 = tmp11 - tmp6;
350    tmp4 = tmp10 + tmp5;
351
352    /* Final output stage: scale down by a factor of 8 and range-limit */
353
354    outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3)
355                            & RANGE_MASK];
356    outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3)
357                            & RANGE_MASK];
358    outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3)
359                            & RANGE_MASK];
360    outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3)
361                            & RANGE_MASK];
362    outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3)
363                            & RANGE_MASK];
364    outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3)
365                            & RANGE_MASK];
366    outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3)
367                            & RANGE_MASK];
368    outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3)
369                            & RANGE_MASK];
370
371    wsptr += DCTSIZE;           /* advance pointer to next row */
372  }
373}
374
375#endif /* DCT_IFAST_SUPPORTED */
Note: See TracBrowser for help on using the repository browser.