4
* Copyright (C) 1991-1996, Thomas G. Lane.
5
* Modification developed 2003-2009 by Guido Vollbeding.
6
* This file is part of the Independent JPEG Group's software.
7
* For conditions of distribution and use, see the accompanying README file.
9
* This file contains a slow-but-accurate integer implementation of the
10
* forward DCT (Discrete Cosine Transform).
12
* A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
13
* on each column. Direct algorithms are also available, but they are
14
* much more complex and seem not to be any faster when reduced to code.
16
* This implementation is based on an algorithm described in
17
* C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
18
* Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
19
* Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
20
* The primary algorithm described there uses 11 multiplies and 29 adds.
21
* We use their alternate method with 12 multiplies and 32 adds.
22
* The advantage of this method is that no data path contains more than one
23
* multiplication; this allows a very simple and accurate implementation in
24
* scaled fixed-point arithmetic, with a minimal number of shifts.
26
* We also provide FDCT routines with various input sample block sizes for
27
* direct resolution reduction or enlargement and for direct resolving the
28
* common 2x1 and 1x2 subsampling cases without additional resampling: NxN
29
* (N=1...16), 2NxN, and Nx2N (N=1...8) pixels for one 8x8 output DCT block.
31
* For N<8 we fill the remaining block coefficients with zero.
32
* For N>8 we apply a partial N-point FDCT on the input samples, computing
33
* just the lower 8 frequency coefficients and discarding the rest.
35
* We must scale the output coefficients of the N-point FDCT appropriately
36
* to the standard 8-point FDCT level by 8/N per 1-D pass. This scaling
37
* is folded into the constant multipliers (pass 2) and/or final/initial
40
* CAUTION: We rely on the FIX() macro except for the N=1,2,4,8 cases
41
* since there would be too many additional constants to pre-calculate.
44
#define JPEG_INTERNALS
47
#include "jdct.h" /* Private declarations for DCT subsystem */
49
#ifdef DCT_ISLOW_SUPPORTED
53
* This module is specialized to the case DCTSIZE = 8.
57
Sorry, this code only copes with 8x8 DCT blocks. /* deliberate syntax err */
62
* The poop on this scaling stuff is as follows:
64
* Each 1-D DCT step produces outputs which are a factor of sqrt(N)
65
* larger than the true DCT outputs. The final outputs are therefore
66
* a factor of N larger than desired; since N=8 this can be cured by
67
* a simple right shift at the end of the algorithm. The advantage of
68
* this arrangement is that we save two multiplications per 1-D DCT,
69
* because the y0 and y4 outputs need not be divided by sqrt(N).
70
* In the IJG code, this factor of 8 is removed by the quantization step
71
* (in jcdctmgr.c), NOT in this module.
73
* We have to do addition and subtraction of the integer inputs, which
74
* is no problem, and multiplication by fractional constants, which is
75
* a problem to do in integer arithmetic. We multiply all the constants
76
* by CONST_SCALE and convert them to integer constants (thus retaining
77
* CONST_BITS bits of precision in the constants). After doing a
78
* multiplication we have to divide the product by CONST_SCALE, with proper
79
* rounding, to produce the correct output. This division can be done
80
* cheaply as a right shift of CONST_BITS bits. We postpone shifting
81
* as long as possible so that partial sums can be added together with
82
* full fractional precision.
84
* The outputs of the first pass are scaled up by PASS1_BITS bits so that
85
* they are represented to better-than-integral precision. These outputs
86
* require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
87
* with the recommended scaling. (For 12-bit sample data, the intermediate
88
* array is INT32 anyway.)
90
* To avoid overflow of the 32-bit intermediate results in pass 2, we must
91
* have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26. Error analysis
92
* shows that the values given below are the most effective.
95
#if BITS_IN_JSAMPLE == 8
100
#define PASS1_BITS 1 /* lose a little precision to avoid overflow */
103
/* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
104
* causing a lot of useless floating-point operations at run time.
105
* To get around this we use the following pre-calculated constants.
106
* If you change CONST_BITS you may want to add appropriate values.
107
* (With a reasonable C compiler, you can just rely on the FIX() macro...)
111
#define FIX_0_298631336 ((INT32) 2446) /* FIX(0.298631336) */
112
#define FIX_0_390180644 ((INT32) 3196) /* FIX(0.390180644) */
113
#define FIX_0_541196100 ((INT32) 4433) /* FIX(0.541196100) */
114
#define FIX_0_765366865 ((INT32) 6270) /* FIX(0.765366865) */
115
#define FIX_0_899976223 ((INT32) 7373) /* FIX(0.899976223) */
116
#define FIX_1_175875602 ((INT32) 9633) /* FIX(1.175875602) */
117
#define FIX_1_501321110 ((INT32) 12299) /* FIX(1.501321110) */
118
#define FIX_1_847759065 ((INT32) 15137) /* FIX(1.847759065) */
119
#define FIX_1_961570560 ((INT32) 16069) /* FIX(1.961570560) */
120
#define FIX_2_053119869 ((INT32) 16819) /* FIX(2.053119869) */
121
#define FIX_2_562915447 ((INT32) 20995) /* FIX(2.562915447) */
122
#define FIX_3_072711026 ((INT32) 25172) /* FIX(3.072711026) */
124
#define FIX_0_298631336 FIX(0.298631336)
125
#define FIX_0_390180644 FIX(0.390180644)
126
#define FIX_0_541196100 FIX(0.541196100)
127
#define FIX_0_765366865 FIX(0.765366865)
128
#define FIX_0_899976223 FIX(0.899976223)
129
#define FIX_1_175875602 FIX(1.175875602)
130
#define FIX_1_501321110 FIX(1.501321110)
131
#define FIX_1_847759065 FIX(1.847759065)
132
#define FIX_1_961570560 FIX(1.961570560)
133
#define FIX_2_053119869 FIX(2.053119869)
134
#define FIX_2_562915447 FIX(2.562915447)
135
#define FIX_3_072711026 FIX(3.072711026)
139
/* Multiply an INT32 variable by an INT32 constant to yield an INT32 result.
140
* For 8-bit samples with the recommended scaling, all the variable
141
* and constant values involved are no more than 16 bits wide, so a
142
* 16x16->32 bit multiply can be used instead of a full 32x32 multiply.
143
* For 12-bit samples, a full 32-bit multiplication will be needed.
146
#if BITS_IN_JSAMPLE == 8
147
#define MULTIPLY(var,const) MULTIPLY16C16(var,const)
149
#define MULTIPLY(var,const) ((var) * (const))
154
* Perform the forward DCT on one block of samples.
158
jpeg_fdct_islow (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
160
INT32 tmp0, tmp1, tmp2, tmp3;
161
INT32 tmp10, tmp11, tmp12, tmp13;
168
/* Pass 1: process rows. */
169
/* Note results are scaled up by sqrt(8) compared to a true DCT; */
170
/* furthermore, we scale the results by 2**PASS1_BITS. */
173
for (ctr = 0; ctr < DCTSIZE; ctr++) {
174
elemptr = sample_data[ctr] + start_col;
176
/* Even part per LL&M figure 1 --- note that published figure is faulty;
177
* rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
180
tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
181
tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
182
tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
183
tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
190
tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
191
tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
192
tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
193
tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
195
/* Apply unsigned->signed conversion */
196
dataptr[0] = (DCTELEM) ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << PASS1_BITS);
197
dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
199
z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
200
/* Add fudge factor here for final descale. */
201
z1 += ONE << (CONST_BITS-PASS1_BITS-1);
202
dataptr[2] = (DCTELEM) RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865),
203
CONST_BITS-PASS1_BITS);
204
dataptr[6] = (DCTELEM) RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065),
205
CONST_BITS-PASS1_BITS);
207
/* Odd part per figure 8 --- note paper omits factor of sqrt(2).
208
* cK represents sqrt(2) * cos(K*pi/16).
209
* i0..i3 in the paper are tmp0..tmp3 here.
216
z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */
217
/* Add fudge factor here for final descale. */
218
z1 += ONE << (CONST_BITS-PASS1_BITS-1);
220
tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */
221
tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */
222
tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */
223
tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */
224
tmp10 = MULTIPLY(tmp10, - FIX_0_899976223); /* c7-c3 */
225
tmp11 = MULTIPLY(tmp11, - FIX_2_562915447); /* -c1-c3 */
226
tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* c5-c3 */
227
tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */
232
dataptr[1] = (DCTELEM)
233
RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS-PASS1_BITS);
234
dataptr[3] = (DCTELEM)
235
RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS-PASS1_BITS);
236
dataptr[5] = (DCTELEM)
237
RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS-PASS1_BITS);
238
dataptr[7] = (DCTELEM)
239
RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS-PASS1_BITS);
241
dataptr += DCTSIZE; /* advance pointer to next row */
244
/* Pass 2: process columns.
245
* We remove the PASS1_BITS scaling, but leave the results scaled up
246
* by an overall factor of 8.
250
for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
251
/* Even part per LL&M figure 1 --- note that published figure is faulty;
252
* rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
255
tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
256
tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
257
tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
258
tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
260
/* Add fudge factor here for final descale. */
261
tmp10 = tmp0 + tmp3 + (ONE << (PASS1_BITS-1));
266
tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
267
tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
268
tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
269
tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
271
dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp10 + tmp11, PASS1_BITS);
272
dataptr[DCTSIZE*4] = (DCTELEM) RIGHT_SHIFT(tmp10 - tmp11, PASS1_BITS);
274
z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
275
/* Add fudge factor here for final descale. */
276
z1 += ONE << (CONST_BITS+PASS1_BITS-1);
277
dataptr[DCTSIZE*2] = (DCTELEM)
278
RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), CONST_BITS+PASS1_BITS);
279
dataptr[DCTSIZE*6] = (DCTELEM)
280
RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), CONST_BITS+PASS1_BITS);
282
/* Odd part per figure 8 --- note paper omits factor of sqrt(2).
283
* cK represents sqrt(2) * cos(K*pi/16).
284
* i0..i3 in the paper are tmp0..tmp3 here.
291
z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */
292
/* Add fudge factor here for final descale. */
293
z1 += ONE << (CONST_BITS+PASS1_BITS-1);
295
tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */
296
tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */
297
tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */
298
tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */
299
tmp10 = MULTIPLY(tmp10, - FIX_0_899976223); /* c7-c3 */
300
tmp11 = MULTIPLY(tmp11, - FIX_2_562915447); /* -c1-c3 */
301
tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* c5-c3 */
302
tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */
307
dataptr[DCTSIZE*1] = (DCTELEM)
308
RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS+PASS1_BITS);
309
dataptr[DCTSIZE*3] = (DCTELEM)
310
RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS+PASS1_BITS);
311
dataptr[DCTSIZE*5] = (DCTELEM)
312
RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS+PASS1_BITS);
313
dataptr[DCTSIZE*7] = (DCTELEM)
314
RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS+PASS1_BITS);
316
dataptr++; /* advance pointer to next column */
320
#ifdef DCT_SCALING_SUPPORTED
324
* Perform the forward DCT on a 7x7 sample block.
328
jpeg_fdct_7x7 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
330
INT32 tmp0, tmp1, tmp2, tmp3;
331
INT32 tmp10, tmp11, tmp12;
338
/* Pre-zero output coefficient block. */
339
MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
341
/* Pass 1: process rows. */
342
/* Note results are scaled up by sqrt(8) compared to a true DCT; */
343
/* furthermore, we scale the results by 2**PASS1_BITS. */
344
/* cK represents sqrt(2) * cos(K*pi/14). */
347
for (ctr = 0; ctr < 7; ctr++) {
348
elemptr = sample_data[ctr] + start_col;
352
tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[6]);
353
tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[5]);
354
tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[4]);
355
tmp3 = GETJSAMPLE(elemptr[3]);
357
tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[6]);
358
tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[5]);
359
tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[4]);
362
/* Apply unsigned->signed conversion */
363
dataptr[0] = (DCTELEM)
364
((z1 + tmp1 + tmp3 - 7 * CENTERJSAMPLE) << PASS1_BITS);
368
z1 = MULTIPLY(z1, FIX(0.353553391)); /* (c2+c6-c4)/2 */
369
z2 = MULTIPLY(tmp0 - tmp2, FIX(0.920609002)); /* (c2+c4-c6)/2 */
370
z3 = MULTIPLY(tmp1 - tmp2, FIX(0.314692123)); /* c6 */
371
dataptr[2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS-PASS1_BITS);
373
z2 = MULTIPLY(tmp0 - tmp1, FIX(0.881747734)); /* c4 */
374
dataptr[4] = (DCTELEM)
375
DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.707106781)), /* c2+c6-c4 */
376
CONST_BITS-PASS1_BITS);
377
dataptr[6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS-PASS1_BITS);
381
tmp1 = MULTIPLY(tmp10 + tmp11, FIX(0.935414347)); /* (c3+c1-c5)/2 */
382
tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.170262339)); /* (c3+c5-c1)/2 */
385
tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.378756276)); /* -c1 */
387
tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.613604268)); /* c5 */
389
tmp2 += tmp3 + MULTIPLY(tmp12, FIX(1.870828693)); /* c3+c1-c5 */
391
dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-PASS1_BITS);
392
dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-PASS1_BITS);
393
dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-PASS1_BITS);
395
dataptr += DCTSIZE; /* advance pointer to next row */
398
/* Pass 2: process columns.
399
* We remove the PASS1_BITS scaling, but leave the results scaled up
400
* by an overall factor of 8.
401
* We must also scale the output by (8/7)**2 = 64/49, which we fold
402
* into the constant multipliers:
403
* cK now represents sqrt(2) * cos(K*pi/14) * 64/49.
407
for (ctr = 0; ctr < 7; ctr++) {
410
tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*6];
411
tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*5];
412
tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*4];
413
tmp3 = dataptr[DCTSIZE*3];
415
tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*6];
416
tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*5];
417
tmp12 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*4];
420
dataptr[DCTSIZE*0] = (DCTELEM)
421
DESCALE(MULTIPLY(z1 + tmp1 + tmp3, FIX(1.306122449)), /* 64/49 */
422
CONST_BITS+PASS1_BITS);
426
z1 = MULTIPLY(z1, FIX(0.461784020)); /* (c2+c6-c4)/2 */
427
z2 = MULTIPLY(tmp0 - tmp2, FIX(1.202428084)); /* (c2+c4-c6)/2 */
428
z3 = MULTIPLY(tmp1 - tmp2, FIX(0.411026446)); /* c6 */
429
dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS+PASS1_BITS);
431
z2 = MULTIPLY(tmp0 - tmp1, FIX(1.151670509)); /* c4 */
432
dataptr[DCTSIZE*4] = (DCTELEM)
433
DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.923568041)), /* c2+c6-c4 */
434
CONST_BITS+PASS1_BITS);
435
dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+PASS1_BITS);
439
tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.221765677)); /* (c3+c1-c5)/2 */
440
tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.222383464)); /* (c3+c5-c1)/2 */
443
tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.800824523)); /* -c1 */
445
tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.801442310)); /* c5 */
447
tmp2 += tmp3 + MULTIPLY(tmp12, FIX(2.443531355)); /* c3+c1-c5 */
449
dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+PASS1_BITS);
450
dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+PASS1_BITS);
451
dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+PASS1_BITS);
453
dataptr++; /* advance pointer to next column */
459
* Perform the forward DCT on a 6x6 sample block.
463
jpeg_fdct_6x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
465
INT32 tmp0, tmp1, tmp2;
466
INT32 tmp10, tmp11, tmp12;
472
/* Pre-zero output coefficient block. */
473
MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
475
/* Pass 1: process rows. */
476
/* Note results are scaled up by sqrt(8) compared to a true DCT; */
477
/* furthermore, we scale the results by 2**PASS1_BITS. */
478
/* cK represents sqrt(2) * cos(K*pi/12). */
481
for (ctr = 0; ctr < 6; ctr++) {
482
elemptr = sample_data[ctr] + start_col;
486
tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
487
tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
488
tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
493
tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
494
tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
495
tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
497
/* Apply unsigned->signed conversion */
498
dataptr[0] = (DCTELEM)
499
((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << PASS1_BITS);
500
dataptr[2] = (DCTELEM)
501
DESCALE(MULTIPLY(tmp12, FIX(1.224744871)), /* c2 */
502
CONST_BITS-PASS1_BITS);
503
dataptr[4] = (DCTELEM)
504
DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
505
CONST_BITS-PASS1_BITS);
509
tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)), /* c5 */
510
CONST_BITS-PASS1_BITS);
512
dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << PASS1_BITS));
513
dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << PASS1_BITS);
514
dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << PASS1_BITS));
516
dataptr += DCTSIZE; /* advance pointer to next row */
519
/* Pass 2: process columns.
520
* We remove the PASS1_BITS scaling, but leave the results scaled up
521
* by an overall factor of 8.
522
* We must also scale the output by (8/6)**2 = 16/9, which we fold
523
* into the constant multipliers:
524
* cK now represents sqrt(2) * cos(K*pi/12) * 16/9.
528
for (ctr = 0; ctr < 6; ctr++) {
531
tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*5];
532
tmp11 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*4];
533
tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*3];
538
tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*5];
539
tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*4];
540
tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*3];
542
dataptr[DCTSIZE*0] = (DCTELEM)
543
DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)), /* 16/9 */
544
CONST_BITS+PASS1_BITS);
545
dataptr[DCTSIZE*2] = (DCTELEM)
546
DESCALE(MULTIPLY(tmp12, FIX(2.177324216)), /* c2 */
547
CONST_BITS+PASS1_BITS);
548
dataptr[DCTSIZE*4] = (DCTELEM)
549
DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */
550
CONST_BITS+PASS1_BITS);
554
tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829)); /* c5 */
556
dataptr[DCTSIZE*1] = (DCTELEM)
557
DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)), /* 16/9 */
558
CONST_BITS+PASS1_BITS);
559
dataptr[DCTSIZE*3] = (DCTELEM)
560
DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)), /* 16/9 */
561
CONST_BITS+PASS1_BITS);
562
dataptr[DCTSIZE*5] = (DCTELEM)
563
DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)), /* 16/9 */
564
CONST_BITS+PASS1_BITS);
566
dataptr++; /* advance pointer to next column */
572
* Perform the forward DCT on a 5x5 sample block.
576
jpeg_fdct_5x5 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
578
INT32 tmp0, tmp1, tmp2;
585
/* Pre-zero output coefficient block. */
586
MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
588
/* Pass 1: process rows. */
589
/* Note results are scaled up by sqrt(8) compared to a true DCT; */
590
/* furthermore, we scale the results by 2**PASS1_BITS. */
591
/* We scale the results further by 2 as part of output adaption */
592
/* scaling for different DCT size. */
593
/* cK represents sqrt(2) * cos(K*pi/10). */
596
for (ctr = 0; ctr < 5; ctr++) {
597
elemptr = sample_data[ctr] + start_col;
601
tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[4]);
602
tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[3]);
603
tmp2 = GETJSAMPLE(elemptr[2]);
608
tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[4]);
609
tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[3]);
611
/* Apply unsigned->signed conversion */
612
dataptr[0] = (DCTELEM)
613
((tmp10 + tmp2 - 5 * CENTERJSAMPLE) << (PASS1_BITS+1));
614
tmp11 = MULTIPLY(tmp11, FIX(0.790569415)); /* (c2+c4)/2 */
616
tmp10 = MULTIPLY(tmp10, FIX(0.353553391)); /* (c2-c4)/2 */
617
dataptr[2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS-PASS1_BITS-1);
618
dataptr[4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS-PASS1_BITS-1);
622
tmp10 = MULTIPLY(tmp0 + tmp1, FIX(0.831253876)); /* c3 */
624
dataptr[1] = (DCTELEM)
625
DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.513743148)), /* c1-c3 */
626
CONST_BITS-PASS1_BITS-1);
627
dataptr[3] = (DCTELEM)
628
DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.176250899)), /* c1+c3 */
629
CONST_BITS-PASS1_BITS-1);
631
dataptr += DCTSIZE; /* advance pointer to next row */
634
/* Pass 2: process columns.
635
* We remove the PASS1_BITS scaling, but leave the results scaled up
636
* by an overall factor of 8.
637
* We must also scale the output by (8/5)**2 = 64/25, which we partially
638
* fold into the constant multipliers (other part was done in pass 1):
639
* cK now represents sqrt(2) * cos(K*pi/10) * 32/25.
643
for (ctr = 0; ctr < 5; ctr++) {
646
tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*4];
647
tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*3];
648
tmp2 = dataptr[DCTSIZE*2];
653
tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*4];
654
tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*3];
656
dataptr[DCTSIZE*0] = (DCTELEM)
657
DESCALE(MULTIPLY(tmp10 + tmp2, FIX(1.28)), /* 32/25 */
658
CONST_BITS+PASS1_BITS);
659
tmp11 = MULTIPLY(tmp11, FIX(1.011928851)); /* (c2+c4)/2 */
661
tmp10 = MULTIPLY(tmp10, FIX(0.452548340)); /* (c2-c4)/2 */
662
dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS+PASS1_BITS);
663
dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS+PASS1_BITS);
667
tmp10 = MULTIPLY(tmp0 + tmp1, FIX(1.064004961)); /* c3 */
669
dataptr[DCTSIZE*1] = (DCTELEM)
670
DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.657591230)), /* c1-c3 */
671
CONST_BITS+PASS1_BITS);
672
dataptr[DCTSIZE*3] = (DCTELEM)
673
DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.785601151)), /* c1+c3 */
674
CONST_BITS+PASS1_BITS);
676
dataptr++; /* advance pointer to next column */
682
* Perform the forward DCT on a 4x4 sample block.
686
jpeg_fdct_4x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
695
/* Pre-zero output coefficient block. */
696
MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
698
/* Pass 1: process rows. */
699
/* Note results are scaled up by sqrt(8) compared to a true DCT; */
700
/* furthermore, we scale the results by 2**PASS1_BITS. */
701
/* We must also scale the output by (8/4)**2 = 2**2, which we add here. */
702
/* cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT]. */
705
for (ctr = 0; ctr < 4; ctr++) {
706
elemptr = sample_data[ctr] + start_col;
710
tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
711
tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
713
tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
714
tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
716
/* Apply unsigned->signed conversion */
717
dataptr[0] = (DCTELEM)
718
((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+2));
719
dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+2));
723
tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */
724
/* Add fudge factor here for final descale. */
725
tmp0 += ONE << (CONST_BITS-PASS1_BITS-3);
727
dataptr[1] = (DCTELEM)
728
RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
729
CONST_BITS-PASS1_BITS-2);
730
dataptr[3] = (DCTELEM)
731
RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
732
CONST_BITS-PASS1_BITS-2);
734
dataptr += DCTSIZE; /* advance pointer to next row */
737
/* Pass 2: process columns.
738
* We remove the PASS1_BITS scaling, but leave the results scaled up
739
* by an overall factor of 8.
743
for (ctr = 0; ctr < 4; ctr++) {
746
/* Add fudge factor here for final descale. */
747
tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3] + (ONE << (PASS1_BITS-1));
748
tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*2];
750
tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*3];
751
tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*2];
753
dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp0 + tmp1, PASS1_BITS);
754
dataptr[DCTSIZE*2] = (DCTELEM) RIGHT_SHIFT(tmp0 - tmp1, PASS1_BITS);
758
tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */
759
/* Add fudge factor here for final descale. */
760
tmp0 += ONE << (CONST_BITS+PASS1_BITS-1);
762
dataptr[DCTSIZE*1] = (DCTELEM)
763
RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
764
CONST_BITS+PASS1_BITS);
765
dataptr[DCTSIZE*3] = (DCTELEM)
766
RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
767
CONST_BITS+PASS1_BITS);
769
dataptr++; /* advance pointer to next column */
775
* Perform the forward DCT on a 3x3 sample block.
779
jpeg_fdct_3x3 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
781
INT32 tmp0, tmp1, tmp2;
787
/* Pre-zero output coefficient block. */
788
MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
790
/* Pass 1: process rows. */
791
/* Note results are scaled up by sqrt(8) compared to a true DCT; */
792
/* furthermore, we scale the results by 2**PASS1_BITS. */
793
/* We scale the results further by 2**2 as part of output adaption */
794
/* scaling for different DCT size. */
795
/* cK represents sqrt(2) * cos(K*pi/6). */
798
for (ctr = 0; ctr < 3; ctr++) {
799
elemptr = sample_data[ctr] + start_col;
803
tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[2]);
804
tmp1 = GETJSAMPLE(elemptr[1]);
806
tmp2 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[2]);
808
/* Apply unsigned->signed conversion */
809
dataptr[0] = (DCTELEM)
810
((tmp0 + tmp1 - 3 * CENTERJSAMPLE) << (PASS1_BITS+2));
811
dataptr[2] = (DCTELEM)
812
DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(0.707106781)), /* c2 */
813
CONST_BITS-PASS1_BITS-2);
817
dataptr[1] = (DCTELEM)
818
DESCALE(MULTIPLY(tmp2, FIX(1.224744871)), /* c1 */
819
CONST_BITS-PASS1_BITS-2);
821
dataptr += DCTSIZE; /* advance pointer to next row */
824
/* Pass 2: process columns.
825
* We remove the PASS1_BITS scaling, but leave the results scaled up
826
* by an overall factor of 8.
827
* We must also scale the output by (8/3)**2 = 64/9, which we partially
828
* fold into the constant multipliers (other part was done in pass 1):
829
* cK now represents sqrt(2) * cos(K*pi/6) * 16/9.
833
for (ctr = 0; ctr < 3; ctr++) {
836
tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*2];
837
tmp1 = dataptr[DCTSIZE*1];
839
tmp2 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*2];
841
dataptr[DCTSIZE*0] = (DCTELEM)
842
DESCALE(MULTIPLY(tmp0 + tmp1, FIX(1.777777778)), /* 16/9 */
843
CONST_BITS+PASS1_BITS);
844
dataptr[DCTSIZE*2] = (DCTELEM)
845
DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(1.257078722)), /* c2 */
846
CONST_BITS+PASS1_BITS);
850
dataptr[DCTSIZE*1] = (DCTELEM)
851
DESCALE(MULTIPLY(tmp2, FIX(2.177324216)), /* c1 */
852
CONST_BITS+PASS1_BITS);
854
dataptr++; /* advance pointer to next column */
860
* Perform the forward DCT on a 2x2 sample block.
864
jpeg_fdct_2x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
866
INT32 tmp0, tmp1, tmp2, tmp3;
869
/* Pre-zero output coefficient block. */
870
MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
872
/* Pass 1: process rows. */
873
/* Note results are scaled up by sqrt(8) compared to a true DCT. */
876
elemptr = sample_data[0] + start_col;
878
tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[1]);
879
tmp1 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[1]);
882
elemptr = sample_data[1] + start_col;
884
tmp2 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[1]);
885
tmp3 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[1]);
887
/* Pass 2: process columns.
888
* We leave the results scaled up by an overall factor of 8.
889
* We must also scale the output by (8/2)**2 = 2**4.
893
/* Apply unsigned->signed conversion */
894
data[DCTSIZE*0] = (DCTELEM) ((tmp0 + tmp2 - 4 * CENTERJSAMPLE) << 4);
895
data[DCTSIZE*1] = (DCTELEM) ((tmp0 - tmp2) << 4);
898
data[DCTSIZE*0+1] = (DCTELEM) ((tmp1 + tmp3) << 4);
899
data[DCTSIZE*1+1] = (DCTELEM) ((tmp1 - tmp3) << 4);
904
* Perform the forward DCT on a 1x1 sample block.
908
jpeg_fdct_1x1 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
910
/* Pre-zero output coefficient block. */
911
MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
913
/* We leave the result scaled up by an overall factor of 8. */
914
/* We must also scale the output by (8/1)**2 = 2**6. */
915
/* Apply unsigned->signed conversion */
917
((GETJSAMPLE(sample_data[0][start_col]) - CENTERJSAMPLE) << 6);
922
* Perform the forward DCT on a 9x9 sample block.
926
jpeg_fdct_9x9 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
928
INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
929
INT32 tmp10, tmp11, tmp12, tmp13;
931
DCTELEM workspace[8];
938
/* Pass 1: process rows. */
939
/* Note results are scaled up by sqrt(8) compared to a true DCT; */
940
/* we scale the results further by 2 as part of output adaption */
941
/* scaling for different DCT size. */
942
/* cK represents sqrt(2) * cos(K*pi/18). */
947
elemptr = sample_data[ctr] + start_col;
951
tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[8]);
952
tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[7]);
953
tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[6]);
954
tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[5]);
955
tmp4 = GETJSAMPLE(elemptr[4]);
957
tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[8]);
958
tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[7]);
959
tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[6]);
960
tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[5]);
962
z1 = tmp0 + tmp2 + tmp3;
964
/* Apply unsigned->signed conversion */
965
dataptr[0] = (DCTELEM) ((z1 + z2 - 9 * CENTERJSAMPLE) << 1);
966
dataptr[6] = (DCTELEM)
967
DESCALE(MULTIPLY(z1 - z2 - z2, FIX(0.707106781)), /* c6 */
969
z1 = MULTIPLY(tmp0 - tmp2, FIX(1.328926049)); /* c2 */
970
z2 = MULTIPLY(tmp1 - tmp4 - tmp4, FIX(0.707106781)); /* c6 */
971
dataptr[2] = (DCTELEM)
972
DESCALE(MULTIPLY(tmp2 - tmp3, FIX(1.083350441)) /* c4 */
973
+ z1 + z2, CONST_BITS-1);
974
dataptr[4] = (DCTELEM)
975
DESCALE(MULTIPLY(tmp3 - tmp0, FIX(0.245575608)) /* c8 */
976
+ z1 - z2, CONST_BITS-1);
980
dataptr[3] = (DCTELEM)
981
DESCALE(MULTIPLY(tmp10 - tmp12 - tmp13, FIX(1.224744871)), /* c3 */
984
tmp11 = MULTIPLY(tmp11, FIX(1.224744871)); /* c3 */
985
tmp0 = MULTIPLY(tmp10 + tmp12, FIX(0.909038955)); /* c5 */
986
tmp1 = MULTIPLY(tmp10 + tmp13, FIX(0.483689525)); /* c7 */
988
dataptr[1] = (DCTELEM) DESCALE(tmp11 + tmp0 + tmp1, CONST_BITS-1);
990
tmp2 = MULTIPLY(tmp12 - tmp13, FIX(1.392728481)); /* c1 */
992
dataptr[5] = (DCTELEM) DESCALE(tmp0 - tmp11 - tmp2, CONST_BITS-1);
993
dataptr[7] = (DCTELEM) DESCALE(tmp1 - tmp11 + tmp2, CONST_BITS-1);
997
if (ctr != DCTSIZE) {
1000
dataptr += DCTSIZE; /* advance pointer to next row */
1002
dataptr = workspace; /* switch pointer to extended workspace */
1005
/* Pass 2: process columns.
1006
* We leave the results scaled up by an overall factor of 8.
1007
* We must also scale the output by (8/9)**2 = 64/81, which we partially
1008
* fold into the constant multipliers and final/initial shifting:
1009
* cK now represents sqrt(2) * cos(K*pi/18) * 128/81.
1014
for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
1017
tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*0];
1018
tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*7];
1019
tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*6];
1020
tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*5];
1021
tmp4 = dataptr[DCTSIZE*4];
1023
tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*0];
1024
tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*7];
1025
tmp12 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*6];
1026
tmp13 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*5];
1028
z1 = tmp0 + tmp2 + tmp3;
1030
dataptr[DCTSIZE*0] = (DCTELEM)
1031
DESCALE(MULTIPLY(z1 + z2, FIX(1.580246914)), /* 128/81 */
1033
dataptr[DCTSIZE*6] = (DCTELEM)
1034
DESCALE(MULTIPLY(z1 - z2 - z2, FIX(1.117403309)), /* c6 */
1036
z1 = MULTIPLY(tmp0 - tmp2, FIX(2.100031287)); /* c2 */
1037
z2 = MULTIPLY(tmp1 - tmp4 - tmp4, FIX(1.117403309)); /* c6 */
1038
dataptr[DCTSIZE*2] = (DCTELEM)
1039
DESCALE(MULTIPLY(tmp2 - tmp3, FIX(1.711961190)) /* c4 */
1040
+ z1 + z2, CONST_BITS+2);
1041
dataptr[DCTSIZE*4] = (DCTELEM)
1042
DESCALE(MULTIPLY(tmp3 - tmp0, FIX(0.388070096)) /* c8 */
1043
+ z1 - z2, CONST_BITS+2);
1047
dataptr[DCTSIZE*3] = (DCTELEM)
1048
DESCALE(MULTIPLY(tmp10 - tmp12 - tmp13, FIX(1.935399303)), /* c3 */
1051
tmp11 = MULTIPLY(tmp11, FIX(1.935399303)); /* c3 */
1052
tmp0 = MULTIPLY(tmp10 + tmp12, FIX(1.436506004)); /* c5 */
1053
tmp1 = MULTIPLY(tmp10 + tmp13, FIX(0.764348879)); /* c7 */
1055
dataptr[DCTSIZE*1] = (DCTELEM)
1056
DESCALE(tmp11 + tmp0 + tmp1, CONST_BITS+2);
1058
tmp2 = MULTIPLY(tmp12 - tmp13, FIX(2.200854883)); /* c1 */
1060
dataptr[DCTSIZE*5] = (DCTELEM)
1061
DESCALE(tmp0 - tmp11 - tmp2, CONST_BITS+2);
1062
dataptr[DCTSIZE*7] = (DCTELEM)
1063
DESCALE(tmp1 - tmp11 + tmp2, CONST_BITS+2);
1065
dataptr++; /* advance pointer to next column */
1066
wsptr++; /* advance pointer to next column */
1072
* Perform the forward DCT on a 10x10 sample block.
1076
jpeg_fdct_10x10 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
1078
INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
1079
INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
1080
DCTELEM workspace[8*2];
1087
/* Pass 1: process rows. */
1088
/* Note results are scaled up by sqrt(8) compared to a true DCT; */
1089
/* we scale the results further by 2 as part of output adaption */
1090
/* scaling for different DCT size. */
1091
/* cK represents sqrt(2) * cos(K*pi/20). */
1096
elemptr = sample_data[ctr] + start_col;
1100
tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[9]);
1101
tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[8]);
1102
tmp12 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[7]);
1103
tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[6]);
1104
tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[5]);
1106
tmp10 = tmp0 + tmp4;
1107
tmp13 = tmp0 - tmp4;
1108
tmp11 = tmp1 + tmp3;
1109
tmp14 = tmp1 - tmp3;
1111
tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[9]);
1112
tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[8]);
1113
tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[7]);
1114
tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[6]);
1115
tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[5]);
1117
/* Apply unsigned->signed conversion */
1118
dataptr[0] = (DCTELEM)
1119
((tmp10 + tmp11 + tmp12 - 10 * CENTERJSAMPLE) << 1);
1121
dataptr[4] = (DCTELEM)
1122
DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.144122806)) - /* c4 */
1123
MULTIPLY(tmp11 - tmp12, FIX(0.437016024)), /* c8 */
1125
tmp10 = MULTIPLY(tmp13 + tmp14, FIX(0.831253876)); /* c6 */
1126
dataptr[2] = (DCTELEM)
1127
DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.513743148)), /* c2-c6 */
1129
dataptr[6] = (DCTELEM)
1130
DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.176250899)), /* c2+c6 */
1135
tmp10 = tmp0 + tmp4;
1136
tmp11 = tmp1 - tmp3;
1137
dataptr[5] = (DCTELEM) ((tmp10 - tmp11 - tmp2) << 1);
1138
tmp2 <<= CONST_BITS;
1139
dataptr[1] = (DCTELEM)
1140
DESCALE(MULTIPLY(tmp0, FIX(1.396802247)) + /* c1 */
1141
MULTIPLY(tmp1, FIX(1.260073511)) + tmp2 + /* c3 */
1142
MULTIPLY(tmp3, FIX(0.642039522)) + /* c7 */
1143
MULTIPLY(tmp4, FIX(0.221231742)), /* c9 */
1145
tmp12 = MULTIPLY(tmp0 - tmp4, FIX(0.951056516)) - /* (c3+c7)/2 */
1146
MULTIPLY(tmp1 + tmp3, FIX(0.587785252)); /* (c1-c9)/2 */
1147
tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.309016994)) + /* (c3-c7)/2 */
1148
(tmp11 << (CONST_BITS - 1)) - tmp2;
1149
dataptr[3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS-1);
1150
dataptr[7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS-1);
1154
if (ctr != DCTSIZE) {
1157
dataptr += DCTSIZE; /* advance pointer to next row */
1159
dataptr = workspace; /* switch pointer to extended workspace */
1162
/* Pass 2: process columns.
1163
* We leave the results scaled up by an overall factor of 8.
1164
* We must also scale the output by (8/10)**2 = 16/25, which we partially
1165
* fold into the constant multipliers and final/initial shifting:
1166
* cK now represents sqrt(2) * cos(K*pi/20) * 32/25.
1171
for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
1174
tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*1];
1175
tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*0];
1176
tmp12 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*7];
1177
tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*6];
1178
tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*5];
1180
tmp10 = tmp0 + tmp4;
1181
tmp13 = tmp0 - tmp4;
1182
tmp11 = tmp1 + tmp3;
1183
tmp14 = tmp1 - tmp3;
1185
tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*1];
1186
tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*0];
1187
tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*7];
1188
tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*6];
1189
tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*5];
1191
dataptr[DCTSIZE*0] = (DCTELEM)
1192
DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(1.28)), /* 32/25 */
1195
dataptr[DCTSIZE*4] = (DCTELEM)
1196
DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.464477191)) - /* c4 */
1197
MULTIPLY(tmp11 - tmp12, FIX(0.559380511)), /* c8 */
1199
tmp10 = MULTIPLY(tmp13 + tmp14, FIX(1.064004961)); /* c6 */
1200
dataptr[DCTSIZE*2] = (DCTELEM)
1201
DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.657591230)), /* c2-c6 */
1203
dataptr[DCTSIZE*6] = (DCTELEM)
1204
DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.785601151)), /* c2+c6 */
1209
tmp10 = tmp0 + tmp4;
1210
tmp11 = tmp1 - tmp3;
1211
dataptr[DCTSIZE*5] = (DCTELEM)
1212
DESCALE(MULTIPLY(tmp10 - tmp11 - tmp2, FIX(1.28)), /* 32/25 */
1214
tmp2 = MULTIPLY(tmp2, FIX(1.28)); /* 32/25 */
1215
dataptr[DCTSIZE*1] = (DCTELEM)
1216
DESCALE(MULTIPLY(tmp0, FIX(1.787906876)) + /* c1 */
1217
MULTIPLY(tmp1, FIX(1.612894094)) + tmp2 + /* c3 */
1218
MULTIPLY(tmp3, FIX(0.821810588)) + /* c7 */
1219
MULTIPLY(tmp4, FIX(0.283176630)), /* c9 */
1221
tmp12 = MULTIPLY(tmp0 - tmp4, FIX(1.217352341)) - /* (c3+c7)/2 */
1222
MULTIPLY(tmp1 + tmp3, FIX(0.752365123)); /* (c1-c9)/2 */
1223
tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.395541753)) + /* (c3-c7)/2 */
1224
MULTIPLY(tmp11, FIX(0.64)) - tmp2; /* 16/25 */
1225
dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS+2);
1226
dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS+2);
1228
dataptr++; /* advance pointer to next column */
1229
wsptr++; /* advance pointer to next column */
1235
* Perform the forward DCT on an 11x11 sample block.
1239
jpeg_fdct_11x11 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
1241
INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1242
INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
1244
DCTELEM workspace[8*3];
1251
/* Pass 1: process rows. */
1252
/* Note results are scaled up by sqrt(8) compared to a true DCT; */
1253
/* we scale the results further by 2 as part of output adaption */
1254
/* scaling for different DCT size. */
1255
/* cK represents sqrt(2) * cos(K*pi/22). */
1260
elemptr = sample_data[ctr] + start_col;
1264
tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[10]);
1265
tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[9]);
1266
tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[8]);
1267
tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[7]);
1268
tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[6]);
1269
tmp5 = GETJSAMPLE(elemptr[5]);
1271
tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[10]);
1272
tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[9]);
1273
tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[8]);
1274
tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[7]);
1275
tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[6]);
1277
/* Apply unsigned->signed conversion */
1278
dataptr[0] = (DCTELEM)
1279
((tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 - 11 * CENTERJSAMPLE) << 1);
1286
z1 = MULTIPLY(tmp0 + tmp3, FIX(1.356927976)) + /* c2 */
1287
MULTIPLY(tmp2 + tmp4, FIX(0.201263574)); /* c10 */
1288
z2 = MULTIPLY(tmp1 - tmp3, FIX(0.926112931)); /* c6 */
1289
z3 = MULTIPLY(tmp0 - tmp1, FIX(1.189712156)); /* c4 */
1290
dataptr[2] = (DCTELEM)
1291
DESCALE(z1 + z2 - MULTIPLY(tmp3, FIX(1.018300590)) /* c2+c8-c6 */
1292
- MULTIPLY(tmp4, FIX(1.390975730)), /* c4+c10 */
1294
dataptr[4] = (DCTELEM)
1295
DESCALE(z2 + z3 + MULTIPLY(tmp1, FIX(0.062335650)) /* c4-c6-c10 */
1296
- MULTIPLY(tmp2, FIX(1.356927976)) /* c2 */
1297
+ MULTIPLY(tmp4, FIX(0.587485545)), /* c8 */
1299
dataptr[6] = (DCTELEM)
1300
DESCALE(z1 + z3 - MULTIPLY(tmp0, FIX(1.620527200)) /* c2+c4-c6 */
1301
- MULTIPLY(tmp2, FIX(0.788749120)), /* c8+c10 */
1306
tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.286413905)); /* c3 */
1307
tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.068791298)); /* c5 */
1308
tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.764581576)); /* c7 */
1309
tmp0 = tmp1 + tmp2 + tmp3 - MULTIPLY(tmp10, FIX(1.719967871)) /* c7+c5+c3-c1 */
1310
+ MULTIPLY(tmp14, FIX(0.398430003)); /* c9 */
1311
tmp4 = MULTIPLY(tmp11 + tmp12, - FIX(0.764581576)); /* -c7 */
1312
tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.399818907)); /* -c1 */
1313
tmp1 += tmp4 + tmp5 + MULTIPLY(tmp11, FIX(1.276416582)) /* c9+c7+c1-c3 */
1314
- MULTIPLY(tmp14, FIX(1.068791298)); /* c5 */
1315
tmp10 = MULTIPLY(tmp12 + tmp13, FIX(0.398430003)); /* c9 */
1316
tmp2 += tmp4 + tmp10 - MULTIPLY(tmp12, FIX(1.989053629)) /* c9+c5+c3-c7 */
1317
+ MULTIPLY(tmp14, FIX(1.399818907)); /* c1 */
1318
tmp3 += tmp5 + tmp10 + MULTIPLY(tmp13, FIX(1.305598626)) /* c1+c5-c9-c7 */
1319
- MULTIPLY(tmp14, FIX(1.286413905)); /* c3 */
1321
dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-1);
1322
dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-1);
1323
dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-1);
1324
dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS-1);
1328
if (ctr != DCTSIZE) {
1331
dataptr += DCTSIZE; /* advance pointer to next row */
1333
dataptr = workspace; /* switch pointer to extended workspace */
1336
/* Pass 2: process columns.
1337
* We leave the results scaled up by an overall factor of 8.
1338
* We must also scale the output by (8/11)**2 = 64/121, which we partially
1339
* fold into the constant multipliers and final/initial shifting:
1340
* cK now represents sqrt(2) * cos(K*pi/22) * 128/121.
1345
for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
1348
tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*2];
1349
tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*1];
1350
tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*0];
1351
tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*7];
1352
tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*6];
1353
tmp5 = dataptr[DCTSIZE*5];
1355
tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*2];
1356
tmp11 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*1];
1357
tmp12 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*0];
1358
tmp13 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*7];
1359
tmp14 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*6];
1361
dataptr[DCTSIZE*0] = (DCTELEM)
1362
DESCALE(MULTIPLY(tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5,
1363
FIX(1.057851240)), /* 128/121 */
1371
z1 = MULTIPLY(tmp0 + tmp3, FIX(1.435427942)) + /* c2 */
1372
MULTIPLY(tmp2 + tmp4, FIX(0.212906922)); /* c10 */
1373
z2 = MULTIPLY(tmp1 - tmp3, FIX(0.979689713)); /* c6 */
1374
z3 = MULTIPLY(tmp0 - tmp1, FIX(1.258538479)); /* c4 */
1375
dataptr[DCTSIZE*2] = (DCTELEM)
1376
DESCALE(z1 + z2 - MULTIPLY(tmp3, FIX(1.077210542)) /* c2+c8-c6 */
1377
- MULTIPLY(tmp4, FIX(1.471445400)), /* c4+c10 */
1379
dataptr[DCTSIZE*4] = (DCTELEM)
1380
DESCALE(z2 + z3 + MULTIPLY(tmp1, FIX(0.065941844)) /* c4-c6-c10 */
1381
- MULTIPLY(tmp2, FIX(1.435427942)) /* c2 */
1382
+ MULTIPLY(tmp4, FIX(0.621472312)), /* c8 */
1384
dataptr[DCTSIZE*6] = (DCTELEM)
1385
DESCALE(z1 + z3 - MULTIPLY(tmp0, FIX(1.714276708)) /* c2+c4-c6 */
1386
- MULTIPLY(tmp2, FIX(0.834379234)), /* c8+c10 */
1391
tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.360834544)); /* c3 */
1392
tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.130622199)); /* c5 */
1393
tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.808813568)); /* c7 */
1394
tmp0 = tmp1 + tmp2 + tmp3 - MULTIPLY(tmp10, FIX(1.819470145)) /* c7+c5+c3-c1 */
1395
+ MULTIPLY(tmp14, FIX(0.421479672)); /* c9 */
1396
tmp4 = MULTIPLY(tmp11 + tmp12, - FIX(0.808813568)); /* -c7 */
1397
tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.480800167)); /* -c1 */
1398
tmp1 += tmp4 + tmp5 + MULTIPLY(tmp11, FIX(1.350258864)) /* c9+c7+c1-c3 */
1399
- MULTIPLY(tmp14, FIX(1.130622199)); /* c5 */
1400
tmp10 = MULTIPLY(tmp12 + tmp13, FIX(0.421479672)); /* c9 */
1401
tmp2 += tmp4 + tmp10 - MULTIPLY(tmp12, FIX(2.104122847)) /* c9+c5+c3-c7 */
1402
+ MULTIPLY(tmp14, FIX(1.480800167)); /* c1 */
1403
tmp3 += tmp5 + tmp10 + MULTIPLY(tmp13, FIX(1.381129125)) /* c1+c5-c9-c7 */
1404
- MULTIPLY(tmp14, FIX(1.360834544)); /* c3 */
1406
dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+2);
1407
dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+2);
1408
dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+2);
1409
dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3, CONST_BITS+2);
1411
dataptr++; /* advance pointer to next column */
1412
wsptr++; /* advance pointer to next column */
1418
* Perform the forward DCT on a 12x12 sample block.
1422
jpeg_fdct_12x12 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
1424
INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1425
INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
1426
DCTELEM workspace[8*4];
1433
/* Pass 1: process rows. */
1434
/* Note results are scaled up by sqrt(8) compared to a true DCT. */
1435
/* cK represents sqrt(2) * cos(K*pi/24). */
1440
elemptr = sample_data[ctr] + start_col;
1444
tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[11]);
1445
tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[10]);
1446
tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[9]);
1447
tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[8]);
1448
tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[7]);
1449
tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[6]);
1451
tmp10 = tmp0 + tmp5;
1452
tmp13 = tmp0 - tmp5;
1453
tmp11 = tmp1 + tmp4;
1454
tmp14 = tmp1 - tmp4;
1455
tmp12 = tmp2 + tmp3;
1456
tmp15 = tmp2 - tmp3;
1458
tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[11]);
1459
tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[10]);
1460
tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[9]);
1461
tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[8]);
1462
tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[7]);
1463
tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[6]);
1465
/* Apply unsigned->signed conversion */
1466
dataptr[0] = (DCTELEM) (tmp10 + tmp11 + tmp12 - 12 * CENTERJSAMPLE);
1467
dataptr[6] = (DCTELEM) (tmp13 - tmp14 - tmp15);
1468
dataptr[4] = (DCTELEM)
1469
DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.224744871)), /* c4 */
1471
dataptr[2] = (DCTELEM)
1472
DESCALE(tmp14 - tmp15 + MULTIPLY(tmp13 + tmp15, FIX(1.366025404)), /* c2 */
1477
tmp10 = MULTIPLY(tmp1 + tmp4, FIX_0_541196100); /* c9 */
1478
tmp14 = tmp10 + MULTIPLY(tmp1, FIX_0_765366865); /* c3-c9 */
1479
tmp15 = tmp10 - MULTIPLY(tmp4, FIX_1_847759065); /* c3+c9 */
1480
tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.121971054)); /* c5 */
1481
tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.860918669)); /* c7 */
1482
tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.580774953)) /* c5+c7-c1 */
1483
+ MULTIPLY(tmp5, FIX(0.184591911)); /* c11 */
1484
tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.184591911)); /* -c11 */
1485
tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.339493912)) /* c1+c5-c11 */
1486
+ MULTIPLY(tmp5, FIX(0.860918669)); /* c7 */
1487
tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.725788011)) /* c1+c11-c7 */
1488
- MULTIPLY(tmp5, FIX(1.121971054)); /* c5 */
1489
tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.306562965)) /* c3 */
1490
- MULTIPLY(tmp2 + tmp5, FIX_0_541196100); /* c9 */
1492
dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS);
1493
dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS);
1494
dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS);
1495
dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS);
1499
if (ctr != DCTSIZE) {
1502
dataptr += DCTSIZE; /* advance pointer to next row */
1504
dataptr = workspace; /* switch pointer to extended workspace */
1507
/* Pass 2: process columns.
1508
* We leave the results scaled up by an overall factor of 8.
1509
* We must also scale the output by (8/12)**2 = 4/9, which we partially
1510
* fold into the constant multipliers and final shifting:
1511
* cK now represents sqrt(2) * cos(K*pi/24) * 8/9.
1516
for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
1519
tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*3];
1520
tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*2];
1521
tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*1];
1522
tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*0];
1523
tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*7];
1524
tmp5 = dataptr[DCTSIZE*5] + dataptr[DCTSIZE*6];
1526
tmp10 = tmp0 + tmp5;
1527
tmp13 = tmp0 - tmp5;
1528
tmp11 = tmp1 + tmp4;
1529
tmp14 = tmp1 - tmp4;
1530
tmp12 = tmp2 + tmp3;
1531
tmp15 = tmp2 - tmp3;
1533
tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*3];
1534
tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*2];
1535
tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*1];
1536
tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*0];
1537
tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*7];
1538
tmp5 = dataptr[DCTSIZE*5] - dataptr[DCTSIZE*6];
1540
dataptr[DCTSIZE*0] = (DCTELEM)
1541
DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(0.888888889)), /* 8/9 */
1543
dataptr[DCTSIZE*6] = (DCTELEM)
1544
DESCALE(MULTIPLY(tmp13 - tmp14 - tmp15, FIX(0.888888889)), /* 8/9 */
1546
dataptr[DCTSIZE*4] = (DCTELEM)
1547
DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.088662108)), /* c4 */
1549
dataptr[DCTSIZE*2] = (DCTELEM)
1550
DESCALE(MULTIPLY(tmp14 - tmp15, FIX(0.888888889)) + /* 8/9 */
1551
MULTIPLY(tmp13 + tmp15, FIX(1.214244803)), /* c2 */
1556
tmp10 = MULTIPLY(tmp1 + tmp4, FIX(0.481063200)); /* c9 */
1557
tmp14 = tmp10 + MULTIPLY(tmp1, FIX(0.680326102)); /* c3-c9 */
1558
tmp15 = tmp10 - MULTIPLY(tmp4, FIX(1.642452502)); /* c3+c9 */
1559
tmp12 = MULTIPLY(tmp0 + tmp2, FIX(0.997307603)); /* c5 */
1560
tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.765261039)); /* c7 */
1561
tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.516244403)) /* c5+c7-c1 */
1562
+ MULTIPLY(tmp5, FIX(0.164081699)); /* c11 */
1563
tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.164081699)); /* -c11 */
1564
tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.079550144)) /* c1+c5-c11 */
1565
+ MULTIPLY(tmp5, FIX(0.765261039)); /* c7 */
1566
tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.645144899)) /* c1+c11-c7 */
1567
- MULTIPLY(tmp5, FIX(0.997307603)); /* c5 */
1568
tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.161389302)) /* c3 */
1569
- MULTIPLY(tmp2 + tmp5, FIX(0.481063200)); /* c9 */
1571
dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+1);
1572
dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+1);
1573
dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+1);
1574
dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+1);
1576
dataptr++; /* advance pointer to next column */
1577
wsptr++; /* advance pointer to next column */
1583
* Perform the forward DCT on a 13x13 sample block.
1587
jpeg_fdct_13x13 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
1589
INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1590
INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
1592
DCTELEM workspace[8*5];
1599
/* Pass 1: process rows. */
1600
/* Note results are scaled up by sqrt(8) compared to a true DCT. */
1601
/* cK represents sqrt(2) * cos(K*pi/26). */
1606
elemptr = sample_data[ctr] + start_col;
1610
tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[12]);
1611
tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[11]);
1612
tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[10]);
1613
tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[9]);
1614
tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[8]);
1615
tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[7]);
1616
tmp6 = GETJSAMPLE(elemptr[6]);
1618
tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[12]);
1619
tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[11]);
1620
tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[10]);
1621
tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[9]);
1622
tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[8]);
1623
tmp15 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[7]);
1625
/* Apply unsigned->signed conversion */
1626
dataptr[0] = (DCTELEM)
1627
(tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 + tmp6 - 13 * CENTERJSAMPLE);
1635
dataptr[2] = (DCTELEM)
1636
DESCALE(MULTIPLY(tmp0, FIX(1.373119086)) + /* c2 */
1637
MULTIPLY(tmp1, FIX(1.058554052)) + /* c6 */
1638
MULTIPLY(tmp2, FIX(0.501487041)) - /* c10 */
1639
MULTIPLY(tmp3, FIX(0.170464608)) - /* c12 */
1640
MULTIPLY(tmp4, FIX(0.803364869)) - /* c8 */
1641
MULTIPLY(tmp5, FIX(1.252223920)), /* c4 */
1643
z1 = MULTIPLY(tmp0 - tmp2, FIX(1.155388986)) - /* (c4+c6)/2 */
1644
MULTIPLY(tmp3 - tmp4, FIX(0.435816023)) - /* (c2-c10)/2 */
1645
MULTIPLY(tmp1 - tmp5, FIX(0.316450131)); /* (c8-c12)/2 */
1646
z2 = MULTIPLY(tmp0 + tmp2, FIX(0.096834934)) - /* (c4-c6)/2 */
1647
MULTIPLY(tmp3 + tmp4, FIX(0.937303064)) + /* (c2+c10)/2 */
1648
MULTIPLY(tmp1 + tmp5, FIX(0.486914739)); /* (c8+c12)/2 */
1650
dataptr[4] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS);
1651
dataptr[6] = (DCTELEM) DESCALE(z1 - z2, CONST_BITS);
1655
tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.322312651)); /* c3 */
1656
tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.163874945)); /* c5 */
1657
tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.937797057)) + /* c7 */
1658
MULTIPLY(tmp14 + tmp15, FIX(0.338443458)); /* c11 */
1659
tmp0 = tmp1 + tmp2 + tmp3 -
1660
MULTIPLY(tmp10, FIX(2.020082300)) + /* c3+c5+c7-c1 */
1661
MULTIPLY(tmp14, FIX(0.318774355)); /* c9-c11 */
1662
tmp4 = MULTIPLY(tmp14 - tmp15, FIX(0.937797057)) - /* c7 */
1663
MULTIPLY(tmp11 + tmp12, FIX(0.338443458)); /* c11 */
1664
tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.163874945)); /* -c5 */
1665
tmp1 += tmp4 + tmp5 +
1666
MULTIPLY(tmp11, FIX(0.837223564)) - /* c5+c9+c11-c3 */
1667
MULTIPLY(tmp14, FIX(2.341699410)); /* c1+c7 */
1668
tmp6 = MULTIPLY(tmp12 + tmp13, - FIX(0.657217813)); /* -c9 */
1669
tmp2 += tmp4 + tmp6 -
1670
MULTIPLY(tmp12, FIX(1.572116027)) + /* c1+c5-c9-c11 */
1671
MULTIPLY(tmp15, FIX(2.260109708)); /* c3+c7 */
1672
tmp3 += tmp5 + tmp6 +
1673
MULTIPLY(tmp13, FIX(2.205608352)) - /* c3+c5+c9-c7 */
1674
MULTIPLY(tmp15, FIX(1.742345811)); /* c1+c11 */
1676
dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS);
1677
dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS);
1678
dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS);
1679
dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS);
1683
if (ctr != DCTSIZE) {
1686
dataptr += DCTSIZE; /* advance pointer to next row */
1688
dataptr = workspace; /* switch pointer to extended workspace */
1691
/* Pass 2: process columns.
1692
* We leave the results scaled up by an overall factor of 8.
1693
* We must also scale the output by (8/13)**2 = 64/169, which we partially
1694
* fold into the constant multipliers and final shifting:
1695
* cK now represents sqrt(2) * cos(K*pi/26) * 128/169.
1700
for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
1703
tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*4];
1704
tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*3];
1705
tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*2];
1706
tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*1];
1707
tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*0];
1708
tmp5 = dataptr[DCTSIZE*5] + dataptr[DCTSIZE*7];
1709
tmp6 = dataptr[DCTSIZE*6];
1711
tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*4];
1712
tmp11 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*3];
1713
tmp12 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*2];
1714
tmp13 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*1];
1715
tmp14 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*0];
1716
tmp15 = dataptr[DCTSIZE*5] - dataptr[DCTSIZE*7];
1718
dataptr[DCTSIZE*0] = (DCTELEM)
1719
DESCALE(MULTIPLY(tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 + tmp6,
1720
FIX(0.757396450)), /* 128/169 */
1729
dataptr[DCTSIZE*2] = (DCTELEM)
1730
DESCALE(MULTIPLY(tmp0, FIX(1.039995521)) + /* c2 */
1731
MULTIPLY(tmp1, FIX(0.801745081)) + /* c6 */
1732
MULTIPLY(tmp2, FIX(0.379824504)) - /* c10 */
1733
MULTIPLY(tmp3, FIX(0.129109289)) - /* c12 */
1734
MULTIPLY(tmp4, FIX(0.608465700)) - /* c8 */
1735
MULTIPLY(tmp5, FIX(0.948429952)), /* c4 */
1737
z1 = MULTIPLY(tmp0 - tmp2, FIX(0.875087516)) - /* (c4+c6)/2 */
1738
MULTIPLY(tmp3 - tmp4, FIX(0.330085509)) - /* (c2-c10)/2 */
1739
MULTIPLY(tmp1 - tmp5, FIX(0.239678205)); /* (c8-c12)/2 */
1740
z2 = MULTIPLY(tmp0 + tmp2, FIX(0.073342435)) - /* (c4-c6)/2 */
1741
MULTIPLY(tmp3 + tmp4, FIX(0.709910013)) + /* (c2+c10)/2 */
1742
MULTIPLY(tmp1 + tmp5, FIX(0.368787494)); /* (c8+c12)/2 */
1744
dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+1);
1745
dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 - z2, CONST_BITS+1);
1749
tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.001514908)); /* c3 */
1750
tmp2 = MULTIPLY(tmp10 + tmp12, FIX(0.881514751)); /* c5 */
1751
tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.710284161)) + /* c7 */
1752
MULTIPLY(tmp14 + tmp15, FIX(0.256335874)); /* c11 */
1753
tmp0 = tmp1 + tmp2 + tmp3 -
1754
MULTIPLY(tmp10, FIX(1.530003162)) + /* c3+c5+c7-c1 */
1755
MULTIPLY(tmp14, FIX(0.241438564)); /* c9-c11 */
1756
tmp4 = MULTIPLY(tmp14 - tmp15, FIX(0.710284161)) - /* c7 */
1757
MULTIPLY(tmp11 + tmp12, FIX(0.256335874)); /* c11 */
1758
tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(0.881514751)); /* -c5 */
1759
tmp1 += tmp4 + tmp5 +
1760
MULTIPLY(tmp11, FIX(0.634110155)) - /* c5+c9+c11-c3 */
1761
MULTIPLY(tmp14, FIX(1.773594819)); /* c1+c7 */
1762
tmp6 = MULTIPLY(tmp12 + tmp13, - FIX(0.497774438)); /* -c9 */
1763
tmp2 += tmp4 + tmp6 -
1764
MULTIPLY(tmp12, FIX(1.190715098)) + /* c1+c5-c9-c11 */
1765
MULTIPLY(tmp15, FIX(1.711799069)); /* c3+c7 */
1766
tmp3 += tmp5 + tmp6 +
1767
MULTIPLY(tmp13, FIX(1.670519935)) - /* c3+c5+c9-c7 */
1768
MULTIPLY(tmp15, FIX(1.319646532)); /* c1+c11 */
1770
dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+1);
1771
dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+1);
1772
dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+1);
1773
dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3, CONST_BITS+1);
1775
dataptr++; /* advance pointer to next column */
1776
wsptr++; /* advance pointer to next column */
1782
* Perform the forward DCT on a 14x14 sample block.
1786
jpeg_fdct_14x14 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
1788
INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1789
INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
1790
DCTELEM workspace[8*6];
1797
/* Pass 1: process rows. */
1798
/* Note results are scaled up by sqrt(8) compared to a true DCT. */
1799
/* cK represents sqrt(2) * cos(K*pi/28). */
1804
elemptr = sample_data[ctr] + start_col;
1808
tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[13]);
1809
tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[12]);
1810
tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[11]);
1811
tmp13 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[10]);
1812
tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[9]);
1813
tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[8]);
1814
tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[7]);
1816
tmp10 = tmp0 + tmp6;
1817
tmp14 = tmp0 - tmp6;
1818
tmp11 = tmp1 + tmp5;
1819
tmp15 = tmp1 - tmp5;
1820
tmp12 = tmp2 + tmp4;
1821
tmp16 = tmp2 - tmp4;
1823
tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[13]);
1824
tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[12]);
1825
tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[11]);
1826
tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[10]);
1827
tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[9]);
1828
tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[8]);
1829
tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[7]);
1831
/* Apply unsigned->signed conversion */
1832
dataptr[0] = (DCTELEM)
1833
(tmp10 + tmp11 + tmp12 + tmp13 - 14 * CENTERJSAMPLE);
1835
dataptr[4] = (DCTELEM)
1836
DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.274162392)) + /* c4 */
1837
MULTIPLY(tmp11 - tmp13, FIX(0.314692123)) - /* c12 */
1838
MULTIPLY(tmp12 - tmp13, FIX(0.881747734)), /* c8 */
1841
tmp10 = MULTIPLY(tmp14 + tmp15, FIX(1.105676686)); /* c6 */
1843
dataptr[2] = (DCTELEM)
1844
DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.273079590)) /* c2-c6 */
1845
+ MULTIPLY(tmp16, FIX(0.613604268)), /* c10 */
1847
dataptr[6] = (DCTELEM)
1848
DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.719280954)) /* c6+c10 */
1849
- MULTIPLY(tmp16, FIX(1.378756276)), /* c2 */
1854
tmp10 = tmp1 + tmp2;
1855
tmp11 = tmp5 - tmp4;
1856
dataptr[7] = (DCTELEM) (tmp0 - tmp10 + tmp3 - tmp11 - tmp6);
1857
tmp3 <<= CONST_BITS;
1858
tmp10 = MULTIPLY(tmp10, - FIX(0.158341681)); /* -c13 */
1859
tmp11 = MULTIPLY(tmp11, FIX(1.405321284)); /* c1 */
1860
tmp10 += tmp11 - tmp3;
1861
tmp11 = MULTIPLY(tmp0 + tmp2, FIX(1.197448846)) + /* c5 */
1862
MULTIPLY(tmp4 + tmp6, FIX(0.752406978)); /* c9 */
1863
dataptr[5] = (DCTELEM)
1864
DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(2.373959773)) /* c3+c5-c13 */
1865
+ MULTIPLY(tmp4, FIX(1.119999435)), /* c1+c11-c9 */
1867
tmp12 = MULTIPLY(tmp0 + tmp1, FIX(1.334852607)) + /* c3 */
1868
MULTIPLY(tmp5 - tmp6, FIX(0.467085129)); /* c11 */
1869
dataptr[3] = (DCTELEM)
1870
DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.424103948)) /* c3-c9-c13 */
1871
- MULTIPLY(tmp5, FIX(3.069855259)), /* c1+c5+c11 */
1873
dataptr[1] = (DCTELEM)
1874
DESCALE(tmp11 + tmp12 + tmp3 + tmp6 -
1875
MULTIPLY(tmp0 + tmp6, FIX(1.126980169)), /* c3+c5-c1 */
1880
if (ctr != DCTSIZE) {
1883
dataptr += DCTSIZE; /* advance pointer to next row */
1885
dataptr = workspace; /* switch pointer to extended workspace */
1888
/* Pass 2: process columns.
1889
* We leave the results scaled up by an overall factor of 8.
1890
* We must also scale the output by (8/14)**2 = 16/49, which we partially
1891
* fold into the constant multipliers and final shifting:
1892
* cK now represents sqrt(2) * cos(K*pi/28) * 32/49.
1897
for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
1900
tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*5];
1901
tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*4];
1902
tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*3];
1903
tmp13 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*2];
1904
tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*1];
1905
tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*0];
1906
tmp6 = dataptr[DCTSIZE*6] + dataptr[DCTSIZE*7];
1908
tmp10 = tmp0 + tmp6;
1909
tmp14 = tmp0 - tmp6;
1910
tmp11 = tmp1 + tmp5;
1911
tmp15 = tmp1 - tmp5;
1912
tmp12 = tmp2 + tmp4;
1913
tmp16 = tmp2 - tmp4;
1915
tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*5];
1916
tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*4];
1917
tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*3];
1918
tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*2];
1919
tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*1];
1920
tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*0];
1921
tmp6 = dataptr[DCTSIZE*6] - dataptr[DCTSIZE*7];
1923
dataptr[DCTSIZE*0] = (DCTELEM)
1924
DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12 + tmp13,
1925
FIX(0.653061224)), /* 32/49 */
1928
dataptr[DCTSIZE*4] = (DCTELEM)
1929
DESCALE(MULTIPLY(tmp10 - tmp13, FIX(0.832106052)) + /* c4 */
1930
MULTIPLY(tmp11 - tmp13, FIX(0.205513223)) - /* c12 */
1931
MULTIPLY(tmp12 - tmp13, FIX(0.575835255)), /* c8 */
1934
tmp10 = MULTIPLY(tmp14 + tmp15, FIX(0.722074570)); /* c6 */
1936
dataptr[DCTSIZE*2] = (DCTELEM)
1937
DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.178337691)) /* c2-c6 */
1938
+ MULTIPLY(tmp16, FIX(0.400721155)), /* c10 */
1940
dataptr[DCTSIZE*6] = (DCTELEM)
1941
DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.122795725)) /* c6+c10 */
1942
- MULTIPLY(tmp16, FIX(0.900412262)), /* c2 */
1947
tmp10 = tmp1 + tmp2;
1948
tmp11 = tmp5 - tmp4;
1949
dataptr[DCTSIZE*7] = (DCTELEM)
1950
DESCALE(MULTIPLY(tmp0 - tmp10 + tmp3 - tmp11 - tmp6,
1951
FIX(0.653061224)), /* 32/49 */
1953
tmp3 = MULTIPLY(tmp3 , FIX(0.653061224)); /* 32/49 */
1954
tmp10 = MULTIPLY(tmp10, - FIX(0.103406812)); /* -c13 */
1955
tmp11 = MULTIPLY(tmp11, FIX(0.917760839)); /* c1 */
1956
tmp10 += tmp11 - tmp3;
1957
tmp11 = MULTIPLY(tmp0 + tmp2, FIX(0.782007410)) + /* c5 */
1958
MULTIPLY(tmp4 + tmp6, FIX(0.491367823)); /* c9 */
1959
dataptr[DCTSIZE*5] = (DCTELEM)
1960
DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(1.550341076)) /* c3+c5-c13 */
1961
+ MULTIPLY(tmp4, FIX(0.731428202)), /* c1+c11-c9 */
1963
tmp12 = MULTIPLY(tmp0 + tmp1, FIX(0.871740478)) + /* c3 */
1964
MULTIPLY(tmp5 - tmp6, FIX(0.305035186)); /* c11 */
1965
dataptr[DCTSIZE*3] = (DCTELEM)
1966
DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.276965844)) /* c3-c9-c13 */
1967
- MULTIPLY(tmp5, FIX(2.004803435)), /* c1+c5+c11 */
1969
dataptr[DCTSIZE*1] = (DCTELEM)
1970
DESCALE(tmp11 + tmp12 + tmp3
1971
- MULTIPLY(tmp0, FIX(0.735987049)) /* c3+c5-c1 */
1972
- MULTIPLY(tmp6, FIX(0.082925825)), /* c9-c11-c13 */
1975
dataptr++; /* advance pointer to next column */
1976
wsptr++; /* advance pointer to next column */
1982
* Perform the forward DCT on a 15x15 sample block.
1986
jpeg_fdct_15x15 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
1988
INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1989
INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
1991
DCTELEM workspace[8*7];
1998
/* Pass 1: process rows. */
1999
/* Note results are scaled up by sqrt(8) compared to a true DCT. */
2000
/* cK represents sqrt(2) * cos(K*pi/30). */
2005
elemptr = sample_data[ctr] + start_col;
2009
tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[14]);
2010
tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[13]);
2011
tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[12]);
2012
tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[11]);
2013
tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[10]);
2014
tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[9]);
2015
tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[8]);
2016
tmp7 = GETJSAMPLE(elemptr[7]);
2018
tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[14]);
2019
tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[13]);
2020
tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[12]);
2021
tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[11]);
2022
tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[10]);
2023
tmp15 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[9]);
2024
tmp16 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[8]);
2026
z1 = tmp0 + tmp4 + tmp5;
2027
z2 = tmp1 + tmp3 + tmp6;
2029
/* Apply unsigned->signed conversion */
2030
dataptr[0] = (DCTELEM) (z1 + z2 + z3 - 15 * CENTERJSAMPLE);
2032
dataptr[6] = (DCTELEM)
2033
DESCALE(MULTIPLY(z1 - z3, FIX(1.144122806)) - /* c6 */
2034
MULTIPLY(z2 - z3, FIX(0.437016024)), /* c12 */
2036
tmp2 += ((tmp1 + tmp4) >> 1) - tmp7 - tmp7;
2037
z1 = MULTIPLY(tmp3 - tmp2, FIX(1.531135173)) - /* c2+c14 */
2038
MULTIPLY(tmp6 - tmp2, FIX(2.238241955)); /* c4+c8 */
2039
z2 = MULTIPLY(tmp5 - tmp2, FIX(0.798468008)) - /* c8-c14 */
2040
MULTIPLY(tmp0 - tmp2, FIX(0.091361227)); /* c2-c4 */
2041
z3 = MULTIPLY(tmp0 - tmp3, FIX(1.383309603)) + /* c2 */
2042
MULTIPLY(tmp6 - tmp5, FIX(0.946293579)) + /* c8 */
2043
MULTIPLY(tmp1 - tmp4, FIX(0.790569415)); /* (c6+c12)/2 */
2045
dataptr[2] = (DCTELEM) DESCALE(z1 + z3, CONST_BITS);
2046
dataptr[4] = (DCTELEM) DESCALE(z2 + z3, CONST_BITS);
2050
tmp2 = MULTIPLY(tmp10 - tmp12 - tmp13 + tmp15 + tmp16,
2051
FIX(1.224744871)); /* c5 */
2052
tmp1 = MULTIPLY(tmp10 - tmp14 - tmp15, FIX(1.344997024)) + /* c3 */
2053
MULTIPLY(tmp11 - tmp13 - tmp16, FIX(0.831253876)); /* c9 */
2054
tmp12 = MULTIPLY(tmp12, FIX(1.224744871)); /* c5 */
2055
tmp4 = MULTIPLY(tmp10 - tmp16, FIX(1.406466353)) + /* c1 */
2056
MULTIPLY(tmp11 + tmp14, FIX(1.344997024)) + /* c3 */
2057
MULTIPLY(tmp13 + tmp15, FIX(0.575212477)); /* c11 */
2058
tmp0 = MULTIPLY(tmp13, FIX(0.475753014)) - /* c7-c11 */
2059
MULTIPLY(tmp14, FIX(0.513743148)) + /* c3-c9 */
2060
MULTIPLY(tmp16, FIX(1.700497885)) + tmp4 + tmp12; /* c1+c13 */
2061
tmp3 = MULTIPLY(tmp10, - FIX(0.355500862)) - /* -(c1-c7) */
2062
MULTIPLY(tmp11, FIX(2.176250899)) - /* c3+c9 */
2063
MULTIPLY(tmp15, FIX(0.869244010)) + tmp4 - tmp12; /* c11+c13 */
2065
dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS);
2066
dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS);
2067
dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS);
2068
dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS);
2072
if (ctr != DCTSIZE) {
2075
dataptr += DCTSIZE; /* advance pointer to next row */
2077
dataptr = workspace; /* switch pointer to extended workspace */
2080
/* Pass 2: process columns.
2081
* We leave the results scaled up by an overall factor of 8.
2082
* We must also scale the output by (8/15)**2 = 64/225, which we partially
2083
* fold into the constant multipliers and final shifting:
2084
* cK now represents sqrt(2) * cos(K*pi/30) * 256/225.
2089
for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
2092
tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*6];
2093
tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*5];
2094
tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*4];
2095
tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*3];
2096
tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*2];
2097
tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*1];
2098
tmp6 = dataptr[DCTSIZE*6] + wsptr[DCTSIZE*0];
2099
tmp7 = dataptr[DCTSIZE*7];
2101
tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*6];
2102
tmp11 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*5];
2103
tmp12 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*4];
2104
tmp13 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*3];
2105
tmp14 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*2];
2106
tmp15 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*1];
2107
tmp16 = dataptr[DCTSIZE*6] - wsptr[DCTSIZE*0];
2109
z1 = tmp0 + tmp4 + tmp5;
2110
z2 = tmp1 + tmp3 + tmp6;
2112
dataptr[DCTSIZE*0] = (DCTELEM)
2113
DESCALE(MULTIPLY(z1 + z2 + z3, FIX(1.137777778)), /* 256/225 */
2116
dataptr[DCTSIZE*6] = (DCTELEM)
2117
DESCALE(MULTIPLY(z1 - z3, FIX(1.301757503)) - /* c6 */
2118
MULTIPLY(z2 - z3, FIX(0.497227121)), /* c12 */
2120
tmp2 += ((tmp1 + tmp4) >> 1) - tmp7 - tmp7;
2121
z1 = MULTIPLY(tmp3 - tmp2, FIX(1.742091575)) - /* c2+c14 */
2122
MULTIPLY(tmp6 - tmp2, FIX(2.546621957)); /* c4+c8 */
2123
z2 = MULTIPLY(tmp5 - tmp2, FIX(0.908479156)) - /* c8-c14 */
2124
MULTIPLY(tmp0 - tmp2, FIX(0.103948774)); /* c2-c4 */
2125
z3 = MULTIPLY(tmp0 - tmp3, FIX(1.573898926)) + /* c2 */
2126
MULTIPLY(tmp6 - tmp5, FIX(1.076671805)) + /* c8 */
2127
MULTIPLY(tmp1 - tmp4, FIX(0.899492312)); /* (c6+c12)/2 */
2129
dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + z3, CONST_BITS+2);
2130
dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(z2 + z3, CONST_BITS+2);
2134
tmp2 = MULTIPLY(tmp10 - tmp12 - tmp13 + tmp15 + tmp16,
2135
FIX(1.393487498)); /* c5 */
2136
tmp1 = MULTIPLY(tmp10 - tmp14 - tmp15, FIX(1.530307725)) + /* c3 */
2137
MULTIPLY(tmp11 - tmp13 - tmp16, FIX(0.945782187)); /* c9 */
2138
tmp12 = MULTIPLY(tmp12, FIX(1.393487498)); /* c5 */
2139
tmp4 = MULTIPLY(tmp10 - tmp16, FIX(1.600246161)) + /* c1 */
2140
MULTIPLY(tmp11 + tmp14, FIX(1.530307725)) + /* c3 */
2141
MULTIPLY(tmp13 + tmp15, FIX(0.654463974)); /* c11 */
2142
tmp0 = MULTIPLY(tmp13, FIX(0.541301207)) - /* c7-c11 */
2143
MULTIPLY(tmp14, FIX(0.584525538)) + /* c3-c9 */
2144
MULTIPLY(tmp16, FIX(1.934788705)) + tmp4 + tmp12; /* c1+c13 */
2145
tmp3 = MULTIPLY(tmp10, - FIX(0.404480980)) - /* -(c1-c7) */
2146
MULTIPLY(tmp11, FIX(2.476089912)) - /* c3+c9 */
2147
MULTIPLY(tmp15, FIX(0.989006518)) + tmp4 - tmp12; /* c11+c13 */
2149
dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+2);
2150
dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+2);
2151
dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+2);
2152
dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3, CONST_BITS+2);
2154
dataptr++; /* advance pointer to next column */
2155
wsptr++; /* advance pointer to next column */
2161
* Perform the forward DCT on a 16x16 sample block.
2165
jpeg_fdct_16x16 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
2167
INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2168
INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
2169
DCTELEM workspace[DCTSIZE2];
2176
/* Pass 1: process rows. */
2177
/* Note results are scaled up by sqrt(8) compared to a true DCT; */
2178
/* furthermore, we scale the results by 2**PASS1_BITS. */
2179
/* cK represents sqrt(2) * cos(K*pi/32). */
2184
elemptr = sample_data[ctr] + start_col;
2188
tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[15]);
2189
tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[14]);
2190
tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[13]);
2191
tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[12]);
2192
tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[11]);
2193
tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[10]);
2194
tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[9]);
2195
tmp7 = GETJSAMPLE(elemptr[7]) + GETJSAMPLE(elemptr[8]);
2197
tmp10 = tmp0 + tmp7;
2198
tmp14 = tmp0 - tmp7;
2199
tmp11 = tmp1 + tmp6;
2200
tmp15 = tmp1 - tmp6;
2201
tmp12 = tmp2 + tmp5;
2202
tmp16 = tmp2 - tmp5;
2203
tmp13 = tmp3 + tmp4;
2204
tmp17 = tmp3 - tmp4;
2206
tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[15]);
2207
tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[14]);
2208
tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[13]);
2209
tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[12]);
2210
tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[11]);
2211
tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[10]);
2212
tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[9]);
2213
tmp7 = GETJSAMPLE(elemptr[7]) - GETJSAMPLE(elemptr[8]);
2215
/* Apply unsigned->signed conversion */
2216
dataptr[0] = (DCTELEM)
2217
((tmp10 + tmp11 + tmp12 + tmp13 - 16 * CENTERJSAMPLE) << PASS1_BITS);
2218
dataptr[4] = (DCTELEM)
2219
DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
2220
MULTIPLY(tmp11 - tmp12, FIX_0_541196100), /* c12[16] = c6[8] */
2221
CONST_BITS-PASS1_BITS);
2223
tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) + /* c14[16] = c7[8] */
2224
MULTIPLY(tmp14 - tmp16, FIX(1.387039845)); /* c2[16] = c1[8] */
2226
dataptr[2] = (DCTELEM)
2227
DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982)) /* c6+c14 */
2228
+ MULTIPLY(tmp16, FIX(2.172734804)), /* c2+c10 */
2229
CONST_BITS-PASS1_BITS);
2230
dataptr[6] = (DCTELEM)
2231
DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243)) /* c2-c6 */
2232
- MULTIPLY(tmp17, FIX(1.061594338)), /* c10+c14 */
2233
CONST_BITS-PASS1_BITS);
2237
tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) + /* c3 */
2238
MULTIPLY(tmp6 - tmp7, FIX(0.410524528)); /* c13 */
2239
tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) + /* c5 */
2240
MULTIPLY(tmp5 + tmp7, FIX(0.666655658)); /* c11 */
2241
tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) + /* c7 */
2242
MULTIPLY(tmp4 - tmp7, FIX(0.897167586)); /* c9 */
2243
tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) + /* c15 */
2244
MULTIPLY(tmp6 - tmp5, FIX(1.407403738)); /* c1 */
2245
tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) + /* -c11 */
2246
MULTIPLY(tmp4 + tmp6, - FIX(1.247225013)); /* -c5 */
2247
tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) + /* -c3 */
2248
MULTIPLY(tmp5 - tmp4, FIX(0.410524528)); /* c13 */
2249
tmp10 = tmp11 + tmp12 + tmp13 -
2250
MULTIPLY(tmp0, FIX(2.286341144)) + /* c7+c5+c3-c1 */
2251
MULTIPLY(tmp7, FIX(0.779653625)); /* c15+c13-c11+c9 */
2252
tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
2253
- MULTIPLY(tmp6, FIX(1.663905119)); /* c7+c13+c1-c5 */
2254
tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
2255
+ MULTIPLY(tmp5, FIX(1.227391138)); /* c9-c11+c1-c13 */
2256
tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
2257
+ MULTIPLY(tmp4, FIX(2.167985692)); /* c1+c13+c5-c9 */
2259
dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
2260
dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
2261
dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
2262
dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
2266
if (ctr != DCTSIZE) {
2267
if (ctr == DCTSIZE * 2)
2269
dataptr += DCTSIZE; /* advance pointer to next row */
2271
dataptr = workspace; /* switch pointer to extended workspace */
2274
/* Pass 2: process columns.
2275
* We remove the PASS1_BITS scaling, but leave the results scaled up
2276
* by an overall factor of 8.
2277
* We must also scale the output by (8/16)**2 = 1/2**2.
2282
for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
2285
tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*7];
2286
tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*6];
2287
tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*5];
2288
tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*4];
2289
tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*3];
2290
tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*2];
2291
tmp6 = dataptr[DCTSIZE*6] + wsptr[DCTSIZE*1];
2292
tmp7 = dataptr[DCTSIZE*7] + wsptr[DCTSIZE*0];
2294
tmp10 = tmp0 + tmp7;
2295
tmp14 = tmp0 - tmp7;
2296
tmp11 = tmp1 + tmp6;
2297
tmp15 = tmp1 - tmp6;
2298
tmp12 = tmp2 + tmp5;
2299
tmp16 = tmp2 - tmp5;
2300
tmp13 = tmp3 + tmp4;
2301
tmp17 = tmp3 - tmp4;
2303
tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*7];
2304
tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*6];
2305
tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*5];
2306
tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*4];
2307
tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*3];
2308
tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*2];
2309
tmp6 = dataptr[DCTSIZE*6] - wsptr[DCTSIZE*1];
2310
tmp7 = dataptr[DCTSIZE*7] - wsptr[DCTSIZE*0];
2312
dataptr[DCTSIZE*0] = (DCTELEM)
2313
DESCALE(tmp10 + tmp11 + tmp12 + tmp13, PASS1_BITS+2);
2314
dataptr[DCTSIZE*4] = (DCTELEM)
2315
DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
2316
MULTIPLY(tmp11 - tmp12, FIX_0_541196100), /* c12[16] = c6[8] */
2317
CONST_BITS+PASS1_BITS+2);
2319
tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) + /* c14[16] = c7[8] */
2320
MULTIPLY(tmp14 - tmp16, FIX(1.387039845)); /* c2[16] = c1[8] */
2322
dataptr[DCTSIZE*2] = (DCTELEM)
2323
DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982)) /* c6+c14 */
2324
+ MULTIPLY(tmp16, FIX(2.172734804)), /* c2+10 */
2325
CONST_BITS+PASS1_BITS+2);
2326
dataptr[DCTSIZE*6] = (DCTELEM)
2327
DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243)) /* c2-c6 */
2328
- MULTIPLY(tmp17, FIX(1.061594338)), /* c10+c14 */
2329
CONST_BITS+PASS1_BITS+2);
2333
tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) + /* c3 */
2334
MULTIPLY(tmp6 - tmp7, FIX(0.410524528)); /* c13 */
2335
tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) + /* c5 */
2336
MULTIPLY(tmp5 + tmp7, FIX(0.666655658)); /* c11 */
2337
tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) + /* c7 */
2338
MULTIPLY(tmp4 - tmp7, FIX(0.897167586)); /* c9 */
2339
tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) + /* c15 */
2340
MULTIPLY(tmp6 - tmp5, FIX(1.407403738)); /* c1 */
2341
tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) + /* -c11 */
2342
MULTIPLY(tmp4 + tmp6, - FIX(1.247225013)); /* -c5 */
2343
tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) + /* -c3 */
2344
MULTIPLY(tmp5 - tmp4, FIX(0.410524528)); /* c13 */
2345
tmp10 = tmp11 + tmp12 + tmp13 -
2346
MULTIPLY(tmp0, FIX(2.286341144)) + /* c7+c5+c3-c1 */
2347
MULTIPLY(tmp7, FIX(0.779653625)); /* c15+c13-c11+c9 */
2348
tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
2349
- MULTIPLY(tmp6, FIX(1.663905119)); /* c7+c13+c1-c5 */
2350
tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
2351
+ MULTIPLY(tmp5, FIX(1.227391138)); /* c9-c11+c1-c13 */
2352
tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
2353
+ MULTIPLY(tmp4, FIX(2.167985692)); /* c1+c13+c5-c9 */
2355
dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS1_BITS+2);
2356
dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS1_BITS+2);
2357
dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS1_BITS+2);
2358
dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS1_BITS+2);
2360
dataptr++; /* advance pointer to next column */
2361
wsptr++; /* advance pointer to next column */
2367
* Perform the forward DCT on a 16x8 sample block.
2369
* 16-point FDCT in pass 1 (rows), 8-point in pass 2 (columns).
2373
jpeg_fdct_16x8 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
2375
INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2376
INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
2383
/* Pass 1: process rows. */
2384
/* Note results are scaled up by sqrt(8) compared to a true DCT; */
2385
/* furthermore, we scale the results by 2**PASS1_BITS. */
2386
/* 16-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/32). */
2390
for (ctr = 0; ctr < DCTSIZE; ctr++) {
2391
elemptr = sample_data[ctr] + start_col;
2395
tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[15]);
2396
tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[14]);
2397
tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[13]);
2398
tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[12]);
2399
tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[11]);
2400
tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[10]);
2401
tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[9]);
2402
tmp7 = GETJSAMPLE(elemptr[7]) + GETJSAMPLE(elemptr[8]);
2404
tmp10 = tmp0 + tmp7;
2405
tmp14 = tmp0 - tmp7;
2406
tmp11 = tmp1 + tmp6;
2407
tmp15 = tmp1 - tmp6;
2408
tmp12 = tmp2 + tmp5;
2409
tmp16 = tmp2 - tmp5;
2410
tmp13 = tmp3 + tmp4;
2411
tmp17 = tmp3 - tmp4;
2413
tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[15]);
2414
tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[14]);
2415
tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[13]);
2416
tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[12]);
2417
tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[11]);
2418
tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[10]);
2419
tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[9]);
2420
tmp7 = GETJSAMPLE(elemptr[7]) - GETJSAMPLE(elemptr[8]);
2422
/* Apply unsigned->signed conversion */
2423
dataptr[0] = (DCTELEM)
2424
((tmp10 + tmp11 + tmp12 + tmp13 - 16 * CENTERJSAMPLE) << PASS1_BITS);
2425
dataptr[4] = (DCTELEM)
2426
DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
2427
MULTIPLY(tmp11 - tmp12, FIX_0_541196100), /* c12[16] = c6[8] */
2428
CONST_BITS-PASS1_BITS);
2430
tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) + /* c14[16] = c7[8] */
2431
MULTIPLY(tmp14 - tmp16, FIX(1.387039845)); /* c2[16] = c1[8] */
2433
dataptr[2] = (DCTELEM)
2434
DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982)) /* c6+c14 */
2435
+ MULTIPLY(tmp16, FIX(2.172734804)), /* c2+c10 */
2436
CONST_BITS-PASS1_BITS);
2437
dataptr[6] = (DCTELEM)
2438
DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243)) /* c2-c6 */
2439
- MULTIPLY(tmp17, FIX(1.061594338)), /* c10+c14 */
2440
CONST_BITS-PASS1_BITS);
2444
tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) + /* c3 */
2445
MULTIPLY(tmp6 - tmp7, FIX(0.410524528)); /* c13 */
2446
tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) + /* c5 */
2447
MULTIPLY(tmp5 + tmp7, FIX(0.666655658)); /* c11 */
2448
tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) + /* c7 */
2449
MULTIPLY(tmp4 - tmp7, FIX(0.897167586)); /* c9 */
2450
tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) + /* c15 */
2451
MULTIPLY(tmp6 - tmp5, FIX(1.407403738)); /* c1 */
2452
tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) + /* -c11 */
2453
MULTIPLY(tmp4 + tmp6, - FIX(1.247225013)); /* -c5 */
2454
tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) + /* -c3 */
2455
MULTIPLY(tmp5 - tmp4, FIX(0.410524528)); /* c13 */
2456
tmp10 = tmp11 + tmp12 + tmp13 -
2457
MULTIPLY(tmp0, FIX(2.286341144)) + /* c7+c5+c3-c1 */
2458
MULTIPLY(tmp7, FIX(0.779653625)); /* c15+c13-c11+c9 */
2459
tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
2460
- MULTIPLY(tmp6, FIX(1.663905119)); /* c7+c13+c1-c5 */
2461
tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
2462
+ MULTIPLY(tmp5, FIX(1.227391138)); /* c9-c11+c1-c13 */
2463
tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
2464
+ MULTIPLY(tmp4, FIX(2.167985692)); /* c1+c13+c5-c9 */
2466
dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
2467
dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
2468
dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
2469
dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
2471
dataptr += DCTSIZE; /* advance pointer to next row */
2474
/* Pass 2: process columns.
2475
* We remove the PASS1_BITS scaling, but leave the results scaled up
2476
* by an overall factor of 8.
2477
* We must also scale the output by 8/16 = 1/2.
2481
for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
2482
/* Even part per LL&M figure 1 --- note that published figure is faulty;
2483
* rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
2486
tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
2487
tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
2488
tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
2489
tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
2491
tmp10 = tmp0 + tmp3;
2492
tmp12 = tmp0 - tmp3;
2493
tmp11 = tmp1 + tmp2;
2494
tmp13 = tmp1 - tmp2;
2496
tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
2497
tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
2498
tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
2499
tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
2501
dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS+1);
2502
dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS+1);
2504
z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
2505
dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, FIX_0_765366865),
2506
CONST_BITS+PASS1_BITS+1);
2507
dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 - MULTIPLY(tmp13, FIX_1_847759065),
2508
CONST_BITS+PASS1_BITS+1);
2510
/* Odd part per figure 8 --- note paper omits factor of sqrt(2).
2511
* 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
2512
* i0..i3 in the paper are tmp0..tmp3 here.
2515
tmp10 = tmp0 + tmp3;
2516
tmp11 = tmp1 + tmp2;
2517
tmp12 = tmp0 + tmp2;
2518
tmp13 = tmp1 + tmp3;
2519
z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */
2521
tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */
2522
tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */
2523
tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */
2524
tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */
2525
tmp10 = MULTIPLY(tmp10, - FIX_0_899976223); /* c7-c3 */
2526
tmp11 = MULTIPLY(tmp11, - FIX_2_562915447); /* -c1-c3 */
2527
tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* c5-c3 */
2528
tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */
2533
dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0 + tmp10 + tmp12,
2534
CONST_BITS+PASS1_BITS+1);
2535
dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1 + tmp11 + tmp13,
2536
CONST_BITS+PASS1_BITS+1);
2537
dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2 + tmp11 + tmp12,
2538
CONST_BITS+PASS1_BITS+1);
2539
dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3 + tmp10 + tmp13,
2540
CONST_BITS+PASS1_BITS+1);
2542
dataptr++; /* advance pointer to next column */
2548
* Perform the forward DCT on a 14x7 sample block.
2550
* 14-point FDCT in pass 1 (rows), 7-point in pass 2 (columns).
2554
jpeg_fdct_14x7 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
2556
INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
2557
INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
2564
/* Zero bottom row of output coefficient block. */
2565
MEMZERO(&data[DCTSIZE*7], SIZEOF(DCTELEM) * DCTSIZE);
2567
/* Pass 1: process rows. */
2568
/* Note results are scaled up by sqrt(8) compared to a true DCT; */
2569
/* furthermore, we scale the results by 2**PASS1_BITS. */
2570
/* 14-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/28). */
2573
for (ctr = 0; ctr < 7; ctr++) {
2574
elemptr = sample_data[ctr] + start_col;
2578
tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[13]);
2579
tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[12]);
2580
tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[11]);
2581
tmp13 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[10]);
2582
tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[9]);
2583
tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[8]);
2584
tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[7]);
2586
tmp10 = tmp0 + tmp6;
2587
tmp14 = tmp0 - tmp6;
2588
tmp11 = tmp1 + tmp5;
2589
tmp15 = tmp1 - tmp5;
2590
tmp12 = tmp2 + tmp4;
2591
tmp16 = tmp2 - tmp4;
2593
tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[13]);
2594
tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[12]);
2595
tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[11]);
2596
tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[10]);
2597
tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[9]);
2598
tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[8]);
2599
tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[7]);
2601
/* Apply unsigned->signed conversion */
2602
dataptr[0] = (DCTELEM)
2603
((tmp10 + tmp11 + tmp12 + tmp13 - 14 * CENTERJSAMPLE) << PASS1_BITS);
2605
dataptr[4] = (DCTELEM)
2606
DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.274162392)) + /* c4 */
2607
MULTIPLY(tmp11 - tmp13, FIX(0.314692123)) - /* c12 */
2608
MULTIPLY(tmp12 - tmp13, FIX(0.881747734)), /* c8 */
2609
CONST_BITS-PASS1_BITS);
2611
tmp10 = MULTIPLY(tmp14 + tmp15, FIX(1.105676686)); /* c6 */
2613
dataptr[2] = (DCTELEM)
2614
DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.273079590)) /* c2-c6 */
2615
+ MULTIPLY(tmp16, FIX(0.613604268)), /* c10 */
2616
CONST_BITS-PASS1_BITS);
2617
dataptr[6] = (DCTELEM)
2618
DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.719280954)) /* c6+c10 */
2619
- MULTIPLY(tmp16, FIX(1.378756276)), /* c2 */
2620
CONST_BITS-PASS1_BITS);
2624
tmp10 = tmp1 + tmp2;
2625
tmp11 = tmp5 - tmp4;
2626
dataptr[7] = (DCTELEM) ((tmp0 - tmp10 + tmp3 - tmp11 - tmp6) << PASS1_BITS);
2627
tmp3 <<= CONST_BITS;
2628
tmp10 = MULTIPLY(tmp10, - FIX(0.158341681)); /* -c13 */
2629
tmp11 = MULTIPLY(tmp11, FIX(1.405321284)); /* c1 */
2630
tmp10 += tmp11 - tmp3;
2631
tmp11 = MULTIPLY(tmp0 + tmp2, FIX(1.197448846)) + /* c5 */
2632
MULTIPLY(tmp4 + tmp6, FIX(0.752406978)); /* c9 */
2633
dataptr[5] = (DCTELEM)
2634
DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(2.373959773)) /* c3+c5-c13 */
2635
+ MULTIPLY(tmp4, FIX(1.119999435)), /* c1+c11-c9 */
2636
CONST_BITS-PASS1_BITS);
2637
tmp12 = MULTIPLY(tmp0 + tmp1, FIX(1.334852607)) + /* c3 */
2638
MULTIPLY(tmp5 - tmp6, FIX(0.467085129)); /* c11 */
2639
dataptr[3] = (DCTELEM)
2640
DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.424103948)) /* c3-c9-c13 */
2641
- MULTIPLY(tmp5, FIX(3.069855259)), /* c1+c5+c11 */
2642
CONST_BITS-PASS1_BITS);
2643
dataptr[1] = (DCTELEM)
2644
DESCALE(tmp11 + tmp12 + tmp3 + tmp6 -
2645
MULTIPLY(tmp0 + tmp6, FIX(1.126980169)), /* c3+c5-c1 */
2646
CONST_BITS-PASS1_BITS);
2648
dataptr += DCTSIZE; /* advance pointer to next row */
2651
/* Pass 2: process columns.
2652
* We remove the PASS1_BITS scaling, but leave the results scaled up
2653
* by an overall factor of 8.
2654
* We must also scale the output by (8/14)*(8/7) = 32/49, which we
2655
* partially fold into the constant multipliers and final shifting:
2656
* 7-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/14) * 64/49.
2660
for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
2663
tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*6];
2664
tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*5];
2665
tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*4];
2666
tmp3 = dataptr[DCTSIZE*3];
2668
tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*6];
2669
tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*5];
2670
tmp12 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*4];
2673
dataptr[DCTSIZE*0] = (DCTELEM)
2674
DESCALE(MULTIPLY(z1 + tmp1 + tmp3, FIX(1.306122449)), /* 64/49 */
2675
CONST_BITS+PASS1_BITS+1);
2679
z1 = MULTIPLY(z1, FIX(0.461784020)); /* (c2+c6-c4)/2 */
2680
z2 = MULTIPLY(tmp0 - tmp2, FIX(1.202428084)); /* (c2+c4-c6)/2 */
2681
z3 = MULTIPLY(tmp1 - tmp2, FIX(0.411026446)); /* c6 */
2682
dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS+PASS1_BITS+1);
2684
z2 = MULTIPLY(tmp0 - tmp1, FIX(1.151670509)); /* c4 */
2685
dataptr[DCTSIZE*4] = (DCTELEM)
2686
DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.923568041)), /* c2+c6-c4 */
2687
CONST_BITS+PASS1_BITS+1);
2688
dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+PASS1_BITS+1);
2692
tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.221765677)); /* (c3+c1-c5)/2 */
2693
tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.222383464)); /* (c3+c5-c1)/2 */
2696
tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.800824523)); /* -c1 */
2698
tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.801442310)); /* c5 */
2700
tmp2 += tmp3 + MULTIPLY(tmp12, FIX(2.443531355)); /* c3+c1-c5 */
2702
dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+PASS1_BITS+1);
2703
dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+PASS1_BITS+1);
2704
dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+PASS1_BITS+1);
2706
dataptr++; /* advance pointer to next column */
2712
* Perform the forward DCT on a 12x6 sample block.
2714
* 12-point FDCT in pass 1 (rows), 6-point in pass 2 (columns).
2718
jpeg_fdct_12x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
2720
INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
2721
INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
2727
/* Zero 2 bottom rows of output coefficient block. */
2728
MEMZERO(&data[DCTSIZE*6], SIZEOF(DCTELEM) * DCTSIZE * 2);
2730
/* Pass 1: process rows. */
2731
/* Note results are scaled up by sqrt(8) compared to a true DCT; */
2732
/* furthermore, we scale the results by 2**PASS1_BITS. */
2733
/* 12-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/24). */
2736
for (ctr = 0; ctr < 6; ctr++) {
2737
elemptr = sample_data[ctr] + start_col;
2741
tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[11]);
2742
tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[10]);
2743
tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[9]);
2744
tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[8]);
2745
tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[7]);
2746
tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[6]);
2748
tmp10 = tmp0 + tmp5;
2749
tmp13 = tmp0 - tmp5;
2750
tmp11 = tmp1 + tmp4;
2751
tmp14 = tmp1 - tmp4;
2752
tmp12 = tmp2 + tmp3;
2753
tmp15 = tmp2 - tmp3;
2755
tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[11]);
2756
tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[10]);
2757
tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[9]);
2758
tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[8]);
2759
tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[7]);
2760
tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[6]);
2762
/* Apply unsigned->signed conversion */
2763
dataptr[0] = (DCTELEM)
2764
((tmp10 + tmp11 + tmp12 - 12 * CENTERJSAMPLE) << PASS1_BITS);
2765
dataptr[6] = (DCTELEM) ((tmp13 - tmp14 - tmp15) << PASS1_BITS);
2766
dataptr[4] = (DCTELEM)
2767
DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.224744871)), /* c4 */
2768
CONST_BITS-PASS1_BITS);
2769
dataptr[2] = (DCTELEM)
2770
DESCALE(tmp14 - tmp15 + MULTIPLY(tmp13 + tmp15, FIX(1.366025404)), /* c2 */
2771
CONST_BITS-PASS1_BITS);
2775
tmp10 = MULTIPLY(tmp1 + tmp4, FIX_0_541196100); /* c9 */
2776
tmp14 = tmp10 + MULTIPLY(tmp1, FIX_0_765366865); /* c3-c9 */
2777
tmp15 = tmp10 - MULTIPLY(tmp4, FIX_1_847759065); /* c3+c9 */
2778
tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.121971054)); /* c5 */
2779
tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.860918669)); /* c7 */
2780
tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.580774953)) /* c5+c7-c1 */
2781
+ MULTIPLY(tmp5, FIX(0.184591911)); /* c11 */
2782
tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.184591911)); /* -c11 */
2783
tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.339493912)) /* c1+c5-c11 */
2784
+ MULTIPLY(tmp5, FIX(0.860918669)); /* c7 */
2785
tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.725788011)) /* c1+c11-c7 */
2786
- MULTIPLY(tmp5, FIX(1.121971054)); /* c5 */
2787
tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.306562965)) /* c3 */
2788
- MULTIPLY(tmp2 + tmp5, FIX_0_541196100); /* c9 */
2790
dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
2791
dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
2792
dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
2793
dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
2795
dataptr += DCTSIZE; /* advance pointer to next row */
2798
/* Pass 2: process columns.
2799
* We remove the PASS1_BITS scaling, but leave the results scaled up
2800
* by an overall factor of 8.
2801
* We must also scale the output by (8/12)*(8/6) = 8/9, which we
2802
* partially fold into the constant multipliers and final shifting:
2803
* 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12) * 16/9.
2807
for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
2810
tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*5];
2811
tmp11 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*4];
2812
tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*3];
2814
tmp10 = tmp0 + tmp2;
2815
tmp12 = tmp0 - tmp2;
2817
tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*5];
2818
tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*4];
2819
tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*3];
2821
dataptr[DCTSIZE*0] = (DCTELEM)
2822
DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)), /* 16/9 */
2823
CONST_BITS+PASS1_BITS+1);
2824
dataptr[DCTSIZE*2] = (DCTELEM)
2825
DESCALE(MULTIPLY(tmp12, FIX(2.177324216)), /* c2 */
2826
CONST_BITS+PASS1_BITS+1);
2827
dataptr[DCTSIZE*4] = (DCTELEM)
2828
DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */
2829
CONST_BITS+PASS1_BITS+1);
2833
tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829)); /* c5 */
2835
dataptr[DCTSIZE*1] = (DCTELEM)
2836
DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)), /* 16/9 */
2837
CONST_BITS+PASS1_BITS+1);
2838
dataptr[DCTSIZE*3] = (DCTELEM)
2839
DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)), /* 16/9 */
2840
CONST_BITS+PASS1_BITS+1);
2841
dataptr[DCTSIZE*5] = (DCTELEM)
2842
DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)), /* 16/9 */
2843
CONST_BITS+PASS1_BITS+1);
2845
dataptr++; /* advance pointer to next column */
2851
* Perform the forward DCT on a 10x5 sample block.
2853
* 10-point FDCT in pass 1 (rows), 5-point in pass 2 (columns).
2857
jpeg_fdct_10x5 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
2859
INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
2860
INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
2866
/* Zero 3 bottom rows of output coefficient block. */
2867
MEMZERO(&data[DCTSIZE*5], SIZEOF(DCTELEM) * DCTSIZE * 3);
2869
/* Pass 1: process rows. */
2870
/* Note results are scaled up by sqrt(8) compared to a true DCT; */
2871
/* furthermore, we scale the results by 2**PASS1_BITS. */
2872
/* 10-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/20). */
2875
for (ctr = 0; ctr < 5; ctr++) {
2876
elemptr = sample_data[ctr] + start_col;
2880
tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[9]);
2881
tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[8]);
2882
tmp12 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[7]);
2883
tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[6]);
2884
tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[5]);
2886
tmp10 = tmp0 + tmp4;
2887
tmp13 = tmp0 - tmp4;
2888
tmp11 = tmp1 + tmp3;
2889
tmp14 = tmp1 - tmp3;
2891
tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[9]);
2892
tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[8]);
2893
tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[7]);
2894
tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[6]);
2895
tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[5]);
2897
/* Apply unsigned->signed conversion */
2898
dataptr[0] = (DCTELEM)
2899
((tmp10 + tmp11 + tmp12 - 10 * CENTERJSAMPLE) << PASS1_BITS);
2901
dataptr[4] = (DCTELEM)
2902
DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.144122806)) - /* c4 */
2903
MULTIPLY(tmp11 - tmp12, FIX(0.437016024)), /* c8 */
2904
CONST_BITS-PASS1_BITS);
2905
tmp10 = MULTIPLY(tmp13 + tmp14, FIX(0.831253876)); /* c6 */
2906
dataptr[2] = (DCTELEM)
2907
DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.513743148)), /* c2-c6 */
2908
CONST_BITS-PASS1_BITS);
2909
dataptr[6] = (DCTELEM)
2910
DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.176250899)), /* c2+c6 */
2911
CONST_BITS-PASS1_BITS);
2915
tmp10 = tmp0 + tmp4;
2916
tmp11 = tmp1 - tmp3;
2917
dataptr[5] = (DCTELEM) ((tmp10 - tmp11 - tmp2) << PASS1_BITS);
2918
tmp2 <<= CONST_BITS;
2919
dataptr[1] = (DCTELEM)
2920
DESCALE(MULTIPLY(tmp0, FIX(1.396802247)) + /* c1 */
2921
MULTIPLY(tmp1, FIX(1.260073511)) + tmp2 + /* c3 */
2922
MULTIPLY(tmp3, FIX(0.642039522)) + /* c7 */
2923
MULTIPLY(tmp4, FIX(0.221231742)), /* c9 */
2924
CONST_BITS-PASS1_BITS);
2925
tmp12 = MULTIPLY(tmp0 - tmp4, FIX(0.951056516)) - /* (c3+c7)/2 */
2926
MULTIPLY(tmp1 + tmp3, FIX(0.587785252)); /* (c1-c9)/2 */
2927
tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.309016994)) + /* (c3-c7)/2 */
2928
(tmp11 << (CONST_BITS - 1)) - tmp2;
2929
dataptr[3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS-PASS1_BITS);
2930
dataptr[7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS-PASS1_BITS);
2932
dataptr += DCTSIZE; /* advance pointer to next row */
2935
/* Pass 2: process columns.
2936
* We remove the PASS1_BITS scaling, but leave the results scaled up
2937
* by an overall factor of 8.
2938
* We must also scale the output by (8/10)*(8/5) = 32/25, which we
2939
* fold into the constant multipliers:
2940
* 5-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/10) * 32/25.
2944
for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
2947
tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*4];
2948
tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*3];
2949
tmp2 = dataptr[DCTSIZE*2];
2951
tmp10 = tmp0 + tmp1;
2952
tmp11 = tmp0 - tmp1;
2954
tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*4];
2955
tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*3];
2957
dataptr[DCTSIZE*0] = (DCTELEM)
2958
DESCALE(MULTIPLY(tmp10 + tmp2, FIX(1.28)), /* 32/25 */
2959
CONST_BITS+PASS1_BITS);
2960
tmp11 = MULTIPLY(tmp11, FIX(1.011928851)); /* (c2+c4)/2 */
2962
tmp10 = MULTIPLY(tmp10, FIX(0.452548340)); /* (c2-c4)/2 */
2963
dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS+PASS1_BITS);
2964
dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS+PASS1_BITS);
2968
tmp10 = MULTIPLY(tmp0 + tmp1, FIX(1.064004961)); /* c3 */
2970
dataptr[DCTSIZE*1] = (DCTELEM)
2971
DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.657591230)), /* c1-c3 */
2972
CONST_BITS+PASS1_BITS);
2973
dataptr[DCTSIZE*3] = (DCTELEM)
2974
DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.785601151)), /* c1+c3 */
2975
CONST_BITS+PASS1_BITS);
2977
dataptr++; /* advance pointer to next column */
2983
* Perform the forward DCT on an 8x4 sample block.
2985
* 8-point FDCT in pass 1 (rows), 4-point in pass 2 (columns).
2989
jpeg_fdct_8x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
2991
INT32 tmp0, tmp1, tmp2, tmp3;
2992
INT32 tmp10, tmp11, tmp12, tmp13;
2999
/* Zero 4 bottom rows of output coefficient block. */
3000
MEMZERO(&data[DCTSIZE*4], SIZEOF(DCTELEM) * DCTSIZE * 4);
3002
/* Pass 1: process rows. */
3003
/* Note results are scaled up by sqrt(8) compared to a true DCT; */
3004
/* furthermore, we scale the results by 2**PASS1_BITS. */
3005
/* We must also scale the output by 8/4 = 2, which we add here. */
3008
for (ctr = 0; ctr < 4; ctr++) {
3009
elemptr = sample_data[ctr] + start_col;
3011
/* Even part per LL&M figure 1 --- note that published figure is faulty;
3012
* rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
3015
tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
3016
tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
3017
tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
3018
tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
3020
tmp10 = tmp0 + tmp3;
3021
tmp12 = tmp0 - tmp3;
3022
tmp11 = tmp1 + tmp2;
3023
tmp13 = tmp1 - tmp2;
3025
tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
3026
tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
3027
tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
3028
tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
3030
/* Apply unsigned->signed conversion */
3031
dataptr[0] = (DCTELEM)
3032
((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << (PASS1_BITS+1));
3033
dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << (PASS1_BITS+1));
3035
z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
3036
/* Add fudge factor here for final descale. */
3037
z1 += ONE << (CONST_BITS-PASS1_BITS-2);
3038
dataptr[2] = (DCTELEM) RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865),
3039
CONST_BITS-PASS1_BITS-1);
3040
dataptr[6] = (DCTELEM) RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065),
3041
CONST_BITS-PASS1_BITS-1);
3043
/* Odd part per figure 8 --- note paper omits factor of sqrt(2).
3044
* 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
3045
* i0..i3 in the paper are tmp0..tmp3 here.
3048
tmp10 = tmp0 + tmp3;
3049
tmp11 = tmp1 + tmp2;
3050
tmp12 = tmp0 + tmp2;
3051
tmp13 = tmp1 + tmp3;
3052
z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */
3053
/* Add fudge factor here for final descale. */
3054
z1 += ONE << (CONST_BITS-PASS1_BITS-2);
3056
tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */
3057
tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */
3058
tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */
3059
tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */
3060
tmp10 = MULTIPLY(tmp10, - FIX_0_899976223); /* c7-c3 */
3061
tmp11 = MULTIPLY(tmp11, - FIX_2_562915447); /* -c1-c3 */
3062
tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* c5-c3 */
3063
tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */
3068
dataptr[1] = (DCTELEM)
3069
RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS-PASS1_BITS-1);
3070
dataptr[3] = (DCTELEM)
3071
RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS-PASS1_BITS-1);
3072
dataptr[5] = (DCTELEM)
3073
RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS-PASS1_BITS-1);
3074
dataptr[7] = (DCTELEM)
3075
RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS-PASS1_BITS-1);
3077
dataptr += DCTSIZE; /* advance pointer to next row */
3080
/* Pass 2: process columns.
3081
* We remove the PASS1_BITS scaling, but leave the results scaled up
3082
* by an overall factor of 8.
3083
* 4-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
3087
for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
3090
/* Add fudge factor here for final descale. */
3091
tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3] + (ONE << (PASS1_BITS-1));
3092
tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*2];
3094
tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*3];
3095
tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*2];
3097
dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp0 + tmp1, PASS1_BITS);
3098
dataptr[DCTSIZE*2] = (DCTELEM) RIGHT_SHIFT(tmp0 - tmp1, PASS1_BITS);
3102
tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */
3103
/* Add fudge factor here for final descale. */
3104
tmp0 += ONE << (CONST_BITS+PASS1_BITS-1);
3106
dataptr[DCTSIZE*1] = (DCTELEM)
3107
RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
3108
CONST_BITS+PASS1_BITS);
3109
dataptr[DCTSIZE*3] = (DCTELEM)
3110
RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
3111
CONST_BITS+PASS1_BITS);
3113
dataptr++; /* advance pointer to next column */
3119
* Perform the forward DCT on a 6x3 sample block.
3121
* 6-point FDCT in pass 1 (rows), 3-point in pass 2 (columns).
3125
jpeg_fdct_6x3 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3127
INT32 tmp0, tmp1, tmp2;
3128
INT32 tmp10, tmp11, tmp12;
3134
/* Pre-zero output coefficient block. */
3135
MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
3137
/* Pass 1: process rows. */
3138
/* Note results are scaled up by sqrt(8) compared to a true DCT; */
3139
/* furthermore, we scale the results by 2**PASS1_BITS. */
3140
/* We scale the results further by 2 as part of output adaption */
3141
/* scaling for different DCT size. */
3142
/* 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12). */
3145
for (ctr = 0; ctr < 3; ctr++) {
3146
elemptr = sample_data[ctr] + start_col;
3150
tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
3151
tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
3152
tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
3154
tmp10 = tmp0 + tmp2;
3155
tmp12 = tmp0 - tmp2;
3157
tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
3158
tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
3159
tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
3161
/* Apply unsigned->signed conversion */
3162
dataptr[0] = (DCTELEM)
3163
((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << (PASS1_BITS+1));
3164
dataptr[2] = (DCTELEM)
3165
DESCALE(MULTIPLY(tmp12, FIX(1.224744871)), /* c2 */
3166
CONST_BITS-PASS1_BITS-1);
3167
dataptr[4] = (DCTELEM)
3168
DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
3169
CONST_BITS-PASS1_BITS-1);
3173
tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)), /* c5 */
3174
CONST_BITS-PASS1_BITS-1);
3176
dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << (PASS1_BITS+1)));
3177
dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << (PASS1_BITS+1));
3178
dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << (PASS1_BITS+1)));
3180
dataptr += DCTSIZE; /* advance pointer to next row */
3183
/* Pass 2: process columns.
3184
* We remove the PASS1_BITS scaling, but leave the results scaled up
3185
* by an overall factor of 8.
3186
* We must also scale the output by (8/6)*(8/3) = 32/9, which we partially
3187
* fold into the constant multipliers (other part was done in pass 1):
3188
* 3-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/6) * 16/9.
3192
for (ctr = 0; ctr < 6; ctr++) {
3195
tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*2];
3196
tmp1 = dataptr[DCTSIZE*1];
3198
tmp2 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*2];
3200
dataptr[DCTSIZE*0] = (DCTELEM)
3201
DESCALE(MULTIPLY(tmp0 + tmp1, FIX(1.777777778)), /* 16/9 */
3202
CONST_BITS+PASS1_BITS);
3203
dataptr[DCTSIZE*2] = (DCTELEM)
3204
DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(1.257078722)), /* c2 */
3205
CONST_BITS+PASS1_BITS);
3209
dataptr[DCTSIZE*1] = (DCTELEM)
3210
DESCALE(MULTIPLY(tmp2, FIX(2.177324216)), /* c1 */
3211
CONST_BITS+PASS1_BITS);
3213
dataptr++; /* advance pointer to next column */
3219
* Perform the forward DCT on a 4x2 sample block.
3221
* 4-point FDCT in pass 1 (rows), 2-point in pass 2 (columns).
3225
jpeg_fdct_4x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3234
/* Pre-zero output coefficient block. */
3235
MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
3237
/* Pass 1: process rows. */
3238
/* Note results are scaled up by sqrt(8) compared to a true DCT; */
3239
/* furthermore, we scale the results by 2**PASS1_BITS. */
3240
/* We must also scale the output by (8/4)*(8/2) = 2**3, which we add here. */
3241
/* 4-point FDCT kernel, */
3242
/* cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT]. */
3245
for (ctr = 0; ctr < 2; ctr++) {
3246
elemptr = sample_data[ctr] + start_col;
3250
tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
3251
tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
3253
tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
3254
tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
3256
/* Apply unsigned->signed conversion */
3257
dataptr[0] = (DCTELEM)
3258
((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+3));
3259
dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+3));
3263
tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */
3264
/* Add fudge factor here for final descale. */
3265
tmp0 += ONE << (CONST_BITS-PASS1_BITS-4);
3267
dataptr[1] = (DCTELEM)
3268
RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
3269
CONST_BITS-PASS1_BITS-3);
3270
dataptr[3] = (DCTELEM)
3271
RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
3272
CONST_BITS-PASS1_BITS-3);
3274
dataptr += DCTSIZE; /* advance pointer to next row */
3277
/* Pass 2: process columns.
3278
* We remove the PASS1_BITS scaling, but leave the results scaled up
3279
* by an overall factor of 8.
3283
for (ctr = 0; ctr < 4; ctr++) {
3286
/* Add fudge factor here for final descale. */
3287
tmp0 = dataptr[DCTSIZE*0] + (ONE << (PASS1_BITS-1));
3288
tmp1 = dataptr[DCTSIZE*1];
3290
dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp0 + tmp1, PASS1_BITS);
3294
dataptr[DCTSIZE*1] = (DCTELEM) RIGHT_SHIFT(tmp0 - tmp1, PASS1_BITS);
3296
dataptr++; /* advance pointer to next column */
3302
* Perform the forward DCT on a 2x1 sample block.
3304
* 2-point FDCT in pass 1 (rows), 1-point in pass 2 (columns).
3308
jpeg_fdct_2x1 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3313
/* Pre-zero output coefficient block. */
3314
MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
3316
elemptr = sample_data[0] + start_col;
3318
tmp0 = GETJSAMPLE(elemptr[0]);
3319
tmp1 = GETJSAMPLE(elemptr[1]);
3321
/* We leave the results scaled up by an overall factor of 8.
3322
* We must also scale the output by (8/2)*(8/1) = 2**5.
3326
/* Apply unsigned->signed conversion */
3327
data[0] = (DCTELEM) ((tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 5);
3330
data[1] = (DCTELEM) ((tmp0 - tmp1) << 5);
3335
* Perform the forward DCT on an 8x16 sample block.
3337
* 8-point FDCT in pass 1 (rows), 16-point in pass 2 (columns).
3341
jpeg_fdct_8x16 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3343
INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3344
INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
3346
DCTELEM workspace[DCTSIZE2];
3353
/* Pass 1: process rows. */
3354
/* Note results are scaled up by sqrt(8) compared to a true DCT; */
3355
/* furthermore, we scale the results by 2**PASS1_BITS. */
3360
elemptr = sample_data[ctr] + start_col;
3362
/* Even part per LL&M figure 1 --- note that published figure is faulty;
3363
* rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
3366
tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
3367
tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
3368
tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
3369
tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
3371
tmp10 = tmp0 + tmp3;
3372
tmp12 = tmp0 - tmp3;
3373
tmp11 = tmp1 + tmp2;
3374
tmp13 = tmp1 - tmp2;
3376
tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
3377
tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
3378
tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
3379
tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
3381
/* Apply unsigned->signed conversion */
3382
dataptr[0] = (DCTELEM) ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << PASS1_BITS);
3383
dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
3385
z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
3386
dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, FIX_0_765366865),
3387
CONST_BITS-PASS1_BITS);
3388
dataptr[6] = (DCTELEM) DESCALE(z1 - MULTIPLY(tmp13, FIX_1_847759065),
3389
CONST_BITS-PASS1_BITS);
3391
/* Odd part per figure 8 --- note paper omits factor of sqrt(2).
3392
* 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
3393
* i0..i3 in the paper are tmp0..tmp3 here.
3396
tmp10 = tmp0 + tmp3;
3397
tmp11 = tmp1 + tmp2;
3398
tmp12 = tmp0 + tmp2;
3399
tmp13 = tmp1 + tmp3;
3400
z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */
3402
tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */
3403
tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */
3404
tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */
3405
tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */
3406
tmp10 = MULTIPLY(tmp10, - FIX_0_899976223); /* c7-c3 */
3407
tmp11 = MULTIPLY(tmp11, - FIX_2_562915447); /* -c1-c3 */
3408
tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* c5-c3 */
3409
tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */
3414
dataptr[1] = (DCTELEM) DESCALE(tmp0 + tmp10 + tmp12, CONST_BITS-PASS1_BITS);
3415
dataptr[3] = (DCTELEM) DESCALE(tmp1 + tmp11 + tmp13, CONST_BITS-PASS1_BITS);
3416
dataptr[5] = (DCTELEM) DESCALE(tmp2 + tmp11 + tmp12, CONST_BITS-PASS1_BITS);
3417
dataptr[7] = (DCTELEM) DESCALE(tmp3 + tmp10 + tmp13, CONST_BITS-PASS1_BITS);
3421
if (ctr != DCTSIZE) {
3422
if (ctr == DCTSIZE * 2)
3424
dataptr += DCTSIZE; /* advance pointer to next row */
3426
dataptr = workspace; /* switch pointer to extended workspace */
3429
/* Pass 2: process columns.
3430
* We remove the PASS1_BITS scaling, but leave the results scaled up
3431
* by an overall factor of 8.
3432
* We must also scale the output by 8/16 = 1/2.
3433
* 16-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/32).
3438
for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
3441
tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*7];
3442
tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*6];
3443
tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*5];
3444
tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*4];
3445
tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*3];
3446
tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*2];
3447
tmp6 = dataptr[DCTSIZE*6] + wsptr[DCTSIZE*1];
3448
tmp7 = dataptr[DCTSIZE*7] + wsptr[DCTSIZE*0];
3450
tmp10 = tmp0 + tmp7;
3451
tmp14 = tmp0 - tmp7;
3452
tmp11 = tmp1 + tmp6;
3453
tmp15 = tmp1 - tmp6;
3454
tmp12 = tmp2 + tmp5;
3455
tmp16 = tmp2 - tmp5;
3456
tmp13 = tmp3 + tmp4;
3457
tmp17 = tmp3 - tmp4;
3459
tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*7];
3460
tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*6];
3461
tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*5];
3462
tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*4];
3463
tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*3];
3464
tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*2];
3465
tmp6 = dataptr[DCTSIZE*6] - wsptr[DCTSIZE*1];
3466
tmp7 = dataptr[DCTSIZE*7] - wsptr[DCTSIZE*0];
3468
dataptr[DCTSIZE*0] = (DCTELEM)
3469
DESCALE(tmp10 + tmp11 + tmp12 + tmp13, PASS1_BITS+1);
3470
dataptr[DCTSIZE*4] = (DCTELEM)
3471
DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
3472
MULTIPLY(tmp11 - tmp12, FIX_0_541196100), /* c12[16] = c6[8] */
3473
CONST_BITS+PASS1_BITS+1);
3475
tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) + /* c14[16] = c7[8] */
3476
MULTIPLY(tmp14 - tmp16, FIX(1.387039845)); /* c2[16] = c1[8] */
3478
dataptr[DCTSIZE*2] = (DCTELEM)
3479
DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982)) /* c6+c14 */
3480
+ MULTIPLY(tmp16, FIX(2.172734804)), /* c2+c10 */
3481
CONST_BITS+PASS1_BITS+1);
3482
dataptr[DCTSIZE*6] = (DCTELEM)
3483
DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243)) /* c2-c6 */
3484
- MULTIPLY(tmp17, FIX(1.061594338)), /* c10+c14 */
3485
CONST_BITS+PASS1_BITS+1);
3489
tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) + /* c3 */
3490
MULTIPLY(tmp6 - tmp7, FIX(0.410524528)); /* c13 */
3491
tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) + /* c5 */
3492
MULTIPLY(tmp5 + tmp7, FIX(0.666655658)); /* c11 */
3493
tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) + /* c7 */
3494
MULTIPLY(tmp4 - tmp7, FIX(0.897167586)); /* c9 */
3495
tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) + /* c15 */
3496
MULTIPLY(tmp6 - tmp5, FIX(1.407403738)); /* c1 */
3497
tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) + /* -c11 */
3498
MULTIPLY(tmp4 + tmp6, - FIX(1.247225013)); /* -c5 */
3499
tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) + /* -c3 */
3500
MULTIPLY(tmp5 - tmp4, FIX(0.410524528)); /* c13 */
3501
tmp10 = tmp11 + tmp12 + tmp13 -
3502
MULTIPLY(tmp0, FIX(2.286341144)) + /* c7+c5+c3-c1 */
3503
MULTIPLY(tmp7, FIX(0.779653625)); /* c15+c13-c11+c9 */
3504
tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
3505
- MULTIPLY(tmp6, FIX(1.663905119)); /* c7+c13+c1-c5 */
3506
tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
3507
+ MULTIPLY(tmp5, FIX(1.227391138)); /* c9-c11+c1-c13 */
3508
tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
3509
+ MULTIPLY(tmp4, FIX(2.167985692)); /* c1+c13+c5-c9 */
3511
dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS1_BITS+1);
3512
dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS1_BITS+1);
3513
dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS1_BITS+1);
3514
dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS1_BITS+1);
3516
dataptr++; /* advance pointer to next column */
3517
wsptr++; /* advance pointer to next column */
3523
* Perform the forward DCT on a 7x14 sample block.
3525
* 7-point FDCT in pass 1 (rows), 14-point in pass 2 (columns).
3529
jpeg_fdct_7x14 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3531
INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
3532
INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
3534
DCTELEM workspace[8*6];
3541
/* Pre-zero output coefficient block. */
3542
MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
3544
/* Pass 1: process rows. */
3545
/* Note results are scaled up by sqrt(8) compared to a true DCT; */
3546
/* furthermore, we scale the results by 2**PASS1_BITS. */
3547
/* 7-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/14). */
3552
elemptr = sample_data[ctr] + start_col;
3556
tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[6]);
3557
tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[5]);
3558
tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[4]);
3559
tmp3 = GETJSAMPLE(elemptr[3]);
3561
tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[6]);
3562
tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[5]);
3563
tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[4]);
3566
/* Apply unsigned->signed conversion */
3567
dataptr[0] = (DCTELEM)
3568
((z1 + tmp1 + tmp3 - 7 * CENTERJSAMPLE) << PASS1_BITS);
3572
z1 = MULTIPLY(z1, FIX(0.353553391)); /* (c2+c6-c4)/2 */
3573
z2 = MULTIPLY(tmp0 - tmp2, FIX(0.920609002)); /* (c2+c4-c6)/2 */
3574
z3 = MULTIPLY(tmp1 - tmp2, FIX(0.314692123)); /* c6 */
3575
dataptr[2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS-PASS1_BITS);
3577
z2 = MULTIPLY(tmp0 - tmp1, FIX(0.881747734)); /* c4 */
3578
dataptr[4] = (DCTELEM)
3579
DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.707106781)), /* c2+c6-c4 */
3580
CONST_BITS-PASS1_BITS);
3581
dataptr[6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS-PASS1_BITS);
3585
tmp1 = MULTIPLY(tmp10 + tmp11, FIX(0.935414347)); /* (c3+c1-c5)/2 */
3586
tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.170262339)); /* (c3+c5-c1)/2 */
3589
tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.378756276)); /* -c1 */
3591
tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.613604268)); /* c5 */
3593
tmp2 += tmp3 + MULTIPLY(tmp12, FIX(1.870828693)); /* c3+c1-c5 */
3595
dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-PASS1_BITS);
3596
dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-PASS1_BITS);
3597
dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-PASS1_BITS);
3601
if (ctr != DCTSIZE) {
3604
dataptr += DCTSIZE; /* advance pointer to next row */
3606
dataptr = workspace; /* switch pointer to extended workspace */
3609
/* Pass 2: process columns.
3610
* We remove the PASS1_BITS scaling, but leave the results scaled up
3611
* by an overall factor of 8.
3612
* We must also scale the output by (8/7)*(8/14) = 32/49, which we
3613
* fold into the constant multipliers:
3614
* 14-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/28) * 32/49.
3619
for (ctr = 0; ctr < 7; ctr++) {
3622
tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*5];
3623
tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*4];
3624
tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*3];
3625
tmp13 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*2];
3626
tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*1];
3627
tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*0];
3628
tmp6 = dataptr[DCTSIZE*6] + dataptr[DCTSIZE*7];
3630
tmp10 = tmp0 + tmp6;
3631
tmp14 = tmp0 - tmp6;
3632
tmp11 = tmp1 + tmp5;
3633
tmp15 = tmp1 - tmp5;
3634
tmp12 = tmp2 + tmp4;
3635
tmp16 = tmp2 - tmp4;
3637
tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*5];
3638
tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*4];
3639
tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*3];
3640
tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*2];
3641
tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*1];
3642
tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*0];
3643
tmp6 = dataptr[DCTSIZE*6] - dataptr[DCTSIZE*7];
3645
dataptr[DCTSIZE*0] = (DCTELEM)
3646
DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12 + tmp13,
3647
FIX(0.653061224)), /* 32/49 */
3648
CONST_BITS+PASS1_BITS);
3650
dataptr[DCTSIZE*4] = (DCTELEM)
3651
DESCALE(MULTIPLY(tmp10 - tmp13, FIX(0.832106052)) + /* c4 */
3652
MULTIPLY(tmp11 - tmp13, FIX(0.205513223)) - /* c12 */
3653
MULTIPLY(tmp12 - tmp13, FIX(0.575835255)), /* c8 */
3654
CONST_BITS+PASS1_BITS);
3656
tmp10 = MULTIPLY(tmp14 + tmp15, FIX(0.722074570)); /* c6 */
3658
dataptr[DCTSIZE*2] = (DCTELEM)
3659
DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.178337691)) /* c2-c6 */
3660
+ MULTIPLY(tmp16, FIX(0.400721155)), /* c10 */
3661
CONST_BITS+PASS1_BITS);
3662
dataptr[DCTSIZE*6] = (DCTELEM)
3663
DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.122795725)) /* c6+c10 */
3664
- MULTIPLY(tmp16, FIX(0.900412262)), /* c2 */
3665
CONST_BITS+PASS1_BITS);
3669
tmp10 = tmp1 + tmp2;
3670
tmp11 = tmp5 - tmp4;
3671
dataptr[DCTSIZE*7] = (DCTELEM)
3672
DESCALE(MULTIPLY(tmp0 - tmp10 + tmp3 - tmp11 - tmp6,
3673
FIX(0.653061224)), /* 32/49 */
3674
CONST_BITS+PASS1_BITS);
3675
tmp3 = MULTIPLY(tmp3 , FIX(0.653061224)); /* 32/49 */
3676
tmp10 = MULTIPLY(tmp10, - FIX(0.103406812)); /* -c13 */
3677
tmp11 = MULTIPLY(tmp11, FIX(0.917760839)); /* c1 */
3678
tmp10 += tmp11 - tmp3;
3679
tmp11 = MULTIPLY(tmp0 + tmp2, FIX(0.782007410)) + /* c5 */
3680
MULTIPLY(tmp4 + tmp6, FIX(0.491367823)); /* c9 */
3681
dataptr[DCTSIZE*5] = (DCTELEM)
3682
DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(1.550341076)) /* c3+c5-c13 */
3683
+ MULTIPLY(tmp4, FIX(0.731428202)), /* c1+c11-c9 */
3684
CONST_BITS+PASS1_BITS);
3685
tmp12 = MULTIPLY(tmp0 + tmp1, FIX(0.871740478)) + /* c3 */
3686
MULTIPLY(tmp5 - tmp6, FIX(0.305035186)); /* c11 */
3687
dataptr[DCTSIZE*3] = (DCTELEM)
3688
DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.276965844)) /* c3-c9-c13 */
3689
- MULTIPLY(tmp5, FIX(2.004803435)), /* c1+c5+c11 */
3690
CONST_BITS+PASS1_BITS);
3691
dataptr[DCTSIZE*1] = (DCTELEM)
3692
DESCALE(tmp11 + tmp12 + tmp3
3693
- MULTIPLY(tmp0, FIX(0.735987049)) /* c3+c5-c1 */
3694
- MULTIPLY(tmp6, FIX(0.082925825)), /* c9-c11-c13 */
3695
CONST_BITS+PASS1_BITS);
3697
dataptr++; /* advance pointer to next column */
3698
wsptr++; /* advance pointer to next column */
3704
* Perform the forward DCT on a 6x12 sample block.
3706
* 6-point FDCT in pass 1 (rows), 12-point in pass 2 (columns).
3710
jpeg_fdct_6x12 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3712
INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3713
INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
3714
DCTELEM workspace[8*4];
3721
/* Pre-zero output coefficient block. */
3722
MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
3724
/* Pass 1: process rows. */
3725
/* Note results are scaled up by sqrt(8) compared to a true DCT; */
3726
/* furthermore, we scale the results by 2**PASS1_BITS. */
3727
/* 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12). */
3732
elemptr = sample_data[ctr] + start_col;
3736
tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
3737
tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
3738
tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
3740
tmp10 = tmp0 + tmp2;
3741
tmp12 = tmp0 - tmp2;
3743
tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
3744
tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
3745
tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
3747
/* Apply unsigned->signed conversion */
3748
dataptr[0] = (DCTELEM)
3749
((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << PASS1_BITS);
3750
dataptr[2] = (DCTELEM)
3751
DESCALE(MULTIPLY(tmp12, FIX(1.224744871)), /* c2 */
3752
CONST_BITS-PASS1_BITS);
3753
dataptr[4] = (DCTELEM)
3754
DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
3755
CONST_BITS-PASS1_BITS);
3759
tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)), /* c5 */
3760
CONST_BITS-PASS1_BITS);
3762
dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << PASS1_BITS));
3763
dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << PASS1_BITS);
3764
dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << PASS1_BITS));
3768
if (ctr != DCTSIZE) {
3771
dataptr += DCTSIZE; /* advance pointer to next row */
3773
dataptr = workspace; /* switch pointer to extended workspace */
3776
/* Pass 2: process columns.
3777
* We remove the PASS1_BITS scaling, but leave the results scaled up
3778
* by an overall factor of 8.
3779
* We must also scale the output by (8/6)*(8/12) = 8/9, which we
3780
* fold into the constant multipliers:
3781
* 12-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/24) * 8/9.
3786
for (ctr = 0; ctr < 6; ctr++) {
3789
tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*3];
3790
tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*2];
3791
tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*1];
3792
tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*0];
3793
tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*7];
3794
tmp5 = dataptr[DCTSIZE*5] + dataptr[DCTSIZE*6];
3796
tmp10 = tmp0 + tmp5;
3797
tmp13 = tmp0 - tmp5;
3798
tmp11 = tmp1 + tmp4;
3799
tmp14 = tmp1 - tmp4;
3800
tmp12 = tmp2 + tmp3;
3801
tmp15 = tmp2 - tmp3;
3803
tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*3];
3804
tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*2];
3805
tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*1];
3806
tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*0];
3807
tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*7];
3808
tmp5 = dataptr[DCTSIZE*5] - dataptr[DCTSIZE*6];
3810
dataptr[DCTSIZE*0] = (DCTELEM)
3811
DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(0.888888889)), /* 8/9 */
3812
CONST_BITS+PASS1_BITS);
3813
dataptr[DCTSIZE*6] = (DCTELEM)
3814
DESCALE(MULTIPLY(tmp13 - tmp14 - tmp15, FIX(0.888888889)), /* 8/9 */
3815
CONST_BITS+PASS1_BITS);
3816
dataptr[DCTSIZE*4] = (DCTELEM)
3817
DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.088662108)), /* c4 */
3818
CONST_BITS+PASS1_BITS);
3819
dataptr[DCTSIZE*2] = (DCTELEM)
3820
DESCALE(MULTIPLY(tmp14 - tmp15, FIX(0.888888889)) + /* 8/9 */
3821
MULTIPLY(tmp13 + tmp15, FIX(1.214244803)), /* c2 */
3822
CONST_BITS+PASS1_BITS);
3826
tmp10 = MULTIPLY(tmp1 + tmp4, FIX(0.481063200)); /* c9 */
3827
tmp14 = tmp10 + MULTIPLY(tmp1, FIX(0.680326102)); /* c3-c9 */
3828
tmp15 = tmp10 - MULTIPLY(tmp4, FIX(1.642452502)); /* c3+c9 */
3829
tmp12 = MULTIPLY(tmp0 + tmp2, FIX(0.997307603)); /* c5 */
3830
tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.765261039)); /* c7 */
3831
tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.516244403)) /* c5+c7-c1 */
3832
+ MULTIPLY(tmp5, FIX(0.164081699)); /* c11 */
3833
tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.164081699)); /* -c11 */
3834
tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.079550144)) /* c1+c5-c11 */
3835
+ MULTIPLY(tmp5, FIX(0.765261039)); /* c7 */
3836
tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.645144899)) /* c1+c11-c7 */
3837
- MULTIPLY(tmp5, FIX(0.997307603)); /* c5 */
3838
tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.161389302)) /* c3 */
3839
- MULTIPLY(tmp2 + tmp5, FIX(0.481063200)); /* c9 */
3841
dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS1_BITS);
3842
dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS1_BITS);
3843
dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS1_BITS);
3844
dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS1_BITS);
3846
dataptr++; /* advance pointer to next column */
3847
wsptr++; /* advance pointer to next column */
3853
* Perform the forward DCT on a 5x10 sample block.
3855
* 5-point FDCT in pass 1 (rows), 10-point in pass 2 (columns).
3859
jpeg_fdct_5x10 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3861
INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
3862
INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
3863
DCTELEM workspace[8*2];
3870
/* Pre-zero output coefficient block. */
3871
MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
3873
/* Pass 1: process rows. */
3874
/* Note results are scaled up by sqrt(8) compared to a true DCT; */
3875
/* furthermore, we scale the results by 2**PASS1_BITS. */
3876
/* 5-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/10). */
3881
elemptr = sample_data[ctr] + start_col;
3885
tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[4]);
3886
tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[3]);
3887
tmp2 = GETJSAMPLE(elemptr[2]);
3889
tmp10 = tmp0 + tmp1;
3890
tmp11 = tmp0 - tmp1;
3892
tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[4]);
3893
tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[3]);
3895
/* Apply unsigned->signed conversion */
3896
dataptr[0] = (DCTELEM)
3897
((tmp10 + tmp2 - 5 * CENTERJSAMPLE) << PASS1_BITS);
3898
tmp11 = MULTIPLY(tmp11, FIX(0.790569415)); /* (c2+c4)/2 */
3900
tmp10 = MULTIPLY(tmp10, FIX(0.353553391)); /* (c2-c4)/2 */
3901
dataptr[2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS-PASS1_BITS);
3902
dataptr[4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS-PASS1_BITS);
3906
tmp10 = MULTIPLY(tmp0 + tmp1, FIX(0.831253876)); /* c3 */
3908
dataptr[1] = (DCTELEM)
3909
DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.513743148)), /* c1-c3 */
3910
CONST_BITS-PASS1_BITS);
3911
dataptr[3] = (DCTELEM)
3912
DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.176250899)), /* c1+c3 */
3913
CONST_BITS-PASS1_BITS);
3917
if (ctr != DCTSIZE) {
3920
dataptr += DCTSIZE; /* advance pointer to next row */
3922
dataptr = workspace; /* switch pointer to extended workspace */
3925
/* Pass 2: process columns.
3926
* We remove the PASS1_BITS scaling, but leave the results scaled up
3927
* by an overall factor of 8.
3928
* We must also scale the output by (8/5)*(8/10) = 32/25, which we
3929
* fold into the constant multipliers:
3930
* 10-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/20) * 32/25.
3935
for (ctr = 0; ctr < 5; ctr++) {
3938
tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*1];
3939
tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*0];
3940
tmp12 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*7];
3941
tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*6];
3942
tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*5];
3944
tmp10 = tmp0 + tmp4;
3945
tmp13 = tmp0 - tmp4;
3946
tmp11 = tmp1 + tmp3;
3947
tmp14 = tmp1 - tmp3;
3949
tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*1];
3950
tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*0];
3951
tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*7];
3952
tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*6];
3953
tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*5];
3955
dataptr[DCTSIZE*0] = (DCTELEM)
3956
DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(1.28)), /* 32/25 */
3957
CONST_BITS+PASS1_BITS);
3959
dataptr[DCTSIZE*4] = (DCTELEM)
3960
DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.464477191)) - /* c4 */
3961
MULTIPLY(tmp11 - tmp12, FIX(0.559380511)), /* c8 */
3962
CONST_BITS+PASS1_BITS);
3963
tmp10 = MULTIPLY(tmp13 + tmp14, FIX(1.064004961)); /* c6 */
3964
dataptr[DCTSIZE*2] = (DCTELEM)
3965
DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.657591230)), /* c2-c6 */
3966
CONST_BITS+PASS1_BITS);
3967
dataptr[DCTSIZE*6] = (DCTELEM)
3968
DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.785601151)), /* c2+c6 */
3969
CONST_BITS+PASS1_BITS);
3973
tmp10 = tmp0 + tmp4;
3974
tmp11 = tmp1 - tmp3;
3975
dataptr[DCTSIZE*5] = (DCTELEM)
3976
DESCALE(MULTIPLY(tmp10 - tmp11 - tmp2, FIX(1.28)), /* 32/25 */
3977
CONST_BITS+PASS1_BITS);
3978
tmp2 = MULTIPLY(tmp2, FIX(1.28)); /* 32/25 */
3979
dataptr[DCTSIZE*1] = (DCTELEM)
3980
DESCALE(MULTIPLY(tmp0, FIX(1.787906876)) + /* c1 */
3981
MULTIPLY(tmp1, FIX(1.612894094)) + tmp2 + /* c3 */
3982
MULTIPLY(tmp3, FIX(0.821810588)) + /* c7 */
3983
MULTIPLY(tmp4, FIX(0.283176630)), /* c9 */
3984
CONST_BITS+PASS1_BITS);
3985
tmp12 = MULTIPLY(tmp0 - tmp4, FIX(1.217352341)) - /* (c3+c7)/2 */
3986
MULTIPLY(tmp1 + tmp3, FIX(0.752365123)); /* (c1-c9)/2 */
3987
tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.395541753)) + /* (c3-c7)/2 */
3988
MULTIPLY(tmp11, FIX(0.64)) - tmp2; /* 16/25 */
3989
dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS+PASS1_BITS);
3990
dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS+PASS1_BITS);
3992
dataptr++; /* advance pointer to next column */
3993
wsptr++; /* advance pointer to next column */
3999
* Perform the forward DCT on a 4x8 sample block.
4001
* 4-point FDCT in pass 1 (rows), 8-point in pass 2 (columns).
4005
jpeg_fdct_4x8 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
4007
INT32 tmp0, tmp1, tmp2, tmp3;
4008
INT32 tmp10, tmp11, tmp12, tmp13;
4015
/* Pre-zero output coefficient block. */
4016
MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
4018
/* Pass 1: process rows. */
4019
/* Note results are scaled up by sqrt(8) compared to a true DCT; */
4020
/* furthermore, we scale the results by 2**PASS1_BITS. */
4021
/* We must also scale the output by 8/4 = 2, which we add here. */
4022
/* 4-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16). */
4025
for (ctr = 0; ctr < DCTSIZE; ctr++) {
4026
elemptr = sample_data[ctr] + start_col;
4030
tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
4031
tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
4033
tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
4034
tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
4036
/* Apply unsigned->signed conversion */
4037
dataptr[0] = (DCTELEM)
4038
((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+1));
4039
dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+1));
4043
tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */
4044
/* Add fudge factor here for final descale. */
4045
tmp0 += ONE << (CONST_BITS-PASS1_BITS-2);
4047
dataptr[1] = (DCTELEM)
4048
RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
4049
CONST_BITS-PASS1_BITS-1);
4050
dataptr[3] = (DCTELEM)
4051
RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
4052
CONST_BITS-PASS1_BITS-1);
4054
dataptr += DCTSIZE; /* advance pointer to next row */
4057
/* Pass 2: process columns.
4058
* We remove the PASS1_BITS scaling, but leave the results scaled up
4059
* by an overall factor of 8.
4063
for (ctr = 0; ctr < 4; ctr++) {
4064
/* Even part per LL&M figure 1 --- note that published figure is faulty;
4065
* rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
4068
tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
4069
tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
4070
tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
4071
tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
4073
/* Add fudge factor here for final descale. */
4074
tmp10 = tmp0 + tmp3 + (ONE << (PASS1_BITS-1));
4075
tmp12 = tmp0 - tmp3;
4076
tmp11 = tmp1 + tmp2;
4077
tmp13 = tmp1 - tmp2;
4079
tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
4080
tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
4081
tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
4082
tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
4084
dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp10 + tmp11, PASS1_BITS);
4085
dataptr[DCTSIZE*4] = (DCTELEM) RIGHT_SHIFT(tmp10 - tmp11, PASS1_BITS);
4087
z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
4088
/* Add fudge factor here for final descale. */
4089
z1 += ONE << (CONST_BITS+PASS1_BITS-1);
4090
dataptr[DCTSIZE*2] = (DCTELEM)
4091
RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), CONST_BITS+PASS1_BITS);
4092
dataptr[DCTSIZE*6] = (DCTELEM)
4093
RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), CONST_BITS+PASS1_BITS);
4095
/* Odd part per figure 8 --- note paper omits factor of sqrt(2).
4096
* 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
4097
* i0..i3 in the paper are tmp0..tmp3 here.
4100
tmp10 = tmp0 + tmp3;
4101
tmp11 = tmp1 + tmp2;
4102
tmp12 = tmp0 + tmp2;
4103
tmp13 = tmp1 + tmp3;
4104
z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */
4105
/* Add fudge factor here for final descale. */
4106
z1 += ONE << (CONST_BITS+PASS1_BITS-1);
4108
tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */
4109
tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */
4110
tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */
4111
tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */
4112
tmp10 = MULTIPLY(tmp10, - FIX_0_899976223); /* c7-c3 */
4113
tmp11 = MULTIPLY(tmp11, - FIX_2_562915447); /* -c1-c3 */
4114
tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* c5-c3 */
4115
tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */
4120
dataptr[DCTSIZE*1] = (DCTELEM)
4121
RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS+PASS1_BITS);
4122
dataptr[DCTSIZE*3] = (DCTELEM)
4123
RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS+PASS1_BITS);
4124
dataptr[DCTSIZE*5] = (DCTELEM)
4125
RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS+PASS1_BITS);
4126
dataptr[DCTSIZE*7] = (DCTELEM)
4127
RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS+PASS1_BITS);
4129
dataptr++; /* advance pointer to next column */
4135
* Perform the forward DCT on a 3x6 sample block.
4137
* 3-point FDCT in pass 1 (rows), 6-point in pass 2 (columns).
4141
jpeg_fdct_3x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
4143
INT32 tmp0, tmp1, tmp2;
4144
INT32 tmp10, tmp11, tmp12;
4150
/* Pre-zero output coefficient block. */
4151
MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
4153
/* Pass 1: process rows. */
4154
/* Note results are scaled up by sqrt(8) compared to a true DCT; */
4155
/* furthermore, we scale the results by 2**PASS1_BITS. */
4156
/* We scale the results further by 2 as part of output adaption */
4157
/* scaling for different DCT size. */
4158
/* 3-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/6). */
4161
for (ctr = 0; ctr < 6; ctr++) {
4162
elemptr = sample_data[ctr] + start_col;
4166
tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[2]);
4167
tmp1 = GETJSAMPLE(elemptr[1]);
4169
tmp2 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[2]);
4171
/* Apply unsigned->signed conversion */
4172
dataptr[0] = (DCTELEM)
4173
((tmp0 + tmp1 - 3 * CENTERJSAMPLE) << (PASS1_BITS+1));
4174
dataptr[2] = (DCTELEM)
4175
DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(0.707106781)), /* c2 */
4176
CONST_BITS-PASS1_BITS-1);
4180
dataptr[1] = (DCTELEM)
4181
DESCALE(MULTIPLY(tmp2, FIX(1.224744871)), /* c1 */
4182
CONST_BITS-PASS1_BITS-1);
4184
dataptr += DCTSIZE; /* advance pointer to next row */
4187
/* Pass 2: process columns.
4188
* We remove the PASS1_BITS scaling, but leave the results scaled up
4189
* by an overall factor of 8.
4190
* We must also scale the output by (8/6)*(8/3) = 32/9, which we partially
4191
* fold into the constant multipliers (other part was done in pass 1):
4192
* 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12) * 16/9.
4196
for (ctr = 0; ctr < 3; ctr++) {
4199
tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*5];
4200
tmp11 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*4];
4201
tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*3];
4203
tmp10 = tmp0 + tmp2;
4204
tmp12 = tmp0 - tmp2;
4206
tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*5];
4207
tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*4];
4208
tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*3];
4210
dataptr[DCTSIZE*0] = (DCTELEM)
4211
DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)), /* 16/9 */
4212
CONST_BITS+PASS1_BITS);
4213
dataptr[DCTSIZE*2] = (DCTELEM)
4214
DESCALE(MULTIPLY(tmp12, FIX(2.177324216)), /* c2 */
4215
CONST_BITS+PASS1_BITS);
4216
dataptr[DCTSIZE*4] = (DCTELEM)
4217
DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */
4218
CONST_BITS+PASS1_BITS);
4222
tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829)); /* c5 */
4224
dataptr[DCTSIZE*1] = (DCTELEM)
4225
DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)), /* 16/9 */
4226
CONST_BITS+PASS1_BITS);
4227
dataptr[DCTSIZE*3] = (DCTELEM)
4228
DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)), /* 16/9 */
4229
CONST_BITS+PASS1_BITS);
4230
dataptr[DCTSIZE*5] = (DCTELEM)
4231
DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)), /* 16/9 */
4232
CONST_BITS+PASS1_BITS);
4234
dataptr++; /* advance pointer to next column */
4240
* Perform the forward DCT on a 2x4 sample block.
4242
* 2-point FDCT in pass 1 (rows), 4-point in pass 2 (columns).
4246
jpeg_fdct_2x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
4255
/* Pre-zero output coefficient block. */
4256
MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
4258
/* Pass 1: process rows. */
4259
/* Note results are scaled up by sqrt(8) compared to a true DCT. */
4260
/* We must also scale the output by (8/2)*(8/4) = 2**3, which we add here. */
4263
for (ctr = 0; ctr < 4; ctr++) {
4264
elemptr = sample_data[ctr] + start_col;
4268
tmp0 = GETJSAMPLE(elemptr[0]);
4269
tmp1 = GETJSAMPLE(elemptr[1]);
4271
/* Apply unsigned->signed conversion */
4272
dataptr[0] = (DCTELEM) ((tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 3);
4276
dataptr[1] = (DCTELEM) ((tmp0 - tmp1) << 3);
4278
dataptr += DCTSIZE; /* advance pointer to next row */
4281
/* Pass 2: process columns.
4282
* We leave the results scaled up by an overall factor of 8.
4283
* 4-point FDCT kernel,
4284
* cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
4288
for (ctr = 0; ctr < 2; ctr++) {
4291
tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3];
4292
tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*2];
4294
tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*3];
4295
tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*2];
4297
dataptr[DCTSIZE*0] = (DCTELEM) (tmp0 + tmp1);
4298
dataptr[DCTSIZE*2] = (DCTELEM) (tmp0 - tmp1);
4302
tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */
4303
/* Add fudge factor here for final descale. */
4304
tmp0 += ONE << (CONST_BITS-1);
4306
dataptr[DCTSIZE*1] = (DCTELEM)
4307
RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
4309
dataptr[DCTSIZE*3] = (DCTELEM)
4310
RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
4313
dataptr++; /* advance pointer to next column */
4319
* Perform the forward DCT on a 1x2 sample block.
4321
* 1-point FDCT in pass 1 (rows), 2-point in pass 2 (columns).
4325
jpeg_fdct_1x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
4329
/* Pre-zero output coefficient block. */
4330
MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
4332
tmp0 = GETJSAMPLE(sample_data[0][start_col]);
4333
tmp1 = GETJSAMPLE(sample_data[1][start_col]);
4335
/* We leave the results scaled up by an overall factor of 8.
4336
* We must also scale the output by (8/1)*(8/2) = 2**5.
4340
/* Apply unsigned->signed conversion */
4341
data[DCTSIZE*0] = (DCTELEM) ((tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 5);
4344
data[DCTSIZE*1] = (DCTELEM) ((tmp0 - tmp1) << 5);
4347
#endif /* DCT_SCALING_SUPPORTED */
4348
#endif /* DCT_ISLOW_SUPPORTED */