4
* Copyright (C) 1994-1996, Thomas G. Lane.
5
* Copyright (C) 1999-2006, MIYASAKA Masaru.
6
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
7
* Copyright (C) 2011 D. R. Commander
8
* This file is part of the Independent JPEG Group's software.
9
* For conditions of distribution and use, see the accompanying README file.
11
* This file contains the forward-DCT management logic.
12
* This code selects a particular DCT implementation to be used,
13
* and it performs related housekeeping chores including coefficient
17
#define JPEG_INTERNALS
20
#include "jdct.h" /* Private declarations for DCT subsystem */
24
/* Private subobject for this module */
26
typedef JMETHOD(void, forward_DCT_method_ptr, (DCTELEM * data));
27
typedef JMETHOD(void, float_DCT_method_ptr, (FAST_FLOAT * data));
29
typedef JMETHOD(void, convsamp_method_ptr,
30
(JSAMPARRAY sample_data, JDIMENSION start_col,
31
DCTELEM * workspace));
32
typedef JMETHOD(void, float_convsamp_method_ptr,
33
(JSAMPARRAY sample_data, JDIMENSION start_col,
34
FAST_FLOAT *workspace));
36
typedef JMETHOD(void, quantize_method_ptr,
37
(JCOEFPTR coef_block, DCTELEM * divisors,
38
DCTELEM * workspace));
39
typedef JMETHOD(void, float_quantize_method_ptr,
40
(JCOEFPTR coef_block, FAST_FLOAT * divisors,
41
FAST_FLOAT * workspace));
43
METHODDEF(void) quantize (JCOEFPTR, DCTELEM *, DCTELEM *);
46
struct jpeg_forward_dct pub; /* public fields */
48
/* Pointer to the DCT routine actually in use */
49
forward_DCT_method_ptr dct;
50
convsamp_method_ptr convsamp;
51
quantize_method_ptr quantize;
53
/* The actual post-DCT divisors --- not identical to the quant table
54
* entries, because of scaling (especially for an unnormalized DCT).
55
* Each table is given in normal array order.
57
DCTELEM * divisors[NUM_QUANT_TBLS];
59
/* work area for FDCT subroutine */
62
#ifdef DCT_FLOAT_SUPPORTED
63
/* Same as above for the floating-point case. */
64
float_DCT_method_ptr float_dct;
65
float_convsamp_method_ptr float_convsamp;
66
float_quantize_method_ptr float_quantize;
67
FAST_FLOAT * float_divisors[NUM_QUANT_TBLS];
68
FAST_FLOAT * float_workspace;
72
typedef my_fdct_controller * my_fdct_ptr;
76
* Find the highest bit in an integer through binary search.
88
if (!(val & 0xff00)) {
92
if (!(val & 0xf000)) {
96
if (!(val & 0xc000)) {
100
if (!(val & 0x8000)) {
109
* Compute values to do a division using reciprocal.
111
* This implementation is based on an algorithm described in
112
* "How to optimize for the Pentium family of microprocessors"
113
* (http://www.agner.org/assem/).
114
* More information about the basic algorithm can be found in
115
* the paper "Integer Division Using Reciprocals" by Robert Alverson.
117
* The basic idea is to replace x/d by x * d^-1. In order to store
118
* d^-1 with enough precision we shift it left a few places. It turns
119
* out that this algoright gives just enough precision, and also fits
122
* b = (the number of significant bits in divisor) - 1
123
* r = (word size) + b
126
* f will not be an integer for most cases, so we need to compensate
127
* for the rounding error introduced:
129
* no fractional part:
131
* result = input >> r
133
* fractional part of f < 0.5:
135
* round f down to nearest integer
136
* result = ((input + 1) * f) >> r
138
* fractional part of f > 0.5:
140
* round f up to nearest integer
141
* result = (input * f) >> r
143
* This is the original algorithm that gives truncated results. But we
144
* want properly rounded results, so we replace "input" with
145
* "input + divisor/2".
147
* In order to allow SIMD implementations we also tweak the values to
148
* allow the same calculation to be made at all times:
150
* dctbl[0] = f rounded to nearest integer
151
* dctbl[1] = divisor / 2 (+ 1 if fractional part of f < 0.5)
152
* dctbl[2] = 1 << ((word size) * 2 - r)
153
* dctbl[3] = r - (word size)
155
* dctbl[2] is for stupid instruction sets where the shift operation
156
* isn't member wise (e.g. MMX).
158
* The reason dctbl[2] and dctbl[3] reduce the shift with (word size)
159
* is that most SIMD implementations have a "multiply and store top
162
* Lastly, we store each of the values in their own table instead
163
* of in a consecutive manner, yet again in order to allow SIMD
167
compute_reciprocal (UINT16 divisor, DCTELEM * dtbl)
173
b = flss(divisor) - 1;
174
r = sizeof(DCTELEM) * 8 + b;
176
fq = ((UDCTELEM2)1 << r) / divisor;
177
fr = ((UDCTELEM2)1 << r) % divisor;
179
c = divisor / 2; /* for rounding */
181
if (fr == 0) { /* divisor is power of two */
182
/* fq will be one bit too large to fit in DCTELEM, so adjust */
185
} else if (fr <= (divisor / 2U)) { /* fractional part is < 0.5 */
187
} else { /* fractional part is > 0.5 */
191
dtbl[DCTSIZE2 * 0] = (DCTELEM) fq; /* reciprocal */
192
dtbl[DCTSIZE2 * 1] = (DCTELEM) c; /* correction + roundfactor */
193
dtbl[DCTSIZE2 * 2] = (DCTELEM) (1 << (sizeof(DCTELEM)*8*2 - r)); /* scale */
194
dtbl[DCTSIZE2 * 3] = (DCTELEM) r - sizeof(DCTELEM)*8; /* shift */
196
if(r <= 16) return 0;
201
* Initialize for a processing pass.
202
* Verify that all referenced Q-tables are present, and set up
203
* the divisor table for each one.
204
* In the current implementation, DCT of all components is done during
205
* the first pass, even if only some components will be output in the
206
* first scan. Hence all components should be examined here.
210
start_pass_fdctmgr (j_compress_ptr cinfo)
212
my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
214
jpeg_component_info *compptr;
218
for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
220
qtblno = compptr->quant_tbl_no;
221
/* Make sure specified quantization table is present */
222
if (qtblno < 0 || qtblno >= NUM_QUANT_TBLS ||
223
cinfo->quant_tbl_ptrs[qtblno] == NULL)
224
ERREXIT1(cinfo, JERR_NO_QUANT_TABLE, qtblno);
225
qtbl = cinfo->quant_tbl_ptrs[qtblno];
226
/* Compute divisors for this quant table */
227
/* We may do this more than once for same table, but it's not a big deal */
228
switch (cinfo->dct_method) {
229
#ifdef DCT_ISLOW_SUPPORTED
231
/* For LL&M IDCT method, divisors are equal to raw quantization
232
* coefficients multiplied by 8 (to counteract scaling).
234
if (fdct->divisors[qtblno] == NULL) {
235
fdct->divisors[qtblno] = (DCTELEM *)
236
(*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
237
(DCTSIZE2 * 4) * SIZEOF(DCTELEM));
239
dtbl = fdct->divisors[qtblno];
240
for (i = 0; i < DCTSIZE2; i++) {
241
if(!compute_reciprocal(qtbl->quantval[i] << 3, &dtbl[i])
242
&& fdct->quantize == jsimd_quantize)
243
fdct->quantize = quantize;
247
#ifdef DCT_IFAST_SUPPORTED
250
/* For AA&N IDCT method, divisors are equal to quantization
251
* coefficients scaled by scalefactor[row]*scalefactor[col], where
253
* scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7
254
* We apply a further scale factor of 8.
256
#define CONST_BITS 14
257
static const INT16 aanscales[DCTSIZE2] = {
258
/* precomputed values scaled up by 14 bits */
259
16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520,
260
22725, 31521, 29692, 26722, 22725, 17855, 12299, 6270,
261
21407, 29692, 27969, 25172, 21407, 16819, 11585, 5906,
262
19266, 26722, 25172, 22654, 19266, 15137, 10426, 5315,
263
16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520,
264
12873, 17855, 16819, 15137, 12873, 10114, 6967, 3552,
265
8867, 12299, 11585, 10426, 8867, 6967, 4799, 2446,
266
4520, 6270, 5906, 5315, 4520, 3552, 2446, 1247
270
if (fdct->divisors[qtblno] == NULL) {
271
fdct->divisors[qtblno] = (DCTELEM *)
272
(*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
273
(DCTSIZE2 * 4) * SIZEOF(DCTELEM));
275
dtbl = fdct->divisors[qtblno];
276
for (i = 0; i < DCTSIZE2; i++) {
277
if(!compute_reciprocal(
278
DESCALE(MULTIPLY16V16((INT32) qtbl->quantval[i],
279
(INT32) aanscales[i]),
280
CONST_BITS-3), &dtbl[i])
281
&& fdct->quantize == jsimd_quantize)
282
fdct->quantize = quantize;
287
#ifdef DCT_FLOAT_SUPPORTED
290
/* For float AA&N IDCT method, divisors are equal to quantization
291
* coefficients scaled by scalefactor[row]*scalefactor[col], where
293
* scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7
294
* We apply a further scale factor of 8.
295
* What's actually stored is 1/divisor so that the inner loop can
296
* use a multiplication rather than a division.
300
static const double aanscalefactor[DCTSIZE] = {
301
1.0, 1.387039845, 1.306562965, 1.175875602,
302
1.0, 0.785694958, 0.541196100, 0.275899379
305
if (fdct->float_divisors[qtblno] == NULL) {
306
fdct->float_divisors[qtblno] = (FAST_FLOAT *)
307
(*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
308
DCTSIZE2 * SIZEOF(FAST_FLOAT));
310
fdtbl = fdct->float_divisors[qtblno];
312
for (row = 0; row < DCTSIZE; row++) {
313
for (col = 0; col < DCTSIZE; col++) {
314
fdtbl[i] = (FAST_FLOAT)
315
(1.0 / (((double) qtbl->quantval[i] *
316
aanscalefactor[row] * aanscalefactor[col] * 8.0)));
324
ERREXIT(cinfo, JERR_NOT_COMPILED);
332
* Load data into workspace, applying unsigned->signed conversion.
336
convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace)
338
register DCTELEM *workspaceptr;
339
register JSAMPROW elemptr;
342
workspaceptr = workspace;
343
for (elemr = 0; elemr < DCTSIZE; elemr++) {
344
elemptr = sample_data[elemr] + start_col;
346
#if DCTSIZE == 8 /* unroll the inner loop */
347
*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
348
*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
349
*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
350
*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
351
*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
352
*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
353
*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
354
*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
358
for (elemc = DCTSIZE; elemc > 0; elemc--)
359
*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
367
* Quantize/descale the coefficients, and store into coef_blocks[].
371
quantize (JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace)
375
UDCTELEM recip, corr, shift;
377
JCOEFPTR output_ptr = coef_block;
379
for (i = 0; i < DCTSIZE2; i++) {
381
recip = divisors[i + DCTSIZE2 * 0];
382
corr = divisors[i + DCTSIZE2 * 1];
383
shift = divisors[i + DCTSIZE2 * 3];
387
product = (UDCTELEM2)(temp + corr) * recip;
388
product >>= shift + sizeof(DCTELEM)*8;
392
product = (UDCTELEM2)(temp + corr) * recip;
393
product >>= shift + sizeof(DCTELEM)*8;
397
output_ptr[i] = (JCOEF) temp;
403
* Perform forward DCT on one or more blocks of a component.
405
* The input samples are taken from the sample_data[] array starting at
406
* position start_row/start_col, and moving to the right for any additional
407
* blocks. The quantized coefficients are returned in coef_blocks[].
411
forward_DCT (j_compress_ptr cinfo, jpeg_component_info * compptr,
412
JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
413
JDIMENSION start_row, JDIMENSION start_col,
414
JDIMENSION num_blocks)
415
/* This version is used for integer DCT implementations. */
417
/* This routine is heavily used, so it's worth coding it tightly. */
418
my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
419
DCTELEM * divisors = fdct->divisors[compptr->quant_tbl_no];
423
/* Make sure the compiler doesn't look up these every pass */
424
forward_DCT_method_ptr do_dct = fdct->dct;
425
convsamp_method_ptr do_convsamp = fdct->convsamp;
426
quantize_method_ptr do_quantize = fdct->quantize;
427
workspace = fdct->workspace;
429
sample_data += start_row; /* fold in the vertical offset once */
431
for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) {
432
/* Load data into workspace, applying unsigned->signed conversion */
433
(*do_convsamp) (sample_data, start_col, workspace);
435
/* Perform the DCT */
436
(*do_dct) (workspace);
438
/* Quantize/descale the coefficients, and store into coef_blocks[] */
439
(*do_quantize) (coef_blocks[bi], divisors, workspace);
444
#ifdef DCT_FLOAT_SUPPORTED
448
convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT * workspace)
450
register FAST_FLOAT *workspaceptr;
451
register JSAMPROW elemptr;
454
workspaceptr = workspace;
455
for (elemr = 0; elemr < DCTSIZE; elemr++) {
456
elemptr = sample_data[elemr] + start_col;
457
#if DCTSIZE == 8 /* unroll the inner loop */
458
*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
459
*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
460
*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
461
*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
462
*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
463
*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
464
*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
465
*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
469
for (elemc = DCTSIZE; elemc > 0; elemc--)
470
*workspaceptr++ = (FAST_FLOAT)
471
(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
479
quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors, FAST_FLOAT * workspace)
481
register FAST_FLOAT temp;
483
register JCOEFPTR output_ptr = coef_block;
485
for (i = 0; i < DCTSIZE2; i++) {
486
/* Apply the quantization and scaling factor */
487
temp = workspace[i] * divisors[i];
489
/* Round to nearest integer.
490
* Since C does not specify the direction of rounding for negative
491
* quotients, we have to force the dividend positive for portability.
492
* The maximum coefficient size is +-16K (for 12-bit data), so this
493
* code should work for either 16-bit or 32-bit ints.
495
output_ptr[i] = (JCOEF) ((int) (temp + (FAST_FLOAT) 16384.5) - 16384);
501
forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info * compptr,
502
JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
503
JDIMENSION start_row, JDIMENSION start_col,
504
JDIMENSION num_blocks)
505
/* This version is used for floating-point DCT implementations. */
507
/* This routine is heavily used, so it's worth coding it tightly. */
508
my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
509
FAST_FLOAT * divisors = fdct->float_divisors[compptr->quant_tbl_no];
510
FAST_FLOAT * workspace;
514
/* Make sure the compiler doesn't look up these every pass */
515
float_DCT_method_ptr do_dct = fdct->float_dct;
516
float_convsamp_method_ptr do_convsamp = fdct->float_convsamp;
517
float_quantize_method_ptr do_quantize = fdct->float_quantize;
518
workspace = fdct->float_workspace;
520
sample_data += start_row; /* fold in the vertical offset once */
522
for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) {
523
/* Load data into workspace, applying unsigned->signed conversion */
524
(*do_convsamp) (sample_data, start_col, workspace);
526
/* Perform the DCT */
527
(*do_dct) (workspace);
529
/* Quantize/descale the coefficients, and store into coef_blocks[] */
530
(*do_quantize) (coef_blocks[bi], divisors, workspace);
534
#endif /* DCT_FLOAT_SUPPORTED */
538
* Initialize FDCT manager.
542
jinit_forward_dct (j_compress_ptr cinfo)
548
(*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
549
SIZEOF(my_fdct_controller));
550
cinfo->fdct = (struct jpeg_forward_dct *) fdct;
551
fdct->pub.start_pass = start_pass_fdctmgr;
553
/* First determine the DCT... */
554
switch (cinfo->dct_method) {
555
#ifdef DCT_ISLOW_SUPPORTED
557
fdct->pub.forward_DCT = forward_DCT;
558
if (jsimd_can_fdct_islow())
559
fdct->dct = jsimd_fdct_islow;
561
fdct->dct = jpeg_fdct_islow;
564
#ifdef DCT_IFAST_SUPPORTED
566
fdct->pub.forward_DCT = forward_DCT;
567
if (jsimd_can_fdct_ifast())
568
fdct->dct = jsimd_fdct_ifast;
570
fdct->dct = jpeg_fdct_ifast;
573
#ifdef DCT_FLOAT_SUPPORTED
575
fdct->pub.forward_DCT = forward_DCT_float;
576
if (jsimd_can_fdct_float())
577
fdct->float_dct = jsimd_fdct_float;
579
fdct->float_dct = jpeg_fdct_float;
583
ERREXIT(cinfo, JERR_NOT_COMPILED);
587
/* ...then the supporting stages. */
588
switch (cinfo->dct_method) {
589
#ifdef DCT_ISLOW_SUPPORTED
592
#ifdef DCT_IFAST_SUPPORTED
595
#if defined(DCT_ISLOW_SUPPORTED) || defined(DCT_IFAST_SUPPORTED)
596
if (jsimd_can_convsamp())
597
fdct->convsamp = jsimd_convsamp;
599
fdct->convsamp = convsamp;
600
if (jsimd_can_quantize())
601
fdct->quantize = jsimd_quantize;
603
fdct->quantize = quantize;
606
#ifdef DCT_FLOAT_SUPPORTED
608
if (jsimd_can_convsamp_float())
609
fdct->float_convsamp = jsimd_convsamp_float;
611
fdct->float_convsamp = convsamp_float;
612
if (jsimd_can_quantize_float())
613
fdct->float_quantize = jsimd_quantize_float;
615
fdct->float_quantize = quantize_float;
619
ERREXIT(cinfo, JERR_NOT_COMPILED);
623
/* Allocate workspace memory */
624
#ifdef DCT_FLOAT_SUPPORTED
625
if (cinfo->dct_method == JDCT_FLOAT)
626
fdct->float_workspace = (FAST_FLOAT *)
627
(*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
628
SIZEOF(FAST_FLOAT) * DCTSIZE2);
631
fdct->workspace = (DCTELEM *)
632
(*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
633
SIZEOF(DCTELEM) * DCTSIZE2);
635
/* Mark divisor tables unallocated */
636
for (i = 0; i < NUM_QUANT_TBLS; i++) {
637
fdct->divisors[i] = NULL;
638
#ifdef DCT_FLOAT_SUPPORTED
639
fdct->float_divisors[i] = NULL;