2
* High quality image resampling with polyphase filters
3
* Copyright (c) 2001 Fabrice Bellard.
5
* This library is free software; you can redistribute it and/or
6
* modify it under the terms of the GNU Lesser General Public
7
* License as published by the Free Software Foundation; either
8
* version 2 of the License, or (at your option) any later version.
10
* This library is distributed in the hope that it will be useful,
11
* but WITHOUT ANY WARRANTY; without even the implied warranty of
12
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
* Lesser General Public License for more details.
15
* You should have received a copy of the GNU Lesser General Public
16
* License along with this library; if not, write to the Free Software
17
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22
* High quality image resampling with polyphase filters .
29
#include "fastmemcpy.h"
32
#define NB_COMPONENTS 3
35
#define NB_PHASES (1 << PHASE_BITS)
37
#define FCENTER 1 /* index of the center of the filter */
38
//#define TEST 1 /* Test it */
40
#define POS_FRAC_BITS 16
41
#define POS_FRAC (1 << POS_FRAC_BITS)
42
/* 6 bits precision is needed for MMX */
45
#define LINE_BUF_HEIGHT (NB_TAPS * 4)
47
struct ImgReSampleContext {
48
int iwidth, iheight, owidth, oheight;
49
int topBand, bottomBand, leftBand, rightBand;
50
int padtop, padbottom, padleft, padright;
51
int pad_owidth, pad_oheight;
53
int16_t h_filters[NB_PHASES][NB_TAPS] __align8; /* horizontal filters */
54
int16_t v_filters[NB_PHASES][NB_TAPS] __align8; /* vertical filters */
58
static inline int get_phase(int pos)
60
return ((pos) >> (POS_FRAC_BITS - PHASE_BITS)) & ((1 << PHASE_BITS) - 1);
63
/* This function must be optimized */
64
static void h_resample_fast(uint8_t *dst, int dst_width, const uint8_t *src,
65
int src_width, int src_start, int src_incr,
68
int src_pos, phase, sum, i;
73
for(i=0;i<dst_width;i++) {
76
if ((src_pos >> POS_FRAC_BITS) < 0 ||
77
(src_pos >> POS_FRAC_BITS) > (src_width - NB_TAPS))
80
s = src + (src_pos >> POS_FRAC_BITS);
81
phase = get_phase(src_pos);
82
filter = filters + phase * NB_TAPS;
84
sum = s[0] * filter[0] +
92
for(j=0;j<NB_TAPS;j++)
93
sum += s[j] * filter[j];
96
sum = sum >> FILTER_BITS;
107
/* This function must be optimized */
108
static void v_resample(uint8_t *dst, int dst_width, const uint8_t *src,
109
int wrap, int16_t *filter)
115
for(i=0;i<dst_width;i++) {
117
sum = s[0 * wrap] * filter[0] +
118
s[1 * wrap] * filter[1] +
119
s[2 * wrap] * filter[2] +
120
s[3 * wrap] * filter[3];
127
for(j=0;j<NB_TAPS;j++) {
128
sum += s1[0] * filter[j];
133
sum = sum >> FILTER_BITS;
146
#include "i386/mmx.h"
148
#define FILTER4(reg) \
150
s = src + (src_pos >> POS_FRAC_BITS);\
151
phase = get_phase(src_pos);\
152
filter = filters + phase * NB_TAPS;\
154
punpcklbw_r2r(mm7, reg);\
155
movq_m2r(*filter, mm6);\
156
pmaddwd_r2r(reg, mm6);\
159
paddd_r2r(mm6, reg);\
160
psrad_i2r(FILTER_BITS, reg);\
161
src_pos += src_incr;\
164
#define DUMP(reg) movq_r2m(reg, tmp); printf(#reg "=%016Lx\n", tmp.uq);
166
/* XXX: do four pixels at a time */
167
static void h_resample_fast4_mmx(uint8_t *dst, int dst_width,
168
const uint8_t *src, int src_width,
169
int src_start, int src_incr, int16_t *filters)
179
while (dst_width >= 4) {
186
packuswb_r2r(mm7, mm0);
187
packuswb_r2r(mm7, mm1);
188
packuswb_r2r(mm7, mm3);
189
packuswb_r2r(mm7, mm2);
201
while (dst_width > 0) {
203
packuswb_r2r(mm7, mm0);
212
static void v_resample4_mmx(uint8_t *dst, int dst_width, const uint8_t *src,
213
int wrap, int16_t *filter)
230
while (dst_width >= 4) {
231
movq_m2r(s[0 * wrap], mm0);
232
punpcklbw_r2r(mm7, mm0);
233
movq_m2r(s[1 * wrap], mm1);
234
punpcklbw_r2r(mm7, mm1);
235
movq_m2r(s[2 * wrap], mm2);
236
punpcklbw_r2r(mm7, mm2);
237
movq_m2r(s[3 * wrap], mm3);
238
punpcklbw_r2r(mm7, mm3);
240
pmullw_m2r(coefs[0], mm0);
241
pmullw_m2r(coefs[1], mm1);
242
pmullw_m2r(coefs[2], mm2);
243
pmullw_m2r(coefs[3], mm3);
248
psraw_i2r(FILTER_BITS, mm0);
250
packuswb_r2r(mm7, mm0);
253
*(uint32_t *)dst = tmp.ud[0];
258
while (dst_width > 0) {
259
sum = s[0 * wrap] * filter[0] +
260
s[1 * wrap] * filter[1] +
261
s[2 * wrap] * filter[2] +
262
s[3 * wrap] * filter[3];
263
sum = sum >> FILTER_BITS;
279
vector unsigned char v;
284
vector signed short v;
288
void v_resample16_altivec(uint8_t *dst, int dst_width, const uint8_t *src,
289
int wrap, int16_t *filter)
293
vector unsigned char *tv, tmp, dstv, zero;
294
vec_ss_t srchv[4], srclv[4], fv[4];
295
vector signed short zeros, sumhv, sumlv;
301
The vec_madds later on does an implicit >>15 on the result.
302
Since FILTER_BITS is 8, and we have 15 bits of magnitude in
303
a signed short, we have just enough bits to pre-shift our
304
filter constants <<7 to compensate for vec_madds.
306
fv[i].s[0] = filter[i] << (15-FILTER_BITS);
307
fv[i].v = vec_splat(fv[i].v, 0);
310
zero = vec_splat_u8(0);
311
zeros = vec_splat_s16(0);
315
When we're resampling, we'd ideally like both our input buffers,
316
and output buffers to be 16-byte aligned, so we can do both aligned
317
reads and writes. Sadly we can't always have this at the moment, so
318
we opt for aligned writes, as unaligned writes have a huge overhead.
319
To do this, do enough scalar resamples to get dst 16-byte aligned.
321
i = (-(int)dst) & 0xf;
323
sum = s[0 * wrap] * filter[0] +
324
s[1 * wrap] * filter[1] +
325
s[2 * wrap] * filter[2] +
326
s[3 * wrap] * filter[3];
327
sum = sum >> FILTER_BITS;
328
if (sum<0) sum = 0; else if (sum>255) sum=255;
336
/* Do our altivec resampling on 16 pixels at once. */
337
while(dst_width>=16) {
339
Read 16 (potentially unaligned) bytes from each of
340
4 lines into 4 vectors, and split them into shorts.
341
Interleave the multipy/accumulate for the resample
342
filter with the loads to hide the 3 cycle latency
345
tv = (vector unsigned char *) &s[0 * wrap];
346
tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[i * wrap]));
347
srchv[0].v = (vector signed short) vec_mergeh(zero, tmp);
348
srclv[0].v = (vector signed short) vec_mergel(zero, tmp);
349
sumhv = vec_madds(srchv[0].v, fv[0].v, zeros);
350
sumlv = vec_madds(srclv[0].v, fv[0].v, zeros);
352
tv = (vector unsigned char *) &s[1 * wrap];
353
tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[1 * wrap]));
354
srchv[1].v = (vector signed short) vec_mergeh(zero, tmp);
355
srclv[1].v = (vector signed short) vec_mergel(zero, tmp);
356
sumhv = vec_madds(srchv[1].v, fv[1].v, sumhv);
357
sumlv = vec_madds(srclv[1].v, fv[1].v, sumlv);
359
tv = (vector unsigned char *) &s[2 * wrap];
360
tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[2 * wrap]));
361
srchv[2].v = (vector signed short) vec_mergeh(zero, tmp);
362
srclv[2].v = (vector signed short) vec_mergel(zero, tmp);
363
sumhv = vec_madds(srchv[2].v, fv[2].v, sumhv);
364
sumlv = vec_madds(srclv[2].v, fv[2].v, sumlv);
366
tv = (vector unsigned char *) &s[3 * wrap];
367
tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[3 * wrap]));
368
srchv[3].v = (vector signed short) vec_mergeh(zero, tmp);
369
srclv[3].v = (vector signed short) vec_mergel(zero, tmp);
370
sumhv = vec_madds(srchv[3].v, fv[3].v, sumhv);
371
sumlv = vec_madds(srclv[3].v, fv[3].v, sumlv);
374
Pack the results into our destination vector,
375
and do an aligned write of that back to memory.
377
dstv = vec_packsu(sumhv, sumlv) ;
378
vec_st(dstv, 0, (vector unsigned char *) dst);
386
If there are any leftover pixels, resample them
387
with the slow scalar method.
390
sum = s[0 * wrap] * filter[0] +
391
s[1 * wrap] * filter[1] +
392
s[2 * wrap] * filter[2] +
393
s[3 * wrap] * filter[3];
394
sum = sum >> FILTER_BITS;
395
if (sum<0) sum = 0; else if (sum>255) sum=255;
404
/* slow version to handle limit cases. Does not need optimisation */
405
static void h_resample_slow(uint8_t *dst, int dst_width,
406
const uint8_t *src, int src_width,
407
int src_start, int src_incr, int16_t *filters)
409
int src_pos, phase, sum, j, v, i;
410
const uint8_t *s, *src_end;
413
src_end = src + src_width;
415
for(i=0;i<dst_width;i++) {
416
s = src + (src_pos >> POS_FRAC_BITS);
417
phase = get_phase(src_pos);
418
filter = filters + phase * NB_TAPS;
420
for(j=0;j<NB_TAPS;j++) {
423
else if (s >= src_end)
427
sum += v * filter[j];
430
sum = sum >> FILTER_BITS;
441
static void h_resample(uint8_t *dst, int dst_width, const uint8_t *src,
442
int src_width, int src_start, int src_incr,
448
n = (0 - src_start + src_incr - 1) / src_incr;
449
h_resample_slow(dst, n, src, src_width, src_start, src_incr, filters);
452
src_start += n * src_incr;
454
src_end = src_start + dst_width * src_incr;
455
if (src_end > ((src_width - NB_TAPS) << POS_FRAC_BITS)) {
456
n = (((src_width - NB_TAPS + 1) << POS_FRAC_BITS) - 1 - src_start) /
462
if ((mm_flags & MM_MMX) && NB_TAPS == 4)
463
h_resample_fast4_mmx(dst, n,
464
src, src_width, src_start, src_incr, filters);
467
h_resample_fast(dst, n,
468
src, src_width, src_start, src_incr, filters);
472
src_start += n * src_incr;
473
h_resample_slow(dst, dst_width,
474
src, src_width, src_start, src_incr, filters);
478
static void component_resample(ImgReSampleContext *s,
479
uint8_t *output, int owrap, int owidth, int oheight,
480
uint8_t *input, int iwrap, int iwidth, int iheight)
482
int src_y, src_y1, last_src_y, ring_y, phase_y, y1, y;
483
uint8_t *new_line, *src_line;
485
last_src_y = - FCENTER - 1;
486
/* position of the bottom of the filter in the source image */
487
src_y = (last_src_y + NB_TAPS) * POS_FRAC;
488
ring_y = NB_TAPS; /* position in ring buffer */
489
for(y=0;y<oheight;y++) {
490
/* apply horizontal filter on new lines from input if needed */
491
src_y1 = src_y >> POS_FRAC_BITS;
492
while (last_src_y < src_y1) {
493
if (++ring_y >= LINE_BUF_HEIGHT + NB_TAPS)
496
/* handle limit conditions : replicate line (slightly
497
inefficient because we filter multiple times) */
501
} else if (y1 >= iheight) {
504
src_line = input + y1 * iwrap;
505
new_line = s->line_buf + ring_y * owidth;
506
/* apply filter and handle limit cases correctly */
507
h_resample(new_line, owidth,
508
src_line, iwidth, - FCENTER * POS_FRAC, s->h_incr,
509
&s->h_filters[0][0]);
510
/* handle ring buffer wraping */
511
if (ring_y >= LINE_BUF_HEIGHT) {
512
memcpy(s->line_buf + (ring_y - LINE_BUF_HEIGHT) * owidth,
516
/* apply vertical filter */
517
phase_y = get_phase(src_y);
519
/* desactivated MMX because loss of precision */
520
if ((mm_flags & MM_MMX) && NB_TAPS == 4 && 0)
521
v_resample4_mmx(output, owidth,
522
s->line_buf + (ring_y - NB_TAPS + 1) * owidth, owidth,
523
&s->v_filters[phase_y][0]);
527
if ((mm_flags & MM_ALTIVEC) && NB_TAPS == 4 && FILTER_BITS <= 6)
528
v_resample16_altivec(output, owidth,
529
s->line_buf + (ring_y - NB_TAPS + 1) * owidth, owidth,
530
&s->v_filters[phase_y][0]);
533
v_resample(output, owidth,
534
s->line_buf + (ring_y - NB_TAPS + 1) * owidth, owidth,
535
&s->v_filters[phase_y][0]);
543
/* XXX: the following filter is quite naive, but it seems to suffice
545
static void build_filter(int16_t *filter, float factor)
548
float x, y, tab[NB_TAPS], norm, mult;
550
/* if upsampling, only need to interpolate, no filter */
554
for(ph=0;ph<NB_PHASES;ph++) {
556
for(i=0;i<NB_TAPS;i++) {
558
x = M_PI * ((float)(i - FCENTER) - (float)ph / NB_PHASES) * factor;
567
/* normalize so that an uniform color remains the same */
568
mult = (float)(1 << FILTER_BITS) / norm;
569
for(i=0;i<NB_TAPS;i++) {
570
v = (int)(tab[i] * mult);
571
filter[ph * NB_TAPS + i] = v;
576
ImgReSampleContext *img_resample_init(int owidth, int oheight,
577
int iwidth, int iheight)
579
return img_resample_full_init(owidth, oheight, iwidth, iheight,
580
0, 0, 0, 0, 0, 0, 0, 0);
583
ImgReSampleContext *img_resample_full_init(int owidth, int oheight,
584
int iwidth, int iheight,
585
int topBand, int bottomBand,
586
int leftBand, int rightBand,
587
int padtop, int padbottom,
588
int padleft, int padright)
590
ImgReSampleContext *s;
592
s = av_mallocz(sizeof(ImgReSampleContext));
595
s->line_buf = av_mallocz(owidth * (LINE_BUF_HEIGHT + NB_TAPS));
600
s->oheight = oheight;
602
s->iheight = iheight;
604
s->topBand = topBand;
605
s->bottomBand = bottomBand;
606
s->leftBand = leftBand;
607
s->rightBand = rightBand;
610
s->padbottom = padbottom;
611
s->padleft = padleft;
612
s->padright = padright;
614
s->pad_owidth = owidth - (padleft + padright);
615
s->pad_oheight = oheight - (padtop + padbottom);
617
s->h_incr = ((iwidth - leftBand - rightBand) * POS_FRAC) / s->pad_owidth;
618
s->v_incr = ((iheight - topBand - bottomBand) * POS_FRAC) / s->pad_oheight;
620
build_filter(&s->h_filters[0][0], (float) s->pad_owidth /
621
(float) (iwidth - leftBand - rightBand));
622
build_filter(&s->v_filters[0][0], (float) s->pad_oheight /
623
(float) (iheight - topBand - bottomBand));
631
void img_resample(ImgReSampleContext *s,
632
AVPicture *output, const AVPicture *input)
638
shift = (i == 0) ? 0 : 1;
640
optr = output->data[i] + (((output->linesize[i] *
641
s->padtop) + s->padleft) >> shift);
643
component_resample(s, optr, output->linesize[i],
644
s->pad_owidth >> shift, s->pad_oheight >> shift,
645
input->data[i] + (input->linesize[i] *
646
(s->topBand >> shift)) + (s->leftBand >> shift),
647
input->linesize[i], ((s->iwidth - s->leftBand -
648
s->rightBand) >> shift),
649
(s->iheight - s->topBand - s->bottomBand) >> shift);
653
void img_resample_close(ImgReSampleContext *s)
655
av_free(s->line_buf);
661
void *av_mallocz(int size)
665
memset(ptr, 0, size);
669
void av_free(void *ptr)
671
/* XXX: this test should not be needed on most libcs */
679
uint8_t img[XSIZE * YSIZE];
684
uint8_t img1[XSIZE1 * YSIZE1];
685
uint8_t img2[XSIZE1 * YSIZE1];
687
void save_pgm(const char *filename, uint8_t *img, int xsize, int ysize)
690
f=fopen(filename,"w");
691
fprintf(f,"P5\n%d %d\n%d\n", xsize, ysize, 255);
692
fwrite(img,1, xsize * ysize,f);
696
static void dump_filter(int16_t *filter)
700
for(ph=0;ph<NB_PHASES;ph++) {
702
for(i=0;i<NB_TAPS;i++) {
703
printf(" %5.2f", filter[ph * NB_TAPS + i] / 256.0);
713
int main(int argc, char **argv)
715
int x, y, v, i, xsize, ysize;
716
ImgReSampleContext *s;
717
float fact, factors[] = { 1/2.0, 3.0/4.0, 1.0, 4.0/3.0, 16.0/9.0, 2.0 };
720
/* build test image */
721
for(y=0;y<YSIZE;y++) {
722
for(x=0;x<XSIZE;x++) {
723
if (x < XSIZE/2 && y < YSIZE/2) {
724
if (x < XSIZE/4 && y < YSIZE/4) {
730
} else if (x < XSIZE/4) {
735
} else if (y < XSIZE/4) {
747
if (((x+3) % 4) <= 1 &&
754
} else if (x < XSIZE/2) {
755
v = ((x - (XSIZE/2)) * 255) / (XSIZE/2);
756
} else if (y < XSIZE/2) {
757
v = ((y - (XSIZE/2)) * 255) / (XSIZE/2);
759
v = ((x + y - XSIZE) * 255) / XSIZE;
761
img[(YSIZE - y) * XSIZE + (XSIZE - x)] = v;
764
save_pgm("/tmp/in.pgm", img, XSIZE, YSIZE);
765
for(i=0;i<sizeof(factors)/sizeof(float);i++) {
767
xsize = (int)(XSIZE * fact);
768
ysize = (int)((YSIZE - 100) * fact);
769
s = img_resample_full_init(xsize, ysize, XSIZE, YSIZE, 50 ,50, 0, 0);
770
printf("Factor=%0.2f\n", fact);
771
dump_filter(&s->h_filters[0][0]);
772
component_resample(s, img1, xsize, xsize, ysize,
773
img + 50 * XSIZE, XSIZE, XSIZE, YSIZE - 100);
774
img_resample_close(s);
776
sprintf(buf, "/tmp/out%d.pgm", i);
777
save_pgm(buf, img1, xsize, ysize);
782
printf("MMX test\n");
784
xsize = (int)(XSIZE * fact);
785
ysize = (int)(YSIZE * fact);
787
s = img_resample_init(xsize, ysize, XSIZE, YSIZE);
788
component_resample(s, img1, xsize, xsize, ysize,
789
img, XSIZE, XSIZE, YSIZE);
792
s = img_resample_init(xsize, ysize, XSIZE, YSIZE);
793
component_resample(s, img2, xsize, xsize, ysize,
794
img, XSIZE, XSIZE, YSIZE);
795
if (memcmp(img1, img2, xsize * ysize) != 0) {
796
fprintf(stderr, "mmx error\n");