289
vector unsigned char v;
294
vector signed short v;
298
void v_resample16_altivec(uint8_t *dst, int dst_width, const uint8_t *src,
299
int wrap, int16_t *filter)
303
vector unsigned char *tv, tmp, dstv, zero;
304
vec_ss_t srchv[4], srclv[4], fv[4];
305
vector signed short zeros, sumhv, sumlv;
311
The vec_madds later on does an implicit >>15 on the result.
312
Since FILTER_BITS is 8, and we have 15 bits of magnitude in
313
a signed short, we have just enough bits to pre-shift our
314
filter constants <<7 to compensate for vec_madds.
316
fv[i].s[0] = filter[i] << (15-FILTER_BITS);
317
fv[i].v = vec_splat(fv[i].v, 0);
320
zero = vec_splat_u8(0);
321
zeros = vec_splat_s16(0);
325
When we're resampling, we'd ideally like both our input buffers,
326
and output buffers to be 16-byte aligned, so we can do both aligned
327
reads and writes. Sadly we can't always have this at the moment, so
328
we opt for aligned writes, as unaligned writes have a huge overhead.
329
To do this, do enough scalar resamples to get dst 16-byte aligned.
331
i = (-(int)dst) & 0xf;
333
sum = s[0 * wrap] * filter[0] +
334
s[1 * wrap] * filter[1] +
335
s[2 * wrap] * filter[2] +
336
s[3 * wrap] * filter[3];
337
sum = sum >> FILTER_BITS;
338
if (sum<0) sum = 0; else if (sum>255) sum=255;
346
/* Do our altivec resampling on 16 pixels at once. */
347
while(dst_width>=16) {
349
Read 16 (potentially unaligned) bytes from each of
350
4 lines into 4 vectors, and split them into shorts.
351
Interleave the multipy/accumulate for the resample
352
filter with the loads to hide the 3 cycle latency
355
tv = (vector unsigned char *) &s[0 * wrap];
356
tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[i * wrap]));
357
srchv[0].v = (vector signed short) vec_mergeh(zero, tmp);
358
srclv[0].v = (vector signed short) vec_mergel(zero, tmp);
359
sumhv = vec_madds(srchv[0].v, fv[0].v, zeros);
360
sumlv = vec_madds(srclv[0].v, fv[0].v, zeros);
362
tv = (vector unsigned char *) &s[1 * wrap];
363
tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[1 * wrap]));
364
srchv[1].v = (vector signed short) vec_mergeh(zero, tmp);
365
srclv[1].v = (vector signed short) vec_mergel(zero, tmp);
366
sumhv = vec_madds(srchv[1].v, fv[1].v, sumhv);
367
sumlv = vec_madds(srclv[1].v, fv[1].v, sumlv);
369
tv = (vector unsigned char *) &s[2 * wrap];
370
tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[2 * wrap]));
371
srchv[2].v = (vector signed short) vec_mergeh(zero, tmp);
372
srclv[2].v = (vector signed short) vec_mergel(zero, tmp);
373
sumhv = vec_madds(srchv[2].v, fv[2].v, sumhv);
374
sumlv = vec_madds(srclv[2].v, fv[2].v, sumlv);
376
tv = (vector unsigned char *) &s[3 * wrap];
377
tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[3 * wrap]));
378
srchv[3].v = (vector signed short) vec_mergeh(zero, tmp);
379
srclv[3].v = (vector signed short) vec_mergel(zero, tmp);
380
sumhv = vec_madds(srchv[3].v, fv[3].v, sumhv);
381
sumlv = vec_madds(srclv[3].v, fv[3].v, sumlv);
384
Pack the results into our destination vector,
385
and do an aligned write of that back to memory.
387
dstv = vec_packsu(sumhv, sumlv) ;
388
vec_st(dstv, 0, (vector unsigned char *) dst);
396
If there are any leftover pixels, resample them
397
with the slow scalar method.
400
sum = s[0 * wrap] * filter[0] +
401
s[1 * wrap] * filter[1] +
402
s[2 * wrap] * filter[2] +
403
s[3 * wrap] * filter[3];
404
sum = sum >> FILTER_BITS;
405
if (sum<0) sum = 0; else if (sum>255) sum=255;
414
/* slow version to handle limit cases. Does not need optimisation */
286
#endif /* HAVE_MMX */
288
/* slow version to handle limit cases. Does not need optimization */
415
289
static void h_resample_slow(uint8_t *dst, int dst_width,
416
290
const uint8_t *src, int src_width,
417
291
int src_start, int src_incr, int16_t *filters)