85
85
extern void _dv_rgbtoycb_mmx(unsigned char* inPtr, int rows, int columns,
86
86
short* outyPtr, short* outuPtr, short* outvPtr);
88
extern void _dv_rgbtoycb_mmx_x86_64(unsigned char* inPtr, int rows, int columns,
89
short* outyPtr, short* outuPtr, short* outvPtr);
88
91
void dv_enc_rgb_to_ycb(unsigned char* img_rgb, int height,
89
92
short* img_y, short* img_cr, short* img_cb)
94
#if (!ARCH_X86) && (!ARCH_X86_64)
264
extern int _dv_need_dct_248_mmx_x86_64_rows(dv_coeff_t * bl);
266
extern void _dv_transpose_mmx_x86_64(short * dst);
267
extern void _dv_ppm_copy_y_block_mmx_x86_64(short * dst, short * src);
268
extern void _dv_ppm_copy_pal_c_block_mmx_x86_64(short * dst, short * src);
269
extern void _dv_ppm_copy_ntsc_c_block_mmx_x86_64(short * dst, short * src);
271
static void finish_mb_mmx_x86_64(dv_macroblock_t* mb)
274
int need_dct_248_rows[6];
275
dv_block_t* bl = mb->b;
277
if (force_dct != -1) {
278
for (b = 0; b < 6; b++) {
279
bl[b].dct_mode = force_dct;
282
for (b = 0; b < 6; b++) {
284
= _dv_need_dct_248_mmx_x86_64_rows(bl[b].coeffs) + 1;
287
_dv_transpose_mmx_x86_64(bl[0].coeffs);
288
_dv_transpose_mmx_x86_64(bl[1].coeffs);
289
_dv_transpose_mmx_x86_64(bl[2].coeffs);
290
_dv_transpose_mmx_x86_64(bl[3].coeffs);
291
_dv_transpose_mmx_x86_64(bl[4].coeffs);
292
_dv_transpose_mmx_x86_64(bl[5].coeffs);
294
if (force_dct == -1) {
295
for (b = 0; b < 6; b++) {
297
((need_dct_248_rows[b] * 65536 /
298
(_dv_need_dct_248_mmx_x86_64_rows(bl[b].coeffs) + 1))
299
> DCT_248_THRESHOLD) ? DV_DCT_248 : DV_DCT_88;
255
304
#endif /* ARCH_X86 */
257
306
static int read_ppm_stream(FILE* f, int * isPAL, int * height_)
523
572
finish_mb_mmx(mb);
578
if (isPAL) { /* PAL */
579
short* start_y = img_y + y * DV_WIDTH + x;
580
_dv_ppm_copy_y_block_mmx_x86_64(bl[0].coeffs, start_y);
581
_dv_ppm_copy_y_block_mmx_x86_64(bl[1].coeffs, start_y + 8);
582
_dv_ppm_copy_y_block_mmx_x86_64(bl[2].coeffs, start_y + 8 * DV_WIDTH);
583
_dv_ppm_copy_y_block_mmx_x86_64(bl[3].coeffs, start_y + 8 * DV_WIDTH + 8);
584
_dv_ppm_copy_pal_c_block_mmx_x86_64(bl[4].coeffs,
585
img_cr+y * DV_WIDTH/2+ x/2);
586
_dv_ppm_copy_pal_c_block_mmx_x86_64(bl[5].coeffs,
587
img_cb+y * DV_WIDTH/2+ x/2);
588
} else if (mb->x == DV_WIDTH- 16) { /* rightmost NTSC block */
589
short* start_y = img_y + y * DV_WIDTH + x;
592
_dv_ppm_copy_y_block_mmx_x86_64(bl[0].coeffs, start_y);
593
_dv_ppm_copy_y_block_mmx_x86_64(bl[1].coeffs, start_y + 8);
594
_dv_ppm_copy_y_block_mmx_x86_64(bl[2].coeffs, start_y + 8 * DV_WIDTH);
595
_dv_ppm_copy_y_block_mmx_x86_64(bl[3].coeffs, start_y + 8 * DV_WIDTH + 8);
597
for (j = 0; j < 8; j++) {
598
for (i = 0; i < 4; i++) {
599
bl[4].coeffs[8 * j + i] =
600
(img_cr[(y + j) * DV_WIDTH/2
602
+ img_cr[(y + j) * DV_WIDTH/2
603
+ x / 2 + 1 + i*2]) >> 1;
604
bl[5].coeffs[8 * j + i] =
605
(img_cb[(y + j) * DV_WIDTH/2
607
+ img_cb[(y + j) * DV_WIDTH/2
608
+ x / 2 + 1 + i*2]) >> 1;
609
bl[4].coeffs[8 * j + i + 4] =
610
(img_cr[(y + j + 8) * DV_WIDTH/2
612
+ img_cr[(y + j + 8) * DV_WIDTH/2
613
+ x / 2 + 1 + i*2]) >> 1;
614
bl[5].coeffs[8 * j + i + 4] =
615
(img_cb[(y + j + 8) * DV_WIDTH/2
617
+ img_cb[(y + j + 8) * DV_WIDTH/2
618
+ x / 2 + 1 + i*2]) >> 1;
623
short* start_y = img_y + y * DV_WIDTH + x;
624
_dv_ppm_copy_y_block_mmx_x86_64(bl[0].coeffs, start_y);
625
_dv_ppm_copy_y_block_mmx_x86_64(bl[1].coeffs, start_y + 8);
626
_dv_ppm_copy_y_block_mmx_x86_64(bl[2].coeffs, start_y + 16);
627
_dv_ppm_copy_y_block_mmx_x86_64(bl[3].coeffs, start_y + 24);
628
_dv_ppm_copy_ntsc_c_block_mmx_x86_64(bl[4].coeffs,
629
img_cr + y*DV_WIDTH/2 + x/2);
630
_dv_ppm_copy_ntsc_c_block_mmx_x86_64(bl[5].coeffs,
631
img_cb + y*DV_WIDTH/2 + x/2);
634
finish_mb_mmx_x86_64(mb);
666
777
extern void _dv_pgm_copy_y_block_mmx(short * dst, unsigned char * src);
667
778
extern void _dv_pgm_copy_pal_c_block_mmx(short * dst, unsigned char * src);
668
779
extern void _dv_pgm_copy_ntsc_c_block_mmx(short * dst, unsigned char * src);
781
extern void _dv_pgm_copy_y_block_mmx_x86_64(short * dst, unsigned char * src);
782
extern void _dv_pgm_copy_pal_c_block_mmx_x86_64(short * dst, unsigned char * src);
783
extern void _dv_pgm_copy_ntsc_c_block_mmx_x86_64(short * dst, unsigned char * src);
671
786
static void pgm_fill_macroblock(dv_macroblock_t *mb, int isPAL)
811
926
finish_mb_mmx(mb);
932
if (isPAL) { /* PAL */
933
unsigned char* start_y = real_readbuf + y * DV_WIDTH + x;
934
unsigned char* img_cr = real_readbuf
935
+ DV_WIDTH * DV_PAL_HEIGHT + DV_WIDTH / 2;
936
unsigned char* img_cb = real_readbuf
937
+ DV_WIDTH * DV_PAL_HEIGHT;
939
_dv_pgm_copy_y_block_mmx_x86_64(bl[0].coeffs, start_y);
940
_dv_pgm_copy_y_block_mmx_x86_64(bl[1].coeffs, start_y + 8);
941
_dv_pgm_copy_y_block_mmx_x86_64(bl[2].coeffs, start_y + 8 * DV_WIDTH);
942
_dv_pgm_copy_y_block_mmx_x86_64(bl[3].coeffs, start_y + 8 * DV_WIDTH + 8);
943
_dv_pgm_copy_pal_c_block_mmx_x86_64(bl[4].coeffs,
944
img_cr + y * DV_WIDTH / 2 + x / 2);
945
_dv_pgm_copy_pal_c_block_mmx_x86_64(bl[5].coeffs,
946
img_cb + y * DV_WIDTH / 2 + x / 2);
947
} else if (x == DV_WIDTH- 16) { /* rightmost NTSC block */
948
unsigned char* start_y = real_readbuf + y * DV_WIDTH + x;
950
unsigned char* img_cr = real_readbuf
951
+ (isPAL ? (DV_WIDTH * DV_PAL_HEIGHT)
952
: (DV_WIDTH * DV_NTSC_HEIGHT)) + DV_WIDTH / 2;
953
unsigned char* img_cb = real_readbuf
954
+ (isPAL ? (DV_WIDTH * DV_PAL_HEIGHT)
955
: (DV_WIDTH * DV_NTSC_HEIGHT));
959
_dv_pgm_copy_y_block_mmx_x86_64(bl[0].coeffs, start_y);
960
_dv_pgm_copy_y_block_mmx_x86_64(bl[1].coeffs, start_y + 8);
961
_dv_pgm_copy_y_block_mmx_x86_64(bl[2].coeffs, start_y + 8 * DV_WIDTH);
962
_dv_pgm_copy_y_block_mmx_x86_64(bl[3].coeffs, start_y + 8 * DV_WIDTH + 8);
964
for (j = 0; j < 8; j++) {
965
for (i = 0; i < 4; i++) {
966
bl[4].coeffs[8*j + i*2] =
967
bl[4].coeffs[8*j + i*2 + 1] =
968
pgm_get_cr_ntsc(y/2 + j, x/2 + i * 2);
969
bl[5].coeffs[8*j + i*2] =
970
bl[5].coeffs[8*j + i*2 + 1] =
971
pgm_get_cb_ntsc(y/2 + j, x/2 + i * 2);
972
bl[4].coeffs[8*j + (i+4)*2] =
973
bl[4].coeffs[8*j + (i+4)*2 + 1] =
974
pgm_get_cr_ntsc(y/2 + j +8, x/2 + i * 2);
975
bl[5].coeffs[8*j + (i+4)*2] =
976
bl[5].coeffs[8*j + (i+4)*2 + 1] =
977
pgm_get_cb_ntsc(y/2 + j +8, x/2 + i * 2);
981
unsigned char* start_y = real_readbuf + y * DV_WIDTH + x;
982
unsigned char* img_cr = real_readbuf
983
+ DV_WIDTH * DV_NTSC_HEIGHT + DV_WIDTH / 2;
984
unsigned char* img_cb = real_readbuf
985
+ DV_WIDTH * DV_NTSC_HEIGHT;
986
_dv_pgm_copy_y_block_mmx_x86_64(bl[0].coeffs, start_y);
987
_dv_pgm_copy_y_block_mmx_x86_64(bl[1].coeffs, start_y + 8);
988
_dv_pgm_copy_y_block_mmx_x86_64(bl[2].coeffs, start_y + 16);
989
_dv_pgm_copy_y_block_mmx_x86_64(bl[3].coeffs, start_y + 24);
990
_dv_pgm_copy_ntsc_c_block_mmx_x86_64(bl[4].coeffs,
991
img_cr + y * DV_WIDTH / 2 + x / 2);
992
_dv_pgm_copy_ntsc_c_block_mmx_x86_64(bl[5].coeffs,
993
img_cb + y * DV_WIDTH / 2 + x / 2);
996
finish_mb_mmx_x86_64(mb);
816
1001
register int i, j;
1105
1294
finish_mb_mmx(mb);
1298
if (isPAL) { /* PAL */
1299
unsigned char* start_y = real_readbuf + y * DV_WIDTH + x;
1300
unsigned char* img_cr = real_readbuf
1301
+ (isPAL ? DV_WIDTH * DV_PAL_HEIGHT * 3/2
1302
: DV_WIDTH * DV_NTSC_HEIGHT * 3/2);
1303
unsigned char* img_cb = real_readbuf
1304
+ (isPAL ? DV_WIDTH * DV_PAL_HEIGHT
1305
: DV_WIDTH * DV_NTSC_HEIGHT);
1307
_dv_video_copy_y_block_mmx_x86_64(bl[0].coeffs, start_y);
1308
_dv_video_copy_y_block_mmx_x86_64(bl[1].coeffs, start_y + 8);
1309
_dv_video_copy_y_block_mmx_x86_64(bl[2].coeffs, start_y + 8 * DV_WIDTH);
1310
_dv_video_copy_y_block_mmx_x86_64(bl[3].coeffs, start_y + 8 * DV_WIDTH+8);
1311
_dv_video_copy_pal_c_block_mmx_x86_64(bl[4].coeffs,
1312
img_cr + y * DV_WIDTH / 2 + x / 2);
1313
_dv_video_copy_pal_c_block_mmx_x86_64(bl[5].coeffs,
1314
img_cb + y * DV_WIDTH / 2 + x / 2);
1315
} else if (x == DV_WIDTH- 16) { /* rightmost NTSC block */
1316
unsigned char* start_y = real_readbuf + y * DV_WIDTH + x;
1318
unsigned char* img_cr = real_readbuf
1319
+ (isPAL ? DV_WIDTH * DV_PAL_HEIGHT * 3/2
1320
: DV_WIDTH * DV_NTSC_HEIGHT * 3/2);
1321
unsigned char* img_cb = real_readbuf
1322
+ (isPAL ? DV_WIDTH * DV_PAL_HEIGHT
1323
: DV_WIDTH * DV_NTSC_HEIGHT);
1327
_dv_video_copy_y_block_mmx_x86_64(bl[0].coeffs, start_y);
1328
_dv_video_copy_y_block_mmx_x86_64(bl[1].coeffs, start_y + 8);
1329
_dv_video_copy_y_block_mmx_x86_64(bl[2].coeffs, start_y + 8 * DV_WIDTH);
1330
_dv_video_copy_y_block_mmx_x86_64(bl[3].coeffs, start_y + 8 * DV_WIDTH+8);
1333
for (j = 0; j < 8; j++) {
1334
for (i = 0; i < 4; i++) {
1335
bl[4].coeffs[8*j + i] =
1336
video_get_cr_ntsc(y/2+j, x/2+i);
1337
bl[5].coeffs[8*j + i] =
1338
video_get_cb_ntsc(y/2+j, x/2+i);
1339
bl[4].coeffs[8*j + (i+4)] =
1340
video_get_cr_ntsc(y/2+j+8, x/2+i);
1341
bl[5].coeffs[8*j + (i+4)] =
1342
video_get_cb_ntsc(y/2+j+8, x/2+i);
1346
unsigned char* start_y = real_readbuf + y * DV_WIDTH + x;
1347
unsigned char* img_cr = real_readbuf
1348
+ DV_WIDTH * DV_NTSC_HEIGHT * 3 / 2;
1349
unsigned char* img_cb = real_readbuf
1350
+ DV_WIDTH * DV_NTSC_HEIGHT;
1351
_dv_video_copy_y_block_mmx_x86_64(bl[0].coeffs, start_y);
1352
_dv_video_copy_y_block_mmx_x86_64(bl[1].coeffs, start_y + 8);
1353
_dv_video_copy_y_block_mmx_x86_64(bl[2].coeffs, start_y + 16);
1354
_dv_video_copy_y_block_mmx_x86_64(bl[3].coeffs, start_y + 24);
1355
_dv_video_copy_ntsc_c_block_mmx_x86_64(bl[4].coeffs,
1356
img_cr + y * DV_WIDTH / 2 + x / 2);
1357
_dv_video_copy_ntsc_c_block_mmx_x86_64(bl[5].coeffs,
1358
img_cb + y * DV_WIDTH / 2 + x / 2);
1361
finish_mb_mmx_x86_64(mb);
1635
int need_dct_248_rows[6];
1637
if (dv_enc->isPAL) { /* PAL or rightmost NTSC block */
1638
short* start_y = dv_enc->img_y + y * DV_WIDTH + x;
1639
_dv_ppm_copy_y_block_mmx_x86_64(bl[0].coeffs, start_y);
1640
_dv_ppm_copy_y_block_mmx_x86_64(bl[1].coeffs, start_y + 8);
1641
_dv_ppm_copy_y_block_mmx_x86_64(bl[2].coeffs, start_y + 8 * DV_WIDTH);
1642
_dv_ppm_copy_y_block_mmx_x86_64(bl[3].coeffs, start_y + 8 * DV_WIDTH + 8);
1643
_dv_ppm_copy_pal_c_block_mmx_x86_64(bl[4].coeffs,
1644
dv_enc->img_cr+y * DV_WIDTH/2+ x/2);
1645
_dv_ppm_copy_pal_c_block_mmx_x86_64(bl[5].coeffs,
1646
dv_enc->img_cb+y * DV_WIDTH/2+ x/2);
1647
} else if (x == DV_WIDTH- 16) { /* rightmost NTSC block */
1648
short* start_y = dv_enc->img_y + y * DV_WIDTH + x;
1651
_dv_ppm_copy_y_block_mmx_x86_64(bl[0].coeffs, start_y);
1652
_dv_ppm_copy_y_block_mmx_x86_64(bl[1].coeffs, start_y + 8);
1653
_dv_ppm_copy_y_block_mmx_x86_64(bl[2].coeffs, start_y + 8 * DV_WIDTH);
1654
_dv_ppm_copy_y_block_mmx_x86_64(bl[3].coeffs, start_y + 8 * DV_WIDTH + 8);
1656
for (j = 0; j < 8; j++) {
1657
for (i = 0; i < 4; i++) {
1658
bl[4].coeffs[8 * j + i] =
1659
(dv_enc->img_cr[(y + j) * DV_WIDTH/2
1661
+ dv_enc->img_cr[(y + j) * DV_WIDTH/2
1662
+ x / 2 + 1 + i*2]) >> 1;
1663
bl[5].coeffs[8 * j + i] =
1664
(dv_enc->img_cb[(y + j) * DV_WIDTH/2
1666
+ dv_enc->img_cb[(y + j) * DV_WIDTH/2
1667
+ x / 2 + 1 + i*2]) >> 1;
1668
bl[4].coeffs[8 * j + i + 4] =
1669
(dv_enc->img_cr[(y + j + 8) * DV_WIDTH/2
1671
+ dv_enc->img_cr[(y + j + 8) * DV_WIDTH/2
1672
+ x / 2 + 1 + i*2]) >> 1;
1673
bl[5].coeffs[8 * j + i + 4] =
1674
(dv_enc->img_cb[(y + j + 8) * DV_WIDTH/2
1676
+ dv_enc->img_cb[(y + j + 8) * DV_WIDTH/2
1677
+ x / 2 + 1 + i*2]) >> 1;
1681
short* start_y = dv_enc->img_y + y * DV_WIDTH + x;
1682
_dv_ppm_copy_y_block_mmx_x86_64(bl[0].coeffs, start_y);
1683
_dv_ppm_copy_y_block_mmx_x86_64(bl[1].coeffs, start_y + 8);
1684
_dv_ppm_copy_y_block_mmx_x86_64(bl[2].coeffs, start_y + 16);
1685
_dv_ppm_copy_y_block_mmx_x86_64(bl[3].coeffs, start_y + 24);
1686
_dv_ppm_copy_ntsc_c_block_mmx_x86_64(bl[4].coeffs,
1687
dv_enc->img_cr + y*DV_WIDTH/2 + x/2);
1688
_dv_ppm_copy_ntsc_c_block_mmx_x86_64(bl[5].coeffs,
1689
dv_enc->img_cb + y*DV_WIDTH/2 + x/2);
1693
/* from finish_mb_mmx() */
1694
if (dv_enc->force_dct != -1) {
1695
for (b = 0; b < 6; b++) {
1696
bl[b].dct_mode = dv_enc->force_dct;
1699
for (b = 0; b < 6; b++) {
1700
need_dct_248_rows[b]
1701
= _dv_need_dct_248_mmx_x86_64_rows(bl[b].coeffs) + 1;
1704
_dv_transpose_mmx_x86_64(bl[0].coeffs);
1705
_dv_transpose_mmx_x86_64(bl[1].coeffs);
1706
_dv_transpose_mmx_x86_64(bl[2].coeffs);
1707
_dv_transpose_mmx_x86_64(bl[3].coeffs);
1708
_dv_transpose_mmx_x86_64(bl[4].coeffs);
1709
_dv_transpose_mmx_x86_64(bl[5].coeffs);
1711
if (dv_enc->force_dct == -1) {
1712
for (b = 0; b < 6; b++) {
1714
((need_dct_248_rows[b] * 65536 /
1715
(_dv_need_dct_248_mmx_x86_64_rows(bl[b].coeffs) + 1))
1716
> DCT_248_THRESHOLD) ? DV_DCT_248 : DV_DCT_88;