1367
1395
//DERING_CORE(dst,src ,ppsx ,psx ,sx ,pplx ,plx ,lx ,t0 ,t1)
1368
DERING_CORE((%%eax),(%%eax, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1369
DERING_CORE((%%eax, %1),(%%eax, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1370
DERING_CORE((%%eax, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1371
DERING_CORE((%0, %1, 4),(%%edx) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1372
DERING_CORE((%%edx),(%%edx, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1373
DERING_CORE((%%edx, %1), (%%edx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1374
DERING_CORE((%%edx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1375
DERING_CORE((%0, %1, 8),(%%edx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1396
DERING_CORE((%%REGa),(%%REGa, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1397
DERING_CORE((%%REGa, %1),(%%REGa, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1398
DERING_CORE((%%REGa, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1399
DERING_CORE((%0, %1, 4),(%%REGd) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1400
DERING_CORE((%%REGd),(%%REGd, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1401
DERING_CORE((%%REGd, %1), (%%REGd, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1402
DERING_CORE((%%REGd, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1403
DERING_CORE((%0, %1, 8),(%%REGd, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1378
: : "r" (src), "r" (stride), "m" (c->pQPb), "m"(c->pQPb2)
1379
: "%eax", "%edx", "%ecx"
1406
: : "r" (src), "r" ((long)stride), "m" (c->pQPb), "m"(c->pQPb2)
1407
: "%"REG_a, "%"REG_d, "%"REG_c
1818
1851
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1819
1852
src+= 4*stride;
1821
"leal (%0, %1), %%eax \n\t"
1822
"leal (%%eax, %1, 4), %%edx \n\t"
1854
"lea (%0, %1), %%"REG_a" \n\t"
1855
"lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
1823
1856
// 0 1 2 3 4 5 6 7 8 9
1824
1857
// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1826
1859
"movq (%2), %%mm0 \n\t" // L0
1827
"movq (%%eax), %%mm1 \n\t" // L2
1860
"movq (%%"REG_a"), %%mm1 \n\t" // L2
1828
1861
PAVGB(%%mm1, %%mm0) // L0+L2
1829
1862
"movq (%0), %%mm2 \n\t" // L1
1830
1863
PAVGB(%%mm2, %%mm0)
1831
1864
"movq %%mm0, (%0) \n\t"
1832
"movq (%%eax, %1), %%mm0 \n\t" // L3
1865
"movq (%%"REG_a", %1), %%mm0 \n\t" // L3
1833
1866
PAVGB(%%mm0, %%mm2) // L1+L3
1834
1867
PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3
1835
"movq %%mm2, (%%eax) \n\t"
1836
"movq (%%eax, %1, 2), %%mm2 \n\t" // L4
1868
"movq %%mm2, (%%"REG_a") \n\t"
1869
"movq (%%"REG_a", %1, 2), %%mm2 \n\t" // L4
1837
1870
PAVGB(%%mm2, %%mm1) // L2+L4
1838
1871
PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4
1839
"movq %%mm1, (%%eax, %1) \n\t"
1872
"movq %%mm1, (%%"REG_a", %1) \n\t"
1840
1873
"movq (%0, %1, 4), %%mm1 \n\t" // L5
1841
1874
PAVGB(%%mm1, %%mm0) // L3+L5
1842
1875
PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5
1843
"movq %%mm0, (%%eax, %1, 2) \n\t"
1844
"movq (%%edx), %%mm0 \n\t" // L6
1876
"movq %%mm0, (%%"REG_a", %1, 2) \n\t"
1877
"movq (%%"REG_d"), %%mm0 \n\t" // L6
1845
1878
PAVGB(%%mm0, %%mm2) // L4+L6
1846
1879
PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6
1847
1880
"movq %%mm2, (%0, %1, 4) \n\t"
1848
"movq (%%edx, %1), %%mm2 \n\t" // L7
1881
"movq (%%"REG_d", %1), %%mm2 \n\t" // L7
1849
1882
PAVGB(%%mm2, %%mm1) // L5+L7
1850
1883
PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7
1851
"movq %%mm1, (%%edx) \n\t"
1852
"movq (%%edx, %1, 2), %%mm1 \n\t" // L8
1884
"movq %%mm1, (%%"REG_d") \n\t"
1885
"movq (%%"REG_d", %1, 2), %%mm1 \n\t" // L8
1853
1886
PAVGB(%%mm1, %%mm0) // L6+L8
1854
1887
PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8
1855
"movq %%mm0, (%%edx, %1) \n\t"
1888
"movq %%mm0, (%%"REG_d", %1) \n\t"
1856
1889
"movq (%0, %1, 8), %%mm0 \n\t" // L9
1857
1890
PAVGB(%%mm0, %%mm2) // L7+L9
1858
1891
PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9
1859
"movq %%mm2, (%%edx, %1, 2) \n\t"
1892
"movq %%mm2, (%%"REG_d", %1, 2) \n\t"
1860
1893
"movq %%mm1, (%2) \n\t"
1862
: : "r" (src), "r" (stride), "r" (tmp)
1895
: : "r" (src), "r" ((long)stride), "r" (tmp)
1896
: "%"REG_a, "%"REG_d
1866
1899
int a, b, c, x;
1920
1953
src+= 4*stride;
1921
1954
#ifdef HAVE_MMX2
1923
"leal (%0, %1), %%eax \n\t"
1924
"leal (%%eax, %1, 4), %%edx \n\t"
1956
"lea (%0, %1), %%"REG_a" \n\t"
1957
"lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
1925
1958
// 0 1 2 3 4 5 6 7 8 9
1926
1959
// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1928
1961
"movq (%0), %%mm0 \n\t" //
1929
"movq (%%eax, %1), %%mm2 \n\t" //
1930
"movq (%%eax), %%mm1 \n\t" //
1962
"movq (%%"REG_a", %1), %%mm2 \n\t" //
1963
"movq (%%"REG_a"), %%mm1 \n\t" //
1931
1964
"movq %%mm0, %%mm3 \n\t"
1932
1965
"pmaxub %%mm1, %%mm0 \n\t" //
1933
1966
"pminub %%mm3, %%mm1 \n\t" //
1934
1967
"pmaxub %%mm2, %%mm1 \n\t" //
1935
1968
"pminub %%mm1, %%mm0 \n\t"
1936
"movq %%mm0, (%%eax) \n\t"
1969
"movq %%mm0, (%%"REG_a") \n\t"
1938
1971
"movq (%0, %1, 4), %%mm0 \n\t" //
1939
"movq (%%eax, %1, 2), %%mm1 \n\t" //
1972
"movq (%%"REG_a", %1, 2), %%mm1 \n\t" //
1940
1973
"movq %%mm2, %%mm3 \n\t"
1941
1974
"pmaxub %%mm1, %%mm2 \n\t" //
1942
1975
"pminub %%mm3, %%mm1 \n\t" //
1943
1976
"pmaxub %%mm0, %%mm1 \n\t" //
1944
1977
"pminub %%mm1, %%mm2 \n\t"
1945
"movq %%mm2, (%%eax, %1, 2) \n\t"
1978
"movq %%mm2, (%%"REG_a", %1, 2) \n\t"
1947
"movq (%%edx), %%mm2 \n\t" //
1948
"movq (%%edx, %1), %%mm1 \n\t" //
1980
"movq (%%"REG_d"), %%mm2 \n\t" //
1981
"movq (%%"REG_d", %1), %%mm1 \n\t" //
1949
1982
"movq %%mm2, %%mm3 \n\t"
1950
1983
"pmaxub %%mm0, %%mm2 \n\t" //
1951
1984
"pminub %%mm3, %%mm0 \n\t" //
1952
1985
"pmaxub %%mm1, %%mm0 \n\t" //
1953
1986
"pminub %%mm0, %%mm2 \n\t"
1954
"movq %%mm2, (%%edx) \n\t"
1987
"movq %%mm2, (%%"REG_d") \n\t"
1956
"movq (%%edx, %1, 2), %%mm2 \n\t" //
1989
"movq (%%"REG_d", %1, 2), %%mm2 \n\t" //
1957
1990
"movq (%0, %1, 8), %%mm0 \n\t" //
1958
1991
"movq %%mm2, %%mm3 \n\t"
1959
1992
"pmaxub %%mm0, %%mm2 \n\t" //
1960
1993
"pminub %%mm3, %%mm0 \n\t" //
1961
1994
"pmaxub %%mm1, %%mm0 \n\t" //
1962
1995
"pminub %%mm0, %%mm2 \n\t"
1963
"movq %%mm2, (%%edx, %1, 2) \n\t"
1966
: : "r" (src), "r" (stride)
1996
"movq %%mm2, (%%"REG_d", %1, 2) \n\t"
1999
: : "r" (src), "r" ((long)stride)
2000
: "%"REG_a, "%"REG_d
1970
2003
#else // MMX without MMX2
1972
"leal (%0, %1), %%eax \n\t"
1973
"leal (%%eax, %1, 4), %%edx \n\t"
2005
"lea (%0, %1), %%"REG_a" \n\t"
2006
"lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
1974
2007
// 0 1 2 3 4 5 6 7 8 9
1975
2008
// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1976
2009
"pxor %%mm7, %%mm7 \n\t"
1978
#define MEDIAN(a,b,c)\
2011
#define REAL_MEDIAN(a,b,c)\
1979
2012
"movq " #a ", %%mm0 \n\t"\
1980
2013
"movq " #b ", %%mm2 \n\t"\
1981
2014
"movq " #c ", %%mm1 \n\t"\
2305
2336
"paddd %%mm0, %%mm4 \n\t"
2306
2337
"movd %%mm4, %%ecx \n\t"
2307
2338
"shll $2, %%ecx \n\t"
2308
"movl %3, %%edx \n\t"
2309
"addl -4(%%edx), %%ecx \n\t"
2310
"addl 4(%%edx), %%ecx \n\t"
2311
"addl -1024(%%edx), %%ecx \n\t"
2339
"mov %3, %%"REG_d" \n\t"
2340
"addl -4(%%"REG_d"), %%ecx \n\t"
2341
"addl 4(%%"REG_d"), %%ecx \n\t"
2342
"addl -1024(%%"REG_d"), %%ecx \n\t"
2312
2343
"addl $4, %%ecx \n\t"
2313
"addl 1024(%%edx), %%ecx \n\t"
2344
"addl 1024(%%"REG_d"), %%ecx \n\t"
2314
2345
"shrl $3, %%ecx \n\t"
2315
"movl %%ecx, (%%edx) \n\t"
2346
"movl %%ecx, (%%"REG_d") \n\t"
2317
// "movl %3, %%ecx \n\t"
2318
// "movl %%ecx, test \n\t"
2348
// "mov %3, %%"REG_c" \n\t"
2349
// "mov %%"REG_c", test \n\t"
2319
2350
// "jmp 4f \n\t"
2320
"cmpl 512(%%edx), %%ecx \n\t"
2351
"cmpl 512(%%"REG_d"), %%ecx \n\t"
2322
"cmpl 516(%%edx), %%ecx \n\t"
2353
"cmpl 516(%%"REG_d"), %%ecx \n\t"
2325
"leal (%%eax, %2, 2), %%edx \n\t" // 5*stride
2326
"leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride
2356
"lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride
2357
"lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride
2327
2358
"movq (%0), %%mm0 \n\t" // L0
2328
2359
"movq (%0, %2), %%mm1 \n\t" // L1
2329
2360
"movq (%0, %2, 2), %%mm2 \n\t" // L2
2330
"movq (%0, %%eax), %%mm3 \n\t" // L3
2361
"movq (%0, %%"REG_a"), %%mm3 \n\t" // L3
2331
2362
"movq (%0, %2, 4), %%mm4 \n\t" // L4
2332
"movq (%0, %%edx), %%mm5 \n\t" // L5
2333
"movq (%0, %%eax, 2), %%mm6 \n\t" // L6
2334
"movq (%0, %%ecx), %%mm7 \n\t" // L7
2363
"movq (%0, %%"REG_d"), %%mm5 \n\t" // L5
2364
"movq (%0, %%"REG_a", 2), %%mm6 \n\t" // L6
2365
"movq (%0, %%"REG_c"), %%mm7 \n\t" // L7
2335
2366
"movq %%mm0, (%1) \n\t" // L0
2336
2367
"movq %%mm1, (%1, %2) \n\t" // L1
2337
2368
"movq %%mm2, (%1, %2, 2) \n\t" // L2
2338
"movq %%mm3, (%1, %%eax) \n\t" // L3
2369
"movq %%mm3, (%1, %%"REG_a") \n\t" // L3
2339
2370
"movq %%mm4, (%1, %2, 4) \n\t" // L4
2340
"movq %%mm5, (%1, %%edx) \n\t" // L5
2341
"movq %%mm6, (%1, %%eax, 2) \n\t" // L6
2342
"movq %%mm7, (%1, %%ecx) \n\t" // L7
2371
"movq %%mm5, (%1, %%"REG_d") \n\t" // L5
2372
"movq %%mm6, (%1, %%"REG_a", 2) \n\t" // L6
2373
"movq %%mm7, (%1, %%"REG_c") \n\t" // L7
2346
"leal (%%eax, %2, 2), %%edx \n\t" // 5*stride
2347
"leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride
2377
"lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride
2378
"lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride
2348
2379
"movq (%0), %%mm0 \n\t" // L0
2349
2380
PAVGB((%1), %%mm0) // L0
2350
2381
"movq (%0, %2), %%mm1 \n\t" // L1
2351
2382
PAVGB((%1, %2), %%mm1) // L1
2352
2383
"movq (%0, %2, 2), %%mm2 \n\t" // L2
2353
2384
PAVGB((%1, %2, 2), %%mm2) // L2
2354
"movq (%0, %%eax), %%mm3 \n\t" // L3
2355
PAVGB((%1, %%eax), %%mm3) // L3
2385
"movq (%0, %%"REG_a"), %%mm3 \n\t" // L3
2386
PAVGB((%1, %%REGa), %%mm3) // L3
2356
2387
"movq (%0, %2, 4), %%mm4 \n\t" // L4
2357
2388
PAVGB((%1, %2, 4), %%mm4) // L4
2358
"movq (%0, %%edx), %%mm5 \n\t" // L5
2359
PAVGB((%1, %%edx), %%mm5) // L5
2360
"movq (%0, %%eax, 2), %%mm6 \n\t" // L6
2361
PAVGB((%1, %%eax, 2), %%mm6) // L6
2362
"movq (%0, %%ecx), %%mm7 \n\t" // L7
2363
PAVGB((%1, %%ecx), %%mm7) // L7
2389
"movq (%0, %%"REG_d"), %%mm5 \n\t" // L5
2390
PAVGB((%1, %%REGd), %%mm5) // L5
2391
"movq (%0, %%"REG_a", 2), %%mm6 \n\t" // L6
2392
PAVGB((%1, %%REGa, 2), %%mm6) // L6
2393
"movq (%0, %%"REG_c"), %%mm7 \n\t" // L7
2394
PAVGB((%1, %%REGc), %%mm7) // L7
2364
2395
"movq %%mm0, (%1) \n\t" // R0
2365
2396
"movq %%mm1, (%1, %2) \n\t" // R1
2366
2397
"movq %%mm2, (%1, %2, 2) \n\t" // R2
2367
"movq %%mm3, (%1, %%eax) \n\t" // R3
2398
"movq %%mm3, (%1, %%"REG_a") \n\t" // R3
2368
2399
"movq %%mm4, (%1, %2, 4) \n\t" // R4
2369
"movq %%mm5, (%1, %%edx) \n\t" // R5
2370
"movq %%mm6, (%1, %%eax, 2) \n\t" // R6
2371
"movq %%mm7, (%1, %%ecx) \n\t" // R7
2400
"movq %%mm5, (%1, %%"REG_d") \n\t" // R5
2401
"movq %%mm6, (%1, %%"REG_a", 2) \n\t" // R6
2402
"movq %%mm7, (%1, %%"REG_c") \n\t" // R7
2372
2403
"movq %%mm0, (%0) \n\t" // L0
2373
2404
"movq %%mm1, (%0, %2) \n\t" // L1
2374
2405
"movq %%mm2, (%0, %2, 2) \n\t" // L2
2375
"movq %%mm3, (%0, %%eax) \n\t" // L3
2406
"movq %%mm3, (%0, %%"REG_a") \n\t" // L3
2376
2407
"movq %%mm4, (%0, %2, 4) \n\t" // L4
2377
"movq %%mm5, (%0, %%edx) \n\t" // L5
2378
"movq %%mm6, (%0, %%eax, 2) \n\t" // L6
2379
"movq %%mm7, (%0, %%ecx) \n\t" // L7
2408
"movq %%mm5, (%0, %%"REG_d") \n\t" // L5
2409
"movq %%mm6, (%0, %%"REG_a", 2) \n\t" // L6
2410
"movq %%mm7, (%0, %%"REG_c") \n\t" // L7
2383
"cmpl 508(%%edx), %%ecx \n\t"
2414
"cmpl 508(%%"REG_d"), %%ecx \n\t"
2386
"leal (%%eax, %2, 2), %%edx \n\t" // 5*stride
2387
"leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride
2417
"lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride
2418
"lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride
2388
2419
"movq (%0), %%mm0 \n\t" // L0
2389
2420
"movq (%0, %2), %%mm1 \n\t" // L1
2390
2421
"movq (%0, %2, 2), %%mm2 \n\t" // L2
2391
"movq (%0, %%eax), %%mm3 \n\t" // L3
2422
"movq (%0, %%"REG_a"), %%mm3 \n\t" // L3
2392
2423
"movq (%1), %%mm4 \n\t" // R0
2393
2424
"movq (%1, %2), %%mm5 \n\t" // R1
2394
2425
"movq (%1, %2, 2), %%mm6 \n\t" // R2
2395
"movq (%1, %%eax), %%mm7 \n\t" // R3
2426
"movq (%1, %%"REG_a"), %%mm7 \n\t" // R3
2396
2427
PAVGB(%%mm4, %%mm0)
2397
2428
PAVGB(%%mm5, %%mm1)
2398
2429
PAVGB(%%mm6, %%mm2)
2642
#endif //HAVE_ALTIVEC
2646
* accurate deblock filter
2648
static always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, PPContext *c){
2649
int64_t dc_mask, eq_mask, both_masks;
2650
int64_t sums[10*8*2];
2651
src+= step*3; // src points to begin of the 8x8 Block
2654
"movq %0, %%mm7 \n\t"
2655
"movq %1, %%mm6 \n\t"
2656
: : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP])
2660
"lea (%2, %3), %%"REG_a" \n\t"
2661
// 0 1 2 3 4 5 6 7 8 9
2662
// %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2
2664
"movq (%2), %%mm0 \n\t"
2665
"movq (%%"REG_a"), %%mm1 \n\t"
2666
"movq %%mm1, %%mm3 \n\t"
2667
"movq %%mm1, %%mm4 \n\t"
2668
"psubb %%mm1, %%mm0 \n\t" // mm0 = differnece
2669
"paddb %%mm7, %%mm0 \n\t"
2670
"pcmpgtb %%mm6, %%mm0 \n\t"
2672
"movq (%%"REG_a",%3), %%mm2 \n\t"
2673
PMAXUB(%%mm2, %%mm4)
2674
PMINUB(%%mm2, %%mm3, %%mm5)
2675
"psubb %%mm2, %%mm1 \n\t"
2676
"paddb %%mm7, %%mm1 \n\t"
2677
"pcmpgtb %%mm6, %%mm1 \n\t"
2678
"paddb %%mm1, %%mm0 \n\t"
2680
"movq (%%"REG_a", %3, 2), %%mm1 \n\t"
2681
PMAXUB(%%mm1, %%mm4)
2682
PMINUB(%%mm1, %%mm3, %%mm5)
2683
"psubb %%mm1, %%mm2 \n\t"
2684
"paddb %%mm7, %%mm2 \n\t"
2685
"pcmpgtb %%mm6, %%mm2 \n\t"
2686
"paddb %%mm2, %%mm0 \n\t"
2688
"lea (%%"REG_a", %3, 4), %%"REG_a" \n\t"
2690
"movq (%2, %3, 4), %%mm2 \n\t"
2691
PMAXUB(%%mm2, %%mm4)
2692
PMINUB(%%mm2, %%mm3, %%mm5)
2693
"psubb %%mm2, %%mm1 \n\t"
2694
"paddb %%mm7, %%mm1 \n\t"
2695
"pcmpgtb %%mm6, %%mm1 \n\t"
2696
"paddb %%mm1, %%mm0 \n\t"
2698
"movq (%%"REG_a"), %%mm1 \n\t"
2699
PMAXUB(%%mm1, %%mm4)
2700
PMINUB(%%mm1, %%mm3, %%mm5)
2701
"psubb %%mm1, %%mm2 \n\t"
2702
"paddb %%mm7, %%mm2 \n\t"
2703
"pcmpgtb %%mm6, %%mm2 \n\t"
2704
"paddb %%mm2, %%mm0 \n\t"
2706
"movq (%%"REG_a", %3), %%mm2 \n\t"
2707
PMAXUB(%%mm2, %%mm4)
2708
PMINUB(%%mm2, %%mm3, %%mm5)
2709
"psubb %%mm2, %%mm1 \n\t"
2710
"paddb %%mm7, %%mm1 \n\t"
2711
"pcmpgtb %%mm6, %%mm1 \n\t"
2712
"paddb %%mm1, %%mm0 \n\t"
2714
"movq (%%"REG_a", %3, 2), %%mm1 \n\t"
2715
PMAXUB(%%mm1, %%mm4)
2716
PMINUB(%%mm1, %%mm3, %%mm5)
2717
"psubb %%mm1, %%mm2 \n\t"
2718
"paddb %%mm7, %%mm2 \n\t"
2719
"pcmpgtb %%mm6, %%mm2 \n\t"
2720
"paddb %%mm2, %%mm0 \n\t"
2722
"movq (%2, %3, 8), %%mm2 \n\t"
2723
PMAXUB(%%mm2, %%mm4)
2724
PMINUB(%%mm2, %%mm3, %%mm5)
2725
"psubb %%mm2, %%mm1 \n\t"
2726
"paddb %%mm7, %%mm1 \n\t"
2727
"pcmpgtb %%mm6, %%mm1 \n\t"
2728
"paddb %%mm1, %%mm0 \n\t"
2730
"movq (%%"REG_a", %3, 4), %%mm1 \n\t"
2731
"psubb %%mm1, %%mm2 \n\t"
2732
"paddb %%mm7, %%mm2 \n\t"
2733
"pcmpgtb %%mm6, %%mm2 \n\t"
2734
"paddb %%mm2, %%mm0 \n\t"
2735
"psubusb %%mm3, %%mm4 \n\t"
2737
"pxor %%mm6, %%mm6 \n\t"
2738
"movq %4, %%mm7 \n\t" // QP,..., QP
2739
"paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
2740
"psubusb %%mm4, %%mm7 \n\t" // Diff >=2QP -> 0
2741
"pcmpeqb %%mm6, %%mm7 \n\t" // Diff < 2QP -> 0
2742
"pcmpeqb %%mm6, %%mm7 \n\t" // Diff < 2QP -> 0
2743
"movq %%mm7, %1 \n\t"
2745
"movq %5, %%mm7 \n\t"
2746
"punpcklbw %%mm7, %%mm7 \n\t"
2747
"punpcklbw %%mm7, %%mm7 \n\t"
2748
"punpcklbw %%mm7, %%mm7 \n\t"
2749
"psubb %%mm0, %%mm6 \n\t"
2750
"pcmpgtb %%mm7, %%mm6 \n\t"
2751
"movq %%mm6, %0 \n\t"
2753
: "=m" (eq_mask), "=m" (dc_mask)
2754
: "r" (src), "r" ((long)step), "m" (c->pQPb), "m"(c->ppMode.flatnessThreshold)
2758
both_masks = dc_mask & eq_mask;
2761
long offset= -8*step;
2762
int64_t *temp_sums= sums;
2765
"movq %2, %%mm0 \n\t" // QP,..., QP
2766
"pxor %%mm4, %%mm4 \n\t"
2768
"movq (%0), %%mm6 \n\t"
2769
"movq (%0, %1), %%mm5 \n\t"
2770
"movq %%mm5, %%mm1 \n\t"
2771
"movq %%mm6, %%mm2 \n\t"
2772
"psubusb %%mm6, %%mm5 \n\t"
2773
"psubusb %%mm1, %%mm2 \n\t"
2774
"por %%mm5, %%mm2 \n\t" // ABS Diff of lines
2775
"psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0
2776
"pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF
2778
"pxor %%mm6, %%mm1 \n\t"
2779
"pand %%mm0, %%mm1 \n\t"
2780
"pxor %%mm1, %%mm6 \n\t"
2783
"movq (%0, %1, 8), %%mm5 \n\t"
2784
"add %1, %0 \n\t" // %0 points to line 1 not 0
2785
"movq (%0, %1, 8), %%mm7 \n\t"
2786
"movq %%mm5, %%mm1 \n\t"
2787
"movq %%mm7, %%mm2 \n\t"
2788
"psubusb %%mm7, %%mm5 \n\t"
2789
"psubusb %%mm1, %%mm2 \n\t"
2790
"por %%mm5, %%mm2 \n\t" // ABS Diff of lines
2791
"movq %2, %%mm0 \n\t" // QP,..., QP
2792
"psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0
2793
"pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF
2795
"pxor %%mm7, %%mm1 \n\t"
2796
"pand %%mm0, %%mm1 \n\t"
2797
"pxor %%mm1, %%mm7 \n\t"
2799
"movq %%mm6, %%mm5 \n\t"
2800
"punpckhbw %%mm4, %%mm6 \n\t"
2801
"punpcklbw %%mm4, %%mm5 \n\t"
2802
// 4:0 5/6:First 7:Last
2804
"movq %%mm5, %%mm0 \n\t"
2805
"movq %%mm6, %%mm1 \n\t"
2806
"psllw $2, %%mm0 \n\t"
2807
"psllw $2, %%mm1 \n\t"
2808
"paddw "MANGLE(w04)", %%mm0 \n\t"
2809
"paddw "MANGLE(w04)", %%mm1 \n\t"
2812
"movq (%0), %%mm2 \n\t"\
2813
"movq (%0), %%mm3 \n\t"\
2815
"punpcklbw %%mm4, %%mm2 \n\t"\
2816
"punpckhbw %%mm4, %%mm3 \n\t"\
2817
"paddw %%mm2, %%mm0 \n\t"\
2818
"paddw %%mm3, %%mm1 \n\t"
2821
"movq (%0), %%mm2 \n\t"\
2822
"movq (%0), %%mm3 \n\t"\
2824
"punpcklbw %%mm4, %%mm2 \n\t"\
2825
"punpckhbw %%mm4, %%mm3 \n\t"\
2826
"psubw %%mm2, %%mm0 \n\t"\
2827
"psubw %%mm3, %%mm1 \n\t"
2833
"movq %%mm0, (%3) \n\t"
2834
"movq %%mm1, 8(%3) \n\t"
2837
"psubw %%mm5, %%mm0 \n\t"
2838
"psubw %%mm6, %%mm1 \n\t"
2839
"movq %%mm0, 16(%3) \n\t"
2840
"movq %%mm1, 24(%3) \n\t"
2843
"psubw %%mm5, %%mm0 \n\t"
2844
"psubw %%mm6, %%mm1 \n\t"
2845
"movq %%mm0, 32(%3) \n\t"
2846
"movq %%mm1, 40(%3) \n\t"
2849
"psubw %%mm5, %%mm0 \n\t"
2850
"psubw %%mm6, %%mm1 \n\t"
2851
"movq %%mm0, 48(%3) \n\t"
2852
"movq %%mm1, 56(%3) \n\t"
2855
"psubw %%mm5, %%mm0 \n\t"
2856
"psubw %%mm6, %%mm1 \n\t"
2857
"movq %%mm0, 64(%3) \n\t"
2858
"movq %%mm1, 72(%3) \n\t"
2860
"movq %%mm7, %%mm6 \n\t"
2861
"punpckhbw %%mm4, %%mm7 \n\t"
2862
"punpcklbw %%mm4, %%mm6 \n\t"
2868
"movq %%mm0, 80(%3) \n\t"
2869
"movq %%mm1, 88(%3) \n\t"
2872
"paddw %%mm6, %%mm0 \n\t"
2873
"paddw %%mm7, %%mm1 \n\t"
2874
"movq %%mm0, 96(%3) \n\t"
2875
"movq %%mm1, 104(%3) \n\t"
2878
"paddw %%mm6, %%mm0 \n\t"
2879
"paddw %%mm7, %%mm1 \n\t"
2880
"movq %%mm0, 112(%3) \n\t"
2881
"movq %%mm1, 120(%3) \n\t"
2884
"paddw %%mm6, %%mm0 \n\t"
2885
"paddw %%mm7, %%mm1 \n\t"
2886
"movq %%mm0, 128(%3) \n\t"
2887
"movq %%mm1, 136(%3) \n\t"
2890
"paddw %%mm6, %%mm0 \n\t"
2891
"paddw %%mm7, %%mm1 \n\t"
2892
"movq %%mm0, 144(%3) \n\t"
2893
"movq %%mm1, 152(%3) \n\t"
2895
"mov %4, %0 \n\t" //FIXME
2898
: "r" ((long)step), "m" (c->pQPb), "r"(sums), "g"(src)
2901
src+= step; // src points to begin of the 8x8 Block
2904
"movq %4, %%mm6 \n\t"
2905
"pcmpeqb %%mm5, %%mm5 \n\t"
2906
"pxor %%mm6, %%mm5 \n\t"
2907
"pxor %%mm7, %%mm7 \n\t"
2910
"movq (%1), %%mm0 \n\t"
2911
"movq 8(%1), %%mm1 \n\t"
2912
"paddw 32(%1), %%mm0 \n\t"
2913
"paddw 40(%1), %%mm1 \n\t"
2914
"movq (%0, %3), %%mm2 \n\t"
2915
"movq %%mm2, %%mm3 \n\t"
2916
"movq %%mm2, %%mm4 \n\t"
2917
"punpcklbw %%mm7, %%mm2 \n\t"
2918
"punpckhbw %%mm7, %%mm3 \n\t"
2919
"paddw %%mm2, %%mm0 \n\t"
2920
"paddw %%mm3, %%mm1 \n\t"
2921
"paddw %%mm2, %%mm0 \n\t"
2922
"paddw %%mm3, %%mm1 \n\t"
2923
"psrlw $4, %%mm0 \n\t"
2924
"psrlw $4, %%mm1 \n\t"
2925
"packuswb %%mm1, %%mm0 \n\t"
2926
"pand %%mm6, %%mm0 \n\t"
2927
"pand %%mm5, %%mm4 \n\t"
2928
"por %%mm4, %%mm0 \n\t"
2929
"movq %%mm0, (%0, %3) \n\t"
2934
: "+r"(offset), "+r"(temp_sums)
2935
: "r" ((long)step), "r"(src - offset), "m"(both_masks)
2938
src+= step; // src points to begin of the 8x8 Block
2940
if(eq_mask != -1LL){
2941
uint8_t *temp_src= src;
2943
"pxor %%mm7, %%mm7 \n\t"
2944
"lea -40(%%"REG_SP"), %%"REG_c" \n\t" // make space for 4 8-byte vars
2945
"and "ALIGN_MASK", %%"REG_c" \n\t" // align
2946
// 0 1 2 3 4 5 6 7 8 9
2947
// %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %1+8%1 ecx+4%1
2949
"movq (%0), %%mm0 \n\t"
2950
"movq %%mm0, %%mm1 \n\t"
2951
"punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0
2952
"punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0
2954
"movq (%0, %1), %%mm2 \n\t"
2955
"lea (%0, %1, 2), %%"REG_a" \n\t"
2956
"movq %%mm2, %%mm3 \n\t"
2957
"punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1
2958
"punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1
2960
"movq (%%"REG_a"), %%mm4 \n\t"
2961
"movq %%mm4, %%mm5 \n\t"
2962
"punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2
2963
"punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2
2965
"paddw %%mm0, %%mm0 \n\t" // 2L0
2966
"paddw %%mm1, %%mm1 \n\t" // 2H0
2967
"psubw %%mm4, %%mm2 \n\t" // L1 - L2
2968
"psubw %%mm5, %%mm3 \n\t" // H1 - H2
2969
"psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2
2970
"psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2
2972
"psllw $2, %%mm2 \n\t" // 4L1 - 4L2
2973
"psllw $2, %%mm3 \n\t" // 4H1 - 4H2
2974
"psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2
2975
"psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2
2977
"movq (%%"REG_a", %1), %%mm2 \n\t"
2978
"movq %%mm2, %%mm3 \n\t"
2979
"punpcklbw %%mm7, %%mm2 \n\t" // L3
2980
"punpckhbw %%mm7, %%mm3 \n\t" // H3
2982
"psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3
2983
"psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3
2984
"psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
2985
"psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2986
"movq %%mm0, (%%"REG_c") \n\t" // 2L0 - 5L1 + 5L2 - 2L3
2987
"movq %%mm1, 8(%%"REG_c") \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2989
"movq (%%"REG_a", %1, 2), %%mm0 \n\t"
2990
"movq %%mm0, %%mm1 \n\t"
2991
"punpcklbw %%mm7, %%mm0 \n\t" // L4
2992
"punpckhbw %%mm7, %%mm1 \n\t" // H4
2994
"psubw %%mm0, %%mm2 \n\t" // L3 - L4
2995
"psubw %%mm1, %%mm3 \n\t" // H3 - H4
2996
"movq %%mm2, 16(%%"REG_c") \n\t" // L3 - L4
2997
"movq %%mm3, 24(%%"REG_c") \n\t" // H3 - H4
2998
"paddw %%mm4, %%mm4 \n\t" // 2L2
2999
"paddw %%mm5, %%mm5 \n\t" // 2H2
3000
"psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4
3001
"psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4
3003
"lea (%%"REG_a", %1), %0 \n\t"
3004
"psllw $2, %%mm2 \n\t" // 4L3 - 4L4
3005
"psllw $2, %%mm3 \n\t" // 4H3 - 4H4
3006
"psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4
3007
"psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4
3009
"movq (%0, %1, 2), %%mm2 \n\t"
3010
"movq %%mm2, %%mm3 \n\t"
3011
"punpcklbw %%mm7, %%mm2 \n\t" // L5
3012
"punpckhbw %%mm7, %%mm3 \n\t" // H5
3013
"psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5
3014
"psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5
3015
"psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5
3016
"psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5
3018
"movq (%%"REG_a", %1, 4), %%mm6 \n\t"
3019
"punpcklbw %%mm7, %%mm6 \n\t" // L6
3020
"psubw %%mm6, %%mm2 \n\t" // L5 - L6
3021
"movq (%%"REG_a", %1, 4), %%mm6 \n\t"
3022
"punpckhbw %%mm7, %%mm6 \n\t" // H6
3023
"psubw %%mm6, %%mm3 \n\t" // H5 - H6
3025
"paddw %%mm0, %%mm0 \n\t" // 2L4
3026
"paddw %%mm1, %%mm1 \n\t" // 2H4
3027
"psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6
3028
"psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6
3030
"psllw $2, %%mm2 \n\t" // 4L5 - 4L6
3031
"psllw $2, %%mm3 \n\t" // 4H5 - 4H6
3032
"psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6
3033
"psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6
3035
"movq (%0, %1, 4), %%mm2 \n\t"
3036
"movq %%mm2, %%mm3 \n\t"
3037
"punpcklbw %%mm7, %%mm2 \n\t" // L7
3038
"punpckhbw %%mm7, %%mm3 \n\t" // H7
3040
"paddw %%mm2, %%mm2 \n\t" // 2L7
3041
"paddw %%mm3, %%mm3 \n\t" // 2H7
3042
"psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7
3043
"psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7
3045
"movq (%%"REG_c"), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
3046
"movq 8(%%"REG_c"), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
3049
"movq %%mm7, %%mm6 \n\t" // 0
3050
"psubw %%mm0, %%mm6 \n\t"
3051
"pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
3052
"movq %%mm7, %%mm6 \n\t" // 0
3053
"psubw %%mm1, %%mm6 \n\t"
3054
"pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
3055
"movq %%mm7, %%mm6 \n\t" // 0
3056
"psubw %%mm2, %%mm6 \n\t"
3057
"pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
3058
"movq %%mm7, %%mm6 \n\t" // 0
3059
"psubw %%mm3, %%mm6 \n\t"
3060
"pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
3062
"movq %%mm7, %%mm6 \n\t" // 0
3063
"pcmpgtw %%mm0, %%mm6 \n\t"
3064
"pxor %%mm6, %%mm0 \n\t"
3065
"psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
3066
"movq %%mm7, %%mm6 \n\t" // 0
3067
"pcmpgtw %%mm1, %%mm6 \n\t"
3068
"pxor %%mm6, %%mm1 \n\t"
3069
"psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
3070
"movq %%mm7, %%mm6 \n\t" // 0
3071
"pcmpgtw %%mm2, %%mm6 \n\t"
3072
"pxor %%mm6, %%mm2 \n\t"
3073
"psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
3074
"movq %%mm7, %%mm6 \n\t" // 0
3075
"pcmpgtw %%mm3, %%mm6 \n\t"
3076
"pxor %%mm6, %%mm3 \n\t"
3077
"psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
3081
"pminsw %%mm2, %%mm0 \n\t"
3082
"pminsw %%mm3, %%mm1 \n\t"
3084
"movq %%mm0, %%mm6 \n\t"
3085
"psubusw %%mm2, %%mm6 \n\t"
3086
"psubw %%mm6, %%mm0 \n\t"
3087
"movq %%mm1, %%mm6 \n\t"
3088
"psubusw %%mm3, %%mm6 \n\t"
3089
"psubw %%mm6, %%mm1 \n\t"
3092
"movd %2, %%mm2 \n\t" // QP
3093
"punpcklbw %%mm7, %%mm2 \n\t"
3095
"movq %%mm7, %%mm6 \n\t" // 0
3096
"pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
3097
"pxor %%mm6, %%mm4 \n\t"
3098
"psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
3099
"pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
3100
"pxor %%mm7, %%mm5 \n\t"
3101
"psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
3103
"psllw $3, %%mm2 \n\t" // 8QP
3104
"movq %%mm2, %%mm3 \n\t" // 8QP
3105
"pcmpgtw %%mm4, %%mm2 \n\t"
3106
"pcmpgtw %%mm5, %%mm3 \n\t"
3107
"pand %%mm2, %%mm4 \n\t"
3108
"pand %%mm3, %%mm5 \n\t"
3111
"psubusw %%mm0, %%mm4 \n\t" // hd
3112
"psubusw %%mm1, %%mm5 \n\t" // ld
3115
"movq "MANGLE(w05)", %%mm2 \n\t" // 5
3116
"pmullw %%mm2, %%mm4 \n\t"
3117
"pmullw %%mm2, %%mm5 \n\t"
3118
"movq "MANGLE(w20)", %%mm2 \n\t" // 32
3119
"paddw %%mm2, %%mm4 \n\t"
3120
"paddw %%mm2, %%mm5 \n\t"
3121
"psrlw $6, %%mm4 \n\t"
3122
"psrlw $6, %%mm5 \n\t"
3124
"movq 16(%%"REG_c"), %%mm0 \n\t" // L3 - L4
3125
"movq 24(%%"REG_c"), %%mm1 \n\t" // H3 - H4
3127
"pxor %%mm2, %%mm2 \n\t"
3128
"pxor %%mm3, %%mm3 \n\t"
3130
"pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4)
3131
"pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4)
3132
"pxor %%mm2, %%mm0 \n\t"
3133
"pxor %%mm3, %%mm1 \n\t"
3134
"psubw %%mm2, %%mm0 \n\t" // |L3-L4|
3135
"psubw %%mm3, %%mm1 \n\t" // |H3-H4|
3136
"psrlw $1, %%mm0 \n\t" // |L3 - L4|/2
3137
"psrlw $1, %%mm1 \n\t" // |H3 - H4|/2
3139
"pxor %%mm6, %%mm2 \n\t"
3140
"pxor %%mm7, %%mm3 \n\t"
3141
"pand %%mm2, %%mm4 \n\t"
3142
"pand %%mm3, %%mm5 \n\t"
3145
"pminsw %%mm0, %%mm4 \n\t"
3146
"pminsw %%mm1, %%mm5 \n\t"
3148
"movq %%mm4, %%mm2 \n\t"
3149
"psubusw %%mm0, %%mm2 \n\t"
3150
"psubw %%mm2, %%mm4 \n\t"
3151
"movq %%mm5, %%mm2 \n\t"
3152
"psubusw %%mm1, %%mm2 \n\t"
3153
"psubw %%mm2, %%mm5 \n\t"
3155
"pxor %%mm6, %%mm4 \n\t"
3156
"pxor %%mm7, %%mm5 \n\t"
3157
"psubw %%mm6, %%mm4 \n\t"
3158
"psubw %%mm7, %%mm5 \n\t"
3159
"packsswb %%mm5, %%mm4 \n\t"
3160
"movq %3, %%mm1 \n\t"
3161
"pandn %%mm4, %%mm1 \n\t"
3162
"movq (%0), %%mm0 \n\t"
3163
"paddb %%mm1, %%mm0 \n\t"
3164
"movq %%mm0, (%0) \n\t"
3165
"movq (%0, %1), %%mm0 \n\t"
3166
"psubb %%mm1, %%mm0 \n\t"
3167
"movq %%mm0, (%0, %1) \n\t"
3170
: "r" ((long)step), "m" (c->pQPb), "m"(eq_mask)
3171
: "%"REG_a, "%"REG_c
3175
STOP_TIMER("step16")
2612
3182
static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
2613
3183
QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c);