1
/*****************************************************************************
2
* copy.c: Fast YV12/NV12 copy
3
*****************************************************************************
4
* Copyright (C) 2010 Laurent Aimar
5
* $Id: 9c9a44bcbd413d042630c3c802e95df3a60d48e0 $
7
* Authors: Laurent Aimar <fenrir _AT_ videolan _DOT_ org>
9
* This program is free software; you can redistribute it and/or modify
10
* it under the terms of the GNU General Public License as published by
11
* the Free Software Foundation; either version 2 of the License, or
12
* (at your option) any later version.
14
* This program is distributed in the hope that it will be useful,
15
* but WITHOUT ANY WARRANTY; without even the implied warranty of
16
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
* GNU General Public License for more details.
19
* You should have received a copy of the GNU General Public License
20
* along with this program; if not, write to the Free Software
21
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
22
*****************************************************************************/
28
#include <vlc_common.h>
29
#include <vlc_picture.h>
35
/* Copy 64 bytes from srcp to dsp loading data with the SSE>=2 instruction load and
36
* storing data with the SSE>=2 instruction store.
38
#define COPY64(dstp, srcp, load, store) \
40
load " 0(%[src]), %%xmm1\n" \
41
load " 16(%[src]), %%xmm2\n" \
42
load " 32(%[src]), %%xmm3\n" \
43
load " 48(%[src]), %%xmm4\n" \
44
store " %%xmm1, 0(%[dst])\n" \
45
store " %%xmm2, 16(%[dst])\n" \
46
store " %%xmm3, 32(%[dst])\n" \
47
store " %%xmm4, 48(%[dst])\n" \
48
: : [dst]"r"(dstp), [src]"r"(srcp) : "memory")
50
/* Execute the instruction op only if SSE2 is supported. */
51
#ifdef CAN_COMPILE_SSE2
52
# define ASM_SSE2(cpu, op) do { \
53
if (cpu & CPU_CAPABILITY_SSE2) \
57
# define ASM_SSE2(cpu, op)
60
/* Optimized copy from "Uncacheable Speculative Write Combining" memory
61
* as used by some video surface.
62
* XXX It is really efficient only when SSE4.1 is available.
64
static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
65
const uint8_t *src, size_t src_pitch,
66
unsigned width, unsigned height,
69
assert(((intptr_t)dst & 0x0f) == 0 && (dst_pitch & 0x0f) == 0);
71
ASM_SSE2(cpu, "mfence");
72
for (unsigned y = 0; y < height; y++) {
73
const unsigned unaligned = (intptr_t)src & 0x0f;
76
for (x = 0; x < unaligned; x++)
79
#ifdef CAN_COMPILE_SSE4_1
80
if (cpu & CPU_CAPABILITY_SSE4_1) {
82
for (; x+63 < width; x += 64)
83
COPY64(&dst[x], &src[x], "movntdqa", "movdqa");
85
for (; x+63 < width; x += 64)
86
COPY64(&dst[x], &src[x], "movntdqa", "movdqu");
90
#ifdef CAN_COMPILE_SSE2
91
if (cpu & CPU_CAPABILITY_SSE2) {
93
for (; x+63 < width; x += 64)
94
COPY64(&dst[x], &src[x], "movdqa", "movdqa");
96
for (; x+63 < width; x += 64)
97
COPY64(&dst[x], &src[x], "movdqa", "movdqu");
102
for (; x < width; x++)
110
static void Copy2d(uint8_t *dst, size_t dst_pitch,
111
const uint8_t *src, size_t src_pitch,
112
unsigned width, unsigned height,
115
assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
117
ASM_SSE2(cpu, "mfence");
119
for (unsigned y = 0; y < height; y++) {
121
bool unaligned = ((intptr_t)dst & 0x0f) != 0;
123
#ifdef CAN_COMPILE_SSE2
124
if (cpu & CPU_CAPABILITY_SSE2) {
126
for (; x+63 < width; x += 64)
127
COPY64(&dst[x], &src[x], "movdqa", "movntdq");
129
for (; x+63 < width; x += 64)
130
COPY64(&dst[x], &src[x], "movdqa", "movdqu");
135
for (; x < width; x++)
143
static void SplitUV(uint8_t *dstu, size_t dstu_pitch,
144
uint8_t *dstv, size_t dstv_pitch,
145
const uint8_t *src, size_t src_pitch,
146
unsigned width, unsigned height, unsigned cpu)
148
const uint8_t shuffle[] = { 0, 2, 4, 6, 8, 10, 12, 14,
149
1, 3, 5, 7, 9, 11, 13, 15 };
150
const uint8_t mask[] = { 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00,
151
0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00 };
153
assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
155
ASM_SSE2(cpu, "mfence");
157
for (unsigned y = 0; y < height; y++) {
161
"movdqa 0(%[src]), %%xmm0\n" \
162
"movdqa 16(%[src]), %%xmm1\n" \
163
"movdqa 32(%[src]), %%xmm2\n" \
164
"movdqa 48(%[src]), %%xmm3\n"
167
"movq %%xmm0, 0(%[dst1])\n" \
168
"movq %%xmm1, 8(%[dst1])\n" \
169
"movhpd %%xmm0, 0(%[dst2])\n" \
170
"movhpd %%xmm1, 8(%[dst2])\n" \
171
"movq %%xmm2, 16(%[dst1])\n" \
172
"movq %%xmm3, 24(%[dst1])\n" \
173
"movhpd %%xmm2, 16(%[dst2])\n" \
174
"movhpd %%xmm3, 24(%[dst2])\n"
176
#ifdef CAN_COMPILE_SSSE3
177
if (cpu & CPU_CAPABILITY_SSSE3) {
178
for (x = 0; x < (width & ~31); x += 32) {
180
"movdqu (%[shuffle]), %%xmm7\n"
182
"pshufb %%xmm7, %%xmm0\n"
183
"pshufb %%xmm7, %%xmm1\n"
184
"pshufb %%xmm7, %%xmm2\n"
185
"pshufb %%xmm7, %%xmm3\n"
187
: : [dst1]"r"(&dstu[x]), [dst2]"r"(&dstv[x]), [src]"r"(&src[2*x]), [shuffle]"r"(shuffle) : "memory");
191
#ifdef CAN_COMPILE_SSE2
192
if (cpu & CPU_CAPABILITY_SSE2) {
193
for (x = 0; x < (width & ~31); x += 32) {
195
"movdqu (%[mask]), %%xmm7\n"
197
"movdqa %%xmm0, %%xmm4\n"
198
"movdqa %%xmm1, %%xmm5\n"
199
"movdqa %%xmm2, %%xmm6\n"
202
"pand %%xmm7, %%xmm4\n"
203
"pand %%xmm7, %%xmm5\n"
204
"pand %%xmm7, %%xmm6\n"
205
"packuswb %%xmm4, %%xmm0\n"
206
"packuswb %%xmm5, %%xmm1\n"
207
"pand %%xmm3, %%xmm7\n"
210
"packuswb %%xmm6, %%xmm2\n"
211
"packuswb %%xmm7, %%xmm3\n"
213
: : [dst2]"r"(&dstu[x]), [dst1]"r"(&dstv[x]), [src]"r"(&src[2*x]), [mask]"r"(mask) : "memory");
220
for (; x < width; x++) {
221
dstu[x] = src[2*x+0];
222
dstv[x] = src[2*x+1];
230
static void CopyPlane(uint8_t *dst, size_t dst_pitch, const uint8_t *src, size_t src_pitch,
231
uint8_t *cache, size_t cache_size,
232
unsigned width, unsigned height,
235
const unsigned w16 = (width+15) & ~15;
236
const unsigned hstep = cache_size / w16;
239
for (unsigned y = 0; y < height; y += hstep) {
240
const unsigned hblock = __MIN(hstep, height - y);
242
/* Copy a bunch of line into our cache */
243
CopyFromUswc(cache, w16,
247
/* Copy from our cache to the destination */
248
Copy2d(dst, dst_pitch,
253
src += src_pitch * hblock;
254
dst += dst_pitch * hblock;
257
ASM_SSE2(cpu, "mfence");
259
static void SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
260
uint8_t *dstv, size_t dstv_pitch,
261
const uint8_t *src, size_t src_pitch,
262
uint8_t *cache, size_t cache_size,
263
unsigned width, unsigned height,
266
const unsigned w2_16 = (2*width+15) & ~15;
267
const unsigned hstep = cache_size / w2_16;
270
for (unsigned y = 0; y < height; y += hstep) {
271
const unsigned hblock = __MIN(hstep, height - y);
273
/* Copy a bunch of line into our cache */
274
CopyFromUswc(cache, w2_16,
276
2*width, hblock, cpu);
278
/* Copy from our cache to the destination */
279
SplitUV(dstu, dstu_pitch,
285
src += src_pitch * hblock;
286
dstu += dstu_pitch * hblock;
287
dstv += dstv_pitch * hblock;
290
ASM_SSE2(cpu, "mfence");
293
int CopyInitCache(copy_cache_t *cache, unsigned width)
295
cache->size = __MAX((width + 0x0f) & ~ 0x0f, 4096);
296
cache->base = malloc(16 + cache->size);
297
if (cache->base == NULL) {
298
cache->buffer = NULL;
301
cache->buffer = &cache->base[16 - ((intptr_t)cache->base & 0x0f)];
304
void CopyCleanCache(copy_cache_t *cache)
309
cache->buffer = NULL;
313
void CopyFromNv12(picture_t *dst, uint8_t *src[2], size_t src_pitch[2],
314
unsigned width, unsigned height,
317
const unsigned cpu = vlc_CPU();
320
CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
321
src[0], src_pitch[0],
322
cache->buffer, cache->size,
324
SplitPlanes(dst->p[2].p_pixels, dst->p[2].i_pitch,
325
dst->p[1].p_pixels, dst->p[1].i_pitch,
326
src[1], src_pitch[1],
327
cache->buffer, cache->size,
328
width/2, height/2, cpu);
330
ASM_SSE2(cpu, "emms");
332
void CopyFromYv12(picture_t *dst, uint8_t *src[3], size_t src_pitch[3],
333
unsigned width, unsigned height,
336
const unsigned cpu = vlc_CPU();
339
for (unsigned n = 0; n < 3; n++) {
340
const unsigned d = n > 0 ? 2 : 1;
341
CopyPlane(dst->p[n].p_pixels, dst->p[n].i_pitch,
342
src[n], src_pitch[n],
343
cache->buffer, cache->size,
344
width/d, height/d, cpu);
346
ASM_SSE2(cpu, "emms");