~ubuntu-branches/ubuntu/saucy/gst-libav1.0/saucy-proposed

« back to all changes in this revision

Viewing changes to gst-libs/ext/libav/libavcodec/arm/vp8dsp_armv6.S

  • Committer: Package Import Robot
  • Author(s): Sebastian Dröge
  • Date: 2013-07-30 09:00:15 UTC
  • mfrom: (1.1.16) (7.1.7 experimental)
  • Revision ID: package-import@ubuntu.com-20130730090015-sc1ou2yssu7q5w4e
Tags: 1.1.3-1
* New upstream development snapshot:
  + debian/control:
    - Build depend on GStreamer and gst-plugins-base >= 1.1.3.

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
/*
 
2
 * VP8 ARMv6 optimisations
 
3
 *
 
4
 * Copyright (c) 2010 Google Inc.
 
5
 * Copyright (c) 2010 Rob Clark <rob@ti.com>
 
6
 * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
 
7
 *
 
8
 * This file is part of Libav.
 
9
 *
 
10
 * Libav is free software; you can redistribute it and/or
 
11
 * modify it under the terms of the GNU Lesser General Public
 
12
 * License as published by the Free Software Foundation; either
 
13
 * version 2.1 of the License, or (at your option) any later version.
 
14
 *
 
15
 * Libav is distributed in the hope that it will be useful,
 
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 
18
 * Lesser General Public License for more details.
 
19
 *
 
20
 * You should have received a copy of the GNU Lesser General Public
 
21
 * License along with Libav; if not, write to the Free Software
 
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 
23
 *
 
24
 * This code was partially ported from libvpx, which uses this license:
 
25
 *
 
26
 * Redistribution and use in source and binary forms, with or without
 
27
 * modification, are permitted provided that the following conditions are
 
28
 * met:
 
29
 *
 
30
 *   * Redistributions of source code must retain the above copyright
 
31
 *     notice, this list of conditions and the following disclaimer.
 
32
 *
 
33
 *   * Redistributions in binary form must reproduce the above copyright
 
34
 *     notice, this list of conditions and the following disclaimer in
 
35
 *     the documentation and/or other materials provided with the
 
36
 *     distribution.
 
37
 *
 
38
 *   * Neither the name of Google nor the names of its contributors may
 
39
 *     be used to endorse or promote products derived from this software
 
40
 *     without specific prior written permission.
 
41
 *
 
42
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 
43
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 
44
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 
45
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 
46
 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 
47
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 
48
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 
49
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 
50
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 
51
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 
52
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
53
 */
 
54
 
 
55
#include "libavutil/arm/asm.S"
 
56
 
 
57
@ idct
 
58
 
 
59
@ void vp8_luma_dc_wht(DCTELEM block[4][4][16], DCTELEM dc[16])
 
60
function ff_vp8_luma_dc_wht_armv6, export=1
 
61
        push            {r4-r10, lr}
 
62
 
 
63
        ldm             r1,  {r2-r9}
 
64
        mov             r10, #0
 
65
        mov             lr,  #0
 
66
        uadd16          r12, r2,  r8            @ t0[0,1]
 
67
        usub16          r2,  r2,  r8            @ t3[0,1]
 
68
        stm             r1!, {r10, lr}
 
69
        uadd16          r8,  r4,  r6            @ t1[0,1]
 
70
        usub16          r4,  r4,  r6            @ t2[0,1]
 
71
        stm             r1!, {r10, lr}
 
72
        uadd16          r6,  r12, r8            @ dc0[0,1]
 
73
        usub16          r12, r12, r8            @ dc2[0,1]
 
74
        stm             r1!, {r10, lr}
 
75
        uadd16          r8,  r2,  r4            @ dc1[0,1]
 
76
        usub16          r2,  r2,  r4            @ dc3[0,1]
 
77
        stm             r1!, {r10, lr}
 
78
 
 
79
        uadd16          lr,  r3,  r9            @ t0[2,3]
 
80
        usub16          r3,  r3,  r9            @ t3[2,3]
 
81
        uadd16          r9,  r5,  r7            @ t1[2,3]
 
82
        usub16          r5,  r5,  r7            @ t2[2,3]
 
83
 
 
84
        uadd16          r7,  lr,  r9            @ dc0[2,3]
 
85
        usub16          lr,  lr,  r9            @ dc2[2,3]
 
86
        uadd16          r9,  r3,  r5            @ dc1[2,3]
 
87
        usub16          r3,  r3,  r5            @ dc3[2,3]
 
88
 
 
89
        mov             r1,  #3
 
90
        orr             r1,  r1,  #0x30000      @ 3 | 3 (round)
 
91
 
 
92
        pkhbt           r4,  r6,  r8,  lsl #16  @ dc{0,1}[0]
 
93
        pkhtb           r6,  r8,  r6,  asr #16  @ dc{0,1}[1]
 
94
        pkhbt           r5,  r12, r2,  lsl #16  @ dc{2,3}[0]
 
95
        pkhtb           r12, r2,  r12, asr #16  @ dc{2,3}[1]
 
96
        pkhbt           r8,  r7,  r9,  lsl #16  @ dc{0,1}[2]
 
97
        uadd16          r4,  r4,  r1
 
98
        uadd16          r5,  r5,  r1
 
99
        pkhtb           r7,  r9,  r7,  asr #16  @ dc{0,1}[3]
 
100
        pkhbt           r2,  lr,  r3,  lsl #16  @ dc{2,3}[2]
 
101
        pkhtb           lr,  r3,  lr,  asr #16  @ dc{2,3}[3]
 
102
 
 
103
        uadd16          r9,  r4,  r7            @ t0[0,1]
 
104
        uadd16          r3,  r5,  lr            @ t0[2,3]
 
105
        usub16          r4,  r4,  r7            @ t3[0,1]
 
106
        usub16          r5,  r5,  lr            @ t3[2,3]
 
107
        uadd16          r7,  r6,  r8            @ t1[0,1]
 
108
        uadd16          lr,  r12, r2            @ t1[2,3]
 
109
        usub16          r6,  r6,  r8            @ t2[0,1]
 
110
        usub16          r12, r12, r2            @ t2[2,3]
 
111
 
 
112
        uadd16          r8,  r9,  r7            @ block[0,1][0]
 
113
        uadd16          r2,  r3,  lr            @ block[2,3][0]
 
114
        usub16          r9,  r9,  r7            @ block[0,1][2]
 
115
        usub16          r3,  r3,  lr            @ block[2,3][2]
 
116
        uadd16          r7,  r4,  r6            @ block[0,1][1]
 
117
        uadd16          lr,  r5,  r12           @ block[2,3][1]
 
118
        usub16          r4,  r4,  r6            @ block[0,1][3]
 
119
        usub16          r5,  r5,  r12           @ block[2,3][3]
 
120
 
 
121
#if HAVE_ARMV6T2_EXTERNAL
 
122
        sbfx            r6,  r8,  #3,  #13
 
123
        sbfx            r12, r7,  #3,  #13
 
124
        sbfx            r1,  r9,  #3,  #13
 
125
        sbfx            r10, r4,  #3,  #13
 
126
#else
 
127
        sxth            r6,  r8
 
128
        sxth            r12, r7
 
129
        sxth            r1,  r9
 
130
        sxth            r10, r4
 
131
        asr             r6,  #3                 @ block[0][0]
 
132
        asr             r12, #3                 @ block[0][1]
 
133
        asr             r1,  #3                 @ block[0][2]
 
134
        asr             r10, #3                 @ block[0][3]
 
135
#endif
 
136
 
 
137
        strh            r6,  [r0], #32
 
138
        asr             r8,  r8,  #19           @ block[1][0]
 
139
        strh            r12, [r0], #32
 
140
        asr             r7,  r7,  #19           @ block[1][1]
 
141
        strh            r1,  [r0], #32
 
142
        asr             r9,  r9,  #19           @ block[1][2]
 
143
        strh            r10, [r0], #32
 
144
        asr             r4,  r4,  #19           @ block[1][3]
 
145
        strh            r8,  [r0], #32
 
146
        asr             r6,  r2,  #19           @ block[3][0]
 
147
        strh            r7,  [r0], #32
 
148
        asr             r12, lr,  #19           @ block[3][1]
 
149
        strh            r9,  [r0], #32
 
150
        asr             r1,  r3,  #19           @ block[3][2]
 
151
        strh            r4,  [r0], #32
 
152
        asr             r10, r5,  #19           @ block[3][3]
 
153
 
 
154
#if HAVE_ARMV6T2_EXTERNAL
 
155
        sbfx            r2,  r2,  #3,  #13
 
156
        sbfx            lr,  lr,  #3,  #13
 
157
        sbfx            r3,  r3,  #3,  #13
 
158
        sbfx            r5,  r5,  #3,  #13
 
159
#else
 
160
        sxth            r2,  r2
 
161
        sxth            lr,  lr
 
162
        sxth            r3,  r3
 
163
        sxth            r5,  r5
 
164
        asr             r2,  #3                 @ block[2][0]
 
165
        asr             lr,  #3                 @ block[2][1]
 
166
        asr             r3,  #3                 @ block[2][2]
 
167
        asr             r5,  #3                 @ block[2][3]
 
168
#endif
 
169
 
 
170
        strh            r2,  [r0], #32
 
171
        strh            lr,  [r0], #32
 
172
        strh            r3,  [r0], #32
 
173
        strh            r5,  [r0], #32
 
174
        strh            r6,  [r0], #32
 
175
        strh            r12, [r0], #32
 
176
        strh            r1,  [r0], #32
 
177
        strh            r10, [r0], #32
 
178
 
 
179
        pop             {r4-r10, pc}
 
180
endfunc
 
181
 
 
182
@ void vp8_luma_dc_wht_dc(DCTELEM block[4][4][16], DCTELEM dc[16])
 
183
function ff_vp8_luma_dc_wht_dc_armv6, export=1
 
184
        ldrsh           r2,  [r1]
 
185
        mov             r3,  #0
 
186
        add             r2,  r2,  #3
 
187
        strh            r3,  [r1]
 
188
        asr             r2,  r2,  #3
 
189
    .rept 16
 
190
        strh            r2,  [r0], #32
 
191
    .endr
 
192
        bx              lr
 
193
endfunc
 
194
 
 
195
@ void vp8_idct_add(uint8_t *dst, DCTELEM block[16], int stride)
 
196
function ff_vp8_idct_add_armv6, export=1
 
197
        push            {r4-r12, lr}
 
198
        sub             sp,  sp,  #32
 
199
 
 
200
        movw            r3,  #20091             @ cospi8sqrt2minus1
 
201
        movw            r4,  #35468             @ sinpi8sqrt2
 
202
        mov             r5,  sp
 
203
1:
 
204
        ldr             r6,  [r1, #8]       @  i5 | i4  = block1[1] | block1[0]
 
205
        ldr             lr,  [r1, #16]      @  i9 | i8  = block2[1] | block2[0]
 
206
        ldr             r12, [r1, #24]      @ i13 | i12 = block3[1] | block3[0]
 
207
 
 
208
        smulwt          r9,  r3,  r6            @ ip[5] * cospi8sqrt2minus1
 
209
        smulwb          r7,  r3,  r6            @ ip[4] * cospi8sqrt2minus1
 
210
        smulwt          r10, r4,  r6            @ ip[5] * sinpi8sqrt2
 
211
        smulwb          r8,  r4,  r6            @ ip[4] * sinpi8sqrt2
 
212
        pkhbt           r7,  r7,  r9,  lsl #16  @ 5c | 4c
 
213
        smulwt          r11, r3,  r12           @ ip[13] * cospi8sqrt2minus1
 
214
        pkhbt           r8,  r8,  r10, lsl #16  @ 5s   | 4s   = t2 first half
 
215
        uadd16          r6,  r6,  r7            @ 5c+5 | 4c+4 = t3 first half
 
216
        smulwb          r9,  r3,  r12           @ ip[12] * cospi8sqrt2minus1
 
217
        smulwt          r7,  r4,  r12           @ ip[13] * sinpi8sqrt2
 
218
        smulwb          r10, r4,  r12           @ ip[12] * sinpi8sqrt2
 
219
 
 
220
        pkhbt           r9,  r9,  r11, lsl #16  @ 13c | 12c
 
221
        ldr             r11, [r1]               @  i1 | i0
 
222
        pkhbt           r10, r10,  r7, lsl #16  @ 13s | 12s    = t3 second half
 
223
        uadd16          r7,  r12, r9            @ 13c+13  | 12c+12 = t2 2nd half
 
224
        uadd16          r6,  r6,  r10           @ d = t3
 
225
        uadd16          r10, r11, lr            @ a = t0
 
226
        usub16          r7,  r8,  r7            @ c = t2
 
227
        usub16          r8,  r11, lr            @ b = t1
 
228
        uadd16          r9,  r10, r6            @ a+d = tmp{0,1}[0]
 
229
        usub16          r10, r10, r6            @ a-d = tmp{0,1}[3]
 
230
        uadd16          r6,  r8,  r7            @ b+c = tmp{0,1}[1]
 
231
        usub16          r7,  r8,  r7            @ b-c = tmp{0,1}[2]
 
232
        mov             r8,  #0
 
233
        cmp             sp,  r5
 
234
        str             r6,  [r5, #8]           @  o5 | o4
 
235
        str             r7,  [r5, #16]          @  o9 | o8
 
236
        str             r10, [r5, #24]          @ o13 | o12
 
237
        str             r9,  [r5], #4           @  o1 | o0
 
238
        str             r8,  [r1, #8]
 
239
        str             r8,  [r1, #16]
 
240
        str             r8,  [r1, #24]
 
241
        str             r8,  [r1], #4
 
242
        beq             1b
 
243
 
 
244
        mov             r5,  #2
 
245
2:
 
246
        pop             {r1, r6, r12, lr}
 
247
        smulwt          r9,  r3,  r12           @ ip[5] * cospi8sqrt2minus1
 
248
        smulwt          r7,  r3,  r1            @ ip[1] * cospi8sqrt2minus1
 
249
        smulwt          r10, r4,  r12           @ ip[5] * sinpi8sqrt2
 
250
        smulwt          r8,  r4,  r1            @ ip[1] * sinpi8sqrt2
 
251
        pkhbt           r11, r1,  r12, lsl #16  @ i4 | i0 = t0/t1 first half
 
252
        pkhtb           r1,  r12, r1,  asr #16  @ i5 | i1
 
253
        pkhbt           r7,  r7,  r9,  lsl #16  @ 5c | 1c
 
254
        pkhbt           r8,  r8,  r10, lsl #16  @ 5s | 1s = t2 first half
 
255
        pkhbt           r9,  r6,  lr,  lsl #16  @ i6 | i2 = t0/t1 second half
 
256
        pkhtb           r12, lr,  r6,  asr #16  @ i7 | i3
 
257
        uadd16          r1,  r7,  r1            @ 5c+5 | 1c+1 = t3 first half
 
258
        uadd16          r10, r11, r9            @ a = t0
 
259
        usub16          r9,  r11, r9            @ b = t1
 
260
        smulwt          r7,  r3,  r12           @ ip[7] * cospi8sqrt2minus1
 
261
        smulwb          lr,  r3,  r12           @ ip[3] * cospi8sqrt2minus1
 
262
        smulwt          r11, r4,  r12           @ ip[7] * sinpi8sqrt2
 
263
        smulwb          r6,  r4,  r12           @ ip[3] * sinpi8sqrt2
 
264
        subs            r5,  r5,  #1
 
265
        pkhbt           r7,  lr,  r7,  lsl #16  @ 7c | 3c
 
266
        pkhbt           r11, r6,  r11, lsl #16  @ 7s | 3s = t3 second half
 
267
        mov             r6,  #0x4
 
268
        orr             r6,  r6,  #0x40000
 
269
        uadd16          r12, r7,  r12           @ 7c+7 | 3c+3 = t2 second half
 
270
        uadd16          r10, r10, r6            @ t0 + 4
 
271
        uadd16          r9,  r9,  r6            @ t1 + 4
 
272
        usub16          lr,  r8,  r12           @ c (o5 | o1) = t2
 
273
        uadd16          r12, r11, r1            @ d (o7 | o3) = t3
 
274
        usub16          r1,  r9,  lr            @ b-c = dst{0,1}[2]
 
275
        uadd16          r7,  r10, r12           @ a+d = dst{0,1}[0]
 
276
        usub16          r12, r10, r12           @ a-d = dst{0,1}[3]
 
277
        uadd16          r10, r9,  lr            @ b+c = dst{0,1}[1]
 
278
 
 
279
        asr             lr,  r1,  #3            @ o[1][2]
 
280
        asr             r9,  r12, #3            @ o[1][3]
 
281
        pkhtb           r8,  lr,  r7,  asr #19  @ o[1][0,2]
 
282
        pkhtb           r11, r9,  r10, asr #19  @ o[1][1,3]
 
283
        ldr             lr,  [r0]
 
284
        sxth            r12, r12
 
285
        ldr             r9,  [r0, r2]
 
286
        sxth            r1,  r1
 
287
#if HAVE_ARMV6T2_EXTERNAL
 
288
        sbfx            r7,  r7,  #3,  #13
 
289
        sbfx            r10, r10, #3,  #13
 
290
#else
 
291
        sxth            r7,  r7
 
292
        sxth            r10, r10
 
293
        asr             r7,  #3                 @ o[0][0]
 
294
        asr             r10, #3                 @ o[0][1]
 
295
#endif
 
296
        pkhbt           r7,  r7,  r1,  lsl #13  @ o[0][0,2]
 
297
        pkhbt           r10, r10, r12, lsl #13  @ o[0][1,3]
 
298
 
 
299
        uxtab16         r7,  r7,  lr
 
300
        uxtab16         r10, r10, lr,  ror #8
 
301
        uxtab16         r8,  r8,  r9
 
302
        uxtab16         r11, r11, r9,  ror #8
 
303
        usat16          r7,  #8,  r7
 
304
        usat16          r10, #8,  r10
 
305
        usat16          r8,  #8,  r8
 
306
        usat16          r11, #8,  r11
 
307
        orr             r7,  r7,  r10, lsl #8
 
308
        orr             r8,  r8,  r11, lsl #8
 
309
        str             r8,  [r0, r2]
 
310
        str_post        r7,  r0,  r2,  lsl #1
 
311
 
 
312
        bne             2b
 
313
 
 
314
        pop             {r4-r12, pc}
 
315
endfunc
 
316
 
 
317
@ void vp8_idct_dc_add(uint8_t *dst, DCTELEM block[16], int stride)
 
318
function ff_vp8_idct_dc_add_armv6, export=1
 
319
        push            {r4-r6, lr}
 
320
        add             r6,  r0,  r2,  lsl #1
 
321
        ldrsh           r3,  [r1]
 
322
        mov             r4,  #0
 
323
        add             r3,  r3,  #4
 
324
        strh            r4,  [r1], #32
 
325
        asr             r3,  #3
 
326
        ldr             r5,  [r0]
 
327
        ldr             r4,  [r0, r2]
 
328
        pkhbt           r3,  r3,  r3,  lsl #16
 
329
        uxtab16         lr,  r3,  r5            @ a1+2 | a1+0
 
330
        uxtab16         r5,  r3,  r5,  ror #8   @ a1+3 | a1+1
 
331
        uxtab16         r12, r3,  r4
 
332
        uxtab16         r4,  r3,  r4,  ror #8
 
333
        usat16          lr,  #8,  lr
 
334
        usat16          r5,  #8,  r5
 
335
        usat16          r12, #8,  r12
 
336
        usat16          r4,  #8,  r4
 
337
        orr             lr,  lr,  r5,  lsl #8
 
338
        ldr             r5,  [r6]
 
339
        orr             r12, r12, r4,  lsl #8
 
340
        ldr             r4,  [r6, r2]
 
341
        str             lr,  [r0]
 
342
        uxtab16         lr,  r3,  r5
 
343
        str             r12, [r0, r2]
 
344
        uxtab16         r5,  r3,  r5,  ror #8
 
345
        uxtab16         r12, r3,  r4
 
346
        uxtab16         r4,  r3,  r4,  ror #8
 
347
        usat16          lr,  #8,  lr
 
348
        usat16          r5,  #8,  r5
 
349
        usat16          r12, #8,  r12
 
350
        usat16          r4,  #8,  r4
 
351
        orr             lr,  lr,  r5,  lsl #8
 
352
        orr             r12, r12, r4,  lsl #8
 
353
        str             lr,  [r6]
 
354
        str             r12, [r6, r2]
 
355
        pop             {r4-r6, pc}
 
356
endfunc
 
357
 
 
358
@ void vp8_idct_dc_add4uv(uint8_t *dst, DCTELEM block[4][16], int stride)
 
359
function ff_vp8_idct_dc_add4uv_armv6, export=1
 
360
        push            {r4, lr}
 
361
 
 
362
        bl              ff_vp8_idct_dc_add_armv6
 
363
        add             r0,  r0,  #4
 
364
        bl              ff_vp8_idct_dc_add_armv6
 
365
        add             r0,  r0,  r2,  lsl #2
 
366
        sub             r0,  r0,  #4
 
367
        bl              ff_vp8_idct_dc_add_armv6
 
368
        add             r0,  r0,  #4
 
369
        bl              ff_vp8_idct_dc_add_armv6
 
370
 
 
371
        pop             {r4, pc}
 
372
endfunc
 
373
 
 
374
@ void vp8_idct_dc_add4y(uint8_t *dst, DCTELEM block[4][16], int stride)
 
375
function ff_vp8_idct_dc_add4y_armv6, export=1
 
376
        push            {r4, lr}
 
377
 
 
378
        bl              ff_vp8_idct_dc_add_armv6
 
379
        add             r0,  r0,  #4
 
380
        bl              ff_vp8_idct_dc_add_armv6
 
381
        add             r0,  r0,  #4
 
382
        bl              ff_vp8_idct_dc_add_armv6
 
383
        add             r0,  r0,  #4
 
384
        bl              ff_vp8_idct_dc_add_armv6
 
385
 
 
386
        pop             {r4, pc}
 
387
endfunc
 
388
 
 
389
@ loopfilter
 
390
 
 
391
.macro  transpose       o3,  o2,  o1,  o0,  i0,  i1,  i2,  i3
 
392
        uxtb16          \o1, \i1                @ xx 12 xx 10
 
393
        uxtb16          \o0, \i0                @ xx 02 xx 00
 
394
        uxtb16          \o3, \i3                @ xx 32 xx 30
 
395
        uxtb16          \o2, \i2                @ xx 22 xx 20
 
396
        orr             \o1, \o0, \o1, lsl #8   @ 12 02 10 00
 
397
        orr             \o3, \o2, \o3, lsl #8   @ 32 22 30 20
 
398
 
 
399
        uxtb16          \i1, \i1, ror #8        @ xx 13 xx 11
 
400
        uxtb16          \i3, \i3, ror #8        @ xx 33 xx 31
 
401
        uxtb16          \i0, \i0, ror #8        @ xx 03 xx 01
 
402
        uxtb16          \i2, \i2, ror #8        @ xx 23 xx 21
 
403
        orr             \i0, \i0, \i1, lsl #8   @ 13 03 11 01
 
404
        orr             \i2, \i2, \i3, lsl #8   @ 33 23 31 21
 
405
 
 
406
        pkhtb           \o2, \o3, \o1, asr #16  @ 32 22 12 02
 
407
        pkhbt           \o0, \o1, \o3, lsl #16  @ 30 20 10 00
 
408
        pkhtb           \o3, \i2, \i0, asr #16  @ 33 23 13 03
 
409
        pkhbt           \o1, \i0, \i2, lsl #16  @ 31 21 11 01
 
410
.endm
 
411
 
 
412
.macro  simple_filter
 
413
        uqsub8          r7,  r3,  r6            @ p1 - q1
 
414
        uqsub8          r8,  r6,  r3            @ q1 - p1
 
415
        uqsub8          r10, r4,  r5            @ p0 - q0
 
416
        uqsub8          r9,  r5,  r4            @ q0 - p0
 
417
        orr             r7,  r7,  r8            @ abs(p1 - q1)
 
418
        orr             r9,  r9,  r10           @ abs(p0 - q0)
 
419
        uhadd8          r7,  r7,  lr            @ abs(p1 - q2) >> 1
 
420
        uqadd8          r9,  r9,  r9            @ abs(p0 - q0) * 2
 
421
        uqadd8          r7,  r7,  r9            @ abs(p0 - q0)*2 + abs(p1-q1)/2
 
422
        mvn             r8,  #0
 
423
        usub8           r10, r12, r7            @ compare to flimit
 
424
        sel             r10, r8,  lr            @ filter mask: F or 0
 
425
        cmp             r10, #0
 
426
        beq             2f
 
427
 
 
428
        eor             r3,  r3,  r2            @ ps1
 
429
        eor             r6,  r6,  r2            @ qs1
 
430
        eor             r4,  r4,  r2            @ ps0
 
431
        eor             r5,  r5,  r2            @ qs0
 
432
 
 
433
        qsub8           r3,  r3,  r6            @ vp8_filter = p1 - q1
 
434
        qsub8           r6,  r5,  r4            @ q0 - p0
 
435
        qadd8           r3,  r3,  r6            @ += q0 - p0
 
436
        lsr             r7,  r2,  #5            @ 0x04040404
 
437
        qadd8           r3,  r3,  r6            @ += q0 - p0
 
438
        sub             r9,  r7,  r2,  lsr #7   @ 0x03030303
 
439
        qadd8           r3,  r3,  r6            @ vp8_filter = p1-q1 + 3*(q0-p0)
 
440
        and             r3,  r3,  r10           @ vp8_filter &= mask
 
441
 
 
442
        qadd8           r9,  r3,  r9            @ Filter2 = vp8_filter + 3
 
443
        qadd8           r3,  r3,  r7            @ Filter1 = vp8_filter + 4
 
444
 
 
445
        shadd8          r9,  r9,  lr
 
446
        shadd8          r3,  r3,  lr
 
447
        shadd8          r9,  r9,  lr
 
448
        shadd8          r3,  r3,  lr
 
449
        shadd8          r9,  r9,  lr            @ Filter2 >>= 3
 
450
        shadd8          r3,  r3,  lr            @ Filter1 >>= 3
 
451
 
 
452
        qadd8           r4,  r4,  r9            @ u = p0 + Filter2
 
453
        qsub8           r5,  r5,  r3            @ u = q0 - Filter1
 
454
        eor             r4,  r4,  r2            @ *op0 = u ^ 0x80
 
455
        eor             r5,  r5,  r2            @ *oq0 = u ^ 0x80
 
456
.endm
 
457
 
 
458
@ void vp8_v_loop_filter16_simple(uint8_t *dst, int stride, int flim)
 
459
function ff_vp8_v_loop_filter16_simple_armv6, export=1
 
460
        push            {r4-r11, lr}
 
461
 
 
462
        orr             r2,  r2,  r2,  lsl #16
 
463
        mov             r11, #4
 
464
        mov             lr,  #0
 
465
        orr             r12, r2,  r2,  lsl #8
 
466
        mov32           r2,  0x80808080
 
467
1:
 
468
        ldr_nreg        r3,  r0,  r1,  lsl #1   @ p1
 
469
        ldr_nreg        r4,  r0,  r1            @ p0
 
470
        ldr             r5,  [r0]               @ q0
 
471
        ldr             r6,  [r0, r1]           @ q1
 
472
        simple_filter
 
473
T       sub             r7,  r0,  r1
 
474
        str             r5,  [r0]               @ oq0
 
475
A       str             r4,  [r0, -r1]          @ op0
 
476
T       str             r4,  [r7]
 
477
2:
 
478
        subs            r11, r11, #1
 
479
        add             r0,  r0,  #4
 
480
        bne             1b
 
481
 
 
482
        pop             {r4-r11, pc}
 
483
endfunc
 
484
 
 
485
.macro  filter_mask_p
 
486
        uqsub8          r6,  r9,  r10           @ p3 - p2
 
487
        uqsub8          r7,  r10, r9            @ p2 - p3
 
488
        uqsub8          r8,  r10, r11           @ p2 - p1
 
489
        uqsub8          r10, r11, r10           @ p1 - p2
 
490
        orr             r6,  r6,  r7            @ abs(p3-p2)
 
491
        orr             r8,  r8,  r10           @ abs(p2-p1)
 
492
        uqsub8          lr,  r6,  r2            @ compare to limit
 
493
        uqsub8          r8,  r8,  r2            @ compare to limit
 
494
        uqsub8          r6,  r11, r12           @ p1 - p0
 
495
        orr             lr,  lr,  r8
 
496
        uqsub8          r7,  r12, r11           @ p0 - p1
 
497
        orr             r6,  r6,  r7            @ abs(p1-p0)
 
498
        uqsub8          r7,  r6,  r2            @ compare to limit
 
499
        uqsub8          r8,  r6,  r3            @ compare to thresh
 
500
        orr             lr,  lr,  r7
 
501
.endm
 
502
 
 
503
.macro filter_mask_pq
 
504
        uqsub8          r6,  r11, r10           @ p1 - q1
 
505
        uqsub8          r7,  r10, r11           @ q1 - p1
 
506
        uqsub8          r11, r12, r9            @ p0 - q0
 
507
        uqsub8          r12, r9,  r12           @ q0 - p0
 
508
        orr             r6,  r6,  r7            @ abs(p1-q1)
 
509
        orr             r12, r11, r12           @ abs(p0-q0)
 
510
        mov32           r7,  0x7f7f7f7f
 
511
        uqadd8          r12, r12, r12           @ abs(p0-q0) * 2
 
512
        and             r6,  r7,  r6,  lsr #1   @ abs(p1-q1) / 2
 
513
        uqadd8          r12, r12, r6            @ abs(p0-q0) * 2 + abs(p1-q1)/2
 
514
.endm
 
515
 
 
516
.macro  filter_mask_v
 
517
        filter_mask_p
 
518
 
 
519
        ldr             r10, [r0, r1]           @ q1
 
520
        ldr_post        r9,  r0,  r1,  lsl #1   @ q0
 
521
 
 
522
        filter_mask_pq
 
523
 
 
524
        ldr             r11, [r0]               @ q2
 
525
 
 
526
        uqsub8          r7,  r9,  r10           @ q0 - q1
 
527
        uqsub8          r6,  r10, r9            @ q1 - q0
 
528
        uqsub8          r12, r12, r4            @ compare to flimit
 
529
        uqsub8          r9,  r11, r10           @ q2 - q1
 
530
        uqsub8          r10, r10, r11           @ q1 - q2
 
531
        orr             lr,  lr,  r12
 
532
        ldr             r12, [r0, r1]           @ q3
 
533
        orr             r6,  r7,  r6            @ abs(q1-q0)
 
534
        orr             r10, r9,  r10           @ abs(q2-q1)
 
535
        uqsub8          r9,  r12, r11           @ q3 - q2
 
536
        uqsub8          r11, r11, r12           @ q2 - q3
 
537
        uqsub8          r7,  r6,  r2            @ compare to limit
 
538
        uqsub8          r10, r10, r2            @ compare to limit
 
539
        uqsub8          r6,  r6,  r3            @ compare to thresh
 
540
        orr             r9,  r9,  r11           @ abs(q3-q2)
 
541
        orr             lr,  lr,  r7
 
542
        orr             lr,  lr,  r10
 
543
        uqsub8          r9,  r9,  r2            @ compare to limit
 
544
        orr             lr,  lr,  r9
 
545
 
 
546
        mov             r12, #0
 
547
        usub8           lr,  r12, lr
 
548
        mvn             r11, #0
 
549
        sel             lr,  r11, r12           @ filter mask
 
550
        sub             r0,  r0,  r1,  lsl #1
 
551
.endm
 
552
 
 
553
.macro  filter_mask_h
 
554
        transpose       r12, r11, r10, r9,  r6,  r7,  r8,  lr
 
555
 
 
556
        filter_mask_p
 
557
 
 
558
        stm             sp,  {r8, r11, r12, lr}
 
559
        sub             r0,  r0,  r1,  lsl #2
 
560
        add             r0,  r0,  #4
 
561
 
 
562
        ldr             r7,  [r0, r1]
 
563
        ldr_post        r6,  r0,  r1,  lsl #1
 
564
        ldr             lr,  [r0, r1]
 
565
        ldr             r8,  [r0]
 
566
 
 
567
        transpose       r12, r11, r10, r9,  r6,  r7,  r8,  lr
 
568
 
 
569
        uqsub8          r8,  r12, r11           @ q3 - q2
 
570
        uqsub8          lr,  r11, r12           @ q2 - q3
 
571
        uqsub8          r7,  r9,  r10           @ q0 - q1
 
572
        uqsub8          r6,  r10, r9            @ q1 - q0
 
573
        uqsub8          r12, r11, r10           @ q2 - q1
 
574
        uqsub8          r11, r10, r11           @ q1 - q2
 
575
        orr             r8,  r8,  lr            @ abs(q3-q2)
 
576
        orr             r6,  r7,  r6            @ abs(q1-q0)
 
577
        orr             r11, r12, r11           @ abs(q2-q1)
 
578
        ldr             lr,  [sp, #12]          @ load back (f)limit accumulator
 
579
        uqsub8          r8,  r8,  r2            @ compare to limit
 
580
        uqsub8          r7,  r6,  r2            @ compare to limit
 
581
        uqsub8          r11, r11, r2            @ compare to limit
 
582
        orr             lr,  lr,  r8
 
583
        uqsub8          r8,  r6,  r3            @ compare to thresh
 
584
        orr             lr,  lr,  r7
 
585
        ldr             r12, [sp, #8]           @ p1
 
586
        orr             lr,  lr,  r11
 
587
 
 
588
        ldr             r11, [sp, #4]           @ p0
 
589
 
 
590
        filter_mask_pq
 
591
 
 
592
        mov             r10, #0
 
593
        uqsub8          r12, r12, r4            @ compare to flimit
 
594
        mvn             r11, #0
 
595
        orr             lr,  lr,  r12
 
596
        usub8           lr,  r10, lr
 
597
        sel             lr,  r11, r10           @ filter mask
 
598
.endm
 
599
 
 
600
.macro  filter          inner
 
601
        mov32           r12, 0x80808080
 
602
        eor             r11, r7,  r12           @ ps1
 
603
        eor             r8,  r8,  r12           @ ps0
 
604
        eor             r9,  r9,  r12           @ qs0
 
605
        eor             r10, r10, r12           @ qs1
 
606
 
 
607
        stm             sp,  {r8-r11}
 
608
 
 
609
        qsub8           r7,  r11, r10           @ vp8_signed_char_clamp(ps1-qs1)
 
610
        qsub8           r8,  r9,  r8            @ vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
 
611
    .if \inner
 
612
        and             r7,  r7,  r6            @ vp8_filter &= hev
 
613
    .endif
 
614
        qadd8           r7,  r7,  r8
 
615
        lsr             r10, r12, #5            @ 0x04040404
 
616
        qadd8           r7,  r7,  r8
 
617
        sub             r9,  r10, r12, lsr #7   @ 0x03030303
 
618
        qadd8           r7,  r7,  r8
 
619
 
 
620
        and             r7,  r7,  lr            @ vp8_filter &= mask
 
621
    .if !\inner
 
622
        mov             r12, r7                 @ Filter2
 
623
        and             r7,  r7,  r6            @ Filter2 &= hev
 
624
    .endif
 
625
        qadd8           lr,  r7,  r9            @ Filter2 = vp8_signed_char_clamp(vp8_filter+3)
 
626
        qadd8           r7,  r7,  r10           @ Filter1 = vp8_signed_char_clamp(vp8_filter+4)
 
627
 
 
628
        mov             r9,  #0
 
629
        shadd8          lr,  lr,  r9            @ Filter2 >>= 3
 
630
        shadd8          r7,  r7,  r9            @ Filter1 >>= 3
 
631
        shadd8          lr,  lr,  r9
 
632
        shadd8          r7,  r7,  r9
 
633
        shadd8          lr,  lr,  r9            @ Filter2
 
634
        shadd8          r7,  r7,  r9            @ Filter1
 
635
.endm
 
636
 
 
637
.macro  filter_v        inner
 
638
        orr             r10, r6,  r8            @ calculate vp8_hevmask
 
639
        ldr_nreg        r7,  r0,  r1,  lsl #1   @ p1
 
640
        usub8           r10, r12, r10
 
641
        ldr_nreg        r8,  r0,  r1            @ p0
 
642
        sel             r6,  r12, r11           @ obtain vp8_hevmask
 
643
        ldr             r9,  [r0]               @ q0
 
644
        ldr             r10, [r0, r1]           @ q1
 
645
        filter          \inner
 
646
.endm
 
647
 
 
648
.macro  filter_h        inner
 
649
        orr             r9,  r6,  r8
 
650
        usub8           r9,  r12, r9
 
651
        sel             r6,  r12, r11           @ hev mask
 
652
 
 
653
        stm             sp,  {r6, lr}
 
654
 
 
655
        ldr_nreg        r12, r0,  r1,  lsl #1
 
656
        ldr_nreg        r11, r0,  r1
 
657
        ldr             r6,  [r0]
 
658
        ldr             lr,  [r0, r1]
 
659
 
 
660
        transpose       r10, r9,  r8,  r7,  r12, r11, r6,  lr
 
661
 
 
662
        ldm             sp,  {r6, lr}
 
663
        filter          \inner
 
664
.endm
 
665
 
 
666
.macro  filter_inner
 
667
        ldm             sp,  {r8, r9}
 
668
        lsr             r10, r10, #2            @ 0x01010101
 
669
        qadd8           r8,  r8,  lr            @ u = vp8_signed_char_clamp(ps0 + Filter2)
 
670
        mov             lr,  #0
 
671
        qsub8           r9,  r9,  r7            @ u = vp8_signed_char_clamp(qs0 - Filter1)
 
672
        sadd8           r7,  r7,  r10           @ vp8_filter += 1
 
673
        ldr             r10, [sp, #8]           @ qs1
 
674
        shadd8          r7,  r7,  lr            @ vp8_filter >>= 1
 
675
        eor             r8,  r8,  r12           @ *op0 = u ^ 0x80
 
676
        bic             r7,  r7,  r6            @ vp8_filter &= ~hev
 
677
        qadd8           r11, r11, r7            @ u = vp8_signed_char_clamp(ps1 + vp8_filter)
 
678
        eor             r9,  r9,  r12           @ *oq0 = u ^ 0x80
 
679
        qsub8           r10, r10, r7            @ u = vp8_signed_char_clamp(qs1 - vp8_filter)
 
680
        eor             r11, r11, r12           @ *op1 = u ^ 0x80
 
681
        eor             r10, r10, r12           @ *oq1 = u ^ 0x80
 
682
.endm
 
683
 
 
684
.macro  filter_x        c0
 
685
        mov             lr,  \c0
 
686
        mov             r7,  #63
 
687
 
 
688
        sxtb16          r6,  r12
 
689
        sxtb16          r10, r12, ror #8
 
690
        smlabb          r8,  r6,  lr,  r7
 
691
        smlatb          r6,  r6,  lr,  r7
 
692
        smlabb          r7,  r10, lr,  r7
 
693
        smultb          r10, r10, lr
 
694
        ssat            r8,  #8,  r8,  asr #7
 
695
        ssat            r6,  #8,  r6,  asr #7
 
696
        add             r10, r10, #63
 
697
        ssat            r7,  #8,  r7,  asr #7
 
698
        ssat            r10, #8,  r10, asr #7
 
699
 
 
700
        pkhbt           r6,  r8,  r6,  lsl #16
 
701
        pkhbt           r10, r7,  r10, lsl #16
 
702
        uxtb16          r6,  r6
 
703
        uxtb16          r10, r10
 
704
 
 
705
        mov32           lr,  0x80808080
 
706
 
 
707
        orr             r10, r6,  r10, lsl #8   @ u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
 
708
        qsub8           r8,  r9,  r10           @ s = vp8_signed_char_clamp(qs0 - u)
 
709
        qadd8           r10, r11, r10           @ s = vp8_signed_char_clamp(ps0 + u)
 
710
        eor             r8,  r8,  lr            @ *oq0 = s ^ 0x80
 
711
        eor             r10, r10, lr            @ *op0 = s ^ 0x80
 
712
.endm
 
713
 
 
714
.macro  filter_1
 
715
        ldm             sp,  {r8, r9}
 
716
        qadd8           r11, r8,  lr
 
717
        qsub8           r9,  r9,  r7
 
718
        bic             r12, r12, r6            @ vp8_filter &= ~hev
 
719
        filter_x        #27
 
720
.endm
 
721
 
 
722
.macro  filter_2
 
723
        ldr             r9,   [sp, #8]          @ qs1
 
724
        ldr             r11,  [sp, #12]         @ ps1
 
725
        filter_x        #18
 
726
.endm
 
727
 
 
728
.macro  filter_3
 
729
        eor             r9,  r9,  lr
 
730
        eor             r11, r11, lr
 
731
        filter_x        #9
 
732
.endm
 
733
 
 
734
function vp8_v_loop_filter_inner_armv6
 
735
        mov             r5,  #4
 
736
        sub             sp,  sp,  #16
 
737
 
 
738
        orr             r2,  r2,  r2,  lsl #16
 
739
        orr             r3,  r3,  r3,  lsl #16
 
740
        orr             r6,  r6,  r6,  lsl #16
 
741
        orr             r4,  r2,  r2,  lsl #8   @ flimE
 
742
        orr             r2,  r3,  r3,  lsl #8   @ flimI
 
743
        orr             r3,  r6,  r6,  lsl #8   @ thresh
 
744
1:
 
745
        sub             r0,  r0,  r1,  lsl #2
 
746
        ldr             r10, [r0, r1]           @ p2
 
747
        ldr_post        r9,  r0,  r1,  lsl #1   @ p3
 
748
        ldr             r12, [r0, r1]           @ p0
 
749
        ldr_post        r11, r0,  r1,  lsl #1   @ p1
 
750
 
 
751
        filter_mask_v
 
752
        cmp             lr,  #0
 
753
        beq             2f
 
754
        filter_v        inner=1
 
755
        filter_inner
 
756
 
 
757
A       str             r11, [r0, -r1, lsl #1]  @ op1
 
758
A       str             r8,  [r0, -r1]          @ op0
 
759
T       sub             r0,  r0,  r1,  lsl #1
 
760
T       str             r8,  [r0, r1]
 
761
T       str_post        r11, r0,  r1,  lsl #1
 
762
        str             r9,  [r0]               @ oq0
 
763
        str             r10, [r0, r1]           @ oq1
 
764
2:
 
765
        add             r0,  r0,  #4
 
766
        cmp             r5,  #3
 
767
        it              eq
 
768
        ldreq           r0,  [sp, #16]
 
769
        subs            r5,  r5,  #1
 
770
        bne             1b
 
771
 
 
772
        add             sp,  sp,  #16
 
773
        pop             {r0, r4-r11, pc}
 
774
endfunc
 
775
 
 
776
function ff_vp8_v_loop_filter16_inner_armv6, export=1
 
777
        push            {r4-r11, lr}
 
778
        add             r12, r0,  #8
 
779
        push            {r12}
 
780
        ldr             r6,  [sp, #40]
 
781
        orr             r2,  r2,  r2,  lsl #16
 
782
        b               vp8_v_loop_filter_inner_armv6
 
783
endfunc
 
784
 
 
785
function ff_vp8_v_loop_filter8uv_inner_armv6, export=1
 
786
        push            {r1, r4-r11, lr}
 
787
        mov             r1,  r2
 
788
        orr             r2,  r3,  r3,  lsl #16
 
789
        ldr             r3,  [sp, #40]
 
790
        ldr             r6,  [sp, #44]
 
791
        b               vp8_v_loop_filter_inner_armv6
 
792
endfunc
 
793
 
 
794
function vp8_v_loop_filter_armv6
 
795
        mov             r5,  #4
 
796
        sub             sp,  sp,  #16
 
797
 
 
798
        orr             r3,  r3,  r3,  lsl #16
 
799
        orr             r6,  r6,  r6,  lsl #16
 
800
        orr             r4,  r2,  r2,  lsl #8   @ flimE
 
801
        orr             r2,  r3,  r3,  lsl #8   @ flimI
 
802
        orr             r3,  r6,  r6,  lsl #8   @ thresh
 
803
1:
 
804
        sub             r0,  r0,  r1,  lsl #2
 
805
        ldr             r10, [r0, r1]           @ p2
 
806
        ldr_post        r9,  r0,  r1,  lsl #1   @ p3
 
807
        ldr             r12, [r0, r1]           @ p0
 
808
        ldr_post        r11, r0,  r1,  lsl #1   @ p1
 
809
 
 
810
        filter_mask_v
 
811
        cmp             lr,  #0
 
812
        beq             2f
 
813
 
 
814
        filter_v        inner=0
 
815
        filter_1
 
816
 
 
817
        str             r8,  [r0]               @ *oq0
 
818
A       str             r10, [r0, -r1]          @ *op0
 
819
T       sub             r0,  r0,  r1,  lsl #1
 
820
T       str             r10, [r0, r1]
 
821
 
 
822
        filter_2
 
823
 
 
824
A       str             r10, [r0, -r1, lsl #1]  @ *op1
 
825
T       str_post        r10, r0,  r1,  lsl #1
 
826
        str             r8,  [r0, r1]           @ *oq1
 
827
 
 
828
        ldr             r9,  [r0, r1,  lsl #1]  @ q2
 
829
        add             r0,  r0,  r1
 
830
A       ldr             r11, [r0, -r1, lsl #2]  @ p2
 
831
T       ldr_dpre        r11, r0,  r1,  lsl #2
 
832
 
 
833
        filter_3
 
834
 
 
835
A       str             r10, [r0, -r1, lsl #2]  @ *op2
 
836
T       str_post        r10, r0,  r1,  lsl #2
 
837
        str             r8,  [r0, r1]           @ *oq2
 
838
        sub             r0,  r0,  r1
 
839
2:
 
840
        add             r0,  r0,  #4
 
841
        cmp             r5,  #3
 
842
        it              eq
 
843
        ldreq           r0,  [sp, #16]
 
844
        subs            r5,  r5,  #1
 
845
        bne             1b
 
846
 
 
847
        add             sp,  sp,  #16
 
848
        pop             {r0, r4-r11, pc}
 
849
endfunc
 
850
 
 
851
function ff_vp8_v_loop_filter16_armv6, export=1
 
852
        push            {r4-r11, lr}
 
853
        add             r12, r0,  #8
 
854
        push            {r12}
 
855
        ldr             r6,  [sp, #40]
 
856
        orr             r2,  r2,  r2,  lsl #16
 
857
        b               vp8_v_loop_filter_armv6
 
858
endfunc
 
859
 
 
860
function ff_vp8_v_loop_filter8uv_armv6, export=1
 
861
        push            {r1, r4-r11, lr}
 
862
        mov             r1,  r2
 
863
        orr             r2,  r3,  r3,  lsl #16
 
864
        ldr             r3,  [sp, #40]
 
865
        ldr             r6,  [sp, #44]
 
866
        b               vp8_v_loop_filter_armv6
 
867
endfunc
 
868
 
 
869
@ void vp8_h_loop_filter16_simple(uint8_t *dst, int stride, int flim)
 
870
function ff_vp8_h_loop_filter16_simple_armv6, export=1
 
871
        push            {r4-r11, lr}
 
872
        orr             r12, r2,  r2,  lsl #16
 
873
        mov32           r2,  0x80808080
 
874
        orr             r12, r12, r12, lsl #8
 
875
 
 
876
        mov             lr,  #0
 
877
        mov             r11, #4
 
878
1:
 
879
        sub             r0,  r0,  #2
 
880
        ldr             r8,  [r0, r1]
 
881
        ldr_post        r7,  r0,  r1,  lsl #1
 
882
        ldr             r10, [r0, r1]
 
883
        ldr_post        r9,  r0,  r1,  lsl #1
 
884
        add             r0,  r0,  #2
 
885
        transpose       r6,  r5,  r4,  r3,  r7,  r8,  r9,  r10
 
886
        simple_filter
 
887
        sub             r0,  r0,  r1,  lsl #2
 
888
        sub             r0,  r0,  #1
 
889
 
 
890
        uxtb16          r6,  r4
 
891
        uxtb16          r8,  r5
 
892
        uxtb16          r7,  r4,  ror #8
 
893
        uxtb16          r9,  r5,  ror #8
 
894
        orr             r6,  r6,  r8,  lsl #8
 
895
        orr             r7,  r7,  r9,  lsl #8
 
896
        lsr             r4,  r6,  #16
 
897
        lsr             r5,  r7,  #16
 
898
 
 
899
        strh_post       r6,  r0,  r1
 
900
        strh_post       r7,  r0,  r1
 
901
        strh_post       r4,  r0,  r1
 
902
        strh_post       r5,  r0,  r1
 
903
        add             r0,  r0,  #1
 
904
2:
 
905
        subs            r11, r11, #1
 
906
        bne             1b
 
907
 
 
908
        pop             {r4-r11, pc}
 
909
endfunc
 
910
 
 
911
function vp8_h_loop_filter_inner_armv6
 
912
        mov             r5,  #4
 
913
        sub             sp,  sp,  #16
 
914
 
 
915
        orr             r3,  r3,  r3,  lsl #16
 
916
        orr             r9,  r9,  r9,  lsl #16
 
917
        orr             r4,  r2,  r2,  lsl #8   @ flimE
 
918
        orr             r2,  r3,  r3,  lsl #8   @ flimI
 
919
        orr             r3,  r9,  r9,  lsl #8   @ thresh
 
920
        sub             r0,  r0,  #4
 
921
1:
 
922
        ldr             r7,  [r0, r1]
 
923
        ldr_post        r6,  r0,  r1,  lsl #1
 
924
        ldr             lr,  [r0, r1]
 
925
        ldr_post        r8,  r0,  r1,  lsl #1
 
926
 
 
927
        filter_mask_h
 
928
 
 
929
        cmp             lr,  #0
 
930
        sub             r0,  r0,  #2
 
931
        beq             2f
 
932
 
 
933
        ldr             r6,  [sp]
 
934
 
 
935
        filter_h        inner=1
 
936
        filter_inner
 
937
 
 
938
        transpose       lr,  r12, r7,  r6,  r11, r8,  r9,  r10
 
939
 
 
940
A       str             r6,  [r0, -r1, lsl #1]
 
941
A       str             r7,  [r0, -r1]
 
942
T       sub             r0,  r0,  r1,  lsl #1
 
943
T       str             r7,  [r0, r1]
 
944
T       str_post        r6,  r0,  r1,  lsl #1
 
945
        str             r12, [r0]
 
946
        str             lr,  [r0, r1]
 
947
2:
 
948
        sub             r0,  r0,  #2
 
949
        add             r0,  r0,  r1,  lsl #1
 
950
        cmp             r5,  #3
 
951
        it              eq
 
952
        ldreq           r0,  [sp, #16]
 
953
        subs            r5,  r5,  #1
 
954
        bne             1b
 
955
 
 
956
        add             sp, sp, #16
 
957
        pop             {r0, r4-r11, pc}
 
958
endfunc
 
959
 
 
960
function ff_vp8_h_loop_filter16_inner_armv6, export=1
 
961
        push            {r4-r11, lr}
 
962
        add             r12, r0,  r1,  lsl #3
 
963
        sub             r12, r12, #4
 
964
        push            {r12}
 
965
        ldr             r9,  [sp, #40]
 
966
        orr             r2,  r2,  r2,  lsl #16
 
967
        b               vp8_h_loop_filter_inner_armv6
 
968
endfunc
 
969
 
 
970
function ff_vp8_h_loop_filter8uv_inner_armv6, export=1
 
971
        sub             r1,  r1,  #4
 
972
        push            {r1, r4-r11, lr}
 
973
        mov             r1,  r2
 
974
        orr             r2,  r3,  r3,  lsl #16
 
975
        ldr             r3,  [sp, #40]
 
976
        ldr             r9,  [sp, #44]
 
977
        b               vp8_h_loop_filter_inner_armv6
 
978
endfunc
 
979
 
 
980
function vp8_h_loop_filter_armv6
 
981
        mov             r5,  #4
 
982
        sub             sp,  sp,  #16
 
983
 
 
984
        orr             r3,  r3,  r3,  lsl #16
 
985
        orr             r9,  r9,  r9,  lsl #16
 
986
        orr             r4,  r2,  r2,  lsl #8   @ flimE
 
987
        orr             r2,  r3,  r3,  lsl #8   @ flimI
 
988
        orr             r3,  r9,  r9,  lsl #8   @ thresh
 
989
1:
 
990
        sub             r0,  r0,  #4
 
991
        ldr             r7,  [r0, r1]
 
992
        ldr_post        r6,  r0,  r1,  lsl #1
 
993
        ldr             lr,  [r0, r1]
 
994
        ldr_post        r8,  r0,  r1,  lsl #1
 
995
 
 
996
        filter_mask_h
 
997
        cmp             lr,  #0
 
998
        it              eq
 
999
        addeq           r0,  r0,  r1,  lsl #1
 
1000
        beq             2f
 
1001
 
 
1002
        ldr             r6,  [sp]
 
1003
        sub             r0,  r0,  #2
 
1004
 
 
1005
        filter_h        inner=0
 
1006
        filter_1
 
1007
 
 
1008
        sub             r0,  r0,  r1,  lsl #1
 
1009
        uxtb16          r6,  r10
 
1010
        uxtb16          r7,  r8
 
1011
        uxtb16          r10, r10, ror #8
 
1012
        uxtb16          r8,  r8,  ror #8
 
1013
        orr             r6,  r6,  r7,  lsl #8
 
1014
        orr             r10, r10, r8,  lsl #8
 
1015
        lsr             r7,  r6,  #16
 
1016
        lsr             r8,  r10, #16
 
1017
 
 
1018
        add             r0,  r0,  #1
 
1019
        strh_post       r6,  r0,  r1
 
1020
        strh_post       r10, r0,  r1
 
1021
        strh_post       r7,  r0,  r1
 
1022
        strh_post       r8,  r0,  r1
 
1023
 
 
1024
        filter_2
 
1025
 
 
1026
        sub             r0,  r0,  r1,  lsl #2
 
1027
        add             r0,  r0,  #3
 
1028
 
 
1029
        ldrb            r11, [r0, #-5]          @ p2 for 1/7th difference
 
1030
        strb            r10, [r0, #-4]          @ op1
 
1031
        strb            r8,  [r0, #-1]          @ oq1
 
1032
        ldrb_post       r9,  r0,  r1            @ q2 for 1/7th difference
 
1033
 
 
1034
        lsr             r10, r10, #8
 
1035
        lsr             r8,  r8,  #8
 
1036
 
 
1037
        ldrb            r6,  [r0, #-5]
 
1038
        strb            r10, [r0, #-4]
 
1039
        strb            r8,  [r0, #-1]
 
1040
        ldrb_post       r7,  r0,  r1
 
1041
 
 
1042
        lsr             r10, r10, #8
 
1043
        lsr             r8,  r8,  #8
 
1044
        orr             r11, r11, r6,  lsl #8
 
1045
        orr             r9,  r9,  r7,  lsl #8
 
1046
 
 
1047
        ldrb            r6,  [r0, #-5]
 
1048
        strb            r10, [r0, #-4]
 
1049
        strb            r8,  [r0, #-1]
 
1050
        ldrb_post       r7,  r0,  r1
 
1051
 
 
1052
        lsr             r10, r10, #8
 
1053
        lsr             r8,  r8,  #8
 
1054
        orr             r11, r11, r6,  lsl #16
 
1055
        orr             r9,  r9,  r7,  lsl #16
 
1056
 
 
1057
        ldrb            r6,  [r0, #-5]
 
1058
        strb            r10, [r0, #-4]
 
1059
        strb            r8,  [r0, #-1]
 
1060
        ldrb_post       r7,  r0,  r1
 
1061
        orr             r11, r11, r6,  lsl #24
 
1062
        orr             r9,  r9,  r7,  lsl #24
 
1063
 
 
1064
        filter_3
 
1065
 
 
1066
        sub             r0,  r0,  r1,  lsl #2
 
1067
        strb            r10, [r0, #-5]
 
1068
        strb_post       r8,  r0,  r1
 
1069
        lsr             r10, r10, #8
 
1070
        lsr             r8,  r8,  #8
 
1071
        strb            r10, [r0, #-5]
 
1072
        strb_post       r8,  r0,  r1
 
1073
        lsr             r10, r10, #8
 
1074
        lsr             r8,  r8,  #8
 
1075
        strb            r10, [r0, #-5]
 
1076
        strb_post       r8,  r0,  r1
 
1077
        lsr             r10, r10, #8
 
1078
        lsr             r8,  r8,  #8
 
1079
        strb            r10, [r0, #-5]
 
1080
        strb_post       r8,  r0,  r1
 
1081
 
 
1082
        sub             r0,  r0,  #2
 
1083
2:
 
1084
        cmp             r5,  #3
 
1085
        it              eq
 
1086
        ldreq           r0,  [sp, #16]
 
1087
        subs            r5,  r5,  #1
 
1088
        bne             1b
 
1089
 
 
1090
        add             sp,  sp,  #16
 
1091
        pop             {r0, r4-r11, pc}
 
1092
endfunc
 
1093
 
 
1094
function ff_vp8_h_loop_filter16_armv6, export=1
 
1095
        push            {r4-r11, lr}
 
1096
        add             r12, r0,  r1,  lsl #3
 
1097
        push            {r12}
 
1098
        ldr             r9,  [sp, #40]
 
1099
        orr             r2,  r2,  r2,  lsl #16
 
1100
        b               vp8_h_loop_filter_armv6
 
1101
endfunc
 
1102
 
 
1103
function ff_vp8_h_loop_filter8uv_armv6, export=1
 
1104
        push            {r1, r4-r11, lr}
 
1105
        mov             r1,  r2
 
1106
        orr             r2,  r3,  r3,  lsl #16
 
1107
        ldr             r3,  [sp, #40]
 
1108
        ldr             r9,  [sp, #44]
 
1109
        b               vp8_h_loop_filter_armv6
 
1110
endfunc
 
1111
 
 
1112
.ltorg
 
1113
 
 
1114
@ MC
 
1115
 
 
1116
@ void put_vp8_pixels16(uint8_t *dst, int dststride, uint8_t *src,
 
1117
@                       int srcstride, int h, int mx, int my)
 
1118
function ff_put_vp8_pixels16_armv6, export=1
 
1119
        push            {r4-r11}
 
1120
        ldr             r12, [sp, #32]          @ h
 
1121
1:
 
1122
        subs            r12, r12, #2
 
1123
        ldr             r5,  [r2, #4]
 
1124
        ldr             r6,  [r2, #8]
 
1125
        ldr             r7,  [r2, #12]
 
1126
        ldr_post        r4,  r2,  r3
 
1127
        ldr             r9,  [r2, #4]
 
1128
        ldr             r10, [r2, #8]
 
1129
        ldr             r11, [r2, #12]
 
1130
        ldr_post        r8,  r2,  r3
 
1131
        strd            r6,  r7,  [r0, #8]
 
1132
        strd_post       r4,  r5,  r0,  r1
 
1133
        strd            r10, r11, [r0, #8]
 
1134
        strd_post       r8,  r9,  r0,  r1
 
1135
        bgt             1b
 
1136
        pop             {r4-r11}
 
1137
        bx              lr
 
1138
endfunc
 
1139
 
 
1140
@ void put_vp8_pixels8(uint8_t *dst, int dststride, uint8_t *src,
 
1141
@                      int srcstride, int h, int mx, int my)
 
1142
function ff_put_vp8_pixels8_armv6, export=1
 
1143
        push            {r4-r11}
 
1144
        ldr             r12, [sp, #32]          @ h
 
1145
1:
 
1146
        subs            r12, r12, #4
 
1147
        ldr             r5,  [r2, #4]
 
1148
        ldr_post        r4,  r2,  r3
 
1149
        ldr             r7,  [r2, #4]
 
1150
        ldr_post        r6,  r2,  r3
 
1151
        ldr             r9,  [r2, #4]
 
1152
        ldr_post        r8,  r2,  r3
 
1153
        ldr             r11, [r2, #4]
 
1154
        ldr_post        r10, r2,  r3
 
1155
        strd_post       r4,  r5,  r0,  r1
 
1156
        strd_post       r6,  r7,  r0,  r1
 
1157
        strd_post       r8,  r9,  r0,  r1
 
1158
        strd_post       r10, r11, r0,  r1
 
1159
        bgt             1b
 
1160
        pop             {r4-r11}
 
1161
        bx              lr
 
1162
endfunc
 
1163
 
 
1164
@ void put_vp8_pixels4(uint8_t *dst, int dststride, uint8_t *src,
 
1165
@                      int srcstride, int h, int mx, int my)
 
1166
function ff_put_vp8_pixels4_armv6, export=1
 
1167
        ldr             r12, [sp, #0]           @ h
 
1168
        push            {r4-r6,lr}
 
1169
1:
 
1170
        subs            r12, r12, #4
 
1171
        ldr_post        r4,  r2,  r3
 
1172
        ldr_post        r5,  r2,  r3
 
1173
        ldr_post        r6,  r2,  r3
 
1174
        ldr_post        lr,  r2,  r3
 
1175
        str_post        r4,  r0,  r1
 
1176
        str_post        r5,  r0,  r1
 
1177
        str_post        r6,  r0,  r1
 
1178
        str_post        lr,  r0,  r1
 
1179
        bgt             1b
 
1180
        pop             {r4-r6,pc}
 
1181
endfunc
 
1182
 
 
1183
@ note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
 
1184
@ arithmatic can be used to apply filters
 
1185
const   sixtap_filters_13245600, align=4
 
1186
        .short     2, 108, -11,  36,  -8, 1, 0, 0
 
1187
        .short     3,  77, -16,  77, -16, 3, 0, 0
 
1188
        .short     1,  36,  -8, 108, -11, 2, 0, 0
 
1189
endconst
 
1190
 
 
1191
const   fourtap_filters_1324, align=4
 
1192
        .short     -6,  12, 123, -1
 
1193
        .short     -9,  50,  93, -6
 
1194
        .short     -6,  93,  50, -9
 
1195
        .short     -1, 123,  12, -6
 
1196
endconst
 
1197
 
 
1198
.macro  vp8_mc_1        name, size, hv
 
1199
function ff_put_vp8_\name\size\()_\hv\()_armv6, export=1
 
1200
        sub             r1,  r1,  #\size
 
1201
        mov             r12, sp
 
1202
        push            {r1, r4-r11, lr}
 
1203
        ldm             r12, {r5-r7}
 
1204
        mov             r4,  #\size
 
1205
        stm             r12, {r4, r5}
 
1206
        orr             r12, r6,  r7
 
1207
        b               vp8_put_\name\()_\hv\()_armv6 + 4
 
1208
endfunc
 
1209
.endm
 
1210
 
 
1211
vp8_mc_1                epel,  16, h6
 
1212
vp8_mc_1                epel,  16, v6
 
1213
vp8_mc_1                epel,   8, h6
 
1214
vp8_mc_1                epel,   8, v6
 
1215
vp8_mc_1                epel,   8, h4
 
1216
vp8_mc_1                epel,   8, v4
 
1217
vp8_mc_1                epel,   4, h6
 
1218
vp8_mc_1                epel,   4, v6
 
1219
vp8_mc_1                epel,   4, h4
 
1220
vp8_mc_1                epel,   4, v4
 
1221
 
 
1222
vp8_mc_1                bilin, 16, h
 
1223
vp8_mc_1                bilin, 16, v
 
1224
vp8_mc_1                bilin,  8, h
 
1225
vp8_mc_1                bilin,  8, v
 
1226
vp8_mc_1                bilin,  4, h
 
1227
vp8_mc_1                bilin,  4, v
 
1228
 
 
1229
/* True relational expressions have the value -1 in the GNU assembler,
 
1230
   +1 in Apple's. */
 
1231
#ifdef __APPLE__
 
1232
#   define TMPSIZE \size * (8 + 8*(\size > 4) + \ytaps - 1)
 
1233
#else
 
1234
#   define TMPSIZE \size * (8 - 8*(\size > 4) + \ytaps - 1)
 
1235
#endif
 
1236
 
 
1237
.macro  vp8_mc_hv       name, size, h, v, ytaps
 
1238
function ff_put_vp8_\name\size\()_\h\v\()_armv6, export=1
 
1239
        push            {r0, r1, r4, lr}
 
1240
        add             r0,  sp,  #16
 
1241
        sub             sp,  sp,  #TMPSIZE+16
 
1242
        ldm             r0,  {r0, r12}
 
1243
        mov             r4,  #\size
 
1244
        add             lr,  r0,  #\ytaps-1
 
1245
    .if \ytaps > 2
 
1246
        sub             r2,  r2,  r3,  lsl #\ytaps >> 1 & 1
 
1247
    .endif
 
1248
        stm             sp,  {r4, lr}
 
1249
        add             r0,  sp,  #16
 
1250
        mov             r1,  #0
 
1251
        bl              vp8_put_\name\()_\h\()_armv6
 
1252
        add             r0,  sp,  #TMPSIZE+16
 
1253
        ldr             lr,  [sp, #TMPSIZE+16+16]
 
1254
        ldm             r0,  {r0, r1}
 
1255
        mov             r3,  #\size
 
1256
        ldr             r12, [sp, #TMPSIZE+16+16+8]
 
1257
        str             lr,  [sp, #4]
 
1258
        add             r2,  sp,  #16 + \size * (\ytaps / 2 - 1)
 
1259
        sub             r1,  r1,  #\size
 
1260
        bl              vp8_put_\name\()_\v\()_armv6
 
1261
        add             sp,  sp,  #TMPSIZE+16+8
 
1262
        pop             {r4, pc}
 
1263
endfunc
 
1264
.endm
 
1265
 
 
1266
vp8_mc_hv               epel,  16, h6, v6, 6
 
1267
vp8_mc_hv               epel,   8, h6, v6, 6
 
1268
vp8_mc_hv               epel,   8, h4, v6, 6
 
1269
vp8_mc_hv               epel,   8, h6, v4, 4
 
1270
vp8_mc_hv               epel,   8, h4, v4, 4
 
1271
vp8_mc_hv               epel,   4, h6, v6, 6
 
1272
vp8_mc_hv               epel,   4, h4, v6, 6
 
1273
vp8_mc_hv               epel,   4, h6, v4, 4
 
1274
vp8_mc_hv               epel,   4, h4, v4, 4
 
1275
 
 
1276
vp8_mc_hv               bilin, 16, h,  v,  2
 
1277
vp8_mc_hv               bilin,  8, h,  v,  2
 
1278
vp8_mc_hv               bilin,  4, h,  v,  2
 
1279
 
 
1280
.macro  sat4            r0,  r1,  r2,  r3
 
1281
        asr             \r0, \r0, #7
 
1282
        asr             \r1, \r1, #7
 
1283
        pkhbt           \r0, \r0, \r2, lsl #9
 
1284
        pkhbt           \r1, \r1, \r3, lsl #9
 
1285
        usat16          \r0, #8,  \r0
 
1286
        usat16          \r1, #8,  \r1
 
1287
        orr             \r0, \r0, \r1, lsl #8
 
1288
.endm
 
1289
 
 
1290
@ Calling convention for the inner MC functions:
 
1291
@       r0      dst
 
1292
@       r1      dst_stride - block_width
 
1293
@       r2      src
 
1294
@       r3      src_stride
 
1295
@       r4      block_width
 
1296
@       r12     filter_index
 
1297
@       [sp]    block_width
 
1298
@       [sp+4]  height
 
1299
@       [sp+8]  scratch
 
1300
 
 
1301
function vp8_put_epel_h6_armv6
 
1302
        push            {r1, r4-r11, lr}
 
1303
        sub             r2,  r2,  #2
 
1304
        movrel          lr,  sixtap_filters_13245600 - 16
 
1305
        add             lr,  lr,  r12, lsl #3
 
1306
        sub             r3,  r3,  r4
 
1307
        str             r3,  [sp, #48]
 
1308
        ldm             lr,  {r1, r3, lr}
 
1309
1:
 
1310
        ldr             r7,  [r2, #5]           @ src[5-8]
 
1311
        ldr             r6,  [r2, #2]           @ src[2-5]
 
1312
        ldr             r5,  [r2], #4           @ src[0-3]
 
1313
 
 
1314
        pkhtb           r7,  r7,  r7,  asr #8   @ src[8,7,7,6]
 
1315
        uxtb16          r9,  r6,  ror #8        @ src[5] | src[3]
 
1316
        uxtb16          r6,  r6                 @ src[4] | src[2]
 
1317
        uxtb16          r8,  r5,  ror #8        @ src[3] | src[1]
 
1318
        uxtb16          r11, r7,  ror #8        @ src[8] | src[7]
 
1319
        uxtb16          r7,  r7                 @ src[7] | src[6]
 
1320
        uxtb16          r5,  r5                 @ src[2] | src[0]
 
1321
 
 
1322
        mov             r10, #0x40
 
1323
        smlad           r5,  r5,  r1,  r10      @ filter[0][0]
 
1324
        smlad           r11, r11, lr,  r10      @ filter[3][2]
 
1325
        smlad           r12, r7,  lr,  r10      @ filter[2][2]
 
1326
        smlad           r10, r8,  r1,  r10      @ filter[1][0]
 
1327
        smlad           r5,  r8,  r3,  r5       @ filter[0][1]
 
1328
        smlad           r11, r9,  r1,  r11      @ filter[3][0]
 
1329
        smlad           r12, r9,  r3,  r12      @ filter[2][1]
 
1330
        pkhtb           r9,  r9,  r6,  asr #16  @ src[5] | src[4]
 
1331
        smlad           r10, r6,  r3,  r10      @ filter[1][1]
 
1332
        pkhbt           r7,  r9,  r7,  lsl #16  @ src[6] | src[4]
 
1333
        smlad           r5,  r9,  lr,  r5       @ filter[0][2]
 
1334
        pkhtb           r8,  r7,  r9,  asr #16  @ src[6] | src[5]
 
1335
        smlad           r11, r7,  r3,  r11      @ filter[3][1]
 
1336
        smlad           r9,  r8,  lr,  r10      @ filter[1][2]
 
1337
        smlad           r7,  r6,  r1,  r12      @ filter[2][0]
 
1338
 
 
1339
        subs            r4,  r4,  #4
 
1340
 
 
1341
        sat4            r5,  r9,  r7,  r11
 
1342
        str             r5,  [r0], #4
 
1343
 
 
1344
        bne             1b
 
1345
 
 
1346
        add             r4,  sp,  #40
 
1347
        ldm             r4,  {r4, r5, r12}
 
1348
        ldr             r6,  [sp]
 
1349
        subs            r5,  r5,  #1
 
1350
        add             r2,  r2,  r12
 
1351
        str             r5,  [sp, #44]
 
1352
        add             r0,  r0,  r6
 
1353
 
 
1354
        bne             1b
 
1355
 
 
1356
        pop             {r1, r4-r11, pc}
 
1357
endfunc
 
1358
 
 
1359
function vp8_put_epel_v6_armv6
 
1360
        push            {r1, r4-r11, lr}
 
1361
        movrel          lr,  sixtap_filters_13245600 - 16
 
1362
        add             lr,  lr,  r12, lsl #3
 
1363
        str             r3,  [sp, #48]
 
1364
1:
 
1365
        add             r1,  r3,  r3,  lsl #1   @ stride * 3
 
1366
        ldr_nreg        r5,  r2,  r3            @ src[0,1,2,3 + stride * 1]
 
1367
        ldr             r6,  [r2, r3]           @ src[0,1,2,3 + stride * 3]
 
1368
        ldr             r7,  [r2, r3,  lsl #1]  @ src[0,1,2,3 + stride * 4]
 
1369
        ldr             r8,  [r2, r1]           @ src[0,1,2,3 + stride * 5]
 
1370
 
 
1371
        uxtb16          r9,  r5,  ror #8        @ src[3 + s*1] | src[1 + s*1]
 
1372
        uxtb16          r10, r6,  ror #8        @ src[3 + s*3] | src[1 + s*3]
 
1373
        uxtb16          r11, r7,  ror #8        @ src[3 + s*4] | src[1 + s*4]
 
1374
        uxtb16          r12, r8,  ror #8        @ src[3 + s*5] | src[1 + s*5]
 
1375
        uxtb16          r5,  r5                 @ src[2 + s*1] | src[0 + s*1]
 
1376
        uxtb16          r6,  r6                 @ src[2 + s*3] | src[0 + s*3]
 
1377
        uxtb16          r7,  r7                 @ src[2 + s*4] | src[0 + s*4]
 
1378
        uxtb16          r8,  r8                 @ src[2 + s*5] | src[0 + s*5]
 
1379
        pkhbt           r1,  r9,  r10, lsl #16  @ src[1 + s*3] | src[1 + s*1]
 
1380
        pkhtb           r9,  r10, r9,  asr #16  @ src[3 + s*3] | src[3 + s*1]
 
1381
        pkhbt           r10, r11, r12, lsl #16  @ src[1 + s*5] | src[1 + s*4]
 
1382
        pkhtb           r11, r12, r11, asr #16  @ src[3 + s*5] | src[3 + s*4]
 
1383
        pkhbt           r12, r5,  r6,  lsl #16  @ src[0 + s*3] | src[0 + s*1]
 
1384
        pkhtb           r5,  r6,  r5,  asr #16  @ src[2 + s*3] | src[2 + s*1]
 
1385
        pkhbt           r6,  r7,  r8,  lsl #16  @ src[0 + s*5] | src[0 + s*4]
 
1386
        pkhtb           r7,  r8,  r7,  asr #16  @ src[2 + s*5] | src[2 + s*4]
 
1387
 
 
1388
        ldr             r8,  [lr, #4]
 
1389
        mov             r3,  #0x40
 
1390
        smlad           r12, r12, r8,  r3       @ filter[0][1]
 
1391
        smlad           r1,  r1,  r8,  r3       @ filter[1][1]
 
1392
        smlad           r5,  r5,  r8,  r3       @ filter[2][1]
 
1393
        smlad           r9,  r9,  r8,  r3       @ filter[3][1]
 
1394
        ldr             r8,  [lr, #8]
 
1395
        ldr             r3,  [sp, #48]
 
1396
        smlad           r12, r6,  r8,  r12      @ filter[0][2]
 
1397
        smlad           r1,  r10, r8,  r1       @ filter[1][2]
 
1398
        ldr_nreg        r6,  r2,  r3,  lsl #1   @ src[0,1,2,3 + stride * 0]
 
1399
        ldr             r10, [r2], #4           @ src[0,1,2,3 + stride * 2]
 
1400
        smlad           r5,  r7,  r8,  r5       @ filter[2][2]
 
1401
        smlad           r9,  r11, r8,  r9       @ filter[3][2]
 
1402
 
 
1403
        uxtb16          r7,  r6,  ror #8        @ src[3 + s*0] | src[1 + s*0]
 
1404
        uxtb16          r11, r10, ror #8        @ src[3 + s*2] | src[1 + s*2]
 
1405
        uxtb16          r6,  r6                 @ src[2 + s*0] | src[0 + s*0]
 
1406
        uxtb16          r10, r10                @ src[2 + s*2] | src[0 + s*2]
 
1407
 
 
1408
        pkhbt           r8,  r7,  r11, lsl #16  @ src[1 + s*2] | src[1 + s*0]
 
1409
        pkhtb           r7,  r11, r7,  asr #16  @ src[3 + s*2] | src[3 + s*0]
 
1410
        pkhbt           r11, r6,  r10, lsl #16  @ src[0 + s*2] | src[0 + s*0]
 
1411
        pkhtb           r6,  r10, r6,  asr #16  @ src[2 + s*2] | src[2 + s*0]
 
1412
 
 
1413
        ldr             r10, [lr]
 
1414
        subs            r4,  r4,  #4
 
1415
        smlad           r12, r11, r10, r12      @ filter[0][0]
 
1416
        smlad           r1,  r8,  r10, r1       @ filter[1][0]
 
1417
        smlad           r5,  r6,  r10, r5       @ filter[2][0]
 
1418
        smlad           r9,  r7,  r10, r9       @ filter[3][0]
 
1419
 
 
1420
        sat4            r12, r1,  r5,  r9
 
1421
        str             r12, [r0], #4
 
1422
 
 
1423
        bne             1b
 
1424
 
 
1425
        ldrd            r4,  r5,  [sp, #40]
 
1426
        ldr             r6,  [sp]
 
1427
        subs            r5,  r5,  #1
 
1428
        sub             r2,  r2,  r4
 
1429
        str             r5,  [sp, #44]
 
1430
        add             r0,  r0,  r6
 
1431
        add             r2,  r2,  r3
 
1432
 
 
1433
        bne             1b
 
1434
 
 
1435
        pop             {r1, r4-r11, pc}
 
1436
endfunc
 
1437
 
 
1438
function vp8_put_epel_h4_armv6
 
1439
        push            {r1, r4-r11, lr}
 
1440
        subs            r2,  r2,  #1
 
1441
        movrel          lr,  fourtap_filters_1324 - 4
 
1442
        add             lr,  lr,  r12, lsl #2
 
1443
        sub             r3,  r3,  r4
 
1444
        ldm             lr,  {r5, r6}
 
1445
        ldr             lr,  [sp, #44]
 
1446
1:
 
1447
        ldr             r9,  [r2, #3]
 
1448
        ldr             r8,  [r2, #2]
 
1449
        ldr             r7,  [r2], #4
 
1450
 
 
1451
        uxtb16          r9,  r9,  ror #8        @ src[6] | src[4]
 
1452
        uxtb16          r10, r8,  ror #8        @ src[5] | src[3]
 
1453
        uxtb16          r8,  r8                 @ src[4] | src[2]
 
1454
        uxtb16          r11, r7,  ror #8        @ src[3] | src[1]
 
1455
        uxtb16          r7,  r7                 @ src[2] | src[0]
 
1456
 
 
1457
        mov             r12, #0x40
 
1458
        smlad           r9,  r9,  r6,  r12      @ filter[3][1]
 
1459
        smlad           r7,  r7,  r5,  r12      @ filter[0][0]
 
1460
        smlad           r9,  r10, r5,  r9       @ filter[3][0]
 
1461
        smlad           r10, r10, r6,  r12      @ filter[2][1]
 
1462
        smlad           r12, r11, r5,  r12      @ filter[1][0]
 
1463
        smlad           r7,  r11, r6,  r7       @ filter[0][1]
 
1464
        smlad           r10, r8,  r5,  r10      @ filter[2][0]
 
1465
        smlad           r12, r8,  r6,  r12      @ filter[1][1]
 
1466
 
 
1467
        subs            r4,  r4,  #4
 
1468
 
 
1469
        sat4            r7,  r12, r10, r9
 
1470
        str             r7,  [r0], #4
 
1471
 
 
1472
        bne             1b
 
1473
 
 
1474
        subs            lr,  lr,  #1
 
1475
        ldr             r4,  [sp, #40]
 
1476
        add             r2,  r2,  r3
 
1477
        add             r0,  r0,  r1
 
1478
 
 
1479
        bne             1b
 
1480
 
 
1481
        pop             {r1, r4-r11, pc}
 
1482
endfunc
 
1483
 
 
1484
function vp8_put_epel_v4_armv6
 
1485
        push            {r1, r4-r11, lr}
 
1486
        movrel          lr,  fourtap_filters_1324 - 4
 
1487
        add             lr,  lr,  r12, lsl #2
 
1488
        ldm             lr,  {r5, r6}
 
1489
        str             r3,  [sp, #48]
 
1490
1:
 
1491
        ldr             lr,  [r2, r3, lsl #1]
 
1492
        ldr             r12, [r2, r3]
 
1493
        ldr_nreg        r7,  r2,  r3
 
1494
        ldr             r11, [r2], #4
 
1495
 
 
1496
        uxtb16          r8,  lr,  ror #8        @ src[3 + s*3] | src[1 + s*3]
 
1497
        uxtb16          r9,  r12, ror #8        @ src[3 + s*2] | src[1 + s*2]
 
1498
        uxtb16          r3,  r7,  ror #8        @ src[3 + s*0] | src[1 + s*0]
 
1499
        uxtb16          r1,  r11, ror #8        @ src[3 + s*1] | src[1 + s*1]
 
1500
        uxtb16          lr,  lr                 @ src[2 + s*3] | src[0 + s*3]
 
1501
        uxtb16          r12, r12                @ src[2 + s*2] | src[0 + s*2]
 
1502
        uxtb16          r7,  r7                 @ src[2 + s*0] | src[0 + s*0]
 
1503
        uxtb16          r11, r11                @ src[2 + s*1] | src[0 + s*1]
 
1504
        pkhbt           r10, r1,  r8,  lsl #16  @ src[1 + s*3] | src[1 + s*1]
 
1505
        pkhtb           r1,  r8,  r1,  asr #16  @ src[3 + s*3] | src[3 + s*1]
 
1506
        pkhbt           r8,  r3,  r9,  lsl #16  @ src[1 + s*2] | src[1 + s*0]
 
1507
        pkhtb           r3,  r9,  r3,  asr #16  @ src[3 + s*2] | src[3 + s*0]
 
1508
        pkhbt           r9,  r11, lr,  lsl #16  @ src[0 + s*3] | src[0 + s*1]
 
1509
        pkhtb           r11, lr,  r11, asr #16  @ src[2 + s*3] | src[2 + s*1]
 
1510
        pkhbt           lr,  r7,  r12, lsl #16  @ src[0 + s*2] | src[0 + s*0]
 
1511
        pkhtb           r7,  r12, r7,  asr #16  @ src[2 + s*2] | src[2 + s*0]
 
1512
 
 
1513
        mov             r12, #0x40
 
1514
        smlad           r9,  r9,  r6,  r12      @ filter[0][1]
 
1515
        smlad           r10, r10, r6,  r12      @ filter[1][1]
 
1516
        smlad           r11, r11, r6,  r12      @ filter[2][1]
 
1517
        smlad           r1,  r1,  r6,  r12      @ filter[3][1]
 
1518
        smlad           r9,  lr,  r5,  r9       @ filter[0][0]
 
1519
        smlad           r10, r8,  r5,  r10      @ filter[1][0]
 
1520
        smlad           r11, r7,  r5,  r11      @ filter[2][0]
 
1521
        smlad           r1,  r3,  r5,  r1       @ filter[3][0]
 
1522
 
 
1523
        subs            r4,  r4,  #4
 
1524
        ldr             r3,  [sp, #48]
 
1525
 
 
1526
        sat4            r9,  r10, r11, r1
 
1527
        str             r9,  [r0], #4
 
1528
 
 
1529
        bne             1b
 
1530
 
 
1531
        ldr             r4,  [sp, #40]
 
1532
        ldr             r12, [sp, #44]
 
1533
        add             r2,  r2,  r3
 
1534
        ldr             r9,  [sp, #0]
 
1535
        subs            r12, r12, #1
 
1536
        sub             r2,  r2,  r4
 
1537
        str             r12, [sp, #44]
 
1538
        add             r0,  r0,  r9
 
1539
 
 
1540
        bne             1b
 
1541
 
 
1542
        pop             {r1, r4-r11, pc}
 
1543
endfunc
 
1544
 
 
1545
function vp8_put_bilin_h_armv6
 
1546
        push            {r1, r4-r11, lr}
 
1547
        rsb             r5,  r12, r12, lsl #16
 
1548
        ldr             r12, [sp, #44]
 
1549
        sub             r3,  r3,  r4
 
1550
        add             r5,  r5,  #8
 
1551
1:
 
1552
        ldrb            r6,  [r2], #1
 
1553
        ldrb            r7,  [r2], #1
 
1554
        ldrb            r8,  [r2], #1
 
1555
        ldrb            r9,  [r2], #1
 
1556
        ldrb            lr,  [r2]
 
1557
 
 
1558
        pkhbt           r6,  r6,  r7,  lsl #16  @ src[1] | src[0]
 
1559
        pkhbt           r7,  r7,  r8,  lsl #16  @ src[2] | src[1]
 
1560
        pkhbt           r8,  r8,  r9,  lsl #16  @ src[3] | src[2]
 
1561
        pkhbt           r9,  r9,  lr,  lsl #16  @ src[4] | src[3]
 
1562
 
 
1563
        mov             r10, #4
 
1564
        smlad           r6,  r6,  r5,  r10
 
1565
        smlad           r7,  r7,  r5,  r10
 
1566
        smlad           r8,  r8,  r5,  r10
 
1567
        smlad           r9,  r9,  r5,  r10
 
1568
 
 
1569
        subs            r4,  r4,  #4
 
1570
 
 
1571
        asr             r6,  #3
 
1572
        asr             r7,  #3
 
1573
        pkhbt           r6,  r6,  r8,  lsl #13
 
1574
        pkhbt           r7,  r7,  r9,  lsl #13
 
1575
        orr             r6,  r6,  r7,  lsl #8
 
1576
        str             r6,  [r0], #4
 
1577
 
 
1578
        bne             1b
 
1579
 
 
1580
        ldr             r4,  [sp, #40]
 
1581
        subs            r12, r12, #1
 
1582
        add             r2,  r2,  r3
 
1583
        add             r0,  r0,  r1
 
1584
 
 
1585
        bne             1b
 
1586
 
 
1587
        pop             {r1, r4-r11, pc}
 
1588
endfunc
 
1589
 
 
1590
function vp8_put_bilin_v_armv6
 
1591
        push            {r1, r4-r11, lr}
 
1592
        rsb             r5,  r12, r12, lsl #16
 
1593
        ldr             r12, [sp, #44]
 
1594
        add             r5,  r5,  #8
 
1595
1:
 
1596
        ldrb            r10, [r2, r3]
 
1597
        ldrb            r6,  [r2], #1
 
1598
        ldrb            r11, [r2, r3]
 
1599
        ldrb            r7,  [r2], #1
 
1600
        ldrb            lr,  [r2, r3]
 
1601
        ldrb            r8,  [r2], #1
 
1602
        ldrb            r9,  [r2, r3]
 
1603
        pkhbt           r6,  r6,  r10, lsl #16
 
1604
        ldrb            r10, [r2], #1
 
1605
        pkhbt           r7,  r7,  r11, lsl #16
 
1606
        pkhbt           r8,  r8,  lr,  lsl #16
 
1607
        pkhbt           r9,  r10, r9,  lsl #16
 
1608
 
 
1609
        mov             r10, #4
 
1610
        smlad           r6,  r6,  r5,  r10
 
1611
        smlad           r7,  r7,  r5,  r10
 
1612
        smlad           r8,  r8,  r5,  r10
 
1613
        smlad           r9,  r9,  r5,  r10
 
1614
 
 
1615
        subs            r4,  r4,  #4
 
1616
 
 
1617
        asr             r6,  #3
 
1618
        asr             r7,  #3
 
1619
        pkhbt           r6,  r6,  r8,  lsl #13
 
1620
        pkhbt           r7,  r7,  r9,  lsl #13
 
1621
        orr             r6,  r6,  r7,  lsl #8
 
1622
        str             r6,  [r0], #4
 
1623
 
 
1624
        bne             1b
 
1625
 
 
1626
        ldr             r4,  [sp, #40]
 
1627
        subs            r12, r12, #1
 
1628
        add             r2,  r2,  r3
 
1629
        add             r0,  r0,  r1
 
1630
        sub             r2,  r2,  r4
 
1631
 
 
1632
        bne             1b
 
1633
        pop             {r1, r4-r11, pc}
 
1634
endfunc