~ubuntu-branches/ubuntu/saucy/gst-libav1.0/saucy-proposed

« back to all changes in this revision

Viewing changes to gst-libs/ext/libav/libavcodec/arm/h264cmc_neon.S

  • Committer: Package Import Robot
  • Author(s): Sebastian Dröge
  • Date: 2013-07-30 09:00:15 UTC
  • mfrom: (1.1.16) (7.1.7 experimental)
  • Revision ID: package-import@ubuntu.com-20130730090015-sc1ou2yssu7q5w4e
Tags: 1.1.3-1
* New upstream development snapshot:
  + debian/control:
    - Build depend on GStreamer and gst-plugins-base >= 1.1.3.

Show diffs side-by-side

added added

removed removed

Lines of Context:
18
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
19
 */
20
20
 
21
 
#include "asm.S"
 
21
#include "libavutil/arm/asm.S"
22
22
 
23
23
/* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
24
24
.macro  h264_chroma_mc8 type, codec=h264
25
25
function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
26
26
        push            {r4-r7, lr}
27
 
        ldrd            r4,  [sp, #20]
 
27
        ldrd            r4,  r5,  [sp, #20]
28
28
  .ifc \type,avg
29
29
        mov             lr,  r0
30
30
  .endif
51
51
 
52
52
        beq             2f
53
53
 
54
 
        add             r5,  r1,  r2
55
 
 
56
54
        vdup.8          d0,  r4
57
 
        lsl             r4,  r2,  #1
58
55
        vdup.8          d1,  r12
59
 
        vld1.8          {d4, d5}, [r1], r4
 
56
        vld1.8          {d4, d5}, [r1], r2
60
57
        vdup.8          d2,  r6
61
 
        vld1.8          {d6, d7}, [r5], r4
62
58
        vdup.8          d3,  r7
63
 
 
64
59
        vext.8          d5,  d4,  d5,  #1
65
 
        vext.8          d7,  d6,  d7,  #1
66
60
 
67
 
1:      pld             [r5]
 
61
1:      vld1.8          {d6, d7}, [r1], r2
68
62
        vmull.u8        q8,  d4,  d0
69
63
        vmlal.u8        q8,  d5,  d1
70
 
        vld1.8          {d4, d5}, [r1], r4
 
64
        vext.8          d7,  d6,  d7,  #1
 
65
        vld1.8          {d4, d5}, [r1], r2
71
66
        vmlal.u8        q8,  d6,  d2
 
67
        pld             [r1]
72
68
        vext.8          d5,  d4,  d5,  #1
73
69
        vmlal.u8        q8,  d7,  d3
74
70
        vmull.u8        q9,  d6,  d0
76
72
        vmlal.u8        q9,  d7,  d1
77
73
        vmlal.u8        q9,  d4,  d2
78
74
        vmlal.u8        q9,  d5,  d3
79
 
        vld1.8          {d6, d7}, [r5], r4
80
 
        pld             [r1]
 
75
        pld             [r1, r2]
81
76
  .ifc \codec,h264
82
77
        vrshrn.u16      d16, q8,  #6
83
78
        vrshrn.u16      d17, q9,  #6
92
87
        vld1.8          {d21}, [lr,:64], r2
93
88
        vrhadd.u8       q8,  q8,  q10
94
89
  .endif
95
 
        vext.8          d7,  d6,  d7,  #1
96
90
        vst1.8          {d16}, [r0,:64], r2
97
91
        vst1.8          {d17}, [r0,:64], r2
98
92
        bgt             1b
106
100
 
107
101
        beq             4f
108
102
 
109
 
        add             r5,  r1,  r2
110
 
        lsl             r4,  r2,  #1
111
 
        vld1.8          {d4}, [r1], r4
112
 
        vld1.8          {d6}, [r5], r4
 
103
        vld1.8          {d4}, [r1], r2
113
104
 
114
 
3:      pld             [r5]
 
105
3:      vld1.8          {d6}, [r1], r2
115
106
        vmull.u8        q8,  d4,  d0
116
107
        vmlal.u8        q8,  d6,  d1
117
 
        vld1.8          {d4}, [r1], r4
 
108
        vld1.8          {d4}, [r1], r2
118
109
        vmull.u8        q9,  d6,  d0
119
110
        vmlal.u8        q9,  d4,  d1
120
 
        vld1.8          {d6}, [r5], r4
 
111
        pld             [r1]
121
112
  .ifc \codec,h264
122
113
        vrshrn.u16      d16, q8,  #6
123
114
        vrshrn.u16      d17, q9,  #6
127
118
        vshrn.u16       d16, q8,  #6
128
119
        vshrn.u16       d17, q9,  #6
129
120
  .endif
 
121
        pld             [r1, r2]
130
122
  .ifc \type,avg
131
123
        vld1.8          {d20}, [lr,:64], r2
132
124
        vld1.8          {d21}, [lr,:64], r2
133
125
        vrhadd.u8       q8,  q8,  q10
134
126
  .endif
135
127
        subs            r3,  r3,  #2
136
 
        pld             [r1]
137
128
        vst1.8          {d16}, [r0,:64], r2
138
129
        vst1.8          {d17}, [r0,:64], r2
139
130
        bgt             3b
144
135
        vld1.8          {d6, d7}, [r1], r2
145
136
        vext.8          d5,  d4,  d5,  #1
146
137
        vext.8          d7,  d6,  d7,  #1
147
 
 
148
 
5:      pld             [r1]
 
138
        pld             [r1]
149
139
        subs            r3,  r3,  #2
150
140
        vmull.u8        q8,  d4,  d0
151
141
        vmlal.u8        q8,  d5,  d1
152
 
        vld1.8          {d4, d5}, [r1], r2
153
142
        vmull.u8        q9,  d6,  d0
154
143
        vmlal.u8        q9,  d7,  d1
155
 
        pld             [r1]
156
 
        vext.8          d5,  d4,  d5,  #1
 
144
        pld             [r1, r2]
157
145
  .ifc \codec,h264
158
146
        vrshrn.u16      d16, q8,  #6
159
147
        vrshrn.u16      d17, q9,  #6
168
156
        vld1.8          {d21}, [lr,:64], r2
169
157
        vrhadd.u8       q8,  q8,  q10
170
158
  .endif
171
 
        vld1.8          {d6, d7}, [r1], r2
172
 
        vext.8          d7,  d6,  d7,  #1
173
159
        vst1.8          {d16}, [r0,:64], r2
174
160
        vst1.8          {d17}, [r0,:64], r2
175
 
        bgt             5b
 
161
        bgt             4b
176
162
 
177
163
        pop             {r4-r7, pc}
178
164
endfunc
182
168
.macro  h264_chroma_mc4 type, codec=h264
183
169
function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
184
170
        push            {r4-r7, lr}
185
 
        ldrd            r4,  [sp, #20]
 
171
        ldrd            r4,  r5,  [sp, #20]
186
172
  .ifc \type,avg
187
173
        mov             lr,  r0
188
174
  .endif
209
195
 
210
196
        beq             2f
211
197
 
212
 
        add             r5,  r1,  r2
213
 
 
214
198
        vdup.8          d0,  r4
215
 
        lsl             r4,  r2,  #1
216
199
        vdup.8          d1,  r12
217
 
        vld1.8          {d4},     [r1], r4
 
200
        vld1.8          {d4},     [r1], r2
218
201
        vdup.8          d2,  r6
219
 
        vld1.8          {d6},     [r5], r4
220
202
        vdup.8          d3,  r7
221
203
 
222
204
        vext.8          d5,  d4,  d5,  #1
223
 
        vext.8          d7,  d6,  d7,  #1
224
205
        vtrn.32         d4,  d5
225
 
        vtrn.32         d6,  d7
226
206
 
227
207
        vtrn.32         d0,  d1
228
208
        vtrn.32         d2,  d3
229
209
 
230
 
1:      pld             [r5]
 
210
1:      vld1.8          {d6},     [r1], r2
 
211
        vext.8          d7,  d6,  d7,  #1
 
212
        vtrn.32         d6,  d7
231
213
        vmull.u8        q8,  d4,  d0
232
214
        vmlal.u8        q8,  d6,  d2
233
 
        vld1.8          {d4},     [r1], r4
 
215
        vld1.8          {d4},     [r1], r2
234
216
        vext.8          d5,  d4,  d5,  #1
235
217
        vtrn.32         d4,  d5
 
218
        pld             [r1]
236
219
        vmull.u8        q9,  d6,  d0
237
220
        vmlal.u8        q9,  d4,  d2
238
 
        vld1.8          {d6},     [r5], r4
239
221
        vadd.i16        d16, d16, d17
240
222
        vadd.i16        d17, d18, d19
241
223
  .ifc \codec,h264
245
227
        vshrn.u16       d16, q8,  #6
246
228
  .endif
247
229
        subs            r3,  r3,  #2
248
 
        pld             [r1]
 
230
        pld             [r1, r2]
249
231
  .ifc \type,avg
250
232
        vld1.32         {d20[0]}, [lr,:32], r2
251
233
        vld1.32         {d20[1]}, [lr,:32], r2
252
234
        vrhadd.u8       d16, d16, d20
253
235
  .endif
254
 
        vext.8          d7,  d6,  d7,  #1
255
 
        vtrn.32         d6,  d7
256
236
        vst1.32         {d16[0]}, [r0,:32], r2
257
237
        vst1.32         {d16[1]}, [r0,:32], r2
258
238
        bgt             1b
268
248
        beq             4f
269
249
 
270
250
        vext.32         d1,  d0,  d1,  #1
271
 
        add             r5,  r1,  r2
272
 
        lsl             r4,  r2,  #1
273
 
        vld1.32         {d4[0]},  [r1], r4
274
 
        vld1.32         {d4[1]},  [r5], r4
 
251
        vld1.32         {d4[0]},  [r1], r2
275
252
 
276
 
3:      pld             [r5]
 
253
3:      vld1.32         {d4[1]},  [r1], r2
277
254
        vmull.u8        q8,  d4,  d0
278
 
        vld1.32         {d4[0]},  [r1], r4
 
255
        vld1.32         {d4[0]},  [r1], r2
279
256
        vmull.u8        q9,  d4,  d1
280
 
        vld1.32         {d4[1]},  [r5], r4
281
257
        vadd.i16        d16, d16, d17
282
258
        vadd.i16        d17, d18, d19
 
259
        pld             [r1]
283
260
  .ifc \codec,h264
284
261
        vrshrn.u16      d16, q8,  #6
285
262
  .else
292
269
        vrhadd.u8       d16, d16, d20
293
270
  .endif
294
271
        subs            r3,  r3,  #2
295
 
        pld             [r1]
 
272
        pld             [r1, r2]
296
273
        vst1.32         {d16[0]}, [r0,:32], r2
297
274
        vst1.32         {d16[1]}, [r0,:32], r2
298
275
        bgt             3b
305
282
        vext.8          d7,  d6,  d7,  #1
306
283
        vtrn.32         d4,  d5
307
284
        vtrn.32         d6,  d7
308
 
 
309
 
5:      vmull.u8        q8,  d4,  d0
 
285
        vmull.u8        q8,  d4,  d0
310
286
        vmull.u8        q9,  d6,  d0
311
287
        subs            r3,  r3,  #2
312
 
        vld1.8          {d4},     [r1], r2
313
 
        vext.8          d5,  d4,  d5,  #1
314
 
        vtrn.32         d4,  d5
315
288
        vadd.i16        d16, d16, d17
316
289
        vadd.i16        d17, d18, d19
317
290
        pld             [r1]
326
299
        vld1.32         {d20[1]}, [lr,:32], r2
327
300
        vrhadd.u8       d16, d16, d20
328
301
  .endif
329
 
        vld1.8          {d6},     [r1], r2
330
 
        vext.8          d7,  d6,  d7,  #1
331
 
        vtrn.32         d6,  d7
332
302
        pld             [r1]
333
303
        vst1.32         {d16[0]}, [r0,:32], r2
334
304
        vst1.32         {d16[1]}, [r0,:32], r2
335
 
        bgt             5b
 
305
        bgt             4b
336
306
 
337
307
        pop             {r4-r7, pc}
338
308
endfunc