2
* dct_block_mmx_x86_64.S
4
* Copyright (C) Peter Schlaile - February 2001
6
* This file is part of libdv, a free DV (IEC 61834/SMPTE 314M)
9
* libdv is free software; you can redistribute it and/or modify it
10
* under the terms of the GNU Lesser Public License as published by
11
* the Free Software Foundation; either version 2.1, or (at your
12
* option) any later version.
14
* libdv is distributed in the hope that it will be useful, but
15
* WITHOUT ANY WARRANTY; without even the implied warranty of
16
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17
* Lesser Public License for more details.
19
* You should have received a copy of the GNU Lesser Public License
20
* along with libdv; see the file COPYING. If not, write to
21
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23
* The libdv homepage is http://libdv.sourceforge.net/.
26
# rdi - input and output data pointer
27
# the input data each 16 bit element in the 8x8 matrix is left aligned:
28
# the output data is tranposed and each 16 bit element in the 8x8 matrix is left aligned:
29
# e.g. in 11...1110000 format
30
# israelh. 11/11/97 removed emms. moved to stub
31
# MMX implementation. Using MMX transpose
33
#define YUV_PRECISION 1
37
#define WA4_SHIFT (NSHIFT-1)
38
#define WA5_SHIFT (NSHIFT+1)
45
WA1: .word 23171,23171,23171,23171 /* 0.70711 * 32768 */
46
WA2: .word 17734,17734,17734,17734 /* 0.54120 * 32768 */
47
WA3: .word 23171,23171,23171,23171 /* 0.70711 * 32768 */
48
WA4: .word 21407,21407,21407,21407 /* 1.30658 * 16384 */
49
WA5: .word 25079,25079,25079,25079 /* 0.38268 * 65536 */
56
.section .note.GNU-stack, "", @progbits
61
.global _dv_dct_88_block_mmx_x86_64
62
.hidden _dv_dct_88_block_mmx_x86_64
63
.type _dv_dct_88_block_mmx_x86_64,@function
64
_dv_dct_88_block_mmx_x86_64:
66
/* void _dv_dct_88_block_mmx_x86_64(int16_t* block); */
68
/* argument is block=rdi */
70
mov %rdi, %r11 # block
73
movq 16*0(%r11), %mm0 # v0
74
movq 16*7(%r11), %mm1 # v7
75
movq %mm0, %mm2 # duplicate v0
76
paddw %mm1, %mm0 # v00: v0+v7
77
psubw %mm1, %mm2 # v07: v0-v7
79
movq 16*1(%r11), %mm1 # v1
80
movq 16*6(%r11), %mm3 # v6
81
movq %mm1, %mm4 # duplicate v1
82
paddw %mm3, %mm1 # v01: v1+v6
83
psubw %mm3, %mm4 # v06: v1-v6
85
movq 16*2(%r11), %mm3 # v2
86
movq 16*5(%r11), %mm5 # v5
87
movq %mm3, %mm6 # duplicate v2
88
paddw %mm5, %mm3 # v02: v2+v5
89
psubw %mm5, %mm6 # v05: v2-v5
91
movq 16*3(%r11), %mm5 # v3
92
movq 16*4(%r11), %mm7 # v4
93
movq %mm7, scratch1(%rip) # scratch1: v4 ;
94
movq %mm5, %mm7 # duplicate v3
95
paddw scratch1(%rip), %mm5 # v03: v3+v4
96
psubw scratch1(%rip), %mm7 # v04: v3-v4
97
movq %mm5, scratch2(%rip) # scratch2: v03
98
movq %mm0, %mm5 # mm5: v00
100
paddw scratch2(%rip), %mm0 # v10: v00+v03
101
psubw scratch2(%rip), %mm5 # v13: v00-v03
102
movq %mm3, scratch3(%rip) # scratch3: v02
103
movq %mm1, %mm3 # duplicate v01
105
paddw scratch3(%rip), %mm1 # v11: v01+v02
106
psubw scratch3(%rip), %mm3 # v12: v01-v02
108
movq %mm6, scratch4(%rip) # scratch4: v05
109
movq %mm0, %mm6 # duplicate v10
111
paddw %mm1, %mm0 # v10+v11
112
psubw %mm1, %mm6 # v10-v11
114
movq %mm0, (%r11) # out0: v10+v11
115
movq %mm6, 16*4(%r11) # out4: v10-v11
117
movq %mm4, %mm0 # mm0: v06
118
paddw scratch4(%rip), %mm4 # v15: v05+v06
119
paddw %mm2, %mm0 # v16: v07+v06
121
pmulhw WA3(%rip), %mm4 # v35~: WA3*v15
122
psllw $1, %mm4 # v35: compensate the coeefient scale
124
movq %mm4, %mm6 # duplicate v35
125
paddw %mm2, %mm4 # v45: v07+v35
126
psubw %mm6, %mm2 # v47: v07-v35
128
paddw %mm5, %mm3 # v22: v12+v13
130
pmulhw WA1(%rip), %mm3 # v32~: WA1*v22
131
psllw $16-NSHIFT, %mm3 # v32: compensate the coeefient scale
132
movq %mm5, %mm6 # duplicate v13
134
paddw %mm3, %mm5 # v13+v32
135
psubw %mm3, %mm6 # v13-v32
137
movq %mm5, 16*2(%r11) # out2: v13+v32
138
movq %mm6, 16*6(%r11) # out6: v13-v32
141
paddw scratch4(%rip), %mm7 # v14n: v04+v05
142
movq %mm0, %mm5 # duplicate v16
144
psubw %mm7, %mm0 # va1: v16-v14n
145
pmulhw WA5(%rip), %mm0 # va0~: va1*WA5
146
pmulhw WA4(%rip), %mm5 # v36~~: v16*WA4
147
pmulhw WA2(%rip), %mm7 # v34~~: v14n*WA2
148
psllw $16-WA4_SHIFT, %mm5 # v36: compensate the coeefient scale
149
psllw $16-NSHIFT, %mm7 # v34: compensate the coeefient scale
151
psubw %mm0, %mm5 # v36~: v36~~-va0~
152
psubw %mm0, %mm7 # v34~: v34~~-va0~
154
movq %mm4, %mm0 # duplicate v45
155
paddw %mm5, %mm4 # v45+v36
156
psubw %mm5, %mm0 # v45-v36
158
movq %mm4, 16*1(%r11) # out1: v45+v36
159
movq %mm0, 16*7(%r11) # out7: v45-v36
161
movq %mm2, %mm5 # duplicate v47
162
paddw %mm7, %mm2 # v47+v34
163
psubw %mm7, %mm5 # v47-v34
165
movq %mm2, 16*5(%r11) # out5: v47+v34
166
movq %mm5, 16*3(%r11) # out3: v47-v34
171
add $8, %r11 # point to the next 4 columns.
172
# it can be done by adding 8 to
173
# immediates but this is nicer
175
movq (%r11), %mm0 # v0
176
movq 16*7(%r11), %mm1 # v7
177
movq %mm0, %mm2 # duplicate v0
178
paddw %mm1, %mm0 # v00: v0+v7
179
psubw %mm1, %mm2 # v07: v0-v7
181
movq 16(%r11), %mm1 # v1
182
movq 16*6(%r11), %mm3 # v6
183
movq %mm1, %mm4 # duplicate v1
184
paddw %mm3, %mm1 # v01: v1+v6
185
psubw %mm3, %mm4 # v06: v1-v6
187
movq 16*2(%r11), %mm3 # v2
188
movq 16*5(%r11), %mm5 # v5
189
movq %mm3, %mm6 # duplicate v2
190
paddw %mm5, %mm3 # v02: v2+v5
191
psubw %mm5, %mm6 # v05: v2-v5
193
movq 16*3(%r11), %mm5 # v3
194
movq 16*4(%r11), %mm7 # v4
195
movq %mm7, scratch1(%rip) # scratch1: v4 ;
196
movq %mm5, %mm7 # duplicate v3
197
paddw scratch1(%rip), %mm5 # v03: v3+v4
198
psubw scratch1(%rip), %mm7 # v04: v3-v4
199
movq %mm5, scratch2(%rip) # scratch2: v03
200
movq %mm0, %mm5 # mm5: v00
202
paddw scratch2(%rip), %mm0 # v10: v00+v03
203
psubw scratch2(%rip), %mm5 # v13: v00-v03
204
movq %mm3, scratch3(%rip) # scratc3: v02
205
movq %mm1, %mm3 # duplicate v01
207
paddw scratch3(%rip), %mm1 # v11: v01+v02
208
psubw scratch3(%rip), %mm3 # v12: v01-v02
210
movq %mm6, scratch4(%rip) # scratc4: v05
211
movq %mm0, %mm6 # duplicate v10
213
paddw %mm1, %mm0 # v10+v11
214
psubw %mm1, %mm6 # v10-v11
216
movq %mm0, (%r11) # out0: v10+v11
217
movq %mm6, 16*4(%r11) # out4: v10-v11
219
movq %mm4, %mm0 # mm0: v06
220
paddw scratch4(%rip), %mm4 # v15: v05+v06
221
paddw %mm2, %mm0 # v16: v07+v06
223
pmulhw WA3(%rip), %mm4 # v35~: WA3*v15
224
psllw $16-NSHIFT, %mm4 # v35: compensate the coeefient scale
226
movq %mm4, %mm6 # duplicate v35
227
paddw %mm2, %mm4 # v45: v07+v35
228
psubw %mm6, %mm2 # v47: v07-v35
230
paddw %mm5, %mm3 # v22: v12+v13
232
pmulhw WA1(%rip), %mm3 # v32~: WA3*v15
233
psllw $16-NSHIFT, %mm3 # v32: compensate the coeefient scale
234
movq %mm5, %mm6 # duplicate v13
236
paddw %mm3, %mm5 # v13+v32
237
psubw %mm3, %mm6 # v13-v32
239
movq %mm5, 16*2(%r11) # out2: v13+v32
240
movq %mm6, 16*6(%r11) # out6: v13-v32
242
paddw scratch4(%rip), %mm7 # v14n: v04+v05
243
movq %mm0, %mm5 # duplicate v16
245
psubw %mm7, %mm0 # va1: v16-v14n
246
pmulhw WA2(%rip), %mm7 # v34~~: v14n*WA2
247
pmulhw WA5(%rip), %mm0 # va0~: va1*WA5
248
pmulhw WA4(%rip), %mm5 # v36~~: v16*WA4
249
psllw $16-NSHIFT, %mm7
250
psllw $16-WA4_SHIFT, %mm5 # v36: compensate the coeffient
251
# scale note that WA4 is shifted 1 bit less than the others
253
psubw %mm0, %mm5 # v36~: v36~~-va0~
254
psubw %mm0, %mm7 # v34~: v34~~-va0~
256
movq %mm4, %mm0 # duplicate v45
257
paddw %mm5, %mm4 # v45+v36
258
psubw %mm5, %mm0 # v45-v36
260
movq %mm4, 16*1(%r11) # out1: v45+v36
261
movq %mm0, 16*7(%r11) # out7: v45-v36
263
movq %mm2, %mm5 # duplicate v47
264
paddw %mm7, %mm2 # v47+v34
265
psubw %mm7, %mm5 # v47-v34
267
movq %mm2, 16*5(%r11) # out5: v47+v34
268
movq %mm5, 16*3(%r11) # out3: v47-v34
272
/* do mmx postscaling and reordering... */
275
.global _dv_dct_block_mmx_x86_64_postscale_88
276
.hidden _dv_dct_block_mmx_x86_64_postscale_88
277
.type _dv_dct_block_mmx_x86_64_postscale_88,@function
278
_dv_dct_block_mmx_x86_64_postscale_88:
280
/* void _dv_dct_block_mmx_x86_64_postscale_88(int16_t* block, int16_t* postscale_matrix); */
282
/* arguments are block=rdi, postscale=rsi */
287
mov %rdi,%r11 # block matrix
288
mov %rsi,%r12 # postscale matrix
290
sub $128+2, %rsp # reserve some stack space in the redzone, plus 2 bytes
316
pmulhw 0*8(%r12), %mm0
317
pmulhw 1*8(%r12), %mm1
318
pmulhw 2*8(%r12), %mm2
319
pmulhw 3*8(%r12), %mm3
321
psraw $YUV_PRECISION, %mm0
322
psraw $YUV_PRECISION, %mm1
323
psraw $YUV_PRECISION, %mm2
324
psraw $YUV_PRECISION, %mm3
391
pmulhw 4*8(%r12), %mm0
392
pmulhw 5*8(%r12), %mm1
397
pmulhw 6*8(%r12), %mm2
398
pmulhw 7*8(%r12), %mm3
403
psraw $YUV_PRECISION, %mm0
404
psraw $YUV_PRECISION, %mm1
409
psraw $YUV_PRECISION, %mm2
410
psraw $YUV_PRECISION, %mm3
447
movq 10*8(%r11), %mm2
448
movq 11*8(%r11), %mm3
501
pmulhw 8*8(%r12), %mm0
502
pmulhw 9*8(%r12), %mm1
507
pmulhw 10*8(%r12), %mm2
508
pmulhw 11*8(%r12), %mm3
510
psraw $YUV_PRECISION, %mm0
511
psraw $YUV_PRECISION, %mm1
516
psraw $YUV_PRECISION, %mm2
517
psraw $YUV_PRECISION, %mm3
545
movq %mm2, 10*8(%r11)
546
movq %mm3, 11*8(%r11)
549
movq 12*8(%r11), %mm0
550
movq 13*8(%r11), %mm1
551
movq 14*8(%r11), %mm2
552
movq 15*8(%r11), %mm3
602
pmulhw 12*8(%r12), %mm0
603
pmulhw 13*8(%r12), %mm1
608
pmulhw 14*8(%r12), %mm2
609
pmulhw 15*8(%r12), %mm3
614
psraw $YUV_PRECISION, %mm0
615
psraw $YUV_PRECISION, %mm1
620
psraw $YUV_PRECISION, %mm2
621
psraw $YUV_PRECISION, %mm3
650
movq %mm0, 12*8(%r11)
651
movq %mm1, 13*8(%r11)
656
movq %mm2, 14*8(%r11)
657
movq %mm3, 15*8(%r11)
703
movl 62*2(%r11), %ebx
706
movl %ebx, 63*2(%rsp)
708
add $2, %rsp /* give back 2 bytes of stack space */
721
movq 32+8(%rsp) , %mm1
722
movq 32+16(%rsp), %mm2
723
movq 32+24(%rsp), %mm3
726
movq %mm1, 32+8(%r11)
727
movq %mm2, 32+16(%r11)
728
movq %mm3, 32+24(%r11)
731
movq 64+8(%rsp) , %mm1
732
movq 64+16(%rsp), %mm2
733
movq 64+24(%rsp), %mm3
736
movq %mm1, 64+8(%r11)
737
movq %mm2, 64+16(%r11)
738
movq %mm3, 64+24(%r11)
741
movq 96+8(%rsp) , %mm1
742
movq 96+16(%rsp), %mm2
743
movq 96+24(%rsp), %mm3
745
add $128, %rsp /* restore the stack pointer */
752
.global _dv_dct_248_block_mmx_x86_64
753
.hidden _dv_dct_248_block_mmx_x86_64
754
.type _dv_dct_248_block_mmx_x86_64,@function
755
_dv_dct_248_block_mmx_x86_64:
757
/* void _dv_dct_248_block_mmx_x86_64(int16_t* block); */
759
/* argument is block=rdi */
761
mov %rdi, %r11 # source
765
movq 16*0(%r11), %mm0 # v00
766
movq 16*2(%r11), %mm1 # v01
767
movq 16*4(%r11), %mm2 # v02
768
movq 16*6(%r11), %mm3 # v03
770
movq %mm0, %mm4 # v00
771
movq %mm1, %mm5 # v01
773
paddw %mm3, %mm0 # v10: v00+v03
774
psubw %mm3, %mm4 # v13: v00-v03
776
paddw %mm2, %mm1 # v11: v01+v02
777
psubw %mm2, %mm5 # v12: v01-v02
779
movq %mm0, %mm3 # v10
781
paddw %mm4, %mm5 # v22: v12+v13
782
paddw %mm1, %mm0 # v20: v10+v11
783
psubw %mm1, %mm3 # v21: v10-v11
785
pmulhw WA1(%rip), %mm5 # v32~: WA1*v22
787
psllw $16-NSHIFT, %mm5 # v32: compensate the coeffient scale
789
movq %mm0, 16*0(%r11)
791
psubw %mm5, %mm2 # v43: v13-v32
792
paddw %mm4, %mm5 # v42: v32+v13
794
movq %mm2, 16*6(%r11)
795
movq %mm3, 16*4(%r11)
796
movq %mm5, 16*2(%r11)
800
add $8, %r11 # point to the next 4 columns.
801
# it can be done by adding 8 to immediates
804
movq 16*0(%r11), %mm0 # v00
805
movq 16*2(%r11), %mm1 # v01
806
movq 16*4(%r11), %mm2 # v02
807
movq 16*6(%r11), %mm3 # v03
812
paddw %mm3, %mm0 # v10: v00+v03
813
psubw %mm3, %mm4 # v13: v00-v03
815
paddw %mm2, %mm1 # v11: v01+v02
816
psubw %mm2, %mm5 # v12: v01-v02
820
paddw %mm4, %mm5 # v22: v12+v13
821
paddw %mm1, %mm0 # v20: v10+v11
822
psubw %mm1, %mm3 # v21: v10-v11
824
pmulhw WA1(%rip), %mm5 # v32~: WA1*v22
826
psllw $16-NSHIFT, %mm5 # v32: compensate the coeffient scale
828
movq %mm0, 16*0(%r11)
830
psubw %mm5, %mm2 # v43: v13-v32
831
paddw %mm4, %mm5 # v42: v32+v13
833
movq %mm2, 16*6(%r11)
834
movq %mm3, 16*4(%r11)
835
movq %mm5, 16*2(%r11)
837
# column 0 ... second line
841
movq 16*0(%r11), %mm0 # v00
842
movq 16*2(%r11), %mm1 # v01
843
movq 16*4(%r11), %mm2 # v02
844
movq 16*6(%r11), %mm3 # v03
849
paddw %mm3, %mm0 # v10: v00+v03
850
psubw %mm3, %mm4 # v13: v00-v03
852
paddw %mm2, %mm1 # v11: v01+v02
853
psubw %mm2, %mm5 # v12: v01-v02
857
paddw %mm4, %mm5 # v22: v12+v13
858
paddw %mm1, %mm0 # v20: v10+v11
859
psubw %mm1, %mm3 # v21: v10-v11
861
pmulhw WA1(%rip), %mm5 # v32~: WA1*v22
863
psllw $16-NSHIFT, %mm5 # v32: compensate the coeffient scale
865
movq %mm0, 16*0(%r11)
867
psubw %mm5, %mm2 # v43: v13-v32
868
paddw %mm4, %mm5 # v42: v32+v13
870
movq %mm2, 16*6(%r11)
871
movq %mm3, 16*4(%r11)
872
movq %mm5, 16*2(%r11)
874
# column 1 ... second line
878
movq 16*0(%r11), %mm0 # v00
879
movq 16*2(%r11), %mm1 # v01
880
movq 16*4(%r11), %mm2 # v02
881
movq 16*6(%r11), %mm3 # v03
886
paddw %mm3, %mm0 # v10: v00+v03
887
psubw %mm3, %mm4 # v13: v00-v03
889
paddw %mm2, %mm1 # v11: v01+v02
890
psubw %mm2, %mm5 # v12: v01-v02
894
paddw %mm4, %mm5 # v22: v12+v13
895
paddw %mm1, %mm0 # v20: v10+v11
896
psubw %mm1, %mm3 # v21: v10-v11
898
pmulhw WA1(%rip), %mm5 # v32~: WA1*v22
900
psllw $16-NSHIFT, %mm5 # v32: compensate the coeffient scale
902
movq %mm0, 16*0(%r11)
904
psubw %mm5, %mm2 # v43: v13-v32
905
paddw %mm4, %mm5 # v42: v32+v13
907
movq %mm2, 16*6(%r11)
908
movq %mm3, 16*4(%r11)
909
movq %mm5, 16*2(%r11)
914
.global _dv_dct_248_block_mmx_x86_64_post_sum
915
.hidden _dv_dct_248_block_mmx_x86_64_post_sum
916
.type _dv_dct_248_block_mmx_x86_64_post_sum,@function
917
_dv_dct_248_block_mmx_x86_64_post_sum:
919
/* void _dv_dct_248_block_mmx_x86_64_post_sum(int16_t* out_block); */
921
/* argument is block=rdi */
923
mov %rdi, %r11 # source
925
movq 16*0(%r11), %mm0
926
movq 16*1(%r11), %mm1
927
movq 16*2(%r11), %mm2
928
movq 16*3(%r11), %mm3
929
movq 16*4(%r11), %mm4
930
movq 16*5(%r11), %mm5
931
movq 16*6(%r11), %mm6
932
movq 16*7(%r11), %mm7
935
paddw 16*0(%r11), %mm1
936
movq %mm0, 16*4(%r11)
937
movq %mm1, 16*0(%r11)
946
movq %mm0, 16*1(%r11)
947
movq %mm2, 16*5(%r11)
948
movq %mm1, 16*2(%r11)
949
movq %mm4, 16*6(%r11)
952
psubw 16*7(%r11), %mm6
954
movq %mm7, 16*3(%r11)
955
movq %mm6, 16*7(%r11)
959
movq 16*0(%r11), %mm0
960
movq 16*1(%r11), %mm1
961
movq 16*2(%r11), %mm2
962
movq 16*3(%r11), %mm3
963
movq 16*4(%r11), %mm4
964
movq 16*5(%r11), %mm5
965
movq 16*6(%r11), %mm6
966
movq 16*7(%r11), %mm7
969
paddw 16*0(%r11), %mm1
970
movq %mm0, 16*4(%r11)
971
movq %mm1, 16*0(%r11)
980
movq %mm0, 16*1(%r11)
981
movq %mm2, 16*5(%r11)
982
movq %mm1, 16*2(%r11)
983
movq %mm4, 16*6(%r11)
986
psubw 16*7(%r11), %mm6
988
movq %mm7, 16*3(%r11)
989
movq %mm6, 16*7(%r11)
994
.global _dv_dct_block_mmx_x86_64_postscale_248
995
.hidden _dv_dct_block_mmx_x86_64_postscale_248
996
.type _dv_dct_block_mmx_x86_64_postscale_248,@function
997
_dv_dct_block_mmx_x86_64_postscale_248:
999
/* void _dv_dct_block_mmx_x86_64_postscale_248(int16_t* block, int16_t* postscale_matrix); */
1001
/* arguments are block=rdi, postscale=rsi */
1005
mov %rdi,%r11 # block matrix
1006
mov %rsi, %r12 # postscale matrix
1008
movq 0*8(%r11), %mm0
1009
movq 1*8(%r11), %mm1
1010
movq 2*8(%r11), %mm2
1011
movq 3*8(%r11), %mm3
1032
pmulhw 0*8(%r12), %mm0
1033
pmulhw 1*8(%r12), %mm1
1034
pmulhw 2*8(%r12), %mm2
1035
pmulhw 3*8(%r12), %mm3
1037
psraw $YUV_PRECISION, %mm0
1038
psraw $YUV_PRECISION, %mm1
1039
psraw $YUV_PRECISION, %mm2
1040
psraw $YUV_PRECISION, %mm3
1052
movq %mm0, 0*8(%r11)
1053
movq %mm1, 1*8(%r11)
1054
movq %mm2, 2*8(%r11)
1055
movq %mm3, 3*8(%r11)
1058
movq 4*8(%r11), %mm0
1059
movq 5*8(%r11), %mm1
1060
movq 6*8(%r11), %mm2
1061
movq 7*8(%r11), %mm3
1082
pmulhw 4*8(%r12), %mm0
1083
pmulhw 5*8(%r12), %mm1
1084
pmulhw 6*8(%r12), %mm2
1085
pmulhw 7*8(%r12), %mm3
1087
psraw $YUV_PRECISION, %mm0
1088
psraw $YUV_PRECISION, %mm1
1089
psraw $YUV_PRECISION, %mm2
1090
psraw $YUV_PRECISION, %mm3
1102
movq %mm0, 4*8(%r11)
1103
movq %mm1, 5*8(%r11)
1104
movq %mm2, 6*8(%r11)
1105
movq %mm3, 7*8(%r11)
1108
movq 8*8(%r11), %mm0
1109
movq 9*8(%r11), %mm1
1110
movq 10*8(%r11), %mm2
1111
movq 11*8(%r11), %mm3
1132
pmulhw 8*8(%r12), %mm0
1133
pmulhw 9*8(%r12), %mm1
1134
pmulhw 10*8(%r12), %mm2
1135
pmulhw 11*8(%r12), %mm3
1137
psraw $YUV_PRECISION, %mm0
1138
psraw $YUV_PRECISION, %mm1
1139
psraw $YUV_PRECISION, %mm2
1140
psraw $YUV_PRECISION, %mm3
1152
movq %mm0, 8*8(%r11)
1153
movq %mm1, 9*8(%r11)
1154
movq %mm2, 10*8(%r11)
1155
movq %mm3, 11*8(%r11)
1158
movq 12*8(%r11), %mm0
1159
movq 13*8(%r11), %mm1
1160
movq 14*8(%r11), %mm2
1161
movq 15*8(%r11), %mm3
1182
pmulhw 12*8(%r12), %mm0
1183
pmulhw 13*8(%r12), %mm1
1184
pmulhw 14*8(%r12), %mm2
1185
pmulhw 15*8(%r12), %mm3
1187
psraw $YUV_PRECISION, %mm0
1188
psraw $YUV_PRECISION, %mm1
1189
psraw $YUV_PRECISION, %mm2
1190
psraw $YUV_PRECISION, %mm3
1202
movq %mm0, 12*8(%r11)
1203
movq %mm1, 13*8(%r11)
1204
movq %mm2, 14*8(%r11)
1205
movq %mm3, 15*8(%r11)