1
.section .note.GNU-stack, "", @progbits
5
.global _dv_transpose_mmx_x86_64
6
.hidden _dv_transpose_mmx_x86_64
7
.type _dv_transpose_mmx_x86_64,@function
8
_dv_transpose_mmx_x86_64:
10
/* extern void _dv_transpose_mmx_x86_64(short * dst); */
12
/* argument dst=rdi */
21
mov $8, %rbx # rbx is x_size
23
mov %r11, %r12 # pointer to the matrix
28
sub $4, %rax # rax is the inner loop variable
30
add %rbx, %rcx # rcx is 6*row size
31
mov %rax, %rdx # rdx is the outer loop variable
33
do_4x4_block_where_x_equals_y:
35
movq (%r11), %mm0 # m03:m02|m01:m00 - first line
37
movq (%r11,%rbx,4), %mm2 # m23:m22|m21:m20 - third line
38
movq %mm0, %mm6 # copy first line
40
punpcklwd (%r11,%rbx,2), %mm0
41
# m11:m01|m10:m00 - interleave first and second lines
42
movq %mm2, %mm7 # copy third line
44
punpcklwd (%r11,%rcx,), %mm2
45
# m31:m21|m30:m20 - interleave third and fourth lines
46
movq %mm0, %mm4 # copy first intermediate result
48
movq (%r11,%rbx,2), %mm1 # m13:m12|m11:m10 - second line
50
# m30:m20|m10:m00 - interleave to produce result 1
52
movq (%r11,%rcx,), %mm3 # m33:m32|m31:m30 - fourth line
54
# m31:m21|m11:m01 - interleave to produce result 2
56
movq %mm0, (%r11) # write result 1
58
# m13:m03|m12:m02 - interleave first and second lines
60
movq %mm4, (%r11,%rbx,2) # write result 2
62
# m33:m23|m32:m22 - interleave third and fourth lines
64
movq %mm6, %mm5 # copy first intermediate result
66
# m32:m22|m12:m02 - interleave to produce result 3
68
lea (%r12,%rbx,8), %r12
69
# reload r12 to point to a 4x4 set 4 rows down
71
# m33:m23|m13:m03 - interleave to produce result 4
73
movq %mm6, (%r11,%rbx,4) # write result 3
75
movq %mm5, (%r11,%rcx,) # write result 4
78
# check to see if the number of rows left is zero
79
je all_done_ready_to_exit
80
#last time through you are done and ready to exit
82
do_4x4_blocks_x_and_y_not_equal:
84
# transpose the two mirror image 4x4 sets so that the writes
85
# can be done without overwriting unused data
87
movq 8(%r11), %mm0 # m03:m02|m01:m00 - first line
89
movq 8(%r11,%rbx,4), %mm2 # m23:m22|m21:m20 - third line
90
movq %mm0, %mm6 # copy first line
92
punpcklwd 8(%r11,%rbx,2), %mm0
93
# m11:m01|m10:m00 - interleave first and second lines
94
movq %mm2, %mm7 # copy third line
96
punpcklwd 8(%r11,%rcx,), %mm2
97
# m31:m21|m30:m20 - interleave third and fourth lines
98
movq %mm0, %mm4 # copy first intermediate result
99
# all references for second 4 x 4 block are referred by "n" instead of "m"
100
movq (%r12), %mm1 # n03:n02|n01:n00 - first line
102
# m30:m20|m10:m00 - interleave to produce first result
104
movq (%r12,%rbx,4), %mm3 # n23:n22|n21:n20 - third line
106
# m31:m21|m11:m01 - interleave to produce second result
108
punpckhwd 8(%r11,%rbx,2), %mm6
109
# m13:m03|m12:m02 - interleave first and second lines
110
movq %mm1, %mm2 # copy first line
112
punpckhwd 8(%r11,%rcx,), %mm7
113
# m33:m23|m32:m22 - interleave third and fourth lines
114
movq %mm6, %mm5 # copy first intermediate result
116
movq %mm0, (%r12) # write result 1
118
# m33:m23|m13:m03 - produce third result
120
punpcklwd (%r12,%rbx,2), %mm1
121
# n11:n01|n10:n00 - interleave first and second lines
122
movq %mm3, %mm0 # copy third line
124
punpckhwd (%r12,%rbx,2), %mm2
125
# n13:n03|n12:n02 - interleave first and second lines
127
movq %mm4, (%r12,%rbx,2) # write result 2 out
129
# m32:m22|m12:m02 - produce fourth result
131
punpcklwd (%r12,%rcx,), %mm3
132
# n31:n21|n30:n20 - interleave third and fourth lines
133
movq %mm1, %mm4 # copy first intermediate result
135
movq %mm6, (%r12,%rbx,4) # write result 3 out
137
# n30:n20|n10:n00 - produce first result
139
punpckhwd (%r12,%rcx,), %mm0
140
# n33:n23|n32:n22 - interleave third and fourth lines
141
movq %mm2, %mm6 # copy second intermediate result
143
movq %mm5, (%r12,%rcx,) # write result 4 out
145
# n31:n21|n11:n01- produce second result
148
# write result 5 out - (first result for other 4 x 4 block)
150
# n32:n22|n12:n02- produce third result
152
movq %mm4, 8(%r11,%rbx,2) # write result 6 out
154
# n33:n23|n13:n03 - produce fourth result
156
movq %mm2, 8(%r11,%rbx,4) # write result 7 out
158
movq %mm6, 8(%r11,%rcx,) # write result 8 out
161
# increment r11 to point to next 4 x 4 block in same row
162
lea (%r12,%rbx,8), %r12
163
# increment r12 to point to next 4 x 4 block below current one
165
sub $4, %rax # decrement inner loop variable
166
jnz do_4x4_blocks_x_and_y_not_equal
167
# rax points to start of the second row in block we just finished
170
lea 8(%r11,%rbx,8), %r11 # reload r11 to point four rows down
173
# subtract the number of bytes in last row
174
# now we point to spot where row = col
175
sub $8, %rdx # sub 4 from row number
181
# reset x_size to outer loop variable to start new row
183
jmp do_4x4_block_where_x_equals_y
185
all_done_ready_to_exit: