4
* Copyright (C) James Bowman - May 2000
5
* Copyright (C) Peter Schlaile - Jan 2001
7
* This file is part of libdv, a free DV (IEC 61834/SMPTE 314M)
10
* libdv is free software; you can redistribute it and/or modify it
11
* under the terms of the GNU Lesser Public License as published by
12
* the Free Software Foundation; either version 2.1, or (at your
13
* option) any later version.
15
* libdv is distributed in the hope that it will be useful, but
16
* WITHOUT ANY WARRANTY; without even the implied warranty of
17
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18
* Lesser Public License for more details.
20
* You should have received a copy of the GNU Lesser Public License
21
* along with libdv; see the file COPYING. If not, write to
22
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
24
* The libdv homepage is http://libdv.sourceforge.net/.
27
.section .note.GNU-stack, "", @progbits
32
The pattern for dv_88_areas looks like this:
43
[1] matrix element [0][0] is untouched.
44
[2] all values in the same diagonal are equal
46
This implementation works by loading the four shift values in turn,
47
and shifting all the appropriate array elements.
54
void _dv_quant_88_inverse(dv_coeff_t *block,int qno,int class)
59
.globl _dv_quant_88_inverse_x86_64
60
.hidden _dv_quant_88_inverse_x86_64
61
.type _dv_quant_88_inverse_x86_64,@function
62
_dv_quant_88_inverse_x86_64:
64
/* Args are at block=rdi, qno=rsi, class=rdx */
77
/* pq = dv_quant_shifts[qno + dv_quant_offset[class]]; */
78
mov %rsi,%rax /* qno */
79
mov %rdx,%r12 /* class */
80
mov dv_quant_offset@GOTPCREL(%rip),%rcx
81
movzbq (%rcx,%r12,1),%rcx /* dv_quant_offset[class] */
82
add %rcx,%rax /* qno + */
83
mov dv_quant_shifts@GOTPCREL(%rip),%r11
84
lea (%r11,%rax,4),%r11 /* r11 is pq = dv_quant_shifts[...] */
86
/* extra = (class == 3); */
88
sub $3,%r12 /* -3 -2 -1 0 */
89
sar $31,%r12 /* -1 -1 -1 0 */
90
inc %r12 /* 0 0 0 1 */
91
add $DV_WEIGHT_BIAS,%r12
94
mov %rdi,%r14 /* r14 is block */
96
/* Pick up each of the factors into %rcx, then shift the
97
appropriate coefficients. The pattern here is taken from
98
dv_88_areas; it's arranged by diagonals for clarity. */
100
#define ADDR(row,col) (2*(8*row+col))(%r14)
101
#define MSHIFT(row,col) \
102
shlw %cl,ADDR(row,col)
104
mov $DV_WEIGHT_BIAS,%rcx
118
movzbq 1(%r11,1),%rcx
139
movzbq 2(%r11,1),%rcx
167
movzbq 3(%r11,1),%rcx
201
.globl _dv_quant_x86_64
202
.hidden _dv_quant_x86_64
203
.type _dv_quant_x86_64,@function
206
/* Args are at block=rdi, qno=rsi, class=rdx */
222
/* pq = dv_quant_shifts[qno + dv_quant_offset[class]]; */
223
mov %rsi,%rax /* qno */
224
mov %rdx,%r12 /* class */
226
mov dv_quant_offset@GOTPCREL(%rip),%rcx
227
movzbq (%rcx,%r12,1),%rcx
229
mov dv_quant_shifts@GOTPCREL(%rip),%r11
230
lea (%r11,%rax,4),%r11 /* r11 is pq */
232
/* extra = (class == 3); */
234
sub $3,%r12 /* -3 -2 -1 0 */
235
sar $31,%r12 /* -1 -1 -1 0 */
236
inc %r12 /* 0 0 0 1 */
239
mov %rdi,%r14 /* r14 is block */
241
/* Since we already reordered the coefficients, it's easy:
242
Shift between OFS0 and OFS1 with the first pq value
243
between OFS1 and OFS2 with the second pq value etc.
244
Since we really want to divide, we have to compensate for
247
The remaining thing is pipe-line optimization
248
which results in obfuscating MMX code...
251
# sarw %cl,ADDR(row,col)
255
#define OFS2 (1+2+3+4+5+6)
256
#define OFS3 (1+2+3+4+5+6+7+8+7)
260
movq OFS0*2(%r14), %mm2
262
movq (OFS0+4)*2(%r14), %mm4
280
movq (OFS1*2)(%r14), %mm2
281
movq %mm3, OFS0*2(%r14)
282
movq %mm5, (OFS0+4)*2(%r14)
285
movzbq 1(%r11,1),%rcx
286
movq (OFS1+4)*2(%r14), %mm4
304
movq %mm3, OFS1*2(%r14)
305
movq (OFS1+8)*2(%r14), %mm2
306
movq %mm5, (OFS1+4)*2(%r14)
307
movq (OFS1+12)*2(%r14), %mm4
323
movq OFS2*2(%r14), %mm0
324
movq %mm3, (OFS1+8)*2(%r14)
325
movq %mm5, (OFS1+12)*2(%r14)
328
movzbq 2(%r11,1),%rcx
329
movq (OFS2+4)*2(%r14), %mm2
331
movq (OFS2+8)*2(%r14), %mm4
356
movq %mm1, OFS2*2(%r14)
357
movq %mm3, (OFS2+4)*2(%r14)
358
movq %mm5, (OFS2+8)*2(%r14)
360
movq (OFS2+12)*2(%r14), %mm0
361
movq (OFS2+16)*2(%r14), %mm2
362
movq (OFS2+20)*2(%r14), %mm4
386
movq OFS3*2(%r14), %mm0
388
movq %mm1, (OFS2+12)*2(%r14)
389
movq %mm3, (OFS2+16)*2(%r14)
390
movq %mm5, (OFS2+20)*2(%r14)
393
movzbq 3(%r11,1),%rcx
394
movq (OFS3+4)*2(%r14), %mm2
396
movq (OFS3+8)*2(%r14), %mm4
421
movq %mm1, OFS3*2(%r14)
422
movq %mm3, (OFS3+4)*2(%r14)
423
movq %mm5, (OFS3+8)*2(%r14)
425
movq (OFS3+12)*2(%r14), %mm2
426
movq (OFS3+16)*2(%r14), %mm4
443
movq %mm3, (OFS3+12)*2(%r14)
444
movq %mm5, (OFS3+16)*2(%r14)
446
movw (OFS3+20)*2(%r14), %ax
454
movw %ax, (OFS3+20)*2(%r14)