2
; jcqnts2f-64.asm - sample data conversion and quantization (64-bit SSE & SSE2)
4
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5
; Copyright 2009 D. R. Commander
8
; x86 SIMD extension for IJG JPEG library
9
; Copyright (C) 1999-2006, MIYASAKA Masaru.
10
; For conditions of distribution and use, see copyright notice in jsimdext.inc
12
; This file should be assembled with NASM (Netwide Assembler),
13
; can *not* be assembled with Microsoft's MASM or any compatible
14
; assembler (including Borland's Turbo Assembler).
15
; NASM is available from http://nasm.sourceforge.net/ or
16
; http://sourceforge.net/project/showfiles.php?group_id=6208
20
%include "jsimdext.inc"
23
; --------------------------------------------------------------------------
27
; Load data into workspace, applying unsigned->signed conversion
30
; jsimd_convsamp_float_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
31
; FAST_FLOAT * workspace);
34
; r10 = JSAMPARRAY sample_data
35
; r11 = JDIMENSION start_col
36
; r12 = FAST_FLOAT * workspace
39
global EXTN(jsimd_convsamp_float_sse2)
41
EXTN(jsimd_convsamp_float_sse2):
50
packsswb xmm7,xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
57
mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
58
mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
60
movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]
61
movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]
63
psubb xmm0,xmm7 ; xmm0=(01234567)
64
psubb xmm1,xmm7 ; xmm1=(89ABCDEF)
66
punpcklbw xmm0,xmm0 ; xmm0=(*0*1*2*3*4*5*6*7)
67
punpcklbw xmm1,xmm1 ; xmm1=(*8*9*A*B*C*D*E*F)
69
punpcklwd xmm2,xmm0 ; xmm2=(***0***1***2***3)
70
punpckhwd xmm0,xmm0 ; xmm0=(***4***5***6***7)
71
punpcklwd xmm3,xmm1 ; xmm3=(***8***9***A***B)
72
punpckhwd xmm1,xmm1 ; xmm1=(***C***D***E***F)
74
psrad xmm2,(DWORD_BIT-BYTE_BIT) ; xmm2=(0123)
75
psrad xmm0,(DWORD_BIT-BYTE_BIT) ; xmm0=(4567)
76
cvtdq2ps xmm2,xmm2 ; xmm2=(0123)
77
cvtdq2ps xmm0,xmm0 ; xmm0=(4567)
78
psrad xmm3,(DWORD_BIT-BYTE_BIT) ; xmm3=(89AB)
79
psrad xmm1,(DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF)
80
cvtdq2ps xmm3,xmm3 ; xmm3=(89AB)
81
cvtdq2ps xmm1,xmm1 ; xmm1=(CDEF)
83
movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
84
movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
85
movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
86
movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
88
add rsi, byte 2*SIZEOF_JSAMPROW
89
add rdi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
99
; --------------------------------------------------------------------------
101
; Quantize/descale the coefficients, and store into coef_block
104
; jsimd_quantize_float_sse2 (JCOEFPTR coef_block, FAST_FLOAT * divisors,
105
; FAST_FLOAT * workspace);
108
; r10 = JCOEFPTR coef_block
109
; r11 = FAST_FLOAT * divisors
110
; r12 = FAST_FLOAT * workspace
113
global EXTN(jsimd_quantize_float_sse2)
115
EXTN(jsimd_quantize_float_sse2):
126
movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
127
movaps xmm1, XMMWORD [XMMBLOCK(0,1,rsi,SIZEOF_FAST_FLOAT)]
128
mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
129
mulps xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
130
movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
131
movaps xmm3, XMMWORD [XMMBLOCK(1,1,rsi,SIZEOF_FAST_FLOAT)]
132
mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
133
mulps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
143
movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0
144
movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2
146
add rsi, byte 16*SIZEOF_FAST_FLOAT
147
add rdx, byte 16*SIZEOF_FAST_FLOAT
148
add rdi, byte 16*SIZEOF_JCOEF
156
; For some reason, the OS X linker does not honor the request to align the
157
; segment unless we do this.