~ubuntu-branches/ubuntu/natty/x264/natty

« back to all changes in this revision

Viewing changes to common/amd64/mc-a2.asm

Committer: Bazaar Package Importer
Author(s): Reinhard Tartler
Date: 2006-02-14 12:51:13 UTC
Revision ID: james.westby@ubuntu.com-20060214125113-t2vdkiqgcctz9ndd

Tags: upstream-0.cvs20060210

Import upstream version 0.cvs20060210

files added:

AUTHORS

COPYING

Makefile

TODO

build

build/win32

build/win32/libx264.dsp

build/win32/libx264.vcproj

build/win32/x264.dsp

build/win32/x264.dsw

build/win32/x264.vcproj

common

common/amd64

common/amd64/amd64inc.asm

common/amd64/cpu-a.asm

common/amd64/dct-a.asm

common/amd64/deblock-a.asm

common/amd64/mc-a.asm

common/amd64/mc-a2.asm

common/amd64/pixel-a.asm

common/amd64/pixel-sse2.asm

common/amd64/predict-a.asm

common/amd64/predict.c

common/amd64/quant-a.asm

common/bs.h

common/cabac.c

common/cabac.h

common/clip1.h

common/common.c

common/common.h

common/cpu.c

common/cpu.h

common/csp.c

common/csp.h

common/dct.c

common/dct.h

common/display-x11.c

common/display.h

common/frame.c

common/frame.h

common/i386

common/i386/cpu-a.asm

common/i386/dct-a.asm

common/i386/dct-c.c

common/i386/dct.h

common/i386/deblock-a.asm

common/i386/i386inc.asm

common/i386/mc-a.asm

common/i386/mc-a2.asm

common/i386/mc-c.c

common/i386/mc.h

common/i386/pixel-a.asm

common/i386/pixel-sse2.asm

common/i386/pixel.h

common/i386/predict-a.asm

common/i386/predict.c

common/i386/predict.h

common/i386/quant-a.asm

common/i386/quant.h

common/macroblock.c

common/macroblock.h

common/mc.c

common/mc.h

common/mdate.c

common/pixel.c

common/pixel.h

common/ppc

common/ppc/dct.c

common/ppc/dct.h

common/ppc/mc.c

common/ppc/mc.h

common/ppc/pixel.c

common/ppc/pixel.h

common/ppc/ppccommon.h

common/predict.c

common/predict.h

common/quant.c

common/quant.h

common/set.c

common/set.h

common/sparc

common/sparc/pixel.asm

common/sparc/pixel.h

common/visualize.c

common/visualize.h

common/vlc.h

configure

decoder

decoder/decoder.c

decoder/macroblock.c

decoder/macroblock.h

decoder/set.c

decoder/set.h

decoder/vlc.c

decoder/vlc.h

doc/ratecontrol.txt

doc/vui.txt

encoder

encoder/analyse.c

encoder/analyse.h

encoder/cabac.c

encoder/cavlc.c

encoder/encoder.c

encoder/eval.c

encoder/macroblock.c

encoder/macroblock.h

encoder/me.c

encoder/me.h

encoder/ratecontrol.c

encoder/ratecontrol.h

encoder/rdo.c

encoder/set.c

encoder/set.h

encoder/slicetype_decision.c

extras

extras/getopt.c

extras/getopt.h

extras/stdint.h

matroska.c

matroska.h

tools

tools/Jamfile

tools/avc2avi.c

tools/checkasm.c

tools/countquant_x264.pl

tools/q_matrix_jvt.cfg

tools/x264-rd.sh

tools/xyuv.c

version.sh

vfw/build

vfw/build/cygwin

vfw/build/cygwin/Makefile

vfw/build/win32

vfw/build/win32/bin

vfw/build/win32/bin/x264vfw.inf

vfw/build/win32/x264vfw.dsp

vfw/build/win32/x264vfw.dsw

vfw/codec.c

vfw/config.c

vfw/driverproc.c

vfw/driverproc.def

vfw/installer

vfw/installer/win.bmp

vfw/installer/x264-conf.nsi

vfw/installer/x264vfw.ico

vfw/resource.h

vfw/resource.rc

vfw/w32api

vfw/w32api/vfw.h

vfw/x264.bmp

vfw/x264vfw.h

x264.c

x264.h

Show diffs side-by-side

added added

removed removed

common/amd64/mc-a2.asm

;*****************************************************************************

;* mc-a2.asm: h264 encoder library

;*****************************************************************************

;* This program is free software; you can redistribute it and/or modify

;* it under the terms of the GNU General Public License as published by

;* the Free Software Foundation; either version 2 of the License, or

;* (at your option) any later version.

;* This program is distributed in the hope that it will be useful,

;* but WITHOUT ANY WARRANTY; without even the implied warranty of

;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

;* GNU General Public License for more details.

;* You should have received a copy of the GNU General Public License

;* along with this program; if not, write to the Free Software

;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.

;*****************************************************************************

BITS 64

;=============================================================================

; Macros and other preprocessor constants

;=============================================================================

%include "amd64inc.asm"

;=============================================================================

; Read only data

;=============================================================================

SECTION .rodata

ALIGN 16

mmx_dw_one:

times 4 dw 16

mmx_dd_one:

times 2 dd 512

mmx_dw_20:

times 4 dw 20

mmx_dw_5:

times 4 dw -5

%assign tbuffer 0

;=============================================================================

; Macros

;=============================================================================

%macro LOAD_4 9

movd %1, %5

movd %2, %6

movd %3, %7

movd %4, %8

punpcklbw %1, %9

punpcklbw %2, %9

punpcklbw %3, %9

punpcklbw %4, %9

%endmacro

%macro FILT_2 2

psubw %1, %2

psllw %2, 2

psubw %1, %2

%endmacro

%macro FILT_4 3

paddw %2, %3

psllw %2, 2

paddw %1, %2

psllw %2, 2

paddw %1, %2

%endmacro

%macro FILT_6 4

psubw %1, %2

psllw %2, 2

psubw %1, %2

paddw %1, %3

paddw %1, %4

psraw %1, 5

%endmacro

%macro FILT_ALL 1

LOAD_4 mm1, mm2, mm3, mm4, [%1], [%1 + rcx], [%1 + 2 * rcx], [%1 + rbx], mm0

FILT_2 mm1, mm2

movd mm5, [%1 + 4 * rcx]

movd mm6, [%1 + rdx]

FILT_4 mm1, mm3, mm4

punpcklbw mm5, mm0

punpcklbw mm6, mm0

psubw mm1, mm5

psllw mm5, 2

psubw mm1, mm5

paddw mm1, mm6

%endmacro

100

101

102

;=============================================================================

103

; Code

104

;=============================================================================

105

106

SECTION .text

107

108

cglobal x264_horizontal_filter_mmxext

109

cglobal x264_center_filter_mmxext

110

111

;-----------------------------------------------------------------------------

112

;

113

; void x264_center_filter_mmxext( uint8_t *dst1, int i_dst1_stride,

114

; uint8_t *dst2, int i_dst2_stride,

115

; uint8_t *src, int i_src_stride,

116

; int i_width, int i_height );

117

;

118

;-----------------------------------------------------------------------------

119

120

ALIGN 16

121

x264_center_filter_mmxext :

122

123

push r15

124

pushreg r15

125

%ifdef WIN64

126

push rdi

127

pushreg rdi

128

push rsi

129

pushreg rsi

130

%endif

131

132

push rbp

133

pushreg rbp

134

push rbx

135

pushreg rbx

136

push r12

137

pushreg r12

138

push r13

139

pushreg r13

140

push r14

141

pushreg r14

142

lea rbp, [rsp]

143

setframe rbp, 0

144

endprolog

145

146

%ifdef WIN64

147

movsxd r13, dword [rsp+64+48] ; src_stride

148

mov r12, [rsp+64+40] ; src

149

%else

150

movsxd r13, r9d ; src_stride

151

mov r12, r8 ; src

152

%endif

153

sub r12, r13

154

sub r12, r13 ; tsrc = src - 2 * src_stride

155

156

; use 24 instead of 18 (used in i386/mc-a2.asm) to keep rsp aligned

157

lea rax, [r13 + r13 + 24 + tbuffer]

158

sub rsp, rax

159

160

mov r10, parm3q ; dst2

161

movsxd r11, parm4d ; dst2_stride

162

mov r8, parm1q ; dst1

163

movsxd r9, parm2d ; dst1_stride

164

%ifdef WIN64

165

movsxd r14, dword [rbp + 64 + 56] ; width

166

movsxd r15, dword [rbp + 64 + 64] ; height

167

%else

168

movsxd r14, dword [rbp + 56] ; width

169

movsxd r15, dword [rbp + 64] ; height

170

%endif

171

172

mov rcx, r13 ; src_stride

173

lea rbx, [r13 + r13 * 2] ; 3 * src_stride

174

lea rdx, [r13 + r13 * 4] ; 5 * src_stride

175

176

pxor mm0, mm0 ; 0 ---> mm0

177

movq mm7, [mmx_dd_one GLOBAL] ; for rounding

178

179

.loopcy:

180

181

xor rax, rax

182

mov rsi, r12 ; tsrc

183

184

FILT_ALL rsi

185

186

pshufw mm2, mm1, 0

187

movq [rsp + tbuffer], mm2

188

movq [rsp + tbuffer + 8], mm1

189

paddw mm1, [mmx_dw_one GLOBAL]

190

psraw mm1, 5

191

192

packuswb mm1, mm1

193

movd [r8], mm1 ; dst1[0] = mm1

194

195

add rax, 8

196

add rsi, 4

197

lea rdi, [r8 - 4] ; rdi = dst1 - 4

198

199

.loopcx1:

200

201

FILT_ALL rsi

202

203

movq [rsp + tbuffer + 2 * rax], mm1

204

paddw mm1, [mmx_dw_one GLOBAL]

205

psraw mm1, 5

206

packuswb mm1, mm1

207

movd [rdi + rax], mm1 ; dst1[rax - 4] = mm1

208

209

add rsi, 4

210

add rax, 4

211

cmp rax, r14 ; cmp rax, width

212

jnz .loopcx1

213

214

FILT_ALL rsi

215

216

pshufw mm2, mm1, 7

217

movq [rsp + tbuffer + 2 * rax], mm1

218

movq [rsp + tbuffer + 2 * rax + 8], mm2

219

paddw mm1, [mmx_dw_one GLOBAL]

220

psraw mm1, 5

221

packuswb mm1, mm1

222

movd [rdi + rax], mm1 ; dst1[rax - 4] = mm1

223

224

add r12, r13 ; tsrc = tsrc + src_stride

225

226

add r8, r9 ; dst1 = dst1 + dst1_stride

227

228

xor rax, rax

229

230

.loopcx2:

231

232

movq mm2, [rsp + 2 * rax + 2 + 4 + tbuffer]

233

movq mm3, [rsp + 2 * rax + 4 + 4 + tbuffer]

234

movq mm4, [rsp + 2 * rax + 6 + 4 + tbuffer]

235

movq mm5, [rsp + 2 * rax + 8 + 4 + tbuffer]

236

movq mm1, [rsp + 2 * rax + 4 + tbuffer]

237

movq mm6, [rsp + 2 * rax + 10 + 4 + tbuffer]

238

paddw mm2, mm5

239

paddw mm3, mm4

240

paddw mm1, mm6

241

242

movq mm5, [mmx_dw_20 GLOBAL]

243

movq mm4, [mmx_dw_5 GLOBAL]

244

movq mm6, mm1

245

pxor mm7, mm7

246

247

punpckhwd mm5, mm2

248

punpcklwd mm4, mm3

249

punpcklwd mm2, [mmx_dw_20 GLOBAL]

250

punpckhwd mm3, [mmx_dw_5 GLOBAL]

251

252

pcmpgtw mm7, mm1

253

254

pmaddwd mm2, mm4

255

pmaddwd mm3, mm5

256

257

punpcklwd mm1, mm7

258

punpckhwd mm6, mm7

259

260

paddd mm2, mm1

261

paddd mm3, mm6

262

263

paddd mm2, [mmx_dd_one GLOBAL]

264

paddd mm3, [mmx_dd_one GLOBAL]

265

266

psrad mm2, 10

267

psrad mm3, 10

268

269

packssdw mm2, mm3

270

packuswb mm2, mm0

271

272

movd [r10 + rax], mm2 ; dst2[rax] = mm2

273

274

add rax, 4

275

cmp rax, r14 ; cmp rax, width

276

jnz .loopcx2

277

278

add r10, r11 ; dst2 += dst2_stride

279

280

dec r15 ; height

281

test r15, r15

282

jnz .loopcy

283

284

lea rsp, [rbp]

285

286

pop r14

287

pop r13

288

pop r12

289

pop rbx

290

pop rbp

291

%ifdef WIN64

292

pop rsi

293

pop rdi

294

%endif

295

pop r15

296

297

ret

298

299

;-----------------------------------------------------------------------------

300

;

301

; void x264_horizontal_filter_mmxext( uint8_t *dst, int i_dst_stride,

302

; uint8_t *src, int i_src_stride,

303

; int i_width, int i_height );

304

;

305

;-----------------------------------------------------------------------------

306

307

ALIGN 16

308

x264_horizontal_filter_mmxext :

309

movsxd r10, parm2d ; dst_stride

310

movsxd r11, parm4d ; src_stride

311

%ifdef WIN64

312

mov rdx, r8 ; src

313

mov r9, rcx ; dst

314

movsxd rcx, parm6d ; height

315

%else

316

movsxd rcx, parm6d ; height

317

mov r9, rdi ; dst

318

%endif

319

320

movsxd r8, parm5d ; width

321

322

pxor mm0, mm0

323

movq mm7, [mmx_dw_one GLOBAL]

324

325

sub rdx, 2

326

327

loophy:

328

329

dec rcx

330

xor rax, rax

331

332

loophx:

333

334

prefetchnta [rdx + rax + 48]

335

336

LOAD_4 mm1, mm2, mm3, mm4, [rdx + rax], [rdx + rax + 1], [rdx + rax + 2], [rdx + rax + 3], mm0

337

FILT_2 mm1, mm2

338

movd mm5, [rdx + rax + 4]

339

movd mm6, [rdx + rax + 5]

340

FILT_4 mm1, mm3, mm4

341

movd mm2, [rdx + rax + 4]

342

movd mm3, [rdx + rax + 6]

343

punpcklbw mm5, mm0

344

punpcklbw mm6, mm0

345

FILT_6 mm1, mm5, mm6, mm7

346

movd mm4, [rdx + rax + 7]

347

movd mm5, [rdx + rax + 8]

348

punpcklbw mm2, mm0

349

punpcklbw mm3, mm0 ; mm2(1), mm3(20), mm6(-5) ready

350

FILT_2 mm2, mm6

351

movd mm6, [rdx + rax + 9]

352

punpcklbw mm4, mm0

353

punpcklbw mm5, mm0 ; mm2(1-5), mm3(20), mm4(20), mm5(-5) ready

354

FILT_4 mm2, mm3, mm4

355

punpcklbw mm6, mm0

356

FILT_6 mm2, mm5, mm6, mm7

357

358

packuswb mm1, mm2

359

movq [r9 + rax], mm1

360

361

add rax, 8

362

cmp rax, r8 ; cmp rax, width

363

jnz loophx

364

365

add rdx, r11 ; src_pitch

366

add r9, r10 ; dst_pitch

367

368

test rcx, rcx

369

jnz loophy

370

371

ret

Older »