~ubuntu-branches/debian/squeeze/pixman/squeeze

« back to all changes in this revision

Viewing changes to pixman/pixman-mmx.c

Committer: Bazaar Package Importer
Author(s): David Nusinow
Date: 2007-08-09 22:15:45 UTC
Revision ID: james.westby@ubuntu.com-20070809221545-b3rj83wnluotrybv

Tags: upstream-0.9.4

Import upstream version 0.9.4

files added:

AUTHORS

COPYING

ChangeLog

INSTALL

Makefile.am

Makefile.in

NEWS

README

TODO

aclocal.m4

config.guess

config.h.in

config.sub

configure

configure.ac

depcomp

install-sh

ltmain.sh

missing

pixman

pixman-1.pc.in

pixman/Makefile.am

pixman/Makefile.in

pixman/pixman-compose.c

pixman/pixman-compute-region.c

pixman/pixman-edge-imp.h

pixman/pixman-edge.c

pixman/pixman-image.c

pixman/pixman-mmx.c

pixman/pixman-mmx.h

pixman/pixman-pict.c

pixman/pixman-private.h

pixman/pixman-region.c

pixman/pixman-timer.c

pixman/pixman-trap.c

pixman/pixman-utils.c

pixman/pixman.h

test

test/Makefile.am

test/Makefile.in

test/composite-test.c

test/gradient-test.c

Show diffs side-by-side

added added

removed removed

pixman/pixman-mmx.c

* Permission to use, copy, modify, distribute, and sell this software and its

* documentation for any purpose is hereby granted without fee, provided that

* the above copyright notice appear in all copies and that both that

* copyright notice and this permission notice appear in supporting

* documentation, and that the name of Red Hat not be used in advertising or

* publicity pertaining to distribution of the software without specific,

* written prior permission. Red Hat makes no representations about the

* suitability of this software for any purpose. It is provided "as is"

* without express or implied warranty.

* THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS

* SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND

* FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY

* SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES

* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN

* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING

* OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS

* SOFTWARE.

* Author: Søren Sandmann (sandmann@redhat.com)

* Minor Improvements: Nicholas Miell (nmiell@gmail.com)

* MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com)

* Based on work by Owen Taylor

#include <config.h>

#ifdef USE_MMX

#if defined(__amd64__) || defined(__x86_64__)

#define USE_SSE

#endif

#include <mmintrin.h>

#ifdef USE_SSE

#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */

#endif

#include "pixman-mmx.h"

#undef READ

#undef WRITE

#define READ(x) *(x)

#define WRITE(ptr,v) (*(ptr) = (v));

#define noVERBOSE

#ifdef VERBOSE

#define CHECKPOINT() ErrorF ("at %s %d\n", __FUNCTION__, __LINE__)

#else

#define CHECKPOINT()

#endif

/* Notes about writing mmx code

* give memory operands as the second operand. If you give it as the

* first, gcc will first load it into a register, then use that

* register

* ie. use

* _mm_mullo_pi16 (x, mmx_constant);

* not

* _mm_mullo_pi16 (mmx_constant, x);

* Also try to minimize dependencies. i.e. when you need a value, try

* to calculate it from a value that was calculated as early as

* possible.

/* --------------- MMX primitivess ------------------------------------ */

typedef unsigned long long ullong;

#ifdef __GNUC__

typedef ullong mmxdatafield;

#endif

#ifdef _MSC_VER

typedef unsigned __int64 ullong;

typedef __m64 mmxdatafield;

#endif

typedef struct

{

mmxdatafield mmx_4x00ff;

mmxdatafield mmx_4x0080;

mmxdatafield mmx_565_rgb;

mmxdatafield mmx_565_unpack_multiplier;

mmxdatafield mmx_565_r;

mmxdatafield mmx_565_g;

mmxdatafield mmx_565_b;

mmxdatafield mmx_mask_0;

100

mmxdatafield mmx_mask_1;

101

mmxdatafield mmx_mask_2;

102

mmxdatafield mmx_mask_3;

103

mmxdatafield mmx_full_alpha;

104

mmxdatafield mmx_ffff0000ffff0000;

105

mmxdatafield mmx_0000ffff00000000;

106

mmxdatafield mmx_000000000000ffff;

107

} MMXData;

108

109

static const MMXData c =

110

{

111

#ifdef __GNUC__

112

.mmx_4x00ff = 0x00ff00ff00ff00ffULL,

113

.mmx_4x0080 = 0x0080008000800080ULL,

114

.mmx_565_rgb = 0x000001f0003f001fULL,

115

.mmx_565_unpack_multiplier = 0x0000008404100840ULL,

116

.mmx_565_r = 0x000000f800000000ULL,

117

.mmx_565_g = 0x0000000000fc0000ULL,

118

.mmx_565_b = 0x00000000000000f8ULL,

119

.mmx_mask_0 = 0xffffffffffff0000ULL,

120

.mmx_mask_1 = 0xffffffff0000ffffULL,

121

.mmx_mask_2 = 0xffff0000ffffffffULL,

122

.mmx_mask_3 = 0x0000ffffffffffffULL,

123

.mmx_full_alpha = 0x00ff000000000000ULL,

124

.mmx_ffff0000ffff0000 = 0xffff0000ffff0000ULL,

125

.mmx_0000ffff00000000 = 0x0000ffff00000000ULL,

126

.mmx_000000000000ffff = 0x000000000000ffffULL,

127

#endif

128

#ifdef _MSC_VER

129

{ 0x00ff00ff00ff00ffUI64 },

130

{ 0x0080008000800080UI64 },

131

{ 0x000001f0003f001fUI64 },

132

{ 0x0000008404100840UI64 },

133

{ 0x000000f800000000UI64 },

134

{ 0x0000000000fc0000UI64 },

135

{ 0x00000000000000f8UI64 },

136

{ 0xffffffffffff0000UI64 },

137

{ 0xffffffff0000ffffUI64 },

138

{ 0xffff0000ffffffffUI64 },

139

{ 0x0000ffffffffffffUI64 },

140

{ 0x00ff000000000000UI64 },

141

{ 0xffff0000ffff0000UI64 },

142

{ 0x0000ffff00000000UI64 },

143

{ 0x000000000000ffffUI64 },

144

#endif

145

};

146

147

#ifdef _MSC_VER

148

#undef inline

149

#define inline __forceinline

150

#endif

151

152

#ifdef __GNUC__

153

#define MC(x) ((__m64) c.mmx_##x)

154

#endif

155

#ifdef _MSC_VER

156

#define MC(x) c.mmx_##x

157

#endif

158

159

static inline __m64

160

shift (__m64 v, int s)

161

{

162

if (s > 0)

163

return _mm_slli_si64 (v, s);

164

else if (s < 0)

165

return _mm_srli_si64 (v, -s);

166

else

167

return v;

168

}

169

170

static inline __m64

171

negate (__m64 mask)

172

{

173

return _mm_xor_si64 (mask, MC(4x00ff));

174

}

175

176

static inline __m64

177

pix_multiply (__m64 a, __m64 b)

178

{

179

__m64 res;

180

181

res = _mm_mullo_pi16 (a, b);

182

res = _mm_adds_pu16 (res, MC(4x0080));

183

res = _mm_adds_pu16 (res, _mm_srli_pi16 (res, 8));

184

res = _mm_srli_pi16 (res, 8);

185

186

return res;

187

}

188

189

static inline __m64

190

pix_add (__m64 a, __m64 b)

191

{

192

return _mm_adds_pu8 (a, b);

193

}

194

195

#ifdef USE_SSE

196

197

static inline __m64

198

expand_alpha (__m64 pixel)

199

{

200

return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(3, 3, 3, 3));

201

}

202

203

static inline __m64

204

expand_alpha_rev (__m64 pixel)

205

{

206

return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(0, 0, 0, 0));

207

}

208

209

static inline __m64

210

invert_colors (__m64 pixel)

211

{

212

return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(3, 0, 1, 2));

213

}

214

215

#else

216

217

static inline __m64

218

expand_alpha (__m64 pixel)

219

{

220

__m64 t1, t2;

221

222

t1 = shift (pixel, -48);

223

t2 = shift (t1, 16);

224

t1 = _mm_or_si64 (t1, t2);

225

t2 = shift (t1, 32);

226

t1 = _mm_or_si64 (t1, t2);

227

228

return t1;

229

}

230

231

static inline __m64

232

expand_alpha_rev (__m64 pixel)

233

{

234

__m64 t1, t2;

235

236

/* move alpha to low 16 bits and zero the rest */

237

t1 = shift (pixel, 48);

238

t1 = shift (t1, -48);

239

240

t2 = shift (t1, 16);

241

t1 = _mm_or_si64 (t1, t2);

242

t2 = shift (t1, 32);

243

t1 = _mm_or_si64 (t1, t2);

244

245

return t1;

246

}

247

248

static inline __m64

249

invert_colors (__m64 pixel)

250

{

251

__m64 x, y, z;

252

253

x = y = z = pixel;

254

255

x = _mm_and_si64 (x, MC(ffff0000ffff0000));

256

y = _mm_and_si64 (y, MC(000000000000ffff));

257

z = _mm_and_si64 (z, MC(0000ffff00000000));

258

259

y = shift (y, 32);

260

z = shift (z, -32);

261

262

x = _mm_or_si64 (x, y);

263

x = _mm_or_si64 (x, z);

264

265

return x;

266

}

267

268

#endif

269

270

static inline __m64

271

over (__m64 src, __m64 srca, __m64 dest)

272

{

273

return _mm_adds_pu8 (src, pix_multiply(dest, negate(srca)));

274

}

275

276

static inline __m64

277

over_rev_non_pre (__m64 src, __m64 dest)

278

{

279

__m64 srca = expand_alpha (src);

280

__m64 srcfaaa = _mm_or_si64 (srca, MC(full_alpha));

281

282

return over(pix_multiply(invert_colors(src), srcfaaa), srca, dest);

283

}

284

285

static inline __m64

286

in (__m64 src,

287

__m64 mask)

288

{

289

return pix_multiply (src, mask);

290

}

291

292

static inline __m64

293

in_over_full_src_alpha (__m64 src, __m64 mask, __m64 dest)

294

{

295

src = _mm_or_si64 (src, MC(full_alpha));

296

297

return over(in (src, mask), mask, dest);

298

}

299

300

#ifndef _MSC_VER

301

static inline __m64

302

in_over (__m64 src,

303

__m64 srca,

304

__m64 mask,

305

__m64 dest)

306

{

307

return over(in(src, mask), pix_multiply(srca, mask), dest);

308

}

309

#else

310

#define in_over(src, srca, mask, dest) over(in(src, mask), pix_multiply(srca, mask), dest)

311

#endif

312

313

static inline __m64

314

load8888 (uint32_t v)

315

{

316

return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (v), _mm_setzero_si64());

317

}

318

319

static inline __m64

320

pack8888 (__m64 lo, __m64 hi)

321

{

322

return _mm_packs_pu16 (lo, hi);

323

}

324

325

static inline uint32_t

326

store8888 (__m64 v)

327

{

328

return _mm_cvtsi64_si32(pack8888(v, _mm_setzero_si64()));

329

}

330

331

/* Expand 16 bits positioned at @pos (0-3) of a mmx register into

332

333

* 00RR00GG00BB

334

335

* --- Expanding 565 in the low word ---

336

337

* m = (m << (32 - 3)) | (m << (16 - 5)) | m;

338

* m = m & (01f0003f001f);

339

* m = m * (008404100840);

340

* m = m >> 8;

341

342

* Note the trick here - the top word is shifted by another nibble to

343

* avoid it bumping into the middle word

344

345

static inline __m64

346

expand565 (__m64 pixel, int pos)

347

{

348

__m64 p = pixel;

349

__m64 t1, t2;

350

351

/* move pixel to low 16 bit and zero the rest */

352

p = shift (shift (p, (3 - pos) * 16), -48);

353

354

t1 = shift (p, 36 - 11);

355

t2 = shift (p, 16 - 5);

356

357

p = _mm_or_si64 (t1, p);

358

p = _mm_or_si64 (t2, p);

359

p = _mm_and_si64 (p, MC(565_rgb));

360

361

pixel = _mm_mullo_pi16 (p, MC(565_unpack_multiplier));

362

return _mm_srli_pi16 (pixel, 8);

363

}

364

365

static inline __m64

366

expand8888 (__m64 in, int pos)

367

{

368

if (pos == 0)

369

return _mm_unpacklo_pi8 (in, _mm_setzero_si64());

370

else

371

return _mm_unpackhi_pi8 (in, _mm_setzero_si64());

372

}

373

374

static inline __m64

375

pack565 (__m64 pixel, __m64 target, int pos)

376

{

377

__m64 p = pixel;

378

__m64 t = target;

379

__m64 r, g, b;

380

381

r = _mm_and_si64 (p, MC(565_r));

382

g = _mm_and_si64 (p, MC(565_g));

383

b = _mm_and_si64 (p, MC(565_b));

384

385

r = shift (r, - (32 - 8) + pos * 16);

386

g = shift (g, - (16 - 3) + pos * 16);

387

b = shift (b, - (0 + 3) + pos * 16);

388

389

if (pos == 0)

390

t = _mm_and_si64 (t, MC(mask_0));

391

else if (pos == 1)

392

t = _mm_and_si64 (t, MC(mask_1));

393

else if (pos == 2)

394

t = _mm_and_si64 (t, MC(mask_2));

395

else if (pos == 3)

396

t = _mm_and_si64 (t, MC(mask_3));

397

398

p = _mm_or_si64 (r, t);

399

p = _mm_or_si64 (g, p);

400

401

return _mm_or_si64 (b, p);

402

}

403

404

#ifndef _MSC_VER

405

static inline __m64

406

pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b)

407

{

408

x = _mm_mullo_pi16 (x, a);

409

y = _mm_mullo_pi16 (y, b);

410

x = _mm_adds_pu16 (x, MC(4x0080));

411

x = _mm_adds_pu16 (x, y);

412

x = _mm_adds_pu16 (x, _mm_srli_pi16 (x, 8));

413

x = _mm_srli_pi16 (x, 8);

414

415

return x;

416

}

417

#else

418

#define pix_add_mul(x, a, y, b) \

419

( x = _mm_mullo_pi16 (x, a), \

420

y = _mm_mullo_pi16 (y, b), \

421

x = _mm_adds_pu16 (x, MC(4x0080)), \

422

x = _mm_adds_pu16 (x, y), \

423

x = _mm_adds_pu16 (x, _mm_srli_pi16 (x, 8)), \

424

_mm_srli_pi16 (x, 8) )

425

#endif

426

427

/* --------------- MMX code patch for fbcompose.c --------------------- */

428

429

static FASTCALL void

430

mmxCombineMaskU (uint32_t *src, const uint32_t *mask, int width)

431

{

432

const uint32_t *end = mask + width;

433

while (mask < end) {

434

uint32_t mmask = *mask;

435

uint32_t maska = mmask >> 24;

436

if (maska == 0) {

437

*src = 0;

438

} else if (maska != 0xff) {

439

__m64 a = load8888(mmask);

440

__m64 s = load8888(*src);

441

a = expand_alpha(a);

442

s = pix_multiply(s, a);

443

*src = store8888(s);

444

}

445

++src;

446

++mask;

447

}

448

_mm_empty();

449

}

450

451

452

static FASTCALL void

453

mmxCombineOverU (uint32_t *dest, const uint32_t *src, int width)

454

{

455

const uint32_t *end = dest + width;

456

457

while (dest < end) {

458

uint32_t ssrc = *src;

459

uint32_t a = ssrc >> 24;

460

if (a == 0xff) {

461

*dest = ssrc;

462

} else if (a) {

463

__m64 s, sa;

464

s = load8888(ssrc);

465

sa = expand_alpha(s);

466

*dest = store8888(over(s, sa, load8888(*dest)));

467

}

468

++dest;

469

++src;

470

}

471

_mm_empty();

472

}

473

474

static FASTCALL void

475

mmxCombineOverReverseU (uint32_t *dest, const uint32_t *src, int width)

476

{

477

const uint32_t *end = dest + width;

478

479

while (dest < end) {

480

__m64 d, da;

481

d = load8888(*dest);

482

da = expand_alpha(d);

483

*dest = store8888(over (d, da, load8888(*src)));

484

++dest;

485

++src;

486

}

487

_mm_empty();

488

}

489

490

static FASTCALL void

491

mmxCombineInU (uint32_t *dest, const uint32_t *src, int width)

492

{

493

const uint32_t *end = dest + width;

494

495

while (dest < end) {

496

__m64 x, a;

497

x = load8888(*src);

498

a = load8888(*dest);

499

a = expand_alpha(a);

500

x = pix_multiply(x, a);

501

*dest = store8888(x);

502

++dest;

503

++src;

504

}

505

_mm_empty();

506

}

507

508

static FASTCALL void

509

mmxCombineInReverseU (uint32_t *dest, const uint32_t *src, int width)

510

{

511

const uint32_t *end = dest + width;

512

513

while (dest < end) {

514

__m64 x, a;

515

x = load8888(*dest);

516

a = load8888(*src);

517

a = expand_alpha(a);

518

x = pix_multiply(x, a);

519

*dest = store8888(x);

520

++dest;

521

++src;

522

}

523

_mm_empty();

524

}

525

526

static FASTCALL void

527

mmxCombineOutU (uint32_t *dest, const uint32_t *src, int width)

528

{

529

const uint32_t *end = dest + width;

530

531

while (dest < end) {

532

__m64 x, a;

533

x = load8888(*src);

534

a = load8888(*dest);

535

a = expand_alpha(a);

536

a = negate(a);

537

x = pix_multiply(x, a);

538

*dest = store8888(x);

539

++dest;

540

++src;

541

}

542

_mm_empty();

543

}

544

545

static FASTCALL void

546

mmxCombineOutReverseU (uint32_t *dest, const uint32_t *src, int width)

547

{

548

const uint32_t *end = dest + width;

549

550

while (dest < end) {

551

__m64 x, a;

552

x = load8888(*dest);

553

a = load8888(*src);

554

a = expand_alpha(a);

555

a = negate(a);

556

x = pix_multiply(x, a);

557

*dest = store8888(x);

558

++dest;

559

++src;

560

}

561

_mm_empty();

562

}

563

564

static FASTCALL void

565

mmxCombineAtopU (uint32_t *dest, const uint32_t *src, int width)

566

{

567

const uint32_t *end = dest + width;

568

569

while (dest < end) {

570

__m64 s, da, d, sia;

571

s = load8888(*src);

572

d = load8888(*dest);

573

sia = expand_alpha(s);

574

sia = negate(sia);

575

da = expand_alpha(d);

576

s = pix_add_mul (s, da, d, sia);

577

*dest = store8888(s);

578

++dest;

579

++src;

580

}

581

_mm_empty();

582

}

583

584

static FASTCALL void

585

mmxCombineAtopReverseU (uint32_t *dest, const uint32_t *src, int width)

586

{

587

const uint32_t *end;

588

589

end = dest + width;

590

591

while (dest < end) {

592

__m64 s, dia, d, sa;

593

s = load8888(*src);

594

d = load8888(*dest);

595

sa = expand_alpha(s);

596

dia = expand_alpha(d);

597

dia = negate(dia);

598

s = pix_add_mul (s, dia, d, sa);

599

*dest = store8888(s);

600

++dest;

601

++src;

602

}

603

_mm_empty();

604

}

605

606

static FASTCALL void

607

mmxCombineXorU (uint32_t *dest, const uint32_t *src, int width)

608

{

609

const uint32_t *end = dest + width;

610

611

while (dest < end) {

612

__m64 s, dia, d, sia;

613

s = load8888(*src);

614

d = load8888(*dest);

615

sia = expand_alpha(s);

616

dia = expand_alpha(d);

617

sia = negate(sia);

618

dia = negate(dia);

619

s = pix_add_mul (s, dia, d, sia);

620

*dest = store8888(s);

621

++dest;

622

++src;

623

}

624

_mm_empty();

625

}

626

627

static FASTCALL void

628

mmxCombineAddU (uint32_t *dest, const uint32_t *src, int width)

629

{

630

const uint32_t *end = dest + width;

631

while (dest < end) {

632

__m64 s, d;

633

s = load8888(*src);

634

d = load8888(*dest);

635

s = pix_add(s, d);

636

*dest = store8888(s);

637

++dest;

638

++src;

639

}

640

_mm_empty();

641

}

642

643

static FASTCALL void

644

mmxCombineSaturateU (uint32_t *dest, const uint32_t *src, int width)

645

{

646

const uint32_t *end = dest + width;

647

while (dest < end) {

648

uint32_t s = *src;

649

uint32_t d = *dest;

650

__m64 ms = load8888(s);

651

__m64 md = load8888(d);

652

uint32_t sa = s >> 24;

653

uint32_t da = ~d >> 24;

654

655

if (sa > da) {

656

__m64 msa = load8888(FbIntDiv(da, sa) << 24);

657

msa = expand_alpha(msa);

658

ms = pix_multiply(ms, msa);

659

}

660

md = pix_add(md, ms);

661

*dest = store8888(md);

662

++src;

663

++dest;

664

}

665

_mm_empty();

666

}

667

668

669

static FASTCALL void

670

mmxCombineSrcC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)

671

{

672

const uint32_t *end = src + width;

673

while (src < end) {

674

__m64 a = load8888(*mask);

675

__m64 s = load8888(*src);

676

s = pix_multiply(s, a);

677

*dest = store8888(s);

678

++src;

679

++mask;

680

++dest;

681

}

682

_mm_empty();

683

}

684

685

static FASTCALL void

686

mmxCombineOverC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)

687

{

688

const uint32_t *end = src + width;

689

while (src < end) {

690

__m64 a = load8888(*mask);

691

__m64 s = load8888(*src);

692

__m64 d = load8888(*dest);

693

__m64 sa = expand_alpha(s);

694

695

*dest = store8888(in_over (s, sa, a, d));

696

697

++src;

698

++dest;

699

++mask;

700

}

701

_mm_empty();

702

}

703

704

static FASTCALL void

705

mmxCombineOverReverseC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)

706

{

707

const uint32_t *end = src + width;

708

while (src < end) {

709

__m64 a = load8888(*mask);

710

__m64 s = load8888(*src);

711

__m64 d = load8888(*dest);

712

__m64 da = expand_alpha(d);

713

714

*dest = store8888(over (d, da, in (s, a)));

715

716

++src;

717

++dest;

718

++mask;

719

}

720

_mm_empty();

721

}

722

723

724

static FASTCALL void

725

mmxCombineInC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)

726

{

727

const uint32_t *end = src + width;

728

while (src < end) {

729

__m64 a = load8888(*mask);

730

__m64 s = load8888(*src);

731

__m64 d = load8888(*dest);

732

__m64 da = expand_alpha(d);

733

s = pix_multiply(s, a);

734

s = pix_multiply(s, da);

735

*dest = store8888(s);

736

++src;

737

++dest;

738

++mask;

739

}

740

_mm_empty();

741

}

742

743

static FASTCALL void

744

mmxCombineInReverseC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)

745

{

746

const uint32_t *end = src + width;

747

while (src < end) {

748

__m64 a = load8888(*mask);

749

__m64 s = load8888(*src);

750

__m64 d = load8888(*dest);

751

__m64 sa = expand_alpha(s);

752

a = pix_multiply(a, sa);

753

d = pix_multiply(d, a);

754

*dest = store8888(d);

755

++src;

756

++dest;

757

++mask;

758

}

759

_mm_empty();

760

}

761

762

static FASTCALL void

763

mmxCombineOutC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)

764

{

765

const uint32_t *end = src + width;

766

while (src < end) {

767

__m64 a = load8888(*mask);

768

__m64 s = load8888(*src);

769

__m64 d = load8888(*dest);

770

__m64 da = expand_alpha(d);

771

da = negate(da);

772

s = pix_multiply(s, a);

773

s = pix_multiply(s, da);

774

*dest = store8888(s);

775

++src;

776

++dest;

777

++mask;

778

}

779

_mm_empty();

780

}

781

782

static FASTCALL void

783

mmxCombineOutReverseC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)

784

{

785

const uint32_t *end = src + width;

786

while (src < end) {

787

__m64 a = load8888(*mask);

788

__m64 s = load8888(*src);

789

__m64 d = load8888(*dest);

790

__m64 sa = expand_alpha(s);

791

a = pix_multiply(a, sa);

792

a = negate(a);

793

d = pix_multiply(d, a);

794

*dest = store8888(d);

795

++src;

796

++dest;

797

++mask;

798

}

799

_mm_empty();

800

}

801

802

static FASTCALL void

803

mmxCombineAtopC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)

804

{

805

const uint32_t *end = src + width;

806

while (src < end) {

807

__m64 a = load8888(*mask);

808

__m64 s = load8888(*src);

809

__m64 d = load8888(*dest);

810

__m64 da = expand_alpha(d);

811

__m64 sa = expand_alpha(s);

812

s = pix_multiply(s, a);

813

a = pix_multiply(a, sa);

814

a = negate(a);

815

d = pix_add_mul (d, a, s, da);

816

*dest = store8888(d);

817

++src;

818

++dest;

819

++mask;

820

}

821

_mm_empty();

822

}

823

824

static FASTCALL void

825

mmxCombineAtopReverseC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)

826

{

827

const uint32_t *end = src + width;

828

while (src < end) {

829

__m64 a = load8888(*mask);

830

__m64 s = load8888(*src);

831

__m64 d = load8888(*dest);

832

__m64 da = expand_alpha(d);

833

__m64 sa = expand_alpha(s);

834

s = pix_multiply(s, a);

835

a = pix_multiply(a, sa);

836

da = negate(da);

837

d = pix_add_mul (d, a, s, da);

838

*dest = store8888(d);

839

++src;

840

++dest;

841

++mask;

842

}

843

_mm_empty();

844

}

845

846

static FASTCALL void

847

mmxCombineXorC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)

848

{

849

const uint32_t *end = src + width;

850

while (src < end) {

851

__m64 a = load8888(*mask);

852

__m64 s = load8888(*src);

853

__m64 d = load8888(*dest);

854

__m64 da = expand_alpha(d);

855

__m64 sa = expand_alpha(s);

856

s = pix_multiply(s, a);

857

a = pix_multiply(a, sa);

858

da = negate(da);

859

a = negate(a);

860

d = pix_add_mul (d, a, s, da);

861

*dest = store8888(d);

862

++src;

863

++dest;

864

++mask;

865

}

866

_mm_empty();

867

}

868

869

static FASTCALL void

870

mmxCombineAddC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)

871

{

872

const uint32_t *end = src + width;

873

while (src < end) {

874

__m64 a = load8888(*mask);

875

__m64 s = load8888(*src);

876

__m64 d = load8888(*dest);

877

s = pix_multiply(s, a);

878

d = pix_add(s, d);

879

*dest = store8888(d);

880

++src;

881

++dest;

882

++mask;

883

}

884

_mm_empty();

885

}

886

887

void fbComposeSetupMMX(void)

888

{

889

/* check if we have MMX support and initialize accordingly */

890

if (pixman_have_mmx())

891

{

892

pixman_composeFunctions.combineU[PIXMAN_OP_OVER] = mmxCombineOverU;

893

pixman_composeFunctions.combineU[PIXMAN_OP_OVER_REVERSE] = mmxCombineOverReverseU;

894

pixman_composeFunctions.combineU[PIXMAN_OP_IN] = mmxCombineInU;

895

pixman_composeFunctions.combineU[PIXMAN_OP_IN_REVERSE] = mmxCombineInReverseU;

896

pixman_composeFunctions.combineU[PIXMAN_OP_OUT] = mmxCombineOutU;

897

pixman_composeFunctions.combineU[PIXMAN_OP_OUT_REVERSE] = mmxCombineOutReverseU;

898

pixman_composeFunctions.combineU[PIXMAN_OP_ATOP] = mmxCombineAtopU;

899

pixman_composeFunctions.combineU[PIXMAN_OP_ATOP_REVERSE] = mmxCombineAtopReverseU;

900

pixman_composeFunctions.combineU[PIXMAN_OP_XOR] = mmxCombineXorU;

901

pixman_composeFunctions.combineU[PIXMAN_OP_ADD] = mmxCombineAddU;

902

pixman_composeFunctions.combineU[PIXMAN_OP_SATURATE] = mmxCombineSaturateU;

903

904

pixman_composeFunctions.combineC[PIXMAN_OP_SRC] = mmxCombineSrcC;

905

pixman_composeFunctions.combineC[PIXMAN_OP_OVER] = mmxCombineOverC;

906

pixman_composeFunctions.combineC[PIXMAN_OP_OVER_REVERSE] = mmxCombineOverReverseC;

907

pixman_composeFunctions.combineC[PIXMAN_OP_IN] = mmxCombineInC;

908

pixman_composeFunctions.combineC[PIXMAN_OP_IN_REVERSE] = mmxCombineInReverseC;

909

pixman_composeFunctions.combineC[PIXMAN_OP_OUT] = mmxCombineOutC;

910

pixman_composeFunctions.combineC[PIXMAN_OP_OUT_REVERSE] = mmxCombineOutReverseC;

911

pixman_composeFunctions.combineC[PIXMAN_OP_ATOP] = mmxCombineAtopC;

912

pixman_composeFunctions.combineC[PIXMAN_OP_ATOP_REVERSE] = mmxCombineAtopReverseC;

913

pixman_composeFunctions.combineC[PIXMAN_OP_XOR] = mmxCombineXorC;

914

pixman_composeFunctions.combineC[PIXMAN_OP_ADD] = mmxCombineAddC;

915

916

pixman_composeFunctions.combineMaskU = mmxCombineMaskU;

917

}

918

}

919

920

921

/* ------------------ MMX code paths called from fbpict.c ----------------------- */

922

923

void

924

fbCompositeSolid_nx8888mmx (pixman_op_t op,

925

pixman_image_t * pSrc,

926

pixman_image_t * pMask,

927

pixman_image_t * pDst,

928

int16_t xSrc,

929

int16_t ySrc,

930

int16_t xMask,

931

int16_t yMask,

932

int16_t xDst,

933

int16_t yDst,

934

uint16_t width,

935

uint16_t height)

936

{

937

uint32_t src;

938

uint32_t *dstLine, *dst;

939

uint16_t w;

940

int dstStride;

941

__m64 vsrc, vsrca;

942

943

CHECKPOINT();

944

945

fbComposeGetSolid(pSrc, src, pDst->bits.format);

946

947

if (src >> 24 == 0)

948

return;

949

950

fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);

951

952

vsrc = load8888 (src);

953

vsrca = expand_alpha (vsrc);

954

955

while (height--)

956

{

957

dst = dstLine;

958

dstLine += dstStride;

959

w = width;

960

961

CHECKPOINT();

962

963

while (w && (unsigned long)dst & 7)

964

{

965

*dst = store8888(over(vsrc, vsrca, load8888(*dst)));

966

967

w--;

968

dst++;

969

}

970

971

while (w >= 2)

972

{

973

__m64 vdest;

974

__m64 dest0, dest1;

975

976

vdest = *(__m64 *)dst;

977

978

dest0 = over(vsrc, vsrca, expand8888(vdest, 0));

979

dest1 = over(vsrc, vsrca, expand8888(vdest, 1));

980

981

*(__m64 *)dst = pack8888(dest0, dest1);

982

983

dst += 2;

984

w -= 2;

985

}

986

987

CHECKPOINT();

988

989

while (w)

990

{

991

*dst = store8888(over(vsrc, vsrca, load8888(*dst)));

992

993

w--;

994

dst++;

995

}

996

}

997

998

_mm_empty();

999

}

1000

1001

void

1002

fbCompositeSolid_nx0565mmx (pixman_op_t op,

1003

pixman_image_t * pSrc,

1004

pixman_image_t * pMask,

1005

pixman_image_t * pDst,

1006

int16_t xSrc,

1007

int16_t ySrc,

1008

int16_t xMask,

1009

int16_t yMask,

1010

int16_t xDst,

1011

int16_t yDst,

1012

uint16_t width,

1013

uint16_t height)

1014

{

1015

uint32_t src;

1016

uint16_t *dstLine, *dst;

1017

uint16_t w;

1018

int dstStride;

1019

__m64 vsrc, vsrca;

1020

1021

CHECKPOINT();

1022

1023

fbComposeGetSolid(pSrc, src, pDst->bits.format);

1024

1025

if (src >> 24 == 0)

1026

return;

1027

1028

fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);

1029

1030

vsrc = load8888 (src);

1031

vsrca = expand_alpha (vsrc);

1032

1033

while (height--)

1034

{

1035

dst = dstLine;

1036

dstLine += dstStride;

1037

w = width;

1038

1039

CHECKPOINT();

1040

1041

while (w && (unsigned long)dst & 7)

1042

{

1043

ullong d = *dst;

1044

__m64 vdest = expand565 ((__m64)d, 0);

1045

vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0);

1046

*dst = (ullong)vdest;

1047

1048

w--;

1049

dst++;

1050

}

1051

1052

while (w >= 4)

1053

{

1054

__m64 vdest;

1055

1056

vdest = *(__m64 *)dst;

1057

1058

vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 0)), vdest, 0);

1059

vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 1)), vdest, 1);

1060

vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 2)), vdest, 2);

1061

vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 3)), vdest, 3);

1062

1063

*(__m64 *)dst = vdest;

1064

1065

dst += 4;

1066

w -= 4;

1067

}

1068

1069

CHECKPOINT();

1070

1071

while (w)

1072

{

1073

ullong d = *dst;

1074

__m64 vdest = expand565 ((__m64)d, 0);

1075

vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0);

1076

*dst = (ullong)vdest;

1077

1078

w--;

1079

dst++;

1080

}

1081

}

1082

1083

_mm_empty();

1084

}

1085

1086

void

1087

fbCompositeSolidMask_nx8888x8888Cmmx (pixman_op_t op,

1088

pixman_image_t * pSrc,

1089

pixman_image_t * pMask,

1090

pixman_image_t * pDst,

1091

int16_t xSrc,

1092

int16_t ySrc,

1093

int16_t xMask,

1094

int16_t yMask,

1095

int16_t xDst,

1096

int16_t yDst,

1097

uint16_t width,

1098

uint16_t height)

1099

{

1100

uint32_t src, srca;

1101

uint32_t *dstLine;

1102

uint32_t *maskLine;

1103

int dstStride, maskStride;

1104

__m64 vsrc, vsrca;

1105

1106

CHECKPOINT();

1107

1108

fbComposeGetSolid(pSrc, src, pDst->bits.format);

1109

1110

srca = src >> 24;

1111

if (srca == 0)

1112

return;

1113

1114

fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);

1115

fbComposeGetStart (pMask, xMask, yMask, uint32_t, maskStride, maskLine, 1);

1116

1117

vsrc = load8888(src);

1118

vsrca = expand_alpha(vsrc);

1119

1120

while (height--)

1121

{

1122

int twidth = width;

1123

uint32_t *p = (uint32_t *)maskLine;

1124

uint32_t *q = (uint32_t *)dstLine;

1125

1126

while (twidth && (unsigned long)q & 7)

1127

{

1128

uint32_t m = *(uint32_t *)p;

1129

1130

if (m)

1131

{

1132

__m64 vdest = load8888(*q);

1133

vdest = in_over(vsrc, vsrca, load8888(m), vdest);

1134

*q = store8888(vdest);

1135

}

1136

1137

twidth--;

1138

p++;

1139

q++;

1140

}

1141

1142

while (twidth >= 2)

1143

{

1144

uint32_t m0, m1;

1145

m0 = *p;

1146

m1 = *(p + 1);

1147

1148

if (m0 | m1)

1149

{

1150

__m64 dest0, dest1;

1151

__m64 vdest = *(__m64 *)q;

1152

1153

dest0 = in_over(vsrc, vsrca, load8888(m0),

1154

expand8888 (vdest, 0));

1155

dest1 = in_over(vsrc, vsrca, load8888(m1),

1156

expand8888 (vdest, 1));

1157

1158

*(__m64 *)q = pack8888(dest0, dest1);

1159

}

1160

1161

p += 2;

1162

q += 2;

1163

twidth -= 2;

1164

}

1165

1166

while (twidth)

1167

{

1168

uint32_t m = *(uint32_t *)p;

1169

1170

if (m)

1171

{

1172

__m64 vdest = load8888(*q);

1173

vdest = in_over(vsrc, vsrca, load8888(m), vdest);

1174

*q = store8888(vdest);

1175

}

1176

1177

twidth--;

1178

p++;

1179

q++;

1180

}

1181

1182

dstLine += dstStride;

1183

maskLine += maskStride;

1184

}

1185

1186

_mm_empty();

1187

}

1188

1189

void

1190

fbCompositeSrc_8888x8x8888mmx (pixman_op_t op,

1191

pixman_image_t * pSrc,

1192

pixman_image_t * pMask,

1193

pixman_image_t * pDst,

1194

int16_t xSrc,

1195

int16_t ySrc,

1196

int16_t xMask,

1197

int16_t yMask,

1198

int16_t xDst,

1199

int16_t yDst,

1200

uint16_t width,

1201

uint16_t height)

1202

{

1203

uint32_t *dstLine, *dst;

1204

uint32_t *srcLine, *src;

1205

uint32_t mask;

1206

__m64 vmask;

1207

int dstStride, srcStride;

1208

uint16_t w;

1209

__m64 srca;

1210

1211

CHECKPOINT();

1212

1213

fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);

1214

fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);

1215

1216

fbComposeGetSolid (pMask, mask, pDst->bits.format);

1217

mask = mask | mask >> 8 | mask >> 16 | mask >> 24;

1218

vmask = load8888 (mask);

1219

srca = MC(4x00ff);

1220

1221

while (height--)

1222

{

1223

dst = dstLine;

1224

dstLine += dstStride;

1225

src = srcLine;

1226

srcLine += srcStride;

1227

w = width;

1228

1229

while (w && (unsigned long)dst & 7)

1230

{

1231

__m64 s = load8888 (*src);

1232

__m64 d = load8888 (*dst);

1233

1234

*dst = store8888 (in_over (s, expand_alpha (s), vmask, d));

1235

1236

w--;

1237

dst++;

1238

src++;

1239

}

1240

1241

while (w >= 2)

1242

{

1243

__m64 vs = *(__m64 *)src;

1244

__m64 vd = *(__m64 *)dst;

1245

__m64 vsrc0 = expand8888 (vs, 0);

1246

__m64 vsrc1 = expand8888 (vs, 1);

1247

1248

*(__m64 *)dst = pack8888 (

1249

in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)),

1250

in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1)));

1251

1252

w -= 2;

1253

dst += 2;

1254

src += 2;

1255

}

1256

1257

while (w)

1258

{

1259

__m64 s = load8888 (*src);

1260

__m64 d = load8888 (*dst);

1261

1262

*dst = store8888 (in_over (s, expand_alpha (s), vmask, d));

1263

1264

w--;

1265

dst++;

1266

src++;

1267

}

1268

}

1269

1270

_mm_empty();

1271

}

1272

1273

void

1274

fbCompositeSrc_x888xnx8888mmx (pixman_op_t op,

1275

pixman_image_t * pSrc,

1276

pixman_image_t * pMask,

1277

pixman_image_t * pDst,

1278

int16_t xSrc,

1279

int16_t ySrc,

1280

int16_t xMask,

1281

int16_t yMask,

1282

int16_t xDst,

1283

int16_t yDst,

1284

uint16_t width,

1285

uint16_t height)

1286

{

1287

uint32_t *dstLine, *dst;

1288

uint32_t *srcLine, *src;

1289

uint32_t mask;

1290

__m64 vmask;

1291

int dstStride, srcStride;

1292

uint16_t w;

1293

__m64 srca;

1294

1295

CHECKPOINT();

1296

1297

fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);

1298

fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);

1299

fbComposeGetSolid (pMask, mask, pDst->bits.format);

1300

1301

mask = mask | mask >> 8 | mask >> 16 | mask >> 24;

1302

vmask = load8888 (mask);

1303

srca = MC(4x00ff);

1304

1305

while (height--)

1306

{

1307

dst = dstLine;

1308

dstLine += dstStride;

1309

src = srcLine;

1310

srcLine += srcStride;

1311

w = width;

1312

1313

while (w && (unsigned long)dst & 7)

1314

{

1315

__m64 s = load8888 (*src | 0xff000000);

1316

__m64 d = load8888 (*dst);

1317

1318

*dst = store8888 (in_over (s, srca, vmask, d));

1319

1320

w--;

1321

dst++;

1322

src++;

1323

}

1324

1325

while (w >= 16)

1326

{

1327

__m64 vd0 = *(__m64 *)(dst + 0);

1328

__m64 vd1 = *(__m64 *)(dst + 2);

1329

__m64 vd2 = *(__m64 *)(dst + 4);

1330

__m64 vd3 = *(__m64 *)(dst + 6);

1331

__m64 vd4 = *(__m64 *)(dst + 8);

1332

__m64 vd5 = *(__m64 *)(dst + 10);

1333

__m64 vd6 = *(__m64 *)(dst + 12);

1334

__m64 vd7 = *(__m64 *)(dst + 14);

1335

1336

__m64 vs0 = *(__m64 *)(src + 0);

1337

__m64 vs1 = *(__m64 *)(src + 2);

1338

__m64 vs2 = *(__m64 *)(src + 4);

1339

__m64 vs3 = *(__m64 *)(src + 6);

1340

__m64 vs4 = *(__m64 *)(src + 8);

1341

__m64 vs5 = *(__m64 *)(src + 10);

1342

__m64 vs6 = *(__m64 *)(src + 12);

1343

__m64 vs7 = *(__m64 *)(src + 14);

1344

1345

vd0 = pack8888 (

1346

in_over (expand8888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),

1347

in_over (expand8888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));

1348

1349

vd1 = pack8888 (

1350

in_over (expand8888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),

1351

in_over (expand8888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));

1352

1353

vd2 = pack8888 (

1354

in_over (expand8888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),

1355

in_over (expand8888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));

1356

1357

vd3 = pack8888 (

1358

in_over (expand8888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),

1359

in_over (expand8888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));

1360

1361

vd4 = pack8888 (

1362

in_over (expand8888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),

1363

in_over (expand8888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));

1364

1365

vd5 = pack8888 (

1366

in_over (expand8888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),

1367

in_over (expand8888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));

1368

1369

vd6 = pack8888 (

1370

in_over (expand8888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),

1371

in_over (expand8888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));

1372

1373

vd7 = pack8888 (

1374

in_over (expand8888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),

1375

in_over (expand8888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));

1376

1377

*(__m64 *)(dst + 0) = vd0;

1378

*(__m64 *)(dst + 2) = vd1;

1379

*(__m64 *)(dst + 4) = vd2;

1380

*(__m64 *)(dst + 6) = vd3;

1381

*(__m64 *)(dst + 8) = vd4;

1382

*(__m64 *)(dst + 10) = vd5;

1383

*(__m64 *)(dst + 12) = vd6;

1384

*(__m64 *)(dst + 14) = vd7;

1385

1386

w -= 16;

1387

dst += 16;

1388

src += 16;

1389

}

1390

1391

while (w)

1392

{

1393

__m64 s = load8888 (*src | 0xff000000);

1394

__m64 d = load8888 (*dst);

1395

1396

*dst = store8888 (in_over (s, srca, vmask, d));

1397

1398

w--;

1399

dst++;

1400

src++;

1401

}

1402

}

1403

1404

_mm_empty();

1405

}

1406

1407

void

1408

fbCompositeSrc_8888x8888mmx (pixman_op_t op,

1409

pixman_image_t * pSrc,

1410

pixman_image_t * pMask,

1411

pixman_image_t * pDst,

1412

int16_t xSrc,

1413

int16_t ySrc,

1414

int16_t xMask,

1415

int16_t yMask,

1416

int16_t xDst,

1417

int16_t yDst,

1418

uint16_t width,

1419

uint16_t height)

1420

{

1421

uint32_t *dstLine, *dst;

1422

uint32_t *srcLine, *src;

1423

uint32_t s;

1424

int dstStride, srcStride;

1425

uint8_t a;

1426

uint16_t w;

1427

1428

CHECKPOINT();

1429

1430

fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);

1431

fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);

1432

1433

while (height--)

1434

{

1435

dst = dstLine;

1436

dstLine += dstStride;

1437

src = srcLine;

1438

srcLine += srcStride;

1439

w = width;

1440

1441

while (w--)

1442

{

1443

s = *src++;

1444

a = s >> 24;

1445

if (a == 0xff)

1446

*dst = s;

1447

else if (a) {

1448

__m64 ms, sa;

1449

ms = load8888(s);

1450

sa = expand_alpha(ms);

1451

*dst = store8888(over(ms, sa, load8888(*dst)));

1452

}

1453

dst++;

1454

}

1455

}

1456

_mm_empty();

1457

}

1458

1459

void

1460

fbCompositeSrc_8888x0565mmx (pixman_op_t op,

1461

pixman_image_t * pSrc,

1462

pixman_image_t * pMask,

1463

pixman_image_t * pDst,

1464

int16_t xSrc,

1465

int16_t ySrc,

1466

int16_t xMask,

1467

int16_t yMask,

1468

int16_t xDst,

1469

int16_t yDst,

1470

uint16_t width,

1471

uint16_t height)

1472

{

1473

uint16_t *dstLine, *dst;

1474

uint32_t *srcLine, *src;

1475

int dstStride, srcStride;

1476

uint16_t w;

1477

1478

CHECKPOINT();

1479

1480

fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);

1481

fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);

1482

1483

#if 0

1484

/* FIXME */

1485

assert (pSrc->pDrawable == pMask->pDrawable);

1486

#endif

1487

1488

while (height--)

1489

{

1490

dst = dstLine;

1491

dstLine += dstStride;

1492

src = srcLine;

1493

srcLine += srcStride;

1494

w = width;

1495

1496

CHECKPOINT();

1497

1498

while (w && (unsigned long)dst & 7)

1499

{

1500

__m64 vsrc = load8888 (*src);

1501

ullong d = *dst;

1502

__m64 vdest = expand565 ((__m64)d, 0);

1503

1504

vdest = pack565(over(vsrc, expand_alpha(vsrc), vdest), vdest, 0);

1505

1506

*dst = (ullong)vdest;

1507

1508

w--;

1509

dst++;

1510

src++;

1511

}

1512

1513

CHECKPOINT();

1514

1515

while (w >= 4)

1516

{

1517

__m64 vsrc0, vsrc1, vsrc2, vsrc3;

1518

__m64 vdest;

1519

1520

vsrc0 = load8888(*(src + 0));

1521

vsrc1 = load8888(*(src + 1));

1522

vsrc2 = load8888(*(src + 2));

1523

vsrc3 = load8888(*(src + 3));

1524

1525

vdest = *(__m64 *)dst;

1526

1527

vdest = pack565(over(vsrc0, expand_alpha(vsrc0), expand565(vdest, 0)), vdest, 0);

1528

vdest = pack565(over(vsrc1, expand_alpha(vsrc1), expand565(vdest, 1)), vdest, 1);

1529

vdest = pack565(over(vsrc2, expand_alpha(vsrc2), expand565(vdest, 2)), vdest, 2);

1530

vdest = pack565(over(vsrc3, expand_alpha(vsrc3), expand565(vdest, 3)), vdest, 3);

1531

1532

*(__m64 *)dst = vdest;

1533

1534

w -= 4;

1535

dst += 4;

1536

src += 4;

1537

}

1538

1539

CHECKPOINT();

1540

1541

while (w)

1542

{

1543

__m64 vsrc = load8888 (*src);

1544

ullong d = *dst;

1545

__m64 vdest = expand565 ((__m64)d, 0);

1546

1547

vdest = pack565(over(vsrc, expand_alpha(vsrc), vdest), vdest, 0);

1548

1549

*dst = (ullong)vdest;

1550

1551

w--;

1552

dst++;

1553

src++;

1554

}

1555

}

1556

1557

_mm_empty();

1558

}

1559

1560

void

1561

fbCompositeSolidMask_nx8x8888mmx (pixman_op_t op,

1562

pixman_image_t * pSrc,

1563

pixman_image_t * pMask,

1564

pixman_image_t * pDst,

1565

int16_t xSrc,

1566

int16_t ySrc,

1567

int16_t xMask,

1568

int16_t yMask,

1569

int16_t xDst,

1570

int16_t yDst,

1571

uint16_t width,

1572

uint16_t height)

1573

{

1574

uint32_t src, srca;

1575

uint32_t *dstLine, *dst;

1576

uint8_t *maskLine, *mask;

1577

int dstStride, maskStride;

1578

uint16_t w;

1579

__m64 vsrc, vsrca;

1580

ullong srcsrc;

1581

1582

CHECKPOINT();

1583

1584

fbComposeGetSolid(pSrc, src, pDst->bits.format);

1585

1586

srca = src >> 24;

1587

if (srca == 0)

1588

return;

1589

1590

srcsrc = (unsigned long long)src << 32 | src;

1591

1592

fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);

1593

fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);

1594

1595

vsrc = load8888 (src);

1596

vsrca = expand_alpha (vsrc);

1597

1598

while (height--)

1599

{

1600

dst = dstLine;

1601

dstLine += dstStride;

1602

mask = maskLine;

1603

maskLine += maskStride;

1604

w = width;

1605

1606

CHECKPOINT();

1607

1608

while (w && (unsigned long)dst & 7)

1609

{

1610

ullong m = *mask;

1611

1612

if (m)

1613

{

1614

__m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), load8888(*dst));

1615

*dst = store8888(vdest);

1616

}

1617

1618

w--;

1619

mask++;

1620

dst++;

1621

}

1622

1623

CHECKPOINT();

1624

1625

while (w >= 2)

1626

{

1627

ullong m0, m1;

1628

m0 = *mask;

1629

m1 = *(mask + 1);

1630

1631

if (srca == 0xff && (m0 & m1) == 0xff)

1632

{

1633

*(unsigned long long *)dst = srcsrc;

1634

}

1635

else if (m0 | m1)

1636

{

1637

__m64 vdest;

1638

__m64 dest0, dest1;

1639

1640

vdest = *(__m64 *)dst;

1641

1642

dest0 = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m0), expand8888(vdest, 0));

1643

dest1 = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m1), expand8888(vdest, 1));

1644

1645

*(__m64 *)dst = pack8888(dest0, dest1);

1646

}

1647

1648

mask += 2;

1649

dst += 2;

1650

w -= 2;

1651

}

1652

1653

CHECKPOINT();

1654

1655

while (w)

1656

{

1657

ullong m = *mask;

1658

1659

if (m)

1660

{

1661

__m64 vdest = load8888(*dst);

1662

vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), vdest);

1663

*dst = store8888(vdest);

1664

}

1665

1666

w--;

1667

mask++;

1668

dst++;

1669

}

1670

}

1671

1672

_mm_empty();

1673

}

1674

1675

pixman_bool_t

1676

pixman_fill_mmx (uint32_t *bits,

1677

int stride,

1678

int bpp,

1679

int x,

1680

int y,

1681

int width,

1682

int height,

1683

uint32_t xor)

1684

{

1685

ullong fill;

1686

__m64 vfill;

1687

uint32_t byte_width;

1688

uint8_t *byte_line;

1689

#ifdef __GNUC__

1690

__m64 v1, v2, v3, v4, v5, v6, v7;

1691

#endif

1692

1693

if (bpp == 16 && (xor >> 16 != (xor & 0xffff)))

1694

return FALSE;

1695

1696

if (bpp != 16 && bpp != 32)

1697

return FALSE;

1698

1699

if (bpp == 16)

1700

{

1701

stride = stride * sizeof (uint32_t) / 2;

1702

byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);

1703

byte_width = 2 * width;

1704

stride *= 2;

1705

}

1706

else

1707

{

1708

stride = stride * sizeof (uint32_t) / 4;

1709

byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);

1710

byte_width = 4 * width;

1711

stride *= 4;

1712

}

1713

1714

fill = ((ullong)xor << 32) | xor;

1715

vfill = (__m64)fill;

1716

1717

#ifdef __GNUC__

1718

__asm__ (

1719

"movq %7, %0\n"

1720

"movq %7, %1\n"

1721

"movq %7, %2\n"

1722

"movq %7, %3\n"

1723

"movq %7, %4\n"

1724

"movq %7, %5\n"

1725

"movq %7, %6\n"

1726

: "=y" (v1), "=y" (v2), "=y" (v3),

1727

"=y" (v4), "=y" (v5), "=y" (v6), "=y" (v7)

1728

: "y" (vfill));

1729

#endif

1730

1731

while (height--)

1732

{

1733

int w;

1734

uint8_t *d = byte_line;

1735

byte_line += stride;

1736

w = byte_width;

1737

1738

while (w >= 2 && ((unsigned long)d & 3))

1739

{

1740

*(uint16_t *)d = xor;

1741

w -= 2;

1742

d += 2;

1743

}

1744

1745

while (w >= 4 && ((unsigned long)d & 7))

1746

{

1747

*(uint32_t *)d = xor;

1748

1749

w -= 4;

1750

d += 4;

1751

}

1752

1753

while (w >= 64)

1754

{

1755

#ifdef __GNUC__

1756

__asm__ (

1757

"movq %1, (%0)\n"

1758

"movq %2, 8(%0)\n"

1759

"movq %3, 16(%0)\n"

1760

"movq %4, 24(%0)\n"

1761

"movq %5, 32(%0)\n"

1762

"movq %6, 40(%0)\n"

1763

"movq %7, 48(%0)\n"

1764

"movq %8, 56(%0)\n"

1765

1766

: "r" (d),

1767

"y" (vfill), "y" (v1), "y" (v2), "y" (v3),

1768

"y" (v4), "y" (v5), "y" (v6), "y" (v7)

1769

: "memory");

1770

#else

1771

*(__m64*) (d + 0) = vfill;

1772

*(__m64*) (d + 8) = vfill;

1773

*(__m64*) (d + 16) = vfill;

1774

*(__m64*) (d + 24) = vfill;

1775

*(__m64*) (d + 32) = vfill;

1776

*(__m64*) (d + 40) = vfill;

1777

*(__m64*) (d + 48) = vfill;

1778

*(__m64*) (d + 56) = vfill;

1779

#endif

1780

w -= 64;

1781

d += 64;

1782

}

1783

1784

while (w >= 4)

1785

{

1786

*(uint32_t *)d = xor;

1787

1788

w -= 4;

1789

d += 4;

1790

}

1791

if (w >= 2)

1792

{

1793

*(uint16_t *)d = xor;

1794

w -= 2;

1795

d += 2;

1796

}

1797

}

1798

1799

_mm_empty();

1800

return TRUE;

1801

}

1802

1803

void

1804

fbCompositeSolidMaskSrc_nx8x8888mmx (pixman_op_t op,

1805

pixman_image_t * pSrc,

1806

pixman_image_t * pMask,

1807

pixman_image_t * pDst,

1808

int16_t xSrc,

1809

int16_t ySrc,

1810

int16_t xMask,

1811

int16_t yMask,

1812

int16_t xDst,

1813

int16_t yDst,

1814

uint16_t width,

1815

uint16_t height)

1816

{

1817

uint32_t src, srca;

1818

uint32_t *dstLine, *dst;

1819

uint8_t *maskLine, *mask;

1820

int dstStride, maskStride;

1821

uint16_t w;

1822

__m64 vsrc, vsrca;

1823

ullong srcsrc;

1824

1825

CHECKPOINT();

1826

1827

fbComposeGetSolid(pSrc, src, pDst->bits.format);

1828

1829

srca = src >> 24;

1830

if (srca == 0)

1831

{

1832

pixman_fill_mmx (pDst->bits.bits, pDst->bits.rowstride, PIXMAN_FORMAT_BPP (pDst->bits.format),

1833

xDst, yDst, width, height, 0);

1834

return;

1835

}

1836

1837

srcsrc = (ullong)src << 32 | src;

1838

1839

fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);

1840

fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);

1841

1842

vsrc = load8888 (src);

1843

vsrca = expand_alpha (vsrc);

1844

1845

while (height--)

1846

{

1847

dst = dstLine;

1848

dstLine += dstStride;

1849

mask = maskLine;

1850

maskLine += maskStride;

1851

w = width;

1852

1853

CHECKPOINT();

1854

1855

while (w && (unsigned long)dst & 7)

1856

{

1857

ullong m = *mask;

1858

1859

if (m)

1860

{

1861

__m64 vdest = in(vsrc, expand_alpha_rev ((__m64)m));

1862

*dst = store8888(vdest);

1863

}

1864

else

1865

{

1866

*dst = 0;

1867

}

1868

1869

w--;

1870

mask++;

1871

dst++;

1872

}

1873

1874

CHECKPOINT();

1875

1876

while (w >= 2)

1877

{

1878

ullong m0, m1;

1879

m0 = *mask;

1880

m1 = *(mask + 1);

1881

1882

if (srca == 0xff && (m0 & m1) == 0xff)

1883

{

1884

*(ullong *)dst = srcsrc;

1885

}

1886

else if (m0 | m1)

1887

{

1888

__m64 vdest;

1889

__m64 dest0, dest1;

1890

1891

vdest = *(__m64 *)dst;

1892

1893

dest0 = in(vsrc, expand_alpha_rev ((__m64)m0));

1894

dest1 = in(vsrc, expand_alpha_rev ((__m64)m1));

1895

1896

*(__m64 *)dst = pack8888(dest0, dest1);

1897

}

1898

else

1899

{

1900

*(ullong *)dst = 0;

1901

}

1902

1903

mask += 2;

1904

dst += 2;

1905

w -= 2;

1906

}

1907

1908

CHECKPOINT();

1909

1910

while (w)

1911

{

1912

ullong m = *mask;

1913

1914

if (m)

1915

{

1916

__m64 vdest = load8888(*dst);

1917

vdest = in(vsrc, expand_alpha_rev ((__m64)m));

1918

*dst = store8888(vdest);

1919

}

1920

else

1921

{

1922

*dst = 0;

1923

}

1924

1925

w--;

1926

mask++;

1927

dst++;

1928

}

1929

}

1930

1931

_mm_empty();

1932

}

1933

1934

void

1935

fbCompositeSolidMask_nx8x0565mmx (pixman_op_t op,

1936

pixman_image_t * pSrc,

1937

pixman_image_t * pMask,

1938

pixman_image_t * pDst,

1939

int16_t xSrc,

1940

int16_t ySrc,

1941

int16_t xMask,

1942

int16_t yMask,

1943

int16_t xDst,

1944

int16_t yDst,

1945

uint16_t width,

1946

uint16_t height)

1947

{

1948

uint32_t src, srca;

1949

uint16_t *dstLine, *dst;

1950

uint8_t *maskLine, *mask;

1951

int dstStride, maskStride;

1952

uint16_t w;

1953

__m64 vsrc, vsrca;

1954

unsigned long long srcsrcsrcsrc, src16;

1955

1956

CHECKPOINT();

1957

1958

fbComposeGetSolid(pSrc, src, pDst->bits.format);

1959

1960

srca = src >> 24;

1961

if (srca == 0)

1962

return;

1963

1964

fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);

1965

fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);

1966

1967

vsrc = load8888 (src);

1968

vsrca = expand_alpha (vsrc);

1969

1970

src16 = (ullong)pack565(vsrc, _mm_setzero_si64(), 0);

1971

1972

srcsrcsrcsrc = (ullong)src16 << 48 | (ullong)src16 << 32 |

1973

(ullong)src16 << 16 | (ullong)src16;

1974

1975

while (height--)

1976

{

1977

dst = dstLine;

1978

dstLine += dstStride;

1979

mask = maskLine;

1980

maskLine += maskStride;

1981

w = width;

1982

1983

CHECKPOINT();

1984

1985

while (w && (unsigned long)dst & 7)

1986

{

1987

ullong m = *mask;

1988

1989

if (m)

1990

{

1991

ullong d = *dst;

1992

__m64 vd = (__m64)d;

1993

__m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), expand565(vd, 0));

1994

*dst = (ullong)pack565(vdest, _mm_setzero_si64(), 0);

1995

}

1996

1997

w--;

1998

mask++;

1999

dst++;

2000

}

2001

2002

CHECKPOINT();

2003

2004

while (w >= 4)

2005

{

2006

ullong m0, m1, m2, m3;

2007

m0 = *mask;

2008

m1 = *(mask + 1);

2009

m2 = *(mask + 2);

2010

m3 = *(mask + 3);

2011

2012

if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)

2013

{

2014

*(unsigned long long *)dst = srcsrcsrcsrc;

2015

}

2016

else if (m0 | m1 | m2 | m3)

2017

{

2018

__m64 vdest;

2019

__m64 vm0, vm1, vm2, vm3;

2020

2021

vdest = *(__m64 *)dst;

2022

2023

vm0 = (__m64)m0;

2024

vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm0), expand565(vdest, 0)), vdest, 0);

2025

vm1 = (__m64)m1;

2026

vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm1), expand565(vdest, 1)), vdest, 1);

2027

vm2 = (__m64)m2;

2028

vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm2), expand565(vdest, 2)), vdest, 2);

2029

vm3 = (__m64)m3;

2030

vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm3), expand565(vdest, 3)), vdest, 3);

2031

2032

*(__m64 *)dst = vdest;

2033

}

2034

2035

w -= 4;

2036

mask += 4;

2037

dst += 4;

2038

}

2039

2040

CHECKPOINT();

2041

2042

while (w)

2043

{

2044

ullong m = *mask;

2045

2046

if (m)

2047

{

2048

ullong d = *dst;

2049

__m64 vd = (__m64)d;

2050

__m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), expand565(vd, 0));

2051

*dst = (ullong)pack565(vdest, _mm_setzero_si64(), 0);

2052

}

2053

2054

w--;

2055

mask++;

2056

dst++;

2057

}

2058

}

2059

2060

_mm_empty();

2061

}

2062

2063

void

2064

fbCompositeSrc_8888RevNPx0565mmx (pixman_op_t op,

2065

pixman_image_t * pSrc,

2066

pixman_image_t * pMask,

2067

pixman_image_t * pDst,

2068

int16_t xSrc,

2069

int16_t ySrc,

2070

int16_t xMask,

2071

int16_t yMask,

2072

int16_t xDst,

2073

int16_t yDst,

2074

uint16_t width,

2075

uint16_t height)

2076

{

2077

uint16_t *dstLine, *dst;

2078

uint32_t *srcLine, *src;

2079

int dstStride, srcStride;

2080

uint16_t w;

2081

2082

CHECKPOINT();

2083

2084

fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);

2085

fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);

2086

2087

#if 0

2088

/* FIXME */

2089

assert (pSrc->pDrawable == pMask->pDrawable);

2090

#endif

2091

2092

while (height--)

2093

{

2094

dst = dstLine;

2095

dstLine += dstStride;

2096

src = srcLine;

2097

srcLine += srcStride;

2098

w = width;

2099

2100

CHECKPOINT();

2101

2102

while (w && (unsigned long)dst & 7)

2103

{

2104

__m64 vsrc = load8888 (*src);

2105

ullong d = *dst;

2106

__m64 vdest = expand565 ((__m64)d, 0);

2107

2108

vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0);

2109

2110

*dst = (ullong)vdest;

2111

2112

w--;

2113

dst++;

2114

src++;

2115

}

2116

2117

CHECKPOINT();

2118

2119

while (w >= 4)

2120

{

2121

uint32_t s0, s1, s2, s3;

2122

unsigned char a0, a1, a2, a3;

2123

2124

s0 = *src;

2125

s1 = *(src + 1);

2126

s2 = *(src + 2);

2127

s3 = *(src + 3);

2128

2129

a0 = (s0 >> 24);

2130

a1 = (s1 >> 24);

2131

a2 = (s2 >> 24);

2132

a3 = (s3 >> 24);

2133

2134

if ((a0 & a1 & a2 & a3) == 0xFF)

2135

{

2136

__m64 vdest;

2137

vdest = pack565(invert_colors(load8888(s0)), _mm_setzero_si64(), 0);

2138

vdest = pack565(invert_colors(load8888(s1)), vdest, 1);

2139

vdest = pack565(invert_colors(load8888(s2)), vdest, 2);

2140

vdest = pack565(invert_colors(load8888(s3)), vdest, 3);

2141

2142

*(__m64 *)dst = vdest;

2143

}

2144

else if (a0 | a1 | a2 | a3)

2145

{

2146

__m64 vdest = *(__m64 *)dst;

2147

2148

vdest = pack565(over_rev_non_pre(load8888(s0), expand565(vdest, 0)), vdest, 0);

2149

vdest = pack565(over_rev_non_pre(load8888(s1), expand565(vdest, 1)), vdest, 1);

2150

vdest = pack565(over_rev_non_pre(load8888(s2), expand565(vdest, 2)), vdest, 2);

2151

vdest = pack565(over_rev_non_pre(load8888(s3), expand565(vdest, 3)), vdest, 3);

2152

2153

*(__m64 *)dst = vdest;

2154

}

2155

2156

w -= 4;

2157

dst += 4;

2158

src += 4;

2159

}

2160

2161

CHECKPOINT();

2162

2163

while (w)

2164

{

2165

__m64 vsrc = load8888 (*src);

2166

ullong d = *dst;

2167

__m64 vdest = expand565 ((__m64)d, 0);

2168

2169

vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0);

2170

2171

*dst = (ullong)vdest;

2172

2173

w--;

2174

dst++;

2175

src++;

2176

}

2177

}

2178

2179

_mm_empty();

2180

}

2181

2182

/* "8888RevNP" is GdkPixbuf's format: ABGR, non premultiplied */

2183

2184

void

2185

fbCompositeSrc_8888RevNPx8888mmx (pixman_op_t op,

2186

pixman_image_t * pSrc,

2187

pixman_image_t * pMask,

2188

pixman_image_t * pDst,

2189

int16_t xSrc,

2190

int16_t ySrc,

2191

int16_t xMask,

2192

int16_t yMask,

2193

int16_t xDst,

2194

int16_t yDst,

2195

uint16_t width,

2196

uint16_t height)

2197

{

2198

uint32_t *dstLine, *dst;

2199

uint32_t *srcLine, *src;

2200

int dstStride, srcStride;

2201

uint16_t w;

2202

2203

CHECKPOINT();

2204

2205

fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);

2206

fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);

2207

2208

#if 0

2209

/* FIXME */

2210

assert (pSrc->pDrawable == pMask->pDrawable);

2211

#endif

2212

2213

while (height--)

2214

{

2215

dst = dstLine;

2216

dstLine += dstStride;

2217

src = srcLine;

2218

srcLine += srcStride;

2219

w = width;

2220

2221

while (w && (unsigned long)dst & 7)

2222

{

2223

__m64 s = load8888 (*src);

2224

__m64 d = load8888 (*dst);

2225

2226

*dst = store8888 (over_rev_non_pre (s, d));

2227

2228

w--;

2229

dst++;

2230

src++;

2231

}

2232

2233

while (w >= 2)

2234

{

2235

ullong s0, s1;

2236

unsigned char a0, a1;

2237

__m64 d0, d1;

2238

2239

s0 = *src;

2240

s1 = *(src + 1);

2241

2242

a0 = (s0 >> 24);

2243

a1 = (s1 >> 24);

2244

2245

if ((a0 & a1) == 0xFF)

2246

{

2247

d0 = invert_colors(load8888(s0));

2248

d1 = invert_colors(load8888(s1));

2249

2250

*(__m64 *)dst = pack8888 (d0, d1);

2251

}

2252

else if (a0 | a1)

2253

{

2254

__m64 vdest = *(__m64 *)dst;

2255

2256

d0 = over_rev_non_pre (load8888(s0), expand8888 (vdest, 0));

2257

d1 = over_rev_non_pre (load8888(s1), expand8888 (vdest, 1));

2258

2259

*(__m64 *)dst = pack8888 (d0, d1);

2260

}

2261

2262

w -= 2;

2263

dst += 2;

2264

src += 2;

2265

}

2266

2267

while (w)

2268

{

2269

__m64 s = load8888 (*src);

2270

__m64 d = load8888 (*dst);

2271

2272

*dst = store8888 (over_rev_non_pre (s, d));

2273

2274

w--;

2275

dst++;

2276

src++;

2277

}

2278

}

2279

2280

_mm_empty();

2281

}

2282

2283

void

2284

fbCompositeSolidMask_nx8888x0565Cmmx (pixman_op_t op,

2285

pixman_image_t * pSrc,

2286

pixman_image_t * pMask,

2287

pixman_image_t * pDst,

2288

int16_t xSrc,

2289

int16_t ySrc,

2290

int16_t xMask,

2291

int16_t yMask,

2292

int16_t xDst,

2293

int16_t yDst,

2294

uint16_t width,

2295

uint16_t height)

2296

{

2297

uint32_t src, srca;

2298

uint16_t *dstLine;

2299

uint32_t *maskLine;

2300

int dstStride, maskStride;

2301

__m64 vsrc, vsrca;

2302

2303

CHECKPOINT();

2304

2305

fbComposeGetSolid(pSrc, src, pDst->bits.format);

2306

2307

srca = src >> 24;

2308

if (srca == 0)

2309

return;

2310

2311

fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);

2312

fbComposeGetStart (pMask, xMask, yMask, uint32_t, maskStride, maskLine, 1);

2313

2314

vsrc = load8888 (src);

2315

vsrca = expand_alpha (vsrc);

2316

2317

while (height--)

2318

{

2319

int twidth = width;

2320

uint32_t *p = (uint32_t *)maskLine;

2321

uint16_t *q = (uint16_t *)dstLine;

2322

2323

while (twidth && ((unsigned long)q & 7))

2324

{

2325

uint32_t m = *(uint32_t *)p;

2326

2327

if (m)

2328

{

2329

ullong d = *q;

2330

__m64 vdest = expand565 ((__m64)d, 0);

2331

vdest = pack565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0);

2332

*q = (ullong)vdest;

2333

}

2334

2335

twidth--;

2336

p++;

2337

q++;

2338

}

2339

2340

while (twidth >= 4)

2341

{

2342

uint32_t m0, m1, m2, m3;

2343

2344

m0 = *p;

2345

m1 = *(p + 1);

2346

m2 = *(p + 2);

2347

m3 = *(p + 3);

2348

2349

if ((m0 | m1 | m2 | m3))

2350

{

2351

__m64 vdest = *(__m64 *)q;

2352

2353

vdest = pack565(in_over(vsrc, vsrca, load8888(m0), expand565(vdest, 0)), vdest, 0);

2354

vdest = pack565(in_over(vsrc, vsrca, load8888(m1), expand565(vdest, 1)), vdest, 1);

2355

vdest = pack565(in_over(vsrc, vsrca, load8888(m2), expand565(vdest, 2)), vdest, 2);

2356

vdest = pack565(in_over(vsrc, vsrca, load8888(m3), expand565(vdest, 3)), vdest, 3);

2357

2358

*(__m64 *)q = vdest;

2359

}

2360

twidth -= 4;

2361

p += 4;

2362

q += 4;

2363

}

2364

2365

while (twidth)

2366

{

2367

uint32_t m;

2368

2369

m = *(uint32_t *)p;

2370

if (m)

2371

{

2372

ullong d = *q;

2373

__m64 vdest = expand565((__m64)d, 0);

2374

vdest = pack565 (in_over(vsrc, vsrca, load8888(m), vdest), vdest, 0);

2375

*q = (ullong)vdest;

2376

}

2377

2378

twidth--;

2379

p++;

2380

q++;

2381

}

2382

2383

maskLine += maskStride;

2384

dstLine += dstStride;

2385

}

2386

2387

_mm_empty ();

2388

}

2389

2390

void

2391

fbCompositeIn_nx8x8mmx (pixman_op_t op,

2392

pixman_image_t * pSrc,

2393

pixman_image_t * pMask,

2394

pixman_image_t * pDst,

2395

int16_t xSrc,

2396

int16_t ySrc,

2397

int16_t xMask,

2398

int16_t yMask,

2399

int16_t xDst,

2400

int16_t yDst,

2401

uint16_t width,

2402

uint16_t height)

2403

{

2404

uint8_t *dstLine, *dst;

2405

uint8_t *maskLine, *mask;

2406

int dstStride, maskStride;

2407

uint16_t w;

2408

uint32_t src;

2409

uint8_t sa;

2410

__m64 vsrc, vsrca;

2411

2412

fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);

2413

fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);

2414

2415

fbComposeGetSolid(pSrc, src, pDst->bits.format);

2416

2417

sa = src >> 24;

2418

if (sa == 0)

2419

return;

2420

2421

vsrc = load8888(src);

2422

vsrca = expand_alpha(vsrc);

2423

2424

while (height--)

2425

{

2426

dst = dstLine;

2427

dstLine += dstStride;

2428

mask = maskLine;

2429

maskLine += maskStride;

2430

w = width;

2431

2432

if ((((unsigned long)pDst & 3) == 0) &&

2433

(((unsigned long)pSrc & 3) == 0))

2434

{

2435

while (w >= 4)

2436

{

2437

uint32_t m;

2438

__m64 vmask;

2439

__m64 vdest;

2440

2441

m = 0;

2442

2443

vmask = load8888 (*(uint32_t *)mask);

2444

vdest = load8888 (*(uint32_t *)dst);

2445

2446

*(uint32_t *)dst = store8888 (in (in (vsrca, vmask), vdest));

2447

2448

dst += 4;

2449

mask += 4;

2450

w -= 4;

2451

}

2452

}

2453

2454

while (w--)

2455

{

2456

uint16_t tmp;

2457

uint8_t a;

2458

uint32_t m, d;

2459

uint32_t r;

2460

2461

a = *mask++;

2462

d = *dst;

2463

2464

m = FbInU (sa, 0, a, tmp);

2465

r = FbInU (m, 0, d, tmp);

2466

2467

*dst++ = r;

2468

}

2469

}

2470

2471

_mm_empty();

2472

}

2473

2474

void

2475

fbCompositeIn_8x8mmx (pixman_op_t op,

2476

pixman_image_t * pSrc,

2477

pixman_image_t * pMask,

2478

pixman_image_t * pDst,

2479

int16_t xSrc,

2480

int16_t ySrc,

2481

int16_t xMask,

2482

int16_t yMask,

2483

int16_t xDst,

2484

int16_t yDst,

2485

uint16_t width,

2486

uint16_t height)

2487

{

2488

uint8_t *dstLine, *dst;

2489

uint8_t *srcLine, *src;

2490

int srcStride, dstStride;

2491

uint16_t w;

2492

2493

fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);

2494

fbComposeGetStart (pSrc, xSrc, ySrc, uint8_t, srcStride, srcLine, 1);

2495

2496

while (height--)

2497

{

2498

dst = dstLine;

2499

dstLine += dstStride;

2500

src = srcLine;

2501

srcLine += srcStride;

2502

w = width;

2503

2504

if ((((unsigned long)pDst & 3) == 0) &&

2505

(((unsigned long)pSrc & 3) == 0))

2506

{

2507

while (w >= 4)

2508

{

2509

uint32_t *s = (uint32_t *)src;

2510

uint32_t *d = (uint32_t *)dst;

2511

2512

*d = store8888 (in (load8888 (*s), load8888 (*d)));

2513

2514

w -= 4;

2515

dst += 4;

2516

src += 4;

2517

}

2518

}

2519

2520

while (w--)

2521

{

2522

uint8_t s, d;

2523

uint16_t tmp;

2524

2525

s = *src;

2526

d = *dst;

2527

2528

*dst = FbInU (s, 0, d, tmp);

2529

2530

src++;

2531

dst++;

2532

}

2533

}

2534

2535

_mm_empty ();

2536

}

2537

2538

void

2539

fbCompositeSrcAdd_8888x8x8mmx (pixman_op_t op,

2540

pixman_image_t * pSrc,

2541

pixman_image_t * pMask,

2542

pixman_image_t * pDst,

2543

int16_t xSrc,

2544

int16_t ySrc,

2545

int16_t xMask,

2546

int16_t yMask,

2547

int16_t xDst,

2548

int16_t yDst,

2549

uint16_t width,

2550

uint16_t height)

2551

{

2552

uint8_t *dstLine, *dst;

2553

uint8_t *maskLine, *mask;

2554

int dstStride, maskStride;

2555

uint16_t w;

2556

uint32_t src;

2557

uint8_t sa;

2558

__m64 vsrc, vsrca;

2559

2560

fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);

2561

fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);

2562

2563

fbComposeGetSolid(pSrc, src, pDst->bits.format);

2564

2565

sa = src >> 24;

2566

if (sa == 0)

2567

return;

2568

2569

vsrc = load8888(src);

2570

vsrca = expand_alpha(vsrc);

2571

2572

while (height--)

2573

{

2574

dst = dstLine;

2575

dstLine += dstStride;

2576

mask = maskLine;

2577

maskLine += maskStride;

2578

w = width;

2579

2580

if ((((unsigned long)pMask & 3) == 0) &&

2581

(((unsigned long)pDst & 3) == 0))

2582

{

2583

while (w >= 4)

2584

{

2585

__m64 vmask = load8888 (*(uint32_t *)mask);

2586

__m64 vdest = load8888 (*(uint32_t *)dst);

2587

2588

*(uint32_t *)dst = store8888 (_mm_adds_pu8 (in (vsrca, vmask), vdest));

2589

2590

w -= 4;

2591

dst += 4;

2592

mask += 4;

2593

}

2594

}

2595

2596

while (w--)

2597

{

2598

uint16_t tmp;

2599

uint16_t a;

2600

uint32_t m, d;

2601

uint32_t r;

2602

2603

a = *mask++;

2604

d = *dst;

2605

2606

m = FbInU (sa, 0, a, tmp);

2607

r = FbAdd (m, d, 0, tmp);

2608

2609

*dst++ = r;

2610

}

2611

}

2612

2613

_mm_empty();

2614

}

2615

2616

void

2617

fbCompositeSrcAdd_8000x8000mmx (pixman_op_t op,

2618

pixman_image_t * pSrc,

2619

pixman_image_t * pMask,

2620

pixman_image_t * pDst,

2621

int16_t xSrc,

2622

int16_t ySrc,

2623

int16_t xMask,

2624

int16_t yMask,

2625

int16_t xDst,

2626

int16_t yDst,

2627

uint16_t width,

2628

uint16_t height)

2629

{

2630

uint8_t *dstLine, *dst;

2631

uint8_t *srcLine, *src;

2632

int dstStride, srcStride;

2633

uint16_t w;

2634

uint8_t s, d;

2635

uint16_t t;

2636

2637

CHECKPOINT();

2638

2639

fbComposeGetStart (pSrc, xSrc, ySrc, uint8_t, srcStride, srcLine, 1);

2640

fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);

2641

2642

while (height--)

2643

{

2644

dst = dstLine;

2645

dstLine += dstStride;

2646

src = srcLine;

2647

srcLine += srcStride;

2648

w = width;

2649

2650

while (w && (unsigned long)dst & 7)

2651

{

2652

s = *src;

2653

d = *dst;

2654

t = d + s;

2655

s = t | (0 - (t >> 8));

2656

*dst = s;

2657

2658

dst++;

2659

src++;

2660

w--;

2661

}

2662

2663

while (w >= 8)

2664

{

2665

*(__m64*)dst = _mm_adds_pu8(*(__m64*)src, *(__m64*)dst);

2666

dst += 8;

2667

src += 8;

2668

w -= 8;

2669

}

2670

2671

while (w)

2672

{

2673

s = *src;

2674

d = *dst;

2675

t = d + s;

2676

s = t | (0 - (t >> 8));

2677

*dst = s;

2678

2679

dst++;

2680

src++;

2681

w--;

2682

}

2683

}

2684

2685

_mm_empty();

2686

}

2687

2688

void

2689

fbCompositeSrcAdd_8888x8888mmx (pixman_op_t op,

2690

pixman_image_t * pSrc,

2691

pixman_image_t * pMask,

2692

pixman_image_t * pDst,

2693

int16_t xSrc,

2694

int16_t ySrc,

2695

int16_t xMask,

2696

int16_t yMask,

2697

int16_t xDst,

2698

int16_t yDst,

2699

uint16_t width,

2700

uint16_t height)

2701

{

2702

uint32_t *dstLine, *dst;

2703

uint32_t *srcLine, *src;

2704

int dstStride, srcStride;

2705

uint16_t w;

2706

2707

CHECKPOINT();

2708

2709

fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);

2710

fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);

2711

2712

while (height--)

2713

{

2714

dst = dstLine;

2715

dstLine += dstStride;

2716

src = srcLine;

2717

srcLine += srcStride;

2718

w = width;

2719

2720

while (w && (unsigned long)dst & 7)

2721

{

2722

*dst = _mm_cvtsi64_si32(_mm_adds_pu8(_mm_cvtsi32_si64(*src),

2723

_mm_cvtsi32_si64(*dst)));

2724

dst++;

2725

src++;

2726

w--;

2727

}

2728

2729

while (w >= 2)

2730

{

2731

*(ullong*)dst = (ullong) _mm_adds_pu8(*(__m64*)src, *(__m64*)dst);

2732

dst += 2;

2733

src += 2;

2734

w -= 2;

2735

}

2736

2737

if (w)

2738

{

2739

*dst = _mm_cvtsi64_si32(_mm_adds_pu8(_mm_cvtsi32_si64(*src),

2740

_mm_cvtsi32_si64(*dst)));

2741

2742

}

2743

}

2744

2745

_mm_empty();

2746

}

2747

2748

pixman_bool_t

2749

pixman_blt_mmx (uint32_t *src_bits,

2750

uint32_t *dst_bits,

2751

int src_stride,

2752

int dst_stride,

2753

int src_bpp,

2754

int dst_bpp,

2755

int src_x, int src_y,

2756

int dst_x, int dst_y,

2757

int width, int height)

2758

{

2759

uint8_t * src_bytes;

2760

uint8_t * dst_bytes;

2761

int byte_width;

2762

2763

if (src_bpp != dst_bpp)

2764

return FALSE;

2765

2766

if (src_bpp == 16)

2767

{

2768

src_stride = src_stride * sizeof (uint32_t) / 2;

2769

dst_stride = dst_stride * sizeof (uint32_t) / 2;

2770

src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));

2771

dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));

2772

byte_width = 2 * width;

2773

src_stride *= 2;

2774

dst_stride *= 2;

2775

} else if (src_bpp == 32) {

2776

src_stride = src_stride * sizeof (uint32_t) / 4;

2777

dst_stride = dst_stride * sizeof (uint32_t) / 4;

2778

src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));

2779

dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));

2780

byte_width = 4 * width;

2781

src_stride *= 4;

2782

dst_stride *= 4;

2783

} else {

2784

return FALSE;

2785

}

2786

2787

while (height--)

2788

{

2789

int w;

2790

uint8_t *s = src_bytes;

2791

uint8_t *d = dst_bytes;

2792

src_bytes += src_stride;

2793

dst_bytes += dst_stride;

2794

w = byte_width;

2795

2796

while (w >= 2 && ((unsigned long)d & 3))

2797

{

2798

*(uint16_t *)d = *(uint16_t *)s;

2799

w -= 2;

2800

s += 2;

2801

d += 2;

2802

}

2803

2804

while (w >= 4 && ((unsigned long)d & 7))

2805

{

2806

*(uint32_t *)d = *(uint32_t *)s;

2807

2808

w -= 4;

2809

s += 4;

2810

d += 4;

2811

}

2812

2813

while (w >= 64)

2814

{

2815

#ifdef __GNUC__

2816

__asm__ (

2817

"movq (%1), %%mm0\n"

2818

"movq 8(%1), %%mm1\n"

2819

"movq 16(%1), %%mm2\n"

2820

"movq 24(%1), %%mm3\n"

2821

"movq 32(%1), %%mm4\n"

2822

"movq 40(%1), %%mm5\n"

2823

"movq 48(%1), %%mm6\n"

2824

"movq 56(%1), %%mm7\n"

2825

2826

"movq %%mm0, (%0)\n"

2827

"movq %%mm1, 8(%0)\n"

2828

"movq %%mm2, 16(%0)\n"

2829

"movq %%mm3, 24(%0)\n"

2830

"movq %%mm4, 32(%0)\n"

2831

"movq %%mm5, 40(%0)\n"

2832

"movq %%mm6, 48(%0)\n"

2833

"movq %%mm7, 56(%0)\n"

2834

2835

: "r" (d), "r" (s)

2836

: "memory",

2837

"%mm0", "%mm1", "%mm2", "%mm3",

2838

"%mm4", "%mm5", "%mm6", "%mm7");

2839

#else

2840

__m64 v0 = *(__m64 *)(s + 0);

2841

__m64 v1 = *(__m64 *)(s + 8);

2842

__m64 v2 = *(__m64 *)(s + 16);

2843

__m64 v3 = *(__m64 *)(s + 24);

2844

__m64 v4 = *(__m64 *)(s + 32);

2845

__m64 v5 = *(__m64 *)(s + 40);

2846

__m64 v6 = *(__m64 *)(s + 48);

2847

__m64 v7 = *(__m64 *)(s + 56);

2848

*(__m64 *)(d + 0) = v0;

2849

*(__m64 *)(d + 8) = v1;

2850

*(__m64 *)(d + 16) = v2;

2851

*(__m64 *)(d + 24) = v3;

2852

*(__m64 *)(d + 32) = v4;

2853

*(__m64 *)(d + 40) = v5;

2854

*(__m64 *)(d + 48) = v6;

2855

*(__m64 *)(d + 56) = v7;

2856

#endif

2857

2858

w -= 64;

2859

s += 64;

2860

d += 64;

2861

}

2862

while (w >= 4)

2863

{

2864

*(uint32_t *)d = *(uint32_t *)s;

2865

2866

w -= 4;

2867

s += 4;

2868

d += 4;

2869

}

2870

if (w >= 2)

2871

{

2872

*(uint16_t *)d = *(uint16_t *)s;

2873

w -= 2;

2874

s += 2;

2875

d += 2;

2876

}

2877

}

2878

2879

_mm_empty();

2880

2881

return TRUE;

2882

}

2883

2884

void

2885

fbCompositeCopyAreammx (pixman_op_t op,

2886

pixman_image_t * pSrc,

2887

pixman_image_t * pMask,

2888

pixman_image_t * pDst,

2889

int16_t xSrc,

2890

int16_t ySrc,

2891

int16_t xMask,

2892

int16_t yMask,

2893

int16_t xDst,

2894

int16_t yDst,

2895

uint16_t width,

2896

uint16_t height)

2897

{

2898

pixman_blt_mmx (pSrc->bits.bits,

2899

pDst->bits.bits,

2900

pSrc->bits.rowstride,

2901

pDst->bits.rowstride,

2902

PIXMAN_FORMAT_BPP (pSrc->bits.format),

2903

PIXMAN_FORMAT_BPP (pDst->bits.format),

2904

xSrc, ySrc, xDst, yDst, width, height);

2905

}

2906

2907

void

2908

fbCompositeOver_x888x8x8888mmx (pixman_op_t op,

2909

pixman_image_t * pSrc,

2910

pixman_image_t * pMask,

2911

pixman_image_t * pDst,

2912

int16_t xSrc,

2913

int16_t ySrc,

2914

int16_t xMask,

2915

int16_t yMask,

2916

int16_t xDst,

2917

int16_t yDst,

2918

uint16_t width,

2919

uint16_t height)

2920

{

2921

uint32_t *src, *srcLine;

2922

uint32_t *dst, *dstLine;

2923

uint8_t *mask, *maskLine;

2924

int srcStride, maskStride, dstStride;

2925

__m64 m;

2926

uint32_t s, d;

2927

uint16_t w;

2928

2929

fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);

2930

fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);

2931

fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);

2932

2933

while (height--)

2934

{

2935

src = srcLine;

2936

srcLine += srcStride;

2937

dst = dstLine;

2938

dstLine += dstStride;

2939

mask = maskLine;

2940

maskLine += maskStride;

2941

2942

w = width;

2943

2944

while (w--)

2945

{

2946

ullong m = *mask;

2947

2948

if (m)

2949

{

2950

__m64 s = load8888 (*src | 0xff000000);

2951

2952

if (m == 0xff)

2953

*dst = store8888 (s);

2954

else

2955

{

2956

__m64 sa = expand_alpha (s);

2957

__m64 vm = expand_alpha_rev ((__m64)m);

2958

__m64 vdest = in_over(s, sa, vm, load8888 (*dst));

2959

2960

*dst = store8888 (vdest);

2961

}

2962

}

2963

2964

mask++;

2965

dst++;

2966

src++;

2967

}

2968

}

2969

2970

_mm_empty();

2971

}

2972

2973

2974

2975

#endif /* USE_MMX */