~ubuntu-branches/ubuntu/quantal/valgrind/quantal : revision 49

1

2

/*---------------------------------------------------------------*/

3

/*--- begin guest_generic_x87.c ---*/

4

/*---------------------------------------------------------------*/

5

6

/*

7

This file is part of Valgrind, a dynamic binary instrumentation

8

framework.

9

10

11

info@open-works.net

12

13

This program is free software; you can redistribute it and/or

14

modify it under the terms of the GNU General Public License as

15

published by the Free Software Foundation; either version 2 of the

16

License, or (at your option) any later version.

17

18

This program is distributed in the hope that it will be useful, but

19

WITHOUT ANY WARRANTY; without even the implied warranty of

20

MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

21

General Public License for more details.

22

23

You should have received a copy of the GNU General Public License

24

along with this program; if not, write to the Free Software

25

Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA

26

02110-1301, USA.

27

28

The GNU General Public License is contained in the file COPYING.

29

30

Neither the names of the U.S. Department of Energy nor the

31

University of California nor the names of its contributors may be

32

used to endorse or promote products derived from this software

33

without prior written permission.

34

*/

35

36

/* This file contains functions for doing some x87-specific

37

operations. Both the amd64 and x86 front ends (guests) indirectly

38

call these functions via guest helper calls. By putting them here,

39

code duplication is avoided. Some of these functions are tricky

40

and hard to verify, so there is much to be said for only having one

41

copy thereof.

42

*/

43

44

#include "libvex_basictypes.h"

45

46

#include "main_util.h"

47

#include "guest_generic_x87.h"

48

49

50

/* 80 and 64-bit floating point formats:

51

52

80-bit:

53

54

S 0 0-------0 zero

55

S 0 0X------X denormals

56

S 1-7FFE 1X------X normals (all normals have leading 1)

57

S 7FFF 10------0 infinity

58

S 7FFF 10X-----X snan

59

S 7FFF 11X-----X qnan

60

61

S is the sign bit. For runs X----X, at least one of the Xs must be

62

nonzero. Exponent is 15 bits, fractional part is 63 bits, and

63

there is an explicitly represented leading 1, and a sign bit,

64

giving 80 in total.

65

66

64-bit avoids the confusion of an explicitly represented leading 1

67

and so is simpler:

68

69

S 0 0------0 zero

70

S 0 X------X denormals

71

S 1-7FE any normals

72

S 7FF 0------0 infinity

73

S 7FF 0X-----X snan

74

S 7FF 1X-----X qnan

75

76

Exponent is 11 bits, fractional part is 52 bits, and there is a

77

sign bit, giving 64 in total.

78

*/

79

80

81

static inline UInt read_bit_array ( UChar* arr, UInt n )

82

{

83

UChar c = arr[n >> 3];

84

c >>= (n&7);

85

return c & 1;

86

}

87

88

static inline void write_bit_array ( UChar* arr, UInt n, UInt b )

89

{

90

UChar c = arr[n >> 3];

91

c = toUChar( c & ~(1 << (n&7)) );

92

c = toUChar( c | ((b&1) << (n&7)) );

93

arr[n >> 3] = c;

94

}

95

96

/* Convert an IEEE754 double (64-bit) into an x87 extended double

97

(80-bit), mimicing the hardware fairly closely. Both numbers are

98

stored little-endian. Limitations, all of which could be fixed,

99

given some level of hassle:

100

101

* Identity of NaNs is not preserved.

102

103

See comments in the code for more details.

104

*/

105

void convert_f64le_to_f80le ( /*IN*/UChar* f64, /*OUT*/UChar* f80 )

106

{

107

Bool mantissaIsZero;

108

Int bexp, i, j, shift;

109

UChar sign;

110

111

sign = toUChar( (f64[7] >> 7) & 1 );

112

bexp = (f64[7] << 4) | ((f64[6] >> 4) & 0x0F);

113

bexp &= 0x7FF;

114

115

mantissaIsZero = False;

116

if (bexp == 0 || bexp == 0x7FF) {

117

/* We'll need to know whether or not the mantissa (bits 51:0) is

118

all zeroes in order to handle these cases. So figure it

119

out. */

120

mantissaIsZero

121

= toBool(

122

(f64[6] & 0x0F) == 0

123

&& f64[5] == 0 && f64[4] == 0 && f64[3] == 0

124

&& f64[2] == 0 && f64[1] == 0 && f64[0] == 0

125

);

126

}

127

128

/* If the exponent is zero, either we have a zero or a denormal.

129

Produce a zero. This is a hack in that it forces denormals to

130

zero. Could do better. */

131

if (bexp == 0) {

132

f80[9] = toUChar( sign << 7 );

133

f80[8] = f80[7] = f80[6] = f80[5] = f80[4]

134

= f80[3] = f80[2] = f80[1] = f80[0] = 0;

135

136

if (mantissaIsZero)

137

/* It really is zero, so that's all we can do. */

138

return;

139

140

/* There is at least one 1-bit in the mantissa. So it's a

141

potentially denormalised double -- but we can produce a

142

normalised long double. Count the leading zeroes in the

143

mantissa so as to decide how much to bump the exponent down

144

by. Note, this is SLOW. */

145

shift = 0;

146

for (i = 51; i >= 0; i--) {

147

if (read_bit_array(f64, i))

148

break;

149

shift++;

150

}

151

152

/* and copy into place as many bits as we can get our hands on. */

153

j = 63;

154

for (i = 51 - shift; i >= 0; i--) {

155

write_bit_array( f80, j,

156

read_bit_array( f64, i ) );

157

j--;

158

}

159

160

/* Set the exponent appropriately, and we're done. */

161

bexp -= shift;

162

bexp += (16383 - 1023);

163

f80[9] = toUChar( (sign << 7) | ((bexp >> 8) & 0xFF) );

164

f80[8] = toUChar( bexp & 0xFF );

165

return;

166

}

167

168

/* If the exponent is 7FF, this is either an Infinity, a SNaN or

169

QNaN, as determined by examining bits 51:0, thus:

170

0 ... 0 Inf

171

0X ... X SNaN

172

1X ... X QNaN

173

where at least one of the Xs is not zero.

174

*/

175

if (bexp == 0x7FF) {

176

if (mantissaIsZero) {

177

/* Produce an appropriately signed infinity:

178

S 1--1 (15) 1 0--0 (63)

179

*/

180

f80[9] = toUChar( (sign << 7) | 0x7F );

181

f80[8] = 0xFF;

182

f80[7] = 0x80;

183

f80[6] = f80[5] = f80[4] = f80[3]

184

= f80[2] = f80[1] = f80[0] = 0;

185

return;

186

}

187

/* So it's either a QNaN or SNaN. Distinguish by considering

188

bit 51. Note, this destroys all the trailing bits

189

(identity?) of the NaN. IEEE754 doesn't require preserving

190

these (it only requires that there be one QNaN value and one

191

SNaN value), but x87 does seem to have some ability to

192

preserve them. Anyway, here, the NaN's identity is

193

destroyed. Could be improved. */

194

if (f64[6] & 8) {

195

/* QNaN. Make a QNaN:

196

S 1--1 (15) 1 1--1 (63)

197

*/

198

f80[9] = toUChar( (sign << 7) | 0x7F );

199

f80[8] = 0xFF;

200

f80[7] = 0xFF;

201

f80[6] = f80[5] = f80[4] = f80[3]

202

= f80[2] = f80[1] = f80[0] = 0xFF;

203

} else {

204

/* SNaN. Make a SNaN:

205

S 1--1 (15) 0 1--1 (63)

206

*/

207

f80[9] = toUChar( (sign << 7) | 0x7F );

208

f80[8] = 0xFF;

209

f80[7] = 0x7F;

210

f80[6] = f80[5] = f80[4] = f80[3]

211

= f80[2] = f80[1] = f80[0] = 0xFF;

212

}

213

return;

214

}

215

216

/* It's not a zero, denormal, infinity or nan. So it must be a

217

normalised number. Rebias the exponent and build the new

218

number. */

219

bexp += (16383 - 1023);

220

221

f80[9] = toUChar( (sign << 7) | ((bexp >> 8) & 0xFF) );

222

f80[8] = toUChar( bexp & 0xFF );

223

f80[7] = toUChar( (1 << 7) | ((f64[6] << 3) & 0x78)

224

| ((f64[5] >> 5) & 7) );

225

f80[6] = toUChar( ((f64[5] << 3) & 0xF8) | ((f64[4] >> 5) & 7) );

226

f80[5] = toUChar( ((f64[4] << 3) & 0xF8) | ((f64[3] >> 5) & 7) );

227

f80[4] = toUChar( ((f64[3] << 3) & 0xF8) | ((f64[2] >> 5) & 7) );

228

f80[3] = toUChar( ((f64[2] << 3) & 0xF8) | ((f64[1] >> 5) & 7) );

229

f80[2] = toUChar( ((f64[1] << 3) & 0xF8) | ((f64[0] >> 5) & 7) );

230

f80[1] = toUChar( ((f64[0] << 3) & 0xF8) );

231

f80[0] = toUChar( 0 );

232

}

233

234

235

/* Convert an x87 extended double (80-bit) into an IEEE 754 double

236

(64-bit), mimicking the hardware fairly closely. Both numbers are

237

stored little-endian. Limitations, both of which could be fixed,

238

given some level of hassle:

239

240

* Rounding following truncation could be a bit better.

241

242

* Identity of NaNs is not preserved.

243

244

See comments in the code for more details.

245

*/

246

void convert_f80le_to_f64le ( /*IN*/UChar* f80, /*OUT*/UChar* f64 )

247

{

248

Bool isInf;

249

Int bexp, i, j;

250

UChar sign;

251

252

sign = toUChar((f80[9] >> 7) & 1);

253

bexp = (((UInt)f80[9]) << 8) | (UInt)f80[8];

254

bexp &= 0x7FFF;

255

256

/* If the exponent is zero, either we have a zero or a denormal.

257

But an extended precision denormal becomes a double precision

258

zero, so in either case, just produce the appropriately signed

259

zero. */

260

if (bexp == 0) {

261

f64[7] = toUChar(sign << 7);

262

f64[6] = f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0;

263

return;

264

}

265

266

/* If the exponent is 7FFF, this is either an Infinity, a SNaN or

267

QNaN, as determined by examining bits 62:0, thus:

268

0 ... 0 Inf

269

0X ... X SNaN

270

1X ... X QNaN

271

where at least one of the Xs is not zero.

272

*/

273

if (bexp == 0x7FFF) {

274

isInf = toBool(

275

(f80[7] & 0x7F) == 0

276

&& f80[6] == 0 && f80[5] == 0 && f80[4] == 0

277

&& f80[3] == 0 && f80[2] == 0 && f80[1] == 0

278

&& f80[0] == 0

279

);

280

if (isInf) {

281

if (0 == (f80[7] & 0x80))

282

goto wierd_NaN;

283

/* Produce an appropriately signed infinity:

284

S 1--1 (11) 0--0 (52)

285

*/

286

f64[7] = toUChar((sign << 7) | 0x7F);

287

f64[6] = 0xF0;

288

f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0;

289

return;

290

}

291

/* So it's either a QNaN or SNaN. Distinguish by considering

292

bit 62. Note, this destroys all the trailing bits

293

(identity?) of the NaN. IEEE754 doesn't require preserving

294

these (it only requires that there be one QNaN value and one

295

SNaN value), but x87 does seem to have some ability to

296

preserve them. Anyway, here, the NaN's identity is

297

destroyed. Could be improved. */

298

if (f80[8] & 0x40) {

299

/* QNaN. Make a QNaN:

300

S 1--1 (11) 1 1--1 (51)

301

*/

302

f64[7] = toUChar((sign << 7) | 0x7F);

303

f64[6] = 0xFF;

304

f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0xFF;

305

} else {

306

/* SNaN. Make a SNaN:

307

S 1--1 (11) 0 1--1 (51)

308

*/

309

f64[7] = toUChar((sign << 7) | 0x7F);

310

f64[6] = 0xF7;

311

f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0xFF;

312

}

313

return;

314

}

315

316

/* If it's not a Zero, NaN or Inf, and the integer part (bit 62) is

317

zero, the x87 FPU appears to consider the number denormalised

318

and converts it to a QNaN. */

319

if (0 == (f80[7] & 0x80)) {

320

wierd_NaN:

321

/* Strange hardware QNaN:

322

S 1--1 (11) 1 0--0 (51)

323

*/

324

/* On a PIII, these QNaNs always appear with sign==1. I have

325

no idea why. */

326

f64[7] = (1 /*sign*/ << 7) | 0x7F;

327

f64[6] = 0xF8;

328

f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0;

329

return;

330

}

331

332

/* It's not a zero, denormal, infinity or nan. So it must be a

333

normalised number. Rebias the exponent and consider. */

334

bexp -= (16383 - 1023);

335

if (bexp >= 0x7FF) {

336

/* It's too big for a double. Construct an infinity. */

337

f64[7] = toUChar((sign << 7) | 0x7F);

338

f64[6] = 0xF0;

339

f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0;

340

return;

341

}

342

343

if (bexp <= 0) {

344

/* It's too small for a normalised double. First construct a

345

zero and then see if it can be improved into a denormal. */

346

f64[7] = toUChar(sign << 7);

347

f64[6] = f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0;

348

349

if (bexp < -52)

350

/* Too small even for a denormal. */

351

return;

352

353

/* Ok, let's make a denormal. Note, this is SLOW. */

354

/* Copy bits 63, 62, 61, etc of the src mantissa into the dst,

355

indexes 52+bexp, 51+bexp, etc, until k+bexp < 0. */

356

/* bexp is in range -52 .. 0 inclusive */

357

for (i = 63; i >= 0; i--) {

358

j = i - 12 + bexp;

359

if (j < 0) break;

360

/* We shouldn't really call vassert from generated code. */

361

vassert(j >= 0 && j < 52);

362

write_bit_array ( f64,

363

j,

364

read_bit_array ( f80, i ) );

365

}

366

/* and now we might have to round ... */

367

if (read_bit_array(f80, 10+1 - bexp) == 1)

368

goto do_rounding;

369

370

return;

371

}

372

373

/* Ok, it's a normalised number which is representable as a double.

374

Copy the exponent and mantissa into place. */

375

/*

376

for (i = 0; i < 52; i++)

377

write_bit_array ( f64,

378

i,

379

read_bit_array ( f80, i+11 ) );

380

*/

381

f64[0] = toUChar( (f80[1] >> 3) | (f80[2] << 5) );

382

f64[1] = toUChar( (f80[2] >> 3) | (f80[3] << 5) );

383

f64[2] = toUChar( (f80[3] >> 3) | (f80[4] << 5) );

384

f64[3] = toUChar( (f80[4] >> 3) | (f80[5] << 5) );

385

f64[4] = toUChar( (f80[5] >> 3) | (f80[6] << 5) );

386

f64[5] = toUChar( (f80[6] >> 3) | (f80[7] << 5) );

387

388

f64[6] = toUChar( ((bexp << 4) & 0xF0) | ((f80[7] >> 3) & 0x0F) );

389

390

f64[7] = toUChar( (sign << 7) | ((bexp >> 4) & 0x7F) );

391

392

/* Now consider any rounding that needs to happen as a result of

393

truncating the mantissa. */

394

if (f80[1] & 4) /* read_bit_array(f80, 10) == 1) */ {

395

396

/* If the bottom bits of f80 are "100 0000 0000", then the

397

infinitely precise value is deemed to be mid-way between the

398

two closest representable values. Since we're doing

399

round-to-nearest (the default mode), in that case it is the

400

bit immediately above which indicates whether we should round

401

upwards or not -- if 0, we don't. All that is encapsulated

402

in the following simple test. */

403

if ((f80[1] & 0xF) == 4/*0100b*/ && f80[0] == 0)

404

return;

405

406

do_rounding:

407

/* Round upwards. This is a kludge. Once in every 2^24

408

roundings (statistically) the bottom three bytes are all 0xFF

409

and so we don't round at all. Could be improved. */

410

if (f64[0] != 0xFF) {

411

f64[0]++;

412

}

413

else

414

if (f64[0] == 0xFF && f64[1] != 0xFF) {

415

f64[0] = 0;

416

f64[1]++;

417

}

418

else

419

if (f64[0] == 0xFF && f64[1] == 0xFF && f64[2] != 0xFF) {

420

f64[0] = 0;

421

f64[1] = 0;

422

f64[2]++;

423

}

424

/* else we don't round, but we should. */

425

}

426

}

427

428

429

/* CALLED FROM GENERATED CODE: CLEAN HELPER */

430

/* Extract the signed significand or exponent component as per

431

fxtract. Arg and result are doubles travelling under the guise of

432

ULongs. Returns significand when getExp is zero and exponent

433

otherwise. */

434

ULong x86amd64g_calculate_FXTRACT ( ULong arg, HWord getExp )

435

{

436

ULong uSig, uExp;

437

/* Long sSig; */

438

Int sExp, i;

439

UInt sign, expExp;

440

441

/*

442

S 7FF 0------0 infinity

443

S 7FF 0X-----X snan

444

S 7FF 1X-----X qnan

445

*/

446

const ULong posInf = 0x7FF0000000000000ULL;

447

const ULong negInf = 0xFFF0000000000000ULL;

448

const ULong nanMask = 0x7FF0000000000000ULL;

449

const ULong qNan = 0x7FF8000000000000ULL;

450

const ULong posZero = 0x0000000000000000ULL;

451

const ULong negZero = 0x8000000000000000ULL;

452

const ULong bit51 = 1ULL << 51;

453

const ULong bit52 = 1ULL << 52;

454

const ULong sigMask = bit52 - 1;

455

456

/* Mimic Core i5 behaviour for special cases. */

457

if (arg == posInf)

458

return getExp ? posInf : posInf;

459

if (arg == negInf)

460

return getExp ? posInf : negInf;

461

if ((arg & nanMask) == nanMask)

462

return qNan | (arg & (1ULL << 63));

463

if (arg == posZero)

464

return getExp ? negInf : posZero;

465

if (arg == negZero)

466

return getExp ? negInf : negZero;

467

468

/* Split into sign, exponent and significand. */

469

sign = ((UInt)(arg >> 63)) & 1;

470

471

/* Mask off exponent & sign. uSig is in range 0 .. 2^52-1. */

472

uSig = arg & sigMask;

473

474

/* Get the exponent. */

475

sExp = ((Int)(arg >> 52)) & 0x7FF;

476

477

/* Deal with denormals: if the exponent is zero, then the

478

significand cannot possibly be zero (negZero/posZero are handled

479

above). Shift the significand left until bit 51 of it becomes

480

1, and decrease the exponent accordingly.

481

*/

482

if (sExp == 0) {

483

for (i = 0; i < 52; i++) {

484

if (uSig & bit51)

485

break;

486

uSig <<= 1;

487

sExp--;

488

}

489

uSig <<= 1;

490

} else {

491

/* Add the implied leading-1 in the significand. */

492

uSig |= bit52;

493

}

494

495

/* Roll in the sign. */

496

/* sSig = uSig; */

497

/* if (sign) sSig =- sSig; */

498

499

/* Convert sig into a double. This should be an exact conversion.

500

Then divide by 2^52, which should give a value in the range 1.0

501

to 2.0-epsilon, at least for normalised args. */

502

/* dSig = (Double)sSig; */

503

/* dSig /= 67108864.0; */ /* 2^26 */

504

/* dSig /= 67108864.0; */ /* 2^26 */

505

uSig &= sigMask;

506

uSig |= 0x3FF0000000000000ULL;

507

if (sign)

508

uSig ^= negZero;

509

510

/* Convert exp into a double. Also an exact conversion. */

511

/* dExp = (Double)(sExp - 1023); */

512

sExp -= 1023;

513

if (sExp == 0) {

514

uExp = 0;

515

} else {

516

uExp = sExp < 0 ? -sExp : sExp;

517

expExp = 0x3FF +52;

518

/* 1 <= uExp <= 1074 */

519

/* Skip first 42 iterations of normalisation loop as we know they

520

will always happen */

521

uExp <<= 42;

522

expExp -= 42;

523

for (i = 0; i < 52-42; i++) {

524

if (uExp & bit52)

525

break;

526

uExp <<= 1;

527

expExp--;

528

}

529

uExp &= sigMask;

530

uExp |= ((ULong)expExp) << 52;

531

if (sExp < 0) uExp ^= negZero;

532

}

533

534

return getExp ? uExp : uSig;

535

}

536

537

538

539

/*---------------------------------------------------------*/

540

/*--- SSE4.2 PCMP{E,I}STR{I,M} helpers ---*/

541

/*---------------------------------------------------------*/

542

543

/* We need the definitions for OSZACP eflags/rflags offsets.

544

#including guest_{amd64,x86}_defs.h causes chaos, so just copy the

545

required values directly. They are not going to change in the

546

foreseeable future :-)

547

*/

548

549

#define SHIFT_O 11

550

#define SHIFT_S 7

551

#define SHIFT_Z 6

552

#define SHIFT_A 4

553

#define SHIFT_C 0

554

#define SHIFT_P 2

555

556

#define MASK_O (1 << SHIFT_O)

557

#define MASK_S (1 << SHIFT_S)

558

#define MASK_Z (1 << SHIFT_Z)

559

#define MASK_A (1 << SHIFT_A)

560

#define MASK_C (1 << SHIFT_C)

561

#define MASK_P (1 << SHIFT_P)

562

563

564

/* Count leading zeroes, w/ 0-produces-32 semantics, a la Hacker's

565

Delight. */

566

static UInt clz32 ( UInt x )

567

{

568

Int y, m, n;

569

y = -(x >> 16);

570

m = (y >> 16) & 16;

571

n = 16 - m;

572

x = x >> m;

573

y = x - 0x100;

574

m = (y >> 16) & 8;

575

n = n + m;

576

x = x << m;

577

y = x - 0x1000;

578

m = (y >> 16) & 4;

579

n = n + m;

580

x = x << m;

581

y = x - 0x4000;

582

m = (y >> 16) & 2;

583

n = n + m;

584

x = x << m;

585

y = x >> 14;

586

m = y & ~(y >> 1);

587

return n + 2 - m;

588

}

589

590

static UInt ctz32 ( UInt x )

591

{

592

return 32 - clz32((~x) & (x-1));

593

}

594

595

/* Convert a 4-bit value to a 32-bit value by cloning each bit 8

596

times. There's surely a better way to do this, but I don't know

597

what it is. */

598

static UInt bits4_to_bytes4 ( UInt bits4 )

599

{

600

UInt r = 0;

601

r |= (bits4 & 1) ? 0x000000FF : 0;

602

r |= (bits4 & 2) ? 0x0000FF00 : 0;

603

r |= (bits4 & 4) ? 0x00FF0000 : 0;

604

r |= (bits4 & 8) ? 0xFF000000 : 0;

605

return r;

606

}

607

608

609

/* Given partial results from a pcmpXstrX operation (intRes1,

610

basically), generate an I- or M-format output value, also the new

611

OSZACP flags. */

612

static

613

void compute_PCMPxSTRx_gen_output (/*OUT*/V128* resV,

614

/*OUT*/UInt* resOSZACP,

615

UInt intRes1,

616

UInt zmaskL, UInt zmaskR,

617

UInt validL,

618

UInt pol, UInt idx,

619

Bool isxSTRM )

620

{

621

vassert((pol >> 2) == 0);

622

vassert((idx >> 1) == 0);

623

624

UInt intRes2 = 0;

625

switch (pol) {

626

case 0: intRes2 = intRes1; break; // pol +

627

case 1: intRes2 = ~intRes1; break; // pol -

628

case 2: intRes2 = intRes1; break; // pol m+

629

case 3: intRes2 = intRes1 ^ validL; break; // pol m-

630

}

631

intRes2 &= 0xFFFF;

632

633

if (isxSTRM) {

634

635

// generate M-format output (a bit or byte mask in XMM0)

636

if (idx) {

637

resV->w32[0] = bits4_to_bytes4( (intRes2 >> 0) & 0xF );

638

resV->w32[1] = bits4_to_bytes4( (intRes2 >> 4) & 0xF );

639

resV->w32[2] = bits4_to_bytes4( (intRes2 >> 8) & 0xF );

640

resV->w32[3] = bits4_to_bytes4( (intRes2 >> 12) & 0xF );

641

} else {

642

resV->w32[0] = intRes2 & 0xFFFF;

643

resV->w32[1] = 0;

644

resV->w32[2] = 0;

645

resV->w32[3] = 0;

646

}

647

648

} else {

649

650

// generate I-format output (an index in ECX)

651

// generate ecx value

652

UInt newECX = 0;

653

if (idx) {

654

// index of ms-1-bit

655

newECX = intRes2 == 0 ? 16 : (31 - clz32(intRes2));

656

} else {

657

// index of ls-1-bit

658

newECX = intRes2 == 0 ? 16 : ctz32(intRes2);

659

}

660

661

resV->w32[0] = newECX;

662

resV->w32[1] = 0;

663

resV->w32[2] = 0;

664

resV->w32[3] = 0;

665

666

}

667

668

// generate new flags, common to all ISTRI and ISTRM cases

669

*resOSZACP // A, P are zero

670

= ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0

671

| ((zmaskL == 0) ? 0 : MASK_Z) // Z == 1 iff any in argL is 0

672

| ((zmaskR == 0) ? 0 : MASK_S) // S == 1 iff any in argR is 0

673

| ((intRes2 & 1) << SHIFT_O); // O == IntRes2[0]

674

}

675

676

677

/* Compute result and new OSZACP flags for all PCMP{E,I}STR{I,M}

678

variants.

679

680

For xSTRI variants, the new ECX value is placed in the 32 bits

681

pointed to by *resV, and the top 96 bits are zeroed. For xSTRM

682

variants, the result is a 128 bit value and is placed at *resV in

683

the obvious way.

684

685

For all variants, the new OSZACP value is placed at *resOSZACP.

686

687

argLV and argRV are the vector args. The caller must prepare a

688

16-bit mask for each, zmaskL and zmaskR. For ISTRx variants this

689

must be 1 for each zero byte of of the respective arg. For ESTRx

690

variants this is derived from the explicit length indication, and

691

must be 0 in all places except at the bit index corresponding to

692

the valid length (0 .. 16). If the valid length is 16 then the

693

mask must be all zeroes. In all cases, bits 31:16 must be zero.

694

695

imm8 is the original immediate from the instruction. isSTRM

696

indicates whether this is a xSTRM or xSTRI variant, which controls

697

how much of *res is written.

698

699

If the given imm8 case can be handled, the return value is True.

700

If not, False is returned, and neither *res not *resOSZACP are

701

altered.

702

*/

703

704

Bool compute_PCMPxSTRx ( /*OUT*/V128* resV,

705

/*OUT*/UInt* resOSZACP,

706

V128* argLV, V128* argRV,

707

UInt zmaskL, UInt zmaskR,

708

UInt imm8, Bool isxSTRM )

709

{

710

vassert(imm8 < 0x80);

711

vassert((zmaskL >> 16) == 0);

712

vassert((zmaskR >> 16) == 0);

713

714

/* Explicitly reject any imm8 values that haven't been validated,

715

even if they would probably work. Life is too short to have

716

unvalidated cases in the code base. */

717

switch (imm8) {

718

case 0x00:

719

case 0x02: case 0x08: case 0x0A: case 0x0C: case 0x12:

720

case 0x1A: case 0x38: case 0x3A: case 0x44: case 0x4A:

721

break;

722

default:

723

return False;

724

}

725

726

UInt fmt = (imm8 >> 0) & 3; // imm8[1:0] data format

727

UInt agg = (imm8 >> 2) & 3; // imm8[3:2] aggregation fn

728

UInt pol = (imm8 >> 4) & 3; // imm8[5:4] polarity

729

UInt idx = (imm8 >> 6) & 1; // imm8[6] 1==msb/bytemask

730

731

/*----------------------------------------*/

732

/*-- strcmp on byte data --*/

733

/*----------------------------------------*/

734

735

if (agg == 2/*equal each, aka strcmp*/

736

&& (fmt == 0/*ub*/ || fmt == 2/*sb*/)) {

737

Int i;

738

UChar* argL = (UChar*)argLV;

739

UChar* argR = (UChar*)argRV;

740

UInt boolResII = 0;

741

for (i = 15; i >= 0; i--) {

742

UChar cL = argL[i];

743

UChar cR = argR[i];

744

boolResII = (boolResII << 1) | (cL == cR ? 1 : 0);

745

}

746

UInt validL = ~(zmaskL | -zmaskL); // not(left(zmaskL))

747

UInt validR = ~(zmaskR | -zmaskR); // not(left(zmaskR))

748

749

// do invalidation, common to all equal-each cases

750

UInt intRes1

751

= (boolResII & validL & validR) // if both valid, use cmpres

752

| (~ (validL | validR)); // if both invalid, force 1

753

// else force 0

754

intRes1 &= 0xFFFF;

755

756

// generate I-format output

757

compute_PCMPxSTRx_gen_output(

758

resV, resOSZACP,

759

intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM

760

);

761

762

return True;

763

}

764

765

/*----------------------------------------*/

766

/*-- set membership on byte data --*/

767

/*----------------------------------------*/

768

769

if (agg == 0/*equal any, aka find chars in a set*/

770

&& (fmt == 0/*ub*/ || fmt == 2/*sb*/)) {

771

/* argL: the string, argR: charset */

772

UInt si, ci;

773

UChar* argL = (UChar*)argLV;

774

UChar* argR = (UChar*)argRV;

775

UInt boolRes = 0;

776

UInt validL = ~(zmaskL | -zmaskL); // not(left(zmaskL))

777

UInt validR = ~(zmaskR | -zmaskR); // not(left(zmaskR))

778

779

for (si = 0; si < 16; si++) {

780

if ((validL & (1 << si)) == 0)

781

// run off the end of the string.

782

break;

783

UInt m = 0;

784

for (ci = 0; ci < 16; ci++) {

785

if ((validR & (1 << ci)) == 0) break;

786

if (argR[ci] == argL[si]) { m = 1; break; }

787

}

788

boolRes |= (m << si);

789

}

790

791

// boolRes is "pre-invalidated"

792

UInt intRes1 = boolRes & 0xFFFF;

793

794

// generate I-format output

795

compute_PCMPxSTRx_gen_output(

796

resV, resOSZACP,

797

intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM

798

);

799

800

return True;

801

}

802

803

/*----------------------------------------*/

804

/*-- substring search on byte data --*/

805

/*----------------------------------------*/

806

807

if (agg == 3/*equal ordered, aka substring search*/

808

&& (fmt == 0/*ub*/ || fmt == 2/*sb*/)) {

809

810

/* argL: haystack, argR: needle */

811

UInt ni, hi;

812

UChar* argL = (UChar*)argLV;

813

UChar* argR = (UChar*)argRV;

814

UInt boolRes = 0;

815

UInt validL = ~(zmaskL | -zmaskL); // not(left(zmaskL))

816

UInt validR = ~(zmaskR | -zmaskR); // not(left(zmaskR))

817

for (hi = 0; hi < 16; hi++) {

818

if ((validL & (1 << hi)) == 0)

819

// run off the end of the haystack

820

break;

821

UInt m = 1;

822

for (ni = 0; ni < 16; ni++) {

823

if ((validR & (1 << ni)) == 0) break;

824

UInt i = ni + hi;

825

if (i >= 16) break;

826

if (argL[i] != argR[ni]) { m = 0; break; }

827

}

828

boolRes |= (m << hi);

829

}

830

831

// boolRes is "pre-invalidated"

832

UInt intRes1 = boolRes & 0xFFFF;

833

834

// generate I-format output

835

compute_PCMPxSTRx_gen_output(

836

resV, resOSZACP,

837

intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM

838

);

839

840

return True;

841

}

842

843

/*----------------------------------------*/

844

/*-- ranges, unsigned byte data --*/

845

/*----------------------------------------*/

846

847

if (agg == 1/*ranges*/

848

&& fmt == 0/*ub*/) {

849

850

/* argL: string, argR: range-pairs */

851

UInt ri, si;

852

UChar* argL = (UChar*)argLV;

853

UChar* argR = (UChar*)argRV;

854

UInt boolRes = 0;

855

UInt validL = ~(zmaskL | -zmaskL); // not(left(zmaskL))

856

UInt validR = ~(zmaskR | -zmaskR); // not(left(zmaskR))

857

for (si = 0; si < 16; si++) {

858

if ((validL & (1 << si)) == 0)

859

// run off the end of the string

860

break;

861

UInt m = 0;

862

for (ri = 0; ri < 16; ri += 2) {

863

if ((validR & (3 << ri)) != (3 << ri)) break;

864

if (argR[ri] <= argL[si] && argL[si] <= argR[ri+1]) {

865

m = 1; break;

866

}

867

}

868

boolRes |= (m << si);

869

}

870

871

// boolRes is "pre-invalidated"

872

UInt intRes1 = boolRes & 0xFFFF;

873

874

// generate I-format output

875

compute_PCMPxSTRx_gen_output(

876

resV, resOSZACP,

877

intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM

878

);

879

880

return True;

881

}

882

883

return False;

884

}

885

886

887

/*---------------------------------------------------------------*/

888

/*--- end guest_generic_x87.c ---*/

889

/*---------------------------------------------------------------*/