~ubuntu-branches/ubuntu/raring/xen/raring-proposed

Viewing changes to .pc/xsa30-4.2.patch/xen/arch/x86/mm/p2m-pod.c

Committer: Package Import Robot
Author(s): Stefan Bader
Date: 2012-12-05 18:13:25 UTC
Revision ID: package-import@ubuntu.com-20121205181325-8185ehgm3pls36mp

Tags: 4.2.0-1ubuntu4

* Applying Xen Security fixes (LP: #1086875)
  - gnttab: fix releasing of memory upon switches between versions
    CVE-2012-5510
  - hvm: Limit the size of large HVM op batches
    CVE-2012-5511
  - xen: add missing guest address range checks to XENMEM_exchange handlers
    CVE-2012-5513
  - xen: fix error handling of guest_physmap_mark_populate_on_demand()
    CVE-2012-5514
  - memop: limit guest specified extent order
    CVE-2012-5515
  - x86: get_page_from_gfn() must return NULL for invalid GFNs
    CVE-2012-5525

files added:
.pc/xsa26-4.2.patch

.pc/xsa26-4.2.patch/xen

.pc/xsa26-4.2.patch/xen/common

.pc/xsa26-4.2.patch/xen/common/grant_table.c

.pc/xsa27-4.2.patch

.pc/xsa27-4.2.patch/xen

.pc/xsa27-4.2.patch/xen/arch

.pc/xsa27-4.2.patch/xen/arch/x86

.pc/xsa27-4.2.patch/xen/arch/x86/hvm

.pc/xsa27-4.2.patch/xen/arch/x86/hvm/hvm.c

.pc/xsa27-4.2.patch/xen/include

.pc/xsa27-4.2.patch/xen/include/asm-x86

.pc/xsa27-4.2.patch/xen/include/asm-x86/config.h

.pc/xsa29-4.2-unstable.patch

.pc/xsa29-4.2-unstable.patch/xen

.pc/xsa29-4.2-unstable.patch/xen/common

.pc/xsa29-4.2-unstable.patch/xen/common/compat

.pc/xsa29-4.2-unstable.patch/xen/common/compat/memory.c

.pc/xsa29-4.2-unstable.patch/xen/common/memory.c

.pc/xsa30-4.2.patch

.pc/xsa30-4.2.patch/xen

.pc/xsa30-4.2.patch/xen/arch

.pc/xsa30-4.2.patch/xen/arch/x86

.pc/xsa30-4.2.patch/xen/arch/x86/mm

.pc/xsa30-4.2.patch/xen/arch/x86/mm/p2m-pod.c

.pc/xsa31-4.2-unstable.patch

.pc/xsa31-4.2-unstable.patch/xen

.pc/xsa31-4.2-unstable.patch/xen/common

.pc/xsa31-4.2-unstable.patch/xen/common/memory.c

.pc/xsa32-4.2.patch

.pc/xsa32-4.2.patch/xen

.pc/xsa32-4.2.patch/xen/include

.pc/xsa32-4.2.patch/xen/include/asm-x86

.pc/xsa32-4.2.patch/xen/include/asm-x86/p2m.h

debian/patches/xsa26-4.2.patch

debian/patches/xsa27-4.2.patch

debian/patches/xsa29-4.2-unstable.patch

debian/patches/xsa30-4.2.patch

debian/patches/xsa31-4.2-unstable.patch

debian/patches/xsa32-4.2.patch

files modified:
.pc/applied-patches

debian/changelog

debian/patches/series

xen/arch/x86/hvm/hvm.c

xen/arch/x86/mm/p2m-pod.c

xen/common/compat/memory.c

xen/common/grant_table.c

xen/common/memory.c

xen/include/asm-x86/config.h

xen/include/asm-x86/p2m.h

Show diffs side-by-side

added added

removed removed

.pc/xsa30-4.2.patch/xen/arch/x86/mm/p2m-pod.c

/******************************************************************************

* arch/x86/mm/p2m-pod.c

* Populate-on-demand p2m entries.

* This program is free software; you can redistribute it and/or modify

* it under the terms of the GNU General Public License as published by

* the Free Software Foundation; either version 2 of the License, or

* (at your option) any later version.

* This program is distributed in the hope that it will be useful,

* but WITHOUT ANY WARRANTY; without even the implied warranty of

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

* GNU General Public License for more details.

* You should have received a copy of the GNU General Public License

* along with this program; if not, write to the Free Software

* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

#include <asm/domain.h>

#include <asm/page.h>

#include <asm/paging.h>

#include <asm/p2m.h>

#include <asm/hvm/vmx/vmx.h> /* ept_p2m_init() */

#include <xen/iommu.h>

#include <asm/mem_event.h>

#include <public/mem_event.h>

#include <asm/mem_sharing.h>

#include <xen/event.h>

#include <asm/hvm/nestedhvm.h>

#include <asm/hvm/svm/amd-iommu-proto.h>

#include "mm-locks.h"

/* Override macros from asm/page.h to make them work with mfn_t */

#undef mfn_to_page

#define mfn_to_page(_m) __mfn_to_page(mfn_x(_m))

#undef mfn_valid

#define mfn_valid(_mfn) __mfn_valid(mfn_x(_mfn))

#undef page_to_mfn

#define page_to_mfn(_pg) _mfn(__page_to_mfn(_pg))

#define superpage_aligned(_x) (((_x)&(SUPERPAGE_PAGES-1))==0)

/* Enforce lock ordering when grabbing the "external" page_alloc lock */

static inline void lock_page_alloc(struct p2m_domain *p2m)

{

page_alloc_mm_pre_lock();

spin_lock(&(p2m->domain->page_alloc_lock));

page_alloc_mm_post_lock(p2m->domain->arch.page_alloc_unlock_level);

}

static inline void unlock_page_alloc(struct p2m_domain *p2m)

{

page_alloc_mm_unlock(p2m->domain->arch.page_alloc_unlock_level);

spin_unlock(&(p2m->domain->page_alloc_lock));

}

* Populate-on-demand functionality

static int

p2m_pod_cache_add(struct p2m_domain *p2m,

struct page_info *page,

unsigned int order)

{

int i;

struct page_info *p;

struct domain *d = p2m->domain;

#ifndef NDEBUG

mfn_t mfn;

mfn = page_to_mfn(page);

/* Check to make sure this is a contiguous region */

if( mfn_x(mfn) & ((1 << order) - 1) )

{

printk("%s: mfn %lx not aligned order %u! (mask %lx)\n",

__func__, mfn_x(mfn), order, ((1UL << order) - 1));

return -1;

}

for(i=0; i < 1 << order ; i++) {

struct domain * od;

p = mfn_to_page(_mfn(mfn_x(mfn) + i));

od = page_get_owner(p);

if(od != d)

{

printk("%s: mfn %lx expected owner d%d, got owner d%d!\n",

__func__, mfn_x(mfn), d->domain_id,

od?od->domain_id:-1);

return -1;

}

100

}

101

#endif

102

103

ASSERT(pod_locked_by_me(p2m));

104

105

106

* Pages from domain_alloc and returned by the balloon driver aren't

107

* guaranteed to be zero; but by reclaiming zero pages, we implicitly

108

* promise to provide zero pages. So we scrub pages before using.

109

110

for ( i = 0; i < (1 << order); i++ )

111

{

112

char *b = map_domain_page(mfn_x(page_to_mfn(page)) + i);

113

clear_page(b);

114

unmap_domain_page(b);

115

}

116

117

/* First, take all pages off the domain list */

118

lock_page_alloc(p2m);

119

for(i=0; i < 1 << order ; i++)

120

{

121

p = page + i;

122

page_list_del(p, &d->page_list);

123

}

124

125

unlock_page_alloc(p2m);

126

127

/* Then add the first one to the appropriate populate-on-demand list */

128

switch(order)

129

{

130

case PAGE_ORDER_2M:

131

page_list_add_tail(page, &p2m->pod.super); /* lock: page_alloc */

132

p2m->pod.count += 1 << order;

133

break;

134

case PAGE_ORDER_4K:

135

page_list_add_tail(page, &p2m->pod.single); /* lock: page_alloc */

136

p2m->pod.count += 1;

137

break;

138

default:

139

BUG();

140

}

141

142

return 0;

143

}

144

145

/* Get a page of size order from the populate-on-demand cache. Will break

146

* down 2-meg pages into singleton pages automatically. Returns null if

147

* a superpage is requested and no superpages are available. */

148

static struct page_info * p2m_pod_cache_get(struct p2m_domain *p2m,

149

unsigned int order)

150

{

151

struct page_info *p = NULL;

152

int i;

153

154

ASSERT(pod_locked_by_me(p2m));

155

156

if ( order == PAGE_ORDER_2M && page_list_empty(&p2m->pod.super) )

157

{

158

return NULL;

159

}

160

else if ( order == PAGE_ORDER_4K && page_list_empty(&p2m->pod.single) )

161

{

162

unsigned long mfn;

163

struct page_info *q;

164

165

BUG_ON( page_list_empty(&p2m->pod.super) );

166

167

/* Break up a superpage to make single pages. NB count doesn't

168

* need to be adjusted. */

169

p = page_list_remove_head(&p2m->pod.super);

170

mfn = mfn_x(page_to_mfn(p));

171

172

for ( i=0; i<SUPERPAGE_PAGES; i++ )

173

{

174

q = mfn_to_page(_mfn(mfn+i));

175

page_list_add_tail(q, &p2m->pod.single);

176

}

177

}

178

179

switch ( order )

180

{

181

case PAGE_ORDER_2M:

182

BUG_ON( page_list_empty(&p2m->pod.super) );

183

p = page_list_remove_head(&p2m->pod.super);

184

p2m->pod.count -= 1 << order;

185

break;

186

case PAGE_ORDER_4K:

187

BUG_ON( page_list_empty(&p2m->pod.single) );

188

p = page_list_remove_head(&p2m->pod.single);

189

p2m->pod.count -= 1;

190

break;

191

default:

192

BUG();

193

}

194

195

/* Put the pages back on the domain page_list */

196

lock_page_alloc(p2m);

197

for ( i = 0 ; i < (1 << order); i++ )

198

{

199

BUG_ON(page_get_owner(p + i) != p2m->domain);

200

page_list_add_tail(p + i, &p2m->domain->page_list);

201

}

202

unlock_page_alloc(p2m);

203

204

return p;

205

}

206

207

/* Set the size of the cache, allocating or freeing as necessary. */

208

static int

209

p2m_pod_set_cache_target(struct p2m_domain *p2m, unsigned long pod_target, int preemptible)

210

{

211

struct domain *d = p2m->domain;

212

int ret = 0;

213

214

ASSERT(pod_locked_by_me(p2m));

215

216

/* Increasing the target */

217

while ( pod_target > p2m->pod.count )

218

{

219

struct page_info * page;

220

int order;

221

222

if ( (pod_target - p2m->pod.count) >= SUPERPAGE_PAGES )

223

order = PAGE_ORDER_2M;

224

else

225

order = PAGE_ORDER_4K;

226

retry:

227

page = alloc_domheap_pages(d, order, PAGE_ORDER_4K);

228

if ( unlikely(page == NULL) )

229

{

230

if ( order == PAGE_ORDER_2M )

231

{

232

/* If we can't allocate a superpage, try singleton pages */

233

order = PAGE_ORDER_4K;

234

goto retry;

235

}

236

237

printk("%s: Unable to allocate page for PoD cache (target=%lu cache=%ld)\n",

238

__func__, pod_target, p2m->pod.count);

239

ret = -ENOMEM;

240

goto out;

241

}

242

243

p2m_pod_cache_add(p2m, page, order);

244

245

if ( hypercall_preempt_check() && preemptible )

246

{

247

ret = -EAGAIN;

248

goto out;

249

}

250

}

251

252

/* Decreasing the target */

253

/* We hold the pod lock here, so we don't need to worry about

254

* cache disappearing under our feet. */

255

while ( pod_target < p2m->pod.count )

256

{

257

struct page_info * page;

258

int order, i;

259

260

if ( (p2m->pod.count - pod_target) > SUPERPAGE_PAGES

261

&& !page_list_empty(&p2m->pod.super) )

262

order = PAGE_ORDER_2M;

263

else

264

order = PAGE_ORDER_4K;

265

266

page = p2m_pod_cache_get(p2m, order);

267

268

ASSERT(page != NULL);

269

270

/* Then free them */

271

for ( i = 0 ; i < (1 << order) ; i++ )

272

{

273

/* Copied from common/memory.c:guest_remove_page() */

274

if ( unlikely(!get_page(page+i, d)) )

275

{

276

gdprintk(XENLOG_INFO, "Bad page free for domain %u\n", d->domain_id);

277

ret = -EINVAL;

278

goto out;

279

}

280

281

if ( test_and_clear_bit(_PGT_pinned, &(page+i)->u.inuse.type_info) )

282

put_page_and_type(page+i);

283

284

if ( test_and_clear_bit(_PGC_allocated, &(page+i)->count_info) )

285

put_page(page+i);

286

287

put_page(page+i);

288

289

if ( hypercall_preempt_check() && preemptible )

290

{

291

ret = -EAGAIN;

292

goto out;

293

}

294

}

295

}

296

297

out:

298

return ret;

299

}

300

301

302

* The "right behavior" here requires some careful thought. First, some

303

* definitions:

304

* + M: static_max

305

* + B: number of pages the balloon driver has ballooned down to.

306

* + P: Number of populated pages.

307

* + T: Old target

308

* + T': New target

309

310

* The following equations should hold:

311

* 0 <= P <= T <= B <= M

312

* d->arch.p2m->pod.entry_count == B - P

313

* d->tot_pages == P + d->arch.p2m->pod.count

314

315

* Now we have the following potential cases to cover:

316

* B <T': Set the PoD cache size equal to the number of outstanding PoD

317

* entries. The balloon driver will deflate the balloon to give back

318

* the remainder of the ram to the guest OS.

319

* T <T'<B : Increase PoD cache size.

320

* T'<T<=B : Here we have a choice. We can decrease the size of the cache,

321

* get the memory right away. However, that means every time we

322

* reduce the memory target we risk the guest attempting to populate the

323

* memory before the balloon driver has reached its new target. Safer to

324

* never reduce the cache size here, but only when the balloon driver frees

325

* PoD ranges.

326

327

* If there are many zero pages, we could reach the target also by doing

328

* zero sweeps and marking the ranges PoD; but the balloon driver will have

329

* to free this memory eventually anyway, so we don't actually gain that much

330

* by doing so.

331

332

* NB that the equation (B<T') may require adjustment to the cache

333

* size as PoD pages are freed as well; i.e., freeing a PoD-backed

334

* entry when pod.entry_count == pod.count requires us to reduce both

335

* pod.entry_count and pod.count.

336

337

int

338

p2m_pod_set_mem_target(struct domain *d, unsigned long target)

339

{

340

struct p2m_domain *p2m = p2m_get_hostp2m(d);

341

int ret = 0;

342

unsigned long populated, pod_target;

343

344

pod_lock(p2m);

345

346

/* P == B: Nothing to do (unless the guest is being created). */

347

populated = d->tot_pages - p2m->pod.count;

348

if ( populated > 0 && p2m->pod.entry_count == 0 )

349

goto out;

350

351

/* Don't do anything if the domain is being torn down */

352

if ( d->is_dying )

353

goto out;

354

355

/* T' < B: Don't reduce the cache size; let the balloon driver

356

* take care of it. */

357

if ( target < d->tot_pages )

358

goto out;

359

360

pod_target = target - populated;

361

362

/* B < T': Set the cache size equal to # of outstanding entries,

363

* let the balloon driver fill in the rest. */

364

if ( populated > 0 && pod_target > p2m->pod.entry_count )

365

pod_target = p2m->pod.entry_count;

366

367

ASSERT( pod_target >= p2m->pod.count );

368

369

ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/);

370

371

out:

372

pod_unlock(p2m);

373

374

return ret;

375

}

376

377

void

378

p2m_pod_empty_cache(struct domain *d)

379

{

380

struct p2m_domain *p2m = p2m_get_hostp2m(d);

381

struct page_info *page;

382

383

/* After this barrier no new PoD activities can happen. */

384

BUG_ON(!d->is_dying);

385

spin_barrier(&p2m->pod.lock.lock);

386

387

lock_page_alloc(p2m);

388

389

while ( (page = page_list_remove_head(&p2m->pod.super)) )

390

{

391

int i;

392

393

for ( i = 0 ; i < SUPERPAGE_PAGES ; i++ )

394

{

395

BUG_ON(page_get_owner(page + i) != d);

396

page_list_add_tail(page + i, &d->page_list);

397

}

398

399

p2m->pod.count -= SUPERPAGE_PAGES;

400

}

401

402

while ( (page = page_list_remove_head(&p2m->pod.single)) )

403

{

404

BUG_ON(page_get_owner(page) != d);

405

page_list_add_tail(page, &d->page_list);

406

407

p2m->pod.count -= 1;

408

}

409

410

BUG_ON(p2m->pod.count != 0);

411

412

unlock_page_alloc(p2m);

413

}

414

415

int

416

p2m_pod_offline_or_broken_hit(struct page_info *p)

417

{

418

struct domain *d;

419

struct p2m_domain *p2m;

420

struct page_info *q, *tmp;

421

unsigned long mfn, bmfn;

422

423

if ( !(d = page_get_owner(p)) || !(p2m = p2m_get_hostp2m(d)) )

424

return 0;

425

426

pod_lock(p2m);

427

bmfn = mfn_x(page_to_mfn(p));

428

page_list_for_each_safe(q, tmp, &p2m->pod.super)

429

{

430

mfn = mfn_x(page_to_mfn(q));

431

if ( (bmfn >= mfn) && ((bmfn - mfn) < SUPERPAGE_PAGES) )

432

{

433

unsigned long i;

434

page_list_del(q, &p2m->pod.super);

435

for ( i = 0; i < SUPERPAGE_PAGES; i++)

436

{

437

q = mfn_to_page(_mfn(mfn + i));

438

page_list_add_tail(q, &p2m->pod.single);

439

}

440

page_list_del(p, &p2m->pod.single);

441

p2m->pod.count--;

442

goto pod_hit;

443

}

444

}

445

446

page_list_for_each_safe(q, tmp, &p2m->pod.single)

447

{

448

mfn = mfn_x(page_to_mfn(q));

449

if ( mfn == bmfn )

450

{

451

page_list_del(p, &p2m->pod.single);

452

p2m->pod.count--;

453

goto pod_hit;

454

}

455

}

456

457

pod_unlock(p2m);

458

return 0;

459

460

pod_hit:

461

lock_page_alloc(p2m);

462

page_list_add_tail(p, &d->arch.relmem_list);

463

unlock_page_alloc(p2m);

464

pod_unlock(p2m);

465

return 1;

466

}

467

468

void

469

p2m_pod_offline_or_broken_replace(struct page_info *p)

470

{

471

struct domain *d;

472

struct p2m_domain *p2m;

473

474

if ( !(d = page_get_owner(p)) || !(p2m = p2m_get_hostp2m(d)) )

475

return;

476

477

free_domheap_page(p);

478

479

p = alloc_domheap_page(d, PAGE_ORDER_4K);

480

if ( unlikely(!p) )

481

return;

482

483

pod_lock(p2m);

484

p2m_pod_cache_add(p2m, p, PAGE_ORDER_4K);

485

pod_unlock(p2m);

486

return;

487

}

488

489

static int

490

p2m_pod_zero_check_superpage(struct p2m_domain *p2m, unsigned long gfn);

491

492

493

/* This function is needed for two reasons:

494

* + To properly handle clearing of PoD entries

495

* + To "steal back" memory being freed for the PoD cache, rather than

496

* releasing it.

497

498

* Once both of these functions have been completed, we can return and

499

* allow decrease_reservation() to handle everything else.

500

501

int

502

p2m_pod_decrease_reservation(struct domain *d,

503

xen_pfn_t gpfn,

504

unsigned int order)

505

{

506

int ret=0;

507

int i;

508

struct p2m_domain *p2m = p2m_get_hostp2m(d);

509

510

int steal_for_cache;

511

int pod, nonpod, ram;

512

513

gfn_lock(p2m, gpfn, order);

514

pod_lock(p2m);

515

516

/* If we don't have any outstanding PoD entries, let things take their

517

* course */

518

if ( p2m->pod.entry_count == 0 )

519

goto out_unlock;

520

521

if ( unlikely(d->is_dying) )

522

goto out_unlock;

523

524

recount:

525

pod = nonpod = ram = 0;

526

527

/* Figure out if we need to steal some freed memory for our cache */

528

steal_for_cache = ( p2m->pod.entry_count > p2m->pod.count );

529

530

/* FIXME: Add contiguous; query for PSE entries? */

531

for ( i=0; i<(1<<order); i++)

532

{

533

p2m_access_t a;

534

p2m_type_t t;

535

536

(void)p2m->get_entry(p2m, gpfn + i, &t, &a, 0, NULL);

537

538

if ( t == p2m_populate_on_demand )

539

pod++;

540

else

541

{

542

nonpod++;

543

if ( p2m_is_ram(t) )

544

ram++;

545

}

546

}

547

548

/* No populate-on-demand? Don't need to steal anything? Then we're done!*/

549

if(!pod && !steal_for_cache)

550

goto out_unlock;

551

552

if ( !nonpod )

553

{

554

/* All PoD: Mark the whole region invalid and tell caller

555

* we're done. */

556

set_p2m_entry(p2m, gpfn, _mfn(INVALID_MFN), order, p2m_invalid, p2m->default_access);

557

p2m->pod.entry_count-=(1<<order);

558

BUG_ON(p2m->pod.entry_count < 0);

559

ret = 1;

560

goto out_entry_check;

561

}

562

563

/* Try to grab entire superpages if possible. Since the common case is for drivers

564

* to pass back singleton pages, see if we can take the whole page back and mark the

565

* rest PoD. */

566

if ( steal_for_cache

567

&& p2m_pod_zero_check_superpage(p2m, gpfn & ~(SUPERPAGE_PAGES-1)))

568

{

569

/* Since order may be arbitrary, we may have taken more or less

570

* than we were actually asked to; so just re-count from scratch */

571

goto recount;

572

}

573

574

/* Process as long as:

575

* + There are PoD entries to handle, or

576

* + There is ram left, and we want to steal it

577

578

for ( i=0;

579

i<(1<<order) && (pod>0 || (steal_for_cache && ram > 0));

580

i++)

581

{

582

mfn_t mfn;

583

p2m_type_t t;

584

p2m_access_t a;

585

586

mfn = p2m->get_entry(p2m, gpfn + i, &t, &a, 0, NULL);

587

if ( t == p2m_populate_on_demand )

588

{

589

set_p2m_entry(p2m, gpfn + i, _mfn(INVALID_MFN), 0, p2m_invalid, p2m->default_access);

590

p2m->pod.entry_count--;

591

BUG_ON(p2m->pod.entry_count < 0);

592

pod--;

593

}

594

else if ( steal_for_cache && p2m_is_ram(t) )

595

{

596

struct page_info *page;

597

598

ASSERT(mfn_valid(mfn));

599

600

page = mfn_to_page(mfn);

601

602

set_p2m_entry(p2m, gpfn + i, _mfn(INVALID_MFN), 0, p2m_invalid, p2m->default_access);

603

set_gpfn_from_mfn(mfn_x(mfn), INVALID_M2P_ENTRY);

604

605

p2m_pod_cache_add(p2m, page, 0);

606

607

steal_for_cache = ( p2m->pod.entry_count > p2m->pod.count );

608

609

nonpod--;

610

ram--;

611

}

612

}

613

614

/* If there are no more non-PoD entries, tell decrease_reservation() that

615

* there's nothing left to do. */

616

if ( nonpod == 0 )

617

ret = 1;

618

619

out_entry_check:

620

/* If we've reduced our "liabilities" beyond our "assets", free some */

621

if ( p2m->pod.entry_count < p2m->pod.count )

622

{

623

p2m_pod_set_cache_target(p2m, p2m->pod.entry_count, 0/*can't preempt*/);

624

}

625

626

out_unlock:

627

pod_unlock(p2m);

628

gfn_unlock(p2m, gpfn, order);

629

return ret;

630

}

631

632

void p2m_pod_dump_data(struct domain *d)

633

{

634

struct p2m_domain *p2m = p2m_get_hostp2m(d);

635

636

printk(" PoD entries=%ld cachesize=%ld\n",

637

p2m->pod.entry_count, p2m->pod.count);

638

}

639

640

641

/* Search for all-zero superpages to be reclaimed as superpages for the

642

* PoD cache. Must be called w/ pod lock held, must lock the superpage

643

* in the p2m */

644

static int

645

p2m_pod_zero_check_superpage(struct p2m_domain *p2m, unsigned long gfn)

646

{

647

mfn_t mfn, mfn0 = _mfn(INVALID_MFN);

648

p2m_type_t type, type0 = 0;

649

unsigned long * map = NULL;

650

int ret=0, reset = 0;

651

int i, j;

652

int max_ref = 1;

653

struct domain *d = p2m->domain;

654

655

ASSERT(pod_locked_by_me(p2m));

656

657

if ( !superpage_aligned(gfn) )

658

goto out;

659

660

/* Allow an extra refcount for one shadow pt mapping in shadowed domains */

661

if ( paging_mode_shadow(d) )

662

max_ref++;

663

664

/* NOTE: this is why we don't enforce deadlock constraints between p2m

665

* and pod locks */

666

gfn_lock(p2m, gfn, SUPERPAGE_ORDER);

667

668

/* Look up the mfns, checking to make sure they're the same mfn

669

* and aligned, and mapping them. */

670

for ( i=0; i<SUPERPAGE_PAGES; i++ )

671

{

672

p2m_access_t a;

673

mfn = p2m->get_entry(p2m, gfn + i, &type, &a, 0, NULL);

674

675

if ( i == 0 )

676

{

677

mfn0 = mfn;

678

type0 = type;

679

}

680

681

/* Conditions that must be met for superpage-superpage:

682

* + All gfns are ram types

683

* + All gfns have the same type

684

* + All of the mfns are allocated to a domain

685

* + None of the mfns are used as pagetables, or allocated via xenheap

686

* + The first mfn is 2-meg aligned

687

* + All the other mfns are in sequence

688

* Adding for good measure:

689

* + None of the mfns are likely to be mapped elsewhere (refcount

690

* 2 or less for shadow, 1 for hap)

691

692

if ( !p2m_is_ram(type)

693

|| type != type0

694

|| ( (mfn_to_page(mfn)->count_info & PGC_allocated) == 0 )

695

|| ( (mfn_to_page(mfn)->count_info & (PGC_page_table|PGC_xen_heap)) != 0 )

696

|| ( (mfn_to_page(mfn)->count_info & PGC_xen_heap ) != 0 )

697

|| ( (mfn_to_page(mfn)->count_info & PGC_count_mask) > max_ref )

698

|| !( ( i == 0 && superpage_aligned(mfn_x(mfn0)) )

699

|| ( i != 0 && mfn_x(mfn) == (mfn_x(mfn0) + i) ) ) )

700

goto out;

701

}

702

703

/* Now, do a quick check to see if it may be zero before unmapping. */

704

for ( i=0; i<SUPERPAGE_PAGES; i++ )

705

{

706

/* Quick zero-check */

707

map = map_domain_page(mfn_x(mfn0) + i);

708

709

for ( j=0; j<16; j++ )

710

if( *(map+j) != 0 )

711

break;

712

713

unmap_domain_page(map);

714

715

if ( j < 16 )

716

goto out;

717

718

}

719

720

/* Try to remove the page, restoring old mapping if it fails. */

721

set_p2m_entry(p2m, gfn, _mfn(0), PAGE_ORDER_2M,

722

p2m_populate_on_demand, p2m->default_access);

723

724

/* Make none of the MFNs are used elsewhere... for example, mapped

725

* via the grant table interface, or by qemu. Allow one refcount for

726

* being allocated to the domain. */

727

for ( i=0; i < SUPERPAGE_PAGES; i++ )

728

{

729

mfn = _mfn(mfn_x(mfn0) + i);

730

if ( (mfn_to_page(mfn)->count_info & PGC_count_mask) > 1 )

731

{

732

reset = 1;

733

goto out_reset;

734

}

735

}

736

737

/* Finally, do a full zero-check */

738

for ( i=0; i < SUPERPAGE_PAGES; i++ )

739

{

740

map = map_domain_page(mfn_x(mfn0) + i);

741

742

for ( j=0; j<PAGE_SIZE/sizeof(*map); j++ )

743

if( *(map+j) != 0 )

744

{

745

reset = 1;

746

break;

747

}

748

749

unmap_domain_page(map);

750

751

if ( reset )

752

goto out_reset;

753

}

754

755

if ( tb_init_done )

756

{

757

struct {

758

u64 gfn, mfn;

759

int d:16,order:16;

760

} t;

761

762

t.gfn = gfn;

763

t.mfn = mfn_x(mfn);

764

t.d = d->domain_id;

765

t.order = 9;

766

767

__trace_var(TRC_MEM_POD_ZERO_RECLAIM, 0, sizeof(t), &t);

768

}

769

770

/* Finally! We've passed all the checks, and can add the mfn superpage

771

* back on the PoD cache, and account for the new p2m PoD entries */

772

p2m_pod_cache_add(p2m, mfn_to_page(mfn0), PAGE_ORDER_2M);

773

p2m->pod.entry_count += SUPERPAGE_PAGES;

774

775

ret = SUPERPAGE_PAGES;

776

777

out_reset:

778

if ( reset )

779

set_p2m_entry(p2m, gfn, mfn0, 9, type0, p2m->default_access);

780

781

out:

782

gfn_unlock(p2m, gfn, SUPERPAGE_ORDER);

783

return ret;

784

}

785

786

static void

787

p2m_pod_zero_check(struct p2m_domain *p2m, unsigned long *gfns, int count)

788

{

789

mfn_t mfns[count];

790

p2m_type_t types[count];

791

unsigned long * map[count];

792

struct domain *d = p2m->domain;

793

794

int i, j;

795

int max_ref = 1;

796

797

/* Allow an extra refcount for one shadow pt mapping in shadowed domains */

798

if ( paging_mode_shadow(d) )

799

max_ref++;

800

801

/* First, get the gfn list, translate to mfns, and map the pages. */

802

for ( i=0; i<count; i++ )

803

{

804

p2m_access_t a;

805

mfns[i] = p2m->get_entry(p2m, gfns[i], types + i, &a, 0, NULL);

806

/* If this is ram, and not a pagetable or from the xen heap, and probably not mapped

807

elsewhere, map it; otherwise, skip. */

808

if ( p2m_is_ram(types[i])

809

&& ( (mfn_to_page(mfns[i])->count_info & PGC_allocated) != 0 )

810

&& ( (mfn_to_page(mfns[i])->count_info & (PGC_page_table|PGC_xen_heap)) == 0 )

811

&& ( (mfn_to_page(mfns[i])->count_info & PGC_count_mask) <= max_ref ) )

812

map[i] = map_domain_page(mfn_x(mfns[i]));

813

else

814

map[i] = NULL;

815

}

816

817

/* Then, go through and check for zeroed pages, removing write permission

818

* for those with zeroes. */

819

for ( i=0; i<count; i++ )

820

{

821

if(!map[i])

822

continue;

823

824

/* Quick zero-check */

825

for ( j=0; j<16; j++ )

826

if( *(map[i]+j) != 0 )

827

break;

828

829

if ( j < 16 )

830

{

831

unmap_domain_page(map[i]);

832

map[i] = NULL;

833

continue;

834

}

835

836

/* Try to remove the page, restoring old mapping if it fails. */

837

set_p2m_entry(p2m, gfns[i], _mfn(0), PAGE_ORDER_4K,

838

p2m_populate_on_demand, p2m->default_access);

839

840

/* See if the page was successfully unmapped. (Allow one refcount

841

* for being allocated to a domain.) */

842

if ( (mfn_to_page(mfns[i])->count_info & PGC_count_mask) > 1 )

843

{

844

unmap_domain_page(map[i]);

845

map[i] = NULL;

846

847

set_p2m_entry(p2m, gfns[i], mfns[i], PAGE_ORDER_4K,

848

types[i], p2m->default_access);

849

850

continue;

851

}

852

}

853

854

/* Now check each page for real */

855

for ( i=0; i < count; i++ )

856

{

857

if(!map[i])

858

continue;

859

860

for ( j=0; j<PAGE_SIZE/sizeof(*map[i]); j++ )

861

if( *(map[i]+j) != 0 )

862

break;

863

864

unmap_domain_page(map[i]);

865

866

/* See comment in p2m_pod_zero_check_superpage() re gnttab

867

* check timing. */

868

if ( j < PAGE_SIZE/sizeof(*map[i]) )

869

{

870

set_p2m_entry(p2m, gfns[i], mfns[i], PAGE_ORDER_4K,

871

types[i], p2m->default_access);

872

}

873

else

874

{

875

if ( tb_init_done )

876

{

877

struct {

878

u64 gfn, mfn;

879

int d:16,order:16;

880

} t;

881

882

t.gfn = gfns[i];

883

t.mfn = mfn_x(mfns[i]);

884

t.d = d->domain_id;

885

t.order = 0;

886

887

__trace_var(TRC_MEM_POD_ZERO_RECLAIM, 0, sizeof(t), &t);

888

}

889

890

/* Add to cache, and account for the new p2m PoD entry */

891

p2m_pod_cache_add(p2m, mfn_to_page(mfns[i]), PAGE_ORDER_4K);

892

p2m->pod.entry_count++;

893

}

894

}

895

896

}

897

898

#define POD_SWEEP_LIMIT 1024

899

900

/* When populating a new superpage, look at recently populated superpages

901

* hoping that they've been zeroed. This will snap up zeroed pages as soon as

902

* the guest OS is done with them. */

903

static void

904

p2m_pod_check_last_super(struct p2m_domain *p2m, unsigned long gfn_aligned)

905

{

906

unsigned long check_gfn;

907

908

ASSERT(p2m->pod.last_populated_index < POD_HISTORY_MAX);

909

910

check_gfn = p2m->pod.last_populated[p2m->pod.last_populated_index];

911

912

p2m->pod.last_populated[p2m->pod.last_populated_index] = gfn_aligned;

913

914

p2m->pod.last_populated_index =

915

( p2m->pod.last_populated_index + 1 ) % POD_HISTORY_MAX;

916

917

p2m_pod_zero_check_superpage(p2m, check_gfn);

918

}

919

920

921

#define POD_SWEEP_STRIDE 16

922

static void

923

p2m_pod_emergency_sweep(struct p2m_domain *p2m)

924

{

925

unsigned long gfns[POD_SWEEP_STRIDE];

926

unsigned long i, j=0, start, limit;

927

p2m_type_t t;

928

929

930

if ( p2m->pod.reclaim_single == 0 )

931

p2m->pod.reclaim_single = p2m->pod.max_guest;

932

933

start = p2m->pod.reclaim_single;

934

limit = (start > POD_SWEEP_LIMIT) ? (start - POD_SWEEP_LIMIT) : 0;

935

936

/* FIXME: Figure out how to avoid superpages */

937

/* NOTE: Promote to globally locking the p2m. This will get complicated

938

* in a fine-grained scenario. If we lock each gfn individually we must be

939

* careful about spinlock recursion limits and POD_SWEEP_STRIDE. */

940

p2m_lock(p2m);

941

for ( i=p2m->pod.reclaim_single; i > 0 ; i-- )

942

{

943

p2m_access_t a;

944

(void)p2m->get_entry(p2m, i, &t, &a, 0, NULL);

945

if ( p2m_is_ram(t) )

946

{

947

gfns[j] = i;

948

j++;

949

BUG_ON(j > POD_SWEEP_STRIDE);

950

if ( j == POD_SWEEP_STRIDE )

951

{

952

p2m_pod_zero_check(p2m, gfns, j);

953

j = 0;

954

}

955

}

956

/* Stop if we're past our limit and we have found *something*.

957

958

* NB that this is a zero-sum game; we're increasing our cache size

959

* by re-increasing our 'debt'. Since we hold the pod lock,

960

* (entry_count - count) must remain the same. */

961

if ( p2m->pod.count > 0 && i < limit )

962

break;

963

}

964

965

if ( j )

966

p2m_pod_zero_check(p2m, gfns, j);

967

968

p2m_unlock(p2m);

969

p2m->pod.reclaim_single = i ? i - 1 : i;

970

971

}

972

973

int

974

p2m_pod_demand_populate(struct p2m_domain *p2m, unsigned long gfn,

975

unsigned int order,

976

p2m_query_t q)

977

{

978

struct domain *d = p2m->domain;

979

struct page_info *p = NULL; /* Compiler warnings */

980

unsigned long gfn_aligned;

981

mfn_t mfn;

982

int i;

983

984

ASSERT(gfn_locked_by_me(p2m, gfn));

985

pod_lock(p2m);

986

987

/* This check is done with the pod lock held. This will make sure that

988

* even if d->is_dying changes under our feet, p2m_pod_empty_cache()

989

* won't start until we're done. */

990

if ( unlikely(d->is_dying) )

991

goto out_fail;

992

993

994

/* Because PoD does not have cache list for 1GB pages, it has to remap

995

* 1GB region to 2MB chunks for a retry. */

996

if ( order == PAGE_ORDER_1G )

997

{

998

pod_unlock(p2m);

999

gfn_aligned = (gfn >> order) << order;

1000

/* Note that we are supposed to call set_p2m_entry() 512 times to

1001

* split 1GB into 512 2MB pages here. But We only do once here because

1002

* set_p2m_entry() should automatically shatter the 1GB page into

1003

* 512 2MB pages. The rest of 511 calls are unnecessary.

1004

1005

* NOTE: In a fine-grained p2m locking scenario this operation

1006

* may need to promote its locking from gfn->1g superpage

1007

1008

set_p2m_entry(p2m, gfn_aligned, _mfn(0), PAGE_ORDER_2M,

1009

p2m_populate_on_demand, p2m->default_access);

1010

return 0;

1011

}

1012

1013

/* Only sweep if we're actually out of memory. Doing anything else

1014

* causes unnecessary time and fragmentation of superpages in the p2m. */

1015

if ( p2m->pod.count == 0 )

1016

p2m_pod_emergency_sweep(p2m);

1017

1018

/* If the sweep failed, give up. */

1019

if ( p2m->pod.count == 0 )

1020

goto out_of_memory;

1021

1022

/* Keep track of the highest gfn demand-populated by a guest fault */

1023

if ( gfn > p2m->pod.max_guest )

1024

p2m->pod.max_guest = gfn;

1025

1026

/* Get a page f/ the cache. A NULL return value indicates that the

1027

* 2-meg range should be marked singleton PoD, and retried */

1028

if ( (p = p2m_pod_cache_get(p2m, order)) == NULL )

1029

goto remap_and_retry;

1030

1031

mfn = page_to_mfn(p);

1032

1033

BUG_ON((mfn_x(mfn) & ((1 << order)-1)) != 0);

1034

1035

gfn_aligned = (gfn >> order) << order;

1036

1037

set_p2m_entry(p2m, gfn_aligned, mfn, order, p2m_ram_rw, p2m->default_access);

1038

1039

for( i = 0; i < (1UL << order); i++ )

1040

{

1041

set_gpfn_from_mfn(mfn_x(mfn) + i, gfn_aligned + i);

1042

paging_mark_dirty(d, mfn_x(mfn) + i);

1043

}

1044

1045

p2m->pod.entry_count -= (1 << order);

1046

BUG_ON(p2m->pod.entry_count < 0);

1047

1048

if ( tb_init_done )

1049

{

1050

struct {

1051

u64 gfn, mfn;

1052

int d:16,order:16;

1053

} t;

1054

1055

t.gfn = gfn;

1056

t.mfn = mfn_x(mfn);

1057

t.d = d->domain_id;

1058

t.order = order;

1059

1060

__trace_var(TRC_MEM_POD_POPULATE, 0, sizeof(t), &t);

1061

}

1062

1063

/* Check the last guest demand-populate */

1064

if ( p2m->pod.entry_count > p2m->pod.count

1065

&& (order == PAGE_ORDER_2M)

1066

&& (q & P2M_ALLOC) )

1067

p2m_pod_check_last_super(p2m, gfn_aligned);

1068

1069

pod_unlock(p2m);

1070

return 0;

1071

out_of_memory:

1072

pod_unlock(p2m);

1073

1074

printk("%s: Dom%d out of PoD memory! (tot=%"PRIu32" ents=%ld dom%d)\n",

1075

__func__, d->domain_id, d->tot_pages, p2m->pod.entry_count,

1076

current->domain->domain_id);

1077

domain_crash(d);

1078

return -1;

1079

out_fail:

1080

pod_unlock(p2m);

1081

return -1;

1082

remap_and_retry:

1083

BUG_ON(order != PAGE_ORDER_2M);

1084

pod_unlock(p2m);

1085

1086

/* Remap this 2-meg region in singleton chunks */

1087

/* NOTE: In a p2m fine-grained lock scenario this might

1088

* need promoting the gfn lock from gfn->2M superpage */

1089

gfn_aligned = (gfn>>order)<<order;

1090

for(i=0; i<(1<<order); i++)

1091

set_p2m_entry(p2m, gfn_aligned+i, _mfn(0), PAGE_ORDER_4K,

1092

p2m_populate_on_demand, p2m->default_access);

1093

if ( tb_init_done )

1094

{

1095

struct {

1096

u64 gfn;

1097

int d:16;

1098

} t;

1099

1100

t.gfn = gfn;

1101

t.d = d->domain_id;

1102

1103

__trace_var(TRC_MEM_POD_SUPERPAGE_SPLINTER, 0, sizeof(t), &t);

1104

}

1105

1106

return 0;

1107

}

1108

1109

1110

int

1111

guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn,

1112

unsigned int order)

1113

{

1114

struct p2m_domain *p2m = p2m_get_hostp2m(d);

1115

unsigned long i, pod_count = 0;

1116

p2m_type_t ot;

1117

mfn_t omfn;

1118

int rc = 0;

1119

1120

if ( !paging_mode_translate(d) )

1121

return -EINVAL;

1122

1123

rc = p2m_gfn_check_limit(d, gfn, order);

1124

if ( rc != 0 )

1125

return rc;

1126

1127

gfn_lock(p2m, gfn, order);

1128

1129

P2M_DEBUG("mark pod gfn=%#lx\n", gfn);

1130

1131

/* Make sure all gpfns are unused */

1132

for ( i = 0; i < (1UL << order); i++ )

1133

{

1134

p2m_access_t a;

1135

omfn = p2m->get_entry(p2m, gfn + i, &ot, &a, 0, NULL);

1136

if ( p2m_is_ram(ot) )

1137

{

1138

printk("%s: gfn_to_mfn returned type %d!\n",

1139

__func__, ot);

1140

rc = -EBUSY;

1141

goto out;

1142

}

1143

else if ( ot == p2m_populate_on_demand )

1144

{

1145

/* Count how man PoD entries we'll be replacing if successful */

1146

pod_count++;

1147

}

1148

}

1149

1150

/* Now, actually do the two-way mapping */

1151

if ( !set_p2m_entry(p2m, gfn, _mfn(0), order,

1152

p2m_populate_on_demand, p2m->default_access) )

1153

rc = -EINVAL;

1154

else

1155

{

1156

pod_lock(p2m);

1157

p2m->pod.entry_count += 1 << order;

1158

p2m->pod.entry_count -= pod_count;

1159

BUG_ON(p2m->pod.entry_count < 0);

1160

pod_unlock(p2m);

1161

}

1162

1163

gfn_unlock(p2m, gfn, order);

1164

1165

out:

1166

return rc;

1167

}

1168

Older »