~ubuntu-branches/ubuntu/gutsy/evms/gutsy : revision 3

1

diff -Naur linux-2002-03-28/drivers/evms/AIXlvm_vge.c evms-2002-03-28/drivers/evms/AIXlvm_vge.c

2

--- linux-2002-03-28/drivers/evms/AIXlvm_vge.c Wed Dec 31 18:00:00 1969

3

+++ evms-2002-03-28/drivers/evms/AIXlvm_vge.c Thu Mar 28 13:53:07 2002

4

@@ -0,0 +1,2540 @@

5

+/* -*- linux-c -*- */

6

+

7

+/*

8

+ *

9

+ *

10

+ * Copyright (c) International Business Machines Corp., 2000

11

+ *

12

+ * This program is free software; you can redistribute it and/or modify

13

+ * it under the terms of the GNU General Public License as published by

14

+ * the Free Software Foundation; either version 2 of the License, or

15

+ * (at your option) any later version.

16

+ *

17

+ * This program is distributed in the hope that it will be useful,

18

+ * but WITHOUT ANY WARRANTY; without even the implied warranty of

19

+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See

20

+ * the GNU General Public License for more details.

21

+ *

22

+ * You should have received a copy of the GNU General Public License

23

+ * along with this program; if not, write to the Free Software

24

+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

25

+ *

26

+ *

27

+ */

28

+/*

29

+ * linux/drivers/evms/AIXlvm_vge.c

30

+ *

31

+ * EVMS AIX LVM Volume Group Emulator

32

+ *

33

+ *

34

+ */

35

+

36

+#define EVMS_DEBUG 1

37

+#define EVMS_AIX_DEBUG 1

38

+

39

+#define AIX_COMMON_SERVICES_MAJOR 0 // Required common services levels for the AIX kernel plugin

40

+#define AIX_COMMON_SERVICES_MINOR 5 // These must be incremented if new function is added to common

41

+#define AIX_COMMON_SERVICES_PATCHLEVEL 0 // services and the AIX kernel plugin uses the new function.

42

+#define AIX_INCREMENT_REQUEST 1

43

+#define AIX_DECREMENT_REQUEST -1

44

+

45

+

46

+#include <linux/module.h>

47

+#include <linux/kernel.h>

48

+#include <linux/config.h>

49

+

50

+#include <linux/genhd.h>

51

+#include <linux/major.h>

52

+#include <linux/string.h>

53

+#include <linux/blk.h>

54

+#include <linux/init.h>

55

+#include <linux/slab.h>

56

+

57

+#include <linux/evms/evms_kernel.h>

58

+#include <linux/evms/evms_aix.h>

59

+#include <asm/system.h>

60

+#include <asm/uaccess.h>

61

+

62

+#include <linux/sched.h>

63

+#include <linux/smp_lock.h>

64

+#include <linux/locks.h>

65

+#include <linux/delay.h>

66

+#include <linux/reboot.h>

67

+#include <linux/completion.h>

68

+#include <linux/vmalloc.h>

69

+

70

+#ifdef EVMS_AIX_DEBUG

71

+static int AIX_volume_group_dump(void);

72

+#endif

73

+

74

+static aix_volume_group_t * AIXVolumeGroupList=NULL;

75

+static evms_thread_t * AIX_mirror_thread;

76

+static evms_pool_mgmt_t * AIX_BH_list_pool = NULL;

77

+static aix_mirror_bh_t * AIX_retry_list = NULL;

78

+static aix_mirror_bh_t ** AIX_retry_tail = NULL;

79

+static spinlock_t AIX_retry_list_lock = SPIN_LOCK_UNLOCKED;

80

+

81

+// Plugin API prototypes

82

+

83

+static void AIXiod (void *data);

84

+static int discover_aix(evms_logical_node_t ** evms_logical_disk_head);

85

+static int discover_volume_groups( evms_logical_node_t ** );

86

+static int discover_logical_volumes( void );

87

+static int end_discover_aix(evms_logical_node_t ** evms_logical_disk_head);

88

+static void read_aix(evms_logical_node_t * node, eio_t * eio);

89

+static void write_aix(evms_logical_node_t * node, eio_t * eio);

90

+static int ioctl_aix( evms_logical_node_t * logical_node,

91

+ struct inode * inode,

92

+ struct file * file,

93

+ unsigned int cmd,

94

+ unsigned long arg);

95

+static int AIX_remap_sector(evms_logical_node_t * node,

96

+ evms_sector_t org_sector, // logical sector to remap

97

+ evms_sector_t size, // size (in sectors) of request to remap

98

+ evms_sector_t * new_sector, // remapped sector

99

+ evms_sector_t * new_size, // new size (in sectors)

100

+ partition_list_entry_t ** partition, // new node for which new_sector is relative

101

+ u_int32_t * le,

102

+ u_int32_t * offset_in_le);

103

+

104

+static int validate_build_volume_group_disk_info(evms_logical_node_t * logical_node,

105

+ AIXlvm_rec_t * AIXlvm);

106

+

107

+static int add_VG_data_to_VG_list ( evms_logical_node_t * logical_node,

108

+ aix_volume_group_t * new_group,

109

+ short int pvNum);

110

+static int add_PV_to_volume_group( aix_volume_group_t * group,

111

+ evms_logical_node_t * evms_partition,

112

+ int pvNum);

113

+static aix_volume_group_t * AIX_create_volume_group(evms_logical_node_t * logical_node,

114

+ AIXlvm_rec_t * AIXlvm);

115

+

116

+static int AIX_update_volume_group(aix_volume_group_t * AIXVGLptr,

117

+ evms_logical_node_t * logical_node,

118

+ AIXlvm_rec_t * AIXlvm);

119

+

120

+static int AIX_evms_cs_notify_lv_io_error(evms_logical_node_t * node);

121

+

122

+

123

+static int export_volumes( evms_logical_node_t ** evms_logical_disk_head );

124

+static int lvm_cleanup( void );

125

+static int AIX_copy_header_info(vg_header *AIXvgh, vg_header *AIXvgh2);

126

+static int build_pe_maps( aix_volume_group_t * volume_group);

127

+

128

+static aix_logical_volume_t * new_logical_volume(lv_entries *AIXlvent,

129

+ aix_volume_group_t *group,

130

+ char *lv_name,

131

+ u_int32_t stripesize);

132

+

133

+static int check_log_volume_and_pe_maps( aix_volume_group_t * group );

134

+static int check_volume_groups(void);

135

+static int init_io_aix( evms_logical_node_t * node,

136

+ int io_flag, /* 0=read, 1=write*/

137

+ evms_sector_t sect_nr, /* disk LBA */

138

+ evms_sector_t num_sects, /* # of sectors */

139

+ void * buf_addr ); /* buffer address */

140

+

141

+

142

+static int delete_logical_volume( aix_logical_volume_t * volume );

143

+static int delete_aix_node( evms_logical_node_t * logical_node );

144

+static int deallocate_volume_group( aix_volume_group_t * group );

145

+

146

+static void AIX_handle_read_mirror_drives(struct buffer_head * bh,

147

+ int uptodate);

148

+

149

+static void AIX_handle_write_mirror_drives(struct buffer_head * bh,

150

+ int uptodate);

151

+

152

+static void aix_notify_cache_ctor(void * foo, kmem_cache_t * cachep, unsigned long flags);

153

+

154

+static aix_mirror_bh_t * AIX_alloc_rbh(evms_logical_node_t * node,

155

+ eio_t * eio,

156

+ uint32_t mirror_copies,

157

+ evms_sector_t org_sector,

158

+ int cmd);

159

+

160

+static aix_mirror_bh_t * AIX_alloc_wbh(evms_logical_node_t * node,

161

+ evms_logical_node_t * node2,

162

+ evms_logical_node_t * node3,

163

+ eio_t * eio,

164

+ uint32_t mirror_copies,

165

+ evms_sector_t new_sector2,

166

+ evms_sector_t new_sector3);

167

+

168

+static void AIX_free_headers(vg_header *AIXvgh, vg_header *AIXvgh2, vg_trailer *AIXvgt, vg_trailer *AIXvgt2);

169

+//****************************************************************************************************

170

+

171

+/* END of PROTOTYES*/

172

+

173

+#define GET_PHYSICAL_PART_SIZE(v1) (1 << v1)

174

+

175

+#define AIX_PVH_DATA_PSN(vgda_psn, pvNum) (vgda_psn + PSN_PPH_OFFSET + ((pvNum -1) * PSN_PVH_INCREMENT))

176

+

177

+#define COMPARE_TIMESTAMPS(t1, t2) ( (t1).tv_sec == (t2).tv_sec && \

178

+ (t1).tv_nsec == (t2).tv_nsec )

179

+

180

+#define COMPARE_UNIQUE_IDS(id1, id2) ( (id1).word1 == (id2).word1 && \

181

+ (id1).word2 == (id2).word2 && \

182

+ (id1).word3 == (id2).word3 && \

183

+ (id1).word4 == (id2).word4 )

184

+

185

+#define AIX_PV_STATE_VALID 0 // Both VGDAs are valid and match.

186

+#define AIX_PV_STATE_FIRST_VGDA 1 // Only the first VGDA is valid.

187

+#define AIX_PV_STATE_SECOND_VGDA 2 // Only the second VGDA is valid.

188

+#define AIX_PV_STATE_EITHER_VGDA -1 // Both VGDAs are valid, but do not match each other.

189

+#define AIX_PV_STATE_INVALID -2 // We're in an invalid state but there's more PVs in this group

190

+

191

+

192

+#ifndef EVMS_AIX_DEBUG

193

+ #define AIX_VOLUME_GROUP_DUMP()

194

+#else

195

+ #define AIX_VOLUME_GROUP_DUMP() LOG_DEBUG("Called line:%d \n",__LINE__); \

196

+ AIX_volume_group_dump()

197

+#endif

198

+

199

+// Global LVM data structures

200

+

201

+static evms_plugin_function_table_t AIXlvm_function_table = {

202

+ discover: &discover_aix,

203

+ end_discover: &end_discover_aix,

204

+ delete : &delete_aix_node,

205

+ read : &read_aix,

206

+ write : &write_aix,

207

+ init_io : &init_io_aix,

208

+ ioctl : &ioctl_aix

209

+};

210

+

211

+static evms_plugin_header_t plugin_header = {

212

+ id : SetPluginID(

213

+ IBM_OEM_ID,

214

+ EVMS_REGION_MANAGER, // Region Manager class

215

+ 3 ), // Unique ID within VGEs

216

+ version : {

217

+ major : 1,

218

+ minor : 0,

219

+ patchlevel : 0

220

+ }, // Major, Minor, Patchlevel

221

+ required_common_services_version: {

222

+ major : AIX_COMMON_SERVICES_MAJOR,

223

+ minor : AIX_COMMON_SERVICES_MINOR,

224

+ patchlevel : AIX_COMMON_SERVICES_PATCHLEVEL

225

+ },

226

+ function_table : &AIXlvm_function_table // Function table for this plugin

227

+};

228

+

229

+

230

+

231

+

232

+/*

233

+ * Function: remap sector

234

+ * Common function to remap volume lba to partition lba in appropriate PE

235

+ */

236

+static int AIX_remap_sector(evms_logical_node_t * node,

237

+ evms_sector_t org_sector, // logical sector to remap

238

+ evms_sector_t size, // size (in sectors) of request to remap

239

+ evms_sector_t * new_sector, // remapped sector

240

+ evms_sector_t * new_size, // new size (in sectors)

241

+ partition_list_entry_t ** partition, // new node for which new_sector is relative

242

+ u_int32_t * le,

243

+ u_int32_t * offset_in_le)

244

+{

245

+ aix_logical_volume_t * volume;

246

+

247

+ u_int32_t sectors_per_stripe;

248

+ u_int32_t partition_to_use;

249

+ u_int32_t column;

250

+ u_int32_t stripe_in_column;

251

+

252

+ u_int32_t org_sector32; // Until striping is 64-bit enabled.

253

+

254

+ volume = (aix_logical_volume_t *) node->instance_data;

255

+

256

+#ifdef EVMS_DEBUG

257

+ LOG_DEBUG("-- %s volume:%p lv:%d size:%Ld Name:%s\n",__FUNCTION__, volume,volume->lv_number,size,volume->name);

258

+ LOG_DEBUG(" node %p node_name [%s] org_sector:%Ld\n",node, node->name, org_sector);

259

+ LOG_DEBUG(" mirror_copies:%d volume->lv_size:%Ld\n",volume->mirror_copies,volume->lv_size);

260

+#endif

261

+

262

+ org_sector32 = org_sector;

263

+

264

+ *(new_size) = size;

265

+

266

+ // Check if volume is striped. Reset the size if the request

267

+ // crosses a stripe boundary.

268

+ if ( volume->stripes > 1 ) {

269

+#ifdef EVMS_DEBUG

270

+ LOG_DEBUG(" *** STRIPED ***\n");

271

+ LOG_DEBUG(" ------- volume->stripe_size:%d org_sector:%d volume_stripes:%d\n",volume->stripe_size, org_sector32, volume->stripes);

272

+#endif

273

+

274

+ *(le) = org_sector >> volume->pe_size_shift; // 64-bit safe

275

+ *(offset_in_le) = org_sector & (volume->pe_size - 1); // 64-bit safe

276

+

277

+#ifdef EVMS_DEBUG

278

+ LOG_DEBUG("OLD - le:%d -- offset_in_le:%d \n",*(le), *(offset_in_le));

279

+#endif

280

+

281

+ sectors_per_stripe = volume->stripe_size / AIX_SECTOR_SIZE;

282

+ partition_to_use = (org_sector32 / sectors_per_stripe) % volume->stripes;

283

+ stripe_in_column = ((((org_sector32 / volume->stripe_size) / volume->stripes) * volume->stripe_size) + (org_sector32 % sectors_per_stripe));

284

+ column = ((org_sector32 / sectors_per_stripe) / volume->stripes) * sectors_per_stripe;

285

+

286

+#ifdef EVMS_DEBUG

287

+ LOG_DEBUG("offset_in_le:%d org_sector:%Ld pe_shift:%d stripe_shift:%d\n",*(offset_in_le), org_sector, volume->pe_size_shift,volume->stripe_size_shift);

288

+

289

+ LOG_DEBUG(" org_sector:%d sectors_per_stripe:%d partition_to_use:%d stripe_in_column:%d column:%d\n",org_sector32, sectors_per_stripe, partition_to_use,stripe_in_column,column);

290

+ LOG_DEBUG(" offset_in_le + size:%Ld volume->pe_size:%d volume->lv_size:%Ld\n",(*(offset_in_le)+size),volume->pe_size ,volume->lv_size);

291

+#endif

292

+

293

+ if ( *(offset_in_le) + size > volume->pe_size ) {

294

+ *new_size = volume->pe_size - *(offset_in_le);

295

+ LOG_DEBUG(" new_size %Ld\n",*new_size);

296

+ }

297

+

298

+ }

299

+ // Non-striped volume. Just find LE and offset. Reset the size

300

+ // if the request crosses an LE boundary.

301

+ else {

302

+#ifdef EVMS_DEBUG

303

+ LOG_DEBUG(" *** NON-STRIPED ***\n");

304

+#endif

305

+

306

+ *(le) = org_sector >> volume->pe_size_shift; // 64-bit safe

307

+ *(offset_in_le) = org_sector & (volume->pe_size - 1); // 64-bit safe

308

+

309

+ }

310

+

311

+#ifdef EVMS_DEBUG

312

+ LOG_DEBUG(" offset_in_le:%d org_sector:%Ld shift:%d\n",*(offset_in_le), org_sector, volume->pe_size_shift);

313

+

314

+ if (*(le) >= volume->num_le) {

315

+ LOG_DEBUG(" le Memory Overwrite !! le:%d vs volume->num_le:%d\n",*(le),volume->num_le);

316

+ return EINVAL;

317

+ }

318

+#endif

319

+

320

+ *(new_sector) = volume->le_to_pe_map[*(le)].pe_sector_offset + *(offset_in_le);

321

+ *(partition) = volume->le_to_pe_map[*(le)].owning_pv;

322

+

323

+#ifdef EVMS_DEBUG

324

+ LOG_DEBUG(" new_sector:%Ld\n", *(new_sector));

325

+ LOG_DEBUG(" Owning Part %p\n",*(partition));

326

+ LOG_DEBUG(" End %s\n",__FUNCTION__);

327

+#endif

328

+

329

+ return(0);

330

+}

331

+

332

+

333

+/*

334

+ * Function: read_aix

335

+ */

336

+static void read_aix(evms_logical_node_t * node,

337

+ eio_t * eio)

338

+{

339

+ partition_list_entry_t * partition;

340

+ evms_sector_t org_sector;

341

+ evms_sector_t new_sector;

342

+ evms_sector_t new_size;

343

+ aix_logical_volume_t * volume;

344

+ aix_mirror_bh_t * tmp_bh;

345

+ u_int32_t le, offset_in_le,count;

346

+

347

+

348

+ volume = (aix_logical_volume_t *) node->instance_data;

349

+#ifdef EVMS_DEBUG

350

+ LOG_DEBUG(" ***** %s ***** bh:%p volume->iter:%d\n",__FUNCTION__,eio->bh, volume->mirror_iterations);

351

+#endif

352

+

353

+

354

+#ifdef EVMS_DEBUG

355

+ LOG_DEBUG(" node->total_vsectors:%Lu\n",node->total_vsectors);

356

+ LOG_DEBUG(" rsector:%Lu rsize:%Lu node_flags:%u\n",eio->rsector,eio->rsize,node->flags);

357

+#endif

358

+

359

+ // Check if I/O goes past end of logical volume.

360

+ if ( eio->rsector + eio->rsize > node->total_vsectors ) {

361

+ LOG_CRITICAL(" read_aix ERROR %d\n",__LINE__);

362

+ EVMS_IO_ERROR(eio);

363

+ return;

364

+ }

365

+

366

+

367

+ // Logical-to-physical remapping.

368

+ if ( AIX_remap_sector(node, eio->rsector, eio->rsize, &new_sector, &new_size, &partition, &le, &offset_in_le) ||

369

+ (!partition || !new_sector)) {

370

+ LOG_CRITICAL(" read_aix bh: ERROR %d\n",__LINE__);

371

+ EVMS_IO_ERROR(eio);

372

+ return;

373

+ }

374

+

375

+ org_sector = eio->rsector;

376

+ eio->rsector = new_sector;

377

+ eio->rsize = new_size;

378

+

379

+#ifdef EVMS_DEBUG

380

+ LOG_DEBUG(" read_aix Mirror_Copies:%d\n",volume->mirror_copies);

381

+#endif

382

+

383

+ if (volume->mirror_copies > AIX_DEFAULT_MIRRORING) {

384

+

385

+

386

+ tmp_bh = AIX_alloc_rbh(node, eio, 1, new_sector, AIX_LV_READ);

387

+

388

+ if (!tmp_bh) {

389

+ EVMS_IO_ERROR(eio);

390

+ return;

391

+ }

392

+

393

+ if (volume->le_to_pe_map_mir1) {

394

+ tmp_bh->mir_node1 = volume->le_to_pe_map_mir1[le].owning_pv->logical_node;

395

+ tmp_bh->mir_sector1 = volume->le_to_pe_map_mir1[le].pe_sector_offset + offset_in_le;

396

+ }

397

+

398

+ if (volume->mirror_copies == AIX_MAX_MIRRORS) {

399

+ tmp_bh->mir_node2 = volume->le_to_pe_map_mir2[le].owning_pv->logical_node;

400

+ tmp_bh->mir_sector2 = volume->le_to_pe_map_mir2[le].pe_sector_offset + offset_in_le;

401

+ }

402

+

403

+ if (evms_cs_volume_request_in_progress(tmp_bh->bh_req.b_dev , AIX_INCREMENT_REQUEST, &count)) {

404

+ EVMS_IO_ERROR(eio);

405

+ return;

406

+ }

407

+

408

+ R_IO(partition->logical_node, &tmp_bh->eio);

409

+ } else {

410

+

411

+ R_IO(partition->logical_node, eio);

412

+ }

413

+

414

+

415

+#ifdef EVMS_DEBUG

416

+ LOG_DEBUG(" ***** %s ***** returning\n",__FUNCTION__);

417

+#endif

418

+ return;

419

+}

420

+

421

+

422

+/*

423

+ * Function: write_aix

424

+ */

425

+static void write_aix( evms_logical_node_t * node,

426

+ eio_t * eio)

427

+{

428

+ partition_list_entry_t * partition;

429

+ evms_sector_t new_sector, new_sector2 = 0, new_sector3 = 0;

430

+ evms_sector_t org_sector;

431

+ evms_sector_t new_size;

432

+ aix_logical_volume_t * volume;

433

+ aix_mirror_bh_t * tmp_bh;

434

+ evms_logical_node_t * node2 = NULL, *node3 = NULL;

435

+ u_int32_t le, offset_in_le, count;

436

+

437

+ volume = (aix_logical_volume_t *) node->instance_data;

438

+

439

+#ifdef EVMS_DEBUG

440

+ LOG_DEBUG(" ***** %s ***** bh:%p volume->iter:%d\n",__FUNCTION__,eio->bh, volume->mirror_iterations);

441

+ LOG_DEBUG(" write_aix rsector:%Lu rsize:%Lu\n",eio->rsector,eio->rsize);

442

+ LOG_DEBUG(" write_aix total_sectors:%Lu\n",node->total_vsectors);

443

+#endif

444

+

445

+ if (volume->lv_access & EVMS_LV_INCOMPLETE) { //No writes allowed on incomplete volumes

446

+ LOG_CRITICAL(" write_aix incomplete volume ERROR %d\n",__LINE__);

447

+ EVMS_IO_ERROR(eio);

448

+ return;

449

+ }

450

+

451

+

452

+ // Check if I/O goes past end of logical volume.

453

+ if ( eio->rsector + eio->rsize > node->total_vsectors ) {

454

+ LOG_CRITICAL(" write_aix ERROR %d\n",__LINE__);

455

+ EVMS_IO_ERROR(eio);

456

+ return;

457

+ }

458

+

459

+ // Logical-to-Physical remapping

460

+ if ( AIX_remap_sector(node, eio->rsector, eio->rsize, &new_sector, &new_size, &partition, &le, &offset_in_le) ||

461

+ (!new_sector || !partition)) {

462

+ LOG_CRITICAL(" write_aix ERROR %d\n",__LINE__);

463

+ EVMS_IO_ERROR(eio);

464

+ return;

465

+ }

466

+

467

+ org_sector = eio->rsector;

468

+ eio->rsector = new_sector;

469

+ eio->rsize = new_size;

470

+

471

+#ifdef EVMS_DEBUG

472

+ LOG_DEBUG(" write_aix Mirror_Copies:%d\n", volume->mirror_copies);

473

+#endif

474

+

475

+

476

+ if (volume->mirror_copies > AIX_DEFAULT_MIRRORING) {

477

+

478

+ if (volume->le_to_pe_map_mir1) {

479

+ new_sector2 = volume->le_to_pe_map_mir1[le].pe_sector_offset + offset_in_le;

480

+ node2 = volume->le_to_pe_map_mir1[le].owning_pv->logical_node;

481

+ }

482

+

483

+ if (volume->mirror_copies == AIX_MAX_MIRRORS) {

484

+

485

+ new_sector3 = volume->le_to_pe_map_mir2[le].pe_sector_offset + offset_in_le;

486

+ node3 = volume->le_to_pe_map_mir2[le].owning_pv->logical_node;

487

+ }

488

+

489

+ tmp_bh = AIX_alloc_wbh(partition->logical_node, node2, node3, eio, volume->mirror_copies, new_sector2, new_sector3);

490

+

491

+ if (!tmp_bh) {

492

+ EVMS_IO_ERROR(eio);

493

+ return;

494

+ }

495

+ tmp_bh->node = node;

496

+

497

+ tmp_bh = tmp_bh->mirror_bh_list;

498

+

499

+ if (evms_cs_volume_request_in_progress(tmp_bh->bh_req.b_dev , AIX_INCREMENT_REQUEST, &count)) {

500

+ EVMS_IO_ERROR(eio);

501

+ // free memory here

502

+ return;

503

+ }

504

+

505

+ W_IO(tmp_bh->node, &tmp_bh->eio);

506

+

507

+ tmp_bh = tmp_bh->next_r1;

508

+

509

+ if (tmp_bh) {

510

+ W_IO(tmp_bh->node, &tmp_bh->eio);

511

+ tmp_bh = tmp_bh->next_r1;

512

+ }

513

+

514

+ if (tmp_bh) {

515

+ W_IO(tmp_bh->node, &tmp_bh->eio);

516

+ }

517

+

518

+ } else {

519

+

520

+ W_IO(partition->logical_node, eio);

521

+ }

522

+

523

+

524

+#ifdef EVMS_DEBUG

525

+ LOG_DEBUG(" ***** %s returning *****\n",__FUNCTION__);

526

+#endif

527

+ return;

528

+}

529

+

530

+

531

+/*

532

+ * Function: ioctl_aix

533

+ *

534

+ */

535

+static int ioctl_aix( evms_logical_node_t * logical_node,

536

+ struct inode * inode,

537

+ struct file * file,

538

+ unsigned int cmd,

539

+ unsigned long arg)

540

+{

541

+ aix_logical_volume_t * volume = (aix_logical_volume_t*)(logical_node->instance_data);

542

+ int rc = 0;

543

+

544

+ LOG_EXTRA(" Ioctl %u\n",cmd);

545

+

546

+

547

+ switch (cmd) {

548

+

549

+ case HDIO_GETGEO:

550

+ {

551

+ // Fixed geomerty for all LVM volumes

552

+ unsigned char heads = 64;

553

+ unsigned char sectors = 32;

554

+ long start = 0;

555

+ struct hd_geometry *hd = (struct hd_geometry *)arg;

556

+ short cylinders;

557

+ cylinders = logical_node->total_vsectors;

558

+ cylinders = (cylinders / heads) / sectors;

559

+

560

+ if (hd == NULL) {

561

+ return EINVAL;

562

+ }

563

+

564

+ if ( copy_to_user((char*)(&hd->heads), &heads, sizeof(heads)) != 0 ||

565

+ copy_to_user((char*)(&hd->sectors), &sectors, sizeof(sectors)) != 0 ||

566

+ copy_to_user((short*)(&hd->cylinders), &cylinders, sizeof(cylinders)) != 0 ||

567

+ copy_to_user((long*)(&hd->start), &start, sizeof(start)) != 0 ) {

568

+ return EFAULT;

569

+ }

570

+ }

571

+ break;

572

+

573

+ case EVMS_QUIESCE_VOLUME:

574

+ break;

575

+

576

+ case EVMS_GET_DISK_LIST:

577

+ case EVMS_CHECK_MEDIA_CHANGE:

578

+ case EVMS_REVALIDATE_DISK:

579

+ case EVMS_OPEN_VOLUME:

580

+ case EVMS_CLOSE_VOLUME:

581

+ {

582

+ // These five ioctl all need to be broadcast to all PVs.

583

+ aix_volume_group_t * group = volume->group;

584

+ partition_list_entry_t * partition;

585

+ for ( partition = group->partition_list; partition; partition = partition->next ) {

586

+ rc |= IOCTL(partition->logical_node, inode, file, cmd, arg);

587

+ }

588

+ }

589

+ break;

590

+

591

+ default:

592

+ // Currently the VGE does not send any ioctl's down to the

593

+ // partitions. Which partition would they go to?

594

+ rc = EINVAL;

595

+ }

596

+

597

+ return rc;

598

+}

599

+

600

+

601

+/*

602

+ * Function: init_io_aix

603

+ *

604

+ */

605

+static int init_io_aix( evms_logical_node_t * node,

606

+ int io_flag, /* 0=read, 1=write*/

607

+ evms_sector_t sect_nr, /* disk LBA */

608

+ evms_sector_t num_sects, /* # of sectors */

609

+ void * buf_addr ) /* buffer address */

610

+{

611

+ partition_list_entry_t * partition;

612

+ evms_sector_t new_sector = 0;

613

+ evms_sector_t new_size = 0;

614

+ int rc = 0;

615

+ u_int32_t le, offset;

616

+

617

+ LOG_DEBUG(" ************ init_io_aix() num_sects:%Ld node:%p sect_nr:%Ld\n",num_sects, node, sect_nr);

618

+

619

+ // Init IO needs to deal with the possibility that a request can come

620

+ // in that spans PEs or stripes. This is possible because there is no

621

+ // limit on num_sects. To fix this, we loop through AIX_remap_sector and

622

+ // INIT_IO until num_sects reaches zero.

623

+

624

+

625

+ while ( num_sects > 0 ) {

626

+

627

+ if (AIX_remap_sector(node, sect_nr, num_sects, &new_sector, &new_size, &partition, &le, &offset) ||

628

+ (!new_sector || !partition)) {

629

+ LOG_CRITICAL("--- Error returned from AIX_remap_sector %d\n",__LINE__);

630

+ return -EIO;

631

+ }

632

+

633

+ LOG_DEBUG(" init_io_aix() line:%d logical_node:%p io_flag:%d new_sector:%Ld new_size:%Ld\n",__LINE__,partition->logical_node, io_flag, new_sector, new_size);

634

+

635

+ rc = INIT_IO(partition->logical_node, io_flag, new_sector, new_size, buf_addr);

636

+ num_sects -= new_size;

637

+ sect_nr += new_size;

638

+ buf_addr = (void*)(((unsigned long)buf_addr) + (unsigned long)(new_size << EVMS_VSECTOR_SIZE_SHIFT));

639

+ }

640

+

641

+ return rc;

642

+}

643

+

644

+/*

645

+ * Function: AIXlvm_vge_init

646

+ *

647

+ */

648

+int __init AIXlvm_vge_init(void)

649

+{

650

+ const char * name = "evms_AIXiod";

651

+

652

+ LOG_DEBUG(" %s --------\n",__FUNCTION__);

653

+

654

+ AIX_mirror_thread = evms_cs_register_thread(AIXiod, NULL, name);

655

+

656

+ MOD_INC_USE_COUNT;

657

+ return evms_cs_register_plugin(&plugin_header); /* register with EVMS */

658

+}

659

+

660

+module_init(AIXlvm_vge_init);

661

+

662

+

663

+

664

+

665

+/********** Required Plugin Functions **********/

666

+

667

+

668

+/*

669

+ * Function: discover_aix

670

+ *

671

+ * This is the entry point into the LVM discovery process.

672

+ */

673

+static int discover_aix(evms_logical_node_t ** evms_logical_disk_head)

674

+{

675

+ int rc = 0, count = 0;

676

+

677

+ LOG_DEBUG("[%s] discover_volume_groups\n",__FUNCTION__);

678

+

679

+ rc = discover_volume_groups(evms_logical_disk_head);

680

+

681

+ if (rc) {

682

+ LOG_ERROR("[%s] discover_volume_groups rc=%d\n",__FUNCTION__ ,rc);

683

+ }

684

+

685

+ if (AIXVolumeGroupList) {

686

+

687

+ LOG_DEBUG("[%s] discover_logical_volumes\n",__FUNCTION__);

688

+

689

+ rc = discover_logical_volumes();

690

+

691

+ if (rc) {

692

+ LOG_ERROR("[%s] discover_logical_volumes rc=%d\n",__FUNCTION__ ,rc);

693

+ }

694

+

695

+

696

+ LOG_DEBUG("[%s] export_volumes\n",__FUNCTION__);

697

+

698

+ count = export_volumes(evms_logical_disk_head);

699

+

700

+ LOG_DEBUG("[%s] export_volumes count=%d\n",__FUNCTION__ ,count);

701

+ }

702

+

703

+ return(count);

704

+}

705

+

706

+

707

+

708

+static int discover_volume_groups(evms_logical_node_t ** evms_logical_disk_head)

709

+{

710

+ evms_logical_node_t * logical_node;

711

+ evms_logical_node_t * next_node;

712

+ AIXIPL_REC * AIXpv;

713

+ AIXlvm_rec_t * AIXlvm; // Temp holder for the LVM on disk rec

714

+

715

+

716

+ LOG_DEBUG(" Begin %s\n", __FUNCTION__);

717

+

718

+ if (evms_cs_allocate_memory((void**)&AIXpv, AIX_SECTOR_SIZE)) {

719

+ return -ENOMEM;

720

+ }

721

+

722

+ // We'll create at least one volume entry, if we don't find any AIX volumes we'll clean it up later

723

+

724

+ if (evms_cs_allocate_memory((void**)&AIXlvm, sizeof(AIXlvm_rec_t))) {

725

+ evms_cs_deallocate_memory(AIXpv);

726

+ return -ENOMEM;

727

+ }

728

+

729

+

730

+ for ( logical_node = *evms_logical_disk_head; logical_node; logical_node = next_node ) {

731

+

732

+ // Grab the next list item in case we remove this partition from the global list.

733

+ next_node = logical_node->next;

734

+

735

+ // Read the first sector and see if it has a valid AIX PV signature.

736

+

737

+ if ( INIT_IO(logical_node, 0, 0, 1, AIXpv) ) {

738

+ // On an I/O error, continue on to the next

739

+ // partition. The group that this partition

740

+ // belongs to will be incomplete, but we still

741

+ // need to discover any other groups.

742

+

743

+ LOG_ERROR(" Error reading PV [%p]\n",logical_node);

744

+ continue;

745

+ }

746

+

747

+

748

+ if (AIXpv->IPL_record_id == IPLRECID) {

749

+

750

+ // This partition is definitely a PV,

751

+ // but is it part of a valid VG?

752

+ LOG_DEBUG(" DVG removing node from list logical_node %p\n", logical_node);

753

+

754

+ if (INIT_IO(logical_node, 0, PSN_LVM_REC, 1, AIXlvm)) {

755

+ LOG_ERROR(" Error reading PV [%p]\n",logical_node);

756

+ continue;

757

+ }

758

+

759

+ if (AIXlvm->lvm_id == AIX_LVM_LVMID) {

760

+

761

+ if (validate_build_volume_group_disk_info(logical_node, AIXlvm) ) {

762

+ // Again, continue on and we'll

763

+ // clean up later.

764

+ continue;

765

+ }

766

+

767

+ evms_cs_remove_logical_node_from_list( evms_logical_disk_head, logical_node );

768

+

769

+ } else {

770

+ LOG_DEBUG(" Found an AIX PV with no parent LVM (LVM ID: %ld)\n",AIXlvm->lvm_id);

771

+ continue;

772

+ }

773

+ } else {

774

+ LOG_DEBUG(" Found a PV not belonging to AIX [%p]\n",logical_node);

775

+ }

776

+ }

777

+

778

+ AIX_VOLUME_GROUP_DUMP();

779

+

780

+ if (check_volume_groups()) {

781

+ return -EINVAL;

782

+ }

783

+

784

+ evms_cs_deallocate_memory(AIXpv);

785

+ evms_cs_deallocate_memory(AIXlvm);

786

+

787

+ return 0;

788

+}

789

+

790

+

791

+/*

792

+ * Function: validate_build_volume_group_disk_info

793

+ *

794

+ * Creates and validates the volume groups found on the disk structures.

795

+ *

796

+ */

797

+static int validate_build_volume_group_disk_info(evms_logical_node_t * logical_node,

798

+ AIXlvm_rec_t * AIXlvm)

799

+{

800

+

801

+ aix_volume_group_t * AIXVGLptr = AIXVolumeGroupList;

802

+

803

+ LOG_DEBUG(" VBVGDI pv_num:%d\n", AIXlvm->pv_num);

804

+

805

+ while (AIXVGLptr) {

806

+ if (COMPARE_UNIQUE_IDS(AIXlvm->vg_id, AIXVGLptr->vg_id)) {

807

+ break;

808

+ }

809

+ AIXVGLptr = AIXVGLptr->next; // There is more than one so walk the list

810

+ }

811

+

812

+ if (!AIXVGLptr) {

813

+ LOG_DEBUG(" VBVGDI AIXVGLptr:%p line:%d\n", AIXVGLptr,__LINE__);

814

+ AIXVGLptr = AIX_create_volume_group(logical_node, AIXlvm);

815

+ AIXVGLptr->next = AIXVolumeGroupList;

816

+ AIXVolumeGroupList = AIXVGLptr;

817

+ } else {

818

+ LOG_DEBUG(" VBVGDI Rediscover AIXVGLptr:%p line:%d\n", AIXVGLptr,__LINE__);

819

+

820

+ if (AIX_update_volume_group(AIXVGLptr, logical_node, AIXlvm)) {

821

+ LOG_DEBUG(" VBVGDI ERROR on Rediscover AIXVGLptr:%p line:%d\n", AIXVGLptr,__LINE__);

822

+ }

823

+ }

824

+

825

+ if (!AIXVGLptr) {

826

+

827

+ LOG_DEBUG(" VBVGDI AIXVGLptr:%p line:%d\n", AIXVGLptr,__LINE__);

828

+ LOG_DEBUG(" VBVGDI flags:%d\n", AIXVGLptr->flags);

829

+ LOG_CRITICAL("Unable to allocate volume group data struct Volume Group Corruption !!\n");

830

+ return -EINVAL;

831

+ } else {

832

+

833

+ LOG_DEBUG(" VBVGDI AIXVolumeGroupList:%p line:%d\n", AIXVolumeGroupList,__LINE__);

834

+ LOG_DEBUG(" VBVGDI AIXVGLptr:%p line:%d\n", AIXVGLptr,__LINE__);

835

+ LOG_DEBUG(" VBVGDI flags:%d\n", AIXVGLptr->flags);

836

+

837

+ if ( add_PV_to_volume_group(AIXVGLptr, logical_node, AIXlvm->pv_num) ) {

838

+ return -EINVAL;

839

+ }

840

+ }

841

+

842

+ return 0;

843

+}

844

+/*

845

+ * Function: add_VG_data_to_VG_list

846

+ *

847

+ * Allocate space for a new LVM volume group and all of its sub-fields.

848

+ * Initialize the appropriate fields.

849

+ */

850

+

851

+static int add_VG_data_to_VG_list ( evms_logical_node_t * logical_node,

852

+ aix_volume_group_t * new_group,

853

+ short int pvNum)

854

+{

855

+ int pvh_pos;

856

+

857

+ pv_header *AIXpvh;

858

+

859

+ // The array of pointer to the logical volumes.

860

+ // Leave this allocation at the max permitted, the lv numbering may not be sequential so you may have gaps

861

+ // in the array allocation i.e. 1,2,3,4,5,6,7,8,11,15,21,33 etc. even though you only have 12 LVs.

862

+

863

+ LOG_DEBUG(" AVGDVGL Entering pvNum:%d vgda_PSN:%d\n",pvNum, new_group->vgda_psn);

864

+

865

+ pvh_pos = AIX_PVH_DATA_PSN(new_group->vgda_psn, pvNum);

866

+

867

+ if (evms_cs_allocate_memory((void**)&AIXpvh, AIX_SECTOR_SIZE)) {

868

+ return ENOMEM;

869

+ }

870

+

871

+ LOG_DEBUG(" AVGDVGL pvh_pos:%d\n", pvh_pos);

872

+

873

+ if (INIT_IO(logical_node, 0, pvh_pos, 1, AIXpvh)) {

874

+ return EIO;

875

+ }

876

+

877

+ LOG_DEBUG(" AVGDVGL AIXpvh->pv_num:%d\n", pvNum);

878

+

879

+ if (!new_group->volume_list) {

880

+ if ( evms_cs_allocate_memory((void**)&(new_group->volume_list), LVM_MAXLVS*sizeof(aix_logical_volume_t*)) ) {

881

+ evms_cs_deallocate_memory(AIXpvh);

882

+ return -ENOMEM;

883

+ }

884

+ }

885

+

886

+ new_group->vg_id.word1 = new_group->AIXvgh->vg_id.word1;

887

+ new_group->vg_id.word2 = new_group->AIXvgh->vg_id.word2;

888

+ new_group->vg_id.word3 = new_group->AIXvgh->vg_id.word3;

889

+ new_group->vg_id.word4 = new_group->AIXvgh->vg_id.word4;

890

+ new_group->numpvs = new_group->AIXvgh->numpvs;

891

+ new_group->numlvs = new_group->AIXvgh->numlvs;

892

+ new_group->lv_max = new_group->AIXvgh->maxlvs;

893

+ new_group->pe_size = (GET_PHYSICAL_PART_SIZE(new_group->AIXvgh->pp_size) / AIX_SECTOR_SIZE);

894

+

895

+ new_group->block_size = 0;

896

+ new_group->hard_sect_size = 0;

897

+ new_group->flags |= EVMS_VG_DIRTY;

898

+

899

+ evms_cs_deallocate_memory(AIXpvh);

900

+

901

+

902

+ LOG_DEBUG(" AVGDVGL Vol Group ID %x\n", new_group->vg_id.word2);

903

+

904

+

905

+ return 0;

906

+}

907

+

908

+

909

+/*

910

+ * Function: add_PV_to_volume_group

911

+ *

912

+ * Create a new partition_list_entry for the specified volume group.

913

+ * Initialize the new partition with the evms node and lvm pv information,

914

+ * and add the new partition to the group's list.

915

+ */

916

+

917

+static int add_PV_to_volume_group( aix_volume_group_t * group,

918

+ evms_logical_node_t * evms_partition,

919

+ int pvNum)

920

+{

921

+ partition_list_entry_t * new_partition;

922

+

923

+ LOG_DEBUG(" APVVG Entering pvNum:%d\n",pvNum);

924

+

925

+ group->flags |= EVMS_VG_DIRTY;

926

+

927

+ for (new_partition = group->partition_list; new_partition != NULL; new_partition=new_partition->next) {

928

+ if (new_partition->logical_node == evms_partition) {

929

+ return 0;

930

+ }

931

+ }

932

+

933

+ if ( evms_cs_allocate_memory((void**)&new_partition, sizeof(partition_list_entry_t)) ) {

934

+ return -ENOMEM;

935

+ }

936

+

937

+ // Add this partition to this group's list.

938

+ new_partition->logical_node = evms_partition;

939

+ new_partition->pv_number = pvNum;

940

+

941

+ group->hard_sect_size = evms_partition->hardsector_size;

942

+ group->block_size = evms_partition->block_size;

943

+

944

+ // Add this partition to the beginning of its group's list.

945

+ new_partition->next = group->partition_list;

946

+ group->partition_list = new_partition;

947

+ group->partition_count++;

948

+

949

+ LOG_DEBUG(" APVVG partition_count:%d pv_num:%d\n",group->partition_count, pvNum);

950

+

951

+ return 0;

952

+}

953

+/****************************************************

954

+*

955

+*

956

+*

957

+*****************************************************/

958

+static aix_volume_group_t * AIX_create_volume_group(evms_logical_node_t * logical_node,

959

+ AIXlvm_rec_t * AIXlvm)

960

+{

961

+ vg_header * AIXvgh, *AIXvgh2;

962

+ vg_trailer * AIXvgt, *AIXvgt2;

963

+ aix_volume_group_t * AIXVGLptr;

964

+

965

+

966

+

967

+ if (evms_cs_allocate_memory((void**)&AIXvgh, AIX_SECTOR_SIZE)) {

968

+ return NULL;

969

+ }

970

+

971

+ if (evms_cs_allocate_memory((void**)&AIXvgh2, AIX_SECTOR_SIZE)) {

972

+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);

973

+ return NULL;

974

+ }

975

+

976

+ if (evms_cs_allocate_memory((void**)&AIXvgt, AIX_SECTOR_SIZE)) {

977

+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);

978

+ return NULL;

979

+ }

980

+

981

+ if (evms_cs_allocate_memory((void**)&AIXvgt2, AIX_SECTOR_SIZE)) {

982

+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);

983

+ return NULL;

984

+ }

985

+

986

+ // First time thru we want to read this in, we may only have one PV in this group, all others

987

+ // may be corrupt, etc. If the info is clean we shouldn't get here.

988

+

989

+ if (INIT_IO(logical_node, 0, AIXlvm->vgda_psn[0], 1, AIXvgh)) {

990

+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);

991

+ return NULL;

992

+ }

993

+

994

+ if (INIT_IO(logical_node, 0, AIXlvm->vgda_psn[1], 1, AIXvgh2)) {

995

+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);

996

+ return NULL;

997

+ }

998

+

999

+ if (INIT_IO(logical_node, 0, (AIXlvm->vgda_psn[0]+AIXlvm->vgda_len-1), 1, AIXvgt)) {

1000

+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);

1001

+ return NULL;

1002

+ }

1003

+

1004

+ if (INIT_IO(logical_node, 0, (AIXlvm->vgda_psn[1]+AIXlvm->vgda_len-1), 1, AIXvgt2)) {

1005

+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);

1006

+ return NULL;

1007

+ }

1008

+

1009

+ LOG_DEBUG("CVG AIXvgh->vgda_psn[%d]:%d\n", 0, AIXlvm->vgda_psn[0]);

1010

+ LOG_DEBUG("CVG AIXvgh->vgda_psn[%d]:%d\n", 1, AIXlvm->vgda_psn[1]);

1011

+ LOG_DEBUG("CVG AIXvgt psn[%d]:%ld\n", 0, (AIXlvm->vgda_psn[0]+AIXlvm->vgda_len-1));

1012

+ LOG_DEBUG("CVG AIXvgt psn[%d]:%ld\n", 1, (AIXlvm->vgda_psn[1]+AIXlvm->vgda_len-1));

1013

+

1014

+

1015

+ LOG_DEBUG("CVG Allocating AIXVGLptr:size:%d \n",sizeof(aix_volume_group_t));

1016

+ if (evms_cs_allocate_memory((void**)&AIXVGLptr, sizeof(aix_volume_group_t))) {

1017

+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);

1018

+ return NULL;

1019

+

1020

+ }

1021

+

1022

+ AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;

1023

+ AIXVGLptr->flags |= EVMS_VG_DIRTY;

1024

+

1025

+ LOG_DEBUG("CVG AIXVGLptr:%p line %d\n",AIXVGLptr, __LINE__);

1026

+

1027

+ if (evms_cs_allocate_memory((void**)&AIXVGLptr->AIXvgh, sizeof(vg_header))) {

1028

+ evms_cs_deallocate_memory(AIXVGLptr);

1029

+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);

1030

+ return NULL;

1031

+

1032

+ }

1033

+

1034

+

1035

+ LOG_DEBUG("CVG COMP TS AIXVGLptr->CleanVGInfo:%d \n",AIXVGLptr->CleanVGInfo);

1036

+

1037

+ if (AIXVGLptr->CleanVGInfo == AIX_PV_STATE_INVALID) {

1038

+ if (COMPARE_TIMESTAMPS(AIXvgh->vg_timestamp,AIXvgt->timestamp)) {

1039

+ if (COMPARE_TIMESTAMPS(AIXvgh2->vg_timestamp,AIXvgt2->timestamp)) {

1040

+ if (COMPARE_TIMESTAMPS(AIXvgh->vg_timestamp,AIXvgh2->vg_timestamp)) {

1041

+ // All timestamps match. Yea!

1042

+ AIXVGLptr->CleanVGInfo = AIX_PV_STATE_VALID;

1043

+ } else {

1044

+ // Both VGDAs are good, but timestamps are

1045

+ // different. Can't tell yet which one is

1046

+ // correct.

1047

+ AIXVGLptr->CleanVGInfo = AIX_PV_STATE_EITHER_VGDA;

1048

+ }

1049

+ } else {

1050

+ // First VGDA is good, second is bad.

1051

+ AIXVGLptr->CleanVGInfo = AIX_PV_STATE_FIRST_VGDA;

1052

+ }

1053

+ } else {

1054

+ if ( COMPARE_TIMESTAMPS(AIXvgh2->vg_timestamp, AIXvgt2->timestamp) ) {

1055

+ // First VGDA is bad, second is good.

1056

+ AIXVGLptr->CleanVGInfo = AIX_PV_STATE_SECOND_VGDA;

1057

+ } else if (AIXvgh->numpvs == 1) { // We only have 1 PV in this group, mismatch or not this will have to do

1058

+ AIXVGLptr->CleanVGInfo = AIX_PV_STATE_VALID;

1059

+ } else {

1060

+ // This should never happen.

1061

+ LOG_DEBUG("All four VG timestamps for %d are different. What happened?!?\n", AIXVGLptr->vg_id.word2);

1062

+ AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;

1063

+

1064

+ }

1065

+ }

1066

+

1067

+ LOG_DEBUG("CVG SWITCH TS AIXVGLptr->CleanVGInfo:%d \n",AIXVGLptr->CleanVGInfo);

1068

+

1069

+ switch (AIXVGLptr->CleanVGInfo) {

1070

+ case AIX_PV_STATE_VALID:

1071

+ case AIX_PV_STATE_FIRST_VGDA:

1072

+

1073

+ LOG_DEBUG("CVG SWITCH VALID %d size:%d\n",AIXVGLptr->CleanVGInfo,sizeof(vg_header));

1074

+

1075

+ AIX_copy_header_info(AIXVGLptr->AIXvgh, AIXvgh); // Get the info. we need

1076

+

1077

+ AIXVGLptr->vgda_psn = AIXlvm->vgda_psn[0];

1078

+ AIXVGLptr->vgda_len = AIXlvm->vgda_len;

1079

+ break;

1080

+

1081

+ case AIX_PV_STATE_SECOND_VGDA:

1082

+ LOG_DEBUG("CVG SWITCH SECOND VGDA %d size:%d\n",AIXVGLptr->CleanVGInfo,sizeof(vg_header));

1083

+

1084

+ AIX_copy_header_info(AIXVGLptr->AIXvgh, AIXvgh2); // Get the info. we need

1085

+

1086

+ AIXVGLptr->vgda_psn = AIXlvm->vgda_psn[1];

1087

+ AIXVGLptr->vgda_len = AIXlvm->vgda_len;

1088

+ break;

1089

+

1090

+ case AIX_PV_STATE_EITHER_VGDA:

1091

+ LOG_DEBUG("CVG SWITCH EITHER VGDA %d size:%d\n",AIXVGLptr->CleanVGInfo,sizeof(vg_header));

1092

+ if ( COMPARE_UNIQUE_IDS(AIXvgh->vg_id, AIXvgh2->vg_id) ) {

1093

+

1094

+ AIX_copy_header_info(AIXVGLptr->AIXvgh, AIXvgh); // Get the info. we need

1095

+

1096

+ AIXVGLptr->vgda_psn = AIXlvm->vgda_psn[0];

1097

+ AIXVGLptr->vgda_len = AIXlvm->vgda_len;

1098

+ } else {

1099

+ AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;

1100

+ // Not sure where this PV belongs. It thinks it is

1101

+ // supposed to be in two different containers. We will

1102

+ // probably need to put this on a separate, temporary

1103

+ // list, and determine later which container is missing

1104

+ // a PV.

1105

+ }

1106

+ break;

1107

+

1108

+ default:

1109

+ LOG_ERROR("Invalid PV state (%d) for %d\n",AIXVGLptr->CleanVGInfo , AIXVGLptr->vg_id.word2);

1110

+ AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;

1111

+ break;

1112

+ }

1113

+

1114

+ }

1115

+

1116

+ add_VG_data_to_VG_list(logical_node, AIXVGLptr, AIXlvm->pv_num);

1117

+

1118

+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);

1119

+

1120

+ LOG_DEBUG("CVG Exiting CleanVGInfo:%d\n", AIXVGLptr->CleanVGInfo);

1121

+

1122

+ return AIXVGLptr;

1123

+}

1124

+/****************************************************

1125

+*

1126

+*

1127

+*

1128

+*****************************************************/

1129

+static int AIX_update_volume_group(aix_volume_group_t * AIXVGLptr,

1130

+ evms_logical_node_t * logical_node,

1131

+ AIXlvm_rec_t * AIXlvm)

1132

+{

1133

+ vg_header * AIXvgh, *AIXvgh2;

1134

+ vg_trailer * AIXvgt, *AIXvgt2;

1135

+

1136

+

1137

+

1138

+ if (evms_cs_allocate_memory((void**)&AIXvgh, AIX_SECTOR_SIZE)) {

1139

+ return -ENOMEM;

1140

+ }

1141

+

1142

+ if (evms_cs_allocate_memory((void**)&AIXvgh2, AIX_SECTOR_SIZE)) {

1143

+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);

1144

+ return -ENOMEM;

1145

+ }

1146

+

1147

+ if (evms_cs_allocate_memory((void**)&AIXvgt, AIX_SECTOR_SIZE)) {

1148

+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);

1149

+ return -ENOMEM;

1150

+ }

1151

+

1152

+ if (evms_cs_allocate_memory((void**)&AIXvgt2, AIX_SECTOR_SIZE)) {

1153

+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);

1154

+ return -ENOMEM;

1155

+ }

1156

+

1157

+ // First time thru we want to read this in, we may only have one PV in this group, all others

1158

+ // may be corrupt, etc. If the info is clean we shouldn't get here.

1159

+

1160

+ if (INIT_IO(logical_node, 0, AIXlvm->vgda_psn[0], 1, AIXvgh)) {

1161

+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);

1162

+ return -ENOMEM;

1163

+ }

1164

+

1165

+ if (INIT_IO(logical_node, 0, AIXlvm->vgda_psn[1], 1, AIXvgh2)) {

1166

+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);

1167

+ return -ENOMEM;

1168

+ }

1169

+

1170

+ if (INIT_IO(logical_node, 0, (AIXlvm->vgda_psn[0]+AIXlvm->vgda_len-1), 1, AIXvgt)) {

1171

+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);

1172

+ return -ENOMEM;

1173

+ }

1174

+

1175

+ if (INIT_IO(logical_node, 0, (AIXlvm->vgda_psn[1]+AIXlvm->vgda_len-1), 1, AIXvgt2)) {

1176

+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);

1177

+ return -ENOMEM;

1178

+ }

1179

+

1180

+ LOG_DEBUG("UVG AIXvgh->vgda_psn[%d]:%d\n", 0, AIXlvm->vgda_psn[0]);

1181

+ LOG_DEBUG("UVG AIXvgh->vgda_psn[%d]:%d\n", 1, AIXlvm->vgda_psn[1]);

1182

+ LOG_DEBUG("UVG AIXvgt psn[%d]:%ld\n", 0, (AIXlvm->vgda_psn[0]+AIXlvm->vgda_len-1));

1183

+ LOG_DEBUG("UVG AIXvgt psn[%d]:%ld\n", 1, (AIXlvm->vgda_psn[1]+AIXlvm->vgda_len-1));

1184

+

1185

+

1186

+ AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;

1187

+ AIXVGLptr->flags |= EVMS_VG_DIRTY;

1188

+

1189

+ LOG_DEBUG("UVG AIXVGLptr:%p line %d\n",AIXVGLptr, __LINE__);

1190

+

1191

+ if (evms_cs_allocate_memory((void**)&AIXVGLptr->AIXvgh, sizeof(vg_header))) {

1192

+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);

1193

+ return -ENOMEM;

1194

+

1195

+ }

1196

+

1197

+

1198

+ LOG_DEBUG("UVG COMP TS AIXVGLptr->CleanVGInfo:%d \n",AIXVGLptr->CleanVGInfo);

1199

+

1200

+ if (AIXVGLptr->CleanVGInfo == AIX_PV_STATE_INVALID) {

1201

+ if (COMPARE_TIMESTAMPS(AIXvgh->vg_timestamp,AIXvgt->timestamp)) {

1202

+ if (COMPARE_TIMESTAMPS(AIXvgh2->vg_timestamp,AIXvgt2->timestamp)) {

1203

+ if (COMPARE_TIMESTAMPS(AIXvgh->vg_timestamp,AIXvgh2->vg_timestamp)) {

1204

+ // All timestamps match. Yea!

1205

+ AIXVGLptr->CleanVGInfo = AIX_PV_STATE_VALID;

1206

+ } else {

1207

+ // Both VGDAs are good, but timestamps are

1208

+ // different. Can't tell yet which one is

1209

+ // correct.

1210

+ AIXVGLptr->CleanVGInfo = AIX_PV_STATE_EITHER_VGDA;

1211

+ }

1212

+ } else {

1213

+ // First VGDA is good, second is bad.

1214

+ AIXVGLptr->CleanVGInfo = AIX_PV_STATE_FIRST_VGDA;

1215

+ }

1216

+ } else {

1217

+ if ( COMPARE_TIMESTAMPS(AIXvgh2->vg_timestamp, AIXvgt2->timestamp) ) {

1218

+ // First VGDA is bad, second is good.

1219

+ AIXVGLptr->CleanVGInfo = AIX_PV_STATE_SECOND_VGDA;

1220

+ } else if (AIXvgh->numpvs == 1) { // We only have 1 PV in this group, mismatch or not this will have to do

1221

+ AIXVGLptr->CleanVGInfo = AIX_PV_STATE_VALID;

1222

+ } else {

1223

+ // This should never happen.

1224

+ LOG_DEBUG("All four VG timestamps for %d are different. What happened?!?\n", AIXVGLptr->vg_id.word2);

1225

+ AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;

1226

+

1227

+ }

1228

+ }

1229

+

1230

+ LOG_DEBUG("UVG SWITCH TS AIXVGLptr->CleanVGInfo:%d \n",AIXVGLptr->CleanVGInfo);

1231

+

1232

+ switch (AIXVGLptr->CleanVGInfo) {

1233

+ case AIX_PV_STATE_VALID:

1234

+ case AIX_PV_STATE_FIRST_VGDA:

1235

+

1236

+ LOG_DEBUG("UVG SWITCH VALID %d size:%d\n",AIXVGLptr->CleanVGInfo,sizeof(vg_header));

1237

+

1238

+ AIX_copy_header_info(AIXVGLptr->AIXvgh, AIXvgh); // Get the info. we need

1239

+

1240

+ AIXVGLptr->vgda_psn = AIXlvm->vgda_psn[0];

1241

+ AIXVGLptr->vgda_len = AIXlvm->vgda_len;

1242

+ break;

1243

+

1244

+ case AIX_PV_STATE_SECOND_VGDA:

1245

+ LOG_DEBUG("UVG SWITCH SECOND VGDA %d size:%d\n",AIXVGLptr->CleanVGInfo,sizeof(vg_header));

1246

+

1247

+ AIX_copy_header_info(AIXVGLptr->AIXvgh, AIXvgh2); // Get the info. we need

1248

+

1249

+ AIXVGLptr->vgda_psn = AIXlvm->vgda_psn[1];

1250

+ AIXVGLptr->vgda_len = AIXlvm->vgda_len;

1251

+ break;

1252

+

1253

+ case AIX_PV_STATE_EITHER_VGDA:

1254

+ LOG_DEBUG("UVG SWITCH EITHER VGDA %d size:%d\n",AIXVGLptr->CleanVGInfo,sizeof(vg_header));

1255

+ if ( COMPARE_UNIQUE_IDS(AIXvgh->vg_id, AIXvgh2->vg_id) ) {

1256

+

1257

+ AIX_copy_header_info(AIXVGLptr->AIXvgh, AIXvgh); // Get the info. we need

1258

+

1259

+ AIXVGLptr->vgda_psn = AIXlvm->vgda_psn[0];

1260

+ AIXVGLptr->vgda_len = AIXlvm->vgda_len;

1261

+ } else {

1262

+ AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;

1263

+ // Not sure where this PV belongs. It thinks it is

1264

+ // supposed to be in two different containers. We will

1265

+ // probably need to put this on a separate, temporary

1266

+ // list, and determine later which container is missing

1267

+ // a PV.

1268

+ }

1269

+ break;

1270

+

1271

+ default:

1272

+ LOG_ERROR("UVG Invalid PV state (%d) for %d\n",AIXVGLptr->CleanVGInfo , AIXVGLptr->vg_id.word2);

1273

+ AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;

1274

+ break;

1275

+ }

1276

+

1277

+ }

1278

+

1279

+ add_VG_data_to_VG_list(logical_node, AIXVGLptr, AIXlvm->pv_num);

1280

+

1281

+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);

1282

+

1283

+ LOG_DEBUG("UVG Exiting CleanVGInfo:%d\n", AIXVGLptr->CleanVGInfo);

1284

+

1285

+ return 0;

1286

+}

1287

+/****************************************************

1288

+* Function: check_volume_groups

1289

+*

1290

+* We just want to make sure the volume groups have found

1291

+* all their drives.

1292

+*

1293

+* If not, we'll continue and build what we can

1294

+*****************************************************/

1295

+static int check_volume_groups(void)

1296

+{

1297

+ aix_volume_group_t * group;

1298

+ partition_list_entry_t * partitions;

1299

+ int NumPVS = 0;

1300

+

1301

+

1302

+ LOG_DEBUG("CHVG Checking volume groups:\n");

1303

+

1304

+ group = AIXVolumeGroupList;

1305

+

1306

+ while (group) {

1307

+ partitions = group->partition_list;

1308

+ while (partitions) {

1309

+ NumPVS++;

1310

+ partitions = partitions->next;

1311

+ }

1312

+

1313

+ if (NumPVS != group->numpvs) {

1314

+ group->flags |= AIX_VG_INCOMPLETE;

1315

+ LOG_ERROR("CHVG Found incomplete VG !! flags:%x\n",group->flags);

1316

+ LOG_ERROR("CHVG Found %d PVs should have %d PVs\n",NumPVS, group->numpvs);

1317

+ }

1318

+

1319

+ group = group->next;

1320

+ NumPVS = 0;

1321

+ }

1322

+

1323

+ LOG_DEBUG("CHVG Finished Checking volume groups:\n");

1324

+ return 0;

1325

+

1326

+}

1327

+

1328

+/************************************************************************

1329

+ * Function: discover_logical_volumes

1330

+ *

1331

+ * After all PVs have been claimed and added to the appropriate VG list,

1332

+ * the volumes for each VG must be constructed.

1333

+ *

1334

+ *

1335

+ */

1336

+static int discover_logical_volumes( void )

1337

+{

1338

+

1339

+ aix_volume_group_t * AIXVGLPtr;

1340

+ aix_logical_volume_t * new_LV;

1341

+ partition_list_entry_t * partition;

1342

+ evms_logical_node_t * node;

1343

+ lv_entries * AIXlvent, *AIXlventHead;

1344

+ int j, lv_found, all_lvs_found, rc;

1345

+ namelist * AIXnamelist;

1346

+ char * NameBuffer;

1347

+

1348

+ AIXVGLPtr = AIXVolumeGroupList;

1349

+

1350

+ LOG_DEBUG("DLV Discover Logical volume AIXVGLPtr:%p\n",AIXVGLPtr);

1351

+

1352

+ if ( evms_cs_allocate_memory((void**)&AIXlventHead, MAX_SECTORS_LV_ENTRIES * AIX_SECTOR_SIZE) ) {

1353

+ return -ENOMEM;

1354

+ }

1355

+

1356

+ if ( evms_cs_allocate_memory((void**)&NameBuffer, MAX_SECTORS_NAMELIST * EVMS_VSECTOR_SIZE) ) {

1357

+ evms_cs_deallocate_memory(AIXlventHead);

1358

+ return -ENOMEM;

1359

+ }

1360

+

1361

+ while (AIXVGLPtr) {

1362

+ partition = AIXVGLPtr->partition_list;

1363

+ node = partition->logical_node;

1364

+

1365

+

1366

+ LOG_DEBUG("DLV INIT_IO AIXNameList position:%ld\n",((AIXVGLPtr->vgda_psn + AIXVGLPtr->vgda_len) - 1 - MAX_SECTORS_NAMELIST));

1367

+

1368

+ if (INIT_IO(node, 0, ((AIXVGLPtr->vgda_psn + AIXVGLPtr->vgda_len) - 1 - MAX_SECTORS_NAMELIST), MAX_SECTORS_NAMELIST, NameBuffer)) {

1369

+ continue;

1370

+ }

1371

+

1372

+ LOG_DEBUG("DLV INIT_IO AIXNameList\n");

1373

+

1374

+ if (INIT_IO(node, 0, AIXVGLPtr->vgda_psn + PSN_LVE_REC, MAX_SECTORS_LV_ENTRIES, AIXlventHead)) {

1375

+ continue;

1376

+ }

1377

+ AIXlvent = AIXlventHead;

1378

+ AIXnamelist = (namelist *)NameBuffer;

1379

+

1380

+ LOG_DEBUG("DLV INIT_IO AIXlvent\n");

1381

+ // Search through the LV structs for valid LV entries

1382

+ // We're just going to search until all valid LVs are found

1383

+ // The max. allowable LVs is 256 and we want don't want to

1384

+ // search for 255 if only 8 are defined 1-8 however, there

1385

+ // could be gaps in the LV numbering. i.e 1,2,3,4,5,6,7,8, 27,43, etc.

1386

+

1387

+ for ( j = 0, lv_found = 0, all_lvs_found = FALSE; !all_lvs_found && j < LVM_MAXLVS; j++, AIXlvent++) {

1388

+

1389

+ LOG_DEBUG(" ** DVIG:lv_size:%d lvname:[%s] j:%d lv_number:%d ** \n",AIXlvent->num_lps, AIXnamelist->name[j], j, AIXlvent->lvname);

1390

+ LOG_DEBUG(" DVIG:stripe_exp:%u stripesize:%u lv_status:%d\n", AIXlvent->striping_width, GET_PHYSICAL_PART_SIZE(AIXlvent->stripe_exp), AIXlvent->lv_state);

1391

+ LOG_DEBUG(" DVIG Group:%x.Access:%x\n",(unsigned int)AIXVGLPtr->vg_id.word2,AIXlvent->permissions);

1392

+ LOG_DEBUG(" DVIG mirror:%d mirror_policy:%d mirwrt:%d \n", AIXlvent->mirror, AIXlvent->mirror_policy, AIXlvent->mirwrt_consist);

1393

+

1394

+ // This is the same check we used in "diskedit" and "readdisk"

1395

+ if ( AIXlvent->lv_state != 0 &&

1396

+ AIXlvent->permissions <= 0x10 ) {

1397

+

1398

+

1399

+ lv_found++;

1400

+ if (lv_found == AIXVGLPtr->numlvs) {

1401

+ all_lvs_found = TRUE;

1402

+ }

1403

+

1404

+ LOG_DEBUG(" DVIG lv_found:%d all_lvs_found:%d \n", lv_found, all_lvs_found);

1405

+

1406

+ // Create a new logical volume and place it in the appropriate

1407

+ // spot in this VG's volume list. For re-discovery, make sure

1408

+ // this volume does not already exist.

1409

+ if ( !AIXVGLPtr->volume_list[AIXlvent->lvname] ) {

1410

+ new_LV = new_logical_volume( AIXlvent, AIXVGLPtr, AIXnamelist->name[j],GET_PHYSICAL_PART_SIZE(AIXlvent->stripe_exp));

1411

+ if (!new_LV) {

1412

+ continue;

1413

+ }

1414

+ LOG_DEBUG(" DVIG Adding new logical volume %d to group:%x \n",new_LV->lv_number, AIXVGLPtr->vg_id.word2);

1415

+ AIXVGLPtr->volume_list[new_LV->lv_number] = new_LV;

1416

+ } else {

1417

+ LOG_DEBUG("DVIG Updating Vol Exists\n");

1418

+ }

1419

+ }

1420

+ }

1421

+

1422

+

1423

+ // Build the le_to_pe_map for each volume that was discovered above.

1424

+ // This has to be done after all volumes in the group are discovered

1425

+ if ( (rc = build_pe_maps(AIXVGLPtr)) ) {

1426

+ continue;

1427

+ }

1428

+

1429

+ check_log_volume_and_pe_maps( AIXVGLPtr );

1430

+

1431

+ AIXVGLPtr = AIXVGLPtr->next;

1432

+ }

1433

+

1434

+ evms_cs_deallocate_memory(NameBuffer);

1435

+ evms_cs_deallocate_memory(AIXlventHead);

1436

+

1437

+ return 0;

1438

+}

1439

+/*

1440

+ * Function: new_logical_volume

1441

+ *

1442

+ * Allocate space for a new LVM logical volume, including space for the

1443

+ * PE map

1444

+ */

1445

+static aix_logical_volume_t * new_logical_volume(lv_entries *AIXlvent,

1446

+ aix_volume_group_t *volume_group,

1447

+ char *lv_name,

1448

+ u_int32_t stripesize)

1449

+{

1450

+ aix_logical_volume_t * new_volume;

1451

+

1452

+

1453

+ LOG_DEBUG(" NLV: lv_number:%d lv_allocated_le:%d lv_size:%d\n", AIXlvent->lvname,

1454

+ AIXlvent->num_lps,

1455

+ AIXlvent->num_lps * volume_group->pe_size);

1456

+

1457

+ // Allocate space for the new logical volume.

1458

+ if ( evms_cs_allocate_memory((void**)&new_volume, sizeof(aix_logical_volume_t)) ) {

1459

+ return NULL;

1460

+ }

1461

+

1462

+ // Allocate space for the LE to PE mapping table

1463

+ // We add 1 for the allocated le to ease mapping later on, all AIX le are 1 based

1464

+ if ( evms_cs_allocate_memory((void**)&(new_volume->le_to_pe_map), (AIXlvent->num_lps+1)*sizeof(pe_table_entry_t)) ) {

1465

+ delete_logical_volume( new_volume );

1466

+ return NULL;

1467

+ }

1468

+

1469

+ if (AIXlvent->mirror > AIX_DEFAULT_MIRRORING) {

1470

+ if (evms_cs_allocate_memory((void**)&(new_volume->le_to_pe_map_mir1), (AIXlvent->num_lps+1)*sizeof(pe_table_entry_t)) ) {

1471

+ delete_logical_volume( new_volume );

1472

+ return NULL;

1473

+ }

1474

+ }

1475

+

1476

+ if (AIXlvent->mirror == AIX_MAX_MIRRORS) {

1477

+ if (evms_cs_allocate_memory((void**)&(new_volume->le_to_pe_map_mir2), (AIXlvent->num_lps+1)*sizeof(pe_table_entry_t)) ) {

1478

+ delete_logical_volume( new_volume );

1479

+ return NULL;

1480

+ }

1481

+ }

1482

+

1483

+

1484

+ // Initialize the rest of the new volume.

1485

+ new_volume->lv_number = AIXlvent->lvname;

1486

+ new_volume->lv_size = AIXlvent->num_lps * (volume_group->pe_size);

1487

+ new_volume->lv_access = AIXlvent->permissions | EVMS_LV_NEW; // All volumes start new.

1488

+ new_volume->lv_status = AIXlvent->lv_state;

1489

+ //new_volume->lv_minor = MINOR(1);

1490

+ new_volume->mirror_copies = AIXlvent->mirror;

1491

+ new_volume->mirror_iterations = AIX_DEFAULT_MIRRORING;

1492

+ new_volume->stripes = AIXlvent->striping_width;

1493

+ new_volume->stripe_size = stripesize;

1494

+ new_volume->stripe_size_shift = evms_cs_log2(stripesize);

1495

+ new_volume->pe_size = volume_group->pe_size;

1496

+ new_volume->pe_size_shift = evms_cs_log2(volume_group->pe_size);

1497

+ new_volume->num_le = AIXlvent->num_lps;

1498

+ new_volume->new_volume = TRUE;

1499

+ new_volume->group = volume_group;

1500

+

1501

+ sprintf(new_volume->name, "aix/%s", lv_name);

1502

+

1503

+ if (!AIX_BH_list_pool && new_volume->mirror_copies > AIX_DEFAULT_MIRRORING) {

1504

+ AIX_BH_list_pool = evms_cs_create_pool(sizeof(aix_mirror_bh_t), "EVMS_AIX_BH", aix_notify_cache_ctor, NULL);

1505

+ if (!AIX_BH_list_pool) {

1506

+ return NULL;

1507

+ }

1508

+ }

1509

+

1510

+ LOG_DEBUG("NLV lv_number:%d name:%s lv_size %Ld \n", new_volume->lv_number, new_volume->name, new_volume->lv_size);

1511

+ LOG_DEBUG("NLV stripe_size:%d stripe_size_shift:%d\n", new_volume->stripe_size, new_volume->stripe_size_shift);

1512

+

1513

+ return new_volume;

1514

+}

1515

+/*

1516

+ * Function: aix_notify_cache_ctor

1517

+ * this function initializes the b_wait field in the buffer heads

1518

+ * in our private buffer head pool.

1519

+ */

1520

+static void

1521

+aix_notify_cache_ctor(

1522

+ void * foo,

1523

+ kmem_cache_t * cachep,

1524

+ unsigned long flags)

1525

+{

1526

+ if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==

1527

+ SLAB_CTOR_CONSTRUCTOR) {

1528

+ aix_mirror_bh_t *rbh = (aix_mirror_bh_t *)foo;

1529

+ memset(rbh, 0, sizeof(aix_mirror_bh_t));

1530

+ init_waitqueue_head(&rbh->bh_req.b_wait);

1531

+ }

1532

+}

1533

+

1534

+/*

1535

+ * Function: build_pe_maps

1536

+ *

1537

+ * After all logical volumes have been discovered, the mappings from

1538

+ * logical extents to physical extents must be constructed. Each PV

1539

+ * contains a map on-disk of its PEs. Each PE map entry contains the

1540

+ * logical volume number and the logical extent number on that volume.

1541

+ * Our internal map is the reverse of this map for each volume, listing

1542

+ * the PV node and sector offset for every logical extent on the volume.

1543

+ */

1544

+static int build_pe_maps( aix_volume_group_t * volume_group)

1545

+{

1546

+ partition_list_entry_t * partition;

1547

+ partition_list_entry_t * mirror_partition;

1548

+ pp_entries * AIXppent, *AIXppent_buff;

1549

+ pv_header * AIXpvh;

1550

+ u_int64_t offset;

1551

+ u_int32_t le_number;

1552

+ u_int32_t j, pp_count,pvh_pos;

1553

+ u_int32_t MirrorFound;

1554

+#ifdef EVMS_DEBUG_MIRRORS

1555

+ u_int32_t lv_found, all_lvs_found;

1556

+ u_int32_t mirs = 0;

1557

+#endif

1558

+

1559

+ LOG_DEBUG(" *** BPEM ***\n");

1560

+ // For every partition in this VG

1561

+

1562

+ if (evms_cs_allocate_memory((void**)&AIXppent_buff, (AIX_SECTOR_SIZE * PHYS_VOL_OFFSET))) {

1563

+ return -ENOMEM;

1564

+ }

1565

+

1566

+ if (evms_cs_allocate_memory((void**)&AIXpvh, AIX_SECTOR_SIZE)) {

1567

+ evms_cs_deallocate_memory(AIXppent_buff);

1568

+ return -ENOMEM;

1569

+ }

1570

+

1571

+ LOG_DEBUG(" BPEM AIXppent_buff:%d \n", (AIX_SECTOR_SIZE * PHYS_VOL_OFFSET));

1572

+

1573

+ for ( partition = volume_group->partition_list; partition; partition = partition->next ) {

1574

+

1575

+ LOG_DEBUG(" BPEM partition:%p next:%p\n", partition, partition->next);

1576

+

1577

+ pvh_pos = AIX_PVH_DATA_PSN(volume_group->vgda_psn, partition->pv_number);

1578

+

1579

+ LOG_DEBUG(" BPEM pvh_pos:%d\n", pvh_pos);

1580

+

1581

+ if (INIT_IO(partition->logical_node, 0, pvh_pos, 1, AIXpvh)) {

1582

+ evms_cs_deallocate_memory(AIXppent_buff);

1583

+ evms_cs_deallocate_memory(AIXpvh);

1584

+ return EIO;

1585

+ }

1586

+

1587

+ // For every entry in the PE map, calculate the PE's sector offset

1588

+ // and update the correct LV's PE map. LV number of 0 marks an unused PE.

1589

+ // For re-discovery, only compute entries for new volumes.

1590

+

1591

+ if (INIT_IO(partition->logical_node, 0, pvh_pos, AIX_PVHPP_LENGTH, AIXppent_buff)) {

1592

+ evms_cs_deallocate_memory(AIXppent_buff);

1593

+ evms_cs_deallocate_memory(AIXpvh);

1594

+ return -EIO;

1595

+ }

1596

+

1597

+ AIXppent = AIXppent_buff;

1598

+ AIXppent++;

1599

+

1600

+ pp_count = AIXpvh->pp_count;

1601

+

1602

+ LOG_DEBUG(" PE Map: volgrp:%x AIXpvh->pv_num:%d partition:%p next:%p lv_index:%d pp_count:%d\n",

1603

+ volume_group->vg_id.word2,

1604

+ AIXpvh->pv_num,

1605

+ partition,

1606

+ partition->next,

1607

+ AIXppent->lv_index,

1608

+ pp_count);

1609

+

1610

+ for (j = 0; j < pp_count; j++) {

1611

+ if (AIXppent->lv_index && AIXppent->pp_state ) {

1612

+

1613

+ LOG_EXTRA(" -- pv:%x pp:%d st:%d nm:%s lv:%d lp:%ld cp:%d fst v:%d fst p:%d snd v:%d snd p:%d \n",

1614

+ volume_group->vg_id.word2, j+1, AIXppent->pp_state, volume_group->volume_list[AIXppent->lv_index-1]->name,

1615

+ AIXppent->lv_index,

1616

+ AIXppent->lp_num, AIXppent->copy,

1617

+ AIXppent->fst_alt_vol, AIXppent->fst_alt_part,

1618

+ AIXppent->snd_alt_vol, AIXppent->snd_alt_part);

1619

+

1620

+ le_number = AIXppent->lp_num -1; // AIX lp's start @ 1, we want a 0 index

1621

+ offset = ((j * (volume_group->pe_size)) + AIXpvh->psn_part1);

1622

+

1623

+ LOG_DEBUG(" PE Map: le_number:%d partition:%p lv_index:%d lv_name:%s\n",

1624

+ le_number,

1625

+ partition,

1626

+ AIXppent->lv_index,

1627

+ volume_group->volume_list[AIXppent->lv_index-1]->name);

1628

+

1629

+ if (volume_group->volume_list[AIXppent->lv_index-1]->le_to_pe_map &&

1630

+ le_number <= volume_group->volume_list[AIXppent->lv_index-1]->num_le) {

1631

+ volume_group->volume_list[AIXppent->lv_index-1]->le_to_pe_map[le_number].owning_pv = partition;

1632

+ volume_group->volume_list[AIXppent->lv_index-1]->le_to_pe_map[le_number].pe_sector_offset = offset;

1633

+ }

1634

+

1635

+

1636

+ if (volume_group->volume_list[AIXppent->lv_index-1]->mirror_copies > AIX_DEFAULT_MIRRORING) {

1637

+

1638

+ LOG_EXTRA(" PE Map: Mirror found lv:%d -- \n", AIXppent->lv_index);

1639

+

1640

+ for ( mirror_partition = volume_group->partition_list, MirrorFound = FALSE; mirror_partition && !MirrorFound; mirror_partition = mirror_partition->next ) {

1641

+

1642

+ if (mirror_partition->pv_number == AIXppent->fst_alt_vol) {

1643

+

1644

+ offset = (((AIXppent->fst_alt_part -1) * (volume_group->pe_size)) + AIXpvh->psn_part1);

1645

+

1646

+

1647

+ volume_group->volume_list[AIXppent->lv_index-1]->le_to_pe_map_mir1[le_number].owning_pv = mirror_partition;

1648

+ volume_group->volume_list[AIXppent->lv_index-1]->le_to_pe_map_mir1[le_number].pe_sector_offset = offset;

1649

+

1650

+ LOG_EXTRA(" PE Map: mirror_partition:%p \n", mirror_partition);

1651

+ LOG_EXTRA(" PE Map: mirror_sector_offet:%d\n", AIXppent->fst_alt_part);

1652

+

1653

+ MirrorFound = TRUE;

1654

+ }

1655

+ }

1656

+

1657

+ if (volume_group->volume_list[AIXppent->lv_index-1]->mirror_copies == AIX_MAX_MIRRORS) {

1658

+

1659

+ for ( mirror_partition = volume_group->partition_list, MirrorFound = FALSE; mirror_partition && !MirrorFound; mirror_partition = mirror_partition->next ) {

1660

+

1661

+ if (mirror_partition->pv_number == AIXppent->snd_alt_vol) {

1662

+

1663

+ offset = (((AIXppent->snd_alt_part -1) * (volume_group->pe_size)) + AIXpvh->psn_part1);

1664

+

1665

+ volume_group->volume_list[AIXppent->lv_index-1]->le_to_pe_map_mir2[le_number].owning_pv = mirror_partition;

1666

+ volume_group->volume_list[AIXppent->lv_index-1]->le_to_pe_map_mir2[le_number].pe_sector_offset = offset;

1667

+

1668

+ LOG_EXTRA(" PE Map: mirror_partition2:%p \n", mirror_partition);

1669

+ LOG_EXTRA(" PE Map: mirror_sector_offet2:%d\n", AIXppent->snd_alt_part);

1670

+

1671

+ MirrorFound = TRUE;

1672

+ }

1673

+ }

1674

+ }

1675

+

1676

+

1677

+ } // End of if mirroring is enabled

1678

+

1679

+ }

1680

+

1681

+ AIXppent++;

1682

+

1683

+ }

1684

+ }

1685

+

1686

+// LOG_EXTRA(" PE Map: PE maps:%d Mirror count:%d -- \n", lvs, mirs);

1687

+

1688

+#ifdef EVMS_DEBUG_MIRRORS

1689

+ for (mirs = 0, lv_found = 0, all_lvs_found = FALSE; !all_lvs_found && mirs < LVM_MAXLVS; mirs++) {

1690

+

1691

+ if (volume_group->volume_list[mirs] != NULL) {

1692

+ if (volume_group->volume_list[mirs]->lv_status == LV_ACTIVE) {

1693

+

1694

+ lv_found++;

1695

+

1696

+ LOG_DEBUG(" PE Map: owning part lv %d -- %p\n", mirs, volume_group->volume_list[mirs]->le_to_pe_map[0].owning_pv);

1697

+ if (volume_group->volume_list[mirs]->mirror_copies > AIX_DEFAULT_MIRRORING) {

1698

+ LOG_DEBUG(" PE Map: mirror_partition lv %d -- %p \n", mirs, volume_group->volume_list[mirs]->le_to_pe_map_mir1[0].owning_pv);

1699

+ }

1700

+ if (volume_group->volume_list[mirs]->mirror_copies == AIX_MAX_MIRRORS) {

1701

+ LOG_DEBUG(" PE Map: mirror_partition lv %d -- %p \n", mirs, volume_group->volume_list[mirs]->le_to_pe_map_mir2[0].owning_pv);

1702

+ }

1703

+ }

1704

+ if (lv_found == volume_group->numlvs) {

1705

+ all_lvs_found = TRUE;

1706

+ LOG_DEBUG(" PE Map: all_lvs_found\n" );

1707

+ }

1708

+ }

1709

+ }

1710

+#endif

1711

+

1712

+ evms_cs_deallocate_memory(AIXpvh);

1713

+ evms_cs_deallocate_memory(AIXppent_buff);

1714

+

1715

+ return 0;

1716

+}

1717

+/*

1718

+ * Function: check_log_volume_and_pe_maps

1719

+ *

1720

+ * Make sure all volumes in this group have valid LE-to-PE maps.

1721

+ * Any volume that doesn't is deleted. This is safe for re-discovery

1722

+ * because only new volumes could have corrupted PE maps.

1723

+ */

1724

+static int check_log_volume_and_pe_maps( aix_volume_group_t * group )

1725

+{

1726

+ aix_logical_volume_t * volume;

1727

+ int i, j, lv_found, all_lvs_found;

1728

+

1729

+ LOG_DEBUG(" check_pe_map.\n");

1730

+

1731

+ for ( i = 0, all_lvs_found = FALSE, lv_found = 0; !all_lvs_found && i < LVM_MAXLVS; i++ ) {

1732

+ if ( ! group->volume_list[i] ) {

1733

+ LOG_DEBUG(" CPEM No Volume %d found \n",i);

1734

+ continue;

1735

+ }

1736

+

1737

+ volume = group->volume_list[i];

1738

+ if ( ! volume->le_to_pe_map ) {

1739

+ LOG_DEBUG(" CPEM Volume %s has no PE map.\n",volume->name);

1740

+ delete_logical_volume(volume);

1741

+ continue;

1742

+ }

1743

+

1744

+ LOG_DEBUG(" CPEM volume %s num_le: %d \n",volume->name, volume->num_le);

1745

+

1746

+ lv_found++;

1747

+

1748

+ if (lv_found == group->numlvs) {

1749

+ all_lvs_found = TRUE;

1750

+ }

1751

+

1752

+

1753

+

1754

+ for ( j = 0; j < volume->num_le; j++) {

1755

+ if ( ! volume->le_to_pe_map[j].owning_pv ||

1756

+ ! volume->le_to_pe_map[j].pe_sector_offset ) {

1757

+ LOG_SERIOUS(" CPEM Volume (%s) incomplete PE map (LE %d) \n",volume->name, j);

1758

+ volume->lv_access |= EVMS_LV_INCOMPLETE;

1759

+ }

1760

+

1761

+ if (volume->mirror_copies > AIX_DEFAULT_MIRRORING) {

1762

+ if ( ! volume->le_to_pe_map_mir1[j].owning_pv ||

1763

+ ! volume->le_to_pe_map_mir1[j].pe_sector_offset ) {

1764

+ LOG_SERIOUS(" CPEM Volume (%s) incomplete PE mirror map 1 (LE %d) \n",volume->name, j);

1765

+ volume->lv_access |= EVMS_LV_INCOMPLETE;

1766

+ }

1767

+

1768

+ if (volume->mirror_copies == AIX_MAX_MIRRORS) {

1769

+ if ( ! volume->le_to_pe_map_mir2[j].owning_pv ||

1770

+ ! volume->le_to_pe_map_mir2[j].pe_sector_offset ) {

1771

+ LOG_SERIOUS(" CPEM Volume (%s) incomplete PE mirror map 2 (LE %d) \n",volume->name, j);

1772

+ volume->lv_access |= EVMS_LV_INCOMPLETE;

1773

+ }

1774

+ }

1775

+ }

1776

+ }

1777

+ }

1778

+

1779

+ LOG_EXTRA(" Leaving check_pe_map.\n");

1780

+ return 0;

1781

+}

1782

+/*

1783

+ * Function: export_volumes

1784

+ *

1785

+ * The last thing this VGE must do is take each constructed volume and

1786

+ * place it back on the evms logical partition list.

1787

+ */

1788

+static int export_volumes( evms_logical_node_t ** evms_partition_list )

1789

+{

1790

+ aix_volume_group_t * AIXVGLPtr;

1791

+ evms_logical_node_t * new_node;

1792

+ aix_logical_volume_t * volume;

1793

+ int j, lv_found, all_lvs_found;

1794

+ int count = 0;

1795

+

1796

+ AIXVGLPtr = AIXVolumeGroupList;

1797

+

1798

+ while (AIXVGLPtr) {

1799

+

1800

+ if (AIXVGLPtr->flags & EVMS_VG_DIRTY) {

1801

+

1802

+ LOG_DEBUG(" Exporting all new volumes numpvs:%d numlvs:%d \n",AIXVGLPtr->numpvs,AIXVGLPtr->numlvs);

1803

+

1804

+ // Export every valid volume in the group. For re-discovery,

1805

+ // make sure we are only exporting "new" volumes.

1806

+

1807

+ for ( j = 0, all_lvs_found = FALSE, lv_found = 0; !all_lvs_found && j < LVM_MAXLVS ; j++ ) {

1808

+ if (AIXVGLPtr->volume_list[j] != NULL ) {

1809

+ if (AIXVGLPtr->volume_list[j]->new_volume == TRUE) {

1810

+

1811

+ LOG_DEBUG(" EV Checking LV:[%d] volume:%p\n",j, AIXVGLPtr->volume_list[j]);

1812

+ volume = AIXVGLPtr->volume_list[j];

1813

+ lv_found++;

1814

+

1815

+ if (lv_found == AIXVGLPtr->numlvs) {

1816

+ all_lvs_found = TRUE;

1817

+ }

1818

+

1819

+ // For new volumes, create a new EVMS node and

1820

+ // initialize the appropriate fields.

1821

+ if ( volume->lv_access & EVMS_LV_NEW ) {

1822

+ if ( evms_cs_allocate_logical_node( &new_node ) ) {

1823

+ LOG_DEBUG(" Export Vol Error allocating node !!\n");

1824

+ continue;

1825

+ } else {

1826

+ LOG_DEBUG(" EV Node allocated OK\n");

1827

+ }

1828

+

1829

+ volume->new_volume = 0;

1830

+ volume->volume_node = new_node;

1831

+ volume->lv_access &= (~EVMS_LV_NEW);

1832

+ new_node->hardsector_size = AIXVGLPtr->hard_sect_size;

1833

+ new_node->block_size = AIXVGLPtr->block_size;

1834

+ new_node->plugin = &plugin_header;

1835

+ new_node->instance_data = volume;

1836

+ new_node->total_vsectors = volume->lv_size;

1837

+

1838

+

1839

+ LOG_DEBUG(" EV volume->name:[%s]\n",volume->name);

1840

+

1841

+ strncpy(new_node->name, volume->name, EVMS_VOLUME_NAME_SIZE+1);

1842

+

1843

+

1844

+ // Is the volume read-only?

1845

+ if ( !(volume->lv_access & AIX_LV_WRITE) ||

1846

+ volume->lv_access & EVMS_LV_INCOMPLETE ) {

1847

+ new_node->flags |= EVMS_VOLUME_SET_READ_ONLY;

1848

+ LOG_DEBUG(" EV Read Only volume->lv_access:%d\n",volume->lv_access);

1849

+ }

1850

+ } else {

1851

+ LOG_DEBUG(" EV Node [%s] allocated previously\n",volume->name);

1852

+ }

1853

+

1854

+ evms_cs_add_logical_node_to_list( evms_partition_list, new_node );

1855

+ count++;

1856

+

1857

+ LOG_DEBUG(" Exporting LVM volume %p new_node:%p ESD->volume_name[%s]\n", volume, new_node,new_node->name);

1858

+ } else {

1859

+ evms_cs_add_logical_node_to_list( evms_partition_list, AIXVGLPtr->volume_list[j]->volume_node);

1860

+ count++;

1861

+ LOG_DEBUG(" ELV vol_list[%d]%p\n",j, AIXVGLPtr->volume_list[j]);

1862

+ }

1863

+ } else {

1864

+ LOG_DEBUG(" EV Checking LV:[%d] == NULL\n",j);

1865

+ }

1866

+ } // end checking all lvs

1867

+

1868

+ } else {

1869

+ LOG_DEBUG(" ELV Existing volume -- %d\n",AIXVGLPtr->vg_id.word2);

1870

+ }

1871

+

1872

+ AIXVGLPtr->flags &= ~EVMS_VG_DIRTY;

1873

+ AIXVGLPtr = AIXVGLPtr->next;

1874

+ }

1875

+

1876

+ return count;

1877

+

1878

+}

1879

+

1880

+/*

1881

+ * Function: delete_logical_volume

1882

+ *

1883

+ * This function deletes the in-memory representation of a single LVM

1884

+ * logical volume, including its PE map and any snapshot data. It does

1885

+ * not alter the parent volume group, except to remove this volume from

1886

+ * its volume list.

1887

+ */

1888

+static int delete_logical_volume( aix_logical_volume_t * volume )

1889

+{

1890

+ aix_volume_group_t * group = volume->group;

1891

+

1892

+ LOG_DEBUG(" Deleting volume %s\n",volume->name);

1893

+

1894

+ // Now free up all the memory. This includes the LE-to-PE map, any

1895

+ // mirror PEs, etc.

1896

+ if ( volume->le_to_pe_map ) {

1897

+ evms_cs_deallocate_memory( volume->le_to_pe_map );

1898

+ volume->le_to_pe_map = NULL;

1899

+ }

1900

+

1901

+ if ( volume->le_to_pe_map_mir1 ) {

1902

+ evms_cs_deallocate_memory( volume->le_to_pe_map_mir1 );

1903

+ volume->le_to_pe_map_mir1 = NULL;

1904

+ }

1905

+

1906

+ if ( volume->le_to_pe_map_mir2 ) {

1907

+ evms_cs_deallocate_memory( volume->le_to_pe_map_mir2 );

1908

+ volume->le_to_pe_map_mir2 = NULL;

1909

+ }

1910

+

1911

+ // Remove this volume from the volume-group's list.

1912

+ if ( group && group->volume_list[volume->lv_number] == volume ) {

1913

+ group->volume_list[volume->lv_number] = NULL;

1914

+ group->numlvs--;

1915

+ }

1916

+

1917

+ evms_cs_deallocate_memory(volume);

1918

+

1919

+ return 0;

1920

+}

1921

+

1922

+

1923

+/* Function: remove_group_from_list

1924

+ *

1925

+ * Remove an LVM volume group from the global LVM list.

1926

+ */

1927

+static int remove_group_from_list( aix_volume_group_t * group )

1928

+{

1929

+ aix_volume_group_t ** p_group;

1930

+

1931

+ for ( p_group = &AIXVolumeGroupList; *p_group; p_group = &(*p_group)->next ) {

1932

+ if ( *p_group == group ) {

1933

+ *p_group = (*p_group)->next;

1934

+ group->next = NULL;

1935

+ break;

1936

+ }

1937

+ }

1938

+ return 0;

1939

+}

1940

+

1941

+

1942

+/*

1943

+ * Function: delete_aix_node

1944

+ *

1945

+ * This function deletes the in-memory representation of an LVM

1946

+ * logical volume. Right now it makes a lot of assumptions about

1947

+ * the data in the group not being corrupted. It would be possible

1948

+ * to put in a lot of consistency checks before deleting everything

1949

+ * to indicate if problems have occurred during the lifetime of the

1950

+ * volume and its volume group.

1951

+ */

1952

+static int delete_aix_node( evms_logical_node_t * logical_node )

1953

+{

1954

+ aix_logical_volume_t * volume = (aix_logical_volume_t*)(logical_node->instance_data);

1955

+ aix_volume_group_t * group = volume->group;

1956

+

1957

+ if ( delete_logical_volume(volume) ) {

1958

+ return -EINVAL;

1959

+ }

1960

+

1961

+ // If we just removed the last volume from this group, the entire group

1962

+ // can also be deleted.

1963

+ if ( group && group->numlvs == 0) {

1964

+ remove_group_from_list(group);

1965

+ deallocate_volume_group(group);

1966

+ }

1967

+

1968

+ // Free the logical node.

1969

+ evms_cs_deallocate_logical_node(logical_node);

1970

+

1971

+ return 0;

1972

+}

1973

+

1974

+/* Function: deallocate_volume_group

1975

+ *

1976

+ * This function deletes the entire in-memory representation of an LVM

1977

+ * volume group, including all partitions and logical volumes. If this

1978

+ * group is on the VGE's volume group list, it is removed.

1979

+ */

1980

+static int deallocate_volume_group( aix_volume_group_t * group )

1981

+{

1982

+ partition_list_entry_t * partition;

1983

+ partition_list_entry_t * next_part;

1984

+ int i;

1985

+

1986

+ LOG_DEBUG(" Deleting volume group %x\n",group->vg_id.word2);

1987

+

1988

+

1989

+ // Delete all partitions from the group's list.

1990

+ for ( partition = group->partition_list; partition; partition = next_part ) {

1991

+

1992

+ next_part = partition->next;

1993

+

1994

+ if ( partition->logical_node ) {

1995

+ // Send a delete command down to the partition manager.

1996

+ LOG_DEBUG(" Deleting PV %d from group %x\n",partition->pv_number,group->vg_id.word2);

1997

+ DELETE(partition->logical_node);

1998

+ }

1999

+ evms_cs_deallocate_memory(partition);

2000

+

2001

+ }

2002

+

2003

+ // Delete all logical volumes, and the array of pointers.

2004

+ for ( i = 0; i < LVM_MAXLVS; i++ ) {

2005

+ if ( group->volume_list[i] ) {

2006

+ delete_logical_volume(group->volume_list[i]);

2007

+ }

2008

+ }

2009

+

2010

+ evms_cs_deallocate_memory(group);

2011

+

2012

+ return 0;

2013

+}

2014

+/* Function: end_discover_aix

2015

+ *

2016

+ * The discovery process at the region-manager level is now iterative,

2017

+ * much like the EVMS feature level. To accomplish this correctly, and

2018

+ * also to accomplish partial volume discovery, a second discover

2019

+ * entry point is needed, so EVMS can tell the region managers that

2020

+ * discovery is over, and to finish up any discovery that is not yet

2021

+ * complete. When this function is called, it should be assumed that

2022

+ * the node list has had nothing new added to it since the last call

2023

+ * of the regular discover function. Therefore, when this function is

2024

+ * called, we do not need to try to discovery any additional volume

2025

+ * groups. We will, however, look for logical volumes once more. This

2026

+ * gives us the ability to export (read-only) volumes that have

2027

+ * partially corrupted LE maps due to missing PVs in their VG.

2028

+ */

2029

+static int end_discover_aix(evms_logical_node_t ** evms_logical_disk_head)

2030

+{

2031

+

2032

+ int rc;

2033

+

2034

+ LOG_DEBUG("Final Discovery:\n");

2035

+

2036

+

2037

+ if ( (rc = discover_logical_volumes()) ) {

2038

+ return rc;

2039

+ }

2040

+

2041

+ rc = export_volumes(evms_logical_disk_head);

2042

+

2043

+ lvm_cleanup();

2044

+

2045

+ return rc;

2046

+}

2047

+/****************************************************

2048

+* Function: AIX_alloc_wbh

2049

+*

2050

+* Alloc any buffer heads from the pool and return a linked list

2051

+*

2052

+*

2053

+*****************************************************/

2054

+static aix_mirror_bh_t * AIX_alloc_wbh(evms_logical_node_t * node,

2055

+ evms_logical_node_t * node2,

2056

+ evms_logical_node_t * node3,

2057

+ eio_t * eio,

2058

+ uint32_t mirror_copies,

2059

+ evms_sector_t new_sector2,

2060

+ evms_sector_t new_sector3)

2061

+

2062

+{

2063

+ aix_mirror_bh_t * tmp_bh = NULL, *head_bh = NULL;

2064

+ int i;

2065

+

2066

+ head_bh = evms_cs_allocate_from_pool(AIX_BH_list_pool, EVMS_BLOCKABLE);

2067

+

2068

+ if (!head_bh) {

2069

+ LOG_SERIOUS("Unable to allocate memory for mirror pool line:%d\n",__LINE__);

2070

+ return NULL;

2071

+ }

2072

+

2073

+ head_bh->master_bh = eio->bh;

2074

+ head_bh->mirror_bh_list = NULL;

2075

+ atomic_set(&head_bh->remaining, 0);

2076

+

2077

+ for (i = AIX_DEFAULT_MIRRORING; i <= mirror_copies; i++) {

2078

+

2079

+ tmp_bh = evms_cs_allocate_from_pool(AIX_BH_list_pool, EVMS_BLOCKABLE);

2080

+ if (!tmp_bh) {

2081

+ LOG_SERIOUS("Unable to allocate memory for mirror pool line:%d\n",__LINE__);

2082

+ return NULL;

2083

+ }

2084

+

2085

+ tmp_bh->next_r1 = head_bh->mirror_bh_list;

2086

+ head_bh->mirror_bh_list = tmp_bh;

2087

+ atomic_inc(&head_bh->remaining);

2088

+

2089

+ memcpy(&tmp_bh->bh_req, eio->bh, sizeof(struct buffer_head));

2090

+ init_waitqueue_head(&tmp_bh->bh_req.b_wait);

2091

+// tmp_bh->master_bh = eio->bh;

2092

+// tmp_bh->iteration = AIX_DEFAULT_MIRRORING + i;

2093

+ tmp_bh->eio.rsize = eio->rsize;

2094

+ tmp_bh->eio.bh = &tmp_bh->bh_req;

2095

+

2096

+ switch (i) {

2097

+

2098

+ case AIX_DEFAULT_MIRRORING:

2099

+ tmp_bh->node = node;

2100

+ tmp_bh->eio.rsector = eio->rsector;

2101

+ break;

2102

+

2103

+ case AIX_FIRST_MIRROR:

2104

+ tmp_bh->node = node2;

2105

+ tmp_bh->eio.rsector = new_sector2;

2106

+ break;

2107

+

2108

+ case AIX_MAX_MIRRORS:

2109

+ tmp_bh->node = node3;

2110

+ tmp_bh->eio.rsector = new_sector3;

2111

+ break;

2112

+ }

2113

+

2114

+ tmp_bh->bh_req.b_end_io = AIX_handle_write_mirror_drives; //setup callback routine

2115

+ tmp_bh->bh_req.b_private = (void*)head_bh;

2116

+

2117

+ }

2118

+

2119

+ return head_bh;

2120

+

2121

+}

2122

+/****************************************************

2123

+* Function: AIX_handle_write_mirror_drives

2124

+*

2125

+* Handles a write from a set of mirrored AIX LVs

2126

+

2127

+*

2128

+*

2129

+*****************************************************/

2130

+static void AIX_handle_write_mirror_drives(struct buffer_head * bh,

2131

+ int uptodate)

2132

+{

2133

+ aix_logical_volume_t * volume;

2134

+ evms_logical_node_t * node;

2135

+ aix_mirror_bh_t * tmp_bh = NULL, * tmp_bh2 = NULL;

2136

+ kdev_t tmp_b_dev = bh->b_dev;

2137

+ u_int32_t count;

2138

+

2139

+ tmp_bh = (aix_mirror_bh_t *)bh->b_private;

2140

+ node = tmp_bh->node;

2141

+ volume = (aix_logical_volume_t *) node->instance_data;

2142

+

2143

+ LOG_DEBUG("AHWMD node:%p bh_flags:%lu uptodate:%d mirror_copies:%d \n", node, bh->b_state,uptodate, volume->mirror_copies);

2144

+

2145

+ if (!uptodate) {

2146

+

2147

+ AIX_evms_cs_notify_lv_io_error(node);

2148

+ }

2149

+

2150

+ if (atomic_dec_and_test(&tmp_bh->remaining)) {

2151

+ tmp_bh->master_bh->b_end_io(tmp_bh->master_bh, uptodate);

2152

+ tmp_bh2 = tmp_bh->mirror_bh_list;

2153

+ evms_cs_deallocate_to_pool(AIX_BH_list_pool, tmp_bh);

2154

+

2155

+ while (tmp_bh2) {

2156

+ tmp_bh = tmp_bh2->next_r1;

2157

+ evms_cs_deallocate_to_pool(AIX_BH_list_pool, tmp_bh2);

2158

+ tmp_bh2 = tmp_bh;

2159

+ }

2160

+

2161

+ evms_cs_volume_request_in_progress(tmp_b_dev, AIX_DECREMENT_REQUEST, &count);

2162

+ }

2163

+

2164

+ return;

2165

+}

2166

+

2167

+/****************************************************

2168

+* Function: AIX_alloc_rbh

2169

+*

2170

+* Alloc any buffer heads from the pool and return a linked list

2171

+*

2172

+*

2173

+*****************************************************/

2174

+static aix_mirror_bh_t * AIX_alloc_rbh(evms_logical_node_t * node,

2175

+ eio_t * eio,

2176

+ uint32_t mirror_copies,

2177

+ evms_sector_t org_sector,

2178

+ int cmd)

2179

+{

2180

+ aix_mirror_bh_t * tmp_bh = NULL;

2181

+

2182

+ tmp_bh = evms_cs_allocate_from_pool(AIX_BH_list_pool, EVMS_BLOCKABLE);

2183

+

2184

+ if (!tmp_bh) {

2185

+ LOG_SERIOUS("Unable to allocate memory for mirror pool line:%d\n",__LINE__);

2186

+ return NULL;

2187

+ }

2188

+

2189

+ memcpy(&tmp_bh->bh_req, eio->bh, sizeof(struct buffer_head));

2190

+ tmp_bh->node = node;

2191

+ tmp_bh->master_bh = eio->bh;

2192

+ tmp_bh->iteration = AIX_FIRST_MIRROR;

2193

+ tmp_bh->eio.rsector = eio->rsector;

2194

+ tmp_bh->eio.rsize = eio->rsize;

2195

+ tmp_bh->eio.bh = &tmp_bh->bh_req;

2196

+

2197

+

2198

+ tmp_bh->bh_req.b_end_io = AIX_handle_read_mirror_drives; //setup callback routine

2199

+ tmp_bh->bh_req.b_private = (void*)tmp_bh;

2200

+

2201

+ tmp_bh->cmd = cmd;

2202

+ tmp_bh->next_r1 = NULL;

2203

+ tmp_bh->node = node;

2204

+

2205

+ return tmp_bh;

2206

+

2207

+}

2208

+

2209

+static void AIX_reschedule_retry (aix_mirror_bh_t *aix_bh)

2210

+{

2211

+ unsigned long flags;

2212

+

2213

+ spin_lock_irqsave(&AIX_retry_list_lock, flags);

2214

+ if (AIX_retry_list == NULL)

2215

+ AIX_retry_tail = &AIX_retry_list;

2216

+ *AIX_retry_tail = aix_bh;

2217

+ AIX_retry_tail = &aix_bh->next_r1;

2218

+ aix_bh->next_r1 = NULL;

2219

+ spin_unlock_irqrestore(&AIX_retry_list_lock, flags);

2220

+ evms_cs_wakeup_thread(AIX_mirror_thread);

2221

+}

2222

+/****************************************************

2223

+* Function: AIX_handle_read_mirror_drives

2224

+*

2225

+* Handles a read from a set of mirrored AIX LVs

2226

+

2227

+*

2228

+*

2229

+*****************************************************/

2230

+static void AIX_handle_read_mirror_drives(struct buffer_head * bh,

2231

+ int uptodate)

2232

+{

2233

+ aix_logical_volume_t * volume;

2234

+ evms_logical_node_t * node;

2235

+ aix_mirror_bh_t * tmp_bh;

2236

+ kdev_t tmp_b_dev = bh->b_dev;

2237

+ u_int32_t count;

2238

+

2239

+ tmp_bh = (aix_mirror_bh_t *)bh->b_private;

2240

+ volume = (aix_logical_volume_t *) tmp_bh->node->instance_data;

2241

+ node = tmp_bh->node;

2242

+

2243

+ LOG_DEBUG("AHRMD node:%p bh_flags:%lu uptodate:%d mirror_copies:%d \n", node, bh->b_state,uptodate, volume->mirror_copies);

2244

+

2245

+ if (!uptodate && tmp_bh->iteration < volume->mirror_copies) {

2246

+ AIX_evms_cs_notify_lv_io_error(node);

2247

+ AIX_reschedule_retry(tmp_bh);

2248

+ } else {

2249

+ tmp_bh->master_bh->b_end_io(tmp_bh->master_bh, uptodate);

2250

+ evms_cs_deallocate_to_pool(AIX_BH_list_pool, tmp_bh);

2251

+ evms_cs_volume_request_in_progress(tmp_b_dev, AIX_DECREMENT_REQUEST, &count);

2252

+

2253

+ }

2254

+

2255

+

2256

+

2257

+ return;

2258

+}

2259

+/****************************************************

2260

+* This is a temporary function until a common EVMS

2261

+* notification function can be created.

2262

+*

2263

+*****************************************************/

2264

+static int AIX_evms_cs_notify_lv_io_error(evms_logical_node_t * node)

2265

+{

2266

+ aix_logical_volume_t * volume;

2267

+

2268

+ volume = (aix_logical_volume_t *)node->instance_data;

2269

+

2270

+ LOG_CRITICAL("Notify_ERROR !! node:%p volume->lv_status:%d volume->name:[%s]\n", node, volume->lv_status,volume->name);

2271

+

2272

+ return 0;

2273

+}

2274

+

2275

+/* Function: lvm_cleanup

2276

+ *

2277

+ * This function runs through the entire lvm data structure, removing

2278

+ * all items that are not needed at runtime. Currently, this is just the

2279

+ * vg_disk_t structure and the pv_disk_t structure for each PV. Also, any

2280

+ * groups that don't contain any volumes are deleted. All of the other

2281

+ * volume_group, logical_volume and evms_logical_node structures will be

2282

+ * kept around at run-time.

2283

+ */

2284

+static int lvm_cleanup( void )

2285

+{

2286

+ aix_volume_group_t * group;

2287

+

2288

+ group = AIXVolumeGroupList;

2289

+

2290

+ while (group) {

2291

+

2292

+ if (group->AIXvgh) {

2293

+ evms_cs_deallocate_memory(group->AIXvgh);

2294

+ group->AIXvgh = NULL;

2295

+ }

2296

+

2297

+ group = group->next;

2298

+ }

2299

+

2300

+ return 0;

2301

+}

2302

+

2303

+/****************************************************

2304

+* Function: AIX_copy_header_info

2305

+*

2306

+* Copy the disk header info into the volume struct

2307

+* so we can use it later.

2308

+*

2309

+*

2310

+*

2311

+*****************************************************/

2312

+static int AIX_copy_header_info(vg_header *AIXvgh, vg_header *AIXvgh2)

2313

+{

2314

+

2315

+ LOG_DEBUG("CHI AIXvgh:%p AIXvgh2:%p\n",AIXvgh,AIXvgh2);

2316

+

2317

+ if (AIXvgh) {

2318

+

2319

+ AIXvgh->vg_timestamp.tv_sec = AIXvgh2->vg_timestamp.tv_sec;

2320

+ AIXvgh->vg_timestamp.tv_nsec = AIXvgh2->vg_timestamp.tv_nsec;

2321

+ AIXvgh->vg_id.word1 = AIXvgh2->vg_id.word1;

2322

+ AIXvgh->vg_id.word2 = AIXvgh2->vg_id.word2;

2323

+ AIXvgh->vg_id.word3 = AIXvgh2->vg_id.word3;

2324

+ AIXvgh->vg_id.word4 = AIXvgh2->vg_id.word4;

2325

+ AIXvgh->numlvs = AIXvgh2->numlvs;

2326

+ AIXvgh->maxlvs = AIXvgh2->maxlvs;

2327

+ AIXvgh->pp_size = AIXvgh2->pp_size;

2328

+ AIXvgh->numpvs = AIXvgh2->numpvs;

2329

+ AIXvgh->total_vgdas = AIXvgh2->total_vgdas;

2330

+ AIXvgh->vgda_size = AIXvgh2->vgda_size;

2331

+ AIXvgh->bigvg = AIXvgh2->bigvg;

2332

+ AIXvgh->quorum = AIXvgh2->quorum;

2333

+ AIXvgh->auto_varyon = AIXvgh2->auto_varyon;

2334

+ AIXvgh->checksum = AIXvgh2->checksum;

2335

+ AIXvgh->bigda_size = AIXvgh2->bigda_size;

2336

+

2337

+ } else {

2338

+ return -ENOMEM;

2339

+ }

2340

+

2341

+ LOG_DEBUG("Returning CHI AIXvgh:%p AIXvgh2:%p\n",AIXvgh,AIXvgh2);

2342

+

2343

+ return 0;

2344

+}

2345

+/****************************************************

2346

+* Function: AIX_free_header

2347

+*

2348

+*

2349

+*

2350

+*

2351

+*

2352

+*****************************************************/

2353

+static void AIX_free_headers(vg_header *AIXvgh, vg_header *AIXvgh2, vg_trailer *AIXvgt, vg_trailer *AIXvgt2)

2354

+{

2355

+

2356

+ if (AIXvgh) {

2357

+ evms_cs_deallocate_memory(AIXvgh);

2358

+ AIXvgh = NULL;

2359

+ }

2360

+

2361

+ if (AIXvgh2) {

2362

+ evms_cs_deallocate_memory(AIXvgh2);

2363

+ AIXvgh2 = NULL;

2364

+ }

2365

+

2366

+ if (AIXvgt) {

2367

+ evms_cs_deallocate_memory(AIXvgt);

2368

+ AIXvgt = NULL;

2369

+ }

2370

+

2371

+ if (AIXvgt2) {

2372

+ evms_cs_deallocate_memory(AIXvgt2);

2373

+ AIXvgt2 = NULL;

2374

+ }

2375

+

2376

+}

2377

+

2378

+/****************************************************

2379

+* Function: AIXiod

2380

+*

2381

+* This is a kernel thread that handles read/write of mirrorss

2382

+* This shouldn't ever run on a non-mirrored LV read/write

2383

+*

2384

+*

2385

+*****************************************************/

2386

+static void AIXiod (void *data)

2387

+{

2388

+ aix_mirror_bh_t * r1_bh;

2389

+ evms_logical_node_t * node;

2390

+ unsigned long flags;

2391

+

2392

+

2393

+ while(1){

2394

+

2395

+ spin_lock_irqsave(&AIX_retry_list_lock, flags);

2396

+ if (AIX_retry_list == NULL){

2397

+ spin_unlock_irqrestore(&AIX_retry_list_lock, flags);

2398

+ break;

2399

+ }

2400

+ r1_bh = AIX_retry_list;

2401

+ AIX_retry_list = r1_bh->next_r1;

2402

+ spin_unlock_irqrestore(&AIX_retry_list_lock, flags);

2403

+ r1_bh->next_r1 = NULL; // for mark

2404

+

2405

+ switch (r1_bh->cmd) {

2406

+ case AIX_LV_READ:

2407

+

2408

+ r1_bh->iteration++;

2409

+ LOG_DEBUG("Report from thread AIXiod READ\n");

2410

+

2411

+ if (r1_bh->iteration == AIX_FIRST_MIRROR) {

2412

+ node = r1_bh->mir_node1;

2413

+ r1_bh->eio.rsector = r1_bh->mir_sector1;

2414

+ } else {

2415

+ node = r1_bh->mir_node2;

2416

+ r1_bh->eio.rsector = r1_bh->mir_sector2;

2417

+ }

2418

+

2419

+

2420

+ R_IO(node, &r1_bh->eio);

2421

+

2422

+ break;

2423

+

2424

+ default:

2425

+ LOG_DEBUG("AIXiod unknown cmd passed to thread:%d\n", r1_bh->cmd);

2426

+ break;

2427

+ }

2428

+

2429

+ }

2430

+ return;

2431

+}

2432

+/****************************************************

2433

+* Function: AIX_volume_group_dump

2434

+*

2435

+* This is for debug purposes and will walk the volume group list

2436

+* and LV's within the volume groups

2437

+*

2438

+* It can be called at anytime however the output to the display is large

2439

+*

2440

+*****************************************************/

2441

+#ifdef EVMS_AIX_DEBUG

2442

+static int AIX_volume_group_dump(void)

2443

+{

2444

+ aix_volume_group_t * AIXVGLDebugPtr;

2445

+ partition_list_entry_t * DebugPartitionList;

2446

+ aix_logical_volume_t * DebugLVList;

2447

+ int i;

2448

+

2449

+ AIXVGLDebugPtr = AIXVolumeGroupList;

2450

+

2451

+ if (!AIXVGLDebugPtr) {

2452

+ LOG_DEBUG("***********************************************\n");

2453

+ LOG_DEBUG("ERROR Nothing built in the list to check !!! \n");

2454

+ LOG_DEBUG("***********************************************\n");

2455

+ return 0;

2456

+ }

2457

+

2458

+ LOG_DEBUG("*********************************************** \n");

2459

+ LOG_DEBUG("Begin Volume Group Dump \n");

2460

+ LOG_DEBUG("*********************************************** \n");

2461

+

2462

+ while (AIXVGLDebugPtr) {

2463

+

2464

+ LOG_DEBUG("vg_number %x\n",AIXVGLDebugPtr->vg_id.word2 );

2465

+ LOG_DEBUG("numpvs %d\n",AIXVGLDebugPtr->numpvs );

2466

+ LOG_DEBUG("numlvs %d\n",AIXVGLDebugPtr->numlvs );

2467

+ LOG_DEBUG("hard_sect_size %d\n",AIXVGLDebugPtr->hard_sect_size);

2468

+ LOG_DEBUG("block_size %d\n",AIXVGLDebugPtr->block_size );

2469

+ LOG_DEBUG("flags %d\n",AIXVGLDebugPtr->flags );

2470

+ LOG_DEBUG("lv_max %d\n",AIXVGLDebugPtr->lv_max );

2471

+ LOG_DEBUG("pe_size %d\n",AIXVGLDebugPtr->pe_size );

2472

+ LOG_DEBUG("CleanVGInfo %d\n",AIXVGLDebugPtr->CleanVGInfo );

2473

+

2474

+ DebugPartitionList = AIXVGLDebugPtr->partition_list;

2475

+

2476

+ LOG_DEBUG("********* Begin Volume Partition Dump ********* \n");

2477

+

2478

+ if (!DebugPartitionList) {

2479

+ LOG_DEBUG("No partitions to check !! \n");

2480

+ }

2481

+

2482

+

2483

+ while (DebugPartitionList) {

2484

+ LOG_DEBUG("logical_node %p\n",DebugPartitionList->logical_node );

2485

+ LOG_DEBUG("pv_number %d\n",DebugPartitionList->pv_number );

2486

+ LOG_DEBUG("block_size %d\n",DebugPartitionList->block_size );

2487

+ LOG_DEBUG("hard_sect_size %d\n",DebugPartitionList->hard_sect_size );

2488

+ LOG_DEBUG("-------------------------------------------------------------\n");

2489

+ DebugPartitionList = DebugPartitionList->next;

2490

+ }

2491

+

2492

+ LOG_DEBUG("********* End Volume Partition Dump **********\n");

2493

+

2494

+ LOG_DEBUG("********** Begin Logical Volume Partition Dump **********\n");

2495

+

2496

+ DebugLVList = AIXVGLDebugPtr->volume_list[0];

2497

+

2498

+ if (!DebugLVList) {

2499

+ LOG_DEBUG("No logical volumes to check !! \n");

2500

+ }

2501

+

2502

+ for (i = 0; i < LVM_MAXLVS && DebugLVList; i++) {

2503

+

2504

+ DebugLVList = AIXVGLDebugPtr->volume_list[i];

2505

+

2506

+ if (DebugLVList) {

2507

+ LOG_DEBUG("volume_list # %d \n", i );

2508

+ LOG_DEBUG("lv_number %d \n", DebugLVList->lv_number );

2509

+ LOG_DEBUG("LV name %s \n", DebugLVList->name );

2510

+ LOG_DEBUG("lv_size %Ld \n", DebugLVList->lv_size );

2511

+ LOG_DEBUG("lv_access %d \n", DebugLVList->lv_access );

2512

+ LOG_DEBUG("lv_status %d \n", DebugLVList->lv_status );

2513

+ LOG_DEBUG("lv_minor %d \n", DebugLVList->lv_minor );

2514

+ LOG_DEBUG("mirror_copies %d \n", DebugLVList->mirror_copies );

2515

+ LOG_DEBUG("mirror_number %d \n", DebugLVList->mirror_number );

2516

+ LOG_DEBUG("stripes %d \n", DebugLVList->stripes );

2517

+ LOG_DEBUG("stripe_size %d \n", DebugLVList->stripe_size );

2518

+ LOG_DEBUG("stripe_size_shift%d \n", DebugLVList->stripe_size_shift);

2519

+ LOG_DEBUG("pe_size %d \n", DebugLVList->pe_size );

2520

+ LOG_DEBUG("pe_size_shift %d \n", DebugLVList->pe_size_shift );

2521

+ LOG_DEBUG("num_le %d \n", DebugLVList->num_le );

2522

+ LOG_DEBUG("new_volume %d \n", DebugLVList->new_volume );

2523

+ LOG_DEBUG("group %p \n", DebugLVList->group );

2524

+ }

2525

+

2526

+

2527

+ }

2528

+

2529

+ AIXVGLDebugPtr = AIXVGLDebugPtr->next;

2530

+

2531

+ LOG_DEBUG("********** End Logical Volume Partition Dump **********\n");

2532

+

2533

+

2534

+ }

2535

+

2536

+ LOG_DEBUG("***********************************************\n");

2537

+ LOG_DEBUG("End Volume Group Dump \n");

2538

+ LOG_DEBUG("***********************************************\n");

2539

+

2540

+ return 0;

2541

+

2542

+}

2543

+#endif

2544

+

2545

diff -Naur linux-2002-03-28/drivers/evms/Config.in evms-2002-03-28/drivers/evms/Config.in

2546

--- linux-2002-03-28/drivers/evms/Config.in Wed Dec 31 18:00:00 1969

2547

+++ evms-2002-03-28/drivers/evms/Config.in Mon Mar 18 16:54:45 2002

2548

@@ -0,0 +1,60 @@

2549

+#

2550

+# Copyright (c) International Business Machines Corp., 2000

2551

+#

2552

+# This program is free software; you can redistribute it and/or modify

2553

+# it under the terms of the GNU General Public License as published by

2554

+# the Free Software Foundation; either version 2 of the License, or

2555

+# (at your option) any later version.

2556

+#

2557

+# This program is distributed in the hope that it will be useful,

2558

+# but WITHOUT ANY WARRANTY; without even the implied warranty of

2559

+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See

2560

+# the GNU General Public License for more details.

2561

+#

2562

+# You should have received a copy of the GNU General Public License

2563

+# along with this program; if not, write to the Free Software

2564

+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

2565

+#

2566

+#

2567

+# EVMS driver configuration

2568

+#

2569

+

2570

+mainmenu_option next_comment

2571

+comment 'Enterprise Volume Management System'

2572

+

2573

+tristate 'EVMS Kernel Runtime' CONFIG_EVMS

2574

+dep_tristate ' EVMS Local Device Manager Plugin' CONFIG_EVMS_LOCAL_DEV_MGR_PLUGIN $CONFIG_EVMS

2575

+dep_tristate ' EVMS DOS Partition Manager Plugin' CONFIG_EVMS_DOS_PARTITION_PLUGIN $CONFIG_EVMS

2576

+dep_tristate ' EVMS SnapShot Feature' CONFIG_EVMS_SNAPSHOT_PLUGIN $CONFIG_EVMS

2577

+dep_tristate ' EVMS DriveLink Feature' CONFIG_EVMS_DRIVELINK_PLUGIN $CONFIG_EVMS

2578

+dep_tristate ' EVMS Bad Block Relocation (BBR) Feature' CONFIG_EVMS_BBR_PLUGIN $CONFIG_EVMS

2579

+dep_tristate ' EVMS Linux LVM Package' CONFIG_EVMS_LVM_PLUGIN $CONFIG_EVMS

2580

+dep_tristate ' EVMS Linux MD Package' CONFIG_EVMS_MD_PLUGIN $CONFIG_EVMS

2581

+dep_tristate ' EVMS MD Linear (append) mode' CONFIG_EVMS_MD_LINEAR_PERS $CONFIG_EVMS_MD_PLUGIN

2582

+dep_tristate ' EVMS MD RAID-0 (stripe) mode' CONFIG_EVMS_MD_RAID0_PERS $CONFIG_EVMS_MD_PLUGIN

2583

+dep_tristate ' EVMS MD RAID-1 (mirroring) mode' CONFIG_EVMS_MD_RAID1_PERS $CONFIG_EVMS_MD_PLUGIN

2584

+dep_tristate ' EVMS MD RAID-4/RAID-5 mode' CONFIG_EVMS_MD_RAID5_PERS $CONFIG_EVMS_MD_PLUGIN

2585

+dep_tristate ' EVMS AIX LVM Package' CONFIG_EVMS_AIX_PLUGIN $CONFIG_EVMS

2586

+dep_tristate ' EVMS OS/2 LVM Package' CONFIG_EVMS_OS2_PLUGIN $CONFIG_EVMS

2587

+dep_tristate ' EVMS Clustering Package' CONFIG_EVMS_ECR_PLUGIN $CONFIG_EVMS

2588

+

2589

+if [ "$CONFIG_ARCH_S390" = "y" ]; then

2590

+dep_tristate ' EVMS s390 Partition Manager Plugin' CONFIG_EVMS_S390_PART_PLUGIN $CONFIG_EVMS

2591

+fi

2592

+

2593

+if [ "$CONFIG_EVMS" != "n" ]; then

2594

+ choice ' EVMS Debug Level' \

2595

+ "Critical CONFIG_EVMS_INFO_CRITICAL \

2596

+ Serious CONFIG_EVMS_INFO_SERIOUS \

2597

+ Error CONFIG_EVMS_INFO_ERROR \

2598

+ Warning CONFIG_EVMS_INFO_WARNING \

2599

+ Default CONFIG_EVMS_INFO_DEFAULT \

2600

+ Details CONFIG_EVMS_INFO_DETAILS \

2601

+ Debug CONFIG_EVMS_INFO_DEBUG \

2602

+ Extra CONFIG_EVMS_INFO_EXTRA \

2603

+ Entry_Exit CONFIG_EVMS_INFO_ENTRY_EXIT \

2604

+ Everything CONFIG_EVMS_INFO_EVERYTHING" Default

2605

+fi

2606

+

2607

+endmenu

2608

+

2609

diff -Naur linux-2002-03-28/drivers/evms/Makefile evms-2002-03-28/drivers/evms/Makefile

2610

--- linux-2002-03-28/drivers/evms/Makefile Wed Dec 31 18:00:00 1969

2611

+++ evms-2002-03-28/drivers/evms/Makefile Thu Mar 28 15:13:34 2002

2612

@@ -0,0 +1,60 @@

2613

+#

2614

+# Makefile for the kernel EVMS driver and modules.

2615

+#

2616

+# 08 March 2001, Mark Peloquin <peloquin@us.ibm.com>

2617

+#

2618

+

2619

+O_TARGET := evmsdrvr.o

2620

+

2621

+export-objs := evms.o evms_passthru.o ldev_mgr.o dos_part.o lvm_vge.o snapshot.o evms_drivelink.o evms_bbr.o AIXlvm_vge.o os2lvm_vge.o evms_ecr.o md_core.o md_linear.o md_raid0.o md_raid1.o md_raid5.o md_xor.o s390_part.o

2622

+

2623

+# Link order is important! Plugins must come first, then the EVMS core.

2624

+

2625

+obj-$(CONFIG_EVMS_LOCAL_DEV_MGR_PLUGIN) += ldev_mgr.o

2626

+obj-$(CONFIG_EVMS_DOS_PARTITION_PLUGIN) += dos_part.o

2627

+obj-$(CONFIG_EVMS_MD_PLUGIN) += md_core.o

2628

+obj-$(CONFIG_EVMS_MD_LINEAR_PERS) += md_linear.o

2629

+obj-$(CONFIG_EVMS_MD_RAID0_PERS) += md_raid0.o

2630

+obj-$(CONFIG_EVMS_MD_RAID1_PERS) += md_raid1.o

2631

+obj-$(CONFIG_EVMS_MD_RAID5_PERS) += md_raid5.o md_xor.o

2632

+obj-$(CONFIG_EVMS_LVM_PLUGIN) += lvm_vge.o

2633

+obj-$(CONFIG_EVMS_AIX_PLUGIN) += AIXlvm_vge.o

2634

+obj-$(CONFIG_EVMS_OS2_PLUGIN) += os2lvm_vge.o

2635

+obj-$(CONFIG_EVMS_DRIVELINK_PLUGIN) += evms_drivelink.o

2636

+obj-$(CONFIG_EVMS_BBR_PLUGIN) += evms_bbr.o

2637

+obj-$(CONFIG_EVMS_SNAPSHOT_PLUGIN) += snapshot.o

2638

+obj-$(CONFIG_EVMS_ECR_PLUGIN) += evms_ecr.o

2639

+obj-$(CONFIG_EVMS_S390_PART_PLUGIN) += s390_part.o

2640

+obj-$(CONFIG_EVMS) += evms_passthru.o evms.o

2641

+

2642

+EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_DEFAULT

2643

+ifeq ($(CONFIG_EVMS_INFO_CRITICAL),y)

2644

+ EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_CRITICAL

2645

+endif

2646

+ifeq ($(CONFIG_EVMS_INFO_SERIOUS),y)

2647

+ EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_SERIOUS

2648

+endif

2649

+ifeq ($(CONFIG_EVMS_INFO_ERROR),y)

2650

+ EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_ERROR

2651

+endif

2652

+ifeq ($(CONFIG_EVMS_INFO_WARNING),y)

2653

+ EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_WARNING

2654

+endif

2655

+ifeq ($(CONFIG_EVMS_INFO_DETAILS),y)

2656

+ EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_DETAILS

2657

+endif

2658

+ifeq ($(CONFIG_EVMS_INFO_DEBUG),y)

2659

+ EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_DEBUG

2660

+endif

2661

+ifeq ($(CONFIG_EVMS_INFO_EXTRA),y)

2662

+ EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_EXTRA

2663

+endif

2664

+ifeq ($(CONFIG_EVMS_INFO_ENTRY_EXIT),y)

2665

+ EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_ENTRY_EXIT

2666

+endif

2667

+ifeq ($(CONFIG_EVMS_INFO_EVERYTHING),y)

2668

+ EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_EVERYTHING

2669

+endif

2670

+

2671

+include $(TOPDIR)/Rules.make

2672

+

2673

diff -Naur linux-2002-03-28/drivers/evms/dos_part.c evms-2002-03-28/drivers/evms/dos_part.c

2674

--- linux-2002-03-28/drivers/evms/dos_part.c Wed Dec 31 18:00:00 1969

2675

+++ evms-2002-03-28/drivers/evms/dos_part.c Wed Mar 27 21:24:20 2002

2676

@@ -0,0 +1,1407 @@

2677

+/* -*- linux-c -*- */

2678

+/*

2679

+ *

2680

+ *

2681

+ * Copyright (c) International Business Machines Corp., 2000

2682

+ *

2683

+ * This program is free software; you can redistribute it and/or modify

2684

+ * it under the terms of the GNU General Public License as published by

2685

+ * the Free Software Foundation; either version 2 of the License, or

2686

+ * (at your option) any later version.

2687

+ *

2688

+ * This program is distributed in the hope that it will be useful,

2689

+ * but WITHOUT ANY WARRANTY; without even the implied warranty of

2690

+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See

2691

+ * the GNU General Public License for more details.

2692

+ *

2693

+ * You should have received a copy of the GNU General Public License

2694

+ * along with this program; if not, write to the Free Software

2695

+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

2696

+ *

2697

+ *

2698

+ */

2699

+/*

2700

+ * linux/drivers/evms/dos_part.c

2701

+ *

2702

+ * EVMS DOS partition manager

2703

+ *

2704

+ * Partial code extracted from

2705

+ *

2706

+ * linux/fs/partitions/msdos.c

2707

+ *

2708

+ */

2709

+

2710

+#include <linux/config.h>

2711

+#include <linux/module.h>

2712

+#include <linux/kernel.h>

2713

+#include <linux/config.h>

2714

+#include <linux/fs.h>

2715

+#include <linux/genhd.h>

2716

+#include <linux/major.h>

2717

+#include <linux/string.h>

2718

+#include <linux/blk.h>

2719

+#include <linux/init.h>

2720

+#include <linux/iobuf.h> /* for kiobuf stuffs */

2721

+

2722

+#ifdef CONFIG_BLK_DEV_IDE

2723

+#include <linux/ide.h> /* IDE xlate */

2724

+#endif /* CONFIG_BLK_DEV_IDE */

2725

+

2726

+#include <linux/evms/evms_kernel.h>

2727

+#include <linux/evms/evms_os2.h>

2728

+

2729

+#include <asm/system.h>

2730

+#include <asm/uaccess.h>

2731

+

2732

+/* prefix used in logging messages */

2733

+#define LOG_PREFIX "dos_part: "

2734

+

2735

+/* #include "msdos.h" */

2736

+#define MSDOS_LABEL_MAGIC 0xAA55

2737

+

2738

+/* Skeletal MBR/EBR structure useful for our purposes */

2739

+typedef struct mbr_ebr_s {

2740

+ u_int8_t unused1[0x1be];

2741

+ struct partition partitions[4];

2742

+ u_int16_t signature;

2743

+} mbr_ebr_t;

2744

+

2745

+/* Private instance data structure for node we produced */

2746

+typedef struct local_instance_data_s {

2747

+ evms_logical_node_t * source_disk;

2748

+ evms_sector_t start_sect; /* starting LBA */

2749

+ evms_sector_t nr_sects; /* number of sectors */

2750

+ unsigned char type; /* partition type or filesystem format indicator, can be set to 0 */

2751

+} local_instance_data_t;

2752

+

2753

+/* Structure used to track progress traversing an EBR chain */

2754

+typedef struct extended_part_s {

2755

+ int partition_number;

2756

+ struct partition *extended;

2757

+ u_int64_t start_sect;

2758

+ u_int64_t next_ebr_start;

2759

+ int done;

2760

+} extended_part_t;

2761

+

2762

+/* Global variables */

2763

+static int cur_comp_part_num; /* used to track non-primary

2764

+ * partition numbers

2765

+ */

2766

+static int exported_nodes; /* total # of exported segments

2767

+ * produced during this discovery.

2768

+ */

2769

+

2770

+/* External references */

2771

+#if CONFIG_BLK_DEV_MD && CONFIG_AUTODETECT_RAID

2772

+extern void md_autodetect_dev(kdev_t dev);

2773

+#endif

2774

+

2775

+/* Prototypes */

2776

+static int mbr_ebr_partition_discover(evms_logical_node_t **);

2777

+static int mbr_ebr_partition_delete(evms_logical_node_t *);

2778

+static void mbr_ebr_partition_read(evms_logical_node_t *,

2779

+ eio_t *);

2780

+static void mbr_ebr_partition_write(evms_logical_node_t *,

2781

+ eio_t *);

2782

+static int mbr_ebr_partition_ioctl(evms_logical_node_t *,

2783

+ struct inode *,

2784

+ struct file *,

2785

+ unsigned int,

2786

+ unsigned long);

2787

+static int mbr_ebr_partition_init_io(evms_logical_node_t *,

2788

+ int,

2789

+ evms_sector_t,

2790

+ evms_sector_t,

2791

+ void *);

2792

+

2793

+static evms_plugin_function_table_t function_table = {

2794

+ discover: &mbr_ebr_partition_discover,

2795

+ delete : &mbr_ebr_partition_delete,

2796

+ read : &mbr_ebr_partition_read,

2797

+ write : &mbr_ebr_partition_write,

2798

+ init_io : &mbr_ebr_partition_init_io,

2799

+ ioctl : &mbr_ebr_partition_ioctl

2800

+};

2801

+

2802

+#define EVMS_MSDOS_PARTITION_MANAGER_ID 1

2803

+

2804

+static evms_plugin_header_t plugin_header = {

2805

+ id : SetPluginID(

2806

+ IBM_OEM_ID,

2807

+ EVMS_SEGMENT_MANAGER,

2808

+ EVMS_MSDOS_PARTITION_MANAGER_ID),

2809

+ version : {

2810

+ major : 1,

2811

+ minor : 0,

2812

+ patchlevel : 0

2813

+ },

2814

+ required_common_services_version : {

2815

+ major : 0,

2816

+ minor : 5,

2817

+ patchlevel : 0

2818

+ },

2819

+ function_table : &function_table

2820

+};

2821

+

2822

+/*

2823

+ * Many architectures don't like unaligned accesses, which is

2824

+ * frequently the case with the nr_sects and start_sect partition

2825

+ * table entries.

2826

+ */

2827

+#include <asm/unaligned.h>

2828

+

2829

+#define SYS_IND(p) (get_unaligned(&p->sys_ind))

2830

+#define NR_SECTS(p) (u_int64_t)({ __typeof__(p->nr_sects) __a = \

2831

+ get_unaligned(&p->nr_sects); \

2832

+ le32_to_cpu(__a); \

2833

+ })

2834

+

2835

+#define START_SECT(p) (u_int64_t)({ __typeof__(p->start_sect) __a = \

2836

+ get_unaligned(&p->start_sect); \

2837

+ le32_to_cpu(__a); \

2838

+ })

2839

+

2840

+

2841

+/***************************************************/

2842

+/* List Support - Typedefs, Variables, & Functions */

2843

+/***************************************************/

2844

+

2845

+/* Typedefs */

2846

+

2847

+typedef struct local_segment_list_node_s {

2848

+ evms_logical_node_t *segment;

2849

+ struct local_segment_list_node_s *next;

2850

+} local_segment_list_node_t;

2851

+

2852

+typedef struct local_disk_list_node_s {

2853

+ evms_logical_node_t *disk;

2854

+ local_segment_list_node_t *segment_list;

2855

+ struct local_disk_list_node_s *next;

2856

+} local_disk_list_node_t;

2857

+

2858

+/* Variables */

2859

+

2860

+static local_disk_list_node_t *my_disk_list;

2861

+

2862

+/* Functions */

2863

+

2864

+static local_disk_list_node_t **

2865

+lookup_disk(

2866

+ evms_logical_node_t *disk)

2867

+{

2868

+ local_disk_list_node_t **ldln;

2869

+

2870

+ ldln = &my_disk_list;

2871

+ while(*ldln) {

2872

+ if ((*ldln)->disk == disk)

2873

+ break;

2874

+ ldln = &(*ldln)->next;

2875

+ }

2876

+ return(ldln);

2877

+}

2878

+

2879

+static local_segment_list_node_t **

2880

+lookup_segment(

2881

+ local_disk_list_node_t *disk,

2882

+ evms_logical_node_t *segment)

2883

+{

2884

+ local_segment_list_node_t **lsln;

2885

+

2886

+ lsln = &disk->segment_list;

2887

+ while(*lsln) {

2888

+ if ((*lsln)->segment == segment)

2889

+ break;

2890

+ lsln = &(*lsln)->next;

2891

+ }

2892

+ return(lsln);

2893

+}

2894

+

2895

+static evms_logical_node_t *

2896

+find_segment_on_disk(

2897

+ evms_logical_node_t *disk,

2898

+ u_int64_t start_sect,

2899

+ u_int64_t nr_sects)

2900

+{

2901

+ evms_logical_node_t *rc = NULL;

2902

+ local_disk_list_node_t **ldln;

2903

+ local_segment_list_node_t **lsln;

2904

+ local_instance_data_t *lid;

2905

+

2906

+ ldln = lookup_disk(disk);

2907

+ if (*ldln) {

2908

+ /* disk found in list */

2909

+ /* attempt to find segment */

2910

+

2911

+ lsln = &(*ldln)->segment_list;

2912

+ while(*lsln) {

2913

+ lid = (*lsln)->segment->instance_data;

2914

+ if (lid->start_sect == start_sect)

2915

+ if (lid->nr_sects == nr_sects)

2916

+ break;

2917

+ lsln = &(*lsln)->next;

2918

+ }

2919

+ if (*lsln)

2920

+ rc = (*lsln)->segment;

2921

+ }

2922

+ return(rc);

2923

+}

2924

+

2925

+/* function description: add_segment_to_disk

2926

+ *

2927

+ * this function attempts to add a segment to the segment

2928

+ * list of a disk. if the specified disk is not found, it

2929

+ * will be added to the global disk list. this function will

2930

+ * return a pointer to the matching segment in the disk's

2931

+ * segment list. the caller must compare the returned pointer

2932

+ * to the specified segment to see if the

2933

+ * specified segment was already present in the disk's segment

2934

+ * list. if the return pointer matches the specified segment,

2935

+ * then the specified segment was added to the list. if the

2936

+ * return segment pointer to does not match the specified

2937

+ * segment pointer, then the specified segment pointer was

2938

+ * a duplicate and can be thrown away.

2939

+ */

2940

+static int

2941

+add_segment_to_disk(

2942

+ evms_logical_node_t *disk,

2943

+ evms_logical_node_t *segment)

2944

+{

2945

+ int rc = 0;

2946

+ local_disk_list_node_t **ldln, *new_disk;

2947

+ local_segment_list_node_t **lsln, *new_segment;

2948

+

2949

+ ldln = lookup_disk(disk);

2950

+ if (*ldln == NULL) {

2951

+ /* disk not in list, add disk */

2952

+ rc = evms_cs_allocate_memory((void **)&new_disk,

2953

+ sizeof(*new_disk));

2954

+ if (!rc) {

2955

+ new_disk->disk = disk;

2956

+ *ldln = new_disk;

2957

+ }

2958

+ }

2959

+ if (!rc) {

2960

+ /* attempt to add segment */

2961

+ lsln = lookup_segment(*ldln, segment);

2962

+ if (*lsln == NULL) {

2963

+ /* segment not in list, add segment */

2964

+ rc = evms_cs_allocate_memory((void **)&new_segment,

2965

+ sizeof(*new_segment));

2966

+ if (!rc) {

2967

+ new_segment->segment = segment;

2968

+ *lsln = new_segment;

2969

+ }

2970

+ } else

2971

+ rc = -1;

2972

+ }

2973

+ return(rc);

2974

+}

2975

+

2976

+static int

2977

+remove_segment_from_disk(

2978

+ evms_logical_node_t *disk,

2979

+ evms_logical_node_t *segment,

2980

+ evms_logical_node_t **empty_disk)

2981

+{

2982

+ int rc = 0;

2983

+ local_disk_list_node_t **ldln, *tmp_disk_node;

2984

+ local_segment_list_node_t **lsln, *tmp_segment_node;

2985

+

2986

+ *empty_disk = NULL;

2987

+ ldln = lookup_disk(disk);

2988

+ if (*ldln == NULL) {

2989

+ rc = -1;

2990

+ } else {

2991

+ /* disk found in list */

2992

+ /* attempt to add segment */

2993

+ lsln = lookup_segment(*ldln, segment);

2994

+ if (*lsln == NULL) {

2995

+ rc = -2;

2996

+ } else {

2997

+ tmp_segment_node = *lsln;

2998

+ /* remove segment from list */

2999

+ *lsln = (*lsln)->next;

3000

+ /* free the segment list node */

3001

+ evms_cs_deallocate_memory(tmp_segment_node);

3002

+

3003

+ if ((*ldln)->segment_list == NULL) {

3004

+ tmp_disk_node = *ldln;

3005

+ *empty_disk = tmp_disk_node->disk;

3006

+ /* remove disk from list */

3007

+ *ldln = (*ldln)->next;

3008

+ /* free the disk list node */

3009

+ evms_cs_deallocate_memory(tmp_disk_node);

3010

+ }

3011

+ }

3012

+ }

3013

+ return(rc);

3014

+}

3015

+

3016

+static inline int

3017

+is_extended_partition(struct partition *p)

3018

+{

3019

+ return (SYS_IND(p) == DOS_EXTENDED_PARTITION ||

3020

+ SYS_IND(p) == WIN98_EXTENDED_PARTITION ||

3021

+ SYS_IND(p) == LINUX_EXTENDED_PARTITION);

3022

+}

3023

+

3024

+static inline u64

3025

+part_start(struct partition *part, u64 ext_start, u64 ebr_start)

3026

+{

3027

+ u64 pstart = START_SECT(part);

3028

+ pstart += (is_extended_partition(part)) ? ext_start : ebr_start;

3029

+ return(pstart);

3030

+}

3031

+

3032

+static int

3033

+validate_mbr_ebr(

3034

+ evms_logical_node_t *node,

3035

+ mbr_ebr_t *mbr_ebr,

3036

+ u_int64_t ext_start,

3037

+ u_int64_t ebr_start)

3038

+{

3039

+ int valid_mbr_ebr, i, j, mbr_flag;

3040

+ struct partition *pi, *pj;

3041

+ u_int64_t pi_start, pi_end, pj_start, pj_end;

3042

+

3043

+ /* assume an MBR */

3044

+ mbr_flag = TRUE;

3045

+

3046

+ /* assume its valid */

3047

+ valid_mbr_ebr = TRUE;

3048

+

3049

+ /* check for valid signature */

3050

+ if (mbr_ebr->signature != cpu_to_le16(MSDOS_LABEL_MAGIC)) {

3051

+ LOG_DEBUG("%s: invalid signature on '%s'!\n",

3052

+ __FUNCTION__, node->name);

3053

+ valid_mbr_ebr = FALSE;

3054

+ }

3055

+

3056

+ /* check for an AIX IPL signature */

3057

+ #define IPLRECID 0xc9c2d4c1 /* Value is EBCIDIC 'IBMA' */

3058

+ if ( *(unsigned int *)mbr_ebr == IPLRECID ) {

3059

+ LOG_DEBUG("%s: found an AIX IPL signature on '%s'\n",

3060

+ __FUNCTION__, node->name);

3061

+ valid_mbr_ebr = FALSE;

3062

+ }

3063

+

3064

+

3065

+ /* check for boot sector fields */

3066

+

3067

+#if 0 //Remove checking of the first byte

3068

+

3069

+ /* attempt to make some initial assumptions about

3070

+ * what type of data structure this could be. we

3071

+ * start by checking the 1st byte. we can tell a

3072

+ * few things based on what is or isn't there.

3073

+ */

3074

+ if (valid_mbr_ebr == TRUE)

3075

+ switch(*(u_char *)mbr_ebr) {

3076

+ /* check for JMP as 1st instruction

3077

+ * if found, assume (for now), that

3078

+ * this is a boot sector.

3079

+ */

3080

+ /* Removed the JMP opcode check because it's not enough to determine

3081

+ * that this sector does not have a valid MBR.

3082

+ * Note: To avoid going thru validation process of partition table,

3083

+ * it's necessary to have a better boot sector check

3084

+ * (eg. JMP opcode && other conditions) */

3085

+ /*

3086

+ case 0xEB:

3087

+ LOG_DEBUG("%s: boot sector detected!\n", __FUNCTION__);

3088

+ valid_mbr_ebr = FALSE;

3089

+ */

3090

+ /* let this fall thru to pick up the

3091

+ * mbr_flag == FALSE.

3092

+ */

3093

+

3094

+

3095

+ /* the MBR should contain boot strap

3096

+ * code, so we don't expect the 1st

3097

+ * byte to be a 0x0. If the 1st byte

3098

+ * IS 0x0, its assumed (for now) to

3099

+ * be an EBR.

3100

+ */

3101

+ case 0:

3102

+ mbr_flag = FALSE;

3103

+ break;

3104

+ }

3105

+#endif //Remove checking of the first byte

3106

+

3107

+ if (valid_mbr_ebr == TRUE) {

3108

+ /* dump the partition table entries in debug mode */

3109

+ LOG_DEBUG("%s: disk relative starts: ext_part(%Ld), ebr(%Ld).\n",

3110

+ __FUNCTION__, ext_start, ebr_start);

3111

+ for (i = 0; i < 4; i++) {

3112

+ pi = &mbr_ebr->partitions[i];

3113

+ LOG_DEBUG("%s: Partition: index(%d), start(%Ld), size(%Ld), sys(0x%x).\n",

3114

+ __FUNCTION__, i, START_SECT(pi), NR_SECTS(pi), SYS_IND(pi));

3115

+ }

3116

+ /* check for mbr/ebr partition table validity */

3117

+ for (i = 0; i < 4; i++) {

3118

+ pi = &mbr_ebr->partitions[i];

3119

+ if (NR_SECTS(pi)) {

3120

+ /* check for partition extending past end of node */

3121

+ pi_start = part_start(pi, ext_start, ebr_start);

3122

+ pi_end = pi_start + NR_SECTS(pi) - 1;

3123

+ if ( pi_end >= node->total_vsectors) {

3124

+ LOG_DEBUG("%s: partition(%d) ends(%Ld) beyond the end of the disk(%s,%Ld)!\n",

3125

+ __FUNCTION__, i, pi_end,

3126

+ node->name, node->total_vsectors);

3127

+ valid_mbr_ebr = FALSE;

3128

+ }

3129

+ if (valid_mbr_ebr == FALSE) break;

3130

+

3131

+ /* check for partition overlap */

3132

+ for (j = i + 1; j < 4; j++) {

3133

+ pj = &mbr_ebr->partitions[j];

3134

+ if (NR_SECTS(pj)) {

3135

+ pj_start = part_start(pj, ext_start, ebr_start);

3136

+ pj_end = pj_start + NR_SECTS(pj) - 1;

3137

+ if (pi_start == pj_start) {

3138

+ valid_mbr_ebr = FALSE;

3139

+ } else if (pi_start < pj_start) {

3140

+ if (pi_end >= pj_start)

3141

+ valid_mbr_ebr = FALSE;

3142

+ } else if (pi_start <= pj_end)

3143

+ valid_mbr_ebr = FALSE;

3144

+

3145

+ if (valid_mbr_ebr == FALSE) {

3146

+ LOG_DEBUG("%s: overlapping partitions(%d,%d) detected on '%s'!\n",

3147

+ __FUNCTION__,i,j, node->name);

3148

+ break;

3149

+ }

3150

+ }

3151

+ }

3152

+ if (valid_mbr_ebr == FALSE) break;

3153

+ }

3154

+ }

3155

+ }

3156

+ if (valid_mbr_ebr == TRUE) {

3157

+ LOG_DEBUG("%s: valid %cBR detected on '%s'!\n", __FUNCTION__,

3158

+ (mbr_flag == TRUE) ? 'M' : 'E', node->name);

3159

+ } else {

3160

+ LOG_DEBUG("%s: no valid MBR/EBR detected on '%s'!\n",

3161

+ __FUNCTION__, node->name);

3162

+ }

3163

+ return(valid_mbr_ebr);

3164

+}

3165

+

3166

+/*

3167

+ * Function: add_segment

3168

+ */

3169

+static int

3170

+mbr_ebr_process_segment(

3171

+ evms_logical_node_t **discover_list,

3172

+ evms_logical_node_t *node,

3173

+ u_int64_t start_sect,

3174

+ u_int64_t nr_sects,

3175

+ unsigned char type,

3176

+ int part_num,

3177

+ char *partition_name)

3178

+{

3179

+ local_instance_data_t *InstData = NULL;

3180

+ evms_logical_node_t *segment;

3181

+ int rc = 0;

3182

+

3183

+ segment = find_segment_on_disk(node, start_sect, nr_sects);

3184

+ if (segment) {

3185

+ LOG_DETAILS("exporting segment '%s'.\n",

3186

+ segment->name);

3187

+ } else {

3188

+ rc = evms_cs_allocate_memory((void **)&InstData,sizeof(*InstData));

3189

+ if (!rc) {

3190

+ InstData->source_disk = node;

3191

+ InstData->start_sect = start_sect;

3192

+ InstData->nr_sects = nr_sects;

3193

+ InstData->type = type;

3194

+ rc = evms_cs_allocate_logical_node(&segment);

3195

+ }

3196

+ if (!rc) {

3197

+ segment->plugin = &plugin_header;

3198

+ segment->system_id = (unsigned int)type;

3199

+ segment->total_vsectors = nr_sects;

3200

+ segment->block_size = node->block_size;

3201

+ segment->hardsector_size = node->hardsector_size;

3202

+ segment->instance_data = InstData;

3203

+ segment->flags = node->flags;

3204

+ if (partition_name)

3205

+ strcpy(segment->name, partition_name);

3206

+ else {

3207

+ strcpy(segment->name, node->name);

3208

+ sprintf(segment->name + strlen(segment->name), "%d", part_num);

3209

+ }

3210

+ LOG_DETAILS("creating segment '%s'.\n",

3211

+ segment->name);

3212

+ rc = add_segment_to_disk(node, segment);

3213

+ if (rc) {

3214

+ LOG_ERROR("%s: error(%d) adding segment '%s'!\n",

3215

+ __FUNCTION__, rc, segment->name);

3216

+ rc = 0;

3217

+ } else {

3218

+ MOD_INC_USE_COUNT;

3219

+ }

3220

+ }

3221

+ if (rc) {

3222

+ if (InstData)

3223

+ evms_cs_deallocate_memory(InstData);

3224

+ if (segment)

3225

+ evms_cs_deallocate_logical_node(segment);

3226

+ }

3227

+ }

3228

+ if (!rc) {

3229

+ evms_cs_add_logical_node_to_list(discover_list, segment);

3230

+ exported_nodes++;

3231

+ }

3232

+ return rc;

3233

+}

3234

+

3235

+static void

3236

+print_partition_info( char *leading_comment, struct partition *p )

3237

+{

3238

+ LOG_EXTRA("%s: boot_ind(0x%02x), sys_ind(0x%02x), startCHS(%u,%u,%u), endCHS(%u,%u,%u), startLBA(%Lu), sizeLBA(%Lu)\n",

3239

+ leading_comment,p->boot_ind,p->sys_ind,p->cyl,p->head,p->sector,

3240

+ p->end_cyl,p->end_head,p->end_sector,START_SECT(p),NR_SECTS(p));

3241

+}

3242

+

3243

+#ifdef CONFIG_BSD_DISKLABEL

3244

+#define BSD_DISKLABEL_PART_TABLE_SECTOR_OFFSET 1

3245

+static void

3246

+print_bsd_partition_info( char *leading_comment, struct bsd_partition *p )

3247

+{

3248

+ LOG_EXTRA("%s: p_size(%u), p_offset(%u), p_fsize(%u), p_fstype(0x%02X), p_frag(0x%02X), p_cpg(%u)\n",

3249

+ leading_comment,p->p_size, p->p_offset, p->p_fsize, p->p_fstype, p->p_frag, p->p_cpg);

3250

+}

3251

+

3252

+/*

3253

+ * bsd_disklabel_partition

3254

+ *

3255

+ * Return:

3256

+ * - 0 for 0 partition

3257

+ * - (positive) number for number of BSD partitions found

3258

+ * - (negative) error code

3259

+ */

3260

+static int

3261

+bsd_disklabel_partition(

3262

+ evms_logical_node_t **discover_list,

3263

+ evms_logical_node_t *node,

3264

+ struct partition *bsd)

3265

+{

3266

+ struct bsd_disklabel *l;

3267

+ struct bsd_partition *p;

3268

+ int max_partitions;

3269

+ char *data;

3270

+ int rc = 0;

3271

+ int count = 0;

3272

+

3273

+ rc = evms_cs_allocate_memory((void**) &data, node->hardsector_size);

3274

+ if (!rc)

3275

+ rc = INIT_IO(node,

3276

+ 0,

3277

+ START_SECT(bsd) + BSD_DISKLABEL_PART_TABLE_SECTOR_OFFSET,

3278

+ 1,

3279

+ data);

3280

+ if (!rc) {

3281

+

3282

+ l = (struct bsd_disklabel *) data;

3283

+ if (l->d_magic == BSD_DISKMAGIC) {

3284

+

3285

+ max_partitions = ((SYS_IND(bsd) == OPENBSD_PARTITION) ? OPENBSD_MAXPARTITIONS

3286

+ : BSD_MAXPARTITIONS);

3287

+ if (l->d_npartitions < max_partitions)

3288

+ max_partitions = l->d_npartitions;

3289

+ for (p = l->d_partitions; p - l->d_partitions < max_partitions; p++) {

3290

+ if (p->p_fstype != BSD_FS_UNUSED) {

3291

+ evmsTRACE2(EVMS_INFO_EXTRA,

3292

+ (print_bsd_partition_info(__FUNCTION__, p)));

3293

+ rc = mbr_ebr_process_segment(

3294

+ discover_list,

3295

+ node,

3296

+ (u_int64_t)p->p_offset,

3297

+ (u_int64_t)p->p_size,

3298

+ p->p_fstype,

3299

+ cur_comp_part_num++,

3300

+ NULL);

3301

+ if (rc)

3302

+ break;

3303

+ count++;

3304

+ }

3305

+ }

3306

+ }

3307

+ }

3308

+ if (data)

3309

+ evms_cs_deallocate_memory(data);

3310

+ if (!rc)

3311

+ rc = count;

3312

+ LOG_DETAILS("%s: exported (%d) partitions\n", __FUNCTION__, rc);

3313

+ return rc;

3314

+}

3315

+#endif

3316

+

3317

+#ifdef CONFIG_UNIXWARE_DISKLABEL

3318

+#define UNIXWARE_PART_TABLE_SECTOR_OFFSET 29

3319

+

3320

+/*

3321

+ * unixware_partition

3322

+ *

3323

+ * Return:

3324

+ * - 0 for 0 partition

3325

+ * - (positive) number for number of UNIXWARE partitions found

3326

+ * - (negative) error code

3327

+ */

3328

+static int

3329

+unixware_partition(

3330

+ evms_logical_node_t **discover_list,

3331

+ evms_logical_node_t *node,

3332

+ struct partition *unixware_part)

3333

+{

3334

+ struct unixware_disklabel *l;

3335

+ struct unixware_slice *p;

3336

+ char *data = NULL;

3337

+ int rc = 0;

3338

+ int count = 0;

3339

+

3340

+ rc = evms_cs_allocate_memory((void**) &data, node->hardsector_size);

3341

+ if (!rc)

3342

+ rc = INIT_IO(node,

3343

+ 0,

3344

+ START_SECT(unixware_part) + UNIXWARE_PART_TABLE_SECTOR_OFFSET,

3345

+ 1,

3346

+ data);

3347

+ if (!rc) {

3348

+ l = (struct unixware_disklabel *)data;

3349

+ if ( le32_to_cpu(l->d_magic) == UNIXWARE_DISKMAGIC &&

3350

+ le32_to_cpu(l->vtoc.v_magic) == UNIXWARE_DISKMAGIC2) {

3351

+ p = &l->vtoc.v_slice[1]; /* The 0th slice is the same as whole disk. */

3352

+ while (p - &l->vtoc.v_slice[0] < UNIXWARE_NUMSLICE) {

3353

+ if (p->s_label != UNIXWARE_FS_UNUSED) {

3354

+ rc = mbr_ebr_process_segment(

3355

+ discover_list,

3356

+ node,

3357

+ START_SECT(p),

3358

+ NR_SECTS(p),

3359

+ UNIXWARE_PARTITION,

3360

+ cur_comp_part_num++,

3361

+ NULL);

3362

+ if (rc)

3363

+ break;

3364

+ count++;

3365

+ }

3366

+ p++;

3367

+ }

3368

+ }

3369

+ }

3370

+ if (data)

3371

+ evms_cs_deallocate_memory(data);

3372

+ if (!rc)

3373

+ rc = count;

3374

+ LOG_DETAILS("%s: exported (%d) partitions\n", __FUNCTION__, rc);

3375

+ return rc;

3376

+}

3377

+#endif

3378

+

3379

+#ifdef CONFIG_SOLARIS_X86_PARTITION

3380

+#define SOLARIS_X86_PART_TABLE_SECTOR_OFFSET 1

3381

+/*

3382

+ * solaris_x86_partition

3383

+ *

3384

+ * Return:

3385

+ * - 0 for 0 partition

3386

+ * - (positive) number for number of solaris partitions found

3387

+ * - (negative) error code

3388

+ */

3389

+static int

3390

+solaris_x86_partition(

3391

+ evms_logical_node_t **discover_list,

3392

+ evms_logical_node_t *node,

3393

+ struct partition *solaris_x86,

3394

+ int probe_only) /* if TRUE, do not add segments */

3395

+{

3396

+ long offset = START_SECT(solaris_x86);

3397

+ struct solaris_x86_vtoc *v;

3398

+ struct solaris_x86_slice *s;

3399

+ int i;

3400

+ char *data = NULL;

3401

+ int rc=0;

3402

+ int count = 0;

3403

+

3404

+ rc = evms_cs_allocate_memory((void**) &data, node->hardsector_size);

3405

+ if (!rc)

3406

+ rc = INIT_IO(node,

3407

+ 0,

3408

+ START_SECT(solaris_x86) + SOLARIS_X86_PART_TABLE_SECTOR_OFFSET,

3409

+ 1,

3410

+ data);

3411

+ if (!rc) {

3412

+

3413

+ v = (struct solaris_x86_vtoc *)data;

3414

+

3415

+ if (v->v_sanity == SOLARIS_X86_VTOC_SANE) {

3416

+ if (v->v_version != 1) {

3417

+ LOG_WARNING("%s: cannot handle version %d vtoc>\n", __FUNCTION__, v->v_version);

3418

+ } else {

3419

+ for (i=0; i<v->v_nparts; i++) {

3420

+ s = &v->v_slice[i];

3421

+ LOG_EXTRA("s[%d] s_tag(%u), s_flag(%u), s_start(%u), s_size(%u), last_sector(%u)\n",

3422

+ i,s->s_tag, s->s_flag, s->s_start, s->s_size, s->s_start + s->s_size -1);

3423

+

3424

+ if ((s->s_size == 0) || (s->s_tag == 0x05))

3425

+ continue;

3426

+ if (!probe_only) {

3427

+ rc = mbr_ebr_process_segment(

3428

+ discover_list,

3429

+ node,

3430

+ (u_int64_t)(s->s_start+offset),

3431

+ (u_int64_t)s->s_size,

3432

+ SOLARIS_X86_PARTITION,

3433

+ cur_comp_part_num++,

3434

+ NULL);

3435

+ if (rc)

3436

+ break;

3437

+ }

3438

+ count++;

3439

+ }

3440

+ }

3441

+ }

3442

+ }

3443

+ if (data)

3444

+ evms_cs_deallocate_memory(data);

3445

+ if (!rc)

3446

+ rc = count;

3447

+ LOG_DETAILS("%s: %s (%d) partitions\n",

3448

+ __FUNCTION__, probe_only ? " " : "exported", rc);

3449

+ return rc;

3450

+}

3451

+#endif

3452

+

3453

+/*

3454

+ * os2lvm_partition() looks for DLAT at last sector of the track containing MBR/EBR

3455

+ *

3456

+ * Returns: 1 - os2 DLAT was found

3457

+ * 0 otherwise

3458

+ *

3459

+ */

3460

+static int

3461

+os2lvm_partition(

3462

+ u_int64_t MBR_EBR_sect,

3463

+ evms_logical_node_t *node,

3464

+ DLA_Table_Sector *dlat)

3465

+{

3466

+ struct hd_geometry geometry;

3467

+ int rc;

3468

+ u_int32_t crc_hold;

3469

+

3470

+ rc = evms_cs_kernel_ioctl(node, HDIO_GETGEO, (unsigned long)&geometry);

3471

+ if (rc) {

3472

+ LOG_SERIOUS("%s: ioctl failed(%u) on '%s'\n",

3473

+ __FUNCTION__, rc, node->name);

3474

+ } else if (!INIT_IO(node, 0, MBR_EBR_sect + geometry.sectors - 1, 1, dlat)) {

3475

+ if ( (dlat->DLA_Signature1 == cpu_to_le32(DLA_TABLE_SIGNATURE1)) &&

3476

+ (dlat->DLA_Signature2 == cpu_to_le32(DLA_TABLE_SIGNATURE2)) ) {

3477

+ crc_hold = le32_to_cpu( dlat->DLA_CRC );

3478

+ dlat->DLA_CRC = 0;

3479

+ if ( evms_cs_calculate_crc( EVMS_INITIAL_CRC, (void *)dlat,

3480

+ node->hardsector_size ) == crc_hold )

3481

+ return 1;

3482

+ }

3483

+ }

3484

+ return 0;

3485

+}

3486

+

3487

+static int

3488

+mbr_ebr_process_logical_drive(

3489

+ evms_logical_node_t **discover_list,

3490

+ evms_logical_node_t *node,

3491

+ extended_part_t *ext_info,

3492

+ int i,

3493

+ struct partition *p,

3494

+ int os2lvm,

3495

+ DLA_Table_Sector *dlat)

3496

+{

3497

+ int rc = 0;

3498

+ char tmp_buf[EVMS_VOLUME_NAME_SIZE], *partition_name;

3499

+

3500

+ LOG_EXTRA("%s: PartitionTableIndex(%i), Start(%Lu), Size(%Lu)\n",

3501

+ __FUNCTION__, i, START_SECT(p), NR_SECTS(p));

3502

+

3503

+ if (NR_SECTS(p)) {

3504

+ if (is_extended_partition(p)) {

3505

+ ext_info->next_ebr_start =

3506

+ (u_int64_t)(START_SECT(p) + START_SECT(ext_info->extended));

3507

+ ext_info->done = FALSE; /* not done yet */

3508

+ } else {

3509

+ partition_name = NULL;

3510

+ if ( os2lvm && p->sys_ind != LVM_PARTITION_INDICATOR &&

3511

+ le32_to_cpu( dlat->DLA_Array[i].Partition_Start ) == ( ext_info->start_sect + START_SECT(p) ) &&

3512

+ le32_to_cpu( dlat->DLA_Array[i].Partition_Size ) == NR_SECTS(p) &&

3513

+ dlat->DLA_Array[i].Drive_Letter != '\0' ) {

3514

+ sprintf( tmp_buf, "os2/%c", dlat->DLA_Array[i].Drive_Letter );

3515

+ partition_name = tmp_buf;

3516

+ }

3517

+ evmsTRACE2(EVMS_INFO_EXTRA,

3518

+ (print_partition_info(__FUNCTION__, p)));

3519

+

3520

+ rc = mbr_ebr_process_segment(

3521

+ discover_list,

3522

+ node,

3523

+ ext_info->start_sect + START_SECT(p),

3524

+ NR_SECTS(p),

3525

+ p->sys_ind,

3526

+ cur_comp_part_num++,

3527

+ partition_name);

3528

+ }

3529

+ }

3530

+ return(rc);

3531

+}

3532

+

3533

+static int

3534

+mbr_ebr_process_ebr(

3535

+ evms_logical_node_t **discover_list,

3536

+ evms_logical_node_t *node,

3537

+ extended_part_t *ext_info,

3538

+ mbr_ebr_t *ebr)

3539

+{

3540

+ int rc = 0, i, os2lvm;

3541

+ struct partition *p;

3542

+ DLA_Table_Sector *dlat = NULL;

3543

+

3544

+ /* allocate space for the OS2 DLAT info */

3545

+ rc = evms_cs_allocate_memory((void **)&dlat, node->hardsector_size);

3546

+ if (!rc) {

3547

+ /* read the dlat for this mbr */

3548

+ os2lvm = os2lvm_partition(ext_info->start_sect, node, dlat);

3549

+

3550

+ /* walk thru the partition table in the mbr

3551

+ * processing each partition record.

3552

+ */

3553

+ for (i = 0; i < 4; i++) {

3554

+ p = &ebr->partitions[i];

3555

+ rc = mbr_ebr_process_logical_drive(

3556

+ discover_list,

3557

+ node,

3558

+ ext_info,

3559

+ i,

3560

+ p,

3561

+ os2lvm,

3562

+ dlat);

3563

+ }

3564

+ }

3565

+

3566

+ /* free the space used for OS2 DLAT info */

3567

+ if (dlat)

3568

+ evms_cs_deallocate_memory(dlat);

3569

+

3570

+ return(rc);

3571

+}

3572

+

3573

+static int

3574

+mbr_ebr_probe_for_ebr(

3575

+ evms_logical_node_t **discover_list,

3576

+ evms_logical_node_t *node,

3577

+ extended_part_t *ext_info)

3578

+{

3579

+ int rc = 0;

3580

+ u_char *sector_buffer = NULL;

3581

+ mbr_ebr_t *ebr = NULL;

3582

+

3583

+ /* allocate a sector size buffer */

3584

+ rc = evms_cs_allocate_memory((void **)&sector_buffer,

3585

+ node->hardsector_size);

3586

+ if (!rc)

3587

+ /* read the location of the mbr sector */

3588

+ rc = INIT_IO(node, 0, ext_info->start_sect, 1, sector_buffer);

3589

+

3590

+ if (!rc) {

3591

+ ebr = (mbr_ebr_t *)sector_buffer;

3592

+ if (validate_mbr_ebr(node, ebr,

3593

+ START_SECT(ext_info->extended),

3594

+ ext_info->start_sect) == TRUE)

3595

+ rc = mbr_ebr_process_ebr(

3596

+ discover_list,

3597

+ node,

3598

+ ext_info,

3599

+ ebr);

3600

+ }

3601

+

3602

+ if (sector_buffer)

3603

+ evms_cs_deallocate_memory(sector_buffer);

3604

+

3605

+ return(rc);

3606

+}

3607

+

3608

+static int

3609

+mbr_ebr_process_extended_partition(

3610

+ evms_logical_node_t **discover_list,

3611

+ evms_logical_node_t *node,

3612

+ struct partition *p)

3613

+{

3614

+ int rc = 0;

3615

+ extended_part_t ext_info;

3616

+

3617

+ memset(&ext_info, 0, sizeof(ext_info));

3618

+ ext_info.done = FALSE;

3619

+ ext_info.extended = p;

3620

+ ext_info.next_ebr_start = START_SECT(p);

3621

+ while (ext_info.done == FALSE) {

3622

+ ext_info.done = TRUE; /* assume done, unless we find another EBR */

3623

+ ext_info.start_sect = ext_info.next_ebr_start;

3624

+ rc = mbr_ebr_probe_for_ebr(

3625

+ discover_list,

3626

+ node,

3627

+ &ext_info);

3628

+ }

3629

+ return rc;

3630

+}

3631

+

3632

+/*

3633

+ * is_non_dos_extended

3634

+ *

3635

+ * This function returns TRUE if the partition entry represents a non-DOS

3636

+ * extended partition such as UnixWare, Solaris x86 and BSD

3637

+ */

3638

+static int

3639

+is_non_dos_extended(

3640

+ evms_logical_node_t **discover_list,

3641

+ evms_logical_node_t *node,

3642

+ struct partition *p)

3643

+{

3644

+ if (NR_SECTS(p)) {

3645

+ #ifdef CONFIG_BSD_DISKLABEL

3646

+ if (SYS_IND(p) == BSD_PARTITION ||

3647

+ SYS_IND(p) == NETBSD_PARTITION ||

3648

+ SYS_IND(p) == OPENBSD_PARTITION)

3649

+ return TRUE;

3650

+ #endif

3651

+

3652

+ #ifdef CONFIG_UNIXWARE_DISKLABEL

3653

+ if (SYS_IND(p) == UNIXWARE_PARTITION)

3654

+ return TRUE;

3655

+ #endif

3656

+

3657

+ #ifdef CONFIG_SOLARIS_X86_PARTITION

3658

+ if ( (SYS_IND(p) == SOLARIS_X86_PARTITION) &&

3659

+ (solaris_x86_partition(discover_list, node, p, TRUE) > 0) )

3660

+ return TRUE;

3661

+ #endif

3662

+ }

3663

+ return(FALSE);

3664

+}

3665

+

3666

+/*

3667

+ * mbr_ebr_process_other_primary_partition

3668

+ * This function processes other (non-DOS) primary partitions such as

3669

+ * UnixWare, Solaris x86 and BSD

3670

+ */

3671

+static int

3672

+mbr_ebr_process_other_primary_partition(

3673

+ evms_logical_node_t **discover_list,

3674

+ evms_logical_node_t *node,

3675

+ struct partition *p)

3676

+{

3677

+ if (NR_SECTS(p)) {

3678

+ #ifdef CONFIG_BSD_DISKLABEL

3679

+ if (SYS_IND(p) == BSD_PARTITION ||

3680

+ SYS_IND(p) == NETBSD_PARTITION ||

3681

+ SYS_IND(p) == OPENBSD_PARTITION)

3682

+ return bsd_disklabel_partition(discover_list, node, p);

3683

+ #endif

3684

+

3685

+ #ifdef CONFIG_UNIXWARE_DISKLABEL

3686

+ if (SYS_IND(p) == UNIXWARE_PARTITION)

3687

+ return unixware_partition(discover_list, node, p);

3688

+ #endif

3689

+

3690

+ #ifdef CONFIG_SOLARIS_X86_PARTITION

3691

+ if (SYS_IND(p) == SOLARIS_X86_PARTITION)

3692

+ return solaris_x86_partition(discover_list, node, p, FALSE);

3693

+ #endif

3694

+ }

3695

+ return(0);

3696

+}

3697

+

3698

+static int

3699

+mbr_ebr_process_dos_primary_partition(

3700

+ evms_logical_node_t **discover_list,

3701

+ evms_logical_node_t *node,

3702

+ int i,

3703

+ struct partition *p,

3704

+ int os2lvm,

3705

+ DLA_Table_Sector *dlat)

3706

+{

3707

+ int rc = 0;

3708

+ char tmp_buf[EVMS_VOLUME_NAME_SIZE], *partition_name;

3709

+

3710

+ LOG_EVERYTHING("%s: PartitionTableIndex(%i), Start(%Lu), Size(%Lu)\n",

3711

+ __FUNCTION__, i, START_SECT(p), NR_SECTS(p));

3712

+

3713

+ if (NR_SECTS(p)) {

3714

+

3715

+ if (is_extended_partition(p))

3716

+ rc = mbr_ebr_process_extended_partition(

3717

+ discover_list,node,p);

3718

+

3719

+ else {

3720

+ partition_name = NULL;

3721

+ if ( os2lvm && p->sys_ind != LVM_PARTITION_INDICATOR &&

3722

+ le32_to_cpu( dlat->DLA_Array[i].Partition_Start ) == START_SECT(p) &&

3723

+ le32_to_cpu( dlat->DLA_Array[i].Partition_Size ) == NR_SECTS(p) &&

3724

+ dlat->DLA_Array[i].Drive_Letter != '\0' ) {

3725

+ sprintf( tmp_buf, "os2/%c", dlat->DLA_Array[i].Drive_Letter );

3726

+ partition_name = tmp_buf;

3727

+ }

3728

+ evmsTRACE2(EVMS_INFO_EXTRA,

3729

+ (print_partition_info(__FUNCTION__, p)));

3730

+

3731

+ rc = mbr_ebr_process_segment(

3732

+ discover_list,

3733

+ node,

3734

+ START_SECT(p),

3735

+ NR_SECTS(p),

3736

+ p->sys_ind,

3737

+ i+1,

3738

+ partition_name);

3739

+ }

3740

+ }

3741

+ return(rc);

3742

+}

3743

+

3744

+static int

3745

+mbr_ebr_process_mbr(

3746

+ evms_logical_node_t **discover_list,

3747

+ evms_logical_node_t *node,

3748

+ mbr_ebr_t *mbr)

3749

+{

3750

+ int rc = 0, i, os2lvm;

3751

+ struct partition *p;

3752

+ DLA_Table_Sector *dlat = NULL;

3753

+

3754

+ cur_comp_part_num = 5; /* set this value for each disk */

3755

+

3756

+ /* allocate space for the OS2 DLAT info */

3757

+ rc = evms_cs_allocate_memory((void **)&dlat, node->hardsector_size);

3758

+ if (!rc) {

3759

+ /* read the dlat for this mbr */

3760

+ os2lvm = os2lvm_partition(0, node, dlat);

3761

+

3762

+ /* Pass 1: walk thru the partition table in the mbr

3763

+ * processing each partition record.

3764

+ */

3765

+ for (i = 0; i < 4; i++) {

3766

+ p = &mbr->partitions[i];

3767

+ if (is_non_dos_extended(discover_list, node, p)) {

3768

+ LOG_DETAILS(" Found and skip a non-dos extended partition.\n");

3769

+ continue;

3770

+ }

3771

+

3772

+ mbr_ebr_process_dos_primary_partition(

3773

+ discover_list,

3774

+ node,

3775

+ i,

3776

+ p,

3777

+ os2lvm,

3778

+ dlat);

3779

+ }

3780

+

3781

+ /* Pass 2: walk thru the partition table in the mbr

3782

+ * processing each partition record for non-DOS extended partitions

3783

+ */

3784

+ for (i = 0; i < 4; i++) {

3785

+ p = &mbr->partitions[i];

3786

+ mbr_ebr_process_other_primary_partition(

3787

+ discover_list,

3788

+ node,

3789

+ p);

3790

+ }

3791

+

3792

+ }

3793

+

3794

+ /* free the space used for OS2 DLAT info */

3795

+ if (dlat)

3796

+ evms_cs_deallocate_memory(dlat);

3797

+

3798

+ return(rc);

3799

+}

3800

+

3801

+static int

3802

+mbr_ebr_probe_for_mbr(

3803

+ evms_logical_node_t **discover_list,

3804

+ evms_logical_node_t *node)

3805

+{

3806

+ int rc = 0;

3807

+ u_char *sector_buffer = NULL;

3808

+ mbr_ebr_t *mbr = NULL;

3809

+

3810

+ LOG_DEBUG("%s: probing (%s).\n",

3811

+ __FUNCTION__, node->name);

3812

+

3813

+ /* allocate a sector size buffer */

3814

+ rc = evms_cs_allocate_memory((void **)&sector_buffer,

3815

+ node->hardsector_size);

3816

+ if (!rc)

3817

+ /* read the location of the mbr sector */

3818

+ rc = INIT_IO(node, 0, 0, 1, sector_buffer);

3819

+ if (rc) {

3820

+ LOG_ERROR("%s: read error(%d) on '%s'.\n",

3821

+ __FUNCTION__, rc, node->name);

3822

+ } else {

3823

+ mbr = (mbr_ebr_t *)sector_buffer;

3824

+ if (validate_mbr_ebr(node, mbr, 0, 0) == TRUE) {

3825

+ /* since it looks like this disk has a

3826

+ * valid MBR, remove the disk node from

3827

+ * the discover list. it may already be

3828

+ * on the global list, or it will be

3829

+ * added to it. in the case of an mbr

3830

+ * with no partitions, it is simply

3831

+ * removed and forgotten. when one or

3832

+ * more partitions are created, the

3833

+ * disk will be examined and handled

3834

+ * properly during the following

3835

+ * rediscover operation.

3836

+ */

3837

+ evms_cs_remove_logical_node_from_list(

3838

+ discover_list, node);

3839

+

3840

+ rc = mbr_ebr_process_mbr(discover_list,node,mbr);

3841

+ }

3842

+ }

3843

+

3844

+ if (sector_buffer)

3845

+ evms_cs_deallocate_memory(sector_buffer);

3846

+

3847

+ return(rc);

3848

+}

3849

+

3850

+/*

3851

+ * Function: mbr_ebr_partition_discover

3852

+ *

3853

+ */

3854

+static int

3855

+mbr_ebr_partition_discover(evms_logical_node_t **discover_list)

3856

+{

3857

+ int rc = 0;

3858

+ evms_logical_node_t *node, *next_node;

3859

+

3860

+ LOG_ENTRY_EXIT("%s: ENTRY\n", __FUNCTION__);

3861

+

3862

+ /* initialize global variable */

3863

+ exported_nodes = 0;

3864

+

3865

+ /* examine each node on the discover list */

3866

+ next_node = *discover_list;

3867

+ while(next_node) {

3868

+ node = next_node;

3869

+ next_node = node->next;

3870

+ if (node->plugin->id == plugin_header.id)

3871

+ /* don't recurse into our own objects

3872

+ */

3873

+ continue;

3874

+ mbr_ebr_probe_for_mbr(discover_list,node);

3875

+ }

3876

+

3877

+ LOG_ENTRY_EXIT("%s: EXIT(exported nodes:%d, error code:%d)\n",

3878

+ __FUNCTION__, exported_nodes, rc);

3879

+ if (exported_nodes)

3880

+ rc = exported_nodes;

3881

+ return(rc);

3882

+}

3883

+

3884

+/*

3885

+ * Function: mbr_ebr_partition_delete

3886

+ *

3887

+ */

3888

+static int

3889

+mbr_ebr_partition_delete(evms_logical_node_t *segment)

3890

+{

3891

+ int rc = 0;

3892

+ local_instance_data_t *LID;

3893

+ evms_logical_node_t *empty_disk = NULL;

3894

+

3895

+ LOG_DETAILS("deleting segment '%s'.\n",segment->name);

3896

+

3897

+ if (!segment) {

3898

+ rc = -ENODEV;

3899

+ } else {

3900

+ LID = segment->instance_data;

3901

+ if (LID) {

3902

+ /* remove the segment from the

3903

+ * disk's segment list

3904

+ */

3905

+ rc = remove_segment_from_disk(

3906

+ LID->source_disk,

3907

+ segment,

3908

+ &empty_disk);

3909

+ /* free the local instance data */

3910

+ evms_cs_deallocate_memory(LID);

3911

+ }

3912

+ /* free the segment node */

3913

+ evms_cs_deallocate_logical_node(segment);

3914

+ MOD_DEC_USE_COUNT;

3915

+ /* if the last segment on the disk was

3916

+ * deleted, delete the disk node too

3917

+ */

3918

+ if (empty_disk)

3919

+ DELETE(empty_disk);

3920

+ }

3921

+ return(rc);

3922

+}

3923

+

3924

+/*

3925

+ * function: mbr_ebr_partition_io_error

3926

+ *

3927

+ * this function was primarily created because the function

3928

+ * buffer_IO_error is inline and kgdb doesn't allow breakpoints

3929

+ * to be set on inline functions. Since this was an error path

3930

+ * and not mainline, I decided to add a trace statement to help

3931

+ * report on the failing condition.

3932

+ *

3933

+ */

3934

+static void

3935

+mbr_ebr_partition_io_error(

3936

+ evms_logical_node_t *node,

3937

+ int io_flag,

3938

+ eio_t *eio)

3939

+{

3940

+ LOG_SERIOUS("attempt to %s beyond partition boundary(%Ld) on (%s), rsector(%Ld).\n",

3941

+ (io_flag) ? "WRITE" : "READ",

3942

+ node->total_vsectors - 1,

3943

+ node->name,

3944

+ eio->rsector);

3945

+

3946

+ EVMS_IO_ERROR(eio);

3947

+}

3948

+

3949

+/*

3950

+ * Function: mbr_ebr_partition_read

3951

+ *

3952

+ */

3953

+static void

3954

+mbr_ebr_partition_read(

3955

+ evms_logical_node_t *partition,

3956

+ eio_t *eio)

3957

+{

3958

+ local_instance_data_t *LID = partition->instance_data;

3959

+

3960

+ if ((eio->rsector + eio->rsize) <= partition->total_vsectors) {

3961

+ eio->rsector += LID->start_sect;

3962

+ R_IO(LID->source_disk, eio);

3963

+ } else

3964

+ mbr_ebr_partition_io_error(partition, READ, eio);

3965

+}

3966

+

3967

+/*

3968

+ * Function: mbr_ebr_partition_write

3969

+ *

3970

+ */

3971

+static void

3972

+mbr_ebr_partition_write(

3973

+ evms_logical_node_t *partition,

3974

+ eio_t *eio)

3975

+{

3976

+ local_instance_data_t *LID = partition->instance_data;

3977

+

3978

+ if ((eio->rsector + eio->rsize) <= partition->total_vsectors) {

3979

+ eio->rsector += LID->start_sect;

3980

+ W_IO(LID->source_disk, eio);

3981

+ } else

3982

+ mbr_ebr_partition_io_error(partition, WRITE, eio);

3983

+}

3984

+

3985

+/*

3986

+ * Function: mbr_ebr_partition_init_io

3987

+ *

3988

+ */

3989

+static int

3990

+mbr_ebr_partition_init_io(

3991

+ evms_logical_node_t *partition,

3992

+ int io_flag, /* 0=read, 1=write*/

3993

+ evms_sector_t sect_nr, /* disk LBA */

3994

+ evms_sector_t num_sects, /* # of sectors */

3995

+ void *buf_addr) /* buffer address */

3996

+{

3997

+ int rc;

3998

+ local_instance_data_t *LID = partition->instance_data;

3999

+

4000

+ if ((sect_nr + num_sects) <= partition->total_vsectors) {

4001

+ rc = INIT_IO(LID->source_disk, io_flag, sect_nr + LID->start_sect, num_sects, buf_addr);

4002

+ } else {

4003

+ LOG_SERIOUS("init_io: attempt to %s beyond partition(%s) boundary(%Ld) at sector(%Ld) for count(%Ld).\n",

4004

+ (io_flag) ? "WRITE" : "READ",

4005

+ partition->name,

4006

+ (LID->nr_sects - 1),

4007

+ sect_nr, num_sects);

4008

+ rc = -EINVAL;

4009

+ }

4010

+

4011

+ return(rc);

4012

+}

4013

+

4014

+/*

4015

+ * Function: mbr_ebr_partition_ioctl

4016

+ *

4017

+ */

4018

+static int

4019

+mbr_ebr_partition_ioctl (

4020

+ evms_logical_node_t *partition,

4021

+ struct inode *inode,

4022

+ struct file *file,

4023

+ unsigned int cmd,

4024

+ unsigned long arg)

4025

+{

4026

+ local_instance_data_t *LID;

4027

+ struct hd_geometry hd_geo;

4028

+ int rc;

4029

+

4030

+ rc = 0;

4031

+ LID = partition->instance_data;

4032

+ if (!inode)

4033

+ return -EINVAL;

4034

+ switch (cmd) {

4035

+ case HDIO_GETGEO:

4036

+ {

4037

+ rc = IOCTL(LID->source_disk, inode, file, cmd, arg);

4038

+ if (rc) break;

4039

+ if (copy_from_user(&hd_geo, (void *)arg, sizeof(struct hd_geometry)))

4040

+ rc = -EFAULT;

4041

+ if (rc) break;

4042

+ hd_geo.start = LID->start_sect;

4043

+ if (copy_to_user((void *)arg, &hd_geo, sizeof(struct hd_geometry)))

4044

+ rc = -EFAULT;

4045

+ }

4046

+ break;

4047

+ case EVMS_GET_BMAP:

4048

+ {

4049

+ evms_get_bmap_t *bmap = (evms_get_bmap_t *)arg;

4050

+ bmap->rsector += LID->start_sect;

4051

+ /* intentionally fall thru to

4052

+ * default ioctl down to device

4053

+ * manager.

4054

+ */

4055

+ }

4056

+ default:

4057

+ rc = IOCTL(LID->source_disk, inode, file, cmd, arg);

4058

+ }

4059

+ return rc;

4060

+}

4061

+

4062

+/*

4063

+ * Function: dos_part_init

4064

+ *

4065

+ */

4066

+static int __init

4067

+dos_part_init(void)

4068

+{

4069

+ return evms_cs_register_plugin(&plugin_header); /* register with EVMS */

4070

+}

4071

+

4072

+static void __exit

4073

+dos_part_exit(void)

4074

+{

4075

+ evms_cs_unregister_plugin(&plugin_header);

4076

+}

4077

+

4078

+module_init(dos_part_init);

4079

+module_exit(dos_part_exit);

4080

+#ifdef MODULE_LICENSE

4081

+MODULE_LICENSE("GPL");

4082

+#endif

4083

+

4084

diff -Naur linux-2002-03-28/drivers/evms/evms.c evms-2002-03-28/drivers/evms/evms.c

4085

--- linux-2002-03-28/drivers/evms/evms.c Wed Dec 31 18:00:00 1969

4086

+++ evms-2002-03-28/drivers/evms/evms.c Thu Mar 28 15:43:00 2002

4087

@@ -0,0 +1,5153 @@

4088

+/* -*- linux-c -*- */

4089

+/*

4090

+ *

4091

+ *

4092

+ * Copyright (c) International Business Machines Corp., 2000

4093

+ *

4094

+ * This program is free software; you can redistribute it and/or modify

4095

+ * it under the terms of the GNU General Public License as published by

4096

+ * the Free Software Foundation; either version 2 of the License, or

4097

+ * (at your option) any later version.

4098

+ *

4099

+ * This program is distributed in the hope that it will be useful,

4100

+ * but WITHOUT ANY WARRANTY; without even the implied warranty of

4101

+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See

4102

+ * the GNU General Public License for more details.

4103

+ *

4104

+ * You should have received a copy of the GNU General Public License

4105

+ * along with this program; if not, write to the Free Software

4106

+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

4107

+ *

4108

+ *

4109

+ */

4110

+/*

4111

+ *

4112

+ * linux/drivers/evms/evms.c

4113

+ *

4114

+ * EVMS Base and Common Services

4115

+ *

4116

+ */

4117

+

4118

+#define DEVICE_NR(device) MINOR(device) /* evms has no partition bits */

4119

+#define DEVICE_NAME "evms" /* name for messaging */

4120

+#define DEVICE_NO_RANDOM /* no entropy to contribute */

4121

+#define DEVICE_OFF(d) /* do nothing */

4122

+

4123

+#include <linux/config.h>

4124

+#include <linux/module.h>

4125

+#include <linux/errno.h>

4126

+#include <linux/kernel.h>

4127

+#include <linux/init.h>

4128

+#include <linux/fs.h>

4129

+#include <linux/major.h>

4130

+#include <linux/slab.h>

4131

+#include <asm/uaccess.h>

4132

+#include <linux/blk.h> /* must be included by all block drivers */

4133

+#include <linux/blkdev.h>

4134

+#include <linux/blkpg.h>

4135

+#include <linux/iobuf.h>

4136

+#include <linux/genhd.h>

4137

+#include <linux/major.h>

4138

+#include <linux/sched.h>

4139

+#include <linux/version.h>

4140

+#include <linux/swap.h>

4141

+#include <net/checksum.h>

4142

+#include <linux/sysctl.h>

4143

+#include <linux/smp_lock.h>

4144

+#include <linux/evms/evms_kernel.h>

4145

+

4146

+//#define VFS_PATCH_PRESENT

4147

+

4148

+/* prefix used in logging messages */

4149

+#define LOG_PREFIX

4150

+

4151

+typedef struct evms_registered_plugin_s {

4152

+ evms_plugin_header_t * plugin;

4153

+ struct evms_registered_plugin_s * next;

4154

+} evms_registered_plugin_t;

4155

+static evms_registered_plugin_t * registered_plugin_head = NULL;

4156

+

4157

+static evms_list_node_t *evms_global_device_list = NULL;

4158

+static evms_list_node_t *evms_global_feature_node_list = NULL;

4159

+static evms_list_node_t *evms_global_notify_list = NULL;

4160

+

4161

+int evms_info_level = EVMS_INFO_LEVEL;

4162

+struct proc_dir_entry *evms_proc_dir = NULL;

4163

+EXPORT_SYMBOL(evms_info_level);

4164

+static evms_logical_volume_t * evms_logical_volumes;

4165

+static int evms_volumes = 0;

4166

+/* a few variables to aid in detecting memory leaks.

4167

+ * these variables are always in use, regardless of

4168

+ * the state of EVMS_MEM_DEBUG.

4169

+ */

4170

+static atomic_t evms_allocs;

4171

+static atomic_t evms_logical_nodes;

4172

+

4173

+char *evms_primary_string = "primary";

4174

+EXPORT_SYMBOL(evms_primary_string);

4175

+char *evms_secondary_string = "secondary";

4176

+EXPORT_SYMBOL(evms_secondary_string);

4177

+

4178

+static evms_version_t evms_svc_version = {

4179

+ major : EVMS_COMMON_SERVICES_MAJOR,

4180

+ minor : EVMS_COMMON_SERVICES_MINOR,

4181

+ patchlevel : EVMS_COMMON_SERVICES_PATCHLEVEL

4182

+};

4183

+

4184

+static int evms_discover_volumes(evms_rediscover_t *);

4185

+

4186

+/* Handles for "private" EVMS object pools */

4187

+static evms_pool_mgmt_t *evms_io_notify_pool;

4188

+

4189

+/* Handles for "public" EVMS object pools */

4190

+evms_pool_mgmt_t *evms_bh_pool;

4191

+EXPORT_SYMBOL(evms_bh_pool);

4192

+

4193

+/* Handle for the devfs directory entry */

4194

+devfs_handle_t evms_dir_devfs_handle;

4195

+devfs_handle_t evms_blk_devfs_handle;

4196

+

4197

+

4198

+/**********************************************************/

4199

+/* SYSCTL - EVMS folder */

4200

+/**********************************************************/

4201

+

4202

+#ifdef CONFIG_PROC_FS

4203

+static struct ctl_table_header *evms_table_header;

4204

+static int evms_info_level_min = EVMS_INFO_CRITICAL;

4205

+static int evms_info_level_max = EVMS_INFO_EVERYTHING;

4206

+

4207

+static ctl_table evms_table[] = {

4208

+ {DEV_EVMS_INFO_LEVEL, "evms_info_level",

4209

+ &evms_info_level, sizeof(int), 0644, NULL,

4210

+ &proc_dointvec_minmax, &sysctl_intvec,

4211

+ NULL, &evms_info_level_min, &evms_info_level_max},

4212

+ {0}

4213

+};

4214

+

4215

+static ctl_table evms_dir_table[] = {

4216

+ {DEV_EVMS, "evms", NULL, 0, 0555, evms_table},

4217

+ {0}

4218

+};

4219

+

4220

+static ctl_table dev_dir_table[] = {

4221

+ {CTL_DEV, "dev", NULL, 0, 0555, evms_dir_table},

4222

+ {0}

4223

+};

4224

+#endif

4225

+

4226

+/**********************************************************/

4227

+/* START -- exported functions/Common Services */

4228

+/**********************************************************/

4229

+

4230

+/*

4231

+ * Function: evms_cs_get_version

4232

+ * Description: This function returns the current EVMS version

4233

+ */

4234

+void

4235

+evms_cs_get_version(int * major, int *minor)

4236

+{

4237

+ *major = EVMS_MAJOR_VERSION;

4238

+ *minor = EVMS_MINOR_VERSION;

4239

+}

4240

+EXPORT_SYMBOL(evms_cs_get_version);

4241

+

4242

+int

4243

+evms_cs_check_version(

4244

+ evms_version_t *required,

4245

+ evms_version_t *actual)

4246

+{

4247

+ int rc = 0;

4248

+

4249

+ if (required->major != actual->major)

4250

+ rc = -EINVAL;

4251

+ else if (required->minor > actual->minor)

4252

+ rc = -EINVAL;

4253

+ else if (required->minor == actual->minor)

4254

+ if (required->patchlevel > actual->patchlevel)

4255

+ rc = -EINVAL;

4256

+ return(rc);

4257

+}

4258

+EXPORT_SYMBOL(evms_cs_check_version);

4259

+

4260

+#ifdef EVMS_MEM_DEBUG

4261

+#define EVMS_MEM_SSIGNATURE 0x4D444D63 //SMEM

4262

+typedef struct memobj_head_s {

4263

+ unsigned int ssignature;

4264

+ struct memobj_head_s *next;

4265

+ int size;

4266

+ struct memobj_tail_s *tail;

4267

+} memobj_head_t;

4268

+#define EVMS_MEM_ESIGNATURE 0x4D444D44 //EMEM

4269

+typedef struct memobj_tail_s {

4270

+ unsigned int esignature;

4271

+ memobj_head_t *head;

4272

+} memobj_tail_t;

4273

+

4274

+static memobj_head_t *memobj_head = NULL;

4275

+static spinlock_t mem_debug_lock = SPIN_LOCK_UNLOCKED;

4276

+

4277

+/*

4278

+ * function description: evms_cs_verify_memory_integrity

4279

+ * Verifies:

4280

+ * the count of memory objects in the list

4281

+ * the starting signature (SSIGNATURE) hasn't been overwritten

4282

+ * the ending signature (ESIGNATURE) hasn't been overwritten

4283

+ *

4284

+ * op_flag: controls the behaviour when a problem is found

4285

+ * 0 = stop immediately where a problem is found

4286

+ * !0 = don't stop, but report problem(s) exist, via return code

4287

+ */

4288

+int

4289

+evms_cs_verify_memory_integrity(int op_flag)

4290

+{

4291

+ int rc = 0, objcount;

4292

+ memobj_head_t *mobj, **ppmobj;

4293

+ memobj_tail_t *mobjtail;

4294

+

4295

+ /* verify each object in the linked list */

4296

+ objcount = 0;

4297

+ spin_lock(&mem_debug_lock);

4298

+ ppmobj = &memobj_head;

4299

+ while(*ppmobj) {

4300

+ objcount++;

4301

+ mobj = *ppmobj;

4302

+ /* verify starting signature */

4303

+ if (mobj->ssignature != EVMS_MEM_SSIGNATURE) {

4304

+ if (op_flag == 0)

4305

+ BUG();

4306

+ else

4307

+ rc++;

4308

+ }

4309

+ /* verify ending signature */

4310

+ mobjtail = mobj->tail;

4311

+ if (mobjtail->esignature != EVMS_MEM_ESIGNATURE) {

4312

+ if (op_flag == 0)

4313

+ BUG();

4314

+ else

4315

+ rc++;

4316

+ }

4317

+ ppmobj = &(*ppmobj)->next;

4318

+ }

4319

+ spin_unlock(&mem_debug_lock);

4320

+ /* verify object count */

4321

+ if (objcount != evms_allocs) {

4322

+ if (op_flag == 0)

4323

+ BUG();

4324

+ else

4325

+ rc++;

4326

+ }

4327

+ return(rc);

4328

+}

4329

+EXPORT_SYMBOL(evms_cs_verify_memory_integrity);

4330

+#endif

4331

+

4332

+/*

4333

+ * function: evms_cs_allocate_memory

4334

+ *

4335

+ * This function is a wrapper function for the kernel malloc

4336

+ * (kmalloc) function. It provides a consistent method of

4337

+ * allocating kernel memory for all evms code.

4338

+ *

4339

+ *

4340

+ * This function takes as arguments:

4341

+ *

4342

+ * **pp: the address of the pointer which is to contain the

4343

+ * the address of the allocated memory object.

4344

+ * size: the size in bytes of the memory object to be

4345

+ * allocated.

4346

+ *

4347

+ *

4348

+ * This function returns:

4349

+ *

4350

+ * *pp = NULL, and return set to -ENOMEM when there is

4351

+ * insufficient memory to satisfy the request.

4352

+ *

4353

+ * OR

4354

+ *

4355

+ * *pp = NULL, and return set to 0 when the specified

4356

+ * size is invalid.

4357

+ *

4358

+ * OR

4359

+ *

4360

+ * *pp is set to the address of the allocated memory object

4361

+ * and return code is set to 0.

4362

+ *

4363

+ *

4364

+ * NOTE: Defining EVMS_MEM_DEBUG turns on memory integrity

4365

+ * checking. This wraps each memory object with a

4366

+ * header and trailer. The header and trailer contain

4367

+ * signatures and sizes that are used to verify that

4368

+ * existing memory objects have not been overwritten.

4369

+ * Refer to the evms_cs_verify_memory_integrity

4370

+ * function for more details.

4371

+ */

4372

+int

4373

+evms_cs_allocate_memory(void **pp, int size)

4374

+{

4375

+ int rc = 0;

4376

+

4377

+#ifdef EVMS_MEM_DEBUG

4378

+ memobj_head_t *mobj, **ppmobj;

4379

+ memobj_tail_t *mobjtail;

4380

+#endif

4381

+ /* verify a valid size parameter was specified */

4382

+ if (size <= 0)

4383

+ /* return NULL on invalid size */

4384

+ *pp = NULL;

4385

+ else {

4386

+#ifdef EVMS_MEM_DEBUG

4387

+ size += sizeof(memobj_head_t) + sizeof(memobj_tail_t);

4388

+#endif

4389

+// *pp = kmalloc(size, GFP_KERNEL);

4390

+ *pp = kmalloc(size, GFP_NOIO);

4391

+ if (*pp == NULL)

4392

+ rc = -ENOMEM;

4393

+ else {

4394

+#ifdef EVMS_MEM_DEBUG

4395

+ /* adjust variables to caller values */

4396

+ mobj = (memobj_head_t *)*pp;

4397

+ *pp += sizeof(memobj_head_t);

4398

+ size -= sizeof(memobj_head_t) + sizeof(memobj_tail_t);

4399

+

4400

+ /* setup memobj head */

4401

+ mobj->ssignature = EVMS_MEM_SSIGNATURE;

4402

+ mobj->size = size;

4403

+

4404

+ /* setup memobj tail */

4405

+ mobjtail = (memobj_tail_t *)(*pp + size);

4406

+ mobjtail->esignature = EVMS_MEM_ESIGNATURE;

4407

+ mobj->tail = mobjtail;

4408

+ mobjtail->head = mobj;

4409

+

4410

+ /* add mobj to linked list */

4411

+

4412

+ spin_lock(&mem_debug_lock);

4413

+ ppmobj = &memobj_head;

4414

+ while(*ppmobj > mobj)

4415

+ ppmobj = &(*ppmobj)->next;

4416

+ mobj->next = *ppmobj;

4417

+ *ppmobj = mobj;

4418

+ spin_unlock(&mem_debug_lock);

4419

+#endif

4420

+ memset(*pp, 0, size);

4421

+ atomic_inc(&evms_allocs);

4422

+ }

4423

+ }

4424

+

4425

+#ifdef EVMS_MEM_DEBUG

4426

+ evms_cs_verify_memory_integrity(0);

4427

+#endif

4428

+ return(rc);

4429

+}

4430

+EXPORT_SYMBOL(evms_cs_allocate_memory);

4431

+

4432

+int

4433

+evms_cs_deallocate_memory(void *p)

4434

+{

4435

+#ifdef EVMS_MEM_DEBUG

4436

+ memobj_head_t *mobj, **ppmobj;

4437

+

4438

+ evms_cs_verify_memory_integrity(0);

4439

+

4440

+ /* init ptr to memobj structure */

4441

+ mobj = (memobj_head_t *)(p - sizeof(memobj_head_t));

4442

+

4443

+ /* find mobj in linked list */

4444

+ spin_lock(&mem_debug_lock);

4445

+ ppmobj = &memobj_head;

4446

+ while(*ppmobj != mobj)

4447

+ ppmobj = &(*ppmobj)->next;

4448

+ *ppmobj = mobj->next;

4449

+ spin_unlock(&mem_debug_lock);

4450

+#endif

4451

+ kfree(p);

4452

+ atomic_dec(&evms_allocs);

4453

+ return(0);

4454

+}

4455

+EXPORT_SYMBOL(evms_cs_deallocate_memory);

4456

+

4457

+int

4458

+evms_cs_allocate_logical_node(evms_logical_node_t **pp)

4459

+{

4460

+ int rc;

4461

+

4462

+ rc = evms_cs_allocate_memory((void **)pp, sizeof(evms_logical_node_t));

4463

+ if (!rc)

4464

+ atomic_inc(&evms_logical_nodes);

4465

+ return(rc);

4466

+}

4467

+EXPORT_SYMBOL(evms_cs_allocate_logical_node);

4468

+

4469

+void

4470

+evms_cs_deallocate_volume_info(evms_logical_node_t *p)

4471

+{

4472

+ if (p->iflags & EVMS_FEATURE_BOTTOM) {

4473

+ evms_cs_remove_item_from_list(

4474

+ &evms_global_feature_node_list, p);

4475

+ evms_cs_deallocate_memory(p->volume_info);

4476

+ p->volume_info = NULL;

4477

+ p->iflags &= ~EVMS_FEATURE_BOTTOM;

4478

+ }

4479

+}

4480

+EXPORT_SYMBOL(evms_cs_deallocate_volume_info);

4481

+

4482

+int

4483

+evms_cs_deallocate_logical_node(evms_logical_node_t *p)

4484

+{

4485

+ if (p->next) {

4486

+ LOG_SERIOUS("Deallocating object whose NEXT ptr is not null!!\n");

4487

+ }

4488

+ evms_cs_deallocate_volume_info(p);

4489

+ if (p->feature_header) {

4490

+ evms_cs_deallocate_memory(p->feature_header);

4491

+ p->feature_header = NULL;

4492

+ }

4493

+ evms_cs_deallocate_memory(p);

4494

+ atomic_dec(&evms_logical_nodes);

4495

+ return(0);

4496

+}

4497

+EXPORT_SYMBOL(evms_cs_deallocate_logical_node);

4498

+

4499

+/*

4500

+ * Function: evms_cs_register_plugin

4501

+ * Description: This function is exported so that all plugins can register with EVMS

4502

+ */

4503

+int

4504

+evms_cs_register_plugin(evms_plugin_header_t * plugin)

4505

+{

4506

+ int rc = 0;

4507

+ evms_registered_plugin_t *reg_record, **pp;

4508

+ evms_version_t *ver;

4509

+

4510

+ ver = &plugin->required_common_services_version;

4511

+

4512

+ LOG_EXTRA("registering plugin (plugin.id=%d.%d.%d, plugin.ver=%d.%d.%d, req.svc.ver=%d.%d.%d)\n",

4513

+ GetPluginOEM(plugin->id),

4514

+ GetPluginType(plugin->id),

4515

+ GetPluginID(plugin->id),

4516

+ plugin->version.major,

4517

+ plugin->version.minor,

4518

+ plugin->version.patchlevel,

4519

+ ver->major,

4520

+ ver->minor,

4521

+ ver->patchlevel);

4522

+

4523

+ /* check common services requirements */

4524

+ rc = evms_cs_check_version(ver, &evms_svc_version);

4525

+ if (rc) {

4526

+ LOG_SERIOUS("plugin failed to load: common services (vers:%d,%d,%d) incompatibility!\n",

4527

+ EVMS_COMMON_SERVICES_MAJOR,

4528

+ EVMS_COMMON_SERVICES_MINOR,

4529

+ EVMS_COMMON_SERVICES_PATCHLEVEL);

4530

+ }

4531

+ if (!rc) {

4532

+ /* ensure a plugin with this feature id is

4533

+ * not already loaded.

4534

+ */

4535

+ for (pp = &registered_plugin_head; *pp; pp = &(*pp)->next) {

4536

+ if ((*pp)->plugin->id == plugin->id) {

4537

+ rc = -EBUSY;

4538

+ LOG_ERROR("error(%d) attempting to load another plugin with id(%x).\n",

4539

+ rc, plugin->id);

4540

+ }

4541

+ }

4542

+ }

4543

+ if (!rc) {

4544

+ /* ensure the plugin has provided functions for

4545

+ * the mandatory entry points.

4546

+ */

4547

+ if (!plugin->function_table->discover) {

4548

+ rc = -EINVAL;

4549

+ } else if (!plugin->function_table->init_io) {

4550

+ rc = -EINVAL;

4551

+ } else if (!plugin->function_table->ioctl) {

4552

+ rc = -EINVAL;

4553

+ } else if (!plugin->function_table->read) {

4554

+ rc = -EINVAL;

4555

+ } else if (!plugin->function_table->write) {

4556

+ rc = -EINVAL;

4557

+ } else if (!plugin->function_table->delete) {

4558

+ rc = -EINVAL;

4559

+ }

4560

+ }

4561

+ if (!rc) {

4562

+ /* allocate a new plugin registration record */

4563

+ rc = evms_cs_allocate_memory((void **)&reg_record,

4564

+ sizeof(evms_registered_plugin_t));

4565

+ }

4566

+ if (!rc) {

4567

+ /* store ptr to plugin header in new registration record */

4568

+ reg_record->plugin = plugin;

4569

+

4570

+ /* terminate the record */

4571

+ reg_record->next = NULL;

4572

+

4573

+ /* find end of the plugin registration list */

4574

+ for (pp = &registered_plugin_head; *pp; pp = &(*pp)->next);

4575

+ /* add registration record to list */

4576

+ *pp = reg_record;

4577

+

4578

+ /* increment the usage count */

4579

+ MOD_INC_USE_COUNT;

4580

+ }

4581

+

4582

+ return(rc);

4583

+}

4584

+EXPORT_SYMBOL(evms_cs_register_plugin);

4585

+

4586

+/*

4587

+ * Function: evms_cs_unregister_plugin

4588

+ * Description: This function is exported so that all plugins can

4589

+ * unregister with EVMS

4590

+ */

4591

+int

4592

+evms_cs_unregister_plugin(evms_plugin_header_t * plugin)

4593

+{

4594

+ int rc = 0, found = FALSE;

4595

+ evms_registered_plugin_t **pp;

4596

+ evms_version_t *ver;

4597

+

4598

+ ver = &plugin->required_common_services_version;

4599

+

4600

+ LOG_EXTRA("unregistering plugin (plugin.id=%d.%d.%d, plugin.ver=%d.%d.%d, req.svc.ver=%d.%d.%d)\n",

4601

+ GetPluginOEM(plugin->id),

4602

+ GetPluginType(plugin->id),

4603

+ GetPluginID(plugin->id),

4604

+ plugin->version.major,

4605

+ plugin->version.minor,

4606

+ plugin->version.patchlevel,

4607

+ ver->major,

4608

+ ver->minor,

4609

+ ver->patchlevel);

4610

+ /* ensure a plugin with this feature id is

4611

+ * currently loaded.

4612

+ */

4613

+ for (pp = &registered_plugin_head; *pp; pp = &(*pp)->next) {

4614

+ if ((*pp)->plugin->id == plugin->id) {

4615

+ found = TRUE;

4616

+ break;

4617

+ }

4618

+ }

4619

+ if (!found) {

4620

+ rc = -ENOPKG;

4621

+ LOG_ERROR("error(%d) attempt to unload a non-loaded plugin with id(%x).\n",

4622

+ rc, plugin->id);

4623

+ }

4624

+ /* actually unload the plugin now */

4625

+ if (!rc) {

4626

+ evms_registered_plugin_t * tmp = *pp;

4627

+

4628

+ /* remove the plugin record from our

4629

+ * internal plugin list

4630

+ */

4631

+ *pp = (*pp)->next;

4632

+ /* deallocate the plugin registration record

4633

+ */

4634

+ evms_cs_deallocate_memory(tmp);

4635

+

4636

+ /* decrement the usage count */

4637

+ MOD_DEC_USE_COUNT;

4638

+ }

4639

+ return(rc);

4640

+}

4641

+EXPORT_SYMBOL(evms_cs_unregister_plugin);

4642

+

4643

+/* function: evms_cs_add_logical_node_to_list

4644

+ *

4645

+ * This functions adds a new logical node to the end of a

4646

+ * node list.

4647

+ *

4648

+ * NOTE: This function is only expected to be called at

4649

+ * discovery time, which is singled threaded by nature,

4650

+ * and therefore doesn't need to be made SMP safe.

4651

+ */

4652

+int

4653

+evms_cs_add_logical_node_to_list(evms_logical_node_t ** list_head, evms_logical_node_t * node)

4654

+{

4655

+ int rc = 0;

4656

+ evms_logical_node_t **pp = NULL;

4657

+

4658

+ /* check to make sure node is not already on a list */

4659

+ if (node->next)

4660

+ rc = 1;

4661

+ else

4662

+ /* check to make sure node being added is not already in the list */

4663

+ for (pp = list_head; *pp; pp = &(*pp)->next)

4664

+ if (*pp == node) {

4665

+ rc = 2;

4666

+ break;

4667

+ }

4668

+

4669

+ /* add node to the end of the list */

4670

+ if (!rc)

4671

+ *pp = node;

4672

+

4673

+ return(rc);

4674

+}

4675

+EXPORT_SYMBOL(evms_cs_add_logical_node_to_list);

4676

+

4677

+/* function: evms_cs_remove_logical_node_from_list

4678

+ *

4679

+ * This functions removes a new logical node from a node list.

4680

+ *

4681

+ * NOTE: This function is only expected to be called at

4682

+ * discovery time, which is singled threaded by nature,

4683

+ * and therefore doesn't need to be made SMP safe.

4684

+ */

4685

+int

4686

+evms_cs_remove_logical_node_from_list(evms_logical_node_t ** list_head, evms_logical_node_t * node)

4687

+{

4688

+ /* remove this node from the head of the list */

4689

+ int rc = 1; /* assume failure until target node is found */

4690

+ evms_logical_node_t **pp;

4691

+ for (pp = list_head; *pp; pp = &(*pp)->next)

4692

+ if (*pp == node) {

4693

+ *pp = (*pp)->next;

4694

+ node->next = NULL;

4695

+ rc = 0;

4696

+ break;

4697

+ }

4698

+ return(rc);

4699

+}

4700

+EXPORT_SYMBOL(evms_cs_remove_logical_node_from_list);

4701

+

4702

+int

4703

+evms_cs_kernel_ioctl(evms_logical_node_t *node, unsigned int cmd, unsigned long arg)

4704

+{

4705

+ int rc = 0;

4706

+ struct inode tmp_inode;

4707

+ mm_segment_t fs;

4708

+

4709

+ lock_kernel();

4710

+ fs = get_fs();

4711

+ set_fs(get_ds());

4712

+ rc = IOCTL(node, &tmp_inode, NULL, cmd, arg);

4713

+ set_fs(fs);

4714

+ unlock_kernel();

4715

+

4716

+ return(rc);

4717

+

4718

+}

4719

+EXPORT_SYMBOL(evms_cs_kernel_ioctl);

4720

+

4721

+/*

4722

+ * function: evms_cs_size_in_vsectors

4723

+ *

4724

+ * In EVMS a V(irtual)Sector is 512 bytes in size.

4725

+ * This function computes the number of VSECTORs an specified

4726

+ * item size would require.

4727

+ *

4728

+ * NOTE: This function has been coded to work with 64 bit values.

4729

+ */

4730

+unsigned long

4731

+evms_cs_size_in_vsectors(long long item_size)

4732

+{

4733

+ long long sectors;

4734

+

4735

+ sectors = item_size >> EVMS_VSECTOR_SIZE_SHIFT;

4736

+ if (item_size & (EVMS_VSECTOR_SIZE - 1))

4737

+ sectors++;

4738

+

4739

+ return(sectors);

4740

+}

4741

+EXPORT_SYMBOL(evms_cs_size_in_vsectors);

4742

+

4743

+/*

4744

+ * function: evms_cs_log2

4745

+ *

4746

+ * this function computes the power of the 2 of specified

4747

+ * value. If the value is 0, a -1 is returned. If the value

4748

+ * is NOT a power of 2, a -2 is return. Otherwise the power

4749

+ * of 2 is returned.

4750

+ */

4751

+int evms_cs_log2(long long value)

4752

+{

4753

+ int result = -1;

4754

+ long long tmp;

4755

+

4756

+ if (value) {

4757

+ tmp = value;

4758

+ result++;

4759

+ while(!(tmp & 1)) {

4760

+ result++;

4761

+ tmp >>= 1;

4762

+ }

4763

+ if (tmp != 1) {

4764

+ result = -2;

4765

+ }

4766

+ }

4767

+ return(result);

4768

+}

4769

+EXPORT_SYMBOL(evms_cs_log2);

4770

+

4771

+/*

4772

+ * Functions:

4773

+ *

4774

+ * build_crc_table()

4775

+ * calculate_crc()

4776

+ *

4777

+ *

4778

+ * Description: The functions in this module provide a means of calculating

4779

+ * the 32 bit CRC for a block of data. build_crc_table must

4780

+ * be called to initialize this module. calculate_crc must

4781

+ * NOT be used until after build_crc_table has been called.

4782

+ * Once build_crc_table has been called, calculate_crc can

4783

+ * be used to calculate the crc of the data residing in a

4784

+ * user specified buffer.

4785

+ *

4786

+ */

4787

+

4788

+#define CRC_POLYNOMIAL 0xEDB88320L

4789

+

4790

+static u_int32_t crc_table[256];

4791

+static u_int32_t crc_table_built = FALSE;

4792

+

4793

+/*********************************************************************/

4794

+/* */

4795

+/* Function Name: build_crc_table */

4796

+/* */

4797

+/* Descriptive Name: This module implements the crc function using */

4798

+/* a table driven method. The required table */

4799

+/* must be setup before the calculate_crc */

4800

+/* function can be used. This table only needs */

4801

+/* to be set up once. This function sets up the */

4802

+/* crc table needed by calculate_crc. */

4803

+/* */

4804

+/* Input: None */

4805

+/* */

4806

+/* Output: None */

4807

+/* */

4808

+/* Error Handling: N/A */

4809

+/* */

4810

+/* Side Effects: The internal crc table is initialized. */

4811

+/* */

4812

+/* Notes: None. */

4813

+/* */

4814

+/*********************************************************************/

4815

+static void

4816

+build_crc_table( void )

4817

+{

4818

+ u_int32_t i, j, crc;

4819

+

4820

+ for (i = 0; i <= 255; i++) {

4821

+ crc = i;

4822

+ for (j = 8; j > 0; j--) {

4823

+ if (crc & 1)

4824

+ crc = (crc >> 1) ^ CRC_POLYNOMIAL;

4825

+ else

4826

+ crc >>= 1;

4827

+ }

4828

+ crc_table[i] = crc;

4829

+ }

4830

+ crc_table_built = TRUE;

4831

+}

4832

+

4833

+/*********************************************************************/

4834

+/* */

4835

+/* Function Name: calculate_crc */

4836

+/* */

4837

+/* Descriptive Name: This function calculates the crc value for */

4838

+/* the data in the buffer specified by Buffer. */

4839

+/* */

4840

+/* Input: u_int32_t crc : This is the starting crc. If you are */

4841

+/* starting a new crc calculation, then */

4842

+/* this should be set to 0xFFFFFFFF. If */

4843

+/* you are continuing a crc calculation */

4844

+/* (i.e. all of the data did not fit in */

4845

+/* the buffer so you could not calculate */

4846

+/* the crc in a single operation), then */

4847

+/* this is the crc output by the last */

4848

+/* calculate_crc call. */

4849

+/* */

4850

+/* Output: The crc for the data in the buffer, based upon the value*/

4851

+/* of the input parameter crc. */

4852

+/* */

4853

+/* Error Handling: None. */

4854

+/* */

4855

+/* Side Effects: None. */

4856

+/* */

4857

+/* Notes: None. */

4858

+/* */

4859

+/*********************************************************************/

4860

+u_int32_t

4861

+evms_cs_calculate_crc(u_int32_t crc, void * buffer, u_int32_t buffersize)

4862

+{

4863

+ unsigned char * current_byte;

4864

+ u_int32_t temp1, temp2, i;

4865

+

4866

+ current_byte = (unsigned char *) buffer;

4867

+ /* Make sure the crc table is available */

4868

+ if (crc_table_built==FALSE) build_crc_table();

4869

+ /* Process each byte in the buffer. */

4870

+ for (i = 0; i < buffersize; i++) {

4871

+ temp1 = (crc >> 8) & 0x00FFFFFF;

4872

+ temp2 = crc_table[(crc ^ (u_int32_t)*current_byte) & (u_int32_t)0xff];

4873

+ current_byte++;

4874

+ crc = temp1 ^ temp2;

4875

+ }

4876

+ return(crc);

4877

+}

4878

+EXPORT_SYMBOL(evms_cs_calculate_crc);

4879

+

4880

+#define EVMS_ORIGINAL_CALLBACK_FLAG 1<<0

4881

+typedef struct io_notify_s {

4882

+ unsigned int flags;

4883

+ void *private;

4884

+ struct buffer_head *bh;

4885

+ u_int64_t rsector;

4886

+ void *b_private;

4887

+ void (*callback_function)(evms_logical_node_t *node,

4888

+ struct buffer_head *bh,

4889

+ int uptodate, int *redrive);

4890

+ struct io_notify_s *next;

4891

+} io_notify_t;

4892

+

4893

+evms_pool_mgmt_t *

4894

+evms_cs_create_pool(

4895

+ int objsize,

4896

+ char *pool_name,

4897

+ void (*ctor)(void*, kmem_cache_t *, unsigned long),

4898

+ void (*dtor)(void*, kmem_cache_t *, unsigned long))

4899

+{

4900

+ evms_pool_mgmt_t *pool;

4901

+

4902

+ /* create the pool management structure */

4903

+ if (evms_cs_allocate_memory((void **)&pool, sizeof(evms_pool_mgmt_t))) {

4904

+ panic("Cannot create %s fpool mgmt structure", pool_name);

4905

+ }

4906

+ /* initialize various field in pool mgmt structure */

4907

+ pool->member_size = objsize;

4908

+ pool->name = pool_name;

4909

+ atomic_set(&pool->waiters, 0);

4910

+ init_waitqueue_head(&pool->wait_queue);

4911

+ /* go create the pool */

4912

+ pool->cachep = kmem_cache_create(

4913

+ pool->name,

4914

+ pool->member_size,

4915

+ 0,

4916

+ SLAB_HWCACHE_ALIGN,

4917

+ ctor, dtor);

4918

+ if(!pool->cachep)

4919

+ panic("Cannot create %s SLAB cache", pool->name);

4920

+ return(pool);

4921

+}

4922

+EXPORT_SYMBOL(evms_cs_create_pool);

4923

+

4924

+void *

4925

+evms_cs_allocate_from_pool(evms_pool_mgmt_t *pool, int blockable)

4926

+{

4927

+ void *objp;

4928

+

4929

+ while (1) {

4930

+ objp = kmem_cache_alloc(pool->cachep, SLAB_NOIO);

4931

+ if (objp || !blockable) {

4932

+ return(objp);

4933

+ } else {

4934

+ /* block and wait for an object to

4935

+ * be returned to the pool

4936

+ */

4937

+ atomic_inc(&pool->waiters);

4938

+ wait_event(pool->wait_queue,

4939

+ (!atomic_read(&pool->waiters)));

4940

+ }

4941

+ }

4942

+ return(objp);

4943

+}

4944

+EXPORT_SYMBOL(evms_cs_allocate_from_pool);

4945

+

4946

+void

4947

+evms_cs_deallocate_to_pool(evms_pool_mgmt_t *pool, void *objp)

4948

+{

4949

+ kmem_cache_free(pool->cachep, objp);

4950

+ atomic_set(&pool->waiters,0);

4951

+ if (waitqueue_active(&pool->wait_queue)) {

4952

+ wake_up(&pool->wait_queue);

4953

+ }

4954

+}

4955

+EXPORT_SYMBOL(evms_cs_deallocate_to_pool);

4956

+

4957

+void

4958

+evms_cs_destroy_pool(evms_pool_mgmt_t *pool)

4959

+{

4960

+ kmem_cache_destroy(pool->cachep);

4961

+ evms_cs_deallocate_memory(pool);

4962

+}

4963

+EXPORT_SYMBOL(evms_cs_destroy_pool);

4964

+

4965

+/*

4966

+ * function: evms_end_io

4967

+ *

4968

+ * This is a support function for

4969

+ * evms_cs_register_for_end_io_notification.

4970

+ * This function is called during I/O completion on any buffer

4971

+ * head that was registered by a plugin. Control is passed here

4972

+ * and this routine will, thru the use of the I/O notify entry

4973

+ * stored in the b_private field of the buffer head, restore

4974

+ * the b_rsector value the buffer head had at the time of

4975

+ * registration and pass control to the registered callback

4976

+ * address, with pointers to the buffer head and an optional

4977

+ * plugin private data. Upon completion of the callback,

4978

+ * control is returned back here. The io notify list entry

4979

+ * is deleted. This process repeats until this routine

4980

+ * detects that all registered plugins have been called back

4981

+ * and the buffer head's original end_io function has been

4982

+ * called. At this point the DONE flag is set, and we terminate

4983

+ * callback loop and exit.

4984

+ *

4985

+ * Plugins may desire to break or interrupt the callback

4986

+ * sequence or chain. This may be useful to redrive I/O or

4987

+ * to wait for other buffer heads to complete before

4988

+ * allowing the original buffer head callback to occur.

4989

+ * To interrupt the callback "chain", a registered

4990

+ * plugin's callback must return with the DONE flag set.

4991

+ *

4992

+ * NOTE: If a plugin set the DONE flag, and wishes to redrive

4993

+ * a buffer head, the plugin MUST reregister the buffer head

4994

+ * to receive another callback on this buffer head. Also, the

4995

+ * plugin MUST ensure that the original buffer head end_io

4996

+ * function get called at some point, either by reregistering

4997

+ * this buffer head and receiving another callback, or by

4998

+ * means of buffer head aggregation triggered by the callbacks

4999

+ * of other buffer heads.

5000

+ *

5001

+ */

5002

+static void

5003

+evms_end_io(struct buffer_head *bh, int uptodate)

5004

+{

5005

+ io_notify_t *entry;

5006

+ int done;

5007

+

5008

+ done = FALSE;

5009

+ while (!done) {

5010

+ /* retrieve the io_notify_entry ptr from

5011

+ * the b_private field in the buffer head.

5012

+ */

5013

+ entry = (io_notify_t *)bh->b_private;

5014

+

5015

+ /* restore the b_private value to

5016

+ * the previous b_private value (which

5017

+ * should be a previous io_notify_entry

5018

+ * or the original b_private pointer).

5019

+ */

5020

+ bh->b_private = entry->b_private;

5021

+

5022

+ /* check for original callback for this bh */

5023

+ if (entry->flags & EVMS_ORIGINAL_CALLBACK_FLAG) {

5024

+ /* this is the original for bh */

5025

+

5026

+ /* turn off flag marking this as the original */

5027

+ entry->flags &= ~EVMS_ORIGINAL_CALLBACK_FLAG;

5028

+

5029

+ /* decrement volume's requests_in_progress var */

5030

+ atomic_dec(&evms_logical_volumes[MINOR(bh->b_dev)].requests_in_progress);

5031

+

5032

+ /* restore b_end_io to original value */

5033

+ bh->b_end_io = (void *)entry->callback_function;

5034

+ if (bh->b_end_io) {

5035

+ /* invoke original callback function

5036

+ * if it exists.

5037

+ */

5038

+ bh->b_end_io(bh, uptodate);

5039

+ }

5040

+ done = TRUE;

5041

+ } else {

5042

+ /* this is a plugin callback */

5043

+

5044

+ /* restore the rsector value to the

5045

+ * value at the time of callback

5046

+ * registration.

5047

+ */

5048

+ bh->b_rsector = entry->rsector;

5049

+ /* invoke plugin callback function */

5050

+ entry->callback_function(entry->private, bh, uptodate, &done);

5051

+ }

5052

+ /* free the io notify entry */

5053

+ evms_cs_deallocate_to_pool(evms_io_notify_pool, entry);

5054

+ }

5055

+}

5056

+

5057

+/*

5058

+ * function: evms_cs_register_for_end_io_notification

5059

+ *

5060

+ * This function is an evms common service.

5061

+ * This routine allows a (plugin) function to register to

5062

+ * participate in the io completion notification process.

5063

+ * This is useful for plugins which alter data after it

5064

+ * has been read from the disk (i.e. encryption or

5065

+ * compression).

5066

+ *

5067

+ * This routine also records the rsector value at the time

5068

+ * of registration, so that it can be restored to that value

5069

+ * prior to the callback to a plugin, thus allowing that

5070

+ * plugin to work with the value it had seen during the

5071

+ * initiating I/O request.

5072

+ *

5073

+ * This routine also records a private data pointer at the

5074

+ * time of registration, and is returned to the plugin

5075

+ * at callback time. This private data pointer was designed

5076

+ * to contain context/callback/buffer_head specific data, and

5077

+ * frees the plugin from having to store and find associated

5078

+ * data at the time of the callback. This field is not used

5079

+ * by this function and is optional (NULL if unused). It is

5080

+ * recorded and returned as a convenience for the plugins.

5081

+ *

5082

+ * DANGER!!! - WILL ROBINSON - DANGER!!!

5083

+ * This routine uses the b_private field in the

5084

+ * buffer_head structure. If any lower level driver uses this

5085

+ * field and do NOT restore it, the I/O callback will fail!!

5086

+ *

5087

+ * Any plugins writers requiring a field for private storage

5088

+ * should instead use the private field parameter in this

5089

+ * function to store their private data.

5090

+ *

5091

+ */

5092

+

5093

+int

5094

+evms_cs_register_for_end_io_notification(

5095

+ void *private,

5096

+ struct buffer_head *bh,

5097

+ void *callback_function)

5098

+{

5099

+ int rc = 0, done;

5100

+ io_notify_t *new_entry;

5101

+

5102

+ done = FALSE;

5103

+ while (!done) {

5104

+ /* allocate a notify entry */

5105

+ new_entry = evms_cs_allocate_from_pool(evms_io_notify_pool, EVMS_BLOCKABLE);

5106

+ if (!new_entry) {

5107

+ schedule();

5108

+ continue;

5109

+ }

5110

+

5111

+ /* initialize notify entry */

5112

+ new_entry->private = private;

5113

+ new_entry->bh = bh;

5114

+ new_entry->rsector = bh->b_rsector;

5115

+ new_entry->b_private = bh->b_private;

5116

+ new_entry->flags = 0;

5117

+

5118

+ /* is this the first callback for this bh? */

5119

+ if (bh->b_end_io != evms_end_io) {

5120

+ /* yes, first callback */

5121

+ new_entry->flags |= EVMS_ORIGINAL_CALLBACK_FLAG;

5122

+ new_entry->callback_function = (void *)bh->b_end_io;

5123

+

5124

+ /* increment volume's requests_in_progress var */

5125

+ atomic_inc(&evms_logical_volumes[MINOR(bh->b_dev)].requests_in_progress);

5126

+

5127

+ /* set b_end_io so we get control */

5128

+ bh->b_end_io = evms_end_io;

5129

+ } else {

5130

+ /* no, not first callback */

5131

+ new_entry->callback_function = callback_function;

5132

+ done = TRUE;

5133

+ }

5134

+ /* set b_private to aid in quick lookup */

5135

+ bh->b_private = new_entry;

5136

+ }

5137

+ return(rc);

5138

+}

5139

+EXPORT_SYMBOL(evms_cs_register_for_end_io_notification);

5140

+

5141

+/* function description: evms_lookup_item_in_list

5142

+ *

5143

+ * this function searches for the specified item in the

5144

+ * specified node list. it returns the address of the

5145

+ * evms_list_node containing the specified item.

5146

+ */

5147

+static evms_list_node_t **

5148

+evms_lookup_item_in_list(

5149

+ evms_list_node_t **node_list,

5150

+ void *item)

5151

+{

5152

+ evms_list_node_t **list_node;

5153

+

5154

+ list_node = node_list;

5155

+ while(*list_node) {

5156

+ if ((*list_node)->item == item)

5157

+ break;

5158

+ list_node = &(*list_node)->next;

5159

+ }

5160

+ return(list_node);

5161

+}

5162

+

5163

+/* function description: evms_add_item_to_list

5164

+ *

5165

+ * this function adds an item to the list. the

5166

+ * node for the new item is added to the end

5167

+ * of the list. the list is traversed to find the end.

5168

+ * while the traversal occurs, the list is checked

5169

+ * for the presence of the specified item. if already

5170

+ * present in the list, and error code is returned.

5171

+ */

5172

+/* function description: evms_cs_add_item_to_list

5173

+ *

5174

+ * this function adds an item to an item list.

5175

+ *

5176

+ * RC == 0 is returned for:

5177

+ * a successful add of a new item

5178

+ *

5179

+ * RC == 1 is returned when:

5180

+ * the item is already on the list

5181

+ *

5182

+ * RC < 0 is returned for an error attempting to add the item.

5183

+ */

5184

+int

5185

+evms_cs_add_item_to_list(

5186

+ evms_list_node_t **list,

5187

+ void *item)

5188

+{

5189

+ int rc = 0;

5190

+ evms_list_node_t **list_node, *new_node;

5191

+

5192

+ list_node = evms_lookup_item_in_list(list, item);

5193

+ if (*list_node == NULL) {

5194

+ rc = evms_cs_allocate_memory(

5195

+ (void **)&new_node,

5196

+ sizeof(evms_list_node_t));

5197

+ if (!rc) {

5198

+ new_node->item = item;

5199

+ *list_node = new_node;

5200

+ }

5201

+ } else {

5202

+ rc = 1;

5203

+ LOG_DEBUG("warning: attempt to add duplicate item(%p) to list(%p).\n",

5204

+ item, list);

5205

+ }

5206

+ return(rc);

5207

+}

5208

+EXPORT_SYMBOL(evms_cs_add_item_to_list);

5209

+

5210

+/* function description: evms_remove_item_from_list

5211

+ *

5212

+ * this function removes a specified item from the

5213

+ * specified list. if the specified item is not

5214

+ * found in the list, and error is returned.

5215

+ */

5216

+int

5217

+evms_cs_remove_item_from_list(

5218

+ evms_list_node_t **list,

5219

+ void *item)

5220

+{

5221

+ int rc = 0;

5222

+ evms_list_node_t **list_node;

5223

+

5224

+ /* check to see if item is in the list */

5225

+ list_node = evms_lookup_item_in_list(list, item);

5226

+

5227

+ /* was the node found in the list? */

5228

+ if (*list_node) {

5229

+ /* yes, it was found */

5230

+ evms_list_node_t *tmp_node;

5231

+

5232

+ /* save ptr to node being removed*/

5233

+ tmp_node = *list_node;

5234

+ /* remove it from the global list */

5235

+ *list_node = tmp_node->next;

5236

+ /* delete removed node */

5237

+ evms_cs_deallocate_memory(tmp_node);

5238

+ } else {

5239

+ /* no, it was not found */

5240

+ rc = -1;

5241

+ LOG_ERROR("error(%d): attempt to remove nonexistant node(%p) from list(%p).\n",

5242

+ rc, item, list);

5243

+ }

5244

+ return(rc);

5245

+}

5246

+EXPORT_SYMBOL(evms_cs_remove_item_from_list);

5247

+

5248

+/* function description: evms_cs_register_device

5249

+ *

5250

+ * this function adds a device to the EVMS global device list.

5251

+ *

5252

+ * RC == 0 is returned for:

5253

+ * a successful add of a new device

5254

+ *

5255

+ * RC == 1 is returned when:

5256

+ * the device is already on the list

5257

+ *

5258

+ * RC < 0 is returned for an error attempting to add the device.

5259

+ */

5260

+int

5261

+evms_cs_register_device(evms_logical_node_t *device)

5262

+{

5263

+ return(evms_cs_add_item_to_list(

5264

+ &evms_global_device_list,

5265

+ device));

5266

+}

5267

+EXPORT_SYMBOL(evms_cs_register_device);

5268

+

5269

+/* function description: evms_cs_unregister_device

5270

+ *

5271

+ * this function removes a device from the EVMS global device list.

5272

+ *

5273

+ * RC == 0 is returned for:

5274

+ * a successful removal of the specified device

5275

+ *

5276

+ * RC < 0 is returned for an error attempting to add the device.

5277

+ * -ENODATA is returned if specified device is not found.

5278

+ */

5279

+int

5280

+evms_cs_unregister_device(evms_logical_node_t *device)

5281

+{

5282

+ return(evms_cs_remove_item_from_list(

5283

+ &evms_global_device_list,

5284

+ device));

5285

+}

5286

+EXPORT_SYMBOL(evms_cs_unregister_device);

5287

+

5288

+static evms_list_node_t *find_first_next_list_node = NULL;

5289

+int

5290

+evms_cs_find_next_device(

5291

+ evms_logical_node_t *in_device,

5292

+ evms_logical_node_t **out_device)

5293

+{

5294

+ int rc = 0;

5295

+ evms_list_node_t **list_node;

5296

+

5297

+ if (in_device == NULL)

5298

+ find_first_next_list_node = evms_global_device_list;

5299

+ else {

5300

+ list_node = evms_lookup_item_in_list(

5301

+ &evms_global_device_list,

5302

+ in_device);

5303

+ find_first_next_list_node = *list_node;

5304

+ if (find_first_next_list_node == NULL)

5305

+ rc = -ENODATA;

5306

+ else

5307

+ find_first_next_list_node =

5308

+ find_first_next_list_node->next;

5309

+ }

5310

+

5311

+ if (find_first_next_list_node == NULL)

5312

+ *out_device = NULL;

5313

+ else

5314

+ *out_device = (evms_logical_node_t *)

5315

+ find_first_next_list_node->item;

5316

+

5317

+ return(rc);

5318

+}

5319

+EXPORT_SYMBOL(evms_cs_find_next_device);

5320

+

5321

+void

5322

+evms_cs_signal_event(int eventid)

5323

+{

5324

+ int rc;

5325

+ evms_list_node_t **list_node;

5326

+

5327

+ /* signal PID(s) of specified event */

5328

+ list_node = &evms_global_notify_list;

5329

+ while(*list_node) {

5330

+ evms_event_t *event;

5331

+

5332

+ event = (*list_node)->item;

5333

+ if (event->eventid == eventid) {

5334

+ struct task_struct *tsk;

5335

+

5336

+ tsk = find_task_by_pid(event->pid);

5337

+ if (tsk) {

5338

+ struct siginfo siginfo;

5339

+

5340

+ siginfo.si_signo = event->signo;

5341

+ siginfo.si_errno = 0;

5342

+ siginfo.si_code = 0;

5343

+ rc = send_sig_info(event->signo,

5344

+ &siginfo,

5345

+ tsk);

5346

+ } else {

5347

+ /* TODO:

5348

+ * unregister this stale

5349

+ * notification record

5350

+ */

5351

+ }

5352

+ }

5353

+ list_node = &(*list_node)->next;

5354

+ }

5355

+}

5356

+EXPORT_SYMBOL(evms_cs_signal_event);

5357

+

5358

+static inline void

5359

+evms_flush_signals (void)

5360

+{

5361

+ spin_lock(&current->sigmask_lock);

5362

+ flush_signals(current);

5363

+ spin_unlock(&current->sigmask_lock);

5364

+}

5365

+

5366

+static inline void

5367

+evms_init_signals (void)

5368

+{

5369

+ current->exit_signal = SIGCHLD;

5370

+ siginitsetinv(&current->blocked, sigmask(SIGKILL));

5371

+}

5372

+

5373

+static int

5374

+evms_thread(void * arg)

5375

+{

5376

+ evms_thread_t *thread = arg;

5377

+ lock_kernel();

5378

+

5379

+ /*

5380

+ * Detach thread

5381

+ */

5382

+

5383

+ daemonize();

5384

+

5385

+ sprintf(current->comm, thread->name);

5386

+ evms_init_signals();

5387

+ evms_flush_signals();

5388

+ thread->tsk = current;

5389

+

5390

+ current->policy = SCHED_OTHER;

5391

+ current->nice = -20;

5392

+ unlock_kernel();

5393

+

5394

+ complete(thread->event);

5395

+ while (thread->run) {

5396

+ void (*run)(void *data);

5397

+ DECLARE_WAITQUEUE(wait, current);

5398

+

5399

+ add_wait_queue(&thread->wqueue, &wait);

5400

+ set_task_state(current, TASK_INTERRUPTIBLE);

5401

+ if (!test_bit(EVMS_THREAD_WAKEUP, &thread->flags)) {

5402

+ schedule();

5403

+ }

5404

+ current->state = TASK_RUNNING;

5405

+ remove_wait_queue(&thread->wqueue, &wait);

5406

+ clear_bit(EVMS_THREAD_WAKEUP, &thread->flags);

5407

+

5408

+ run = thread->run;

5409

+ if (run) {

5410

+ run(thread->data);

5411

+ run_task_queue(&tq_disk);

5412

+ }

5413

+ if (signal_pending(current)) {

5414

+ evms_flush_signals();

5415

+ }

5416

+ }

5417

+ complete(thread->event);

5418

+ return 0;

5419

+}

5420

+

5421

+evms_thread_t *

5422

+evms_cs_register_thread (

5423

+ void (*run) (void *),

5424

+ void *data,

5425

+ const char *name)

5426

+{

5427

+ evms_thread_t *thread;

5428

+ int ret;

5429

+ struct completion event;

5430

+

5431

+ if (evms_cs_allocate_memory((void**)&thread,sizeof(evms_thread_t)))

5432

+ return NULL;

5433

+

5434

+ memset(thread, 0, sizeof(evms_thread_t));

5435

+ init_waitqueue_head(&thread->wqueue);

5436

+

5437

+ init_completion(&event);

5438

+ thread->event = &event;

5439

+ thread->run = run;

5440

+ thread->data = data;

5441

+ thread->name = name;

5442

+ ret = kernel_thread(evms_thread, thread, 0);

5443

+ if (ret < 0) {

5444

+ evms_cs_deallocate_memory(thread);

5445

+ return NULL;

5446

+ }

5447

+ wait_for_completion(&event);

5448

+ return thread;

5449

+}

5450

+EXPORT_SYMBOL(evms_cs_register_thread);

5451

+

5452

+void

5453

+evms_cs_unregister_thread (evms_thread_t *thread)

5454

+{

5455

+ struct completion event;

5456

+

5457

+ init_completion(&event);

5458

+

5459

+ thread->event = &event;

5460

+ thread->run = NULL;

5461

+ thread->name = NULL;

5462

+ evms_cs_interrupt_thread(thread);

5463

+ wait_for_completion(&event);

5464

+ evms_cs_deallocate_memory(thread);

5465

+}

5466

+EXPORT_SYMBOL(evms_cs_unregister_thread);

5467

+

5468

+void

5469

+evms_cs_wakeup_thread(evms_thread_t *thread)

5470

+{

5471

+ set_bit(EVMS_THREAD_WAKEUP, &thread->flags);

5472

+ wake_up(&thread->wqueue);

5473

+}

5474

+EXPORT_SYMBOL(evms_cs_wakeup_thread);

5475

+

5476

+void

5477

+evms_cs_interrupt_thread (evms_thread_t *thread)

5478

+{

5479

+ if (!thread->tsk) {

5480

+ LOG_ERROR("error: attempted to interrupt an invalid thread!\n");

5481

+ return;

5482

+ }

5483

+ send_sig(SIGKILL, thread->tsk, 1);

5484

+}

5485

+EXPORT_SYMBOL(evms_cs_interrupt_thread);

5486

+

5487

+struct proc_dir_entry *

5488

+evms_cs_get_evms_proc_dir(void)

5489

+{

5490

+#ifdef CONFIG_PROC_FS

5491

+ if (!evms_proc_dir) {

5492

+ evms_proc_dir = create_proc_entry("evms", S_IFDIR, &proc_root);

5493

+ }

5494

+#endif

5495

+ return(evms_proc_dir);

5496

+}

5497

+EXPORT_SYMBOL(evms_cs_get_evms_proc_dir);

5498

+

5499

+int

5500

+evms_cs_volume_request_in_progress(

5501

+ kdev_t dev,

5502

+ int operation,

5503

+ int *current_count)

5504

+{

5505

+ int rc = 0;

5506

+ evms_logical_volume_t *volume;

5507

+

5508

+ volume = &evms_logical_volumes[MINOR(dev)];

5509

+ if (volume->node) {

5510

+ if (operation > 0) {

5511

+ atomic_inc(&volume->requests_in_progress);

5512

+ } else if (operation < 0) {

5513

+ atomic_dec(&volume->requests_in_progress);

5514

+ }

5515

+ if (current_count) {

5516

+ *current_count = atomic_read(&volume->requests_in_progress);

5517

+ }

5518

+ } else {

5519

+ rc = -ENODEV;

5520

+ }

5521

+ return(rc);

5522

+}

5523

+EXPORT_SYMBOL(evms_cs_volume_request_in_progress);

5524

+

5525

+/**********************************************************/

5526

+/* END -- exported functions/Common Services */

5527

+/**********************************************************/

5528

+

5529

+/**********************************************************/

5530

+/* START -- Proc FS Support functions */

5531

+/**********************************************************/

5532

+

5533

+#ifdef CONFIG_PROC_FS

5534

+static int

5535

+evms_info_read_proc(

5536

+ char *page,

5537

+ char **start,

5538

+ off_t off,

5539

+ int count,

5540

+ int *eof,

5541

+ void *data)

5542

+{

5543

+ int sz = 0;

5544

+ char *info_level_text = NULL;

5545

+

5546

+ PROCPRINT("Enterprise Volume Management System: Info\n");

5547

+ switch(evms_info_level) {

5548

+ case 1:

5549

+ info_level_text = "critical";

5550

+ break;

5551

+ case 2:

5552

+ info_level_text = "serious";

5553

+ break;

5554

+ case 3:

5555

+ info_level_text = "error";

5556

+ break;

5557

+ case 4:

5558

+ info_level_text = "warning";

5559

+ break;

5560

+ case 5:

5561

+ info_level_text = "default";

5562

+ break;

5563

+ case 6:

5564

+ info_level_text = "details";

5565

+ break;

5566

+ case 7:

5567

+ info_level_text = "debug";

5568

+ break;

5569

+ case 8:

5570

+ info_level_text = "extra";

5571

+ break;

5572

+ case 9:

5573

+ info_level_text = "entry exit";

5574

+ break;

5575

+ case 10:

5576

+ info_level_text = "everything";

5577

+ break;

5578

+ default:

5579

+ info_level_text = "unknown";

5580

+ break;

5581

+ }

5582

+ PROCPRINT("EVMS info level: %d (%s).\n",

5583

+ evms_info_level, info_level_text);

5584

+

5585

+ PROCPRINT("EVMS kernel version: %d.%d.%d\n",

5586

+ EVMS_MAJOR_VERSION,

5587

+ EVMS_MINOR_VERSION,

5588

+ EVMS_PATCHLEVEL_VERSION);

5589

+

5590

+ PROCPRINT("EVMS IOCTL interface version: %d.%d.%d\n",

5591

+ EVMS_IOCTL_INTERFACE_MAJOR,

5592

+ EVMS_IOCTL_INTERFACE_MINOR,

5593

+ EVMS_IOCTL_INTERFACE_PATCHLEVEL);

5594

+

5595

+ PROCPRINT("EVMS Common Services version: %d.%d.%d\n",

5596

+ EVMS_COMMON_SERVICES_MAJOR,

5597

+ EVMS_COMMON_SERVICES_MINOR,

5598

+ EVMS_COMMON_SERVICES_PATCHLEVEL);

5599

+

5600

+ return sz;

5601

+}

5602

+

5603

+static int

5604

+evms_plugins_read_proc(

5605

+ char *page,

5606

+ char **start,

5607

+ off_t off,

5608

+ int count,

5609

+ int *eof,

5610

+ void *data)

5611

+{

5612

+ int sz = 0;

5613

+ evms_registered_plugin_t *rp = NULL;

5614

+

5615

+ PROCPRINT("Enterprise Volume Management System: Plugins\n");

5616

+ /* 0 1 1 2 2 3 3 4 4 5 5 6 6 7*/

5617

+ /* 1 5 0 5 0 5 0 5 0 5 0 5 0 5 0*/

5618

+ PROCPRINT(" ---------Plugin---------- required services\n");

5619

+ PROCPRINT(" ----id---- version version\n\n");

5620

+ for (rp = registered_plugin_head; rp; rp = rp->next) {

5621

+ PROCPRINT(" %x.%x.%x\t %d.%d.%d\t%d.%d.%d\n",

5622

+ GetPluginOEM(rp->plugin->id),

5623

+ GetPluginType(rp->plugin->id),

5624

+ GetPluginID(rp->plugin->id),

5625

+ rp->plugin->version.major,

5626

+ rp->plugin->version.minor,

5627

+ rp->plugin->version.patchlevel,

5628

+ rp->plugin->required_common_services_version.major,

5629

+ rp->plugin->required_common_services_version.minor,

5630

+ rp->plugin->required_common_services_version.patchlevel);

5631

+ }

5632

+

5633

+ return sz;

5634

+}

5635

+

5636

+static int

5637

+evms_volumes_read_proc(

5638

+ char *page,

5639

+ char **start,

5640

+ off_t off,

5641

+ int count,

5642

+ int *eof,

5643

+ void *data)

5644

+{

5645

+ int sz = 0, j;

5646

+

5647

+ PROCPRINT("Enterprise Volume Management System: Volumes\n");

5648

+ PROCPRINT("major minor #blocks type flags name\n\n");

5649

+ for (j = 1; j < MAX_EVMS_VOLUMES; j++) {

5650

+ evms_logical_volume_t *volume;

5651

+

5652

+ volume = &evms_logical_volumes[j];

5653

+ if (volume->node) {

5654

+ PROCPRINT("%5d %7d %16Ld %s %s %s %s%s\n",

5655

+ EVMS_MAJOR, j,

5656

+ volume->node->total_vsectors >> 1,

5657

+ (volume->flags & EVMS_VOLUME_FLAG) ? "evms " : "compat",

5658

+ (volume->flags & EVMS_VOLUME_READ_ONLY) ? "ro" : "rw",

5659

+ (volume->flags & EVMS_VOLUME_PARTIAL) ? "p " : " ",

5660

+ EVMS_DEV_NODE_PATH,

5661

+ volume->name);

5662

+ }

5663

+ }

5664

+

5665

+ return sz;

5666

+}

5667

+#endif

5668

+

5669

+/**********************************************************/

5670

+/* END -- Proc FS Support functions */

5671

+/**********************************************************/

5672

+

5673

+/**********************************************************/

5674

+/* START -- FOPS functions definitions */

5675

+/**********************************************************/

5676

+

5677

+/************************************************/

5678

+/* START -- IOCTL commands -- EVMS specific */

5679

+/************************************************/

5680

+

5681

+static int

5682

+evms_ioctl_cmd_get_ioctl_version (void * arg)

5683

+{

5684

+ int rc = 0;

5685

+ evms_version_t ver;

5686

+

5687

+ ver.major = EVMS_IOCTL_INTERFACE_MAJOR;

5688

+ ver.minor = EVMS_IOCTL_INTERFACE_MINOR;

5689

+ ver.patchlevel = EVMS_IOCTL_INTERFACE_PATCHLEVEL;

5690

+

5691

+ /* copy info to userspace */

5692

+ if (copy_to_user(arg, &ver, sizeof(ver)))

5693

+ rc = -EFAULT;

5694

+

5695

+ return (rc);

5696

+}

5697

+

5698

+static int

5699

+evms_ioctl_cmd_get_version (void * arg)

5700

+{

5701

+ int rc = 0;

5702

+ evms_version_t ver;

5703

+

5704

+ ver.major = EVMS_MAJOR_VERSION;

5705

+ ver.minor = EVMS_MINOR_VERSION;

5706

+ ver.patchlevel = EVMS_PATCHLEVEL_VERSION;

5707

+

5708

+ /* copy info to userspace */

5709

+ if (copy_to_user(arg, &ver, sizeof(ver)))

5710

+ rc = -EFAULT;

5711

+

5712

+ return (rc);

5713

+}

5714

+

5715

+static int

5716

+evms_ioctl_cmd_get_info_level (void * arg)

5717

+{

5718

+ int rc = 0;

5719

+

5720

+ /* copy info to userspace */

5721

+ if (copy_to_user(arg, &evms_info_level, sizeof(evms_info_level)))

5722

+ rc = -EFAULT;

5723

+

5724

+ return (rc);

5725

+}

5726

+

5727

+static int

5728

+evms_ioctl_cmd_set_info_level (void * arg)

5729

+{

5730

+ int rc = 0;

5731

+

5732

+ /* copy info from userspace */

5733

+ if (copy_from_user(&evms_info_level, arg, sizeof(evms_info_level)))

5734

+ rc = -EFAULT;

5735

+

5736

+ return (rc);

5737

+}

5738

+

5739

+/* function: evms_quiesce_volume

5740

+ *

5741

+ * this function performs the actual quiesce operation on

5742

+ * a volume in kernel memory.

5743

+ *

5744

+ * when quiescing, all new I/Os to a volume are stopped,

5745

+ * causing the calling thread to block. this thread then

5746

+ * waits until all I/Os in progress are completed, before

5747

+ * return control to the caller.

5748

+ *

5749

+ * when unquiescing, all new I/Os are allowed to proceed

5750

+ * unencumbered, and all threads waiting (blocked) on this

5751

+ * volume, are woken up and allowed to proceed.

5752

+ *

5753

+ */

5754

+static int

5755

+evms_quiesce_volume(

5756

+ evms_logical_volume_t *volume,

5757

+ struct inode *inode,

5758

+ struct file *file,

5759

+ evms_quiesce_volume_t *qv)

5760

+{

5761

+ int rc;

5762

+

5763

+ LOG_DEBUG("%squiescing %s.\n",

5764

+ ((qv->command) ? "" : "un"), volume->name);

5765

+

5766

+#ifdef VFS_PATCH_PRESENT

5767

+ if (qv->do_vfs) {

5768

+ /* VFS function call to sync and lock the filesystem */

5769

+ fsync_dev_lockfs(MKDEV(EVMS_MAJOR, qv->minor));

5770

+ volume->vfs_quiesced = TRUE;

5771

+ }

5772

+#endif

5773

+ volume->quiesced = qv->command;

5774

+

5775

+ /* Command specified was "quiesce". */

5776

+ if (qv->command) {

5777

+ /* After setting the volume to

5778

+ * a quiesced state, there could

5779

+ * be threads (on SMP systems)

5780

+ * that are executing in the

5781

+ * function, evms_handle_request,

5782

+ * between the "wait_event" and the

5783

+ * "atomic_inc" lines. We need to

5784

+ * provide a "delay" sufficient

5785

+ * to allow those threads to

5786

+ * to reach the atomic_inc's

5787

+ * before executing the while loop

5788

+ * below. The "schedule" call should

5789

+ * provide this.

5790

+ */

5791

+ schedule();

5792

+ /* wait for outstanding requests

5793

+ * to complete

5794

+ */

5795

+ while(atomic_read(&volume->requests_in_progress)>0)

5796

+ schedule();

5797

+ }

5798

+ /* send this command down the stack so lower */

5799

+ /* layers can know about this */

5800

+ rc = IOCTL(volume->node, inode, file,

5801

+ EVMS_QUIESCE_VOLUME, (unsigned long)&qv);

5802

+ if (!rc) {

5803

+ /* Command specified was "unquiesce". */

5804

+ if (!qv->command) {

5805

+ /* "wakeup" any I/O requests waiting on

5806

+ * this volume.

5807

+ */

5808

+ if (waitqueue_active(&volume->wait_queue))

5809

+ wake_up(&volume->wait_queue);

5810

+#ifdef VFS_PATCH_PRESENT

5811

+ if (volume->vfs_quiesced) {

5812

+ /* VFS function call to unlock the filesystem */

5813

+ unlockfs(MKDEV(EVMS_MAJOR, qv->minor));

5814

+ volume->vfs_quiesced = FALSE;

5815

+ }

5816

+#endif

5817

+ }

5818

+ } else {

5819

+ LOG_ERROR("error(%d) %squiescing %s.\n",

5820

+ rc,

5821

+ ((qv->command) ? "" : "un"),

5822

+ volume->name);

5823

+ }

5824

+ return(rc);

5825

+}

5826

+

5827

+/* function: evms_delete_volume

5828

+ *

5829

+ * this function performs the actual delete operation on

5830

+ * a volume to purge it from kernel memory. all structures

5831

+ * and memory consumed by this volume will be free as well

5832

+ * as clearing or unregistering any system services or

5833

+ * global data arrays.

5834

+ *

5835

+ * NOTE: this function will return -EBUSY on attempts to

5836

+ * delete mounted volumes.

5837

+ *

5838

+ */

5839

+static int

5840

+evms_delete_volume(

5841

+ evms_logical_volume_t *volume,

5842

+ evms_delete_volume_t *dv)

5843

+{

5844

+ int rc = 0;

5845

+

5846

+ /* if this is a "permament" delete */

5847

+ /* check to make sure volume is not mounted */

5848

+ if (dv->command) {

5849

+ if (is_mounted(MKDEV(EVMS_MAJOR, dv->minor))) {

5850

+ rc = -EBUSY;

5851

+ }

5852

+ }

5853

+

5854

+ /* invoke the delete ioctl at the top of the feature stack */

5855

+ if (!rc) {

5856

+ LOG_DETAILS("deleting '%s'.\n",volume->name);

5857

+ rc = DELETE(volume->node);

5858

+ }

5859

+

5860

+ /* the volume has been deleted, do any clean up work

5861

+ * required.

5862

+ */

5863

+ if (!rc) {

5864

+ devfs_unregister(volume->devfs_handle);

5865

+ if (dv->command) {

5866

+ /* if "permanent" delete, free the name

5867

+ * and NULL the name field.

5868

+ */

5869

+ evms_cs_deallocate_memory(volume->name);

5870

+ volume->name = NULL;

5871

+ volume->flags = 0;

5872

+ } else {

5873

+ /* if "soft" delete, leave the name so

5874

+ * we can use it to reassign the same

5875

+ * minor to this volume after a

5876

+ * rediscovery.

5877

+ */

5878

+ volume->flags = EVMS_VOLUME_SOFT_DELETED;

5879

+ }

5880

+ volume->node = NULL;

5881

+ set_device_ro(MKDEV(EVMS_MAJOR,dv->minor),0);

5882

+ blk_size[EVMS_MAJOR][dv->minor] = 0;

5883

+ blksize_size[EVMS_MAJOR][dv->minor] = 0;

5884

+ hardsect_size[EVMS_MAJOR][dv->minor] = 0;

5885

+ evms_volumes--;

5886

+ } else {

5887

+ LOG_ERROR("error(%d) %s deleting %s.\n",

5888

+ rc,

5889

+ ((dv->command) ? "hard" : "soft"),

5890

+ volume->name);

5891

+ }

5892

+ return(rc);

5893

+}

5894

+

5895

+/* function: evms_user_delete_volume

5896

+ *

5897

+ * this function, depending on the parameters, performs

5898

+ * a "soft" or a "hard" delete. for a "soft" delete, a

5899

+ * quiesce & delete request is queued up, to be executed

5900

+ * at the beginning of the next rediscovery. for a

5901

+ * "hard" delete, the target volume is quiesced and then

5902

+ * deleted. if there is any errors attempting to delete

5903

+ * the target, then the target is unquiesced. if an

5904

+ * associative volume is specified it is quiesced before

5905

+ * the target volume is quiesced, and is unquiesced

5906

+ * after the attempt to delete the target volume.

5907

+ *

5908

+ */

5909

+static int

5910

+evms_user_delete_volume(

5911

+ evms_logical_volume_t *lvt,

5912

+ struct inode *inode,

5913

+ struct file *file,

5914

+ evms_delete_volume_t *dv)

5915

+{

5916

+ int rc = 0;

5917

+

5918

+ if (!dv->command) {

5919

+ /* "soft delete" requested */

5920

+ lvt->flags |= (EVMS_REQUESTED_QUIESCE |

5921

+ EVMS_REQUESTED_DELETE);

5922

+ if (dv->do_vfs) {

5923

+ lvt->flags |= EVMS_REQUESTED_VFS_QUIESCE;

5924

+ }

5925

+ } else {

5926

+ /* "hard delete" requested */

5927

+ int qa = FALSE;

5928

+ evms_quiesce_volume_t qv;

5929

+ evms_logical_volume_t *lva = NULL;

5930

+

5931

+ if (dv->associative_minor) {

5932

+ /* associative volume specified

5933

+ *

5934

+ * quiesce it

5935

+ */

5936

+ lva = &evms_logical_volumes[dv->associative_minor];

5937

+ /* quiesce associative volume */

5938

+ qv.command = EVMS_QUIESCE;

5939

+ qv.do_vfs = EVMS_VFS_DO_NOTHING;

5940

+ qv.minor = dv->associative_minor;

5941

+ rc = evms_quiesce_volume(lva, inode, file, &qv);

5942

+ qa = (rc) ? FALSE : TRUE;

5943

+ }

5944

+ if (!rc) {

5945

+ /* quiesce target volume */

5946

+ qv.command = EVMS_QUIESCE;

5947

+ qv.do_vfs = EVMS_VFS_DO_NOTHING;

5948

+ qv.minor = dv->minor;

5949

+ rc = evms_quiesce_volume(lvt, inode, file, &qv);

5950

+ }

5951

+ if (!rc) {

5952

+ /* delete the target volume */

5953

+ rc = evms_delete_volume(lvt, dv);

5954

+ if (rc) {

5955

+ /* got an error undeleting...

5956

+ *

5957

+ * unquiesce the target

5958

+ */

5959

+ qv.command = EVMS_UNQUIESCE;

5960

+ qv.do_vfs = EVMS_VFS_DO_NOTHING;

5961

+ qv.minor = dv->minor;

5962

+ evms_quiesce_volume(lvt, inode, file, &qv);

5963

+ }

5964

+ }

5965

+ if (dv->associative_minor) {

5966

+ /* associative volume specified

5967

+ *

5968

+ * unquiesce it

5969

+ */

5970

+ if (qa) {

5971

+ /* only unquiesce associative

5972

+ * if we successfully quiesced

5973

+ * it previously.

5974

+ */

5975

+ qv.command = EVMS_UNQUIESCE;

5976

+ qv.do_vfs = EVMS_VFS_DO_NOTHING;

5977

+ qv.minor = dv->associative_minor;

5978

+ evms_quiesce_volume(lva, inode, file, &qv);

5979

+ }

5980

+ }

5981

+ }

5982

+ return(rc);

5983

+}

5984

+

5985

+/* function: evms_ioctl_cmd_delete_volume

5986

+ *

5987

+ * this function copy user data to/from the kernel, and

5988

+ * validates user parameters. after validation, control

5989

+ * is passed to worker routine evms_user_delete_volume.

5990

+ *

5991

+ */

5992

+static int

5993

+evms_ioctl_cmd_delete_volume(

5994

+ struct inode *inode,

5995

+ struct file *file,

5996

+ unsigned long arg)

5997

+{

5998

+ int rc = 0;

5999

+ evms_delete_volume_t tmp, *user_parms;

6000

+ evms_logical_volume_t *volume = NULL;

6001

+

6002

+ user_parms = (evms_delete_volume_t *)arg;

6003

+ /* copy user's parameters to kernel space */

6004

+ if (copy_from_user(&tmp, user_parms, sizeof(tmp)))

6005

+ rc = -EFAULT;

6006

+

6007

+ /* check to make sure associative minor is in use */

6008

+ if (!rc) {

6009

+ if (tmp.associative_minor) {

6010

+ volume = &evms_logical_volumes[tmp.associative_minor];

6011

+ if (volume->node == NULL)

6012

+ rc = -ENXIO;

6013

+ }

6014

+ }

6015

+ /* check to make sure target minor is in use */

6016

+ if (!rc) {

6017

+ volume = &evms_logical_volumes[tmp.minor];

6018

+ if (volume->node == NULL)

6019

+ rc = -ENXIO;

6020

+ else

6021

+ rc = evms_user_delete_volume(

6022

+ volume,inode,file,&tmp);

6023

+ }

6024

+ /* copy the status value back to the user */

6025

+ tmp.status = rc;

6026

+ if (copy_to_user(user_parms, &tmp, sizeof(tmp)))

6027

+ rc = -EFAULT;

6028

+

6029

+ return(rc);

6030

+}

6031

+

6032

+/* function: evms_full_rediscover_prep

6033

+ *

6034

+ * this function helps to prevent problems when evms is

6035

+ * configured with the base built in statically and some

6036

+ * plugins built as modules.

6037

+ *

6038

+ * in these cases, when the initial discovery is done,

6039

+ * only the statically built modules are available for

6040

+ * volume construction. as a result, some volumes that

6041

+ * require the plugins built as modules (which haven't

6042

+ * been loaded), to be fully reconstructed, may come up

6043

+ * as compatibility volumes or partial volumes.

6044

+ *

6045

+ * when parts of evms are built as modules, the

6046

+ * evms_rediscovery utility is used, to perform a secondary

6047

+ * rediscover, after all the plugins built as modules

6048

+ * have been loaded, to construct all the volumes

6049

+ * requiring these plugins.

6050

+ *

6051

+ * however since some of the volumes, requiring the plugins

6052

+ * built as modules, may have been already exported as

6053

+ * compatibility or partial volumes, we need to purge these

6054

+ * volumes from kernel's memory, so that can be rediscovered

6055

+ * and claimed by the appropriate plugins, and reconstructed

6056

+ * into the correct volumes.

6057

+ *

6058

+ * this function purges all compatibility volumes that are

6059

+ * not in use(mounted) and all partial volumes, prior to

6060

+ * doing the secondary rediscover, thus allowing volumes to

6061

+ * rediscovered correctly.

6062

+ *

6063

+ * NOTE: again, this is only required in cases when a

6064

+ * combination of plugins are built statically and as

6065

+ * modules.

6066

+ *

6067

+ */

6068

+static void

6069

+evms_full_rediscover_prep(struct inode *inode, struct file *file)

6070

+{

6071

+ int rc = 0, i;

6072

+

6073

+ LOG_DETAILS("%s: started.\n", __FUNCTION__);

6074

+ /* check for acceptable volumes to be deleted */

6075

+ for (i = 1; i < MAX_EVMS_VOLUMES; i++) {

6076

+ evms_logical_volume_t *volume = NULL;

6077

+ evms_delete_volume_t dv;

6078

+ int volume_mounted, doit;

6079

+ kdev_t devp;

6080

+

6081

+ volume = &evms_logical_volumes[i];

6082

+ if (!volume->node)

6083

+ continue;

6084

+ devp = MKDEV(EVMS_MAJOR,i);

6085

+ volume_mounted = (is_mounted(devp)) ? 1 : 0;

6086

+ /* only proceed on volumes that are:

6087

+ * partial volumes

6088

+ * OR

6089

+ * unmounted compatibility volumes

6090

+ */

6091

+ doit = FALSE;

6092

+ if (volume->flags & EVMS_VOLUME_PARTIAL) {

6093

+ /* do all partial volumes

6094

+ */

6095

+ doit = TRUE;

6096

+ } else if (!(volume->flags & EVMS_VOLUME_FLAG)) {

6097

+ /* check all compatibility volumes

6098

+ */

6099

+ if (!volume_mounted && !is_swap_partition(devp)) {

6100

+ /* only do unmounted volumes

6101

+ */

6102

+ doit = TRUE;

6103

+ }

6104

+ }

6105

+ if (doit == FALSE) {

6106

+ continue;

6107

+ }

6108

+ /* delete the volume from memory.

6109

+ * do a 'soft' delete if volume

6110

+ * is mounted, and 'hard' delete

6111

+ * if it is not.

6112

+ *

6113

+ * NOTE: the delete operation will

6114

+ * clear the bits in the flags field.

6115

+ */

6116

+ dv.command = (volume_mounted) ?

6117

+ EVMS_SOFT_DELETE : EVMS_HARD_DELETE;

6118

+ dv.minor = i;

6119

+ dv.associative_minor = 0;

6120

+ dv.status = 0;

6121

+ rc = evms_user_delete_volume(volume,inode,file,&dv);

6122

+ }

6123

+ LOG_DETAILS("%s: completed.\n", __FUNCTION__);

6124

+}

6125

+

6126

+static int

6127

+evms_ioctl_cmd_rediscover_volumes(

6128

+ struct inode *inode,

6129

+ struct file *file,

6130

+ unsigned int cmd,

6131

+ unsigned long arg)

6132

+{

6133

+ int rc, i;

6134

+ evms_rediscover_t tmp, *user_parms;

6135

+ unsigned long *array_ptr = NULL, array_size = 0;

6136

+ evms_logical_volume_t *volume = NULL;

6137

+

6138

+ rc = tmp.drive_count = 0;

6139

+ user_parms = (evms_rediscover_t *)arg;

6140

+ /* copy user's parameters to kernel space */

6141

+ if (copy_from_user(&tmp, user_parms, sizeof(tmp)))

6142

+ rc = -EFAULT;

6143

+

6144

+ if (tmp.drive_count == REDISCOVER_ALL_DEVICES) {

6145

+ evms_full_rediscover_prep(inode, file);

6146

+ }

6147

+ /* quiesce all queued volumes */

6148

+ for (i = 1; i < MAX_EVMS_VOLUMES; i++) {

6149

+ evms_quiesce_volume_t qv;

6150

+

6151

+ volume = &evms_logical_volumes[i];

6152

+ if (!volume->node) {

6153

+ continue;

6154

+ }

6155

+ if (!(volume->flags & EVMS_REQUESTED_QUIESCE)) {

6156

+ continue;

6157

+ }

6158

+ qv.command = EVMS_QUIESCE;

6159

+ qv.minor = i;

6160

+ qv.do_vfs = (volume->flags & EVMS_REQUESTED_VFS_QUIESCE) ?

6161

+ EVMS_VFS_DO : EVMS_VFS_DO_NOTHING,

6162

+ qv.status = 0;

6163

+ rc = evms_quiesce_volume(volume,inode,file,&qv);

6164

+ }

6165

+ /* "soft" delete all queued volumes */

6166

+ for (i = 1; i < MAX_EVMS_VOLUMES; i++) {

6167

+ evms_delete_volume_t dv;

6168

+

6169

+ volume = &evms_logical_volumes[i];

6170

+ if (!volume->node) {

6171

+ continue;

6172

+ }

6173

+ if (!(volume->flags & EVMS_REQUESTED_DELETE)) {

6174

+ continue;

6175

+ }

6176

+ dv.command = EVMS_SOFT_DELETE;

6177

+ dv.minor = i;

6178

+ dv.associative_minor = 0;

6179

+ dv.status = 0;

6180

+ rc = evms_delete_volume(volume, &dv);

6181

+ }

6182

+

6183

+ if (tmp.drive_count &&

6184

+ (tmp.drive_count != REDISCOVER_ALL_DEVICES)) {

6185

+ if (!rc) {

6186

+ /* create space for userspace drive array */

6187

+ array_size = sizeof(*tmp.drive_array) * tmp.drive_count;

6188

+ array_ptr = tmp.drive_array;

6189

+ rc = evms_cs_allocate_memory((void **)&tmp.drive_array, array_size);

6190

+ }

6191

+ if (!rc)

6192

+ /* copy rediscover drive array to kernel space */

6193

+ if (copy_from_user(tmp.drive_array, array_ptr, array_size))

6194

+ rc = -EFAULT;

6195

+ }

6196

+

6197

+ if (!rc) {

6198

+ /* perform the rediscovery operation */

6199

+ rc = evms_discover_volumes(&tmp);

6200

+ }

6201

+

6202

+ /* clean up after operation */

6203

+ if (tmp.drive_count &&

6204

+ (tmp.drive_count != REDISCOVER_ALL_DEVICES))

6205

+ evms_cs_deallocate_memory(tmp.drive_array);

6206

+

6207

+ /* set return code and copy info to userspace */

6208

+ tmp.status = rc;

6209

+ if (copy_to_user(&user_parms->status, &tmp.status, sizeof(tmp.status)))

6210

+ rc = -EFAULT;

6211

+

6212

+ return(rc);

6213

+}

6214

+

6215

+static evms_list_node_t *user_disk_ptr;

6216

+static int

6217

+evms_ioctl_cmd_get_logical_disk(void * arg)

6218

+{

6219

+ int rc = 0;

6220

+ evms_user_disk_t tmp, *user_parms;

6221

+

6222

+ user_parms = (evms_user_disk_t *)arg;

6223

+ /* copy user's parameters to kernel space */

6224

+ if (copy_from_user(&tmp.command, &user_parms->command, sizeof(tmp.command)))

6225

+ rc = -EFAULT;

6226

+

6227

+ if (!rc) {

6228

+ if (tmp.command == EVMS_FIRST_DISK)

6229

+ user_disk_ptr = evms_global_device_list;

6230

+ else /* tmp.command == EVMS_NEXT_DISK */

6231

+ user_disk_ptr = user_disk_ptr->next;

6232

+

6233

+ if (user_disk_ptr == NULL)

6234

+ tmp.status = EVMS_DISK_INVALID;

6235

+ else {

6236

+ tmp.status = EVMS_DISK_VALID;

6237

+ tmp.disk_handle = (unsigned long)user_disk_ptr->item ^ EVMS_HANDLE_KEY;

6238

+ }

6239

+ /* copy info to userspace */

6240

+ if (copy_to_user(user_parms, &tmp, sizeof(tmp)))

6241

+ rc = -EFAULT;

6242

+ }

6243

+ return(rc);

6244

+}

6245

+

6246

+static int

6247

+evms_ioctl_cmd_get_logical_disk_info(void * arg)

6248

+{

6249

+ int rc = 0;

6250

+ evms_user_disk_info_t tmp, *user_parms;

6251

+ evms_list_node_t *p;

6252

+

6253

+ user_parms = (evms_user_disk_info_t *)arg;

6254

+ /* copy user's parameters to kernel space */

6255

+ if (copy_from_user(&tmp.disk_handle, &user_parms->disk_handle, sizeof(tmp.disk_handle)))

6256

+ rc = -EFAULT;

6257

+

6258

+ /* check handle for validity */

6259

+ if (!rc) {

6260

+ rc = -EINVAL;

6261

+ for (p = evms_global_device_list; p; p = p->next)

6262

+ if (p->item == (evms_logical_node_t *)(tmp.disk_handle ^ EVMS_HANDLE_KEY)) {

6263

+ rc = 0;

6264

+ user_disk_ptr = p;

6265

+ break;

6266

+ }

6267

+ }

6268

+

6269

+ /* populate kernel copy of user's structure with appropriate info */

6270

+ if (!rc) {

6271

+ evms_logical_node_t *node = (evms_logical_node_t *)user_disk_ptr->item;

6272

+ tmp.flags = node->flags;

6273

+ strcpy(tmp.disk_name, EVMS_DEV_NODE_PATH);

6274

+ strcat(tmp.disk_name, node->name);

6275

+ tmp.total_sectors = node->total_vsectors;

6276

+ tmp.hardsect_size = node->hardsector_size;

6277

+ tmp.block_size = node->block_size;

6278

+ rc = evms_cs_kernel_ioctl(node, HDIO_GETGEO,

6279

+ (unsigned long)&tmp.geometry);

6280

+ }

6281

+

6282

+ /* set return code and copy info to userspace */

6283

+ tmp.status = rc;

6284

+ if (copy_to_user(user_parms, &tmp, sizeof(tmp)))

6285

+ rc = -EFAULT;

6286

+

6287

+ return(rc);

6288

+}

6289

+

6290

+#define MAX_IO_SIZE 128

6291

+static int

6292

+evms_ioctl_cmd_sector_io(void * arg)

6293

+{

6294

+ int rc;

6295

+ evms_sector_t io_size = MAX_IO_SIZE;

6296

+ evms_sector_io_t tmp, *user_parms;

6297

+ evms_logical_node_t *disk_node = NULL;

6298

+ evms_list_node_t *list_node;

6299

+ unsigned char *io_buffer;

6300

+

6301

+ rc = 0;

6302

+ list_node = NULL;

6303

+ io_buffer = NULL;

6304

+

6305

+ user_parms = (evms_sector_io_t *)arg;

6306

+ /* copy user's parameters to kernel space */

6307

+ if (copy_from_user(&tmp, user_parms, sizeof(tmp)))

6308

+ rc = -EFAULT;

6309

+

6310

+ /* check handle for validity */

6311

+ if (!rc) {

6312

+ rc = -EINVAL;

6313

+ disk_node = (evms_logical_node_t *)(tmp.disk_handle ^ EVMS_HANDLE_KEY);

6314

+ for (list_node = evms_global_device_list; list_node; list_node = list_node->next)

6315

+ if (list_node->item == disk_node) {

6316

+ rc = 0;

6317

+ break;

6318

+ }

6319

+ }

6320

+ if (!rc) {

6321

+ /* allocate a io buffer upto 64Kbytes in size */

6322

+ if (tmp.sector_count < MAX_IO_SIZE)

6323

+ io_size = tmp.sector_count;

6324

+

6325

+ /* allocate buffer large enough to hold a single sector */

6326

+ rc = evms_cs_allocate_memory(

6327

+ (void **)&io_buffer,

6328

+ io_size << EVMS_VSECTOR_SIZE_SHIFT);

6329

+ }

6330

+ /* perform io with specified disk */

6331

+ if (!rc) {

6332

+ evms_sector_t io_sector_offset, io_remaining;

6333

+ u_int64_t io_bytes;

6334

+ u_char *user_buffer_ptr;

6335

+

6336

+ io_remaining = tmp.sector_count;

6337

+ io_sector_offset = 0;

6338

+ user_buffer_ptr = tmp.buffer_address;

6339

+ while(io_remaining) {

6340

+ /* compute the io_size for this pass */

6341

+ io_size = (io_remaining >= MAX_IO_SIZE) ?

6342

+ MAX_IO_SIZE : io_remaining;

6343

+

6344

+ io_bytes = io_size << EVMS_VSECTOR_SIZE_SHIFT;

6345

+ /* for writes, copy a sector from user to kernel */

6346

+ if (tmp.io_flag == EVMS_SECTOR_IO_WRITE) {

6347

+ /* copy sector from user data buffer */

6348

+ if (copy_from_user(io_buffer,

6349

+ user_buffer_ptr,

6350

+ io_bytes))

6351

+ rc = -EFAULT;

6352

+ }

6353

+ if (rc) break;

6354

+

6355

+ /* perform IO one sector at a time */

6356

+ rc = INIT_IO(

6357

+ disk_node,

6358

+ tmp.io_flag,

6359

+ io_sector_offset + tmp.starting_sector,

6360

+ io_size,

6361

+ io_buffer);

6362

+

6363

+ if (rc) break;

6364

+

6365

+ if (tmp.io_flag != EVMS_SECTOR_IO_WRITE) {

6366

+ /* copy sector to user data buffer */

6367

+ if (copy_to_user(user_buffer_ptr,

6368

+ io_buffer,

6369

+ io_bytes))

6370

+ rc = -EFAULT;

6371

+ }

6372

+ if (rc) break;

6373

+

6374

+ user_buffer_ptr += io_bytes;

6375

+ tmp.buffer_address += io_bytes;

6376

+ io_sector_offset += io_size;

6377

+ io_remaining -= io_size;

6378

+ }

6379

+ }

6380

+

6381

+ /* if the sector_buffer was allocated, free it */

6382

+ if (io_buffer)

6383

+ evms_cs_deallocate_memory(io_buffer);

6384

+

6385

+ /* copy the status value back to the user */

6386

+ tmp.status = rc;

6387

+ if (copy_to_user(user_parms, &tmp, sizeof(tmp)))

6388

+ rc = -EFAULT;

6389

+

6390

+ return(rc);

6391

+}

6392

+#undef MAX_IO_SIZE

6393

+

6394

+static int user_minor;

6395

+static int

6396

+evms_ioctl_cmd_get_minor(void * arg)

6397

+{

6398

+ int rc = 0;

6399

+ evms_user_minor_t tmp, *user_parms;

6400

+

6401

+ user_parms = (evms_user_minor_t *)arg;

6402

+ /* copy user's parameters to kernel space */

6403

+ if (copy_from_user(&tmp.command, &user_parms->command, sizeof(tmp.command)))

6404

+ rc = -EFAULT;

6405

+

6406

+ if (!rc) {

6407

+ if (tmp.command == EVMS_FIRST_VOLUME)

6408

+ user_minor = 1;

6409

+ else /* tmp.command == EVMS_NEXT_VOLUME */

6410

+ user_minor++;

6411

+

6412

+ tmp.status = EVMS_VOLUME_INVALID;

6413

+ for (; user_minor < MAX_EVMS_VOLUMES; user_minor++) {

6414

+ evms_logical_volume_t *lv;

6415

+

6416

+ lv = &evms_logical_volumes[user_minor];

6417

+ /* see if any corrupt volumes have been

6418

+ * unmounted. If so, clean up the

6419

+ * evms_logical_volumes array entry, and

6420

+ * don't report the volume to the user.

6421

+ */

6422

+ if (lv->flags & EVMS_VOLUME_CORRUPT) {

6423

+ if (!get_super(MKDEV(EVMS_MAJOR,user_minor))) {

6424

+ /* clear logical volume structure

6425

+ * for this volume so it may be

6426

+ * reused.

6427

+ */

6428

+ LOG_WARNING("ioctl_get_minor: found unmounted %s volume(%u,%u,%s).\n",

6429

+ ((lv->flags & EVMS_VOLUME_SOFT_DELETED) ?

6430

+ "'soft deleted'" : ""),

6431

+ EVMS_MAJOR, user_minor,

6432

+ lv->name);

6433

+ LOG_WARNING(" releasing minor(%d) used by volume(%s)!\n",

6434

+ user_minor, lv->name);

6435

+ evms_cs_deallocate_memory(lv->name);

6436

+ lv->name = NULL;

6437

+ lv->flags = 0;

6438

+ }

6439

+ }

6440

+ if (lv->node || (lv->flags & EVMS_VOLUME_CORRUPT)) {

6441

+ tmp.status = EVMS_VOLUME_VALID;

6442

+ tmp.minor = user_minor;

6443

+ break;

6444

+ }

6445

+ }

6446

+

6447

+ /* copy info to userspace */

6448

+ if (copy_to_user(user_parms, &tmp, sizeof(tmp)))

6449

+ rc = -EFAULT;

6450

+ }

6451

+ return(rc);

6452

+}

6453

+

6454

+static int

6455

+evms_ioctl_cmd_get_volume_data(void * arg)

6456

+{

6457

+ int rc = 0;

6458

+ evms_volume_data_t tmp, *user_parms;

6459

+ evms_logical_volume_t *volume = NULL;

6460

+ evms_logical_node_t *node = NULL;

6461

+

6462

+ user_parms = (evms_volume_data_t *)arg;

6463

+ /* copy user's parameters to kernel space */

6464

+ if (copy_from_user(&tmp, user_parms, sizeof(tmp)))

6465

+ rc = -EFAULT;

6466

+

6467

+ if (!rc) {

6468

+ volume = &evms_logical_volumes[tmp.minor];

6469

+ node = volume->node;

6470

+ if (node == NULL)

6471

+ rc = -ENODEV;

6472

+ }

6473

+ if (!rc) {

6474

+ tmp.flags = volume->flags;

6475

+ strcpy(tmp.volume_name, EVMS_DEV_NODE_PATH);

6476

+ strcat(tmp.volume_name, volume->name);

6477

+ }

6478

+

6479

+ /* copy return code and info to userspace */

6480

+ tmp.status = rc;

6481

+ if (copy_to_user(user_parms, &tmp, sizeof(tmp)))

6482

+ rc = -EFAULT;

6483

+ return(rc);

6484

+}

6485

+

6486

+static evms_registered_plugin_t *ioctl_reg_record;

6487

+static int

6488

+evms_ioctl_cmd_get_plugin(void * arg)

6489

+{

6490

+ int rc = 0;

6491

+ evms_kernel_plugin_t tmp, *user_parms;

6492

+

6493

+ user_parms = (evms_kernel_plugin_t *)arg;

6494

+ /* copy user's parameters to kernel space */

6495

+ if (copy_from_user(&tmp.command, &user_parms->command, sizeof(tmp.command)))

6496

+ rc = -EFAULT;

6497

+

6498

+ if (!rc) {

6499

+ /* if the command is not 0, then verify

6500

+ * that ioctl_reg_record is pointing to

6501

+ * current and valid plugin header.

6502

+ */

6503

+ if (tmp.command) { /* tmp.command == EVMS_NEXT_PLUGIN */

6504

+ evms_registered_plugin_t *tmp_reg_record;

6505

+ tmp_reg_record = registered_plugin_head;

6506

+ /* search the current plugin list */

6507

+ while(tmp_reg_record) {

6508

+ if (tmp_reg_record == ioctl_reg_record)

6509

+ break;

6510

+ tmp_reg_record = tmp_reg_record->next;

6511

+ }

6512

+ /* if the ioctl_reg_record is not in the

6513

+ * current list, then start at the beginning.

6514

+ */

6515

+ if (!tmp_reg_record)

6516

+ tmp.command = EVMS_FIRST_PLUGIN;

6517

+ }

6518

+

6519

+ if (tmp.command == EVMS_FIRST_PLUGIN)

6520

+ /* start at beginning of plugin list */

6521

+ ioctl_reg_record = registered_plugin_head;

6522

+ else /* tmp.command == EVMS_NEXT_PLUGIN */

6523

+ /* continue from current position in list */

6524

+ ioctl_reg_record = ioctl_reg_record->next;

6525

+

6526

+ tmp.status = EVMS_PLUGIN_INVALID;

6527

+ tmp.id = 0;

6528

+ if (ioctl_reg_record) {

6529

+ tmp.id = ioctl_reg_record->plugin->id;

6530

+ tmp.version = ioctl_reg_record->plugin->version;

6531

+ tmp.status = EVMS_PLUGIN_VALID;

6532

+ }

6533

+

6534

+ /* copy info to userspace */

6535

+ if (copy_to_user(user_parms, &tmp, sizeof(tmp)))

6536

+ rc = -EFAULT;

6537

+ }

6538

+ return(rc);

6539

+}

6540

+

6541

+static int

6542

+evms_ioctl_cmd_plugin_ioctl(

6543

+ struct inode *inode,

6544

+ struct file *file,

6545

+ unsigned int cmd,

6546

+ unsigned long arg)

6547

+{

6548

+ int rc = 0, found = FALSE;

6549

+ evms_plugin_ioctl_t tmp, *user_parms;

6550

+ evms_registered_plugin_t * p;

6551

+

6552

+ user_parms = (evms_plugin_ioctl_t *)arg;

6553

+ /* copy user's parameters to kernel space */

6554

+ if (copy_from_user(&tmp, user_parms, sizeof(tmp)))

6555

+ rc = -EFAULT;

6556

+

6557

+ if (!rc) {

6558

+ /* search for the specified plugin */

6559

+ for (p = registered_plugin_head; p; p = p->next)

6560

+ /* check for the specified feature id */

6561

+ if (p->plugin->id == tmp.feature_id) {

6562

+ found = TRUE;

6563

+ /* check that entry point is used */

6564

+ if (p->plugin->function_table->direct_ioctl)

6565

+ rc = DIRECT_IOCTL(p, inode, file, cmd, arg);

6566

+ else

6567

+ rc = -ENOSYS;

6568

+ break;

6569

+ }

6570

+ /* was the specified plugin found? */

6571

+ if (found == FALSE)

6572

+ rc = -ENOPKG;

6573

+

6574

+ /* copy the status value back to the user */

6575

+ tmp.status = rc;

6576

+ if (copy_to_user(user_parms, &tmp, sizeof(tmp)))

6577

+ rc = -EFAULT;

6578

+ }

6579

+ return(rc);

6580

+}

6581

+

6582

+#define MAX_BUFFER_SIZE 65536

6583

+static int

6584

+evms_ioctl_cmd_kernel_partial_csum(void * arg)

6585

+{

6586

+ int rc = 0;

6587

+ u_int64_t compute_size = MAX_BUFFER_SIZE;

6588

+ evms_compute_csum_t tmp, *user_parms;

6589

+ unsigned char *buffer = NULL;

6590

+

6591

+ user_parms = (evms_compute_csum_t *)arg;

6592

+ /* copy user's parameters to kernel space */

6593

+ if (copy_from_user(&tmp, user_parms, sizeof(tmp)))

6594

+ rc = -EFAULT;

6595

+

6596

+ if (!rc) {

6597

+ /* allocate a io buffer upto 64Kbytes in size */

6598

+ if (tmp.buffer_size < MAX_BUFFER_SIZE)

6599

+ compute_size = tmp.buffer_size;

6600

+

6601

+ /* allocate buffer large enough to hold a single sector */

6602

+ rc = evms_cs_allocate_memory(

6603

+ (void **)&buffer, compute_size);

6604

+ }

6605

+ /* perform io with specified disk */

6606

+ if (!rc) {

6607

+ evms_sector_t remaining_bytes;

6608

+ u_char *user_buffer_ptr;

6609

+ unsigned int insum = tmp.insum;

6610

+

6611

+ remaining_bytes = tmp.buffer_size;

6612

+ user_buffer_ptr = tmp.buffer_address;

6613

+ while(remaining_bytes) {

6614

+ /* compute the compute_size for this pass */

6615

+ compute_size = (remaining_bytes >= MAX_BUFFER_SIZE) ?

6616

+ MAX_BUFFER_SIZE : remaining_bytes;

6617

+

6618

+ /* copy into kernel from user data buffer */

6619

+ if (copy_from_user(buffer, user_buffer_ptr,

6620

+ compute_size))

6621

+ rc = -EFAULT;

6622

+ if (rc) break;

6623

+ /* compute the checksum for this pass */

6624

+ tmp.outsum = csum_partial(buffer, tmp.buffer_size,

6625

+ insum);

6626

+ /* set up for another possible pass */

6627

+ insum = tmp.outsum;

6628

+ /* update loop progress variables */

6629

+ user_buffer_ptr += compute_size;

6630

+ tmp.buffer_address += compute_size;

6631

+ remaining_bytes -= compute_size;

6632

+ }

6633

+ }

6634

+

6635

+ /* if the sector_buffer was allocated, free it */

6636

+ if (buffer)

6637

+ evms_cs_deallocate_memory(buffer);

6638

+

6639

+ /* copy the status value back to the user */

6640

+ tmp.status = rc;

6641

+ if (copy_to_user(user_parms, &tmp, sizeof(tmp)))

6642

+ rc = -EFAULT;

6643

+

6644

+ return(rc);

6645

+}

6646

+#undef MAX_BUFFER_SIZE

6647

+

6648

+static int

6649

+evms_ioctl_cmd_get_bmap(

6650

+ struct inode *inode,

6651

+ struct file *file,

6652

+ unsigned int cmd,

6653

+ unsigned long arg)

6654

+{

6655

+ int rc = 0;

6656

+ evms_get_bmap_t tmp, *user_parms;

6657

+

6658

+ user_parms = (evms_get_bmap_t *)arg;

6659

+ /* copy user's parameters to kernel space */

6660

+ if (copy_from_user(&tmp, user_parms, sizeof(tmp)))

6661

+ rc = -EFAULT;

6662

+

6663

+ /* pass the ioctl down the volume stack */

6664

+ if (!rc) {

6665

+ evms_logical_volume_t *volume;

6666

+

6667

+ volume = &evms_logical_volumes[MINOR(inode->i_rdev)];

6668

+ rc = IOCTL(volume->node, inode, file, cmd, (unsigned long)&tmp);

6669

+ }

6670

+ /* copy the status value back to the user */

6671

+ tmp.status = rc;

6672

+ if (copy_to_user(user_parms, &tmp, sizeof(tmp)))

6673

+ rc = -EFAULT;

6674

+

6675

+ return(rc);

6676

+}

6677

+

6678

+static int

6679

+evms_ioctl_cmd_process_notify_event(unsigned long arg)

6680

+{

6681

+ int rc = 0, found = FALSE;

6682

+ evms_notify_t tmp, *user_parms;

6683

+ evms_list_node_t **list_node = NULL;

6684

+ evms_event_t *event = NULL;

6685

+

6686

+ user_parms = (evms_notify_t *)arg;

6687

+ /* copy user's parameters to kernel space */

6688

+ if (copy_from_user(&tmp, user_parms, sizeof(tmp)))

6689

+ rc = -EFAULT;

6690

+

6691

+ /* check to see if PID has already been registered

6692

+ * for this event.

6693

+ */

6694

+ if (!rc) {

6695

+ list_node = &evms_global_notify_list;

6696

+ while(*list_node) {

6697

+ event = (*list_node)->item;

6698

+ if ((event->pid == tmp.eventry.pid) &&

6699

+ (event->eventid == tmp.eventry.eventid)) {

6700

+ found = TRUE;

6701

+ break;

6702

+ }

6703

+ list_node = &(*list_node)->next;

6704

+ }

6705

+ }

6706

+ if (tmp.command) { /* tmp.command == EVMS_REGISTER_EVENT */

6707

+ /* registration code */

6708

+ if (found) {

6709

+ rc = -EBUSY;

6710

+ LOG_ERROR("error(%d) pid(%d) already register to receive signal(%d) on event(%d).\n",

6711

+ rc, tmp.eventry.pid, tmp.eventry.signo, tmp.eventry.eventid);

6712

+ } else {

6713

+ /* register this pid/event type */

6714

+ rc = evms_cs_allocate_memory((void **)&event, sizeof(evms_event_t));

6715

+ if (rc) {

6716

+ LOG_ERROR("error(%d) allocating event structure.\n",

6717

+ rc);

6718

+ } else {

6719

+ event->pid = tmp.eventry.pid;

6720

+ event->eventid = tmp.eventry.eventid;

6721

+ event->signo = tmp.eventry.signo;

6722

+ rc = evms_cs_add_item_to_list(

6723

+ &evms_global_notify_list,

6724

+ event);

6725

+ }

6726

+ }

6727

+ } else { /* tmp.command == EVMS_UNREGISTER_EVENT */

6728

+ /* unregistration code */

6729

+ if (!found) {

6730

+ rc = -ENODATA;

6731

+ LOG_ERROR("error(%d) attempting to unregister a non-registered pid(%d) on event(%d).\n",

6732

+ rc, tmp.eventry.pid, tmp.eventry.eventid);

6733

+ } else {

6734

+ event = (*list_node)->item;

6735

+ rc = evms_cs_remove_item_from_list(

6736

+ &evms_global_notify_list,

6737

+ event);

6738

+ if (!rc) {

6739

+ evms_cs_deallocate_memory(event);

6740

+ }

6741

+ }

6742

+ }

6743

+ /* copy the status value back to the user */

6744

+ tmp.status = rc;

6745

+ if (copy_to_user(user_parms, &tmp, sizeof(tmp)))

6746

+ rc = -EFAULT;

6747

+

6748

+ return(rc);

6749

+}

6750

+/************************************************/

6751

+/* END -- IOCTL commands -- EVMS specific */

6752

+/************************************************/

6753

+

6754

+/************************************************/

6755

+/* START -- IOCTL commands -- Volume specific */

6756

+/************************************************/

6757

+

6758

+/************************************************/

6759

+/* END -- IOCTL commands -- Volume specific */

6760

+/************************************************/

6761

+

6762

+/************************************************/

6763

+/* START -- IOCTL main */

6764

+/************************************************/

6765

+

6766

+/*

6767

+ * Function: evms_ioctl

6768

+ *

6769

+ * This function is the main ioctl entry point for all of evms.

6770

+ */

6771

+

6772

+static int

6773

+evms_ioctl(

6774

+ struct inode *inode,

6775

+ struct file *file,

6776

+ unsigned int cmd,

6777

+ unsigned long arg)

6778

+{

6779

+ unsigned long minor = 0;

6780

+ int rc = 0;

6781

+ evms_logical_node_t *node = NULL;

6782

+

6783

+ /* check user access */

6784

+ if (!capable(CAP_SYS_ADMIN))

6785

+ rc = -EACCES;

6786

+

6787

+ if (!inode)

6788

+ rc = -EINVAL;

6789

+

6790

+ if (!rc) {

6791

+ /* get the minor */

6792

+ minor = MINOR(inode->i_rdev);

6793

+ LOG_EXTRA("ioctl: minor(%lu), dir(%d), size(%d), type(%d), nr(%d)\n",

6794

+ minor,

6795

+ (cmd >> _IOC_DIRSHIFT) & _IOC_DIRMASK,

6796

+ (cmd >> _IOC_SIZESHIFT) & _IOC_SIZEMASK,

6797

+ (cmd >> _IOC_TYPESHIFT) & _IOC_TYPEMASK,

6798

+ (cmd >> _IOC_NRSHIFT) & _IOC_NRMASK);

6799

+

6800

+ /* insure this minor points to a valid volume */

6801

+ if (minor) {

6802

+ node = evms_logical_volumes[minor].node;

6803

+ if (node == NULL)

6804

+ rc = -ENXIO;

6805

+ }

6806

+ }

6807

+

6808

+ /* process the IOCTL commands */

6809

+ if (!rc) {

6810

+ if (!minor) {

6811

+ /* process all EVMS specific commands */

6812

+ switch(cmd) {

6813

+ case EVMS_GET_IOCTL_VERSION:

6814

+ rc = evms_ioctl_cmd_get_ioctl_version((void *)arg);

6815

+ break;

6816

+ case EVMS_GET_VERSION:

6817

+ rc = evms_ioctl_cmd_get_version((void *)arg);

6818

+ break;

6819

+ case EVMS_GET_INFO_LEVEL:

6820

+ rc = evms_ioctl_cmd_get_info_level((void *)arg);

6821

+ break;

6822

+ case EVMS_SET_INFO_LEVEL:

6823

+ rc = evms_ioctl_cmd_set_info_level((void *)arg);

6824

+ break;

6825

+ case EVMS_REDISCOVER_VOLUMES:

6826

+ rc = evms_ioctl_cmd_rediscover_volumes(inode, file, cmd, arg);

6827

+ break;

6828

+ case EVMS_GET_LOGICAL_DISK:

6829

+ rc = evms_ioctl_cmd_get_logical_disk((void *)arg);

6830

+ break;

6831

+ case EVMS_GET_LOGICAL_DISK_INFO:

6832

+ rc = evms_ioctl_cmd_get_logical_disk_info((void *)arg);

6833

+ break;

6834

+ case EVMS_SECTOR_IO:

6835

+ rc = evms_ioctl_cmd_sector_io((void *)arg);

6836

+ break;

6837

+ case EVMS_GET_MINOR:

6838

+ rc = evms_ioctl_cmd_get_minor((void *)arg);

6839

+ break;

6840

+ case EVMS_GET_VOLUME_DATA:

6841

+ rc = evms_ioctl_cmd_get_volume_data((void *)arg);

6842

+ break;

6843

+ case EVMS_DELETE_VOLUME:

6844

+ rc = evms_ioctl_cmd_delete_volume(inode, file, arg);

6845

+ break;

6846

+ case EVMS_GET_PLUGIN:

6847

+ rc = evms_ioctl_cmd_get_plugin((void *)arg);

6848

+ break;

6849

+ case EVMS_PLUGIN_IOCTL:

6850

+ rc = evms_ioctl_cmd_plugin_ioctl(inode, file, cmd, arg);

6851

+ break;

6852

+ case EVMS_COMPUTE_CSUM:

6853

+ rc = evms_ioctl_cmd_kernel_partial_csum((void *)arg);

6854

+ break;

6855

+ case EVMS_PROCESS_NOTIFY_EVENT:

6856

+ rc = evms_ioctl_cmd_process_notify_event(arg);

6857

+ break;

6858

+ default:

6859

+ rc = -EINVAL;

6860

+ break;

6861

+ }

6862

+ } else {

6863

+ /* process Volume specific commands */

6864

+ switch(cmd) {

6865

+ /* pick up standard blk ioctls */

6866

+ case BLKFLSBUF:

6867

+ case BLKROSET:

6868

+ case BLKROGET:

6869

+ case BLKRASET:

6870

+ case BLKRAGET:

6871

+ case BLKBSZGET:

6872

+ case BLKSSZGET:

6873

+ rc = blk_ioctl(inode->i_rdev, cmd, arg);

6874

+ break;

6875

+ case BLKGETSIZE:

6876

+ {

6877

+ /* casting size down to 32-bits until

6878

+ * kernel allows return of 64-bit size

6879

+ * values.

6880

+ */

6881

+ long size = node->total_vsectors;

6882

+ if (copy_to_user((long *)arg, &size, sizeof(long)))

6883

+ rc = -EFAULT;

6884

+ }

6885

+ break;

6886

+ case BLKGETSIZE64:

6887

+ {

6888

+ u64 size_in_bytes = node->total_vsectors << EVMS_VSECTOR_SIZE_SHIFT;

6889

+ if (copy_to_user((u64 *)arg, &size_in_bytes, sizeof(u64)))

6890

+ rc = -EFAULT;

6891

+ }

6892

+ break;

6893

+ case EVMS_GET_IOCTL_VERSION:

6894

+ rc = evms_ioctl_cmd_get_ioctl_version((void *)arg);

6895

+ break;

6896

+ case EVMS_GET_BMAP:

6897

+ rc = evms_ioctl_cmd_get_bmap(inode, file, cmd, arg);

6898

+ break;

6899

+ default:

6900

+ rc = IOCTL(node, inode, file, cmd, arg);

6901

+ break;

6902

+ }

6903

+ }

6904

+ }

6905

+ return rc;

6906

+}

6907

+

6908

+/************************************************/

6909

+/* END -- IOCTL main */

6910

+/************************************************/

6911

+

6912

+/************************************************/

6913

+/* START -- CHECK MEDIA CHANGE */

6914

+/************************************************/

6915

+

6916

+static int

6917

+evms_check_media_change(kdev_t dev)

6918

+{

6919

+ int rc = 0;

6920

+ evms_logical_volume_t *volume = NULL;

6921

+

6922

+ /* check user access */

6923

+ if (!capable(CAP_SYS_ADMIN))

6924

+ rc = -EACCES;

6925

+ if (!rc) {

6926

+ int minor;

6927

+ /* get the minor */

6928

+ minor = MINOR(dev);

6929

+ /* insure this minor points to a valid volume */

6930

+ volume = &evms_logical_volumes[minor];

6931

+ if (volume->node == NULL) {

6932

+ rc = -ENXIO;

6933

+ }

6934

+ }

6935

+ if (!rc) {

6936

+ if (volume->flags & EVMS_DEVICE_REMOVABLE) {

6937

+ /* check for media change */

6938

+ rc = evms_cs_kernel_ioctl(

6939

+ volume->node,

6940

+ EVMS_CHECK_MEDIA_CHANGE,

6941

+ (unsigned long)NULL);

6942

+ if (rc < 0) {

6943

+ LOG_ERROR("error(%d) doing EVMS_CHECK_MEDIA_CHANGE ioctl on '%s'.\n",

6944

+ rc, volume->name);

6945

+ }

6946

+ }

6947

+ }

6948

+ return(rc);

6949

+}

6950

+

6951

+/************************************************/

6952

+/* END -- CHECK MEDIA CHANGE */

6953

+/************************************************/

6954

+

6955

+static void

6956

+evms_discover_logical_disks(evms_logical_node_t **);

6957

+

6958

+static int

6959

+evms_check_for_device_changes(

6960

+ struct inode *inode,

6961

+ struct file *file)

6962

+{

6963

+ int rc = 0, something_changed = 0, i;

6964

+ evms_rediscover_t kernel_rd_pckt = {0,0,NULL};

6965

+ evms_list_node_t *disk_list = NULL, *lnode, *next_lnode;

6966

+ evms_logical_node_t *disk, *new_device_list = NULL;

6967

+ evms_logical_volume_t *volume = NULL;

6968

+

6969

+ /* check for new devices

6970

+ *

6971

+ * put all new devices on the disk list so they

6972

+ * will be included in the rediscovery process.

6973

+ */

6974

+ evms_discover_logical_disks(&new_device_list);

6975

+ if (new_device_list) {

6976

+ LOG_DETAILS("%s: new devices detected.\n", __FUNCTION__);

6977

+ something_changed++;

6978

+ /* put these new nodes on the disk list */

6979

+ while(new_device_list) {

6980

+ disk = new_device_list;

6981

+ rc = evms_cs_remove_logical_node_from_list(

6982

+ &new_device_list,disk);

6983

+ if (rc) {

6984

+ LOG_ERROR("%s: error(%d) removing device(%s) from list.\n",

6985

+ __FUNCTION__, rc, disk->name);

6986

+ }

6987

+ rc = evms_cs_add_item_to_list(

6988

+ &disk_list,disk);

6989

+ if (rc) {

6990

+ LOG_ERROR("%s: error(%d) adding device(%s) from list.\n",

6991

+ __FUNCTION__, rc, disk->name);

6992

+ }

6993

+ }

6994

+ }

6995

+

6996

+ /* check all devices for changed removable media

6997

+ *

6998

+ * scan the global device list and issue check

6999

+ * media change on each removable media device.

7000

+ * put all removable devices that indicate a

7001

+ * media change on the disk list.

7002

+ */

7003

+ for (lnode = evms_global_device_list; lnode; lnode = lnode->next) {

7004

+ disk = (evms_logical_node_t *)lnode->item;

7005

+ /* only really check removable media devices */

7006

+ if (disk->flags & EVMS_DEVICE_REMOVABLE) {

7007

+ /* check for media change */

7008

+ rc = evms_cs_kernel_ioctl(

7009

+ disk,

7010

+ EVMS_CHECK_MEDIA_CHANGE,

7011

+ (unsigned long)NULL);

7012

+ if (rc < 0) {

7013

+ LOG_ERROR("%s: error(%d) doing EVMS_CHECK_MEDIA_CHANGE ioctl on '%s'.\n",

7014

+ __FUNCTION__, rc, disk->name);

7015

+ } else if (rc == 1) {

7016

+ something_changed++;

7017

+ rc = evms_cs_add_item_to_list(

7018

+ &disk_list, disk);

7019

+ }

7020

+ }

7021

+ }

7022

+ /* log a statement that we detected changed media.

7023

+ */

7024

+ if (disk_list) {

7025

+ LOG_DETAILS("%s: media change detected.\n", __FUNCTION__);

7026

+ }

7027

+

7028

+ /* check for volumes with removed removable media.

7029

+ * mark the volumes that reside on changed media.

7030

+ */

7031

+ for (i = 1; i < MAX_EVMS_VOLUMES; i++) {

7032

+ volume = &evms_logical_volumes[i];

7033

+ if (!volume->node)

7034

+ continue;

7035

+ if (!(volume->flags & EVMS_DEVICE_REMOVABLE))

7036

+ continue;

7037

+ if (evms_check_media_change(MKDEV(EVMS_MAJOR,i)) <= 0)

7038

+ continue;

7039

+ /* remember which volumes have changed media */

7040

+ volume->flags |= EVMS_MEDIA_CHANGED;

7041

+ something_changed++;

7042

+ }

7043

+

7044

+ /* check for removed hotplug devices */

7045

+

7046

+ /* do we have some work to do? */

7047

+ if (something_changed) {

7048

+ /* check for volumes to be deleted */

7049

+ for (i = 1; i < MAX_EVMS_VOLUMES; i++) {

7050

+ evms_quiesce_volume_t qv;

7051

+

7052

+ volume = &evms_logical_volumes[i];

7053

+ if (!volume->node)

7054

+ continue;

7055

+ /* only proceed on volumes with:

7056

+ * changed media,

7057

+ * hot-unplugged devices,

7058

+ * & partial volumes

7059

+ */

7060

+ if (!(volume->flags &

7061

+ (EVMS_MEDIA_CHANGED |

7062

+ EVMS_VOLUME_PARTIAL |

7063

+ EVMS_DEVICE_UNPLUGGED)))

7064

+ continue;

7065

+ /* gather the disk's needing to be

7066

+ * rediscovered to rebuild this

7067

+ * volume.

7068

+ *

7069

+ * this will locate other disks that

7070

+ * the volume resides on that don't

7071

+ * indicate media change.

7072

+ */

7073

+ rc = evms_cs_kernel_ioctl(

7074

+ volume->node,

7075

+ EVMS_GET_DISK_LIST,

7076

+ (unsigned long)&disk_list);

7077

+ if (rc) {

7078

+ LOG_ERROR("%s: error(%d) retrieving underlying disk list for '%s', skipping ...\n",

7079

+ __FUNCTION__, rc, volume->name);

7080

+ continue;

7081

+ }

7082

+ /* quiesce all the changed volumes

7083

+ * prior to being deleted.

7084

+ */

7085

+ qv.command = 1; // quiesce

7086

+ qv.minor = i; //

7087

+ qv.status = 0; // reset status

7088

+ qv.do_vfs = 0;

7089

+ rc = evms_quiesce_volume(volume, inode, file, &qv);

7090

+ if (rc) {

7091

+ LOG_ERROR("%s: error(%d) attempting to quiesce '%s%s'.\n",

7092

+ __FUNCTION__, rc,

7093

+ EVMS_DEV_NODE_PATH,

7094

+ volume->name);

7095

+ }

7096

+ }

7097

+

7098

+ /* we need to revalidate all the changed

7099

+ * media. this is accomplished by issuing

7100

+ * the revalidate disk ioctl to each device

7101

+ * with changed media. the device manager

7102

+ * remembers which devices indicated

7103

+ * media changed (set by check media

7104

+ * changed ioctl issued earlier), and will

7105

+ * only issue the revalidate disk ioctl to

7106

+ * those disks one time.

7107

+ *

7108

+ * NOTE:

7109

+ * this needs to be done BEFORE deleting

7110

+ * the volumes because deleting the

7111

+ * last segment on disk will cause the

7112

+ * associated disk node to freed, and we

7113

+ * will not be able to issue the

7114

+ * revalidate disk ioctl after that.

7115

+ */

7116

+ for (lnode = disk_list; lnode; lnode = lnode->next) {

7117

+ disk = (evms_logical_node_t *)lnode->item;

7118

+ /* only really do removable media devices */

7119

+ if (disk->flags & EVMS_MEDIA_CHANGED) {

7120

+ /* go revalidate the change media */

7121

+ rc = evms_cs_kernel_ioctl(

7122

+ disk,

7123

+ EVMS_REVALIDATE_DISK,

7124

+ (unsigned long)NULL);

7125

+ }

7126

+ }

7127

+

7128

+ /* delete all the affected volumes */

7129

+ for (i = 1; i < MAX_EVMS_VOLUMES; i++) {

7130

+ evms_delete_volume_t dv;

7131

+

7132

+ volume = &evms_logical_volumes[i];

7133

+ if (!volume->node)

7134

+ continue;

7135

+ /* only proceed on volumes with:

7136

+ * changed media,

7137

+ * hot-unplugged devices,

7138

+ * & partial volumes

7139

+ */

7140

+ if (!(volume->flags &

7141

+ (EVMS_MEDIA_CHANGED |

7142

+ EVMS_VOLUME_PARTIAL |

7143

+ EVMS_DEVICE_UNPLUGGED)))

7144

+ continue;

7145

+ /* only delete quiesced volumes */

7146

+ if (!volume->quiesced)

7147

+ continue;

7148

+ /* delete the volume from memory.

7149

+ * do a 'soft' delete if volume

7150

+ * is mounted, and 'hard' delete

7151

+ * if it is not.

7152

+ *

7153

+ * NOTE: the delete operation will

7154

+ * clear the bits in the flags field.

7155

+ */

7156

+ dv.command = (is_mounted(MKDEV(EVMS_MAJOR,i))) ? 0 : 1;

7157

+ dv.minor = i;

7158

+ dv.status = 0;

7159

+ rc = evms_delete_volume(volume, &dv);

7160

+ }

7161

+

7162

+ /* at this point all devices indicating

7163

+ * media change that had volumes on them

7164

+ * should be gone. however, we could still

7165

+ * have devices indicating media change

7166

+ * that had no volumes on them in the disk

7167

+ * list. we need to delete these devices

7168

+ * from kernel memory and the global device

7169

+ * list.

7170

+ */

7171

+ for (lnode = evms_global_device_list; lnode; lnode = next_lnode) {

7172

+ next_lnode = lnode->next;

7173

+

7174

+ disk = (evms_logical_node_t *)lnode->item;

7175

+ if (disk->flags & EVMS_MEDIA_CHANGED) {

7176

+ rc = DELETE(disk);

7177

+ }

7178

+ }

7179

+

7180

+ /* all the devices that indicated media

7181

+ * change should be gone, both from kernel

7182

+ * memory and global device list. we now

7183

+ * need to remove any references to these

7184

+ * devices from the disk list.

7185

+ *

7186

+ * when removable media is installed, it

7187

+ * will get detected in the device manager's

7188

+ * rediscovery as a new device and added to

7189

+ * the discover list.

7190

+ */

7191

+ for (lnode = disk_list; lnode; lnode = next_lnode) {

7192

+ evms_list_node_t *glnode;

7193

+ int lnode_still_there;

7194

+

7195

+ next_lnode = lnode->next;

7196

+

7197

+ lnode_still_there = FALSE;

7198

+ for (glnode = evms_global_device_list;

7199

+ glnode; glnode = glnode->next) {

7200

+ if (glnode->item == lnode->item) {

7201

+ lnode_still_there = TRUE;

7202

+ break;

7203

+ }

7204

+ }

7205

+ if (lnode_still_there == FALSE) {

7206

+ rc = evms_cs_remove_item_from_list(

7207

+ &disk_list,

7208

+ lnode->item);

7209

+ if (rc) {

7210

+ LOG_ERROR("%s: error(%d) attempting to remove item(%p) from disk_list(%p).\n",

7211

+ __FUNCTION__, rc, lnode->item, &disk_list);

7212

+ }

7213

+ }

7214

+ }

7215

+

7216

+ /* build the in-kernel rediscover packet */

7217

+

7218

+ /* allocate the space for the drive_array in

7219

+ * the evms_rediscover_t packet. to do this

7220

+ * we need to count the number of disk nodes,

7221

+ * then allocate the necessary space.

7222

+ */

7223

+ /* count the disk nodes */

7224

+ for (lnode = disk_list; lnode; lnode = lnode->next)

7225

+ kernel_rd_pckt.drive_count++;

7226

+ /* allocate the space */

7227

+ if (kernel_rd_pckt.drive_count) {

7228

+ rc = evms_cs_allocate_memory(

7229

+ (void **)&kernel_rd_pckt.drive_array,

7230

+ kernel_rd_pckt.drive_count *

7231

+ sizeof(unsigned long));

7232

+ if (rc) {

7233

+ LOG_ERROR("%s: error(%d) allocating rediscover drive array.\n",

7234

+ __FUNCTION__, rc);

7235

+ }

7236

+ }

7237

+ /* populate the drive array

7238

+ *

7239

+ * this also frees the disk_list which is useful

7240

+ * if we had an error allocating the drive array.

7241

+ */

7242

+ for (i = 0, lnode = disk_list; lnode; lnode = next_lnode, i++) {

7243

+ next_lnode = lnode->next;

7244

+

7245

+ /* remove this disk from the disk list */

7246

+ disk = (evms_logical_node_t *)lnode->item;

7247

+ rc = evms_cs_remove_item_from_list(&disk_list, disk);

7248

+ if (!rc) {

7249

+ /* add this disk to rediscover

7250

+ * packet

7251

+ */

7252

+ kernel_rd_pckt.drive_array[i] =

7253

+ (unsigned long)disk ^ EVMS_HANDLE_KEY;

7254

+ }

7255

+ }

7256

+ /* perform the rediscovery operation */

7257

+ if (!rc) {

7258

+ rc = evms_discover_volumes(&kernel_rd_pckt);

7259

+ if (kernel_rd_pckt.drive_count) {

7260

+ evms_cs_deallocate_memory(

7261

+ kernel_rd_pckt.drive_array);

7262

+ }

7263

+ }

7264

+ LOG_DETAILS("%s: rediscover completed.\n", __FUNCTION__);

7265

+ }

7266

+

7267

+ return(rc);

7268

+}

7269

+

7270

+/************************************************/

7271

+/* START -- REVALIDATE DISK */

7272

+/************************************************/

7273

+

7274

+static int

7275

+evms_revalidate_disk(kdev_t dev)

7276

+{

7277

+ int rc = 0;

7278

+ evms_logical_volume_t *volume = NULL;

7279

+

7280

+ /* check user access */

7281

+ if (!capable(CAP_SYS_ADMIN))

7282

+ rc = -EACCES;

7283

+ if (!rc) {

7284

+ int minor;

7285

+ /* get the minor */

7286

+ minor = MINOR(dev);

7287

+ /* insure this minor points to a valid volume */

7288

+ volume = &evms_logical_volumes[minor];

7289

+ if (volume->node == NULL) {

7290

+ rc = -ENXIO;

7291

+ }

7292

+ }

7293

+ if (!rc) {

7294

+ /* go revalidate the change media */

7295

+ rc = evms_cs_kernel_ioctl(

7296

+ volume->node,

7297

+ EVMS_REVALIDATE_DISK,

7298

+ (unsigned long)NULL);

7299

+ }

7300

+ return(rc);

7301

+}

7302

+

7303

+/************************************************/

7304

+/* END -- REVALIDATE DISK */

7305

+/************************************************/

7306

+

7307

+/************************************************/

7308

+/* START -- OPEN */

7309

+/************************************************/

7310

+

7311

+static int

7312

+evms_open(struct inode * inode, struct file * file)

7313

+{

7314

+ int rc = 0, minor = 0;

7315

+ evms_logical_volume_t *volume = NULL;

7316

+

7317

+ /* check user access */

7318

+ if (!capable(CAP_SYS_ADMIN))

7319

+ rc = -EACCES;

7320

+ if (!rc) {

7321

+ if (!inode)

7322

+ rc = -EINVAL;

7323

+ }

7324

+ rc = evms_check_for_device_changes(inode, file);

7325

+ if (!rc) {

7326

+ /* get the minor */

7327

+ minor = MINOR(inode->i_rdev);

7328

+ if (minor) {

7329

+ /* insure this minor points to a valid volume */

7330

+ volume = &evms_logical_volumes[minor];

7331

+ if (volume->node == NULL) {

7332

+ rc = -ENXIO;

7333

+ }

7334

+ }

7335

+ }

7336

+ /* go "open" the volume */

7337

+ if (!rc && minor) {

7338

+ rc = IOCTL(volume->node, inode, file,

7339

+ EVMS_OPEN_VOLUME,

7340

+ (unsigned long)NULL);

7341

+ if (rc) {

7342

+ LOG_ERROR("error(%d) doing EVMS_OPEN_VOLUME ioctl to '%s'.\n",

7343

+ rc, volume->name);

7344

+ }

7345

+ }

7346

+ return(rc);

7347

+}

7348

+

7349

+/************************************************/

7350

+/* END -- OPEN */

7351

+/************************************************/

7352

+

7353

+/************************************************/

7354

+/* START -- RELEASE */

7355

+/************************************************/

7356

+

7357

+static int

7358

+evms_release(struct inode * inode, struct file * file)

7359

+{

7360

+ int rc = 0, minor = 0;

7361

+ evms_logical_volume_t *volume = NULL;

7362

+

7363

+ /* check user access */

7364

+ if (!capable(CAP_SYS_ADMIN))

7365

+ rc = -EACCES;

7366

+ if (!rc) {

7367

+ if (!inode)

7368

+ rc = -EINVAL;

7369

+ }

7370

+ if (!rc) {

7371

+ /* get the minor */

7372

+ minor = MINOR(inode->i_rdev);

7373

+ if (minor) {

7374

+ /* insure this minor points to a valid volume */

7375

+ volume = &evms_logical_volumes[minor];

7376

+ if (volume->node == NULL) {

7377

+ rc = -ENXIO;

7378

+ }

7379

+ }

7380

+ }

7381

+ /* go "close" the volume */

7382

+ if (!rc && minor) {

7383

+ rc = IOCTL(volume->node, inode, file,

7384

+ EVMS_CLOSE_VOLUME,

7385

+ (unsigned long)NULL);

7386

+ if (rc) {

7387

+ LOG_ERROR("error(%d) doing EVMS_CLOSE_VOLUME ioctl to '%s'.\n",

7388

+ rc, volume->name);

7389

+ }

7390

+ }

7391

+ return(rc);

7392

+}

7393

+

7394

+/************************************************/

7395

+/* END -- RELEASE */

7396

+/************************************************/

7397

+

7398

+struct block_device_operations evms_fops = {

7399

+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,14)

7400

+ owner: THIS_MODULE,

7401

+#endif

7402

+ open: evms_open,

7403

+ release: evms_release,

7404

+ ioctl: evms_ioctl,

7405

+ check_media_change: evms_check_media_change,

7406

+ revalidate: evms_revalidate_disk

7407

+};

7408

+

7409

+/**********************************************************/

7410

+/* END -- FOPS functions definitions */

7411

+/**********************************************************/

7412

+

7413

+/**********************************************************/

7414

+/* START -- RUNTIME support functions */

7415

+/**********************************************************/

7416

+

7417

+static void

7418

+evms_do_request_fn(request_queue_t *q) {

7419

+ LOG_WARNING("This function should not be called.\n");

7420

+}

7421

+

7422

+#ifdef CONFIG_SMP

7423

+static request_queue_t *

7424

+evms_find_queue(kdev_t dev)

7425

+{

7426

+ request_queue_t *rq = NULL;

7427

+ evms_logical_volume_t *volume;

7428

+

7429

+ volume = &evms_logical_volumes[MINOR(dev)];

7430

+ if (volume->node)

7431

+ rq = &volume->request_queue;

7432

+ return (rq);

7433

+}

7434

+#endif

7435

+

7436

+/*

7437

+ * Function: evms_make_request_fn

7438

+ *

7439

+ */

7440

+static int

7441

+evms_make_request_fn(

7442

+ request_queue_t *q,

7443

+ int rw,

7444

+ struct buffer_head *bh)

7445

+{

7446

+ evms_logical_volume_t *volume;

7447

+ eio_t eio;

7448

+

7449

+ eio.rsector = bh->b_rsector;

7450

+ eio.rsize = bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT;

7451

+ eio.bh = bh;

7452

+

7453

+ volume = &evms_logical_volumes[MINOR(bh->b_dev)];

7454

+ wait_event(volume->wait_queue, (!volume->quiesced));

7455

+ if (volume->node) {

7456

+ switch (rw) {

7457

+ case READ:

7458

+ case READA:

7459

+ atomic_inc(&volume->requests_in_progress);

7460

+ R_IO(volume->node, &eio);

7461

+ atomic_dec(&volume->requests_in_progress);

7462

+ return 0;

7463

+ case WRITE:

7464

+ atomic_inc(&volume->requests_in_progress);

7465

+ W_IO(volume->node, &eio);

7466

+ atomic_dec(&volume->requests_in_progress);

7467

+ return 0;

7468

+ default:

7469

+ buffer_IO_error(bh);

7470

+ return 0;

7471

+ }

7472

+ } else {

7473

+ LOG_ERROR("request for unknown logical volume [minor(%d)].\n",

7474

+ bh->b_dev);

7475

+ buffer_IO_error(bh);

7476

+ }

7477

+ return 0;

7478

+}

7479

+

7480

+/**********************************************************/

7481

+/* END -- RUNTIME support functions */

7482

+/**********************************************************/

7483

+

7484

+/**********************************************************/

7485

+/* START -- INIT/DISCOVERY support functions */

7486

+/**********************************************************/

7487

+

7488

+/*

7489

+ * Function: evms_discover_logical_disks

7490

+ * Description: Construct the logical disk list by calling all registered device managers.

7491

+ */

7492

+static void

7493

+evms_discover_logical_disks(evms_logical_node_t **disk_list)

7494

+{

7495

+ evms_registered_plugin_t * p;

7496

+ LOG_EXTRA("discovering logical disks...\n");

7497

+ for (p = registered_plugin_head; p; p = p->next) {

7498

+ if (GetPluginType(p->plugin->id) == EVMS_DEVICE_MANAGER) {

7499

+ DISCOVER(p, disk_list);

7500

+ }

7501

+ }

7502

+}

7503

+

7504

+/*

7505

+ * Function: evms_discover_logical_partitions

7506

+ * Description: Construct the logical partition list by calling all registered partition managers.

7507

+ */

7508

+static void

7509

+evms_discover_logical_partitions(evms_logical_node_t **discover_list)

7510

+{

7511

+ int rc, done;

7512

+

7513

+ evms_registered_plugin_t * p;

7514

+ LOG_EXTRA("discovering logical partitions...\n");

7515

+ do {

7516

+ done = TRUE;

7517

+ for (p = registered_plugin_head; p; p = p->next) {

7518

+ if (GetPluginType(p->plugin->id) == EVMS_SEGMENT_MANAGER) {

7519

+ rc = DISCOVER(p, discover_list);

7520

+ /* RC > 0 means the plugin

7521

+ * added something to the

7522

+ * discover list. This also

7523

+ * means we must loop thru

7524

+ * these plugins another time.

7525

+ * RC == 0 means nothing was

7526

+ * added to the discover list

7527

+ * by this plugin.

7528

+ * RC < 0 means the plugin

7529

+ * encountered some error and

7530

+ * nothing was added to the list.

7531

+ * NOTE: If a plugin has both

7532

+ * added something new to the

7533

+ * discover list and encountered

7534

+ * an error, RC > 0 must be

7535

+ * returned.

7536

+ */

7537

+ if (rc > 0)

7538

+ done = FALSE;

7539

+ }

7540

+ }

7541

+ } while (done == FALSE);

7542

+

7543

+ /* send the end of discovery signal to each

7544

+ * partition manager plugin.

7545

+ */

7546

+ for (p = registered_plugin_head; p; p = p->next)

7547

+ if (GetPluginType(p->plugin->id) == EVMS_SEGMENT_MANAGER)

7548

+ if (p->plugin->function_table->end_discover)

7549

+ rc = END_DISCOVER(p, discover_list);

7550

+}

7551

+

7552

+/*

7553

+ * Function: evms_discover_volume_groups

7554

+ * Description: Find volume groups within the logical partitions list

7555

+ */

7556

+static void

7557

+evms_discover_volume_groups(evms_logical_node_t **discover_list)

7558

+{

7559

+ int rc, done;

7560

+

7561

+ evms_registered_plugin_t * p;

7562

+ LOG_EXTRA("discovering logical volume groups...\n");

7563

+ do {

7564

+ done = TRUE;

7565

+ for (p = registered_plugin_head; p; p = p->next) {

7566

+ if (GetPluginType(p->plugin->id) == EVMS_REGION_MANAGER) {

7567

+ rc = DISCOVER(p, discover_list);

7568

+ /* RC > 0 means the plugin

7569

+ * added something to the

7570

+ * discover list. This also

7571

+ * means we must loop thru

7572

+ * these plugins another time.

7573

+ * RC == 0 means nothing was

7574

+ * added to the discover list

7575

+ * by this plugin.

7576

+ * RC < 0 means the plugin

7577

+ * encountered some error and

7578

+ * nothing was added to the list.

7579

+ * NOTE: If a plugin has both

7580

+ * added something new to the

7581

+ * discover list and encountered

7582

+ * an error, RC > 0 must be

7583

+ * returned.

7584

+ */

7585

+ if (rc > 0)

7586

+ done = FALSE;

7587

+ }

7588

+ }

7589

+ } while (done == FALSE);

7590

+

7591

+ /* send the end of discovery signal to each volume

7592

+ * group plugin.

7593

+ */

7594

+ for (p = registered_plugin_head; p; p = p->next)

7595

+ if (GetPluginType(p->plugin->id) == EVMS_REGION_MANAGER)

7596

+ if (p->plugin->function_table->end_discover)

7597

+ rc = END_DISCOVER(p, discover_list);

7598

+}

7599

+

7600

+/*

7601

+ *

7602

+ * convert all the feature header fields into cpu native format

7603

+ * from the on-disk Little Endian format. From this point forward

7604

+ * all plugins can deal with feature headers natively.

7605

+ */

7606

+void

7607

+le_feature_header_to_cpu(evms_feature_header_t *fh)

7608

+{

7609

+ fh->signature = le32_to_cpu(fh->signature);

7610

+ fh->crc = le32_to_cpu(fh->crc);

7611

+ fh->version.major = le32_to_cpu(fh->version.major);

7612

+ fh->version.minor = le32_to_cpu(fh->version.minor);

7613

+ fh->version.patchlevel = le32_to_cpu(fh->version.patchlevel);

7614

+ fh->engine_version.major = le32_to_cpu(fh->engine_version.major);

7615

+ fh->engine_version.minor = le32_to_cpu(fh->engine_version.minor);

7616

+ fh->engine_version.patchlevel = le32_to_cpu(fh->engine_version.patchlevel);

7617

+ fh->flags = le32_to_cpu(fh->flags);

7618

+ fh->feature_id = le32_to_cpu(fh->feature_id);

7619

+ fh->sequence_number = le64_to_cpu(fh->sequence_number);

7620

+ fh->alignment_padding = le64_to_cpu(fh->alignment_padding);

7621

+ fh->feature_data1_start_lsn = le64_to_cpu(fh->feature_data1_start_lsn);

7622

+ fh->feature_data1_size = le64_to_cpu(fh->feature_data1_size);

7623

+ fh->feature_data2_start_lsn = le64_to_cpu(fh->feature_data2_start_lsn);

7624

+ fh->feature_data2_size = le64_to_cpu(fh->feature_data2_size);

7625

+ fh->volume_serial_number = le64_to_cpu(fh->volume_serial_number);

7626

+ fh->volume_system_id = le32_to_cpu(fh->volume_system_id);

7627

+ fh->object_depth = le32_to_cpu(fh->object_depth);

7628

+}

7629

+

7630

+static int

7631

+edef_load_feature_header(evms_logical_node_t *node)

7632

+{

7633

+ int i, rc = 0, rc_array[2] = {0,0};

7634

+ unsigned long size_in_bytes;

7635

+ u_int64_t size_in_sectors, starting_sector = 0;

7636

+ evms_feature_header_t *fh = NULL, *fh1 = NULL, *fh2 = NULL;

7637

+ char *location_name = NULL;

7638

+ evms_version_t version = {

7639

+ EVMS_FEATURE_HEADER_MAJOR,

7640

+ EVMS_FEATURE_HEADER_MINOR,

7641

+ EVMS_FEATURE_HEADER_PATCHLEVEL

7642

+ };

7643

+

7644

+ if (!node->feature_header) {

7645

+ size_in_sectors = evms_cs_size_in_vsectors(sizeof(*fh));

7646

+ size_in_bytes = size_in_sectors << EVMS_VSECTOR_SIZE_SHIFT;

7647

+ rc = evms_cs_allocate_memory((void **)&fh1,size_in_bytes);

7648

+ if (!rc) {

7649

+ rc = evms_cs_allocate_memory((void **)&fh2,size_in_bytes);

7650

+ if (rc)

7651

+ evms_cs_deallocate_memory(fh1);

7652

+ }

7653

+ for (i = 0; i < 2; i++) {

7654

+ if (i == 0) {

7655

+ starting_sector =

7656

+ node->total_vsectors -

7657

+ size_in_sectors;

7658

+ fh = fh1;

7659

+ location_name = evms_primary_string;

7660

+ } else {

7661

+ starting_sector--;

7662

+ fh = fh2;

7663

+ location_name = evms_secondary_string;

7664

+ }

7665

+ /* read header into buffer */

7666

+ rc = INIT_IO(

7667

+ node,

7668

+ 0,

7669

+ starting_sector,

7670

+ size_in_sectors,

7671

+ fh);

7672

+ if (rc) {

7673

+ LOG_ERROR("error(%d) probing for %s feature header(at %Ld) on '%s'.\n",

7674

+ rc,

7675

+ location_name,

7676

+ starting_sector,

7677

+ node->name);

7678

+ rc_array[i] = rc;

7679

+ continue;

7680

+ }

7681

+ /* validate header signature */

7682

+ if (cpu_to_le32(fh->signature) != EVMS_FEATURE_HEADER_SIGNATURE) {

7683

+ rc = -ENODATA;

7684

+ rc_array[i] = rc;

7685

+ continue;

7686

+ }

7687

+ /* validate header CRC */

7688

+ if (fh->crc != EVMS_MAGIC_CRC) {

7689

+ u_int32_t org_crc, final_crc;

7690

+ org_crc = cpu_to_le32(fh->crc);

7691

+ fh->crc = 0;

7692

+ final_crc = evms_cs_calculate_crc(

7693

+ EVMS_INITIAL_CRC,

7694

+ fh, sizeof(*fh));

7695

+ if (final_crc != org_crc) {

7696

+ LOG_ERROR("CRC mismatch error [stored(%x), computed(%x)] in %s feature header(at %Ld) on '%s'.\n",

7697

+ org_crc, final_crc,

7698

+ location_name,

7699

+ starting_sector,

7700

+ node->name);

7701

+ rc = -EINVAL;

7702

+ rc_array[i] = rc;

7703

+ continue;

7704

+ }

7705

+ } else {

7706

+ LOG_WARNING("CRC disabled in %s feature header(at %Ld) on '%s'.\n",

7707

+ location_name,

7708

+ starting_sector,

7709

+ node->name);

7710

+ }

7711

+ /* convert the feature header from the

7712

+ * on-disk format (Little Endian) to

7713

+ * native cpu format.

7714

+ */

7715

+ le_feature_header_to_cpu(fh);

7716

+ /* verify the system data version */

7717

+ rc = evms_cs_check_version(

7718

+ &version,

7719

+ &fh->version);

7720

+ if (rc) {

7721

+ LOG_ERROR("error: obsolete version(%d,%d,%d) in %s feature header on '%s'.\n",

7722

+ fh->version.major,

7723

+ fh->version.minor,

7724

+ fh->version.patchlevel,

7725

+ location_name,

7726

+ node->name);

7727

+ rc_array[i] = rc;

7728

+ }

7729

+ }

7730

+

7731

+ /* getting same return code for both copies? */

7732

+ if (rc_array[0] == rc_array[1]) {

7733

+ rc = rc_array[0];

7734

+ /* if no errors on both copies,

7735

+ * check the sequence numbers.

7736

+ * use the highest sequence number.

7737

+ */

7738

+ if (!rc) {

7739

+ /* compare sequence numbers */

7740

+ if (fh1->sequence_number == fh2->sequence_number) {

7741

+ fh = fh1;

7742

+ } else {

7743

+ LOG_WARNING("%s feature header sequence number(%Ld) mismatches %s feature header sequence number(%Ld) on '%s'!\n",

7744

+ evms_primary_string,

7745

+ fh1->sequence_number,

7746

+ evms_secondary_string,

7747

+ fh2->sequence_number,

7748

+ node->name);

7749

+ if (fh1->sequence_number > fh2->sequence_number) {

7750

+ fh = fh1;

7751

+ location_name = evms_primary_string;

7752

+ /* indicate bad sequence number of secondary */

7753

+ rc_array[1] = -1;

7754

+ } else {

7755

+ fh = fh2;

7756

+ location_name = evms_secondary_string;

7757

+ /* indicate bad sequence number of primary */

7758

+ rc_array[0] = -1;

7759

+ }

7760

+ }

7761

+ }

7762

+ /* getting different return codes for each copy */

7763

+ } else

7764

+ /* either primary or secondary copy is

7765

+ * valid, so use the valid copy.

7766

+ */

7767

+ if ((rc_array[0] == 0) ||

7768

+ (rc_array[1] == 0)) {

7769

+ char *warn_name = NULL;

7770

+

7771

+ /* indicate success */

7772

+ rc = 0;

7773

+ /* set variables based on which copy is valid */

7774

+ if (rc_array[0] == 0) {

7775

+ /* use primary (rear) copy if its good */

7776

+ fh = fh1;

7777

+ location_name = evms_primary_string;

7778

+ warn_name = evms_secondary_string;

7779

+ } else {

7780

+ /* use secondary (front) copy if its good */

7781

+ fh = fh2;

7782

+ location_name = evms_secondary_string;

7783

+ warn_name = evms_primary_string;

7784

+ }

7785

+ /* warn the user about the invalid copy */

7786

+ LOG_WARNING("warning: error(%d) probing/verifying the %s feature header on '%s'.\n",

7787

+ rc_array[0] + rc_array[1],

7788

+ warn_name,

7789

+ node->name);

7790

+ } else

7791

+ /* both copies had a different error,

7792

+ * and one was a fatal error, so

7793

+ * indicate fatal error.

7794

+ */

7795

+ if ((rc_array[0] == -EINVAL) ||

7796

+ (rc_array[1] == -EINVAL)) {

7797

+ rc = -EINVAL;

7798

+ }

7799

+

7800

+ /* on error, set fh to NULL */

7801

+ if (rc) fh = NULL;

7802

+

7803

+ /* deallocate metadata buffers appropriately */

7804

+ if (fh != fh1)

7805

+ evms_cs_deallocate_memory(fh1);

7806

+ if (fh != fh2)

7807

+ evms_cs_deallocate_memory(fh2);

7808

+

7809

+ /* save validated feature header pointer */

7810

+ if (!rc) {

7811

+ node->feature_header = fh;

7812

+ if (rc_array[0] != rc_array[1]) {

7813

+ LOG_DETAILS("using %s feature header on '%s'.\n",

7814

+ location_name,

7815

+ node->name);

7816

+ }

7817

+ }

7818

+

7819

+ /* if no signature found, adjust return code */

7820

+ if (rc == -ENODATA) {

7821

+ rc = 0;

7822

+ LOG_DEBUG("no feature header found on '%s'.\n",

7823

+ node->name);

7824

+ }

7825

+ }

7826

+ return(rc);

7827

+}

7828

+

7829

+static int

7830

+edef_find_first_features(evms_logical_node_t **discover_list)

7831

+{

7832

+ int rc;

7833

+ evms_logical_node_t *node, *tmp_list_head;

7834

+

7835

+ tmp_list_head = *discover_list;

7836

+ *discover_list = NULL;

7837

+

7838

+ while(tmp_list_head) {

7839

+ node = tmp_list_head;

7840

+ rc = evms_cs_remove_logical_node_from_list(

7841

+ &tmp_list_head,

7842

+ node);

7843

+ if (rc) BUG();

7844

+ /* load the feature header if present */

7845

+ rc = edef_load_feature_header(node);

7846

+ /* This node have a feature header ?

7847

+ * it won't be if there is no header to load

7848

+ * OR

7849

+ * there was a fatal error attempting to read it.

7850

+ */

7851

+ if (node->feature_header) {

7852

+ /* check for object flag */

7853

+ if (node->feature_header->flags &

7854

+ EVMS_VOLUME_DATA_OBJECT) {

7855

+ LOG_DEFAULT("object detected, deleting '%s'.\n",

7856

+ node->name);

7857

+ rc = -EINVAL;

7858

+ } else

7859

+ /* check for stop-data flag */

7860

+ if (node->feature_header->flags &

7861

+ EVMS_VOLUME_DATA_STOP) {

7862

+ LOG_DEFAULT("stop data detected, deleting '%s'.\n",

7863

+ node->name);

7864

+ rc = -EINVAL;

7865

+ } else {

7866

+ /* register node on global list */

7867

+ evms_list_node_t **evms_node;

7868

+

7869

+ /* check for duplicate pointers */

7870

+ /* search for node in global list */

7871

+ evms_node = evms_lookup_item_in_list(

7872

+ &evms_global_feature_node_list,

7873

+ node);

7874

+ /* already present? */

7875

+ if (*evms_node) {

7876

+ /* yes, already present */

7877

+ rc = -ENODATA; /* dont process this node further */

7878

+ LOG_DEFAULT("deleting duplicate reference to '%s'.\n",

7879

+ node->name);

7880

+ /* forget this node */

7881

+ node = NULL;

7882

+ } else {

7883

+ /* no, not present.

7884

+ * add it to the list.

7885

+ */

7886

+ node->flags |= EVMS_VOLUME_FLAG;

7887

+ node->iflags |= EVMS_FEATURE_BOTTOM;

7888

+ rc = evms_cs_allocate_memory(

7889

+ (void **)&node->volume_info,

7890

+ sizeof(evms_volume_info_t));

7891

+ if (!rc) {

7892

+ node->volume_info->volume_serial_number =

7893

+ node->feature_header->volume_serial_number;

7894

+ node->volume_info->volume_system_id =

7895

+ node->feature_header->volume_system_id;

7896

+ strcpy(node->volume_info->volume_name,

7897

+ node->feature_header->volume_name);

7898

+ rc = evms_cs_add_item_to_list(

7899

+ &evms_global_feature_node_list,

7900

+ node);

7901

+ }

7902

+ }

7903

+ }

7904

+ }

7905

+ /* if any errors, delete the node */

7906

+ if (rc) {

7907

+ if (node)

7908

+ DELETE(node);

7909

+ } else

7910

+ /* on successful processing of this node

7911

+ * place it back on the discover list.

7912

+ */

7913

+ evms_cs_add_logical_node_to_list(

7914

+ discover_list,

7915

+ node);

7916

+ }

7917

+ return(0);

7918

+}

7919

+

7920

+/* These define describe the node types that can be isolated. */

7921

+#define ISOLATE_ASSOCIATIVE_FEATURES 0

7922

+#define ISOLATE_COMPATIBILITY_VOLUMES 1

7923

+#define ISOLATE_EVMS_VOLUMES 2

7924

+#define ISOLATE_EVMS_VOLUME_SERIAL_NUMBER 3

7925

+#define ISOLATE_EVMS_NODES_BY_FEATURE_AND_DEPTH 4

7926

+static int

7927

+edef_isolate_nodes_by_type(

7928

+ unsigned int type,

7929

+ evms_logical_node_t **src_list,

7930

+ evms_logical_node_t **trg_list,

7931

+ u_int32_t compare32,

7932

+ u_int64_t compare64)

7933

+{

7934

+ evms_logical_node_t *node, *next_node;

7935

+ int rc = 0, found_node;

7936

+ evms_feature_header_t *fh = NULL;

7937

+

7938

+ for (node = *src_list; node; node = next_node) {

7939

+ next_node = node->next;

7940

+

7941

+ if (node->feature_header)

7942

+ fh = node->feature_header;

7943

+ found_node = FALSE;

7944

+ switch(type) {

7945

+ case ISOLATE_ASSOCIATIVE_FEATURES:

7946

+ if (fh) {

7947

+ if (GetPluginType(fh->feature_id) ==

7948

+ EVMS_ASSOCIATIVE_FEATURE)

7949

+ found_node = TRUE;

7950

+ }

7951

+ break;

7952

+ case ISOLATE_COMPATIBILITY_VOLUMES:

7953

+ if (!(node->flags & EVMS_VOLUME_FLAG))

7954

+ found_node = TRUE;

7955

+ break;

7956

+ case ISOLATE_EVMS_VOLUMES:

7957

+ if (node->flags & EVMS_VOLUME_FLAG)

7958

+ found_node = TRUE;

7959

+ break;

7960

+ /* EVMS volumes with same serial # */

7961

+ case ISOLATE_EVMS_VOLUME_SERIAL_NUMBER:

7962

+ if (node->volume_info->volume_serial_number == compare64)

7963

+ found_node = TRUE;

7964

+ break;

7965

+ case ISOLATE_EVMS_NODES_BY_FEATURE_AND_DEPTH:

7966

+ if (fh)

7967

+ if (fh->object_depth == compare64)

7968

+ if (fh->feature_id == compare32)

7969

+ found_node = TRUE;

7970

+ break;

7971

+ }

7972

+ if (found_node == TRUE) {

7973

+ rc = evms_cs_remove_logical_node_from_list(src_list, node);

7974

+ if (rc) break;

7975

+ rc = evms_cs_add_logical_node_to_list(trg_list, node);

7976

+ if (rc) break;

7977

+ }

7978

+ }

7979

+ return(rc);

7980

+}

7981

+

7982

+static int

7983

+edef_apply_feature(

7984

+ evms_logical_node_t *node,

7985

+ evms_logical_node_t **volume_node_list)

7986

+{

7987

+ evms_registered_plugin_t * p;

7988

+ int rc = -1;

7989

+

7990

+ for (p = registered_plugin_head; p; p = p->next) {

7991

+ if (p->plugin->id ==

7992

+ node->feature_header->feature_id) {

7993

+ rc = DISCOVER(p, volume_node_list);

7994

+ break;

7995

+ }

7996

+ }

7997

+ return(rc);

7998

+}

7999

+

8000

+static int

8001

+edef_get_feature_plugin_header(

8002

+ u_int32_t id,

8003

+ evms_plugin_header_t **header)

8004

+{

8005

+ int rc = -ENOPKG;

8006

+ evms_registered_plugin_t *p;

8007

+

8008

+ for (p = registered_plugin_head; p; p = p->next) {

8009

+ if (p->plugin->id == id) {

8010

+ *header = p->plugin;

8011

+ rc = 0;

8012

+ break;

8013

+ }

8014

+ }

8015

+ if (rc) {

8016

+ LOG_SERIOUS("no plugin loaded for feature id(0x%x)\n", id);

8017

+ }

8018

+ return(rc);

8019

+}

8020

+

8021

+typedef struct evms_volume_build_info_s {

8022

+ int node_count;

8023

+ int feature_header_count;

8024

+ int feature_count;

8025

+ int associative_feature_count;

8026

+ u_int64_t max_depth;

8027

+ evms_plugin_header_t *plugin;

8028

+ evms_logical_node_t *feature_node_list;

8029

+} evms_volume_build_info_t;

8030

+

8031

+/*

8032

+ * edef_evaluate_volume_node_list:

8033

+ * does:

8034

+ * 1) put all nodes from feature list back on volume list

8035

+ * 2) loads the node's feature headers

8036

+ * 3) counts the node list's entries

8037

+ * 4) builds the feature node list

8038

+ * 5) counts the feature headers for associative features

8039

+ * 6) sets feature count to >1 if >1 features to be processed

8040

+ */

8041

+static int

8042

+edef_evaluate_volume_node_list(

8043

+ evms_logical_node_t **volume_node_list,

8044

+ evms_volume_build_info_t *vbi,

8045

+ int volume_complete)

8046

+{

8047

+ int rc;

8048

+ evms_logical_node_t *node;

8049

+

8050

+ vbi->node_count =

8051

+ vbi->feature_count =

8052

+ vbi->associative_feature_count =

8053

+ vbi->max_depth = 0;

8054

+ vbi->plugin = NULL;

8055

+

8056

+ /* put all feature nodes back on the volume list */

8057

+ rc = edef_isolate_nodes_by_type(

8058

+ ISOLATE_EVMS_VOLUMES,

8059

+ &vbi->feature_node_list,

8060

+ volume_node_list,

8061

+ 0,0);

8062

+ if (rc) return(rc);

8063

+

8064

+ /* load all the feature headers */

8065

+ if (!volume_complete) {

8066

+ for(node = *volume_node_list; node; node = node->next) {

8067

+ rc = edef_load_feature_header(node);

8068

+ if (rc) return(rc);

8069

+ }

8070

+ }

8071

+

8072

+ /* find the 1st max depth object:

8073

+ * record the depth

8074

+ * record the plugin

8075

+ */

8076

+ for(node = *volume_node_list; node; node = node->next) {

8077

+ evms_plugin_header_t *plugin;

8078

+ evms_feature_header_t *fh = node->feature_header;

8079

+

8080

+ /* count the nodes */

8081

+ vbi->node_count++;

8082

+

8083

+ /* no feature header found, continue to next node */

8084

+ if (!fh) continue;

8085

+

8086

+ /* check the depth */

8087

+ if (fh->object_depth > vbi->max_depth) {

8088

+ /* record new max depth */

8089

+ vbi->max_depth = fh->object_depth;

8090

+ /* find the plugin header for this feature id */

8091

+ rc = edef_get_feature_plugin_header(

8092

+ fh->feature_id,

8093

+ &plugin);

8094

+ if (rc) return(rc);

8095

+ /* check for >1 plugins */

8096

+ if (vbi->plugin != plugin) {

8097

+ vbi->feature_count++;

8098

+ vbi->plugin = plugin;

8099

+ }

8100

+ }

8101

+ /* check for "associative" feature indicator */

8102

+ if (GetPluginType(vbi->plugin->id) ==

8103

+ EVMS_ASSOCIATIVE_FEATURE)

8104

+ vbi->associative_feature_count++;

8105

+ }

8106

+ /* build a list of max depth nodes for this feature */

8107

+ if (vbi->max_depth) {

8108

+ rc = edef_isolate_nodes_by_type(

8109

+ ISOLATE_EVMS_NODES_BY_FEATURE_AND_DEPTH,

8110

+ volume_node_list,

8111

+ &vbi->feature_node_list,

8112

+ vbi->plugin->id,

8113

+ vbi->max_depth);

8114

+ if (rc) return(rc);

8115

+ if (!vbi->plugin)

8116

+ return(-ENODATA);

8117

+ if (!vbi->feature_node_list)

8118

+ return(-ENODATA);

8119

+ }

8120

+

8121

+ return(rc);

8122

+}

8123

+

8124

+/* function: edef_check_feature_conditions

8125

+ *

8126

+ * This routine verifies the state of volume based on the features

8127

+ * headers and nodes in the current discovery list. All detected

8128

+ * errors are considered fatal.

8129

+ */

8130

+static int

8131

+edef_check_feature_conditions(evms_volume_build_info_t *vbi)

8132

+{

8133

+ int rc = 0;

8134

+

8135

+ if (vbi->associative_feature_count) {

8136

+ if (vbi->node_count > 1) {

8137

+ rc = -EVMS_VOLUME_FATAL_ERROR;

8138

+ LOG_ERROR("associative ERROR: > 1 nodes(%d) remaining to be processed!\n",

8139

+ vbi->node_count);

8140

+ } else if (vbi->max_depth != 1) {

8141

+ rc = -EVMS_VOLUME_FATAL_ERROR;

8142

+ LOG_ERROR("associative ERROR: associative feature found at node depth(%Ld) != 1!\n",

8143

+ vbi->max_depth);

8144

+ } else

8145

+ rc = -EVMS_ASSOCIATIVE_FEATURE;

8146

+ }

8147

+ if (!rc) {

8148

+ if (!vbi->max_depth) {

8149

+ if (vbi->node_count > 1) {

8150

+ rc = -EVMS_VOLUME_FATAL_ERROR;

8151

+ LOG_ERROR("max depth ERROR: > 1 nodes(%d) remaining to be processed!\n",

8152

+ vbi->node_count);

8153

+ }

8154

+ } else if (vbi->max_depth == 1) {

8155

+ if (vbi->feature_count > 1) {

8156

+ rc = -EVMS_VOLUME_FATAL_ERROR;

8157

+ LOG_ERROR("max depth 1 ERROR: > 1 features remaining to be processed!\n");

8158

+ }

8159

+ }

8160

+ }

8161

+ return(rc);

8162

+}

8163

+

8164

+/* function: edef_apply_features

8165

+ *

8166

+ * This routine applies none, one, or more features to an EVMS

8167

+ * volume. The system data structure is first verified and then

8168

+ * features are applied and verified recursively until the

8169

+ * entire volume has been constructed. Fatal errors result in

8170

+ * all nodes in the volume discovery list being deleted.

8171

+ */

8172

+static int

8173

+edef_apply_features(evms_logical_node_t **volume_node_list)

8174

+{

8175

+ int rc = 1, done, top_feature_applying;

8176

+ evms_volume_build_info_t vbi;

8177

+

8178

+ vbi.feature_node_list = NULL;

8179

+ rc = edef_evaluate_volume_node_list(

8180

+ volume_node_list,

8181

+ &vbi, FALSE);

8182

+

8183

+ /* this loop should ONLY get used when

8184

+ * there are features to process.

8185

+ */

8186

+ done = (rc) ? TRUE : FALSE;

8187

+ while(!done) {

8188

+ rc = edef_check_feature_conditions(&vbi);

8189

+ if (rc) break;

8190

+ top_feature_applying = (vbi.max_depth == 1) ? TRUE : FALSE;

8191

+ rc = vbi.plugin->function_table->

8192

+ discover(&vbi.feature_node_list);

8193

+ if (!rc) {

8194

+ rc = edef_evaluate_volume_node_list(

8195

+ volume_node_list,

8196

+ &vbi, top_feature_applying);

8197

+ if (top_feature_applying == TRUE) {

8198

+ if (vbi.node_count > 1) {

8199

+ rc = -EVMS_VOLUME_FATAL_ERROR;

8200

+ LOG_ERROR("ERROR: detected > 1 node at volume completion!\n");

8201

+ }

8202

+ done = TRUE;

8203

+ } else {

8204

+ if (!vbi.plugin) {

8205

+ rc = -EVMS_VOLUME_FATAL_ERROR;

8206

+ LOG_ERROR("ERROR: depth(%Ld): expected another feature!\n",

8207

+ vbi.max_depth);

8208

+ done = TRUE;

8209

+ }

8210

+ }

8211

+ } else { /* rc != 0 */

8212

+ rc = -EVMS_VOLUME_FATAL_ERROR;

8213

+ done = TRUE;

8214

+ }

8215

+ }

8216

+ if (rc)

8217

+ /* put all feature nodes back on the volume list */

8218

+ if (edef_isolate_nodes_by_type(

8219

+ ISOLATE_EVMS_VOLUMES,

8220

+ &vbi.feature_node_list,

8221

+ volume_node_list,

8222

+ 0,0))

8223

+ BUG();

8224

+ return(rc);

8225

+}

8226

+

8227

+static int

8228

+edef_delete_node(

8229

+ evms_logical_node_t **node_list,

8230

+ evms_logical_node_t *node,

8231

+ int return_code,

8232

+ char *log_text)

8233

+{

8234

+ int rc;

8235

+

8236

+ rc = evms_cs_remove_logical_node_from_list(node_list, node);

8237

+ if (!rc) {

8238

+ LOG_ERROR("%s error(%d): deleting volume(%s), node(%s)\n",

8239

+ log_text, return_code,

8240

+ node->volume_info->volume_name,

8241

+ node->name);

8242

+ rc = DELETE(node);

8243

+ if (rc) {

8244

+ LOG_ERROR("error(%d) while deleting node(%s)\n",

8245

+ rc, node->name);

8246

+ }

8247

+ } else {

8248

+ LOG_WARNING("%s error(%d): node gone, assumed deleted by plugin.\n",

8249

+ log_text, return_code);

8250

+ /* plugin must have cleaned up the node.

8251

+ * So just reset the return code and leave.

8252

+ */

8253

+ rc = 0;

8254

+ }

8255

+

8256

+ return(rc);

8257

+}

8258

+

8259

+static int

8260

+edef_process_evms_volumes(

8261

+ evms_logical_node_t **discover_list,

8262

+ evms_logical_node_t **associative_feature_list)

8263

+{

8264

+ int rc = 0;

8265

+ evms_logical_node_t *node, *evms_volumes_list, *volume_node_list;

8266

+ u_int64_t volume_sn;

8267

+

8268

+ /* put all EVMS volumes on their own list */

8269

+ evms_volumes_list = NULL;

8270

+ rc = edef_isolate_nodes_by_type(

8271

+ ISOLATE_EVMS_VOLUMES,

8272

+ discover_list,

8273

+ &evms_volumes_list,

8274

+ 0,0);

8275

+

8276

+ /* apply features to each EVMS volume */

8277

+ /* one volume at a time on each pass */

8278

+ while (evms_volumes_list) {

8279

+ node = evms_volumes_list;

8280

+ /* put all nodes for one EVMS volume on separate list */

8281

+ volume_node_list = NULL;

8282

+ volume_sn = node->volume_info->volume_serial_number;

8283

+ rc = edef_isolate_nodes_by_type(

8284

+ ISOLATE_EVMS_VOLUME_SERIAL_NUMBER,

8285

+ &evms_volumes_list,

8286

+ &volume_node_list,

8287

+ 0, volume_sn);

8288

+ if (rc) break;

8289

+ /* go apply all the volume features now */

8290

+ rc = edef_apply_features(&volume_node_list);

8291

+ switch(rc) {

8292

+ case 0: /* SUCCESS */

8293

+ /* remove volume just processed */

8294

+ node = volume_node_list;

8295

+ rc = evms_cs_remove_logical_node_from_list(&volume_node_list, node);

8296

+ if (rc) break;

8297

+ /* put volume on global list */

8298

+ rc = evms_cs_add_logical_node_to_list(discover_list, node);

8299

+ break;

8300

+ case -EVMS_ASSOCIATIVE_FEATURE:

8301

+ /* put all "associative" features on their own list */

8302

+ rc = edef_isolate_nodes_by_type(

8303

+ ISOLATE_ASSOCIATIVE_FEATURES,

8304

+ &volume_node_list,

8305

+ associative_feature_list,

8306

+ 0,0);

8307

+ break;

8308

+ default:/* FATAL ERROR */

8309

+ /* delete each node remaining in the list */

8310

+ if (volume_node_list) {

8311

+ LOG_ERROR("encountered fatal error building volume '%s'\n",

8312

+ volume_node_list->volume_info->volume_name);

8313

+ }

8314

+ while(volume_node_list) {

8315

+ node = volume_node_list;

8316

+ edef_delete_node(

8317

+ &volume_node_list,

8318

+ node, rc,

8319

+ "EVMS feature");

8320

+ }

8321

+ rc = 0;

8322

+ break;

8323

+ }

8324

+ if (rc) break;

8325

+ }

8326

+ return(rc);

8327

+}

8328

+

8329

+static int

8330

+edef_process_associative_volumes(

8331

+ evms_logical_node_t **associative_feature_list,

8332

+ evms_logical_node_t **discover_list)

8333

+{

8334

+ int rc = 0;

8335

+ evms_logical_node_t *node;

8336

+

8337

+ while (*associative_feature_list) {

8338

+ node = *associative_feature_list;

8339

+ /* remove this node from associative feature list */

8340

+ rc = evms_cs_remove_logical_node_from_list(associative_feature_list, node);

8341

+ if (rc) break;

8342

+ /* put volume on global list */

8343

+ rc = evms_cs_add_logical_node_to_list(discover_list, node);

8344

+ if (rc) break;

8345

+ rc = edef_load_feature_header(node);

8346

+ if (rc) break;

8347

+ rc = edef_apply_feature(node, discover_list);

8348

+ if (rc)

8349

+ edef_delete_node(

8350

+ discover_list, node, rc,

8351

+ "Associative feature");

8352

+ }

8353

+ return(rc);

8354

+}

8355

+

8356

+static int

8357

+edef_check_for_incomplete_volumes(

8358

+ evms_logical_node_t **discover_list)

8359

+{

8360

+ int rc = 0;

8361

+ evms_logical_node_t *next_node, *node;

8362

+

8363

+ /* check to see if any incomplete volumes are left around */

8364

+ /* if so, delete them. */

8365

+ /* complete volumes should not have feature_headers */

8366

+ /* hanging off them, if we find any, we know the volume */

8367

+ /* is incomplete. */

8368

+

8369

+ for (node = *discover_list; node; node = next_node) {

8370

+ next_node = node->next;

8371

+

8372

+ if (node->feature_header) {

8373

+ edef_delete_node(

8374

+ discover_list, node, rc,

8375

+ "Unexpected feature header");

8376

+ }

8377

+ }

8378

+ return(rc);

8379

+}

8380

+

8381

+/*

8382

+ * Function: evms_discover_evms_features

8383

+ * Description: Find features for nodes on the logical partitions list

8384

+ */

8385

+static int

8386

+evms_discover_evms_features(evms_logical_node_t **discover_list)

8387

+{

8388

+ evms_logical_node_t *associative_feature_list;

8389

+ int rc = 0;

8390

+

8391

+ LOG_EXTRA("discovering evms volume features...\n");

8392

+

8393

+ /* initialize "associative" features list */

8394

+ associative_feature_list = NULL;

8395

+

8396

+ /* find the bottom features */

8397

+ rc = edef_find_first_features(discover_list);

8398

+ if (!rc)

8399

+ /* process EVMS volumes here */

8400

+ rc = edef_process_evms_volumes(discover_list, &associative_feature_list);

8401

+ if (!rc)

8402

+ /* process "associative" features here */

8403

+ rc = edef_process_associative_volumes(

8404

+ &associative_feature_list, discover_list);

8405

+ if (!rc)

8406

+ /* check for incomplete volumes */

8407

+ rc = edef_check_for_incomplete_volumes(discover_list);

8408

+

8409

+ return(rc);

8410

+}

8411

+

8412

+/*

8413

+ * function: eelv_assign_volume_minor

8414

+ *

8415

+ * This is a support function for evms_export_logical_volumes.

8416

+ * This routine assigns a specific minor number to a volume. It

8417

+ * also performs the remaining steps to make this volume visible

8418

+ * and usable to the kernel.

8419

+ *

8420

+ */

8421

+static void

8422

+eelv_assign_volume_minor(evms_logical_node_t *node, int minor)

8423

+{

8424

+ evms_logical_volume_t *volume;

8425

+ int rc;

8426

+

8427

+ /* initialize the logical_node entry in the volume array */

8428

+ volume = &evms_logical_volumes[minor];

8429

+ volume->node = node;

8430

+ rc = evms_cs_allocate_memory((void **)&volume->name,

8431

+ strlen(EVMS_GET_NODE_NAME(node)) + 1);

8432

+ if (rc) BUG();

8433

+ strcpy(volume->name, EVMS_GET_NODE_NAME(node));

8434

+

8435

+ /* copy flags from top level node into volume structure */

8436

+ volume->flags = node->flags;

8437

+

8438

+ /* check for read-only volume */

8439

+ if ( volume->flags & EVMS_VOLUME_READ_ONLY ) {

8440

+ set_device_ro(MKDEV(EVMS_MAJOR, minor),1);

8441

+ }

8442

+

8443

+ /* initialize the global device arrays */

8444

+ blksize_size[EVMS_MAJOR][minor] = node->block_size;

8445

+ hardsect_size[EVMS_MAJOR][minor] = node->hardsector_size;

8446

+ blk_size[EVMS_MAJOR][minor] = (int)(node->total_vsectors >> 1);

8447

+

8448

+ /* register this volume with devfs */

8449

+ volume->devfs_handle =

8450

+ devfs_register(evms_dir_devfs_handle,

8451

+ volume->name,

8452

+ DEVFS_FL_DEFAULT,

8453

+ EVMS_MAJOR, minor,

8454

+ S_IFBLK | S_IRUGO | S_IWUGO,

8455

+ &evms_fops, NULL);

8456

+

8457

+ evms_volumes++;

8458

+

8459

+ LOG_DEFAULT("Exporting EVMS Volume(%u,%u) from \"%s%s\".\n",

8460

+ EVMS_MAJOR, minor,

8461

+ EVMS_DEV_NODE_PATH, volume->name);

8462

+}

8463

+

8464

+/*

8465

+ * function: eelv_check_for_duplicity

8466

+ *

8467

+ * This is a support function for evms_export_logical_volumes.

8468

+ * This routine compares the serial number in the top most node

8469

+ * in the volume to the list of currently exported volumes. If

8470

+ * this volumes serial number is found in the list then we know

8471

+ * this volume is a duplicate and it is then delete.

8472

+ *

8473

+ */

8474

+static void

8475

+eelv_check_for_duplicity(evms_logical_node_t **discover_list)

8476

+{

8477

+ evms_logical_node_t *next_node, *node;

8478

+ evms_logical_volume_t *lv;

8479

+ int i, is_dup;

8480

+

8481

+ for (node = *discover_list; node; node = next_node) {

8482

+ next_node = node->next;

8483

+

8484

+ is_dup = FALSE;

8485

+ for (i = 1; i < MAX_EVMS_VOLUMES; i++) {

8486

+ lv = &evms_logical_volumes[i];

8487

+ /* only check exported volumes */

8488

+ if (lv->node) {

8489

+ char *type_ptr = NULL;

8490

+

8491

+ /* check for duplicate pointer */

8492

+ if (node == lv->node) {

8493

+ is_dup = TRUE;

8494

+ type_ptr = "pointer";

8495

+ /* check for duplicate node */

8496

+ } else if (!strcmp(node->name,

8497

+ lv->node->name)) {

8498

+ is_dup = TRUE;

8499

+ type_ptr = "node";

8500

+ }

8501

+ if (is_dup == TRUE) {

8502

+ evms_cs_remove_logical_node_from_list(discover_list, node);

8503

+ LOG_DEFAULT("deleting duplicate %s to EVMS volume(%u,%u,%s)...\n",

8504

+ type_ptr,

8505

+ EVMS_MAJOR, i,

8506

+ EVMS_GET_NODE_NAME(node));

8507

+ /* forget duplicate */

8508

+ break;

8509

+ }

8510

+ }

8511

+ }

8512

+ }

8513

+}

8514

+

8515

+/*

8516

+ * function: eelv_reassign_soft_deleted_volume_minors

8517

+ *

8518

+ * This is a support function for evms_export_logical_volumes.

8519

+ * This routine reassigns minor numbers to rediscovered "soft"

8520

+ * deleted volumes.

8521

+ *

8522

+ */

8523

+static void

8524

+eelv_reassign_soft_deleted_volume_minors(evms_logical_node_t **discover_list)

8525

+{

8526

+ evms_logical_node_t *next_node, *node;

8527

+ evms_logical_volume_t *lv;

8528

+ int i, node_removed;

8529

+

8530

+ for (node = *discover_list; node; node = next_node) {

8531

+ next_node = node->next;

8532

+

8533

+ node_removed = FALSE;

8534

+ for (i = 1; i < MAX_EVMS_VOLUMES; i++) {

8535

+ lv = &evms_logical_volumes[i];

8536

+ /* only check soft deleted volumes:

8537

+ * they have a non-NULL name.

8538

+ */

8539

+ if (lv->flags & EVMS_VOLUME_SOFT_DELETED) {

8540

+ if (!strcmp(EVMS_GET_NODE_NAME(node),lv->name)) {

8541

+ /* reassign requested minor */

8542

+ evms_cs_remove_logical_node_from_list(discover_list, node);

8543

+ node_removed = TRUE;

8544

+ LOG_DEFAULT("Re");

8545

+ /* free the previously used name */

8546

+ evms_cs_deallocate_memory(lv->name);

8547

+ lv->name = NULL;

8548

+ /* clear the EVMS_VOLUME_SOFT_DELETED flag */

8549

+ lv->flags = 0;

8550

+ eelv_assign_volume_minor(node, i);

8551

+ break;

8552

+ }

8553

+ }

8554

+ }

8555

+ }

8556

+}

8557

+

8558

+/*

8559

+ * function: eelv_assign_evms_volume_minors

8560

+ *

8561

+ * This is a support function for evms_export_logical_volumes.

8562

+ * This routine assigns minor numbers to new evms volumes. If

8563

+ * the specified minor is already in use, the requested minor

8564

+ * is set to 0, and will be assigned next available along with

8565

+ * any remaining volumes at the end of evms_export_logical_volumes.

8566

+ *

8567

+ */

8568

+static void

8569

+eelv_assign_evms_volume_minors(evms_logical_node_t **discover_list)

8570

+{

8571

+ evms_logical_node_t *next_node, *node, *lv_node;

8572

+ unsigned int requested_minor, node_removed;

8573

+

8574

+ for (node = *discover_list; node; node = next_node) {

8575

+ next_node = node->next;

8576

+

8577

+ node_removed = FALSE;

8578

+ /* only process evms volumes */

8579

+ if (node->flags & EVMS_VOLUME_FLAG) {

8580

+ requested_minor = node->volume_info->volume_system_id;

8581

+ /* is there a requested minor? */

8582

+ if (requested_minor) {

8583

+ int lv_flags = 0;

8584

+

8585

+ /* check range of requested minor */

8586

+ if (requested_minor >= MAX_EVMS_VOLUMES)

8587

+ lv_node = node;

8588

+ else {

8589

+ evms_logical_volume_t *lv;

8590

+ lv = &evms_logical_volumes[requested_minor];

8591

+ lv_node = lv->node;

8592

+ lv_flags = lv->flags;

8593

+ }

8594

+ if ( (!lv_node) && (!(lv_flags & EVMS_VOLUME_SOFT_DELETED)) ) {

8595

+ /* assign requested minor */

8596

+ evms_cs_remove_logical_node_from_list(discover_list, node);

8597

+ node_removed = TRUE;

8598

+ eelv_assign_volume_minor(node, requested_minor);

8599

+ } else {

8600

+ LOG_WARNING("EVMS volume(%s) requesting invalid/in-use minor(%d), assigning next available!\n",

8601

+ node->volume_info->volume_name,

8602

+ requested_minor);

8603

+ /*

8604

+ * requested minor is already

8605

+ * in use, defer assignment

8606

+ * until later.

8607

+ */

8608

+ node->volume_info->volume_system_id = 0;

8609

+ }

8610

+ }

8611

+ }

8612

+ }

8613

+}

8614

+

8615

+/*

8616

+ * function: eelv_assign_remaining_evms_volume_minors

8617

+ *

8618

+ * This is a support function for evms_export_logical_volumes.

8619

+ * This routine assigns minor numbers to new evms volumes that

8620

+ * have no/conflicting minor assignments. This function will

8621

+ * search from high(255) minor values down, for the first available

8622

+ * minor. Searching high to low minimizes the possibility of

8623

+ * conflicting evms volumes causing "compatibility" minor

8624

+ * assignments to shift from expected assignments.

8625

+ *

8626

+ */

8627

+static void

8628

+eelv_assign_remaining_evms_volume_minors(

8629

+ evms_logical_node_t **discover_list)

8630

+{

8631

+ evms_logical_node_t *next_node, *node;

8632

+ int requested_minor, node_removed;

8633

+

8634

+ for (node = *discover_list; node; node = next_node) {

8635

+ next_node = node->next;

8636

+

8637

+ node_removed = FALSE;

8638

+ /* only process evms volumes */

8639

+ /* all remaining evms volumes should now

8640

+ * have a minor value of 0, meaning they

8641

+ * had no minor assignment, or their minor

8642

+ * assignment conflicted with an existing

8643

+ * minor assignment.

8644

+ */

8645

+ if (node->flags & EVMS_VOLUME_FLAG) {

8646

+ evms_cs_remove_logical_node_from_list(discover_list, node);

8647

+ node_removed = TRUE;

8648

+ /* find next available minor number */

8649

+ for (requested_minor = 255;

8650

+ (evms_logical_volumes[requested_minor].node ||

8651

+ evms_logical_volumes[requested_minor].name) &&

8652

+ requested_minor;

8653

+ requested_minor--);

8654

+ /* check range of assigned minor */

8655

+ if (!requested_minor) {

8656

+ LOG_CRITICAL("no more minor numbers available for evms volumes!!!!\n");

8657

+ DELETE(node);

8658

+ } else

8659

+ /* assign requested minor */

8660

+ eelv_assign_volume_minor(node, requested_minor);

8661

+ }

8662

+ }

8663

+}

8664

+

8665

+/*

8666

+ * function: eelv_assign_remaining_volume_minors

8667

+ *

8668

+ * This is a support function for evms_export_logical_volumes.

8669

+ * This routine assigns minor numbers to all remaining unassigned

8670

+ * volumes. Minor numbers are assigned on an availability

8671

+ * basis. The first free minor number is used in the assignment.

8672

+ *

8673

+ */

8674

+static void

8675

+eelv_assign_remaining_volume_minors(

8676

+ evms_logical_node_t **discover_list)

8677

+{

8678

+ evms_logical_node_t *node;

8679

+ int minor;

8680

+

8681

+ while(*discover_list) {

8682

+ node = *discover_list;

8683

+ evms_cs_remove_logical_node_from_list(discover_list, node);

8684

+

8685

+ /* find next available minor number */

8686

+ for (minor = 1;

8687

+ (evms_logical_volumes[minor].node ||

8688

+ evms_logical_volumes[minor].name) &&

8689

+ minor < MAX_EVMS_VOLUMES;

8690

+ minor++);

8691

+

8692

+ if (minor >= MAX_EVMS_VOLUMES) {

8693

+ LOG_CRITICAL("no more minor numbers available for compatibility volumes!!!!\n");

8694

+ DELETE(node);

8695

+ } else

8696

+ /* assign minor */

8697

+ eelv_assign_volume_minor(node, minor);

8698

+ }

8699

+}

8700

+

8701

+/*

8702

+ * function: eelv_check_for_unreassign_soft_deleted_volume

8703

+ *

8704

+ * This is a support function for evms_export_logical_volumes.

8705

+ * This routine reports any "soft deleted" volumes that were not

8706

+ * found after a rediscovery.

8707

+ */

8708

+static void

8709

+eelv_check_for_unreassign_soft_deleted_volume(void)

8710

+{

8711

+ evms_logical_volume_t *lv;

8712

+ int i;

8713

+

8714

+ for (i = 1; i < MAX_EVMS_VOLUMES; i++) {

8715

+ lv = &evms_logical_volumes[i];

8716

+ /* only check soft deleted volumes:

8717

+ * they have a NULL node ptr &

8718

+ * they have a non-NULL name.

8719

+ */

8720

+ if (lv->flags & EVMS_VOLUME_SOFT_DELETED) {

8721

+ if (get_super(MKDEV(EVMS_MAJOR, i)))

8722

+ lv->flags |= EVMS_VOLUME_CORRUPT;

8723

+ LOG_ERROR("error: rediscovery failed to find %smounted 'soft deleted' volume(%u,%u,%s)...\n",

8724

+ ((lv->flags & EVMS_VOLUME_CORRUPT) ? "" : "un"),

8725

+ EVMS_MAJOR, i,

8726

+ lv->name);

8727

+ if (lv->flags & EVMS_VOLUME_CORRUPT) {

8728

+ LOG_ERROR(" flagging volume(%u,%u,%s) as CORRUPT!\n",

8729

+ EVMS_MAJOR, i,

8730

+ lv->name);

8731

+ } else {

8732

+ LOG_ERROR(" releasing minor(%d) used by volume(%s)!\n",

8733

+ i, lv->name);

8734

+ /* clear logical volume structure

8735

+ * for this volume so it may be

8736

+ * reused.

8737

+ */

8738

+ evms_cs_deallocate_memory(lv->name);

8739

+ lv->name = NULL;

8740

+ lv->flags = 0;

8741

+ }

8742

+ }

8743

+ }

8744

+}

8745

+

8746

+static void

8747

+eelv_unquiesce_volumes(void)

8748

+{

8749

+ int i;

8750

+

8751

+ /* check each volume array entry */

8752

+ for (i = 1; i < MAX_EVMS_VOLUMES; i++) {

8753

+ evms_logical_volume_t *volume;

8754

+

8755

+ volume = &evms_logical_volumes[i];

8756

+ /* is this volume "quiesced" ? */

8757

+ if (volume->quiesced) {

8758

+ int rc = 1;

8759

+ if (volume->node) {

8760

+ /* "unquiesce" it */

8761

+ struct inode inode;

8762

+ evms_quiesce_volume_t qv;

8763

+

8764

+ qv.command = qv.status = 0;

8765

+ qv.do_vfs = 0;

8766

+ qv.minor = i;

8767

+ rc = evms_quiesce_volume(volume, &inode, NULL, &qv);

8768

+ }

8769

+ /* Wake up any waiters */

8770

+ if (rc) {

8771

+ /* clear the flag */

8772

+ volume->quiesced = 0;

8773

+ /* wake up the waiters */

8774

+ if (waitqueue_active(&volume->wait_queue))

8775

+ wake_up(&volume->wait_queue);

8776

+#ifdef VFS_PATCH_PRESENT

8777

+ /* unquiesce VFS if quiesced */

8778

+ if (volume->vfs_quiesced) {

8779

+ /* VFS function call to unlock the filesystem */

8780

+ unlockfs(MKDEV(EVMS_MAJOR, i));

8781

+ volume->vfs_quiesced = FALSE;

8782

+ }

8783

+#endif

8784

+ }

8785

+ }

8786

+ }

8787

+}

8788

+

8789

+/*

8790

+ * Function: evms_export_logical_volumes

8791

+ *

8792

+ * This function is called from evms_discover_volumes. It

8793

+ * check for duplicate volumes, assigns minor values to evms

8794

+ * volumes, and assigns minor values to the remaining volumes.

8795

+ * In addition to assigning minor values to each volume this

8796

+ * function also completes the final steps necessary to allow

8797

+ * the volumes to be using by the operating system.

8798

+ */

8799

+static void

8800

+evms_export_logical_volumes(evms_logical_node_t **discover_list)

8801

+{

8802

+ LOG_EXTRA("exporting EVMS logical volumes...\n");

8803

+

8804

+ eelv_check_for_duplicity(discover_list);

8805

+

8806

+ eelv_reassign_soft_deleted_volume_minors(discover_list);

8807

+

8808

+ eelv_assign_evms_volume_minors(discover_list);

8809

+

8810

+ eelv_assign_remaining_evms_volume_minors(discover_list);

8811

+

8812

+ eelv_assign_remaining_volume_minors(discover_list);

8813

+

8814

+ eelv_check_for_unreassign_soft_deleted_volume();

8815

+

8816

+ /* "unquiesce" any "quiesced" volumes */

8817

+ eelv_unquiesce_volumes();

8818

+}

8819

+

8820

+static int

8821

+edv_populate_discover_list(

8822

+ evms_list_node_t *src_list,

8823

+ evms_logical_node_t **trg_list,

8824

+ evms_rediscover_t *discover_parms)

8825

+{

8826

+ int rc = 0, i, move_node, use_all_disks = FALSE;

8827

+ evms_list_node_t *src_node;

8828

+

8829

+

8830

+ /* if no discover parameters are specified */

8831

+ /* copy ALL the disk nodes into the */

8832

+ /* discovery list. */

8833

+ if ((discover_parms == NULL) ||

8834

+ (discover_parms->drive_count == REDISCOVER_ALL_DEVICES))

8835

+ use_all_disks = TRUE;

8836

+

8837

+ /* copy the disk nodes specified in the */

8838

+ /* discover_parms over to a discover list */

8839

+ src_node = src_list;

8840

+ while(src_node) {

8841

+ move_node = use_all_disks;

8842

+ if (move_node == FALSE)

8843

+ /* check the rediscovery array */

8844

+ for (i = 0; i < discover_parms->drive_count; i++)

8845

+ if (discover_parms->drive_array[i] == ((unsigned long)src_node->item ^ EVMS_HANDLE_KEY)) {

8846

+ move_node = TRUE;

8847

+ break;

8848

+ }

8849

+ /* check to see if we want this node */

8850

+ if (move_node == TRUE)

8851

+ evms_cs_add_logical_node_to_list(

8852

+ trg_list,

8853

+ (evms_logical_node_t *)src_node->item);

8854

+ /* advance to next evms_list_node_t */

8855

+ src_node = src_node->next;

8856

+ }

8857

+ return(rc);

8858

+}

8859

+

8860

+static int

8861

+evms_discover_volumes(evms_rediscover_t *discover_parms)

8862

+{

8863

+ int rc = 0;

8864

+ evms_logical_node_t *discover_list = NULL;

8865

+

8866

+ evms_discover_logical_disks(&discover_list);

8867

+ if (evms_global_device_list) {

8868

+ /* move the appropriate disk nodes, based on */

8869

+ /* on the discover parameters, onto the */

8870

+ /* discover list for the partition managers */

8871

+ /* to process */

8872

+ edv_populate_discover_list(

8873

+ evms_global_device_list,

8874

+ &discover_list, discover_parms);

8875

+ }

8876

+ if (discover_list) {

8877

+ evms_discover_logical_partitions(&discover_list);

8878

+ }

8879

+ if (discover_list) {

8880

+ evms_discover_volume_groups(&discover_list);

8881

+ }

8882

+ if (discover_list) {

8883

+ evms_discover_evms_features(&discover_list);

8884

+ }

8885

+ if (discover_list) {

8886

+ evms_export_logical_volumes(&discover_list);

8887

+ evms_cs_signal_event(EVMS_EVENT_END_OF_DISCOVERY);

8888

+ }

8889

+ return(rc);

8890

+}

8891

+

8892

+/*

8893

+ * Function: find_root_fs_dev

8894

+ * If "root=/dev/evms/???" was specified on the kernel command line, and devfs

8895

+ * is not enabled, we need to determine the appropriate minor number for the

8896

+ * specified volume for the root fs.

8897

+ */

8898

+static void find_root_fs_dev(void)

8899

+{

8900

+ char * name;

8901

+ int i;

8902

+

8903

+ if ( ! strncmp(root_device_name, EVMS_DIR_NAME "/", strlen(EVMS_DIR_NAME)+1) ) {

8904

+ name = &root_device_name[strlen(EVMS_DIR_NAME)+1];

8905

+

8906

+ for ( i = 1; i <= MAX_EVMS_VOLUMES; i++ ) {

8907

+ if ( evms_logical_volumes[i].name &&

8908

+ ! strncmp(name, evms_logical_volumes[i].name, strlen(evms_logical_volumes[i].name)) ) {

8909

+ ROOT_DEV = MKDEV(EVMS_MAJOR,i);

8910

+ return;

8911

+ }

8912

+ }

8913

+ }

8914

+}

8915

+

8916

+/*

8917

+ * Function: bh_cache_ctor

8918

+ * this function initializes the b_wait field in the buffer heads

8919

+ * in our private buffer head pool.

8920

+ */

8921

+static void

8922

+io_notify_cache_ctor(

8923

+ void * foo,

8924

+ kmem_cache_t * cachep,

8925

+ unsigned long flags)

8926

+{

8927

+ if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==

8928

+ SLAB_CTOR_CONSTRUCTOR)

8929

+ {

8930

+ io_notify_t *io_notify = (io_notify_t *)foo;

8931

+ memset(io_notify, 0, sizeof(*io_notify));

8932

+ }

8933

+}

8934

+

8935

+/*

8936

+ * Function: bh_cache_ctor

8937

+ * this function initializes the b_wait field in the buffer heads

8938

+ * in our private buffer head pool.

8939

+ */

8940

+static void

8941

+bh_cache_ctor(

8942

+ void * foo,

8943

+ kmem_cache_t * cachep,

8944

+ unsigned long flags)

8945

+{

8946

+ if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==

8947

+ SLAB_CTOR_CONSTRUCTOR)

8948

+ {

8949

+ struct buffer_head *bh = (struct buffer_head *)foo;

8950

+ memset(bh, 0, sizeof(*bh));

8951

+ init_waitqueue_head(&bh->b_wait);

8952

+ }

8953

+}

8954

+

8955

+/*

8956

+ * Function: evms_init_module

8957

+ * This function runs once at system initialization.

8958

+ */

8959

+static int __init

8960

+evms_init_module (void)

8961

+{

8962

+ int rc = 0, i;

8963

+ int *evms_blocksizes;

8964

+

8965

+ LOG_DEFAULT("EVMS v%d.%d.%d initializing .... info level(%d).\n",

8966

+ EVMS_MAJOR_VERSION,

8967

+ EVMS_MINOR_VERSION,

8968

+ EVMS_PATCHLEVEL_VERSION,

8969

+ evms_info_level);

8970

+

8971

+ /* initialize memory management counters */

8972

+ atomic_set(&evms_allocs,0);

8973

+ atomic_set(&evms_logical_nodes,0);

8974

+

8975

+ /* initialize the io_notify_entry pool */

8976

+ if (!rc)

8977

+ evms_io_notify_pool = evms_cs_create_pool(

8978

+ sizeof(io_notify_t),

8979

+ "EVMS IO Notify",

8980

+ io_notify_cache_ctor,

8981

+ NULL );

8982

+

8983

+ /* initialize the "public" buffer_head pool */

8984

+ if (!rc)

8985

+ evms_bh_pool = evms_cs_create_pool(

8986

+ sizeof(struct buffer_head),

8987

+ "EVMS BH",

8988

+ bh_cache_ctor,

8989

+ NULL );

8990

+

8991

+ /* allocate the logical volume array */

8992

+ if (!rc)

8993

+ rc = evms_cs_allocate_memory(

8994

+ (void **)&evms_logical_volumes,

8995

+ sizeof(evms_logical_volume_t) * MAX_EVMS_VOLUMES);

8996

+

8997

+ /* initialize the logical volume array entries */

8998

+ if (!rc)

8999

+ for (i = 1; i < MAX_EVMS_VOLUMES; i++) {

9000

+ evms_logical_volume_t *volume;

9001

+

9002

+ volume = &evms_logical_volumes[i];

9003

+ init_waitqueue_head(&volume->wait_queue);

9004

+#ifdef CONFIG_SMP

9005

+ blk_init_queue(&volume->request_queue,

9006

+ evms_do_request_fn);

9007

+ blk_queue_make_request(&volume->request_queue,

9008

+ evms_make_request_fn);

9009

+#endif

9010

+ }

9011

+

9012

+ /* allocate EVMS' blk_size array */

9013

+ if (!rc) {

9014

+ rc = evms_cs_allocate_memory((void **)&evms_blocksizes, MAX_EVMS_VOLUMES * sizeof(int));

9015

+ if (rc) {

9016

+ LOG_CRITICAL("can't allocate memory for EVMS blk_size\n");

9017

+ } else blk_size[EVMS_MAJOR] = evms_blocksizes;

9018

+ }

9019

+

9020

+ /* allocate EVMS' blksize_size array */

9021

+ if (!rc) {

9022

+ rc = evms_cs_allocate_memory((void **)&evms_blocksizes, MAX_EVMS_VOLUMES * sizeof(int));

9023

+ if (rc) {

9024

+ LOG_CRITICAL("can't allocate memory for EVMS blksize_size\n");

9025

+ } else blksize_size[EVMS_MAJOR] = evms_blocksizes;

9026

+ }

9027

+

9028

+ /* allocate EVMS' hardsect_size array */

9029

+ if (!rc) {

9030

+ rc = evms_cs_allocate_memory((void **)&evms_blocksizes, MAX_EVMS_VOLUMES * sizeof(int));

9031

+ if (rc) {

9032

+ LOG_CRITICAL("can't allocate memory for EVMS hardsect_size\n");

9033

+ } else hardsect_size[EVMS_MAJOR] = evms_blocksizes;

9034

+ }

9035

+

9036

+ /* Register the block device */

9037

+ if (!rc) {

9038

+ rc = devfs_register_blkdev(EVMS_MAJOR, EVMS_DIR_NAME, &evms_fops);

9039

+ if (rc) {

9040

+ LOG_CRITICAL("error calling devfs_register_blkdev() err=%u\n", rc);

9041

+ rc = -EINVAL;

9042

+ }

9043

+ }

9044

+

9045

+ /* Register with devfs */

9046

+ if (!rc) {

9047

+ evms_dir_devfs_handle = devfs_mk_dir(NULL, EVMS_DIR_NAME, NULL);

9048

+ // A NULL return cannot be fatal.

9049

+ // Devfs just might not be running

9050

+ if ( ! evms_dir_devfs_handle ) {

9051

+ LOG_EXTRA("NULL return from devfs_mk_dir() for \"%s\"\n", EVMS_DIR_NAME);

9052

+ LOG_EXTRA("Is devfs enabled?\n");

9053

+ }

9054

+ else {

9055

+ evms_blk_devfs_handle = devfs_register(evms_dir_devfs_handle,

9056

+ EVMS_DEV_NAME,

9057

+ DEVFS_FL_DEFAULT,

9058

+ EVMS_MAJOR, 0,

9059

+ S_IFBLK | S_IRUGO | S_IWUGO,

9060

+ &evms_fops, NULL);

9061

+ if ( ! evms_blk_devfs_handle ) {

9062

+ LOG_DETAILS("NULL return from devfs_register() for \"%s\"\n", EVMS_DEV_NAME);

9063

+ }

9064

+ }

9065

+ }

9066

+

9067

+ if (!rc) {

9068

+ read_ahead[EVMS_MAJOR] = 4096;

9069

+#ifdef CONFIG_SMP

9070

+ blk_dev[EVMS_MAJOR].queue = evms_find_queue;

9071

+#else

9072

+ blk_init_queue(BLK_DEFAULT_QUEUE(EVMS_MAJOR), evms_do_request_fn);

9073

+ blk_queue_make_request(BLK_DEFAULT_QUEUE(EVMS_MAJOR), evms_make_request_fn);

9074

+#endif

9075

+#ifdef CONFIG_PROC_FS

9076

+ evms_cs_get_evms_proc_dir();

9077

+ if (evms_proc_dir) {

9078

+ create_proc_read_entry("info", 0, evms_proc_dir, evms_info_read_proc, NULL);

9079

+ create_proc_read_entry("plugins", 0, evms_proc_dir, evms_plugins_read_proc, NULL);

9080

+ create_proc_read_entry("volumes", 0, evms_proc_dir, evms_volumes_read_proc, NULL);

9081

+ }

9082

+ evms_table_header = register_sysctl_table(dev_dir_table, 1);

9083

+#endif

9084

+ }

9085

+

9086

+ return rc;

9087

+}

9088

+

9089

+/*

9090

+ * Function: evms_init_module

9091

+ * This function runs once at system initialization.

9092

+ */

9093

+static void __exit

9094

+evms_exit_module (void)

9095

+{

9096

+ int rc = 0, i;

9097

+

9098

+ LOG_DEFAULT("EVMS v%d.%d.%d unloading ....\n",

9099

+ EVMS_MAJOR_VERSION,

9100

+ EVMS_MINOR_VERSION,

9101

+ EVMS_PATCHLEVEL_VERSION);

9102

+

9103

+ /* ensure no EVMS volumes exist

9104

+ */

9105

+ for (i = 1; i < MAX_EVMS_VOLUMES; i++) {

9106

+ if (evms_logical_volumes[i].node) {

9107

+ LOG_ERROR("volume(%d,%d,%s) still exists.\n",

9108

+ EVMS_MAJOR, i,

9109

+ evms_logical_volumes[i].name);

9110

+ rc = -EPERM;

9111

+ }

9112

+ }

9113

+ if (rc) {

9114

+ LOG_ERROR("unable to unload until no volumes exist!\n");

9115

+ }

9116

+ if (!rc) {

9117

+ /* ensure no plugins are loaded.

9118

+ */

9119

+ evms_registered_plugin_t *p;

9120

+ int found = FALSE;

9121

+

9122

+ for (p = registered_plugin_head; p; p = p->next) {

9123

+ found = TRUE;

9124

+ LOG_ERROR("plugin (plugin.id=%d.%d.%d, plugin.ver=%d.%d.%d still loaded.\n",

9125

+ GetPluginOEM(p->plugin->id),

9126

+ GetPluginType(p->plugin->id),

9127

+ GetPluginID(p->plugin->id),

9128

+ p->plugin->version.major,

9129

+ p->plugin->version.minor,

9130

+ p->plugin->version.patchlevel);

9131

+ }

9132

+ if (found) {

9133

+ LOG_ERROR("unable to unload while plugins still loaded!\n");

9134

+ }

9135

+ }

9136

+ if (!rc) {

9137

+ /* unregister with devfs

9138

+ */

9139

+ devfs_unregister(evms_dir_devfs_handle);

9140

+ /* clean up the queue for the block device

9141

+ */

9142

+ blk_cleanup_queue(blk_get_queue(MKDEV(EVMS_MAJOR,0)));

9143

+ /* unregister block device

9144

+ */

9145

+ rc = devfs_unregister_blkdev(EVMS_MAJOR, EVMS_DIR_NAME);

9146

+ }

9147

+ if (!rc) {

9148

+ /* deallocate device arrays

9149

+ */

9150

+ evms_cs_deallocate_memory(blk_size[EVMS_MAJOR]);

9151

+ blk_size[EVMS_MAJOR] = NULL;

9152

+ evms_cs_deallocate_memory(blksize_size[EVMS_MAJOR]);

9153

+ blksize_size[EVMS_MAJOR] = NULL;

9154

+ evms_cs_deallocate_memory(hardsect_size[EVMS_MAJOR]);

9155

+ hardsect_size[EVMS_MAJOR] = NULL;

9156

+ read_ahead[EVMS_MAJOR] = 0;

9157

+ /* deallocate logical volumes array

9158

+ */

9159

+ evms_cs_deallocate_memory(evms_logical_volumes);

9160

+ /* destroy buffer head pool

9161

+ */

9162

+ evms_cs_destroy_pool(evms_bh_pool);

9163

+ /* destroy io notify pool

9164

+ */

9165

+ evms_cs_destroy_pool(evms_io_notify_pool);

9166

+#ifdef CONFIG_PROC_FS

9167

+ if (evms_proc_dir) {

9168

+ remove_proc_entry("volumes", evms_proc_dir);

9169

+ remove_proc_entry("plugins", evms_proc_dir);

9170

+ remove_proc_entry("info", evms_proc_dir);

9171

+ remove_proc_entry("evms", NULL);

9172

+ }

9173

+ unregister_sysctl_table(evms_table_header);

9174

+#endif

9175

+ }

9176

+}

9177

+

9178

+/*

9179

+ * Function: evms_init_discover

9180

+ * If EVMS is statically built into the kernel, this function will be called

9181

+ * to perform an initial volume discovery.

9182

+ */

9183

+int __init

9184

+evms_init_discover (void)

9185

+{

9186

+ /* go find volumes */

9187

+ evms_discover_volumes(NULL);

9188

+

9189

+ /* Check if the root fs is on EVMS */

9190

+ if ( MAJOR(ROOT_DEV) == EVMS_MAJOR ) {

9191

+ find_root_fs_dev();

9192

+ }

9193

+

9194

+ return 0;

9195

+}

9196

+

9197

+

9198

+/*

9199

+ * a placeholder for cluster enablement

9200

+ */

9201

+void

9202

+evms_cluster_init(int nodeid, int clusterid)

9203

+{

9204

+ /* dummy */

9205

+ return;

9206

+}

9207

+EXPORT_SYMBOL(evms_cluster_init);

9208

+

9209

+/*

9210

+ * a placeholder for cluster enablement

9211

+ */

9212

+int

9213

+evms_cluster_shutdown(void)

9214

+{

9215

+ /* dummy */

9216

+ return -1;

9217

+}

9218

+EXPORT_SYMBOL(evms_cluster_shutdown);

9219

+

9220

+static int __init

9221

+evms_boot_info_level(char *str)

9222

+{

9223

+ int evms_boot_info_level = (int) simple_strtoul(str, NULL, 10);

9224

+ if (evms_boot_info_level) {

9225

+ evms_info_level = evms_boot_info_level;

9226

+ }

9227

+ return 1;

9228

+}

9229

+

9230

+__setup("evms_info_level=", evms_boot_info_level);

9231

+module_init(evms_init_module);

9232

+module_exit(evms_exit_module);

9233

+__initcall(evms_init_discover);

9234

+#ifdef MODULE_LICENSE

9235

+MODULE_LICENSE("GPL");

9236

+#endif

9237

+

9238

+/**********************************************************/

9239

+/* END -- INIT/DISCOVERY support functions */

9240

+/**********************************************************/

9241

diff -Naur linux-2002-03-28/drivers/evms/evms_bbr.c evms-2002-03-28/drivers/evms/evms_bbr.c

9242

--- linux-2002-03-28/drivers/evms/evms_bbr.c Wed Dec 31 18:00:00 1969

9243

+++ evms-2002-03-28/drivers/evms/evms_bbr.c Wed Mar 27 19:01:30 2002

9244

@@ -0,0 +1,1631 @@

9245

+/* -*- linux-c -*- */

9246

+/*

9247

+ *

9248

+ * Copyright (c) International Business Machines Corp., 2000

9249

+ *

9250

+ * This program is free software; you can redistribute it and/or modify

9251

+ * it under the terms of the GNU General Public License as published by

9252

+ * the Free Software Foundation; either version 2 of the License, or

9253

+ * (at your option) any later version.

9254

+ *

9255

+ * This program is distributed in the hope that it will be useful,

9256

+ * but WITHOUT ANY WARRANTY; without even the implied warranty of

9257

+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See

9258

+ * the GNU General Public License for more details.

9259

+ *

9260

+ * You should have received a copy of the GNU General Public License

9261

+ * along with this program; if not, write to the Free Software

9262

+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

9263

+ */

9264

+

9265

+/* linux/driver/evms/evms_bbr.c

9266

+ *

9267

+ * EVMS - Bad Block Relocation (BBR) Feature Plugin

9268

+ *

9269

+ * BBR feature is designed to remap I/O write failures to another safe location on disk.

9270

+ * Note that most disk drives have BBR built into them, this means that our software BBR

9271

+ * will be only activated when all hardware BBR replacement sectors have been used.

9272

+ */

9273

+

9274

+

9275

+/* #define EVMS_BBR_DEBUG 1 */

9276

+

9277

+#include <linux/evms/evms_bbr_k.h>

9278

+

9279

+#define LOG_PREFIX "bbr: "

9280

+

9281

+static bbr_instance_data_t *bbr_instances = NULL;

9282

+

9283

+static struct notifier_block bbr_notifier = {

9284

+ notifier_call: bbr_notify_reboot,

9285

+ next: NULL,

9286

+ priority: INT_MAX, /* before any real devices */

9287

+};

9288

+

9289

+// Data pertaining to the I/O thread.

9290

+static evms_thread_t * bbr_io_thread = NULL;

9291

+static spinlock_t bbr_io_list_lock = SPIN_LOCK_UNLOCKED;

9292

+static bbr_bh_t * bbr_io_list = NULL, **bbr_io_list_tail;

9293

+

9294

+/* plugin function table definition */

9295

+static evms_plugin_function_table_t function_table = {

9296

+ discover : bbr_discover,

9297

+ delete : bbr_delete,

9298

+ read : bbr_read,

9299

+ write : bbr_write,

9300

+ init_io : bbr_init_io,

9301

+ ioctl : bbr_ioctl,

9302

+ direct_ioctl : bbr_direct_ioctl

9303

+};

9304

+

9305

+static evms_plugin_header_t plugin_header = {

9306

+ id : SetPluginID(

9307

+ IBM_OEM_ID,

9308

+ EVMS_FEATURE,

9309

+ EVMS_BBR_FEATURE_ID),

9310

+ version : { 1,0,0 },

9311

+ required_common_services_version : {

9312

+ EVMS_BBR_COMMON_SERVICES_MAJOR,

9313

+ EVMS_BBR_COMMON_SERVICES_MINOR,

9314

+ EVMS_BBR_COMMON_SERVICES_PATCHLEVEL

9315

+ },

9316

+ function_table : &function_table

9317

+};

9318

+

9319

+

9320

+/*

9321

+ * Function: le_meta_data_to_cpu

9322

+ * convert bbr meta data from on-disk (LE) format to the native cpu endian format.

9323

+ */

9324

+void le_meta_data_to_cpu(evms_bbr_metadata_t *md)

9325

+{

9326

+ md->signature = le32_to_cpu(md->signature);

9327

+ md->crc = le32_to_cpu(md->crc);

9328

+ md->block_size = le32_to_cpu(md->block_size);

9329

+ md->flags = le32_to_cpu(md->flags);

9330

+ md->sequence_number = le64_to_cpu(md->sequence_number);

9331

+ md->start_sect_bbr_table = le64_to_cpu(md->start_sect_bbr_table);

9332

+ md->nr_sects_bbr_table = le64_to_cpu(md->nr_sects_bbr_table);

9333

+ md->start_replacement_sect = le64_to_cpu(md->start_replacement_sect);

9334

+ md->nr_replacement_blks = le64_to_cpu(md->nr_replacement_blks);

9335

+}

9336

+

9337

+/*

9338

+ * Function: le_bbr_table_sector_to_cpu

9339

+ * convert bbr meta data from on-disk (LE) format to the native cpu endian format.

9340

+ */

9341

+void le_bbr_table_sector_to_cpu(evms_bbr_table_t *p)

9342

+{

9343

+ int i;

9344

+ p->signature = le32_to_cpu(p->signature);

9345

+ p->crc = le32_to_cpu(p->crc);

9346

+ p->sequence_number = le32_to_cpu(p->sequence_number);

9347

+ p->in_use_cnt = le32_to_cpu(p->in_use_cnt);

9348

+ for (i=0; i<EVMS_BBR_ENTRIES_PER_SECT; i++) {

9349

+ p->entries[i].bad_sect = le64_to_cpu(p->entries[i].bad_sect);

9350

+ p->entries[i].replacement_sect = le64_to_cpu(p->entries[i].replacement_sect);

9351

+ }

9352

+}

9353

+

9354

+/*

9355

+ * Function: cpu_bbr_table_sector_to_le

9356

+ * convert bbr meta data from cpu endian format to on-disk (LE) format

9357

+ */

9358

+void cpu_bbr_table_sector_to_le(evms_bbr_table_t *p, evms_bbr_table_t *le)

9359

+{

9360

+ int i;

9361

+ le->signature = cpu_to_le32(p->signature);

9362

+ le->crc = cpu_to_le32(p->crc);

9363

+ le->sequence_number = cpu_to_le32(p->sequence_number);

9364

+ le->in_use_cnt = cpu_to_le32(p->in_use_cnt);

9365

+ for (i=0; i<EVMS_BBR_ENTRIES_PER_SECT; i++) {

9366

+ le->entries[i].bad_sect = cpu_to_le64(p->entries[i].bad_sect);

9367

+ le->entries[i].replacement_sect = cpu_to_le64(p->entries[i].replacement_sect);

9368

+ }

9369

+}

9370

+

9371

+

9372

+

9373

+static int validate_bbr_table_sector(evms_bbr_table_t *p)

9374

+{

9375

+ int rc=0;

9376

+ int org_crc, final_crc;

9377

+

9378

+ if (le32_to_cpu(p->signature) != EVMS_BBR_TABLE_SIGNATURE) {

9379

+ LOG_ERROR("BBR_TABLE_SIGNATURE don't match! sector has (0x%08X) expected(0x%08X)\n",

9380

+ le32_to_cpu(p->signature), EVMS_BBR_TABLE_SIGNATURE);

9381

+ rc = -EINVAL;

9382

+ } else {

9383

+ if (p->crc) {

9384

+ org_crc = le32_to_cpu(p->crc);

9385

+ p->crc = 0;

9386

+ final_crc = evms_cs_calculate_crc(EVMS_INITIAL_CRC, p, sizeof(*p));

9387

+ if (final_crc != org_crc) {

9388

+ LOG_ERROR("CRC failed! sector has (0x%08X) calculated(0x%08X)\n",

9389

+ org_crc, final_crc);

9390

+ rc = -EINVAL;

9391

+ }

9392

+ p->crc = cpu_to_le32(org_crc);

9393

+ } else {

9394

+ LOG_ERROR("bbr table sector has no crc\n");

9395

+ rc = -EINVAL;

9396

+ }

9397

+ }

9398

+ if (rc)

9399

+ BBR_DEBUG_PRINT_TABLE_SECTOR(p);

9400

+ le_bbr_table_sector_to_cpu(p);

9401

+ return rc;

9402

+}

9403

+

9404

+void update_invalid_bbr_table_sector(

9405

+ evms_logical_node_t *node,

9406

+ evms_bbr_table_t *valid,

9407

+ evms_bbr_table_t *invalid,

9408

+ evms_sector_t LSN)

9409

+{

9410

+ int rc;

9411

+ evms_bbr_table_t *tmp_bbr_table;

9412

+

9413

+ /* Correct the invalid bbr table sector */

9414

+ memcpy(invalid, valid, sizeof(evms_bbr_table_t));

9415

+

9416

+ /* Allocate memory for I/O */

9417

+ rc = evms_cs_allocate_memory((void**)&tmp_bbr_table,sizeof(evms_bbr_table_t));

9418

+ if (!rc) {

9419

+ cpu_bbr_table_sector_to_le(valid, tmp_bbr_table);

9420

+ LOG_WARNING("%s: updating LSN=%Lu\n", __FUNCTION__, LSN);

9421

+ rc = INIT_IO(node, 1, LSN, 1, tmp_bbr_table);

9422

+ if (rc) {

9423

+ LOG_ERROR("Could not update bbr table sector, INIT_IO(rc=%d)\n", rc);

9424

+ }

9425

+ evms_cs_deallocate_memory(tmp_bbr_table);

9426

+ }

9427

+}

9428

+

9429

+static u_int32_t validate_bbr_table(

9430

+ evms_bbr_metadata_t *md,

9431

+ evms_bbr_table_t *p)

9432

+{

9433

+ u_int32_t i, nr_sects;

9434

+

9435

+ nr_sects = md->nr_sects_bbr_table;

9436

+

9437

+ for (i=0; i<nr_sects; i++, p++) {

9438

+ if (validate_bbr_table_sector(p))

9439

+ break;

9440

+ }

9441

+

9442

+ if (i != nr_sects) {

9443

+ LOG_SERIOUS("stop validation at sector[%d]\n",i);

9444

+ nr_sects = i;

9445

+ }

9446

+ LOG_DEBUG("processed %d bbr table sectors\n", nr_sects);

9447

+ return nr_sects;

9448

+}

9449

+

9450

+

9451

+static u_int32_t validate_bbr_tables(

9452

+ evms_logical_node_t *node,

9453

+ evms_bbr_metadata_t *MD1,

9454

+ evms_bbr_metadata_t *MD2,

9455

+ evms_bbr_table_t *p1,

9456

+ evms_bbr_table_t *p2)

9457

+{

9458

+ u_int32_t i, rc1, rc2, nr_sects;

9459

+

9460

+ nr_sects = MD1->nr_sects_bbr_table;

9461

+ if (nr_sects != MD2->nr_sects_bbr_table) {

9462

+ nr_sects = (MD1->nr_sects_bbr_table < MD2->nr_sects_bbr_table) ?

9463

+ MD1->nr_sects_bbr_table : MD2->nr_sects_bbr_table;

9464

+ LOG_SERIOUS("number of bbr table sectors don't match, use %d",nr_sects);

9465

+ }

9466

+

9467

+ for (i=0; i<nr_sects; i++, p1++, p2++) {

9468

+ rc1 = rc2 = 0;

9469

+ if ((rc1 = validate_bbr_table_sector(p1)))

9470

+ LOG_WARNING("%s: MD1 has invalid bbr table sector at (LSN=%Lu)\n",

9471

+ __FUNCTION__, MD1->start_sect_bbr_table + i);

9472

+

9473

+ if ((rc2 = validate_bbr_table_sector(p2)))

9474

+ LOG_WARNING("%s: MD2 has invalid bbr table sector at (LSN=%Lu)\n",

9475

+ __FUNCTION__, MD2->start_sect_bbr_table + i);

9476

+ if (rc1 && rc2) {

9477

+ /* cannot continue */

9478

+ break;

9479

+ } else {

9480

+ if (rc1 || rc2) {

9481

+ if (rc1) {

9482

+ update_invalid_bbr_table_sector(node, p2, p1,

9483

+ MD1->start_sect_bbr_table + i);

9484

+ } else {

9485

+ update_invalid_bbr_table_sector(node, p1, p2,

9486

+ MD2->start_sect_bbr_table + i);

9487

+ }

9488

+ /* skip sequence number check, advance to next bbr table sector */

9489

+ continue;

9490

+ }

9491

+ }

9492

+

9493

+ if (p1->sequence_number != p2->sequence_number) {

9494

+ LOG_WARNING("at bbr table sector idx[%d] MD1 sequence_nr=%u <> MD2 sequence_nr_2=%u\n",

9495

+ i, p1->sequence_number, p2->sequence_number);

9496

+ if (p1->sequence_number < p2->sequence_number)

9497

+ update_invalid_bbr_table_sector(node, p2, p1,

9498

+ MD1->start_sect_bbr_table + i);

9499

+ else

9500

+ update_invalid_bbr_table_sector(node, p1, p2,

9501

+ MD2->start_sect_bbr_table + i);

9502

+ }

9503

+ }

9504

+ if (i != nr_sects) {

9505

+ LOG_SERIOUS("stop validation at sector[%d]\n",i);

9506

+ nr_sects = i;

9507

+ }

9508

+ LOG_DEBUG("%s processed %d bbr table sectors\n", __FUNCTION__, nr_sects);

9509

+ return nr_sects;

9510

+}

9511

+

9512

+#ifdef EVMS_BBR_DEBUG

9513

+static void print_meta_data(evms_bbr_metadata_t *md)

9514

+{

9515

+ LOG_DEBUG("META DATA SECTOR\n sig(0x%08X) crc(0x%08X) block_size=%d\n"

9516

+ " start_sect_bbr_table=%Lu, nr_sects_bbr_table=%Lu\n"

9517

+ " start_replacement_sect=%Lu, nr_replacement_blks=%Lu\n",

9518

+ md->signature,

9519

+ md->crc,

9520

+ md->block_size,

9521

+ md->start_sect_bbr_table,

9522

+ md->nr_sects_bbr_table,

9523

+ md->start_replacement_sect,

9524

+ md->nr_replacement_blks);

9525

+}

9526

+

9527

+static void print_bbr_table_sector(evms_bbr_table_t *p)

9528

+{

9529

+ int i;

9530

+ LOG_DEBUG("BBR TABLE SECTOR\n sig(0x%08X) crc(0x%08X) sequence=%d, in_use_cnt=%d\n ENTRIES:",

9531

+ p->signature, p->crc, p->sequence_number, p->in_use_cnt);

9532

+ for (i=0; i<EVMS_BBR_ENTRIES_PER_SECT; i++) {

9533

+ LOG_DEBUG(" [%d] bad_sect=%Lu, replacement_sect=%Lu\n",

9534

+ i, p->entries[i].bad_sect, p->entries[i].replacement_sect);

9535

+ }

9536

+}

9537

+

9538

+#endif

9539

+

9540

+static int validate_meta_data(evms_bbr_metadata_t *md)

9541

+{

9542

+ int org_crc, final_crc;

9543

+

9544

+ BBR_DEBUG_PRINT_META_DATA(md);

9545

+

9546

+ if (le32_to_cpu(md->signature) != EVMS_BBR_SIGNATURE) {

9547

+ LOG_SERIOUS("EVMS_BBR_SIGNATURE don't match, got(0x%08X), expected(0x%08X)\n",

9548

+ le32_to_cpu(md->signature), EVMS_BBR_SIGNATURE);

9549

+ return -EINVAL;

9550

+ }

9551

+

9552

+ if (md->crc) {

9553

+ org_crc = le32_to_cpu(md->crc);

9554

+ md->crc = 0;

9555

+ final_crc = evms_cs_calculate_crc(EVMS_INITIAL_CRC, md, sizeof(*md));

9556

+ if (final_crc != org_crc) {

9557

+ LOG_SERIOUS("metadata has crc(0x%08X), calculated(0x%08X)\n",

9558

+ org_crc, final_crc);

9559

+ return -EINVAL;

9560

+ }

9561

+ md->crc = cpu_to_le32(org_crc);

9562

+ } else {

9563

+ LOG_WARNING("metadata has no crc!!!\n");

9564

+ }

9565

+

9566

+ le_meta_data_to_cpu(md);

9567

+ return 0;

9568

+}

9569

+

9570

+/*

9571

+ * Function: bbr_load_meta_data

9572

+ * Load and validate bbr meta data

9573

+ */

9574

+static int load_meta_data(

9575

+ evms_logical_node_t *node,

9576

+ evms_sector_t LSN,

9577

+ evms_bbr_metadata_t **md,

9578

+ evms_bbr_table_t **bbr_table)

9579

+{

9580

+ int rc;

9581

+

9582

+ *md = NULL;

9583

+ *bbr_table = NULL;

9584

+

9585

+ if (!LSN) {

9586

+ rc = -ENODATA;

9587

+ LOG_WARNING("No meta data\n");

9588

+ return rc;

9589

+ }

9590

+

9591

+ rc = evms_cs_allocate_memory((void **)md, sizeof(evms_bbr_metadata_t));

9592

+ if (!rc) {

9593

+ int metadata_hdr_size;

9594

+ metadata_hdr_size = evms_cs_size_in_vsectors(sizeof(evms_bbr_metadata_t));

9595

+ rc = INIT_IO(node, 0, LSN, metadata_hdr_size, *md);

9596

+ if (!rc) {

9597

+ rc = validate_meta_data(*md);

9598

+ if (!rc) {

9599

+ rc = evms_cs_allocate_memory((void**)bbr_table,

9600

+ (*md)->nr_sects_bbr_table * EVMS_VSECTOR_SIZE);

9601

+ if (!rc) {

9602

+ /* load BBR table but do not validate here */

9603

+ rc = INIT_IO(node, 0,

9604

+ (*md)->start_sect_bbr_table,

9605

+ (*md)->nr_sects_bbr_table,

9606

+ *bbr_table);

9607

+ }

9608

+ }

9609

+ }

9610

+ }

9611

+

9612

+ if (rc) {

9613

+ LOG_ERROR("%s failed rc=%d. Free allocated memory!\n",__FUNCTION__,rc);

9614

+ if (*md) {

9615

+ evms_cs_deallocate_memory(*md);

9616

+ *md = NULL;

9617

+ }

9618

+

9619

+ if (*bbr_table) {

9620

+ evms_cs_deallocate_memory(*bbr_table);

9621

+ *bbr_table = NULL;

9622

+ }

9623

+ }

9624

+ return rc;

9625

+}

9626

+

9627

+

9628

+/*

9629

+ * Function: bbr_load_feature_data

9630

+ * Load 2 copies meta data

9631

+ *

9632

+ */

9633

+static int load_feature_data(

9634

+ evms_logical_node_t *node,

9635

+ bbr_instance_data_t **ID)

9636

+{

9637

+ int rc = 0;

9638

+ int rc1, rc2;

9639

+ evms_bbr_metadata_t *md1 = NULL;

9640

+ evms_bbr_metadata_t *md2 = NULL;

9641

+ evms_bbr_table_t *table1 = NULL;

9642

+ evms_bbr_table_t *table2 = NULL;

9643

+ u_int64_t lba_table1 = 0;

9644

+ u_int64_t lba_table2 = 0;

9645

+ u_int32_t nr_sects = 0;

9646

+

9647

+ *ID = NULL;

9648

+

9649

+ /* Loads metadata 1 */

9650

+ rc1 = load_meta_data(node,

9651

+ node->feature_header->feature_data1_start_lsn,

9652

+ &md1,

9653

+ &table1);

9654

+ /* Loads metadata 2 */

9655

+ rc2 = load_meta_data(node,

9656

+ node->feature_header->feature_data2_start_lsn,

9657

+ &md2,

9658

+ &table2);

9659

+

9660

+ if (rc1 && rc2) { /* both copies are bad ?*/

9661

+ rc = -ENODATA; /* cannot continue */

9662

+ } else {

9663

+ if (!rc1 && !rc2) {

9664

+ lba_table1 = md1->start_sect_bbr_table;

9665

+ lba_table2 = md2->start_sect_bbr_table;

9666

+ nr_sects = validate_bbr_tables(node, md1, md2, table1, table2);

9667

+ if (nr_sects == 0) {

9668

+ rc = -ENODATA;

9669

+ }

9670

+ } else {

9671

+ /* only 1 copy of meta data */

9672

+ if (rc1) {

9673

+ lba_table2 = md2->start_sect_bbr_table;

9674

+ /* free meta data 1 */

9675

+ evms_cs_deallocate_memory(table1);

9676

+ table1 = table2;

9677

+ table2 = NULL;

9678

+ evms_cs_deallocate_memory(md1);

9679

+ md1 = md2;

9680

+ md2 = NULL;

9681

+ } else {

9682

+ lba_table1 = md1->start_sect_bbr_table;

9683

+ }

9684

+ nr_sects = validate_bbr_table(md1,table1);

9685

+ if (nr_sects == 0) {

9686

+ rc = -ENODATA;

9687

+ }

9688

+ }

9689

+ }

9690

+

9691

+ if (!rc && nr_sects) {

9692

+ rc = evms_cs_allocate_memory((void **)ID, sizeof(bbr_instance_data_t));

9693

+ if (!rc) {

9694

+ /* memset(*ID, 0, sizeof(bbr_instance_data_t)); */ /* not needed */

9695

+ (*ID)->source = node;

9696

+ (*ID)->blksize_in_sects = md1->block_size >> EVMS_VSECTOR_SIZE_SHIFT;

9697

+ (*ID)->remap_root = NULL;

9698

+ (*ID)->lba_table1 = lba_table1;

9699

+ (*ID)->lba_table2 = lba_table2;

9700

+ (*ID)->bbr_table = table1; /* use only 1 copy of bbr table */

9701

+ (*ID)->nr_sects_bbr_table = nr_sects;

9702

+ if (nr_sects < md1->nr_sects_bbr_table) {

9703

+ LOG_WARNING(" making bbr node read-only\n");

9704

+ (*ID)->flag |= EVMS_VOLUME_READ_ONLY;

9705

+ }

9706

+ (*ID)->nr_replacement_blks = nr_sects * EVMS_BBR_ENTRIES_PER_SECT;

9707

+ (*ID)->start_replacement_sect = md1->start_replacement_sect;

9708

+ atomic_set(&(*ID)->in_use_replacement_blks,0);

9709

+ (*ID)->bbr_id_lock = SPIN_LOCK_UNLOCKED;

9710

+ rc = bbr_create_pools(*ID);

9711

+ if (!rc)

9712

+ atomic_set(&(*ID)->in_use_replacement_blks,bbr_table_to_remap_list(*ID));

9713

+ }

9714

+ }

9715

+

9716

+ if (!rc) {

9717

+ if (!bbr_io_thread) {

9718

+ const char * name1 = "evms_bbr_io";

9719

+ bbr_io_thread = evms_cs_register_thread(bbr_io_handler, NULL, name1);

9720

+ if (!bbr_io_thread) {

9721

+ rc = -EINVAL;

9722

+ }

9723

+ }

9724

+ }

9725

+

9726

+ /* if error, free table1 */

9727

+ if (rc) {

9728

+ if (table1)

9729

+ evms_cs_deallocate_memory(table1);

9730

+ if (*ID) {

9731

+ (*ID)->bbr_table = NULL;

9732

+ bbr_free_instance_data(*ID);

9733

+ (*ID) = NULL;

9734

+ }

9735

+ }

9736

+

9737

+ /* Will never use md1, md2 and table2 again */

9738

+ if (md1)

9739

+ evms_cs_deallocate_memory(md1);

9740

+ if (md2)

9741

+ evms_cs_deallocate_memory(md2);

9742

+ if (table2)

9743

+ evms_cs_deallocate_memory(table2);

9744

+

9745

+ return rc;

9746

+}

9747

+

9748

+#ifdef EVMS_BBR_DEBUG

9749

+

9750

+/*

9751

+ * bbr_print_binary_tree

9752

+ * Traverse the tree and print out each node

9753

+ */

9754

+void print_binary_tree(bbr_runtime_remap_t *node)

9755

+{

9756

+ if (node == NULL) {

9757

+ return;

9758

+ } else {

9759

+ LOG_DEFAULT("[%Lu,%Lu]\n",node->remap.bad_sect, node->remap.replacement_sect);

9760

+ print_binary_tree(node->left);

9761

+ print_binary_tree(node->right);

9762

+ }

9763

+

9764

+}

9765

+

9766

+static void print_remap_list(bbr_instance_data_t *BBRID)

9767

+{

9768

+ if (!BBRID->remap_root)

9769

+ return;

9770

+ LOG_DEFAULT("%s for %s\n", __FUNCTION__,

9771

+ BBRID->node ? BBRID->node->name : "?");

9772

+ print_binary_tree(BBRID->remap_root);

9773

+}

9774

+

9775

+#endif

9776

+

9777

+#ifdef BBR_USE_RECURSIVE_FUNCTIONS

9778

+

9779

+/*

9780

+ * Recursive function to insert a node into the binary tree

9781

+ */

9782

+void bbr_binary_tree_insert(bbr_runtime_remap_t **node, bbr_runtime_remap_t *newnode)

9783

+{

9784

+ if (*node == NULL) {

9785

+ newnode->left = newnode->right = NULL;

9786

+ *node = newnode;

9787

+ return;

9788

+ } else {

9789

+ if (newnode->remap.bad_sect > (*node)->remap.bad_sect)

9790

+ return bbr_binary_tree_insert(&((*node)->right),newnode);

9791

+ else

9792

+ return bbr_binary_tree_insert(&((*node)->left),newnode);

9793

+ }

9794

+}

9795

+

9796

+/*

9797

+ * Recursive function to search for a node that contains bad_sect = lsn

9798

+ */

9799

+bbr_runtime_remap_t * bbr_binary_search(bbr_runtime_remap_t *node, evms_sector_t lsn)

9800

+{

9801

+ if ((node == NULL) || (node->remap.bad_sect == lsn)) {

9802

+ return node;

9803

+ } else {

9804

+ if (lsn > node->remap.bad_sect)

9805

+ return bbr_binary_search(node->right, lsn);

9806

+ else

9807

+ return bbr_binary_search(node->left, lsn);

9808

+ }

9809

+}

9810

+

9811

+/*

9812

+ * Recursive function to detroy the binary tree

9813

+ */

9814

+void bbr_binary_tree_destroy(bbr_runtime_remap_t *node, bbr_instance_data_t *BBRID)

9815

+{

9816

+ if (node) {

9817

+ bbr_binary_tree_destroy(node->left, BBRID);

9818

+ bbr_binary_tree_destroy(node->right, BBRID);

9819

+ evms_cs_deallocate_to_pool(BBRID->remap_pool, node);

9820

+ }

9821

+}

9822

+

9823

+#else

9824

+

9825

+/*

9826

+ * Insert a node into the binary tree

9827

+ */

9828

+void bbr_binary_tree_insert(bbr_runtime_remap_t **root, bbr_runtime_remap_t *newnode)

9829

+{

9830

+ bbr_runtime_remap_t **node = root;

9831

+ while (node && *node) {

9832

+ if (newnode->remap.bad_sect > (*node)->remap.bad_sect)

9833

+ node = &((*node)->right);

9834

+ else

9835

+ node = &((*node)->left);

9836

+ }

9837

+

9838

+ newnode->left = newnode->right = NULL;

9839

+ *node = newnode;

9840

+}

9841

+

9842

+/*

9843

+ * Search for a node that contains bad_sect = lsn

9844

+ */

9845

+bbr_runtime_remap_t * bbr_binary_search(

9846

+ bbr_runtime_remap_t *root,

9847

+ evms_sector_t lsn)

9848

+{

9849

+ bbr_runtime_remap_t *node = root;

9850

+ while (node) {

9851

+ if (node->remap.bad_sect == lsn)

9852

+ break;

9853

+ if (lsn > node->remap.bad_sect)

9854

+ node = node->right;

9855

+ else

9856

+ node = node->left;

9857

+ }

9858

+ return node;

9859

+}

9860

+

9861

+/*

9862

+ * Destroy the binary tree

9863

+ */

9864

+void bbr_binary_tree_destroy(bbr_runtime_remap_t *root, bbr_instance_data_t *BBRID)

9865

+{

9866

+ bbr_runtime_remap_t **link = NULL;

9867

+ bbr_runtime_remap_t *node = root;

9868

+

9869

+ while (node) {

9870

+ if (node->left) {

9871

+ link = &(node->left);

9872

+ node = node->left;

9873

+ continue;

9874

+ }

9875

+ if (node->right) {

9876

+ link = &(node->right);

9877

+ node = node->right;

9878

+ continue;

9879

+ }

9880

+

9881

+ evms_cs_deallocate_to_pool(BBRID->remap_pool, node);

9882

+

9883

+ if (node == root) /* if root is deleted, it's done. */

9884

+ break;

9885

+ node = root; /* back to root */

9886

+ *link = NULL;

9887

+ }

9888

+}

9889

+

9890

+#endif

9891

+

9892

+static void bbr_free_remap(bbr_instance_data_t *BBRID)

9893

+{

9894

+ unsigned long flags;

9895

+ spin_lock_irqsave(&BBRID->bbr_id_lock, flags);

9896

+ bbr_binary_tree_destroy(BBRID->remap_root, BBRID);

9897

+ BBRID->remap_root = NULL;

9898

+ spin_unlock_irqrestore(&BBRID->bbr_id_lock, flags);

9899

+}

9900

+

9901

+/*

9902

+ * bbr_insert_remap_entry

9903

+ */

9904

+static int bbr_insert_remap_entry(bbr_instance_data_t *BBRID,

9905

+ evms_bbr_table_entry_t *new_bbr_entry)

9906

+{

9907

+ bbr_runtime_remap_t *newnode = NULL;

9908

+ unsigned long flags;

9909

+ int rc;

9910

+

9911

+ newnode = kmem_cache_alloc (BBRID->remap_pool->cachep, SLAB_ATOMIC);

9912

+ if (!newnode) {

9913

+ rc = -ENOMEM;

9914

+ LOG_SERIOUS("could not allocate from remap pool! (rc=%d)\n", rc);

9915

+ return rc;

9916

+ }

9917

+ newnode->remap.bad_sect = new_bbr_entry->bad_sect;

9918

+ newnode->remap.replacement_sect = new_bbr_entry->replacement_sect;

9919

+ spin_lock_irqsave(&BBRID->bbr_id_lock, flags);

9920

+ bbr_binary_tree_insert(&BBRID->remap_root,newnode);

9921

+ spin_unlock_irqrestore(&BBRID->bbr_id_lock, flags);

9922

+ return 0;

9923

+}

9924

+

9925

+/*

9926

+ * bbr_table_to_remap_list

9927

+ *

9928

+ * The on-disk bbr table is sorted by the replacement sector LBA

9929

+ * In order to improve run time performance, the in memory remap

9930

+ * list must be sorted by the bad sector LBA.

9931

+ * This function is called at discovery time to initialize the remap

9932

+ * list. This function assumes that at least one copy of meta data is valid.

9933

+ */

9934

+static u_int32_t bbr_table_to_remap_list(bbr_instance_data_t *BBRID)

9935

+{

9936

+ u_int32_t in_use_blks = 0;

9937

+ int i, j;

9938

+ evms_bbr_table_t *p;

9939

+

9940

+

9941

+ for (i=0, p=BBRID->bbr_table; i<BBRID->nr_sects_bbr_table; i++, p++) {

9942

+ if (!p->in_use_cnt)

9943

+ break;

9944

+ in_use_blks += p->in_use_cnt;

9945

+ for (j=0; j<p->in_use_cnt; j++) {

9946

+ bbr_insert_remap_entry(BBRID, &p->entries[j]);

9947

+ }

9948

+ }

9949

+

9950

+

9951

+ return in_use_blks;

9952

+}

9953

+

9954

+/*

9955

+ * bbr_search_remap_entry

9956

+ *

9957

+ * Search remap entry for the specified sector.

9958

+ * If found, return pointer to evms_bbr_table_entry_t.

9959

+ * Otherwise, return NULL.

9960

+ */

9961

+static evms_bbr_table_entry_t * bbr_search_remap_entry(bbr_instance_data_t *BBRID, evms_sector_t lsn)

9962

+{

9963

+ bbr_runtime_remap_t *p;

9964

+ unsigned long flags;

9965

+

9966

+ spin_lock_irqsave(&BBRID->bbr_id_lock, flags);

9967

+ p = bbr_binary_search(BBRID->remap_root, lsn);

9968

+ spin_unlock_irqrestore(&BBRID->bbr_id_lock, flags);

9969

+ if (p)

9970

+ return (&p->remap);

9971

+ else

9972

+ return NULL;

9973

+}

9974

+

9975

+/*

9976

+ * bbr_remap

9977

+ * if *lsn is in the remap table, return TRUE and modify *lsn

9978

+ * else, return FALSE.

9979

+ */

9980

+static inline int bbr_remap(bbr_instance_data_t *BBRID,

9981

+ evms_sector_t *lsn)

9982

+{

9983

+ evms_bbr_table_entry_t *e;

9984

+

9985

+ if (atomic_read(&BBRID->in_use_replacement_blks) &&

9986

+ !(BBRID->flag & BBR_STOP_REMAP) ) {

9987

+ e = bbr_search_remap_entry(BBRID,*lsn);

9988

+ if (e) {

9989

+ *lsn = e->replacement_sect;

9990

+ LOG_EXTRA("%s replacement sector(LSN=%Lu)\n", __FUNCTION__, *lsn);

9991

+ return TRUE;

9992

+ }

9993

+ }

9994

+ return FALSE;

9995

+}

9996

+

9997

+/*

9998

+ * bbr_remap_probe

9999

+ * if any of the sectors [lsn, lsn+nr_sects] in the remap table

10000

+ * return TRUE

10001

+ * else, return FALSE.

10002

+ */

10003

+static inline int bbr_remap_probe(

10004

+ bbr_instance_data_t *BBRID,

10005

+ evms_sector_t lsn,

10006

+ evms_sector_t nr_sects)

10007

+{

10008

+ evms_sector_t tmp, cnt;

10009

+

10010

+ if (atomic_read(&BBRID->in_use_replacement_blks) &&

10011

+ !(BBRID->flag & BBR_STOP_REMAP) ) {

10012

+ for (cnt = 0, tmp=lsn;

10013

+ cnt < nr_sects;

10014

+ cnt += BBRID->blksize_in_sects, tmp = lsn + cnt) {

10015

+ if (bbr_remap(BBRID,&tmp))

10016

+ return TRUE;

10017

+ }

10018

+ }

10019

+ return FALSE;

10020

+}

10021

+

10022

+static int bbr_create_pools(bbr_instance_data_t *BBRID)

10023

+{

10024

+ /* create a memory pool for the remap list */

10025

+ sprintf(BBRID->remap_pool_name, "BBR_REMAP_%p", BBRID);

10026

+ sprintf(BBRID->bh_pool_name, "BBR_BH_%p", BBRID);

10027

+ BBRID->remap_pool = evms_cs_create_pool(

10028

+ sizeof (bbr_runtime_remap_t), BBRID->remap_pool_name, NULL, NULL);

10029

+ BBRID->bbr_bh_pool = evms_cs_create_pool(

10030

+ sizeof(bbr_bh_t), BBRID->bh_pool_name, NULL, NULL);

10031

+

10032

+ if (!BBRID->remap_pool || !BBRID->bbr_bh_pool) {

10033

+ BBR_BUG(" Could not allocate pools!");

10034

+ bbr_destroy_pools(BBRID);

10035

+ return -ENOMEM;

10036

+ }

10037

+ return 0;

10038

+}

10039

+

10040

+static void bbr_destroy_pools(bbr_instance_data_t *BBRID)

10041

+{

10042

+ if (BBRID->bbr_bh_pool)

10043

+ evms_cs_destroy_pool(BBRID->bbr_bh_pool);

10044

+ if (BBRID->remap_pool)

10045

+ evms_cs_destroy_pool(BBRID->remap_pool);

10046

+}

10047

+

10048

+static int bbr_discover(evms_logical_node_t **discover_list)

10049

+{

10050

+ int rc = 0;

10051

+ evms_logical_node_t *node, *next_node;

10052

+ evms_logical_node_t *bbr_node = NULL;

10053

+ bbr_instance_data_t *BBRID;

10054

+

10055

+ next_node = *discover_list;

10056

+ while(next_node) {

10057

+

10058

+ node = next_node;

10059

+ next_node = node->next;

10060

+

10061

+ if ((!node->feature_header) || (node->feature_header->feature_id != plugin_header.id))

10062

+ continue; // probably a node we just put on the list, skip and go to next.

10063

+

10064

+ rc = load_feature_data(node, &BBRID);

10065

+ if (rc) {

10066

+ /* error loading feature data */

10067

+ /* This node belongs to us, but metadata is invalid,

10068

+ * remove it from the discovery list

10069

+ * delete it

10070

+ * clear error code then continue.

10071

+ * Will consider creating a read only BBR node in the future.

10072

+ */

10073

+ LOG_SERIOUS(" Error in node (%s) with %Lu sectors.\n",

10074

+ node->name,node->total_vsectors);

10075

+ evms_cs_remove_logical_node_from_list(discover_list, node);

10076

+ DELETE(node);

10077

+ rc = 0;

10078

+ continue;

10079

+ }

10080

+

10081

+ rc = evms_cs_allocate_logical_node(&bbr_node);

10082

+ if (!rc) {

10083

+ int bad_blocks;

10084

+

10085

+ bbr_node->volume_info = node->volume_info;

10086

+ bbr_node->flags |= node->flags;

10087

+ bbr_node->plugin = &plugin_header;

10088

+ strcpy(bbr_node->name, node->feature_header->object_name);

10089

+ bbr_node->hardsector_size = node->hardsector_size;

10090

+ bbr_node->total_vsectors = node->total_vsectors;

10091

+ bbr_node->total_vsectors -= (u_int64_t)(evms_cs_size_in_vsectors(sizeof(evms_feature_header_t)) * 2);

10092

+ bbr_node->total_vsectors -= (u_int64_t)node->feature_header->feature_data1_size;

10093

+ bbr_node->total_vsectors -= (u_int64_t)node->feature_header->feature_data2_size;

10094

+ bbr_node->block_size = node->block_size;

10095

+ bbr_node->instance_data = BBRID;

10096

+ BBRID->total_vsectors = bbr_node->total_vsectors;

10097

+ BBRID->node = bbr_node;

10098

+

10099

+ /* free the feature header */

10100

+ evms_cs_deallocate_memory(node->feature_header);

10101

+ node->feature_header = NULL;

10102

+ evms_cs_remove_logical_node_from_list(discover_list, node);

10103

+

10104

+ /* If bad blocks exist, give warning */

10105

+ bad_blocks = atomic_read(&BBRID->in_use_replacement_blks);

10106

+ if (bad_blocks) {

10107

+ BBR_DEBUG_PRINT_REMAP_LIST(BBRID);

10108

+ LOG_WARNING("%s has %d bad blocks\n", BBRID->source->name, bad_blocks);

10109

+ LOG_WARNING("There are %Lu total replacement blocks.\n",

10110

+ BBRID->nr_replacement_blks);

10111

+ LOG_WARNING("There are %Lu remaining replacement blocks.\n",

10112

+ BBRID->nr_replacement_blks - bad_blocks);

10113

+ }

10114

+

10115

+ evms_cs_add_logical_node_to_list(discover_list, bbr_node);

10116

+

10117

+ MOD_INC_USE_COUNT;

10118

+ bbr_list_add(BBRID);

10119

+ } else {

10120

+ LOG_SERIOUS("could not allocate logical node! rc=%d\n",rc);

10121

+ bbr_free_instance_data(BBRID);

10122

+ }

10123

+ } /* end while()*/

10124

+ return( rc );

10125

+}

10126

+

10127

+static inline void bbr_list_add(bbr_instance_data_t *BBRID)

10128

+{

10129

+ BBRID->next = bbr_instances;

10130

+ bbr_instances = BBRID;

10131

+}

10132

+

10133

+static void bbr_list_remove(bbr_instance_data_t *BBRID)

10134

+{

10135

+ bbr_instance_data_t *p;

10136

+

10137

+ if (!BBRID)

10138

+ return;

10139

+

10140

+ if (BBRID == bbr_instances) {

10141

+ bbr_instances = NULL;

10142

+ return;

10143

+ }

10144

+

10145

+ p = bbr_instances;

10146

+ while (p) {

10147

+ if (p->next == BBRID) {

10148

+ p->next = p->next->next;

10149

+ return;

10150

+ }

10151

+ p = p->next;

10152

+ }

10153

+}

10154

+

10155

+static bbr_instance_data_t *bbr_find_instance_data (char * object_name)

10156

+{

10157

+ bbr_instance_data_t *p = bbr_instances;

10158

+

10159

+ while (p) {

10160

+ if (!strcmp(p->node->name, object_name))

10161

+ break;

10162

+ p = p->next;

10163

+ }

10164

+ return p;

10165

+}

10166

+

10167

+static void bbr_free_instance_data(bbr_instance_data_t *BBRID)

10168

+{

10169

+ if (BBRID->remap_root)

10170

+ bbr_free_remap(BBRID);

10171

+ bbr_destroy_pools(BBRID);

10172

+ if (BBRID->bbr_table)

10173

+ evms_cs_deallocate_memory(BBRID->bbr_table);

10174

+ bbr_list_remove(BBRID);

10175

+ evms_cs_deallocate_memory(BBRID);

10176

+}

10177

+

10178

+static int bbr_delete(evms_logical_node_t *bbr_node)

10179

+{

10180

+ bbr_instance_data_t *BBRID;

10181

+ int rc;

10182

+

10183

+ BBRID = bbr_node->instance_data;

10184

+

10185

+ rc = DELETE(BBRID->source);

10186

+ if (!rc) {

10187

+ /* Now cleanup and go away */

10188

+ bbr_free_instance_data(BBRID);

10189

+ evms_cs_deallocate_logical_node(bbr_node);

10190

+ MOD_DEC_USE_COUNT;

10191

+ if (!bbr_instances) {

10192

+ if (bbr_io_thread) {

10193

+ evms_cs_unregister_thread(bbr_io_thread);

10194

+ bbr_io_thread = NULL;

10195

+ }

10196

+ }

10197

+ }

10198

+ return rc;

10199

+}

10200

+

10201

+static bbr_bh_t * allocate_bbr_bh(bbr_instance_data_t *BBRID, int rw)

10202

+{

10203

+ bbr_bh_t * bbr_bh;

10204

+

10205

+ bbr_bh = evms_cs_allocate_from_pool(BBRID->bbr_bh_pool, TRUE);

10206

+ if (bbr_bh) {

10207

+ memset(bbr_bh, 0, sizeof(bbr_bh_t));

10208

+ bbr_bh->BBRID = BBRID;

10209

+ bbr_bh->rw = rw;

10210

+ atomic_set(&bbr_bh->waiters, 0);

10211

+ }

10212

+ else {

10213

+ LOG_WARNING("Could not allocate from BBR BH pool!\n");

10214

+ }

10215

+ return bbr_bh;

10216

+}

10217

+

10218

+static void free_bbr_bh(bbr_bh_t *bbr_bh)

10219

+{

10220

+ evms_cs_deallocate_to_pool(bbr_bh->BBRID->bbr_bh_pool, bbr_bh);

10221

+}

10222

+

10223

+

10224

+/* bbr_io_remap_error

10225

+ *

10226

+ * For the requested range, try to write each sector individually. For each

10227

+ * sector that fails, find the next available remap location and write the

10228

+ * data to that new location. Then update the table and write both copies

10229

+ * of the table to disk. Finally, update the in-memory mapping and do any

10230

+ * other necessary bookkeeping.

10231

+ */

10232

+static int bbr_io_remap_error( bbr_instance_data_t * BBRID,

10233

+ int rw,

10234

+ evms_sector_t starting_lsn,

10235

+ evms_sector_t count,

10236

+ char * buffer )

10237

+{

10238

+ evms_sector_t lsn, new_lsn;

10239

+ evms_bbr_table_t * bbr_table;

10240

+ unsigned long table_sector_index;

10241

+ unsigned long table_sector_offset;

10242

+ unsigned long index;

10243

+ int rc;

10244

+

10245

+ if ( rw == READ ) {

10246

+ // Nothing can be done about read errors.

10247

+ return -EIO;

10248

+ }

10249

+

10250

+ // For each sector in the request.

10251

+ for ( lsn = 0; lsn < count; lsn++, buffer += EVMS_VSECTOR_SIZE ) {

10252

+ rc = INIT_IO(BBRID->source, rw, starting_lsn + lsn, 1, buffer);

10253

+ while (rc) {

10254

+ if ( BBRID->flag & BBR_STOP_REMAP ) {

10255

+ // Can't allow new remaps if the engine told us to stop.

10256

+ LOG_ERROR("object %s: Bad sector (%Lu), but remapping is turned off.\n",

10257

+ BBRID->node->name, starting_lsn + lsn);

10258

+ return -EIO;

10259

+ }

10260

+

10261

+ // Find the next available relocation sector.

10262

+ new_lsn = atomic_read(&BBRID->in_use_replacement_blks);

10263

+ if ( new_lsn >= BBRID->nr_replacement_blks ) {

10264

+ // No more replacement sectors available.

10265

+ return -EIO;

10266

+ }

10267

+ new_lsn += BBRID->start_replacement_sect;

10268

+

10269

+ // Write the data to its new location.

10270

+ LOG_WARNING("object %s: Trying to remap bad sector (%Lu) to sector (%Lu)\n",

10271

+ BBRID->node->name, starting_lsn + lsn, new_lsn);

10272

+ rc = INIT_IO(BBRID->source, rw, new_lsn, 1, buffer);

10273

+ if (rc) {

10274

+ // This replacement sector is bad. Try the next.

10275

+ LOG_ERROR("object %s: Replacement sector (%Lu) is bad. Skipping.\n",

10276

+ BBRID->node->name, new_lsn);

10277

+ atomic_inc(&BBRID->in_use_replacement_blks);

10278

+ continue;

10279

+ }

10280

+

10281

+ // Add this new entry to the on-disk table.

10282

+ table_sector_index = new_lsn - BBRID->start_replacement_sect;

10283

+ table_sector_offset = table_sector_index / EVMS_BBR_ENTRIES_PER_SECT;

10284

+ index = table_sector_index % EVMS_BBR_ENTRIES_PER_SECT;

10285

+

10286

+ bbr_table = &BBRID->bbr_table[table_sector_offset];

10287

+ bbr_table->entries[index].bad_sect = starting_lsn + lsn;

10288

+ bbr_table->entries[index].replacement_sect = new_lsn;

10289

+ bbr_table->in_use_cnt++;

10290

+ bbr_table->sequence_number++;

10291

+ bbr_table->crc = 0;

10292

+ bbr_table->crc = evms_cs_calculate_crc( EVMS_INITIAL_CRC,

10293

+ bbr_table,

10294

+ sizeof(evms_bbr_table_t));

10295

+

10296

+ // Write the table to disk.

10297

+ cpu_bbr_table_sector_to_le(bbr_table, bbr_table);

10298

+ if ( BBRID->lba_table1 ) {

10299

+ rc = INIT_IO(BBRID->source, WRITE, BBRID->lba_table1 + table_sector_offset, 1, bbr_table);

10300

+ }

10301

+ if ( BBRID->lba_table2 ) {

10302

+ rc |= INIT_IO(BBRID->source, WRITE, BBRID->lba_table2 + table_sector_offset, 1, bbr_table);

10303

+ }

10304

+ le_bbr_table_sector_to_cpu(bbr_table);

10305

+

10306

+ if (rc) {

10307

+ // Error writing one of the tables to disk.

10308

+ LOG_ERROR("object %s: Error updating BBR tables on disk.\n",

10309

+ BBRID->node->name);

10310

+ return rc;

10311

+ }

10312

+

10313

+ // Insert a new entry in the remapping binary-tree.

10314

+ rc = bbr_insert_remap_entry(BBRID, &bbr_table->entries[index]);

10315

+ if (rc) {

10316

+ LOG_ERROR("object %s: Error adding new entry to remap tree.\n",

10317

+ BBRID->node->name);

10318

+ return rc;

10319

+ }

10320

+

10321

+ atomic_inc(&BBRID->in_use_replacement_blks);

10322

+ }

10323

+ }

10324

+

10325

+ return 0;

10326

+}

10327

+

10328

+

10329

+/* bbr_io_process_request

10330

+ *

10331

+ * For each sector in this request, check if the sector has already

10332

+ * been remapped. If so, process all previous sectors in the request,

10333

+ * followed by the remapped sector. Then reset the starting lsn and

10334

+ * count, and keep going with the rest of the request as if it were

10335

+ * a whole new request. If any of the INIT_IO's return an error,

10336

+ * call the remapper to relocate the bad sector(s).

10337

+ */

10338

+static int bbr_io_process_request( bbr_bh_t * bbr_bh )

10339

+{

10340

+ bbr_instance_data_t * BBRID = bbr_bh->BBRID;

10341

+ evms_sector_t starting_lsn = bbr_bh->eio.rsector;

10342

+ evms_sector_t count = bbr_bh->eio.rsize;

10343

+ evms_sector_t lsn, remapped_lsn;

10344

+ char * buffer = bbr_bh->eio.bh->b_data;

10345

+ int rc = 0, rw = bbr_bh->rw;

10346

+

10347

+ // For each sector in this request, check if this sector has already

10348

+ // been remapped. If so, process all previous sectors in this request,

10349

+ // followed by the remapped sector. Then reset the starting lsn and

10350

+ // count and keep going with the rest of the request as if it were

10351

+ // a whole new request.

10352

+ for ( lsn = 0; lsn < count && !(BBRID->flag & BBR_STOP_REMAP); lsn++ ) {

10353

+ remapped_lsn = starting_lsn + lsn;

10354

+ rc = bbr_remap(BBRID, &remapped_lsn);

10355

+ if (rc) {

10356

+ // Process all sectors in the request up to this one.

10357

+ if (lsn > 0) {

10358

+ rc = INIT_IO(BBRID->source, rw, starting_lsn, lsn, buffer);

10359

+ if (rc) {

10360

+ // If this I/O failed, then one of the

10361

+ // sectors in this request needs to be

10362

+ // relocated.

10363

+ rc = bbr_io_remap_error(BBRID, rw, starting_lsn, lsn, buffer);

10364

+ if (rc) {

10365

+ return rc;

10366

+ }

10367

+ }

10368

+ buffer += (lsn << EVMS_VSECTOR_SIZE_SHIFT);

10369

+ }

10370

+

10371

+ // Process the remapped sector.

10372

+ rc = INIT_IO(BBRID->source, rw, remapped_lsn, 1, buffer);

10373

+ if (rc) {

10374

+ // BUGBUG - Need more processing if this caused an error.

10375

+ // If this I/O failed, then the existing remap

10376

+ // is now bad, and we need to find a new remap.

10377

+ // Can't use bbr_io_remap_error(), because the

10378

+ // existing map entry needs to be changed, not

10379

+ // added again, and the original table entry

10380

+ // also needs to be changed.

10381

+ return rc;

10382

+ }

10383

+

10384

+ buffer += EVMS_VSECTOR_SIZE;

10385

+ starting_lsn += (lsn + 1);

10386

+ count -= (lsn + 1);

10387

+ lsn = -1;

10388

+ }

10389

+ }

10390

+

10391

+ // Check for any remaining sectors after the last split. This could

10392

+ // potentially be the whole request, but that should be a rare case

10393

+ // because requests should only be processed by the thread if we know

10394

+ // an error occurred or they contained one or more remapped sectors.

10395

+ if ( count ) {

10396

+ rc = INIT_IO(BBRID->source, rw, starting_lsn, count, buffer);

10397

+ if (rc) {

10398

+ // If this I/O failed, then one of the sectors in this

10399

+ // request needs to be relocated.

10400

+ rc = bbr_io_remap_error(BBRID, rw, starting_lsn, lsn, buffer);

10401

+ if (rc) {

10402

+ return rc;

10403

+ }

10404

+ }

10405

+ }

10406

+

10407

+ return 0;

10408

+}

10409

+

10410

+

10411

+/* bbr_io_handler

10412

+ *

10413

+ * This is the handler for the bbr_io_thread. It continuously loops,

10414

+ * taking I/O requests off its list and processing them. If nothing

10415

+ * is on the list, the thread goes back to sleep until specifically

10416

+ * woken up.

10417

+ *

10418

+ * I/O requests should only be sent to this thread if we know that:

10419

+ * a) the request contains at least one remapped sector.

10420

+ * or

10421

+ * b) the request caused an error on the normal I/O path.

10422

+ * This function uses synchronous I/O, so sending a request to this

10423

+ * thread that doesn't need special processing will cause severe

10424

+ * performance degredation.

10425

+ */

10426

+static void bbr_io_handler( void * void_data )

10427

+{

10428

+ bbr_bh_t * bbr_bh;

10429

+ struct buffer_head * bh;

10430

+ unsigned long flags;

10431

+ int rc = 0;

10432

+

10433

+ while (1) {

10434

+ // Process bbr_io_list, one entry at a time.

10435

+ spin_lock_irqsave(&bbr_io_list_lock, flags);

10436

+ bbr_bh = bbr_io_list;

10437

+ if (!bbr_bh) {

10438

+ spin_unlock_irqrestore(&bbr_io_list_lock, flags);

10439

+ break; // No more items on the list.

10440

+ }

10441

+ bbr_io_list = bbr_bh->next;

10442

+ spin_unlock_irqrestore(&bbr_io_list_lock, flags);

10443

+

10444

+ rc = bbr_io_process_request(bbr_bh);

10445

+

10446

+ // Clean up and complete the original I/O.

10447

+ bh = bbr_bh->eio.bh;

10448

+ if (bh->b_end_io) {

10449

+ // A normal request that originated from above EVMS.

10450

+ if ( ! (bbr_bh->flag & BBR_BH_USE_EVMS_CALLBACK) ) {

10451

+ evms_cs_volume_request_in_progress(bh->b_dev, -1, NULL);

10452

+ }

10453

+ free_bbr_bh(bbr_bh);

10454

+ bh->b_end_io(bh, rc ? 0 : 1);

10455

+ }

10456

+ else {

10457

+ // A request that originated from bbr_init_io.

10458

+ bbr_bh->rc = rc;

10459

+ if ( waitqueue_active(&bh->b_wait) ) {

10460

+ atomic_dec(&bbr_bh->waiters);

10461

+ wake_up(&bh->b_wait);

10462

+ }

10463

+ }

10464

+ }

10465

+}

10466

+

10467

+

10468

+/* bbr_schedule_io

10469

+ *

10470

+ * Place the specified bbr_bh on the thread's processing list.

10471

+ */

10472

+static void bbr_schedule_io( bbr_bh_t * bbr_bh )

10473

+{

10474

+ unsigned long flags;

10475

+

10476

+ spin_lock_irqsave(&bbr_io_list_lock, flags);

10477

+ if (bbr_io_list == NULL)

10478

+ bbr_io_list_tail = &bbr_io_list;

10479

+ *bbr_io_list_tail = bbr_bh;

10480

+ bbr_io_list_tail = &bbr_bh->next;

10481

+ bbr_bh->next = NULL;

10482

+ spin_unlock_irqrestore(&bbr_io_list_lock, flags);

10483

+ evms_cs_wakeup_thread(bbr_io_thread);

10484

+}

10485

+

10486

+

10487

+/* bbr_read

10488

+ *

10489

+ * If there are any remapped sectors on this object, send this request over

10490

+ * to the thread for processing. Otherwise send it down the stack normally.

10491

+ */

10492

+static void bbr_read( evms_logical_node_t * bbr_node,

10493

+ eio_t * eio )

10494

+{

10495

+ bbr_instance_data_t * BBRID = bbr_node->instance_data;

10496

+ bbr_bh_t * bbr_bh;

10497

+

10498

+ if ( eio->rsector + eio->rsize <= bbr_node->total_vsectors ) {

10499

+ if ( atomic_read(&BBRID->in_use_replacement_blks) == 0 ||

10500

+ BBRID->flag & BBR_STOP_REMAP ||

10501

+ ! bbr_remap_probe(BBRID, eio->rsector, eio->rsize) ) {

10502

+ R_IO(BBRID->source, eio);

10503

+ }

10504

+ else {

10505

+ bbr_bh = allocate_bbr_bh(BBRID, READ);

10506

+ if (bbr_bh) {

10507

+ bbr_bh->eio = *eio;

10508

+ evms_cs_volume_request_in_progress(bbr_bh->eio.bh->b_dev, +1, NULL);

10509

+ bbr_schedule_io(bbr_bh);

10510

+ }

10511

+ else {

10512

+ // Can't get memory to track the I/O.

10513

+ EVMS_IO_ERROR(eio);

10514

+ }

10515

+ }

10516

+ }

10517

+ else {

10518

+ // Request is off the end of the object.

10519

+ EVMS_IO_ERROR(eio);

10520

+ }

10521

+}

10522

+

10523

+

10524

+/* bbr_write_callback

10525

+ *

10526

+ * This is the callback for normal write requests. Check for an error

10527

+ * during the I/O, and send to the thread for processing if necessary.

10528

+ */

10529

+static void bbr_write_callback( bbr_bh_t * bbr_bh,

10530

+ struct buffer_head * bh,

10531

+ int uptodate,

10532

+ int * redrive )

10533

+{

10534

+ if ( ! uptodate &&

10535

+ ! (bbr_bh->BBRID->flag & BBR_STOP_REMAP) ) {

10536

+ LOG_ERROR("object %s: Write failure on sector (%Lu). Scheduling for retry.\n",

10537

+ bbr_bh->BBRID->node->name, bbr_bh->eio.rsector);

10538

+ bbr_schedule_io(bbr_bh);

10539

+ *redrive = TRUE;

10540

+ }

10541

+ else {

10542

+ free_bbr_bh(bbr_bh);

10543

+ }

10544

+}

10545

+

10546

+

10547

+/* bbr_write

10548

+ *

10549

+ * If there are any remapped sectors on this object, send the request over

10550

+ * to the thread for processing. Otherwise, register for callback

10551

+ * notification, and send the request down normally.

10552

+ */

10553

+static void bbr_write(evms_logical_node_t *bbr_node, eio_t *eio)

10554

+{

10555

+ bbr_instance_data_t * BBRID = bbr_node->instance_data;

10556

+ bbr_bh_t * bbr_bh;

10557

+

10558

+ if ( eio->rsector + eio->rsize <= bbr_node->total_vsectors &&

10559

+ ! (BBRID->flag & EVMS_VOLUME_READ_ONLY) ) {

10560

+ bbr_bh = allocate_bbr_bh(BBRID, WRITE);

10561

+ if (bbr_bh) {

10562

+ bbr_bh->eio = *eio;

10563

+

10564

+ if ( atomic_read(&BBRID->in_use_replacement_blks) == 0 ||

10565

+ BBRID->flag & BBR_STOP_REMAP ||

10566

+ ! bbr_remap_probe(BBRID, eio->rsector, eio->rsize) ) {

10567

+ bbr_bh->flag |= BBR_BH_USE_EVMS_CALLBACK;

10568

+ evms_cs_register_for_end_io_notification(bbr_bh, eio->bh, bbr_write_callback);

10569

+ W_IO(BBRID->source, eio);

10570

+ }

10571

+ else {

10572

+ evms_cs_volume_request_in_progress(eio->bh->b_dev, +1, NULL);

10573

+ bbr_schedule_io(bbr_bh);

10574

+ }

10575

+ }

10576

+ else {

10577

+ // Can't get memory to track the I/O.

10578

+ EVMS_IO_ERROR(eio);

10579

+ }

10580

+ }

10581

+ else {

10582

+ // Request is off the end of the object, or this

10583

+ // is a read-only object.

10584

+ EVMS_IO_ERROR(eio);

10585

+ }

10586

+}

10587

+

10588

+

10589

+/********************************************************/

10590

+/* Required Plugin Function Table Entry Point: */

10591

+/* Init_io function */

10592

+/********************************************************/

10593

+

10594

+

10595

+static int bbr_init_io_schedule_io( bbr_instance_data_t * BBRID,

10596

+ int rw,

10597

+ evms_sector_t lsn,

10598

+ evms_sector_t count,

10599

+ void * buffer )

10600

+{

10601

+ bbr_bh_t * bbr_bh;

10602

+ struct buffer_head * bh;

10603

+ int rc = 0;

10604

+

10605

+ if ( rw == WRITE ) {

10606

+ LOG_ERROR("object %s: init_io write failure (sector %Lu: count %Lu). Scheduling for retry.\n",

10607

+ BBRID->node->name, lsn, count);

10608

+ bbr_bh = allocate_bbr_bh(BBRID,rw);

10609

+ if (bbr_bh) {

10610

+ bbr_bh->eio.rsector = lsn;

10611

+ bbr_bh->eio.rsize = count;

10612

+

10613

+ bh = evms_cs_allocate_from_pool(evms_bh_pool, TRUE);

10614

+ if (bh) {

10615

+ bbr_bh->eio.bh = bh;

10616

+

10617

+ memset(bh, 0, sizeof(*bh));

10618

+ init_waitqueue_head(&bh->b_wait);

10619

+ bh->b_data = buffer;

10620

+ bh->b_end_io = NULL;

10621

+

10622

+ atomic_inc(&bbr_bh->waiters);

10623

+ bbr_schedule_io(bbr_bh);

10624

+ wait_event(bh->b_wait, (atomic_read(&bbr_bh->waiters) == 0));

10625

+

10626

+ rc = bbr_bh->rc;

10627

+

10628

+ evms_cs_deallocate_to_pool(evms_bh_pool, bh);

10629

+ }

10630

+ else {

10631

+ // Couldn't get buffer head.

10632

+ rc = -ENOMEM;

10633

+ }

10634

+

10635

+ free_bbr_bh(bbr_bh);

10636

+ }

10637

+ else {

10638

+ // Couldn't get bbr_bh.

10639

+ rc = -ENOMEM;

10640

+ }

10641

+ }

10642

+ else {

10643

+ // Nothing can be done about read failures.

10644

+ rc = -EIO;

10645

+ }

10646

+

10647

+ return 0;

10648

+}

10649

+

10650

+static int bbr_init_io( evms_logical_node_t * bbr_node,

10651

+ int io_flag,

10652

+ evms_sector_t start_lsn,

10653

+ evms_sector_t count,

10654

+ void * buffer )

10655

+{

10656

+ bbr_instance_data_t * BBRID;

10657

+ evms_sector_t lsn;

10658

+ int rc = 0;

10659

+

10660

+ if ( start_lsn + count <= bbr_node->total_vsectors ) {

10661

+ BBRID = bbr_node->instance_data;

10662

+

10663

+ if ( io_flag == WRITE && (BBRID->flag & EVMS_VOLUME_READ_ONLY) ) {

10664

+ // Can't write to a read-only object.

10665

+ rc = -EINVAL;

10666

+ }

10667

+ else {

10668

+ if ( BBRID->flag & BBR_STOP_REMAP ) {

10669

+ // Can't remap at all.

10670

+ rc = INIT_IO(BBRID->source, io_flag, start_lsn, count, buffer);

10671

+ }

10672

+ else if ( atomic_read(&BBRID->in_use_replacement_blks) == 0 ||

10673

+ ! bbr_remap_probe(BBRID, start_lsn, count) ) {

10674

+ // Normal case (no existing remaps)

10675

+ rc = INIT_IO(BBRID->source, io_flag, start_lsn, count, buffer);

10676

+ if (rc) {

10677

+ // Init_io error. Send request over to

10678

+ // thread for further processing.

10679

+ rc = bbr_init_io_schedule_io(BBRID, io_flag, start_lsn, count, buffer);

10680

+ }

10681

+ }

10682

+ else {

10683

+ // At least one sector in this request needs to

10684

+ // be remapped. Test and send each one down

10685

+ // individually.

10686

+ for ( lsn = start_lsn; lsn < start_lsn + count; lsn++, buffer += EVMS_VSECTOR_SIZE ) {

10687

+ bbr_remap(BBRID, &lsn);

10688

+ rc = INIT_IO(BBRID->source, io_flag, lsn, 1, buffer);

10689

+ if (rc) {

10690

+ // Init_io error. Send request

10691

+ // to thread for processing.

10692

+ rc = bbr_init_io_schedule_io(BBRID, io_flag, lsn, 1, buffer);

10693

+ if (rc) {

10694

+ break;

10695

+ }

10696

+ }

10697

+ }

10698

+ }

10699

+ }

10700

+ }

10701

+ else {

10702

+ // Request is off the end of the object.

10703

+ rc = -EINVAL;

10704

+ }

10705

+

10706

+ return rc;

10707

+}

10708

+

10709

+

10710

+/********************************************************/

10711

+/* Required Plugin Function Table Entry Point: */

10712

+/* IOCTL function */

10713

+/********************************************************/

10714

+

10715

+static int bbr_direct_ioctl_sector_io( bbr_instance_data_t * BBRID,

10716

+ evms_notify_bbr_t * ioctl_arg )

10717

+{

10718

+ char * buffer, *user_buffer;

10719

+ evms_sector_t lsn;

10720

+ int rc = 0;

10721

+

10722

+ if ( evms_cs_allocate_memory((void**)&buffer, EVMS_VSECTOR_SIZE) ) {

10723

+ return -ENOMEM;

10724

+ }

10725

+

10726

+ user_buffer = (char*)ioctl_arg->buffer;

10727

+

10728

+ for ( lsn = 0; lsn < ioctl_arg->nr_sect; lsn++, user_buffer += EVMS_VSECTOR_SIZE ) {

10729

+ if ( ioctl_arg->rw == WRITE ) {

10730

+ if ( copy_from_user(buffer, user_buffer, EVMS_VSECTOR_SIZE) ) {

10731

+ rc = -EFAULT;

10732

+ break;

10733

+ }

10734

+ }

10735

+

10736

+ rc = bbr_init_io(BBRID->node, ioctl_arg->rw, ioctl_arg->start_sect + lsn, 1, buffer);

10737

+ if (rc) {

10738

+ break;

10739

+ }

10740

+

10741

+ if ( ioctl_arg->rw == READ ) {

10742

+ if ( copy_to_user(user_buffer, buffer, EVMS_VSECTOR_SIZE) ) {

10743

+ rc = -EFAULT;

10744

+ break;

10745

+ }

10746

+ }

10747

+ }

10748

+

10749

+ evms_cs_deallocate_memory(buffer);

10750

+ return rc;

10751

+}

10752

+

10753

+static int bbr_direct_ioctl (

10754

+ struct inode *inode,

10755

+ struct file *file,

10756

+ unsigned int cmd,

10757

+ unsigned long arg)

10758

+{

10759

+ int rc = 0;

10760

+ bbr_instance_data_t *BBRID;

10761

+ evms_plugin_ioctl_t argument;

10762

+ evms_notify_bbr_t ioctl_arg, *usr_ioctl_arg;

10763

+

10764

+ if ( copy_from_user(&argument, (evms_plugin_ioctl_t *)arg, sizeof(argument)) ) {

10765

+ return -EFAULT;

10766

+ }

10767

+

10768

+ if ( argument.feature_id != plugin_header.id ) {

10769

+ return -EINVAL;

10770

+ }

10771

+

10772

+ usr_ioctl_arg = (evms_notify_bbr_t*)argument.feature_ioctl_data;

10773

+ if ( copy_from_user(&ioctl_arg, usr_ioctl_arg, sizeof(ioctl_arg)) ) {

10774

+ rc = -EFAULT;

10775

+ }

10776

+ else {

10777

+ BBRID = bbr_find_instance_data(ioctl_arg.object_name);

10778

+ if (!BBRID)

10779

+ rc = -ENODEV;

10780

+

10781

+ if (!rc) {

10782

+

10783

+ switch(argument.feature_command) {

10784

+

10785

+ case BBR_STOP_REMAP_CMD:

10786

+ BBRID->flag |= BBR_STOP_REMAP;

10787

+ // Fall through.

10788

+

10789

+ case BBR_GET_INFO_CMD:

10790

+ ioctl_arg.count = atomic_read(&BBRID->in_use_replacement_blks);

10791

+ if ( copy_to_user(&usr_ioctl_arg->count,

10792

+ &ioctl_arg.count,

10793

+ sizeof(usr_ioctl_arg->count)) ) {

10794

+ rc = -EFAULT;

10795

+ }

10796

+ break;

10797

+

10798

+ case BBR_SECTOR_IO_CMD:

10799

+ rc = bbr_direct_ioctl_sector_io(BBRID, &ioctl_arg);

10800

+ break;

10801

+

10802

+ default:

10803

+ rc = -ENOSYS;

10804

+ }

10805

+ }

10806

+ }

10807

+

10808

+ argument.status = rc;

10809

+ copy_to_user((evms_plugin_ioctl_t*)arg, &argument, sizeof(argument));

10810

+ return rc;

10811

+}

10812

+

10813

+static int bbr_ioctl (evms_logical_node_t *bbr_node,

10814

+ struct inode *inode,

10815

+ struct file *file,

10816

+ unsigned int cmd,

10817

+ unsigned long arg)

10818

+{

10819

+ bbr_instance_data_t *BBRID;

10820

+ int rc;

10821

+

10822

+ rc = 0;

10823

+ BBRID = bbr_node->instance_data;

10824

+ if (!inode)

10825

+ return -EINVAL;

10826

+ switch (cmd) {

10827

+ case EVMS_PLUGIN_IOCTL:

10828

+ rc = bbr_direct_ioctl(inode,file,cmd,arg);

10829

+ break;

10830

+ case EVMS_GET_BMAP:

10831

+ {

10832

+ evms_get_bmap_t *bmap = (evms_get_bmap_t *)arg;

10833

+

10834

+ bbr_remap(BBRID, &bmap->rsector);

10835

+ /* fall thru */

10836

+ }

10837

+

10838

+ default:

10839

+ rc = IOCTL(BBRID->source, inode, file, cmd, arg);

10840

+ }

10841

+ return rc;

10842

+}

10843

+

10844

+int bbr_notify_reboot(struct notifier_block *this, unsigned long code, void *x)

10845

+{

10846

+ if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) {

10847

+

10848

+ LOG_DEFAULT("%s unregister BBR threads\n", __FUNCTION__);

10849

+ if (bbr_io_thread)

10850

+ evms_cs_unregister_thread(bbr_io_thread);

10851

+ mdelay(1000*1); /* delay some */

10852

+ }

10853

+ return NOTIFY_DONE;

10854

+}

10855

+

10856

+static int __init bbr_init(void)

10857

+{

10858

+ /* Register for reboot notification */

10859

+ register_reboot_notifier(&bbr_notifier);

10860

+

10861

+ return evms_cs_register_plugin(&plugin_header);

10862

+}

10863

+

10864

+static void __exit bbr_exit(void)

10865

+{

10866

+ evms_cs_unregister_plugin(&plugin_header);

10867

+}

10868

+

10869

+

10870

+module_init(bbr_init);

10871

+module_exit(bbr_exit);

10872

+#ifdef MODULE_LICENSE

10873

+MODULE_LICENSE("GPL");

10874

+#endif

10875

+

10876

diff -Naur linux-2002-03-28/drivers/evms/evms_drivelink.c evms-2002-03-28/drivers/evms/evms_drivelink.c

10877

--- linux-2002-03-28/drivers/evms/evms_drivelink.c Wed Dec 31 18:00:00 1969

10878

+++ evms-2002-03-28/drivers/evms/evms_drivelink.c Wed Mar 27 15:51:36 2002

10879

@@ -0,0 +1,1107 @@

10880

+/* -*- linux-c -*- */

10881

+

10882

+/*

10883

+ *

10884

+ *

10885

+ * Copyright (c) International Business Machines Corp., 2000

10886

+ *

10887

+ * This program is free software; you can redistribute it and/or modify

10888

+ * it under the terms of the GNU General Public License as published by

10889

+ * the Free Software Foundation; either version 2 of the License, or

10890

+ * (at your option) any later version.

10891

+ *

10892

+ * This program is distributed in the hope that it will be useful,

10893

+ * but WITHOUT ANY WARRANTY; without even the implied warranty of

10894

+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See

10895

+ * the GNU General Public License for more details.

10896

+ *

10897

+ * You should have received a copy of the GNU General Public License

10898

+ * along with this program; if not, write to the Free Software

10899

+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

10900

+ *

10901

+ *

10902

+ */

10903

+/*

10904

+ * linux/drivers/evms/drvlink.c

10905

+

10906

+ *

10907

+ * EVMS Drive Linking Feature.

10908

+ *

10909

+ * This feature provides the ability to link multiple storage objects

10910

+ * together as a single virtual storage object.

10911

+ *

10912

+ */

10913

+

10914

+#include <linux/module.h>

10915

+#include <linux/kernel.h>

10916

+#include <linux/config.h>

10917

+#include <linux/genhd.h>

10918

+#include <linux/blk.h>

10919

+#include <linux/evms/evms_kernel.h>

10920

+#include <linux/evms/evms_drivelink.h>

10921

+#include <asm/uaccess.h>

10922

+

10923

+#define LOG_PREFIX "drivelink: "

10924

+

10925

+/* prototypes for mandatory plugin interface functions */

10926

+static int drivelink_discover(evms_logical_node_t **);

10927

+static int drivelink_delete(evms_logical_node_t *);

10928

+static void drivelink_read(evms_logical_node_t *, eio_t *);

10929

+static void drivelink_write(evms_logical_node_t *, eio_t *);

10930

+static int drivelink_ioctl(evms_logical_node_t *,

10931

+ struct inode *,

10932

+ struct file *,

10933

+ unsigned int,

10934

+ unsigned long);

10935

+static int drivelink_init_io(evms_logical_node_t *,

10936

+ int,

10937

+ evms_sector_t,

10938

+ evms_sector_t,

10939

+ void *);

10940

+

10941

+/* plugin function table definition */

10942

+static evms_plugin_function_table_t function_table = {

10943

+ discover: &drivelink_discover,

10944

+ delete : &drivelink_delete,

10945

+ read : &drivelink_read,

10946

+ write : &drivelink_write,

10947

+ init_io : &drivelink_init_io,

10948

+ ioctl : &drivelink_ioctl

10949

+};

10950

+

10951

+/* plugin header definition */

10952

+static evms_plugin_header_t plugin_header = {

10953

+ id : SetPluginID(

10954

+ IBM_OEM_ID,

10955

+ EVMS_FEATURE, //FEATURE class

10956

+ EVMS_DRIVELINK_FEATURE_ID), // unique id for feature

10957

+ version : {

10958

+ major : EVMS_DRIVELINK_VERSION_MAJOR,

10959

+ minor : EVMS_DRIVELINK_VERSION_MINOR,

10960

+ patchlevel : EVMS_DRIVELINK_VERSION_PATCHLEVEL

10961

+ },

10962

+ required_common_services_version : {

10963

+ major : 0,

10964

+ minor : 5,

10965

+ patchlevel : 0

10966

+ },

10967

+ function_table : &function_table // function table for this plugin

10968

+};

10969

+

10970

+/********************************************************/

10971

+/* Required Plugin Function Table Entry Point: */

10972

+/* Discover function & Support routines */

10973

+/********************************************************/

10974

+

10975

+

10976

+/*

10977

+ *

10978

+ * convert feature data from on-disk (Little Endian) format

10979

+ * to the native cpu endian format.

10980

+ */

10981

+static void

10982

+le_feature_data_to_cpu(evms_drivelink_metadata_t *DLMD)

10983

+{

10984

+ int i;

10985

+

10986

+ DLMD->signature = le32_to_cpu(DLMD->signature);

10987

+ DLMD->crc = le32_to_cpu(DLMD->crc);

10988

+ DLMD->version.major = le32_to_cpu(DLMD->version.major);

10989

+ DLMD->version.minor = le32_to_cpu(DLMD->version.minor);

10990

+ DLMD->version.patchlevel = le32_to_cpu(DLMD->version.patchlevel);

10991

+ DLMD->flags = le32_to_cpu(DLMD->flags);

10992

+ DLMD->sequence_number = le64_to_cpu(DLMD->sequence_number);

10993

+ DLMD->child_serial_number = le64_to_cpu(DLMD->child_serial_number);

10994

+ DLMD->parent_serial_number = le64_to_cpu(DLMD->parent_serial_number);

10995

+ DLMD->child_count = le64_to_cpu(DLMD->child_count);

10996

+ for (i = 0; i < EVMS_DRIVELINK_MAX_ENTRIES; i++) {

10997

+ evms_dl_ordering_table_entry_t *child_entry;

10998

+

10999

+ child_entry = &DLMD->ordering_table[i];

11000

+ child_entry->child_serial_number =

11001

+ le64_to_cpu(child_entry->child_serial_number);

11002

+ child_entry->child_vsize =

11003

+ le64_to_cpu(child_entry->child_vsize);

11004

+ }

11005

+}

11006

+

11007

+static int

11008

+load_feature_data(

11009

+ evms_logical_node_t *node,

11010

+ evms_drivelink_metadata_t **DLMD)

11011

+{

11012

+ int i, rc = 0, rc_array[2] = {0,0}, size_in_bytes;

11013

+ u_int64_t real_metadata_size, feature_data_size;

11014

+ u_int64_t starting_sector;

11015

+ evms_drivelink_metadata_t *cur_DLMD, *DLMD1, *DLMD2;

11016

+ char *location_name;

11017

+

11018

+ /* verify the feature metadata size from the */

11019

+ /* feature header agrees with the real size */

11020

+ /* of the current metadata structure. */

11021

+ real_metadata_size = evms_cs_size_in_vsectors(sizeof(**DLMD));

11022

+

11023

+ /* allocate a buffer large enough to hold all */

11024

+ /* sectors containing the feature's metadata */

11025

+ size_in_bytes = real_metadata_size * EVMS_VSECTOR_SIZE;

11026

+ rc = evms_cs_allocate_memory((void **)&DLMD1, size_in_bytes);

11027

+ if (!rc) {

11028

+ rc = evms_cs_allocate_memory((void **)&DLMD2, size_in_bytes);

11029

+ if (rc) evms_cs_deallocate_memory(DLMD1);

11030

+ }

11031

+ if (!rc) {

11032

+ for (i = 0; i < 2; i++) {

11033

+ if (i == 0) {

11034

+ starting_sector = node->feature_header->feature_data1_start_lsn;

11035

+ feature_data_size = node->feature_header->feature_data1_size;

11036

+ cur_DLMD = DLMD1;

11037

+ location_name = evms_primary_string;

11038

+ } else {

11039

+ starting_sector = node->feature_header->feature_data2_start_lsn;

11040

+ feature_data_size = node->feature_header->feature_data2_size;

11041

+ cur_DLMD = DLMD2;

11042

+ location_name = evms_secondary_string;

11043

+ }

11044

+ /* check that real metadata size matches the */

11045

+ /* feature data size */

11046

+ if (real_metadata_size != feature_data_size) {

11047

+ LOG_ERROR("%s feature data size(%Lu bytes) doesn't match expected size(%Lu bytes).\n",

11048

+ location_name,

11049

+ feature_data_size << EVMS_VSECTOR_SIZE_SHIFT,

11050

+ real_metadata_size << EVMS_VSECTOR_SIZE_SHIFT);

11051

+ rc = -EINVAL;

11052

+ rc_array[i] = rc;

11053

+ continue;

11054

+ }

11055

+ /* load the node's feature data */

11056

+ rc = INIT_IO(node,

11057

+ 0,

11058

+ starting_sector,

11059

+ feature_data_size,

11060

+ cur_DLMD);

11061

+ if (rc) {

11062

+ LOG_ERROR("error(%d) probing for %s feature data at sector(%Ld) on '%s'.\n",

11063

+ rc,

11064

+ location_name,

11065

+ starting_sector,

11066

+ node->name);

11067

+ rc_array[i] = rc;

11068

+ continue;

11069

+ }

11070

+ /* check for valid metadata signature */

11071

+ if (le32_to_cpu(cur_DLMD->signature) != EVMS_DRIVELINK_SIGNATURE) {

11072

+ rc = -ENODATA;

11073

+ LOG_SERIOUS("error(%d) invalid signature in %s feature data on '%s'\n",

11074

+ rc,

11075

+ location_name,

11076

+ node->name);

11077

+ rc_array[i] = rc;

11078

+ continue;

11079

+ }

11080

+ /* validate feature data CRC */

11081

+ if (cur_DLMD->crc != EVMS_MAGIC_CRC) {

11082

+ int org_crc, final_crc;

11083

+ org_crc = le32_to_cpu(cur_DLMD->crc);

11084

+ cur_DLMD->crc = 0;

11085

+ final_crc = evms_cs_calculate_crc(

11086

+ EVMS_INITIAL_CRC,

11087

+ cur_DLMD, sizeof(*cur_DLMD));

11088

+ if (final_crc != org_crc) {

11089

+ LOG_ERROR("CRC mismatch error [stored(%x), computed(%x)] in %s feature data on '%s'.\n",

11090

+ org_crc, final_crc,

11091

+ location_name,

11092

+ node->name);

11093

+ rc = -EINVAL;

11094

+ rc_array[i] = rc;

11095

+ continue;

11096

+ }

11097

+ } else {

11098

+ LOG_WARNING("CRC disabled in %s feature data on '%s'.\n",

11099

+ location_name,

11100

+ node->name);

11101

+ }

11102

+ /* convert feature data from on-disk

11103

+ * format (Little Endian) to native

11104

+ * cpu endian format.

11105

+ */

11106

+ le_feature_data_to_cpu(cur_DLMD);

11107

+ /* check for valid structure version */

11108

+ rc = evms_cs_check_version(

11109

+ &plugin_header.version,

11110

+ &cur_DLMD->version);

11111

+ if (rc) {

11112

+ LOG_SERIOUS("error(%d) obsolete version(%d,%d,%d) detected in %s feature data on '%s'\n",

11113

+ rc,

11114

+ cur_DLMD->version.major,

11115

+ cur_DLMD->version.minor,

11116

+ cur_DLMD->version.patchlevel,

11117

+ location_name,

11118

+ node->name);

11119

+ rc_array[i] = rc;

11120

+ }

11121

+ }

11122

+ /* getting same return code for both copies? */

11123

+ if (rc_array[0] == rc_array[1]) {

11124

+ rc = rc_array[0];

11125

+ /* if no errors on both copies,

11126

+ * check the sequence numbers.

11127

+ * use the highest sequence number.

11128

+ */

11129

+ if (!rc) {

11130

+ /* compare sequence numbers */

11131

+ if (DLMD1->sequence_number == DLMD2->sequence_number) {

11132

+ cur_DLMD = DLMD1;

11133

+ } else {

11134

+ LOG_WARNING("sequence number mismatches between front(%Ld) and rear(%Ld) feature data copies on node(%s)!\n",

11135

+ DLMD2->sequence_number,

11136

+ DLMD1->sequence_number,

11137

+ node->name);

11138

+ if (DLMD1->sequence_number > DLMD2->sequence_number)

11139

+ cur_DLMD = DLMD1;

11140

+ else

11141

+ cur_DLMD = DLMD2;

11142

+ LOG_WARNING("using %s feature data copy!\n",

11143

+ (cur_DLMD == DLMD1) ?

11144

+ evms_primary_string :

11145

+ evms_secondary_string);

11146

+ }

11147

+ }

11148

+ /* getting different return codes for each copy */

11149

+ } else if (rc_array[0] == 0) {

11150

+ /* use 1st (rear) copy if its good */

11151

+ rc = 0;

11152

+ cur_DLMD = DLMD1;

11153

+ } else if (rc_array[1] == 0) {

11154

+ /* use 2nd (front) copy if its good */

11155

+ rc = 0;

11156

+ cur_DLMD = DLMD2;

11157

+ } else if ((rc_array[0] == -EINVAL) ||

11158

+ (rc_array[1] == -EINVAL)) {

11159

+ /* fail if either give a fatal error */

11160

+ rc = -EINVAL;

11161

+ cur_DLMD = NULL;

11162

+ }

11163

+

11164

+ /* deallocate metadata buffers appropriately */

11165

+ if (rc || (cur_DLMD == DLMD1))

11166

+ evms_cs_deallocate_memory(DLMD2);

11167

+ if (rc || (cur_DLMD == DLMD2))

11168

+ evms_cs_deallocate_memory(DLMD1);

11169

+

11170

+ /* save validated feature header pointer */

11171

+ if (!rc)

11172

+ *DLMD = cur_DLMD;

11173

+ }

11174

+ return(rc);

11175

+}

11176

+

11177

+static int

11178

+find_parent_node_for_child_node(

11179

+ evms_logical_node_t *child_node,

11180

+ evms_drivelink_metadata_t *DLMD,

11181

+ evms_logical_node_t **parent_node,

11182

+ evms_drivelink_runtime_data_t **drivelink_instance_data,

11183

+ evms_logical_node_t **discover_list)

11184

+{

11185

+ int rc = 0, parent_found = FALSE;

11186

+ evms_logical_node_t *parent = NULL;

11187

+ evms_drivelink_runtime_data_t *DLID = NULL;

11188

+

11189

+ /* find the parent node for this child */

11190

+ for (parent = *discover_list; parent; parent = parent->next) {

11191

+ /* only parent nodes will have null feature headers */

11192

+ if (!parent->feature_header) {

11193

+ DLID = (evms_drivelink_runtime_data_t *)parent->instance_data;

11194

+ if (DLID->parent_serial_number == DLMD->parent_serial_number) {

11195

+ parent_found = TRUE;

11196

+ break;

11197

+ }

11198

+ }

11199

+ }

11200

+ /* if no parent node found, create it */

11201

+ if (parent_found == FALSE) {

11202

+ rc = evms_cs_allocate_logical_node(&parent);

11203

+ if (!rc) {

11204

+ /* transpose info from child to parent */

11205

+ parent->flags |= child_node->flags;

11206

+ strcpy(parent->name, child_node->feature_header->object_name);

11207

+ /* copy evms system data to parent */

11208

+ parent->volume_info = child_node->volume_info;

11209

+ /* initialize the plugin id field */

11210

+ parent->plugin = &plugin_header;

11211

+ /* allocate parent's instance data */

11212

+ rc = evms_cs_allocate_memory(

11213

+ (void **)&parent->instance_data,

11214

+ sizeof(*DLID));

11215

+ }

11216

+ if (!rc) {

11217

+ /* initialize some instance data fields */

11218

+ DLID = (evms_drivelink_runtime_data_t *)parent->instance_data;

11219

+ DLID->parent_serial_number = DLMD->parent_serial_number;

11220

+ DLID->child_count = DLMD->child_count;

11221

+ /* allocate the child table */

11222

+ rc = evms_cs_allocate_memory(

11223

+ (void **)&DLID->child_table,

11224

+ sizeof(evms_drivelink_runtime_entry_t) *

11225

+ DLID->child_count);

11226

+ }

11227

+ if (!rc) {

11228

+ /* add the parent node to the discover list */

11229

+ rc = evms_cs_add_logical_node_to_list(discover_list, parent);

11230

+ MOD_INC_USE_COUNT;

11231

+ }

11232

+ /* if any errors encountered, try to clean up */

11233

+ if (rc) {

11234

+ LOG_SERIOUS("find_parent_node: rc(%d) from '%s'\n",

11235

+ rc, child_node->name);

11236

+ if (parent) {

11237

+ DELETE(parent);

11238

+ parent = NULL;

11239

+ DLID = NULL;

11240

+ }

11241

+ }

11242

+ }

11243

+

11244

+ *drivelink_instance_data = DLID;

11245

+ *parent_node = parent;

11246

+

11247

+ return(rc);

11248

+}

11249

+

11250

+static int

11251

+compute_child_index(

11252

+ evms_logical_node_t *node,

11253

+ evms_drivelink_metadata_t *DLMD)

11254

+{

11255

+ int i, position = -1;

11256

+

11257

+ for(i = 0; i < DLMD->child_count; i++) {

11258

+ if (DLMD->ordering_table[i].child_serial_number ==

11259

+ DLMD->child_serial_number) {

11260

+ position = i;

11261

+ break;

11262

+ }

11263

+ }

11264

+ if (position == -1) {

11265

+ LOG_SERIOUS("%s: child not found from '%s'\n",

11266

+ __FUNCTION__, node->name);

11267

+ }

11268

+ return(position);

11269

+}

11270

+

11271

+static int

11272

+process_child_nodes(evms_logical_node_t **discover_list)

11273

+{

11274

+ int rc = 0, index = -1;

11275

+ evms_logical_node_t *node, *next_node, *parent;

11276

+ evms_drivelink_metadata_t *DLMD;

11277

+ evms_drivelink_runtime_data_t *DLID;

11278

+ evms_drivelink_runtime_entry_t *child_entry = NULL;

11279

+

11280

+ for (node = *discover_list; node; node = next_node) {

11281

+ next_node = node->next;

11282

+ if ( (!node->feature_header) ||

11283

+ (node->feature_header->feature_id != plugin_header.id) ) {

11284

+ continue;

11285

+ }

11286

+

11287

+ rc = evms_cs_remove_logical_node_from_list(discover_list, node);

11288

+ if (rc) BUG();

11289

+ /* we need to load the feature data to */

11290

+ /* find the parent's serial number this */

11291

+ /* child node belongs to. */

11292

+ DLMD = NULL;

11293

+ rc = load_feature_data(node,&DLMD);

11294

+ if (!rc) {

11295

+ /* find the parent node for this child */

11296

+ parent = NULL;

11297

+ rc = find_parent_node_for_child_node(

11298

+ node, DLMD, &parent, &DLID, discover_list);

11299

+ }

11300

+ if (!rc) {

11301

+ /* determine position of child in drive link object */

11302

+ index = compute_child_index(node, DLMD);

11303

+ if (index == -1)

11304

+ rc = index;

11305

+ }

11306

+ if (!rc) {

11307

+ /* check for multiple child index requests */

11308

+ child_entry = (evms_drivelink_runtime_entry_t *)&DLID->child_table[index];

11309

+ /* check to see if this child index is

11310

+ * already in use.

11311

+ */

11312

+ if (child_entry->child_node) {

11313

+ LOG_SERIOUS("attempt to put '%s' in child index(%d). Already occupied by '%s'.\n",

11314

+ node->name, index, child_entry->child_node->name);

11315

+ rc = -1;

11316

+ }

11317

+ }

11318

+ if (!rc) {

11319

+ /* fill in child info in parent */

11320

+

11321

+ /* check the sector size for this node */

11322

+ if (node->hardsector_size > parent->hardsector_size)

11323

+ parent->hardsector_size = node->hardsector_size;

11324

+ /* check the block size for this node */

11325

+ if (node->block_size > parent->block_size)

11326

+ parent->block_size = node->block_size;

11327

+ /* set the child node */

11328

+ child_entry->child_node = node;

11329

+ /* set the metadata for this node */

11330

+ child_entry->child_metadata = DLMD;

11331

+ }

11332

+

11333

+ /* on error, clean up accordingly */

11334

+ if (rc) {

11335

+ if (DLMD)

11336

+ evms_cs_deallocate_memory(DLMD);

11337

+ LOG_SERIOUS("%s: rc(%d) from '%s'\n",

11338

+ __FUNCTION__, rc, node->name);

11339

+ LOG_SERIOUS("deleting child node '%s'.\n",

11340

+ node->name);

11341

+ rc = DELETE(node);

11342

+ if (rc) {

11343

+ LOG_SERIOUS("error(%d) attempting to delete '%s'.\n",

11344

+ rc, node->name);

11345

+ }

11346

+ }

11347

+ }

11348

+

11349

+ /* errors are handled internal to this function */

11350

+ /* by deleting the failed node. This will get */

11351

+ /* picked up by finalize_parent_nodes as a */

11352

+ /* missing child node */

11353

+ return(0);

11354

+}

11355

+

11356

+#define TEST_CHILD_PRESENCE 0

11357

+#define TEST_CHILD_COUNT 1

11358

+#define TEST_CHILD_PARENTS_SERIAL_NUM 2

11359

+#define TEST_CHILD_POSITION 3

11360

+#define TEST_CHILD_METADATA 4

11361

+

11362

+static int

11363

+test_parent_node(evms_logical_node_t *node)

11364

+{

11365

+ int i, rc = 0;

11366

+ evms_drivelink_runtime_data_t *DLID;

11367

+ evms_drivelink_runtime_entry_t *child_entry;

11368

+

11369

+ DLID = (evms_drivelink_runtime_data_t *)node->instance_data;

11370

+ for(i = 0; i < DLID->child_count; i++) {

11371

+ child_entry = (evms_drivelink_runtime_entry_t *)&DLID->child_table[i];

11372

+

11373

+ /* insure each child entry is filled */

11374

+ if (!child_entry->child_node) {

11375

+ node->flags |=

11376

+ EVMS_VOLUME_SET_READ_ONLY |

11377

+ EVMS_VOLUME_PARTIAL;

11378

+ LOG_ERROR("%s: missing child(%d).\n",__FUNCTION__,i);

11379

+ } else

11380

+ /* insure child count is the same */

11381

+ /* in each child's metadata */

11382

+ if (child_entry->child_metadata->child_count !=

11383

+ DLID->child_count) {

11384

+ rc = -EVMS_FEATURE_FATAL_ERROR;

11385

+ LOG_ERROR("%s: child count wrong for node '%s'\n",

11386

+ __FUNCTION__, node->name);

11387

+ } else

11388

+ /* insure parent serial number is */

11389

+ /* the same in each child's metadata */

11390

+ if (child_entry->child_metadata->parent_serial_number !=

11391

+ DLID->parent_serial_number) {

11392

+ rc = -EVMS_FEATURE_FATAL_ERROR;

11393

+ LOG_ERROR("%s: incorrect [is(%Ld), should be(%Ld)] child serial number for node '%s'\n",

11394

+ __FUNCTION__,

11395

+ child_entry->child_metadata->parent_serial_number,

11396

+ DLID->parent_serial_number,

11397

+ node->name);

11398

+ } else

11399

+ /* insure each is in the correct entry */

11400

+ if (child_entry->child_metadata->ordering_table[i].child_serial_number !=

11401

+ child_entry->child_metadata->child_serial_number) {

11402

+ rc = -EVMS_FEATURE_FATAL_ERROR;

11403

+ LOG_ERROR("%s: child reports different index for node '%s'\n",

11404

+ __FUNCTION__, node->name);

11405

+ } else {

11406

+ evms_drivelink_runtime_entry_t *other_child_entry;

11407

+ int j, rc2;

11408

+ /* compare the children's metadata */

11409

+

11410

+ /* look for another present child to

11411

+ * compare against.

11412

+ */

11413

+ other_child_entry = NULL;

11414

+ for (j = 0; j < DLID->child_count; j++) {

11415

+ /* skip comparing to ourselves */

11416

+ if (j == i) {

11417

+ continue;

11418

+ }

11419

+ /* is this child is present? */

11420

+ if (DLID->child_table[j].child_node) {

11421

+ /* yes, use it */

11422

+ other_child_entry = &DLID->child_table[j];

11423

+ break;

11424

+ }

11425

+ }

11426

+ /* if we can't find another valid

11427

+ * child node's metadata to compare

11428

+ * against, just skip this test.

11429

+ */

11430

+ if (!other_child_entry) {

11431

+ continue;

11432

+ }

11433

+ rc2 = memcmp(

11434

+ other_child_entry->child_metadata->ordering_table,

11435

+ child_entry->child_metadata->ordering_table,

11436

+ sizeof(child_entry->child_metadata->ordering_table));

11437

+ if (rc2) {

11438

+ rc = -EVMS_FEATURE_FATAL_ERROR;

11439

+ LOG_ERROR("%s: mismatching child metadata for nodes '%s' and '%s'\n",

11440

+ __FUNCTION__, DLID->child_table[i-1].child_node->name,

11441

+ child_entry->child_node->name);

11442

+ }

11443

+ }

11444

+ /* stop if fatal error encountered */

11445

+ if (rc == -EVMS_FEATURE_FATAL_ERROR) {

11446

+ break;

11447

+ }

11448

+ }

11449

+ return(rc);

11450

+}

11451

+

11452

+/*

11453

+ * function: perform_final_adjustments

11454

+ *

11455

+ * This function does the following:

11456

+ * sets the vsize (in vsectors) field in each child node

11457

+ * sets the voffset (in vsectors) field in each child node

11458

+ * frees each child node's metadata

11459

+ * sets the parent's total size field

11460

+ */

11461

+static void

11462

+perform_final_adjustments(evms_logical_node_t *node)

11463

+{

11464

+ int i;

11465

+ evms_drivelink_runtime_data_t *DLID;

11466

+ evms_drivelink_runtime_entry_t *child_entry = NULL;

11467

+ evms_drivelink_metadata_t *ref_data = NULL;

11468

+

11469

+ DLID = (evms_drivelink_runtime_data_t *)node->instance_data;

11470

+ /* find a valid copy of the ordering table.

11471

+ * since all the ordering tables are the same

11472

+ * we can just pick one to use for all the

11473

+ * child computations.

11474

+ */

11475

+ for(i = 0; i < DLID->child_count; i++) {

11476

+ child_entry = (evms_drivelink_runtime_entry_t *)&DLID->child_table[i];

11477

+ if (child_entry->child_node) {

11478

+ ref_data = child_entry->child_metadata;

11479

+ break;

11480

+ }

11481

+ }

11482

+ /* if we got this far, there should

11483

+ * always be at least one valid child.

11484

+ */

11485

+ if (!ref_data) BUG();

11486

+ /* compute the parent's usable size,

11487

+ * and construct the table used to

11488

+ * remap parent I/Os to child I/Os */

11489

+ for(i = 0; i < DLID->child_count; i++) {

11490

+ child_entry = (evms_drivelink_runtime_entry_t *)&DLID->child_table[i];

11491

+ /* set the LBA count for this child node */

11492

+ child_entry->vsize = ref_data->ordering_table[i].child_vsize;

11493

+ /* set the start LBA value for this child node */

11494

+ child_entry->voffset = node->total_vsectors;

11495

+ /* keep a running total of size in sectors */

11496

+ node->total_vsectors += child_entry->vsize;

11497

+ /* free the metadata for this child node */

11498

+ if (ref_data != child_entry->child_metadata) {

11499

+ evms_cs_deallocate_memory(child_entry->child_metadata);

11500

+ }

11501

+ child_entry->child_metadata = NULL;

11502

+ /* free the feature header for this child node */

11503

+ if (child_entry->child_node) {

11504

+ evms_cs_deallocate_memory(child_entry->child_node->feature_header);

11505

+ child_entry->child_node->feature_header = NULL;

11506

+ }

11507

+ }

11508

+ /* free the reference data */

11509

+ evms_cs_deallocate_memory(ref_data);

11510

+}

11511

+

11512

+static int

11513

+finalize_parent_nodes(evms_logical_node_t **discover_list)

11514

+{

11515

+ int rc = 0, rc2;

11516

+ evms_logical_node_t *node, *next_node;

11517

+

11518

+ for (node = *discover_list; node; node = next_node) {

11519

+ next_node = node->next;

11520

+ /* only check parent nodes */

11521

+ if (!node->feature_header) {

11522

+ /* valid the children of this parent */

11523

+ rc = test_parent_node(node);

11524

+ if (!rc) {

11525

+ /* compute parent size and

11526

+ * child remap table.

11527

+ */

11528

+ perform_final_adjustments(node);

11529

+ } else {

11530

+ /* fatal error encountered.

11531

+ * cleanup from this node and

11532

+ * delete it from memory.

11533

+ */

11534

+ evms_cs_remove_logical_node_from_list(discover_list, node);

11535

+ rc2 = DELETE(node);

11536

+ if (rc2) {

11537

+ LOG_SERIOUS("error(%d) attempting to delete '%s'.\n",

11538

+ rc2, node->name);

11539

+ }

11540

+ }

11541

+ }

11542

+ }

11543

+ return(rc);

11544

+}

11545

+

11546

+/*

11547

+ * Function: discover drive linked storage objects

11548

+ *

11549

+ */

11550

+static int

11551

+drivelink_discover(evms_logical_node_t **discover_list)

11552

+{

11553

+ int rc = 0;

11554

+

11555

+ rc = process_child_nodes(discover_list);

11556

+ if (!rc)

11557

+ rc = finalize_parent_nodes(discover_list);

11558

+

11559

+ return(rc);

11560

+}

11561

+

11562

+

11563

+/********************************************************/

11564

+/* Required Plugin Function Table Entry Point: */

11565

+/* Delete function */

11566

+/********************************************************/

11567

+

11568

+/*

11569

+ * Function: drivelink_delete

11570

+ *

11571

+ */

11572

+static int

11573

+drivelink_delete(evms_logical_node_t * node)

11574

+{

11575

+ int i, rc = 0;

11576

+ evms_drivelink_runtime_data_t *DLID;

11577

+ evms_drivelink_runtime_entry_t *child_entry;

11578

+

11579

+ LOG_DETAILS("deleting '%s'.\n", node->name);

11580

+

11581

+ DLID = (evms_drivelink_runtime_data_t *)node->instance_data;

11582

+ if (DLID) {

11583

+ for (i = 0; i < DLID->child_count; i++) {

11584

+ child_entry = &DLID->child_table[i];

11585

+ /* delete the child node */

11586

+ if (child_entry->child_node) {

11587

+ rc = DELETE(child_entry->child_node);

11588

+ if (rc) break;

11589

+ child_entry->child_node = NULL;

11590

+ }

11591

+ /* delete the child's metadata */

11592

+ if (child_entry->child_metadata) {

11593

+ evms_cs_deallocate_memory(child_entry->child_metadata);

11594

+ child_entry->child_metadata = NULL;

11595

+ }

11596

+ }

11597

+ if (!rc) {

11598

+ /* delete the child table */

11599

+ if (DLID->child_table) {

11600

+ evms_cs_deallocate_memory(DLID->child_table);

11601

+ DLID->child_table = NULL;

11602

+ }

11603

+ /* delete the instance data */

11604

+ evms_cs_deallocate_memory(DLID);

11605

+ node->instance_data = NULL;

11606

+ }

11607

+ }

11608

+ if (!rc) {

11609

+ evms_cs_deallocate_logical_node(node);

11610

+ MOD_DEC_USE_COUNT;

11611

+ }

11612

+

11613

+ return(rc);

11614

+}

11615

+

11616

+/********************************************************/

11617

+/* Required Plugin Function Table Entry Point: */

11618

+/* Read function & Support routines */

11619

+/********************************************************/

11620

+

11621

+/*

11622

+ * function: which_child

11623

+ *

11624

+ * This function find the child node a parent rsector maps to.

11625

+ * It then adjusts the rsector value to be child relative and

11626

+ * optionally computes the max # of sectors that can be access

11627

+ * from this starting point on the child. The child node, the

11628

+ * child relative rsector and max io size are returned to the

11629

+ * caller.

11630

+ *

11631

+ */

11632

+static evms_logical_node_t *

11633

+which_child(

11634

+ evms_logical_node_t *parent,

11635

+ evms_sector_t *rsector,

11636

+ evms_sector_t *max_io_sects)

11637

+{

11638

+ int i;

11639

+ evms_logical_node_t *child = NULL;

11640

+ evms_drivelink_runtime_data_t *DLID;

11641

+ evms_drivelink_runtime_entry_t *child_entry = NULL;

11642

+

11643

+ DLID = (evms_drivelink_runtime_data_t *)parent->instance_data;

11644

+ for (i = 0; i < DLID->child_count; i++) {

11645

+ child_entry = (evms_drivelink_runtime_entry_t *)&DLID->child_table[i];

11646

+

11647

+ if (*rsector >= child_entry->vsize) {

11648

+ *rsector -= child_entry->vsize;

11649

+ } else {

11650

+ /* get the child node */

11651

+ child = child_entry->child_node;

11652

+ /* compute the sector count if requested */

11653

+ if (max_io_sects)

11654

+ /* this is only used for INIT I/O

11655

+ * to return the largest sector

11656

+ * count size for this child based

11657

+ * on first sector in the I/O.

11658

+ */

11659

+ *max_io_sects =

11660

+ child_entry->vsize - *rsector;

11661

+ break;

11662

+ }

11663

+ }

11664

+ return(child);

11665

+}

11666

+

11667

+/*

11668

+ * function: drivelink_io_error

11669

+ *

11670

+ * this function was primarily created because the function

11671

+ * buffer_IO_error is inline and kgdb doesn't allow breakpoints

11672

+ * to be set on inline functions. Since this was an error path

11673

+ * and not mainline, I decided to add a trace statement to help

11674

+ * report on the failing condition.

11675

+ *

11676

+ */

11677

+static void

11678

+drivelink_io_error(

11679

+ evms_logical_node_t *node,

11680

+ int io_flag,

11681

+ eio_t *eio)

11682

+{

11683

+ LOG_SERIOUS("sector remap error %sING on (%s), rsector(%Ld).\n",

11684

+ (io_flag) ? "WRIT" : "READ",

11685

+ node->name,

11686

+ eio->rsector);

11687

+

11688

+ EVMS_IO_ERROR(eio);

11689

+}

11690

+

11691

+/*

11692

+ * Function: drivelink_read

11693

+ */

11694

+static void

11695

+drivelink_read(evms_logical_node_t *node, eio_t *eio)

11696

+{

11697

+ evms_logical_node_t *child;

11698

+

11699

+ child = which_child(node, &eio->rsector, NULL);

11700

+ if (child) {

11701

+ R_IO(child, eio);

11702

+ } else {

11703

+ drivelink_io_error(node, READ, eio);

11704

+ }

11705

+}

11706

+

11707

+/********************************************************/

11708

+/* Required Plugin Function Table Entry Point: */

11709

+/* Read function & Support routines */

11710

+/********************************************************/

11711

+

11712

+/*

11713

+ * Function: drivelink_write

11714

+ *

11715

+ */

11716

+static void

11717

+drivelink_write(evms_logical_node_t *node, eio_t *eio)

11718

+{

11719

+ evms_logical_node_t *child;

11720

+

11721

+ child = which_child(node, &eio->rsector, NULL);

11722

+ if (child) {

11723

+ W_IO(child, eio);

11724

+ } else {

11725

+ drivelink_io_error(node, WRITE, eio);

11726

+ }

11727

+}

11728

+

11729

+/********************************************************/

11730

+/* Required Plugin Function Table Entry Point: */

11731

+/* Init I/O function */

11732

+/********************************************************/

11733

+

11734

+/*

11735

+ * function: init_io

11736

+ *

11737

+ * This function must determine which child or children a

11738

+ * specified I/O request must be passed to. Also if, when,

11739

+ * and how a request must be broken up.

11740

+ *

11741

+ */

11742

+static int

11743

+drivelink_init_io(

11744

+ evms_logical_node_t * node,

11745

+ int io_flag, /* 0=read, 1=write*/

11746

+ evms_sector_t sect_nr, /* disk LBA */

11747

+ evms_sector_t num_sects, /* # of sectors */

11748

+ void * buf_addr ) /* buffer address */

11749

+{

11750

+ int rc = 0;

11751

+

11752

+ if (!node)

11753

+ rc = -EINVAL;

11754

+ else {

11755

+ evms_sector_t starting_sector, remaining_sectors;

11756

+ void *io_buf;

11757

+ evms_drivelink_runtime_data_t *DLID;

11758

+

11759

+ if ( (sect_nr + num_sects) > node->total_vsectors) {

11760

+ LOG_SERIOUS("attempted out of bound(%Ld) %s on '%s' at sector(%Ld), count(%Ld).\n",

11761

+ node->total_vsectors,

11762

+ (io_flag) ? "WRITE" : "READ",

11763

+ node->name,

11764

+ sect_nr, num_sects);

11765

+ rc = -EINVAL;

11766

+ } else {

11767

+ DLID = (evms_drivelink_runtime_data_t *)node->instance_data;

11768

+ /* make working copies of input parameters */

11769

+ starting_sector = sect_nr;

11770

+ remaining_sectors = num_sects;

11771

+ io_buf = buf_addr;

11772

+ /* loop until all I/O is performed */

11773

+ while(remaining_sectors) {

11774

+ evms_sector_t io_start, io_size;

11775

+ evms_logical_node_t *child;

11776

+

11777

+ /* compute the child relative io_start

11778

+ * and max io_size.

11779

+ */

11780

+ io_start = starting_sector;

11781

+ child = which_child(node, &io_start, &io_size);

11782

+ /* adjust io_size based on

11783

+ * original remaining sectors

11784

+ * in this io.

11785

+ */

11786

+ if (io_size > remaining_sectors)

11787

+ io_size = remaining_sectors;

11788

+ if (child) {

11789

+ rc = INIT_IO(child,

11790

+ io_flag,

11791

+ io_start,

11792

+ io_size,

11793

+ io_buf);

11794

+ } else {

11795

+ /* if partial volume, return 0's

11796

+ * for missing children.

11797

+ */

11798

+ if (io_flag == READ) {

11799

+ memset(io_buf, 0, io_size << EVMS_VSECTOR_SIZE_SHIFT);

11800

+ }

11801

+ }

11802

+ if (!rc) {

11803

+ /* adjust working copies */

11804

+ starting_sector += io_size;

11805

+ remaining_sectors -= io_size;

11806

+ io_buf += io_size <<

11807

+ EVMS_VSECTOR_SIZE_SHIFT;

11808

+ } else

11809

+ break;

11810

+ }

11811

+ }

11812

+ }

11813

+

11814

+ return(rc);

11815

+}

11816

+

11817

+/********************************************************/

11818

+/* Required Plugin Function Table Entry Point: */

11819

+/* IOCTL function & Support routines */

11820

+/********************************************************/

11821

+

11822

+static int

11823

+drivelink_ioctl_cmd_plugin_ioctl(

11824

+ evms_logical_node_t *node,

11825

+ struct inode *inode, struct file *file,

11826

+ unsigned long cmd, unsigned long arg)

11827

+{

11828

+ int i, rc = 0;

11829

+ evms_drivelink_runtime_data_t *DLID;

11830

+ evms_plugin_ioctl_t tmp, *user_parms;

11831

+

11832

+ user_parms = (evms_plugin_ioctl_t *)arg;

11833

+ /* copy user's parameters to kernel space */

11834

+ if (copy_from_user(&tmp, user_parms, sizeof(tmp)))

11835

+ rc = -EFAULT;

11836

+

11837

+ if (!rc) {

11838

+ DLID = (evms_drivelink_runtime_data_t *)node->instance_data;

11839

+ /* is this cmd targetted at this feature ? */

11840

+ if (tmp.feature_id == node->plugin->id) {

11841

+ switch(tmp.feature_command) {

11842

+ default:

11843

+ break;

11844

+ }

11845

+ } else { /* broadcast this cmd to all children */

11846

+ for (i = 0; i < DLID->child_count; i++) {

11847

+ rc = IOCTL(DLID->child_table[i].child_node,

11848

+ inode, file, cmd, arg);

11849

+ if (rc) break;

11850

+ }

11851

+ }

11852

+ /* copy info to userspace */

11853

+ if (copy_to_user(user_parms, &tmp, sizeof(tmp)))

11854

+ rc = -EFAULT;

11855

+ }

11856

+ return(rc);

11857

+}

11858

+

11859

+static int

11860

+drivelink_ioctl_cmd_broadcast(

11861

+ evms_logical_node_t *node,

11862

+ struct inode *inode, struct file *file,

11863

+ unsigned long cmd, unsigned long arg)

11864

+{

11865

+ int i, rc = 0;

11866

+ evms_drivelink_runtime_data_t *DLID;

11867

+

11868

+ DLID = (evms_drivelink_runtime_data_t *)node->instance_data;

11869

+ /* broadcast this cmd to all children */

11870

+ for (i = 0; i < DLID->child_count; i++) {

11871

+ evms_logical_node_t *child_node;

11872

+

11873

+ child_node = DLID->child_table[i].child_node;

11874

+ if (child_node) {

11875

+ rc |= IOCTL(child_node, inode, file, cmd, arg);

11876

+ }

11877

+ }

11878

+ return(rc);

11879

+}

11880

+

11881

+/*

11882

+ * Function: drivelink_ioctl

11883

+ *

11884

+ */

11885

+static int

11886

+drivelink_ioctl(

11887

+ evms_logical_node_t * node,

11888

+ struct inode * inode,

11889

+ struct file * file,

11890

+ unsigned int cmd,

11891

+ unsigned long arg)

11892

+{

11893

+ int rc = 0;

11894

+ evms_drivelink_runtime_data_t *DLID = NULL;

11895

+ struct hd_geometry hdgeo;

11896

+

11897

+ if ( (!node) || (!inode) )

11898

+ rc = -EINVAL;

11899

+

11900

+ if (!rc) {

11901

+ DLID = (evms_drivelink_runtime_data_t *)node->instance_data;

11902

+ switch (cmd) {

11903

+ case HDIO_GETGEO:

11904

+ hdgeo.heads = 255;

11905

+ hdgeo.sectors = 63;

11906

+ hdgeo.cylinders = ((unsigned int)node->total_vsectors) /

11907

+ hdgeo.heads / hdgeo.sectors;

11908

+ hdgeo.start = 0;

11909

+ if (copy_to_user((int *)arg,

11910

+ &hdgeo,

11911

+ sizeof(hdgeo)))

11912

+ rc = -EFAULT;

11913

+ break;

11914

+ case EVMS_QUIESCE_VOLUME:

11915

+ case EVMS_GET_DISK_LIST:

11916

+ case EVMS_CHECK_MEDIA_CHANGE:

11917

+ case EVMS_REVALIDATE_DISK:

11918

+ case EVMS_OPEN_VOLUME:

11919

+ case EVMS_CLOSE_VOLUME:

11920

+ rc = drivelink_ioctl_cmd_broadcast(

11921

+ node, inode, file, cmd, arg);

11922

+ break;

11923

+ case EVMS_PLUGIN_IOCTL:

11924

+ rc = drivelink_ioctl_cmd_plugin_ioctl(

11925

+ node, inode, file, cmd, arg);

11926

+ break;

11927

+ case EVMS_GET_BMAP:

11928

+ {

11929

+ evms_get_bmap_t *bmap;

11930

+ evms_sector_t io_start, io_size;

11931

+ evms_logical_node_t *child;

11932

+

11933

+ bmap = (evms_get_bmap_t *)arg;

11934

+ io_start = bmap->rsector;

11935

+ child = which_child(node, &io_start, &io_size);

11936

+ if (child) {

11937

+ if (node->block_size !=

11938

+ child->block_size) {

11939

+ bmap->status = -EPERM;

11940

+ } else {

11941

+ bmap->rsector = io_start;

11942

+ rc = IOCTL(child,

11943

+ inode,

11944

+ file,

11945

+ cmd,

11946

+ arg);

11947

+ }

11948

+ }

11949

+ }

11950

+ break;

11951

+ default:

11952

+ rc = -EINVAL;

11953

+ break;

11954

+ }

11955

+ }

11956

+ return(rc);

11957

+}

11958

+

11959

+

11960

+/********************************************************/

11961

+/* Required Module Entry Point: */

11962

+/* drivelink_init */

11963

+/********************************************************/

11964

+

11965

+/*

11966

+ * Function: drivelink_init

11967

+ *

11968

+ */

11969

+int __init

11970

+drivelink_init(void)

11971

+{

11972

+ return evms_cs_register_plugin(&plugin_header); /* register with EVMS */

11973

+}

11974

+

11975

+void __exit

11976

+drivelink_exit(void)

11977

+{

11978

+ evms_cs_unregister_plugin(&plugin_header);

11979

+}

11980

+

11981

+module_init(drivelink_init);

11982

+module_exit(drivelink_exit);

11983

+#ifdef MODULE_LICENSE

11984

+MODULE_LICENSE("GPL");

11985

+#endif

11986

+

11987

diff -Naur linux-2002-03-28/drivers/evms/evms_ecr.c evms-2002-03-28/drivers/evms/evms_ecr.c

11988

--- linux-2002-03-28/drivers/evms/evms_ecr.c Wed Dec 31 18:00:00 1969

11989

+++ evms-2002-03-28/drivers/evms/evms_ecr.c Wed Mar 6 16:01:37 2002

11990

@@ -0,0 +1,212 @@

11991

+/* -*- linux-c -*- */

11992

+/*

11993

+ *

11994

+ * Copyright (c) International Business Machines Corp., 2000

11995

+ *

11996

+ * This program is free software; you can redistribute it and/or modify

11997

+ * it under the terms of the GNU General Public License as published by

11998

+ * the Free Software Foundation; either version 2 of the License, or

11999

+ * (at your option) any later version.

12000

+ *

12001

+ * This program is distributed in the hope that it will be useful,

12002

+ * but WITHOUT ANY WARRANTY; without even the implied warranty of

12003

+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See

12004

+ * the GNU General Public License for more details.

12005

+ *

12006

+ * You should have received a copy of the GNU General Public License

12007

+ * along with this program; if not, write to the Free Software

12008

+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

12009

+ */

12010

+

12011

+/* linux/driver/evms/evms_ecr.c

12012

+ *

12013

+ * EVMS - Cluster enablement (ECR) module

12014

+ *

12015

+ */

12016

+

12017

+

12018

+#include <linux/kernel.h>

12019

+#include <linux/module.h>

12020

+#include <linux/init.h>

12021

+#include <linux/types.h>

12022

+#include <linux/evms/evms_ecr.h>

12023

+

12024

+#define LOG_PREFIX "ecr: "

12025

+

12026

+

12027

+/*

12028

+ * ecr_group_join

12029

+ */

12030

+ecr_group_t ecr_group_join(char *group_name, ecr_table_t *f_table,

12031

+ ecr_cred_t * cred, size_t size, ecr_instance_t *instance)

12032

+{

12033

+ /* dummy */

12034

+ return ECR_FAIL;

12035

+}

12036

+

12037

+

12038

+

12039

+

12040

+/*

12041

+ * ecr_group_leave

12042

+ */

12043

+void ecr_group_leave(ecr_group_t group)

12044

+{

12045

+ /* dummy */

12046

+ return;

12047

+}

12048

+

12049

+

12050

+

12051

+/*

12052

+ * ecr_group_send

12053

+ */

12054

+int ecr_group_send(ecr_group_t group, ecr_nodeid_t node, void *message,

12055

+ size_t size, ecr_instance_t *instance,

12056

+ void callback(int ret, ecr_instance_t *instance))

12057

+{

12058

+ /* dummy */

12059

+ return ECR_FAIL;

12060

+}

12061

+

12062

+

12063

+

12064

+/*

12065

+ * ecr_group_send_wait

12066

+ */

12067

+int ecr_group_send_wait(ecr_group_t group, ecr_nodeid_t node, void *message,

12068

+ size_t size, int *ret)

12069

+{

12070

+ /* dummy */

12071

+ *ret = ECR_FAIL;

12072

+ return ECR_FAIL;

12073

+}

12074

+

12075

+

12076

+

12077

+/*

12078

+ * ecr_group_broadcast

12079

+ */

12080

+int ecr_group_broadcast(ecr_group_t group, void *message, size_t size,

12081

+ ecr_instance_t *instance,

12082

+ void callback(u_char ret, ecr_instance_t *instance))

12083

+{

12084

+ /* dummy */

12085

+ return ECR_FAIL;

12086

+}

12087

+

12088

+

12089

+

12090

+/*

12091

+ * ecr_group_broadcast_wait

12092

+ */

12093

+int ecr_group_broadcast_wait(ecr_group_t group, void *message, size_t size,

12094

+ u_char *ret)

12095

+{

12096

+ /* dummy */

12097

+ *ret = ECR_FAIL;

12098

+ return ECR_FAIL;

12099

+}

12100

+

12101

+

12102

+

12103

+/*

12104

+ * ecr_group_atomic_execute

12105

+ */

12106

+int ecr_group_atomic_execute(ecr_group_t group, void *message, size_t size,

12107

+ ecr_instance_t *instance,

12108

+ void callback(ecr_instance_t *instance))

12109

+{

12110

+ /* dummy */

12111

+ return ECR_FAIL;

12112

+}

12113

+

12114

+

12115

+

12116

+/*

12117

+ * ecr_group_atomic_execute_wait

12118

+ */

12119

+int ecr_group_atomic_execute_wait(ecr_group_t group, void *message, size_t size)

12120

+{

12121

+ /* dummy */

12122

+ return ECR_FAIL;

12123

+}

12124

+

12125

+

12126

+

12127

+/*

12128

+ * ecr_group_success_response

12129

+ */

12130

+void ecr_group_success_response(ecr_message_t *handle)

12131

+{

12132

+ /* dummy */

12133

+ return;

12134

+}

12135

+

12136

+

12137

+

12138

+

12139

+/*

12140

+ * ecr_group_failure_response

12141

+ */

12142

+void ecr_group_failure_response(ecr_message_t *handle, int ret)

12143

+{

12144

+ /* dummy */

12145

+ return;

12146

+}

12147

+

12148

+

12149

+

12150

+/*

12151

+ * ecr_lock_create

12152

+ */

12153

+ecr_lock_t ecr_lock_create(char *lockname)

12154

+{

12155

+ /* dummy */

12156

+ return ECR_FAIL;

12157

+}

12158

+

12159

+/*

12160

+ * ecr_lock

12161

+ */

12162

+int ecr_lock(ecr_lock_t lock, u_int64_t start, u_int64_t length,

12163

+ ecr_lock_mode_t mode, u_char flag)

12164

+{

12165

+ /* dummy */

12166

+ return ECR_FAIL;

12167

+}

12168

+

12169

+

12170

+

12171

+/*

12172

+ * ecr_unlock

12173

+ */

12174

+int ecr_unlock(ecr_lock_t lock, u_int64_t start, u_int64_t length)

12175

+{

12176

+ /* dummy */

12177

+ return ECR_FAIL;

12178

+}

12179

+

12180

+

12181

+/********************************************************/

12182

+/* Required Module Entry Point: */

12183

+/* ecr_init() */

12184

+/********************************************************/

12185

+

12186

+static int __init ecr_init(void)

12187

+{

12188

+ /* dummy */

12189

+ return 0;

12190

+}

12191

+

12192

+static void __exit ecr_exit(void)

12193

+{

12194

+ return;

12195

+}

12196

+

12197

+module_init(ecr_init);

12198

+module_exit(ecr_exit);

12199

+#ifdef MODULE_LICENSE

12200

+MODULE_LICENSE("GPL");

12201

+#endif

12202

+

12203

diff -Naur linux-2002-03-28/drivers/evms/evms_passthru.c evms-2002-03-28/drivers/evms/evms_passthru.c

12204

--- linux-2002-03-28/drivers/evms/evms_passthru.c Wed Dec 31 18:00:00 1969

12205

+++ evms-2002-03-28/drivers/evms/evms_passthru.c Mon Mar 18 17:39:22 2002

12206

@@ -0,0 +1,317 @@

12207

+/* -*- linux-c -*- */

12208

+

12209

+/*

12210

+ *

12211

+ *

12212

+ * Copyright (c) International Business Machines Corp., 2000

12213

+ *

12214

+ * This program is free software; you can redistribute it and/or modify

12215

+ * it under the terms of the GNU General Public License as published by

12216

+ * the Free Software Foundation; either version 2 of the License, or

12217

+ * (at your option) any later version.

12218

+ *

12219

+ * This program is distributed in the hope that it will be useful,

12220

+ * but WITHOUT ANY WARRANTY; without even the implied warranty of

12221

+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See

12222

+ * the GNU General Public License for more details.

12223

+ *

12224

+ * You should have received a copy of the GNU General Public License

12225

+ * along with this program; if not, write to the Free Software

12226

+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

12227

+ *

12228

+ *

12229

+ */

12230

+/*

12231

+ * linux/drivers/evms/evms_passthru.c

12232

+ *

12233

+ * EVMS System Data Manager

12234

+ *

12235

+ *

12236

+ */

12237

+

12238

+#include <linux/module.h>

12239

+#include <linux/kernel.h>

12240

+#include <linux/config.h>

12241

+#include <linux/genhd.h>

12242

+#include <linux/major.h>

12243

+#include <linux/string.h>

12244

+#include <linux/blk.h>

12245

+#include <linux/init.h>

12246

+#include <linux/slab.h>

12247

+#include <linux/evms/evms_kernel.h>

12248

+#include <asm/system.h>

12249

+

12250

+#define EVMS_PASSTHRU_ID 0

12251

+#define LOG_PREFIX "passthru: "

12252

+

12253

+static int passthru_mgr_discover(evms_logical_node_t **);

12254

+static int passthru_mgr_delete(evms_logical_node_t *);

12255

+static void passthru_mgr_read(evms_logical_node_t *,

12256

+ eio_t *);

12257

+static void passthru_mgr_write(evms_logical_node_t *,

12258

+ eio_t *);

12259

+static int passthru_mgr_ioctl(evms_logical_node_t *,

12260

+ struct inode *,

12261

+ struct file *,

12262

+ unsigned int,

12263

+ unsigned long);

12264

+static int passthru_mgr_init_io(evms_logical_node_t *,

12265

+ int,

12266

+ evms_sector_t,

12267

+ evms_sector_t,

12268

+ void *);

12269

+

12270

+static evms_plugin_function_table_t function_table = {

12271

+ discover: &passthru_mgr_discover,

12272

+ delete : &passthru_mgr_delete,

12273

+ read : &passthru_mgr_read,

12274

+ write : &passthru_mgr_write,

12275

+ init_io : &passthru_mgr_init_io,

12276

+ ioctl : &passthru_mgr_ioctl

12277

+};

12278

+

12279

+static evms_plugin_header_t plugin_header = {

12280

+ id : SetPluginID(

12281

+ IBM_OEM_ID,

12282

+ EVMS_FEATURE,

12283

+ EVMS_PASSTHRU_ID),

12284

+ version : {

12285

+ major : 1,

12286

+ minor : 0,

12287

+ patchlevel : 0

12288

+ },

12289

+ required_common_services_version : {

12290

+ major : 0,

12291

+ minor : 5,

12292

+ patchlevel : 0

12293

+ },

12294

+ function_table : &function_table // function table for this plugin

12295

+};

12296

+

12297

+/*******************************/

12298

+/* discovery support functions */

12299

+/*******************************/

12300

+

12301

+static int

12302

+process_passthru_data(evms_logical_node_t **pp)

12303

+{

12304

+ int rc, size_in_sectors;

12305

+ evms_logical_node_t *node, *new_node;

12306

+

12307

+ node = *pp;

12308

+

12309

+ size_in_sectors = evms_cs_size_in_vsectors(

12310

+ sizeof(evms_feature_header_t));

12311

+

12312

+ /* allocate "parent" node */

12313

+ rc = evms_cs_allocate_logical_node(&new_node);

12314

+ if (!rc) {

12315

+ /* initialize "parent" node */

12316

+ new_node->instance_data = node;

12317

+ new_node->flags = node->flags;

12318

+ new_node->plugin = &plugin_header;

12319

+ new_node->system_id = node->system_id;

12320

+ new_node->block_size = node->block_size;

12321

+ new_node->hardsector_size = node->hardsector_size;

12322

+ new_node->total_vsectors = node->total_vsectors;

12323

+ new_node->total_vsectors -=

12324

+ (size_in_sectors << 1) +

12325

+ node->feature_header->alignment_padding;

12326

+ new_node->volume_info = node->volume_info;

12327

+ strcpy(new_node->name, node->name);

12328

+ if (strlen(node->feature_header->object_name))

12329

+ strcat(new_node->name, node->feature_header->object_name);

12330

+ else

12331

+ strcat(new_node->name, "_Passthru");

12332

+

12333

+ /* return "parent" node to caller */

12334

+ *pp = new_node;

12335

+

12336

+ MOD_INC_USE_COUNT;

12337

+

12338

+ LOG_DETAILS("feature header found on '%s', created '%s'.\n",

12339

+ node->name, new_node->name);

12340

+ /* we're done with the passthru feature headers

12341

+ * so lets delete them now.

12342

+ */

12343

+ evms_cs_deallocate_memory(node->feature_header);

12344

+ node->feature_header = NULL;

12345

+ } else {

12346

+ /* on any fatal error, delete the node */

12347

+ int rc2 = DELETE(node);

12348

+ if (rc2) {

12349

+ LOG_DEFAULT("error(%d) attempting to delete node(%p,%s).\n",

12350

+ rc2, node, node->name);

12351

+ }

12352

+ }

12353

+ return(rc);

12354

+}

12355

+

12356

+/********** Required Plugin Functions **********/

12357

+

12358

+

12359

+/*

12360

+ * Function: passthru_mgr_discover

12361

+ *

12362

+ */

12363

+static int

12364

+passthru_mgr_discover(evms_logical_node_t **discover_list)

12365

+{

12366

+ int rc = 0;

12367

+ evms_logical_node_t *node, *tmp_list_head;

12368

+

12369

+ tmp_list_head = *discover_list;

12370

+ *discover_list = NULL;

12371

+

12372

+ while(tmp_list_head) {

12373

+ node = tmp_list_head;

12374

+ rc = evms_cs_remove_logical_node_from_list(&tmp_list_head, node);

12375

+ if (!rc)

12376

+ rc = process_passthru_data(&node);

12377

+ if (!rc)

12378

+ if (node)

12379

+ rc = evms_cs_add_logical_node_to_list(discover_list, node);

12380

+ }

12381

+ return(rc);

12382

+}

12383

+

12384

+/*

12385

+ * Function: passthru_mgr_delete

12386

+ *

12387

+ */

12388

+static int

12389

+passthru_mgr_delete(evms_logical_node_t * node)

12390

+{

12391

+ int rc;

12392

+ evms_logical_node_t *p;

12393

+

12394

+ LOG_DETAILS("deleting '%s'.\n", node->name);

12395

+

12396

+ p = node->instance_data;

12397

+ rc = DELETE(p);

12398

+ if (!rc) {

12399

+ evms_cs_deallocate_logical_node(node);

12400

+ MOD_DEC_USE_COUNT;

12401

+ }

12402

+ return(rc);

12403

+}

12404

+

12405

+/*

12406

+ * function: passthru_io_error

12407

+ *

12408

+ * this function was primarily created because the function

12409

+ * buffer_IO_error is inline and kgdb doesn't allow breakpoints

12410

+ * to be set on inline functions. Since this was an error path

12411

+ * and not mainline, I decided to add a trace statement to help

12412

+ * report on the failing condition.

12413

+ *

12414

+ */

12415

+static void

12416

+passthru_io_error(

12417

+ evms_logical_node_t *node,

12418

+ int io_flag,

12419

+ eio_t *eio)

12420

+{

12421

+ LOG_SERIOUS("attempt to %s beyond boundary(%Ld) on (%s), rsector(%Ld).\n",

12422

+ (io_flag) ? "WRITE" : "READ",

12423

+ node->total_vsectors - 1,

12424

+ node->name,

12425

+ eio->rsector);

12426

+

12427

+ EVMS_IO_ERROR(eio);

12428

+}

12429

+

12430

+/*

12431

+ * Function: passthru_mgr_read

12432

+ */

12433

+static void

12434

+passthru_mgr_read(

12435

+ evms_logical_node_t *node,

12436

+ eio_t *eio)

12437

+{

12438

+ if ((eio->rsector + eio->rsize) <= node->total_vsectors) {

12439

+ R_IO(((evms_logical_node_t*)(node->instance_data)),

12440

+ eio);

12441

+ } else

12442

+ passthru_io_error(node, READ, eio);

12443

+}

12444

+

12445

+/*

12446

+ * Function: passthru_mgr_write

12447

+ *

12448

+ */

12449

+static void

12450

+passthru_mgr_write(

12451

+ evms_logical_node_t *node,

12452

+ eio_t *eio)

12453

+{

12454

+ if ((eio->rsector + eio->rsize) <= node->total_vsectors) {

12455

+ W_IO(((evms_logical_node_t*)(node->instance_data)),

12456

+ eio);

12457

+ } else

12458

+ passthru_io_error(node, WRITE, eio);

12459

+}

12460

+

12461

+/*

12462

+ * Function: passthru_mgr_ioctl

12463

+ *

12464

+ */

12465

+static int

12466

+passthru_mgr_ioctl(

12467

+ evms_logical_node_t * node,

12468

+ struct inode * inode,

12469

+ struct file * file,

12470

+ unsigned int cmd,

12471

+ unsigned long arg)

12472

+{

12473

+ int rc;

12474

+

12475

+ if ((!node) || (!inode))

12476

+ rc = -EINVAL;

12477

+ else

12478

+ rc = IOCTL(((evms_logical_node_t*)(node->instance_data)), inode, file, cmd, arg);

12479

+ return(rc);

12480

+}

12481

+

12482

+

12483

+static int

12484

+passthru_mgr_init_io(

12485

+ evms_logical_node_t * node,

12486

+ int io_flag, /* 0=read, 1=write*/

12487

+ evms_sector_t sect_nr, /* disk LBA */

12488

+ evms_sector_t num_sects, /* # of sectors */

12489

+ void * buf_addr ) /* buffer address */

12490

+{

12491

+ int rc;

12492

+ if ((sect_nr + num_sects) <= node->total_vsectors) {

12493

+ rc = INIT_IO(((evms_logical_node_t*)(node->instance_data)),

12494

+ io_flag, sect_nr, num_sects, buf_addr);

12495

+ } else

12496

+ rc = -EINVAL;

12497

+ return(rc);

12498

+}

12499

+

12500

+

12501

+

12502

+/*

12503

+ * Function: passthru_init

12504

+ *

12505

+ */

12506

+int __init

12507

+evms_passthru_manager_init(void)

12508

+{

12509

+ return evms_cs_register_plugin(&plugin_header); /* register with EVMS */

12510

+}

12511

+

12512

+void __exit

12513

+evms_passthru_manager_exit(void)

12514

+{

12515

+ evms_cs_unregister_plugin(&plugin_header);

12516

+}

12517

+

12518

+module_init(evms_passthru_manager_init);

12519

+module_exit(evms_passthru_manager_exit);

12520

+#ifdef MODULE_LICENSE

12521

+MODULE_LICENSE("GPL");

12522

+#endif

12523

+

12524

diff -Naur linux-2002-03-28/drivers/evms/ldev_mgr.c evms-2002-03-28/drivers/evms/ldev_mgr.c

12525

--- linux-2002-03-28/drivers/evms/ldev_mgr.c Wed Dec 31 18:00:00 1969

12526

+++ evms-2002-03-28/drivers/evms/ldev_mgr.c Wed Mar 27 16:25:55 2002

12527

@@ -0,0 +1,1262 @@

12528

+/* -*- linux-c -*- */

12529

+/*

12530

+ *

12531

+ * Copyright (c) International Business Machines Corp., 2000

12532

+ *

12533

+ * This program is free software; you can redistribute it and/or modify

12534

+ * it under the terms of the GNU General Public License as published by

12535

+ * the Free Software Foundation; either version 2 of the License, or

12536

+ * (at your option) any later version.

12537

+ *

12538

+ * This program is distributed in the hope that it will be useful,

12539

+ * but WITHOUT ANY WARRANTY; without even the implied warranty of

12540

+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See

12541

+ * the GNU General Public License for more details.

12542

+ *

12543

+ * You should have received a copy of the GNU General Public License

12544

+ * along with this program; if not, write to the Free Software

12545

+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

12546

+ */

12547

+

12548

+/* linux/driver/evms/ldev_mgr.c

12549

+ *

12550

+ * EVMS - Local Device (Hard Drive) Manager

12551

+ *

12552

+ * This plugin walks the gendisk list and creates logical disk structures for each

12553

+ * local ide or scsi device.

12554

+ *

12555

+ */

12556

+

12557

+#include <linux/config.h>

12558

+#include <linux/module.h>

12559

+#include <linux/errno.h>

12560

+#include <linux/kernel.h>

12561

+#include <linux/fs.h>

12562

+#include <linux/major.h>

12563

+#include <linux/slab.h>

12564

+#include <asm/uaccess.h>

12565

+#include <linux/blk.h> /* must be included by all block drivers */

12566

+#include <linux/genhd.h>

12567

+#include <linux/ide.h>

12568

+#include "../scsi/scsi.h"

12569

+#include "../scsi/sd.h"

12570

+#include <linux/init.h>

12571

+#include <linux/evms/evms_kernel.h>

12572

+

12573

+#define LOG_PREFIX "ldev_mgr: "

12574

+

12575

+#define EVMS_LOCAL_DEVICE_MANAGER_ID 1

12576

+

12577

+/* local instance data structure definition */

12578

+typedef struct ldev_mgr_instance_data_s {

12579

+ kdev_t dev;

12580

+ struct gendisk *gd;

12581

+ int media_changed;

12582

+} ldev_mgr_instance_data_t;

12583

+

12584

+/* prototypes for mandatory plugin interface functions */

12585

+static int discover_disks(evms_logical_node_t **);

12586

+static int ldev_mgr_delete(evms_logical_node_t *);

12587

+static void ldev_mgr_read(evms_logical_node_t *, eio_t *);

12588

+static void ldev_mgr_write(evms_logical_node_t *, eio_t *);

12589

+static int ldev_mgr_ioctl(evms_logical_node_t *,

12590

+ struct inode *,

12591

+ struct file *,

12592

+ unsigned int,

12593

+ unsigned long);

12594

+static int ldev_init_io(evms_logical_node_t *,

12595

+ int,

12596

+ evms_sector_t,

12597

+ evms_sector_t,

12598

+ void *);

12599

+

12600

+/* plugin function table definition */

12601

+static evms_plugin_function_table_t function_table = {

12602

+ discover : &discover_disks,

12603

+ delete : &ldev_mgr_delete,

12604

+ read : &ldev_mgr_read,

12605

+ write : &ldev_mgr_write,

12606

+ init_io : &ldev_init_io,

12607

+ ioctl : &ldev_mgr_ioctl

12608

+};

12609

+

12610

+/* plugin header definition */

12611

+static evms_plugin_header_t plugin_header = {

12612

+ id : SetPluginID(

12613

+ IBM_OEM_ID,

12614

+ EVMS_DEVICE_MANAGER,

12615

+ EVMS_LOCAL_DEVICE_MANAGER_ID),

12616

+ version : {

12617

+ major : 1,

12618

+ minor : 0,

12619

+ patchlevel : 0

12620

+ },

12621

+ required_common_services_version : {

12622

+ major : 0,

12623

+ minor : 5,

12624

+ patchlevel : 0

12625

+ },

12626

+ function_table : &function_table

12627

+};

12628

+

12629

+#define TYPE_NONE 0

12630

+#define TYPE_GENERIC 1

12631

+#define TYPE_IDE 2

12632

+#define TYPE_SCSI 3

12633

+

12634

+#define INDEX_ALPHA 0

12635

+#define INDEX_NUMERIC 1

12636

+

12637

+/********************************************************/

12638

+/* Required Plugin Function Table Entry Point: */

12639

+/* Discover function & Support routines */

12640

+/********************************************************/

12641

+

12642

+#define MAX_NAME_BASE_SIZE 10

12643

+#define MAX_NAME_MODIFIER_SIZE 4

12644

+typedef struct blk_device_info_s {

12645

+ char devnode_name_base[MAX_NAME_BASE_SIZE];

12646

+ char null1;

12647

+ char devnode_name_modifier[MAX_NAME_MODIFIER_SIZE];

12648

+ char null2;

12649

+ int devnode_name_index;

12650

+ int devnode_name_type;

12651

+ int device_type;

12652

+} blk_device_info_t;

12653

+

12654

+static blk_device_info_t *blk_dev_info = NULL;

12655

+

12656

+#define BLK_DEV_INFO(a,b,c,d,e) \

12657

+ strncpy(blk_dev_info[a].devnode_name_base, b, MAX_NAME_BASE_SIZE); \

12658

+ blk_dev_info[a].null1 = 0; \

12659

+ strncpy(blk_dev_info[a].devnode_name_modifier, c, MAX_NAME_MODIFIER_SIZE); \

12660

+ blk_dev_info[a].null2 = 0; \

12661

+ blk_dev_info[a].devnode_name_index = 0; \

12662

+ blk_dev_info[a].device_type = d; \

12663

+ blk_dev_info[a].devnode_name_type = e;

12664

+

12665

+static void

12666

+init_blk_dev_info( blk_device_info_t *blk_dev_info )

12667

+{

12668

+ BLK_DEV_INFO( IDE0_MAJOR, "hd", "a", TYPE_IDE, INDEX_ALPHA );

12669

+ BLK_DEV_INFO( IDE1_MAJOR, "hd", "c", TYPE_IDE, INDEX_ALPHA );

12670

+ BLK_DEV_INFO( IDE2_MAJOR, "hd", "e", TYPE_IDE, INDEX_ALPHA );

12671

+ BLK_DEV_INFO( IDE3_MAJOR, "hd", "g", TYPE_IDE, INDEX_ALPHA );

12672

+ BLK_DEV_INFO( IDE4_MAJOR, "hd", "i", TYPE_IDE, INDEX_ALPHA );

12673

+ BLK_DEV_INFO( IDE5_MAJOR, "hd", "k", TYPE_IDE, INDEX_ALPHA );

12674

+ BLK_DEV_INFO( IDE6_MAJOR, "hd", "m", TYPE_IDE, INDEX_ALPHA );

12675

+ BLK_DEV_INFO( IDE7_MAJOR, "hd", "o", TYPE_IDE, INDEX_ALPHA );

12676

+ BLK_DEV_INFO( IDE8_MAJOR, "hd", "q", TYPE_IDE, INDEX_ALPHA );

12677

+ BLK_DEV_INFO( IDE9_MAJOR, "hd", "s", TYPE_IDE, INDEX_ALPHA );

12678

+

12679

+ BLK_DEV_INFO( SCSI_DISK0_MAJOR, "sd", "a", TYPE_SCSI, INDEX_ALPHA );

12680

+ BLK_DEV_INFO( SCSI_DISK1_MAJOR, "sd", "q", TYPE_SCSI, INDEX_ALPHA );

12681

+ BLK_DEV_INFO( SCSI_DISK2_MAJOR, "sd", "ag", TYPE_SCSI, INDEX_ALPHA );

12682

+ BLK_DEV_INFO( SCSI_DISK3_MAJOR, "sd", "aw", TYPE_SCSI, INDEX_ALPHA );

12683

+ BLK_DEV_INFO( SCSI_DISK4_MAJOR, "sd", "bm", TYPE_SCSI, INDEX_ALPHA );

12684

+ BLK_DEV_INFO( SCSI_DISK5_MAJOR, "sd", "cc", TYPE_SCSI, INDEX_ALPHA );

12685

+ BLK_DEV_INFO( SCSI_DISK6_MAJOR, "sd", "cs", TYPE_SCSI, INDEX_ALPHA );

12686

+ BLK_DEV_INFO( SCSI_DISK7_MAJOR, "sd", "di", TYPE_SCSI, INDEX_ALPHA );

12687

+

12688

+// BLK_DEV_INFO( MD_MAJOR, "md", "0", TYPE_GENERIC, INDEX_NUMERIC );

12689

+

12690

+ BLK_DEV_INFO( XT_DISK_MAJOR, "xd", "a", TYPE_GENERIC, INDEX_ALPHA );

12691

+

12692

+ BLK_DEV_INFO( CYCLADES_MAJOR, "double", "0", TYPE_GENERIC, INDEX_NUMERIC );

12693

+

12694

+ BLK_DEV_INFO( MFM_ACORN_MAJOR, "mfm", "a", TYPE_GENERIC, INDEX_ALPHA );

12695

+

12696

+ BLK_DEV_INFO( ACSI_MAJOR, "ad", "a", TYPE_GENERIC, INDEX_ALPHA );

12697

+

12698

+ BLK_DEV_INFO( PS2ESDI_MAJOR, "ed", "a", TYPE_GENERIC, INDEX_ALPHA );

12699

+

12700

+ BLK_DEV_INFO( 40, "ez", "a", TYPE_GENERIC, INDEX_ALPHA );

12701

+ BLK_DEV_INFO( 43, "nb", "0", TYPE_GENERIC, INDEX_NUMERIC );

12702

+ BLK_DEV_INFO( 44, "ftl", "a", TYPE_GENERIC, INDEX_ALPHA );

12703

+ BLK_DEV_INFO( 45, "pd", "a", TYPE_GENERIC, INDEX_ALPHA );

12704

+ BLK_DEV_INFO( 47, "pf", "0", TYPE_GENERIC, INDEX_NUMERIC );

12705

+

12706

+ BLK_DEV_INFO( DAC960_MAJOR + 0, "rd/c0d", "0", TYPE_GENERIC, INDEX_NUMERIC );

12707

+ BLK_DEV_INFO( DAC960_MAJOR + 1, "rd/c1d", "0", TYPE_GENERIC, INDEX_NUMERIC );

12708

+ BLK_DEV_INFO( DAC960_MAJOR + 2, "rd/c2d", "0", TYPE_GENERIC, INDEX_NUMERIC );

12709

+ BLK_DEV_INFO( DAC960_MAJOR + 3, "rd/c3d", "0", TYPE_GENERIC, INDEX_NUMERIC );

12710

+ BLK_DEV_INFO( DAC960_MAJOR + 4, "rd/c4d", "0", TYPE_GENERIC, INDEX_NUMERIC );

12711

+ BLK_DEV_INFO( DAC960_MAJOR + 5, "rd/c5d", "0", TYPE_GENERIC, INDEX_NUMERIC );

12712

+ BLK_DEV_INFO( DAC960_MAJOR + 6, "rd/c6d", "0", TYPE_GENERIC, INDEX_NUMERIC );

12713

+ BLK_DEV_INFO( DAC960_MAJOR + 7, "rd/c7d", "0", TYPE_GENERIC, INDEX_NUMERIC );

12714

+

12715

+ BLK_DEV_INFO( COMPAQ_SMART2_MAJOR, "ida/c0d", "0", TYPE_GENERIC, INDEX_NUMERIC );

12716

+ BLK_DEV_INFO( COMPAQ_SMART2_MAJOR1, "ida/c1d", "0", TYPE_GENERIC, INDEX_NUMERIC );

12717

+ BLK_DEV_INFO( COMPAQ_SMART2_MAJOR2, "ida/c2d", "0", TYPE_GENERIC, INDEX_NUMERIC );

12718

+ BLK_DEV_INFO( COMPAQ_SMART2_MAJOR3, "ida/c3d", "0", TYPE_GENERIC, INDEX_NUMERIC );

12719

+ BLK_DEV_INFO( COMPAQ_SMART2_MAJOR4, "ida/c4d", "0", TYPE_GENERIC, INDEX_NUMERIC );

12720

+ BLK_DEV_INFO( COMPAQ_SMART2_MAJOR5, "ida/c5d", "0", TYPE_GENERIC, INDEX_NUMERIC );

12721

+ BLK_DEV_INFO( COMPAQ_SMART2_MAJOR6, "ida/c6d", "0", TYPE_GENERIC, INDEX_NUMERIC );

12722

+ BLK_DEV_INFO( COMPAQ_SMART2_MAJOR7, "ida/c7d", "0", TYPE_GENERIC, INDEX_NUMERIC );

12723

+

12724

+ BLK_DEV_INFO( I2O_MAJOR + 0, "i2o/hd", "a", TYPE_GENERIC, INDEX_ALPHA );

12725

+ BLK_DEV_INFO( I2O_MAJOR + 1, "i2o/hd", "q", TYPE_GENERIC, INDEX_ALPHA );

12726

+ BLK_DEV_INFO( I2O_MAJOR + 2, "i2o/hd", "ag", TYPE_GENERIC, INDEX_ALPHA );

12727

+ BLK_DEV_INFO( I2O_MAJOR + 3, "i2o/hd", "aw", TYPE_GENERIC, INDEX_ALPHA );

12728

+ BLK_DEV_INFO( I2O_MAJOR + 4, "i2o/hd", "bm", TYPE_GENERIC, INDEX_ALPHA );

12729

+ BLK_DEV_INFO( I2O_MAJOR + 5, "i2o/hd", "cc", TYPE_GENERIC, INDEX_ALPHA );

12730

+ BLK_DEV_INFO( I2O_MAJOR + 6, "i2o/hd", "cs", TYPE_GENERIC, INDEX_ALPHA );

12731

+ BLK_DEV_INFO( I2O_MAJOR + 7, "i2o/hd", "di", TYPE_GENERIC, INDEX_ALPHA );

12732

+

12733

+ BLK_DEV_INFO( 92, "ppdd", "0", TYPE_GENERIC, INDEX_NUMERIC );

12734

+ BLK_DEV_INFO( 93, "nftl", "a", TYPE_GENERIC, INDEX_ALPHA );

12735

+

12736

+ BLK_DEV_INFO( DASD_MAJOR, "dasd", "a", TYPE_GENERIC, INDEX_ALPHA );

12737

+ BLK_DEV_INFO( MDISK_MAJOR, "mdisk", "a", TYPE_GENERIC, INDEX_ALPHA );

12738

+

12739

+ BLK_DEV_INFO( 96, "msd", "0", TYPE_GENERIC, INDEX_NUMERIC );

12740

+ BLK_DEV_INFO( 97, "pktcdvd", "0", TYPE_GENERIC, INDEX_NUMERIC );

12741

+

12742

+ BLK_DEV_INFO( UBD_MAJOR, "ubd", "0", TYPE_GENERIC, INDEX_NUMERIC );

12743

+

12744

+ BLK_DEV_INFO( JSFD_MAJOR, "jsfd", "", TYPE_GENERIC, INDEX_NUMERIC );

12745

+

12746

+ BLK_DEV_INFO( 101, "amiraid/ar", "0", TYPE_GENERIC, INDEX_NUMERIC );

12747

+

12748

+ BLK_DEV_INFO( 104, "cciss/c0d", "0", TYPE_GENERIC, INDEX_NUMERIC );

12749

+ BLK_DEV_INFO( 105, "cciss/c1d", "0", TYPE_GENERIC, INDEX_NUMERIC );

12750

+ BLK_DEV_INFO( 106, "cciss/c2d", "0", TYPE_GENERIC, INDEX_NUMERIC );

12751

+ BLK_DEV_INFO( 107, "cciss/c3d", "0", TYPE_GENERIC, INDEX_NUMERIC );

12752

+ BLK_DEV_INFO( 108, "cciss/c4d", "0", TYPE_GENERIC, INDEX_NUMERIC );

12753

+ BLK_DEV_INFO( 108, "cciss/c5d", "0", TYPE_GENERIC, INDEX_NUMERIC );

12754

+ BLK_DEV_INFO( 110, "cciss/c6d", "0", TYPE_GENERIC, INDEX_NUMERIC );

12755

+ BLK_DEV_INFO( 111, "cciss/c7d", "0", TYPE_GENERIC, INDEX_NUMERIC );

12756

+

12757

+ BLK_DEV_INFO( RAW_MAJOR, "raw", "0", TYPE_GENERIC, INDEX_NUMERIC );

12758

+

12759

+ BLK_DEV_INFO( VXVM_MAJOR, "vx/dsk", "0", TYPE_GENERIC, INDEX_NUMERIC );

12760

+ BLK_DEV_INFO( VXDMP_MAJOR, "vx/dmp", "0", TYPE_GENERIC, INDEX_NUMERIC );

12761

+}

12762

+

12763

+static int

12764

+is_in_device_list(

12765

+ struct gendisk *gd,

12766

+ int major, int minor)

12767

+{

12768

+ int found, done, rc;

12769

+ evms_logical_node_t *device = NULL;

12770

+ ldev_mgr_instance_data_t *LID;

12771

+

12772

+ done = found = FALSE;

12773

+ while(done == FALSE) {

12774

+ rc = evms_cs_find_next_device(device, &device);

12775

+ if (rc || !device)

12776

+ done = TRUE;

12777

+ else {

12778

+ LID = device->instance_data;

12779

+ if (LID->gd == gd)

12780

+ if (MAJOR(LID->dev) == major)

12781

+ if (MINOR(LID->dev) == minor)

12782

+ done = found = TRUE;

12783

+ }

12784

+ }

12785

+ return(found);

12786

+}

12787

+

12788

+static void

12789

+build_devnode_name(char *name_buf, int major)

12790

+{

12791

+ char buf[11], *modifier, *buf_ptr;

12792

+ int int_mod;

12793

+ blk_device_info_t *bdi;

12794

+

12795

+ bdi = &blk_dev_info[major];

12796

+

12797

+ /* convert the base name modifier to an integer */

12798

+ modifier = bdi->devnode_name_modifier;

12799

+ int_mod = 0;

12800

+ while (*modifier) {

12801

+ if (bdi->devnode_name_type == INDEX_ALPHA) {

12802

+ int_mod *= 26;

12803

+ int_mod += *modifier - 'a';

12804

+ } else {

12805

+ int_mod *= 10;

12806

+ int_mod += *modifier - '0';

12807

+ }

12808

+ modifier++;

12809

+ }

12810

+ /* add in device_index_value */

12811

+ int_mod += bdi->devnode_name_index;

12812

+ bdi->devnode_name_index++;

12813

+

12814

+ /* convert integer modifier back to ALPHA/NUMERIC chars */

12815

+ memset(buf, 0, sizeof(buf));

12816

+ /* fill the buffer from the rear to front with the

12817

+ * ascii version of the modifier, leaving space for

12818

+ * NULL terminator at the end.

12819

+ */

12820

+ buf_ptr = &buf[sizeof(buf) - 2];

12821

+ do {

12822

+ if (bdi->devnode_name_type == INDEX_ALPHA) {

12823

+ *buf_ptr = (int_mod % 26) + 'a';

12824

+ int_mod /= 26;

12825

+ } else {

12826

+ *buf_ptr = (int_mod % 10) + '0';

12827

+ int_mod /= 10;

12828

+ }

12829

+ buf_ptr--;

12830

+ } while (int_mod);

12831

+

12832

+ /* find beginning of modifier in buffer */

12833

+ modifier = buf;

12834

+ while (!*modifier)

12835

+ modifier++;

12836

+

12837

+ /* build the final device devnode name */

12838

+ sprintf(name_buf, "%s%s",

12839

+ bdi->devnode_name_base,

12840

+ modifier);

12841

+}

12842

+

12843

+#define DEVICE_KNOWN 1234

12844

+#define DEVICE_UNINITIALIZED 1235

12845

+#define DEVICE_MEDIA_NOT_PRESENT 1236

12846

+static int

12847

+create_logical_disk(

12848

+ evms_logical_node_t **disk_list,

12849

+ struct gendisk *gd,

12850

+ int device_index)

12851

+{

12852

+ int rc = 0, major, minor;

12853

+ evms_logical_node_t *new_disk;

12854

+ ldev_mgr_instance_data_t *InstData;

12855

+ char device_name[EVMS_VOLUME_NAME_SIZE + 1];

12856

+

12857

+ major = gd->major;

12858

+ minor = device_index << gd->minor_shift;

12859

+

12860

+ /* skip uninitialized devices */

12861

+ if (!blk_size[major])

12862

+ rc = DEVICE_UNINITIALIZED;

12863

+ else if (!blk_size[major][minor])

12864

+ rc = DEVICE_UNINITIALIZED;

12865

+ if (!rc) {

12866

+ /* construct the devnode name for this device */

12867

+ build_devnode_name(device_name, major);

12868

+

12869

+ /* skip devices we already know about */

12870

+ if (is_in_device_list(gd, major, minor) == TRUE)

12871

+ rc = DEVICE_KNOWN;

12872

+ }

12873

+ /* allocate the new node & it's instance data */

12874

+ if (!rc)

12875

+ rc = evms_cs_allocate_logical_node(&new_disk);

12876

+ if (!rc) {

12877

+ rc = evms_cs_allocate_memory((void **)&InstData,sizeof(ldev_mgr_instance_data_t));

12878

+ if (rc)

12879

+ evms_cs_deallocate_logical_node(new_disk);

12880

+ }

12881

+ /* initialize the new node */

12882

+ if (!rc) {

12883

+ struct hd_geometry dev_geo;

12884

+ new_disk->plugin = &plugin_header;

12885

+

12886

+ /* initialize the instance data */

12887

+ new_disk->instance_data = InstData;

12888

+ InstData->dev = MKDEV(major, minor);

12889

+ InstData->gd = gd;

12890

+

12891

+ /* determine hardsector size */

12892

+ new_disk->hardsector_size = 512;

12893

+ if (hardsect_size[major])

12894

+ new_disk->hardsector_size = hardsect_size[major][minor];

12895

+

12896

+ /* determine block size */

12897

+ new_disk->block_size = 1024;

12898

+ if (blksize_size[major])

12899

+ new_disk->block_size = blksize_size[major][minor];

12900

+

12901

+ /* determine the device size in sectors */

12902

+ new_disk->total_vsectors = blk_size[major][minor] << 1;

12903

+ /* check the size based on the device geometry

12904

+ * and use this if its larger than the blk_size

12905

+ * info. because of odd(non-even) geometry, the

12906

+ * total sector count could be an odd number,

12907

+ * and we need to insure we truly reflect the

12908

+ * maximum size of the device.

12909

+ */

12910

+ rc = evms_cs_kernel_ioctl(

12911

+ new_disk,

12912

+ HDIO_GETGEO,

12913

+ (unsigned long)&dev_geo);

12914

+ if (rc) {

12915

+ LOG_ERROR("error(%d) retrieving geometry for '%s'.\n",

12916

+ rc, device_name);

12917

+ } else {

12918

+ u64 dev_size;

12919

+

12920

+ dev_size = dev_geo.cylinders;

12921

+ dev_size *= (u64)dev_geo.heads;

12922

+ dev_size *= (u64)dev_geo.sectors;

12923

+

12924

+ /* convert device size to 512 byte units */

12925

+ dev_size <<= evms_cs_log2(new_disk->hardsector_size) - 9;

12926

+

12927

+ if (dev_size > new_disk->total_vsectors) {

12928

+ new_disk->total_vsectors = dev_size;

12929

+ }

12930

+ LOG_DEBUG("blk_size(%Lu), geometry size(%Lu) in 512 byte units.\n",

12931

+ (u64)blk_size[major][minor] << 1,

12932

+ dev_size);

12933

+ }

12934

+

12935

+ /* remember removable devices */

12936

+ if (gd->flags)

12937

+ if (gd->flags[device_index] & GENHD_FL_REMOVABLE)

12938

+ new_disk->flags |= EVMS_DEVICE_REMOVABLE;

12939

+

12940

+ /* save the devnode name for this device */

12941

+ strcpy(new_disk->name, device_name);

12942

+

12943

+ /* register this device with evms */

12944

+ evms_cs_register_device(new_disk);

12945

+ MOD_INC_USE_COUNT;

12946

+

12947

+ /* append this record the linked list */

12948

+ evms_cs_add_logical_node_to_list(disk_list, new_disk);

12949

+ LOG_DETAILS("added logical disk(%s) for physical disk(%u,%u,%s), size(%Lu) in 512 byte units\n",

12950

+ new_disk->name,

12951

+ major, minor,

12952

+ new_disk->name,

12953

+ new_disk->total_vsectors);

12954

+

12955

+ }

12956

+ /* reset the "benign" error codes for the caller */

12957

+ switch(rc) {

12958

+ case DEVICE_UNINITIALIZED:

12959

+ case DEVICE_KNOWN:

12960

+ case DEVICE_MEDIA_NOT_PRESENT:

12961

+ rc = 0;

12962

+ }

12963

+ return( rc );

12964

+}

12965

+

12966

+static int

12967

+create_logical_generic_disks(

12968

+ evms_logical_node_t **disk_list,

12969

+ struct gendisk *gd)

12970

+{

12971

+ int rc, i;

12972

+

12973

+ /* This is a generic device */

12974

+

12975

+ rc = 0;

12976

+ LOG_DEBUG("major name = %s\n", gd->major_name);

12977

+ LOG_DEBUG("number of real devices = %i\n", gd->nr_real);

12978

+ for ( i = 0; i < gd->nr_real; i++ ) {

12979

+ LOG_DEBUG("device %d:\n", i);

12980

+ rc = create_logical_disk(disk_list, gd, i);

12981

+ if (rc) break;

12982

+ }

12983

+ return( rc );

12984

+}

12985

+

12986

+static int

12987

+create_logical_ide_disks(

12988

+ evms_logical_node_t **disk_list,

12989

+ struct gendisk *gd)

12990

+{

12991

+ int rc = 0, i;

12992

+ ide_hwif_t * ide_hwif;

12993

+ ide_drive_t * drive;

12994

+

12995

+ /* This is an IDE device */

12996

+ LOG_DEBUG("found IDE major : %i - searching for disks\n",

12997

+ gd->major);

12998

+

12999

+ ide_hwif = gd->real_devices; /* IDE internal data */

13000

+ for (i = 0; i < MAX_DRIVES; i++) {

13001

+ drive = &(ide_hwif->drives[i]);

13002

+ if (drive->present && (drive->media == ide_disk)) {

13003

+ /* force the name index value on ide drives */

13004

+ blk_dev_info[gd->major].devnode_name_index = i;

13005

+ rc = create_logical_disk(disk_list, gd, i);

13006

+ }

13007

+ if (rc) break;

13008

+ }

13009

+ return( rc );

13010

+}

13011

+

13012

+static int

13013

+create_logical_scsi_disks(

13014

+ evms_logical_node_t **disk_list,

13015

+ struct gendisk *gd)

13016

+{

13017

+ int rc = 0, i;

13018

+ Scsi_Disk *SDisks;

13019

+ Scsi_Device *SDev;

13020

+

13021

+ /* This is an SCSI device */

13022

+ LOG_DEBUG("found SCSI major : %i - searching for disks\n",gd->major);

13023

+ LOG_DEBUG("scsi: major name = %s\n",gd->major_name);

13024

+ LOG_DEBUG("scsi: number of real devices = %i\n",gd->nr_real);

13025

+ SDisks = gd->real_devices; /* SCSI internal data */

13026

+ for ( i = 0; i < gd->nr_real; i++ ) {

13027

+ SDev = SDisks[i].device;

13028

+ LOG_DEBUG("scsi: Channel = %i, Id = %i, Lun = %i, Capacity = %i\n",

13029

+ SDev->channel, SDev->id, SDev->lun, SDisks[i].capacity);

13030

+ rc = create_logical_disk(disk_list, gd, i);

13031

+ if (rc) break;

13032

+ }

13033

+ return( rc );

13034

+}

13035

+

13036

+static int

13037

+create_logical_disks(struct gendisk *gd,

13038

+ void * p_disk_list)

13039

+{

13040

+ int rc = 0;

13041

+ evms_logical_node_t **disk_list = p_disk_list;

13042

+

13043

+ /* create logical disks from all IDE & SCSI devices */

13044

+ switch(blk_dev_info[gd->major].device_type) {

13045

+ case TYPE_IDE:

13046

+ rc = create_logical_ide_disks(disk_list, gd);

13047

+ break;

13048

+ case TYPE_SCSI:

13049

+ rc = create_logical_scsi_disks(disk_list, gd);

13050

+ break;

13051

+ case TYPE_GENERIC:

13052

+ rc = create_logical_generic_disks(disk_list, gd);

13053

+ break;

13054

+ default:

13055

+ LOG_DEBUG("unrecognized device major : %i\n",gd->major);

13056

+ break;

13057

+ }

13058

+

13059

+ return(rc);

13060

+}

13061

+

13062

+static int

13063

+discover_disks(evms_logical_node_t **disk_list)

13064

+{

13065

+ int rc = 0;

13066

+

13067

+ LOG_ENTRY_EXIT("%s Entry\n", __FUNCTION__);

13068

+

13069

+ if (blk_dev_info == NULL) {

13070

+ /* allocate space for device info array */

13071

+ rc = evms_cs_allocate_memory(

13072

+ (void **)&blk_dev_info,

13073

+ sizeof(blk_device_info_t) * (MAX_BLKDEV + 1));

13074

+ if (!rc)

13075

+ /* initialize device info array */

13076

+ init_blk_dev_info(blk_dev_info);

13077

+ }

13078

+ if (!rc)

13079

+ /* create logical disks from the raw devices */

13080

+ rc = walk_gendisk(create_logical_disks, disk_list);

13081

+

13082

+ /* free blk_dev_info table and null the ptr to it */

13083

+ evms_cs_deallocate_memory(blk_dev_info);

13084

+ blk_dev_info = NULL;

13085

+

13086

+ LOG_ENTRY_EXIT("%s Exit\n", __FUNCTION__);

13087

+ return( rc );

13088

+}

13089

+

13090

+/********************************************************/

13091

+/* Required Plugin Function Table Entry Point: */

13092

+/* Delete function */

13093

+/********************************************************/

13094

+

13095

+static int

13096

+ldev_mgr_delete(evms_logical_node_t *disk)

13097

+{

13098

+ ldev_mgr_instance_data_t *LID;

13099

+

13100

+ /* reset any evms volume related info from

13101

+ * the device node, because we can't predict

13102

+ * how this node will be used in the future.

13103

+ */

13104

+

13105

+ /* removed the feature header if its been used

13106

+ */

13107

+ if (disk->feature_header) {

13108

+ evms_cs_deallocate_memory(disk->feature_header);

13109

+ disk->feature_header = NULL;

13110

+ }

13111

+ /* remove the volume_info structure and flag

13112

+ * if this has been used directly by an evms

13113

+ * feature.

13114

+ */

13115

+ evms_cs_deallocate_volume_info(disk);

13116

+ /* reset the flags field to the appropriate state

13117

+ */

13118

+ disk->flags &= ~EVMS_VOLUME_FLAG;

13119

+

13120

+ /* disk nodes only get deleted when:

13121

+ * 1) there are no references to the disk node

13122

+ * in memory.

13123

+ * 2) the device is removable

13124

+ * 3) the device reported a media change

13125

+ *

13126

+ * All three of these conditions must be true

13127

+ * before the disk node can be deleted.

13128

+ * evms_check_for_device_changes should set

13129

+ * and ensure these conditions before issuing

13130

+ * deletes.

13131

+ *

13132

+ * Newly installed removable media will be

13133

+ * picked up in this modules discover code.

13134

+ */

13135

+ if (disk->flags & EVMS_MEDIA_CHANGED) {

13136

+ LOG_DETAILS("deleting '%s'.\n",disk->name);

13137

+

13138

+ evms_cs_unregister_device(disk);

13139

+ MOD_DEC_USE_COUNT;

13140

+ LID = disk->instance_data;

13141

+ if (LID) {

13142

+ evms_cs_deallocate_memory(LID);

13143

+ }

13144

+ evms_cs_deallocate_logical_node(disk);

13145

+ }

13146

+ return 0;

13147

+}

13148

+

13149

+/********************************************************/

13150

+/* Required Plugin Function Table Entry Point: */

13151

+/* Read function */

13152

+/********************************************************/

13153

+

13154

+/*

13155

+ * function: ldev_mgr_io_error

13156

+ *

13157

+ * this function was primarily created because the function

13158

+ * buffer_IO_error is inline and kgdb doesn't allow breakpoints

13159

+ * to be set on inline functions. Since this was an error path

13160

+ * and not mainline, I decided to add a trace statement to help

13161

+ * report on the failing condition.

13162

+ *

13163

+ */

13164

+static void

13165

+ldev_mgr_io_error(

13166

+ evms_logical_node_t *disk,

13167

+ int io_flag,

13168

+ eio_t *eio,

13169

+ int rc)

13170

+{

13171

+ if (rc == -EOVERFLOW) {

13172

+ LOG_SERIOUS("attempt to %s beyond boundary(%Ld) on (%s), rsector(%Ld).\n",

13173

+ (io_flag) ? "WRITE" : "READ",

13174

+ disk->total_vsectors - 1,

13175

+ disk->name,

13176

+ eio->rsector);

13177

+ } else if (rc == -ENXIO) {

13178

+ LOG_SERIOUS("attempt to access a non-existent device(%s).\n",

13179

+ disk->name);

13180

+ }

13181

+

13182

+ EVMS_IO_ERROR(eio);

13183

+}

13184

+

13185

+static void

13186

+ldev_mgr_read(evms_logical_node_t *disk, eio_t *eio)

13187

+{

13188

+ int rc = 0;

13189

+ request_queue_t *q;

13190

+ ldev_mgr_instance_data_t *InstData;

13191

+

13192

+ InstData = disk->instance_data;

13193

+ if ((eio->rsector + eio->rsize) <= disk->total_vsectors) {

13194

+ eio->bh->b_rsector = eio->rsector;

13195

+ eio->bh->b_size = eio->rsize << EVMS_VSECTOR_SIZE_SHIFT;

13196

+ eio->bh->b_rdev = InstData->dev;

13197

+ q = blk_get_queue(InstData->dev);

13198

+ if (q) {

13199

+ q->make_request_fn(q, READ, eio->bh);

13200

+ return;

13201

+ } else {

13202

+ rc = -ENXIO;

13203

+ disk->flags |= EVMS_VOLUME_CORRUPT |

13204

+ EVMS_VOLUME_GENDISK_GONE;

13205

+ }

13206

+ } else {

13207

+ rc = -EOVERFLOW;

13208

+ }

13209

+ if (rc) {

13210

+ ldev_mgr_io_error(disk, READ, eio, rc);

13211

+ }

13212

+}

13213

+

13214

+/********************************************************/

13215

+/* Required Plugin Function Table Entry Point: */

13216

+/* Write function */

13217

+/********************************************************/

13218

+

13219

+static void

13220

+ldev_mgr_write(evms_logical_node_t *disk, eio_t *eio)

13221

+{

13222

+ int rc = 0;

13223

+ request_queue_t *q;

13224

+ ldev_mgr_instance_data_t *InstData;

13225

+

13226

+ InstData = disk->instance_data;

13227

+ if ((eio->rsector + eio->rsize) <= disk->total_vsectors) {

13228

+ eio->bh->b_rsector = eio->rsector;

13229

+ eio->bh->b_size = eio->rsize << EVMS_VSECTOR_SIZE_SHIFT;

13230

+ eio->bh->b_rdev = InstData->dev;

13231

+ q = blk_get_queue(InstData->dev);

13232

+ if (q) {

13233

+ q->make_request_fn(q, WRITE, eio->bh);

13234

+ return;

13235

+ } else {

13236

+ rc = -ENXIO;

13237

+ disk->flags |= EVMS_VOLUME_CORRUPT |

13238

+ EVMS_VOLUME_GENDISK_GONE;

13239

+ }

13240

+ } else {

13241

+ rc = -EOVERFLOW;

13242

+ }

13243

+ if (rc) {

13244

+ ldev_mgr_io_error(disk, WRITE, eio, rc);

13245

+ }

13246

+}

13247

+

13248

+/********************************************************/

13249

+/* Required Plugin Function Table Entry Point: */

13250

+/* Init_io function & Support routines */

13251

+/********************************************************/

13252

+

13253

+/*

13254

+ * function: allocate_bh

13255

+ *

13256

+ * This function obtains a buffer head from the private

13257

+ * buffer head pool (pre-allocated at EVMS initial

13258

+ * discovery time).

13259

+ *

13260

+ * NOTE: All access to the buffer head pool are protected

13261

+ * by a private spinlock.

13262

+ *

13263

+ */

13264

+static inline struct buffer_head *

13265

+allocate_bh(void)

13266

+{

13267

+ struct buffer_head *bh =

13268

+ evms_cs_allocate_from_pool(evms_bh_pool, FALSE);

13269

+ if (bh) {

13270

+ init_waitqueue_head(&bh->b_wait);

13271

+ }

13272

+ return(bh);

13273

+}

13274

+

13275

+/*

13276

+ * function: deallocate_bh

13277

+ *

13278

+ * This function returns a buffer head to the private

13279

+ * buffer head pool (pre-allocated at EVMS initial

13280

+ * discovery time).

13281

+ *

13282

+ * NOTE: All access to the buffer head pool are protected

13283

+ * by a private spinlock.

13284

+ *

13285

+ */

13286

+static inline void

13287

+deallocate_bh(struct buffer_head *bh)

13288

+{

13289

+ evms_cs_deallocate_to_pool(evms_bh_pool, bh);

13290

+}

13291

+

13292

+/* this is the buffer head control block structure definition */

13293

+typedef struct bh_cb_s {

13294

+ int rc;

13295

+ atomic_t blks_allocated;

13296

+ wait_queue_head_t cb_wait;

13297

+} bh_cb_t;

13298

+

13299

+/*

13300

+ * function: __wait_on_bh_cb

13301

+ *

13302

+ * This is a worker function to wait_on_bh_cb.

13303

+ * This function waits for a set of private buffer heads

13304

+ * associated to the specified buffer head control block

13305

+ * to return from I/O completion. On completion of the

13306

+ * last buffer head, the calling function is awakened

13307

+ * and continues running.

13308

+ *

13309

+ * This is the worker function to the function wait_on_bh_cb.

13310

+ *

13311

+ */

13312

+static void

13313

+__wait_on_bh_cb(bh_cb_t *bh_cb)

13314

+{

13315

+ struct task_struct *tsk = current;

13316

+ DECLARE_WAITQUEUE(wait, tsk);

13317

+

13318

+ add_wait_queue(&bh_cb->cb_wait, &wait);

13319

+ do {

13320

+ run_task_queue(&tq_disk);

13321

+ set_task_state(tsk, TASK_UNINTERRUPTIBLE);

13322

+ if (!atomic_read(&bh_cb->blks_allocated))

13323

+ break;

13324

+ schedule();

13325

+ } while (atomic_read(&bh_cb->blks_allocated));

13326

+ tsk->state = TASK_RUNNING;

13327

+ remove_wait_queue(&bh_cb->cb_wait, &wait);

13328

+}

13329

+

13330

+/*

13331

+ * function: wait_on_bh_cb

13332

+ *

13333

+ * This function waits for a set of private buffer heads

13334

+ * associated to the specified buffer head control block

13335

+ * to return from I/O completion. On completion of the

13336

+ * last buffer head, the calling function is awakened

13337

+ * and continues running.

13338

+ *

13339

+ */

13340

+static void

13341

+wait_on_bh_cb(bh_cb_t *bh_cb)

13342

+{

13343

+ if (atomic_read(&bh_cb->blks_allocated))

13344

+ __wait_on_bh_cb(bh_cb);

13345

+ else

13346

+ /* if we ended up with no buffer heads on

13347

+ * this pass, lets wait a until a few buffer

13348

+ * heads have been freed and try again. This

13349

+ * should provide a reasonable delay.

13350

+ */

13351

+ schedule();

13352

+}

13353

+

13354

+/*

13355

+ * function: end_bh_cb_io

13356

+ *

13357

+ * This is the I/O completion function that is called for

13358

+ * each private buffer head obtained from the buffer head

13359

+ * pool. Control is return thru this routine so we can track

13360

+ * all outstanding requests to know when to awaken the caller,

13361

+ * and to regain control after all I/Os have been performed.

13362

+ *

13363

+ */

13364

+static void

13365

+end_bh_cb_io_sync(struct buffer_head *bh, int uptodate)

13366

+{

13367

+ bh_cb_t *bh_cb = (bh_cb_t *)bh->b_private;

13368

+

13369

+ /* record that errors occurred */

13370

+ if (!uptodate) {

13371

+ bh_cb->rc = -EIO;

13372

+ }

13373

+ mark_buffer_uptodate(bh, uptodate);

13374

+ unlock_buffer(bh);

13375

+

13376

+ deallocate_bh(bh);

13377

+ atomic_dec(&bh_cb->blks_allocated);

13378

+ if (!atomic_read(&bh_cb->blks_allocated))

13379

+ if (waitqueue_active(&bh_cb->cb_wait))

13380

+ wake_up(&bh_cb->cb_wait);

13381

+}

13382

+

13383

+/*

13384

+ * function: ldev_partial_sector_init_io

13385

+ *

13386

+ * This function is a support function for ldev_init_io,

13387

+ * which handles the cases of performing I/O to only a part

13388

+ * of non-standard sized hardsector. This function is not

13389

+ * designed to be called directly, but via ldev_init_io.

13390

+ *

13391

+ */

13392

+static int

13393

+ldev_partial_sector_init_io(

13394

+ evms_logical_node_t *node,

13395

+ int io_flag,

13396

+ bh_cb_t *bh_cb,

13397

+ u_int64_t next_lsn,

13398

+ u_int64_t sector_lsn,

13399

+ u_int64_t io_size,

13400

+ void *bufptr,

13401

+ unsigned char **sector_buf )

13402

+{

13403

+ int rc = 0;

13404

+ ldev_mgr_instance_data_t *InstData = node->instance_data;

13405

+ kdev_t dev = InstData->dev;

13406

+ struct buffer_head *bh;

13407

+

13408

+ if (*sector_buf == NULL) {

13409

+ /* allocate buffer for incoming sector */

13410

+ rc = evms_cs_allocate_memory((void **)sector_buf,

13411

+ node->hardsector_size);

13412

+ if (rc) return(rc);

13413

+ }

13414

+ /* allocate a buffer head from the pool */

13415

+ while((bh = allocate_bh()) == NULL)

13416

+ /* yielding the cpu is playing it

13417

+ * safe. it might be wiser to just

13418

+ * spin. requires more thought.

13419

+ */

13420

+ schedule();

13421

+

13422

+ /* set up the buffer head for this sector */

13423

+ bh->b_end_io = end_bh_cb_io_sync;

13424

+ bh->b_size = node->hardsector_size;

13425

+ bh->b_rdev = dev;

13426

+ bh->b_rsector = next_lsn - sector_lsn;

13427

+ bh->b_data = *sector_buf;

13428

+ bh->b_page = virt_to_page(*sector_buf); /* this isn't handling the case of a block with more than 1 sector, that spans pages */

13429

+ bh->b_state = 0;

13430

+ set_bit(BH_Dirty, &bh->b_state);

13431

+ set_bit(BH_Lock, &bh->b_state);

13432

+ set_bit(BH_Req, &bh->b_state);

13433

+ set_bit(BH_Mapped, &bh->b_state);

13434

+ bh->b_private = (void *)bh_cb;

13435

+ atomic_inc(&bh_cb->blks_allocated);

13436

+

13437

+ /* drive the buffer head down */

13438

+ /* to the device */

13439

+ generic_make_request(READ, bh);

13440

+

13441

+ /* wait for all bh's I/O's to end */

13442

+ wait_on_bh_cb(bh_cb);

13443

+

13444

+ /* copy data to/from user */

13445

+ if (io_flag != WRITE)

13446

+ /* READ */

13447

+ memcpy(bufptr,

13448

+ *sector_buf + (sector_lsn << EVMS_VSECTOR_SIZE_SHIFT),

13449

+ io_size << EVMS_VSECTOR_SIZE_SHIFT);

13450

+ else {

13451

+ /* WRITE */

13452

+ memcpy(*sector_buf + (sector_lsn << EVMS_VSECTOR_SIZE_SHIFT),

13453

+ bufptr,

13454

+ io_size << EVMS_VSECTOR_SIZE_SHIFT);

13455

+

13456

+ /* allocate a buffer head from the pool */

13457

+ while((bh = allocate_bh()) == NULL)

13458

+ /* yielding the cpu is playing it

13459

+ * safe. it might be wiser to just

13460

+ * spin. requires more thought.

13461

+ */

13462

+ schedule();

13463

+

13464

+ /* set up the buffer head for this sector */

13465

+ bh->b_end_io = end_bh_cb_io_sync;

13466

+ bh->b_size = node->hardsector_size;

13467

+ bh->b_rdev = dev;

13468

+ bh->b_rsector = next_lsn - sector_lsn;

13469

+ bh->b_data = *sector_buf;

13470

+ bh->b_page = virt_to_page(*sector_buf); /* this isn't handling the case of a block with more than 1 sector, that spans pages */

13471

+ bh->b_state = 0;

13472

+ set_bit(BH_Dirty, &bh->b_state);

13473

+ set_bit(BH_Lock, &bh->b_state);

13474

+ set_bit(BH_Req, &bh->b_state);

13475

+ set_bit(BH_Mapped, &bh->b_state);

13476

+ bh->b_private = (void *)bh_cb;

13477

+ atomic_inc(&bh_cb->blks_allocated);

13478

+

13479

+ /* drive the buffer head down */

13480

+ /* to the device */

13481

+ generic_make_request(WRITE, bh);

13482

+

13483

+ /* wait for all bh's I/O's to end */

13484

+ wait_on_bh_cb(bh_cb);

13485

+ }

13486

+ return(rc);

13487

+}

13488

+

13489

+/*

13490

+ * function: ldev_init_io

13491

+ *

13492

+ * This function provides support for synchronous I/O

13493

+ * operations to the underlying devices. These I/O

13494

+ * operations are NOT buffered in any way including the

13495

+ * operating system's buffer cache.

13496

+ *

13497

+ * This function can work with any hardsector size that

13498

+ * is a power of 2.

13499

+ *

13500

+ * node : logical node of the target logical disk

13501

+ * io_flag : 0 = read, 1 = write, 2 = read-a-head

13502

+ * starting_lsn : the 0-based (disk relative) logical

13503

+ * : (512 byte) sector number (lsn)

13504

+ * num_lsns : the total number of lsns in this I/O

13505

+ * bufptr : address of the memory to read/write the data

13506

+ *

13507

+ */

13508

+static int

13509

+ldev_init_io(

13510

+ evms_logical_node_t *node,

13511

+ int io_flag,

13512

+ u_int64_t starting_lsn,

13513

+ u_int64_t num_lsns,

13514

+ void *bufptr )

13515

+{

13516

+ int rc = 0, lsns_per_hardsector, lsns_per_blocksize;

13517

+ unchar *sector_buf = NULL, *cur_bufptr;

13518

+ u_int64_t next_lsn, remaining_lsns, sector_lsn;

13519

+ ldev_mgr_instance_data_t *InstData = node->instance_data;

13520

+ kdev_t dev = InstData->dev;

13521

+ bh_cb_t bh_cb;

13522

+

13523

+ LOG_EVERYTHING("%s Entry: Disk(%u,%u), ioflag(%u), start_lsn(%Lu), num_lsns(%Lu), bufptr(0x%p)\n",

13524

+ __FUNCTION__, MAJOR(dev), MINOR(dev), io_flag, starting_lsn, num_lsns, bufptr);

13525

+

13526

+ /* check for valid device */

13527

+ if (!blk_size[MAJOR(dev)][MINOR(dev)]) {

13528

+ node->flags |= EVMS_VOLUME_CORRUPT |

13529

+ EVMS_VOLUME_GENDISK_GONE;

13530

+ return(-ENXIO);

13531

+ }

13532

+ /* check for 0 length request */

13533

+ if ( num_lsns == 0 ) {

13534

+ LOG_ERROR("%s: error requesting 0 sectors.\n", __FUNCTION__);

13535

+ return(-EINVAL);

13536

+ }

13537

+ /* check for out of bound request */

13538

+ if ( (starting_lsn + num_lsns) > node->total_vsectors) {

13539

+ LOG_ERROR("%s: attempted %s beyond logical disk boundary(%Lu LSNs), requesting LSN(%Lu), total LSNs(%Lu).\n",

13540

+ __FUNCTION__, (io_flag == WRITE) ? "WRITE" : "READ",

13541

+ node->total_vsectors,

13542

+ starting_lsn, num_lsns);

13543

+ return(-EINVAL);

13544

+ }

13545

+ /* check for invalid io_flag value */

13546

+ switch( io_flag ) {

13547

+ case READ: /* read... */

13548

+ case WRITE: /* write... */

13549

+ case READA: /* reada... */

13550

+ break;

13551

+ default:

13552

+ return(-EINVAL);

13553

+ }

13554

+

13555

+ /* compute some per device info once up-front */

13556

+ lsns_per_hardsector = node->hardsector_size / EVMS_VSECTOR_SIZE;

13557

+ lsns_per_blocksize = node->block_size / EVMS_VSECTOR_SIZE;

13558

+

13559

+ /* initialize the buffer head control block */

13560

+ memset(&bh_cb, 0, sizeof(bh_cb_t));

13561

+ init_waitqueue_head(&bh_cb.cb_wait);

13562

+

13563

+ /* only update the local copy of variables */

13564

+ cur_bufptr = bufptr;

13565

+ next_lsn = starting_lsn;

13566

+ remaining_lsns = num_lsns;

13567

+

13568

+ /* check for a mid-sector starting offset

13569

+ *

13570

+ * if found, perform I/O on part of that

13571

+ * sector

13572

+ */

13573

+ sector_lsn = next_lsn & (lsns_per_hardsector - 1);

13574

+ if (sector_lsn) {

13575

+ u_int64_t io_size;

13576

+

13577

+ /* determine bytes in IO to this sector */

13578

+ io_size = lsns_per_hardsector - sector_lsn;

13579

+ if (io_size > remaining_lsns)

13580

+ io_size = remaining_lsns;

13581

+

13582

+ /* perform the partial sector io */

13583

+ rc = ldev_partial_sector_init_io(

13584

+ node,io_flag, &bh_cb,

13585

+ next_lsn,

13586

+ sector_lsn, io_size,

13587

+ cur_bufptr, &sector_buf);

13588

+

13589

+ if (!rc) {

13590

+ /* update progress in local variables */

13591

+ cur_bufptr += io_size << EVMS_VSECTOR_SIZE_SHIFT;

13592

+ next_lsn += io_size;

13593

+ remaining_lsns -= io_size;

13594

+ }

13595

+ }

13596

+

13597

+ /* continue if no errors found */

13598

+ if (!rc) {

13599

+ /* perform I/O on all the complete sectors

13600

+ * in this request.

13601

+ *

13602

+ * loop until there are no more complete sectors

13603

+ * to process.

13604

+ */

13605

+ while(remaining_lsns >= lsns_per_hardsector) {

13606

+ /* this inner loop attempts to drive as many

13607

+ * bytes (in sector size multiples) down to

13608

+ * the device as possible using the available

13609

+ * buffer heads in the pool.

13610

+ */

13611

+ while(remaining_lsns >= lsns_per_hardsector) {

13612

+ struct buffer_head *bh;

13613

+

13614

+ /* allocate a buffer head from the pool */

13615

+ bh = allocate_bh();

13616

+ if (bh == NULL) break;

13617

+

13618

+ /* set up the buffer head for this I/O */

13619

+ bh->b_end_io = end_bh_cb_io_sync;

13620

+ bh->b_size =

13621

+ (remaining_lsns >= lsns_per_blocksize) ?

13622

+ node->block_size :

13623

+ node->hardsector_size;

13624

+ bh->b_data = cur_bufptr;

13625

+ bh->b_rdev = dev;

13626

+ bh->b_rsector = next_lsn;

13627

+ bh->b_page = virt_to_page(cur_bufptr); /* this isn't handling the case of a block with more than 1 sector, that spans pages */

13628

+ bh->b_state = 0;

13629

+ set_bit(BH_Dirty, &bh->b_state);

13630

+ set_bit(BH_Lock, &bh->b_state);

13631

+ set_bit(BH_Req, &bh->b_state);

13632

+ set_bit(BH_Mapped, &bh->b_state);

13633

+ bh->b_private = (void *)&bh_cb;

13634

+ atomic_inc(&bh_cb.blks_allocated);

13635

+

13636

+ /* drive the buffer head down */

13637

+ /* to the device */

13638

+ generic_make_request(io_flag, bh);

13639

+

13640

+ /* update progress in local variables */

13641

+ cur_bufptr += bh->b_size;

13642

+ next_lsn += bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT;

13643

+ remaining_lsns -= bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT;

13644

+ }

13645

+ /* wait for all bh's I/O's to end */

13646

+ wait_on_bh_cb(&bh_cb);

13647

+ }

13648

+ }

13649

+

13650

+ /* continue if no errors found */

13651

+ if (!rc)

13652

+ /* check for a mid-sector ending offset

13653

+ *

13654

+ * if found, perform I/O on part of that

13655

+ * sector

13656

+ */

13657

+ if (remaining_lsns)

13658

+ /* perform the partial sector io */

13659

+ rc = ldev_partial_sector_init_io(

13660

+ node, io_flag, &bh_cb,

13661

+ next_lsn,

13662

+ 0, remaining_lsns,

13663

+ cur_bufptr, &sector_buf);

13664

+

13665

+ /* free the sector buffer if it was allocated */

13666

+ if (sector_buf)

13667

+ evms_cs_deallocate_memory(sector_buf);

13668

+

13669

+ /* coalesce return codes */

13670

+ rc |= bh_cb.rc;

13671

+

13672

+ LOG_EVERYTHING("%s Exit: rc(%u)\n", __FUNCTION__, rc);

13673

+

13674

+ return( rc );

13675

+}

13676

+

13677

+/********************************************************/

13678

+/* Required Plugin Function Table Entry Point: */

13679

+/* IOCTL function & Support routines */

13680

+/********************************************************/

13681

+

13682

+static int

13683

+ldev_mgr_ioctl(

13684

+ evms_logical_node_t * disk,

13685

+ struct inode * inode,

13686

+ struct file * file,

13687

+ unsigned int cmd,

13688

+ unsigned long arg)

13689

+{

13690

+ int rc = 0;

13691

+ ldev_mgr_instance_data_t *InstData = disk->instance_data;

13692

+ kdev_t save_dev;

13693

+

13694

+ if (!inode || !disk)

13695

+ return -EINVAL;

13696

+

13697

+ save_dev = inode->i_rdev;

13698

+ inode->i_rdev = InstData->dev;

13699

+ switch (cmd) {

13700

+ case EVMS_QUIESCE_VOLUME:

13701

+ case EVMS_PLUGIN_IOCTL:

13702

+ rc = 0;

13703

+ break;

13704

+ case EVMS_GET_BMAP:

13705

+ {

13706

+ evms_get_bmap_t *bmap = (evms_get_bmap_t *)arg;

13707

+ bmap->dev = InstData->dev;

13708

+ bmap->status = 0;

13709

+ }

13710

+ break;

13711

+ case EVMS_OPEN_VOLUME:

13712

+ rc = InstData->gd->fops->open(inode, file);

13713

+ break;

13714

+ case EVMS_CLOSE_VOLUME:

13715

+ rc = InstData->gd->fops->release(inode, file);

13716

+ break;

13717

+ case EVMS_CHECK_MEDIA_CHANGE:

13718

+ /* once we detect that media changed

13719

+ * is 'set', don't send any more ioctls

13720

+ * down to the device, until the

13721

+ * media change has been 'reset' by a

13722

+ * revalidate disk ioctl. when already

13723

+ * 'set', just return a 1 w/o actually

13724

+ * performing another ioctl call to the

13725

+ * device.

13726

+ */

13727

+ if (InstData->media_changed == TRUE) {

13728

+ rc = 1;

13729

+ break;

13730

+ }

13731

+ rc = InstData->gd->fops->check_media_change(InstData->dev);

13732

+ if (rc == 1) {

13733

+ InstData->media_changed = TRUE;

13734

+ disk->flags |= EVMS_MEDIA_CHANGED;

13735

+ }

13736

+ break;

13737

+ case EVMS_REVALIDATE_DISK:

13738

+ /* don't actually send this ioctl down

13739

+ * to the device, until we know that

13740

+ * previous check media change ioctl

13741

+ * has occurred.

13742

+ *

13743

+ * when we do actually send the ioctl

13744

+ * down, reset the local media_changed

13745

+ * flag.

13746

+ */

13747

+ if (InstData->media_changed == FALSE)

13748

+ break;

13749

+ rc = InstData->gd->fops->revalidate(InstData->dev);

13750

+ InstData->media_changed = FALSE;

13751

+ break;

13752

+ case EVMS_GET_DISK_LIST:

13753

+ rc = evms_cs_add_item_to_list(

13754

+ (evms_list_node_t **)arg,

13755

+ disk);

13756

+ if (rc > 0)

13757

+ rc = 0;

13758

+ break;

13759

+ default:

13760

+ rc = InstData->gd->fops->ioctl(inode, file, cmd, arg);

13761

+ break;

13762

+ }

13763

+ inode->i_rdev = save_dev;

13764

+

13765

+ return( rc );

13766

+}

13767

+

13768

+/********************************************************/

13769

+/* Required Module Entry Point: */

13770

+/* ldev_mgr_init */

13771

+/********************************************************/

13772

+

13773

+static int __init

13774

+ldev_mgr_init(void)

13775

+{

13776

+ return evms_cs_register_plugin(&plugin_header);

13777

+}

13778

+

13779

+static void __exit

13780

+ldev_mgr_exit(void)

13781

+{

13782

+ evms_cs_unregister_plugin(&plugin_header);

13783

+}

13784

+

13785

+module_init(ldev_mgr_init);

13786

+module_exit(ldev_mgr_exit);

13787

+#ifdef MODULE_LICENSE

13788

+MODULE_LICENSE("GPL");

13789

+#endif

13790

diff -Naur linux-2002-03-28/drivers/evms/lvm_vge.c evms-2002-03-28/drivers/evms/lvm_vge.c

13791

--- linux-2002-03-28/drivers/evms/lvm_vge.c Wed Dec 31 18:00:00 1969

13792

+++ evms-2002-03-28/drivers/evms/lvm_vge.c Thu Mar 28 10:20:25 2002

13793

@@ -0,0 +1,3480 @@

13794

+/* -*- linux-c -*- */

13795

+

13796

+/*

13797

+ * Copyright (c) International Business Machines Corp., 2000

13798

+ *

13799

+ * This program is free software; you can redistribute it and/or modify

13800

+ * it under the terms of the GNU General Public License as published by

13801

+ * the Free Software Foundation; either version 2 of the License, or

13802

+ * (at your option) any later version.

13803

+ *

13804

+ * This program is distributed in the hope that it will be useful,

13805

+ * but WITHOUT ANY WARRANTY; without even the implied warranty of

13806

+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See

13807

+ * the GNU General Public License for more details.

13808

+ *

13809

+ * You should have received a copy of the GNU General Public License

13810

+ * along with this program; if not, write to the Free Software

13811

+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

13812

+ */

13813

+/*

13814

+ * linux/drivers/evms/lvm_vge.c

13815

+ *

13816

+ * EVMS Linux LVM Region Manager

13817

+ */

13818

+

13819

+#include <linux/module.h>

13820

+#include <linux/kernel.h>

13821

+#include <linux/config.h>

13822

+#include <linux/genhd.h>

13823

+#include <linux/major.h>

13824

+#include <linux/string.h>

13825

+#include <linux/blk.h>

13826

+#include <linux/init.h>

13827

+#include <linux/slab.h>

13828

+#include <linux/vmalloc.h>

13829

+#include <linux/evms/evms_kernel.h>

13830

+#include <linux/evms/evms_lvm.h>

13831

+#include <asm/system.h>

13832

+#include <asm/uaccess.h>

13833

+

13834

+#define LOG_PREFIX "lvm: "

13835

+

13836

+// Plugin API prototypes

13837

+static int lvm_discover( evms_logical_node_t ** evms_node_list );

13838

+static int lvm_discover_end( evms_logical_node_t ** evms_node_list );

13839

+static int lvm_delete_node( evms_logical_node_t * logical_node );

13840

+static void lvm_read( evms_logical_node_t * node,

13841

+ eio_t * eio );

13842

+static void lvm_write( evms_logical_node_t * node,

13843

+ eio_t * eio );

13844

+static int lvm_init_io( evms_logical_node_t * node,

13845

+ int io_flag,

13846

+ evms_sector_t sect_nr,

13847

+ evms_sector_t num_sects,

13848

+ void * buf_addr );

13849

+static int lvm_ioctl( evms_logical_node_t * logical_node,

13850

+ struct inode * inode,

13851

+ struct file * file,

13852

+ unsigned int cmd,

13853

+ unsigned long arg);

13854

+static int lvm_direct_ioctl( struct inode * inode,

13855

+ struct file * file,

13856

+ unsigned int cmd,

13857

+ unsigned long args );

13858

+

13859

+static snapshot_map_entry_t * allocate_snapshot_map_entry(evms_sector_t org_sector,

13860

+ evms_sector_t snap_sector );

13861

+

13862

+

13863

+// Global LVM data structures

13864

+static evms_plugin_function_table_t lvm_function_table = {

13865

+ discover : lvm_discover,

13866

+ end_discover : lvm_discover_end,

13867

+ delete : lvm_delete_node,

13868

+ read : lvm_read,

13869

+ write : lvm_write,

13870

+ init_io : lvm_init_io,

13871

+ ioctl : lvm_ioctl,

13872

+ direct_ioctl : lvm_direct_ioctl

13873

+};

13874

+

13875

+static evms_plugin_header_t lvm_plugin_header = {

13876

+ id : SetPluginID(

13877

+ IBM_OEM_ID,

13878

+ EVMS_REGION_MANAGER,

13879

+ 0x01 ),

13880

+ version : {

13881

+ major : EVMS_LVM_VERSION_MAJOR,

13882

+ minor : EVMS_LVM_VERSION_MINOR,

13883

+ patchlevel : EVMS_LVM_VERSION_PATCH

13884

+ },

13885

+ required_common_services_version: {

13886

+ major : 0,

13887

+ minor : 5,

13888

+ patchlevel : 0

13889

+ },

13890

+ function_table : &lvm_function_table

13891

+};

13892

+

13893

+static lvm_volume_group_t * lvm_group_list = NULL;

13894

+static struct proc_dir_entry * lvm_proc = NULL;

13895

+

13896

+

13897

+

13898

+/********** Miscellaneous Functions **********/

13899

+

13900

+

13901

+

13902

+/* Function: remap sector

13903

+ *

13904

+ * Common function to remap LV lba to PV lba in appropriate PE. This

13905

+ * function needs to deal with requests that span PEs and/or stripes. If

13906

+ * this occurs, the request will simply be chopped off at the boundary of

13907

+ * the first PE/stripe. It is up to the calling function to loop

13908

+ * accordingly to finish the full remapping. This function is now partially

13909

+ * 64-bit enabled. The striping section contains code that currently cannot

13910

+ * eliminate at least one mod operation on 64 bit values.

13911

+ */

13912

+static int remap_sector(evms_logical_node_t * node,

13913

+ evms_sector_t org_sector, // logical sector to remap

13914

+ evms_sector_t size, // size (in sectors) of request to remap

13915

+ evms_sector_t * new_sector, // remapped sector

13916

+ evms_sector_t * new_size, // new size (in sectors)

13917

+ evms_sector_t * pe_start_sector,// starting sector of pe - needed for snapshotting

13918

+ lvm_physical_volume_t ** pv_entry ) // new node for which new_sector is relative

13919

+{

13920

+ lvm_logical_volume_t * volume = node->instance_data;

13921

+ le_table_entry_t * le_entry;

13922

+ u_int32_t le;

13923

+ u_int32_t offset_in_le;

13924

+

13925

+ u_int32_t sectors_per_column;

13926

+ u_int32_t column;

13927

+ u_int32_t sector_in_column;

13928

+ u_int32_t stripe_in_column;

13929

+ u_int32_t le_in_column;

13930

+ u_int32_t columns;

13931

+ u_int32_t offset_in_stripe;

13932

+ u_int32_t stripe_in_le;

13933

+ u_int32_t org_sector32; // Needed for striping - not 64-bit enabled

13934

+

13935

+ *new_size = size;

13936

+

13937

+ // Check if volume is striped. Reset the size if the request

13938

+ // crosses a stripe boundary. Striping in LVM is not 64-bit

13939

+ // enabled.

13940

+ if ( volume->stripes > 1 ) {

13941

+ org_sector32 = org_sector;

13942

+ sectors_per_column = volume->stripes * volume->pe_size;

13943

+ column = org_sector32 / sectors_per_column;

13944

+ sector_in_column = org_sector32 % sectors_per_column;

13945

+ stripe_in_column = sector_in_column / volume->stripe_size;

13946

+ le_in_column = stripe_in_column % volume->stripes;

13947

+ columns = volume->num_le / volume->stripes;

13948

+ le = column + (columns * le_in_column);

13949

+

13950

+ offset_in_stripe = org_sector32 % volume->stripe_size;

13951

+ stripe_in_le = stripe_in_column / volume->stripes;

13952

+ offset_in_le = offset_in_stripe + stripe_in_le * volume->stripe_size;

13953

+

13954

+ if ( offset_in_stripe + size > volume->stripe_size ) {

13955

+ *new_size = volume->stripe_size - offset_in_stripe;

13956

+ }

13957

+ }

13958

+ // Non-striped volume. Just find LE and offset. Reset the size if

13959

+ // the request crosses an LE boundary. This path is 64-bit safe.

13960

+ else {

13961

+ le = org_sector >> volume->pe_size_shift;

13962

+ offset_in_le = org_sector & (volume->pe_size - 1);

13963

+

13964

+ if ( offset_in_le + size > volume->pe_size ) {

13965

+ *new_size = volume->pe_size - offset_in_le;

13966

+ }

13967

+ }

13968

+

13969

+ le_entry = &volume->le_map[le];

13970

+ *pe_start_sector = le_entry->pe_sector_offset;

13971

+ *new_sector = le_entry->pe_sector_offset + offset_in_le;

13972

+ *pv_entry = le_entry->owning_pv;

13973

+

13974

+ return 0;

13975

+}

13976

+

13977

+

13978

+/* Function: add_group_to_list

13979

+ *

13980

+ * Add an LVM volume group to the global LVM list. This inserts at

13981

+ * the start of the list, since order isn't particularly important.

13982

+ *

13983

+ * So, it appears that order is important. :) Now inserting at the

13984

+ * end of the list instead of the beginning.

13985

+ */

13986

+static int add_group_to_list( lvm_volume_group_t * group )

13987

+{

13988

+ lvm_volume_group_t ** p_group;

13989

+

13990

+ for ( p_group = &lvm_group_list; *p_group; p_group = &(*p_group)->next_group ) {

13991

+ ;

13992

+ }

13993

+

13994

+ *p_group = group;

13995

+ group->next_group = NULL;

13996

+

13997

+ return 0;

13998

+}

13999

+

14000

+

14001

+/* Function: remove_group_from_list

14002

+ *

14003

+ * Remove an LVM volume group from the global LVM list.

14004

+ */

14005

+static int remove_group_from_list( lvm_volume_group_t * group )

14006

+{

14007

+ lvm_volume_group_t ** p_group;

14008

+

14009

+ for ( p_group = &lvm_group_list; *p_group; p_group = &(*p_group)->next_group ) {

14010

+ if ( *p_group == group ) {

14011

+ *p_group = (*p_group)->next_group;

14012

+ group->next_group = NULL;

14013

+ break;

14014

+ }

14015

+ }

14016

+

14017

+ return 0;

14018

+}

14019

+

14020

+

14021

+/* Function: find_group_by_uuid

14022

+ *

14023

+ * Use the vg_uuid to find the desired volume group.

14024

+ */

14025

+static int find_group_by_uuid( unsigned char * vg_uuid,

14026

+ lvm_volume_group_t ** group)

14027

+{

14028

+ lvm_volume_group_t * gp;

14029

+

14030

+ for ( gp = lvm_group_list; gp; gp = gp->next_group ) {

14031

+ if ( ! memcmp(vg_uuid, gp->vg_uuid, UUID_LEN) ) {

14032

+ *group = gp;

14033

+ return 0;

14034

+ }

14035

+ }

14036

+ *group = NULL;

14037

+ return -EINVAL;

14038

+}

14039

+

14040

+

14041

+/* Function: find_pv_by_number

14042

+ *

14043

+ * Search the PV list of the specified volume group, looking for the

14044

+ * specified PV number. If found, return a pointer to that PV.

14045

+ */

14046

+static lvm_physical_volume_t * find_pv_by_number(u_int32_t pv_number,

14047

+ lvm_volume_group_t * group )

14048

+{

14049

+ lvm_physical_volume_t * pv_entry;

14050

+

14051

+ for ( pv_entry = group->pv_list; pv_entry; pv_entry = pv_entry->next ) {

14052

+ if ( pv_entry->pv_number == pv_number ) {

14053

+ return pv_entry;

14054

+ }

14055

+ }

14056

+ return NULL;

14057

+}

14058

+

14059

+

14060

+/* Function: translate_lv_name

14061

+ *

14062

+ * In LVM, volumes have names based on their dev-node, which follow the

14063

+ * pattern /dev/group_name/volume_name. In EVMS, the same volume needs

14064

+ * to appear as /dev/evms/lvm/group_name/volume_name. Thus, the name from

14065

+ * the lv_disk_t needs to be translated before copying to the associated

14066

+ * node. evms_node_name must point to a NAME_LEN sized buffer.

14067

+ */

14068

+static int translate_lv_name( char * lvm_lv_name, char * evms_node_name )

14069

+{

14070

+ char * ptr;

14071

+

14072

+ memset(evms_node_name, 0, NAME_LEN);

14073

+

14074

+ // Make sure the string starts with /dev/, and skip over it.

14075

+ ptr = strstr(lvm_lv_name, DEV_DIRECTORY);

14076

+ if ( ptr != lvm_lv_name ) {

14077

+ LOG_SERIOUS("Invalid LV name: %s\n", lvm_lv_name);

14078

+ return -EINVAL;

14079

+ }

14080

+ ptr = &ptr[strlen(DEV_DIRECTORY)];

14081

+

14082

+ // ptr now points to "group_name/volume_name".

14083

+ // Use this to create the name for the EVMS node.

14084

+ strcpy(evms_node_name, LVM_DEV_DIRECTORY);

14085

+ strncat(evms_node_name, ptr, NAME_LEN-strlen(evms_node_name)-1);

14086

+

14087

+ return 0;

14088

+}

14089

+

14090

+

14091

+/* Function: check_pv_for_lv

14092

+ *

14093

+ * Run through all LE maps of all LVs in this group, and make sure the

14094

+ * specified PV is not being pointed to by any LEs.

14095

+ */

14096

+static int check_pv_for_lv( lvm_physical_volume_t * pv_entry,

14097

+ lvm_volume_group_t * group )

14098

+{

14099

+ lvm_logical_volume_t * volume;

14100

+ int i,j;

14101

+

14102

+ for ( i = 1; i <= MAX_LV; i++ ) {

14103

+ if ( (volume = group->volume_list[i]) ) {

14104

+ for ( j = 0; j < volume->num_le; j++ ) {

14105

+ if ( volume->le_map[j].owning_pv == pv_entry ) {

14106

+ return -EINVAL;

14107

+ }

14108

+ }

14109

+ }

14110

+ }

14111

+ return 0;

14112

+}

14113

+

14114

+

14115

+

14116

+/********** Metadata I/O Functions **********/

14117

+

14118

+

14119

+/* Function: endian_convert_pv

14120

+ *

14121

+ * Endian-neutral conversion for PV structures.

14122

+ */

14123

+static inline void endian_convert_pv( pv_disk_t * pv )

14124

+{

14125

+ pv->version = le16_to_cpu(pv->version);

14126

+ pv->pv_on_disk.base = le32_to_cpu(pv->pv_on_disk.base);

14127

+ pv->pv_on_disk.size = le32_to_cpu(pv->pv_on_disk.size);

14128

+ pv->vg_on_disk.base = le32_to_cpu(pv->vg_on_disk.base);

14129

+ pv->vg_on_disk.size = le32_to_cpu(pv->vg_on_disk.size);

14130

+ pv->pv_uuidlist_on_disk.base = le32_to_cpu(pv->pv_uuidlist_on_disk.base);

14131

+ pv->pv_uuidlist_on_disk.size = le32_to_cpu(pv->pv_uuidlist_on_disk.size);

14132

+ pv->lv_on_disk.base = le32_to_cpu(pv->lv_on_disk.base);

14133

+ pv->lv_on_disk.size = le32_to_cpu(pv->lv_on_disk.size);

14134

+ pv->pe_on_disk.base = le32_to_cpu(pv->pe_on_disk.base);

14135

+ pv->pe_on_disk.size = le32_to_cpu(pv->pe_on_disk.size);

14136

+ pv->pv_major = le32_to_cpu(pv->pv_major);

14137

+ pv->pv_number = le32_to_cpu(pv->pv_number);

14138

+ pv->pv_status = le32_to_cpu(pv->pv_status);

14139

+ pv->pv_allocatable = le32_to_cpu(pv->pv_allocatable);

14140

+ pv->pv_size = le32_to_cpu(pv->pv_size);

14141

+ pv->lv_cur = le32_to_cpu(pv->lv_cur);

14142

+ pv->pe_size = le32_to_cpu(pv->pe_size);

14143

+ pv->pe_total = le32_to_cpu(pv->pe_total);

14144

+ pv->pe_allocated = le32_to_cpu(pv->pe_allocated);

14145

+ pv->pe_start = le32_to_cpu(pv->pe_start);

14146

+}

14147

+

14148

+

14149

+/* Function: read_pv

14150

+ *

14151

+ * Read in the PV structure from the specified node. If it contains a

14152

+ * valid PV signature, allocate a new pv_disk_t and copy the data.

14153

+ */

14154

+static int read_pv( evms_logical_node_t * node,

14155

+ pv_disk_t ** pv )

14156

+{

14157

+ pv_disk_t * pv_buffer;

14158

+

14159

+ *pv = NULL;

14160

+

14161

+ // Buffer for reading the PV metadata.

14162

+ pv_buffer = kmalloc(LVM_PV_DISK_SIZE, GFP_NOIO);

14163

+ if ( ! pv_buffer ) {

14164

+ LOG_CRITICAL("Memory error creating buffer to read PV metadata for node %s\n", node->name);

14165

+ return -ENOMEM;

14166

+ }

14167

+

14168

+ // Read the first two sectors.

14169

+ if ( INIT_IO(node, 0, evms_cs_size_in_vsectors(LVM_PV_DISK_BASE),

14170

+ evms_cs_size_in_vsectors(LVM_PV_DISK_SIZE), pv_buffer) ) {

14171

+ LOG_SERIOUS("Error reading PV metadata from node %s\n", node->name);

14172

+ kfree(pv_buffer);

14173

+ return -EIO;

14174

+ }

14175

+

14176

+ // Endian-neutral conversion of PV metadata.

14177

+ endian_convert_pv(pv_buffer);

14178

+

14179

+ // Check for an LVM signature and make sure the sizes match.

14180

+ // Versions 1 and 2 are both valid now. Thanks LVM! :)

14181

+ if ( ! ( pv_buffer->id[0] == 'H' &&

14182

+ pv_buffer->id[1] == 'M' &&

14183

+ (pv_buffer->version == 1 || pv_buffer->version == 2) &&

14184

+ pv_buffer->pv_size == node->total_vsectors ) ) {

14185

+ LOG_EXTRA("Node %s is not an LVM PV\n", node->name);

14186

+ kfree(pv_buffer);

14187

+ return -EINVAL;

14188

+ }

14189

+

14190

+ // This is a valid PV. Allocate a new pv_disk_t.

14191

+ *pv = kmalloc(sizeof(pv_disk_t), GFP_NOIO);

14192

+ if ( ! *pv ) {

14193

+ LOG_CRITICAL("Memory error creating new PV for node %s\n", node->name);

14194

+ kfree(pv_buffer);

14195

+ return -ENOMEM;

14196

+ }

14197

+

14198

+ // Copy the metadata.

14199

+ memcpy(*pv, pv_buffer, sizeof(pv_disk_t));

14200

+ kfree(pv_buffer);

14201

+ return 0;

14202

+}

14203

+

14204

+

14205

+/* Function: endian_convert_vg

14206

+ *

14207

+ * Endian-neutral conversion for VG structures

14208

+ */

14209

+static inline void endian_convert_vg( vg_disk_t * vg )

14210

+{

14211

+ vg->vg_number = le32_to_cpu(vg->vg_number);

14212

+ vg->vg_access = le32_to_cpu(vg->vg_access);

14213

+ vg->vg_status = le32_to_cpu(vg->vg_status);

14214

+ vg->lv_max = le32_to_cpu(vg->lv_max);

14215

+ vg->lv_cur = le32_to_cpu(vg->lv_cur);

14216

+ vg->lv_open = le32_to_cpu(vg->lv_open);

14217

+ vg->pv_max = le32_to_cpu(vg->pv_max);

14218

+ vg->pv_cur = le32_to_cpu(vg->pv_cur);

14219

+ vg->pv_act = le32_to_cpu(vg->pv_act);

14220

+ vg->dummy = le32_to_cpu(vg->dummy);

14221

+ vg->vgda = le32_to_cpu(vg->vgda);

14222

+ vg->pe_size = le32_to_cpu(vg->pe_size);

14223

+ vg->pe_total = le32_to_cpu(vg->pe_total);

14224

+ vg->pe_allocated= le32_to_cpu(vg->pe_allocated);

14225

+ vg->pvg_total = le32_to_cpu(vg->pvg_total);

14226

+}

14227

+

14228

+

14229

+/* Function: read_vg

14230

+ *

14231

+ * Read in the VG structure from the specified node. Allocate a new

14232

+ * vg_disk_t and copy the data.

14233

+ */

14234

+static int read_vg( evms_logical_node_t * node,

14235

+ pv_disk_t * pv,

14236

+ vg_disk_t ** vg )

14237

+{

14238

+ vg_disk_t * vg_buffer;

14239

+ unsigned long vg_sectors;

14240

+

14241

+ // Allocate a buffer to read the VG metadata.

14242

+ vg_sectors = evms_cs_size_in_vsectors(pv->vg_on_disk.size);

14243

+ vg_buffer = kmalloc(vg_sectors << EVMS_VSECTOR_SIZE_SHIFT, GFP_NOIO);

14244

+ if ( ! vg_buffer ) {

14245

+ LOG_CRITICAL("Memory error creating buffer to read VG metadata from node %s\n", node->name);

14246

+ return -ENOMEM;

14247

+ }

14248

+

14249

+ // Read the VG metadata.

14250

+ if ( INIT_IO(node, 0, evms_cs_size_in_vsectors(pv->vg_on_disk.base), vg_sectors, vg_buffer) ) {

14251

+ LOG_SERIOUS("Error reading VG metadata from node %s\n", node->name);

14252

+ kfree(vg_buffer);

14253

+ return -EIO;

14254

+ }

14255

+

14256

+ // Endian-neutral conversion of VG metadata.

14257

+ endian_convert_vg(vg_buffer);

14258

+

14259

+ // Allocate a new vg_disk_t

14260

+ *vg = kmalloc(sizeof(vg_disk_t), GFP_NOIO);

14261

+ if ( ! *vg ) {

14262

+ LOG_CRITICAL("Memory error creating new VG structure for node %s\n", node->name);

14263

+ kfree(vg_buffer);

14264

+ return -ENOMEM;

14265

+ }

14266

+

14267

+ // Copy the metadata.

14268

+ memcpy(*vg, vg_buffer, sizeof(vg_disk_t));

14269

+ kfree(vg_buffer);

14270

+ return 0;

14271

+}

14272

+

14273

+

14274

+/* Function: read_uuid_list

14275

+ */

14276

+static int read_uuid_list( evms_logical_node_t * node,

14277

+ pv_disk_t * pv,

14278

+ lvm_volume_group_t * group )

14279

+{

14280

+ evms_sector_t start_sector;

14281

+ unsigned long total_sectors;

14282

+ unsigned char * uuid_buffer;

14283

+ unsigned long buffer_size = IO_BUFFER_SECTORS * EVMS_VSECTOR_SIZE;

14284

+ unsigned long uuid_list_size;

14285

+ int i;

14286

+

14287

+ if ( group->uuid_list ) {

14288

+ LOG_EXTRA("Already read PV UUIDs for group %s\n", group->vg_name);

14289

+ return 0;

14290

+ }

14291

+

14292

+ start_sector = evms_cs_size_in_vsectors(pv->pv_uuidlist_on_disk.base);

14293

+ total_sectors = evms_cs_size_in_vsectors(pv->pv_uuidlist_on_disk.size);

14294

+ uuid_list_size = round_up(total_sectors * EVMS_VSECTOR_SIZE, buffer_size);

14295

+

14296

+ // Allocate memory for the UUID array for this group.

14297

+ group->uuid_list = vmalloc(uuid_list_size);

14298

+ if ( ! group->uuid_list ) {

14299

+ LOG_CRITICAL("Memory error creating UUID list for group %s\n", group->vg_name);

14300

+ return -ENOMEM;

14301

+ }

14302

+ memset(group->uuid_list, 0, uuid_list_size);

14303

+

14304

+ // Allocate a buffer to perform the I/Os.

14305

+ uuid_buffer = kmalloc(buffer_size, GFP_NOIO);

14306

+ if ( ! uuid_buffer ) {

14307

+ LOG_CRITICAL("Memory error creating I/O buffer for UUID list in group %s\n", group->vg_name);

14308

+ vfree(group->uuid_list);

14309

+ group->uuid_list = NULL;

14310

+ return -ENOMEM;

14311

+ }

14312

+

14313

+ for ( i = 0; i < total_sectors; i += IO_BUFFER_SECTORS ) {

14314

+ if ( INIT_IO(node, 0, start_sector + i, IO_BUFFER_SECTORS, uuid_buffer) ) {

14315

+ LOG_SERIOUS("Error reading PV UUID list from node %s\n", node->name);

14316

+ kfree(uuid_buffer);

14317

+ vfree(group->uuid_list);

14318

+ group->uuid_list = NULL;

14319

+ return -EIO;

14320

+ }

14321

+

14322

+ // Copy the I/O buffer into the UUID array.

14323

+ memcpy(&(group->uuid_list[i*EVMS_VSECTOR_SIZE]), uuid_buffer, buffer_size);

14324

+ }

14325

+

14326

+ // Clear out the unused portion at the end of the uuid_list

14327

+ memset(&(group->uuid_list[pv->pv_uuidlist_on_disk.size]), 0, uuid_list_size - pv->pv_uuidlist_on_disk.size);

14328

+

14329

+ kfree(uuid_buffer);

14330

+ return 0;

14331

+}

14332

+

14333

+

14334

+/* Function: endian_convert_lv

14335

+ *

14336

+ * Endian-neutral conversion for LV structures

14337

+ */

14338

+static inline void endian_convert_lv( lv_disk_t * lv )

14339

+{

14340

+ lv->lv_access = le32_to_cpu(lv->lv_access);

14341

+ lv->lv_status = le32_to_cpu(lv->lv_status);

14342

+ lv->lv_open = le32_to_cpu(lv->lv_open);

14343

+ lv->lv_dev = le32_to_cpu(lv->lv_dev);

14344

+ lv->lv_number = le32_to_cpu(lv->lv_number);

14345

+ lv->lv_mirror_copies = le32_to_cpu(lv->lv_mirror_copies);

14346

+ lv->lv_recovery = le32_to_cpu(lv->lv_recovery);

14347

+ lv->lv_schedule = le32_to_cpu(lv->lv_schedule);

14348

+ lv->lv_size = le32_to_cpu(lv->lv_size);

14349

+ lv->lv_snapshot_minor = le32_to_cpu(lv->lv_snapshot_minor);

14350

+ lv->lv_chunk_size = le16_to_cpu(lv->lv_chunk_size);

14351

+ lv->dummy = le16_to_cpu(lv->dummy);

14352

+ lv->lv_allocated_le = le32_to_cpu(lv->lv_allocated_le);

14353

+ lv->lv_stripes = le32_to_cpu(lv->lv_stripes);

14354

+ lv->lv_stripesize = le32_to_cpu(lv->lv_stripesize);

14355

+ lv->lv_badblock = le32_to_cpu(lv->lv_badblock);

14356

+ lv->lv_allocation = le32_to_cpu(lv->lv_allocation);

14357

+ lv->lv_io_timeout = le32_to_cpu(lv->lv_io_timeout);

14358

+ lv->lv_read_ahead = le32_to_cpu(lv->lv_read_ahead);

14359

+}

14360

+

14361

+static inline void endian_convert_lvs( lvm_volume_group_t * group )

14362

+{

14363

+ int i;

14364

+ for ( i = 0; i < group->vg->lv_max; i++ ) {

14365

+ endian_convert_lv(&(group->lv_array[i]));

14366

+ }

14367

+}

14368

+

14369

+

14370

+/* Function: read_lv

14371

+ *

14372

+ * Read in the LV structures for the specified group. Do the read from

14373

+ * the first PV in the group. If that one fails, keep trying on the

14374

+ * remaining PVs until one works. This function will allocate a buffer

14375

+ * for the group to read in the structures.

14376

+ */

14377

+static int read_lv( lvm_volume_group_t * group )

14378

+{

14379

+ lvm_physical_volume_t * pv_entry = group->pv_list;

14380

+ unsigned char * lv_buffer = NULL;

14381

+ evms_sector_t start_sector;

14382

+ unsigned long total_sectors;

14383

+ unsigned long buffer_size = IO_BUFFER_SECTORS * EVMS_VSECTOR_SIZE;

14384

+ unsigned long lv_array_size;

14385

+ int i, rc = 1;

14386

+

14387

+ if ( group->lv_array ) {

14388

+ return 0;

14389

+ }

14390

+

14391

+ if ( ! pv_entry ) {

14392

+ LOG_ERROR("Group %s has no PVs. Cannot read LV structures.\n", group->vg_name);

14393

+ return -EINVAL;

14394

+ }

14395

+

14396

+ // Allocate a buffer to do the actual I/Os.

14397

+ lv_buffer = kmalloc(buffer_size, GFP_NOIO);

14398

+ if ( ! lv_buffer ) {

14399

+ LOG_CRITICAL("Memory error creating I/O buffer for LV structs for Group %s\n", group->vg_name);

14400

+ return -ENOMEM;

14401

+ }

14402

+

14403

+ // Read in the LV structures 4k at a time. If one PV returns errors,

14404

+ // start over with the next PV in the group.

14405

+ while (rc && pv_entry) {

14406

+ start_sector = evms_cs_size_in_vsectors(pv_entry->pv->lv_on_disk.base);

14407

+ total_sectors = evms_cs_size_in_vsectors(pv_entry->pv->lv_on_disk.size);

14408

+ lv_array_size = round_up(total_sectors * EVMS_VSECTOR_SIZE, buffer_size);

14409

+

14410

+ // Allocate the buffer for this group to hold the entire LV array.

14411

+ if ( group->lv_array ) {

14412

+ vfree(group->lv_array);

14413

+ group->lv_array = NULL;

14414

+ }

14415

+ group->lv_array = vmalloc(lv_array_size);

14416

+ if ( ! group->lv_array ) {

14417

+ LOG_CRITICAL("Memory error creating lv_array buffer for Group %s\n", group->vg_name);

14418

+ kfree(lv_buffer);

14419

+ return -ENOMEM;

14420

+ }

14421

+ memset(group->lv_array, 0, lv_array_size);

14422

+

14423

+ for ( i = 0; i < total_sectors; i += IO_BUFFER_SECTORS ) {

14424

+ rc = INIT_IO(pv_entry->logical_node, 0, start_sector + i, IO_BUFFER_SECTORS, lv_buffer);

14425

+ if (rc) {

14426

+ LOG_SERIOUS("Error reading LV metadata from node %s in Group %s\n",

14427

+ pv_entry->logical_node->name, group->vg_name);

14428

+

14429

+ // Try the next PV if the current one caused any errors.

14430

+ pv_entry = pv_entry->next;

14431

+ break;

14432

+ }

14433

+

14434

+ // Copy the I/O buffer into the lv_array

14435

+ memcpy(&(((char*)(group->lv_array))[i*EVMS_VSECTOR_SIZE]), lv_buffer, buffer_size);

14436

+ }

14437

+ }

14438

+

14439

+ if (rc) {

14440

+ LOG_SERIOUS("Unable to read LV metadata from any PV in Group %s\n", group->vg_name);

14441

+ kfree(lv_buffer);

14442

+ vfree(group->lv_array);

14443

+ group->lv_array = NULL;

14444

+ return -EIO;

14445

+ }

14446

+

14447

+ // Clear out the unused portion at the end of the lv_array.

14448

+ memset(&(((char*)(group->lv_array))[pv_entry->pv->lv_on_disk.size]), 0, lv_array_size - pv_entry->pv->lv_on_disk.size);

14449

+

14450

+ // Endian-neutral conversion of the LV metadata.

14451

+ endian_convert_lvs(group);

14452

+

14453

+ kfree(lv_buffer);

14454

+ return 0;

14455

+}

14456

+

14457

+

14458

+/* Function: endian_convert_pe_map

14459

+ *

14460

+ * Endian-neutral conversion for PE structures

14461

+ */

14462

+static inline void endian_convert_pe_map( lvm_physical_volume_t * pv_entry )

14463

+{

14464

+ int i;

14465

+ for ( i = 0; i < pv_entry->pv->pe_total; i++ ) {

14466

+ pv_entry->pe_map[i].lv_num = le16_to_cpu(pv_entry->pe_map[i].lv_num);

14467

+ pv_entry->pe_map[i].le_num = le16_to_cpu(pv_entry->pe_map[i].le_num);

14468

+ }

14469

+}

14470

+

14471

+

14472

+/* Function: read_pe_map

14473

+ *

14474

+ * Read in the PE map for the specified PV. This function will allocate a

14475

+ * buffer to read in the data.

14476

+ */

14477

+static int read_pe_map( lvm_physical_volume_t * pv_entry )

14478

+{

14479

+ evms_logical_node_t * node = pv_entry->logical_node;

14480

+ pv_disk_t * pv = pv_entry->pv;

14481

+ unsigned char * pe_buffer;

14482

+ evms_sector_t start_sector;

14483

+ unsigned long total_sectors;

14484

+ unsigned long buffer_size = IO_BUFFER_SECTORS * EVMS_VSECTOR_SIZE;

14485

+ unsigned long pe_map_size;

14486

+ int i;

14487

+

14488

+ if ( pv_entry->pe_map ) {

14489

+ return 0;

14490

+ }

14491

+

14492

+ start_sector = evms_cs_size_in_vsectors(pv->pe_on_disk.base);

14493

+ total_sectors = evms_cs_size_in_vsectors(pv->pe_total * sizeof(pe_disk_t));

14494

+ pe_map_size = round_up(total_sectors * EVMS_VSECTOR_SIZE, buffer_size);

14495

+

14496

+ // Allocate a buffer to hold the PE map for this PV.

14497

+ //pv_entry->pe_map = vmalloc(total_sectors << EVMS_VSECTOR_SIZE_SHIFT);

14498

+ pv_entry->pe_map = vmalloc(pe_map_size);

14499

+ if ( ! pv_entry->pe_map ) {

14500

+ LOG_CRITICAL("Memory error creating PE map for node %s\n", node->name);

14501

+ return -ENOMEM;

14502

+ }

14503

+ memset(pv_entry->pe_map, 0, pe_map_size);

14504

+

14505

+ // Allocate a buffer for performing the I/O.

14506

+ pe_buffer = kmalloc(buffer_size, GFP_NOIO);

14507

+ if ( ! pe_buffer ) {

14508

+ LOG_CRITICAL("Memory error creating I/O buffer for PE maps for node %s\n", node->name);

14509

+ vfree(pv_entry->pe_map);

14510

+ pv_entry->pe_map = NULL;

14511

+ return -ENOMEM;

14512

+ }

14513

+

14514

+ for ( i = 0; i < total_sectors; i += IO_BUFFER_SECTORS ) {

14515

+ if ( INIT_IO(node, 0, start_sector + i, IO_BUFFER_SECTORS, pe_buffer) ) {

14516

+ LOG_SERIOUS("Error reading PE maps from node %s.\n", node->name);

14517

+ kfree(pe_buffer);

14518

+ vfree(pv_entry->pe_map);

14519

+ pv_entry->pe_map = NULL;

14520

+ return -EIO;

14521

+ }

14522

+ // Copy the data to the actual PE map.

14523

+ memcpy(&(((char*)(pv_entry->pe_map))[i*EVMS_VSECTOR_SIZE]), pe_buffer, buffer_size);

14524

+ }

14525

+

14526

+ // Clear out the unused portion at the end of the PE map.

14527

+ memset(&(((char*)(pv_entry->pe_map))[total_sectors*EVMS_VSECTOR_SIZE]), 0, pe_map_size - total_sectors*EVMS_VSECTOR_SIZE);

14528

+

14529

+ // Endian-neutral conversion of the PE metadata.

14530

+ endian_convert_pe_map(pv_entry);

14531

+

14532

+ kfree(pe_buffer);

14533

+ return 0;

14534

+}

14535

+

14536

+

14537

+

14538

+/********** Snapshot Manipulation Functions **********/

14539

+

14540

+

14541

+/* Function: snapshot_check_quiesce_original

14542

+ *

14543

+ * For this snapshot LV, check that both it and its original are quiesced.

14544

+ */

14545

+static int snapshot_check_quiesce_original( lvm_logical_volume_t * snap_volume )

14546

+{

14547

+ lvm_logical_volume_t * org_volume = snap_volume->snapshot_org;

14548

+

14549

+ if ( ! (snap_volume->lv_access & EVMS_LV_QUIESCED) ) {

14550

+ return -EINVAL;

14551

+ }

14552

+

14553

+ if ( org_volume &&

14554

+ ! (org_volume->lv_access & EVMS_LV_QUIESCED) ) {

14555

+ return -EINVAL;

14556

+ }

14557

+

14558

+ return 0;

14559

+}

14560

+

14561

+

14562

+/* Function: snapshot_check_quiesce_all

14563

+ *

14564

+ * Go through the list of all snapshots for an original volume, and make

14565

+ * sure everyone is in a quiesced state.

14566

+ */

14567

+static int snapshot_check_quiesce_all( lvm_logical_volume_t * org_volume )

14568

+{

14569

+ lvm_logical_volume_t * snap;

14570

+

14571

+ if ( ! (org_volume->lv_access & EVMS_LV_QUIESCED) ) {

14572

+ return -EINVAL;

14573

+ }

14574

+

14575

+ for ( snap = org_volume->snapshot_next; snap; snap = snap->snapshot_next ) {

14576

+ if ( ! (snap->lv_access & EVMS_LV_QUIESCED) ) {

14577

+ return -EINVAL;

14578

+ }

14579

+ }

14580

+

14581

+ return 0;

14582

+}

14583

+

14584

+

14585

+/* Function: invalidate_snapshot_volume

14586

+ *

14587

+ * In the event a snapshot volume becomes full or corrupted, its metadata

14588

+ * must be altered in order to prevent it from being used again. Write some

14589

+ * invalid data into the first entry of the COW table. If this volume is

14590

+ * not fully deleted by the user/engine, this invalid COW entry will be

14591

+ * detected by build_snapshot_maps(), and will cause the volume to be

14592

+ * deleted before being exported to EVMS during discover. This is obviously

14593

+ * a hack, but it is the same hack currently used by LVM. We're just trying

14594

+ * to be compatible. :)

14595

+ */

14596

+static int invalidate_snapshot_volume( lvm_logical_volume_t * snap_volume )

14597

+{

14598

+ evms_logical_node_t tmp_node;

14599

+

14600

+ tmp_node.instance_data = snap_volume;

14601

+ tmp_node.total_vsectors = snap_volume->lv_size;

14602

+

14603

+ if ( ! (snap_volume->lv_access & LV_SNAPSHOT) ) {

14604

+ LOG_WARNING("Volume %s is not a snapshot. Cannot invalidate\n", snap_volume->name);

14605

+ return -EINVAL;

14606

+ }

14607

+

14608

+ LOG_WARNING("Invalidating full/corrupted snapshot volume %s\n", snap_volume->name);

14609

+ LOG_WARNING("Run the EVMS administration tools to remove this snapshot.\n");

14610

+

14611

+ if ( snap_volume->cow_table ) {

14612

+ snap_volume->cow_table[0].pv_org_rsector = cpu_to_le64(((evms_sector_t)1));

14613

+ if ( lvm_init_io(&tmp_node, 4, 0, 1, snap_volume->cow_table) ) {

14614

+ LOG_SERIOUS("Unable to invalidate snapshot volume %s\n", snap_volume->name);

14615

+ }

14616

+ }

14617

+ else {

14618

+ LOG_SERIOUS("Unable to invalidate snapshot volume %s\n", snap_volume->name);

14619

+ }

14620

+

14621

+ snap_volume->lv_status &= ~LV_ACTIVE;

14622

+

14623

+ return 0;

14624

+}

14625

+

14626

+

14627

+/* Function: remove_snapshot_from_chain

14628

+ *

14629

+ * Remove a snapshot volume from its original's chain of snapshots. This

14630

+ * does not delete the snapshot volume. At runtime, we cannot delete

14631

+ * volumes at the region-manager level, because EVMS may have this volume

14632

+ * exported, and there is no way to notify EVMS of the deletion. It will

14633

+ * eventually need to be deleted in the engine, which will then tell the

14634

+ * EVMS kernel services to delete the volume in the kernel.

14635

+ */

14636

+static int remove_snapshot_from_chain( lvm_logical_volume_t * snap_volume )

14637

+{

14638

+ lvm_logical_volume_t * org_volume = snap_volume->snapshot_org;

14639

+ lvm_logical_volume_t ** p_volume;

14640

+

14641

+ if ( org_volume ) {

14642

+ for ( p_volume = &org_volume->snapshot_next; *p_volume; p_volume = &(*p_volume)->snapshot_next ) {

14643

+ if ( *p_volume == snap_volume ) {

14644

+ *p_volume = snap_volume->snapshot_next;

14645

+ break;

14646

+ }

14647

+ }

14648

+ }

14649

+

14650

+ snap_volume->snapshot_org = NULL;

14651

+ snap_volume->snapshot_next = NULL;

14652

+ return 0;

14653

+}

14654

+

14655

+

14656

+/* Function: snapshot_hash

14657

+ *

14658

+ * The snapshot hash tables are NEVER going to have 4 billion entries, so

14659

+ * we can safely cast the org_sector to 32 bits and just mod it by the

14660

+ * hash table size.

14661

+ */

14662

+static u_int32_t snapshot_hash( evms_sector_t org_sector,

14663

+ lvm_logical_volume_t * snap_volume )

14664

+{

14665

+ return( ((u_int32_t)org_sector) % snap_volume->hash_table_size);

14666

+}

14667

+

14668

+

14669

+/* Function: snapshot_search_hash_chain

14670

+ *

14671

+ * Search the hash chain that is anchored at the specified head pointer.

14672

+ * If the sector number is found, the result pointer is set to that entry

14673

+ * in the chain, and a 1 is returned. If the sector is not found, the

14674

+ * result pointer is set to the previous entry and 0 is returned. If the

14675

+ * result pointer is NULL, this means either the list is empty, or the

14676

+ * specified sector should become the first list item.

14677

+ */

14678

+static int snapshot_search_hash_chain( evms_sector_t org_sector,

14679

+ snapshot_map_entry_t * head,

14680

+ snapshot_map_entry_t ** result )

14681

+{

14682

+ snapshot_map_entry_t * curr = head;

14683

+ snapshot_map_entry_t * prev = head;

14684

+ while ( curr && curr->org_sector < org_sector ) {

14685

+ prev = curr;

14686

+ curr = curr->next;

14687

+ }

14688

+ if ( ! curr ) {

14689

+ // Either an empty chain or went off the end of the chain.

14690

+ *result = prev;

14691

+ return 0;

14692

+ }

14693

+ else if ( curr->org_sector != org_sector ) {

14694

+ *result = curr->prev;

14695

+ return 0;

14696

+ }

14697

+ else {

14698

+ // Found the desired sector.

14699

+ *result = curr;

14700

+ return 1;

14701

+ }

14702

+}

14703

+

14704

+

14705

+/* Function: insert_snapshot_map_entry

14706

+ *

14707

+ * Insert a new entry into a snapshot hash chain, immediately following the

14708

+ * specified entry. This function should not be used to add an entry into

14709

+ * an empty list, or as the first entry in an existing list. For that case,

14710

+ * use insert_snapshot_map_entry_at_head().

14711

+ */

14712

+static int insert_snapshot_map_entry( snapshot_map_entry_t * entry,

14713

+ snapshot_map_entry_t * base )

14714

+{

14715

+ entry->next = base->next;

14716

+ entry->prev = base;

14717

+ base->next = entry;

14718

+ if ( entry->next ) {

14719

+ entry->next->prev = entry;

14720

+ }

14721

+ return 0;

14722

+}

14723

+

14724

+

14725

+/* Function: insert_snapshot_map_entry_at_head

14726

+ *

14727

+ * Insert a new entry into a snapshot chain as the first entry.

14728

+ */

14729

+static int insert_snapshot_map_entry_at_head( snapshot_map_entry_t * entry,

14730

+ snapshot_map_entry_t ** head )

14731

+{

14732

+ entry->next = *head;

14733

+ entry->prev = NULL;

14734

+ *head = entry;

14735

+ if ( entry->next ) {

14736

+ entry->next->prev = entry;

14737

+ }

14738

+ return 0;

14739

+}

14740

+

14741

+

14742

+/* Function: add_cow_entry_to_snapshot_map

14743

+ *

14744

+ * Convert a cow table entry (from the on-disk data) into an appropriate

14745

+ * entry for the snapshot map. Insert this new entry into the appropriate

14746

+ * map for the specified volume.

14747

+ *

14748

+ * The cow_entry passed into this function must have already been

14749

+ * endian-converted from disk-order to cpu-order.

14750

+ */

14751

+static int add_cow_entry_to_snapshot_map(lv_COW_table_disk_t * cow_entry,

14752

+ lvm_logical_volume_t * volume )

14753

+{

14754

+ snapshot_map_entry_t * new_entry;

14755

+ snapshot_map_entry_t ** hash_table;

14756

+ snapshot_map_entry_t * chain_head;

14757

+ snapshot_map_entry_t * target_entry;

14758

+ u_int32_t hash_value;

14759

+

14760

+ if ( cow_entry->pv_org_number == 0 ) {

14761

+ return -EINVAL;

14762

+ }

14763

+ new_entry = allocate_snapshot_map_entry(cow_entry->pv_org_rsector, cow_entry->pv_snap_rsector);

14764

+ if ( ! new_entry ) {

14765

+ return -ENOMEM;

14766

+ }

14767

+ new_entry->snap_pv = find_pv_by_number(cow_entry->pv_snap_number, volume->group);

14768

+ if ( ! new_entry->snap_pv ) {

14769

+ return -EINVAL;

14770

+ }

14771

+

14772

+ hash_value = snapshot_hash(new_entry->org_sector, volume);

14773

+ hash_table = volume->snapshot_map[cow_entry->pv_org_number];

14774

+ chain_head = hash_table[hash_value];

14775

+ if ( snapshot_search_hash_chain(new_entry->org_sector, chain_head, &target_entry) ) {

14776

+ // In general, we should not find this entry in the snapshot

14777

+ // map already. However, it could happen on a re-discover, but

14778

+ // the build_snapshot_maps function should weed out those cases.

14779

+ // In either event, we can simply ignore duplicates.

14780

+ LOG_WARNING("Detected a duplicate snapshot map entry\n");

14781

+ LOG_WARNING("Snap PV %Ld:%Ld, Org PV %Ld:%Ld\n", cow_entry->pv_snap_number, cow_entry->pv_snap_rsector,

14782

+ cow_entry->pv_org_number, cow_entry->pv_org_rsector);

14783

+ kfree(new_entry);

14784

+ }

14785

+ else {

14786

+ if ( target_entry ) {

14787

+ insert_snapshot_map_entry(new_entry, target_entry);

14788

+ }

14789

+ else {

14790

+ insert_snapshot_map_entry_at_head(new_entry, &hash_table[hash_value]);

14791

+ }

14792

+ }

14793

+

14794

+ return 0;

14795

+}

14796

+

14797

+

14798

+/* Function: snapshot_remap_sector

14799

+ *

14800

+ * Perform a sector remap on a snapshot volume. This should be called from

14801

+ * the I/O read path, after the LE-to-PE translation has already been

14802

+ * performed. First, determine the base sector of the chunk containing the

14803

+ * specified sector, and save the remainder. Then, perform a search through

14804

+ * the snapshot map for the specified volume. If an match is found, change

14805

+ * the PV and sector numbers to the new values. If no match is found, leave

14806

+ * the values alone, meaning the read should proceed down the original

14807

+ * volume.

14808

+ */

14809

+static void snapshot_remap_sector( lvm_logical_volume_t * snap_volume,

14810

+ evms_sector_t pe_start_sector,

14811

+ evms_sector_t * sector,

14812

+ lvm_physical_volume_t ** pv_entry )

14813

+{

14814

+ snapshot_map_entry_t ** hash_table;

14815

+ snapshot_map_entry_t * chain_head;

14816

+ snapshot_map_entry_t * result;

14817

+ u_int32_t hash_value;

14818

+ evms_sector_t chunk_sector;

14819

+ evms_sector_t remainder;

14820

+

14821

+ if ( ! (snap_volume->lv_access & LV_SNAPSHOT) ) {

14822

+ return;

14823

+ }

14824

+

14825

+ chunk_sector = ((*sector - pe_start_sector) & ((evms_sector_t)(~(snap_volume->chunk_size - 1)))) + pe_start_sector;

14826

+ remainder = *sector - chunk_sector;

14827

+ hash_value = snapshot_hash(chunk_sector, snap_volume);

14828

+ hash_table = snap_volume->snapshot_map[(*pv_entry)->pv_number];

14829

+ chain_head = hash_table[hash_value];

14830

+

14831

+ if ( snapshot_search_hash_chain(chunk_sector, chain_head, &result) ) {

14832

+ *pv_entry = result->snap_pv;

14833

+ *sector = result->snap_sector + remainder;

14834

+ }

14835

+}

14836

+

14837

+

14838

+/* Function: snapshot_read_write_chunk

14839

+ *

14840

+ * This function takes care of reading one chunk of data from the

14841

+ * original, and writing it to the snapshot. Since the original now has

14842

+ * a fixed sized buffer for this data, we may have to loop to get the

14843

+ * whole chunk copied.

14844

+ */

14845

+static int snapshot_read_write_chunk( lvm_logical_volume_t * org_volume,

14846

+ lvm_physical_volume_t * org_pv,

14847

+ evms_sector_t chunk_sector,

14848

+ lvm_logical_volume_t * snap_volume,

14849

+ lvm_physical_volume_t ** snap_pv,

14850

+ evms_sector_t * snap_sector )

14851

+{

14852

+ u_int32_t io_size = snap_volume->chunk_size;

14853

+ evms_sector_t snap_pe_start_sector;

14854

+ evms_sector_t size;

14855

+ int i, iterations = 1;

14856

+

14857

+ if ( org_volume->chunk_size < snap_volume->chunk_size ) {

14858

+ iterations = snap_volume->chunk_size / org_volume->chunk_size;

14859

+ io_size = org_volume->chunk_size;

14860

+ }

14861

+

14862

+ remap_sector(snap_volume->volume_node, snap_volume->next_free_chunk, 1, snap_sector, &size, &snap_pe_start_sector, snap_pv);

14863

+

14864

+ // Check for an incomplete volume

14865

+ if ( ! *snap_sector || ! *snap_pv ) {

14866

+ invalidate_snapshot_volume(snap_volume);

14867

+ return -1;

14868

+ }

14869

+

14870

+ for ( i = 0; i < iterations; i++ ) {

14871

+

14872

+ // Read the chunk from the original volume. This is a physical

14873

+ // read, not logical. Thus, stripe boundary considerations are

14874

+ // unnecessary. Also, chunks are always aligned with PEs, so PE

14875

+ // boundary considerations are unnecessary.

14876

+ if ( INIT_IO(org_pv->logical_node, 0, chunk_sector + i*io_size, io_size, org_volume->chunk_data_buffer) ) {

14877

+ return 1;

14878

+ }

14879

+

14880

+ // Write this chunk to the snapshot volume. This does duplicate

14881

+ // the local init_io code, but we need to have the remapped

14882

+ // sector later on, so this is slightly more efficient. Snapshot

14883

+ // volumes cannot be striped, so there is no need to consider

14884

+ // stripe-boundary conditions. And just like the read in the

14885

+ // previous line, chunks are always aligned with PEs, so we

14886

+ // don't have to consider PE-boundary conditions.

14887

+ if ( INIT_IO((*snap_pv)->logical_node, 1, *snap_sector + i*io_size, io_size, org_volume->chunk_data_buffer) ) {

14888

+ // An error writing the chunk to the snapshot is the

14889

+ // same situation as the snapshot being full.

14890

+ invalidate_snapshot_volume(snap_volume);

14891

+ return -1;

14892

+ }

14893

+ }

14894

+

14895

+ return 0;

14896

+}

14897

+

14898

+

14899

+/* Function: snapshot_copy_data

14900

+ *

14901

+ * On a write to a snapshotted volume, check all snapshots to see if the

14902

+ * specified chunk has already been remapped. If it has not, read the

14903

+ * original data from the volume, write the data to the next available

14904

+ * chunk on the snapshot, update the COW table, write the COW table to

14905

+ * the snapshot, and insert a new entry into the snapshot map.

14906

+ *

14907

+ * Now converted to copy data to a single snapshot. The looping is left

14908

+ * up to lvm_write.

14909

+ */

14910

+static int snapshot_copy_data( lvm_logical_volume_t * org_volume,

14911

+ lvm_logical_volume_t * snap_volume,

14912

+ evms_sector_t pe_start_sector,

14913

+ evms_sector_t org_sector,

14914

+ lvm_physical_volume_t * org_pv )

14915

+{

14916

+ lvm_physical_volume_t * snap_pv;

14917

+ snapshot_map_entry_t ** hash_table;

14918

+ snapshot_map_entry_t * chain_head;

14919

+ snapshot_map_entry_t * target_entry;

14920

+ snapshot_map_entry_t * new_map_entry;

14921

+ u_int32_t hash_value;

14922

+ evms_sector_t chunk_sector;

14923

+ evms_sector_t snap_sector;

14924

+ int rc;

14925

+

14926

+ // Lock out this snapshot while we are remapping.

14927

+ down(&snap_volume->snap_semaphore);

14928

+

14929

+ // Make sure the snapshot has not been deactivated.

14930

+ if ( ! (snap_volume->lv_status & LV_ACTIVE) ) {

14931

+ up(&snap_volume->snap_semaphore);

14932

+ return 0;

14933

+ }

14934

+

14935

+ // Search the hash table to see if this sector has already been

14936

+ // remapped on this snapshot.

14937

+ chunk_sector = ((org_sector - pe_start_sector) & ((evms_sector_t)(~(snap_volume->chunk_size - 1)))) + pe_start_sector;

14938

+ hash_value = snapshot_hash(chunk_sector, snap_volume);

14939

+ hash_table = snap_volume->snapshot_map[org_pv->pv_number];

14940

+ chain_head = hash_table[hash_value];

14941

+ if ( snapshot_search_hash_chain(chunk_sector, chain_head, &target_entry) ) {

14942

+ // Chunk is already remapped.

14943

+ up(&snap_volume->snap_semaphore);

14944

+ return 0;

14945

+ }

14946

+

14947

+ // Is there room on the snapshot to remap this chunk?

14948

+ if ( snap_volume->next_free_chunk >= snap_volume->lv_size ) {

14949

+ // At this point, the snapshot is full. Any further

14950

+ // writes to the original will cause the snapshot to

14951

+ // become "corrupt" because they can't be remapped.

14952

+ // Take this snapshot permanently offline.

14953

+ invalidate_snapshot_volume(snap_volume);

14954

+ up(&snap_volume->snap_semaphore);

14955

+ return 0;

14956

+ }

14957

+

14958

+ rc = snapshot_read_write_chunk(org_volume, org_pv, chunk_sector, snap_volume, &snap_pv, &snap_sector);

14959

+ if ( rc > 0 ) {

14960

+ up(&snap_volume->snap_semaphore);

14961

+ return -EIO;

14962

+ }

14963

+ else if ( rc < 0 ) {

14964

+ up(&snap_volume->snap_semaphore);

14965

+ return 0;

14966

+ }

14967

+

14968

+ // Fill in the appropriate COW table entry and write that

14969

+ // metadata sector back to the snapshot volume. Since we are

14970

+ // only writing one sector, there are no boundary conditions.

14971

+ // Must endian-convert each entry as it is added.

14972

+ snap_volume->cow_table[snap_volume->next_cow_entry].pv_org_number = cpu_to_le64((evms_sector_t)(org_pv->pv_number));

14973

+ snap_volume->cow_table[snap_volume->next_cow_entry].pv_org_rsector = cpu_to_le64(chunk_sector);

14974

+ snap_volume->cow_table[snap_volume->next_cow_entry].pv_snap_number = cpu_to_le64((evms_sector_t)(snap_pv->pv_number));

14975

+ snap_volume->cow_table[snap_volume->next_cow_entry].pv_snap_rsector = cpu_to_le64(snap_sector);

14976

+ if ( lvm_init_io(snap_volume->volume_node, 4, snap_volume->current_cow_sector, 1, snap_volume->cow_table) ) {

14977

+ // The data was written to the snapshot, but

14978

+ // writing the metadata failed.

14979

+ invalidate_snapshot_volume(snap_volume);

14980

+ up(&snap_volume->snap_semaphore);

14981

+ return 0;

14982

+ }

14983

+ snap_volume->next_cow_entry++;

14984

+ if ( snap_volume->next_cow_entry >= (EVMS_VSECTOR_SIZE/sizeof(lv_COW_table_disk_t)) ) {

14985

+ snap_volume->next_cow_entry = 0;

14986

+ snap_volume->current_cow_sector++;

14987

+ memset(snap_volume->cow_table, 0, EVMS_VSECTOR_SIZE);

14988

+ if ( lvm_init_io(snap_volume->volume_node, 4, snap_volume->current_cow_sector, 1, snap_volume->cow_table) ) {

14989

+ // Can't clear out the next sector of metadata.

14990

+ invalidate_snapshot_volume(snap_volume);

14991

+ up(&snap_volume->snap_semaphore);

14992

+ return 0;

14993

+ }

14994

+ }

14995

+ snap_volume->next_free_chunk += snap_volume->chunk_size;

14996

+

14997

+ // Create a new snapshot map entry and add it in the appropriate

14998

+ // place in the map.

14999

+ if ( ! (new_map_entry = allocate_snapshot_map_entry(chunk_sector, snap_sector)) ) {

15000

+ invalidate_snapshot_volume(snap_volume);

15001

+ up(&snap_volume->snap_semaphore);

15002

+ return -ENOMEM;

15003

+ }

15004

+ new_map_entry->snap_pv = snap_pv;

15005

+ if ( target_entry ) {

15006

+ insert_snapshot_map_entry(new_map_entry, target_entry);

15007

+ }

15008

+ else {

15009

+ insert_snapshot_map_entry_at_head(new_map_entry, &(hash_table[hash_value]));

15010

+ }

15011

+

15012

+ up(&snap_volume->snap_semaphore);

15013

+ return 0;

15014

+}

15015

+

15016

+

15017

+/* Function: get_snapshot_stats

15018

+ */

15019

+static int get_snapshot_stats( lvm_snapshot_stat_ioctl_t * snap_stats )

15020

+{

15021

+ lvm_logical_volume_t * volume;

15022

+ lvm_volume_group_t * group;

15023

+

15024

+ // Make sure the parameters are in range.

15025

+ if ( snap_stats->lv_number < 1 ||

15026

+ snap_stats->lv_number > MAX_LV ) {

15027

+ return 1;

15028

+ }

15029

+

15030

+ // Make sure the specified group and volume exist, and that

15031

+ // this is a snapshot volume.

15032

+ find_group_by_uuid(snap_stats->vg_uuid, &group);

15033

+ if ( ! group ||

15034

+ ! (volume = group->volume_list[snap_stats->lv_number]) ||

15035

+ ! (volume->lv_access & LV_SNAPSHOT) ) {

15036

+ return 1;

15037

+ }

15038

+

15039

+ // Return the starting LBA of the next available chunk.

15040

+ snap_stats->next_free_chunk = volume->next_free_chunk;

15041

+ snap_stats->lv_status = volume->lv_status;

15042

+

15043

+ return 0;

15044

+}

15045

+

15046

+

15047

+/********** Memory Allocation/Deallocation Functions **********/

15048

+

15049

+

15050

+

15051

+/* Function: deallocate_physical_volume

15052

+ *

15053

+ * Free the memory used by this physical volume. Do not delete the EVMS

15054

+ * node in this function, since this could be called during an error

15055

+ * path when we want to save the logical node.

15056

+ */

15057

+static int deallocate_physical_volume( lvm_physical_volume_t * pv_entry )

15058

+{

15059

+ if ( pv_entry->pv ) {

15060

+ kfree(pv_entry->pv);

15061

+ pv_entry->pv = NULL;

15062

+ }

15063

+

15064

+ if ( pv_entry->pe_map ) {

15065

+ vfree(pv_entry->pe_map);

15066

+ pv_entry->pe_map = NULL;

15067

+ }

15068

+

15069

+ kfree(pv_entry);

15070

+ return 0;

15071

+}

15072

+

15073

+

15074

+/* Function: allocate_physical_volume

15075

+ *

15076

+ * Create a new lvm_physical_volume_t for the specified volume group.

15077

+ * Initialize the new PV with the evms node and lvm pv information.

15078

+ */

15079

+static lvm_physical_volume_t * allocate_physical_volume(evms_logical_node_t * node,

15080

+ pv_disk_t * pv )

15081

+{

15082

+ lvm_physical_volume_t * new_pv;

15083

+

15084

+ new_pv = kmalloc(sizeof(lvm_physical_volume_t), GFP_NOIO);

15085

+ if ( ! new_pv ) {

15086

+ LOG_CRITICAL("Memory error creating physical volume for node %s.\n", node->name);

15087

+ kfree(pv);

15088

+ return NULL;

15089

+ }

15090

+

15091

+ // Initialize the PV

15092

+ memset(new_pv, 0, sizeof(lvm_physical_volume_t));

15093

+ new_pv->logical_node = node;

15094

+ new_pv->pv = pv;

15095

+ new_pv->pv_number = pv->pv_number;

15096

+

15097

+ return new_pv;

15098

+}

15099

+

15100

+

15101

+/* Function: allocate_snapshot_map_entry

15102

+ *

15103

+ * Allocate memory for a new entry in the snapshot map and fill in the

15104

+ * sector values. The PV pointer is not filled in here, but can easily

15105

+ * be found by using the find_pv_by_number function.

15106

+ */

15107

+static snapshot_map_entry_t * allocate_snapshot_map_entry(evms_sector_t org_sector,

15108

+ evms_sector_t snap_sector )

15109

+{

15110

+ snapshot_map_entry_t * new_entry;

15111

+

15112

+ new_entry = kmalloc(sizeof(snapshot_map_entry_t), GFP_NOIO);

15113

+ if ( ! new_entry ) {

15114

+ return NULL;

15115

+ }

15116

+ memset(new_entry, 0, sizeof(snapshot_map_entry_t));

15117

+ new_entry->org_sector = org_sector;

15118

+ new_entry->snap_sector = snap_sector;

15119

+ return new_entry;

15120

+}

15121

+

15122

+

15123

+/* Function: deallocate_snapshot_map

15124

+ *

15125

+ * This function will delete one hash table, which is part of the whole

15126

+ * snapshot remapping structure. Each hash table is an array of pointers

15127

+ * to linked lists of snapshot_map_entry_t's.

15128

+ */

15129

+static int deallocate_snapshot_map( snapshot_map_entry_t ** table, u_int32_t table_size )

15130

+{

15131

+ snapshot_map_entry_t * entry;

15132

+ snapshot_map_entry_t * next;

15133

+ int i;

15134

+

15135

+ if ( table ) {

15136

+ for ( i = 0; i < table_size; i++ ) {

15137

+ for ( entry = table[i]; entry; entry = next ) {

15138

+ next = entry->next;

15139

+ kfree(entry);

15140

+ }

15141

+ }

15142

+ vfree(table);

15143

+ }

15144

+ return 0;

15145

+}

15146

+

15147

+

15148

+/* Function: deallocate_logical_volume

15149

+ *

15150

+ * Delete the in-memory representation of a single LVM logical volume,

15151

+ * including its PE map and any snapshot data. Do not alter the parent

15152

+ * volume group, except to remove this volume from its volume list.

15153

+ */

15154

+static int deallocate_logical_volume( lvm_logical_volume_t * volume )

15155

+{

15156

+ lvm_volume_group_t * group = volume->group;

15157

+ lvm_logical_volume_t * org_volume;

15158

+ lvm_logical_volume_t * snap_volume;

15159

+ int i;

15160

+

15161

+ // If this volume is a snapshot, remove it from the linked list of

15162

+ // volumes that are snapshotting the original. First, the original

15163

+ // volume must be quiesced.

15164

+ if ( volume->lv_access & LV_SNAPSHOT ) {

15165

+ org_volume = volume->snapshot_org;

15166

+

15167

+ if ( snapshot_check_quiesce_original(volume) ) {

15168

+ return -EINVAL;

15169

+ }

15170

+

15171

+ remove_snapshot_from_chain(volume);

15172

+

15173

+ // If the snapshot that was just removed was the last/only

15174

+ // volume snapshotting the original, then mark the original

15175

+ // as no longer being snapshotted.

15176

+ if ( org_volume && ! org_volume->snapshot_next ) {

15177

+ org_volume->lv_access &= ~LV_SNAPSHOT_ORG;

15178

+ }

15179

+ }

15180

+

15181

+ // If this volume is a snapshot original, all of its snapshots must also

15182

+ // be deleted. However, Those deletions need to be taken care of by the

15183

+ // engine. So just check that they have all been quiesced before

15184

+ // removing the original.

15185

+ else if ( volume->lv_access & LV_SNAPSHOT_ORG ) {

15186

+ if ( snapshot_check_quiesce_all(volume) ) {

15187

+ return -EINVAL;

15188

+ }

15189

+

15190

+ // In case there are any snapshots remaining, we must clear out

15191

+ // their pointers to this original to prevent errors when those

15192

+ // snapshots are accessed or deleted.

15193

+ for ( snap_volume = volume->snapshot_next; snap_volume; snap_volume = snap_volume->snapshot_next ) {

15194

+ snap_volume->snapshot_org = NULL;

15195

+ }

15196

+ }

15197

+

15198

+ LOG_DEBUG("Deleting volume %s\n", volume->name);

15199

+

15200

+ // Free all the memory. This includes the LE-to-PE map, any snapshot

15201

+ // hash tables, the COW table, and chunk data buffer.

15202

+ if ( volume->le_map ) {

15203

+ vfree(volume->le_map);

15204

+ volume->le_map = NULL;

15205

+ }

15206

+ if ( volume->snapshot_map ) {

15207

+ for ( i = 1; i <= group->pv_count; i++ ) {

15208

+ deallocate_snapshot_map(volume->snapshot_map[i], volume->hash_table_size);

15209

+ }

15210

+ kfree(volume->snapshot_map);

15211

+ volume->snapshot_map = NULL;

15212

+ }

15213

+ if ( volume->cow_table ) {

15214

+ kfree(volume->cow_table);

15215

+ volume->cow_table = NULL;

15216

+ }

15217

+ if ( volume->chunk_data_buffer ) {

15218

+ kfree(volume->chunk_data_buffer);

15219

+ volume->chunk_data_buffer = NULL;

15220

+ }

15221

+

15222

+ // Remove this volume from the volume-group's list.

15223

+ if ( group && group->volume_list[volume->lv_number] == volume ) {

15224

+ group->volume_list[volume->lv_number] = NULL;

15225

+ group->volume_count--;

15226

+ }

15227

+

15228

+ kfree(volume);

15229

+

15230

+ return 0;

15231

+}

15232

+

15233

+

15234

+/* Function: allocate_logical_volume

15235

+ *

15236

+ * Allocate space for a new LVM logical volume, including space for the

15237

+ * LE-to-PE map and any necessary snapshot data.

15238

+ */

15239

+static lvm_logical_volume_t * allocate_logical_volume( lv_disk_t * lv,

15240

+ lvm_volume_group_t * group )

15241

+{

15242

+ lvm_logical_volume_t * new_volume;

15243

+ u_int32_t table_entries_per_chunk;

15244

+ u_int32_t table_chunks;

15245

+ int i;

15246

+

15247

+ // Allocate space for the new logical volume.

15248

+ new_volume = kmalloc(sizeof(lvm_logical_volume_t), GFP_NOIO);

15249

+ if ( ! new_volume ) {

15250

+ LOG_CRITICAL("Memory error creating new logical volume %s\n", lv->lv_name);

15251

+ return NULL;

15252

+ }

15253

+ memset(new_volume, 0, sizeof(lvm_logical_volume_t));

15254

+

15255

+ // Allocate space for the LE to PE mapping table

15256

+ new_volume->le_map = vmalloc(lv->lv_allocated_le*sizeof(le_table_entry_t));

15257

+ if ( ! new_volume->le_map ) {

15258

+ LOG_CRITICAL("Memory error creating LE map for logical volume %s\n", lv->lv_name);

15259

+ kfree(new_volume);

15260

+ return NULL;

15261

+ }

15262

+ memset(new_volume->le_map, 0, lv->lv_allocated_le*sizeof(le_table_entry_t));

15263

+

15264

+ // Initialize the rest of the new volume.

15265

+ new_volume->lv_number = lv->lv_number + 1; // Need the +1 to match the PE Map entries on the PV

15266

+ new_volume->lv_size = lv->lv_size;

15267

+ new_volume->lv_access = lv->lv_access | EVMS_LV_NEW | EVMS_LV_QUIESCED; // All volumes start new and quieseced.

15268

+ new_volume->lv_status = lv->lv_status | LV_ACTIVE; // All LVs start as active.

15269

+ new_volume->lv_minor = MINOR(lv->lv_dev);

15270

+ new_volume->stripes = lv->lv_stripes;

15271

+ new_volume->stripe_size = lv->lv_stripesize;

15272

+ new_volume->stripe_size_shift = evms_cs_log2(lv->lv_stripesize);

15273

+ new_volume->pe_size = group->vg->pe_size;

15274

+ new_volume->pe_size_shift = evms_cs_log2(group->vg->pe_size);

15275

+ new_volume->num_le = lv->lv_allocated_le;

15276

+ new_volume->group = group;

15277

+ // Different naming scheme for EVMS nodes.

15278

+ if ( translate_lv_name(lv->lv_name, new_volume->name) ) {

15279

+ deallocate_logical_volume(new_volume);

15280

+ return NULL;

15281

+ }

15282

+

15283

+ // If the volume is a snapshot, initialize the remaining data, and

15284

+ // allocate space for the remapping structures, and one sector's worth

15285

+ // of COW tables.

15286

+ if ( new_volume->lv_access & LV_SNAPSHOT ) {

15287

+ new_volume->chunk_size = lv->lv_chunk_size;

15288

+ new_volume->num_chunks = lv->lv_size / lv->lv_chunk_size;

15289

+ new_volume->snap_org_minor = lv->lv_snapshot_minor;

15290

+ new_volume->next_cow_entry = 0;

15291

+ new_volume->current_cow_sector = 0;

15292

+ table_entries_per_chunk = (new_volume->chunk_size << EVMS_VSECTOR_SIZE_SHIFT) / sizeof(lv_COW_table_disk_t);

15293

+ table_chunks = (new_volume->num_chunks + table_entries_per_chunk - 1) / table_entries_per_chunk;

15294

+ new_volume->next_free_chunk = table_chunks * new_volume->chunk_size;

15295

+ new_volume->hash_table_size = (lv->lv_size / lv->lv_chunk_size / MAX_HASH_CHAIN_ENTRIES) + 1;

15296

+

15297

+ new_volume->cow_table = kmalloc(EVMS_VSECTOR_SIZE, GFP_NOIO);

15298

+ if ( ! new_volume->cow_table ) {

15299

+ LOG_CRITICAL("Memory error creating COW table for logical volume %s\n", lv->lv_name);

15300

+ deallocate_logical_volume(new_volume);

15301

+ return NULL;

15302

+ }

15303

+ memset(new_volume->cow_table, 0, EVMS_VSECTOR_SIZE);

15304

+

15305

+ new_volume->snapshot_map = kmalloc((group->pv_count+1) * sizeof(snapshot_map_entry_t**), GFP_NOIO);

15306

+ if ( ! new_volume->snapshot_map ) {

15307

+ LOG_CRITICAL("Memory error creating snapshot map for logical volume %s\n", lv->lv_name);

15308

+ deallocate_logical_volume(new_volume);

15309

+ return NULL;

15310

+ }

15311

+

15312

+ new_volume->snapshot_map[0] = NULL;

15313

+ for ( i = 1; i <= group->pv_count; i++ ) {

15314

+ new_volume->snapshot_map[i] = vmalloc(new_volume->hash_table_size * sizeof(snapshot_map_entry_t*));

15315

+ if ( ! new_volume->snapshot_map[i] ) {

15316

+ LOG_CRITICAL("Memory error creating snapshot sub-map for logical volume %s\n", lv->lv_name);

15317

+ deallocate_logical_volume(new_volume);

15318

+ return NULL;

15319

+ }

15320

+ memset(new_volume->snapshot_map[i], 0, new_volume->hash_table_size*sizeof(snapshot_map_entry_t*));

15321

+ }

15322

+ init_MUTEX(&new_volume->snap_semaphore);

15323

+ }

15324

+

15325

+ // If the volume is a snapshot original, allocate space to use for

15326

+ // copying snapshot chunks. This will now be a fixed size instead of

15327

+ // being based on the chunk size of the snapshots.

15328

+ else if ( new_volume->lv_access & LV_SNAPSHOT_ORG ) {

15329

+ new_volume->chunk_size = CHUNK_DATA_BUFFER_SIZE;

15330

+ new_volume->chunk_data_buffer = kmalloc(new_volume->chunk_size << EVMS_VSECTOR_SIZE_SHIFT, GFP_NOIO);

15331

+ if ( ! new_volume->chunk_data_buffer ) {

15332

+ LOG_SERIOUS("Memory error creating snapshot chunk buffer for logical volume %s\n", lv->lv_name);

15333

+ deallocate_logical_volume(new_volume);

15334

+ return NULL;

15335

+ }

15336

+ memset(new_volume->chunk_data_buffer, 0, new_volume->chunk_size << EVMS_VSECTOR_SIZE_SHIFT);

15337

+ }

15338

+

15339

+ return new_volume;

15340

+}

15341

+

15342

+

15343

+/* Function: deallocate_volume_group

15344

+ *

15345

+ * Delete the entire in-memory representation of an LVM volume group,

15346

+ * including all PVs and logical volumes. If this group is on LVM's

15347

+ * volume group list, remove it.

15348

+ */

15349

+static int deallocate_volume_group( lvm_volume_group_t * group )

15350

+{

15351

+ lvm_physical_volume_t * pv_entry;

15352

+ lvm_physical_volume_t * next_pv;

15353

+ int i;

15354

+

15355

+ LOG_DEBUG("Deleting volume group %s\n", group->vg_name);

15356

+

15357

+ // Remove the group from the global list.

15358

+ remove_group_from_list(group);

15359

+

15360

+ // Delete the LV metadata array.

15361

+ if ( group->lv_array ) {

15362

+ vfree(group->lv_array);

15363

+ group->lv_array = NULL;

15364

+ }

15365

+

15366

+ // Delete the PV UUID list

15367

+ if ( group->uuid_list ) {

15368

+ vfree(group->uuid_list);

15369

+ group->uuid_list = NULL;

15370

+ }

15371

+

15372

+ // Delete all logical volumes.

15373

+ for ( i = 1; i <= MAX_LV; i++ ) {

15374

+ if ( group->volume_list[i] ) {

15375

+ deallocate_logical_volume(group->volume_list[i]);

15376

+ group->volume_list[i] = NULL;

15377

+ }

15378

+ }

15379

+

15380

+ // Delete all PVs from the group's list.

15381

+ for ( pv_entry = group->pv_list; pv_entry; pv_entry = next_pv ) {

15382

+ next_pv = pv_entry->next;

15383

+ if ( pv_entry->logical_node ) {

15384

+ // Send a delete command down to the partition manager.

15385

+ LOG_DEBUG("Deleting PV %s from group %s\n", pv_entry->logical_node->name, group->vg_name);

15386

+ DELETE(pv_entry->logical_node);

15387

+ pv_entry->logical_node = NULL;

15388

+ }

15389

+ deallocate_physical_volume(pv_entry);

15390

+ }

15391

+

15392

+ // Delete the VG metadata.

15393

+ if ( group->vg ) {

15394

+ kfree(group->vg);

15395

+ group->vg = NULL;

15396

+ }

15397

+

15398

+ kfree(group);

15399

+

15400

+ return 0;

15401

+}

15402

+

15403

+

15404

+/* Function: allocate_volume_group

15405

+ *

15406

+ * Allocate space for a new LVM volume group and all of its sub-fields.

15407

+ * Initialize the appropriate fields.

15408

+ * vg parameter should already have an allocate/initialized vg_disk_t.

15409

+ */

15410

+static lvm_volume_group_t * allocate_volume_group( vg_disk_t * vg,

15411

+ unsigned char * vg_name )

15412

+{

15413

+ lvm_volume_group_t * new_group;

15414

+

15415

+ // The volume group itself.

15416

+ new_group = kmalloc(sizeof(lvm_volume_group_t), GFP_NOIO);

15417

+ if ( ! new_group ) {

15418

+ kfree(vg);

15419

+ return NULL;

15420

+ }

15421

+

15422

+ // Initialize the new group.

15423

+ memset(new_group, 0, sizeof(lvm_volume_group_t));

15424

+ memcpy(new_group->vg_uuid, vg->vg_uuid, UUID_LEN);

15425

+ strncpy(new_group->vg_name, vg_name, NAME_LEN-1);

15426

+ new_group->vg = vg;

15427

+ new_group->hard_sect_size = 512; // Default value

15428

+ new_group->block_size = 1024; // Default value

15429

+ new_group->flags = EVMS_VG_DIRTY;

15430

+

15431

+ LOG_DETAILS("Discovered volume group %s\n", new_group->vg_name);

15432

+

15433

+ return new_group;

15434

+}

15435

+

15436

+

15437

+/* Function: remove_pv_from_group

15438

+ *

15439

+ * In the engine, when a PV is removed from a group (on a vgreduce), that

15440

+ * same PV must be removed from that group in the kernel. Otherwise, when

15441

+ * the rediscover occurs, that PV will still appear in the group, and

15442

+ * will cause segfaults when we try to read metadata from it.

15443

+ */

15444

+static int remove_pv_from_group(int pv_number,

15445

+ unsigned char * vg_uuid )

15446

+{

15447

+ lvm_volume_group_t * group;

15448

+ lvm_physical_volume_t * pv_entry;

15449

+ lvm_physical_volume_t ** p_pv_entry;

15450

+ int rc = 0;

15451

+

15452

+ // Make sure the numbers are in range.

15453

+ if ( pv_number < 0 || pv_number > MAX_PV ) {

15454

+ return 0;

15455

+ }

15456

+

15457

+ // Make sure the group exists.

15458

+ find_group_by_uuid(vg_uuid, &group);

15459

+ if ( ! group ) {

15460

+ return 0;

15461

+ }

15462

+

15463

+ // Make sure the PV is in this group.

15464

+ pv_entry = find_pv_by_number(pv_number, group);

15465

+ if ( ! pv_entry ) {

15466

+ LOG_WARNING("Did not find PV %d in group %s\n", pv_number, group->vg_name);

15467

+ return 0;

15468

+ }

15469

+

15470

+ // Make sure the PV is not in use by any volumes

15471

+ if ( check_pv_for_lv(pv_entry, group) ) {

15472

+ LOG_SERIOUS("PV %d in group %s still contains LVs\n", pv_number, group->vg_name);

15473

+ return -EINVAL;

15474

+ }

15475

+

15476

+ // Take this PV out of the group's list.

15477

+ for ( p_pv_entry = &group->pv_list; *p_pv_entry; p_pv_entry = &(*p_pv_entry)->next ) {

15478

+ if ( *p_pv_entry == pv_entry ) {

15479

+ *p_pv_entry = (*p_pv_entry)->next;

15480

+ pv_entry->next = NULL;

15481

+ break;

15482

+ }

15483

+ }

15484

+

15485

+ group->pv_count--;

15486

+

15487

+ // There is no way that this PV was the last from this group, so the

15488

+ // group never needs to be deleted at this point. The only way this

15489

+ // group will exist in the kernel is if there are volumes exported from

15490

+ // it. If this was the last PV, then those volumes must be on that PV,

15491

+ // and it wouldn't be allowed to be removed from the group (above).

15492

+

15493

+ // Free up the memory for this PV. Just drop the node.

15494

+ deallocate_physical_volume(pv_entry);

15495

+

15496

+ LOG_DEBUG("PV %d removed from group %s\n", pv_number, group->vg_name);

15497

+ return rc;

15498

+}

15499

+

15500

+

15501

+

15502

+/********** Consistency Checking Functions **********/

15503

+

15504

+

15505

+/* Function: clear_le_entries_for_missing_pv

15506

+ */

15507

+static void clear_le_entries_for_missing_pv( lvm_volume_group_t * group,

15508

+ lvm_physical_volume_t * pv_entry )

15509

+{

15510

+ lvm_logical_volume_t * volume;

15511

+ int i, j;

15512

+

15513

+ for ( i = 1; i <= MAX_LV; i++ ) {

15514

+ if ( group->volume_list[i] ) {

15515

+ volume = group->volume_list[i];

15516

+ for ( j = 0; j < volume->num_le; j++ ) {

15517

+ if ( volume->le_map[j].owning_pv == pv_entry ) {

15518

+ volume->le_map[j].owning_pv = NULL;

15519

+ volume->le_map[j].pe_sector_offset = 0;

15520

+ }

15521

+ }

15522

+ }

15523

+ }

15524

+}

15525

+

15526

+

15527

+/* Function: check_volume_groups

15528

+ *

15529

+ * This function performs some simple consistency checks on all dirty

15530

+ * volume groups. Any groups that have no PVs are deleted. If any metadata

15531

+ * structures (PV or VG) are missing, they are read in from disk.

15532

+ */

15533

+static int check_volume_groups( void )

15534

+{

15535

+ lvm_volume_group_t * group;

15536

+ lvm_volume_group_t * next_group;

15537

+ lvm_physical_volume_t * pv_entry;

15538

+ lvm_physical_volume_t * next_pv;

15539

+ int rc = 0;

15540

+

15541

+ for ( group = lvm_group_list; group; group = next_group) {

15542

+ next_group = group->next_group;

15543

+

15544

+ LOG_DEBUG("Checking Group %s\n", group->vg_name);

15545

+

15546

+ // If a group has no PVs, it can be safely deleted,

15547

+ // because we can't find any volumes on it.

15548

+ if ( ! group->pv_count ) {

15549

+ LOG_WARNING("No PVs found for Group %s.\n", group->vg_name);

15550

+ if ( ! group->volume_count ) {

15551

+ deallocate_volume_group(group);

15552

+ }

15553

+ continue;

15554

+ }

15555

+

15556

+ // Make sure all metadata for the PVs is present. On a

15557

+ // rediscover, it may be missing, because we delete it at the

15558

+ // end of discovery. If any is missing, read it in from disk.

15559

+ // This is only necessary in the kernel. It can't happen in

15560

+ // the engine.

15561

+ for ( pv_entry = group->pv_list; pv_entry; pv_entry = next_pv ) {

15562

+ next_pv = pv_entry->next;

15563

+ if ( ! pv_entry->pv ) {

15564

+ LOG_DEBUG("Re-reading PV metadata for node %s\n", pv_entry->logical_node->name);

15565

+ rc = read_pv(pv_entry->logical_node, &pv_entry->pv);

15566

+ if (rc) {

15567

+ // What happens if we can't re-read the

15568

+ // PV metadata? This PV must be removed

15569

+ // from the group. Need to also clear

15570

+ // all LE entries in all LVs that are

15571

+ // pointing to this PV before it can be

15572

+ // removed from the list.

15573

+ LOG_SERIOUS("PV metadata is missing or cannot be read from node %s\n", pv_entry->logical_node->name);

15574

+ clear_le_entries_for_missing_pv(group, pv_entry);

15575

+ remove_pv_from_group(pv_entry->pv_number, group->vg_uuid);

15576

+ continue;

15577

+ }

15578

+ pv_entry->pv_number = pv_entry->pv->pv_number;

15579

+

15580

+ // Check for a "stale" PV. This case should be

15581

+ // already be covered, as long as the Engine is

15582

+ // calling the PV_REMOVE ioctl when it does a

15583

+ // vgreduce or a pvremove. If this is the last

15584

+ // PV in the group, the group will be deleted.

15585

+ if ( ! pv_entry->pv_number ) {

15586

+ remove_pv_from_group(0, group->vg_uuid);

15587

+ continue;

15588

+ }

15589

+ }

15590

+

15591

+ if ( ! pv_entry->pe_map ) {

15592

+ LOG_DEBUG("Re-reading PE maps for node %s\n", pv_entry->logical_node->name);

15593

+ rc = read_pe_map(pv_entry);

15594

+ if (rc) {

15595

+ LOG_WARNING("Error reading PE maps for node %s\n", pv_entry->logical_node->name);

15596

+ LOG_WARNING("Any volumes residing on this node will be incomplete!\n");

15597

+ }

15598

+ }

15599

+ }

15600

+

15601

+ // Make sure the metadata for the VG is present. If it's

15602

+ // missing, read it in from the first PV in the VG.

15603

+ if ( ! group->vg && group->pv_count ) {

15604

+ LOG_DEBUG("Re-reading VG metadata for Group %s\n", group->vg_name);

15605

+ pv_entry = group->pv_list;

15606

+ rc = read_vg(pv_entry->logical_node, pv_entry->pv, &group->vg);

15607

+ if (rc) {

15608

+ // What happens if we can't re-read the

15609

+ // VG metadata? It's definitely bad

15610

+ // news. Should we delete the VG?

15611

+ continue;

15612

+ }

15613

+ }

15614

+

15615

+ // Display a warning if the number of PVs found for the group

15616

+ // doesn't match the number of PVs recorded for the VG.

15617

+ if ( group->vg && group->pv_count != group->vg->pv_cur ) {

15618

+ LOG_WARNING("Group %s is incomplete.\n", group->vg_name);

15619

+ LOG_WARNING(" Only %d of %d PVs found.\n", group->pv_count, group->vg->pv_cur);

15620

+ LOG_WARNING(" Volumes in this group may be incomplete.\n");

15621

+ }

15622

+ }

15623

+

15624

+ return 0;

15625

+}

15626

+

15627

+

15628

+/* Function: check_le_maps

15629

+ *

15630

+ * Make sure all volumes in this group have valid LE-to-PE maps. Any

15631

+ * volume that doesn't is marked as incomplete. This is safe for

15632

+ * re-discovery because only new volumes could have corrupted LE maps.

15633

+ */

15634

+static int check_le_maps( lvm_volume_group_t * group )

15635

+{

15636

+ lvm_logical_volume_t * volume;

15637

+ int i, j, count;

15638

+

15639

+ for ( i = 1; i <= MAX_LV; i++ ) {

15640

+ volume = group->volume_list[i];

15641

+ if ( ! volume ) {

15642

+ continue;

15643

+ }

15644

+

15645

+ if ( ! volume->le_map ) {

15646

+ // No point in keeping the volume around if it has

15647

+ // no LE map at all.

15648

+ LOG_SERIOUS("Volume %s has no LE map.\n", volume->name);

15649

+ deallocate_logical_volume(volume);

15650

+ continue;

15651

+ }

15652

+

15653

+ // If any entries in the LE map are missing, mark this volume

15654

+ // as incomplete.

15655

+ for ( j = 0, count = 0; j < volume->num_le; j++ ) {

15656

+ if ( ! volume->le_map[j].owning_pv ||

15657

+ ! volume->le_map[j].pe_sector_offset ) {

15658

+ count++;

15659

+ }

15660

+ }

15661

+ if ( count ) {

15662

+ LOG_SERIOUS("Volume %s has incomplete LE map.\n", volume->name);

15663

+ LOG_SERIOUS(" Missing %d out of %d LEs.\n", count, volume->num_le);

15664

+ volume->lv_access |= EVMS_LV_INCOMPLETE;

15665

+ }

15666

+ }

15667

+ return 0;

15668

+}

15669

+

15670

+

15671

+/* Function: check_snapshot_map

15672

+ *

15673

+ * For snapshot volumes, make sure the snapshot map is intact, and that

15674

+ * any existing entries in the map are in the correct order and there

15675

+ * are no duplicate entries.

15676

+ */

15677

+static int check_snapshot_map( lvm_logical_volume_t * snap_volume )

15678

+{

15679

+ snapshot_map_entry_t ** table;

15680

+ snapshot_map_entry_t * curr;

15681

+ int i, j;

15682

+

15683

+ if ( ! (snap_volume->lv_access & LV_SNAPSHOT) ) {

15684

+ return 0;

15685

+ }

15686

+ if ( ! snap_volume->snapshot_map ) {

15687

+ snap_volume->lv_access |= EVMS_LV_INVALID;

15688

+ return -EINVAL;

15689

+ }

15690

+ for ( i = 1; i <= snap_volume->group->pv_count; i++ ) {

15691

+ if ( ! snap_volume->snapshot_map[i] ) {

15692

+ snap_volume->lv_access |= EVMS_LV_INVALID;

15693

+ return -EINVAL;

15694

+ }

15695

+ table = snap_volume->snapshot_map[i];

15696

+ for ( j = 0; j < snap_volume->hash_table_size; j++ ) {

15697

+ for ( curr = table[j]; curr; curr = curr->next ) {

15698

+ if ( curr->next && curr->org_sector >= curr->next->org_sector ) {

15699

+ snap_volume->lv_access |= EVMS_LV_INVALID;

15700

+ return -EINVAL;

15701

+ }

15702

+ }

15703

+ }

15704

+ }

15705

+ return 0;

15706

+}

15707

+

15708

+

15709

+/* Function: check_logical_volumes

15710

+ *

15711

+ * Perform a consistency check on all of the logical volumes that have been

15712

+ * discovered. Any volume that has any inconsistencies will be marked as

15713

+ * incomplete or invalid, depending on the severity of the problem. At the

15714

+ * end, all invalid volumes are deleted. If the deleted_incompletes

15715

+ * parameter is set, those will also be deleted.

15716

+ */

15717

+static int check_logical_volumes( int final_discovery )

15718

+{

15719

+ lvm_volume_group_t * group;

15720

+ lvm_logical_volume_t * volume;

15721

+ lvm_logical_volume_t * snap;

15722

+ lvm_logical_volume_t * next;

15723

+ int count;

15724

+ int i, j;

15725

+

15726

+ // Check every valid, dirty volume group

15727

+ for ( group = lvm_group_list; group; group = group->next_group ) {

15728

+ if ( ! (group->flags & EVMS_VG_DIRTY) ) {

15729

+ continue;

15730

+ }

15731

+

15732

+ // Check every valid volume in this group

15733

+ for ( i = 1; i <= MAX_LV; i++ ) {

15734

+ volume = group->volume_list[i];

15735

+ if ( ! volume ) {

15736

+ continue;

15737

+ }

15738

+

15739

+ LOG_DEBUG("Checking logical volume %s\n", volume->name);

15740

+

15741

+ if ( ! volume->group ) {

15742

+ volume->group = group;

15743

+ }

15744

+

15745

+ // All LE-map entries must have valid values. The I/O

15746

+ // paths now detect missing LE entries.

15747

+ if ( volume->le_map ) {

15748

+ for ( j = 0, count = 0; j < volume->num_le; j++ ) {

15749

+ if ( ! volume->le_map[j].owning_pv ||

15750

+ ! volume->le_map[j].pe_sector_offset ) {

15751

+ count++;

15752

+ }

15753

+ }

15754

+ if ( count ) {

15755

+ LOG_SERIOUS("Volume %s has incomplete LE map.\n", volume->name);

15756

+ LOG_SERIOUS(" Missing %d out of %d LEs.\n", count, volume->num_le);

15757

+ volume->lv_access |= EVMS_LV_INCOMPLETE;

15758

+ }

15759

+ else {

15760

+ // In case this volume was previously

15761

+ // marked incomplete.

15762

+ volume->lv_access &= ~EVMS_LV_INCOMPLETE;

15763

+ }

15764

+ }

15765

+ else {

15766

+ // This should only ever happen due to

15767

+ // memory corruption.

15768

+ LOG_SERIOUS("Volume %s has no LE map.\n", volume->name);

15769

+ volume->lv_access |= EVMS_LV_INVALID;

15770

+ }

15771

+

15772

+ // For a snapshot original, check all snapshots in the

15773

+ // chain, to make sure they point back to the original.

15774

+ // Also, make sure there is memory for the chunk buffer.

15775

+ if ( volume->lv_access & LV_SNAPSHOT_ORG ) {

15776

+ for ( snap = volume->snapshot_next, count = 0; snap; snap = snap->snapshot_next, count++ ) {

15777

+ if ( snap->snapshot_org != volume ) {

15778

+ LOG_SERIOUS("Snapshot volume %s not pointing at correct original\n", volume->name);

15779

+ snap->snapshot_org = NULL;

15780

+ snap->lv_access |= EVMS_LV_INVALID;

15781

+ }

15782

+ }

15783

+ if ( ! count ) {

15784

+ LOG_WARNING("No snapshots found for volume %s\n", volume->name);

15785

+ if ( final_discovery ) {

15786

+ volume->lv_access &= ~LV_SNAPSHOT_ORG;

15787

+ }

15788

+ }

15789

+ else if ( ! volume->chunk_data_buffer ) {

15790

+ volume->lv_access |= EVMS_LV_INVALID;

15791

+ }

15792

+ }

15793

+

15794

+ // For a snapshot volume, make sure it points back to

15795

+ // its original. Also make sure there is memory for the

15796

+ // cow table, and that any existing snapshot entries in

15797

+ // the snapshot map are correctly ordered.

15798

+ else if ( volume->lv_access & LV_SNAPSHOT ) {

15799

+ // Is there a COW table?

15800

+ if ( ! volume->cow_table ) {

15801

+ LOG_SERIOUS("Snapshot volume %s has no COW table\n", volume->name);

15802

+ volume->lv_access |= EVMS_LV_INVALID;

15803

+ }

15804

+ // Is the snapshot map in order?

15805

+ if ( check_snapshot_map(volume) ) {

15806

+ LOG_SERIOUS("Snapshot volume %s has snapshot map inconsistency\n", volume->name);

15807

+ volume->lv_access |= EVMS_LV_INVALID;

15808

+ }

15809

+ // Is there an original volume? This is only

15810

+ // a real problem during final discovery.

15811

+ if ( ! volume->snapshot_org ) {

15812

+ LOG_SERIOUS("Snapshot volume %s not pointing at an original\n", volume->name);

15813

+ if ( final_discovery ) {

15814

+ volume->lv_access |= EVMS_LV_INVALID;

15815

+ }

15816

+ }

15817

+ // Is the original the correct one?

15818

+ else if ( volume->snap_org_minor != volume->snapshot_org->lv_minor ) {

15819

+ LOG_SERIOUS("Snapshot volume %s not pointing at correct original\n", volume->name);

15820

+ volume->lv_access |= EVMS_LV_INVALID;

15821

+ }

15822

+ }

15823

+

15824

+ // Delete any invalid volumes from use. Delete

15825

+ // incomplete volumes as well if this is not final

15826

+ // discovery. If a snapshot original is bad, delete all

15827

+ // of its snapshots.

15828

+ if ( volume->lv_access & EVMS_LV_INVALID ||

15829

+ (!final_discovery &&

15830

+ (volume->lv_access & EVMS_LV_INCOMPLETE) &&

15831

+ (volume->lv_access & EVMS_LV_NEW) ) ) {

15832

+ if ( volume->lv_access & LV_SNAPSHOT_ORG ) {

15833

+ for ( snap = volume->snapshot_next; snap; snap = next ) {

15834

+ next = snap->snapshot_next;

15835

+ snap->snapshot_next = NULL;

15836

+ snap->snapshot_org = NULL;

15837

+ invalidate_snapshot_volume(snap);

15838

+ deallocate_logical_volume(snap);

15839

+ }

15840

+ volume->snapshot_next = NULL;

15841

+ }

15842

+ else if ( volume->lv_access & LV_SNAPSHOT ) {

15843

+ invalidate_snapshot_volume(volume);

15844

+ }

15845

+ deallocate_logical_volume(volume);

15846

+ }

15847

+ }

15848

+ }

15849

+

15850

+ return 0;

15851

+}

15852

+

15853

+

15854

+

15855

+/********** Volume Group Discovery Functions **********/

15856

+

15857

+

15858

+

15859

+/* Function: find_group_for_pv

15860

+ *

15861

+ * This is a discover-time function. It reads the VG metadata info for the

15862

+ * specified node, and locates the appropriate group that owns that

15863

+ * node. If that group does not already exist, it is created and

15864

+ * initialized.

15865

+ */

15866

+static int find_group_for_pv( evms_logical_node_t * node,

15867

+ pv_disk_t * pv,

15868

+ lvm_volume_group_t ** group )

15869

+{

15870

+ vg_disk_t * vg;

15871

+ int rc;

15872

+

15873

+ *group = NULL;

15874

+

15875

+ // Check for an unassigned PV.

15876

+ if ( pv->vg_name[0] == 0 ) {

15877

+ return 0;

15878

+ }

15879

+

15880

+ // Read the VG on-disk info for this PV. If this succeeds, it

15881

+ // allocates a new VG metadata structure.

15882

+ rc = read_vg(node, pv, &vg);

15883

+ if (rc) {

15884

+ return rc;

15885

+ }

15886

+

15887

+ // Use the UUID from the VG metadata to determine if this group

15888

+ // has already been discovered and constructed.

15889

+ find_group_by_uuid(vg->vg_uuid, group);

15890

+

15891

+ if ( ! *group ) {

15892

+ // Create a new group entry and add to the global list.

15893

+ *group = allocate_volume_group(vg, pv->vg_name);

15894

+ if ( ! *group ) {

15895

+ return -ENOMEM;

15896

+ }

15897

+ add_group_to_list(*group);

15898

+ }

15899

+ else if ( ! (*group)->vg ) {

15900

+ // On a rediscover, the VG metadata for an existing group might

15901

+ // be missing. Fill it in if necessary. This check is also not

15902

+ // necessary in the engine, since the metadata is never deleted.

15903

+// Should we re-copy vg_name? (vg_uuid can not be allowed to change).

15904

+// Or should vg_name changes be done through direct ioctl only?

15905

+ (*group)->vg = vg;

15906

+ }

15907

+ else {

15908

+ kfree(vg);

15909

+ }

15910

+

15911

+ // Read in the UUID list for this group, if it isn't present.

15912

+ rc = read_uuid_list(node, pv, *group);

15913

+ if (rc) {

15914

+ LOG_WARNING("Error reading UUID list for group %s.\n", (*group)->vg_name);

15915

+ LOG_WARNING("May not be able to verify PV UUIDs for group %s\n", (*group)->vg_name);

15916

+ }

15917

+

15918

+ // In the kernel, any time we even see a PV for a group, that group

15919

+ // must be marked dirty so its volumes will be re-exported.

15920

+ (*group)->flags |= EVMS_VG_DIRTY;

15921

+

15922

+ return 0;

15923

+}

15924

+

15925

+

15926

+/* Function: check_for_duplicate_pv

15927

+ *

15928

+ * Search the list of PVs in the specified volume group. If the

15929

+ * specified node already exists in the list, we can discard it.

15930

+ */

15931

+static int check_for_duplicate_pv( evms_logical_node_t * node,

15932

+ pv_disk_t * pv,

15933

+ lvm_volume_group_t * group )

15934

+{

15935

+ lvm_physical_volume_t * pv_entry;

15936

+

15937

+ // For re-discovery, we need to search all existing PVs in this VG to

15938

+ // make sure we didn't get a duplicate from the plugin below us. The

15939

+ // plugins below us should be re-exporting the same node on

15940

+ // re-discovery, instead of creating a new node to represent the same

15941

+ // objects, so just check the memory location.

15942

+ for ( pv_entry = group->pv_list; pv_entry; pv_entry = pv_entry->next ) {

15943

+ if ( pv_entry->logical_node == node ) {

15944

+

15945

+ // We found a duplicate. Just ignore the duplicate.

15946

+ LOG_DEBUG("PV %s is already in Group %s.\n", node->name, group->vg_name);

15947

+

15948

+ // Even if the node was a duplicate, we may need to

15949

+ // fill in the pv entry for this partition, since we

15950

+ // always delete those at the end of discovery.

15951

+ if ( ! pv_entry->pv ) {

15952

+ pv_entry->pv = pv;

15953

+ pv_entry->pv_number = pv->pv_number;

15954

+ }

15955

+ else {

15956

+ kfree(pv);

15957

+ }

15958

+

15959

+ return 1;

15960

+ }

15961

+ }

15962

+

15963

+ // No duplicate was found.

15964

+ return 0;

15965

+}

15966

+

15967

+

15968

+/* Function: verify_pv_uuid

15969

+ *

15970

+ * Verify that the specified PV belongs in the specified group by

15971

+ * searching for the PV's UUID in the group's list.

15972

+ */

15973

+static int verify_pv_uuid( lvm_physical_volume_t * pv_entry,

15974

+ lvm_volume_group_t * group )

15975

+{

15976

+ int i;

15977

+

15978

+ // Obviously the UUID list must be present in order to search.

15979

+ if ( ! group->uuid_list ) {

15980

+ LOG_WARNING("UUID list is missing from group %s.\n", group->vg_name);

15981

+ LOG_WARNING("Cannot verify UUID for PV %s\n", pv_entry->logical_node->name);

15982

+ return 0;

15983

+ }

15984

+

15985

+ // Start with the UUID entry for this PV's number

15986

+ if ( ! memcmp(pv_entry->pv->pv_uuid, &(group->uuid_list[(pv_entry->pv_number-1)*NAME_LEN]), UUID_LEN) ) {

15987

+ return 0;

15988

+ }

15989

+

15990

+ // If it wasn't found there, then search the entire group's list.

15991

+ for ( i = 0; i < group->vg->pv_cur; i++ ) {

15992

+ if ( ! memcmp(pv_entry->pv->pv_uuid, &(group->uuid_list[i*NAME_LEN]), UUID_LEN) ) {

15993

+ // Found the UUID.

15994

+ LOG_WARNING("Detected UUID mismatch for PV %s!\n", pv_entry->logical_node->name);

15995

+ LOG_WARNING("PV %s is recorded as being at index %d,\n", pv_entry->logical_node->name, pv_entry->pv_number);

15996

+ LOG_WARNING(" but Group %s has it recorded at index %d.\n", group->vg_name, i+1);

15997

+ LOG_WARNING("Run the EVMS Engine to correct the problem.\n");

15998

+ LOG_WARNING("If you have any snapshot regions in group %s\n", group->vg_name);

15999

+ LOG_WARNING(" it is recommended that you delete them immediately!\n");

16000

+ return 0;

16001

+ }

16002

+ }

16003

+

16004

+ LOG_SERIOUS("Could not find UUID for PV %s in group %s\n", pv_entry->logical_node->name, group->vg_name);

16005

+ return -EINVAL;

16006

+}

16007

+

16008

+

16009

+/* Function: add_pv_to_group

16010

+ *

16011

+ * Adds the physical volume to the appropriate volume group. The PV

16012

+ * passed into this function MUST be part of a valid VG.

16013

+ */

16014

+static int add_pv_to_group( lvm_physical_volume_t * pv_entry,

16015

+ lvm_volume_group_t * group )

16016

+{

16017

+ int rc;

16018

+

16019

+ // Make sure this PV's UUID is listed in the group.

16020

+ rc = verify_pv_uuid(pv_entry, group);

16021

+ if (rc) {

16022

+ LOG_SERIOUS("PV %s does not belong in group %s!\n", pv_entry->logical_node->name, group->vg_name);

16023

+ return rc;

16024

+ }

16025

+

16026

+ // Add this PV to the beginning of its group's list.

16027

+ pv_entry->next = group->pv_list;

16028

+ group->pv_list = pv_entry;

16029

+ group->pv_count++;

16030

+

16031

+ // Update the group's block and hardsector sizes as appropriate.

16032

+ group->block_size = max(pv_entry->logical_node->block_size, group->block_size);

16033

+ group->hard_sect_size = max(pv_entry->logical_node->hardsector_size, group->hard_sect_size);

16034

+

16035

+ // Check for the Partial or Removable flag on the PV.

16036

+ if ( pv_entry->logical_node->flags & EVMS_VOLUME_PARTIAL ) {

16037

+ group->flags |= EVMS_VG_PARTIAL_PVS;

16038

+ }

16039

+ if ( pv_entry->logical_node->flags & EVMS_DEVICE_REMOVABLE ) {

16040

+ group->flags |= EVMS_VG_REMOVABLE_PVS;

16041

+ }

16042

+

16043

+ LOG_DETAILS("PV %s added to Group %s\n", pv_entry->logical_node->name, group->vg_name);

16044

+

16045

+ return 0;

16046

+}

16047

+

16048

+

16049

+/* Function: discover_volume_groups

16050

+ *

16051

+ * Examine the list of logical nodes. Any node that contains a valid PV

16052

+ * structure is consumed and added to the appropriate volume group. PVs

16053

+ * which do not belong to any group are deleted. Everything else is left

16054

+ * on the discovery list.

16055

+ */

16056

+static int discover_volume_groups( evms_logical_node_t ** evms_node_list )

16057

+{

16058

+ evms_logical_node_t * node;

16059

+ evms_logical_node_t * next_node;

16060

+ pv_disk_t * pv;

16061

+ lvm_volume_group_t * group;

16062

+ lvm_physical_volume_t * pv_entry;

16063

+ int rc;

16064

+

16065

+ LOG_EXTRA("Searching for PVs in the node list.\n");

16066

+

16067

+ // Run through the discovery list

16068

+ for ( node = *evms_node_list; node; node = next_node ) {

16069

+ // Save the next node. We may remove this one from the list.

16070

+ next_node = node->next;

16071

+

16072

+ // Read the PV metadata. This will also create a new pv_disk_t

16073

+ // if it finds the correct LVM signatures.

16074

+ rc = read_pv(node, &pv);

16075

+ if (rc) {

16076

+ // This node is not an LVM PV, or an error occurred.

16077

+ // Just leave the node on the discovery list.

16078

+ continue;

16079

+ }

16080

+

16081

+ rc = find_group_for_pv(node, pv, &group);

16082

+ if (rc) {

16083

+ // Error getting the group for this PV.

16084

+ kfree(pv);

16085

+ continue;

16086

+ }

16087

+

16088

+ if ( ! group ) {

16089

+ // This node is an unassigned PV.

16090

+ LOG_DETAILS("PV %s is unassigned.\n", node->name);

16091

+ kfree(pv);

16092

+ continue;

16093

+ }

16094

+

16095

+ rc = check_for_duplicate_pv(node, pv, group);

16096

+ if (rc) {

16097

+ // This node is already in the group. This check is also

16098

+ // only in the kernel because the engine has no notion

16099

+ // of rediscover, and thus can never get a duplicate.

16100

+ evms_cs_remove_logical_node_from_list(evms_node_list, node);

16101

+ continue;

16102

+ }

16103

+

16104

+ // Allocate a PV entry for this node.

16105

+ pv_entry = allocate_physical_volume(node, pv);

16106

+ if ( ! pv_entry ) {

16107

+ continue;

16108

+ }

16109

+

16110

+ // Add this PV to the appropriate volume group.

16111

+ rc = add_pv_to_group(pv_entry, group);

16112

+ if (rc) {

16113

+ deallocate_physical_volume(pv_entry);

16114

+ continue;

16115

+ }

16116

+

16117

+ rc = read_pe_map(pv_entry);

16118

+ if (rc) {

16119

+ LOG_WARNING("Error reading PE maps for node %s\n", node->name);

16120

+ LOG_WARNING("Any volumes residing on this node will be incomplete!\n");

16121

+ }

16122

+

16123

+ evms_cs_remove_logical_node_from_list(evms_node_list, node);

16124

+ }

16125

+

16126

+ LOG_EXTRA("Group discovery complete.\n");

16127

+ return 0;

16128

+}

16129

+

16130

+

16131

+

16132

+/********** Logical Volume Discovery Functions **********/

16133

+

16134

+

16135

+

16136

+/* Function: build_le_maps

16137

+ *

16138

+ * After all logical volumes have been discovered, the mappings from

16139

+ * logical extents to physical extents must be constructed. Each PV

16140

+ * contains a map on-disk of its PEs. Each PE map entry contains the

16141

+ * logical volume number and the logical extent number on that volume.

16142

+ * Our internal map is the reverse of this map for each volume, listing

16143

+ * the PV node and sector offset for every logical extent on the volume.

16144

+ */

16145

+static int build_le_maps( lvm_volume_group_t * group )

16146

+{

16147

+ lvm_logical_volume_t ** volume_list = group->volume_list;

16148

+ lvm_physical_volume_t * pv_entry;

16149

+ evms_logical_node_t * node;

16150

+ pv_disk_t * pv;

16151

+ pe_disk_t * pe_map;

16152

+ evms_sector_t offset;

16153

+ u_int32_t lv_number;

16154

+ u_int32_t le_number;

16155

+ u_int32_t first_pe_sector;

16156

+ int i;

16157

+

16158

+ LOG_DEBUG("Building LE maps for new volumes in group %s.\n", group->vg_name);

16159

+

16160

+ // For every PV in this VG

16161

+ for ( pv_entry = group->pv_list; pv_entry; pv_entry = pv_entry->next ) {

16162

+ node = pv_entry->logical_node;

16163

+ pv = pv_entry->pv;

16164

+ pe_map = pv_entry->pe_map;

16165

+

16166

+ // Version 1 metadata uses pe_on_disk.base + .size to find start

16167

+ // of first PE. Version 2 uses pe_start.

16168

+ if ( pv->version == 1 ) {

16169

+ first_pe_sector = evms_cs_size_in_vsectors(pv->pe_on_disk.base + pv->pe_on_disk.size);

16170

+ }

16171

+ else {

16172

+ first_pe_sector = pv->pe_start;

16173

+ if ( ! first_pe_sector ) {

16174

+ first_pe_sector = evms_cs_size_in_vsectors(pv->pe_on_disk.base + pv->pe_on_disk.size);

16175

+ }

16176

+ }

16177

+

16178

+ // For every entry in the PE map, calculate the PE's sector offset

16179

+ // and update the correct LV's PE map. LV number of 0 marks an unused PE.

16180

+ // For re-discovery, only compute entries for new volumes. If a PV

16181

+ // is read-only, all LVs on that PV will also be read-only.

16182

+ for ( i = 0; i < pv->pe_total; i++ ) {

16183

+ lv_number = pe_map[i].lv_num;

16184

+ if ( lv_number &&

16185

+ volume_list[lv_number] &&

16186

+ volume_list[lv_number]->lv_access & (EVMS_LV_NEW|EVMS_LV_INCOMPLETE) ) {

16187

+ le_number = pe_map[i].le_num;

16188

+ offset = i * pv->pe_size + first_pe_sector;

16189

+ volume_list[lv_number]->le_map[le_number].owning_pv = pv_entry;

16190

+ volume_list[lv_number]->le_map[le_number].pe_sector_offset = offset;

16191

+ if ( node->flags & EVMS_VOLUME_SET_READ_ONLY ) {

16192

+ volume_list[lv_number]->lv_access &= ~LV_WRITE;

16193

+ }

16194

+ }

16195

+ }

16196

+ }

16197

+

16198

+ return 0;

16199

+}

16200

+

16201

+

16202

+/* Function: build_snapshot_maps

16203

+ *

16204

+ * For every volume in this group that is a snapshot, read all of the

16205

+ * existing entries in the COW table, and build up the snapshot mapping

16206

+ * structures accordingly.

16207

+ *

16208

+ * For reference, the COW tables attached to the snapshot volumes will

16209

+ * always be in disk-order (little-endian), so that it can always be

16210

+ * immediately written to disk. Therefore, endian conversions are necessary

16211

+ * any time the COW table is accessed. This function will make a local

16212

+ * copy of each COW table sector, and convert the local copy before

16213

+ * building the snapshot maps.

16214

+ */

16215

+static int build_snapshot_maps( lvm_volume_group_t * group )

16216

+{

16217

+ lvm_logical_volume_t * volume;

16218

+ evms_logical_node_t tmp_node;

16219

+ lv_COW_table_disk_t cow_table[EVMS_VSECTOR_SIZE/sizeof(lv_COW_table_disk_t)];

16220

+ unsigned long max_entries = EVMS_VSECTOR_SIZE / sizeof(lv_COW_table_disk_t);

16221

+ int i, j, rc = 0;

16222

+

16223

+ // Check every volume in the group to see if it is a snapshot. Also

16224

+ // check to make sure it is a new volume in the case of re-discovery.

16225

+ for ( i = 1; i <= MAX_LV; i++ ) {

16226

+

16227

+ // The volume must exist, must be new, and must be a snapshot

16228

+ volume = group->volume_list[i];

16229

+ if ( ! volume ||

16230

+ ! (volume->lv_access & EVMS_LV_NEW) ||

16231

+ ! (volume->lv_access & LV_SNAPSHOT) ) {

16232

+ continue;

16233

+ }

16234

+

16235

+ // Set up a temporary EVMS node

16236

+ tmp_node.instance_data = volume;

16237

+ rc = 0;

16238

+

16239

+ LOG_DEBUG("Building snapshot map for volume %s\n", volume->name);

16240

+

16241

+ while (1) {

16242

+ // Read in one sector's worth of COW tables.

16243

+ if ( lvm_init_io(&tmp_node, 0, volume->current_cow_sector, 1, volume->cow_table) ) {

16244

+ invalidate_snapshot_volume(volume);

16245

+ deallocate_logical_volume(volume);

16246

+ break;

16247

+ }

16248

+

16249

+ // Endian-conversion of this COW table to a local table.

16250

+ for ( j = 0; j < max_entries; j++ ) {

16251

+ cow_table[j].pv_org_number = le64_to_cpu(volume->cow_table[j].pv_org_number);

16252

+ cow_table[j].pv_org_rsector = le64_to_cpu(volume->cow_table[j].pv_org_rsector);

16253

+ cow_table[j].pv_snap_number = le64_to_cpu(volume->cow_table[j].pv_snap_number);

16254

+ cow_table[j].pv_snap_rsector = le64_to_cpu(volume->cow_table[j].pv_snap_rsector);

16255

+ }

16256

+

16257

+

16258

+ // Translate every valid COW table entry into

16259

+ // a snapshot map entry.

16260

+ for ( volume->next_cow_entry = 0;

16261

+ volume->next_cow_entry < max_entries &&

16262

+ cow_table[volume->next_cow_entry].pv_org_number;

16263

+ volume->next_cow_entry++ ) {

16264

+ // org_rsector must be a valid sector number,

16265

+ // i.e. it can't be within a PVs metadata. This

16266

+ // is how we detect invalidated snapshots.

16267

+ if ( (cow_table[volume->next_cow_entry].pv_org_rsector < 10) ||

16268

+ (cow_table[volume->next_cow_entry].pv_org_number > group->pv_count) ||

16269

+ (add_cow_entry_to_snapshot_map(&(cow_table[volume->next_cow_entry]),volume)) ) {

16270

+ // This volume either has an invalid COW entry,

16271

+ // or had an error adding that COW entry to the

16272

+ // snapshot map. This snapshot is done.

16273

+ invalidate_snapshot_volume(volume);

16274

+ deallocate_logical_volume(volume);

16275

+ rc = -EINVAL;

16276

+ break;

16277

+ }

16278

+ volume->next_free_chunk += volume->chunk_size;

16279

+ }

16280

+ // Move on to the next sector if necessary.

16281

+ if ( !rc && volume->next_cow_entry == max_entries ) {

16282

+ volume->current_cow_sector++;

16283

+ }

16284

+ else {

16285

+ break;

16286

+ }

16287

+ }

16288

+ }

16289

+

16290

+ return 0;

16291

+}

16292

+

16293

+

16294

+/* Function: link_snapshot_volumes

16295

+ *

16296

+ * This function examines the list of logical volumes in this group and

16297

+ * sets up the necessary pointers to link snapshots and their originals.

16298

+ * A singly-linked list is created starting with the original volume. Also,

16299

+ * all snapshot volumes point directly back to their original. This

16300

+ * function should not be run until all volumes have been discovered.

16301

+ * In the case of re-discovery, all of these links/lists get rebuilt as if

16302

+ * they were not already there. Currently this should not pose a problem.

16303

+ */

16304

+static int link_snapshot_volumes( lvm_volume_group_t * group )

16305

+{

16306

+ lvm_logical_volume_t * org_volume;

16307

+ lvm_logical_volume_t * snap_volume;

16308

+ u_int32_t org_minor;

16309

+ u_int32_t buffer_size = 0;

16310

+ int i, j;

16311

+

16312

+ for ( i = 1; i <= MAX_LV; i++ ) {

16313

+

16314

+ // Only process snapshot-originals

16315

+ org_volume = group->volume_list[i];

16316

+ if ( ! org_volume ||

16317

+ ! (org_volume->lv_access & LV_SNAPSHOT_ORG) ) {

16318

+ continue;

16319

+ }

16320

+

16321

+ // For snapshot-originals, look for all other volumes that

16322

+ // claim to be snapshotting it. For each one that is found,

16323

+ // insert it at the start of the original's list of snapshots.

16324

+ org_minor = org_volume->lv_minor;

16325

+ org_volume->snapshot_next = NULL; // This is necessary for rediscovery to work properly.

16326

+ // Could get circular snapshot lists otherwise.

16327

+ for ( j = 1; j <= MAX_LV; j++ ) {

16328

+ snap_volume = group->volume_list[j];

16329

+ if ( snap_volume &&

16330

+ snap_volume->lv_access & LV_SNAPSHOT &&

16331

+ (snap_volume->snap_org_minor == org_minor) ) {

16332

+ snap_volume->snapshot_org = org_volume;

16333

+ snap_volume->snapshot_next = org_volume->snapshot_next;

16334

+ org_volume->snapshot_next = snap_volume;

16335

+ if ( snap_volume->chunk_size > buffer_size ) {

16336

+ buffer_size = snap_volume->chunk_size;

16337

+ }

16338

+ LOG_DEBUG("Linking snapshot (%s) to original (%s)\n", snap_volume->name, org_volume->name);

16339

+ }

16340

+ }

16341

+

16342

+ // If no snapshots were found for a volume that claims to be

16343

+ // under snapshot, mark the group dirty. If this is final

16344

+ // discovery, the original will have the snapshot flag turned

16345

+ // off in check_logical_volumes().

16346

+ if ( ! org_volume->snapshot_next ) {

16347

+ LOG_WARNING("No snapshots found for original (%s)\n", org_volume->name);

16348

+ group->flags |= EVMS_VG_DIRTY;

16349

+ }

16350

+ }

16351

+ return 0;

16352

+}

16353

+

16354

+

16355

+/* Function: discover_volumes_in_group

16356

+ *

16357

+ */

16358

+static int discover_volumes_in_group( lvm_volume_group_t * group )

16359

+{

16360

+ lv_disk_t * lv_array = group->lv_array;

16361

+ lvm_logical_volume_t * new_volume;

16362

+ int i;

16363

+

16364

+ // Search through the LV structs for valid LV entries

16365

+ for ( i = 0; i < group->vg->lv_max; i++ ) {

16366

+

16367

+ // Only discover valid, active volumes

16368

+ if ( ! lv_array[i].lv_name[0] ||

16369

+ lv_array[i].lv_number >= MAX_LV ) {

16370

+ continue;

16371

+ }

16372

+

16373

+ // Make sure this volume isn't already in the list.

16374

+ if ( group->volume_list[lv_array[i].lv_number+1] ) {

16375

+ continue;

16376

+ }

16377

+

16378

+ // Create a new logical volume and place it in the appropriate

16379

+ // spot in this VG's volume list.

16380

+ new_volume = allocate_logical_volume(&(lv_array[i]), group);

16381

+ if ( ! new_volume ) {

16382

+ // This volume will be missing, but other

16383

+ // volumes in this group can still be built.

16384

+ LOG_CRITICAL("Memory error creating LV %s in Group %s\n", lv_array[i].lv_name, group->vg_name);

16385

+ continue;

16386

+ }

16387

+

16388

+ group->volume_list[new_volume->lv_number] = new_volume;

16389

+ group->volume_count++;

16390

+ group->flags |= EVMS_VG_DIRTY;

16391

+

16392

+ LOG_DEBUG("Discovered volume %s in group %s.\n", new_volume->name, group->vg_name);

16393

+ }

16394

+

16395

+ return 0;

16396

+}

16397

+

16398

+

16399

+/* Function: discover_logical_volumes

16400

+ *

16401

+ * After all PVs have been claimed and added to the appropriate VG list,

16402

+ * the volumes for each VG must be constructed. For each group, read all

16403

+ * the LV structs off the first PV in the list. Search this list of

16404

+ * structs for valid LVs. For each valid LV, create a new volume and add

16405

+ * it to the group.

16406

+ */

16407

+static int discover_logical_volumes( void )

16408

+{

16409

+ lvm_volume_group_t * group;

16410

+ int rc;

16411

+

16412

+ // Look for volumes in each valid VG entry. We even need to check ones

16413

+ // that aren't dirty - We could have deleted an incomplete volume on

16414

+ // the previous pass, and need to rediscover it in case this is final

16415

+ // discovery and we now want to export it.

16416

+ for ( group = lvm_group_list; group; group = group->next_group ) {

16417

+

16418

+ if ( ! group->vg ) {

16419

+ continue;

16420

+ }

16421

+

16422

+ LOG_DEBUG("Searching for volumes in group %s\n", group->vg_name);

16423

+

16424

+ // Read in the LV array from disk if necessary.

16425

+ rc = read_lv(group);

16426

+ if (rc) {

16427

+ LOG_WARNING("Unable to read LV metadata for group %s\n", group->vg_name);

16428

+ LOG_WARNING("No regions can be discovered for group %s\n", group->vg_name);

16429

+ continue;

16430

+ }

16431

+

16432

+ // Assemble each volume in the group.

16433

+ discover_volumes_in_group(group);

16434

+

16435

+ // Build the LE map for each LV discovered in this group. This

16436

+ // must be done after all LVS in the group are discovered.

16437

+ build_le_maps(group);

16438

+ check_le_maps(group);

16439

+

16440

+ // Set up all of the initial snapshot maps. Only the kernel

16441

+ // keeps track of the snapshot maps.

16442

+ build_snapshot_maps(group);

16443

+

16444

+ // Set up the pointers to link snapshot volumes

16445

+ // with their originals.

16446

+ link_snapshot_volumes(group);

16447

+ }

16448

+

16449

+ return 0;

16450

+}

16451

+

16452

+

16453

+/* Function: export_volumes

16454

+ *

16455

+ * The last thing the plugin must do is take each newly constructed volume

16456

+ * and place it on the evms logical node list. A zero return-code from

16457

+ * this function means nothing new was added to the list, and a positive

16458

+ * return code means that many new items were added to the list.

16459

+ */

16460

+static int export_volumes( evms_logical_node_t ** evms_node_list )

16461

+{

16462

+ lvm_volume_group_t * group;

16463

+ evms_logical_node_t * new_node;

16464

+ lvm_logical_volume_t * volume;

16465

+ int count = 0;

16466

+ int i;

16467

+

16468

+ LOG_EXTRA("Exporting volumes\n");

16469

+

16470

+ // For every valid, dirty volume group

16471

+ for ( group = lvm_group_list; group; group = group->next_group ) {

16472

+ if ( ! (group->flags & EVMS_VG_DIRTY) ) {

16473

+ continue;

16474

+ }

16475

+

16476

+ // Export every valid volume in the group. For re-discovery,

16477

+ // we re-export the same logical node.

16478

+ for ( i = 1; i <= MAX_LV; i++ ) {

16479

+ volume = group->volume_list[i];

16480

+ if ( ! volume ) {

16481

+ continue;

16482

+ }

16483

+

16484

+ // For new volumes, create a new EVMS node and

16485

+ // initialize the appropriate fields.

16486

+ if ( volume->lv_access & EVMS_LV_NEW ) {

16487

+ if ( evms_cs_allocate_logical_node(&new_node) ) {

16488

+ continue;

16489

+ }

16490

+

16491

+ volume->volume_node = new_node;

16492

+ volume->lv_access &= (~EVMS_LV_QUIESCED & ~EVMS_LV_NEW);

16493

+ new_node->hardsector_size = group->hard_sect_size;

16494

+ new_node->block_size = group->block_size;

16495

+ new_node->plugin = &lvm_plugin_header;

16496

+ new_node->instance_data = volume;

16497

+ memcpy(new_node->name, volume->name, NAME_LEN);

16498

+

16499

+ // Snapshot volumes should report the size of their original

16500

+ if ( volume->lv_access & LV_SNAPSHOT ) {

16501

+ new_node->total_vsectors = volume->snapshot_org->lv_size;

16502

+ }

16503

+ else {

16504

+ new_node->total_vsectors = volume->lv_size;

16505

+ }

16506

+

16507

+ // Is the volume read-only?

16508

+ if ( ! (volume->lv_access & LV_WRITE) ) {

16509

+ new_node->flags |= EVMS_VOLUME_READ_ONLY;

16510

+ LOG_DEBUG("LVM volume %s is read-only\n", volume->name);

16511

+ }

16512

+

16513

+ // Is the volume incomplete?

16514

+ if ( volume->lv_access & EVMS_LV_INCOMPLETE ) {

16515

+ new_node->flags |= (EVMS_VOLUME_READ_ONLY | EVMS_VOLUME_PARTIAL);

16516

+ LOG_DEBUG("LVM volume %s is incomplete\n", volume->name);

16517

+ }

16518

+

16519

+ // Does the volume group contain any partial or

16520

+ // removable PVs?

16521

+ if ( group->flags & EVMS_VG_PARTIAL_PVS ) {

16522

+ new_node->flags |= EVMS_VOLUME_PARTIAL;

16523

+ }

16524

+ if ( group->flags & EVMS_VG_REMOVABLE_PVS ) {

16525

+ new_node->flags |= EVMS_DEVICE_REMOVABLE;

16526

+ }

16527

+

16528

+ MOD_INC_USE_COUNT;

16529

+ }

16530

+

16531

+ // Export the node. The add_to_list will catch it if

16532

+ // we try to add the same node to the list twice.

16533

+ if ( ! evms_cs_add_logical_node_to_list(evms_node_list, volume->volume_node) ) {

16534

+ LOG_DETAILS("Exporting LVM volume %s\n", volume->name);

16535

+ count++;

16536

+ }

16537

+ }

16538

+

16539

+ // The group is clean now.

16540

+ group->flags &= ~EVMS_VG_DIRTY;

16541

+ }

16542

+

16543

+ return count;

16544

+}

16545

+

16546

+

16547

+/* Function: lvm_cleanup

16548

+ *

16549

+ * This function runs through the entire lvm data structure, removing

16550

+ * all items that are not needed at runtime. Currently, this is just the

16551

+ * vg_disk_t structure and the pv_disk_t structure for each PV. Also, any

16552

+ * groups that don't contain any volumes are deleted. All of the other

16553

+ * volume_group, logical_volume and evms_logical_node structures will be

16554

+ * kept around at run-time.

16555

+ */

16556

+static int lvm_cleanup( void )

16557

+{

16558

+ lvm_volume_group_t * group;

16559

+ lvm_volume_group_t * next_group;

16560

+ lvm_physical_volume_t * pv_entry;

16561

+

16562

+ for ( group = lvm_group_list; group; group = next_group ) {

16563

+ next_group = group->next_group;

16564

+

16565

+ // Delete groups with no volumes.

16566

+ if ( ! group->volume_count ) {

16567

+ LOG_WARNING("Group %s contains no logical volumes. Deleting.\n", group->vg_name);

16568

+ remove_group_from_list(group);

16569

+ deallocate_volume_group(group);

16570

+ // Need to go back to the start of the list,

16571

+ // just to be safe. :)

16572

+ next_group = lvm_group_list;

16573

+ continue;

16574

+ }

16575

+

16576

+ // Delete data structures that aren't used at runtime.

16577

+ if ( group->vg ) {

16578

+ kfree(group->vg);

16579

+ group->vg = NULL;

16580

+ }

16581

+ for ( pv_entry = group->pv_list; pv_entry; pv_entry = pv_entry->next ) {

16582

+ if ( pv_entry->pv ) {

16583

+ kfree(pv_entry->pv);

16584

+ pv_entry->pv = NULL;

16585

+ }

16586

+ if ( pv_entry->pe_map ) {

16587

+ vfree(pv_entry->pe_map);

16588

+ pv_entry->pe_map = NULL;

16589

+ }

16590

+ }

16591

+ if ( group->lv_array ) {

16592

+ vfree(group->lv_array);

16593

+ group->lv_array = NULL;

16594

+ }

16595

+ if ( group->uuid_list ) {

16596

+ vfree(group->uuid_list);

16597

+ group->uuid_list = NULL;

16598

+ }

16599

+ }

16600

+ return 0;

16601

+}

16602

+

16603

+

16604

+/* Function: lvm_get_bmap

16605

+ *

16606

+ * Support for the BMAP ioctl used by LILO to translate filesystem blocks

16607

+ * to disk blocks to map kernel images for boot time.

16608

+ */

16609

+static int lvm_get_bmap(evms_logical_node_t * node,

16610

+ evms_get_bmap_t * bmap,

16611

+ evms_logical_node_t ** pv_node )

16612

+{

16613

+ lvm_logical_volume_t * volume = node->instance_data;

16614

+ lvm_physical_volume_t * pv_entry;

16615

+ evms_sector_t new_sector = 0;

16616

+ evms_sector_t new_size = 0;

16617

+ evms_sector_t pe_start_sector;

16618

+ int rc = 0;

16619

+

16620

+ // No kernel images allowed on snapshot LVs.

16621

+ if ( volume->lv_access & LV_SNAPSHOT ) {

16622

+ return -EINVAL;

16623

+ }

16624

+

16625

+ // Range check.

16626

+ if ( bmap->rsector >= volume->lv_size ) {

16627

+ return -EINVAL;

16628

+ }

16629

+

16630

+ rc = remap_sector(node, bmap->rsector, 1, &new_sector, &new_size, &pe_start_sector, &pv_entry);

16631

+

16632

+ if (rc || !pv_entry || !new_sector) {

16633

+ return -EINVAL;

16634

+ }

16635

+

16636

+ bmap->rsector = new_sector;

16637

+ *pv_node = pv_entry->logical_node;

16638

+

16639

+ return 0;

16640

+}

16641

+

16642

+

16643

+/* Function: lvm_global_proc_read

16644

+ *

16645

+ * A callback function for the lvm-global proc-fs entry. This will print

16646

+ * general info about all LVM VGs, PVs, and LVs.

16647

+ */

16648

+static int lvm_global_proc_read(char * page,

16649

+ char ** start,

16650

+ off_t off,

16651

+ int count,

16652

+ int * eof,

16653

+ void * data )

16654

+{

16655

+ lvm_volume_group_t * group;

16656

+ lvm_physical_volume_t * pv_entry;

16657

+ lvm_logical_volume_t * volume;

16658

+ lvm_logical_volume_t * snap;

16659

+ int vgs = 0;

16660

+ int lvs = 0;

16661

+ int pvs = 0;

16662

+ int sz = 0;

16663

+ int i;

16664

+

16665

+ PROCPRINT("Enterprise Volume Management System: LVM Plugin\n");

16666

+ PROCPRINT("Plugin ID: %x.%x.%x\n",

16667

+ GetPluginOEM(lvm_plugin_header.id),

16668

+ GetPluginType(lvm_plugin_header.id),

16669

+ GetPluginID(lvm_plugin_header.id));

16670

+ PROCPRINT("Plugin Version: %d.%d.%d\n",

16671

+ lvm_plugin_header.version.major,

16672

+ lvm_plugin_header.version.minor,

16673

+ lvm_plugin_header.version.patchlevel);

16674

+ PROCPRINT("Required EVMS Services Version: %d.%d.%d\n",

16675

+ lvm_plugin_header.required_common_services_version.major,

16676

+ lvm_plugin_header.required_common_services_version.minor,

16677

+ lvm_plugin_header.required_common_services_version.patchlevel);

16678

+

16679

+ // Count all existing items.

16680

+ for ( group = lvm_group_list; group; group = group->next_group ) {

16681

+ lvs += group->volume_count;

16682

+ pvs += group->pv_count;

16683

+ vgs++;

16684

+ }

16685

+

16686

+ PROCPRINT("\n");

16687

+ PROCPRINT("Total: %d VGs %d PVs %d LVs\n", vgs, pvs, lvs);

16688

+

16689

+ // Print out specifics about each VG.

16690

+ for ( group = lvm_group_list; group; group = group->next_group ) {

16691

+ PROCPRINT("\n");

16692

+ PROCPRINT("VG: %s [%d PV, %d LV]\n",

16693

+ group->vg_name, group->pv_count, group->volume_count);

16694

+ PROCPRINT("PVs:\n");

16695

+ for ( pv_entry = group->pv_list; pv_entry; pv_entry = pv_entry->next ) {

16696

+ if ( pv_entry->logical_node ) {

16697

+ PROCPRINT("\t%s\t%10Ld KB\n",

16698

+ pv_entry->logical_node->name,

16699

+ pv_entry->logical_node->total_vsectors / 2);

16700

+ }

16701

+ }

16702

+ PROCPRINT("LVs:\n");

16703

+ for ( i = 1; i <= MAX_LV; i++ ) {

16704

+ if ( group->volume_list[i] ) {

16705

+ volume = group->volume_list[i];

16706

+ PROCPRINT("\t%s\t%10Ld KB / %5d LEs",

16707

+ volume->name,

16708

+ volume->lv_size / 2,

16709

+ volume->num_le);

16710

+ if ( volume->lv_access & LV_SNAPSHOT ) {

16711

+ PROCPRINT("\tSnapshot of : ");

16712

+ if ( volume->snapshot_org ) {

16713

+ PROCPRINT("%s : ", volume->snapshot_org->name);

16714

+ }

16715

+ else {

16716

+ PROCPRINT("(unknown) : ");

16717

+ }

16718

+ PROCPRINT("%ld%% full : ", (long)(volume->next_free_chunk) * 100 / (long)(volume->lv_size));

16719

+ if ( volume->lv_status & LV_ACTIVE ) {

16720

+ PROCPRINT("active");

16721

+ }

16722

+ else {

16723

+ PROCPRINT("disabled");

16724

+ }

16725

+ }

16726

+ else if ( volume->lv_access & LV_SNAPSHOT_ORG ) {

16727

+ PROCPRINT("\tSnapshotted by : ");

16728

+ for ( snap = volume->snapshot_next; snap; snap = snap->snapshot_next ) {

16729

+ PROCPRINT("%s ", snap->name);

16730

+ }

16731

+ }

16732

+ PROCPRINT("\n");

16733

+ }

16734

+ }

16735

+ }

16736

+

16737

+ return sz;

16738

+}

16739

+

16740

+

16741

+/********** Required EVMS Plugin Functions **********/

16742

+

16743

+

16744

+/* Function: lvm_discover

16745

+ *

16746

+ * This is the entry point into the LVM discovery process. It is a three

16747

+ * phase process. First, the list of nodes are examined for PVs, and the

16748

+ * appropriate volume groups are created. Then each volume group is

16749

+ * examined to find all available logical volumes. Finally, each LVM

16750

+ * logical volume has a new EVMS node created for it, and added to the

16751

+ * list of nodes.

16752

+ */

16753

+static int lvm_discover( evms_logical_node_t ** evms_node_list )

16754

+{

16755

+ int rc;

16756

+

16757

+ LOG_EXTRA("Beginning discovery.\n");

16758

+

16759

+ discover_volume_groups(evms_node_list);

16760

+

16761

+ check_volume_groups();

16762

+

16763

+ discover_logical_volumes();

16764

+

16765

+ check_logical_volumes(0);

16766

+

16767

+ rc = export_volumes(evms_node_list);

16768

+

16769

+ LOG_EXTRA("Discovery complete.\n");

16770

+ return rc;

16771

+}

16772

+

16773

+

16774

+/* Function: lvm_discover_end

16775

+ *

16776

+ * The discovery process at the region-manager level is now iterative,

16777

+ * much like the EVMS feature level. This allows the ability to stack

16778

+ * LVM on top of MD, or vice-versa. To accomplish this correctly, and

16779

+ * also to accomplish partial volume discovery, a second discover

16780

+ * entry point is needed, so EVMS can tell the region managers that

16781

+ * discovery is over, and to finish up any discovery that is not yet

16782

+ * complete. When this function is called, it should be assumed that

16783

+ * the node list has had nothing new added to it since the last call

16784

+ * of the regular discover function. Therefore, when this function is

16785

+ * called, we do not need to try to discovery any additional volume

16786

+ * groups. We will, however, look for logical volumes once more. This

16787

+ * gives us the ability to export (read-only) volumes that have

16788

+ * partially corrupted LE maps due to missing PVs in their VG.

16789

+ */

16790

+static int lvm_discover_end( evms_logical_node_t ** evms_node_list )

16791

+{

16792

+ int rc;

16793

+

16794

+ LOG_EXTRA("Beginning final discovery\n");

16795

+

16796

+ discover_volume_groups(evms_node_list);

16797

+

16798

+ check_volume_groups();

16799

+

16800

+ discover_logical_volumes();

16801

+

16802

+ check_logical_volumes(1);

16803

+

16804

+ rc = export_volumes(evms_node_list);

16805

+

16806

+ lvm_cleanup();

16807

+

16808

+ LOG_EXTRA("Final discovery complete.\n");

16809

+ return rc;

16810

+}

16811

+

16812

+

16813

+/* Function: lvm_delete_node

16814

+ *

16815

+ * This function deletes the in-memory representation of an LVM

16816

+ * logical volume.

16817

+ */

16818

+static int lvm_delete_node( evms_logical_node_t * logical_node )

16819

+{

16820

+ lvm_logical_volume_t * volume = logical_node->instance_data;

16821

+ lvm_volume_group_t * group = volume->group;

16822

+

16823

+ LOG_DEBUG("Deleting LVM node %s\n", logical_node->name);

16824

+

16825

+ if ( deallocate_logical_volume(volume) ) {

16826

+ return -EINVAL;

16827

+ }

16828

+

16829

+ // If we just removed the last volume from this group, the entire group

16830

+ // must also be deleted.

16831

+ if ( group && group->volume_count == 0 ) {

16832

+ remove_group_from_list(group);

16833

+ deallocate_volume_group(group);

16834

+ }

16835

+

16836

+ // Free the logical node.

16837

+ evms_cs_deallocate_logical_node(logical_node);

16838

+

16839

+ MOD_DEC_USE_COUNT;

16840

+

16841

+ return 0;

16842

+}

16843

+

16844

+

16845

+/* Function: lvm_read

16846

+ */

16847

+static void lvm_read( evms_logical_node_t * node,

16848

+ eio_t * eio )

16849

+{

16850

+ lvm_logical_volume_t * volume = node->instance_data;

16851

+ lvm_physical_volume_t * pv_entry;

16852

+ evms_sector_t pe_start_sector;

16853

+ evms_sector_t new_sector;

16854

+ evms_sector_t new_size;

16855

+

16856

+ // Make sure the volume is active and readable

16857

+ if ( ! (volume->lv_access & LV_READ && volume->lv_status & LV_ACTIVE) ) {

16858

+ EVMS_IO_ERROR(eio);

16859

+ return;

16860

+ }

16861

+

16862

+ // If this volume is a snapshot, lock the volume, and do

16863

+ // the LE-PE translation on its original volume.

16864

+ if ( volume->lv_access & LV_SNAPSHOT ) {

16865

+ down( &volume->snap_semaphore );

16866

+ if ( ! volume->snapshot_org ) {

16867

+ EVMS_IO_ERROR(eio);

16868

+ up( &volume->snap_semaphore );

16869

+ return;

16870

+ }

16871

+ node = volume->snapshot_org->volume_node;

16872

+ }

16873

+

16874

+ // Check if I/O goes past end of logical volume. Must use the

16875

+ // node, not the volume, so snapshots will work correctly.

16876

+ if ( eio->rsector + eio->rsize > node->total_vsectors ) {

16877

+ if ( volume->lv_access & LV_SNAPSHOT ) {

16878

+ up( &volume->snap_semaphore );

16879

+ }

16880

+ EVMS_IO_ERROR(eio);

16881

+ return;

16882

+ }

16883

+

16884

+ // Logical-to-Physical remapping. Check for incomplete volumes.

16885

+ if ( remap_sector(node, eio->rsector, eio->rsize, &new_sector, &new_size, &pe_start_sector, &pv_entry) ||

16886

+ ! pe_start_sector || ! pv_entry ) {

16887

+ if ( volume->lv_access & LV_SNAPSHOT ) {

16888

+ up( &volume->snap_semaphore );

16889

+ }

16890

+ EVMS_IO_ERROR(eio);

16891

+ return;

16892

+ }

16893

+

16894

+ // For snapshot volumes, check if this sector's chunk has been

16895

+ // remapped. If it has, new_sector and pv_entry will be changed

16896

+ // accordingly. If not, they remain the same.

16897

+ if ( volume->lv_access & LV_SNAPSHOT ) {

16898

+ snapshot_remap_sector(volume, pe_start_sector , &new_sector, &pv_entry);

16899

+ }

16900

+

16901

+ eio->rsector = new_sector;

16902

+ eio->rsize = new_size;

16903

+ R_IO(pv_entry->logical_node, eio);

16904

+

16905

+ // Unlock the snapshot

16906

+ if ( volume->lv_access & LV_SNAPSHOT ) {

16907

+ up( &volume->snap_semaphore );

16908

+ }

16909

+}

16910

+

16911

+

16912

+/* Function: lvm_write

16913

+ */

16914

+static void lvm_write( evms_logical_node_t * node,

16915

+ eio_t * eio )

16916

+{

16917

+ lvm_logical_volume_t * volume = node->instance_data;

16918

+ lvm_logical_volume_t * snap_volume;

16919

+ lvm_physical_volume_t * pv_entry;

16920

+ evms_sector_t pe_start_sector;

16921

+ evms_sector_t new_sector;

16922

+ evms_sector_t new_size;

16923

+

16924

+ // Make sure the volume is active and writable

16925

+ if ( ! (volume->lv_access & LV_WRITE && volume->lv_status & LV_ACTIVE) ) {

16926

+ EVMS_IO_ERROR(eio);

16927

+ return;

16928

+ }

16929

+

16930

+ // Check if I/O goes past end of logical volume.

16931

+ if ( eio->rsector + eio->rsize > node->total_vsectors ) {

16932

+ EVMS_IO_ERROR(eio);

16933

+ return;

16934

+ }

16935

+

16936

+ // Logical-to-Physical remapping. Check for incomplete volumes.

16937

+ if ( remap_sector(node, eio->rsector, eio->rsize, &new_sector, &new_size, &pe_start_sector, &pv_entry) ||

16938

+ ! pe_start_sector || ! pv_entry ) {

16939

+ EVMS_IO_ERROR(eio);

16940

+ return;

16941

+ }

16942

+

16943

+ // Copy-on-write for snapshotting

16944

+ if ( volume->lv_access & LV_SNAPSHOT_ORG ) {

16945

+ // Originals can be snapshotted multiple times

16946

+ for ( snap_volume = volume->snapshot_next; snap_volume; snap_volume = snap_volume->snapshot_next ) {

16947

+ if ( snapshot_copy_data(volume, snap_volume, pe_start_sector, new_sector, pv_entry) ) {

16948

+ EVMS_IO_ERROR(eio);

16949

+ return;

16950

+ }

16951

+ }

16952

+ }

16953

+

16954

+ eio->rsector = new_sector;

16955

+ eio->rsize = new_size;

16956

+ W_IO(pv_entry->logical_node, eio);

16957

+}

16958

+

16959

+

16960

+/* Function: lvm_init_io

16961

+ *

16962

+ * Init_io on a snapshot volume treats it like a regular volume.

16963

+ */

16964

+static int lvm_init_io( evms_logical_node_t * node,

16965

+ int io_flag, // 0=read, 1=write, 4=LVM-internal-write

16966

+ evms_sector_t sect_nr, // node LBA

16967

+ evms_sector_t num_sects, // # of sectors

16968

+ void * buf_addr ) // buffer address

16969

+{

16970

+ lvm_physical_volume_t * pv_entry;

16971

+ lvm_logical_volume_t * volume = node->instance_data;

16972

+ evms_sector_t pe_start_sector;

16973

+ evms_sector_t new_sector;

16974

+ evms_sector_t new_size;

16975

+ int rc = 0;

16976

+

16977

+ // Only allow internal writes to snapshots (io_flag==4). Disallow

16978

+ // writes to snapshot originals.

16979

+ if ( io_flag == 1 &&

16980

+ volume->lv_access & (LV_SNAPSHOT|LV_SNAPSHOT_ORG) ) {

16981

+ return -EINVAL;

16982

+ }

16983

+ // The node for a snapshot reports the size of the original. If a

16984

+ // request comes in in that range, just return.

16985

+ else if ( volume->lv_access & LV_SNAPSHOT &&

16986

+ sect_nr >= volume->lv_size &&

16987

+ sect_nr < node->total_vsectors ) {

16988

+ if ( io_flag == 0 ) {

16989

+ memset( buf_addr, 0, num_sects << EVMS_VSECTOR_SIZE_SHIFT );

16990

+ }

16991

+ return 0;

16992

+ }

16993

+ // Regular range check.

16994

+ else if ( sect_nr + num_sects > volume->lv_size ) {

16995

+ return -EINVAL;

16996

+ }

16997

+

16998

+ if ( io_flag == 4 ) {

16999

+ io_flag = 1;

17000

+ }

17001

+

17002

+ // Init IO needs to deal with the possibility of a request that spans

17003

+ // PEs or stripes. This is possible because there is no limit on

17004

+ // num_sects. To handle this, we loop through remap_sector and

17005

+ // INIT_IO until num_sects reaches zero.

17006

+ while ( num_sects ) {

17007

+ if ( remap_sector(node, sect_nr, num_sects, &new_sector, &new_size, &pe_start_sector, &pv_entry) ) {

17008

+ return -EIO;

17009

+ }

17010

+ // If the volume is incomplete, clear the buffer (on a read).

17011

+ if ( !pe_start_sector || !pv_entry ) {

17012

+ if ( io_flag == 0 ) {

17013

+ memset(buf_addr, 0, new_size << EVMS_VSECTOR_SIZE_SHIFT);

17014

+ }

17015

+ }

17016

+ else {

17017

+ rc = INIT_IO(pv_entry->logical_node, io_flag, new_sector, new_size, buf_addr);

17018

+ }

17019

+ num_sects -= new_size;

17020

+ sect_nr += new_size;

17021

+ buf_addr = (void*)(((unsigned long)buf_addr) + (unsigned long)(new_size << EVMS_VSECTOR_SIZE_SHIFT));

17022

+ }

17023

+

17024

+ return rc;

17025

+}

17026

+

17027

+

17028

+/* Function: lvm_ioctl

17029

+ */

17030

+static int lvm_ioctl( evms_logical_node_t * logical_node,

17031

+ struct inode * inode,

17032

+ struct file * file,

17033

+ unsigned int cmd,

17034

+ unsigned long arg)

17035

+{

17036

+ lvm_logical_volume_t * volume = logical_node->instance_data;

17037

+ int rc = 0;

17038

+

17039

+ LOG_ENTRY_EXIT("--lvm: Ioctl %d\n",cmd);

17040

+

17041

+ switch (cmd) {

17042

+

17043

+ case HDIO_GETGEO:

17044

+ {

17045

+ // Fixed geometry for all LVM volumes

17046

+ unsigned char heads = 64;

17047

+ unsigned char sectors = 32;

17048

+ long start = 0;

17049

+ struct hd_geometry *hd = (struct hd_geometry *)arg;

17050

+ short cylinders;

17051

+ cylinders = logical_node->total_vsectors;

17052

+ cylinders = (cylinders / heads) / sectors;

17053

+

17054

+ if (hd == NULL) {

17055

+ return -EINVAL;

17056

+ }

17057

+

17058

+ if ( copy_to_user((char*)(&hd->heads), &heads, sizeof(heads)) != 0 ||

17059

+ copy_to_user((char*)(&hd->sectors), &sectors, sizeof(sectors)) != 0 ||

17060

+ copy_to_user((short*)(&hd->cylinders), &cylinders, sizeof(cylinders)) != 0 ||

17061

+ copy_to_user((long*)(&hd->start), &start, sizeof(start)) != 0 ) {

17062

+ return -EFAULT;

17063

+ }

17064

+ }

17065

+ break;

17066

+

17067

+ case LV_SET_ACCESS:

17068

+ // Set access flags of a logical volume

17069

+ // If we decide to make a volume read-only, how do we

17070

+ // tell the EVMS level?

17071

+ /*

17072

+ if (!capable(CAP_SYS_ADMIN)) return -EACCES;

17073

+ lv_ptr->lv_access = (ulong) arg;

17074

+ if ( lv_ptr->lv_access & LV_WRITE)

17075

+ set_device_ro(lv_ptr->lv_dev, 0);

17076

+ else

17077

+ set_device_ro(lv_ptr->lv_dev, 1);

17078

+ */

17079

+ rc = -EINVAL;

17080

+ break;

17081

+

17082

+ case LV_SET_STATUS:

17083

+ // Set status flags of a logical volume

17084

+ /*

17085

+ if (!capable(CAP_SYS_ADMIN)) return -EACCES;

17086

+ if (!((ulong) arg & LV_ACTIVE) && lv_ptr->lv_open > 1)

17087

+ return -EPERM;

17088

+ lv_ptr->lv_status = (ulong) arg;

17089

+ */

17090

+ rc = -EINVAL;

17091

+ break;

17092

+

17093

+ case EVMS_QUIESCE_VOLUME:

17094

+ {

17095

+ evms_quiesce_volume_t * tmp = (evms_quiesce_volume_t*)arg;

17096

+ if ( tmp->command ) { // Quiesce

17097

+ volume->lv_access |= EVMS_LV_QUIESCED;

17098

+ }

17099

+ else { // Un-quiesce

17100

+ volume->lv_access &= ~EVMS_LV_QUIESCED;

17101

+ }

17102

+ }

17103

+ break;

17104

+

17105

+ case EVMS_GET_BMAP:

17106

+ {

17107

+ evms_get_bmap_t * bmap = (evms_get_bmap_t*)arg;

17108

+ evms_logical_node_t * pv_node;

17109

+

17110

+ rc = lvm_get_bmap(logical_node, bmap, &pv_node);

17111

+ if (!rc) {

17112

+ rc = IOCTL(pv_node, inode, file, cmd, (unsigned long)bmap);

17113

+ }

17114

+ }

17115

+ break;

17116

+

17117

+ case EVMS_GET_DISK_LIST:

17118

+ case EVMS_CHECK_MEDIA_CHANGE:

17119

+ case EVMS_REVALIDATE_DISK:

17120

+ case EVMS_OPEN_VOLUME:

17121

+ case EVMS_CLOSE_VOLUME:

17122

+ {

17123

+ // These five ioctl all need to be broadcast to all PVs.

17124

+ lvm_volume_group_t * group = volume->group;

17125

+ lvm_physical_volume_t * pv_entry;

17126

+ for ( pv_entry = group->pv_list; pv_entry; pv_entry = pv_entry->next ) {

17127

+ rc |= IOCTL(pv_entry->logical_node, inode, file, cmd, arg);

17128

+ }

17129

+ }

17130

+ break;

17131

+

17132

+ default:

17133

+ // Currently LVM does not send any ioctl's down to the

17134

+ // PVs. Which PV would they go to? What would we do with

17135

+ // the return codes?

17136

+ rc = -EINVAL;

17137

+ }

17138

+

17139

+ return rc;

17140

+}

17141

+

17142

+

17143

+/* Function: lvm_direct_ioctl

17144

+ *

17145

+ * This function provides a method for user-space to communicate directly

17146

+ * with a plugin in the kernel.

17147

+ */

17148

+static int lvm_direct_ioctl( struct inode * inode,

17149

+ struct file * file,

17150

+ unsigned int cmd,

17151

+ unsigned long args )

17152

+{

17153

+ evms_plugin_ioctl_t argument;

17154

+ int rc = 0;

17155

+

17156

+ // Copy user's parameters to kernel space

17157

+ if ( copy_from_user(&argument, (evms_plugin_ioctl_t*)args, sizeof(argument)) ) {

17158

+ return -EFAULT;

17159

+ }

17160

+

17161

+ // Make sure this is supposed to be our ioctl.

17162

+ if ( argument.feature_id != lvm_plugin_header.id ) {

17163

+ return -EINVAL;

17164

+ }

17165

+

17166

+ switch(argument.feature_command) {

17167

+

17168

+ case EVMS_LVM_PV_REMOVE_IOCTL:

17169

+ {

17170

+ lvm_pv_remove_ioctl_t pv_remove;

17171

+ if ( copy_from_user(&pv_remove, (lvm_pv_remove_ioctl_t*)argument.feature_ioctl_data, sizeof(pv_remove)) ) {

17172

+ rc = -EINVAL;

17173

+ break;

17174

+ }

17175

+ rc = remove_pv_from_group(pv_remove.pv_number, pv_remove.vg_uuid);

17176

+ }

17177

+ break;

17178

+

17179

+ case EVMS_LVM_SNAPSHOT_STAT_IOCTL:

17180

+ {

17181

+ lvm_snapshot_stat_ioctl_t snap_stats;

17182

+ if ( copy_from_user(&snap_stats, (lvm_snapshot_stat_ioctl_t*)argument.feature_ioctl_data, sizeof(snap_stats)) ) {

17183

+ rc = -EINVAL;

17184

+ break;

17185

+ }

17186

+ rc = get_snapshot_stats(&snap_stats);

17187

+ if ( copy_to_user((lvm_snapshot_stat_ioctl_t*)argument.feature_ioctl_data, &snap_stats, sizeof(snap_stats)) ) {

17188

+ rc = -EINVAL;

17189

+ break;

17190

+ }

17191

+ }

17192

+ break;

17193

+

17194

+ default:

17195

+ rc = -EINVAL;

17196

+ break;

17197

+ }

17198

+

17199

+ argument.status = rc;

17200

+ copy_to_user((evms_plugin_ioctl_t*)args, &argument, sizeof(argument));

17201

+ return rc;

17202

+}

17203

+

17204

+

17205

+/* Function: lvm_vge_init

17206

+ */

17207

+int __init lvm_vge_init(void)

17208

+{

17209

+ struct proc_dir_entry * pde;

17210

+

17211

+ lvm_group_list = NULL;

17212

+ lvm_proc = NULL;

17213

+

17214

+ // Register the global proc-fs entries.

17215

+ pde = evms_cs_get_evms_proc_dir();

17216

+ if ( pde ) {

17217

+ lvm_proc = create_proc_entry(LVM_PROC_NAME, S_IFDIR, pde);

17218

+ if ( lvm_proc ) {

17219

+ create_proc_read_entry(LVM_PROC_GLOBAL_NAME, S_IFREG, lvm_proc, lvm_global_proc_read, NULL);

17220

+ }

17221

+ }

17222

+

17223

+ // Register this plugin with EVMS.

17224

+ return evms_cs_register_plugin(&lvm_plugin_header);

17225

+}

17226

+

17227

+

17228

+/* Function: lvm_vge_exit

17229

+ */

17230

+void __exit lvm_vge_exit(void)

17231

+{

17232

+ lvm_volume_group_t * group;

17233

+ lvm_volume_group_t * next_group;

17234

+ struct proc_dir_entry * pde;

17235

+ int i;

17236

+

17237

+ // If LVM is called for module_exit, that means the reference

17238

+ // count must be zero, which means there should be no volumes,

17239

+ // and thus no volume groups. But, check anyway and delete

17240

+ // any volumes and groups that are still hanging around.

17241

+ if ( lvm_group_list ) {

17242

+ LOG_SERIOUS("Called for module_exit, but group list is not empty!\n");

17243

+ }

17244

+ for ( group = lvm_group_list; group; group = next_group ) {

17245

+ next_group = group->next_group;

17246

+

17247

+ LOG_SERIOUS("In module_exit: deleting all volumes from group %s.\n", group->vg_name);

17248

+

17249

+ for ( i = 1; i <= MAX_LV; i++ ) {

17250

+ if ( group->volume_list[i] ) {

17251

+ lvm_delete_node(group->volume_list[i]->volume_node);

17252

+ }

17253

+ }

17254

+ }

17255

+

17256

+ // Unregister the proc-fs entries.

17257

+ pde = evms_cs_get_evms_proc_dir();

17258

+ if (pde) {

17259

+ remove_proc_entry(LVM_PROC_GLOBAL_NAME, lvm_proc);

17260

+ remove_proc_entry(LVM_PROC_NAME, pde);

17261

+ }

17262

+

17263

+ // Unregister this plugin from EVMS.

17264

+ evms_cs_unregister_plugin(&lvm_plugin_header);

17265

+}

17266

+

17267

+

17268

+module_init(lvm_vge_init);

17269

+module_exit(lvm_vge_exit);

17270

+#ifdef MODULE_LICENSE

17271

+MODULE_LICENSE("GPL");

17272

+#endif

17273

+

17274

diff -Naur linux-2002-03-28/drivers/evms/md_core.c evms-2002-03-28/drivers/evms/md_core.c

17275

--- linux-2002-03-28/drivers/evms/md_core.c Wed Dec 31 18:00:00 1969

17276

+++ evms-2002-03-28/drivers/evms/md_core.c Thu Mar 28 08:37:22 2002

17277

@@ -0,0 +1,3267 @@

17278

+/*

17279

+ * Copyright (c) International Business Machines Corp., 2000

17280

+ *

17281

+ * This program is free software; you can redistribute it and/or modify

17282

+ * it under the terms of the GNU General Public License as published by

17283

+ * the Free Software Foundation; either version 2 of the License, or

17284

+ * (at your option) any later version.

17285

+ *

17286

+ * This program is distributed in the hope that it will be useful,

17287

+ * but WITHOUT ANY WARRANTY; without even the implied warranty of

17288

+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See

17289

+ * the GNU General Public License for more details.

17290

+ *

17291

+ * You should have received a copy of the GNU General Public License

17292

+ * along with this program; if not, write to the Free Software

17293

+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

17294

+ *

17295

+ *

17296

+ * linux/drivers/evms/md_core.c

17297

+ *

17298

+ * EVMS Linux MD Region Manager

17299

+ *

17300

+ */

17301

+

17302

+

17303

+#include <linux/module.h>

17304

+#include <linux/kmod.h>

17305

+#include <linux/kernel.h>

17306

+#include <linux/config.h>

17307

+#include <linux/genhd.h>

17308

+#include <linux/major.h>

17309

+#include <linux/string.h>

17310

+#include <linux/blk.h>

17311

+#include <linux/init.h>

17312

+#include <linux/slab.h>

17313

+#include <linux/vmalloc.h>

17314

+#include <linux/evms/evms_kernel.h>

17315

+#include <linux/evms/evms_md.h>

17316

+#include <linux/sysctl.h>

17317

+#include <asm/system.h>

17318

+#include <asm/uaccess.h>

17319

+

17320

+#define LOG_PREFIX "md core: "

17321

+

17322

+/*

17323

+ * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'

17324

+ * is 100 KB/sec, so the extra system load does not show up that much.

17325

+ * Increase it if you want to have more _guaranteed_ speed. Note that

17326

+ * the RAID driver will use the maximum available bandwith if the IO

17327

+ * subsystem is idle. There is also an 'absolute maximum' reconstruction

17328

+ * speed limit - in case reconstruction slows down your system despite

17329

+ * idle IO detection.

17330

+ *

17331

+ * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.

17332

+ */

17333

+

17334

+static MD_LIST_HEAD(all_raid_disks);

17335

+static MD_LIST_HEAD(pending_raid_disks);

17336

+

17337

+static int sysctl_speed_limit_min = 100;

17338

+static int sysctl_speed_limit_max = 100000;

17339

+

17340

+

17341

+static mdk_personality_t *pers[MAX_PERSONALITY];

17342

+

17343

+static int md_blocksizes[MAX_MD_DEVS];

17344

+static int md_hardsect_sizes[MAX_MD_DEVS];

17345

+int evms_md_size[MAX_MD_DEVS];

17346

+static evms_thread_t *evms_md_recovery_thread;

17347

+

17348

+/*

17349

+ * Enables to iterate over all existing md arrays

17350

+ */

17351

+static MD_LIST_HEAD(all_mddevs);

17352

+

17353

+/*

17354

+ * The mapping between kdev and mddev is not necessary a simple

17355

+ * one! Eg. HSM uses several sub-devices to implement Logical

17356

+ * Volumes. All these sub-devices map to the same mddev.

17357

+ */

17358

+dev_mapping_t evms_mddev_map[MAX_MD_DEVS];

17359

+

17360

+

17361

+static md_spinlock_t activate_spare_list_lock = MD_SPIN_LOCK_UNLOCKED;

17362

+static evms_md_activate_spare_t *evms_activate_spare_list = NULL, **evms_activate_spare_tail;

17363

+

17364

+/* Support functions for discovery */

17365

+static int evms_md_import_device (evms_logical_node_t **discover_list,

17366

+ evms_logical_node_t *node,

17367

+ int on_disk);

17368

+static void evms_md_autostart_arrays(evms_logical_node_t **discover_list);

17369

+static void evms_md_autorun_devices (evms_logical_node_t **discover_list,

17370

+ kdev_t countdev);

17371

+static void evms_md_autorun_array (evms_logical_node_t ** discover_list,

17372

+ mddev_t *mddev);

17373

+static int evms_md_create_logical_node(evms_logical_node_t **discover_list,

17374

+ mddev_t *mddev, uint flags);

17375

+static int evms_md_read_disk_sb (mdk_rdev_t * rdev);

17376

+static int evms_md_analyze_sbs (mddev_t * mddev);

17377

+static mddev_t * alloc_mddev (kdev_t dev);

17378

+static void free_mddev(mddev_t * mddev);

17379

+static int do_md_run (mddev_t * mddev);

17380

+static int do_md_stop (mddev_t * mddev, int ro);

17381

+

17382

+static void kick_rdev_from_array (mdk_rdev_t * rdev);

17383

+static mdp_disk_t *evms_md_find_disk(mddev_t *mddev, kdev_t dev);

17384

+static void remove_descriptor (mdp_disk_t *disk, mdp_super_t *sb);

17385

+

17386

+/* Plugin API prototypes */

17387

+static int md_discover( evms_logical_node_t ** discover_list );

17388

+static int md_end_discover( evms_logical_node_t ** discover_list );

17389

+static int md_delete( evms_logical_node_t * node);

17390

+static void md_read( evms_logical_node_t * node,

17391

+ eio_t * eio);

17392

+static void md_write( evms_logical_node_t * node,

17393

+ eio_t * eio);

17394

+static int md_init_io( evms_logical_node_t * node,

17395

+ int rw,

17396

+ evms_sector_t sect_nr,

17397

+ evms_sector_t num_sects,

17398

+ void * buf_addr );

17399

+static int md_ioctl( evms_logical_node_t * node,

17400

+ struct inode * inode,

17401

+ struct file * file,

17402

+ unsigned int cmd,

17403

+ unsigned long arg);

17404

+static int md_ioctl_cmd_broadcast(

17405

+ evms_logical_node_t *node,

17406

+ struct inode *inode,

17407

+ struct file *file,

17408

+ unsigned long cmd,

17409

+ unsigned long arg);

17410

+

17411

+static int md_direct_ioctl(

17412

+ struct inode * inode,

17413

+ struct file * file,

17414

+ unsigned int cmd,

17415

+ unsigned long arg);

17416

+

17417

+/* global MD data structures */

17418

+static evms_plugin_function_table_t md_function_table = {

17419

+ discover : &md_discover,

17420

+ end_discover : &md_end_discover,

17421

+ delete : &md_delete,

17422

+ read : &md_read,

17423

+ write : &md_write,

17424

+ init_io : &md_init_io,

17425

+ ioctl : &md_ioctl,

17426

+ direct_ioctl : &md_direct_ioctl

17427

+};

17428

+

17429

+static evms_plugin_header_t md_plugin_header = {

17430

+ id : SetPluginID(

17431

+ IBM_OEM_ID,

17432

+ EVMS_REGION_MANAGER,

17433

+ EVMS_MD_ID ),

17434

+ version : {

17435

+ major : MD_MAJOR_VERSION,

17436

+ minor : MD_MINOR_VERSION,

17437

+ patchlevel : MD_PATCHLEVEL_VERSION

17438

+ },

17439

+ required_common_services_version: {

17440

+ major : EVMS_MD_COMMON_SERVICES_MAJOR,

17441

+ minor : EVMS_MD_COMMON_SERVICES_MINOR,

17442

+ patchlevel : EVMS_MD_COMMON_SERVICES_PATCHLEVEL

17443

+ },

17444

+ function_table : &md_function_table

17445

+};

17446

+

17447

+/* local instance data structure definition */

17448

+typedef struct md_instance_data_s {

17449

+ mddev_t *mddev;

17450

+} md_instance_data_t;

17451

+

17452

+/* global variables */

17453

+static int exported_nodes; /* total # of exported devices

17454

+ * produced during this discovery.

17455

+ */

17456

+static evms_logical_node_t **cur_discover_list = NULL;

17457

+

17458

+/**********************************************************/

17459

+/* SYSCTL - EVMS/RAID folder */

17460

+/**********************************************************/

17461

+

17462

+#ifdef CONFIG_PROC_FS

17463

+static struct ctl_table_header *md_table_header;

17464

+

17465

+static ctl_table md_table[] = {

17466

+ {DEV_EVMS_MD_SPEED_LIMIT_MIN, "speed_limit_min",

17467

+ &sysctl_speed_limit_min, sizeof(int), 0644, NULL, &proc_dointvec},

17468

+ {DEV_EVMS_MD_SPEED_LIMIT_MAX, "speed_limit_max",

17469

+ &sysctl_speed_limit_max, sizeof(int), 0644, NULL, &proc_dointvec},

17470

+ {0}

17471

+};

17472

+

17473

+static ctl_table md_dir_table[] = {

17474

+ {DEV_EVMS_MD, "md", NULL, 0, 0555, md_table},

17475

+ {0}

17476

+};

17477

+

17478

+static ctl_table evms_dir_table[] = {

17479

+ {DEV_EVMS, "evms", NULL, 0, 0555, md_dir_table},

17480

+ {0}

17481

+};

17482

+

17483

+static ctl_table dev_dir_table[] = {

17484

+ {CTL_DEV, "dev", NULL, 0, 0555, evms_dir_table},

17485

+ {0}

17486

+};

17487

+#endif

17488

+/********** Required EVMS Plugin Functions **********/

17489

+

17490

+/*

17491

+ * Function: md_discover

17492

+ * We should only export complete MD device nodes

17493

+ */

17494

+static int md_discover( evms_logical_node_t ** discover_list )

17495

+{

17496

+ LOG_ENTRY_EXIT("md_discover() ENTRY\n");

17497

+

17498

+ /* initialize global variable */

17499

+ exported_nodes = 0;

17500

+ cur_discover_list = discover_list;

17501

+ evms_md_autostart_arrays(discover_list);

17502

+

17503

+ LOG_ENTRY_EXIT("md_discover() EXIT (exported nodes: %d)\n", exported_nodes);

17504

+ cur_discover_list = NULL;

17505

+ return(exported_nodes);

17506

+}

17507

+

17508

+

17509

+/*

17510

+ * Function: md_discover_end

17511

+ */

17512

+static int md_end_discover( evms_logical_node_t ** discover_list )

17513

+{

17514

+ int rc = 0;

17515

+ mddev_t *mddev;

17516

+ struct md_list_head *tmp;

17517

+ int done = FALSE;

17518

+

17519

+ rc = md_discover(discover_list);

17520

+

17521

+ do {

17522

+ done = TRUE;

17523

+ ITERATE_MDDEV(mddev,tmp){

17524

+ if (!mddev->nr_raid_disks) {

17525

+ free_mddev(mddev);

17526

+ done = FALSE;

17527

+ break;

17528

+ }

17529

+ if (mddev->flag & EVMS_MD_INCOMPLETE) {

17530

+ LOG_DETAILS("trying to run incomplete array md%d\n", mdidx(mddev));

17531

+ evms_md_autorun_array(discover_list,mddev);

17532

+ done = FALSE;

17533

+ break;

17534

+ }

17535

+ }

17536

+ } while (!done);

17537

+

17538

+ return rc;

17539

+}

17540

+

17541

+

17542

+/*

17543

+ * Function: md_delete_node

17544

+ */

17545

+static int md_delete( evms_logical_node_t * node)

17546

+{

17547

+ md_instance_data_t *MDID;

17548

+ mddev_t *mddev;

17549

+

17550

+ MDID = node->instance_data;

17551

+ mddev = MDID->mddev;

17552

+

17553

+ LOG_DEFAULT("md_delete() name=%s\n", evms_md_partition_name(node));

17554

+

17555

+ do_md_stop(mddev,0);

17556

+ if (MDID)

17557

+ evms_cs_deallocate_memory(MDID);

17558

+ evms_cs_deallocate_logical_node(node);

17559

+ return 0;

17560

+}

17561

+

17562

+

17563

+/*

17564

+ * Function: md_read

17565

+ */

17566

+static void md_read( evms_logical_node_t * node,

17567

+ eio_t * eio)

17568

+{

17569

+ md_instance_data_t *MDID;

17570

+ mddev_t *mddev;

17571

+

17572

+ MDID = node->instance_data;

17573

+ mddev = MDID->mddev;

17574

+ if ((eio->rsector + eio->rsize) > node->total_vsectors)

17575

+ EVMS_IO_ERROR(eio);

17576

+ else {

17577

+ if (mddev && mddev->pers)

17578

+ mddev->pers->make_request(mddev, READ, eio);

17579

+ }

17580

+}

17581

+

17582

+

17583

+/*

17584

+ * Function: md_write

17585

+ */

17586

+static void md_write( evms_logical_node_t * node,

17587

+ eio_t * eio)

17588

+{

17589

+ md_instance_data_t *MDID;

17590

+ mddev_t *mddev;

17591

+

17592

+ MDID = node->instance_data;

17593

+ mddev = MDID->mddev;

17594

+ if ((eio->rsector + eio->rsize) > node->total_vsectors)

17595

+ EVMS_IO_ERROR(eio);

17596

+ else {

17597

+ if (mddev && mddev->pers)

17598

+ mddev->pers->make_request(mddev, WRITE, eio);

17599

+ }

17600

+}

17601

+

17602

+

17603

+/*

17604

+ * Function: md_init_io

17605

+ */

17606

+static int md_init_io( evms_logical_node_t * node,

17607

+ int rw,

17608

+ evms_sector_t sect_nr,

17609

+ evms_sector_t num_sects, /* # of sectors */

17610

+ void * buf_addr ) /* buffer address */

17611

+{

17612

+ md_instance_data_t *MDID;

17613

+ mddev_t *mddev;

17614

+ int rc = 0;

17615

+

17616

+ MDID = node->instance_data;

17617

+ mddev = MDID->mddev;

17618

+ if (sect_nr + num_sects > node->total_vsectors) {

17619

+ LOG_ERROR(" md_init_io() attempt to %s beyond MD device(%s) boundary(%Lu) with sect_nr(%Lu) and num_sects(%Lu)\n",

17620

+ rw ? "WRITE" : "READ", evms_md_partition_name(node),node->total_vsectors,sect_nr,num_sects);

17621

+ rc = -EINVAL;

17622

+ }

17623

+ if (!rc && mddev && mddev->pers)

17624

+ rc = mddev->pers->init_io(mddev, rw, sect_nr, num_sects, buf_addr);

17625

+ else

17626

+ rc = -EINVAL;

17627

+ return rc;

17628

+}

17629

+

17630

+

17631

+/*

17632

+ * Function: md_ioctl

17633

+ */

17634

+static int md_ioctl(

17635

+ evms_logical_node_t * node,

17636

+ struct inode * inode,

17637

+ struct file * file,

17638

+ unsigned int cmd,

17639

+ unsigned long arg)

17640

+{

17641

+ md_instance_data_t * MDID = node->instance_data;

17642

+ mddev_t *mddev;

17643

+ int rc = 0;

17644

+

17645

+ if ((!inode) || (!MDID) )

17646

+ rc = -EINVAL;

17647

+

17648

+ if (!rc) {

17649

+ switch (cmd) {

17650

+ /*

17651

+ * We have a problem here : there is no easy way to give a CHS

17652

+ * virtual geometry. We currently pretend that we have a 2 heads

17653

+ * 4 sectors (with a BIG number of cylinders...). This drives

17654

+ * dosfs just mad... ;-)

17655

+ */

17656

+

17657

+ case HDIO_GETGEO:

17658

+ {

17659

+ struct hd_geometry hdgeo;

17660

+ hdgeo.heads = 2;

17661

+ hdgeo.sectors = 4;

17662

+ hdgeo.cylinders = ((unsigned int)node->total_vsectors) /

17663

+ hdgeo.heads / hdgeo.sectors;

17664

+ hdgeo.start = 0;

17665

+ if (copy_to_user((int *)arg,

17666

+ &hdgeo,

17667

+ sizeof(hdgeo)))

17668

+ rc = -EFAULT;

17669

+ }

17670

+ break;

17671

+ case EVMS_QUIESCE_VOLUME:

17672

+ case EVMS_GET_DISK_LIST:

17673

+ case EVMS_CHECK_MEDIA_CHANGE:

17674

+ case EVMS_REVALIDATE_DISK:

17675

+ case EVMS_OPEN_VOLUME:

17676

+ case EVMS_CLOSE_VOLUME:

17677

+ rc = md_ioctl_cmd_broadcast(

17678

+ node, inode, file, cmd, arg);

17679

+ break;

17680

+ case EVMS_PLUGIN_IOCTL:

17681

+ rc = md_direct_ioctl(

17682

+ inode, file, cmd, arg);

17683

+ break;

17684

+ default:

17685

+ mddev = MDID->mddev;

17686

+ if (mddev == NULL) {

17687

+ rc = -ENODEV;

17688

+ } else if (mddev->pers->evms_ioctl == NULL) {

17689

+ rc = -ENOSYS;

17690

+ } else {

17691

+ rc = mddev->pers->evms_ioctl(mddev, inode, file, cmd, arg);

17692

+ }

17693

+ }

17694

+ }

17695

+ return(rc);

17696

+}

17697

+

17698

+static int md_ioctl_cmd_broadcast(

17699

+ evms_logical_node_t *node,

17700

+ struct inode *inode,

17701

+ struct file *file,

17702

+ unsigned long cmd,

17703

+ unsigned long arg)

17704

+{

17705

+ int rc = 0;

17706

+ md_instance_data_t *MDID;

17707

+ mddev_t *mddev;

17708

+ struct md_list_head *tmp;

17709

+ mdk_rdev_t *rdev;

17710

+

17711

+ MDID = node->instance_data;

17712

+ mddev = MDID->mddev;

17713

+

17714

+ /* broadcast this cmd to all children */

17715

+ ITERATE_RDEV(mddev,rdev,tmp) {

17716

+ if (!rdev->mddev) {

17717

+ MD_BUG();

17718

+ continue;

17719

+ }

17720

+ if (!rdev->virtual_spare) {

17721

+ rc |= IOCTL(rdev->node, inode, file, cmd, arg);

17722

+ }

17723

+ }

17724

+ return (rc);

17725

+}

17726

+

17727

+

17728

+static int evms_md_add_virtual_spare (mddev_t *mddev, kdev_t dev)

17729

+{

17730

+ mdk_rdev_t *rdev;

17731

+ mdp_disk_t *disk = NULL;

17732

+ int i;

17733

+

17734

+ if (evms_md_find_rdev(mddev,dev))

17735

+ return -EEXIST;

17736

+

17737

+ LOG_ENTRY_EXIT("%s ENTRY\n", __FUNCTION__);

17738

+ if( evms_cs_allocate_memory((void**)&rdev, sizeof(*rdev)))

17739

+ return -ENOMEM;

17740

+

17741

+ memset(rdev, 0, sizeof(*rdev));

17742

+

17743

+ for (i = mddev->sb->raid_disks; i < MD_SB_DISKS; i++) {

17744

+ disk = mddev->sb->disks + i;

17745

+ if (!disk->major && !disk->minor)

17746

+ break;

17747

+ if (disk_removed(disk))

17748

+ break;

17749

+ }

17750

+ if (i == MD_SB_DISKS) {

17751

+ LOG_WARNING("%s : [md%d]can not hot-add to full array!\n", __FUNCTION__, mdidx(mddev));

17752

+ evms_cs_deallocate_memory(rdev);

17753

+ return -EBUSY;

17754

+ }

17755

+

17756

+ if (disk_removed(disk)) {

17757

+ /*

17758

+ * reuse slot

17759

+ */

17760

+ if (disk->number != i) {

17761

+ MD_BUG();

17762

+ evms_cs_deallocate_memory(rdev);

17763

+ return -EINVAL;

17764

+ }

17765

+ } else {

17766

+ disk->number = i;

17767

+ }

17768

+

17769

+ disk->raid_disk = disk->number;

17770

+ disk->major = MAJOR(dev);

17771

+ disk->minor = MINOR(dev);

17772

+

17773

+ mark_disk_spare(disk);

17774

+

17775

+ rdev->mddev = mddev;

17776

+ rdev->dev = dev;

17777

+ rdev->desc_nr = disk->number;

17778

+ rdev->virtual_spare = 1;

17779

+

17780

+ /* bind rdev to mddev array */

17781

+ md_list_add(&rdev->all, &all_raid_disks);

17782

+ md_list_add(&rdev->same_set, &mddev->disks);

17783

+ MD_INIT_LIST_HEAD(&rdev->pending);

17784

+

17785

+ mddev->sb->nr_disks++;

17786

+ mddev->sb->spare_disks++;

17787

+ mddev->sb->working_disks++;

17788

+ mddev->nb_dev++;

17789

+

17790

+ mddev->sb_dirty = 1;

17791

+

17792

+ evms_md_update_sb(mddev);

17793

+

17794

+ return 0;

17795

+}

17796

+

17797

+static int evms_md_remove_disk(mddev_t *mddev, kdev_t dev)

17798

+{

17799

+ mdk_rdev_t *rdev = NULL;

17800

+ mdp_disk_t *disk;

17801

+ int rc = 0;

17802

+

17803

+ disk = evms_md_find_disk(mddev,dev);

17804

+ if (!disk)

17805

+ return -ENODEV;

17806

+

17807

+ rdev = evms_md_find_rdev(mddev,dev);

17808

+

17809

+ if (rdev && !rdev->faulty) {

17810

+ /*

17811

+ * The disk is active in the array,

17812

+ * must ask the personality to do it

17813

+ */

17814

+ if (mddev->pers && mddev->pers->diskop) {

17815

+ /* Assume spare, try to remove it first. */

17816

+ rc = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_SPARE);

17817

+ if (rc)

17818

+ rc = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_DISK);

17819

+ } else

17820

+ rc = -ENOSYS;

17821

+ }

17822

+

17823

+ if (!rc) {

17824

+ remove_descriptor(disk,mddev->sb);

17825

+ if (rdev)

17826

+ kick_rdev_from_array(rdev);

17827

+ mddev->sb_dirty = 1;

17828

+ evms_md_update_sb(mddev);

17829

+

17830

+ }

17831

+ return rc;

17832

+}

17833

+

17834

+static int evms_md_activate_spare(mddev_t *mddev, kdev_t dev)

17835

+{

17836

+ mdk_rdev_t *rdev = NULL;

17837

+ evms_md_activate_spare_t activate_spare;

17838

+ unsigned long flags;

17839

+ int rc = 0;

17840

+

17841

+ rdev = evms_md_find_rdev(mddev,dev);

17842

+ if (rdev) {

17843

+ if (mddev->recovery_running) {

17844

+ rc = -EBUSY;

17845

+ } else {

17846

+ activate_spare.mddev = mddev;

17847

+ activate_spare.spare = &mddev->sb->disks[rdev->sb->this_disk.number];

17848

+ md_spin_lock_irqsave(&activate_spare_list_lock, flags);

17849

+ if (evms_activate_spare_list == NULL)

17850

+ evms_activate_spare_tail = &evms_activate_spare_list;

17851

+ *evms_activate_spare_tail = &activate_spare;

17852

+ evms_activate_spare_tail = &activate_spare.next;

17853

+ activate_spare.next = NULL;

17854

+ md_spin_unlock_irqrestore(&activate_spare_list_lock, flags);

17855

+

17856

+ mddev->sb->raid_disks++;

17857

+ evms_md_recover_arrays();

17858

+ }

17859

+ } else {

17860

+ rc = -ENODEV;

17861

+ }

17862

+ return rc;

17863

+}

17864

+

17865

+static int evms_md_deactivate_disk(mddev_t *mddev, kdev_t dev)

17866

+{

17867

+ mdk_rdev_t *rdev = NULL;

17868

+ mdp_disk_t *disk;

17869

+ int rc = 0;

17870

+

17871

+ disk = evms_md_find_disk(mddev,dev);

17872

+ rdev = evms_md_find_rdev(mddev,dev);

17873

+ if (!disk || !rdev || rdev->faulty)

17874

+ return -ENODEV;

17875

+

17876

+ /* Make sure it's not a spare */

17877

+ if (disk_spare(disk))

17878

+ return -EINVAL;

17879

+ /*

17880

+ * The disk is active in the array,

17881

+ * must ask the personality to do it

17882

+ */

17883

+ if (mddev->pers && mddev->pers->diskop) {

17884

+ rc = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_DEACTIVATE_DISK);

17885

+ if (!rc) {

17886

+ mark_disk_spare(disk);

17887

+ mddev->sb->active_disks--;

17888

+ mddev->sb->raid_disks--;

17889

+ mddev->sb->spare_disks++;

17890

+ mddev->sb_dirty = 1;

17891

+ evms_md_update_sb(mddev);

17892

+ }

17893

+ } else

17894

+ rc = -ENOSYS;

17895

+

17896

+ return rc;

17897

+

17898

+}

17899

+

17900

+/*

17901

+ * Function: md_direct_ioctl

17902

+ *

17903

+ * This function provides a method for user-space to communicate directly

17904

+ * with a plugin in the kernel.

17905

+ */

17906

+static int md_direct_ioctl(

17907

+ struct inode * inode,

17908

+ struct file * file,

17909

+ unsigned int cmd,

17910

+ unsigned long args )

17911

+{

17912

+ evms_plugin_ioctl_t argument;

17913

+ kdev_t md_kdev;

17914

+ mddev_t *mddev = NULL;

17915

+ evms_md_ioctl_t ioctl_arg;

17916

+ evms_md_kdev_t device;

17917

+ evms_md_array_info_t array_info, *usr_array_info;

17918

+ int rc = 0;

17919

+

17920

+ // Copy user's parameters to kernel space

17921

+ if ( copy_from_user(&argument, (evms_plugin_ioctl_t*)args, sizeof(argument)) )

17922

+ return -EFAULT;

17923

+

17924

+ // Make sure this is supposed to be our ioctl.

17925

+ if ( argument.feature_id != md_plugin_header.id )

17926

+ return -EINVAL;

17927

+

17928

+ // Copy user's md ioclt parmeters to kernel space

17929

+ if ( copy_from_user(&ioctl_arg,

17930

+ (evms_md_ioctl_t*)argument.feature_ioctl_data,

17931

+ sizeof(ioctl_arg)) )

17932

+ rc = -EFAULT;

17933

+ else {

17934

+ if (ioctl_arg.mddev_idx < MAX_MD_DEVS) {

17935

+ md_kdev = MKDEV(MD_MAJOR, ioctl_arg.mddev_idx);

17936

+ mddev = kdev_to_mddev(md_kdev);

17937

+ if (mddev == NULL)

17938

+ rc = -ENODEV;

17939

+ } else

17940

+ rc = -ENODEV;

17941

+ }

17942

+

17943

+ if (!rc) {

17944

+ switch(argument.feature_command) {

17945

+ case EVMS_MD_PERS_IOCTL_CMD:

17946

+ if (mddev->pers->md_pers_ioctl == NULL)

17947

+ return -ENOSYS;

17948

+ rc = mddev->pers->md_pers_ioctl(mddev,

17949

+ ioctl_arg.cmd,

17950

+ ioctl_arg.arg);

17951

+ copy_to_user((evms_md_ioctl_t*)argument.feature_ioctl_data,

17952

+ &ioctl_arg,

17953

+ sizeof(ioctl_arg));

17954

+ break;

17955

+

17956

+ case EVMS_MD_ADD:

17957

+ if ( copy_from_user(&device,

17958

+ (evms_md_kdev_t*)ioctl_arg.arg,

17959

+ sizeof(device)) )

17960

+ rc = -EFAULT;

17961

+ else

17962

+ rc = evms_md_add_virtual_spare(mddev,MKDEV(device.major, device.minor));

17963

+ break;

17964

+

17965

+ case EVMS_MD_REMOVE:

17966

+ if ( copy_from_user(&device,

17967

+ (evms_md_kdev_t*)ioctl_arg.arg,

17968

+ sizeof(device)) )

17969

+ rc = -EFAULT;

17970

+ else

17971

+ rc = evms_md_remove_disk(mddev,MKDEV(device.major, device.minor));

17972

+ break;

17973

+

17974

+ case EVMS_MD_ACTIVATE:

17975

+ if ( copy_from_user(&device,

17976

+ (evms_md_kdev_t*)ioctl_arg.arg,

17977

+ sizeof(device)) )

17978

+ rc = -EFAULT;

17979

+ else

17980

+ rc = evms_md_activate_spare(mddev,MKDEV(device.major, device.minor));

17981

+ break;

17982

+

17983

+ case EVMS_MD_DEACTIVATE:

17984

+ if ( copy_from_user(&device,

17985

+ (evms_md_kdev_t*)ioctl_arg.arg,

17986

+ sizeof(device)) )

17987

+ rc = -EFAULT;

17988

+ else

17989

+ rc = evms_md_deactivate_disk(mddev,MKDEV(device.major, device.minor));

17990

+ break;

17991

+

17992

+ case EVMS_MD_GET_ARRAY_INFO:

17993

+

17994

+ usr_array_info = (evms_md_array_info_t*)ioctl_arg.arg;

17995

+ if ( copy_from_user(&array_info, usr_array_info,

17996

+ sizeof(array_info)) )

17997

+ rc = -EFAULT;

17998

+ else {

17999

+ array_info.state = 0;

18000

+ if (mddev->curr_resync)

18001

+ array_info.state |= EVMS_MD_ARRAY_SYNCING;

18002

+ copy_to_user(&usr_array_info->state, &array_info.state,

18003

+ sizeof(usr_array_info->state));

18004

+ if (copy_to_user(array_info.sb, mddev->sb,

18005

+ sizeof(mdp_super_t)))

18006

+ rc = -EFAULT;

18007

+ }

18008

+ break;

18009

+ default:

18010

+ rc = -ENOSYS;

18011

+ break;

18012

+ }

18013

+ }

18014

+

18015

+ argument.status = rc;

18016

+ copy_to_user((evms_plugin_ioctl_t*)args, &argument, sizeof(argument));

18017

+ return rc;

18018

+}

18019

+

18020

+

18021

+

18022

+

18023

+void evms_md_add_mddev_mapping (mddev_t * mddev, kdev_t dev, void *data)

18024

+{

18025

+ unsigned int minor = MINOR(dev);

18026

+

18027

+ if (MAJOR(dev) != MD_MAJOR) {

18028

+ MD_BUG();

18029

+ return;

18030

+ }

18031

+ if (evms_mddev_map[minor].mddev != NULL) {

18032

+ MD_BUG();

18033

+ return;

18034

+ }

18035

+ evms_mddev_map[minor].mddev = mddev;

18036

+ evms_mddev_map[minor].data = data;

18037

+}

18038

+

18039

+void evms_md_del_mddev_mapping (mddev_t * mddev, kdev_t dev)

18040

+{

18041

+ unsigned int minor = MINOR(dev);

18042

+

18043

+ if (MAJOR(dev) != MD_MAJOR) {

18044

+ MD_BUG();

18045

+ return;

18046

+ }

18047

+ if (evms_mddev_map[minor].mddev != mddev) {

18048

+ MD_BUG();

18049

+ return;

18050

+ }

18051

+ evms_mddev_map[minor].mddev = NULL;

18052

+ evms_mddev_map[minor].data = NULL;

18053

+}

18054

+

18055

+static mddev_t * alloc_mddev (kdev_t dev)

18056

+{

18057

+ mddev_t *mddev;

18058

+

18059

+ if (MAJOR(dev) != MD_MAJOR) {

18060

+ MD_BUG();

18061

+ return 0;

18062

+ }

18063

+ mddev = (mddev_t *) kmalloc(sizeof(*mddev), GFP_KERNEL);

18064

+ if (!mddev)

18065

+ return NULL;

18066

+

18067

+ memset(mddev, 0, sizeof(*mddev));

18068

+

18069

+ mddev->__minor = MINOR(dev);

18070

+ init_MUTEX(&mddev->reconfig_sem);

18071

+ init_MUTEX(&mddev->recovery_sem);

18072

+ init_MUTEX(&mddev->resync_sem);

18073

+ MD_INIT_LIST_HEAD(&mddev->disks);

18074

+ MD_INIT_LIST_HEAD(&mddev->all_mddevs);

18075

+ atomic_set(&mddev->active, 0);

18076

+

18077

+ /*

18078

+ * The 'base' mddev is the one with data NULL.

18079

+ * personalities can create additional mddevs

18080

+ * if necessary.

18081

+ */

18082

+ evms_md_add_mddev_mapping(mddev, dev, 0);

18083

+ md_list_add(&mddev->all_mddevs, &all_mddevs);

18084

+

18085

+ MOD_INC_USE_COUNT;

18086

+

18087

+ return mddev;

18088

+}

18089

+

18090

+mdk_rdev_t * evms_md_find_rdev_nr(mddev_t *mddev, int nr)

18091

+{

18092

+ mdk_rdev_t * rdev;

18093

+ struct md_list_head *tmp;

18094

+

18095

+ ITERATE_RDEV(mddev,rdev,tmp) {

18096

+ if (rdev->desc_nr == nr)

18097

+ return rdev;

18098

+ }

18099

+ return NULL;

18100

+}

18101

+

18102

+

18103

+mdk_rdev_t * evms_md_find_rdev(mddev_t * mddev, kdev_t dev)

18104

+{

18105

+ struct md_list_head *tmp;

18106

+ mdk_rdev_t *rdev;

18107

+

18108

+ ITERATE_RDEV(mddev,rdev,tmp) {

18109

+ if (rdev->dev == dev)

18110

+ return rdev;

18111

+ }

18112

+ return NULL;

18113

+}

18114

+

18115

+mdk_rdev_t * evms_md_find_rdev_from_node(mddev_t * mddev, evms_logical_node_t * node)

18116

+{

18117

+ struct md_list_head *tmp;

18118

+ mdk_rdev_t *rdev;

18119

+

18120

+ ITERATE_RDEV(mddev,rdev,tmp) {

18121

+ if (rdev->node == node)

18122

+ return rdev;

18123

+ }

18124

+ return NULL;

18125

+}

18126

+

18127

+static MD_LIST_HEAD(device_names);

18128

+

18129

+static char * org_partition_name (kdev_t dev)

18130

+{

18131

+ struct gendisk *hd;

18132

+ static char nomem [] = "<nomem>";

18133

+ dev_name_t *dname;

18134

+ struct md_list_head *tmp = device_names.next;

18135

+

18136

+ while (tmp != &device_names) {

18137

+ dname = md_list_entry(tmp, dev_name_t, list);

18138

+ if (dname->dev == dev)

18139

+ return dname->name;

18140

+ tmp = tmp->next;

18141

+ }

18142

+

18143

+ dname = (dev_name_t *) kmalloc(sizeof(*dname), GFP_KERNEL);

18144

+

18145

+ if (!dname)

18146

+ return nomem;

18147

+ /*

18148

+ * ok, add this new device name to the list

18149

+ */

18150

+ hd = get_gendisk (dev);

18151

+ dname->name = NULL;

18152

+ if (hd)

18153

+ dname->name = disk_name (hd, MINOR(dev), dname->namebuf);

18154

+ if (!dname->name) {

18155

+ sprintf (dname->namebuf, "[dev %s]", kdevname(dev));

18156

+ dname->name = dname->namebuf;

18157

+ }

18158

+

18159

+ dname->dev = dev;

18160

+ MD_INIT_LIST_HEAD(&dname->list);

18161

+ md_list_add(&dname->list, &device_names);

18162

+

18163

+ return dname->name;

18164

+}

18165

+

18166

+

18167

+#define EVMS_MD_NULL_PARTITION_NAME "<EVMS_NODE_NO_NAME>"

18168

+char * evms_md_partition_name (evms_logical_node_t *node)

18169

+{

18170

+ if (node && node->name)

18171

+ return node->name;

18172

+ else

18173

+ return EVMS_MD_NULL_PARTITION_NAME;

18174

+}

18175

+

18176

+static char * get_partition_name (mdk_rdev_t *rdev)

18177

+{

18178

+ if (rdev->node)

18179

+ return evms_md_partition_name(rdev->node);

18180

+ else

18181

+ return org_partition_name(rdev->dev);

18182

+}

18183

+

18184

+/*

18185

+ * Function: evms_md_calc_dev_sboffset

18186

+ * return the LSN for md super block.

18187

+ */

18188

+static u_int64_t evms_md_calc_dev_sboffset (evms_logical_node_t *node,mddev_t *mddev, int persistent)

18189

+{

18190

+ u_int64_t size = 0;

18191

+

18192

+ size = node->total_vsectors;

18193

+ if (persistent) {

18194

+ size = MD_NEW_SIZE_SECTORS(size);

18195

+ }

18196

+ return size; /* size in sectors */

18197

+}

18198

+

18199

+/*

18200

+ * Function: evms_md_calc_dev_size

18201

+ * return data size (in blocks) for an "extended" device.

18202

+ */

18203

+static unsigned long evms_md_calc_dev_size (evms_logical_node_t *node,

18204

+ mddev_t *mddev,

18205

+ int persistent)

18206

+{

18207

+ unsigned long size;

18208

+ u_int64_t size_in_sectors;

18209

+

18210

+ size_in_sectors = evms_md_calc_dev_sboffset(node, mddev, persistent);

18211

+ size = size_in_sectors >> 1;

18212

+ if (!mddev->sb) {

18213

+ MD_BUG();

18214

+ return size;

18215

+ }

18216

+ if (mddev->sb->chunk_size)

18217

+ size &= ~(mddev->sb->chunk_size/1024 - 1);

18218

+ return size;

18219

+}

18220

+

18221

+static unsigned int zoned_raid_size (mddev_t *mddev)

18222

+{

18223

+ unsigned int mask;

18224

+ mdk_rdev_t * rdev;

18225

+ struct md_list_head *tmp;

18226

+

18227

+ if (!mddev->sb) {

18228

+ MD_BUG();

18229

+ return -EINVAL;

18230

+ }

18231

+ /*

18232

+ * do size and offset calculations.

18233

+ */

18234

+ mask = ~(mddev->sb->chunk_size/1024 - 1);

18235

+

18236

+ ITERATE_RDEV(mddev,rdev,tmp) {

18237

+ rdev->size &= mask;

18238

+ evms_md_size[mdidx(mddev)] += rdev->size;

18239

+ }

18240

+ return 0;

18241

+}

18242

+

18243

+/*

18244

+ * We check wether all devices are numbered from 0 to nb_dev-1. The

18245

+ * order is guaranteed even after device name changes.

18246

+ *

18247

+ * Some personalities (raid0, linear) use this. Personalities that

18248

+ * provide data have to be able to deal with loss of individual

18249

+ * disks, so they do their checking themselves.

18250

+ */

18251

+int evms_md_check_ordering (mddev_t *mddev)

18252

+{

18253

+ int i, c;

18254

+ mdk_rdev_t *rdev;

18255

+ struct md_list_head *tmp;

18256

+

18257

+ /*

18258

+ * First, all devices must be fully functional

18259

+ */

18260

+ ITERATE_RDEV(mddev,rdev,tmp) {

18261

+ if (rdev->faulty) {

18262

+ LOG_ERROR("evms_md_check_ordering() md%d's device %s faulty, aborting.\n",

18263

+ mdidx(mddev), get_partition_name(rdev));

18264

+ goto abort;

18265

+ }

18266

+ }

18267

+

18268

+ c = 0;

18269

+ ITERATE_RDEV(mddev,rdev,tmp) {

18270

+ c++;

18271

+ }

18272

+ if (c != mddev->nb_dev) {

18273

+ MD_BUG();

18274

+ goto abort;

18275

+ }

18276

+ if (mddev->nb_dev != mddev->sb->raid_disks) {

18277

+ LOG_ERROR("[md%d] array needs %d disks, has %d, aborting.\n",

18278

+ mdidx(mddev), mddev->sb->raid_disks, mddev->nb_dev);

18279

+ goto abort;

18280

+ }

18281

+ /*

18282

+ * Now the numbering check

18283

+ */

18284

+ for (i = 0; i < mddev->nb_dev; i++) {

18285

+ c = 0;

18286

+ ITERATE_RDEV(mddev,rdev,tmp) {

18287

+ if (rdev->desc_nr == i)

18288

+ c++;

18289

+ }

18290

+ if (!c) {

18291

+ LOG_ERROR("md%d, missing disk #%d, aborting.\n",mdidx(mddev), i);

18292

+ goto abort;

18293

+ }

18294

+ if (c > 1) {

18295

+ LOG_ERROR("md%d, too many disks #%d, aborting.\n",mdidx(mddev), i);

18296

+ goto abort;

18297

+ }

18298

+ }

18299

+ return 0;

18300

+abort:

18301

+ return 1;

18302

+}

18303

+

18304

+static void remove_descriptor (mdp_disk_t *disk, mdp_super_t *sb)

18305

+{

18306

+ if (disk_active(disk)) {

18307

+ sb->working_disks--;

18308

+ } else {

18309

+ if (disk_spare(disk)) {

18310

+ sb->spare_disks--;

18311

+ sb->working_disks--;

18312

+ } else {

18313

+ sb->failed_disks--;

18314

+ }

18315

+ }

18316

+ sb->nr_disks--;

18317

+ disk->major = disk->minor = 0;

18318

+ mark_disk_removed(disk);

18319

+}

18320

+

18321

+#define BAD_MAGIC \

18322

+"invalid raid superblock magic on %s\n"

18323

+

18324

+#define BAD_MINOR \

18325

+"%s: invalid raid minor (%x)\n"

18326

+

18327

+#define NO_SB \

18328

+"disabled device %s, could not read superblock.\n"

18329

+

18330

+#define BAD_CSUM \

18331

+"invalid superblock checksum on %s\n"

18332

+

18333

+

18334

+static int alloc_array_sb (mddev_t * mddev)

18335

+{

18336

+ if (mddev->sb) {

18337

+ MD_BUG();

18338

+ return 0;

18339

+ }

18340

+

18341

+ mddev->sb = (mdp_super_t *) __get_free_page (GFP_KERNEL);

18342

+ if (!mddev->sb) {

18343

+ LOG_ERROR("%s: Out of memory!\n", __FUNCTION__);

18344

+ return -ENOMEM;

18345

+ }

18346

+ md_clear_page(mddev->sb);

18347

+ return 0;

18348

+}

18349

+

18350

+static int alloc_disk_sb (mdk_rdev_t * rdev)

18351

+{

18352

+ if (rdev->sb)

18353

+ MD_BUG();

18354

+

18355

+ rdev->sb = (mdp_super_t *) __get_free_page(GFP_KERNEL);

18356

+ if (!rdev->sb) {

18357

+ LOG_ERROR("%s: Out of memory!\n", __FUNCTION__);

18358

+ return -EINVAL;

18359

+ }

18360

+ md_clear_page(rdev->sb);

18361

+

18362

+ return 0;

18363

+}

18364

+

18365

+/*

18366

+ * Function: free_disk_sb

18367

+ *

18368

+ */

18369

+static void free_disk_sb (mdk_rdev_t * rdev)

18370

+{

18371

+ if (rdev->sb) {

18372

+ free_page((unsigned long) rdev->sb);

18373

+ rdev->sb = NULL;

18374

+ rdev->sb_offset = 0;

18375

+ rdev->size = 0;

18376

+ } else {

18377

+ if (!rdev->virtual_spare && !rdev->faulty)

18378

+ MD_BUG();

18379

+ }

18380

+}

18381

+

18382

+/*

18383

+ * Function: evms_md_read_disk_sb

18384

+ * Read the MD superblock.

18385

+ */

18386

+static int evms_md_read_disk_sb (mdk_rdev_t * rdev)

18387

+{

18388

+ int rc = 0;

18389

+ evms_logical_node_t *node = rdev->node;

18390

+ u_int64_t sb_offset_in_sectors;

18391

+

18392

+ if (!rdev->sb) {

18393

+ MD_BUG();

18394

+ return -EINVAL;

18395

+ }

18396

+ if (node->total_vsectors <= MD_RESERVED_SECTORS) {

18397

+ LOG_DETAILS("%s is too small, total_vsectors(%Lu)\n",

18398

+ evms_md_partition_name(node), node->total_vsectors);

18399

+ return -EINVAL;

18400

+ }

18401

+

18402

+ /*

18403

+ * Calculate the position of the superblock,

18404

+ * it's at the end of the disk

18405

+ */

18406

+ sb_offset_in_sectors = evms_md_calc_dev_sboffset(node, rdev->mddev, 1);

18407

+ rdev->sb_offset = (unsigned long)(sb_offset_in_sectors >> 1);

18408

+ LOG_DEBUG("(read) %s's sb offset(%Lu) total_vsectors(%Lu)\n",

18409

+ evms_md_partition_name(node), sb_offset_in_sectors, node->total_vsectors);

18410

+

18411

+ /*

18412

+ * Read superblock

18413

+ */

18414

+ rc = INIT_IO(node, 0, sb_offset_in_sectors, MD_SB_SECTORS, rdev->sb);

18415

+

18416

+ if (!rc) {

18417

+ LOG_DEBUG(" [events: %x]\n", rdev->sb->events_lo);

18418

+ } else {

18419

+ LOG_ERROR(NO_SB, evms_md_partition_name(node));

18420

+ }

18421

+ return rc;

18422

+}

18423

+

18424

+static unsigned int calc_sb_csum (mdp_super_t * sb)

18425

+{

18426

+ unsigned int disk_csum, csum;

18427

+

18428

+ disk_csum = sb->sb_csum;

18429

+ sb->sb_csum = 0;

18430

+ csum = csum_partial((void *)sb, MD_SB_BYTES, 0);

18431

+ sb->sb_csum = disk_csum;

18432

+ return csum;

18433

+}

18434

+

18435

+

18436

+

18437

+/*

18438

+ * Check one RAID superblock for generic plausibility

18439

+ */

18440

+

18441

+static int check_disk_sb (mdk_rdev_t * rdev)

18442

+{

18443

+ mdp_super_t *sb;

18444

+ int ret = -EINVAL;

18445

+

18446

+ sb = rdev->sb;

18447

+ if (!sb) {

18448

+ MD_BUG();

18449

+ goto abort;

18450

+ }

18451

+

18452

+ if (sb->md_magic != MD_SB_MAGIC) {

18453

+ LOG_DEBUG(BAD_MAGIC, get_partition_name(rdev));

18454

+ goto abort;

18455

+ }

18456

+

18457

+ if (sb->md_minor >= MAX_MD_DEVS) {

18458

+ LOG_ERROR(BAD_MINOR, get_partition_name(rdev), sb->md_minor);

18459

+ goto abort;

18460

+ }

18461

+ if (calc_sb_csum(sb) != sb->sb_csum) {

18462

+ LOG_ERROR(BAD_CSUM, get_partition_name(rdev));

18463

+ goto abort;

18464

+ }

18465

+ ret = 0;

18466

+abort:

18467

+ return ret;

18468

+}

18469

+

18470

+static kdev_t dev_unit(kdev_t dev)

18471

+{

18472

+ unsigned int mask;

18473

+ struct gendisk *hd = get_gendisk(dev);

18474

+

18475

+ if (!hd)

18476

+ return 0;

18477

+ mask = ~((1 << hd->minor_shift) - 1);

18478

+

18479

+ return MKDEV(MAJOR(dev), MINOR(dev) & mask);

18480

+}

18481

+

18482

+static mdk_rdev_t * match_dev_unit(mddev_t *mddev, kdev_t dev)

18483

+{

18484

+ struct md_list_head *tmp;

18485

+ mdk_rdev_t *rdev;

18486

+

18487

+ ITERATE_RDEV(mddev,rdev,tmp)

18488

+ if (dev_unit(rdev->dev) == dev_unit(dev))

18489

+ return rdev;

18490

+

18491

+ return NULL;

18492

+}

18493

+

18494

+static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)

18495

+{

18496

+ struct md_list_head *tmp;

18497

+ mdk_rdev_t *rdev;

18498

+

18499

+ ITERATE_RDEV(mddev1,rdev,tmp)

18500

+ if (match_dev_unit(mddev2, rdev->dev))

18501

+ return 1;

18502

+

18503

+ return 0;

18504

+}

18505

+

18506

+

18507

+static void bind_rdev_to_array (mdk_rdev_t * rdev, mddev_t * mddev)

18508

+{

18509

+ mdk_rdev_t *same_pdev;

18510

+

18511

+ if (rdev->mddev) {

18512

+ MD_BUG();

18513

+ return;

18514

+ }

18515

+

18516

+ same_pdev = match_dev_unit(mddev, rdev->dev);

18517

+ if (same_pdev)

18518

+ LOG_WARNING("[md%d] WARNING: %s appears to be on the same physical disk as %s. True\n"

18519

+ " protection against single-disk failure might be compromised.\n",

18520

+ mdidx(mddev), get_partition_name(rdev),get_partition_name(same_pdev));

18521

+

18522

+ md_list_add(&rdev->same_set, &mddev->disks);

18523

+ rdev->mddev = mddev;

18524

+ mddev->nb_dev++;

18525

+ if (rdev->sb && disk_active(&rdev->sb->this_disk))

18526

+ mddev->nr_raid_disks++;

18527

+ LOG_DETAILS("bind<%s,%d>\n", get_partition_name(rdev), rdev->mddev->nb_dev);

18528

+}

18529

+

18530

+static void unbind_rdev_from_array (mdk_rdev_t * rdev)

18531

+{

18532

+ if (!rdev->mddev) {

18533

+ MD_BUG();

18534

+ return;

18535

+ }

18536

+ md_list_del(&rdev->same_set);

18537

+ MD_INIT_LIST_HEAD(&rdev->same_set);

18538

+ rdev->mddev->nb_dev--;

18539

+ if (rdev->sb && disk_active(&rdev->sb->this_disk))

18540

+ rdev->mddev->nr_raid_disks--;

18541

+ LOG_DETAILS("unbind<%s,%d>\n", get_partition_name(rdev), rdev->mddev->nb_dev);

18542

+ rdev->mddev = NULL;

18543

+}

18544

+

18545

+

18546

+/*

18547

+ * Function: evms_md_export_rdev

18548

+ * EVMS MD version of export_rdev()

18549

+ * Discard this MD "extended" device

18550

+ */

18551

+static void evms_md_export_rdev (mdk_rdev_t * rdev)

18552

+{

18553

+ LOG_DETAILS("%s: (%s)\n", __FUNCTION__ , get_partition_name(rdev));

18554

+ if (rdev->mddev)

18555

+ MD_BUG();

18556

+ free_disk_sb(rdev);

18557

+ md_list_del(&rdev->all);

18558

+ MD_INIT_LIST_HEAD(&rdev->all);

18559

+ if (rdev->pending.next != &rdev->pending) {

18560

+ LOG_WARNING("%s: (%s was pending)\n",__FUNCTION__ ,get_partition_name(rdev));

18561

+ md_list_del(&rdev->pending);

18562

+ MD_INIT_LIST_HEAD(&rdev->pending);

18563

+ }

18564

+ if (rdev->node) {

18565

+ LOG_DETAILS("%s: deleting node %s\n", __FUNCTION__, get_partition_name(rdev));

18566

+ if (cur_discover_list) {

18567

+ LOG_DETAILS("%s: remove (%s) from discover list.\n", __FUNCTION__,

18568

+ get_partition_name(rdev));

18569

+ evms_cs_remove_logical_node_from_list(cur_discover_list, rdev->node);

18570

+ }

18571

+ DELETE(rdev->node);

18572

+ rdev->node = NULL;

18573

+ }

18574

+ rdev->dev = 0;

18575

+ rdev->faulty = 0;

18576

+ kfree(rdev);

18577

+}

18578

+

18579

+

18580

+static void kick_rdev_from_array (mdk_rdev_t * rdev)

18581

+{

18582

+ LOG_DEFAULT("%s: (%s)\n", __FUNCTION__,get_partition_name(rdev));

18583

+ unbind_rdev_from_array(rdev);

18584

+ evms_md_export_rdev(rdev);

18585

+}

18586

+

18587

+static void export_array (mddev_t *mddev)

18588

+{

18589

+ struct md_list_head *tmp;

18590

+ mdk_rdev_t *rdev;

18591

+ mdp_super_t *sb = mddev->sb;

18592

+

18593

+ LOG_DEFAULT("%s: [md%d]\n",__FUNCTION__ ,mdidx(mddev));

18594

+ if (mddev->sb) {

18595

+ mddev->sb = NULL;

18596

+ free_page((unsigned long) sb);

18597

+ }

18598

+

18599

+ LOG_DEBUG("%s: removing all extended devices belong to md%d\n",__FUNCTION__,mdidx(mddev));

18600

+ ITERATE_RDEV(mddev,rdev,tmp) {

18601

+ if (!rdev->mddev) {

18602

+ MD_BUG();

18603

+ continue;

18604

+ }

18605

+ kick_rdev_from_array(rdev);

18606

+ }

18607

+ if (mddev->nb_dev)

18608

+ MD_BUG();

18609

+}

18610

+

18611

+static void free_mddev (mddev_t *mddev)

18612

+{

18613

+ if (!mddev) {

18614

+ MD_BUG();

18615

+ return;

18616

+ }

18617

+

18618

+ export_array(mddev);

18619

+ evms_md_size[mdidx(mddev)] = 0;

18620

+

18621

+

18622

+ /*

18623

+ * Make sure nobody else is using this mddev

18624

+ * (careful, we rely on the global kernel lock here)

18625

+ */

18626

+ while (md_atomic_read(&mddev->resync_sem.count) != 1)

18627

+ schedule();

18628

+ while (md_atomic_read(&mddev->recovery_sem.count) != 1)

18629

+ schedule();

18630

+

18631

+ evms_md_del_mddev_mapping(mddev, MKDEV(MD_MAJOR, mdidx(mddev)));

18632

+ md_list_del(&mddev->all_mddevs);

18633

+ MD_INIT_LIST_HEAD(&mddev->all_mddevs);

18634

+ kfree(mddev);

18635

+ MOD_DEC_USE_COUNT;

18636

+}

18637

+

18638

+

18639

+static void print_desc(mdp_disk_t *desc)

18640

+{

18641

+ printk(" DISK<N:%d,R:%d,S:%d>\n", desc->number,

18642

+ desc->raid_disk,desc->state);

18643

+}

18644

+

18645

+static void print_sb(mdp_super_t *sb)

18646

+{

18647

+ int i;

18648

+

18649

+ printk(" SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",

18650

+ sb->major_version, sb->minor_version, sb->patch_version,

18651

+ sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,

18652

+ sb->ctime);

18653

+ printk(" L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", sb->level,

18654

+ sb->size, sb->nr_disks, sb->raid_disks, sb->md_minor,

18655

+ sb->layout, sb->chunk_size);

18656

+ printk(" UT:%08x ST:%d AD:%d WD:%d FD:%d SD:%d CSUM:%08x E:%x\n",

18657

+ sb->utime, sb->state, sb->active_disks, sb->working_disks,

18658

+ sb->failed_disks, sb->spare_disks,

18659

+ sb->sb_csum, sb->events_lo);

18660

+

18661

+ for (i = 0; i < MD_SB_DISKS; i++) {

18662

+ mdp_disk_t *desc;

18663

+

18664

+ desc = sb->disks + i;

18665

+ if (desc->number || desc->major || desc->minor || desc->raid_disk || (desc->state && (desc->state != 4))) {

18666

+ printk(" D %2d: ", i);

18667

+ print_desc(desc);

18668

+ }

18669

+ }

18670

+ printk(" THIS: ");

18671

+ print_desc(&sb->this_disk);

18672

+

18673

+}

18674

+

18675

+static void print_rdev(mdk_rdev_t *rdev)

18676

+{

18677

+ printk("rdev %s: SZ:%08ld F:%d DN:%d ",

18678

+ get_partition_name(rdev),

18679

+ rdev->size, rdev->faulty, rdev->desc_nr);

18680

+ if (rdev->sb) {

18681

+ printk("rdev superblock:\n");

18682

+ print_sb(rdev->sb);

18683

+ } else

18684

+ printk("no rdev superblock!\n");

18685

+}

18686

+

18687

+void evms_md_print_devices (void)

18688

+{

18689

+ struct md_list_head *tmp, *tmp2;

18690

+ mdk_rdev_t *rdev;

18691

+ mddev_t *mddev;

18692

+

18693

+ printk("\n");

18694

+ printk(": **********************************\n");

18695

+ printk(": * <COMPLETE RAID STATE PRINTOUT> *\n");

18696

+ printk(": **********************************\n");

18697

+ ITERATE_MDDEV(mddev,tmp) {

18698

+ printk("md%d: ", mdidx(mddev));

18699

+

18700

+ ITERATE_RDEV(mddev,rdev,tmp2)

18701

+ printk("<%s>", get_partition_name(rdev));

18702

+

18703

+ if (mddev->sb) {

18704

+ printk(" array superblock:\n");

18705

+ print_sb(mddev->sb);

18706

+ } else

18707

+ printk(" no array superblock.\n");

18708

+

18709

+ ITERATE_RDEV(mddev,rdev,tmp2)

18710

+ print_rdev(rdev);

18711

+ }

18712

+ printk(": **********************************\n");

18713

+ printk("\n");

18714

+}

18715

+

18716

+static int sb_equal ( mdp_super_t *sb1, mdp_super_t *sb2)

18717

+{

18718

+ int ret;

18719

+ mdp_super_t *tmp1, *tmp2;

18720

+

18721

+ tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);

18722

+ tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);

18723

+

18724

+ if (!tmp1 || !tmp2) {

18725

+ ret = 0;

18726

+ printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n");

18727

+ goto abort;

18728

+ }

18729

+

18730

+ *tmp1 = *sb1;

18731

+ *tmp2 = *sb2;

18732

+

18733

+ /*

18734

+ * nr_disks is not constant

18735

+ */

18736

+ tmp1->nr_disks = 0;

18737

+ tmp2->nr_disks = 0;

18738

+

18739

+ if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))

18740

+ ret = 0;

18741

+ else

18742

+ ret = 1;

18743

+

18744

+abort:

18745

+ if (tmp1)

18746

+ kfree(tmp1);

18747

+ if (tmp2)

18748

+ kfree(tmp2);

18749

+

18750

+ return ret;

18751

+}

18752

+

18753

+static int uuid_equal(mdk_rdev_t *rdev1, mdk_rdev_t *rdev2)

18754

+{

18755

+ if ( (rdev1->sb->set_uuid0 == rdev2->sb->set_uuid0) &&

18756

+ (rdev1->sb->set_uuid1 == rdev2->sb->set_uuid1) &&

18757

+ (rdev1->sb->set_uuid2 == rdev2->sb->set_uuid2) &&

18758

+ (rdev1->sb->set_uuid3 == rdev2->sb->set_uuid3))

18759

+

18760

+ return 1;

18761

+

18762

+ return 0;

18763

+}

18764

+

18765

+/*

18766

+ * Function: evms_md_find_rdev_all

18767

+ * EVMS MD version of find_rdev_all() above

18768

+ * Search entire all_raid_disks for "node"

18769

+ * Return the MD "extended" device if found.

18770

+ */

18771

+static mdk_rdev_t * evms_md_find_rdev_all (evms_logical_node_t *node)

18772

+{

18773

+ struct md_list_head *tmp;

18774

+ mdk_rdev_t *rdev;

18775

+

18776

+ tmp = all_raid_disks.next;

18777

+ while (tmp != &all_raid_disks) {

18778

+ rdev = md_list_entry(tmp, mdk_rdev_t, all);

18779

+ if (rdev->node == node)

18780

+ return rdev;

18781

+ tmp = tmp->next;

18782

+ }

18783

+ return NULL;

18784

+}

18785

+

18786

+

18787

+/*

18788

+ * Function: evms_md_write_disk_sb

18789

+ * EVMS MD version of write_disk_sb

18790

+ */

18791

+static int evms_md_write_disk_sb(mdk_rdev_t * rdev)

18792

+{

18793

+ unsigned long size;

18794

+ u_int64_t sb_offset_in_sectors;

18795

+

18796

+ if (!rdev->sb) {

18797

+ MD_BUG();

18798

+ return 1;

18799

+ }

18800

+ if (rdev->faulty) {

18801

+ MD_BUG();

18802

+ return 1;

18803

+ }

18804

+ if (rdev->sb->md_magic != MD_SB_MAGIC) {

18805

+ MD_BUG();

18806

+ return 1;

18807

+ }

18808

+

18809

+ sb_offset_in_sectors = evms_md_calc_dev_sboffset(rdev->node, rdev->mddev, 1);

18810

+ if (rdev->sb_offset != (sb_offset_in_sectors >> 1)) {

18811

+ LOG_WARNING("%s's sb offset has changed from blocks(%ld) to blocks(%ld), skipping\n",

18812

+ get_partition_name(rdev),

18813

+ rdev->sb_offset,

18814

+ (unsigned long)(sb_offset_in_sectors >> 1));

18815

+ goto skip;

18816

+ }

18817

+ /*

18818

+ * If the disk went offline meanwhile and it's just a spare, then

18819

+ * its size has changed to zero silently, and the MD code does

18820

+ * not yet know that it's faulty.

18821

+ */

18822

+ size = evms_md_calc_dev_size(rdev->node, rdev->mddev, 1);

18823

+ if (size != rdev->size) {

18824

+ LOG_WARNING("%s's size has changed from %ld to %ld since import, skipping\n",

18825

+ get_partition_name(rdev), rdev->size, size);

18826

+ goto skip;

18827

+ }

18828

+

18829

+ LOG_DETAILS("(write) %s's sb offset: %Lu\n",get_partition_name(rdev), sb_offset_in_sectors);

18830

+

18831

+ INIT_IO(rdev->node,WRITE,sb_offset_in_sectors,MD_SB_SECTORS,rdev->sb);

18832

+

18833

+skip:

18834

+ return 0;

18835

+}

18836

+

18837

+static int evms_md_sync_sbs(mddev_t * mddev)

18838

+{

18839

+ mdk_rdev_t *rdev;

18840

+ struct md_list_head *tmp;

18841

+ mdp_disk_t * disk;

18842

+

18843

+ ITERATE_RDEV(mddev,rdev,tmp) {

18844

+ if (rdev->virtual_spare || rdev->faulty || rdev->alias_device)

18845

+ continue;

18846

+

18847

+ /* copy everything from the master */

18848

+ *rdev->sb = *mddev->sb;

18849

+

18850

+ /* this_disk is unique, copy it from the master */

18851

+// rdev->sb->this_disk = mddev->sb->disks[rdev->desc_nr];

18852

+ // use the SB disk array since if update occurred on normal shutdown

18853

+ // the rdevs may be out of date.

18854

+ disk = evms_md_find_disk(mddev, rdev->dev);

18855

+ if (disk) {

18856

+ rdev->sb->this_disk = *disk;

18857

+ }

18858

+

18859

+ rdev->sb->sb_csum = calc_sb_csum(rdev->sb);

18860

+ }

18861

+ return 0;

18862

+}

18863

+

18864

+int evms_md_update_sb_sync(mddev_t * mddev)

18865

+{

18866

+ mdk_rdev_t *rdev;

18867

+ struct md_list_head *tmp;

18868

+

18869

+ ITERATE_RDEV(mddev,rdev,tmp) {

18870

+ if (rdev->virtual_spare || rdev->faulty || rdev->alias_device)

18871

+ continue;

18872

+

18873

+ /* found first good device, so read the new SB */

18874

+ if (!evms_md_read_disk_sb(rdev)){

18875

+ /* this_disk is unique, copy it from the master */

18876

+ if (rdev->sb->md_magic == MD_SB_MAGIC) {

18877

+ *mddev->sb = *rdev->sb;

18878

+ mddev->sb->state |= 1 << MD_SB_CLEAN;

18879

+ evms_md_update_sb(mddev);

18880

+ break;

18881

+ }

18882

+ }

18883

+

18884

+ }

18885

+ return 0;

18886

+

18887

+}

18888

+int evms_md_update_sb(mddev_t * mddev)

18889

+{

18890

+ int err, count = 100;

18891

+ struct md_list_head *tmp;

18892

+ mdk_rdev_t *rdev;

18893

+

18894

+

18895

+repeat:

18896

+ mddev->sb->utime = CURRENT_TIME;

18897

+ if ((++mddev->sb->events_lo)==0)

18898

+ ++mddev->sb->events_hi;

18899

+

18900

+ if ((mddev->sb->events_lo|mddev->sb->events_hi)==0) {

18901

+ /*

18902

+ * oops, this 64-bit counter should never wrap.

18903

+ * Either we are in around ~1 trillion A.C., assuming

18904

+ * 1 reboot per second, or we have a bug:

18905

+ */

18906

+ MD_BUG();

18907

+ mddev->sb->events_lo = mddev->sb->events_hi = 0xffffffff;

18908

+ }

18909

+ evms_md_sync_sbs(mddev);

18910

+

18911

+ /*

18912

+ * do not write anything to disk if using

18913

+ * nonpersistent superblocks

18914

+ */

18915

+ if (mddev->sb->not_persistent)

18916

+ return 0;

18917

+

18918

+ LOG_DETAILS("%s: updating [md%d] superblock\n",__FUNCTION__ ,mdidx(mddev));

18919

+

18920

+ err = 0;

18921

+ ITERATE_RDEV(mddev,rdev,tmp) {

18922

+ if (!rdev->virtual_spare && !rdev->faulty && !rdev->alias_device) {

18923

+ LOG_DETAILS(" %s [events: %x]",

18924

+ get_partition_name(rdev),

18925

+ rdev->sb->events_lo);

18926

+ err += evms_md_write_disk_sb(rdev);

18927

+ } else {

18928

+ if (rdev->faulty)

18929

+ LOG_DETAILS(" skipping faulty %s\n", get_partition_name(rdev));

18930

+ if (rdev->alias_device)

18931

+ LOG_DETAILS(" skipping alias %s\n", get_partition_name(rdev));

18932

+ if (rdev->virtual_spare)

18933

+ LOG_DETAILS(" skipping virtual spare.\n");

18934

+ }

18935

+ }

18936

+ if (err) {

18937

+ if (--count) {

18938

+ LOG_WARNING("errors occurred during superblock update, repeating\n");

18939

+ goto repeat;

18940

+ }

18941

+ LOG_ERROR("excessive errors occurred during superblock update, exiting\n");

18942

+ }

18943

+ return 0;

18944

+}

18945

+

18946

+/*

18947

+ * Function: evms_md_import_device

18948

+ * Insure that node is not yet imported.

18949

+ * Read and validate the MD super block on this device

18950

+ * Add to the global MD "extended" devices list (all_raid_disks)

18951

+ *

18952

+ */

18953

+static int evms_md_import_device (evms_logical_node_t **discover_list,

18954

+ evms_logical_node_t *node,

18955

+ int on_disk)

18956

+{

18957

+ int err;

18958

+ mdk_rdev_t *rdev;

18959

+

18960

+ LOG_ENTRY_EXIT("%s: discovering %s\n",__FUNCTION__,evms_md_partition_name(node));

18961

+

18962

+ if (evms_md_find_rdev_all(node)) {

18963

+ LOG_DEBUG("%s exists\n", evms_md_partition_name(node));

18964

+ return -EEXIST;

18965

+ }

18966

+

18967

+ rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL);

18968

+ if (!rdev) {

18969

+ LOG_ERROR("could not alloc mem for %s!\n", evms_md_partition_name(node));

18970

+ return -ENOMEM;

18971

+ }

18972

+ memset(rdev, 0, sizeof(*rdev));

18973

+

18974

+ if ((err = alloc_disk_sb(rdev)))

18975

+ goto abort_free;

18976

+

18977

+ rdev->node = node; /* set this for evms_md_read_disk_sb() */

18978

+

18979

+ rdev->desc_nr = -1;

18980

+ rdev->faulty = 0;

18981

+

18982

+ if (!node->total_vsectors) {

18983

+ LOG_ERROR("%s has zero size, marking faulty!\n", evms_md_partition_name(node));

18984

+ err = -EINVAL;

18985

+ goto abort_free;

18986

+ }

18987

+

18988

+ if (on_disk) {

18989

+ if ((err = evms_md_read_disk_sb(rdev))) {

18990

+ LOG_EXTRA("could not read %s's sb, not importing!\n",evms_md_partition_name(node));

18991

+ goto abort_free;

18992

+ }

18993

+ if ((err = check_disk_sb(rdev))) {

18994

+ LOG_EXTRA("%s has invalid sb, not importing!\n",evms_md_partition_name(node));

18995

+ goto abort_free;

18996

+ }

18997

+ if (rdev->sb->level != -4) {

18998

+ rdev->old_dev = MKDEV(rdev->sb->this_disk.major,

18999

+ rdev->sb->this_disk.minor);

19000

+ rdev->desc_nr = rdev->sb->this_disk.number;

19001

+ } else {

19002

+ rdev->old_dev = MKDEV(0, 0);

19003

+ rdev->desc_nr = -1;

19004

+ }

19005

+ rdev->dev = MKDEV(rdev->sb->this_disk.major, rdev->sb->this_disk.minor);

19006

+ LOG_DETAILS("FOUND %s desc_nr(%d)\n", get_partition_name(rdev), rdev->desc_nr);

19007

+ }

19008

+ md_list_add(&rdev->all, &all_raid_disks);

19009

+ MD_INIT_LIST_HEAD(&rdev->pending);

19010

+

19011

+ if (rdev->faulty && rdev->sb)

19012

+ free_disk_sb(rdev);

19013

+

19014

+ return 0;

19015

+

19016

+abort_free:

19017

+ if (rdev->sb) {

19018

+ free_disk_sb(rdev);

19019

+ }

19020

+ kfree(rdev);

19021

+ return err;

19022

+}

19023

+

19024

+

19025

+

19026

+/*

19027

+ * Function: evms_md_analyze_sbs

19028

+ * EVMS MD version of analyze_sbs()

19029

+ */

19030

+static int evms_md_analyze_sbs (mddev_t * mddev)

19031

+{

19032

+ int out_of_date = 0, i;

19033

+ struct md_list_head *tmp, *tmp2;

19034

+ mdk_rdev_t *rdev, *rdev2, *freshest;

19035

+ mdp_super_t *sb;

19036

+

19037

+ LOG_ENTRY_EXIT("Analyzing all superblocks...\n");

19038

+ /*

19039

+ * Verify the RAID superblock on each real device

19040

+ */

19041

+ ITERATE_RDEV(mddev,rdev,tmp) {

19042

+ if (rdev->faulty) {

19043

+ MD_BUG();

19044

+ goto abort;

19045

+ }

19046

+ if (!rdev->sb) {

19047

+ MD_BUG();

19048

+ goto abort;

19049

+ }

19050

+ if (check_disk_sb(rdev))

19051

+ goto abort;

19052

+ }

19053

+

19054

+ /*

19055

+ * The superblock constant part has to be the same

19056

+ * for all disks in the array.

19057

+ */

19058

+ sb = NULL;

19059

+

19060

+ ITERATE_RDEV(mddev,rdev,tmp) {

19061

+ if (!sb) {

19062

+ sb = rdev->sb;

19063

+ continue;

19064

+ }

19065

+ if (!sb_equal(sb, rdev->sb)) {

19066

+ LOG_WARNING("kick out %s\n",get_partition_name(rdev));

19067

+ kick_rdev_from_array(rdev);

19068

+ continue;

19069

+ }

19070

+ }

19071

+

19072

+ /*

19073

+ * OK, we have all disks and the array is ready to run. Let's

19074

+ * find the freshest superblock, that one will be the superblock

19075

+ * that represents the whole array.

19076

+ */

19077

+ if (!mddev->sb)

19078

+ if (alloc_array_sb(mddev))

19079

+ goto abort;

19080

+ sb = mddev->sb;

19081

+ freshest = NULL;

19082

+

19083

+ ITERATE_RDEV(mddev,rdev,tmp) {

19084

+ __u64 ev1, ev2;

19085

+ /*

19086

+ * if the checksum is invalid, use the superblock

19087

+ * only as a last resort. (decrease it's age by

19088

+ * one event)

19089

+ */

19090

+ if (calc_sb_csum(rdev->sb) != rdev->sb->sb_csum) {

19091

+ if (rdev->sb->events_lo || rdev->sb->events_hi)

19092

+ if ((rdev->sb->events_lo--)==0)

19093

+ rdev->sb->events_hi--;

19094

+ }

19095

+ LOG_DETAILS("%s's event counter: %x\n",get_partition_name(rdev), rdev->sb->events_lo);

19096

+

19097

+ if (!freshest) {

19098

+ freshest = rdev;

19099

+ continue;

19100

+ }

19101

+ /*

19102

+ * Find the newest superblock version

19103

+ */

19104

+ ev1 = md_event(rdev->sb);

19105

+ ev2 = md_event(freshest->sb);

19106

+ if (ev1 != ev2) {

19107

+ out_of_date = 1;

19108

+ if (ev1 > ev2)

19109

+ freshest = rdev;

19110

+ }

19111

+ }

19112

+ if (out_of_date) {

19113

+ LOG_WARNING("OUT OF DATE, freshest: %s\n",get_partition_name(freshest));

19114

+ }

19115

+ memcpy (sb, freshest->sb, sizeof(*sb));

19116

+

19117

+ /*

19118

+ * at this point we have picked the 'best' superblock

19119

+ * from all available superblocks.

19120

+ * now we validate this superblock and kick out possibly

19121

+ * failed disks.

19122

+ */

19123

+ ITERATE_RDEV(mddev,rdev,tmp) {

19124

+ /*

19125

+ * Kick all non-fresh devices

19126

+ */

19127

+ __u64 ev1, ev2;

19128

+ ev1 = md_event(rdev->sb);

19129

+ ev2 = md_event(sb);

19130

+ if (ev1 < ev2) {

19131

+ if (ev1) {

19132

+ LOG_WARNING("kicking non-fresh %s from array!\n",get_partition_name(rdev));

19133

+ kick_rdev_from_array(rdev);

19134

+ continue;

19135

+ } else {

19136

+ LOG_DETAILS("%s is a new spare.\n",get_partition_name(rdev));

19137

+ }

19138

+ }

19139

+ }

19140

+

19141

+ /*

19142

+ * Remove unavailable and faulty devices ...

19143

+ *

19144

+ * note that if an array becomes completely unrunnable due to

19145

+ * missing devices, we do not write the superblock back, so the

19146

+ * administrator has a chance to fix things up. The removal thus

19147

+ * only happens if it's nonfatal to the contents of the array.

19148

+ */

19149

+ for (i = 0; i < MD_SB_DISKS; i++) {

19150

+ int found;

19151

+ mdp_disk_t *desc;

19152

+

19153

+ desc = sb->disks + i;

19154

+

19155

+ /*

19156

+ * We kick faulty devices/descriptors immediately.

19157

+ *

19158

+ * Note: multipath devices are a special case. Since we

19159

+ * were able to read the superblock on the path, we don't

19160

+ * care if it was previously marked as faulty, it's up now

19161

+ * so enable it.

19162

+ */

19163

+ if (disk_faulty(desc) && mddev->sb->level != -4) {

19164

+ found = 0;

19165

+ ITERATE_RDEV(mddev,rdev,tmp) {

19166

+ if (rdev->desc_nr != desc->number)

19167

+ continue;

19168

+ LOG_WARNING("[md%d] kicking faulty %s!\n",mdidx(mddev),get_partition_name(rdev));

19169

+ kick_rdev_from_array(rdev);

19170

+ found = 1;

19171

+ break;

19172

+ }

19173

+ if (!found) {

19174

+ LOG_WARNING("%s: [md%d] found former faulty device [number=%d]\n",

19175

+ __FUNCTION__ ,mdidx(mddev), desc->number);

19176

+ }

19177

+ /*

19178

+ * Don't call remove_descriptor(),

19179

+ * let the administrator remove it from the user-land */

19180

+ /* remove_descriptor(desc, sb); */

19181

+ continue;

19182

+ } else if (disk_faulty(desc)) {

19183

+ /*

19184

+ * multipath entry marked as faulty, unfaulty it

19185

+ */

19186

+ kdev_t dev;

19187

+

19188

+ dev = MKDEV(desc->major, desc->minor);

19189

+

19190

+ rdev = evms_md_find_rdev(mddev, dev);

19191

+ if (rdev)

19192

+ mark_disk_spare(desc);

19193

+ else {

19194

+ LOG_WARNING("%s: [md%d] (MULTIPATH) found former faulty device [number=%d]\n",

19195

+ __FUNCTION__ ,mdidx(mddev), desc->number);

19196

+ /*

19197

+ * Don't call remove_descriptor(),

19198

+ * let the administrator remove it from the user-land */

19199

+ /* remove_descriptor(desc, sb); */

19200

+ }

19201

+ }

19202

+

19203

+ /*

19204

+ * Is this device present in the rdev ring?

19205

+ */

19206

+ found = 0;

19207

+ ITERATE_RDEV(mddev,rdev,tmp) {

19208

+ /*

19209

+ * Multi-path IO special-case: since we have no

19210

+ * this_disk descriptor at auto-detect time,

19211

+ * we cannot check rdev->number.

19212

+ * We can check the device though.

19213

+ */

19214

+ if ((sb->level == -4) && (rdev->dev ==

19215

+ MKDEV(desc->major,desc->minor))) {

19216

+ found = 1;

19217

+ break;

19218

+ }

19219

+ if (rdev->desc_nr == desc->number) {

19220

+ found = 1;

19221

+ break;

19222

+ }

19223

+ }

19224

+ if (found)

19225

+ continue;

19226

+

19227

+ LOG_WARNING(" [md%d]: former device [number=%d] is unavailable!\n",

19228

+ mdidx(mddev), desc->number);

19229

+ /*

19230

+ * Don't call remove_descriptor(),

19231

+ * let the administrator remove it from the user-land */

19232

+ /* remove_descriptor(desc, sb); */

19233

+ }

19234

+

19235

+ /*

19236

+ * Kick all rdevs that are not in the

19237

+ * descriptor array:

19238

+ */

19239

+ ITERATE_RDEV(mddev,rdev,tmp) {

19240

+ if (rdev->desc_nr == -1)

19241

+ kick_rdev_from_array(rdev);

19242

+ }

19243

+

19244

+ /*

19245

+ * Do a final reality check.

19246

+ */

19247

+ if (mddev->sb->level != -4) {

19248

+ ITERATE_RDEV(mddev,rdev,tmp) {

19249

+ if (rdev->desc_nr == -1) {

19250

+ MD_BUG();

19251

+ goto abort;

19252

+ }

19253

+ /*

19254

+ * is the desc_nr unique?

19255

+ */

19256

+ ITERATE_RDEV(mddev,rdev2,tmp2) {

19257

+ if ((rdev2 != rdev) &&

19258

+ (rdev2->desc_nr == rdev->desc_nr)) {

19259

+ MD_BUG();

19260

+ goto abort;

19261

+ }

19262

+ }

19263

+ }

19264

+ }

19265

+

19266

+#define OLD_VERSION KERN_ALERT \

19267

+"md%d: unsupported raid array version %d.%d.%d\n"

19268

+

19269

+#define NOT_CLEAN_IGNORE KERN_ERR \

19270

+"md%d: raid array is not clean -- starting background reconstruction\n"

19271

+

19272

+ /*

19273

+ * Check if we can support this RAID array

19274

+ */

19275

+ if (sb->major_version != MD_MAJOR_VERSION ||

19276

+ sb->minor_version > MD_MINOR_VERSION) {

19277

+

19278

+ LOG_ERROR("[md%d] unsupported raid array version %d.%d.%d\n",

19279

+ mdidx(mddev),

19280

+ sb->major_version,

19281

+ sb->minor_version,

19282

+ sb->patch_version);

19283

+ goto abort;

19284

+ }

19285

+

19286

+ if ((sb->state != (1 << MD_SB_CLEAN)) && ((sb->level == 1) ||

19287

+ (sb->level == 4) || (sb->level == 5)))

19288

+ LOG_WARNING("[md%d, level=%d] raid array is not clean -- starting background reconstruction\n",

19289

+ mdidx(mddev), sb->level);

19290

+

19291

+ LOG_ENTRY_EXIT("analysis of all superblocks is OK!\n");

19292

+ return 0;

19293

+abort:

19294

+ LOG_WARNING("ABORT analyze_sbs()!!!\n");

19295

+ return 1;

19296

+}

19297

+

19298

+

19299

+static int device_size_calculation (mddev_t * mddev)

19300

+{

19301

+ int data_disks = 0, persistent;

19302

+ //unsigned int readahead;

19303

+ mdp_super_t *sb = mddev->sb;

19304

+ struct md_list_head *tmp;

19305

+ mdk_rdev_t *rdev;

19306

+

19307

+ /*

19308

+ * Do device size calculation. Bail out if too small.

19309

+ * (we have to do this after having validated chunk_size,

19310

+ * because device size has to be modulo chunk_size)

19311

+ */

19312

+ persistent = !mddev->sb->not_persistent;

19313

+ ITERATE_RDEV(mddev,rdev,tmp) {

19314

+ if (rdev->faulty)

19315

+ continue;

19316

+ if (rdev->size) {

19317

+ MD_BUG();

19318

+ continue;

19319

+ }

19320

+ rdev->size = evms_md_calc_dev_size(rdev->node, mddev, persistent);

19321

+ if (rdev->size < sb->chunk_size / 1024) {

19322

+ LOG_WARNING("Dev %s smaller than chunk_size: %ldk < %dk\n",

19323

+ get_partition_name(rdev), rdev->size, sb->chunk_size / 1024);

19324

+ return -EINVAL;

19325

+ }

19326

+ }

19327

+

19328

+ switch (sb->level) {

19329

+ case -4:

19330

+ data_disks = 1;

19331

+ break;

19332

+ case -3:

19333

+ data_disks = 1;

19334

+ break;

19335

+ case -2:

19336

+ data_disks = 1;

19337

+ break;

19338

+ case -1:

19339

+ zoned_raid_size(mddev);

19340

+ data_disks = 1;

19341

+ break;

19342

+ case 0:

19343

+ zoned_raid_size(mddev);

19344

+ data_disks = sb->raid_disks;

19345

+ break;

19346

+ case 1:

19347

+ data_disks = 1;

19348

+ break;

19349

+ case 4:

19350

+ case 5:

19351

+ data_disks = sb->raid_disks-1;

19352

+ break;

19353

+ default:

19354

+ LOG_ERROR("[md%d] unkown level %d\n", mdidx(mddev), sb->level);

19355

+ goto abort;

19356

+ }

19357

+ if (!evms_md_size[mdidx(mddev)])

19358

+ evms_md_size[mdidx(mddev)] = sb->size * data_disks;

19359

+

19360

+ return 0;

19361

+abort:

19362

+ return 1;

19363

+}

19364

+

19365

+

19366

+#define TOO_BIG_CHUNKSIZE KERN_ERR \

19367

+"too big chunk_size: %d > %d\n"

19368

+

19369

+#define TOO_SMALL_CHUNKSIZE KERN_ERR \

19370

+"too small chunk_size: %d < %ld\n"

19371

+

19372

+#define BAD_CHUNKSIZE KERN_ERR \

19373

+"no chunksize specified, see 'man raidtab'\n"

19374

+

19375

+static int do_md_run (mddev_t * mddev)

19376

+{

19377

+ int pnum, err;

19378

+ int chunk_size;

19379

+ struct md_list_head *tmp;

19380

+ mdk_rdev_t *rdev;

19381

+

19382

+

19383

+ if (!mddev->nb_dev) {

19384

+ MD_BUG();

19385

+ return -EINVAL;

19386

+ }

19387

+

19388

+ if (mddev->pers)

19389

+ return -EBUSY;

19390

+

19391

+ /*

19392

+ * Resize disks to align partitions size on a given

19393

+ * chunk size.

19394

+ */

19395

+ evms_md_size[mdidx(mddev)] = 0;

19396

+

19397

+ /*

19398

+ * Analyze all RAID superblock(s)

19399

+ */

19400

+ if (evms_md_analyze_sbs(mddev)) {

19401

+ MD_BUG();

19402

+ return -EINVAL;

19403

+ }

19404

+

19405

+ chunk_size = mddev->sb->chunk_size;

19406

+ pnum = level_to_pers(mddev->sb->level);

19407

+

19408

+ mddev->param.chunk_size = chunk_size;

19409

+ mddev->param.personality = pnum;

19410

+

19411

+ if ((pnum != MULTIPATH) && (pnum != RAID1)) {

19412

+ if (!chunk_size) {

19413

+ /*

19414

+ * 'default chunksize' in the old md code used to

19415

+ * be PAGE_SIZE, baaad.

19416

+ * we abort here to be on the safe side. We dont

19417

+ * want to continue the bad practice.

19418

+ */

19419

+ printk(BAD_CHUNKSIZE);

19420

+ return -EINVAL;

19421

+ }

19422

+ if (chunk_size > MAX_CHUNK_SIZE) {

19423

+ printk(TOO_BIG_CHUNKSIZE, chunk_size, MAX_CHUNK_SIZE);

19424

+ return -EINVAL;

19425

+ }

19426

+ /*

19427

+ * chunk-size has to be a power of 2 and multiples of PAGE_SIZE

19428

+ */

19429

+ if ( (1 << ffz(~chunk_size)) != chunk_size) {

19430

+ MD_BUG();

19431

+ return -EINVAL;

19432

+ }

19433

+ if (chunk_size < PAGE_SIZE) {

19434

+ printk(TOO_SMALL_CHUNKSIZE, chunk_size, PAGE_SIZE);

19435

+ return -EINVAL;

19436

+ }

19437

+ } else

19438

+ if (chunk_size)

19439

+ printk(KERN_INFO "RAID level %d does not need chunksize! Continuing anyway.\n", mddev->sb->level);

19440

+

19441

+ if (pnum >= MAX_PERSONALITY) {

19442

+ MD_BUG();

19443

+ return -EINVAL;

19444

+ }

19445

+ if (!pers[pnum])

19446

+ {

19447

+#ifdef CONFIG_KMOD

19448

+ char module_name[80];

19449

+ sprintf (module_name, "md-personality-%d", pnum);

19450

+ request_module (module_name);

19451

+ if (!pers[pnum])

19452

+#endif

19453

+ {

19454

+ printk(KERN_ERR "personality %d is not loaded!\n",

19455

+ pnum);

19456

+ return -EINVAL;

19457

+ }

19458

+ }

19459

+ if (device_size_calculation(mddev))

19460

+ return -EINVAL;

19461

+

19462

+ /*

19463

+ * Drop all container device buffers, from now on

19464

+ * the only valid external interface is through the md

19465

+ * device.

19466

+ * Also find largest hardsector size

19467

+ */

19468

+ md_hardsect_sizes[mdidx(mddev)] = 512;

19469

+ ITERATE_RDEV(mddev,rdev,tmp) {

19470

+ if (rdev->faulty)

19471

+ continue;

19472

+ invalidate_device(rdev->dev, 1);

19473

+/* if (get_hardsect_size(rdev->dev)

19474

+ > md_hardsect_sizes[mdidx(mddev)])

19475

+ md_hardsect_sizes[mdidx(mddev)] =

19476

+ get_hardsect_size(rdev->dev); */

19477

+ if (rdev->node->hardsector_size > md_hardsect_sizes[mdidx(mddev)]) {

19478

+ md_hardsect_sizes[mdidx(mddev)] = rdev->node->hardsector_size;

19479

+ }

19480

+

19481

+ }

19482

+ md_blocksizes[mdidx(mddev)] = 1024;

19483

+ if (md_blocksizes[mdidx(mddev)] < md_hardsect_sizes[mdidx(mddev)])

19484

+ md_blocksizes[mdidx(mddev)] = md_hardsect_sizes[mdidx(mddev)];

19485

+

19486

+ mddev->pers = pers[pnum];

19487

+

19488

+ err = mddev->pers->run(mddev);

19489

+ if (err) {

19490

+ printk("pers->run() failed ...\n");

19491

+ mddev->pers = NULL;

19492

+ return -EINVAL;

19493

+ }

19494

+ mddev->sb->state &= ~(1 << MD_SB_CLEAN);

19495

+

19496

+ evms_md_update_sb(mddev);

19497

+

19498

+ mddev->flag &= ~EVMS_MD_INCOMPLETE; /* Clear incomplete flag */

19499

+

19500

+ return (0);

19501

+}

19502

+

19503

+#undef TOO_BIG_CHUNKSIZE

19504

+#undef BAD_CHUNKSIZE

19505

+

19506

+

19507

+#define OUT(x) do { err = (x); goto out; } while (0)

19508

+

19509

+

19510

+#define STILL_MOUNTED KERN_WARNING \

19511

+"md%d still mounted.\n"

19512

+#define STILL_IN_USE \

19513

+"md%d still in use.\n"

19514

+

19515

+static int do_md_stop (mddev_t * mddev, int ro)

19516

+{

19517

+ int err = 0, resync_interrupted = 0;

19518

+ kdev_t dev = mddev_to_kdev(mddev);

19519

+

19520

+ if (atomic_read(&mddev->active)>1) {

19521

+ printk(STILL_IN_USE, mdidx(mddev));

19522

+ OUT(-EBUSY);

19523

+ }

19524

+

19525

+ if (mddev->pers) {

19526

+ /*

19527

+ * It is safe to call stop here, it only frees private

19528

+ * data. Also, it tells us if a device is unstoppable

19529

+ * (eg. resyncing is in progress)

19530

+ */

19531

+ if (mddev->pers->stop_resync)

19532

+ if (mddev->pers->stop_resync(mddev))

19533

+ resync_interrupted = 1;

19534

+

19535

+ if (mddev->recovery_running)

19536

+ evms_cs_interrupt_thread(evms_md_recovery_thread);

19537

+

19538

+ /*

19539

+ * This synchronizes with signal delivery to the

19540

+ * resync or reconstruction thread. It also nicely

19541

+ * hangs the process if some reconstruction has not

19542

+ * finished.

19543

+ */

19544

+ down(&mddev->recovery_sem);

19545

+ up(&mddev->recovery_sem);

19546

+

19547

+ invalidate_device(dev, 1);

19548

+

19549

+ if (ro) {

19550

+ if (mddev->ro)

19551

+ OUT(-ENXIO);

19552

+ mddev->ro = 1;

19553

+ } else {

19554

+ if (mddev->ro)

19555

+ set_device_ro(dev, 0);

19556

+ if (mddev->pers->stop(mddev)) {

19557

+ if (mddev->ro)

19558

+ set_device_ro(dev, 1);

19559

+ OUT(-EBUSY);

19560

+ }

19561

+ if (mddev->ro)

19562

+ mddev->ro = 0;

19563

+ }

19564

+ if (mddev->sb) {

19565

+ /*

19566

+ * mark it clean only if there was no resync

19567

+ * interrupted.

19568

+ */

19569

+ if (!mddev->recovery_running && !resync_interrupted) {

19570

+ printk("marking sb clean...\n");

19571

+ mddev->sb->state |= 1 << MD_SB_CLEAN;

19572

+ }

19573

+ evms_md_update_sb_sync(mddev);

19574

+ }

19575

+ if (ro)

19576

+ set_device_ro(dev, 1);

19577

+ }

19578

+

19579

+ /*

19580

+ * Free resources if final stop

19581

+ */

19582

+ if (!ro) {

19583

+ printk (KERN_INFO "md%d stopped.\n", mdidx(mddev));

19584

+ free_mddev(mddev);

19585

+

19586

+ } else

19587

+ printk (KERN_INFO

19588

+ "md%d switched to read-only mode.\n", mdidx(mddev));

19589

+out:

19590

+ return err;

19591

+}

19592

+

19593

+

19594

+static void evms_md_autorun_array (evms_logical_node_t ** discover_list, mddev_t *mddev)

19595

+{

19596

+ mdk_rdev_t *rdev;

19597

+ struct md_list_head *tmp;

19598

+ int err;

19599

+ uint flags = 0;

19600

+

19601

+ if (mddev->disks.prev == &mddev->disks) {

19602

+ MD_BUG();

19603

+ return;

19604

+ }

19605

+

19606

+ LOG_DETAILS("%s: trying to run array md%d\n", __FUNCTION__,mdidx(mddev) );

19607

+

19608

+ ITERATE_RDEV(mddev,rdev,tmp) {

19609

+ LOG_DETAILS(" <%s>\n", get_partition_name(rdev));

19610

+ }

19611

+

19612

+ err = do_md_run (mddev);

19613

+ if (!err) {

19614

+ /*

19615

+ * remove all nodes consumed by this md device from the discover list

19616

+ */

19617

+ ITERATE_RDEV(mddev,rdev,tmp) {

19618

+ LOG_DETAILS(" removing %s from discover list.\n", get_partition_name(rdev));

19619

+ evms_cs_remove_logical_node_from_list(discover_list,rdev->node);

19620

+ flags |= rdev->node->flags;

19621

+ }

19622

+ err = evms_md_create_logical_node(discover_list,mddev,flags);

19623

+ if (!err) {

19624

+ exported_nodes++;

19625

+ }

19626

+ } else {

19627

+ LOG_WARNING("%s: cannot run array md%d\n",__FUNCTION__,mdidx(mddev));

19628

+ mddev->sb_dirty = 0;

19629

+ do_md_stop (mddev, 0);

19630

+ }

19631

+}

19632

+

19633

+/*

19634

+ * lets try to run arrays based on all disks that have arrived

19635

+ * until now. (those are in the ->pending list)

19636

+ *

19637

+ * the method: pick the first pending disk, collect all disks with

19638

+ * the same UUID, remove all from the pending list and put them into

19639

+ * the 'same_array' list. Then order this list based on superblock

19640

+ * update time (freshest comes first), kick out 'old' disks and

19641

+ * compare superblocks. If everything's fine then run it.

19642

+ *

19643

+ * If "unit" is allocated, then bump its reference count

19644

+ */

19645

+static void evms_md_autorun_devices (evms_logical_node_t **discover_list, kdev_t countdev)

19646

+{

19647

+ struct md_list_head candidates;

19648

+ struct md_list_head *tmp;

19649

+ mdk_rdev_t *rdev0, *rdev;

19650

+ mddev_t *mddev;

19651

+ kdev_t md_kdev;

19652

+

19653

+

19654

+ LOG_DETAILS("autorun ...\n");

19655

+ while (pending_raid_disks.next != &pending_raid_disks) {

19656

+ rdev0 = md_list_entry(pending_raid_disks.next,

19657

+ mdk_rdev_t, pending);

19658

+ LOG_DETAILS("considering %s ...\n",get_partition_name(rdev0));

19659

+ MD_INIT_LIST_HEAD(&candidates);

19660

+ ITERATE_RDEV_PENDING(rdev,tmp) {

19661

+ if (uuid_equal(rdev0, rdev)) {

19662

+ if (!sb_equal(rdev0->sb, rdev->sb)) {

19663

+ LOG_DETAILS("%s has same UUID as %s, but superblocks differ ...\n",\

19664

+ get_partition_name(rdev),get_partition_name(rdev0));

19665

+ continue;

19666

+ }

19667

+ LOG_DETAILS(" adding %s ...\n", get_partition_name(rdev));

19668

+ md_list_del(&rdev->pending);

19669

+ md_list_add(&rdev->pending, &candidates);

19670

+ }

19671

+ }

19672

+

19673

+ /*

19674

+ * now we have a set of devices, with all of them having

19675

+ * mostly sane superblocks. It's time to allocate the

19676

+ * mddev.

19677

+ */

19678

+ md_kdev = MKDEV(MD_MAJOR, rdev0->sb->md_minor);

19679

+ mddev = kdev_to_mddev(md_kdev);

19680

+ if (mddev && (!(mddev->flag & EVMS_MD_INCOMPLETE))) {

19681

+ LOG_DETAILS("md%d already running, cannot run %s\n",

19682

+ mdidx(mddev), get_partition_name(rdev0));

19683

+ /*

19684

+ * This is EVMS re-discovery!

19685

+ * Remove all nodes consumed by this md device from the discover list

19686

+ */

19687

+ ITERATE_RDEV(mddev,rdev,tmp)

19688

+ evms_cs_remove_logical_node_from_list(discover_list,rdev->node);

19689

+ ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp)

19690

+ evms_md_export_rdev(rdev);

19691

+ continue;

19692

+ }

19693

+

19694

+ if (!mddev) {

19695

+ mddev = alloc_mddev(md_kdev);

19696

+ if (mddev == NULL) {

19697

+ LOG_ERROR("cannot allocate memory for md drive.\n");

19698

+ break;

19699

+ }

19700

+ LOG_DETAILS("created md%d\n", mdidx(mddev));

19701

+ } else {

19702

+ LOG_DETAILS("found INCOMPLETE md%d\n", mdidx(mddev));

19703

+ }

19704

+

19705

+ if (md_kdev == countdev)

19706

+ atomic_inc(&mddev->active);

19707

+

19708

+ ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) {

19709

+ bind_rdev_to_array(rdev, mddev);

19710

+ md_list_del(&rdev->pending);

19711

+ MD_INIT_LIST_HEAD(&rdev->pending);

19712

+ }

19713

+

19714

+ if ((mddev->nr_raid_disks >= rdev0->sb->raid_disks) ||

19715

+ (mddev->nb_dev == rdev0->sb->nr_disks)) {

19716

+ evms_md_autorun_array(discover_list,mddev);

19717

+ } else {

19718

+ mddev->flag |= EVMS_MD_INCOMPLETE;

19719

+ LOG_DETAILS("THIS md%d IS INCOMPLETE, found %d devices, need %d\n",

19720

+ mdidx(mddev), mddev->nr_raid_disks, rdev0->sb->raid_disks);

19721

+ ITERATE_RDEV(mddev,rdev,tmp) {

19722

+ evms_cs_remove_logical_node_from_list(discover_list,rdev->node);

19723

+ }

19724

+ }

19725

+ }

19726

+ LOG_DETAILS("... autorun DONE.\n");

19727

+}

19728

+

19729

+void evms_md_recover_arrays(void)

19730

+{

19731

+ if (!evms_md_recovery_thread) {

19732

+ MD_BUG();

19733

+ return;

19734

+ }

19735

+ evms_cs_wakeup_thread(evms_md_recovery_thread);

19736

+}

19737

+

19738

+int evms_md_error(

19739

+ mddev_t *mddev,

19740

+ evms_logical_node_t *node)

19741

+{

19742

+ mdk_rdev_t * rrdev;

19743

+

19744

+ LOG_ERROR("evms_md_error dev:(md%d), node:(%s), (caller: %p,%p,%p,%p).\n",

19745

+ mdidx(mddev), node->name,

19746

+ __builtin_return_address(0),__builtin_return_address(1),

19747

+ __builtin_return_address(2),__builtin_return_address(3));

19748

+

19749

+ if (!mddev) {

19750

+ MD_BUG();

19751

+ return 0;

19752

+ }

19753

+ rrdev = evms_md_find_rdev_from_node(mddev, node);

19754

+ if (!rrdev || rrdev->faulty)

19755

+ return 0;

19756

+ if (!mddev->pers->error_handler

19757

+ || mddev->pers->error_handler(mddev,node) <= 0) {

19758

+ free_disk_sb(rrdev);

19759

+ rrdev->faulty = 1;

19760

+ } else

19761

+ return 1;

19762

+ /*

19763

+ * if recovery was running, stop it now.

19764

+ */

19765

+ if (mddev->pers->stop_resync)

19766

+ mddev->pers->stop_resync(mddev);

19767

+ if (mddev->recovery_running)

19768

+ evms_cs_interrupt_thread(evms_md_recovery_thread);

19769

+ evms_md_recover_arrays();

19770

+

19771

+ return 0;

19772

+}

19773

+

19774

+int evms_register_md_personality (int pnum, mdk_personality_t *p)

19775

+{

19776

+ if (pnum >= MAX_PERSONALITY) {

19777

+ MD_BUG();

19778

+ return -EINVAL;

19779

+ }

19780

+

19781

+ if (pers[pnum]) {

19782

+ MD_BUG();

19783

+ return -EBUSY;

19784

+ }

19785

+

19786

+ pers[pnum] = p;

19787

+ LOG_DETAILS("%s personality registered as nr %d\n",p->name, pnum);

19788

+ return 0;

19789

+}

19790

+

19791

+int evms_unregister_md_personality (int pnum)

19792

+{

19793

+ if (pnum >= MAX_PERSONALITY) {

19794

+ MD_BUG();

19795

+ return -EINVAL;

19796

+ }

19797

+

19798

+ printk(KERN_INFO "%s personality unregistered\n", pers[pnum]->name);

19799

+ pers[pnum] = NULL;

19800

+ return 0;

19801

+}

19802

+

19803

+mdp_disk_t *evms_md_get_spare(mddev_t *mddev)

19804

+{

19805

+ mdp_super_t *sb = mddev->sb;

19806

+ mdp_disk_t *disk;

19807

+ mdk_rdev_t *rdev;

19808

+// struct md_list_head *tmp;

19809

+ int i, j;

19810

+

19811

+ for (i = 0, j = 0; j < mddev->nb_dev; i++) {

19812

+ rdev = evms_md_find_rdev_nr(mddev, i);

19813

+ if (rdev == NULL)

19814

+ continue;

19815

+ j++;

19816

+ if (rdev->faulty)

19817

+ continue;

19818

+ if (!rdev->sb) {

19819

+ if (!rdev->virtual_spare)

19820

+ MD_BUG();

19821

+ continue;

19822

+ }

19823

+ disk = &sb->disks[rdev->desc_nr];

19824

+ if (disk_faulty(disk)) {

19825

+ MD_BUG();

19826

+ continue;

19827

+ }

19828

+ if (disk_active(disk))

19829

+ continue;

19830

+ return disk;

19831

+ }

19832

+ return NULL;

19833

+}

19834

+

19835

+static mdp_disk_t *evms_md_find_disk(mddev_t *mddev, kdev_t dev)

19836

+{

19837

+ mdp_super_t *sb = mddev->sb;

19838

+ mdp_disk_t *disk;

19839

+ int i;

19840

+

19841

+ for (i=0; i < MD_SB_DISKS; i++) {

19842

+ disk = &sb->disks[i];

19843

+ if ((disk->major == MAJOR(dev)) && (disk->minor == MINOR(dev)))

19844

+ return disk;

19845

+ }

19846

+ return NULL;

19847

+}

19848

+

19849

+static unsigned int sync_io[DK_MAX_MAJOR][DK_MAX_DISK];

19850

+void evms_md_sync_acct(

19851

+ kdev_t dev,

19852

+ unsigned long nr_sectors)

19853

+{

19854

+ unsigned int major = MAJOR(dev);

19855

+ unsigned int index;

19856

+

19857

+ index = disk_index(dev);

19858

+ if ((index >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))

19859

+ return;

19860

+

19861

+ sync_io[major][index] += nr_sectors;

19862

+}

19863

+

19864

+static int is_mddev_idle(mddev_t *mddev)

19865

+{

19866

+ mdk_rdev_t * rdev;

19867

+ struct md_list_head *tmp;

19868

+ int idle;

19869

+ unsigned long curr_events;

19870

+

19871

+ idle = 1;

19872

+ ITERATE_RDEV(mddev,rdev,tmp) {

19873

+ int major = MAJOR(rdev->dev);

19874

+ int idx = disk_index(rdev->dev);

19875

+

19876

+ if ((idx >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))

19877

+ continue;

19878

+

19879

+ curr_events = kstat.dk_drive_rblk[major][idx] +

19880

+ kstat.dk_drive_wblk[major][idx] ;

19881

+ curr_events -= sync_io[major][idx];

19882

+ if ((curr_events - rdev->last_events) > 32) {

19883

+ rdev->last_events = curr_events;

19884

+ idle = 0;

19885

+ }

19886

+ }

19887

+ return idle;

19888

+}

19889

+

19890

+MD_DECLARE_WAIT_QUEUE_HEAD(evms_resync_wait);

19891

+

19892

+void evms_md_done_sync(mddev_t *mddev, int blocks, int ok)

19893

+{

19894

+ /* another "blocks" (512byte) blocks have been synced */

19895

+ atomic_sub(blocks, &mddev->recovery_active);

19896

+ wake_up(&mddev->recovery_wait);

19897

+ if (!ok) {

19898

+ // stop recovery, signal do_sync ....

19899

+ }

19900

+}

19901

+

19902

+#define SYNC_MARKS 10

19903

+#define SYNC_MARK_STEP (3*HZ)

19904

+int evms_md_do_sync(mddev_t *mddev, mdp_disk_t *spare)

19905

+{

19906

+ mddev_t *mddev2;

19907

+ unsigned int max_sectors, currspeed,

19908

+ j, window, err, serialize;

19909

+ unsigned long mark[SYNC_MARKS];

19910

+ unsigned long mark_cnt[SYNC_MARKS];

19911

+ int last_mark,m;

19912

+ struct md_list_head *tmp;

19913

+ unsigned long last_check;

19914

+

19915

+

19916

+ err = down_interruptible(&mddev->resync_sem);

19917

+ if (err)

19918

+ goto out_nolock;

19919

+

19920

+recheck:

19921

+ serialize = 0;

19922

+ ITERATE_MDDEV(mddev2,tmp) {

19923

+ if (mddev2 == mddev)

19924

+ continue;

19925

+ if (mddev2->curr_resync && match_mddev_units(mddev,mddev2)) {

19926

+ LOG_DEFAULT("delaying resync of md%d until md%d "

19927

+ "has finished resync (they share one or more physical units)\n",

19928

+ mdidx(mddev), mdidx(mddev2));

19929

+ serialize = 1;

19930

+ break;

19931

+ }

19932

+ }

19933

+ if (serialize) {

19934

+ interruptible_sleep_on(&evms_resync_wait);

19935

+ if (md_signal_pending(current)) {

19936

+ md_flush_signals();

19937

+ err = -EINTR;

19938

+ goto out;

19939

+ }

19940

+ goto recheck;

19941

+ }

19942

+

19943

+ mddev->curr_resync = 1;

19944

+

19945

+ max_sectors = mddev->sb->size<<1;

19946

+

19947

+ LOG_DEFAULT("syncing RAID array md%d\n", mdidx(mddev));

19948

+ LOG_DEFAULT("minimum _guaranteed_ reconstruction speed: %d KB/sec/disc.\n",

19949

+ sysctl_speed_limit_min);

19950

+ LOG_DEFAULT("using maximum available idle IO bandwith "

19951

+ "(but not more than %d KB/sec) for reconstruction.\n",

19952

+ sysctl_speed_limit_max);

19953

+

19954

+ /*

19955

+ * Resync has low priority.

19956

+ */

19957

+ current->nice = 19;

19958

+

19959

+ is_mddev_idle(mddev); /* this also initializes IO event counters */

19960

+ for (m = 0; m < SYNC_MARKS; m++) {

19961

+ mark[m] = jiffies;

19962

+ mark_cnt[m] = 0;

19963

+ }

19964

+ last_mark = 0;

19965

+ mddev->resync_mark = mark[last_mark];

19966

+ mddev->resync_mark_cnt = mark_cnt[last_mark];

19967

+

19968

+ /*

19969

+ * Tune reconstruction:

19970

+ */

19971

+ window = MAX_READAHEAD*(PAGE_SIZE/512);

19972

+ LOG_DEFAULT("using %dk window, over a total of %d blocks.\n",

19973

+ window/2,max_sectors/2);

19974

+

19975

+ atomic_set(&mddev->recovery_active, 0);

19976

+ init_waitqueue_head(&mddev->recovery_wait);

19977

+ last_check = 0;

19978

+ for (j = 0; j < max_sectors;) {

19979

+ int sectors;

19980

+

19981

+ sectors = mddev->pers->sync_request(mddev, j);

19982

+

19983

+ if (sectors < 0) {

19984

+ err = sectors;

19985

+ goto out;

19986

+ }

19987

+ atomic_add(sectors, &mddev->recovery_active);

19988

+ j += sectors;

19989

+ mddev->curr_resync = j;

19990

+

19991

+ if (last_check + window > j)

19992

+ continue;

19993

+

19994

+ last_check = j;

19995

+

19996

+ run_task_queue(&tq_disk);

19997

+

19998

+ repeat:

19999

+ if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) {

20000

+ /* step marks */

20001

+ int next = (last_mark+1) % SYNC_MARKS;

20002

+

20003

+ mddev->resync_mark = mark[next];

20004

+ mddev->resync_mark_cnt = mark_cnt[next];

20005

+ mark[next] = jiffies;

20006

+ mark_cnt[next] = j - atomic_read(&mddev->recovery_active);

20007

+ last_mark = next;

20008

+ }

20009

+

20010

+

20011

+ if (md_signal_pending(current)) {

20012

+ /*

20013

+ * got a signal, exit.

20014

+ */

20015

+ mddev->curr_resync = 0;

20016

+ LOG_DEFAULT("evms_md_do_sync() got signal ... exiting\n");

20017

+ md_flush_signals();

20018

+ err = -EINTR;

20019

+ goto out;

20020

+ }

20021

+

20022

+ /*

20023

+ * this loop exits only if either when we are slower than

20024

+ * the 'hard' speed limit, or the system was IO-idle for

20025

+ * a jiffy.

20026

+ * the system might be non-idle CPU-wise, but we only care

20027

+ * about not overloading the IO subsystem. (things like an

20028

+ * e2fsck being done on the RAID array should execute fast)

20029

+ */

20030

+ if (md_need_resched(current))

20031

+ schedule();

20032

+

20033

+ currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1;

20034

+

20035

+ if (currspeed > sysctl_speed_limit_min) {

20036

+ current->nice = 19;

20037

+

20038

+ if ((currspeed > sysctl_speed_limit_max) ||

20039

+ !is_mddev_idle(mddev)) {

20040

+ current->state = TASK_INTERRUPTIBLE;

20041

+ md_schedule_timeout(HZ/4);

20042

+ goto repeat;

20043

+ }

20044

+ } else

20045

+ current->nice = -20;

20046

+ }

20047

+ LOG_DEFAULT("md%d: sync done.\n",mdidx(mddev));

20048

+ err = 0;

20049

+ /*

20050

+ * this also signals 'finished resyncing' to md_stop

20051

+ */

20052

+out:

20053

+ wait_event(mddev->recovery_wait, atomic_read(&mddev->recovery_active)==0);

20054

+ up(&mddev->resync_sem);

20055

+out_nolock:

20056

+ mddev->curr_resync = 0;

20057

+ wake_up(&evms_resync_wait);

20058

+ return err;

20059

+}

20060

+

20061

+

20062

+

20063

+/*

20064

+ * This is a kernel thread which syncs a spare disk with the active array

20065

+ *

20066

+ * the amount of foolproofing might seem to be a tad excessive, but an

20067

+ * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs

20068

+ * of my root partition with the first 0.5 gigs of my /home partition ... so

20069

+ * i'm a bit nervous ;)

20070

+ */

20071

+void evms_md_do_recovery(void *data)

20072

+{

20073

+ int err;

20074

+ mddev_t *mddev;

20075

+ mdp_super_t *sb;

20076

+ mdp_disk_t *spare;

20077

+ struct md_list_head *tmp;

20078

+ unsigned long flags;

20079

+ evms_md_activate_spare_t *activate_spare;

20080

+

20081

+ LOG_DEFAULT("recovery thread got woken up ...\n");

20082

+restart:

20083

+ ITERATE_MDDEV(mddev,tmp) {

20084

+

20085

+ sb = mddev->sb;

20086

+ if (!sb)

20087

+ continue;

20088

+ if (mddev->recovery_running)

20089

+ continue;

20090

+ if (sb->active_disks == sb->raid_disks)

20091

+ continue;

20092

+ if (!sb->spare_disks) {

20093

+ LOG_ERROR(" [md%d] no spare disk to reconstruct array! "

20094

+ "-- continuing in degraded mode\n", mdidx(mddev));

20095

+ continue;

20096

+ }

20097

+

20098

+ spare = NULL;

20099

+ activate_spare = NULL;

20100

+

20101

+ spin_lock_irqsave(&activate_spare_list_lock, flags);

20102

+ activate_spare = evms_activate_spare_list;

20103

+ if (activate_spare && (activate_spare->mddev == mddev)) {

20104

+ spare = activate_spare->spare;

20105

+ evms_activate_spare_list = activate_spare->next;

20106

+ }

20107

+ spin_unlock_irqrestore(&activate_spare_list_lock, flags);

20108

+

20109

+ if (!spare) {

20110

+ /*

20111

+ * now here we get the spare and resync it.

20112

+ */

20113

+ spare = evms_md_get_spare(mddev);

20114

+ }

20115

+ if (!spare)

20116

+ continue;

20117

+

20118

+ LOG_DEFAULT(" [md%d] resyncing spare disk %s to replace failed disk\n",

20119

+ mdidx(mddev), org_partition_name(MKDEV(spare->major,spare->minor)));

20120

+ if (!mddev->pers->diskop)

20121

+ continue;

20122

+

20123

+ if (mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_WRITE))

20124

+ continue;

20125

+

20126

+ down(&mddev->recovery_sem);

20127

+ mddev->recovery_running = 1;

20128

+ err = evms_md_do_sync(mddev, spare);

20129

+ if (err == -EIO) {

20130

+ LOG_DEFAULT("[md%d] spare disk %s failed, skipping to next spare.\n",

20131

+ mdidx(mddev), org_partition_name(MKDEV(spare->major,spare->minor)));

20132

+ if (!disk_faulty(spare)) {

20133

+ mddev->pers->diskop(mddev,&spare,DISKOP_SPARE_INACTIVE);

20134

+ mark_disk_faulty(spare);

20135

+ mark_disk_nonsync(spare);

20136

+ mark_disk_inactive(spare);

20137

+ sb->spare_disks--;

20138

+ sb->working_disks--;

20139

+ sb->failed_disks++;

20140

+ }

20141

+ } else

20142

+ if (disk_faulty(spare))

20143

+ mddev->pers->diskop(mddev, &spare,

20144

+ DISKOP_SPARE_INACTIVE);

20145

+ if (err == -EINTR || err == -ENOMEM) {

20146

+ /*

20147

+ * Recovery got interrupted, or ran out of mem ...

20148

+ * signal back that we have finished using the array.

20149

+ */

20150

+ mddev->pers->diskop(mddev, &spare,

20151

+ DISKOP_SPARE_INACTIVE);

20152

+ up(&mddev->recovery_sem);

20153

+ mddev->recovery_running = 0;

20154

+ continue;

20155

+ } else {

20156

+ mddev->recovery_running = 0;

20157

+ up(&mddev->recovery_sem);

20158

+ }

20159

+ if (!disk_faulty(spare)) {

20160

+ /*

20161

+ * the SPARE_ACTIVE diskop possibly changes the

20162

+ * pointer too

20163

+ */

20164

+ if (activate_spare)

20165

+ mddev->pers->diskop(mddev, &spare, DISKOP_HOT_SPARE_ACTIVE);

20166

+ else

20167

+ mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_ACTIVE);

20168

+ mark_disk_sync(spare);

20169

+ mark_disk_active(spare);

20170

+ sb->active_disks++;

20171

+ sb->spare_disks--;

20172

+ }

20173

+ mddev->sb_dirty = 1;

20174

+ evms_md_update_sb(mddev);

20175

+ goto restart;

20176

+ }

20177

+ LOG_DEFAULT("recovery thread finished ...\n");

20178

+

20179

+}

20180

+

20181

+int evms_md_notify_reboot(struct notifier_block *this,

20182

+ unsigned long code, void *x)

20183

+{

20184

+ struct md_list_head *tmp;

20185

+ mddev_t *mddev;

20186

+

20187

+ if ((code == MD_SYS_DOWN) || (code == MD_SYS_HALT)

20188

+ || (code == MD_SYS_POWER_OFF)) {

20189

+

20190

+ LOG_DEFAULT("stopping all md devices.\n");

20191

+

20192

+ ITERATE_MDDEV(mddev,tmp)

20193

+ do_md_stop (mddev, 1);

20194

+ /*

20195

+ * certain more exotic SCSI devices are known to be

20196

+ * volatile wrt too early system reboots. While the

20197

+ * right place to handle this issue is the given

20198

+ * driver, we do want to have a safe RAID driver ...

20199

+ */

20200

+ md_mdelay(1000*1);

20201

+ }

20202

+ return NOTIFY_DONE;

20203

+}

20204

+

20205

+static struct notifier_block md_notifier = {

20206

+ notifier_call: evms_md_notify_reboot,

20207

+ next: NULL,

20208

+ priority: INT_MAX, /* before any real devices */

20209

+};

20210

+

20211

+

20212

+

20213

+/*

20214

+ * Function: evms_md_create_logical_node

20215

+ */

20216

+static int evms_md_create_logical_node(evms_logical_node_t **discover_list,

20217

+ mddev_t *mddev, uint flags)

20218

+{

20219

+ int rc;

20220

+ md_instance_data_t *MDID = NULL;

20221

+ evms_logical_node_t *newnode = NULL;

20222

+

20223

+ rc = evms_cs_allocate_logical_node(&newnode);

20224

+ if (!rc) {

20225

+ rc = evms_cs_allocate_memory((void**)&MDID,sizeof(*MDID));

20226

+ }

20227

+ if (!rc) {

20228

+ memset(newnode,0,sizeof(*MDID));

20229

+ newnode->plugin = &md_plugin_header;

20230

+ newnode->total_vsectors = (u_int64_t)evms_md_size[mdidx(mddev)] * 2;

20231

+ newnode->block_size = md_blocksizes[mdidx(mddev)];

20232

+ newnode->hardsector_size = md_hardsect_sizes[mdidx(mddev)];

20233

+ sprintf(newnode->name,"md/md%d",mdidx(mddev));

20234

+ MDID->mddev = mddev;

20235

+ newnode->instance_data = MDID;

20236

+ newnode->flags = flags;

20237

+ }

20238

+ if (!rc) {

20239

+ rc = evms_cs_add_logical_node_to_list(discover_list, newnode);

20240

+ if (rc) {

20241

+ LOG_ERROR("could not add md node %s\n",newnode->name);

20242

+ } else {

20243

+ LOG_DETAILS("added our md node %s to discover list (total_vsectors=%Lu, blk_size=%d, sector_size=%d)\n",

20244

+ newnode->name, newnode->total_vsectors, newnode->block_size, newnode->hardsector_size);

20245

+ }

20246

+ }

20247

+

20248

+ if (!rc) {

20249

+ mddev->node = newnode;

20250

+ } else {

20251

+ if (MDID)

20252

+ evms_cs_deallocate_memory(MDID);

20253

+ if (newnode)

20254

+ evms_cs_deallocate_logical_node(newnode);

20255

+ }

20256

+ return rc;

20257

+}

20258

+

20259

+/*

20260

+ * Function: evms_md_autostart_arrays

20261

+ * Discover MD "extended" devices

20262

+ * Add MD "extended" devices to pending list for further processing

20263

+ */

20264

+static void evms_md_autostart_arrays (evms_logical_node_t **discover_list)

20265

+{

20266

+ evms_logical_node_t *node, *next_node;

20267

+ mdk_rdev_t *rdev;

20268

+ int rc=0;

20269

+

20270

+ LOG_ENTRY_EXIT(":autostart_arrays() ENTRY\n");

20271

+

20272

+ /* examine each node on the discover list */

20273

+ next_node = *discover_list;

20274

+ while(next_node) {

20275

+ node = next_node;

20276

+ next_node = node->next;

20277

+

20278

+ rc = evms_md_import_device(discover_list, node,1);

20279

+ if (rc && (rc != -EEXIST)) {

20280

+ LOG_EXTRA("autostart_arrrays() Not %s!\n",evms_md_partition_name(node));

20281

+ continue;

20282

+ }

20283

+

20284

+ /*

20285

+ * Sanity checks:

20286

+ */

20287

+ rdev = evms_md_find_rdev_all(node);

20288

+ if (!rdev) {

20289

+ LOG_ERROR("find_rdev_all() failed\n");

20290

+ continue;

20291

+ }

20292

+ if (rdev->faulty) {

20293

+ MD_BUG();

20294

+ continue;

20295

+ }

20296

+

20297

+ if (!rc) {

20298

+ md_list_add(&rdev->pending, &pending_raid_disks);

20299

+ } else if (rc == -EEXIST) {

20300

+ evms_logical_node_t *md_node;

20301

+ /*

20302

+ * Must be in a re-discovery process here.

20303

+ * Find the EVMS MD node that this rdev is a member of

20304

+ */

20305

+ if (rdev->mddev) {

20306

+ md_node = rdev->mddev->node;

20307

+ if (md_node) {

20308

+ rc = evms_cs_add_logical_node_to_list(discover_list,md_node);

20309

+ switch (rc) {

20310

+ case 0:

20311

+ exported_nodes++;

20312

+ LOG_DETAILS("Added MD node (%s) to discover list\n",

20313

+ md_node->name);

20314

+ break;

20315

+ case 1: /* already on the list */

20316

+ case 2: /* already on the list */

20317

+ break;

20318

+ default:

20319

+ LOG_WARNING("could not add md node (%s), rc=%d\n",

20320

+ md_node->name, rc);

20321

+ }

20322

+ } else {

20323

+ LOG_ERROR("This MD device [md%d] does not have an EVMS logical node.\n",

20324

+ rdev->mddev->__minor);

20325

+ }

20326

+ } else {

20327

+ LOG_ERROR("This device [%s] does not belong to any array!\n",

20328

+ get_partition_name(rdev));

20329

+ evms_md_export_rdev(rdev);

20330

+ }

20331

+ evms_cs_remove_logical_node_from_list(discover_list,node);

20332

+ }

20333

+ }

20334

+

20335

+ evms_md_autorun_devices(discover_list, -1);

20336

+ LOG_DETAILS("EVMD MD:autostart_arrays() EXIT (exported_nodes=%d)\n",exported_nodes);

20337

+}

20338

+

20339

+#ifdef CONFIG_PROC_FS

20340

+static int status_resync(char * page, mddev_t * mddev)

20341

+{

20342

+ int sz = 0;

20343

+ unsigned long max_blocks, resync, res, dt, db, rt;

20344

+

20345

+ resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;

20346

+ max_blocks = mddev->sb->size;

20347

+

20348

+ /*

20349

+ * Should not happen.

20350

+ */

20351

+ if (!max_blocks) {

20352

+ MD_BUG();

20353

+ return 0;

20354

+ }

20355

+ res = (resync/1024)*1000/(max_blocks/1024 + 1);

20356

+ {

20357

+ int i, x = res/50, y = 20-x;

20358

+ PROCPRINT("[");

20359

+ for (i = 0; i < x; i++)

20360

+ PROCPRINT("=");

20361

+ sz += sprintf(page + sz, ">");

20362

+ for (i = 0; i < y; i++)

20363

+ PROCPRINT(".");

20364

+ PROCPRINT("] ");

20365

+ }

20366

+ if (!mddev->recovery_running)

20367

+ /*

20368

+ * true resync

20369

+ */

20370

+ PROCPRINT(" resync =%3lu.%lu%% (%lu/%lu)",

20371

+ res/10, res % 10, resync, max_blocks);

20372

+ else

20373

+ /*

20374

+ * recovery ...

20375

+ */

20376

+ PROCPRINT(" recovery =%3lu.%lu%% (%lu/%lu)",

20377

+ res/10, res % 10, resync, max_blocks);

20378

+

20379

+ /*

20380

+ * We do not want to overflow, so the order of operands and

20381

+ * the * 100 / 100 trick are important. We do a +1 to be

20382

+ * safe against division by zero. We only estimate anyway.

20383

+ *

20384

+ * dt: time from mark until now

20385

+ * db: blocks written from mark until now

20386

+ * rt: remaining time

20387

+ */

20388

+ dt = ((jiffies - mddev->resync_mark) / HZ);

20389

+ if (!dt) dt++;

20390

+ db = resync - (mddev->resync_mark_cnt/2);

20391

+ rt = (dt * ((max_blocks-resync) / (db/100+1)))/100;

20392

+

20393

+ PROCPRINT(" finish=%lu.%lumin", rt / 60, (rt % 60)/6);

20394

+

20395

+ PROCPRINT(" speed=%ldK/sec", db/dt);

20396

+

20397

+ return sz;

20398

+}

20399

+

20400

+static int evms_md_status_read_proc(char *page, char **start, off_t off,

20401

+ int count, int *eof, void *data)

20402

+{

20403

+ int sz = 0, j, size;

20404

+ struct md_list_head *tmp, *tmp2;

20405

+ mdk_rdev_t *rdev;

20406

+ mddev_t *mddev;

20407

+

20408

+ PROCPRINT("Enterprise Volume Management System: MD Status\n");

20409

+ PROCPRINT("Personalities : ");

20410

+ for (j = 0; j < MAX_PERSONALITY; j++)

20411

+ if (pers[j])

20412

+ PROCPRINT("[%s] ", pers[j]->name);

20413

+

20414

+ PROCPRINT("\n");

20415

+

20416

+

20417

+ ITERATE_MDDEV(mddev,tmp) {

20418

+ PROCPRINT("md%d : %sactive", mdidx(mddev),

20419

+ mddev->pers ? "" : "in");

20420

+ if (mddev->pers) {

20421

+ if (mddev->ro)

20422

+ PROCPRINT(" (read-only)");

20423

+ PROCPRINT(" %s", mddev->pers->name);

20424

+ }

20425

+

20426

+ size = 0;

20427

+ ITERATE_RDEV(mddev,rdev,tmp2) {

20428

+ PROCPRINT(" %s[%d]",

20429

+ rdev->node->name, rdev->desc_nr);

20430

+ if (rdev->faulty) {

20431

+ PROCPRINT("(F)");

20432

+ continue;

20433

+ }

20434

+ size += rdev->size;

20435

+ }

20436

+

20437

+ if (mddev->nb_dev) {

20438

+ if (mddev->pers)

20439

+ PROCPRINT("\n %Ld blocks",

20440

+ mddev->node->total_vsectors >> 1);

20441

+ else

20442

+ PROCPRINT("\n %d blocks", size);

20443

+ }

20444

+

20445

+ if (!mddev->pers) {

20446

+ PROCPRINT("\n");

20447

+ continue;

20448

+ }

20449

+

20450

+ sz += mddev->pers->status (page+sz, mddev);

20451

+

20452

+ PROCPRINT("\n ");

20453

+ if (mddev->curr_resync) {

20454

+ sz += status_resync (page+sz, mddev);

20455

+ } else {

20456

+ if (atomic_read(&mddev->resync_sem.count) != 1)

20457

+ PROCPRINT(" resync=DELAYED");

20458

+ }

20459

+

20460

+ PROCPRINT("\n");

20461

+ }

20462

+

20463

+ return sz;

20464

+}

20465

+#endif

20466

+

20467

+/* Function: md_core_init

20468

+ */

20469

+int __init md_core_init(void)

20470

+{

20471

+ static char * name = "evms_mdrecoveryd";

20472

+#ifdef CONFIG_PROC_FS

20473

+ struct proc_dir_entry *evms_proc_dir;

20474

+#endif

20475

+

20476

+ // Increment the use count, so it never goes to zero.

20477

+ // This is necessary for now because we don't have code

20478

+ // to shut down the MD threads. When that is written,

20479

+ // this line should be removed.

20480

+ MOD_INC_USE_COUNT;

20481

+

20482

+#ifdef CONFIG_PROC_FS

20483

+ evms_proc_dir = evms_cs_get_evms_proc_dir();

20484

+ if (evms_proc_dir) {

20485

+ create_proc_read_entry("mdstat", 0, evms_proc_dir, evms_md_status_read_proc, NULL);

20486

+ }

20487

+ md_table_header = register_sysctl_table(dev_dir_table, 1);

20488

+#endif

20489

+

20490

+ /* Create MD recovery thread */

20491

+ evms_md_recovery_thread = evms_cs_register_thread(evms_md_do_recovery, NULL, name);

20492

+ if (!evms_md_recovery_thread)

20493

+ LOG_SERIOUS("%s: evms_cs_recovery_thread failed\n", __FUNCTION__);

20494

+

20495

+ /* Register for reboot notification */

20496

+ md_register_reboot_notifier(&md_notifier);

20497

+

20498

+ return evms_cs_register_plugin(&md_plugin_header);

20499

+}

20500

+

20501

+static void __exit md_core_exit(void)

20502

+{

20503

+#ifdef CONFIG_PROC_FS

20504

+ struct proc_dir_entry *evms_proc_dir;

20505

+

20506

+ evms_proc_dir = evms_cs_get_evms_proc_dir();

20507

+ if (evms_proc_dir) {

20508

+ remove_proc_entry("mdstat", evms_proc_dir);

20509

+ }

20510

+ unregister_sysctl_table(md_table_header);

20511

+#endif

20512

+ evms_cs_unregister_plugin(&md_plugin_header);

20513

+}

20514

+

20515

+module_init(md_core_init);

20516

+module_exit(md_core_exit);

20517

+#ifdef MODULE_LICENSE

20518

+MODULE_LICENSE("GPL");

20519

+#endif

20520

+

20521

+/*

20522

+ * In order to have the coexistence of this EVMS plugin and the orginal MD

20523

+ * module, the symbols exported by this plugin are prefixed with "evms_"

20524

+ */

20525

+

20526

+MD_EXPORT_SYMBOL(evms_md_size);

20527

+MD_EXPORT_SYMBOL(evms_register_md_personality);

20528

+MD_EXPORT_SYMBOL(evms_unregister_md_personality);

20529

+ /* Export the following function for use with rdev->node in evms_md_k.h */

20530

+MD_EXPORT_SYMBOL(evms_md_partition_name);

20531

+ /* Export the following function for use with disks[] in md_p.h */

20532

+//MD_EXPORT_SYMBOL(get_partition_name);

20533

+MD_EXPORT_SYMBOL(evms_md_error);

20534

+MD_EXPORT_SYMBOL(evms_md_update_sb);

20535

+MD_EXPORT_SYMBOL(evms_md_find_rdev_nr);

20536

+MD_EXPORT_SYMBOL(evms_md_print_devices);

20537

+MD_EXPORT_SYMBOL(evms_mddev_map);

20538

+MD_EXPORT_SYMBOL(evms_md_check_ordering);

20539

+MD_EXPORT_SYMBOL(evms_md_do_sync);

20540

+MD_EXPORT_SYMBOL(evms_md_sync_acct);

20541

+MD_EXPORT_SYMBOL(evms_md_done_sync);

20542

+MD_EXPORT_SYMBOL(evms_md_recover_arrays);

20543

+MD_EXPORT_SYMBOL(evms_md_get_spare);

20544

+

20545

diff -Naur linux-2002-03-28/drivers/evms/md_linear.c evms-2002-03-28/drivers/evms/md_linear.c

20546

--- linux-2002-03-28/drivers/evms/md_linear.c Wed Dec 31 18:00:00 1969

20547

+++ evms-2002-03-28/drivers/evms/md_linear.c Thu Mar 28 16:28:59 2002

20548

@@ -0,0 +1,284 @@

20549

+/*

20550

+ linear.c : Multiple Devices driver for Linux

20551

20552

+ <zyngier@ufr-info-p7.ibp.fr> or

20553

+ <maz@gloups.fdn.fr>

20554

+

20555

+ Linear mode management functions.

20556

+

20557

+ This program is free software; you can redistribute it and/or modify

20558

+ it under the terms of the GNU General Public License as published by

20559

+ the Free Software Foundation; either version 2, or (at your option)

20560

+ any later version.

20561

+

20562

+ You should have received a copy of the GNU General Public License

20563

+ (for example /usr/src/linux/COPYING); if not, write to the Free

20564

+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

20565

+*/

20566

+

20567

+#include <linux/module.h>

20568

+#include <linux/evms/evms_md.h>

20569

+#include <linux/evms/evms_linear.h>

20570

+#include <linux/slab.h>

20571

+

20572

+

20573

+#define MAJOR_NR MD_MAJOR

20574

+#define MD_DRIVER

20575

+#define MD_PERSONALITY

20576

+

20577

+#define LOG_PREFIX "md linear: "

20578

+static int linear_run (mddev_t *mddev)

20579

+{

20580

+ linear_conf_t *conf;

20581

+ struct linear_hash *table;

20582

+ mdk_rdev_t *rdev;

20583

+ int size, i, j, nb_zone;

20584

+ unsigned int curr_offset;

20585

+

20586

+ MOD_INC_USE_COUNT;

20587

+

20588

+ conf = kmalloc (sizeof (*conf), GFP_KERNEL);

20589

+ if (!conf)

20590

+ goto out;

20591

+ mddev->private = conf;

20592

+

20593

+ if (evms_md_check_ordering(mddev)) {

20594

+ printk("linear: disks are not ordered, aborting!\n");

20595

+ goto out;

20596

+ }

20597

+

20598

+ /*

20599

+ * Find the smallest device.

20600

+ */

20601

+

20602

+ conf->smallest = NULL;

20603

+ curr_offset = 0;

20604

+ ITERATE_RDEV_ORDERED(mddev,rdev,j) {

20605

+ dev_info_t *disk = conf->disks + j;

20606

+ disk->node = rdev->node;

20607

+ LOG_DETAILS(__FUNCTION__" is taking %s, total_vsectors=%Lu\n",

20608

+ disk->node->name,disk->node->total_vsectors);

20609

+ disk->dev = rdev->dev;

20610

+ disk->size = rdev->size;

20611

+ disk->offset = curr_offset;

20612

+

20613

+ curr_offset += disk->size;

20614

+

20615

+ if (!conf->smallest || (disk->size < conf->smallest->size))

20616

+ conf->smallest = disk;

20617

+ }

20618

+

20619

+ nb_zone = conf->nr_zones = evms_md_size[mdidx(mddev)] / conf->smallest->size +

20620

+ ((evms_md_size[mdidx(mddev)] % conf->smallest->size) ? 1 : 0);

20621

+

20622

+ conf->hash_table = kmalloc (sizeof (struct linear_hash) * nb_zone,

20623

+ GFP_KERNEL);

20624

+ if (!conf->hash_table)

20625

+ goto out;

20626

+

20627

+ /*

20628

+ * Here we generate the linear hash table

20629

+ */

20630

+ table = conf->hash_table;

20631

+ i = 0;

20632

+ size = 0;

20633

+ for (j = 0; j < mddev->nb_dev; j++) {

20634

+ dev_info_t *disk = conf->disks + j;

20635

+

20636

+ if (size < 0) {

20637

+ table[-1].dev1 = disk;

20638

+ }

20639

+ size += disk->size;

20640

+

20641

+ while (size>0) {

20642

+ table->dev0 = disk;

20643

+ table->dev1 = NULL;

20644

+ size -= conf->smallest->size;

20645

+ table++;

20646

+ }

20647

+ }

20648

+ if (table-conf->hash_table != nb_zone)

20649

+ BUG();

20650

+ LOG_DETAILS(__FUNCTION__" EXIT nr_zones=%d, smallest=%lu\n",

20651

+ conf->nr_zones,conf->smallest->size);

20652

+ return 0;

20653

+

20654

+out:

20655

+ if (conf)

20656

+ kfree(conf);

20657

+ MOD_DEC_USE_COUNT;

20658

+ return 1;

20659

+}

20660

+

20661

+static int linear_stop (mddev_t *mddev)

20662

+{

20663

+ linear_conf_t *conf = mddev_to_conf(mddev);

20664

+

20665

+ kfree(conf->hash_table);

20666

+ kfree(conf);

20667

+

20668

+ MOD_DEC_USE_COUNT;

20669

+

20670

+ return 0;

20671

+}

20672

+

20673

+/*

20674

+ * Function: linear_map

20675

+ */

20676

+static int linear_map(mddev_t *mddev, evms_logical_node_t **node, evms_sector_t *LSN)

20677

+{

20678

+ linear_conf_t *conf = mddev_to_conf(mddev);

20679

+ struct linear_hash *hash;

20680

+ dev_info_t *tmp_dev;

20681

+ long block;

20682

+

20683

+ block = (long)(*LSN >> 1);

20684

+ hash = conf->hash_table + (block / conf->smallest->size);

20685

+ if (block >= (hash->dev0->size + hash->dev0->offset)) {

20686

+ if (!hash->dev1) {

20687

+ LOG_ERROR(__FUNCTION__ " hash->dev1==NULL for block %ld\n",block);

20688

+ return -EINVAL;

20689

+ }

20690

+ tmp_dev = hash->dev1;

20691

+ } else

20692

+ tmp_dev = hash->dev0;

20693

+

20694

+ if (block >= (tmp_dev->size + tmp_dev->offset)

20695

+ || block < tmp_dev->offset) {

20696

+ LOG_ERROR(__FUNCTION__" Block %ld out of bounds on node %s size %ld offset %ld\n",

20697

+ block,

20698

+ tmp_dev->node->name,

20699

+ tmp_dev->size,

20700

+ tmp_dev->offset);

20701

+ return -EINVAL;

20702

+ }

20703

+ *LSN -= (evms_sector_t)(tmp_dev->offset << 1);

20704

+ *node = tmp_dev->node;

20705

+ return 0;

20706

+}

20707

+

20708

+static int linear_init_io(mddev_t *mddev,

20709

+ int rw,

20710

+ evms_sector_t LSN,

20711

+ evms_sector_t nr_sects,

20712

+ void *data)

20713

+{

20714

+ int rc = 0;

20715

+ evms_logical_node_t *node;

20716

+

20717

+ LOG_ENTRY_EXIT(__FUNCTION__" LSN=%Lu, nr_sects=%Lu\n", LSN, nr_sects);

20718

+ rc = linear_map(mddev, &node, &LSN);

20719

+ if (!rc)

20720

+ rc = INIT_IO(node, rw, LSN, nr_sects, data);

20721

+ return rc;

20722

+}

20723

+

20724

+static int linear_make_request (mddev_t *mddev,

20725

+ int rw,

20726

+ eio_t *eio)

20727

+{

20728

+ evms_logical_node_t *node;

20729

+ int rc;

20730

+

20731

+ rc = linear_map(mddev, &node, &eio->rsector);

20732

+ if (!rc) {

20733

+

20734

+ if (rw == READ) {

20735

+ R_IO(node, eio);

20736

+ } else {

20737

+ W_IO(node, eio);

20738

+ }

20739

+ return 1; /* success */

20740

+ }

20741

+ LOG_ERROR(__FUNCTION__ " FAILED %s node(%s) rsector(%Lu)\n",

20742

+ (rw == READ) ? "READ" : "WRITE",node->name,eio->rsector);

20743

+

20744

+ EVMS_IO_ERROR(eio);

20745

+

20746

+ return 0;

20747

+}

20748

+

20749

+static int linear_status (char *page, mddev_t *mddev)

20750

+{

20751

+ int sz = 0;

20752

+

20753

+#undef MD_DEBUG

20754

+#ifdef MD_DEBUG

20755

+ int j;

20756

+ linear_conf_t *conf = mddev_to_conf(mddev);

20757

+

20758

+ sz += sprintf(page+sz, " ");

20759

+ for (j = 0; j < conf->nr_zones; j++)

20760

+ {

20761

+ sz += sprintf(page+sz, "[%s",

20762

+ partition_name(conf->hash_table[j].dev0->dev));

20763

+

20764

+ if (conf->hash_table[j].dev1)

20765

+ sz += sprintf(page+sz, "/%s] ",

20766

+ partition_name(conf->hash_table[j].dev1->dev));

20767

+ else

20768

+ sz += sprintf(page+sz, "] ");

20769

+ }

20770

+ sz += sprintf(page+sz, "\n");

20771

+#endif

20772

+ sz += sprintf(page+sz, " %dk rounding", mddev->param.chunk_size/1024);

20773

+ return sz;

20774

+}

20775

+

20776

+static int linear_evms_ioctl (

20777

+ mddev_t * mddev,

20778

+ struct inode * inode,

20779

+ struct file * file,

20780

+ unsigned int cmd,

20781

+ unsigned long arg)

20782

+{

20783

+ int rc = 0;

20784

+ evms_logical_node_t *node;

20785

+

20786

+ switch (cmd) {

20787

+ case EVMS_GET_BMAP:

20788

+ {

20789

+ evms_get_bmap_t *bmap = (evms_get_bmap_t *)arg;

20790

+ rc = linear_map(mddev,&node, &bmap->rsector);

20791

+ if (!rc) {

20792

+ if (node)

20793

+ rc = IOCTL(node, inode, file, cmd, arg);

20794

+ else

20795

+ rc = -ENODEV;

20796

+ }

20797

+ break;

20798

+ }

20799

+

20800

+ default:

20801

+ rc = -EINVAL;

20802

+ }

20803

+ return rc;

20804

+}

20805

+

20806

+static mdk_personality_t linear_personality=

20807

+{

20808

+ name: "evms_linear",

20809

+ init_io: linear_init_io,

20810

+ make_request: linear_make_request,

20811

+ run: linear_run,

20812

+ stop: linear_stop,

20813

+ status: linear_status,

20814

+ evms_ioctl: linear_evms_ioctl

20815

+};

20816

+

20817

+static int md__init linear_init (void)

20818

+{

20819

+ return evms_register_md_personality (LINEAR, &linear_personality);

20820

+}

20821

+

20822

+static void linear_exit (void)

20823

+{

20824

+ evms_unregister_md_personality (LINEAR);

20825

+}

20826

+

20827

+

20828

+module_init(linear_init);

20829

+module_exit(linear_exit);

20830

+#ifdef MODULE_LICENSE

20831

+MODULE_LICENSE("GPL");

20832

+#endif

20833

diff -Naur linux-2002-03-28/drivers/evms/md_raid0.c evms-2002-03-28/drivers/evms/md_raid0.c

20834

--- linux-2002-03-28/drivers/evms/md_raid0.c Wed Dec 31 18:00:00 1969

20835

+++ evms-2002-03-28/drivers/evms/md_raid0.c Thu Mar 28 16:28:46 2002

20836

@@ -0,0 +1,442 @@

20837

+/*

20838

+ raid0.c : Multiple Devices driver for Linux

20839

20840

+ <zyngier@ufr-info-p7.ibp.fr> or

20841

+ <maz@gloups.fdn.fr>

20842

20843

+

20844

+

20845

+ RAID-0 management functions.

20846

+

20847

+ This program is free software; you can redistribute it and/or modify

20848

+ it under the terms of the GNU General Public License as published by

20849

+ the Free Software Foundation; either version 2, or (at your option)

20850

+ any later version.

20851

+

20852

+ You should have received a copy of the GNU General Public License

20853

+ (for example /usr/src/linux/COPYING); if not, write to the Free

20854

+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

20855

+*/

20856

+

20857

+#include <linux/module.h>

20858

+#include <linux/evms/evms_raid0.h>

20859

+

20860

+#define MAJOR_NR MD_MAJOR

20861

+#define MD_DRIVER

20862

+#define MD_PERSONALITY

20863

+

20864

+#define LOG_PREFIX "md raid0: "

20865

+

20866

+static int create_strip_zones (mddev_t *mddev)

20867

+{

20868

+ int i, c, j, j1, j2;

20869

+ unsigned long current_offset, curr_zone_offset;

20870

+ raid0_conf_t *conf = mddev_to_conf(mddev);

20871

+ mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev;

20872

+

20873

+ /*

20874

+ * The number of 'same size groups'

20875

+ */

20876

+ conf->nr_strip_zones = 0;

20877

+

20878

+ ITERATE_RDEV_ORDERED(mddev,rdev1,j1) {

20879

+ LOG_DETAILS(" looking at %s\n", evms_md_partition_name(rdev1->node));

20880

+ c = 0;

20881

+ ITERATE_RDEV_ORDERED(mddev,rdev2,j2) {

20882

+ LOG_DETAILS(" comparing %s(%ld) with %s(%ld)\n",

20883

+ evms_md_partition_name(rdev1->node), rdev1->size,

20884

+ evms_md_partition_name(rdev2->node), rdev2->size);

20885

+ if (rdev2 == rdev1) {

20886

+ LOG_DETAILS(" END\n");

20887

+ break;

20888

+ }

20889

+ if (rdev2->size == rdev1->size)

20890

+ {

20891

+ /*

20892

+ * Not unique, dont count it as a new

20893

+ * group

20894

+ */

20895

+ LOG_DETAILS(" EQUAL\n");

20896

+ c = 1;

20897

+ break;

20898

+ }

20899

+ LOG_DETAILS(" NOT EQUAL\n");

20900

+ }

20901

+ if (!c) {

20902

+ LOG_DETAILS(" ==> UNIQUE\n");

20903

+ conf->nr_strip_zones++;

20904

+ LOG_DETAILS(" %d zones\n",conf->nr_strip_zones);

20905

+ }

20906

+ }

20907

+ LOG_DETAILS(" FINAL %d zones\n",conf->nr_strip_zones);

20908

+

20909

+ conf->strip_zone = vmalloc(sizeof(struct strip_zone)*

20910

+ conf->nr_strip_zones);

20911

+ if (!conf->strip_zone)

20912

+ return 1;

20913

+

20914

+

20915

+ conf->smallest = NULL;

20916

+ current_offset = 0;

20917

+ curr_zone_offset = 0;

20918

+

20919

+ for (i = 0; i < conf->nr_strip_zones; i++)

20920

+ {

20921

+ struct strip_zone *zone = conf->strip_zone + i;

20922

+

20923

+ LOG_DETAILS(" zone %d\n", i);

20924

+ zone->dev_offset = current_offset;

20925

+ smallest = NULL;

20926

+ c = 0;

20927

+

20928

+ ITERATE_RDEV_ORDERED(mddev,rdev,j) {

20929

+

20930

+ LOG_DETAILS(" checking %s ...",evms_md_partition_name(rdev->node));

20931

+ if (rdev->size > current_offset)

20932

+ {

20933

+ LOG_DETAILS(" contained as device %d\n", c);

20934

+ zone->dev[c] = rdev;

20935

+ c++;

20936

+ if (!smallest || (rdev->size <smallest->size)) {

20937

+ smallest = rdev;

20938

+ LOG_DETAILS(" (%ld) is smallest!.\n", rdev->size);

20939

+ }

20940

+ } else

20941

+ LOG_DETAILS(" nope.\n");

20942

+ }

20943

+

20944

+ zone->nb_dev = c;

20945

+ zone->size = (smallest->size - current_offset) * c;

20946

+ LOG_DETAILS(" zone->nb_dev: %d, size: %ld\n",

20947

+ zone->nb_dev,zone->size);

20948

+

20949

+ if (!conf->smallest || (zone->size < conf->smallest->size))

20950

+ conf->smallest = zone;

20951

+

20952

+ zone->zone_offset = curr_zone_offset;

20953

+ curr_zone_offset += zone->size;

20954

+

20955

+ current_offset = smallest->size;

20956

+ LOG_DETAILS(" current zone offset: %ld\n",current_offset);

20957

+ }

20958

+ LOG_DETAILS(" done.\n");

20959

+ return 0;

20960

+}

20961

+

20962

+static int raid0_run (mddev_t *mddev)

20963

+{

20964

+ unsigned long cur=0, i=0, size, zone0_size, nb_zone;

20965

+ raid0_conf_t *conf;

20966

+

20967

+ MOD_INC_USE_COUNT;

20968

+

20969

+ conf = vmalloc(sizeof (raid0_conf_t));

20970

+ if (!conf)

20971

+ goto out;

20972

+ mddev->private = (void *)conf;

20973

+

20974

+ if (evms_md_check_ordering(mddev)) {

20975

+ LOG_ERROR("disks are not ordered, aborting!\n");

20976

+ goto out_free_conf;

20977

+ }

20978

+

20979

+ if (create_strip_zones (mddev))

20980

+ goto out_free_conf;

20981

+

20982

+ LOG_DETAILS("evms_md_size is %d blocks.\n", evms_md_size[mdidx(mddev)]);

20983

+ LOG_DETAILS("conf->smallest->size is %ld blocks.\n", conf->smallest->size);

20984

+ nb_zone = evms_md_size[mdidx(mddev)]/conf->smallest->size +

20985

+ (evms_md_size[mdidx(mddev)] % conf->smallest->size ? 1 : 0);

20986

+ LOG_DETAILS("nb_zone is %ld.\n", nb_zone);

20987

+ conf->nr_zones = nb_zone;

20988

+

20989

+ LOG_DETAILS("Allocating %ld bytes for hash.\n", nb_zone*sizeof(struct raid0_hash));

20990

+

20991

+ conf->hash_table = vmalloc (sizeof (struct raid0_hash)*nb_zone);

20992

+ if (!conf->hash_table)

20993

+ goto out_free_zone_conf;

20994

+ size = conf->strip_zone[cur].size;

20995

+

20996

+ i = 0;

20997

+ while (cur < conf->nr_strip_zones) {

20998

+ conf->hash_table[i].zone0 = conf->strip_zone + cur;

20999

+

21000

+ /*

21001

+ * If we completely fill the slot

21002

+ */

21003

+ if (size >= conf->smallest->size) {

21004

+ conf->hash_table[i++].zone1 = NULL;

21005

+ size -= conf->smallest->size;

21006

+

21007

+ if (!size) {

21008

+ if (++cur == conf->nr_strip_zones)

21009

+ continue;

21010

+ size = conf->strip_zone[cur].size;

21011

+ }

21012

+ continue;

21013

+ }

21014

+ if (++cur == conf->nr_strip_zones) {

21015

+ /*

21016

+ * Last dev, set unit1 as NULL

21017

+ */

21018

+ conf->hash_table[i].zone1=NULL;

21019

+ continue;

21020

+ }

21021

+

21022

+ /*

21023

+ * Here we use a 2nd dev to fill the slot

21024

+ */

21025

+ zone0_size = size;

21026

+ size = conf->strip_zone[cur].size;

21027

+ conf->hash_table[i++].zone1 = conf->strip_zone + cur;

21028

+ size -= (conf->smallest->size - zone0_size);

21029

+ }

21030

+ return 0;

21031

+

21032

+out_free_zone_conf:

21033

+ vfree(conf->strip_zone);

21034

+ conf->strip_zone = NULL;

21035

+

21036

+out_free_conf:

21037

+ vfree(conf);

21038

+ mddev->private = NULL;

21039

+out:

21040

+ MOD_DEC_USE_COUNT;

21041

+ return 1;

21042

+}

21043

+

21044

+static int raid0_stop (mddev_t *mddev)

21045

+{

21046

+ raid0_conf_t *conf = mddev_to_conf(mddev);

21047

+

21048

+ vfree (conf->hash_table);

21049

+ conf->hash_table = NULL;

21050

+ vfree (conf->strip_zone);

21051

+ conf->strip_zone = NULL;

21052

+ vfree (conf);

21053

+ mddev->private = NULL;

21054

+

21055

+ MOD_DEC_USE_COUNT;

21056

+ return 0;

21057

+}

21058

+

21059

+

21060

+/*

21061

+ * Function: raid0_map

21062

+ *

21063

+ * Return 0 for success, else error

21064

+ *

21065

+ * Comment from original code:

21066

+ *

21067

+ * FIXME - We assume some things here :

21068

+ * - requested buffers NEVER bigger than chunk size,

21069

+ * - requested buffers NEVER cross stripes limits.

21070

+ * Of course, those facts may not be valid anymore (and surely won't...)

21071

+ * Hey guys, there's some work out there ;-)

21072

+ */

21073

+

21074

+static inline int raid0_map(mddev_t *mddev, evms_logical_node_t **node, evms_sector_t *LSN, evms_sector_t size)

21075

+{

21076

+ unsigned int sect_in_chunk, chunksize_bits, chunk_size;

21077

+ raid0_conf_t *conf = mddev_to_conf(mddev);

21078

+ struct raid0_hash *hash;

21079

+ struct strip_zone *zone;

21080

+ mdk_rdev_t *tmp_dev;

21081

+ unsigned long chunk, block, rsect;

21082

+ unsigned long b_rsector;

21083

+ unsigned int b_size;

21084

+

21085

+ b_rsector = (unsigned long)*LSN;

21086

+ b_size = (unsigned int)size;

21087

+

21088

+ chunk_size = mddev->param.chunk_size >> 10;

21089

+ chunksize_bits = ffz(~chunk_size);

21090

+ block = b_rsector >> 1;

21091

+ hash = conf->hash_table + block / conf->smallest->size;

21092

+

21093

+ /* Sanity check */

21094

+ if (chunk_size < (block % chunk_size) + (b_size >> 10))

21095

+ goto bad_map;

21096

+

21097

+ if (!hash)

21098

+ goto bad_hash;

21099

+

21100

+ if (!hash->zone0)

21101

+ goto bad_zone0;

21102

+

21103

+ if (block >= (hash->zone0->size + hash->zone0->zone_offset)) {

21104

+ if (!hash->zone1)

21105

+ goto bad_zone1;

21106

+ zone = hash->zone1;

21107

+ } else

21108

+ zone = hash->zone0;

21109

+

21110

+ sect_in_chunk = b_rsector & ((chunk_size<<1) -1);

21111

+ chunk = (block - zone->zone_offset) / (zone->nb_dev << chunksize_bits);

21112

+ tmp_dev = zone->dev[(block >> chunksize_bits) % zone->nb_dev];

21113

+ rsect = (((chunk << chunksize_bits) + zone->dev_offset)<<1)

21114

+ + sect_in_chunk;

21115

+

21116

+ /*

21117

+ * The new BH_Lock semantics in ll_rw_blk.c guarantee that this

21118

+ * is the only IO operation happening on this bh.

21119

+ */

21120

+ *LSN = (evms_sector_t)rsect;

21121

+ *node = tmp_dev->node;

21122

+ return 0;

21123

+

21124

+bad_map:

21125

+ LOG_ERROR(__FUNCTION__ " bug: can't convert block across chunks or bigger than %dk %ld %d\n",

21126

+ chunk_size, b_rsector, b_size >> 10);

21127

+ goto outerr;

21128

+bad_hash:

21129

+ LOG_ERROR(__FUNCTION__ " bug: hash==NULL for block %ld\n",block);

21130

+ goto outerr;

21131

+bad_zone0:

21132

+ LOG_ERROR(__FUNCTION__ " bug: hash->zone0==NULL for block %ld\n", block);

21133

+ goto outerr;

21134

+bad_zone1:

21135

+ LOG_ERROR(__FUNCTION__ " bug: hash->zone1==NULL for block %ld\n",block);

21136

+outerr:

21137

+ return -EINVAL;

21138

+}

21139

+

21140

+/*

21141

+ * Function: raid0_init_io

21142

+ */

21143

+static int raid0_init_io(

21144

+ mddev_t *mddev,

21145

+ int rw,

21146

+ evms_sector_t LSN,

21147

+ evms_sector_t nr_sects,

21148

+ void *data)

21149

+{

21150

+ int rc = 0;

21151

+ evms_logical_node_t *node;

21152

+

21153

+ LOG_ENTRY_EXIT(__FUNCTION__ " LSN=%Lu, nr_sects=%Lu\n", LSN, nr_sects);

21154

+ rc = raid0_map(mddev, &node, &LSN, nr_sects);

21155

+ if (!rc)

21156

+ rc = INIT_IO(node, rw, LSN, nr_sects, data);

21157

+ return rc;

21158

+}

21159

+

21160

+static int raid0_make_request (

21161

+ mddev_t *mddev,

21162

+ int rw,

21163

+ eio_t *eio)

21164

+{

21165

+ evms_logical_node_t *node;

21166

+ int rc;

21167

+

21168

+ rc = raid0_map(mddev, &node, &eio->rsector, eio->rsize);

21169

+ if (!rc) {

21170

+ if (rw == READ) {

21171

+ R_IO(node, eio);

21172

+ } else {

21173

+ W_IO(node, eio);

21174

+ }

21175

+ return 1; /* success */

21176

+ }

21177

+ LOG_ERROR(__FUNCTION__ " FAILED %s node(%s) rsector(%Lu)\n",

21178

+ (rw == READ) ? "READ" : "WRITE",node->name,eio->rsector);

21179

+

21180

+ EVMS_IO_ERROR(eio);

21181

+

21182

+ return 0;

21183

+}

21184

+

21185

+

21186

+static int raid0_status (char *page, mddev_t *mddev)

21187

+{

21188

+ int sz = 0;

21189

+#undef MD_DEBUG

21190

+#ifdef MD_DEBUG

21191

+ int j, k;

21192

+ raid0_conf_t *conf = mddev_to_conf(mddev);

21193

+

21194

+ sz += sprintf(page + sz, " ");

21195

+ for (j = 0; j < conf->nr_zones; j++) {

21196

+ sz += sprintf(page + sz, "[z%d",

21197

+ conf->hash_table[j].zone0 - conf->strip_zone);

21198

+ if (conf->hash_table[j].zone1)

21199

+ sz += sprintf(page+sz, "/z%d] ",

21200

+ conf->hash_table[j].zone1 - conf->strip_zone);

21201

+ else

21202

+ sz += sprintf(page+sz, "] ");

21203

+ }

21204

+

21205

+ sz += sprintf(page + sz, "\n");

21206

+

21207

+ for (j = 0; j < conf->nr_strip_zones; j++) {

21208

+ sz += sprintf(page + sz, " z%d=[", j);

21209

+ for (k = 0; k < conf->strip_zone[j].nb_dev; k++)

21210

+ sz += sprintf (page+sz, "%s/", partition_name(

21211

+ conf->strip_zone[j].dev[k]->dev));

21212

+ sz--;

21213

+ sz += sprintf (page+sz, "] zo=%d do=%d s=%d\n",

21214

+ conf->strip_zone[j].zone_offset,

21215

+ conf->strip_zone[j].dev_offset,

21216

+ conf->strip_zone[j].size);

21217

+ }

21218

+#endif

21219

+ sz += sprintf(page + sz, " %dk chunks", mddev->param.chunk_size/1024);

21220

+ return sz;

21221

+}

21222

+

21223

+static int raid0_evms_ioctl (

21224

+ mddev_t * mddev,

21225

+ struct inode * inode,

21226

+ struct file * file,

21227

+ unsigned int cmd,

21228

+ unsigned long arg)

21229

+{

21230

+ int rc = 0;

21231

+ evms_logical_node_t *node;

21232

+

21233

+ switch (cmd) {

21234

+ case EVMS_GET_BMAP:

21235

+ {

21236

+ evms_get_bmap_t *bmap = (evms_get_bmap_t *)arg;

21237

+ rc = raid0_map(mddev,&node, &bmap->rsector, mddev->node->block_size);

21238

+ if (!rc) {

21239

+ if (node)

21240

+ rc = IOCTL(node, inode, file, cmd, arg);

21241

+ else

21242

+ rc = -ENODEV;

21243

+ }

21244

+ break;

21245

+ }

21246

+

21247

+ default:

21248

+ rc = -EINVAL;

21249

+ }

21250

+ return rc;

21251

+}

21252

+

21253

+static mdk_personality_t raid0_personality=

21254

+{

21255

+ name: "evms_raid0",

21256

+ init_io: raid0_init_io,

21257

+ make_request: raid0_make_request,

21258

+ run: raid0_run,

21259

+ stop: raid0_stop,

21260

+ status: raid0_status,

21261

+ evms_ioctl: raid0_evms_ioctl

21262

+};

21263

+

21264

+static int md__init raid0_init (void)

21265

+{

21266

+ return evms_register_md_personality (RAID0, &raid0_personality);

21267

+}

21268

+

21269

+static void raid0_exit (void)

21270

+{

21271

+ evms_unregister_md_personality (RAID0);

21272

+}

21273

+

21274

+module_init(raid0_init);

21275

+module_exit(raid0_exit);

21276

+#ifdef MODULE_LICENSE

21277

+MODULE_LICENSE("GPL");

21278

+#endif

21279

diff -Naur linux-2002-03-28/drivers/evms/md_raid1.c evms-2002-03-28/drivers/evms/md_raid1.c

21280

--- linux-2002-03-28/drivers/evms/md_raid1.c Wed Dec 31 18:00:00 1969

21281

+++ evms-2002-03-28/drivers/evms/md_raid1.c Wed Mar 27 09:07:59 2002

21282

@@ -0,0 +1,2053 @@

21283

+/*

21284

+ * md_raid1.c : Multiple Devices driver for Linux

21285

+ *

21286

21287

+ *

21288

21289

+ *

21290

+ * RAID-1 management functions.

21291

+ *

21292

+ * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000

21293

+ *

21294

+ * Fixes to reconstruction by Jakob �stergaard" <jakob@ostenfeld.dk>

21295

+ * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>

21296

+ *

21297

+ * 'md_raid1.c' is an EVMS version of linux/drivers/md/raid1.c modified

21298

+ * by Cuong (Mike) Tran <miketran@us.ibm.com>, January 2002.

21299

+ *

21300

+ * This program is free software; you can redistribute it and/or modify

21301

+ * it under the terms of the GNU General Public License as published by

21302

+ * the Free Software Foundation; either version 2, or (at your option)

21303

+ * any later version.

21304

+ *

21305

+ * You should have received a copy of the GNU General Public License

21306

+ * (for example /usr/src/linux/COPYING); if not, write to the Free

21307

+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

21308

+ */

21309

+

21310

+#include <linux/module.h>

21311

+#include <linux/slab.h>

21312

+#include <linux/evms/evms_raid1.h>

21313

+#include <asm/atomic.h>

21314

+

21315

+#define MAJOR_NR MD_MAJOR

21316

+#define MD_DRIVER

21317

+#define MD_PERSONALITY

21318

+

21319

+#define MAX_WORK_PER_DISK 128

21320

+

21321

+#define NR_RESERVED_BUFS 32

21322

+

21323

+#define LOG_PREFIX "md raid1: "

21324

+/*

21325

+ * The following can be used to debug the driver

21326

+ */

21327

+#define RAID1_DEBUG 0

21328

+

21329

+#if RAID1_DEBUG

21330

+#define PRINTK(x...) LOG_DEFAULT(x)

21331

+#define inline

21332

+#define __inline__

21333

+#else

21334

+#define PRINTK(x...) do { } while (0)

21335

+#endif

21336

+

21337

+

21338

+static mdk_personality_t raid1_personality;

21339

+static md_spinlock_t retry_list_lock = MD_SPIN_LOCK_UNLOCKED;

21340

+struct raid1_bh *evms_raid1_retry_list = NULL, **evms_raid1_retry_tail;

21341

+

21342

+static inline void add_node_mapping(

21343

+ struct raid1_bh *r1_bh,

21344

+ evms_logical_node_t *node,

21345

+ struct buffer_head *bh)

21346

+{

21347

+ int i;

21348

+ for (i=0; i<MD_SB_DISKS; i++) {

21349

+ if (!r1_bh->mirror_node_map[i].node) {

21350

+ r1_bh->mirror_node_map[i].node = node;

21351

+ r1_bh->mirror_node_map[i].bh = bh;

21352

+ return;

21353

+ }

21354

+ }

21355

+ LOG_ERROR(__FUNCTION__" Cannot create mapping for %s\n",node->name);

21356

+}

21357

+

21358

+static inline evms_logical_node_t * bh_to_node(

21359

+ struct raid1_bh *r1_bh,

21360

+ struct buffer_head *bh)

21361

+{

21362

+ int i;

21363

+ for (i=0; i<MD_SB_DISKS; i++) {

21364

+ if (r1_bh->mirror_node_map[i].bh == bh) {

21365

+ return r1_bh->mirror_node_map[i].node;

21366

+ }

21367

+ }

21368

+ LOG_ERROR(__FUNCTION__" Cannot find mapping for bh(%p)\n",bh);

21369

+ return NULL;

21370

+}

21371

+

21372

+static struct buffer_head *raid1_alloc_bh(raid1_conf_t *conf, int cnt)

21373

+{

21374

+ /* return a linked list of "cnt" struct buffer_heads.

21375

+ * don't take any off the free list unless we know we can

21376

+ * get all we need, otherwise we could deadlock

21377

+ */

21378

+ struct buffer_head *bh=NULL;

21379

+

21380

+ while(cnt) {

21381

+ struct buffer_head *t;

21382

+ md_spin_lock_irq(&conf->device_lock);

21383

+ if (!conf->freebh_blocked && conf->freebh_cnt >= cnt)

21384

+ while (cnt) {

21385

+ t = conf->freebh;

21386

+ conf->freebh = t->b_next;

21387

+ t->b_next = bh;

21388

+ bh = t;

21389

+ t->b_state = 0;

21390

+ conf->freebh_cnt--;

21391

+ cnt--;

21392

+ }

21393

+ md_spin_unlock_irq(&conf->device_lock);

21394

+ if (cnt == 0)

21395

+ break;

21396

+ t = kmem_cache_alloc(bh_cachep, SLAB_NOIO);

21397

+ if (t) {

21398

+ t->b_next = bh;

21399

+ bh = t;

21400

+ cnt--;

21401

+ } else {

21402

+ PRINTK("raid1: waiting for %d bh\n", cnt);

21403

+ conf->freebh_blocked = 1;

21404

+ wait_disk_event(conf->wait_buffer,

21405

+ !conf->freebh_blocked ||

21406

+ conf->freebh_cnt > conf->raid_disks * NR_RESERVED_BUFS/2);

21407

+ conf->freebh_blocked = 0;

21408

+ }

21409

+ }

21410

+ return bh;

21411

+}

21412

+

21413

+static inline void raid1_free_bh(raid1_conf_t *conf, struct buffer_head *bh)

21414

+{

21415

+ unsigned long flags;

21416

+ spin_lock_irqsave(&conf->device_lock, flags);

21417

+ while (bh) {

21418

+ struct buffer_head *t = bh;

21419

+ bh=bh->b_next;

21420

+ if (t->b_pprev == NULL)

21421

+ kmem_cache_free(bh_cachep, t);

21422

+ else {

21423

+ t->b_next= conf->freebh;

21424

+ conf->freebh = t;

21425

+ conf->freebh_cnt++;

21426

+ }

21427

+ }

21428

+ spin_unlock_irqrestore(&conf->device_lock, flags);

21429

+ wake_up(&conf->wait_buffer);

21430

+}

21431

+

21432

+static int raid1_grow_bh(raid1_conf_t *conf, int cnt)

21433

+{

21434

+ /* allocate cnt buffer_heads, possibly less if kmalloc fails */

21435

+ int i = 0;

21436

+

21437

+ while (i < cnt) {

21438

+ struct buffer_head *bh;

21439

+ bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL);

21440

+ if (!bh) break;

21441

+

21442

+ md_spin_lock_irq(&conf->device_lock);

21443

+ bh->b_pprev = &conf->freebh;

21444

+ bh->b_next = conf->freebh;

21445

+ conf->freebh = bh;

21446

+ conf->freebh_cnt++;

21447

+ md_spin_unlock_irq(&conf->device_lock);

21448

+

21449

+ i++;

21450

+ }

21451

+ return i;

21452

+}

21453

+

21454

+static void raid1_shrink_bh(raid1_conf_t *conf)

21455

+{

21456

+ /* discard all buffer_heads */

21457

+

21458

+ md_spin_lock_irq(&conf->device_lock);

21459

+ while (conf->freebh) {

21460

+ struct buffer_head *bh = conf->freebh;

21461

+ conf->freebh = bh->b_next;

21462

+ kmem_cache_free(bh_cachep, bh);

21463

+ conf->freebh_cnt--;

21464

+ }

21465

+ md_spin_unlock_irq(&conf->device_lock);

21466

+}

21467

+

21468

+

21469

+static struct raid1_bh *raid1_alloc_r1bh(raid1_conf_t *conf)

21470

+{

21471

+ struct raid1_bh *r1_bh = NULL;

21472

+

21473

+ do {

21474

+ md_spin_lock_irq(&conf->device_lock);

21475

+ if (!conf->freer1_blocked && conf->freer1) {

21476

+ r1_bh = conf->freer1;

21477

+ conf->freer1 = r1_bh->next_r1;

21478

+ conf->freer1_cnt--;

21479

+ r1_bh->next_r1 = NULL;

21480

+ r1_bh->state = (1 << R1BH_PreAlloc);

21481

+ r1_bh->bh_req.b_state = 0;

21482

+ memset(r1_bh->mirror_node_map, 0, sizeof(r1_bh->mirror_node_map));

21483

+ }

21484

+ md_spin_unlock_irq(&conf->device_lock);

21485

+ if (r1_bh)

21486

+ return r1_bh;

21487

+ r1_bh = (struct raid1_bh *) kmalloc(sizeof(struct raid1_bh), GFP_NOIO);

21488

+ if (r1_bh) {

21489

+ memset(r1_bh, 0, sizeof(*r1_bh));

21490

+ return r1_bh;

21491

+ }

21492

+ conf->freer1_blocked = 1;

21493

+ wait_disk_event(conf->wait_buffer,

21494

+ !conf->freer1_blocked ||

21495

+ conf->freer1_cnt > NR_RESERVED_BUFS/2

21496

+ );

21497

+ conf->freer1_blocked = 0;

21498

+ } while (1);

21499

+}

21500

+

21501

+static inline void raid1_free_r1bh(struct raid1_bh *r1_bh)

21502

+{

21503

+ struct buffer_head *bh = r1_bh->mirror_bh_list;

21504

+ raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);

21505

+

21506

+ r1_bh->mirror_bh_list = NULL;

21507

+

21508

+ if (test_bit(R1BH_PreAlloc, &r1_bh->state)) {

21509

+ unsigned long flags;

21510

+ spin_lock_irqsave(&conf->device_lock, flags);

21511

+ r1_bh->next_r1 = conf->freer1;

21512

+ conf->freer1 = r1_bh;

21513

+ conf->freer1_cnt++;

21514

+ spin_unlock_irqrestore(&conf->device_lock, flags);

21515

+ /* don't need to wakeup wait_buffer because

21516

+ * raid1_free_bh below will do that

21517

+ */

21518

+ } else {

21519

+ kfree(r1_bh);

21520

+ }

21521

+ raid1_free_bh(conf, bh);

21522

+}

21523

+

21524

+static int raid1_grow_r1bh (raid1_conf_t *conf, int cnt)

21525

+{

21526

+ int i = 0;

21527

+

21528

+ while (i < cnt) {

21529

+ struct raid1_bh *r1_bh;

21530

+ r1_bh = (struct raid1_bh*)kmalloc(sizeof(*r1_bh), GFP_KERNEL);

21531

+ if (!r1_bh)

21532

+ break;

21533

+ memset(r1_bh, 0, sizeof(*r1_bh));

21534

+ set_bit(R1BH_PreAlloc, &r1_bh->state);

21535

+ r1_bh->mddev = conf->mddev;

21536

+

21537

+ raid1_free_r1bh(r1_bh);

21538

+ i++;

21539

+ }

21540

+ return i;

21541

+}

21542

+

21543

+static void raid1_shrink_r1bh(raid1_conf_t *conf)

21544

+{

21545

+ md_spin_lock_irq(&conf->device_lock);

21546

+ while (conf->freer1) {

21547

+ struct raid1_bh *r1_bh = conf->freer1;

21548

+ conf->freer1 = r1_bh->next_r1;

21549

+ conf->freer1_cnt--;

21550

+ kfree(r1_bh);

21551

+ }

21552

+ md_spin_unlock_irq(&conf->device_lock);

21553

+}

21554

+

21555

+

21556

+

21557

+static inline void raid1_free_buf(struct raid1_bh *r1_bh)

21558

+{

21559

+ unsigned long flags;

21560

+ struct buffer_head *bh = r1_bh->mirror_bh_list;

21561

+ raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);

21562

+ r1_bh->mirror_bh_list = NULL;

21563

+

21564

+ spin_lock_irqsave(&conf->device_lock, flags);

21565

+ r1_bh->next_r1 = conf->freebuf;

21566

+ conf->freebuf = r1_bh;

21567

+ spin_unlock_irqrestore(&conf->device_lock, flags);

21568

+ raid1_free_bh(conf, bh);

21569

+}

21570

+

21571

+static struct raid1_bh *raid1_alloc_buf(raid1_conf_t *conf)

21572

+{

21573

+ struct raid1_bh *r1_bh;

21574

+

21575

+ md_spin_lock_irq(&conf->device_lock);

21576

+ wait_event_lock_irq(conf->wait_buffer, conf->freebuf, conf->device_lock);

21577

+ r1_bh = conf->freebuf;

21578

+ conf->freebuf = r1_bh->next_r1;

21579

+ r1_bh->next_r1= NULL;

21580

+ md_spin_unlock_irq(&conf->device_lock);

21581

+ memset(r1_bh->mirror_node_map, 0, sizeof(r1_bh->mirror_node_map));

21582

+ return r1_bh;

21583

+}

21584

+

21585

+static int raid1_grow_buffers (raid1_conf_t *conf, int cnt)

21586

+{

21587

+ int i = 0;

21588

+

21589

+ md_spin_lock_irq(&conf->device_lock);

21590

+ while (i < cnt) {

21591

+ struct raid1_bh *r1_bh;

21592

+ struct page *page;

21593

+

21594

+ page = alloc_page(GFP_KERNEL);

21595

+ if (!page)

21596

+ break;

21597

+

21598

+ r1_bh = (struct raid1_bh *) kmalloc(sizeof(*r1_bh), GFP_KERNEL);

21599

+ if (!r1_bh) {

21600

+ __free_page(page);

21601

+ break;

21602

+ }

21603

+ memset(r1_bh, 0, sizeof(*r1_bh));

21604

+ r1_bh->bh_req.b_page = page;

21605

+ r1_bh->bh_req.b_data = page_address(page);

21606

+ r1_bh->next_r1 = conf->freebuf;

21607

+ conf->freebuf = r1_bh;

21608

+ i++;

21609

+ }

21610

+ md_spin_unlock_irq(&conf->device_lock);

21611

+ return i;

21612

+}

21613

+

21614

+static void raid1_shrink_buffers (raid1_conf_t *conf)

21615

+{

21616

+ md_spin_lock_irq(&conf->device_lock);

21617

+ while (conf->freebuf) {

21618

+ struct raid1_bh *r1_bh = conf->freebuf;

21619

+ conf->freebuf = r1_bh->next_r1;

21620

+ __free_page(r1_bh->bh_req.b_page);

21621

+ kfree(r1_bh);

21622

+ }

21623

+ md_spin_unlock_irq(&conf->device_lock);

21624

+}

21625

+

21626

+/*

21627

+ * evms_raid1_map

21628

+ * EVMS raid1 version of raid1_map()

21629

+ */

21630

+static int evms_raid1_map (mddev_t *mddev, evms_logical_node_t **node)

21631

+{

21632

+ raid1_conf_t *conf = mddev_to_conf(mddev);

21633

+ int i;

21634

+

21635

+ /*

21636

+ * Later we do read balancing on the read side

21637

+ * now we use the first available disk.

21638

+ */

21639

+

21640

+ for (i = 0; i < MD_SB_DISKS; i++) {

21641

+ if (conf->mirrors[i].operational) {

21642

+ *node = conf->mirrors[i].node;

21643

+ return (0);

21644

+ }

21645

+ }

21646

+

21647

+ LOG_ERROR("huh, no more operational devices?\n");

21648

+ return (-1);

21649

+}

21650

+

21651

+

21652

+static void raid1_reschedule_retry (struct raid1_bh *r1_bh)

21653

+{

21654

+ unsigned long flags;

21655

+ mddev_t *mddev = r1_bh->mddev;

21656

+ raid1_conf_t *conf = mddev_to_conf(mddev);

21657

+

21658

+ md_spin_lock_irqsave(&retry_list_lock, flags);

21659

+ if (evms_raid1_retry_list == NULL)

21660

+ evms_raid1_retry_tail = &evms_raid1_retry_list;

21661

+ *evms_raid1_retry_tail = r1_bh;

21662

+ evms_raid1_retry_tail = &r1_bh->next_r1;

21663

+ r1_bh->next_r1 = NULL;

21664

+ md_spin_unlock_irqrestore(&retry_list_lock, flags);

21665

+ evms_cs_wakeup_thread(conf->thread);

21666

+}

21667

+

21668

+

21669

+static void inline io_request_done(unsigned long sector, raid1_conf_t *conf, int phase)

21670

+{

21671

+ unsigned long flags;

21672

+ spin_lock_irqsave(&conf->segment_lock, flags);

21673

+ if (sector < conf->start_active)

21674

+ conf->cnt_done--;

21675

+ else if (sector >= conf->start_future && conf->phase == phase)

21676

+ conf->cnt_future--;

21677

+ else if (!--conf->cnt_pending)

21678

+ wake_up(&conf->wait_ready);

21679

+

21680

+ spin_unlock_irqrestore(&conf->segment_lock, flags);

21681

+}

21682

+

21683

+static void inline sync_request_done (unsigned long sector, raid1_conf_t *conf)

21684

+{

21685

+ unsigned long flags;

21686

+ spin_lock_irqsave(&conf->segment_lock, flags);

21687

+ if (sector >= conf->start_ready)

21688

+ --conf->cnt_ready;

21689

+ else if (sector >= conf->start_active) {

21690

+ if (!--conf->cnt_active) {

21691

+ conf->start_active = conf->start_ready;

21692

+ wake_up(&conf->wait_done);

21693

+ }

21694

+ }

21695

+ spin_unlock_irqrestore(&conf->segment_lock, flags);

21696

+}

21697

+

21698

+/*

21699

+ * raid1_end_bh_io() is called when we have finished servicing a mirrored

21700

+ * operation and are ready to return a success/failure code to the buffer

21701

+ * cache layer.

21702

+ */

21703

+static void raid1_end_bh_io (struct raid1_bh *r1_bh, int uptodate)

21704

+{

21705

+ struct buffer_head *bh = r1_bh->master_bh;

21706

+ unsigned long rsector = (unsigned long)r1_bh->eio.rsector;

21707

+

21708

+ //io_request_done(bh->b_rsector, mddev_to_conf(r1_bh->mddev),

21709

+ io_request_done(rsector, mddev_to_conf(r1_bh->mddev),

21710

+ test_bit(R1BH_SyncPhase, &r1_bh->state));

21711

+

21712

+ bh->b_end_io(bh, uptodate);

21713

+ raid1_free_r1bh(r1_bh);

21714

+}

21715

+

21716

+void evms_raid1_end_request (struct buffer_head *bh, int uptodate)

21717

+{

21718

+ struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);

21719

+

21720

+ /*

21721

+ * this branch is our 'one mirror IO has finished' event handler:

21722

+ */

21723

+ if (!uptodate) {

21724

+ if (r1_bh->node)

21725

+ /* READ */

21726

+ evms_md_error (r1_bh->mddev, r1_bh->node);

21727

+ else { /* WRITE */

21728

+ evms_logical_node_t *node;

21729

+ node = bh_to_node(r1_bh,bh);

21730

+ if (node)

21731

+ evms_md_error (r1_bh->mddev, node);

21732

+ }

21733

+ } else

21734

+ /*

21735

+ * Set R1BH_Uptodate in our master buffer_head, so that

21736

+ * we will return a good error code for to the higher

21737

+ * levels even if IO on some other mirrored buffer fails.

21738

+ *

21739

+ * The 'master' represents the complex operation to

21740

+ * user-side. So if something waits for IO, then it will

21741

+ * wait for the 'master' buffer_head.

21742

+ */

21743

+ set_bit (R1BH_Uptodate, &r1_bh->state);

21744

+

21745

+ /*

21746

+ * We split up the read and write side, imho they are

21747

+ * conceptually different.

21748

+ */

21749

+

21750

+ if ( (r1_bh->cmd == READ) || (r1_bh->cmd == READA) ) {

21751

+ /*

21752

+ * we have only one buffer_head on the read side

21753

+ */

21754

+

21755

+ if (uptodate) {

21756

+ raid1_end_bh_io(r1_bh, uptodate);

21757

+ return;

21758

+ }

21759

+ /*

21760

+ * oops, read error:

21761

+ */

21762

+ LOG_ERROR("rescheduling block %lu\n", bh->b_blocknr);

21763

+ raid1_reschedule_retry(r1_bh);

21764

+ return;

21765

+ }

21766

+

21767

+ /*

21768

+ * WRITE:

21769

+ *

21770

+ * Let's see if all mirrored write operations have finished

21771

+ * already.

21772

+ */

21773

+

21774

+ if (atomic_dec_and_test(&r1_bh->remaining))

21775

+ raid1_end_bh_io(r1_bh, test_bit(R1BH_Uptodate, &r1_bh->state));

21776

+}

21777

+

21778

+/*

21779

+ * This routine returns the disk from which the requested read should

21780

+ * be done. It bookkeeps the last read position for every disk

21781

+ * in array and when new read requests come, the disk which last

21782

+ * position is nearest to the request, is chosen.

21783

+ *

21784

+ * TODO: now if there are 2 mirrors in the same 2 devices, performance

21785

+ * degrades dramatically because position is mirror, not device based.

21786

+ * This should be changed to be device based. Also atomic sequential

21787

+ * reads should be somehow balanced.

21788

+ */

21789

+

21790

+//static int raid1_read_balance (raid1_conf_t *conf, struct buffer_head *bh)

21791

+static int raid1_read_balance (raid1_conf_t *conf, eio_t *eio)

21792

+{

21793

+ int new_disk = conf->last_used;

21794

+ //const int sectors = bh->b_size >> 9;

21795

+ const int sectors = (int)eio->rsize;

21796

+ //const unsigned long this_sector = bh->b_rsector;

21797

+ const unsigned long this_sector = (unsigned long)eio->rsector;

21798

+ int disk = new_disk;

21799

+ unsigned long new_distance;

21800

+ unsigned long current_distance;

21801

+

21802

+ /*

21803

+ * Check if it is sane at all to balance

21804

+ */

21805

+

21806

+ if (conf->resync_mirrors)

21807

+ goto rb_out;

21808

+

21809

+

21810

+ /* make sure that disk is operational */

21811

+ while( !conf->mirrors[new_disk].operational) {

21812

+ if (new_disk <= 0) new_disk = conf->raid_disks;

21813

+ new_disk--;

21814

+ if (new_disk == disk) {

21815

+ /*

21816

+ * This means no working disk was found

21817

+ * Nothing much to do, lets not change anything

21818

+ * and hope for the best...

21819

+ */

21820

+

21821

+ new_disk = conf->last_used;

21822

+

21823

+ goto rb_out;

21824

+ }

21825

+ }

21826

+ disk = new_disk;

21827

+ /* now disk == new_disk == starting point for search */

21828

+

21829

+ /*

21830

+ * Don't touch anything for sequential reads.

21831

+ */

21832

+

21833

+ if (this_sector == conf->mirrors[new_disk].head_position)

21834

+ goto rb_out;

21835

+

21836

+ /*

21837

+ * If reads have been done only on a single disk

21838

+ * for a time, lets give another disk a change.

21839

+ * This is for kicking those idling disks so that

21840

+ * they would find work near some hotspot.

21841

+ */

21842

+

21843

+ if (conf->sect_count >= conf->mirrors[new_disk].sect_limit) {

21844

+ conf->sect_count = 0;

21845

+

21846

+ do {

21847

+ if (new_disk<=0)

21848

+ new_disk = conf->raid_disks;

21849

+ new_disk--;

21850

+ if (new_disk == disk)

21851

+ break;

21852

+ } while ((conf->mirrors[new_disk].write_only) ||

21853

+ (!conf->mirrors[new_disk].operational));

21854

+

21855

+ goto rb_out;

21856

+ }

21857

+

21858

+ current_distance = abs(this_sector -

21859

+ conf->mirrors[disk].head_position);

21860

+

21861

+ /* Find the disk which is closest */

21862

+

21863

+ do {

21864

+ if (disk <= 0)

21865

+ disk = conf->raid_disks;

21866

+ disk--;

21867

+

21868

+ if ((conf->mirrors[disk].write_only) ||

21869

+ (!conf->mirrors[disk].operational))

21870

+ continue;

21871

+

21872

+ new_distance = abs(this_sector -

21873

+ conf->mirrors[disk].head_position);

21874

+

21875

+ if (new_distance < current_distance) {

21876

+ conf->sect_count = 0;

21877

+ current_distance = new_distance;

21878

+ new_disk = disk;

21879

+ }

21880

+ } while (disk != conf->last_used);

21881

+

21882

+rb_out:

21883

+ conf->mirrors[new_disk].head_position = this_sector + sectors;

21884

+

21885

+ conf->last_used = new_disk;

21886

+ conf->sect_count += sectors;

21887

+

21888

+ return new_disk;

21889

+}

21890

+

21891

+

21892

+static int raid1_init_io(mddev_t *mddev,

21893

+ int rw,

21894

+ evms_sector_t LSN,

21895

+ evms_sector_t nr_sects,

21896

+ void *data)

21897

+{

21898

+ int rc = 0;

21899

+ raid1_conf_t *conf = mddev_to_conf(mddev);

21900

+ struct mirror_info *mirror;

21901

+

21902

+ LOG_EXTRA(__FUNCTION__ " LSN=%Lu, nr_sects=%Lu\n", LSN, nr_sects);

21903

+

21904

+ if (rw == READ) {

21905

+ /*

21906

+ * read balancing logic:

21907

+ */

21908

+ eio_t eio;

21909

+ eio.rsector = LSN;

21910

+ eio.rsize = nr_sects;

21911

+ mirror = conf->mirrors + raid1_read_balance(conf, &eio);

21912

+

21913

+ return INIT_IO(mirror->node, rw, LSN, nr_sects, data);

21914

+ } else {

21915

+ int i;

21916

+ int saved_rc = 0;

21917

+ for (i=0; i< MD_SB_DISKS; i++) {

21918

+ if (!conf->mirrors[i].operational)

21919

+ continue;

21920

+ rc = INIT_IO(conf->mirrors[i].node, rw, LSN, nr_sects, data);

21921

+ if (rc) {

21922

+ LOG_ERROR(__FUNCTION__ " WRITE failed on %s, rc=%d\n",

21923

+ conf->mirrors[i].node->name, rc);

21924

+ saved_rc = rc;

21925

+ }

21926

+ }

21927

+ if (saved_rc)

21928

+ rc = saved_rc;

21929

+ }

21930

+ return rc;

21931

+}

21932

+

21933

+

21934

+static int raid1_make_request (mddev_t *mddev,

21935

+ int rw,

21936

+ eio_t *eio)

21937

+{

21938

+ struct buffer_head *bh = eio->bh;

21939

+ unsigned long rsector = (unsigned long)eio->rsector;

21940

+ raid1_conf_t *conf = mddev_to_conf(mddev);

21941

+ struct buffer_head *bh_req;

21942

+ struct raid1_bh * r1_bh;

21943

+ int disks = MD_SB_DISKS;

21944

+ struct buffer_head *bhl;

21945

+ int i, sum_bhs = 0;

21946

+ struct mirror_info *mirror;

21947

+

21948

+ if (!buffer_locked(bh))

21949

+ BUG();

21950

+

21951

+/*

21952

+ * make_request() can abort the operation when READA is being

21953

+ * used and no empty request is available.

21954

+ *

21955

+ * Currently, just replace the command with READ/WRITE.

21956

+ */

21957

+ if (rw == READA)

21958

+ rw = READ;

21959

+

21960

+ r1_bh = raid1_alloc_r1bh (conf);

21961

+

21962

+ spin_lock_irq(&conf->segment_lock);

21963

+ wait_event_lock_irq(conf->wait_done,

21964

+ rsector < conf->start_active ||

21965

+ rsector >= conf->start_future,

21966

+ conf->segment_lock);

21967

+ if (rsector < conf->start_active)

21968

+ conf->cnt_done++;

21969

+ else {

21970

+ conf->cnt_future++;

21971

+ if (conf->phase)

21972

+ set_bit(R1BH_SyncPhase, &r1_bh->state);

21973

+ }

21974

+ spin_unlock_irq(&conf->segment_lock);

21975

+

21976

+ /*

21977

+ * i think the read and write branch should be separated completely,

21978

+ * since we want to do read balancing on the read side for example.

21979

+ * Alternative implementations? :) --mingo

21980

+ */

21981

+

21982

+ r1_bh->master_bh = bh;

21983

+ r1_bh->mddev = mddev;

21984

+ r1_bh->cmd = rw;

21985

+

21986

+ if (rw == READ) {

21987

+ /*

21988

+ * read balancing logic:

21989

+ */

21990

+ //mirror = conf->mirrors + raid1_read_balance(conf, bh);

21991

+ mirror = conf->mirrors + raid1_read_balance(conf, eio);

21992

+

21993

+ bh_req = &r1_bh->bh_req;

21994

+ memcpy(bh_req, bh, sizeof(*bh));

21995

+ bh_req->b_blocknr = rsector;

21996

+ bh_req->b_dev = mirror->dev;

21997

+ bh_req->b_rdev = mirror->dev;

21998

+ /* bh_req->b_rsector = bh->n_rsector; */

21999

+ bh_req->b_end_io = evms_raid1_end_request;

22000

+ bh_req->b_private = r1_bh;

22001

+ //generic_make_request (rw, bh_req);

22002

+ eio->bh = bh_req;

22003

+ r1_bh->node = mirror->node;

22004

+ r1_bh->eio = *eio;

22005

+ R_IO(mirror->node, eio);

22006

+ return 0;

22007

+ }

22008

+

22009

+ /*

22010

+ * WRITE:

22011

+ */

22012

+

22013

+ bhl = raid1_alloc_bh(conf, conf->raid_disks);

22014

+ r1_bh->node = NULL;

22015

+ r1_bh->eio = *eio;

22016

+ for (i = 0; i < disks; i++) {

22017

+ struct buffer_head *mbh;

22018

+ if (!conf->mirrors[i].operational)

22019

+ continue;

22020

+

22021

+ /*

22022

+ * We should use a private pool (size depending on NR_REQUEST),

22023

+ * to avoid writes filling up the memory with bhs

22024

+ *

22025

+ * Such pools are much faster than kmalloc anyways (so we waste

22026

+ * almost nothing by not using the master bh when writing and

22027

+ * win alot of cleanness) but for now we are cool enough. --mingo

22028

+ *

22029

+ * It's safe to sleep here, buffer heads cannot be used in a shared

22030

+ * manner in the write branch. Look how we lock the buffer at the

22031

+ * beginning of this function to grok the difference ;)

22032

+ */

22033

+ mbh = bhl;

22034

+ if (mbh == NULL) {

22035

+ MD_BUG();

22036

+ break;

22037

+ }

22038

+ bhl = mbh->b_next;

22039

+ mbh->b_next = NULL;

22040

+ mbh->b_this_page = (struct buffer_head *)1;

22041

+

22042

+ /*

22043

+ * prepare mirrored mbh (fields ordered for max mem throughput):

22044

+ */

22045

+ mbh->b_blocknr = rsector;

22046

+ mbh->b_dev = conf->mirrors[i].dev;

22047

+ mbh->b_rdev = conf->mirrors[i].dev;

22048

+ mbh->b_rsector = rsector;

22049

+ mbh->b_state = (1<<BH_Req) | (1<<BH_Dirty) |

22050

+ (1<<BH_Mapped) | (1<<BH_Lock);

22051

+

22052

+ atomic_set(&mbh->b_count, 1);

22053

+ mbh->b_size = bh->b_size;

22054

+ mbh->b_page = bh->b_page;

22055

+ mbh->b_data = bh->b_data;

22056

+ mbh->b_list = BUF_LOCKED;

22057

+ mbh->b_end_io = evms_raid1_end_request;

22058

+ //mbh->b_private = r1_bh;

22059

+ mbh->b_private = conf->mirrors[i].node;

22060

+

22061

+ mbh->b_next = r1_bh->mirror_bh_list;

22062

+ r1_bh->mirror_bh_list = mbh;

22063

+ sum_bhs++;

22064

+ }

22065

+ if (bhl) raid1_free_bh(conf,bhl);

22066

+ if (!sum_bhs) {

22067

+ /* Gag - all mirrors non-operational.. */

22068

+ raid1_end_bh_io(r1_bh, 0);

22069

+ return 0;

22070

+ }

22071

+ md_atomic_set(&r1_bh->remaining, sum_bhs);

22072

+

22073

+ /*

22074

+ * We have to be a bit careful about the semaphore above, thats

22075

+ * why we start the requests separately. Since kmalloc() could

22076

+ * fail, sleep and make_request() can sleep too, this is the

22077

+ * safer solution. Imagine, end_request decreasing the semaphore

22078

+ * before we could have set it up ... We could play tricks with

22079

+ * the semaphore (presetting it and correcting at the end if

22080

+ * sum_bhs is not 'n' but we have to do end_request by hand if

22081

+ * all requests finish until we had a chance to set up the

22082

+ * semaphore correctly ... lots of races).

22083

+ */

22084

+ bh = r1_bh->mirror_bh_list;

22085

+ while(bh) {

22086

+ evms_logical_node_t *node;

22087

+ eio_t this_eio;

22088

+ struct buffer_head *bh2 = bh;

22089

+

22090

+ bh = bh->b_next;

22091

+ node = (evms_logical_node_t *)bh2->b_private;

22092

+ bh2->b_private = r1_bh;

22093

+ this_eio = r1_bh->eio;

22094

+ this_eio.bh = bh2;

22095

+ add_node_mapping(r1_bh, node, bh2);

22096

+ W_IO(node, &this_eio);

22097

+ //generic_make_request(rw, bh2);

22098

+ }

22099

+

22100

+ return (0);

22101

+}

22102

+

22103

+static int raid1_status (char *page, mddev_t *mddev)

22104

+{

22105

+ raid1_conf_t *conf = mddev_to_conf(mddev);

22106

+ int sz = 0, i;

22107

+

22108

+ sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks,

22109

+ conf->working_disks);

22110

+ for (i = 0; i < conf->raid_disks; i++)

22111

+ sz += sprintf (page+sz, "%s",

22112

+ conf->mirrors[i].operational ? "U" : "_");

22113

+ sz += sprintf (page+sz, "]");

22114

+ return sz;

22115

+}

22116

+

22117

+#define LAST_DISK KERN_ALERT \

22118

+"EVMS raid1: only one disk left and IO error.\n"

22119

+

22120

+#define NO_SPARE_DISK KERN_ALERT \

22121

+"EVMS raid1: no spare disk left, degrading mirror level by one.\n"

22122

+

22123

+#define DISK_FAILED KERN_ALERT \

22124

+"EVMS raid1: Disk failure on %s, disabling device. \n" \

22125

+" Operation continuing on %d devices\n"

22126

+

22127

+#define START_SYNCING KERN_ALERT \

22128

+"EVMS raid1: start syncing spare disk.\n"

22129

+

22130

+#define ALREADY_SYNCING KERN_INFO \

22131

+"EVMS raid1: syncing already in progress.\n"

22132

+

22133

+static void mark_disk_bad (mddev_t *mddev, int failed)

22134

+{

22135

+ raid1_conf_t *conf = mddev_to_conf(mddev);

22136

+ struct mirror_info *mirror = conf->mirrors+failed;

22137

+ mdp_super_t *sb = mddev->sb;

22138

+

22139

+ mirror->operational = 0;

22140

+ mark_disk_faulty(sb->disks+mirror->number);

22141

+ mark_disk_nonsync(sb->disks+mirror->number);

22142

+ mark_disk_inactive(sb->disks+mirror->number);

22143

+ if (!mirror->write_only)

22144

+ sb->active_disks--;

22145

+ sb->working_disks--;

22146

+ sb->failed_disks++;

22147

+ mddev->sb_dirty = 1;

22148

+ evms_cs_wakeup_thread(conf->thread);

22149

+ if (!mirror->write_only)

22150

+ conf->working_disks--;

22151

+ LOG_SERIOUS(DISK_FAILED, evms_md_partition_name(mirror->node),conf->working_disks);

22152

+}

22153

+

22154

+static int raid1_error (

22155

+ mddev_t *mddev,

22156

+ evms_logical_node_t *node)

22157

+{

22158

+ raid1_conf_t *conf = mddev_to_conf(mddev);

22159

+ struct mirror_info * mirrors = conf->mirrors;

22160

+ int disks = MD_SB_DISKS;

22161

+ int i;

22162

+

22163

+ /* Find the drive.

22164

+ * If it is not operational, then we have already marked it as dead

22165

+ * else if it is the last working disks, ignore the error, let the

22166

+ * next level up know.

22167

+ * else mark the drive as failed

22168

+ */

22169

+

22170

+ for (i = 0; i < disks; i++)

22171

+ if (mirrors[i].node==node && mirrors[i].operational)

22172

+ break;

22173

+ if (i == disks)

22174

+ return 0;

22175

+

22176

+ if (i < conf->raid_disks && conf->working_disks == 1) {

22177

+ /* Don't fail the drive, act as though we were just a

22178

+ * normal single drive

22179

+ */

22180

+

22181

+ return 1;

22182

+ }

22183

+ mark_disk_bad(mddev, i);

22184

+ return 0;

22185

+}

22186

+

22187

+#undef LAST_DISK

22188

+#undef NO_SPARE_DISK

22189

+#undef DISK_FAILED

22190

+#undef START_SYNCING

22191

+

22192

+

22193

+static void print_raid1_conf (raid1_conf_t *conf)

22194

+{

22195

+ int i;

22196

+ struct mirror_info *tmp;

22197

+

22198

+ LOG_DEFAULT("RAID1 conf printout:\n");

22199

+ if (!conf) {

22200

+ LOG_DEFAULT("(conf==NULL)\n");

22201

+ return;

22202

+ }

22203

+ LOG_DEFAULT(" --- wd:%d rd:%d nd:%d\n",

22204

+ conf->working_disks,conf->raid_disks, conf->nr_disks);

22205

+

22206

+ for (i = 0; i < conf->nr_disks; i++) {

22207

+ tmp = conf->mirrors + i;

22208

+ LOG_DEFAULT(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",

22209

+ i, tmp->spare,tmp->operational,

22210

+ tmp->number,tmp->raid_disk,tmp->used_slot,

22211

+ evms_md_partition_name(tmp->node));

22212

+ }

22213

+}

22214

+

22215

+static void close_sync(raid1_conf_t *conf)

22216

+{

22217

+ mddev_t *mddev = conf->mddev;

22218

+ /* If reconstruction was interrupted, we need to close the "active" and "pending"

22219

+ * holes.

22220

+ * we know that there are no active rebuild requests, os cnt_active == cnt_ready ==0

22221

+ */

22222

+ /* this is really needed when recovery stops too... */

22223

+ spin_lock_irq(&conf->segment_lock);

22224

+ conf->start_active = conf->start_pending;

22225

+ conf->start_ready = conf->start_pending;

22226

+ wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);

22227

+ conf->start_active =conf->start_ready = conf->start_pending = conf->start_future;

22228

+ conf->start_future = mddev->sb->size+1;

22229

+ conf->cnt_pending = conf->cnt_future;

22230

+ conf->cnt_future = 0;

22231

+ conf->phase = conf->phase ^1;

22232

+ wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);

22233

+ conf->start_active = conf->start_ready = conf->start_pending = conf->start_future = 0;

22234

+ conf->phase = 0;

22235

+ conf->cnt_future = conf->cnt_done;;

22236

+ conf->cnt_done = 0;

22237

+ spin_unlock_irq(&conf->segment_lock);

22238

+ wake_up(&conf->wait_done);

22239

+}

22240

+

22241

+static int raid1_diskop(mddev_t *mddev, mdp_disk_t **d, int state)

22242

+{

22243

+ int err = 0;

22244

+ int i, failed_disk=-1, spare_disk=-1, removed_disk=-1;

22245

+ raid1_conf_t *conf = mddev->private;

22246

+ struct mirror_info *tmp, *sdisk, *fdisk, *rdisk;

22247

+ mdp_super_t *sb = mddev->sb;

22248

+ mdp_disk_t *failed_desc, *spare_desc;

22249

+ mdk_rdev_t *spare_rdev, *failed_rdev;

22250

+

22251

+ print_raid1_conf(conf);

22252

+ md_spin_lock_irq(&conf->device_lock);

22253

+ /*

22254

+ * find the disk ...

22255

+ */

22256

+ switch (state) {

22257

+

22258

+ case DISKOP_SPARE_ACTIVE:

22259

+

22260

+ /*

22261

+ * Find the failed disk within the RAID1 configuration ...

22262

+ * (this can only be in the first conf->working_disks part)

22263

+ */

22264

+ for (i = 0; i < conf->raid_disks; i++) {

22265

+ tmp = conf->mirrors + i;

22266

+ if ((!tmp->operational && !tmp->spare) ||

22267

+ !tmp->used_slot) {

22268

+ failed_disk = i;

22269

+ break;

22270

+ }

22271

+ }

22272

+ /*

22273

+ * When we activate a spare disk we _must_ have a disk in

22274

+ * the lower (active) part of the array to replace.

22275

+ */

22276

+/* if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {

22277

+ MD_BUG();

22278

+ err = 1;

22279

+ goto abort;

22280

+ }

22281

+ */ /* fall through */

22282

+

22283

+ case DISKOP_HOT_SPARE_ACTIVE:

22284

+ case DISKOP_SPARE_WRITE:

22285

+ case DISKOP_SPARE_INACTIVE:

22286

+

22287

+ /*

22288

+ * Find the spare disk ... (can only be in the 'high'

22289

+ * area of the array)

22290

+ ##### Actually it can be sooner now that we have improved MD #####

22291

+ This support required for expanding number of active mirrors.

22292

+ */

22293

+ for (i = 0; i < MD_SB_DISKS; i++) {

22294

+ tmp = conf->mirrors + i;

22295

+ if (tmp->spare && tmp->number == (*d)->number) {

22296

+ spare_disk = i;

22297

+ break;

22298

+ }

22299

+ }

22300

+ if (spare_disk == -1) {

22301

+ MD_BUG();

22302

+ err = 1;

22303

+ goto abort;

22304

+ }

22305

+ break;

22306

+

22307

+ case DISKOP_HOT_REMOVE_SPARE:

22308

+

22309

+ for (i = 0; i < MD_SB_DISKS; i++) {

22310

+ tmp = conf->mirrors + i;

22311

+ if (tmp->used_slot && (tmp->number == (*d)->number)) {

22312

+ if (tmp->operational) {

22313

+ err = -EBUSY;

22314

+ goto abort;

22315

+ } else if (!tmp->spare){

22316

+ MD_BUG();

22317

+ err = 1;

22318

+ goto abort;

22319

+ }

22320

+ removed_disk = i;

22321

+ break;

22322

+ }

22323

+ }

22324

+ if (removed_disk == -1) {

22325

+ MD_BUG();

22326

+ err = 1;

22327

+ goto abort;

22328

+ }

22329

+ break;

22330

+

22331

+ case DISKOP_HOT_REMOVE_DISK:

22332

+ if (conf->working_disks <= 1) {

22333

+ err = -EBUSY;

22334

+ goto abort;

22335

+ }

22336

+ for (i = 0; i < MD_SB_DISKS; i++) {

22337

+ tmp = conf->mirrors + i;

22338

+ if (tmp->used_slot && (tmp->number == (*d)->number)) {

22339

+ removed_disk = i;

22340

+ break;

22341

+ }

22342

+ }

22343

+ if (removed_disk == -1) {

22344

+ MD_BUG();

22345

+ err = 1;

22346

+ goto abort;

22347

+ }

22348

+ break;

22349

+

22350

+ case DISKOP_HOT_ADD_DISK:

22351

+ err = -ENOSYS;

22352

+ goto abort;

22353

+ break;

22354

+ }

22355

+

22356

+ switch (state) {

22357

+ /*

22358

+ * Switch the spare disk to write-only mode:

22359

+ */

22360

+ case DISKOP_SPARE_WRITE:

22361

+ sdisk = conf->mirrors + spare_disk;

22362

+ sdisk->operational = 1;

22363

+ sdisk->write_only = 1;

22364

+ break;

22365

+ /*

22366

+ * Deactivate a spare disk:

22367

+ */

22368

+ case DISKOP_SPARE_INACTIVE:

22369

+ close_sync(conf);

22370

+ sdisk = conf->mirrors + spare_disk;

22371

+ sdisk->operational = 0;

22372

+ sdisk->write_only = 0;

22373

+ break;

22374

+ /*

22375

+ * Activate (mark read-write) the (now sync) spare disk,

22376

+ * which means we switch it's 'raid position' (->raid_disk)

22377

+ * with the failed disk. (only the first 'conf->nr_disks'

22378

+ * slots are used for 'real' disks and we must preserve this

22379

+ * property)

22380

+ */

22381

+ case DISKOP_SPARE_ACTIVE:

22382

+ close_sync(conf);

22383

+ sdisk = conf->mirrors + spare_disk;

22384

+ if (failed_disk < 0) {

22385

+ // preset failed disk to itself if no failed disk.

22386

+ failed_disk = spare_disk;

22387

+ // try to find spare earlier in array

22388

+ for (i = conf->raid_disks; i < spare_disk; i++) {

22389

+ tmp = conf->mirrors + i;

22390

+ if ((tmp->spare) || !tmp->used_slot) {

22391

+ failed_disk = i;

22392

+ break;

22393

+ }

22394

+ }

22395

+ }

22396

+ fdisk = conf->mirrors + failed_disk;

22397

+

22398

+ spare_desc = &sb->disks[sdisk->number];

22399

+ failed_desc = &sb->disks[fdisk->number];

22400

+

22401

+ if (spare_desc != *d) {

22402

+ MD_BUG();

22403

+ err = 1;

22404

+ goto abort;

22405

+ }

22406

+

22407

+ if (spare_desc->raid_disk != sdisk->raid_disk) {

22408

+ MD_BUG();

22409

+ err = 1;

22410

+ goto abort;

22411

+ }

22412

+

22413

+ if (sdisk->raid_disk != spare_disk) {

22414

+ MD_BUG();

22415

+ err = 1;

22416

+ goto abort;

22417

+ }

22418

+

22419

+ if (failed_desc->raid_disk != fdisk->raid_disk) {

22420

+ MD_BUG();

22421

+ err = 1;

22422

+ goto abort;

22423

+ }

22424

+

22425

+ if (fdisk->raid_disk != failed_disk) {

22426

+ MD_BUG();

22427

+ err = 1;

22428

+ goto abort;

22429

+ }

22430

+

22431

+ /*

22432

+ * do the switch finally

22433

+ */

22434

+ spare_rdev = evms_md_find_rdev_nr(mddev, spare_desc->number);

22435

+ failed_rdev = evms_md_find_rdev_nr(mddev, failed_desc->number);

22436

+

22437

+ /* There must be a spare_rdev, but there may not be a

22438

+ * failed_rdev. That slot might be empty...

22439

+ */

22440

+ spare_rdev->desc_nr = failed_desc->number;

22441

+ if (failed_rdev)

22442

+ failed_rdev->desc_nr = spare_desc->number;

22443

+

22444

+ xchg_values(*spare_desc, *failed_desc);

22445

+ xchg_values(*fdisk, *sdisk);

22446

+

22447

+ /*

22448

+ * (careful, 'failed' and 'spare' are switched from now on)

22449

+ *

22450

+ * we want to preserve linear numbering and we want to

22451

+ * give the proper raid_disk number to the now activated

22452

+ * disk. (this means we switch back these values)

22453

+ */

22454

+

22455

+ xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);

22456

+ xchg_values(sdisk->raid_disk, fdisk->raid_disk);

22457

+ xchg_values(spare_desc->number, failed_desc->number);

22458

+ xchg_values(sdisk->number, fdisk->number);

22459

+

22460

+ *d = failed_desc;

22461

+

22462

+ if (sdisk->dev == MKDEV(0,0))

22463

+ sdisk->used_slot = 0;

22464

+ /*

22465

+ * this really activates the spare.

22466

+ */

22467

+ fdisk->spare = 0;

22468

+ fdisk->write_only = 0;

22469

+

22470

+ /*

22471

+ * if we activate a spare, we definitely replace a

22472

+ * non-operational disk slot in the 'low' area of

22473

+ * the disk array.

22474

+ */

22475

+

22476

+ conf->working_disks++;

22477

+

22478

+ break;

22479

+

22480

+ /* Activate a spare disk without a failed disk */

22481

+ case DISKOP_HOT_SPARE_ACTIVE:

22482

+ sdisk = conf->mirrors + spare_disk;

22483

+ sdisk->spare = 0;

22484

+ sdisk->write_only = 0;

22485

+ conf->working_disks++;

22486

+ conf->raid_disks++;

22487

+ if (raid1_grow_bh(conf, NR_RESERVED_BUFS) < NR_RESERVED_BUFS)

22488

+ LOG_WARNING("%s: Cannot grow BH pool\n", __FUNCTION__);

22489

+ break;

22490

+

22491

+ case DISKOP_HOT_REMOVE_SPARE:

22492

+ rdisk = conf->mirrors + removed_disk;

22493

+

22494

+ if (removed_disk < conf->raid_disks) {

22495

+ MD_BUG();

22496

+ err = 1;

22497

+ goto abort;

22498

+ }

22499

+

22500

+ LOG_WARNING("%s: removing spare %s, [md%d] nr_disks=%d\n",

22501

+ __FUNCTION__, evms_md_partition_name(rdisk->node),

22502

+ conf->mddev->__minor, conf->nr_disks-1);

22503

+

22504

+ rdisk->dev = MKDEV(0,0);

22505

+ rdisk->node = NULL;

22506

+ rdisk->used_slot = 0;

22507

+ conf->nr_disks--;

22508

+ break;

22509

+

22510

+ case DISKOP_HOT_REMOVE_DISK:

22511

+ rdisk = conf->mirrors + removed_disk;

22512

+

22513

+ LOG_WARNING("%s: removing active disk %s, [md%d] nr_disks=%d\n",

22514

+ __FUNCTION__, evms_md_partition_name(rdisk->node),

22515

+ conf->mddev->__minor, conf->nr_disks-1);

22516

+

22517

+ rdisk->dev = MKDEV(0,0);

22518

+ rdisk->node = NULL;

22519

+ rdisk->used_slot = 0;

22520

+ rdisk->operational = 0;

22521

+ conf->working_disks--;

22522

+ conf->nr_disks--;

22523

+ sb->raid_disks--; //decrement raid disks. md_core now increments

22524

+ //when activating new spare, don't assume add spare here

22525

+ break;

22526

+ default:

22527

+ MD_BUG();

22528

+ err = 1;

22529

+ goto abort;

22530

+ }

22531

+abort:

22532

+ md_spin_unlock_irq(&conf->device_lock);

22533

+ if (state == DISKOP_SPARE_ACTIVE || state == DISKOP_SPARE_INACTIVE)

22534

+ /* should move to "END_REBUILD" when such exists */

22535

+ raid1_shrink_buffers(conf);

22536

+

22537

+ print_raid1_conf(conf);

22538

+ return err;

22539

+}

22540

+

22541

+

22542

+#define IO_ERROR KERN_ALERT \

22543

+"EVMS raid1: %s: unrecoverable I/O read error for block %lu\n"

22544

+

22545

+#define REDIRECT_SECTOR KERN_ERR \

22546

+"EVMS raid1: %s: redirecting sector %lu to another mirror\n"

22547

+

22548

+/*

22549

+ * This is a kernel thread which:

22550

+ *

22551

+ * 1. Retries failed read operations on working mirrors.

22552

+ * 2. Updates the raid superblock when problems encounter.

22553

+ * 3. Performs writes following reads for array syncronising.

22554

+ */

22555

+static void end_sync_write(struct buffer_head *bh, int uptodate);

22556

+static void end_sync_read(struct buffer_head *bh, int uptodate);

22557

+

22558

+static void raid1d (void *data)

22559

+{

22560

+ struct raid1_bh *r1_bh;

22561

+ struct buffer_head *bh;

22562

+ unsigned long flags;

22563

+ mddev_t *mddev;

22564

+#ifdef ORG_RAID1_CODE

22565

+ kdev_t dev;

22566

+#endif

22567

+

22568

+ for (;;) {

22569

+ md_spin_lock_irqsave(&retry_list_lock, flags);

22570

+ r1_bh = evms_raid1_retry_list;

22571

+ if (!r1_bh)

22572

+ break;

22573

+ evms_raid1_retry_list = r1_bh->next_r1;

22574

+ md_spin_unlock_irqrestore(&retry_list_lock, flags);

22575

+

22576

+ mddev = r1_bh->mddev;

22577

+ if (mddev->sb_dirty) {

22578

+ LOG_DEFAULT("EVMS raid1: dirty sb detected, updating.\n");

22579

+ mddev->sb_dirty = 0;

22580

+ evms_md_update_sb(mddev);

22581

+ }

22582

+ bh = &r1_bh->bh_req;

22583

+ switch(r1_bh->cmd) {

22584

+ case SPECIAL:

22585

+ /* have to allocate lots of bh structures and

22586

+ * schedule writes

22587

+ */

22588

+ if (test_bit(R1BH_Uptodate, &r1_bh->state)) {

22589

+ int i, sum_bhs = 0;

22590

+ int disks = MD_SB_DISKS;

22591

+ struct buffer_head *bhl, *mbh;

22592

+ raid1_conf_t *conf;

22593

+

22594

+ conf = mddev_to_conf(mddev);

22595

+ bhl = raid1_alloc_bh(conf, conf->raid_disks); /* don't really need this many */

22596

+ for (i = 0; i < disks ; i++) {

22597

+ if (!conf->mirrors[i].operational)

22598

+ continue;

22599

+ if (i==conf->last_used)

22600

+ /* we read from here, no need to write */

22601

+ continue;

22602

+ if (i < conf->raid_disks

22603

+ && !conf->resync_mirrors

22604

+ && !conf->mirrors[i].write_only)

22605

+ /* don't need to write this,

22606

+ * we are just rebuilding */

22607

+ continue;

22608

+ mbh = bhl;

22609

+ if (!mbh) {

22610

+ MD_BUG();

22611

+ break;

22612

+ }

22613

+ bhl = mbh->b_next;

22614

+ mbh->b_this_page = (struct buffer_head *)1;

22615

+

22616

+

22617

+ /*

22618

+ * prepare mirrored bh (fields ordered for max mem throughput):

22619

+ */

22620

+ mbh->b_blocknr = bh->b_blocknr;

22621

+ mbh->b_dev = conf->mirrors[i].dev;

22622

+ mbh->b_rdev = conf->mirrors[i].dev;

22623

+ mbh->b_rsector = bh->b_blocknr;

22624

+ mbh->b_state = (1<<BH_Req) | (1<<BH_Dirty) |

22625

+ (1<<BH_Mapped) | (1<<BH_Lock);

22626

+ atomic_set(&mbh->b_count, 1);

22627

+ mbh->b_size = bh->b_size;

22628

+ mbh->b_page = bh->b_page;

22629

+ mbh->b_data = bh->b_data;

22630

+ mbh->b_list = BUF_LOCKED;

22631

+ mbh->b_end_io = end_sync_write;

22632

+ //mbh->b_private = r1_bh;

22633

+ mbh->b_private = conf->mirrors[i].node;

22634

+

22635

+ mbh->b_next = r1_bh->mirror_bh_list;

22636

+ r1_bh->mirror_bh_list = mbh;

22637

+

22638

+ sum_bhs++;

22639

+ }

22640

+ md_atomic_set(&r1_bh->remaining, sum_bhs);

22641

+ if (bhl) raid1_free_bh(conf, bhl);

22642

+ mbh = r1_bh->mirror_bh_list;

22643

+

22644

+ if (!sum_bhs) {

22645

+ /* nowhere to write this too... I guess we

22646

+ * must be done

22647

+ */

22648

+ sync_request_done(bh->b_blocknr, conf);

22649

+ evms_md_done_sync(mddev, bh->b_size>>9, 0);

22650

+ raid1_free_buf(r1_bh);

22651

+ } else

22652

+ while (mbh) {

22653

+ evms_logical_node_t *node;

22654

+ eio_t eio;

22655

+ struct buffer_head *bh1 = mbh;

22656

+

22657

+ mbh = mbh->b_next;

22658

+ node = (evms_logical_node_t *)bh1->b_private;

22659

+ bh1->b_private = r1_bh;

22660

+ eio = r1_bh->eio;

22661

+ eio.bh = bh1;

22662

+ add_node_mapping(r1_bh, node, bh1);

22663

+ W_IO(node, &eio);

22664

+ evms_md_sync_acct(bh1->b_dev, bh1->b_size/512);

22665

+ }

22666

+ } else {

22667

+ /* There is no point trying a read-for-reconstruct

22668

+ * as reconstruct is about to be aborted

22669

+ */

22670

+

22671

+ LOG_ERROR(IO_ERROR, evms_md_partition_name(r1_bh->node), bh->b_blocknr);

22672

+ evms_md_done_sync(mddev, bh->b_size>>9, 0);

22673

+ }

22674

+

22675

+ break;

22676

+ case READ:

22677

+ case READA:

22678

+ {

22679

+ evms_logical_node_t *node, *new_node;

22680

+

22681

+ node = r1_bh->node;

22682

+ evms_raid1_map(mddev,&new_node);

22683

+ if (new_node == node) {

22684

+ LOG_ERROR(" unrecoverable read error on %s at LBA(%Lu)\n",

22685

+ node->name, r1_bh->eio.rsector);

22686

+ raid1_end_bh_io(r1_bh, 0);

22687

+ } else {

22688

+ /* retry I/O on new device */

22689

+ eio_t eio;

22690

+ eio = r1_bh->eio;

22691

+ R_IO(new_node, &eio);

22692

+ }

22693

+ }

22694

+ break;

22695

+ }

22696

+ }

22697

+ md_spin_unlock_irqrestore(&retry_list_lock, flags);

22698

+}

22699

+#undef IO_ERROR

22700

+#undef REDIRECT_SECTOR

22701

+

22702

+/*

22703

+ * Private kernel thread to reconstruct mirrors after an unclean

22704

+ * shutdown.

22705

+ */

22706

+static void raid1syncd (void *data)

22707

+{

22708

+ raid1_conf_t *conf = data;

22709

+ mddev_t *mddev = conf->mddev;

22710

+

22711

+ if (!conf->resync_mirrors)

22712

+ return;

22713

+ if (conf->resync_mirrors == 2)

22714

+ return;

22715

+ down(&mddev->recovery_sem);

22716

+ if (!evms_md_do_sync(mddev, NULL)) {

22717

+ /*

22718

+ * Only if everything went Ok.

22719

+ */

22720

+ conf->resync_mirrors = 0;

22721

+ }

22722

+

22723

+ close_sync(conf);

22724

+

22725

+ up(&mddev->recovery_sem);

22726

+ raid1_shrink_buffers(conf);

22727

+}

22728

+

22729

+/*

22730

+ * perform a "sync" on one "block"

22731

+ *

22732

+ * We need to make sure that no normal I/O request - particularly write

22733

+ * requests - conflict with active sync requests.

22734

+ * This is achieved by conceptually dividing the device space into a

22735

+ * number of sections:

22736

+ * DONE: 0 .. a-1 These blocks are in-sync

22737

+ * ACTIVE: a.. b-1 These blocks may have active sync requests, but

22738

+ * no normal IO requests

22739

+ * READY: b .. c-1 These blocks have no normal IO requests - sync

22740

+ * request may be happening

22741

+ * PENDING: c .. d-1 These blocks may have IO requests, but no new

22742

+ * ones will be added

22743

+ * FUTURE: d .. end These blocks are not to be considered yet. IO may

22744

+ * be happening, but not sync

22745

+ *

22746

+ * We keep a

22747

+ * phase which flips (0 or 1) each time d moves and

22748

+ * a count of:

22749

+ * z = active io requests in FUTURE since d moved - marked with

22750

+ * current phase

22751

+ * y = active io requests in FUTURE before d moved, or PENDING -

22752

+ * marked with previous phase

22753

+ * x = active sync requests in READY

22754

+ * w = active sync requests in ACTIVE

22755

+ * v = active io requests in DONE

22756

+ *

22757

+ * Normally, a=b=c=d=0 and z= active io requests

22758

+ * or a=b=c=d=END and v= active io requests

22759

+ * Allowed changes to a,b,c,d:

22760

+ * A: c==d && y==0 -> d+=window, y=z, z=0, phase=!phase

22761

+ * B: y==0 -> c=d

22762

+ * C: b=c, w+=x, x=0

22763

+ * D: w==0 -> a=b

22764

+ * E: a==b==c==d==end -> a=b=c=d=0, z=v, v=0

22765

+ *

22766

+ * At start of sync we apply A.

22767

+ * When y reaches 0, we apply B then A then being sync requests

22768

+ * When sync point reaches c-1, we wait for y==0, and W==0, and

22769

+ * then apply apply B then A then D then C.

22770

+ * Finally, we apply E

22771

+ *

22772

+ * The sync request simply issues a "read" against a working drive

22773

+ * This is marked so that on completion the raid1d thread is woken to

22774

+ * issue suitable write requests

22775

+ */

22776

+

22777

+static int raid1_sync_request (mddev_t *mddev, unsigned long sector_nr)

22778

+{

22779

+ raid1_conf_t *conf = mddev_to_conf(mddev);

22780

+ struct mirror_info *mirror;

22781

+ struct raid1_bh *r1_bh;

22782

+ struct buffer_head *bh;

22783

+ eio_t eio;

22784

+ int bsize;

22785

+ int disk;

22786

+ int block_nr;

22787

+

22788

+ spin_lock_irq(&conf->segment_lock);

22789

+ if (!sector_nr) {

22790

+ /* initialize ...*/

22791

+ int buffs;

22792

+ conf->start_active = 0;

22793

+ conf->start_ready = 0;

22794

+ conf->start_pending = 0;

22795

+ conf->start_future = 0;

22796

+ conf->phase = 0;

22797

+ /* we want enough buffers to hold twice the window of 128*/

22798

+ buffs = 128 *2 / (PAGE_SIZE>>9);

22799

+ buffs = raid1_grow_buffers(conf, buffs);

22800

+ if (buffs < 2)

22801

+ goto nomem;

22802

+

22803

+ conf->window = buffs*(PAGE_SIZE>>9)/2;

22804

+ conf->cnt_future += conf->cnt_done+conf->cnt_pending;

22805

+ conf->cnt_done = conf->cnt_pending = 0;

22806

+ if (conf->cnt_ready || conf->cnt_active)

22807

+ MD_BUG();

22808

+ }

22809

+ while (sector_nr >= conf->start_pending) {

22810

+ PRINTK("wait .. sect=%lu start_active=%d ready=%d pending=%d future=%d, cnt_done=%d active=%d ready=%d pending=%d future=%d\n",

22811

+ sector_nr, conf->start_active, conf->start_ready, conf->start_pending, conf->start_future,

22812

+ conf->cnt_done, conf->cnt_active, conf->cnt_ready, conf->cnt_pending, conf->cnt_future);

22813

+ wait_event_lock_irq(conf->wait_done,

22814

+ !conf->cnt_active,

22815

+ conf->segment_lock);

22816

+ wait_event_lock_irq(conf->wait_ready,

22817

+ !conf->cnt_pending,

22818

+ conf->segment_lock);

22819

+ conf->start_active = conf->start_ready;

22820

+ conf->start_ready = conf->start_pending;

22821

+ conf->start_pending = conf->start_future;

22822

+ conf->start_future = conf->start_future+conf->window;

22823

+ // Note: falling off the end is not a problem

22824

+ conf->phase = conf->phase ^1;

22825

+ conf->cnt_active = conf->cnt_ready;

22826

+ conf->cnt_ready = 0;

22827

+ conf->cnt_pending = conf->cnt_future;

22828

+ conf->cnt_future = 0;

22829

+ wake_up(&conf->wait_done);

22830

+ }

22831

+ conf->cnt_ready++;

22832

+ spin_unlock_irq(&conf->segment_lock);

22833

+

22834

+

22835

+ /* If reconstructing, and >1 working disc,

22836

+ * could dedicate one to rebuild and others to

22837

+ * service read requests ..

22838

+ */

22839

+ disk = conf->last_used;

22840

+ /* make sure disk is operational */

22841

+ while (!conf->mirrors[disk].operational) {

22842

+ if (disk <= 0) disk = conf->raid_disks;

22843

+ disk--;

22844

+ if (disk == conf->last_used)

22845

+ break;

22846

+ }

22847

+ conf->last_used = disk;

22848

+

22849

+ mirror = conf->mirrors+conf->last_used;

22850

+

22851

+ r1_bh = raid1_alloc_buf (conf);

22852

+ r1_bh->master_bh = NULL;

22853

+ r1_bh->mddev = mddev;

22854

+ r1_bh->cmd = SPECIAL;

22855

+ bh = &r1_bh->bh_req;

22856

+

22857

+ block_nr = sector_nr;

22858

+ bsize = 512;

22859

+ while (!(block_nr & 1) && bsize < PAGE_SIZE

22860

+ && (block_nr+2)*(bsize>>9) <= (mddev->sb->size *2)) {

22861

+ block_nr >>= 1;

22862

+ bsize <<= 1;

22863

+ }

22864

+ bh->b_size = bsize;

22865

+ bh->b_list = BUF_LOCKED;

22866

+ bh->b_dev = mirror->dev;

22867

+ bh->b_rdev = mirror->dev;

22868

+ bh->b_state = (1<<BH_Req) | (1<<BH_Mapped) | (1<<BH_Lock);

22869

+ if (!bh->b_page)

22870

+ BUG();

22871

+ if (!bh->b_data)

22872

+ BUG();

22873

+ if (bh->b_data != page_address(bh->b_page))

22874

+ BUG();

22875

+ bh->b_end_io = end_sync_read;

22876

+ bh->b_private = r1_bh;

22877

+ bh->b_blocknr = sector_nr;

22878

+ bh->b_rsector = sector_nr;

22879

+ r1_bh->node = mirror->node;

22880

+ r1_bh->eio.bh = bh;

22881

+ r1_bh->eio.rsector = bh->b_rsector;

22882

+ r1_bh->eio.rsize = bh->b_size/512;

22883

+ eio = r1_bh->eio;

22884

+ init_waitqueue_head(&bh->b_wait);

22885

+

22886

+ R_IO(mirror->node,&eio);

22887

+ evms_md_sync_acct(bh->b_dev, bh->b_size/512);

22888

+

22889

+ return (bsize >> 9);

22890

+

22891

+nomem:

22892

+ raid1_shrink_buffers(conf);

22893

+ spin_unlock_irq(&conf->segment_lock);

22894

+ return -ENOMEM;

22895

+}

22896

+

22897

+static void end_sync_read(struct buffer_head *bh, int uptodate)

22898

+{

22899

+ struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);

22900

+

22901

+ /* we have read a block, now it needs to be re-written,

22902

+ * or re-read if the read failed.

22903

+ * We don't do much here, just schedule handling by raid1d

22904

+ */

22905

+ if (!uptodate) {

22906

+ if (r1_bh->node)

22907

+ evms_md_error (r1_bh->mddev, r1_bh->node);

22908

+ }

22909

+ else

22910

+ set_bit(R1BH_Uptodate, &r1_bh->state);

22911

+ raid1_reschedule_retry(r1_bh);

22912

+}

22913

+

22914

+static void end_sync_write(struct buffer_head *bh, int uptodate)

22915

+{

22916

+ struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);

22917

+

22918

+ if (!uptodate) {

22919

+ evms_logical_node_t *node;

22920

+ node = bh_to_node(r1_bh,bh);

22921

+ if (node)

22922

+ evms_md_error (r1_bh->mddev, node);

22923

+ }

22924

+ if (atomic_dec_and_test(&r1_bh->remaining)) {

22925

+ mddev_t *mddev = r1_bh->mddev;

22926

+ unsigned long sect = bh->b_blocknr;

22927

+ int size = bh->b_size;

22928

+ raid1_free_buf(r1_bh);

22929

+ sync_request_done(sect, mddev_to_conf(mddev));

22930

+ evms_md_done_sync(mddev,size>>9, uptodate);

22931

+ }

22932

+}

22933

+

22934

+#define INVALID_LEVEL KERN_WARNING \

22935

+"EVMS raid1: md%d: raid level not set to mirroring (%d)\n"

22936

+

22937

+#define NO_SB KERN_ERR \

22938

+"EVMS raid1: disabled mirror %s (couldn't access raid superblock)\n"

22939

+

22940

+#define ERRORS KERN_ERR \

22941

+"EVMS raid1: disabled mirror %s (errors detected)\n"

22942

+

22943

+#define NOT_IN_SYNC KERN_ERR \

22944

+"EVMS raid1: disabled mirror %s (not in sync)\n"

22945

+

22946

+#define INCONSISTENT KERN_ERR \

22947

+"EVMS raid1: disabled mirror %s (inconsistent descriptor)\n"

22948

+

22949

+#define ALREADY_RUNNING KERN_ERR \

22950

+"EVMS raid1: disabled mirror %s (mirror %d already operational)\n"

22951

+

22952

+#define OPERATIONAL KERN_INFO \

22953

+"EVMS raid1: device %s operational as mirror %d\n"

22954

+

22955

+#define MEM_ERROR KERN_ERR \

22956

+"EVMS raid1: couldn't allocate memory for md%d\n"

22957

+

22958

+#define SPARE KERN_INFO \

22959

+"EVMS raid1: spare disk %s\n"

22960

+

22961

+#define NONE_OPERATIONAL KERN_ERR \

22962

+"EVMS raid1: no operational mirrors for md%d\n"

22963

+

22964

+#define ARRAY_IS_ACTIVE KERN_INFO \

22965

+"EVMS raid1: raid set md%d active with %d out of %d mirrors\n"

22966

+

22967

+#define THREAD_ERROR KERN_ERR \

22968

+"EVMS raid1: couldn't allocate thread for md%d\n"

22969

+

22970

+#define START_RESYNC KERN_WARNING \

22971

+"EVMS raid1: raid set md%d not clean; reconstructing mirrors\n"

22972

+

22973

+static int raid1_run (mddev_t *mddev)

22974

+{

22975

+ raid1_conf_t *conf;

22976

+ int i, j, disk_idx;

22977

+ struct mirror_info *disk;

22978

+ mdp_super_t *sb = mddev->sb;

22979

+ mdp_disk_t *descriptor;

22980

+ mdk_rdev_t *rdev;

22981

+ struct md_list_head *tmp;

22982

+ int start_recovery = 0;

22983

+

22984

+ MOD_INC_USE_COUNT;

22985

+

22986

+ LOG_EXTRA(__FUNCTION__" ENTRY\n");

22987

+ if (sb->level != 1) {

22988

+ LOG_ERROR(INVALID_LEVEL, mdidx(mddev), sb->level);

22989

+ goto out;

22990

+ }

22991

+ /*

22992

+ * copy the already verified devices into our private RAID1

22993

+ * bookkeeping area. [whatever we allocate in raid1_run(),

22994

+ * should be freed in raid1_stop()]

22995

+ */

22996

+

22997

+ conf = kmalloc(sizeof(raid1_conf_t), GFP_KERNEL);

22998

+ mddev->private = conf;

22999

+ if (!conf) {

23000

+ LOG_ERROR(MEM_ERROR, mdidx(mddev));

23001

+ goto out;

23002

+ }

23003

+ memset(conf, 0, sizeof(*conf));

23004

+

23005

+ ITERATE_RDEV(mddev,rdev,tmp) {

23006

+ if (rdev->faulty) {

23007

+ LOG_ERROR(ERRORS, evms_md_partition_name(rdev->node));

23008

+ } else {

23009

+ if (!rdev->sb) {

23010

+ MD_BUG();

23011

+ continue;

23012

+ }

23013

+ }

23014

+ if (rdev->desc_nr == -1) {

23015

+ MD_BUG();

23016

+ continue;

23017

+ }

23018

+ descriptor = &sb->disks[rdev->desc_nr];

23019

+ disk_idx = descriptor->raid_disk;

23020

+ disk = conf->mirrors + disk_idx;

23021

+

23022

+ if (disk_faulty(descriptor)) {

23023

+ disk->number = descriptor->number;

23024

+ disk->raid_disk = disk_idx;

23025

+ disk->node = rdev->node;

23026

+ disk->dev = rdev->dev;

23027

+ disk->sect_limit = MAX_WORK_PER_DISK;

23028

+ disk->operational = 0;

23029

+ disk->write_only = 0;

23030

+ disk->spare = 0;

23031

+ disk->used_slot = 1;

23032

+ disk->head_position = 0;

23033

+ continue;

23034

+ }

23035

+ if (disk_active(descriptor)) {

23036

+ if (!disk_sync(descriptor)) {

23037

+ LOG_ERROR(NOT_IN_SYNC, evms_md_partition_name(rdev->node));

23038

+ continue;

23039

+ }

23040

+ if ((descriptor->number > MD_SB_DISKS) ||

23041

+ (disk_idx > sb->raid_disks)) {

23042

+

23043

+ LOG_ERROR(INCONSISTENT,evms_md_partition_name(rdev->node));

23044

+ continue;

23045

+ }

23046

+ if (disk->operational) {

23047

+ LOG_ERROR(ALREADY_RUNNING, evms_md_partition_name(rdev->node), disk_idx);

23048

+ continue;

23049

+ }

23050

+ LOG_DEFAULT(OPERATIONAL, evms_md_partition_name(rdev->node), disk_idx);

23051

+ disk->number = descriptor->number;

23052

+ disk->raid_disk = disk_idx;

23053

+ disk->node = rdev->node;

23054

+ disk->dev = rdev->dev;

23055

+ disk->sect_limit = MAX_WORK_PER_DISK;

23056

+ disk->operational = 1;

23057

+ disk->write_only = 0;

23058

+ disk->spare = 0;

23059

+ disk->used_slot = 1;

23060

+ disk->head_position = 0;

23061

+ conf->working_disks++;

23062

+ } else {

23063

+ /*

23064

+ * Must be a spare disk ..

23065

+ */

23066

+ LOG_DEFAULT(SPARE, evms_md_partition_name(rdev->node));

23067

+ disk->number = descriptor->number;

23068

+ disk->raid_disk = disk_idx;

23069

+ disk->node = rdev->node;

23070

+ disk->dev = rdev->dev;

23071

+ disk->sect_limit = MAX_WORK_PER_DISK;

23072

+ disk->operational = 0;

23073

+ disk->write_only = 0;

23074

+ disk->spare = 1;

23075

+ disk->used_slot = 1;

23076

+ disk->head_position = 0;

23077

+ }

23078

+ }

23079

+ conf->raid_disks = sb->raid_disks;

23080

+ conf->nr_disks = sb->nr_disks;

23081

+ conf->mddev = mddev;

23082

+ conf->device_lock = MD_SPIN_LOCK_UNLOCKED;

23083

+

23084

+ conf->segment_lock = MD_SPIN_LOCK_UNLOCKED;

23085

+ init_waitqueue_head(&conf->wait_buffer);

23086

+ init_waitqueue_head(&conf->wait_done);

23087

+ init_waitqueue_head(&conf->wait_ready);

23088

+

23089

+ if (!conf->working_disks) {

23090

+ LOG_ERROR(NONE_OPERATIONAL, mdidx(mddev));

23091

+ goto out_free_conf;

23092

+ }

23093

+

23094

+

23095

+ /* pre-allocate some buffer_head structures.

23096

+ * As a minimum, 1 r1bh and raid_disks buffer_heads

23097

+ * would probably get us by in tight memory situations,

23098

+ * but a few more is probably a good idea.

23099

+ * For now, try NR_RESERVED_BUFS r1bh and

23100

+ * NR_RESERVED_BUFS*raid_disks bufferheads

23101

+ * This will allow at least NR_RESERVED_BUFS concurrent

23102

+ * reads or writes even if kmalloc starts failing

23103

+ */

23104

+ if (raid1_grow_r1bh(conf, NR_RESERVED_BUFS) < NR_RESERVED_BUFS ||

23105

+ raid1_grow_bh(conf, NR_RESERVED_BUFS*conf->raid_disks)

23106

+ < NR_RESERVED_BUFS*conf->raid_disks) {

23107

+ LOG_ERROR(MEM_ERROR, mdidx(mddev));

23108

+ goto out_free_conf;

23109

+ }

23110

+

23111

+ for (i = 0; i < MD_SB_DISKS; i++) {

23112

+

23113

+ descriptor = sb->disks+i;

23114

+ disk_idx = descriptor->raid_disk;

23115

+ disk = conf->mirrors + disk_idx;

23116

+

23117

+ if (disk_faulty(descriptor) && (disk_idx < conf->raid_disks) &&

23118

+ !disk->used_slot) {

23119

+

23120

+ disk->number = descriptor->number;

23121

+ disk->raid_disk = disk_idx;

23122

+ disk->dev = MKDEV(0,0);

23123

+

23124

+ disk->operational = 0;

23125

+ disk->write_only = 0;

23126

+ disk->spare = 0;

23127

+ disk->used_slot = 1;

23128

+ disk->head_position = 0;

23129

+ }

23130

+ }

23131

+

23132

+ /*

23133

+ * find the first working one and use it as a starting point

23134

+ * to read balancing.

23135

+ */

23136

+ for (j = 0; !conf->mirrors[j].operational && j < MD_SB_DISKS; j++)

23137

+ /* nothing */;

23138

+ conf->last_used = j;

23139

+

23140

+

23141

+ if (conf->working_disks != sb->raid_disks) {

23142

+ LOG_SERIOUS(" md%d, not all disks are operational -- trying to recover array\n",

23143

+ mdidx(mddev));

23144

+ start_recovery = 1;

23145

+ }

23146

+

23147

+ {

23148

+ const char * name = "evms_raid1d";

23149

+

23150

+ conf->thread = evms_cs_register_thread(raid1d, conf, name);

23151

+ if (!conf->thread) {

23152

+ LOG_ERROR(THREAD_ERROR, mdidx(mddev));

23153

+ goto out_free_conf;

23154

+ }

23155

+ }

23156

+

23157

+ if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN)) &&

23158

+ (conf->working_disks > 1)) {

23159

+ const char * name = "evms_raid1syncd";

23160

+

23161

+ conf->resync_thread = evms_cs_register_thread(raid1syncd, conf,name);

23162

+ if (!conf->resync_thread) {

23163

+ LOG_ERROR(THREAD_ERROR, mdidx(mddev));

23164

+ goto out_free_conf;

23165

+ }

23166

+

23167

+ LOG_WARNING(START_RESYNC, mdidx(mddev));

23168

+ conf->resync_mirrors = 1;

23169

+ evms_cs_wakeup_thread(conf->resync_thread);

23170

+ }

23171

+

23172

+ /*

23173

+ * Regenerate the "device is in sync with the raid set" bit for

23174

+ * each device.

23175

+ */

23176

+ for (i = 0; i < MD_SB_DISKS; i++) {

23177

+ mark_disk_nonsync(sb->disks+i);

23178

+ for (j = 0; j < sb->raid_disks; j++) {

23179

+ if (!conf->mirrors[j].operational)

23180

+ continue;

23181

+ if (sb->disks[i].number == conf->mirrors[j].number)

23182

+ mark_disk_sync(sb->disks+i);

23183

+ }

23184

+ }

23185

+ sb->active_disks = conf->working_disks;

23186

+

23187

+ if (start_recovery)

23188

+ evms_md_recover_arrays();

23189

+

23190

+

23191

+ LOG_DEFAULT(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks);

23192

+ /*

23193

+ * Ok, everything is just fine now

23194

+ */

23195

+ return 0;

23196

+

23197

+out_free_conf:

23198

+ raid1_shrink_r1bh(conf);

23199

+ raid1_shrink_bh(conf);

23200

+ raid1_shrink_buffers(conf);

23201

+ kfree(conf);

23202

+ mddev->private = NULL;

23203

+out:

23204

+ MOD_DEC_USE_COUNT;

23205

+ return -EIO;

23206

+}

23207

+

23208

+#undef INVALID_LEVEL

23209

+#undef NO_SB

23210

+#undef ERRORS

23211

+#undef NOT_IN_SYNC

23212

+#undef INCONSISTENT

23213

+#undef ALREADY_RUNNING

23214

+#undef OPERATIONAL

23215

+#undef SPARE

23216

+#undef NONE_OPERATIONAL

23217

+#undef ARRAY_IS_ACTIVE

23218

+

23219

+static int raid1_stop_resync (mddev_t *mddev)

23220

+{

23221

+ raid1_conf_t *conf = mddev_to_conf(mddev);

23222

+

23223

+ LOG_DEFAULT(__FUNCTION__ " ENTRY\n");

23224

+ if (conf->resync_thread) {

23225

+ if (conf->resync_mirrors) {

23226

+ conf->resync_mirrors = 2;

23227

+ evms_cs_interrupt_thread(conf->resync_thread);

23228

+ LOG_WARNING(" mirror resync was not fully finished, restarting next time.\n");

23229

+ return 1;

23230

+ }

23231

+ return 0;

23232

+ }

23233

+ return 0;

23234

+}

23235

+

23236

+static int raid1_restart_resync (mddev_t *mddev)

23237

+{

23238

+ raid1_conf_t *conf = mddev_to_conf(mddev);

23239

+

23240

+ LOG_DEFAULT(__FUNCTION__" ENTRY\n");

23241

+ if (conf->resync_mirrors) {

23242

+ if (!conf->resync_thread) {

23243

+ MD_BUG();

23244

+ return 0;

23245

+ }

23246

+ conf->resync_mirrors = 1;

23247

+ evms_cs_wakeup_thread(conf->resync_thread);

23248

+ return 1;

23249

+ }

23250

+ return 0;

23251

+}

23252

+

23253

+static int raid1_stop (mddev_t *mddev)

23254

+{

23255

+ raid1_conf_t *conf = mddev_to_conf(mddev);

23256

+

23257

+ LOG_DEFAULT(__FUNCTION__ " ENTRY\n");

23258

+ evms_cs_unregister_thread(conf->thread);

23259

+ if (conf->resync_thread)

23260

+ evms_cs_unregister_thread(conf->resync_thread);

23261

+ raid1_shrink_r1bh(conf);

23262

+ raid1_shrink_bh(conf);

23263

+ raid1_shrink_buffers(conf);

23264

+ kfree(conf);

23265

+ mddev->private = NULL;

23266

+ MOD_DEC_USE_COUNT;

23267

+ return 0;

23268

+}

23269

+

23270

+static int raid1_evms_ioctl (

23271

+ mddev_t * mddev,

23272

+ struct inode * inode,

23273

+ struct file * file,

23274

+ unsigned int cmd,

23275

+ unsigned long arg)

23276

+{

23277

+ int i, rc = 0;

23278

+ evms_logical_node_t *node = NULL;

23279

+ raid1_conf_t *conf = mddev_to_conf(mddev);

23280

+

23281

+ switch (cmd) {

23282

+ case EVMS_GET_BMAP:

23283

+ {

23284

+ for (i = 0; i < MD_SB_DISKS; i++) {

23285

+ if (conf->mirrors[i].operational) {

23286

+ node = conf->mirrors[i].node;

23287

+ break;

23288

+ }

23289

+ }

23290

+

23291

+ if (node)

23292

+ rc = IOCTL(node, inode, file, cmd, arg);

23293

+ else

23294

+ rc = -ENODEV;

23295

+

23296

+ break;

23297

+ }

23298

+

23299

+ default:

23300

+ rc = -EINVAL;

23301

+ }

23302

+ return rc;

23303

+}

23304

+

23305

+static mdk_personality_t raid1_personality=

23306

+{

23307

+ name: "evms_raid1",

23308

+ init_io: raid1_init_io,

23309

+ make_request: raid1_make_request,

23310

+ run: raid1_run,

23311

+ stop: raid1_stop,

23312

+ status: raid1_status,

23313

+ error_handler: raid1_error,

23314

+ diskop: raid1_diskop,

23315

+ stop_resync: raid1_stop_resync,

23316

+ restart_resync: raid1_restart_resync,

23317

+ sync_request: raid1_sync_request,

23318

+ evms_ioctl: raid1_evms_ioctl

23319

+};

23320

+

23321

+static int md__init raid1_init (void)

23322

+{

23323

+ return evms_register_md_personality (RAID1, &raid1_personality);

23324

+}

23325

+

23326

+static void raid1_exit (void)

23327

+{

23328

+ evms_unregister_md_personality (RAID1);

23329

+}

23330

+

23331

+module_init(raid1_init);

23332

+module_exit(raid1_exit);

23333

+#ifdef MODULE_LICENSE

23334

+MODULE_LICENSE("GPL");

23335

+#endif

23336

diff -Naur linux-2002-03-28/drivers/evms/md_raid5.c evms-2002-03-28/drivers/evms/md_raid5.c

23337

--- linux-2002-03-28/drivers/evms/md_raid5.c Wed Dec 31 18:00:00 1969

23338

+++ evms-2002-03-28/drivers/evms/md_raid5.c Thu Mar 28 16:28:37 2002

23339

@@ -0,0 +1,2566 @@

23340

+/*

23341

+ * md_raid5.c : Multiple Devices driver for Linux

23342

23343

23344

+ *

23345

+ * RAID-5 management functions.

23346

+ *

23347

+ * 'md_raid5.c' is an EVMS version of linux/drivers/md/raid5.c modified

23348

+ * by Cuong (Mike) Tran <miketran@us.ibm.com>, January 2002.

23349

+ *

23350

+ * This program is free software; you can redistribute it and/or modify

23351

+ * it under the terms of the GNU General Public License as published by

23352

+ * the Free Software Foundation; either version 2, or (at your option)

23353

+ * any later version.

23354

+ *

23355

+ * You should have received a copy of the GNU General Public License

23356

+ * (for example /usr/src/linux/COPYING); if not, write to the Free

23357

+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

23358

+ */

23359

+

23360

+

23361

+#include <linux/config.h>

23362

+#include <linux/module.h>

23363

+#include <linux/locks.h>

23364

+#include <linux/slab.h>

23365

+#include <linux/evms/evms_raid5.h>

23366

+#include <asm/bitops.h>

23367

+#include <asm/atomic.h>

23368

+

23369

+#define LOG_PREFIX "md raid5: "

23370

+

23371

+static mdk_personality_t raid5_personality;

23372

+

23373

+/*

23374

+ * Stripe cache

23375

+ */

23376

+

23377

+#define NR_STRIPES 256

23378

+#define IO_THRESHOLD 1

23379

+#define HASH_PAGES 1

23380

+#define HASH_PAGES_ORDER 0

23381

+#define NR_HASH (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *))

23382

+#define HASH_MASK (NR_HASH - 1)

23383

+#define stripe_hash(conf, sect) ((conf)->stripe_hashtbl[((sect) / ((conf)->buffer_size >> 9)) & HASH_MASK])

23384

+

23385

+/*

23386

+ * The following can be used to debug the driver

23387

+ */

23388

+#define RAID5_DEBUG 0

23389

+#define RAID5_PARANOIA 1

23390

+#if RAID5_PARANOIA && CONFIG_SMP

23391

+# define CHECK_DEVLOCK() if (!spin_is_locked(&conf->device_lock)) BUG()

23392

+#else

23393

+# define CHECK_DEVLOCK()

23394

+#endif

23395

+

23396

+

23397

+static void print_raid5_conf (raid5_conf_t *conf);

23398

+

23399

+static inline void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)

23400

+{

23401

+ if (atomic_dec_and_test(&sh->count)) {

23402

+ if (!list_empty(&sh->lru))

23403

+ BUG();

23404

+ if (atomic_read(&conf->active_stripes)==0)

23405

+ BUG();

23406

+ if (test_bit(STRIPE_HANDLE, &sh->state)) {

23407

+ if (test_bit(STRIPE_DELAYED, &sh->state))

23408

+ list_add_tail(&sh->lru, &conf->delayed_list);

23409

+ else

23410

+ list_add_tail(&sh->lru, &conf->handle_list);

23411

+ evms_cs_wakeup_thread(conf->thread);

23412

+ } else {

23413

+ if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {

23414

+ atomic_dec(&conf->preread_active_stripes);

23415

+ if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)

23416

+ evms_cs_wakeup_thread(conf->thread);

23417

+ }

23418

+ list_add_tail(&sh->lru, &conf->inactive_list);

23419

+ atomic_dec(&conf->active_stripes);

23420

+ if (!conf->inactive_blocked ||

23421

+ atomic_read(&conf->active_stripes) < (NR_STRIPES*3/4))

23422

+ wake_up(&conf->wait_for_stripe);

23423

+ }

23424

+ }

23425

+}

23426

+static void release_stripe(struct stripe_head *sh)

23427

+{

23428

+ raid5_conf_t *conf = sh->raid_conf;

23429

+ unsigned long flags;

23430

+

23431

+ spin_lock_irqsave(&conf->device_lock, flags);

23432

+ __release_stripe(conf, sh);

23433

+ spin_unlock_irqrestore(&conf->device_lock, flags);

23434

+}

23435

+

23436

+static void remove_hash(struct stripe_head *sh)

23437

+{

23438

+ LOG_DEBUG("%s: stripe %lu\n", __FUNCTION__, sh->sector);

23439

+

23440

+ if (sh->hash_pprev) {

23441

+ if (sh->hash_next)

23442

+ sh->hash_next->hash_pprev = sh->hash_pprev;

23443

+ *sh->hash_pprev = sh->hash_next;

23444

+ sh->hash_pprev = NULL;

23445

+ }

23446

+}

23447

+

23448

+static __inline__ void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)

23449

+{

23450

+ struct stripe_head **shp = &stripe_hash(conf, sh->sector);

23451

+

23452

+ LOG_DEBUG("%s: stripe %lu\n", __FUNCTION__, sh->sector);

23453

+

23454

+ CHECK_DEVLOCK();

23455

+ if ((sh->hash_next = *shp) != NULL)

23456

+ (*shp)->hash_pprev = &sh->hash_next;

23457

+ *shp = sh;

23458

+ sh->hash_pprev = shp;

23459

+}

23460

+

23461

+

23462

+/* find an idle stripe, make sure it is unhashed, and return it. */

23463

+static struct stripe_head *get_free_stripe(raid5_conf_t *conf)

23464

+{

23465

+ struct stripe_head *sh = NULL;

23466

+ struct list_head *first;

23467

+

23468

+ CHECK_DEVLOCK();

23469

+ if (list_empty(&conf->inactive_list))

23470

+ goto out;

23471

+ first = conf->inactive_list.next;

23472

+ sh = list_entry(first, struct stripe_head, lru);

23473

+ list_del_init(first);

23474

+ remove_hash(sh);

23475

+ atomic_inc(&conf->active_stripes);

23476

+out:

23477

+ return sh;

23478

+}

23479

+

23480

+static void shrink_buffers(struct stripe_head *sh, int num)

23481

+{

23482

+ struct buffer_head *bh;

23483

+ int i;

23484

+

23485

+ for (i=0; i<num ; i++) {

23486

+ bh = sh->bh_cache[i];

23487

+ if (!bh)

23488

+ return;

23489

+ sh->bh_cache[i] = NULL;

23490

+ free_page((unsigned long) bh->b_data);

23491

+ kfree(bh);

23492

+ }

23493

+}

23494

+

23495

+static int grow_buffers(struct stripe_head *sh, int num, int b_size, int priority)

23496

+{

23497

+ struct buffer_head *bh;

23498

+ int i;

23499

+

23500

+ for (i=0; i<num; i++) {

23501

+ struct page *page;

23502

+ bh = kmalloc(sizeof(struct buffer_head), priority);

23503

+ if (!bh)

23504

+ return 1;

23505

+ memset(bh, 0, sizeof (struct buffer_head));

23506

+ init_waitqueue_head(&bh->b_wait);

23507

+ if ((page = alloc_page(priority)))

23508

+ bh->b_data = page_address(page);

23509

+ else {

23510

+ kfree(bh);

23511

+ return 1;

23512

+ }

23513

+ atomic_set(&bh->b_count, 0);

23514

+ bh->b_page = page;

23515

+ sh->bh_cache[i] = bh;

23516

+

23517

+ }

23518

+ return 0;

23519

+}

23520

+

23521

+static struct buffer_head *raid5_build_block (struct stripe_head *sh, int i);

23522

+

23523

+static inline void init_stripe(struct stripe_head *sh, unsigned long sector)

23524

+{

23525

+ raid5_conf_t *conf = sh->raid_conf;

23526

+ int disks = conf->raid_disks, i;

23527

+

23528

+ if (atomic_read(&sh->count) != 0)

23529

+ BUG();

23530

+ if (test_bit(STRIPE_HANDLE, &sh->state))

23531

+ BUG();

23532

+

23533

+ CHECK_DEVLOCK();

23534

+ LOG_EXTRA("init_stripe called, stripe %lu\n", sh->sector);

23535

+

23536

+ remove_hash(sh);

23537

+

23538

+ sh->sector = sector;

23539

+ sh->size = conf->buffer_size;

23540

+ sh->state = 0;

23541

+

23542

+ for (i=disks; i--; ) {

23543

+ if (sh->bh_read[i] || sh->bh_write[i] || sh->bh_written[i] ||

23544

+ buffer_locked(sh->bh_cache[i])) {

23545

+ LOG_ERROR("sector=%lx i=%d %p %p %p %d\n",

23546

+ sh->sector, i, sh->bh_read[i],

23547

+ sh->bh_write[i], sh->bh_written[i],

23548

+ buffer_locked(sh->bh_cache[i]));

23549

+ BUG();

23550

+ }

23551

+ clear_bit(BH_Uptodate, &sh->bh_cache[i]->b_state);

23552

+ raid5_build_block(sh, i);

23553

+ }

23554

+ insert_hash(conf, sh);

23555

+}

23556

+

23557

+/* the buffer size has changed, so unhash all stripes

23558

+ * as active stripes complete, they will go onto inactive list

23559

+ */

23560

+static void shrink_stripe_cache(raid5_conf_t *conf)

23561

+{

23562

+ int i;

23563

+ CHECK_DEVLOCK();

23564

+ if (atomic_read(&conf->active_stripes))

23565

+ BUG();

23566

+ for (i=0; i < NR_HASH; i++) {

23567

+ struct stripe_head *sh;

23568

+ while ((sh = conf->stripe_hashtbl[i]))

23569

+ remove_hash(sh);

23570

+ }

23571

+}

23572

+

23573

+static struct stripe_head *__find_stripe(raid5_conf_t *conf, unsigned long sector)

23574

+{

23575

+ struct stripe_head *sh;

23576

+

23577

+ CHECK_DEVLOCK();

23578

+ LOG_DEBUG("%s: sector %lu\n", __FUNCTION__, sector);

23579

+ for (sh = stripe_hash(conf, sector); sh; sh = sh->hash_next)

23580

+ if (sh->sector == sector)

23581

+ return sh;

23582

+ LOG_DEBUG("%s: %lu not in cache\n", __FUNCTION__, sector);

23583

+ return NULL;

23584

+}

23585

+

23586

+static struct stripe_head *get_active_stripe(raid5_conf_t *conf, unsigned long sector, int size, int noblock)

23587

+{

23588

+ struct stripe_head *sh;

23589

+

23590

+

23591

+ md_spin_lock_irq(&conf->device_lock);

23592

+

23593

+ do {

23594

+ if (conf->buffer_size == 0 ||

23595

+ (size && size != conf->buffer_size)) {

23596

+ /* either the size is being changed (buffer_size==0) or

23597

+ * we need to change it.

23598

+ * If size==0, we can proceed as soon as buffer_size gets set.

23599

+ * If size>0, we can proceed when active_stripes reaches 0, or

23600

+ * when someone else sets the buffer_size to size.

23601

+ * If someone sets the buffer size to something else, we will need to

23602

+ * assert that we want to change it again

23603

+ */

23604

+ if (size==0)

23605

+ wait_event_lock_irq(conf->wait_for_stripe,

23606

+ conf->buffer_size,

23607

+ conf->device_lock);

23608

+ else {

23609

+ while (conf->buffer_size != size && atomic_read(&conf->active_stripes)) {

23610

+ conf->buffer_size = 0;

23611

+ wait_event_lock_irq(conf->wait_for_stripe,

23612

+ atomic_read(&conf->active_stripes)==0 || conf->buffer_size,

23613

+ conf->device_lock);

23614

+ }

23615

+

23616

+ if (conf->buffer_size != size) {

23617

+ shrink_stripe_cache(conf);

23618

+ if (size==0) BUG();

23619

+ conf->buffer_size = size;

23620

+ }

23621

+ }

23622

+ }

23623

+ if (size == 0)

23624

+ sector -= sector & ((conf->buffer_size>>9)-1);

23625

+

23626

+ sh = __find_stripe(conf, sector);

23627

+ if (!sh) {

23628

+ if (!conf->inactive_blocked)

23629

+ sh = get_free_stripe(conf);

23630

+ if (noblock && sh == NULL)

23631

+ break;

23632

+ if (!sh) {

23633

+ conf->inactive_blocked = 1;

23634

+ wait_event_lock_irq(conf->wait_for_stripe,

23635

+ !list_empty(&conf->inactive_list) &&

23636

+ (atomic_read(&conf->active_stripes) < (NR_STRIPES *3/4)

23637

+ || !conf->inactive_blocked),

23638

+ conf->device_lock);

23639

+ conf->inactive_blocked = 0;

23640

+ } else

23641

+ init_stripe(sh, sector);

23642

+ } else {

23643

+ if (atomic_read(&sh->count)) {

23644

+ if (!list_empty(&sh->lru))

23645

+ BUG();

23646

+ } else {

23647

+ if (!test_bit(STRIPE_HANDLE, &sh->state))

23648

+ atomic_inc(&conf->active_stripes);

23649

+ if (list_empty(&sh->lru))

23650

+ BUG();

23651

+ list_del_init(&sh->lru);

23652

+ }

23653

+ }

23654

+ } while (sh == NULL);

23655

+

23656

+ if (sh)

23657

+ atomic_inc(&sh->count);

23658

+

23659

+ md_spin_unlock_irq(&conf->device_lock);

23660

+ return sh;

23661

+}

23662

+

23663

+static int grow_stripes(raid5_conf_t *conf, int num, int priority)

23664

+{

23665

+ struct stripe_head *sh;

23666

+

23667

+ while (num--) {

23668

+ sh = kmalloc(sizeof(struct stripe_head), priority);

23669

+ if (!sh)

23670

+ return 1;

23671

+ memset(sh, 0, sizeof(*sh));

23672

+ sh->raid_conf = conf;

23673

+ sh->lock = SPIN_LOCK_UNLOCKED;

23674

+

23675

+ if (grow_buffers(sh, conf->raid_disks, PAGE_SIZE, priority)) {

23676

+ shrink_buffers(sh, conf->raid_disks);

23677

+ kfree(sh);

23678

+ return 1;

23679

+ }

23680

+ /* we just created an active stripe so... */

23681

+ atomic_set(&sh->count, 1);

23682

+ atomic_inc(&conf->active_stripes);

23683

+ INIT_LIST_HEAD(&sh->lru);

23684

+ release_stripe(sh);

23685

+ }

23686

+ return 0;

23687

+}

23688

+

23689

+static void shrink_stripes(raid5_conf_t *conf, int num)

23690

+{

23691

+ struct stripe_head *sh;

23692

+

23693

+ while (num--) {

23694

+ spin_lock_irq(&conf->device_lock);

23695

+ sh = get_free_stripe(conf);

23696

+ spin_unlock_irq(&conf->device_lock);

23697

+ if (!sh)

23698

+ break;

23699

+ if (atomic_read(&sh->count))

23700

+ BUG();

23701

+ shrink_buffers(sh, conf->raid_disks);

23702

+ kfree(sh);

23703

+ atomic_dec(&conf->active_stripes);

23704

+ }

23705

+}

23706

+

23707

+

23708

+static void raid5_end_read_request (struct buffer_head * bh, int uptodate)

23709

+{

23710

+ struct stripe_head *sh = bh->b_private;

23711

+ raid5_conf_t *conf = sh->raid_conf;

23712

+ int disks = conf->raid_disks, i;

23713

+ unsigned long flags;

23714

+

23715

+ for (i=0 ; i<disks; i++)

23716

+ if (bh == sh->bh_cache[i])

23717

+ break;

23718

+

23719

+ if (i == disks) {

23720

+ BUG();

23721

+ return;

23722

+ }

23723

+

23724

+ if (uptodate) {

23725

+ struct buffer_head *buffer;

23726

+ spin_lock_irqsave(&conf->device_lock, flags);

23727

+ /* we can return a buffer if we bypassed the cache or

23728

+ * if the top buffer is not in highmem. If there are

23729

+ * multiple buffers, leave the extra work to

23730

+ * handle_stripe

23731

+ */

23732

+ buffer = sh->bh_read[i];

23733

+ if (buffer &&

23734

+ (!PageHighMem(buffer->b_page)

23735

+ || buffer->b_page == bh->b_page )

23736

+ ) {

23737

+ sh->bh_read[i] = buffer->b_reqnext;

23738

+ buffer->b_reqnext = NULL;

23739

+ } else

23740

+ buffer = NULL;

23741

+ spin_unlock_irqrestore(&conf->device_lock, flags);

23742

+ if (sh->bh_page[i]==NULL)

23743

+ set_bit(BH_Uptodate, &bh->b_state);

23744

+ if (buffer) {

23745

+ if (buffer->b_page != bh->b_page)

23746

+ memcpy(buffer->b_data, bh->b_data, bh->b_size);

23747

+ buffer->b_end_io(buffer, 1);

23748

+ }

23749

+ } else {

23750

+ /* I/O error */

23751

+ if (sh->node[i])

23752

+ evms_md_error(conf->mddev, sh->node[i]);

23753

+ else

23754

+ LOG_WARNING("NODE was not set, skipping evms_md_error()\n");

23755

+ clear_bit(BH_Uptodate, &bh->b_state);

23756

+ }

23757

+ /* must restore b_page before unlocking buffer... */

23758

+ if (sh->bh_page[i]) {

23759

+ bh->b_page = sh->bh_page[i];

23760

+ bh->b_data = page_address(bh->b_page);

23761

+ sh->bh_page[i] = NULL;

23762

+ clear_bit(BH_Uptodate, &bh->b_state);

23763

+ }

23764

+ clear_bit(BH_Lock, &bh->b_state);

23765

+ set_bit(STRIPE_HANDLE, &sh->state);

23766

+ release_stripe(sh);

23767

+ if (sh->node[i]) {

23768

+ sh->node[i] = NULL;

23769

+ } else {

23770

+ LOG_WARNING(" evms node was not set.\n");

23771

+ }

23772

+

23773

+}

23774

+

23775

+static void raid5_end_write_request (struct buffer_head *bh, int uptodate)

23776

+{

23777

+ struct stripe_head *sh = bh->b_private;

23778

+ raid5_conf_t *conf = sh->raid_conf;

23779

+ int disks = conf->raid_disks, i;

23780

+ unsigned long flags;

23781

+

23782

+ for (i=0 ; i<disks; i++)

23783

+ if (bh == sh->bh_cache[i])

23784

+ break;

23785

+

23786

+ if (i == disks) {

23787

+ BUG();

23788

+ return;

23789

+ }

23790

+

23791

+ md_spin_lock_irqsave(&conf->device_lock, flags);

23792

+ if (!uptodate) {

23793

+ /* I/O error */

23794

+ if (sh->node[i])

23795

+ evms_md_error(conf->mddev, sh->node[i]);

23796

+ else

23797

+ LOG_WARNING(" NODE was not set, skipping evms_md_error()\n");

23798

+ }

23799

+ clear_bit(BH_Lock, &bh->b_state);

23800

+ set_bit(STRIPE_HANDLE, &sh->state);

23801

+ __release_stripe(conf, sh);

23802

+ md_spin_unlock_irqrestore(&conf->device_lock, flags);

23803

+ if (sh->node[i]) {

23804

+ sh->node[i] = NULL;

23805

+ } else {

23806

+ LOG_WARNING(" evms node was not set.\n");

23807

+ }

23808

+}

23809

+

23810

+

23811

+

23812

+static struct buffer_head *raid5_build_block (struct stripe_head *sh, int i)

23813

+{

23814

+ raid5_conf_t *conf = sh->raid_conf;

23815

+ struct buffer_head *bh = sh->bh_cache[i];

23816

+ unsigned long block = sh->sector / (sh->size >> 9);

23817

+

23818

+ init_buffer(bh, raid5_end_read_request, sh);

23819

+ bh->b_dev = conf->disks[i].dev;

23820

+ bh->b_blocknr = block;

23821

+

23822

+ bh->b_state = (1 << BH_Req) | (1 << BH_Mapped);

23823

+ bh->b_size = sh->size;

23824

+ bh->b_list = BUF_LOCKED;

23825

+ return bh;

23826

+}

23827

+

23828

+static int raid5_error (

23829

+ mddev_t *mddev,

23830

+ evms_logical_node_t *node)

23831

+{

23832

+ raid5_conf_t *conf = (raid5_conf_t *) mddev->private;

23833

+ mdp_super_t *sb = mddev->sb;

23834

+ struct disk_info *disk;

23835

+ int i;

23836

+

23837

+ LOG_WARNING("%s: called\n", __FUNCTION__);

23838

+

23839

+ for (i = 0, disk = conf->disks; i < conf->raid_disks; i++, disk++) {

23840

+ if (disk->node == node) {

23841

+ if (disk->operational) {

23842

+ disk->operational = 0;

23843

+ mark_disk_faulty(sb->disks+disk->number);

23844

+ mark_disk_nonsync(sb->disks+disk->number);

23845

+ mark_disk_inactive(sb->disks+disk->number);

23846

+ sb->active_disks--;

23847

+ sb->working_disks--;

23848

+ sb->failed_disks++;

23849

+ mddev->sb_dirty = 1;

23850

+ conf->working_disks--;

23851

+ conf->failed_disks++;

23852

+ evms_cs_wakeup_thread(conf->thread);

23853

+ LOG_WARNING("Disk failure on %s, disabling device."

23854

+ " Operation continuing on %d devices\n",

23855

+ evms_md_partition_name (disk->node), conf->working_disks);

23856

+ }

23857

+ return 0;

23858

+ }

23859

+ }

23860

+ /*

23861

+ * handle errors in spares (during reconstruction)

23862

+ */

23863

+ if (conf->spare) {

23864

+ disk = conf->spare;

23865

+ if (disk->node == node) {

23866

+ LOG_WARNING("EVMS RAID5: Disk failure on spare %s\n",

23867

+ evms_md_partition_name (disk->node));

23868

+ if (!conf->spare->operational) {

23869

+ /* probably a SET_DISK_FAULTY ioctl */

23870

+ return -EIO;

23871

+ }

23872

+ disk->operational = 0;

23873

+ disk->write_only = 0;

23874

+ conf->spare = NULL;

23875

+ mark_disk_faulty(sb->disks+disk->number);

23876

+ mark_disk_nonsync(sb->disks+disk->number);

23877

+ mark_disk_inactive(sb->disks+disk->number);

23878

+ sb->spare_disks--;

23879

+ sb->working_disks--;

23880

+ sb->failed_disks++;

23881

+

23882

+ mddev->sb_dirty = 1;

23883

+ evms_cs_wakeup_thread(conf->thread);

23884

+

23885

+ return 0;

23886

+ }

23887

+ }

23888

+ MD_BUG();

23889

+ return -EIO;

23890

+}

23891

+

23892

+/*

23893

+ * Input: a 'big' sector number,

23894

+ * Output: index of the data and parity disk, and the sector # in them.

23895

+ */

23896

+static unsigned long raid5_compute_sector(unsigned long r_sector, unsigned int raid_disks,

23897

+ unsigned int data_disks, unsigned int * dd_idx,

23898

+ unsigned int * pd_idx, raid5_conf_t *conf)

23899

+{

23900

+ unsigned long stripe;

23901

+ unsigned long chunk_number;

23902

+ unsigned int chunk_offset;

23903

+ unsigned long new_sector;

23904

+ int sectors_per_chunk = conf->chunk_size >> 9;

23905

+

23906

+ /* First compute the information on this sector */

23907

+

23908

+ /*

23909

+ * Compute the chunk number and the sector offset inside the chunk

23910

+ */

23911

+ chunk_number = r_sector / sectors_per_chunk;

23912

+ chunk_offset = r_sector % sectors_per_chunk;

23913

+

23914

+ /*

23915

+ * Compute the stripe number

23916

+ */

23917

+ stripe = chunk_number / data_disks;

23918

+

23919

+ /*

23920

+ * Compute the data disk and parity disk indexes inside the stripe

23921

+ */

23922

+ *dd_idx = chunk_number % data_disks;

23923

+

23924

+ /*

23925

+ * Select the parity disk based on the user selected algorithm.

23926

+ */

23927

+ if (conf->level == 4)

23928

+ *pd_idx = data_disks;

23929

+ else switch (conf->algorithm) {

23930

+ case ALGORITHM_LEFT_ASYMMETRIC:

23931

+ *pd_idx = data_disks - stripe % raid_disks;

23932

+ if (*dd_idx >= *pd_idx)

23933

+ (*dd_idx)++;

23934

+ break;

23935

+ case ALGORITHM_RIGHT_ASYMMETRIC:

23936

+ *pd_idx = stripe % raid_disks;

23937

+ if (*dd_idx >= *pd_idx)

23938

+ (*dd_idx)++;

23939

+ break;

23940

+ case ALGORITHM_LEFT_SYMMETRIC:

23941

+ *pd_idx = data_disks - stripe % raid_disks;

23942

+ *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;

23943

+ break;

23944

+ case ALGORITHM_RIGHT_SYMMETRIC:

23945

+ *pd_idx = stripe % raid_disks;

23946

+ *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;

23947

+ break;

23948

+ default:

23949

+ LOG_ERROR(" unsupported algorithm %d\n", conf->algorithm);

23950

+ }

23951

+

23952

+ /*

23953

+ * Finally, compute the new sector number

23954

+ */

23955

+ new_sector = stripe * sectors_per_chunk + chunk_offset;

23956

+ return new_sector;

23957

+}

23958

+

23959

+#define check_xor() do { \

23960

+ if (count == MAX_XOR_BLOCKS) { \

23961

+ evms_md_xor_block(count, bh_ptr); \

23962

+ count = 1; \

23963

+ } \

23964

+ } while(0)

23965

+

23966

+

23967

+static void compute_block(struct stripe_head *sh, int dd_idx)

23968

+{

23969

+ raid5_conf_t *conf = sh->raid_conf;

23970

+ int i, count, disks = conf->raid_disks;

23971

+ struct buffer_head *bh_ptr[MAX_XOR_BLOCKS], *bh;

23972

+

23973

+ memset(sh->bh_cache[dd_idx]->b_data, 0, sh->size);

23974

+ bh_ptr[0] = sh->bh_cache[dd_idx];

23975

+ count = 1;

23976

+ for (i = disks ; i--; ) {

23977

+ if (i == dd_idx)

23978

+ continue;

23979

+ bh = sh->bh_cache[i];

23980

+ if (buffer_uptodate(bh))

23981

+ bh_ptr[count++] = bh;

23982

+ else

23983

+ LOG_ERROR("%s: %d, stripe %lu, %d not present\n",

23984

+ __FUNCTION__, dd_idx, sh->sector, i);

23985

+

23986

+ check_xor();

23987

+ }

23988

+ if (count != 1)

23989

+ evms_md_xor_block(count, bh_ptr);

23990

+ set_bit(BH_Uptodate, &sh->bh_cache[dd_idx]->b_state);

23991

+}

23992

+

23993

+static void compute_parity(struct stripe_head *sh, int method)

23994

+{

23995

+ raid5_conf_t *conf = sh->raid_conf;

23996

+ int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count;

23997

+ struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];

23998

+ struct buffer_head *chosen[MD_SB_DISKS];

23999

+

24000

+ memset(chosen, 0, sizeof(chosen));

24001

+

24002

+ count = 1;

24003

+ bh_ptr[0] = sh->bh_cache[pd_idx];

24004

+ switch(method) {

24005

+ case READ_MODIFY_WRITE:

24006

+ if (!buffer_uptodate(sh->bh_cache[pd_idx]))

24007

+ BUG();

24008

+ for (i=disks ; i-- ;) {

24009

+ if (i==pd_idx)

24010

+ continue;

24011

+ if (sh->bh_write[i] &&

24012

+ buffer_uptodate(sh->bh_cache[i])) {

24013

+ bh_ptr[count++] = sh->bh_cache[i];

24014

+ chosen[i] = sh->bh_write[i];

24015

+ sh->bh_write[i] = sh->bh_write[i]->b_reqnext;

24016

+ chosen[i]->b_reqnext = sh->bh_written[i];

24017

+ sh->bh_written[i] = chosen[i];

24018

+ check_xor();

24019

+ }

24020

+ }

24021

+ break;

24022

+ case RECONSTRUCT_WRITE:

24023

+ memset(sh->bh_cache[pd_idx]->b_data, 0, sh->size);

24024

+ for (i= disks; i-- ;)

24025

+ if (i!=pd_idx && sh->bh_write[i]) {

24026

+ chosen[i] = sh->bh_write[i];

24027

+ sh->bh_write[i] = sh->bh_write[i]->b_reqnext;

24028

+ chosen[i]->b_reqnext = sh->bh_written[i];

24029

+ sh->bh_written[i] = chosen[i];

24030

+ }

24031

+ break;

24032

+ case CHECK_PARITY:

24033

+ break;

24034

+ }

24035

+ if (count>1) {

24036

+ evms_md_xor_block(count, bh_ptr);

24037

+ count = 1;

24038

+ }

24039

+

24040

+ for (i = disks; i--;)

24041

+ if (chosen[i]) {

24042

+ struct buffer_head *bh = sh->bh_cache[i];

24043

+ char *bdata;

24044

+ bdata = bh_kmap(chosen[i]);

24045

+ memcpy(bh->b_data,

24046

+ bdata,sh->size);

24047

+ bh_kunmap(chosen[i]);

24048

+ set_bit(BH_Lock, &bh->b_state);

24049

+ mark_buffer_uptodate(bh, 1);

24050

+ }

24051

+

24052

+ switch(method) {

24053

+ case RECONSTRUCT_WRITE:

24054

+ case CHECK_PARITY:

24055

+ for (i=disks; i--;)

24056

+ if (i != pd_idx) {

24057

+ bh_ptr[count++] = sh->bh_cache[i];

24058

+ check_xor();

24059

+ }

24060

+ break;

24061

+ case READ_MODIFY_WRITE:

24062

+ for (i = disks; i--;)

24063

+ if (chosen[i]) {

24064

+ bh_ptr[count++] = sh->bh_cache[i];

24065

+ check_xor();

24066

+ }

24067

+ }

24068

+ if (count != 1)

24069

+ evms_md_xor_block(count, bh_ptr);

24070

+

24071

+ if (method != CHECK_PARITY) {

24072

+ mark_buffer_uptodate(sh->bh_cache[pd_idx], 1);

24073

+ set_bit(BH_Lock, &sh->bh_cache[pd_idx]->b_state);

24074

+ } else

24075

+ mark_buffer_uptodate(sh->bh_cache[pd_idx], 0);

24076

+}

24077

+

24078

+static void add_stripe_bh (struct stripe_head *sh, struct buffer_head *bh, int dd_idx, int rw)

24079

+{

24080

+ struct buffer_head **bhp;

24081

+ raid5_conf_t *conf = sh->raid_conf;

24082

+

24083

+ spin_lock(&sh->lock);

24084

+ spin_lock_irq(&conf->device_lock);

24085

+ bh->b_reqnext = NULL;

24086

+ if (rw == READ)

24087

+ bhp = &sh->bh_read[dd_idx];

24088

+ else

24089

+ bhp = &sh->bh_write[dd_idx];

24090

+ while (*bhp) {

24091

+ LOG_DEFAULT("EVMS RAID5: multiple %d requests for sector %ld\n", rw, sh->sector);

24092

+ bhp = & (*bhp)->b_reqnext;

24093

+ }

24094

+ *bhp = bh;

24095

+ spin_unlock_irq(&conf->device_lock);

24096

+ spin_unlock(&sh->lock);

24097

+

24098

+}

24099

+

24100

+

24101

+

24102

+

24103

+

24104

+/*

24105

+ * handle_stripe - do things to a stripe.

24106

+ *

24107

+ * We lock the stripe and then examine the state of various bits

24108

+ * to see what needs to be done.

24109

+ * Possible results:

24110

+ * return some read request which now have data

24111

+ * return some write requests which are safely on disc

24112

+ * schedule a read on some buffers

24113

+ * schedule a write of some buffers

24114

+ * return confirmation of parity correctness

24115

+ *

24116

+ * Parity calculations are done inside the stripe lock

24117

+ * buffers are taken off read_list or write_list, and bh_cache buffers

24118

+ * get BH_Lock set before the stripe lock is released.

24119

+ *

24120

+ */

24121

+

24122

+static void handle_stripe(struct stripe_head *sh)

24123

+{

24124

+ raid5_conf_t *conf = sh->raid_conf;

24125

+ int disks = conf->raid_disks;

24126

+ struct buffer_head *return_ok= NULL, *return_fail = NULL;

24127

+ int action[MD_SB_DISKS];

24128

+ int i;

24129

+ int syncing;

24130

+ int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;

24131

+ int failed_num=0;

24132

+ struct buffer_head *bh;

24133

+

24134

+ memset(action, 0, sizeof(action));

24135

+

24136

+ spin_lock(&sh->lock);

24137

+ clear_bit(STRIPE_HANDLE, &sh->state);

24138

+ clear_bit(STRIPE_DELAYED, &sh->state);

24139

+

24140

+ syncing = test_bit(STRIPE_SYNCING, &sh->state);

24141

+ /* Now to look around and see what can be done */

24142

+

24143

+ for (i=disks; i--; ) {

24144

+ bh = sh->bh_cache[i];

24145

+ /* maybe we can reply to a read */

24146

+ if (buffer_uptodate(bh) && sh->bh_read[i]) {

24147

+ struct buffer_head *rbh, *rbh2;

24148

+ spin_lock_irq(&conf->device_lock);

24149

+ rbh = sh->bh_read[i];

24150

+ sh->bh_read[i] = NULL;

24151

+ spin_unlock_irq(&conf->device_lock);

24152

+ while (rbh) {

24153

+ char *bdata;

24154

+ bdata = bh_kmap(rbh);

24155

+ memcpy(bdata, bh->b_data, bh->b_size);

24156

+ bh_kunmap(rbh);

24157

+ rbh2 = rbh->b_reqnext;

24158

+ rbh->b_reqnext = return_ok;

24159

+ return_ok = rbh;

24160

+ rbh = rbh2;

24161

+ }

24162

+ }

24163

+

24164

+ /* now count some things */

24165

+ if (buffer_locked(bh)) locked++;

24166

+ if (buffer_uptodate(bh)) uptodate++;

24167

+

24168

+

24169

+ if (sh->bh_read[i]) to_read++;

24170

+ if (sh->bh_write[i]) to_write++;

24171

+ if (sh->bh_written[i]) written++;

24172

+ if (!conf->disks[i].operational) {

24173

+ failed++;

24174

+ failed_num = i;

24175

+ }

24176

+ }

24177

+ /* check if the array has lost two devices and, if so, some requests might

24178

+ * need to be failed

24179

+ */

24180

+ if (failed > 1 && to_read+to_write) {

24181

+ for (i=disks; i--; ) {

24182

+ /* fail all writes first */

24183

+ if (sh->bh_write[i]) to_write--;

24184

+ while ((bh = sh->bh_write[i])) {

24185

+ sh->bh_write[i] = bh->b_reqnext;

24186

+ bh->b_reqnext = return_fail;

24187

+ return_fail = bh;

24188

+ }

24189

+ /* fail any reads if this device is non-operational */

24190

+ if (!conf->disks[i].operational) {

24191

+ spin_lock_irq(&conf->device_lock);

24192

+ if (sh->bh_read[i]) to_read--;

24193

+ while ((bh = sh->bh_read[i])) {

24194

+ sh->bh_read[i] = bh->b_reqnext;

24195

+ bh->b_reqnext = return_fail;

24196

+ return_fail = bh;

24197

+ }

24198

+ spin_unlock_irq(&conf->device_lock);

24199

+ }

24200

+ }

24201

+ }

24202

+ if (failed > 1 && syncing) {

24203

+ evms_md_done_sync(conf->mddev, (sh->size>>9) - sh->sync_redone,0);

24204

+ clear_bit(STRIPE_SYNCING, &sh->state);

24205

+ syncing = 0;

24206

+ }

24207

+

24208

+ /* might be able to return some write requests if the parity block

24209

+ * is safe, or on a failed drive

24210

+ */

24211

+ bh = sh->bh_cache[sh->pd_idx];

24212

+ if ( written &&

24213

+ ( (conf->disks[sh->pd_idx].operational && !buffer_locked(bh) && buffer_uptodate(bh))

24214

+ || (failed == 1 && failed_num == sh->pd_idx))

24215

+ ) {

24216

+ /* any written block on a uptodate or failed drive can be returned */

24217

+ for (i=disks; i--; )

24218

+ if (sh->bh_written[i]) {

24219

+ bh = sh->bh_cache[i];

24220

+ if (!conf->disks[sh->pd_idx].operational ||

24221

+ (!buffer_locked(bh) && buffer_uptodate(bh)) ) {

24222

+ /* maybe we can return some write requests */

24223

+ struct buffer_head *wbh, *wbh2;

24224

+ wbh = sh->bh_written[i];

24225

+ sh->bh_written[i] = NULL;

24226

+ while (wbh) {

24227

+ wbh2 = wbh->b_reqnext;

24228

+ wbh->b_reqnext = return_ok;

24229

+ return_ok = wbh;

24230

+ wbh = wbh2;

24231

+ }

24232

+ }

24233

+ }

24234

+ }

24235

+

24236

+ /* Now we might consider reading some blocks, either to check/generate

24237

+ * parity, or to satisfy requests

24238

+ */

24239

+ if (to_read || (syncing && (uptodate+failed < disks))) {

24240

+ for (i=disks; i--;) {

24241

+ bh = sh->bh_cache[i];

24242

+ if (!buffer_locked(bh) && !buffer_uptodate(bh) &&

24243

+ (sh->bh_read[i] || syncing || (failed && sh->bh_read[failed_num]))) {

24244

+ /* we would like to get this block, possibly

24245

+ * by computing it, but we might not be able to

24246

+ */

24247

+ if (uptodate == disks-1) {

24248

+ compute_block(sh, i);

24249

+ uptodate++;

24250

+ } else if (conf->disks[i].operational) {

24251

+ set_bit(BH_Lock, &bh->b_state);

24252

+ action[i] = READ+1;

24253

+ /* if I am just reading this block and we don't have

24254

+ a failed drive, or any pending writes then sidestep the cache */

24255

+ if (sh->bh_page[i]) BUG();

24256

+ if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext &&

24257

+ ! syncing && !failed && !to_write) {

24258

+ sh->bh_page[i] = sh->bh_cache[i]->b_page;

24259

+ sh->bh_cache[i]->b_page = sh->bh_read[i]->b_page;

24260

+ sh->bh_cache[i]->b_data = sh->bh_read[i]->b_data;

24261

+ }

24262

+ locked++;

24263

+ if (syncing)

24264

+ evms_md_sync_acct(conf->disks[i].dev, bh->b_size>>9);

24265

+ }

24266

+ }

24267

+ }

24268

+ set_bit(STRIPE_HANDLE, &sh->state);

24269

+ }

24270

+

24271

+ /* now to consider writing and what else, if anything should be read */

24272

+ if (to_write) {

24273

+ int rmw=0, rcw=0;

24274

+ for (i=disks ; i--;) {

24275

+ /* would I have to read this buffer for read_modify_write */

24276

+ bh = sh->bh_cache[i];

24277

+ if ((sh->bh_write[i] || i == sh->pd_idx) &&

24278

+ (!buffer_locked(bh) || sh->bh_page[i]) &&

24279

+ !buffer_uptodate(bh)) {

24280

+ if (conf->disks[i].operational

24281

+/* && !(conf->resync_parity && i == sh->pd_idx) */

24282

+ )

24283

+ rmw++;

24284

+ else rmw += 2*disks; /* cannot read it */

24285

+ }

24286

+ /* Would I have to read this buffer for reconstruct_write */

24287

+ if (!sh->bh_write[i] && i != sh->pd_idx &&

24288

+ (!buffer_locked(bh) || sh->bh_page[i]) &&

24289

+ !buffer_uptodate(bh)) {

24290

+ if (conf->disks[i].operational) rcw++;

24291

+ else rcw += 2*disks;

24292

+ }

24293

+ }

24294

+ set_bit(STRIPE_HANDLE, &sh->state);

24295

+ if (rmw < rcw && rmw > 0)

24296

+ /* prefer read-modify-write, but need to get some data */

24297

+ for (i=disks; i--;) {

24298

+ bh = sh->bh_cache[i];

24299

+ if ((sh->bh_write[i] || i == sh->pd_idx) &&

24300

+ !buffer_locked(bh) && !buffer_uptodate(bh) &&

24301

+ conf->disks[i].operational) {

24302

+ if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))

24303

+ {

24304

+ set_bit(BH_Lock, &bh->b_state);

24305

+ action[i] = READ+1;

24306

+ locked++;

24307

+ } else {

24308

+ set_bit(STRIPE_DELAYED, &sh->state);

24309

+ set_bit(STRIPE_HANDLE, &sh->state);

24310

+ }

24311

+ }

24312

+ }

24313

+ if (rcw <= rmw && rcw > 0)

24314

+ /* want reconstruct write, but need to get some data */

24315

+ for (i=disks; i--;) {

24316

+ bh = sh->bh_cache[i];

24317

+ if (!sh->bh_write[i] && i != sh->pd_idx &&

24318

+ !buffer_locked(bh) && !buffer_uptodate(bh) &&

24319

+ conf->disks[i].operational) {

24320

+ if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))

24321

+ {

24322

+ set_bit(BH_Lock, &bh->b_state);

24323

+ action[i] = READ+1;

24324

+ locked++;

24325

+ } else {

24326

+ set_bit(STRIPE_DELAYED, &sh->state);

24327

+ set_bit(STRIPE_HANDLE, &sh->state);

24328

+ }

24329

+ }

24330

+ }

24331

+ /* now if nothing is locked, and if we have enough data, we can start a write request */

24332

+ if (locked == 0 && (rcw == 0 ||rmw == 0)) {

24333

+ compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);

24334

+ /* now every locked buffer is ready to be written */

24335

+ for (i=disks; i--;)

24336

+ if (buffer_locked(sh->bh_cache[i])) {

24337

+ locked++;

24338

+ action[i] = WRITE+1;

24339

+ if (!conf->disks[i].operational

24340

+ || (i==sh->pd_idx && failed == 0))

24341

+ set_bit(STRIPE_INSYNC, &sh->state);

24342

+ }

24343

+ if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {

24344

+ atomic_dec(&conf->preread_active_stripes);

24345

+ if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)

24346

+ evms_cs_wakeup_thread(conf->thread);

24347

+ }

24348

+ }

24349

+ }

24350

+

24351

+ /* maybe we need to check and possibly fix the parity for this stripe

24352

+ * Any reads will already have been scheduled, so we just see if enough data

24353

+ * is available

24354

+ */

24355

+ if (syncing && locked == 0 &&

24356

+ !test_bit(STRIPE_INSYNC, &sh->state) && failed <= 1) {

24357

+ set_bit(STRIPE_HANDLE, &sh->state);

24358

+ if (failed == 0) {

24359

+ if (uptodate != disks)

24360

+ BUG();

24361

+ compute_parity(sh, CHECK_PARITY);

24362

+ uptodate--;

24363

+ bh = sh->bh_cache[sh->pd_idx];

24364

+ if ((*(u32*)bh->b_data) == 0 &&

24365

+ !memcmp(bh->b_data, bh->b_data+4, bh->b_size-4)) {

24366

+ /* parity is correct (on disc, not in buffer any more) */

24367

+ set_bit(STRIPE_INSYNC, &sh->state);

24368

+ }

24369

+ }

24370

+ if (!test_bit(STRIPE_INSYNC, &sh->state)) {

24371

+ struct disk_info *spare;

24372

+ if (failed==0)

24373

+ failed_num = sh->pd_idx;

24374

+ /* should be able to compute the missing block and write it to spare */

24375

+ if (!buffer_uptodate(sh->bh_cache[failed_num])) {

24376

+ if (uptodate+1 != disks)

24377

+ BUG();

24378

+ compute_block(sh, failed_num);

24379

+ uptodate++;

24380

+ }

24381

+ if (uptodate != disks)

24382

+ BUG();

24383

+ bh = sh->bh_cache[failed_num];

24384

+ set_bit(BH_Lock, &bh->b_state);

24385

+ action[failed_num] = WRITE+1;

24386

+ locked++;

24387

+ set_bit(STRIPE_INSYNC, &sh->state);

24388

+ if (conf->disks[failed_num].operational)

24389

+ evms_md_sync_acct(conf->disks[failed_num].dev, bh->b_size>>9);

24390

+ else if ((spare=conf->spare))

24391

+ evms_md_sync_acct(spare->dev, bh->b_size>>9);

24392

+

24393

+ }

24394

+ }

24395

+ if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {

24396

+ evms_md_done_sync(conf->mddev, (sh->size>>9) - sh->sync_redone,1);

24397

+ clear_bit(STRIPE_SYNCING, &sh->state);

24398

+ }

24399

+

24400

+

24401

+ spin_unlock(&sh->lock);

24402

+

24403

+ while ((bh=return_ok)) {

24404

+ return_ok = bh->b_reqnext;

24405

+ bh->b_reqnext = NULL;

24406

+ bh->b_end_io(bh, 1);

24407

+ }

24408

+ while ((bh=return_fail)) {

24409

+ return_fail = bh->b_reqnext;

24410

+ bh->b_reqnext = NULL;

24411

+ bh->b_end_io(bh, 0);

24412

+ }

24413

+ for (i=disks; i-- ;)

24414

+ if (action[i]) {

24415

+ struct buffer_head *bh = sh->bh_cache[i];

24416

+ struct disk_info *spare = conf->spare;

24417

+ evms_logical_node_t *node = NULL;

24418

+ eio_t eio;

24419

+ int skip = 0;

24420

+ if (action[i] == READ+1)

24421

+ bh->b_end_io = raid5_end_read_request;

24422

+ else

24423

+ bh->b_end_io = raid5_end_write_request;

24424

+ if (conf->disks[i].operational) {

24425

+ bh->b_dev = conf->disks[i].dev;

24426

+ node = conf->disks[i].node;

24427

+ } else if (spare && action[i] == WRITE+1) {

24428

+ bh->b_dev = spare->dev;

24429

+ node = spare->node;

24430

+ } else skip=1;

24431

+ if (!skip) {

24432

+ atomic_inc(&sh->count);

24433

+ bh->b_rdev = bh->b_dev;

24434

+ bh->b_rsector = bh->b_blocknr * (bh->b_size>>9);

24435

+ eio.bh = bh;

24436

+ eio.rsector = bh->b_rsector;

24437

+ eio.rsize = bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT;

24438

+ sh->node[i] = node;

24439

+ if (action[i] == READ+1)

24440

+ R_IO(node, &eio);

24441

+ else

24442

+ W_IO(node, &eio);

24443

+ } else {

24444

+ clear_bit(BH_Lock, &bh->b_state);

24445

+ set_bit(STRIPE_HANDLE, &sh->state);

24446

+ }

24447

+ }

24448

+}

24449

+

24450

+static inline void raid5_activate_delayed(raid5_conf_t *conf)

24451

+{

24452

+ if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {

24453

+ while (!list_empty(&conf->delayed_list)) {

24454

+ struct list_head *l = conf->delayed_list.next;

24455

+ struct stripe_head *sh;

24456

+ sh = list_entry(l, struct stripe_head, lru);

24457

+ list_del_init(l);

24458

+ clear_bit(STRIPE_DELAYED, &sh->state);

24459

+ if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))

24460

+ atomic_inc(&conf->preread_active_stripes);

24461

+ list_add_tail(&sh->lru, &conf->handle_list);

24462

+ }

24463

+ }

24464

+}

24465

+static void raid5_unplug_device(void *data)

24466

+{

24467

+ raid5_conf_t *conf = (raid5_conf_t *)data;

24468

+ unsigned long flags;

24469

+

24470

+ spin_lock_irqsave(&conf->device_lock, flags);

24471

+

24472

+ raid5_activate_delayed(conf);

24473

+

24474

+ conf->plugged = 0;

24475

+ evms_cs_wakeup_thread(conf->thread);

24476

+

24477

+ spin_unlock_irqrestore(&conf->device_lock, flags);

24478

+}

24479

+

24480

+static inline void raid5_plug_device(raid5_conf_t *conf)

24481

+{

24482

+ spin_lock_irq(&conf->device_lock);

24483

+ if (list_empty(&conf->delayed_list))

24484

+ if (!conf->plugged) {

24485

+ conf->plugged = 1;

24486

+ queue_task(&conf->plug_tq, &tq_disk);

24487

+ }

24488

+ spin_unlock_irq(&conf->device_lock);

24489

+}

24490

+

24491

+

24492

+static int raid5_make_request (mddev_t *mddev,

24493

+ int rw,

24494

+ eio_t *eio)

24495

+{

24496

+ raid5_conf_t *conf = (raid5_conf_t *) mddev->private;

24497

+ const unsigned int raid_disks = conf->raid_disks;

24498

+ const unsigned int data_disks = raid_disks - 1;

24499

+ unsigned int dd_idx, pd_idx;

24500

+ unsigned long new_sector;

24501

+ int read_ahead = 0;

24502

+ struct buffer_head *bh = eio->bh;

24503

+

24504

+ struct stripe_head *sh;

24505

+

24506

+ /* Note: Need to add 64-bit support in the future */

24507

+ bh->b_size = (unsigned short)eio->rsize << EVMS_VSECTOR_SIZE_SHIFT;

24508

+ bh->b_rsector = (unsigned long)eio->rsector;

24509

+ if (rw == READA) {

24510

+ rw = READ;

24511

+ read_ahead=1;

24512

+ }

24513

+

24514

+ new_sector = raid5_compute_sector(bh->b_rsector,

24515

+ raid_disks, data_disks, &dd_idx, &pd_idx, conf);

24516

+

24517

+ sh = get_active_stripe(conf, new_sector, bh->b_size, read_ahead);

24518

+ if (sh) {

24519

+ sh->pd_idx = pd_idx;

24520

+

24521

+ add_stripe_bh(sh, bh, dd_idx, rw);

24522

+

24523

+ raid5_plug_device(conf);

24524

+ handle_stripe(sh);

24525

+ release_stripe(sh);

24526

+ } else

24527

+ bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));

24528

+ return 0;

24529

+}

24530

+

24531

+/*

24532

+ * function: allocate_bh

24533

+ *

24534

+ * This function obtains a buffer head from the private

24535

+ * buffer head pool (pre-allocated at EVMS initial

24536

+ * discovery time).

24537

+ *

24538

+ * NOTE: All access to the buffer head pool are protected

24539

+ * by a private spinlock.

24540

+ *

24541

+ */

24542

+static inline struct buffer_head *

24543

+allocate_bh(void)

24544

+{

24545

+ struct buffer_head *bh =

24546

+ evms_cs_allocate_from_pool(evms_bh_pool, FALSE);

24547

+ if (bh) {

24548

+ init_waitqueue_head(&bh->b_wait);

24549

+ }

24550

+ return(bh);

24551

+}

24552

+

24553

+/*

24554

+ * function: deallocate_bh

24555

+ *

24556

+ * This function returns a buffer head to the private

24557

+ * buffer head pool (pre-allocated at EVMS initial

24558

+ * discovery time).

24559

+ *

24560

+ * NOTE: All access to the buffer head pool are protected

24561

+ * by a private spinlock.

24562

+ *

24563

+ */

24564

+static inline void

24565

+deallocate_bh(struct buffer_head *bh)

24566

+{

24567

+ evms_cs_deallocate_to_pool(evms_bh_pool, bh);

24568

+}

24569

+

24570

+/* this is the buffer head control block structure definition */

24571

+typedef struct bh_cb_s {

24572

+ int rc;

24573

+ atomic_t blks_allocated;

24574

+ wait_queue_head_t cb_wait;

24575

+} bh_cb_t;

24576

+

24577

+/*

24578

+ * function: __wait_on_bh_cb

24579

+ *

24580

+ * This is a worker function to wait_on_bh_cb.

24581

+ * This function waits for a set of private buffer heads

24582

+ * associated to the specified buffer head control block

24583

+ * to return from I/O completion. On completion of the

24584

+ * last buffer head, the calling function is awakened

24585

+ * and continues running.

24586

+ *

24587

+ * This is the worker function to the function wait_on_bh_cb.

24588

+ *

24589

+ */

24590

+static void

24591

+__wait_on_bh_cb(bh_cb_t *bh_cb)

24592

+{

24593

+ struct task_struct *tsk = current;

24594

+ DECLARE_WAITQUEUE(wait, tsk);

24595

+

24596

+ add_wait_queue(&bh_cb->cb_wait, &wait);

24597

+ do {

24598

+ run_task_queue(&tq_disk);

24599

+ set_task_state(tsk, TASK_UNINTERRUPTIBLE);

24600

+ if (!atomic_read(&bh_cb->blks_allocated))

24601

+ break;

24602

+ schedule();

24603

+ } while (atomic_read(&bh_cb->blks_allocated));

24604

+ tsk->state = TASK_RUNNING;

24605

+ remove_wait_queue(&bh_cb->cb_wait, &wait);

24606

+}

24607

+

24608

+/*

24609

+ * function: wait_on_bh_cb

24610

+ *

24611

+ * This function waits for a set of private buffer heads

24612

+ * associated to the specified buffer head control block

24613

+ * to return from I/O completion. On completion of the

24614

+ * last buffer head, the calling function is awakened

24615

+ * and continues running.

24616

+ *

24617

+ */

24618

+static void

24619

+wait_on_bh_cb(bh_cb_t *bh_cb)

24620

+{

24621

+ if (atomic_read(&bh_cb->blks_allocated))

24622

+ __wait_on_bh_cb(bh_cb);

24623

+ else

24624

+ /* if we ended up with no buffer heads on

24625

+ * this pass, lets wait a until a few buffer

24626

+ * heads have been freed and try again. This

24627

+ * should provide a reasonable delay.

24628

+ */

24629

+ schedule();

24630

+}

24631

+

24632

+/*

24633

+ * function: end_bh_cb_io

24634

+ *

24635

+ * This is the I/O completion function that is called for

24636

+ * each private buffer head obtained from the buffer head

24637

+ * pool. Control is return thru this routine so we can track

24638

+ * all outstanding requests to know when to awaken the caller,

24639

+ * and to regain control after all I/Os have been performed.

24640

+ *

24641

+ */

24642

+static void

24643

+end_bh_cb_io_sync(struct buffer_head *bh, int uptodate)

24644

+{

24645

+ bh_cb_t *bh_cb = (bh_cb_t *)bh->b_private;

24646

+

24647

+ /* record that errors occurred */

24648

+ if (!uptodate) {

24649

+ bh_cb->rc = -EIO;

24650

+ }

24651

+ mark_buffer_uptodate(bh, uptodate);

24652

+ unlock_buffer(bh);

24653

+

24654

+ deallocate_bh(bh);

24655

+ atomic_dec(&bh_cb->blks_allocated);

24656

+ if (!atomic_read(&bh_cb->blks_allocated))

24657

+ if (waitqueue_active(&bh_cb->cb_wait))

24658

+ wake_up(&bh_cb->cb_wait);

24659

+}

24660

+

24661

+/*

24662

+ * function: md_raid5_internal_partial_sector_io

24663

+ *

24664

+ * This function is a support function for md_raid5_internal_io,

24665

+ * which handles the cases of performing I/O to only a part

24666

+ * of sector. This function is not designed to be called

24667

+ * directly, other than by md_raid5_internal_io.

24668

+ *

24669

+ */

24670

+static int

24671

+md_raid5_internal_partial_sector_io(

24672

+ mddev_t *mddev,

24673

+ int io_flag,

24674

+ bh_cb_t *bh_cb,

24675

+ u_int64_t next_offset,

24676

+ u_int64_t sector_offset,

24677

+ u_int64_t io_size,

24678

+ void *bufptr,

24679

+ unsigned char **sector_buf )

24680

+{

24681

+ int rc = 0;

24682

+ struct buffer_head *bh;

24683

+ eio_t eio;

24684

+ raid5_conf_t *conf = (raid5_conf_t *) mddev->private;

24685

+

24686

+ if (*sector_buf == NULL)

24687

+ /* allocate buffer for incoming sector */

24688

+ rc = evms_cs_allocate_memory((void **)sector_buf,

24689

+ conf->buffer_size);

24690

+ if (!rc) {

24691

+ /* allocate a buffer head from the pool */

24692

+ while((bh = allocate_bh()) == NULL)

24693

+ /* yielding the cpu is playing it

24694

+ * safe. it might be wiser to just

24695

+ * spin. requires more thought.

24696

+ */

24697

+ schedule();

24698

+

24699

+ /* set up the buffer head for this sector */

24700

+ bh->b_end_io = end_bh_cb_io_sync;

24701

+ bh->b_size = conf->buffer_size;

24702

+ bh->b_rdev = 0;

24703

+ bh->b_rsector = (next_offset - sector_offset) >> EVMS_VSECTOR_SIZE_SHIFT;

24704

+ bh->b_data = *sector_buf;

24705

+ bh->b_page = virt_to_page(*sector_buf); /* this isn't handling the case of a block with more than 1 sector, that spans pages */

24706

+ bh->b_state = 0;

24707

+ set_bit(BH_Dirty, &bh->b_state);

24708

+ set_bit(BH_Lock, &bh->b_state);

24709

+ set_bit(BH_Req, &bh->b_state);

24710

+ set_bit(BH_Mapped, &bh->b_state);

24711

+ bh->b_private = (void *)bh_cb;

24712

+ atomic_inc(&bh_cb->blks_allocated);

24713

+

24714

+ /* drive the buffer head down */

24715

+ /* to the device */

24716

+ eio.bh = bh;

24717

+ eio.rsector = bh->b_rsector;

24718

+ eio.rsize = (u64)bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT;

24719

+ raid5_make_request(mddev, READ, &eio);

24720

+

24721

+ /* wait for all bh's I/O's to end */

24722

+ wait_on_bh_cb(bh_cb);

24723

+

24724

+ /* copy data to/from user */

24725

+ if (io_flag != WRITE)

24726

+ /* READ */

24727

+ memcpy(bufptr,

24728

+ *sector_buf + sector_offset,

24729

+ io_size);

24730

+ else {

24731

+ /* WRITE */

24732

+ memcpy(*sector_buf + sector_offset,

24733

+ bufptr,

24734

+ io_size);

24735

+

24736

+ /* allocate a buffer head from the pool */

24737

+ while((bh = allocate_bh()) == NULL)

24738

+ /* yielding the cpu is playing it

24739

+ * safe. it might be wiser to just

24740

+ * spin. requires more thought.

24741

+ */

24742

+ schedule();

24743

+

24744

+ /* set up the buffer head for this sector */

24745

+ bh->b_end_io = end_bh_cb_io_sync;

24746

+ bh->b_size = conf->buffer_size;

24747

+ bh->b_rdev = 0;

24748

+ bh->b_rsector = (next_offset - sector_offset) >> EVMS_VSECTOR_SIZE_SHIFT;

24749

+ bh->b_data = *sector_buf;

24750

+ bh->b_page = virt_to_page(*sector_buf); /* this isn't handling the case of a block with more than 1 sector, that spans pages */

24751

+ bh->b_state = 0;

24752

+ set_bit(BH_Dirty, &bh->b_state);

24753

+ set_bit(BH_Lock, &bh->b_state);

24754

+ set_bit(BH_Req, &bh->b_state);

24755

+ set_bit(BH_Mapped, &bh->b_state);

24756

+ bh->b_private = (void *)bh_cb;

24757

+ atomic_inc(&bh_cb->blks_allocated);

24758

+

24759

+ /* drive the buffer head down */

24760

+ /* to the device */

24761

+ eio.bh = bh;

24762

+ eio.rsector = bh->b_rsector;

24763

+ eio.rsize = (u64)bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT;

24764

+ raid5_make_request(mddev, WRITE, &eio);

24765

+

24766

+ /* wait for all bh's I/O's to end */

24767

+ wait_on_bh_cb(bh_cb);

24768

+ }

24769

+ }

24770

+ return(rc);

24771

+}

24772

+

24773

+/*

24774

+ * function: md_raid5_internal_io

24775

+ *

24776

+ * This function provides support for synchronous I/O

24777

+ * operations to the underlying devices. These I/O

24778

+ * operations are NOT buffered in any way including the

24779

+ * operating system's buffer cache.

24780

+ *

24781

+ * This function can work with any hardsector size that

24782

+ * is a power of 2.

24783

+ *

24784

+ * node : logical node of the target logical disk

24785

+ * io_flag : 0 = read, 1 = write, 2 = read-a-head

24786

+ * starting_offset: the 0-based (disk relative) byte offset

24787

+ * num_bytes : the total number of bytes in this I/O

24788

+ * bufptr : address of the memory to read/write the data

24789

+ *

24790

+ */

24791

+static int

24792

+md_raid5_internal_io(

24793

+ mddev_t *mddev,

24794

+ int io_flag,

24795

+ u_int64_t starting_offset,

24796

+ u_int64_t num_bytes,

24797

+ void *bufptr )

24798

+{

24799

+ int rc = 0;

24800

+ u_int64_t next_offset, remaining_bytes;

24801

+ char *cur_bufptr;

24802

+ bh_cb_t bh_cb;

24803

+ unsigned char *sector_buf = NULL;

24804

+ evms_logical_node_t *node = mddev->node;

24805

+ raid5_conf_t *conf = (raid5_conf_t *) mddev->private;

24806

+

24807

+ LOG_EVERYTHING("%s: node(%s), ioflag(%u), start_offset(%Lu), num_bytes(%Lu), bufptr(0x%p)\n",

24808

+ __FUNCTION__, node->name, io_flag, starting_offset, num_bytes, bufptr);

24809

+

24810

+ /* check for 0 length request */

24811

+ if ( num_bytes == 0 ) {

24812

+ LOG_ERROR("%s: error requesting 0 bytes.\n", __FUNCTION__);

24813

+ rc = -EINVAL;

24814

+ }

24815

+ /* check for out of bound request */

24816

+ if (!rc) {

24817

+ u64 node_total_bytes =

24818

+ node->total_vsectors <<

24819

+ EVMS_VSECTOR_SIZE_SHIFT;

24820

+ if ( (starting_offset + num_bytes) > node_total_bytes) {

24821

+ LOG_ERROR("%s: attempted %s beyond boundary(%Lu bytes), requesting offset(%Lu), length(%Lu).\n",

24822

+ __FUNCTION__, (io_flag == WRITE) ? "WRITE" : "READ",

24823

+ node_total_bytes, starting_offset, num_bytes);

24824

+ rc = -EINVAL;

24825

+ }

24826

+ }

24827

+ /* check for invalid io_flag value */

24828

+ if (!rc)

24829

+ switch( io_flag ) {

24830

+ case READ: /* read... */

24831

+ case WRITE: /* write... */

24832

+ case READA: /* reada... */

24833

+ break;

24834

+ default:

24835

+ rc = -EINVAL;

24836

+ break;

24837

+ }

24838

+

24839

+ /* initialize the buffer head control block */

24840

+ memset(&bh_cb, 0, sizeof(bh_cb_t));

24841

+ init_waitqueue_head(&bh_cb.cb_wait);

24842

+

24843

+ /* only update the local copy of variables */

24844

+ cur_bufptr = bufptr;

24845

+ next_offset = starting_offset;

24846

+ remaining_bytes = num_bytes;

24847

+

24848

+ /* continue if no errors found */

24849

+ if (!rc) {

24850

+ u_int64_t sector_offset;

24851

+

24852

+ /* check for a mid-sector starting offset

24853

+ *

24854

+ * if found, perform I/O on part of that

24855

+ * sector

24856

+ */

24857

+ sector_offset = next_offset & (conf->buffer_size - 1);

24858

+ if (sector_offset) {

24859

+ u_int64_t io_size;

24860

+

24861

+ /* determine bytes in IO to this sector */

24862

+ io_size = conf->buffer_size - sector_offset;

24863

+ if (io_size > remaining_bytes)

24864

+ io_size = remaining_bytes;

24865

+

24866

+ /* perform the partial sector io */

24867

+ rc = md_raid5_internal_partial_sector_io(

24868

+ mddev,io_flag,&bh_cb,

24869

+ next_offset,

24870

+ sector_offset, io_size,

24871

+ cur_bufptr, &sector_buf);

24872

+

24873

+ if (!rc) {

24874

+ /* update progress in local variables */

24875

+ cur_bufptr += io_size;

24876

+ next_offset += io_size;

24877

+ remaining_bytes -= io_size;

24878

+ }

24879

+ }

24880

+ }

24881

+

24882

+ /* continue if no errors found */

24883

+ if (!rc) {

24884

+ /* perform I/O on all the complete sectors

24885

+ * in this request.

24886

+ *

24887

+ * loop until there are no more complete sectors

24888

+ * to process.

24889

+ */

24890

+ while(remaining_bytes >= conf->buffer_size) {

24891

+ /* this inner loop attempts to drive as many

24892

+ * bytes (in sector size multiples) down to

24893

+ * the device as possible using the available

24894

+ * buffer heads in the pool.

24895

+ */

24896

+ while(remaining_bytes >= conf->buffer_size) {

24897

+ struct buffer_head *bh;

24898

+ eio_t eio;

24899

+

24900

+ /* allocate a buffer head from the pool */

24901

+ bh = allocate_bh();

24902

+ if (bh == NULL) break;

24903

+

24904

+ /* set up the buffer head for this I/O */

24905

+ bh->b_end_io = end_bh_cb_io_sync;

24906

+ bh->b_size = conf->buffer_size;

24907

+ bh->b_data = cur_bufptr;

24908

+ bh->b_rdev = 0;

24909

+ bh->b_rsector = next_offset >> EVMS_VSECTOR_SIZE_SHIFT;

24910

+ bh->b_page = virt_to_page(cur_bufptr); /* this isn't handling the case of a block with more than 1 sector, that spans pages */

24911

+ bh->b_state = 0;

24912

+ set_bit(BH_Dirty, &bh->b_state);

24913

+ set_bit(BH_Lock, &bh->b_state);

24914

+ set_bit(BH_Req, &bh->b_state);

24915

+ set_bit(BH_Mapped, &bh->b_state);

24916

+ bh->b_private = (void *)&bh_cb;

24917

+ atomic_inc(&bh_cb.blks_allocated);

24918

+

24919

+ /* drive the buffer head down */

24920

+ /* to the device */

24921

+ eio.bh = bh;

24922

+ eio.rsector = bh->b_rsector;

24923

+ eio.rsize = (u64)bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT;

24924

+ raid5_make_request(mddev, io_flag, &eio);

24925

+

24926

+ /* update progress in local variables */

24927

+ cur_bufptr += bh->b_size;

24928

+ next_offset += bh->b_size;

24929

+ remaining_bytes -= bh->b_size;

24930

+ }

24931

+ /* wait for all bh's I/O's to end */

24932

+ wait_on_bh_cb(&bh_cb);

24933

+ }

24934

+ }

24935

+

24936

+ /* continue if no errors found */

24937

+ if (!rc)

24938

+ /* check for a mid-sector ending offset

24939

+ *

24940

+ * if found, perform I/O on part of that

24941

+ * sector

24942

+ */

24943

+ if (remaining_bytes)

24944

+ /* perform the partial sector io */

24945

+ rc = md_raid5_internal_partial_sector_io(

24946

+ mddev, io_flag, &bh_cb,

24947

+ next_offset,

24948

+ 0, remaining_bytes,

24949

+ cur_bufptr, &sector_buf);

24950

+

24951

+ /* free the sector buffer if it was allocated */

24952

+ if (sector_buf)

24953

+ evms_cs_deallocate_memory(sector_buf);

24954

+

24955

+ /* coalesce return codes */

24956

+ rc |= bh_cb.rc;

24957

+

24958

+ LOG_EVERYTHING("%s: rc(%u)\n", __FUNCTION__, rc);

24959

+ return( rc );

24960

+}

24961

+

24962

+static int

24963

+raid5_init_io(

24964

+ mddev_t *mddev,

24965

+ int io_flag,

24966

+ evms_sector_t startingLSN,

24967

+ evms_sector_t numLSNs,

24968

+ void *bufptr )

24969

+{

24970

+ int rc = 0;

24971

+ u_int64_t starting_offset, num_bytes;

24972

+

24973

+ starting_offset = startingLSN;

24974

+ starting_offset <<= EVMS_VSECTOR_SIZE_SHIFT;

24975

+ num_bytes = numLSNs;

24976

+ num_bytes <<= EVMS_VSECTOR_SIZE_SHIFT;

24977

+ rc = md_raid5_internal_io(mddev,io_flag,starting_offset,

24978

+ num_bytes, bufptr);

24979

+ return(rc);

24980

+}

24981

+

24982

+static int raid5_sync_request (mddev_t *mddev, unsigned long sector_nr)

24983

+{

24984

+ raid5_conf_t *conf = (raid5_conf_t *) mddev->private;

24985

+ struct stripe_head *sh;

24986

+ int sectors_per_chunk = conf->chunk_size >> 9;

24987

+ unsigned long stripe = sector_nr/sectors_per_chunk;

24988

+ int chunk_offset = sector_nr % sectors_per_chunk;

24989

+ int dd_idx, pd_idx;

24990

+ unsigned long first_sector;

24991

+ int raid_disks = conf->raid_disks;

24992

+ int data_disks = raid_disks-1;

24993

+ int redone = 0;

24994

+ int bufsize;

24995

+

24996

+ sh = get_active_stripe(conf, sector_nr, 0, 0);

24997

+ bufsize = sh->size;

24998

+ redone = sector_nr - sh->sector;

24999

+ first_sector = raid5_compute_sector(stripe*data_disks*sectors_per_chunk

25000

+ + chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf);

25001

+ sh->pd_idx = pd_idx;

25002

+ spin_lock(&sh->lock);

25003

+ set_bit(STRIPE_SYNCING, &sh->state);

25004

+ clear_bit(STRIPE_INSYNC, &sh->state);

25005

+ sh->sync_redone = redone;

25006

+ spin_unlock(&sh->lock);

25007

+

25008

+ handle_stripe(sh);

25009

+ release_stripe(sh);

25010

+

25011

+ return (bufsize>>9)-redone;

25012

+}

25013

+

25014

+/*

25015

+ * This is our raid5 kernel thread.

25016

+ *

25017

+ * We scan the hash table for stripes which can be handled now.

25018

+ * During the scan, completed stripes are saved for us by the interrupt

25019

+ * handler, so that they will not have to wait for our next wakeup.

25020

+ */

25021

+static void raid5d (void *data)

25022

+{

25023

+ struct stripe_head *sh;

25024

+ raid5_conf_t *conf = data;

25025

+ mddev_t *mddev = conf->mddev;

25026

+ int handled;

25027

+

25028

+ LOG_ENTRY_EXIT("+++ raid5d active\n");

25029

+

25030

+ handled = 0;

25031

+

25032

+ if (mddev->sb_dirty) {

25033

+ mddev->sb_dirty = 0;

25034

+ evms_md_update_sb(mddev);

25035

+ }

25036

+ md_spin_lock_irq(&conf->device_lock);

25037

+ while (1) {

25038

+ struct list_head *first;

25039

+

25040

+ if (list_empty(&conf->handle_list) &&

25041

+ atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD &&

25042

+ !conf->plugged &&

25043

+ !list_empty(&conf->delayed_list))

25044

+ raid5_activate_delayed(conf);

25045

+

25046

+ if (list_empty(&conf->handle_list))

25047

+ break;

25048

+

25049

+ first = conf->handle_list.next;

25050

+ sh = list_entry(first, struct stripe_head, lru);

25051

+

25052

+ list_del_init(first);

25053

+ atomic_inc(&sh->count);

25054

+ if (atomic_read(&sh->count)!= 1)

25055

+ BUG();

25056

+ md_spin_unlock_irq(&conf->device_lock);

25057

+

25058

+ handled++;

25059

+ handle_stripe(sh);

25060

+ release_stripe(sh);

25061

+

25062

+ md_spin_lock_irq(&conf->device_lock);

25063

+ }

25064

+ LOG_DEBUG("%d stripes handled\n", handled);

25065

+

25066

+ md_spin_unlock_irq(&conf->device_lock);

25067

+

25068

+ LOG_ENTRY_EXIT("+++ raid5d inactive\n");

25069

+}

25070

+

25071

+/*

25072

+ * Private kernel thread for parity reconstruction after an unclean

25073

+ * shutdown. Reconstruction on spare drives in case of a failed drive

25074

+ * is done by the generic mdsyncd.

25075

+ */

25076

+static void raid5syncd (void *data)

25077

+{

25078

+ raid5_conf_t *conf = data;

25079

+ mddev_t *mddev = conf->mddev;

25080

+

25081

+ if (!conf->resync_parity)

25082

+ return;

25083

+ if (conf->resync_parity == 2)

25084

+ return;

25085

+ down(&mddev->recovery_sem);

25086

+ if (evms_md_do_sync(mddev,NULL)) {

25087

+ up(&mddev->recovery_sem);

25088

+ LOG_WARNING("resync aborted!\n");

25089

+ return;

25090

+ }

25091

+ conf->resync_parity = 0;

25092

+ up(&mddev->recovery_sem);

25093

+ LOG_DEFAULT("resync finished.\n");

25094

+}

25095

+

25096

+static int raid5_run (mddev_t *mddev)

25097

+{

25098

+ raid5_conf_t *conf;

25099

+ int i, j, raid_disk, memory;

25100

+ mdp_super_t *sb = mddev->sb;

25101

+ mdp_disk_t *desc;

25102

+ mdk_rdev_t *rdev;

25103

+ struct disk_info *disk;

25104

+ struct md_list_head *tmp;

25105

+ int start_recovery = 0;

25106

+

25107

+ MOD_INC_USE_COUNT;

25108

+

25109

+ if (sb->level != 5 && sb->level != 4) {

25110

+ LOG_ERROR("%s: [md%d] raid level not set to 4/5 (%d)\n",

25111

+ __FUNCTION__, mdidx(mddev), sb->level);

25112

+ MOD_DEC_USE_COUNT;

25113

+ return -EIO;

25114

+ }

25115

+

25116

+ mddev->private = kmalloc (sizeof (raid5_conf_t), GFP_KERNEL);

25117

+ if ((conf = mddev->private) == NULL)

25118

+ goto abort;

25119

+ memset (conf, 0, sizeof (*conf));

25120

+ conf->mddev = mddev;

25121

+

25122

+ if ((conf->stripe_hashtbl = (struct stripe_head **) md__get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL)

25123

+ goto abort;

25124

+ memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);

25125

+

25126

+ conf->device_lock = MD_SPIN_LOCK_UNLOCKED;

25127

+ md_init_waitqueue_head(&conf->wait_for_stripe);

25128

+ INIT_LIST_HEAD(&conf->handle_list);

25129

+ INIT_LIST_HEAD(&conf->delayed_list);

25130

+ INIT_LIST_HEAD(&conf->inactive_list);

25131

+ atomic_set(&conf->active_stripes, 0);

25132

+ atomic_set(&conf->preread_active_stripes, 0);

25133

+ conf->buffer_size = PAGE_SIZE; /* good default for rebuild */

25134

+

25135

+ conf->plugged = 0;

25136

+ conf->plug_tq.sync = 0;

25137

+ conf->plug_tq.routine = &raid5_unplug_device;

25138

+ conf->plug_tq.data = conf;

25139

+

25140

+ ITERATE_RDEV(mddev,rdev,tmp) {

25141

+ /*

25142

+ * This is important -- we are using the descriptor on

25143

+ * the disk only to get a pointer to the descriptor on

25144

+ * the main superblock, which might be more recent.

25145

+ */

25146

+ desc = sb->disks + rdev->desc_nr;

25147

+ raid_disk = desc->raid_disk;

25148

+ disk = conf->disks + raid_disk;

25149

+

25150

+ if (disk_faulty(desc)) {

25151

+ LOG_ERROR("%s: disabled device %s (errors detected)\n",

25152

+ __FUNCTION__, evms_md_partition_name(rdev->node));

25153

+ if (!rdev->faulty) {

25154

+ MD_BUG();

25155

+ goto abort;

25156

+ }

25157

+ disk->number = desc->number;

25158

+ disk->raid_disk = raid_disk;

25159

+ disk->dev = rdev->dev;

25160

+ disk->node = rdev->node;

25161

+

25162

+ disk->operational = 0;

25163

+ disk->write_only = 0;

25164

+ disk->spare = 0;

25165

+ disk->used_slot = 1;

25166

+ continue;

25167

+ }

25168

+ if (disk_active(desc)) {

25169

+ if (!disk_sync(desc)) {

25170

+ LOG_ERROR("%s: disabled device %s (not in sync)\n",

25171

+ __FUNCTION__, evms_md_partition_name(rdev->node));

25172

+ MD_BUG();

25173

+ goto abort;

25174

+ }

25175

+ if (raid_disk > sb->raid_disks) {

25176

+ LOG_ERROR("%s: disabled device %s (inconsistent descriptor)\n",

25177

+ __FUNCTION__, evms_md_partition_name(rdev->node));

25178

+ continue;

25179

+ }

25180

+ if (disk->operational) {

25181

+ LOG_ERROR("%s: disabled device %s (device %d already operational)\n",

25182

+ __FUNCTION__, evms_md_partition_name(rdev->node), raid_disk);

25183

+ continue;

25184

+ }

25185

+ LOG_DEFAULT("%s: device %s operational as raid disk %d\n",

25186

+ __FUNCTION__, evms_md_partition_name(rdev->node), raid_disk);

25187

+

25188

+ disk->number = desc->number;

25189

+ disk->raid_disk = raid_disk;

25190

+ disk->dev = rdev->dev;

25191

+ disk->node = rdev->node;

25192

+ disk->operational = 1;

25193

+ disk->used_slot = 1;

25194

+

25195

+ conf->working_disks++;

25196

+ } else {

25197

+ /*

25198

+ * Must be a spare disk ..

25199

+ */

25200

+ LOG_DEFAULT(" spare disk %s\n", evms_md_partition_name(rdev->node));

25201

+ disk->number = desc->number;

25202

+ disk->raid_disk = raid_disk;

25203

+ disk->dev = rdev->dev;

25204

+ disk->node = rdev->node;

25205

+

25206

+ disk->operational = 0;

25207

+ disk->write_only = 0;

25208

+ disk->spare = 1;

25209

+ disk->used_slot = 1;

25210

+ }

25211

+ }

25212

+

25213

+ for (i = 0; i < MD_SB_DISKS; i++) {

25214

+ desc = sb->disks + i;

25215

+ raid_disk = desc->raid_disk;

25216

+ disk = conf->disks + raid_disk;

25217

+

25218

+ if (disk_faulty(desc) && (raid_disk < sb->raid_disks) &&

25219

+ !conf->disks[raid_disk].used_slot) {

25220

+

25221

+ disk->number = desc->number;

25222

+ disk->raid_disk = raid_disk;

25223

+ disk->dev = MKDEV(0,0);

25224

+ disk->node = NULL;

25225

+

25226

+ disk->operational = 0;

25227

+ disk->write_only = 0;

25228

+ disk->spare = 0;

25229

+ disk->used_slot = 1;

25230

+ }

25231

+ }

25232

+

25233

+ conf->raid_disks = sb->raid_disks;

25234

+ /*

25235

+ * faied_disks: 0 for a fully functional array, 1 for a degraded array.

25236

+ */

25237

+ conf->failed_disks = conf->raid_disks - conf->working_disks;

25238

+ conf->mddev = mddev;

25239

+ conf->chunk_size = sb->chunk_size;

25240

+ conf->level = sb->level;

25241

+ conf->algorithm = sb->layout;

25242

+ conf->max_nr_stripes = NR_STRIPES;

25243

+

25244

+ /*

25245

+ * If chunk_size is validated in md_core.c, why do it again?

25246

+ * And the check in md_core is:

25247

+ * chunk_size has to be a power of 2 and multiples of PAGE_SIZE

25248

+ */

25249

+

25250

+ if (!conf->chunk_size ||

25251

+ ( (1 << ffz(~conf->chunk_size)) != conf->chunk_size) ||

25252

+ (conf->chunk_size < PAGE_SIZE)) {

25253

+ LOG_ERROR("%s: invalid chunk size %d for md%d\n", __FUNCTION__, conf->chunk_size, mdidx(mddev));

25254

+ goto abort;

25255

+ }

25256

+ if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {

25257

+ LOG_ERROR(" unsupported parity algorithm %d for md%d\n", conf->algorithm, mdidx(mddev));

25258

+ goto abort;

25259

+ }

25260

+ if (conf->failed_disks > 1) {

25261

+ LOG_ERROR(" not enough operational devices for md%d (%d/%d failed)\n",

25262

+ mdidx(mddev), conf->failed_disks, conf->raid_disks);

25263

+ goto abort;

25264

+ }

25265

+

25266

+ if (conf->working_disks != sb->raid_disks) {

25267

+ LOG_WARNING(" md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));

25268

+ start_recovery = 1;

25269

+ }

25270

+

25271

+ {

25272

+ const char * name = "evms_raid5d";

25273

+

25274

+ conf->thread = evms_cs_register_thread(raid5d, conf, name);

25275

+ if (!conf->thread) {

25276

+ LOG_ERROR("%s: couldn't allocate thread for md%d\n", __FUNCTION__, mdidx(mddev));

25277

+ goto abort;

25278

+ }

25279

+ }

25280

+

25281

+ memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +

25282

+ conf->raid_disks * ((sizeof(struct buffer_head) + PAGE_SIZE))) / 1024;

25283

+ if (grow_stripes(conf, conf->max_nr_stripes, GFP_KERNEL)) {

25284

+ LOG_ERROR("%s: couldn't allocate %dkB for buffers\n", __FUNCTION__, memory);

25285

+ shrink_stripes(conf, conf->max_nr_stripes);

25286

+ goto abort;

25287

+ } else

25288

+ LOG_DETAILS("%s: allocated %dkB for md%d\n", __FUNCTION__, memory, mdidx(mddev));

25289

+

25290

+ /*

25291

+ * Regenerate the "device is in sync with the raid set" bit for

25292

+ * each device.

25293

+ */

25294

+ for (i = 0; i < MD_SB_DISKS ; i++) {

25295

+ mark_disk_nonsync(sb->disks + i);

25296

+ for (j = 0; j < sb->raid_disks; j++) {

25297

+ if (!conf->disks[j].operational)

25298

+ continue;

25299

+ if (sb->disks[i].number == conf->disks[j].number)

25300

+ mark_disk_sync(sb->disks + i);

25301

+ }

25302

+ }

25303

+ sb->active_disks = conf->working_disks;

25304

+

25305

+ if (sb->active_disks == sb->raid_disks) {

25306

+ LOG_DETAILS("%s: raid level %d set md%d active with %d out of %d devices, algorithm %d\n",

25307

+ __FUNCTION__, conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm);

25308

+ } else {

25309

+ LOG_WARNING("%s: raid level %d set md%d active with %d out of %d devices, algorithm %d\n",

25310

+ __FUNCTION__, conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm);

25311

+ }

25312

+

25313

+ if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN))) {

25314

+ const char * name = "evms_raid5syncd";

25315

+

25316

+ conf->resync_thread = evms_cs_register_thread(raid5syncd, conf,name);

25317

+ if (!conf->resync_thread) {

25318

+ LOG_ERROR("%s: couldn't allocate thread for md%d\n", __FUNCTION__, mdidx(mddev));

25319

+ goto abort;

25320

+ }

25321

+

25322

+ LOG_WARNING("%s: raid set md%d not clean; reconstructing parity\n", __FUNCTION__, mdidx(mddev));

25323

+ conf->resync_parity = 1;

25324

+ evms_cs_wakeup_thread(conf->resync_thread);

25325

+ }

25326

+

25327

+ print_raid5_conf(conf);

25328

+ if (start_recovery)

25329

+ evms_md_recover_arrays();

25330

+ print_raid5_conf(conf);

25331

+

25332

+ /* Ok, everything is just fine now */

25333

+ return (0);

25334

+abort:

25335

+ if (conf) {

25336

+ print_raid5_conf(conf);

25337

+ if (conf->stripe_hashtbl)

25338

+ free_pages((unsigned long) conf->stripe_hashtbl,

25339

+ HASH_PAGES_ORDER);

25340

+ kfree(conf);

25341

+ }

25342

+ mddev->private = NULL;

25343

+ LOG_WARNING("%s: failed to run raid set md%d\n", __FUNCTION__, mdidx(mddev));

25344

+ MOD_DEC_USE_COUNT;

25345

+ return -EIO;

25346

+}

25347

+

25348

+static int raid5_stop_resync (mddev_t *mddev)

25349

+{

25350

+ raid5_conf_t *conf = mddev_to_conf(mddev);

25351

+ evms_thread_t *thread;

25352

+

25353

+ if (conf == NULL) {

25354

+ return 0;

25355

+ }

25356

+

25357

+ thread = conf->resync_thread;

25358

+

25359

+ if (thread) {

25360

+ if (conf->resync_parity) {

25361

+ conf->resync_parity = 2;

25362

+ evms_cs_interrupt_thread(thread);

25363

+ LOG_WARNING("%s: parity resync was not fully finished, restarting next time.\n", __FUNCTION__);

25364

+ return 1;

25365

+ }

25366

+ return 0;

25367

+ }

25368

+ return 0;

25369

+}

25370

+

25371

+static int raid5_restart_resync (mddev_t *mddev)

25372

+{

25373

+ raid5_conf_t *conf = mddev_to_conf(mddev);

25374

+

25375

+ if (conf->resync_parity) {

25376

+ if (!conf->resync_thread) {

25377

+ MD_BUG();

25378

+ return 0;

25379

+ }

25380

+ LOG_DEFAULT("%s: waking up raid5resync.\n", __FUNCTION__);

25381

+ conf->resync_parity = 1;

25382

+ evms_cs_wakeup_thread(conf->resync_thread);

25383

+ return 1;

25384

+ } else

25385

+ LOG_DEFAULT("%s: no restart-resync needed.\n", __FUNCTION__);

25386

+ return 0;

25387

+}

25388

+

25389

+

25390

+static int raid5_stop (mddev_t *mddev)

25391

+{

25392

+ raid5_conf_t *conf = (raid5_conf_t *) mddev->private;

25393

+

25394

+ if (conf != NULL) {

25395

+ if (conf->resync_thread)

25396

+ evms_cs_unregister_thread(conf->resync_thread);

25397

+ evms_cs_unregister_thread(conf->thread);

25398

+ shrink_stripes(conf, conf->max_nr_stripes);

25399

+ free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER);

25400

+ kfree(conf);

25401

+ mddev->private = NULL;

25402

+ }

25403

+ MOD_DEC_USE_COUNT;

25404

+ return 0;

25405

+}

25406

+

25407

+#if RAID5_DEBUG

25408

+static void print_sh (struct stripe_head *sh)

25409

+{

25410

+ int i;

25411

+

25412

+ LOG_DEFAULT("sh %lu, size %d, pd_idx %d, state %ld.\n", sh->sector, sh->size, sh->pd_idx, sh->state);

25413

+ LOG_DEFAULT("sh %lu, count %d.\n", sh->sector, atomic_read(&sh->count));

25414

+ LOG_DEFAULT("sh %lu, ", sh->sector);

25415

+ for (i = 0; i < MD_SB_DISKS; i++) {

25416

+ if (sh->bh_cache[i])

25417

+ LOG_DEFAULT("(cache%d: %p %ld) ", i, sh->bh_cache[i], sh->bh_cache[i]->b_state);

25418

+ }

25419

+ LOG_DEFAULT("\n");

25420

+}

25421

+

25422

+static void printall (raid5_conf_t *conf)

25423

+{

25424

+ struct stripe_head *sh;

25425

+ int i;

25426

+

25427

+ md_spin_lock_irq(&conf->device_lock);

25428

+ for (i = 0; i < NR_HASH; i++) {

25429

+ sh = conf->stripe_hashtbl[i];

25430

+ for (; sh; sh = sh->hash_next) {

25431

+ if (sh->raid_conf != conf)

25432

+ continue;

25433

+ print_sh(sh);

25434

+ }

25435

+ }

25436

+ md_spin_unlock_irq(&conf->device_lock);

25437

+}

25438

+#endif

25439

+

25440

+static int raid5_status (char *page, mddev_t *mddev)

25441

+{

25442

+ raid5_conf_t *conf = (raid5_conf_t *) mddev->private;

25443

+ mdp_super_t *sb = mddev->sb;

25444

+ int sz = 0, i;

25445

+

25446

+ sz += sprintf (page+sz, " level %d, %dk chunk, algorithm %d", sb->level, sb->chunk_size >> 10, sb->layout);

25447

+ sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks, conf->working_disks);

25448

+ for (i = 0; i < conf->raid_disks; i++)

25449

+ sz += sprintf (page+sz, "%s", conf->disks[i].operational ? "U" : "_");

25450

+ sz += sprintf (page+sz, "]");

25451

+#if RAID5_DEBUG

25452

+#define D(x) \

25453

+ sz += sprintf (page+sz, "<"#x":%d>", atomic_read(&conf->x))

25454

+ printall(conf);

25455

+#endif

25456

+ return sz;

25457

+}

25458

+

25459

+static void print_raid5_conf (raid5_conf_t *conf)

25460

+{

25461

+ int i;

25462

+ struct disk_info *tmp;

25463

+

25464

+ LOG_DEFAULT("RAID5 conf printout:\n");

25465

+ if (!conf) {

25466

+ LOG_DEFAULT("(conf==NULL)\n");

25467

+ return;

25468

+ }

25469

+ LOG_DEFAULT(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks,

25470

+ conf->working_disks, conf->failed_disks);

25471

+

25472

+#if RAID5_DEBUG

25473

+ for (i = 0; i < MD_SB_DISKS; i++) {

25474

+#else

25475

+ for (i = 0; i < conf->working_disks+conf->failed_disks; i++) {

25476

+#endif

25477

+ tmp = conf->disks + i;

25478

+ LOG_DEFAULT(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",

25479

+ i, tmp->spare,tmp->operational,

25480

+ tmp->number,tmp->raid_disk,tmp->used_slot,

25481

+ evms_md_partition_name(tmp->node));

25482

+ }

25483

+}

25484

+

25485

+static int raid5_diskop(mddev_t *mddev, mdp_disk_t **d, int state)

25486

+{

25487

+ int err = 0;

25488

+ int i, failed_disk=-1, spare_disk=-1, removed_disk=-1;

25489

+ raid5_conf_t *conf = mddev->private;

25490

+ struct disk_info *tmp, *sdisk, *fdisk, *rdisk;

25491

+ mdp_super_t *sb = mddev->sb;

25492

+ mdp_disk_t *failed_desc, *spare_desc;

25493

+ mdk_rdev_t *spare_rdev, *failed_rdev;

25494

+

25495

+ print_raid5_conf(conf);

25496

+ md_spin_lock_irq(&conf->device_lock);

25497

+ /*

25498

+ * find the disk ...

25499

+ */

25500

+ switch (state) {

25501

+

25502

+ case DISKOP_SPARE_ACTIVE:

25503

+

25504

+ /*

25505

+ * Find the failed disk within the RAID5 configuration ...

25506

+ * (this can only be in the first conf->raid_disks part)

25507

+ */

25508

+ for (i = 0; i < conf->raid_disks; i++) {

25509

+ tmp = conf->disks + i;

25510

+ if ((!tmp->operational && !tmp->spare) ||

25511

+ !tmp->used_slot) {

25512

+ failed_disk = i;

25513

+ break;

25514

+ }

25515

+ }

25516

+ /*

25517

+ * When we activate a spare disk we _must_ have a disk in

25518

+ * the lower (active) part of the array to replace.

25519

+ */

25520

+ if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {

25521

+ MD_BUG();

25522

+ err = 1;

25523

+ goto abort;

25524

+ }

25525

+ /* fall through */

25526

+

25527

+ case DISKOP_SPARE_WRITE:

25528

+ case DISKOP_SPARE_INACTIVE:

25529

+

25530

+ /*

25531

+ * Find the spare disk ... (can only be in the 'high'

25532

+ * area of the array)

25533

+ */

25534

+ for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {

25535

+ tmp = conf->disks + i;

25536

+ if (tmp->spare && tmp->number == (*d)->number) {

25537

+ spare_disk = i;

25538

+ break;

25539

+ }

25540

+ }

25541

+ if (spare_disk == -1) {

25542

+ MD_BUG();

25543

+ err = 1;

25544

+ goto abort;

25545

+ }

25546

+ break;

25547

+

25548

+ case DISKOP_HOT_REMOVE_SPARE:

25549

+

25550

+ for (i = 0; i < MD_SB_DISKS; i++) {

25551

+ tmp = conf->disks + i;

25552

+ if (tmp->used_slot && (tmp->number == (*d)->number)) {

25553

+ if (tmp->operational) {

25554

+ err = -EBUSY;

25555

+ goto abort;

25556

+ } else if (!tmp->spare) {

25557

+ MD_BUG();

25558

+ err = 1;

25559

+ goto abort;

25560

+ }

25561

+ removed_disk = i;

25562

+ break;

25563

+ }

25564

+ }

25565

+ if (removed_disk == -1) {

25566

+ MD_BUG();

25567

+ err = 1;

25568

+ goto abort;

25569

+ }

25570

+ break;

25571

+

25572

+ case DISKOP_HOT_REMOVE_DISK:

25573

+ for (i = 0; i < MD_SB_DISKS; i++) {

25574

+ tmp = conf->disks + i;

25575

+ if (tmp->used_slot && (tmp->number == (*d)->number)) {

25576

+ if (i < conf->raid_disks) {

25577

+ if (conf->working_disks != conf->raid_disks) {

25578

+ /*

25579

+ * Can't remove a disk from an

25580

+ * array that is running in

25581

+ * degrade mode.

25582

+ */

25583

+ err = -EBUSY;

25584

+ goto abort;

25585

+ }

25586

+ if (sb->spare_disks == 0) {

25587

+ /*

25588

+ * Must have a spare ready

25589

+ * before removing an active

25590

+ * disk.

25591

+ */

25592

+ err = -EBUSY;

25593

+ goto abort;

25594

+ }

25595

+ }

25596

+ removed_disk = i;

25597

+ break;

25598

+ }

25599

+ }

25600

+ if (removed_disk == -1) {

25601

+ MD_BUG();

25602

+ err = 1;

25603

+ goto abort;

25604

+ }

25605

+ break;

25606

+

25607

+ case DISKOP_HOT_ADD_DISK:

25608

+ err = -ENOSYS;

25609

+ goto abort;

25610

+ break;

25611

+ }

25612

+

25613

+ switch (state) {

25614

+ /*

25615

+ * Switch the spare disk to write-only mode:

25616

+ */

25617

+ case DISKOP_SPARE_WRITE:

25618

+ if (conf->spare) {

25619

+ MD_BUG();

25620

+ err = 1;

25621

+ goto abort;

25622

+ }

25623

+ sdisk = conf->disks + spare_disk;

25624

+ sdisk->operational = 1;

25625

+ sdisk->write_only = 1;

25626

+ conf->spare = sdisk;

25627

+ break;

25628

+ /*

25629

+ * Deactivate a spare disk:

25630

+ */

25631

+ case DISKOP_SPARE_INACTIVE:

25632

+ sdisk = conf->disks + spare_disk;

25633

+ sdisk->operational = 0;

25634

+ sdisk->write_only = 0;

25635

+ /*

25636

+ * Was the spare being resynced?

25637

+ */

25638

+ if (conf->spare == sdisk)

25639

+ conf->spare = NULL;

25640

+ break;

25641

+ /*

25642

+ * Activate (mark read-write) the (now sync) spare disk,

25643

+ * which means we switch it's 'raid position' (->raid_disk)

25644

+ * with the failed disk. (only the first 'conf->raid_disks'

25645

+ * slots are used for 'real' disks and we must preserve this

25646

+ * property)

25647

+ */

25648

+ case DISKOP_SPARE_ACTIVE:

25649

+ if (!conf->spare) {

25650

+ MD_BUG();

25651

+ err = 1;

25652

+ goto abort;

25653

+ }

25654

+ sdisk = conf->disks + spare_disk;

25655

+ fdisk = conf->disks + failed_disk;

25656

+

25657

+ spare_desc = &sb->disks[sdisk->number];

25658

+ failed_desc = &sb->disks[fdisk->number];

25659

+

25660

+ if (spare_desc != *d) {

25661

+ MD_BUG();

25662

+ err = 1;

25663

+ goto abort;

25664

+ }

25665

+

25666

+ if (spare_desc->raid_disk != sdisk->raid_disk) {

25667

+ MD_BUG();

25668

+ err = 1;

25669

+ goto abort;

25670

+ }

25671

+

25672

+ if (sdisk->raid_disk != spare_disk) {

25673

+ MD_BUG();

25674

+ err = 1;

25675

+ goto abort;

25676

+ }

25677

+

25678

+ if (failed_desc->raid_disk != fdisk->raid_disk) {

25679

+ MD_BUG();

25680

+ err = 1;

25681

+ goto abort;

25682

+ }

25683

+

25684

+ if (fdisk->raid_disk != failed_disk) {

25685

+ MD_BUG();

25686

+ err = 1;

25687

+ goto abort;

25688

+ }

25689

+

25690

+ /*

25691

+ * do the switch finally

25692

+ */

25693

+ spare_rdev = evms_md_find_rdev_nr(mddev, spare_desc->number);

25694

+ failed_rdev = evms_md_find_rdev_nr(mddev, failed_desc->number);

25695

+

25696

+ /* There must be a spare_rdev, but there may not be a

25697

+ * failed_rdev. That slot might be empty...

25698

+ */

25699

+ spare_rdev->desc_nr = failed_desc->number;

25700

+ if (failed_rdev)

25701

+ failed_rdev->desc_nr = spare_desc->number;

25702

+

25703

+ xchg_values(*spare_desc, *failed_desc);

25704

+ xchg_values(*fdisk, *sdisk);

25705

+

25706

+ /*

25707

+ * (careful, 'failed' and 'spare' are switched from now on)

25708

+ *

25709

+ * we want to preserve linear numbering and we want to

25710

+ * give the proper raid_disk number to the now activated

25711

+ * disk. (this means we switch back these values)

25712

+ */

25713

+

25714

+ xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);

25715

+ xchg_values(sdisk->raid_disk, fdisk->raid_disk);

25716

+ xchg_values(spare_desc->number, failed_desc->number);

25717

+ xchg_values(sdisk->number, fdisk->number);

25718

+

25719

+ *d = failed_desc;

25720

+

25721

+ //if (sdisk->dev == MKDEV(0,0))

25722

+ if (sdisk->node == NULL)

25723

+ sdisk->used_slot = 0;

25724

+

25725

+ /*

25726

+ * this really activates the spare.

25727

+ */

25728

+ fdisk->spare = 0;

25729

+ fdisk->write_only = 0;

25730

+

25731

+ /*

25732

+ * if we activate a spare, we definitely replace a

25733

+ * non-operational disk slot in the 'low' area of

25734

+ * the disk array.

25735

+ */

25736

+ conf->failed_disks--;

25737

+ conf->working_disks++;

25738

+ conf->spare = NULL;

25739

+

25740

+ break;

25741

+

25742

+ case DISKOP_HOT_REMOVE_SPARE:

25743

+ rdisk = conf->disks + removed_disk;

25744

+

25745

+ if (rdisk->spare && (removed_disk < conf->raid_disks)) {

25746

+ MD_BUG();

25747

+ err = 1;

25748

+ goto abort;

25749

+ }

25750

+ if (conf->spare != NULL) {

25751

+ if (conf->spare->number == removed_disk) {

25752

+ conf->spare = NULL;

25753

+ }

25754

+ }

25755

+

25756

+ rdisk->dev = MKDEV(0,0);

25757

+ rdisk->node = NULL;

25758

+ rdisk->used_slot = 0;

25759

+

25760

+ break;

25761

+

25762

+ case DISKOP_HOT_REMOVE_DISK:

25763

+ rdisk = conf->disks + removed_disk;

25764

+ if (rdisk->operational) {

25765

+ /* We're removing a running disk in the array. */

25766

+ conf->working_disks--;

25767

+ conf->failed_disks++;

25768

+ }

25769

+ rdisk->dev = MKDEV(0,0);

25770

+ rdisk->node = NULL;

25771

+ rdisk->used_slot = 0;

25772

+ rdisk->operational = 0;

25773

+ break;

25774

+

25775

+ default:

25776

+ MD_BUG();

25777

+ err = 1;

25778

+ goto abort;

25779

+ }

25780

+abort:

25781

+ md_spin_unlock_irq(&conf->device_lock);

25782

+ print_raid5_conf(conf);

25783

+ return err;

25784

+}

25785

+

25786

+static int raid5_bmap(mddev_t *mddev, evms_sector_t *rsector, evms_logical_node_t **node)

25787

+{

25788

+ raid5_conf_t *conf = (raid5_conf_t *) mddev->private;

25789

+ const unsigned int raid_disks = conf->raid_disks;

25790

+ const unsigned int data_disks = raid_disks - 1;

25791

+ unsigned int dd_idx, pd_idx;

25792

+

25793

+ *rsector = (evms_sector_t)raid5_compute_sector((unsigned long)*rsector,

25794

+ raid_disks,

25795

+ data_disks,

25796

+ &dd_idx,

25797

+ &pd_idx,

25798

+ conf);

25799

+ *node = conf->disks[dd_idx].node;

25800

+ return 0; /* always successful */

25801

+}

25802

+

25803

+static int raid5_evms_ioctl (

25804

+ mddev_t * mddev,

25805

+ struct inode * inode,

25806

+ struct file * file,

25807

+ unsigned int cmd,

25808

+ unsigned long arg)

25809

+{

25810

+ int rc = 0;

25811

+ evms_logical_node_t *node;

25812

+

25813

+ switch (cmd) {

25814

+ case EVMS_GET_BMAP:

25815

+ {

25816

+ evms_get_bmap_t *bmap = (evms_get_bmap_t *)arg;

25817

+ rc = raid5_bmap(mddev,&bmap->rsector,&node);

25818

+ if (!rc) {

25819

+ if (node)

25820

+ rc = IOCTL(node, inode, file, cmd, arg);

25821

+ else

25822

+ rc = -ENODEV;

25823

+ }

25824

+ break;

25825

+ }

25826

+

25827

+ default:

25828

+ rc = -EINVAL;

25829

+ }

25830

+ return rc;

25831

+}

25832

+

25833

+static int raid5_pers_ioctl(mddev_t *mddev, int cmd, void * args){

25834

+

25835

+ int rc = 0;

25836

+ raid5_ioctl_init_io_t init_io_args;

25837

+ void * data;

25838

+

25839

+ LOG_DETAILS("%s: cmd == %d.\n", __FUNCTION__, cmd);

25840

+ switch (cmd) {

25841

+ case EVMS_MD_RAID5_INIT_IO:

25842

+

25843

+ if (copy_from_user(&init_io_args, (raid5_ioctl_init_io_t*)args, sizeof(init_io_args)) ) {

25844

+ return -EFAULT;

25845

+ }

25846

+

25847

+ rc = evms_cs_allocate_memory(&data, init_io_args.nr_sects << EVMS_VSECTOR_SIZE_SHIFT);

25848

+ if (rc != 0) {

25849

+ return rc;

25850

+ }

25851

+

25852

+ if (copy_from_user(data, init_io_args.data, init_io_args.nr_sects << EVMS_VSECTOR_SIZE_SHIFT)) {

25853

+ evms_cs_deallocate_memory(data);

25854

+ return -EFAULT;

25855

+ }

25856

+

25857

+ rc = raid5_init_io(mddev, init_io_args.rw,

25858

+ init_io_args.lsn, init_io_args.nr_sects,data);

25859

+

25860

+ copy_to_user(init_io_args.data, data, init_io_args.nr_sects << EVMS_VSECTOR_SIZE_SHIFT);

25861

+ evms_cs_deallocate_memory(data);

25862

+

25863

+ copy_to_user((raid5_ioctl_init_io_t*)args, &init_io_args, sizeof(init_io_args));

25864

+ break;

25865

+

25866

+ default:

25867

+ rc = -ENOSYS;

25868

+ }

25869

+

25870

+ return rc;

25871

+}

25872

+

25873

+

25874

+static mdk_personality_t raid5_personality=

25875

+{

25876

+ name: "evms_raid5",

25877

+ init_io: raid5_init_io,

25878

+ make_request: raid5_make_request,

25879

+ run: raid5_run,

25880

+ stop: raid5_stop,

25881

+ status: raid5_status,

25882

+ error_handler: raid5_error,

25883

+ diskop: raid5_diskop,

25884

+ stop_resync: raid5_stop_resync,

25885

+ restart_resync: raid5_restart_resync,

25886

+ sync_request: raid5_sync_request,

25887

+ evms_ioctl: raid5_evms_ioctl,

25888

+ md_pers_ioctl: raid5_pers_ioctl

25889

+};

25890

+

25891

+static int md__init raid5_init (void)

25892

+{

25893

+ return evms_register_md_personality (RAID5, &raid5_personality);

25894

+}

25895

+

25896

+static void raid5_exit (void)

25897

+{

25898

+ evms_unregister_md_personality (RAID5);

25899

+}

25900

+

25901

+module_init(raid5_init);

25902

+module_exit(raid5_exit);

25903

+#ifdef MODULE_LICENSE

25904

+MODULE_LICENSE("GPL");

25905

+#endif

25906

diff -Naur linux-2002-03-28/drivers/evms/md_xor.c evms-2002-03-28/drivers/evms/md_xor.c

25907

--- linux-2002-03-28/drivers/evms/md_xor.c Wed Dec 31 18:00:00 1969

25908

+++ evms-2002-03-28/drivers/evms/md_xor.c Fri Mar 1 11:50:58 2002

25909

@@ -0,0 +1,149 @@

25910

+/*

25911

+ * md_xor.c : Multiple Devices driver for Linux

25912

+ *

25913

25914

+ * Ingo Molnar, Matti Aarnio, Jakub Jelinek, Richard Henderson.

25915

+ *

25916

+ * Dispatch optimized RAID-5 checksumming functions.

25917

+ *

25918

+ * 'md_xor.c' is an EVMS version of linux/drivers/md/xor.c modified

25919

+ * by Cuong (Mike) Tran <miketran@us.ibm.com>, January 2002.

25920

+ *

25921

+ * This program is free software; you can redistribute it and/or modify

25922

+ * it under the terms of the GNU General Public License as published by

25923

+ * the Free Software Foundation; either version 2, or (at your option)

25924

+ * any later version.

25925

+ *

25926

+ * You should have received a copy of the GNU General Public License

25927

+ * (for example /usr/src/linux/COPYING); if not, write to the Free

25928

+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

25929

+ */

25930

+

25931

+#define BH_TRACE 0

25932

+#include <linux/module.h>

25933

+#include <linux/evms/evms_md.h>

25934

+#include <linux/evms/evms_xor.h>

25935

+#include <asm/xor.h>

25936

+

25937

+#define LOG_PREFIX "md raid5: "

25938

+/* The xor routines to use. */

25939

+static struct xor_block_template *active_template;

25940

+

25941

+void

25942

+evms_md_xor_block(unsigned int count, struct buffer_head **bh_ptr)

25943

+{

25944

+ unsigned long *p0, *p1, *p2, *p3, *p4;

25945

+ unsigned long bytes = bh_ptr[0]->b_size;

25946

+

25947

+ p0 = (unsigned long *) bh_ptr[0]->b_data;

25948

+ p1 = (unsigned long *) bh_ptr[1]->b_data;

25949

+ if (count == 2) {

25950

+ active_template->do_2(bytes, p0, p1);

25951

+ return;

25952

+ }

25953

+

25954

+ p2 = (unsigned long *) bh_ptr[2]->b_data;

25955

+ if (count == 3) {

25956

+ active_template->do_3(bytes, p0, p1, p2);

25957

+ return;

25958

+ }

25959

+

25960

+ p3 = (unsigned long *) bh_ptr[3]->b_data;

25961

+ if (count == 4) {

25962

+ active_template->do_4(bytes, p0, p1, p2, p3);

25963

+ return;

25964

+ }

25965

+

25966

+ p4 = (unsigned long *) bh_ptr[4]->b_data;

25967

+ active_template->do_5(bytes, p0, p1, p2, p3, p4);

25968

+}

25969

+

25970

+/* Set of all registered templates. */

25971

+static struct xor_block_template *template_list;

25972

+

25973

+#define BENCH_SIZE (PAGE_SIZE)

25974

+

25975

+static void

25976

+do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)

25977

+{

25978

+ int speed;

25979

+ unsigned long now;

25980

+ int i, count, max;

25981

+

25982

+ tmpl->next = template_list;

25983

+ template_list = tmpl;

25984

+

25985

+ /*

25986

+ * Count the number of XORs done during a whole jiffy, and use

25987

+ * this to calculate the speed of checksumming. We use a 2-page

25988

+ * allocation to have guaranteed color L1-cache layout.

25989

+ */

25990

+ max = 0;

25991

+ for (i = 0; i < 5; i++) {

25992

+ now = jiffies;

25993

+ count = 0;

25994

+ while (jiffies == now) {

25995

+ mb();

25996

+ tmpl->do_2(BENCH_SIZE, b1, b2);

25997

+ mb();

25998

+ count++;

25999

+ mb();

26000

+ }

26001

+ if (count > max)

26002

+ max = count;

26003

+ }

26004

+

26005

+ speed = max * (HZ * BENCH_SIZE / 1024);

26006

+ tmpl->speed = speed;

26007

+

26008

+ LOG_DEFAULT(" %-10s: %5d.%03d MB/sec\n", tmpl->name,

26009

+ speed / 1000, speed % 1000);

26010

+}

26011

+

26012

+static int

26013

+calibrate_xor_block(void)

26014

+{

26015

+ void *b1, *b2;

26016

+ struct xor_block_template *f, *fastest;

26017

+

26018

+ b1 = (void *) md__get_free_pages(GFP_KERNEL, 2);

26019

+ if (! b1) {

26020

+ LOG_ERROR("Yikes! No memory available.\n");

26021

+ return -ENOMEM;

26022

+ }

26023

+ b2 = b1 + 2*PAGE_SIZE + BENCH_SIZE;

26024

+

26025

+ LOG_DEFAULT("measuring checksumming speed\n");

26026

+ sti();

26027

+

26028

+#define xor_speed(templ) do_xor_speed((templ), b1, b2)

26029

+

26030

+ XOR_TRY_TEMPLATES;

26031

+

26032

+#undef xor_speed

26033

+

26034

+ free_pages((unsigned long)b1, 2);

26035

+

26036

+ fastest = template_list;

26037

+ for (f = fastest; f; f = f->next)

26038

+ if (f->speed > fastest->speed)

26039

+ fastest = f;

26040

+

26041

+#ifdef XOR_SELECT_TEMPLATE

26042

+ fastest = XOR_SELECT_TEMPLATE(fastest);

26043

+#endif

26044

+

26045

+ active_template = fastest;

26046

+ LOG_DEFAULT("using function: %s (%d.%03d MB/sec)\n",

26047

+ fastest->name, fastest->speed / 1000, fastest->speed % 1000);

26048

+

26049

+ return 0;

26050

+}

26051

+

26052

+MD_EXPORT_SYMBOL(evms_md_xor_block);

26053

+

26054

+#ifdef MODULE_LICENSE

26055

+MODULE_LICENSE("GPL");

26056

+#endif

26057

+

26058

+module_init(calibrate_xor_block);

26059

diff -Naur linux-2002-03-28/drivers/evms/os2lvm_vge.c evms-2002-03-28/drivers/evms/os2lvm_vge.c

26060

--- linux-2002-03-28/drivers/evms/os2lvm_vge.c Wed Dec 31 18:00:00 1969

26061

+++ evms-2002-03-28/drivers/evms/os2lvm_vge.c Thu Mar 28 12:50:56 2002

26062

@@ -0,0 +1,2207 @@

26063

+/*

26064

+ *

26065

+ * Copyright (c) International Business Machines Corp., 2001

26066

+ *

26067

+ * This program is free software; you can redistribute it and/or modify

26068

+ * it under the terms of the GNU General Public License as published by

26069

+ * the Free Software Foundation; either version 2 of the License, or

26070

+ * (at your option) any later version.

26071

+ *

26072

+ * This program is distributed in the hope that it will be useful,

26073

+ * but WITHOUT ANY WARRANTY; without even the implied warranty of

26074

+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See

26075

+ * the GNU General Public License for more details.

26076

+ *

26077

+ * You should have received a copy of the GNU General Public License

26078

+ * along with this program; if not, write to the Free Software

26079

+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

26080

+ *

26081

+ */

26082

+

26083

+/*

26084

+ * linux/drivers/evms/os2lvm_vge.c

26085

+ *

26086

+ * EVMS OS/2 LVM Emulator

26087

+ *

26088

+ * This Volume Group Emulator will take the type 0x35 partitions created by

26089

+ * OS/2 versions 4.5 and later and build them into volumes. It emulates

26090

+ * the Drive Linking and Bad Block Relocation features and therefore

26091

+ * provides binary compatibility with the OS/2 version. Of course, if

26092

+ * you select to mkfs a file system OS/2 doesn't support, you're on your

26093

+ * own...

26094

+ *

26095

+ * Since OS/2 LVM volumes can only exist on DOS-style partitioned disks,

26096

+ * this VGE has a dependency on dospart.c to report a list of the

26097

+ * candidate partitions. This module will then take the appropriate partitions

26098

+ * from the list and use them to build the OS/2-style volumes.

26099

+ *

26100

+ * Change Activity:

26101

+ *

26102

+ * 7/01/2001 John Stiles getting started.

26103

+ * 9/14/2001 John Stiles original version.

26104

+ * 11/01/2001 John Stiles new naming scheme.

26105

+ * 11/21/2001 John Stiles i/o path changes.

26106

+ */

26107

+

26108

+#define EVMS_DEBUG 1

26109

+#define EVMS_OS2_DEBUG 1

26110

+

26111

+#include <linux/module.h>

26112

+#include <linux/kernel.h>

26113

+#include <linux/config.h>

26114

+#include <linux/genhd.h>

26115

+#include <linux/major.h>

26116

+#include <linux/string.h>

26117

+#include <linux/blk.h>

26118

+#include <linux/init.h>

26119

+#include <linux/evms/evms_kernel.h>

26120

+#include <linux/evms/evms_os2.h>

26121

+#include <asm/uaccess.h>

26122

+

26123

+#define LOG_PREFIX "os2lvm: "

26124

+

26125

+// Global Structure and Type definitions

26126

+typedef struct BBR_IO_Transfer_Record_s{

26127

+ int Write_Flag; /* 0 = read, 1 = write */

26128

+ os2_drivelink_runtime_entry_t * Partition_Data;

26129

+ eio_t eio;

26130

+ struct BBR_IO_Transfer_Record_s * Next;

26131

+} BBR_IO_Transfer_Record_t;

26132

+

26133

+typedef struct DL_IO_Tracking_Record_s{ /* structure used to track IO requests that must be broken into two pieces due to drive linking */

26134

+ unsigned int IO_In_Progress;

26135

+ int Up_To_Date;

26136

+ eio_t Original; /* Original IO */

26137

+ eio_t Link1; /* First child. */

26138

+ os2_drivelink_runtime_entry_t * Link1_Partition_Data;

26139

+ BBR_IO_Transfer_Record_t * Link1_Transfer_Record;

26140

+ int Link1_BBR_Attempted;

26141

+ eio_t Link2; /* Second child */

26142

+ os2_drivelink_runtime_entry_t * Link2_Partition_Data;

26143

+ BBR_IO_Transfer_Record_t * Link2_Transfer_Record;

26144

+ int Link2_BBR_Attempted;

26145

+} DL_IO_Tracking_Record_t;

26146

+

26147

+// Prototypes for local VGE functions

26148

+static int discover_os2lvm_partitions( evms_logical_node_t ** );

26149

+static evms_logical_node_t * find_os2_volume( u_int32_t );

26150

+static int add_os2link( os2_drivelink_runtime_entry_t *, evms_logical_node_t * );

26151

+static os2_drivelink_runtime_entry_t * find_link_data( os2_drivelink_runtime_entry_t **, u_int32_t );

26152

+static int find_drive_link( evms_logical_node_t *, os2_drivelink_runtime_entry_t **, evms_sector_t *, evms_sector_t * );

26153

+static int validate_signaturesector( evms_logical_node_t *, LVM_Signature_Sector *, u_int32_t );

26154

+static int validate_drivelinksector( void *, int, u_int32_t);

26155

+static int validate_bbrtablesector( void *, int, u_int32_t );

26156

+static u_int32_t check_for_os2_bbr_relocations( char * );

26157

+static int check_os2_volumes( evms_logical_node_t ** );

26158

+static int OS2_ioctl_cmd_broadcast(

26159

+ evms_logical_node_t *node,

26160

+ struct inode *inode, struct file *file,

26161

+ unsigned long cmd, unsigned long arg);

26162

+static int os2_ioctl_cmd_plugin_ioctl(

26163

+ evms_logical_node_t *node,

26164

+ struct inode *inode, struct file *file,

26165

+ unsigned long cmd, unsigned long arg);

26166

+static void BBR_Worker( void *);

26167

+static void OS2_BBR_Write_Callback( BBR_IO_Transfer_Record_t * Transfer_Record,

26168

+ struct buffer_head * bh,

26169

+ int uptodate,

26170

+ int * redrive );

26171

+static void BBR_Transfer_IO(BBR_IO_Transfer_Record_t * Transfer_Record);

26172

+static void OS2_DL_Callback(struct buffer_head *bh, int uptodate);

26173

+static int Sector_Is_Remapped(os2_drivelink_runtime_entry_t * io_dlentry,

26174

+ evms_sector_t Source_Sector,

26175

+ evms_sector_t * Replacement_Sector);

26176

+static void Invalidate_Mapping(os2_drivelink_runtime_entry_t * io_dlentry,

26177

+ evms_sector_t Source_Sector,

26178

+ int Replacement_Sector_Is_Bad);

26179

+static int Create_New_BBR_Table_Entry(os2_drivelink_runtime_entry_t * io_dlentry,

26180

+ evms_sector_t starting_lsn,

26181

+ unsigned int count,

26182

+ void * buffer);

26183

+static void Clone_Bufferhead(struct buffer_head * Source, struct buffer_head * Child);

26184

+

26185

+

26186

+// Prototypes for local memory allocation/deallocation functions

26187

+static os2_drivelink_runtime_entry_t * new_os2_drive_link( LVM_Signature_Sector *, evms_logical_node_t * );

26188

+static char * new_os2_link_data( u_int32_t, u_int32_t, u_int32_t, evms_logical_node_t * );

26189

+static char * new_os2_bbr_data( u_int32_t, u_int32_t, u_int32_t, evms_logical_node_t * );

26190

+static evms_logical_node_t * new_os2volume( u_int32_t, char * );

26191

+static int delete_os2lvm_volume( evms_logical_node_t * );

26192

+static int delete_os2_drive_link( os2_drivelink_runtime_entry_t *, int );

26193

+

26194

+

26195

+// Prototypes for Function Table interface

26196

+static int discover_os2lvm( evms_logical_node_t ** );

26197

+static int delete_os2lvm( evms_logical_node_t * );

26198

+static void read_os2lvm( evms_logical_node_t *, eio_t * );

26199

+static void write_os2lvm( evms_logical_node_t *, eio_t * );

26200

+static int init_io_os2lvm( evms_logical_node_t *, int, evms_sector_t, evms_sector_t, void * );

26201

+static int ioctl_os2lvm( evms_logical_node_t *, struct inode *, struct file *, unsigned int, unsigned long );

26202

+static int do_os2_bbr_io( os2_drivelink_runtime_entry_t *, int, evms_sector_t, evms_sector_t, void * );

26203

+

26204

+

26205

+// Global data structures

26206

+static evms_logical_node_t * os2lvm_nodes = NULL;

26207

+static evms_thread_t * BBR_Worker_Thread = NULL;

26208

+static spinlock_t BBR_Queue_Lock = SPIN_LOCK_UNLOCKED;

26209

+static const char * BBR_Worker_Name = "evms_os2_bbr_io";

26210

+static BBR_IO_Transfer_Record_t *BBR_IO_List_Head = NULL;

26211

+static BBR_IO_Transfer_Record_t *BBR_IO_List_Tail = NULL;

26212

+static evms_pool_mgmt_t * BBR_Transfer_Pool = NULL;

26213

+static char * BBR_Transfer_Pool_Name = "OS-2 Transfer Pool";

26214

+static char * DL_Tracking_Pool_Name = "OS-2 Tracking Pool";

26215

+static evms_pool_mgmt_t * DL_Tracking_Pool = NULL;

26216

+

26217

+

26218

+// Required plug-in Function Table definition

26219

+static evms_plugin_function_table_t function_table = {

26220

+ discover: &discover_os2lvm,

26221

+ delete : &delete_os2lvm,

26222

+ read : &read_os2lvm,

26223

+ write : &write_os2lvm,

26224

+ init_io : &init_io_os2lvm,

26225

+ ioctl : &ioctl_os2lvm

26226

+};

26227

+

26228

+

26229

+// Required plug-in Header definition

26230

+static evms_plugin_header_t plugin_header = {

26231

+ id : SetPluginID(

26232

+ IBM_OEM_ID,

26233

+ EVMS_REGION_MANAGER, // Region Manger class

26234

+ 2 ), // Unique ID within VGEs

26235

+ version : {

26236

+ major : 1,

26237

+ minor : 0,

26238

+ patchlevel : 0

26239

+ },

26240

+ required_common_services_version: {

26241

+ major : EVMS_COMMON_SERVICES_MAJOR,

26242

+ minor : EVMS_COMMON_SERVICES_MINOR,

26243

+ patchlevel : EVMS_COMMON_SERVICES_PATCHLEVEL

26244

+ },

26245

+ function_table : &function_table // Function table for this plugin

26246

+};

26247

+

26248

+

26249

+// Required Plugin Functions

26250

+

26251

+

26252

+/*

26253

+ * Function: discover_os2lvm

26254

+ *

26255

+ * This is the entry point into the discovery process.

26256

+ */

26257

+static int discover_os2lvm( evms_logical_node_t ** evms_partition_list )

26258

+{

26259

+ int rc;

26260

+

26261

+ if ( ! BBR_Transfer_Pool ) {

26262

+ BBR_Transfer_Pool = evms_cs_create_pool( sizeof(BBR_IO_Transfer_Record_t), BBR_Transfer_Pool_Name, NULL, NULL);

26263

+ if ( ! BBR_Transfer_Pool ) {

26264

+ return -ENOMEM;

26265

+ }

26266

+ }

26267

+

26268

+ if ( ! DL_Tracking_Pool ) {

26269

+ DL_Tracking_Pool = evms_cs_create_pool( sizeof(DL_IO_Tracking_Record_t), DL_Tracking_Pool_Name, NULL, NULL);

26270

+ if ( ! DL_Tracking_Pool ) {

26271

+ return -ENOMEM;

26272

+ }

26273

+ }

26274

+

26275

+ rc = discover_os2lvm_partitions( evms_partition_list );

26276

+

26277

+ if (!rc) {

26278

+ rc = check_os2_volumes( evms_partition_list );

26279

+ }

26280

+

26281

+ return rc;

26282

+}

26283

+

26284

+

26285

+/*

26286

+ * Function: delete_os2lvm

26287

+ *

26288

+ * This is the entry point for deleting a node.

26289

+ */

26290

+static int delete_os2lvm( evms_logical_node_t * logical_node )

26291

+{

26292

+ LOG_EXTRA("Deleting volume: %s\n", logical_node->name );

26293

+

26294

+ return delete_os2lvm_volume( logical_node );

26295

+}

26296

+

26297

+

26298

+/*

26299

+ * Function: read_os2lvm

26300

+ */

26301

+static void read_os2lvm( evms_logical_node_t * node,

26302

+ eio_t * eio )

26303

+{

26304

+ int rc;

26305

+ evms_sector_t sector_count;

26306

+ struct buffer_head * Link1 = NULL;

26307

+ struct buffer_head * Link2 = NULL;

26308

+ DL_IO_Tracking_Record_t * Tracking_Record = NULL;

26309

+ os2_drivelink_runtime_entry_t * cur_dlentry = NULL;

26310

+ BBR_IO_Transfer_Record_t * Transfer_Record;

26311

+

26312

+ sector_count = eio->rsize;

26313

+ rc = find_drive_link( node, &cur_dlentry, &eio->rsector, &sector_count );

26314

+ switch ( rc ) {

26315

+ case 1 :

26316

+ if ( cur_dlentry->bbr_is_active ) {

26317

+ Transfer_Record = evms_cs_allocate_from_pool(BBR_Transfer_Pool,1); /* Block until we get a transfer record. */

26318

+ /* Transfer the IO to the BBR Worker Thread. */

26319

+ Transfer_Record->Write_Flag = 0;

26320

+ Transfer_Record->Partition_Data = cur_dlentry;

26321

+ Transfer_Record->eio = *eio;

26322

+ Transfer_Record->Next = NULL;

26323

+ BBR_Transfer_IO(Transfer_Record);

26324

+ }

26325

+ else

26326

+ R_IO( cur_dlentry->link_partition, eio );

26327

+ break;

26328

+ case 2 :

26329

+ /* We must split the IO. Duplicate the buffer head twice and allocate the tracking record. */

26330

+ Tracking_Record = evms_cs_allocate_from_pool(DL_Tracking_Pool,1); /* Block until we get a tracking record. */

26331

+ Link1 = evms_cs_allocate_from_pool(evms_bh_pool,1);

26332

+ Link2 = evms_cs_allocate_from_pool(evms_bh_pool,1);

26333

+

26334

+ /* Initialize the tracking record so we can associate the two new I/Os with the original. */

26335

+ Tracking_Record->IO_In_Progress = 2;

26336

+ Tracking_Record->Up_To_Date = 0;

26337

+ Tracking_Record->Original = *eio;

26338

+

26339

+ /* Create the I/O to the first link. */

26340

+ Clone_Bufferhead(eio->bh,Link1);

26341

+ Link1->b_private = Tracking_Record;

26342

+ Link1->b_end_io = OS2_DL_Callback;

26343

+ Link1->b_size = sector_count << EVMS_VSECTOR_SIZE_SHIFT;

26344

+ Tracking_Record->Link1.rsector = eio->rsector;

26345

+ Tracking_Record->Link1.rsize = sector_count;

26346

+ Tracking_Record->Link1.bh = Link1;

26347

+ Tracking_Record->Link1_Partition_Data = cur_dlentry;

26348

+ Tracking_Record->Link1_BBR_Attempted = 0;

26349

+ Tracking_Record->Link1_Transfer_Record = NULL;

26350

+

26351

+ /* Create the I/O to the second link */

26352

+ Clone_Bufferhead(eio->bh,Link2);

26353

+ Link2->b_private = Tracking_Record;

26354

+ Link2->b_end_io = OS2_DL_Callback;

26355

+ Link2->b_data += sector_count << EVMS_VSECTOR_SIZE_SHIFT;

26356

+ Tracking_Record->Link2.bh = Link2;

26357

+ Tracking_Record->Link2_Partition_Data = cur_dlentry->next;

26358

+ Link2->b_rsector = 0;

26359

+ Tracking_Record->Link2.rsector = 0;

26360

+ Tracking_Record->Link2.rsize = eio->rsize - sector_count;

26361

+ Link2->b_size = Tracking_Record->Link2.rsize << EVMS_VSECTOR_SIZE_SHIFT;

26362

+ Tracking_Record->Link2_BBR_Attempted = 0;

26363

+ Tracking_Record->Link2_Transfer_Record = NULL;

26364

+

26365

+ /* Process the I/O to the first link. */

26366

+ if ( cur_dlentry->bbr_is_active ) {

26367

+ Transfer_Record = evms_cs_allocate_from_pool(BBR_Transfer_Pool,1); /* Block until we get a transfer record. */

26368

+ /* Transfer the IO to the BBR Worker Thread. */

26369

+ Transfer_Record->Write_Flag = 0;

26370

+ Transfer_Record->Partition_Data = cur_dlentry;

26371

+ Transfer_Record->eio = Tracking_Record->Link1;

26372

+ Transfer_Record->Next = NULL;

26373

+ BBR_Transfer_IO(Transfer_Record);

26374

+ }

26375

+ else

26376

+ R_IO( cur_dlentry->link_partition, &Tracking_Record->Link1 );

26377

+

26378

+ /* Process the I/O to the second link. */

26379

+ cur_dlentry = cur_dlentry->next;

26380

+ if ( cur_dlentry->bbr_is_active ) {

26381

+ Transfer_Record = evms_cs_allocate_from_pool(BBR_Transfer_Pool,1); /* Block until we get a transfer record. */

26382

+ /* Transfer the IO to the BBR Worker Thread. */

26383

+ Transfer_Record->Write_Flag = 0;

26384

+ Transfer_Record->Partition_Data = cur_dlentry;

26385

+ Transfer_Record->eio = Tracking_Record->Link2;

26386

+ Transfer_Record->Next = NULL;

26387

+ BBR_Transfer_IO(Transfer_Record);

26388

+ }

26389

+ else

26390

+ R_IO( cur_dlentry->link_partition, &Tracking_Record->Link2 );

26391

+

26392

+ break;

26393

+ default:

26394

+ LOG_SERIOUS("READ error, request exceeds volume size.\n" );

26395

+ EVMS_IO_ERROR(eio);

26396

+ break;

26397

+ }

26398

+}

26399

+

26400

+

26401

+/*

26402

+ * Function: write_os2lvm

26403

+ */

26404

+static void write_os2lvm( evms_logical_node_t * node,

26405

+ eio_t * eio )

26406

+{

26407

+ int rc;

26408

+ evms_sector_t sector_count;

26409

+ struct buffer_head * Link1 = NULL;

26410

+ struct buffer_head * Link2 = NULL;

26411

+ DL_IO_Tracking_Record_t * Tracking_Record = NULL;

26412

+ os2_drivelink_runtime_entry_t * cur_dlentry = NULL;

26413

+ BBR_IO_Transfer_Record_t * Transfer_Record;

26414

+

26415

+ sector_count = eio->rsize;

26416

+ rc = find_drive_link( node, &cur_dlentry, &eio->rsector, &sector_count );

26417

+ switch ( rc ) {

26418

+ case 1 :

26419

+ /* Set up a Transfer Record. If there are Bad Blocks on the partition that this I/O is

26420

+ directed to, then we will need the Transfer Record to put the I/O in the queue for the

26421

+ BBR Worker Thread. If there are no bad blocks, then we will need the Transfer Record

26422

+ for the OS2_BBR_Write_Callback function. This function expects the Transfer Record to

26423

+ be pre-allocated and available because it is running on an interrupt thread and should

26424

+ not do memory allocation. If there is an error during the write, then the

26425

+ OS2_BBR_Write_Callback function will use the Transfer Record to transfer the I/O

26426

+ to the BBR worker thread for further processing. If there are no errors during the I/O,

26427

+ then the OS2_BBR_Write_Callback will deallocate the Transfer Record. */

26428

+ Transfer_Record = evms_cs_allocate_from_pool(BBR_Transfer_Pool,1); /* Block until we get a transfer record. */

26429

+ Transfer_Record->Write_Flag = 1;

26430

+ Transfer_Record->Partition_Data = cur_dlentry;

26431

+ Transfer_Record->eio = *eio;

26432

+ Transfer_Record->Next = NULL;

26433

+ if ( cur_dlentry->bbr_is_active ) {

26434

+ /* Transfer the IO to the BBR Worker Thread. */

26435

+ BBR_Transfer_IO(Transfer_Record);

26436

+ }

26437

+ else {

26438

+ evms_cs_register_for_end_io_notification(Transfer_Record,eio->bh,OS2_BBR_Write_Callback);

26439

+ W_IO( cur_dlentry->link_partition, eio );

26440

+ }

26441

+ break;

26442

+ case 2 :

26443

+ /* We must split the IO. Duplicate the buffer head twice and allocate the tracking record. */

26444

+ Tracking_Record = evms_cs_allocate_from_pool(DL_Tracking_Pool,1); /* Block until we get a tracking record. */

26445

+ Link1 = evms_cs_allocate_from_pool(evms_bh_pool,1);

26446

+ Link2 = evms_cs_allocate_from_pool(evms_bh_pool,1);

26447

+

26448

+ /* Initialize the tracking record so we can associate the two new I/Os with the original. */

26449

+ Tracking_Record->IO_In_Progress = 2;

26450

+ Tracking_Record->Up_To_Date = 0;

26451

+ Tracking_Record->Original = *eio;

26452

+

26453

+ /* Create the I/O to the first link. */

26454

+ Clone_Bufferhead(eio->bh,Link1);

26455

+ Link1->b_private = Tracking_Record;

26456

+ Link1->b_end_io = OS2_DL_Callback;

26457

+ Link1->b_size = sector_count << EVMS_VSECTOR_SIZE_SHIFT;

26458

+ Tracking_Record->Link1.rsector = eio->rsector;

26459

+ Tracking_Record->Link1.rsize = sector_count;

26460

+ Tracking_Record->Link1.bh = Link1;

26461

+ Tracking_Record->Link1_Partition_Data = cur_dlentry;

26462

+

26463

+ /* Create the I/O to the second link */

26464

+ Clone_Bufferhead(eio->bh,Link2);

26465

+ Link2->b_private = Tracking_Record;

26466

+ Link2->b_end_io = OS2_DL_Callback;

26467

+ Link2->b_data += sector_count << EVMS_VSECTOR_SIZE_SHIFT;

26468

+ Tracking_Record->Link2.bh = Link2;

26469

+ Tracking_Record->Link2_Partition_Data = cur_dlentry->next;

26470

+ Link2->b_rsector = 0;

26471

+ Tracking_Record->Link2.rsector = 0;

26472

+ Tracking_Record->Link2.rsize = eio->rsize - sector_count;

26473

+ Link2->b_size = Tracking_Record->Link2.rsize << EVMS_VSECTOR_SIZE_SHIFT;

26474

+

26475

+ Transfer_Record = evms_cs_allocate_from_pool(BBR_Transfer_Pool,1); /* Block until we get a transfer record. */

26476

+ Transfer_Record->Write_Flag = 1;

26477

+ Transfer_Record->Partition_Data = cur_dlentry;

26478

+ Transfer_Record->eio = Tracking_Record->Link1;

26479

+ Transfer_Record->Next = NULL;

26480

+ Tracking_Record->Link1_Transfer_Record = Transfer_Record;

26481

+ /* Process the I/O to the first link. */

26482

+ if ( cur_dlentry->bbr_is_active ) {

26483

+ /* Transfer the IO to the BBR Worker Thread. */

26484

+ Tracking_Record->Link1_BBR_Attempted = 1;

26485

+ BBR_Transfer_IO(Transfer_Record);

26486

+ }

26487

+ else {

26488

+ Tracking_Record->Link1_BBR_Attempted = 0;

26489

+ W_IO( cur_dlentry->link_partition, &Tracking_Record->Link1 );

26490

+ }

26491

+

26492

+ /* Process the I/O to the second link. */

26493

+ cur_dlentry = cur_dlentry->next;

26494

+ Transfer_Record = evms_cs_allocate_from_pool(BBR_Transfer_Pool,1); /* Block until we get a transfer record. */

26495

+ Transfer_Record->Write_Flag = 1;

26496

+ Transfer_Record->Partition_Data = cur_dlentry;

26497

+ Transfer_Record->eio = Tracking_Record->Link2;

26498

+ Transfer_Record->Next = NULL;

26499

+ Tracking_Record->Link2_Transfer_Record= Transfer_Record;

26500

+ if ( cur_dlentry->bbr_is_active ) {

26501

+ /* Transfer the IO to the BBR Worker Thread. */

26502

+ Tracking_Record->Link2_BBR_Attempted = 1;

26503

+ BBR_Transfer_IO(Transfer_Record);

26504

+ }

26505

+ else {

26506

+ Tracking_Record->Link2_BBR_Attempted = 0;

26507

+ W_IO( cur_dlentry->link_partition, &Tracking_Record->Link2 );

26508

+ }

26509

+

26510

+ break;

26511

+ default:

26512

+ LOG_SERIOUS("WRITE error, request exceeds volume size.\n" );

26513

+ EVMS_IO_ERROR(eio);

26514

+ break;

26515

+ }

26516

+}

26517

+

26518

+

26519

+static int os2_ioctl_cmd_plugin_ioctl( evms_logical_node_t *node,

26520

+ struct inode *inode,

26521

+ struct file *file,

26522

+ unsigned long cmd,

26523

+ unsigned long arg)

26524

+{

26525

+ int rc = 0;

26526

+ os2_volume_runtime_entry_t * Node_Data;

26527

+ os2_drivelink_runtime_entry_t * curlink, * nextlink;

26528

+ evms_plugin_ioctl_t tmp, *user_parms;

26529

+

26530

+ user_parms = (evms_plugin_ioctl_t *)arg;

26531

+ /* copy user's parameters to kernel space */

26532

+ if (copy_from_user(&tmp, user_parms, sizeof(tmp)))

26533

+ rc = -EFAULT;

26534

+

26535

+ if (!rc) {

26536

+ Node_Data = (os2_volume_runtime_entry_t *)node->instance_data;

26537

+ /* is this cmd targetted at this feature ? */

26538

+ if (tmp.feature_id == node->plugin->id) {

26539

+ switch(tmp.feature_command) {

26540

+ default:

26541

+ break;

26542

+ }

26543

+ } else { /* broadcast this cmd to all children */

26544

+ curlink = Node_Data->drive_link;

26545

+

26546

+ /* broadcast this cmd to all children */

26547

+ while ( curlink ) {

26548

+ nextlink = curlink->next;

26549

+

26550

+ rc = IOCTL(curlink->link_partition,inode,file,cmd,arg);

26551

+

26552

+ if (rc) {

26553

+ break;

26554

+ }

26555

+ curlink = nextlink;

26556

+ }

26557

+

26558

+ }

26559

+ /* copy info to userspace */

26560

+ if (copy_to_user(user_parms, &tmp, sizeof(tmp)))

26561

+ rc = -EFAULT;

26562

+ }

26563

+ return(rc);

26564

+}

26565

+

26566

+

26567

+static int OS2_ioctl_cmd_broadcast( evms_logical_node_t *node,

26568

+ struct inode *inode,

26569

+ struct file *file,

26570

+ unsigned long cmd,

26571

+ unsigned long arg)

26572

+{

26573

+ int rc = 0;

26574

+ os2_volume_runtime_entry_t * Node_Data;

26575

+ os2_drivelink_runtime_entry_t * curlink, * nextlink;

26576

+

26577

+ Node_Data = (os2_volume_runtime_entry_t *)node->instance_data;

26578

+ curlink = Node_Data->drive_link;

26579

+

26580

+ /* broadcast this cmd to all children */

26581

+ while ( curlink ) {

26582

+ nextlink = curlink->next;

26583

+

26584

+ rc |= IOCTL(curlink->link_partition,inode,file,cmd,arg);

26585

+

26586

+ curlink = nextlink;

26587

+ }

26588

+

26589

+ return(rc);

26590

+}

26591

+

26592

+

26593

+/*

26594

+ * Function: ioctl_os2lvm

26595

+ */

26596

+static int ioctl_os2lvm( evms_logical_node_t * logical_node,

26597

+ struct inode * inode,

26598

+ struct file * file,

26599

+ unsigned int cmd,

26600

+ unsigned long arg )

26601

+{

26602

+ int rc = 0;

26603

+ evms_sector_t Sectors_Per_Cylinder;

26604

+ evms_sector_t Total_Sectors;

26605

+ evms_logical_node_t * partition_node;

26606

+

26607

+ partition_node = (( os2_volume_runtime_entry_t * )logical_node->instance_data )->drive_link->link_partition;

26608

+

26609

+ if ( !inode )

26610

+ return -EINVAL;

26611

+

26612

+ LOG_EVERYTHING("Ioctl %d\n", cmd );

26613

+

26614

+ switch ( cmd ) {

26615

+ case HDIO_GETGEO:

26616

+ {

26617

+ // Return fake geometry

26618

+ struct hd_geometry *hd = ( struct hd_geometry * )arg;

26619

+ short cylinders;

26620

+ unsigned char heads = 255;

26621

+ unsigned char sectors = OS2LVM_SYNTHETIC_SECTORS_PER_TRACK;

26622

+ long start = 0;

26623

+

26624

+ /* OS/2 always created a fake geometry using the maximum cylinder size. */

26625

+ Sectors_Per_Cylinder = heads * sectors;

26626

+ for ( cylinders = 0, Total_Sectors = 0; Total_Sectors < ( ( os2_volume_runtime_entry_t * )logical_node->instance_data )->size_in_sectors; cylinders++ )

26627

+ Total_Sectors += Sectors_Per_Cylinder;

26628

+

26629

+ cylinders--;

26630

+

26631

+ if ( copy_to_user(( short * )( &hd->cylinders ), &cylinders, sizeof( cylinders )) ||

26632

+ copy_to_user(( char * )( &hd->heads ), &heads, sizeof( heads )) ||

26633

+ copy_to_user(( char * )( &hd->sectors ), &sectors, sizeof( sectors )) ||

26634

+ copy_to_user(( long * )( &hd->start ), &start, sizeof( start )) ) {

26635

+ return -EFAULT;

26636

+ }

26637

+ }

26638

+ break;

26639

+

26640

+ case EVMS_GET_BMAP:

26641

+ // No kernel images allowed on OS/2 volumes right now.

26642

+ rc = -EINVAL;

26643

+ break;

26644

+

26645

+ case EVMS_QUIESCE_VOLUME:

26646

+ case EVMS_GET_DISK_LIST:

26647

+ case EVMS_CHECK_MEDIA_CHANGE:

26648

+ case EVMS_REVALIDATE_DISK:

26649

+ case EVMS_OPEN_VOLUME:

26650

+ case EVMS_CLOSE_VOLUME:

26651

+ rc = OS2_ioctl_cmd_broadcast(logical_node, inode, file, cmd, arg);

26652

+ break;

26653

+ case EVMS_PLUGIN_IOCTL:

26654

+ rc = os2_ioctl_cmd_plugin_ioctl( logical_node, inode, file, cmd, arg);

26655

+ break;

26656

+ default:

26657

+ rc = -EINVAL;

26658

+ break;

26659

+ }

26660

+

26661

+ return rc;

26662

+}

26663

+

26664

+

26665

+/*

26666

+ * Function: init_io_os2lvm

26667

+ */

26668

+static int init_io_os2lvm( evms_logical_node_t * node,

26669

+ int io_flag, /* 0=read, 1=write */

26670

+ evms_sector_t sect_nr, /* disk LBA */

26671

+ evms_sector_t num_sects, /* # of sectors */

26672

+ void * buf_addr ) /* buffer address */

26673

+{

26674

+ int rc = 0;

26675

+ evms_sector_t sector_count;

26676

+ evms_logical_node_t * partition_node;

26677

+ os2_drivelink_runtime_entry_t * cur_dlentry = NULL;

26678

+

26679

+ sector_count = num_sects;

26680

+ rc = find_drive_link( node, &cur_dlentry, &sect_nr, &sector_count );

26681

+ switch ( rc ) {

26682

+ case 1 :

26683

+ partition_node = cur_dlentry->link_partition;

26684

+ if ( cur_dlentry->bbr_is_active )

26685

+ rc = do_os2_bbr_io( cur_dlentry, io_flag, sect_nr, num_sects, buf_addr );

26686

+ else {

26687

+ rc = INIT_IO( partition_node, io_flag, sect_nr, num_sects, buf_addr );

26688

+ if ( rc && io_flag ) {

26689

+ cur_dlentry->bbr_is_active = 1;

26690

+ rc = do_os2_bbr_io( cur_dlentry, io_flag, sect_nr, num_sects, buf_addr );

26691

+ }

26692

+ }

26693

+ break;

26694

+ case 2 :

26695

+ partition_node = cur_dlentry->link_partition;

26696

+ if ( cur_dlentry->bbr_is_active )

26697

+ rc = do_os2_bbr_io( cur_dlentry, io_flag, sect_nr, sector_count, buf_addr );

26698

+ else {

26699

+ rc = INIT_IO( partition_node, io_flag, sect_nr, sector_count, buf_addr );

26700

+ if ( rc && io_flag) {

26701

+ cur_dlentry->bbr_is_active = 1;

26702

+ rc = do_os2_bbr_io( cur_dlentry, io_flag, sect_nr, sector_count, buf_addr );

26703

+ }

26704

+ }

26705

+

26706

+ if ( !rc ) {

26707

+ cur_dlentry = cur_dlentry->next;

26708

+ partition_node = cur_dlentry->link_partition;

26709

+ num_sects -= sector_count;

26710

+ buf_addr += sector_count << OS2_SECTOR_SHIFT;

26711

+ rc = 1;

26712

+ if ( cur_dlentry->bbr_is_active )

26713

+ rc = do_os2_bbr_io( cur_dlentry, io_flag, 0, num_sects, buf_addr );

26714

+ else {

26715

+ rc = INIT_IO( partition_node, io_flag, 0, num_sects, buf_addr );

26716

+ if ( rc && io_flag ) {

26717

+ cur_dlentry->bbr_is_active = 1;

26718

+ rc = do_os2_bbr_io( cur_dlentry, io_flag, 0, num_sects, buf_addr );

26719

+ }

26720

+

26721

+ }

26722

+ }

26723

+ break;

26724

+ default:

26725

+ LOG_SERIOUS("INITIO error, request exceeds volume size.\n" );

26726

+ break;

26727

+ }

26728

+

26729

+ return rc;

26730

+}

26731

+

26732

+

26733

+/*

26734

+ * Function: do_os2_bbr_io

26735

+ *

26736

+ * Check the Bad Block Relocation list for relocated sectors. If any are found,

26737

+ * this function will do the i/o directly.

26738

+ * Return values: 0 == i/o done, 1 == unable to complete i/o

26739

+ */

26740

+static int do_os2_bbr_io( os2_drivelink_runtime_entry_t * io_dlentry,

26741

+ int rw, /* 0=read, 1=write */

26742

+ evms_sector_t starting_lsn, /* disk LBA */

26743

+ evms_sector_t count, /* # of sectors */

26744

+ void * buffer ) /* buffer address */

26745

+{

26746

+ evms_sector_t lsn, remapped_lsn;

26747

+ int rc;

26748

+

26749

+ // For each sector in this request, check if this sector has already

26750

+ // been remapped. If so, process all previous sectors in this request,

26751

+ // followed by the remapped sector. Then reset the starting lsn and

26752

+ // count and keep going with the rest of the request as if it were

26753

+ // a whole new request.

26754

+ for ( lsn = 0; lsn < count; lsn++ ) {

26755

+ remapped_lsn = starting_lsn + lsn;

26756

+ rc = Sector_Is_Remapped(io_dlentry,remapped_lsn, &remapped_lsn);

26757

+ if (rc) {

26758

+ // Process all sectors in the request up to this one.

26759

+ if (lsn > 0) {

26760

+ rc = INIT_IO(io_dlentry->link_partition, rw, starting_lsn, lsn, buffer);

26761

+ if (rc) {

26762

+ /* If this is a read, then we are done. */

26763

+ if (! rw) {

26764

+ return 1;

26765

+ }

26766

+

26767

+ /* Since this was a write, we must see if we can remap the bad sector to a replacement sector. */

26768

+ if ( !Create_New_BBR_Table_Entry(io_dlentry, starting_lsn, lsn, buffer) ) {

26769

+ /* We were unable to remap the bad sector(s) in the I/O. We can not complete the I/O. */

26770

+ return 1;

26771

+ }

26772

+ }

26773

+ buffer += (lsn * OS2_BYTES_PER_SECTOR);

26774

+ }

26775

+

26776

+ // Process the remapped sector.

26777

+ rc = INIT_IO(io_dlentry->link_partition, rw, remapped_lsn, 1, buffer);

26778

+ if (rc) {

26779

+ /* If this is a read, then we are done. */

26780

+ if (! rw) {

26781

+ return 1;

26782

+ }

26783

+

26784

+ /* Get the original sector that was remapped. */

26785

+ remapped_lsn = starting_lsn + lsn;

26786

+

26787

+ /* Invalidate the current remapping. */

26788

+ Invalidate_Mapping(io_dlentry,remapped_lsn,1);

26789

+

26790

+ /* Try to remap the bad sector to another replacement sector. */

26791

+ if ( !Create_New_BBR_Table_Entry(io_dlentry, remapped_lsn, 1, buffer) ) {

26792

+ /* We were unable to remap the bad sector(s) in the I/O. We can not complete the I/O. */

26793

+ return 1;

26794

+ }

26795

+

26796

+ }

26797

+

26798

+ buffer += OS2_BYTES_PER_SECTOR;

26799

+

26800

+ starting_lsn += (lsn + 1);

26801

+ count -= (lsn + 1);

26802

+ lsn = -1;

26803

+ }

26804

+

26805

+ }

26806

+

26807

+ /* Are there any sectors left to process? */

26808

+ if ( count > 0 ) {

26809

+ rc = INIT_IO(io_dlentry->link_partition, rw, starting_lsn, count, buffer);

26810

+ if (rc) {

26811

+ /* If this is a read, then we are done. */

26812

+ if (! rw) {

26813

+ return 1;

26814

+ }

26815

+

26816

+ /* Since this was a write, we must see if we can remap the bad sector to a replacement sector. */

26817

+ if ( !Create_New_BBR_Table_Entry(io_dlentry, starting_lsn, count, buffer) ) {

26818

+ /* We were unable to remap the bad sector(s) in the I/O. We can not complete the I/O. */

26819

+ return 1;

26820

+ }

26821

+

26822

+ }

26823

+

26824

+ }

26825

+

26826

+ return 0;

26827

+}

26828

+

26829

+

26830

+/*

26831

+ * Function: os2lvm_vge_init

26832

+ */

26833

+int __init os2lvm_vge_init( void )

26834

+{

26835

+ /* Should I be allocating the pools and BBR Worker Thread here? */

26836

+ return evms_cs_register_plugin( &plugin_header );/* register with EVMS*/

26837

+}

26838

+

26839

+void __exit os2lvm_vge_exit( void )

26840

+{

26841

+ /* BUGBUG - Is there where I need to kill the BBR Worker Thread and free any memory I am still holding? */

26842

+

26843

+ evms_cs_unregister_plugin(&plugin_header);

26844

+}

26845

+

26846

+module_init(os2lvm_vge_init);

26847

+module_exit(os2lvm_vge_exit);

26848

+#ifdef MODULE_LICENSE

26849

+MODULE_LICENSE("GPL");

26850

+#endif

26851

+

26852

+

26853

+

26854

+

26855

+// Local VGE Functions

26856

+

26857

+

26858

+/*

26859

+ * Function: discover_os2lvm_partitions

26860

+ *

26861

+ * Examine the list of logical partitions. Any type 0x35 partition that contains

26862

+ * a valid OS/2 signature sector is consumed and added to the appropriate logical

26863

+ * volume.

26864

+ */

26865

+static int discover_os2lvm_partitions( evms_logical_node_t ** evms_partition_list )

26866

+{

26867

+ evms_logical_node_t * evms_partition;

26868

+ evms_logical_node_t * next_partition;

26869

+ evms_logical_node_t * new_volume;

26870

+ evms_sector_t sectornum = 0;

26871

+ u_int32_t volumeserial;

26872

+ char * sigsect;

26873

+ char * volumename;

26874

+ char driveletter[8];

26875

+ LVM_Signature_Sector * sigsector;

26876

+ os2_drivelink_runtime_entry_t * new_dlentry;

26877

+

26878

+ LOG_ENTRY_EXIT("Discovering OS/2 Logical Volumes\n" );

26879

+ if ( evms_cs_allocate_memory(( void** )&sigsect, OS2_BYTES_PER_SECTOR ) ) {

26880

+ LOG_SERIOUS("Could not allocate Signature sector data\n" );

26881

+ return -ENOMEM;

26882

+ }

26883

+

26884

+ for ( evms_partition = *evms_partition_list; evms_partition; evms_partition = next_partition ) {

26885

+ // Save the next node. We may remove this one from the list.

26886

+ next_partition = evms_partition->next;

26887

+

26888

+ // The node must not have the OS/2 vge id.

26889

+ if ( evms_partition->plugin->id == plugin_header.id ) {

26890

+ continue;

26891

+ }

26892

+

26893

+ LOG_EXTRA("Examining partition serial %s\n", evms_partition->name );

26894

+

26895

+ // Have to go to the last accessible sector of the partition and

26896

+ // read it in. It should be the LVM Signature Sector.

26897

+ sectornum = evms_partition->total_vsectors - 1;

26898

+ if ( INIT_IO( evms_partition, 0, sectornum, 1, sigsect ) ) {

26899

+ // On an I/O error, continue on to the next partition.

26900

+ // This means that the volume it belongs to will be incomplete

26901

+ // and later deleted in the completeness check.

26902

+ LOG_SERIOUS("I/O error on Signature sector read\n" );

26903

+ continue;

26904

+ }

26905

+ sigsector = ( LVM_Signature_Sector * )sigsect;

26906

+

26907

+ // Validate the Signature Sector

26908

+ if ( validate_signaturesector( evms_partition, sigsector, OS2_BYTES_PER_SECTOR )) {

26909

+ LOG_EXTRA("Signature sector is not valid\n" );

26910

+ continue;

26911

+ }

26912

+// Bugbug - At this point, we have validated an OS/2 LVM Signature Sector. However, if the partition

26913

+// is not marked as a type 0x35, then this Signature Sector may be erroneous. The problem here is that

26914

+// there is currently no way to find out if this partition was marked as a type 0x35. Also, if we

26915

+// should reject this partition due to some problem with the drive linking or BBR metadata, should we

26916

+// leave the partition in the evms partition list or not? If the partition was marked as a type 0x35

26917

+// and the Signature Sector was valid, then I would say that we should remove it from the evms partition

26918

+// partition list. If the partition is not marked as a type 0x35 but the Signature Sector is valid, then

26919

+// we could have a stray Signature Sector, in which case the partition should remain in the evms partition

26920

+// list. The OS/2 LVM Signature Sector does have additional information that could be used to resolve

26921

+// this issue, such as the starting LBA of the partition that the Signature Sector belongs to, but

26922

+// we can not get the starting LBA of the partition to compare against. If we leave the partition in

26923

+// the evms partition list when we should not, then an extraneous compatibility volume could result.

26924

+ // Build the Metadata for this partition

26925

+ if ( !( new_dlentry = new_os2_drive_link( sigsector, evms_partition )) ) {

26926

+ continue;

26927

+ }

26928

+

26929

+ // Search for the parent Volume for this partition

26930

+ volumeserial = sigsector->Volume_Serial_Number;

26931

+ if ( !( new_volume = find_os2_volume( volumeserial )) ) {

26932

+

26933

+ // If not found, allocate a new Volume

26934

+ LOG_EVERYTHING("Parent not found, allocate new.\n" );

26935

+ if ( sigsector->Drive_Letter != '\0' ) {

26936

+ driveletter[0] = sigsector->Drive_Letter;

26937

+ driveletter[1] = '\0';

26938

+ volumename = driveletter;

26939

+ }

26940

+ else

26941

+ volumename = sigsector->Volume_Name;

26942

+

26943

+ if ( !( new_volume = new_os2volume( volumeserial, volumename )) ) {

26944

+ delete_os2_drive_link( new_dlentry, 0 );

26945

+ new_dlentry = NULL;

26946

+ continue;

26947

+ }

26948

+ }

26949

+

26950

+ // Now remove the partition from the List

26951

+ evms_cs_remove_logical_node_from_list( evms_partition_list, evms_partition );

26952

+

26953

+ if ( (( os2_volume_runtime_entry_t * )new_volume->instance_data )->complete ) {

26954

+ // Volume is complete, delete this duplicate

26955

+ delete_os2_drive_link( new_dlentry, 0 );

26956

+ LOG_EVERYTHING("Deleting duplicate node.\n" );

26957

+ (( os2_volume_runtime_entry_t * )new_volume->instance_data )->Export_Needed = 1; //We must export this volume again!

26958

+ }

26959

+ else /* Add this partition to its parent Volume */

26960

+ add_os2link( new_dlentry, new_volume );

26961

+

26962

+ }

26963

+

26964

+ evms_cs_deallocate_memory(( void* )sigsect );

26965

+ LOG_ENTRY_EXIT("Finished Discovering OS/2 Logical Volumes\n" );

26966

+

26967

+ return 0;

26968

+}

26969

+

26970

+

26971

+/*

26972

+ * Function: find_os2_volume

26973

+ *

26974

+ * Search for the OS/2 volume that matches the volume serial.

26975

+ */

26976

+static evms_logical_node_t * find_os2_volume( u_int32_t volumeserial )

26977

+{

26978

+ os2_volume_runtime_entry_t * cur_volume;

26979

+ evms_logical_node_t * cur_node;

26980

+

26981

+ cur_node = os2lvm_nodes;

26982

+

26983

+ while ( cur_node ) {

26984

+ cur_volume = ( os2_volume_runtime_entry_t * )cur_node->instance_data;

26985

+ if ( cur_volume->Volume_Serial_Number == volumeserial ) {

26986

+ LOG_EVERYTHING("%s: found volser match.\n", __FUNCTION__ );

26987

+ return cur_node;

26988

+ }

26989

+ LOG_EVERYTHING("%s: volser does not match.\n", __FUNCTION__ );

26990

+ cur_node = cur_volume->next_os2lvm_node;

26991

+ }

26992

+

26993

+ return NULL;

26994

+}

26995

+

26996

+

26997

+/*

26998

+ * Function: add_os2link

26999

+ *

27000

+ * Add the Drive Link metadata to the parent OS/2 volume.

27001

+ */

27002

+static int add_os2link( os2_drivelink_runtime_entry_t * newlink,

27003

+ evms_logical_node_t * parent_volume )

27004

+{

27005

+ os2_volume_runtime_entry_t * parent_metadata = ( os2_volume_runtime_entry_t * )parent_volume->instance_data;

27006

+ os2_drivelink_runtime_entry_t * curlink = parent_metadata->drive_link, * nextlink;

27007

+

27008

+ if ( curlink ) {

27009

+ nextlink = curlink->next;

27010

+ while ( nextlink ) {

27011

+ curlink = nextlink;

27012

+ nextlink = curlink->next;

27013

+ }

27014

+ curlink->next = newlink;

27015

+ }

27016

+ else {

27017

+ parent_metadata->drive_link = newlink;

27018

+ }

27019

+ parent_metadata->drive_link_count++;

27020

+ parent_metadata->size_in_sectors += newlink->sector_count;

27021

+ parent_volume->total_vsectors += newlink->sector_count;

27022

+ return 0;

27023

+}

27024

+

27025

+

27026

+/*

27027

+ * Function: find_link_data

27028

+ *

27029

+ * Find the Drive Link metadata that matches the partition serial number.

27030

+ * Remove it from the link_list passed in.

27031

+ */

27032

+static os2_drivelink_runtime_entry_t * find_link_data( os2_drivelink_runtime_entry_t ** link_list,

27033

+ u_int32_t partitionser )

27034

+{

27035

+ os2_drivelink_runtime_entry_t * curlink = *link_list, * prevlink = NULL;

27036

+

27037

+ while ( curlink ) {

27038

+ if ( curlink->Partition_Serial_Number == partitionser ) {

27039

+ if ( prevlink ) {

27040

+ prevlink->next = curlink->next;

27041

+ }

27042

+ else {

27043

+ *link_list = curlink->next;

27044

+ }

27045

+ curlink->next = NULL;

27046

+ return curlink;

27047

+ }

27048

+ prevlink = curlink;

27049

+ curlink = prevlink->next;

27050

+ }

27051

+

27052

+ return NULL;

27053

+}

27054

+

27055

+

27056

+/*

27057

+ * Function: find_drive_link

27058

+ *

27059

+ * Walk the linked list of drive links to find the proper

27060

+ * target partition. Returns the metadata associated with

27061

+ * the drive link.

27062

+ * Return values: 1 == data contained in 1 partition, 2 == data crosses 2 partitions,

27063

+ * 0 == target partition not found

27064

+ */

27065

+static int find_drive_link( evms_logical_node_t * node,

27066

+ os2_drivelink_runtime_entry_t ** dlentry,

27067

+ evms_sector_t * sector,

27068

+ evms_sector_t * num_sectors )

27069

+{

27070

+ evms_sector_t last_link_sector, cur_last_sector;

27071

+ os2_drivelink_runtime_entry_t * curlink = (( os2_volume_runtime_entry_t * )node->instance_data )->drive_link, * nextlink;

27072

+

27073

+ while ( curlink ) {

27074

+ nextlink = curlink->next;

27075

+ last_link_sector = curlink->start_sector + curlink->sector_count;

27076

+ if ( *sector < last_link_sector ) {

27077

+ *dlentry = curlink;

27078

+ cur_last_sector = *sector + *num_sectors;

27079

+ *sector -= curlink->start_sector;

27080

+ LOG_EVERYTHING("I/O start_RBA == %Ld , sector_count == %Ld\n", *sector, *num_sectors );

27081

+ if ( cur_last_sector <= last_link_sector )

27082

+ return 1;

27083

+ else {

27084

+ if ( (*dlentry)->next )

27085

+ *num_sectors -= cur_last_sector - last_link_sector;

27086

+ else

27087

+ return 0;

27088

+ }

27089

+ return 2;

27090

+ }

27091

+

27092

+ curlink = nextlink;

27093

+ }

27094

+

27095

+ return 0;

27096

+}

27097

+

27098

+

27099

+

27100

+// Allocation/Deallocation Functions

27101

+

27102

+

27103

+/*

27104

+ * Function: new_os2_drive_link

27105

+ *

27106

+ * Allocate space for a new OS/2 drive link structure.

27107

+ * Initialize the appropriate fields.

27108

+ * Note: since the BBR info applies to each link, the BBR structures

27109

+ * are also initialized here.

27110

+ */

27111

+static os2_drivelink_runtime_entry_t * new_os2_drive_link( LVM_Signature_Sector * signature_sector,

27112

+ evms_logical_node_t * evms_partition )

27113

+{

27114

+ int i;

27115

+ u_int32_t feature, feature_size, sectoroffset;

27116

+ os2_drivelink_runtime_entry_t * new_dlentry;

27117

+

27118

+ if ( evms_cs_allocate_memory(( void** )&new_dlentry, sizeof( os2_drivelink_runtime_entry_t )) ) {

27119

+ LOG_SERIOUS("Could not allocate drivelink metadata\n" );

27120

+ return NULL;

27121

+ }

27122

+ new_dlentry->sector_count = signature_sector->Partition_Size_To_Report_To_User;

27123

+ new_dlentry->Partition_Serial_Number = signature_sector->Partition_Serial_Number;

27124

+ new_dlentry->bbr_is_active = 0; // initialize to not active

27125

+ new_dlentry->link_partition = evms_partition;

27126

+ init_MUTEX( &(new_dlentry->BBR_Table_Lock) );

27127

+

27128

+ sectoroffset = signature_sector->Partition_Start;

27129

+ LOG_EVERYTHING("Partition Start is at LBA %i\n", sectoroffset );

27130

+ for ( i = 0 ; i < OS2LVM_MAX_FEATURES_PER_VOLUME ; i++ ) {

27131

+ feature = signature_sector->LVM_Feature_Array[i].Feature_ID;

27132

+ if ( feature ) {

27133

+ feature_size = signature_sector->LVM_Feature_Array[i].Feature_Data_Size;

27134

+ LOG_EVERYTHING("Entry %d in Feature Table is valid,\n", i+1 );

27135

+ LOG_EVERYTHING("Feature Data size is %i sectors.\n", feature_size );

27136

+ if ( feature == DRIVE_LINKING_FEATURE_ID ) {

27137

+ if ( !new_dlentry->link_data ) {

27138

+ new_dlentry->Drive_Link_Data_Copy1 = signature_sector->LVM_Feature_Array[i].Location_Of_Primary_Feature_Data - sectoroffset;

27139

+ new_dlentry->Drive_Link_Data_Copy2 = signature_sector->LVM_Feature_Array[i].Location_Of_Secondary_Feature_Data - sectoroffset;

27140

+ new_dlentry->link_data = new_os2_link_data( new_dlentry->Drive_Link_Data_Copy1, new_dlentry->Drive_Link_Data_Copy2, feature_size, evms_partition );

27141

+ if ( new_dlentry->link_data == NULL) {

27142

+ delete_os2_drive_link(new_dlentry,0);

27143

+ new_dlentry = NULL;

27144

+ }

27145

+ }

27146

+ else {

27147

+ LOG_WARNING("os2lvm_vge: Drive Linking Feature encountered twice in the same Feature Array!\n");

27148

+ delete_os2_drive_link(new_dlentry,0);

27149

+ new_dlentry = NULL;

27150

+ }

27151

+ }

27152

+ else if ( feature == BBR_FEATURE_ID ) {

27153

+ if ( !new_dlentry->bbr_data ) {

27154

+ new_dlentry->BBR_Data_Copy1 = signature_sector->LVM_Feature_Array[i].Location_Of_Primary_Feature_Data;

27155

+ new_dlentry->BBR_Data_Copy2 = signature_sector->LVM_Feature_Array[i].Location_Of_Secondary_Feature_Data;

27156

+ new_dlentry->BBR_Feature_Size = feature_size;

27157

+ new_dlentry->bbr_data = new_os2_bbr_data( new_dlentry->BBR_Data_Copy1, new_dlentry->BBR_Data_Copy2, feature_size, evms_partition );

27158

+ if ( new_dlentry->bbr_data == NULL) {

27159

+ delete_os2_drive_link(new_dlentry,0);

27160

+ new_dlentry = NULL;

27161

+ }

27162

+ else if ( signature_sector->LVM_Feature_Array[i].Feature_Active ) {

27163

+ new_dlentry->bbr_is_active = check_for_os2_bbr_relocations( new_dlentry->bbr_data );

27164

+ }

27165

+ }

27166

+ else {

27167

+ LOG_WARNING("os2lvm_vge: BBR Feature encountered twice in the same Feature Array!\n");

27168

+ delete_os2_drive_link(new_dlentry,0);

27169

+ new_dlentry = NULL;

27170

+ }

27171

+ }

27172

+ else {

27173

+ LOG_WARNING("os2lvm_vge: Unknown Feature entry %d found.\n", feature );

27174

+ delete_os2_drive_link(new_dlentry,0);

27175

+ new_dlentry = NULL;

27176

+ }

27177

+

27178

+ if ( signature_sector->LVM_Feature_Array[i].Feature_Active ) {

27179

+ LOG_EVERYTHING("Feature is active.\n" );

27180

+ }

27181

+ }

27182

+ }

27183

+

27184

+ if ( new_dlentry &&

27185

+ ( ( ! new_dlentry->bbr_data ) || ( ! new_dlentry->link_data ) )

27186

+ ) {

27187

+ LOG_WARNING("os2lvm_vge: Incomplete Feature Data found.\n" );

27188

+ delete_os2_drive_link(new_dlentry,0);

27189

+ new_dlentry = NULL;

27190

+ }

27191

+ return new_dlentry;

27192

+}

27193

+

27194

+

27195

+/*

27196

+ * Function: new_os2_link_data

27197

+ *

27198

+ * Allocate space for OS/2 drive link information.

27199

+ * Read in and validate the information from disk.

27200

+ * Note: assumes 512 byte sectors.

27201

+ */

27202

+static char * new_os2_link_data( u_int32_t linksector1,

27203

+ u_int32_t linksector2,

27204

+ u_int32_t linknumsectors,

27205

+ evms_logical_node_t * link_partition )

27206

+{

27207

+ char * new_data1; /* Buffer used to hold the primary copy of the drive linking data. */

27208

+ char * new_data2; /* Buffer used to hold the secondary copy of the drive linking data. */

27209

+ char * p1; /* Used to access individual sectors of data within new_data1. */

27210

+ char * p2; /* Used to access individual sectors of data within new_data2. */

27211

+ int memsize = linknumsectors * OS2_BYTES_PER_SECTOR;

27212

+ u_int32_t i, seq1, seq2;

27213

+

27214

+ /* Allocate Memory for the buffers to hold the drive linking data. */

27215

+ LOG_EVERYTHING("Drive Linking Feature entry found.\n" );

27216

+ if ( evms_cs_allocate_memory(( void** )&new_data1, memsize ) ) {

27217

+ LOG_SERIOUS("Could not allocate Primary Link data\n" );

27218

+ return NULL;

27219

+ }

27220

+ if ( evms_cs_allocate_memory(( void** )&new_data2, memsize ) ) {

27221

+ LOG_SERIOUS("Could not allocate Secondary Link data\n" );

27222

+ evms_cs_deallocate_memory(( void* )new_data1 );

27223

+ return NULL;

27224

+ }

27225

+

27226

+ LOG_EVERYTHING("Primary Feature Data starts at RBA %i\n", linksector1 );

27227

+ LOG_EVERYTHING("Secondary Feature Data starts at RBA %i\n", linksector2 );

27228

+

27229

+ /* Read the drive linking data into memory. */

27230

+ if ( INIT_IO( link_partition, 0, linksector1, linknumsectors, new_data1 ) ) {

27231

+ LOG_SERIOUS("I/O error reading Primary Feature Data.\n" );

27232

+ seq1 = 0;

27233

+ p1 = NULL;

27234

+ }

27235

+ else {

27236

+ /* Set up access to the buffer. Extract the Master Sequence Number from the buffer. */

27237

+ p1 = new_data1;

27238

+ seq1 = (( LVM_Link_Table_First_Sector * )p1 )->Sequence_Number;

27239

+ }

27240

+

27241

+ if ( INIT_IO( link_partition, 0, linksector2, linknumsectors, new_data2 ) ) {

27242

+ LOG_SERIOUS("I/O error reading Secondary Feature Data.\n" );

27243

+ seq2 = 0;

27244

+ p2 = NULL;

27245

+ }

27246

+ else {

27247

+ /* Set up access to the second buffer. Extract its copy of the Master Sequence Number. */

27248

+ p2 = new_data2;

27249

+ seq2 = (( LVM_Link_Table_Sector * )p2 )->Sequence_Number;

27250

+ }

27251

+

27252

+ /* Validate both copies of the drive linking data one sector at a time. */

27253

+ for ( i = 0; i < linknumsectors; i++, p1 += OS2_BYTES_PER_SECTOR, p2 += OS2_BYTES_PER_SECTOR ) {

27254

+ if ( (seq1 > 0) && validate_drivelinksector( ( LVM_Link_Table_Sector * )p1, i, seq1 )) {

27255

+ LOG_SERIOUS("The primary copy of the drive link data is invalid! Sector %i is not valid\n", i );

27256

+ seq1 = 0;

27257

+ }

27258

+

27259

+ if ( (seq2 > 0) && validate_drivelinksector( ( LVM_Link_Table_Sector * )p2, i, seq2 )) {

27260

+ LOG_SERIOUS("The secondary copy of the drive link data is invalid! Sector %i is not valid\n", i );

27261

+ seq2 = 0;

27262

+ }

27263

+

27264

+ }

27265

+

27266

+ LOG_EVERYTHING("Primary Feature Data sequence # %i\n", seq1 );

27267

+ LOG_EVERYTHING("Secondary Feature Data sequence # %i\n", seq2 );

27268

+

27269

+ /* Choose which copy of the drive linking data to use. If both sequence numbers are 0, then both copies

27270

+ of the drive linking data are bad. If both are equal and non-zero, then both copies are good and it

27271

+ really doesn't matter which one you choose. Otherwise, choose the copy with the highest sequence number. */

27272

+ if ( seq2 > seq1 ) {

27273

+ evms_cs_deallocate_memory(( void* )new_data1 );

27274

+ return new_data2;

27275

+ }

27276

+ else {

27277

+ evms_cs_deallocate_memory(( void* )new_data2 );

27278

+ if ( !seq1 ) {

27279

+ evms_cs_deallocate_memory(( void* )new_data1 );

27280

+ new_data1 = NULL;

27281

+ }

27282

+ }

27283

+ return new_data1;

27284

+}

27285

+

27286

+

27287

+/*

27288

+ * Function: new_os2_bbr_data

27289

+ *

27290

+ * Allocate space for OS/2 bad block relocation information.

27291

+ * Read in and validate the information from disk.

27292

+ * Note: assumes 512 byte sectors.

27293

+ */

27294

+static char * new_os2_bbr_data( u_int32_t bbrsector1,

27295

+ u_int32_t bbrsector2,

27296

+ u_int32_t bbrnumsectors,

27297

+ evms_logical_node_t * bbr_partition )

27298

+{

27299

+ char * new_data1; /* Buffer to hold the primary copy of the BBR data. */

27300

+ char * new_data2; /* Buffer to hold the secondary copy of the BBR data. */

27301

+ char * p1; /* Used to examine the individual sectors of BBR data within new_data1. */

27302

+ char * p2; /* Used to examine the individual sectors of BBR data within new_data2. */

27303

+ int memsize = bbrnumsectors * OS2_BYTES_PER_SECTOR;

27304

+ u_int32_t i, seq1, seq2;

27305

+

27306

+ LOG_EVERYTHING("BBR Feature entry found.\n" );

27307

+

27308

+ /* Allocate memory for the buffers. */

27309

+ if ( evms_cs_allocate_memory(( void** )&new_data1, memsize ) ) {

27310

+ LOG_SERIOUS("Could not allocate Primary BBR data\n" );

27311

+ return NULL;

27312

+ }

27313

+ if ( evms_cs_allocate_memory(( void** )&new_data2, memsize ) ) {

27314

+ LOG_SERIOUS("Could not allocate Secondary BBR data\n" );

27315

+ evms_cs_deallocate_memory(( void* )new_data1 );

27316

+ return NULL;

27317

+ }

27318

+

27319

+ LOG_EVERYTHING("Primary Feature Data starts at RBA %i\n", bbrsector1 );

27320

+ LOG_EVERYTHING("Secondary Feature Data starts at RBA %i\n", bbrsector2 );

27321

+

27322

+ /* Read in both copies of the BBR data. */

27323

+ if ( INIT_IO( bbr_partition, 0, bbrsector1, bbrnumsectors, new_data1 ) ) {

27324

+ LOG_SERIOUS("I/O error reading Primary Feature Data.\n" );

27325

+ seq1 = 0;

27326

+ p1 = NULL;

27327

+ }

27328

+ else {

27329

+ /* Establish access to the first sector of the BBR data. Extract the Master Sequence Number

27330

+ for this copy of the BBR data. */

27331

+ p1 = new_data1;

27332

+ seq1 = (( LVM_BBR_Table_First_Sector * )p1 )->Sequence_Number;

27333

+ }

27334

+

27335

+ if ( INIT_IO( bbr_partition, 0, bbrsector2, bbrnumsectors, new_data2 ) ) {

27336

+ LOG_SERIOUS("I/O error reading Secondary Feature Data.\n" );

27337

+ seq2 = 0;

27338

+ p2 = NULL;

27339

+ }

27340

+ else {

27341

+ /* Establish access to the first sector of the second copy of the BBR data. Extract the

27342

+ Master Sequence Number for this copy of the BBR data. */

27343

+ p2 = new_data2;

27344

+ seq2 = (( LVM_BBR_Table_Sector * )p2 )->Sequence_Number;

27345

+ }

27346

+

27347

+ /* Validate both copies of the BBR Data, one sector at a time. */

27348

+ for ( i = 0; i < bbrnumsectors; i++, p1 += OS2_BYTES_PER_SECTOR, p2 += OS2_BYTES_PER_SECTOR ) {

27349

+ if ( (seq1 > 0) && validate_bbrtablesector( p1, i, seq1 )) {

27350

+ LOG_SERIOUS("The primary BBR data is invalid! Sector %i is not valid\n", i );

27351

+ seq1 = 0;

27352

+ }

27353

+

27354

+ if ( (seq2 > 0) && validate_bbrtablesector( p2, i, seq2 )) {

27355

+ LOG_SERIOUS("The secondary BBR data is invalid! Sector %i is not valid\n", i );

27356

+ seq2 = 0;

27357

+ }

27358

+

27359

+ }

27360

+

27361

+ LOG_EVERYTHING("Primary Feature Data sequence # %i\n", seq1 );

27362

+ LOG_EVERYTHING("Secondary Feature Data sequence # %i\n", seq2 );

27363

+

27364

+ /* Choose which copy of the BBR Data to use based upon the sequence number. If both sequence numbers

27365

+ are 0, then there is no valid BBR data. If both are non-zero and equal, then it really doesn't

27366

+ matter which copy is used. Otherwise, choose the copy with the highest sequence number. */

27367

+ if ( seq2 > seq1 ) {

27368

+ evms_cs_deallocate_memory(( void* )new_data1 );

27369

+ return new_data2;

27370

+ }

27371

+ else {

27372

+ evms_cs_deallocate_memory(( void* )new_data2 );

27373

+ if ( !seq1 ) {

27374

+ evms_cs_deallocate_memory(( void* )new_data1 );

27375

+ new_data1 = NULL;

27376

+ }

27377

+ }

27378

+ return new_data1;

27379

+}

27380

+

27381

+

27382

+/*

27383

+ * Function: new_os2volume

27384

+ *

27385

+ * Allocate space for a new OS/2 logical volume.

27386

+ * Initialize the appropriate fields.

27387

+ */

27388

+static evms_logical_node_t * new_os2volume( u_int32_t volumeserial,

27389

+ char * volume_name )

27390

+{

27391

+ evms_logical_node_t * new_node;

27392

+ os2_volume_runtime_entry_t * cur_volume;

27393

+

27394

+ if ( evms_cs_allocate_logical_node( &new_node ) ) {

27395

+ LOG_SERIOUS("Could not allocate new volume\n" );

27396

+ return NULL;

27397

+ }

27398

+ if ( evms_cs_allocate_memory( &new_node->instance_data, sizeof( os2_volume_runtime_entry_t )) ) {

27399

+ LOG_SERIOUS("Could not allocate volume metadata\n" );

27400

+ evms_cs_deallocate_logical_node( new_node );

27401

+ return NULL;

27402

+ }

27403

+ new_node->plugin = &plugin_header;

27404

+ new_node->system_id = LVM_PARTITION_INDICATOR;

27405

+ sprintf( new_node->name, "os2/%s", volume_name );

27406

+ cur_volume = ( os2_volume_runtime_entry_t * )new_node->instance_data;

27407

+ cur_volume->Volume_Serial_Number = volumeserial;

27408

+ cur_volume->Export_Needed = 1;

27409

+

27410

+ if ( os2lvm_nodes == NULL )

27411

+ os2lvm_nodes = new_node;

27412

+

27413

+ // This is the first node discovered. Start the BBR thread.

27414

+ if ( ! BBR_Worker_Thread ) {

27415

+ BBR_Worker_Thread = evms_cs_register_thread(BBR_Worker, NULL, BBR_Worker_Name);

27416

+ if ( ! BBR_Worker_Thread ) {

27417

+ evms_cs_deallocate_memory(new_node->instance_data);

27418

+ evms_cs_deallocate_logical_node(new_node);

27419

+ os2lvm_nodes = NULL;

27420

+ return NULL;

27421

+ }

27422

+ }

27423

+ else {

27424

+ cur_volume = ( os2_volume_runtime_entry_t * )os2lvm_nodes->instance_data;

27425

+ while ( cur_volume->next_os2lvm_node )

27426

+ cur_volume = ( os2_volume_runtime_entry_t * )cur_volume->next_os2lvm_node->instance_data;

27427

+ cur_volume->next_os2lvm_node = new_node;

27428

+ }

27429

+

27430

+ MOD_INC_USE_COUNT;

27431

+

27432

+ return new_node;

27433

+}

27434

+

27435

+

27436

+/*

27437

+ * Function: delete_os2lvm_volume

27438

+ *

27439

+ * This function deletes the in-memory representation of an OS/2

27440

+ * logical volume.

27441

+ */

27442

+static int delete_os2lvm_volume( evms_logical_node_t * logical_node )

27443

+{

27444

+ os2_drivelink_runtime_entry_t * curdrvlink = (( os2_volume_runtime_entry_t * )logical_node->instance_data )->drive_link, * nextdrvlink;

27445

+ os2_volume_runtime_entry_t * cur_volume, * next_volume;

27446

+

27447

+ while ( curdrvlink ) {

27448

+ nextdrvlink = curdrvlink->next;

27449

+ delete_os2_drive_link( curdrvlink, 1 );

27450

+ curdrvlink = nextdrvlink;

27451

+ }

27452

+

27453

+ cur_volume = ( os2_volume_runtime_entry_t * )os2lvm_nodes->instance_data;

27454

+ if ( os2lvm_nodes == logical_node )

27455

+ os2lvm_nodes = cur_volume->next_os2lvm_node;

27456

+ else {

27457

+ while ( cur_volume->next_os2lvm_node ) {

27458

+ next_volume = ( os2_volume_runtime_entry_t * )cur_volume->next_os2lvm_node->instance_data;

27459

+ if ( cur_volume->next_os2lvm_node == logical_node ) {

27460

+ cur_volume->next_os2lvm_node = next_volume->next_os2lvm_node;

27461

+ break;

27462

+ }

27463

+ }

27464

+ }

27465

+

27466

+ if ( os2lvm_nodes == NULL ) {

27467

+ // Just deleted the last os2 node. Stop the BBR thread.

27468

+ if ( BBR_Worker_Thread ) {

27469

+ evms_cs_unregister_thread(BBR_Worker_Thread);

27470

+ BBR_Worker_Thread = NULL;

27471

+ }

27472

+ }

27473

+

27474

+ evms_cs_deallocate_memory( logical_node->instance_data );

27475

+ evms_cs_deallocate_logical_node( logical_node );

27476

+

27477

+ MOD_DEC_USE_COUNT;

27478

+

27479

+ return 0;

27480

+}

27481

+

27482

+

27483

+/*

27484

+ * Function: delete_os2_drive_link

27485

+ *

27486

+ * This function deletes the drive link runtime structure and any

27487

+ * other structures it points to.

27488

+ */

27489

+static int delete_os2_drive_link( os2_drivelink_runtime_entry_t * drive_link,

27490

+ int delete_link_partition )

27491

+{

27492

+ if ( drive_link->link_data )

27493

+ evms_cs_deallocate_memory( drive_link->link_data );

27494

+ if ( drive_link->bbr_data )

27495

+ evms_cs_deallocate_memory( drive_link->bbr_data );

27496

+ if ( delete_link_partition )

27497

+ DELETE( drive_link->link_partition );

27498

+ evms_cs_deallocate_memory( drive_link );

27499

+

27500

+ return 0;

27501

+}

27502

+

27503

+

27504

+

27505

+// Consistency Checking Functions

27506

+

27507

+

27508

+/*

27509

+ * Function: validate_signaturesector

27510

+ *

27511

+ * This function checks the OS/2 LVM Signature Sector

27512

+ */

27513

+static int validate_signaturesector(evms_logical_node_t * evms_partition,

27514

+ LVM_Signature_Sector * signature_sector,

27515

+ u_int32_t sectorsize )

27516

+{

27517

+ u_int32_t crc_hold, crc_new;

27518

+

27519

+ /* In order for a signature sector to be considered valid, its signature and CRC must

27520

+ be correct. Also, OS/2 stores the starting LBA of the partition and the size of

27521

+ the partition that this signature sector corresponds to. These should be checked

27522

+ as well. However, since the starting LBA of the partition that this belongs to is

27523

+ not available to us as part of an evms_logical_node_t, we can only check the size

27524

+ of the partition against what is stored in the signature sector. */

27525

+

27526

+ /* The signature used is in two parts. Test the first part. */

27527

+ if ( signature_sector->LVM_Signature1 != OS2LVM_PRIMARY_SIGNATURE ) {

27528

+ LOG_EVERYTHING("Primary LVM Signature failed.\n" );

27529

+ return 1;

27530

+ }

27531

+

27532

+ /* Test the second part of the signature. */

27533

+ if ( signature_sector->LVM_Signature2 != OS2LVM_SECONDARY_SIGNATURE ) {

27534

+ LOG_EVERYTHING("Secondary LVM Signature failed.\n" );

27535

+ return 1;

27536

+ }

27537

+

27538

+ /* Calculate the CRC and compare it against the stored CRC. */

27539

+ crc_hold = signature_sector->Signature_Sector_CRC;

27540

+ signature_sector->Signature_Sector_CRC = 0;

27541

+ crc_new = evms_cs_calculate_crc( EVMS_INITIAL_CRC, ( void * )signature_sector, sectorsize );

27542

+ if ( crc_hold != crc_new ) {

27543

+ LOG_EVERYTHING("Signature sector crc failed.\n" );

27544

+ LOG_EVERYTHING("sector_crc == %x , calc_crc == %x \n", crc_hold, crc_new );

27545

+ return 1;

27546

+ }

27547

+

27548

+ // The partition size must == that found in the Signature Sector

27549

+ if ( evms_partition->total_vsectors != signature_sector->Partition_Sector_Count ) {

27550

+ LOG_EXTRA("Partition size is not valid\n" );

27551

+ return 1;

27552

+ }

27553

+

27554

+ return 0;

27555

+}

27556

+

27557

+

27558

+/*

27559

+ * Function: validate_drivelinksector

27560

+ *

27561

+ * This function checks the OS/2 LVM Drivelink Feature Sector

27562

+ */

27563

+static int validate_drivelinksector( void * Sector_To_Validate,

27564

+ int Sector_Index,

27565

+ u_int32_t Master_Sequence_Number )

27566

+{

27567

+ u_int32_t crc_hold, crc_new;

27568

+ LVM_Link_Table_First_Sector * First_Sector = (LVM_Link_Table_First_Sector * ) Sector_To_Validate;

27569

+ LVM_Link_Table_Sector * Link_Sector = (LVM_Link_Table_Sector * ) Sector_To_Validate;

27570

+

27571

+ /* The OS/2 drive linking data covers several sectors. The format of the first sector is slightly

27572

+ different from the following sectors because it contains additional information about how many

27573

+ drive links are actually in use. The following sectors just contain portions of the drive link

27574

+ table. Each sector of OS/2 drive linking data contains a signature, crc, and sequence number

27575

+ which must be validated. */

27576

+

27577

+ if ( Sector_Index == 0 ) {

27578

+

27579

+ /* Link Table Master Signature Check */

27580

+ if ( LINK_TABLE_MASTER_SIGNATURE != First_Sector->Link_Table_Signature ) {

27581

+ LOG_EVERYTHING("Link Table Master Signature Test failed.\n" );

27582

+ return 1;

27583

+ }

27584

+

27585

+ /* We will NOT check the sequence number here as the first sector of drive link data is the

27586

+ source of the Master_Sequence_Number which was passed in to us. */

27587

+

27588

+ /* Set up for the CRC Check */

27589

+ crc_hold = First_Sector->Link_Table_CRC;

27590

+ First_Sector->Link_Table_CRC = 0;

27591

+ }

27592

+ else {

27593

+ /* Link Table Internal Signature Check */

27594

+ if ( LINK_TABLE_SIGNATURE != Link_Sector->Link_Table_Signature ) {

27595

+ LOG_EVERYTHING("Link Table Internal Signature Test failed.\n" );

27596

+ return 1;

27597

+ }

27598

+

27599

+ /* Check the sequence number. */

27600

+ if ( Master_Sequence_Number != Link_Sector->Sequence_Number ) {

27601

+ LOG_EVERYTHING("Link Table Internal Sequence Number Test failed.\n" );

27602

+ return 1;

27603

+ }

27604

+

27605

+ /* Set up for the CRC Check */

27606

+ crc_hold = Link_Sector->Link_Table_CRC;

27607

+ Link_Sector->Link_Table_CRC = 0;

27608

+ }

27609

+

27610

+ crc_new = evms_cs_calculate_crc( EVMS_INITIAL_CRC, Sector_To_Validate, OS2_BYTES_PER_SECTOR );

27611

+ if ( crc_hold != crc_new ) {

27612

+ LOG_EVERYTHING("Link Table crc failed.\n" );

27613

+ LOG_EVERYTHING("sector_crc == %x , calc_crc == %x \n", crc_hold, crc_new );

27614

+ return 1;

27615

+ }

27616

+

27617

+ return 0;

27618

+}

27619

+

27620

+

27621

+/*

27622

+ * Function: validate_bbrtablesector

27623

+ *

27624

+ * This function checks the OS/2 LVM Bad Block Relocation Feature Sector

27625

+ */

27626

+static int validate_bbrtablesector( void * Sector_To_Validate,

27627

+ int Sector_Index,

27628

+ u_int32_t Master_Sequence_Number )

27629

+{

27630

+ u_int32_t crc_hold, crc_new;

27631

+ LVM_BBR_Table_First_Sector * First_Sector = (LVM_BBR_Table_First_Sector * ) Sector_To_Validate;

27632

+ LVM_BBR_Table_Sector * BBR_Sector = (LVM_BBR_Table_Sector * ) Sector_To_Validate;

27633

+

27634

+ /* The OS/2 bad block relocation (BBR) data covers several sectors. The format of the first sector

27635

+ is different from the following sectors because it contains additional information about how many

27636

+ relocations are actually in use and the size and location of the block of replacement sectors.

27637

+ The following sectors just contain portions of the BBR remap table. Each sector of OS/2 BBR data

27638

+ contains a signature, crc, and sequence number which must be validated. */

27639

+

27640

+ if ( Sector_Index == 0 ) {

27641

+

27642

+ /* BBR Table Master Signature Check */

27643

+ if ( BBR_TABLE_MASTER_SIGNATURE != First_Sector->Signature ) {

27644

+ LOG_EVERYTHING("BBR Table Master Signature Test failed.\n" );

27645

+ return 1;

27646

+ }

27647

+

27648

+ /* We will NOT check the sequence number here as the first sector of BBR data is the

27649

+ source of the Master_Sequence_Number which was passed in to us. */

27650

+

27651

+ /* Set up for the CRC Check */

27652

+ crc_hold = First_Sector->CRC;

27653

+ First_Sector->CRC = 0;

27654

+

27655

+ }

27656

+ else {

27657

+ /* BBR Table Internal Signature Check */

27658

+ if ( BBR_TABLE_SIGNATURE != BBR_Sector->Signature ) {

27659

+ LOG_EVERYTHING("BBR Table Internal Signature Test failed.\n" );

27660

+ return 1;

27661

+ }

27662

+

27663

+ /* Check the sequence number. */

27664

+ if ( Master_Sequence_Number != BBR_Sector->Sequence_Number ) {

27665

+ LOG_EVERYTHING("BBR Table Internal Sequence Number Test failed.\n" );

27666

+ return 1;

27667

+ }

27668

+

27669

+ /* Set up for the CRC Check */

27670

+ crc_hold = BBR_Sector->CRC;

27671

+ BBR_Sector->CRC = 0;

27672

+ }

27673

+

27674

+ crc_new = evms_cs_calculate_crc( EVMS_INITIAL_CRC, Sector_To_Validate, OS2_BYTES_PER_SECTOR );

27675

+ if ( crc_hold != crc_new ) {

27676

+ LOG_EVERYTHING("BBRTable crc failed.\n" );

27677

+ LOG_EVERYTHING("sector_crc == %x , calc_crc == %x \n", crc_hold, crc_new );

27678

+ return 1;

27679

+ }

27680

+

27681

+ return 0;

27682

+}

27683

+

27684

+

27685

+/*

27686

+ * Function: check_for_os2_bbr_relocations

27687

+ *

27688

+ * This function checks the OS/2 LVM Bad Block Relocation Tables

27689

+ * for any active relocation sectors. The bbr table is reformatted in memory

27690

+ * to make searches faster.

27691

+ * Return values: 0 == no active relocations, 1 == contains active relocations

27692

+ */

27693

+static u_int32_t check_for_os2_bbr_relocations( char * bbr_data_ptr )

27694

+{

27695

+ LVM_BBR_Feature * feature_data = ( LVM_BBR_Feature * )bbr_data_ptr;

27696

+

27697

+ if ( feature_data->control.Table_Entries_In_Use ) {

27698

+ LOG_EVERYTHING("There are %d active relocations.\n", feature_data->control.Table_Entries_In_Use );

27699

+ return 1;

27700

+ }

27701

+

27702

+ return 0;

27703

+}

27704

+

27705

+

27706

+/*

27707

+ * Function: check_os2_volumes

27708

+ *

27709

+ * This function performs a consistency check on all existing OS/2

27710

+ * Logical Volumes. The list of constituent partitions ( links )

27711

+ * is checked and ordered according to the Link Table. If any link

27712

+ * is missing or inconsistent, the entire volume will be deleted.

27713

+ */

27714

+static int check_os2_volumes( evms_logical_node_t ** node_list )

27715

+{

27716

+ os2_volume_runtime_entry_t * cur_volume;

27717

+ os2_volume_runtime_entry_t * previous_volume;

27718

+ evms_logical_node_t * cur_node;

27719

+ evms_logical_node_t * previous_node = NULL;

27720

+ os2_drivelink_runtime_entry_t * link_list, * link_hold;

27721

+ LVM_Link_Table_First_Sector * psector1;

27722

+ int i, rc = 0;

27723

+ u_int32_t numlinks, countlinks, linkser;

27724

+ u_int32_t Master_Sequence_Number; /* Used to check whether or not all of the copies of Drive Linking data match. */

27725

+ evms_sector_t partition_offset;

27726

+ char * sect_ptr;

27727

+

27728

+ LOG_ENTRY_EXIT("Checking OS/2 Logical Volumes\n" );

27729

+

27730

+ cur_node = os2lvm_nodes;

27731

+

27732

+ while ( cur_node ) {

27733

+ cur_volume = ( os2_volume_runtime_entry_t * )cur_node->instance_data;

27734

+ link_list = NULL;

27735

+ if ( !cur_volume->complete ) { /* need to verify this one */

27736

+ cur_volume->complete = 1;

27737

+ LOG_EVERYTHING("Checking volume %s\n", cur_node->name );

27738

+

27739

+ // Reset fields for sort operation

27740

+ cur_volume->size_in_sectors = 0;

27741

+ numlinks = cur_volume->drive_link_count;

27742

+ cur_volume->drive_link_count = 0;

27743

+ cur_node->total_vsectors = 0;

27744

+ link_list = cur_volume->drive_link;

27745

+ cur_volume->drive_link = NULL;

27746

+

27747

+ // Access the link data to order the drive links

27748

+ psector1 = ( LVM_Link_Table_First_Sector * )link_list->link_data;

27749

+ Master_Sequence_Number = psector1->Sequence_Number;

27750

+

27751

+ if ( numlinks != psector1->Links_In_Use ) {

27752

+ LOG_SERIOUS("Link Count mismatch vol=%i, table=%i\n", numlinks, psector1->Links_In_Use );

27753

+ cur_volume->complete = 0;

27754

+ countlinks = 0;

27755

+ }

27756

+ else{

27757

+ if ( numlinks > LINKS_IN_FIRST_SECTOR ) {

27758

+ countlinks = LINKS_IN_FIRST_SECTOR;

27759

+ numlinks -= LINKS_IN_FIRST_SECTOR;

27760

+ }

27761

+ else {

27762

+ countlinks = numlinks;

27763

+ numlinks = 0;

27764

+ }

27765

+

27766

+ }

27767

+

27768

+ partition_offset = 0;

27769

+ for ( i = 0; (i < countlinks) && (cur_volume->complete == 1); i++ ) {

27770

+ linkser = psector1->Link_Table[i].Partition_Serial_Number;

27771

+ if ( ( link_hold = find_link_data( &link_list, linkser ) ) ) {

27772

+ // Add this partition to its parent Volume

27773

+ add_os2link( link_hold, cur_node );

27774

+ LOG_EVERYTHING("Link start_RBA == %Ld , sector_count == %Ld\n",

27775

+ partition_offset, link_hold->sector_count );

27776

+ link_hold->start_sector = partition_offset;

27777

+ partition_offset += link_hold->sector_count;

27778

+ }

27779

+ else {

27780

+ LOG_SERIOUS("Link Table entry %i metadata missing\n", i );

27781

+ cur_volume->complete = 0;

27782

+ break;

27783

+ }

27784

+ }

27785

+

27786

+ sect_ptr = ( char * )psector1;

27787

+

27788

+ while ( numlinks && (cur_volume->complete == 1) ) {

27789

+ if ( numlinks > LINKS_IN_NEXT_SECTOR ) {

27790

+ countlinks = LINKS_IN_NEXT_SECTOR;

27791

+ numlinks -= LINKS_IN_NEXT_SECTOR;

27792

+ }

27793

+ else {

27794

+ countlinks = numlinks;

27795

+ numlinks = 0;

27796

+ }

27797

+ sect_ptr += OS2_BYTES_PER_SECTOR;

27798

+ if ( Master_Sequence_Number != (( LVM_Link_Table_Sector * )sect_ptr )->Sequence_Number ) {

27799

+ cur_volume->complete = 0;

27800

+ LOG_SERIOUS("Bad Sequence Number for Drive Linking Metadata!\n");

27801

+ }

27802

+ else {

27803

+ for ( i = 0; i < countlinks; i++ ) {

27804

+ linkser = (( LVM_Link_Table_Sector * )sect_ptr )->Link_Table[i].Partition_Serial_Number;

27805

+ if ( ( link_hold = find_link_data( &link_list, linkser ) ) ) {

27806

+ // Add this partition to its parent Volume

27807

+ add_os2link( link_hold, cur_node );

27808

+ LOG_EVERYTHING("Link start_RBA == %Ld , sector_count == %Ld\n",

27809

+ partition_offset, link_hold->sector_count );

27810

+ link_hold->start_sector = partition_offset;

27811

+ partition_offset += link_hold->sector_count;

27812

+ }

27813

+ else {

27814

+ LOG_SERIOUS("Link Table entry %i metadata missing\n", i );

27815

+ cur_volume->complete = 0;

27816

+ break;

27817

+ }

27818

+ }

27819

+ }

27820

+ }

27821

+ }

27822

+

27823

+ /* If the volume is complete we can export it for use. */

27824

+ if ( cur_volume->complete && (link_list == NULL) ) {

27825

+

27826

+ // Link new volume into the node list

27827

+ if ( cur_volume->Export_Needed &&

27828

+ ( !evms_cs_add_logical_node_to_list( node_list, cur_node ) )

27829

+ ) {

27830

+ rc++;

27831

+ cur_volume->Export_Needed = 0;

27832

+ }

27833

+

27834

+ previous_node = cur_node;

27835

+ cur_node = cur_volume->next_os2lvm_node;

27836

+ }

27837

+ else {

27838

+ /* Remove the volume from os2lvm_nodes list and delete it. */

27839

+ if ( previous_node != NULL ) {

27840

+

27841

+ previous_volume = ( os2_volume_runtime_entry_t * )previous_node->instance_data;

27842

+ previous_volume->next_os2lvm_node = cur_volume->next_os2lvm_node;

27843

+ cur_volume->next_os2lvm_node = NULL;

27844

+

27845

+ delete_os2lvm_volume(cur_node);

27846

+

27847

+ cur_node = previous_volume->next_os2lvm_node;

27848

+ }

27849

+ else {

27850

+ previous_node = cur_volume->next_os2lvm_node;

27851

+ delete_os2lvm_volume(cur_node);

27852

+ cur_node = previous_node;

27853

+ previous_node = NULL;

27854

+ os2lvm_nodes = cur_node;

27855

+ }

27856

+

27857

+ /* If any items remain in link_list, delete those as well. */

27858

+ while (link_list) {

27859

+ link_hold = link_list->next;

27860

+ delete_os2_drive_link(link_list,1);

27861

+ link_list = link_hold;

27862

+ }

27863

+

27864

+ }

27865

+

27866

+ }

27867

+

27868

+ LOG_ENTRY_EXIT("Finished Checking OS/2 Logical Volumes\n" );

27869

+

27870

+ return rc;

27871

+}

27872

+

27873

+

27874

+

27875

+/* BBR_Transfer_IO

27876

+ *

27877

+ * Transfer the responsibility for completing the specified IO from

27878

+ * the thread that requested it to the BBR Worker Thread

27879

+ */

27880

+static void BBR_Transfer_IO(BBR_IO_Transfer_Record_t * Transfer_Record)

27881

+{

27882

+ unsigned long flags;

27883

+ int Wake_Worker_Thread = 0; /* Assume that the worker is already awake. */

27884

+

27885

+ spin_lock_irqsave(&BBR_Queue_Lock, flags);

27886

+

27887

+ /* The BBR IO List is a singly linked list. BBR_IO_List_Head points

27888

+ to the first item in the list, and BBR_IO_List_Tail points to the

27889

+ last item in the list. */

27890

+ Transfer_Record->Next = NULL;

27891

+ if ( !BBR_IO_List_Tail ) { /* Empty list */

27892

+ BBR_IO_List_Head = Transfer_Record;

27893

+ Wake_Worker_Thread = 1; /* Wake up the worker thread. */

27894

+ }

27895

+ else /* Items already in the list. */

27896

+ BBR_IO_List_Tail->Next = Transfer_Record;

27897

+

27898

+ BBR_IO_List_Tail = Transfer_Record;

27899

+

27900

+ spin_unlock_irqrestore(&BBR_Queue_Lock, flags);

27901

+ if ( Wake_Worker_Thread )

27902

+ evms_cs_wakeup_thread(BBR_Worker_Thread);

27903

+

27904

+ return;

27905

+}

27906

+

27907

+

27908

+/* OS2_DL_Callback

27909

+ *

27910

+ * This is the callback function used when an I/O request has to be broken

27911

+ * into two parts because it crosses a drive link boundary.

27912

+ *

27913

+ */

27914

+static void OS2_DL_Callback(struct buffer_head *bh, int uptodate)

27915

+{

27916

+

27917

+ DL_IO_Tracking_Record_t * Tracking_Record;

27918

+ struct buffer_head * Original;

27919

+

27920

+ Tracking_Record = bh->b_private;

27921

+

27922

+ /* Is this a read or a write? */

27923

+ if ( Tracking_Record->Link1_Transfer_Record ||

27924

+ Tracking_Record->Link2_Transfer_Record ) {

27925

+ /* We have a write here. Was it successful? */

27926

+ if ( ! uptodate) {

27927

+ /* Have we tried BBR yet? */

27928

+ if ( ( bh == Tracking_Record->Link1.bh ) &&

27929

+ ( ! Tracking_Record->Link1_BBR_Attempted ) ){

27930

+ /* Attempt BBR. */

27931

+ BBR_Transfer_IO(Tracking_Record->Link1_Transfer_Record);

27932

+ Tracking_Record->Link1_BBR_Attempted = 1;

27933

+ return;

27934

+ }

27935

+ else if ( ( bh == Tracking_Record->Link2.bh ) &&

27936

+ ( ! Tracking_Record->Link2_BBR_Attempted ) ) {

27937

+ /* Attempt BBR. */

27938

+ BBR_Transfer_IO(Tracking_Record->Link2_Transfer_Record);

27939

+ Tracking_Record->Link2_BBR_Attempted = 1;

27940

+ return;

27941

+ }

27942

+

27943

+ }

27944

+

27945

+ }

27946

+

27947

+ Tracking_Record->IO_In_Progress -= 1;

27948

+ if ( Tracking_Record->IO_In_Progress) {

27949

+ Tracking_Record->Up_To_Date = uptodate;

27950

+ }

27951

+ Original = Tracking_Record->Original.bh;

27952

+

27953

+ if ( ! Tracking_Record->IO_In_Progress ) {

27954

+ uptodate &= Tracking_Record->Up_To_Date;

27955

+ /* If this is a write, then Transfer Records will have been set up for both Link1 and Link2.

27956

+ If the transfer records were used because of BBR, then the BBR worker thread will have

27957

+ disposed of the transfer records. If the transfer records were not used, then we must

27958

+ dispose of them here to prevent memory leaks. */

27959

+ if ( Tracking_Record->Link1_Transfer_Record &&

27960

+ ( ! Tracking_Record->Link1_BBR_Attempted) ) {

27961

+ evms_cs_deallocate_to_pool( BBR_Transfer_Pool,Tracking_Record->Link1_Transfer_Record);

27962

+ }

27963

+ if ( Tracking_Record->Link2_Transfer_Record &&

27964

+ ( ! Tracking_Record->Link2_BBR_Attempted) ) {

27965

+ evms_cs_deallocate_to_pool( BBR_Transfer_Pool,Tracking_Record->Link2_Transfer_Record);

27966

+ }

27967

+ evms_cs_deallocate_to_pool(evms_bh_pool,Tracking_Record->Link1.bh);

27968

+ evms_cs_deallocate_to_pool(evms_bh_pool,Tracking_Record->Link2.bh);

27969

+ evms_cs_deallocate_to_pool(DL_Tracking_Pool,Tracking_Record);

27970

+ Original->b_end_io(Original,uptodate);

27971

+ }

27972

+

27973

+ return;

27974

+}

27975

+

27976

+/* OS2_BBR_Write_Callback

27977

+ *

27978

+ * This is the callback for normal write requests. Check for an error

27979

+ * during the I/O, and send to the worker thread for processing if necessary.

27980

+ */

27981

+static void OS2_BBR_Write_Callback( BBR_IO_Transfer_Record_t * Transfer_Record,

27982

+ struct buffer_head * bh,

27983

+ int uptodate,

27984

+ int * redrive )

27985

+{

27986

+ if ( ! uptodate ) {

27987

+ BBR_Transfer_IO(Transfer_Record);

27988

+ *redrive = TRUE;

27989

+ }

27990

+ else {

27991

+ evms_cs_deallocate_to_pool(BBR_Transfer_Pool,Transfer_Record);

27992

+ }

27993

+

27994

+ return;

27995

+}

27996

+

27997

+

27998

+

27999

+

28000

+/* Worker thread to handle:

28001

+

28002

+ I/O to drive/partitions/objects where bad blocks are known to exist

28003

+ I/O to drive/partition/object where a new bad block has been discovered and the I/O must be redriven.

28004

+

28005

+*/

28006

+static void BBR_Worker( void * Not_Used)

28007

+{

28008

+ unsigned long flags;

28009

+ BBR_IO_Transfer_Record_t * Current_IO;

28010

+ int complete;

28011

+

28012

+ for (;;) {

28013

+ // Process bbr_io_list, one entry at a time.

28014

+ spin_lock_irqsave(&BBR_Queue_Lock, flags);

28015

+

28016

+ /* Is there any work for us? */

28017

+ if ( ! BBR_IO_List_Head ) {

28018

+ spin_unlock_irqrestore(&BBR_Queue_Lock, flags);

28019

+ break; /* List empty - nothing to do. */

28020

+ }

28021

+

28022

+ /* Get the IO to perform. */

28023

+ Current_IO = BBR_IO_List_Head;

28024

+ BBR_IO_List_Head = Current_IO->Next;

28025

+ if (! BBR_IO_List_Head )

28026

+ BBR_IO_List_Tail = BBR_IO_List_Head;

28027

+

28028

+ spin_unlock_irqrestore(&BBR_Queue_Lock, flags);

28029

+

28030

+ /* Now lets process the I/O request. */

28031

+ complete = do_os2_bbr_io(Current_IO->Partition_Data,Current_IO->Write_Flag, Current_IO->eio.rsector, Current_IO->eio.rsize, Current_IO->eio.bh->b_data);

28032

+

28033

+ /* We need to do the callback. */

28034

+ Current_IO->eio.bh->b_end_io(Current_IO->eio.bh, (complete == 0) );

28035

+

28036

+ /* Now cleanup */

28037

+ evms_cs_deallocate_to_pool(BBR_Transfer_Pool,Current_IO);

28038

+ }

28039

+

28040

+ return; /* Go to sleep. */

28041

+

28042

+}

28043

+

28044

+

28045

+/*

28046

+ * Sector_Is_Remapped

28047

+ *

28048

+ * This function returns 1 if the specified sector has been remapped, 0 if it has not

28049

+ *

28050

+ * If the sector has been remapped, then the new sector is returned in Replacement_Sector

28051

+ *

28052

+ */

28053

+static int Sector_Is_Remapped(os2_drivelink_runtime_entry_t * io_dlentry, evms_sector_t Source_Sector, evms_sector_t * Replacement_Sector)

28054

+{

28055

+ LVM_BBR_Feature * Feature_Data = ( LVM_BBR_Feature * )io_dlentry->bbr_data;

28056

+ unsigned int Sector_Index; /* The BBR Table is spread across several sectors. This tracks which sector we are looking at. */

28057

+ unsigned int BBR_Table_Index; /* This tracks the actual entry in the BBR Table that we are examining. */

28058

+ unsigned int BBR_Table_Entries_In_Use = Feature_Data->control.Table_Entries_In_Use;

28059

+ BBR_Table_Entry * BBR_Table_Entry;

28060

+ unsigned int Guard1;

28061

+

28062

+ /* Default value is no remap. */

28063

+ *Replacement_Sector = Source_Sector;

28064

+

28065

+ do {

28066

+ Guard1 = io_dlentry->Guard1; /* Lamport's Theorem */

28067

+

28068

+ for ( BBR_Table_Index = 0; BBR_Table_Index < BBR_Table_Entries_In_Use; BBR_Table_Index++) {

28069

+ Sector_Index = BBR_Table_Index / BBR_TABLE_ENTRIES_PER_SECTOR;

28070

+ BBR_Table_Entry = &(Feature_Data->remap[Sector_Index].BBR_Table[BBR_Table_Index - (Sector_Index * BBR_TABLE_ENTRIES_PER_SECTOR)]);

28071

+ if ( BBR_Table_Entry->BadSector == Source_Sector ){

28072

+ *Replacement_Sector = BBR_Table_Entry->ReplacementSector;

28073

+ break;

28074

+ }

28075

+ }

28076

+

28077

+ } while ( Guard1 != io_dlentry->Guard2 ); /* Lamport's Theorem */

28078

+

28079

+ if ( *Replacement_Sector != Source_Sector )

28080

+ return 1;

28081

+ else

28082

+ return 0;

28083

+}

28084

+

28085

+

28086

+/*

28087

+ * Invalidate_Mapping

28088

+ *

28089

+ * This function either frees a replacement sector to be reused, or it

28090

+ * marks the replacement sector as bad.

28091

+ *

28092

+ */

28093

+static void Invalidate_Mapping(os2_drivelink_runtime_entry_t * dlentry,

28094

+ evms_sector_t Source_Sector,

28095

+ int Replacement_Sector_Is_Bad)

28096

+{

28097

+ LVM_BBR_Feature * Feature_Data = ( LVM_BBR_Feature * )dlentry->bbr_data;

28098

+ unsigned int Sector_Index; /* The BBR Table is spread across several sectors. This tracks which sector we are looking at. */

28099

+ unsigned int BBR_Table_Index; /* This tracks the actual entry in the BBR Table that we are examining. */

28100

+ unsigned int BBR_Table_Entries_In_Use = Feature_Data->control.Table_Entries_In_Use;

28101

+ BBR_Table_Entry * BBR_Table_Entry = NULL;

28102

+

28103

+ /* Lock for the BBR Table. */

28104

+ down( &(dlentry->BBR_Table_Lock) );

28105

+

28106

+ /* Find the entry to invalidate. */

28107

+ for ( BBR_Table_Index = 0; BBR_Table_Index < BBR_Table_Entries_In_Use; BBR_Table_Index++) {

28108

+ Sector_Index = BBR_Table_Index / BBR_TABLE_ENTRIES_PER_SECTOR;

28109

+ BBR_Table_Entry = &(Feature_Data->remap[Sector_Index].BBR_Table[BBR_Table_Index - (Sector_Index * BBR_TABLE_ENTRIES_PER_SECTOR)]);

28110

+ if ( BBR_Table_Entry->BadSector == Source_Sector ){

28111

+ break;

28112

+ }

28113

+ }

28114

+

28115

+ /* Now that we have found the entry, we must invalidate it. */

28116

+ if ( Replacement_Sector_Is_Bad ) {

28117

+ BBR_Table_Entry->BadSector = (u_int32_t) -1;

28118

+ }

28119

+ /* OS/2 supported a method for clearing out bad block remappings if the filesystem on the volume supported

28120

+ the tracking of bad blocks. We don't support that under Linux, so there is no else case here. */

28121

+

28122

+ /* Unlock the BBR Table */

28123

+ up( &(dlentry->BBR_Table_Lock) );

28124

+

28125

+ return;

28126

+}

28127

+

28128

+/*

28129

+ * Create_New_BBR_Table_Entry

28130

+ *

28131

+ * Finds bad blocks within the range specified, allocates replacement sectors,

28132

+ * writes the data to the replacement sectors, and updates the BBR metadata on

28133

+ * disk to reflect the new mapping. Returns 1 if successful, 0 otherwise.

28134

+ *

28135

+ */

28136

+static int Create_New_BBR_Table_Entry(os2_drivelink_runtime_entry_t * dlentry,

28137

+ evms_sector_t starting_lsn,

28138

+ unsigned int count,

28139

+ void * buffer)

28140

+{

28141

+ evms_sector_t lsn;

28142

+ BBR_Table_Entry *Table_Entry;

28143

+ unsigned int Sector_Index;

28144

+ unsigned int Table_Index;

28145

+ int rc;

28146

+ int rc2;

28147

+ u_int32_t New_Sequence_Number;

28148

+ LVM_BBR_Feature *BBR_Data = (LVM_BBR_Feature*) dlentry->bbr_data;

28149

+

28150

+ for ( lsn = starting_lsn; lsn < (starting_lsn + count); lsn++) {

28151

+ rc = INIT_IO(dlentry->link_partition, 1, lsn, 1, buffer);

28152

+ while (rc) {

28153

+

28154

+ /* Lock for the BBR Table. */

28155

+ down( &(dlentry->BBR_Table_Lock) );

28156

+

28157

+ /* Increment the second guard value. This will cause those reading the BBR Table to spin.*/

28158

+ dlentry->Guard2++;

28159

+

28160

+ /* Ensure that the bbr active flag is set. */

28161

+ dlentry->bbr_is_active = 1;

28162

+

28163

+ /* Allocate a replacement sector */

28164

+ if ( BBR_Data->control.Table_Entries_In_Use < BBR_Data->control.Table_Size ) {

28165

+ Sector_Index = BBR_Data->control.Table_Entries_In_Use / BBR_TABLE_ENTRIES_PER_SECTOR;

28166

+ Table_Index = BBR_Data->control.Table_Entries_In_Use % BBR_TABLE_ENTRIES_PER_SECTOR;

28167

+ BBR_Data->control.Table_Entries_In_Use = BBR_Data->control.Table_Entries_In_Use + 1;

28168

+ Table_Entry = (BBR_Table_Entry *) &(BBR_Data->remap[Sector_Index].BBR_Table[Table_Index]);

28169

+ Table_Entry->BadSector = lsn;

28170

+ }

28171

+ else {

28172

+ /* There are no more replacement sectors available! Time to bail ... */

28173

+ up( &(dlentry->BBR_Table_Lock) );

28174

+ return 0;

28175

+ }

28176

+

28177

+ /* Now that we have a replacement sector, increment the first guard value. This will free any

28178

+ threads reading the BBR Table. */

28179

+ dlentry->Guard1++;

28180

+

28181

+ /* Release the lock now that we have a replacement sector. */

28182

+ up( &(dlentry->BBR_Table_Lock) );

28183

+

28184

+ /* Test the replacement sector. */

28185

+ rc = INIT_IO(dlentry->link_partition, 1, Table_Entry->ReplacementSector, 1, buffer);

28186

+ if (rc) {

28187

+ /* The replacement sector was bad. Lets mark it bad in the table and try again. */

28188

+ Table_Entry->BadSector = (u_int32_t) -1;

28189

+ }

28190

+

28191

+ } /* End of processing for the current sector. */

28192

+

28193

+ } /* end of loop to test each sector in the I/O and remap any bad ones found. */

28194

+

28195

+ /* Need to write the modified BBR Table back to disk. This includes updating the sequence numbers and CRCs. */

28196

+

28197

+ /* Lock for the BBR Table. */

28198

+ down( &(dlentry->BBR_Table_Lock) );

28199

+

28200

+ /* Increment the sequence numbers. */

28201

+ New_Sequence_Number = BBR_Data->control.Sequence_Number + 1;

28202

+ BBR_Data->control.Sequence_Number = New_Sequence_Number;

28203

+ for ( Sector_Index = 0; Sector_Index < BBR_Data->control.Sectors_Per_Table; Sector_Index++) {

28204

+ BBR_Data->remap[Sector_Index].Sequence_Number = New_Sequence_Number;

28205

+ }

28206

+

28207

+ /* Calculate the new CRC values. */

28208

+ BBR_Data->control.CRC = 0;

28209

+ BBR_Data->control.CRC = evms_cs_calculate_crc(EVMS_INITIAL_CRC,&(BBR_Data->control),OS2_BYTES_PER_SECTOR);

28210

+ for ( Sector_Index = 0; Sector_Index < BBR_Data->control.Sectors_Per_Table; Sector_Index++) {

28211

+ BBR_Data->remap[Sector_Index].CRC = 0;

28212

+ BBR_Data->remap[Sector_Index].CRC = evms_cs_calculate_crc(EVMS_INITIAL_CRC,&(BBR_Data->remap[Sector_Index]),OS2_BYTES_PER_SECTOR);

28213

+ }

28214

+

28215

+ /* Now we must write the table back to the partition from whence it came. */

28216

+

28217

+ /* Write the first copy. */

28218

+ rc = INIT_IO(dlentry->link_partition,1,dlentry->BBR_Data_Copy1,dlentry->BBR_Feature_Size,BBR_Data);

28219

+

28220

+ /* Write the second copy. */

28221

+ rc2 = INIT_IO(dlentry->link_partition,1,dlentry->BBR_Data_Copy2,dlentry->BBR_Feature_Size,BBR_Data);

28222

+

28223

+ /* If both copies failed to reach the disk, then fail the I/O. */

28224

+ if ( rc && rc2 ) {

28225

+ rc = 0;

28226

+ }

28227

+ else

28228

+ rc = 1;

28229

+

28230

+ /* Unlock the BBR Table */

28231

+ up( &(dlentry->BBR_Table_Lock) );

28232

+

28233

+ /* Indicate success. */

28234

+ return rc;

28235

+}

28236

+

28237

+

28238

+/*

28239

+ * Clone_Bufferhead

28240

+ *

28241

+ * Prepares a usable copy of an existing bufferhead.

28242

+ *

28243

+ */

28244

+static void Clone_Bufferhead(struct buffer_head * Source, struct buffer_head * Child)

28245

+{

28246

+ Child->b_next = NULL;

28247

+ Child->b_blocknr = Source->b_blocknr;

28248

+ Child->b_size = Source->b_size;

28249

+ Child->b_list = 0;

28250

+ Child->b_dev = Source->b_dev;

28251

+ Child->b_count = Source->b_count;

28252

+ Child->b_rdev = Source->b_rdev;

28253

+ Child->b_state = Source->b_state;

28254

+ Child->b_flushtime = 0;

28255

+ Child->b_next_free = NULL;

28256

+ Child->b_prev_free = NULL;

28257

+ Child->b_this_page = NULL;

28258

+ Child->b_reqnext = NULL;

28259

+ Child->b_pprev = NULL;

28260

+ Child->b_data = Source->b_data;

28261

+ Child->b_page = Source->b_page;

28262

+ Child->b_end_io = Source->b_end_io;

28263

+ Child->b_private = Source->b_private;

28264

+ Child->b_rsector = Source->b_rsector;

28265

+ Child->b_inode = NULL;

28266

+ Child->b_inode_buffers.next = NULL;

28267

+ Child->b_inode_buffers.prev = NULL;

28268

+ return;

28269

+}

28270

diff -Naur linux-2002-03-28/drivers/evms/s390_part.c evms-2002-03-28/drivers/evms/s390_part.c

28271

--- linux-2002-03-28/drivers/evms/s390_part.c Wed Dec 31 18:00:00 1969

28272

+++ evms-2002-03-28/drivers/evms/s390_part.c Tue Mar 26 14:28:49 2002

28273

@@ -0,0 +1,836 @@

28274

+/* -*- linux-c -*- */

28275

+/*

28276

+ *

28277

+ *

28278

+ * Copyright (c) International Business Machines Corp., 2000

28279

+ *

28280

+ * This program is free software; you can redistribute it and/or modify

28281

+ * it under the terms of the GNU General Public License as published by

28282

+ * the Free Software Foundation; either version 2 of the License, or

28283

+ * (at your option) any later version.

28284

+ *

28285

+ * This program is distributed in the hope that it will be useful,

28286

+ * but WITHOUT ANY WARRANTY; without even the implied warranty of

28287

+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See

28288

+ * the GNU General Public License for more details.

28289

+ *

28290

+ * You should have received a copy of the GNU General Public License

28291

+ * along with this program; if not, write to the Free Software

28292

+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

28293

+ *

28294

+ *

28295

+ */

28296

+/*

28297

+ * linux/drivers/evms/s390_part.c

28298

+ *

28299

+ * EVMS S/390 partition manager

28300

+ *

28301

+ * Partial code extracted from

28302

+ *

28303

+ * linux/fs/partitions/ibm.c

28304

+ *

28305

+ */

28306

+

28307

+#include <linux/config.h>

28308

+#include <linux/module.h>

28309

+#include <linux/kernel.h>

28310

+#include <linux/config.h>

28311

+#include <linux/string.h>

28312

+#include <linux/blk.h>

28313

+#include <asm/ebcdic.h>

28314

+#include <asm/uaccess.h>

28315

+#include <asm/dasd.h>

28316

+#include <asm/vtoc.h>

28317

+#include <linux/evms/evms_kernel.h>

28318

+

28319

+/* prefix used in logging messages */

28320

+#define LOG_PREFIX "s390_part: "

28321

+

28322

+/* Private instance data structure for node we produced */

28323

+typedef struct local_instance_data_s {

28324

+ evms_logical_node_t * source_disk;

28325

+ evms_sector_t start_sect; /* starting LBA */

28326

+ evms_sector_t nr_sects; /* number of sectors */

28327

+ unsigned char type; /* partition type or filesystem format indicator, can be set to 0 */

28328

+} local_instance_data_t;

28329

+

28330

+static int exported_nodes; /* total # of exported segments

28331

+ * produced during this discovery.

28332

+ */

28333

+

28334

+/* Prototypes */

28335

+static int s390_partition_discover(evms_logical_node_t **);

28336

+static int s390_partition_delete(evms_logical_node_t *);

28337

+static void s390_partition_read(evms_logical_node_t *,

28338

+ eio_t *);

28339

+static void s390_partition_write(evms_logical_node_t *,

28340

+ eio_t *);

28341

+static int s390_partition_ioctl(evms_logical_node_t *,

28342

+ struct inode *,

28343

+ struct file *,

28344

+ unsigned int,

28345

+ unsigned long);

28346

+static int s390_partition_init_io(evms_logical_node_t *,

28347

+ int,

28348

+ evms_sector_t,

28349

+ evms_sector_t,

28350

+ void *);

28351

+

28352

+static evms_plugin_function_table_t function_table = {

28353

+ discover: &s390_partition_discover,

28354

+ delete : &s390_partition_delete,

28355

+ read : &s390_partition_read,

28356

+ write : &s390_partition_write,

28357

+ init_io : &s390_partition_init_io,

28358

+ ioctl : &s390_partition_ioctl

28359

+};

28360

+

28361

+#define EVMS_S390_PARTITION_MANAGER_ID 2

28362

+

28363

+static evms_plugin_header_t plugin_header = {

28364

+ id : SetPluginID(

28365

+ IBM_OEM_ID,

28366

+ EVMS_SEGMENT_MANAGER,

28367

+ EVMS_S390_PARTITION_MANAGER_ID),

28368

+ version : {

28369

+ major : 1,

28370

+ minor : 0,

28371

+ patchlevel : 0

28372

+ },

28373

+ required_common_services_version : {

28374

+ major : 0,

28375

+ minor : 5,

28376

+ patchlevel : 0

28377

+ },

28378

+ function_table : &function_table

28379

+};

28380

+

28381

+/***************************************************/

28382

+/* List Support - Typedefs, Variables, & Functions */

28383

+/***************************************************/

28384

+

28385

+/* Typedefs */

28386

+

28387

+typedef struct local_segment_list_node_s {

28388

+ evms_logical_node_t *segment;

28389

+ struct local_segment_list_node_s *next;

28390

+} local_segment_list_node_t;

28391

+

28392

+typedef struct local_disk_list_node_s {

28393

+ evms_logical_node_t *disk;

28394

+ local_segment_list_node_t *segment_list;

28395

+ struct local_disk_list_node_s *next;

28396

+} local_disk_list_node_t;

28397

+

28398

+/* Variables */

28399

+

28400

+static local_disk_list_node_t *my_disk_list;

28401

+

28402

+/* Functions */

28403

+

28404

+static local_disk_list_node_t **

28405

+lookup_disk(

28406

+ evms_logical_node_t *disk)

28407

+{

28408

+ local_disk_list_node_t **ldln;

28409

+

28410

+ ldln = &my_disk_list;

28411

+ while(*ldln) {

28412

+ if ((*ldln)->disk == disk)

28413

+ break;

28414

+ ldln = &(*ldln)->next;

28415

+ }

28416

+ return(ldln);

28417

+}

28418

+

28419

+static local_segment_list_node_t **

28420

+lookup_segment(

28421

+ local_disk_list_node_t *disk,

28422

+ evms_logical_node_t *segment)

28423

+{

28424

+ local_segment_list_node_t **lsln;

28425

+

28426

+ lsln = &disk->segment_list;

28427

+ while(*lsln) {

28428

+ if ((*lsln)->segment == segment)

28429

+ break;

28430

+ lsln = &(*lsln)->next;

28431

+ }

28432

+ return(lsln);

28433

+}

28434

+

28435

+static evms_logical_node_t *

28436

+find_segment_on_disk(

28437

+ evms_logical_node_t *disk,

28438

+ u_int64_t start_sect,

28439

+ u_int64_t nr_sects)

28440

+{

28441

+ evms_logical_node_t *rc = NULL;

28442

+ local_disk_list_node_t **ldln;

28443

+ local_segment_list_node_t **lsln;

28444

+ local_instance_data_t *lid;

28445

+

28446

+ ldln = lookup_disk(disk);

28447

+ if (*ldln) {

28448

+ /* disk found in list */

28449

+ /* attempt to find segment */

28450

+

28451

+ lsln = &(*ldln)->segment_list;

28452

+ while(*lsln) {

28453

+ lid = (*lsln)->segment->instance_data;

28454

+ if (lid->start_sect == start_sect)

28455

+ if (lid->nr_sects == nr_sects)

28456

+ break;

28457

+ lsln = &(*lsln)->next;

28458

+ }

28459

+ if (*lsln)

28460

+ rc = (*lsln)->segment;

28461

+ }

28462

+ return(rc);

28463

+}

28464

+

28465

+/* function description: add_segment_to_disk

28466

+ *

28467

+ * this function attempts to add a segment to the segment

28468

+ * list of a disk. if the specified disk is not found, it

28469

+ * will be added to the global disk list. this function will

28470

+ * return a pointer to the matching segment in the disk's

28471

+ * segment list. the caller must compare the returned pointer

28472

+ * to the specified segment to see if the

28473

+ * specified segment was already present in the disk's segment

28474

+ * list. if the return pointer matches the specified segment,

28475

+ * then the specified segment was added to the list. if the

28476

+ * return segment pointer to does not match the specified

28477

+ * segment pointer, then the specified segment pointer was

28478

+ * a duplicate and can be thrown away.

28479

+ */

28480

+static int

28481

+add_segment_to_disk(

28482

+ evms_logical_node_t *disk,

28483

+ evms_logical_node_t *segment)

28484

+{

28485

+ int rc = 0;

28486

+ local_disk_list_node_t **ldln, *new_disk;

28487

+ local_segment_list_node_t **lsln, *new_segment;

28488

+

28489

+ ldln = lookup_disk(disk);

28490

+ if (*ldln == NULL) {

28491

+ /* disk not in list, add disk */

28492

+ rc = evms_cs_allocate_memory((void **)&new_disk,

28493

+ sizeof(*new_disk));

28494

+ if (!rc) {

28495

+ new_disk->disk = disk;

28496

+ *ldln = new_disk;

28497

+ }

28498

+ }

28499

+ if (!rc) {

28500

+ /* attempt to add segment */

28501

+ lsln = lookup_segment(*ldln, segment);

28502

+ if (*lsln == NULL) {

28503

+ /* segment not in list, add segment */

28504

+ rc = evms_cs_allocate_memory((void **)&new_segment,

28505

+ sizeof(*new_segment));

28506

+ if (!rc) {

28507

+ new_segment->segment = segment;

28508

+ *lsln = new_segment;

28509

+ }

28510

+ } else

28511

+ rc = -1;

28512

+ }

28513

+ return(rc);

28514

+}

28515

+

28516

+static int

28517

+remove_segment_from_disk(

28518

+ evms_logical_node_t *disk,

28519

+ evms_logical_node_t *segment,

28520

+ evms_logical_node_t **empty_disk)

28521

+{

28522

+ int rc = 0;

28523

+ local_disk_list_node_t **ldln, *tmp_disk_node;

28524

+ local_segment_list_node_t **lsln, *tmp_segment_node;

28525

+

28526

+ *empty_disk = NULL;

28527

+ ldln = lookup_disk(disk);

28528

+ if (*ldln == NULL) {

28529

+ rc = -1;

28530

+ } else {

28531

+ /* disk found in list */

28532

+ /* attempt to add segment */

28533

+ lsln = lookup_segment(*ldln, segment);

28534

+ if (*lsln == NULL) {

28535

+ rc = -2;

28536

+ } else {

28537

+ tmp_segment_node = *lsln;

28538

+ /* remove segment from list */

28539

+ *lsln = (*lsln)->next;

28540

+ /* free the segment list node */

28541

+ evms_cs_deallocate_memory(tmp_segment_node);

28542

+

28543

+ if ((*ldln)->segment_list == NULL) {

28544

+ tmp_disk_node = *ldln;

28545

+ *empty_disk = tmp_disk_node->disk;

28546

+ /* remove disk from list */

28547

+ *ldln = (*ldln)->next;

28548

+ /* free the disk list node */

28549

+ evms_cs_deallocate_memory(tmp_disk_node);

28550

+ }

28551

+ }

28552

+ }

28553

+ return(rc);

28554

+}

28555

+

28556

+/*

28557

+ * Function: add_segment

28558

+ */

28559

+static int

28560

+s390_process_segment(

28561

+ evms_logical_node_t **discover_list,

28562

+ evms_logical_node_t *node,

28563

+ u_int64_t start_sect,

28564

+ u_int64_t nr_sects,

28565

+ unsigned char type,

28566

+ int part_num)

28567

+{

28568

+ local_instance_data_t *InstData = NULL;

28569

+ evms_logical_node_t *segment;

28570

+ int rc = 0;

28571

+

28572

+ segment = find_segment_on_disk(node, start_sect, nr_sects);

28573

+ if (segment) {

28574

+ LOG_DETAILS("exporting segment '%s'.\n",

28575

+ segment->name);

28576

+ } else {

28577

+ rc = evms_cs_allocate_memory((void **)&InstData,sizeof(*InstData));

28578

+ if (!rc) {

28579

+ InstData->source_disk = node;

28580

+ InstData->start_sect = start_sect;

28581

+ InstData->nr_sects = nr_sects;

28582

+ InstData->type = type;

28583

+ rc = evms_cs_allocate_logical_node(&segment);

28584

+ }

28585

+ if (!rc) {

28586

+ segment->plugin = &plugin_header;

28587

+ segment->system_id = (unsigned int)type;

28588

+ segment->total_vsectors = nr_sects;

28589

+ segment->block_size = node->block_size;

28590

+ segment->hardsector_size = node->hardsector_size;

28591

+ segment->instance_data = InstData;

28592

+ segment->flags = node->flags;

28593

+ strcpy(segment->name, node->name);

28594

+ sprintf(segment->name + strlen(segment->name), "%d", part_num);

28595

+ LOG_DETAILS("creating segment '%s'.\n",

28596

+ segment->name);

28597

+ rc = add_segment_to_disk(node, segment);

28598

+ if (rc) {

28599

+ LOG_ERROR("%s: error(%d) adding segment '%s'!\n",

28600

+ __FUNCTION__, rc, segment->name);

28601

+ rc = 0;

28602

+ } else {

28603

+ MOD_INC_USE_COUNT;

28604

+ }

28605

+ }

28606

+ if (rc) {

28607

+ if (InstData)

28608

+ evms_cs_deallocate_memory(InstData);

28609

+ if (segment)

28610

+ evms_cs_deallocate_logical_node(segment);

28611

+ }

28612

+ }

28613

+ if (!rc) {

28614

+ evms_cs_add_logical_node_to_list(discover_list, segment);

28615

+ exported_nodes++;

28616

+ }

28617

+ return rc;

28618

+}

28619

+

28620

+typedef enum {

28621

+ ibm_partition_lnx1 = 0,

28622

+ ibm_partition_vol1 = 1,

28623

+ ibm_partition_cms1 = 2,

28624

+ ibm_partition_none = 3

28625

+} ibm_partition_t;

28626

+

28627

+static char* part_names[] = {

28628

+ [ibm_partition_lnx1] = "LNX1",

28629

+ [ibm_partition_vol1] = "VOL1",

28630

+ [ibm_partition_cms1] = "CMS1",

28631

+ [ibm_partition_none] = "(nonl)"

28632

+};

28633

+

28634

+static ibm_partition_t

28635

+get_partition_type ( char * type )

28636

+{

28637

+ int i;

28638

+ for ( i = 0; i < 3; i ++) {

28639

+ if ( ! strncmp (type,part_names[i],4) )

28640

+ break;

28641

+ }

28642

+ return i;

28643

+}

28644

+

28645

+/*

28646

+ * compute the block number from a

28647

+ * cyl-cyl-head-head structure

28648

+ */

28649

+static inline int

28650

+cchh2blk (cchh_t *ptr, struct hd_geometry *geo) {

28651

+ return ptr->cc * geo->heads * geo->sectors +

28652

+ ptr->hh * geo->sectors;

28653

+}

28654

+

28655

+

28656

+/*

28657

+ * compute the block number from a

28658

+ * cyl-cyl-head-head-block structure

28659

+ */

28660

+static inline int

28661

+cchhb2blk (cchhb_t *ptr, struct hd_geometry *geo) {

28662

+ return ptr->cc * geo->heads * geo->sectors +

28663

+ ptr->hh * geo->sectors +

28664

+ ptr->b;

28665

+}

28666

+

28667

+void print_mem( void *buffer, int length )

28668

+{

28669

+ int i, done;

28670

+ unsigned char *bufptr;

28671

+

28672

+ bufptr = (unsigned char *)buffer;

28673

+ i = done = 0;

28674

+ while( !done ) {

28675

+ if ( (i % 16) == 0 )

28676

+ printk(KERN_INFO "\n0x%p->", buffer + i);

28677

+ printk(KERN_INFO "%02x ", bufptr[i]);

28678

+ if ( ++i >= length )

28679

+ done++;

28680

+ }

28681

+ printk(KERN_INFO "\n");

28682

+}

28683

+

28684

+static int

28685

+s390_probe_for_segments(

28686

+ evms_logical_node_t **discover_list,

28687

+ evms_logical_node_t *disk)

28688

+{

28689

+ char type[5] = {0,}, name[7] = {0,};

28690

+ int rc, vsects_per_hardsect = 0;

28691

+ unsigned int blk;

28692

+ u64 io_start;

28693

+ dasd_information_t *info = NULL;

28694

+ struct hd_geometry *geo = NULL;

28695

+ unchar *data = NULL;

28696

+

28697

+ /* allocate space for DASD ioctl packet

28698

+ */

28699

+ rc = evms_cs_allocate_memory((void **)&info, sizeof(dasd_information_t));

28700

+ if (!rc) {

28701

+ LOG_DEBUG("probing '%s' for 390 DASD info...\n",

28702

+ disk->name);

28703

+ /* issue DASD info ioctl

28704

+ */

28705

+ rc = evms_cs_kernel_ioctl(disk, BIODASDINFO, (unsigned long)info);

28706

+ if (rc) {

28707

+ LOG_DEBUG("error(%d) from BIODASDINFO ioctl.\n", rc);

28708

+ LOG_DEBUG("assuming '%s' is not a valid 390 device!\n",

28709

+ disk->name);

28710

+ }

28711

+ }

28712

+ if (!rc) {

28713

+ /* if we successfully completed the previous

28714

+ * get DASD info ioctl, we will assume that

28715

+ * the device is a valid 390 disk.

28716

+ *

28717

+ * remove it from the discover list.

28718

+ */

28719

+ rc = evms_cs_remove_logical_node_from_list(

28720

+ discover_list, disk);

28721

+ if (rc) {

28722

+ LOG_ERROR("error(%d) removing disk(%s) from discover list.\n",

28723

+ rc, disk->name);

28724

+ }

28725

+ }

28726

+ if (!rc)

28727

+ /* allocate space for the geometry packet

28728

+ */

28729

+ rc = evms_cs_allocate_memory((void **)&geo, sizeof(struct hd_geometry));

28730

+ if (!rc) {

28731

+ /* issue the Get GEO ioctl

28732

+ */

28733

+ rc = evms_cs_kernel_ioctl(disk, HDIO_GETGEO, (unsigned long)geo);

28734

+ if (rc) {

28735

+ LOG_ERROR("error(%d) from HDIO_GETGEO ioctl.\n", rc);

28736

+ }

28737

+ }

28738

+ if (!rc) {

28739

+ /* retrieve the vsects_per_hardsect (hardsector size)

28740

+ */

28741

+ vsects_per_hardsect = disk->hardsector_size;

28742

+ vsects_per_hardsect >>= EVMS_VSECTOR_SIZE_SHIFT;

28743

+ rc = evms_cs_allocate_memory((void **)&data, EVMS_VSECTOR_SIZE);

28744

+ }

28745

+ if (!rc) {

28746

+ /* go read the 1st block on the disk

28747

+ */

28748

+ io_start = info->label_block * vsects_per_hardsect;

28749

+ rc = INIT_IO(disk, READ, io_start, 1, data);

28750

+ if (rc) {

28751

+ LOG_ERROR("error(%d) reading sector(%Ld) from '%s'.\n",

28752

+ rc, io_start, disk->name);

28753

+ } else {

28754

+// print_mem(data, EVMS_VSECTOR_SIZE);

28755

+ }

28756

+ }

28757

+ if (!rc) {

28758

+ int offset, size, psize, counter = 0;

28759

+ format1_label_t f1;

28760

+ volume_label_t vlabel;

28761

+ ibm_partition_t partition_type;

28762

+

28763

+ /* determine the format type

28764

+ */

28765

+

28766

+ strncpy (type, data, 4);

28767

+ if ((!info->FBA_layout) && (!strcmp(info->type,"ECKD"))) {

28768

+ strncpy ( name, data + 8, 6);

28769

+ } else {

28770

+ strncpy ( name, data + 4, 6);

28771

+ }

28772

+ memcpy (&vlabel, data, sizeof(volume_label_t));

28773

+

28774

+ EBCASC(type,4);

28775

+ EBCASC(name,6);

28776

+ partition_type = get_partition_type(type);

28777

+ LOG_DETAILS("disk: raw type(%s), type(%s), name(%s)\n",

28778

+ type, part_names[partition_type], name);

28779

+ switch ( partition_type ) {

28780

+ case ibm_partition_cms1:

28781

+ if (*((long *)data + 13) != 0) {

28782

+ /* disk is reserved minidisk */

28783

+ long *label=(long*)data;

28784

+ vsects_per_hardsect = label[3] >> EVMS_VSECTOR_SIZE_SHIFT;

28785

+ offset = label[13];

28786

+ size = (label[7] - 1) * vsects_per_hardsect;

28787

+ LOG_DEBUG("(MDSK)");

28788

+ } else {

28789

+ offset = info->label_block + 1;

28790

+ size = disk->total_vsectors;

28791

+ }

28792

+ offset *= vsects_per_hardsect;

28793

+ /* adjust for 0 thru label block offset

28794

+ */

28795

+ size -= offset;

28796

+ rc = s390_process_segment(discover_list,

28797

+ disk,

28798

+ offset,

28799

+ size,

28800

+ 0,

28801

+ 1);

28802

+ break;

28803

+ case ibm_partition_lnx1:

28804

+ case ibm_partition_none:

28805

+ offset = info->label_block + 1;

28806

+ offset *= vsects_per_hardsect;

28807

+ size = disk->total_vsectors;

28808

+ /* adjust for 0 thru label block offset

28809

+ */

28810

+ size -= offset;

28811

+ rc = s390_process_segment(discover_list,

28812

+ disk,

28813

+ offset,

28814

+ size,

28815

+ 0,

28816

+ 1);

28817

+ break;

28818

+ case ibm_partition_vol1:

28819

+ /* get block number and read then first format1 label */

28820

+ blk = cchhb2blk(&vlabel.vtoc, geo) + 1;

28821

+ io_start = blk * vsects_per_hardsect;

28822

+ rc = INIT_IO(disk, READ, io_start, 1, data);

28823

+ if (rc) {

28824

+ LOG_ERROR("error(%d) reading sector(%Ld) from '%s'.\n",

28825

+ rc, io_start, disk->name);

28826

+ break;

28827

+ } else {

28828

+// print_mem(data, EVMS_VSECTOR_SIZE);

28829

+ }

28830

+ memcpy (&f1, data, sizeof(format1_label_t));

28831

+

28832

+ while (f1.DS1FMTID == _ascebc['1']) {

28833

+ offset = cchh2blk(&f1.DS1EXT1.llimit, geo);

28834

+ psize = cchh2blk(&f1.DS1EXT1.ulimit, geo) -

28835

+ offset + geo->sectors;

28836

+

28837

+ counter++;

28838

+ rc = s390_process_segment(discover_list,

28839

+ disk,

28840

+ offset * vsects_per_hardsect,

28841

+ psize * vsects_per_hardsect,

28842

+ 0,

28843

+ counter);

28844

+

28845

+ blk++;

28846

+ io_start = blk * vsects_per_hardsect;

28847

+ rc = INIT_IO(disk, READ, io_start, 1, data);

28848

+ if (rc) {

28849

+ LOG_ERROR("error(%d) reading sector(%Ld) from '%s'.\n",

28850

+ rc, io_start, disk->name);

28851

+ break;

28852

+ } else {

28853

+// print_mem(data, EVMS_VSECTOR_SIZE);

28854

+ }

28855

+ memcpy (&f1, data, sizeof(format1_label_t));

28856

+ }

28857

+ break;

28858

+ default:

28859

+ rc = s390_process_segment(discover_list,

28860

+ disk, 0, 0, 0, 1);

28861

+ break;

28862

+ }

28863

+ }

28864

+ if (info) {

28865

+ evms_cs_deallocate_memory(info);

28866

+ }

28867

+ if (geo) {

28868

+ evms_cs_deallocate_memory(geo);

28869

+ }

28870

+ if (data)

28871

+ evms_cs_deallocate_memory(data);

28872

+

28873

+ return(rc);

28874

+}

28875

+

28876

+/*

28877

+ * Function: s390_partition_discover

28878

+ *

28879

+ */

28880

+static int

28881

+s390_partition_discover(evms_logical_node_t **discover_list)

28882

+{

28883

+ int rc = 0;

28884

+ evms_logical_node_t *node, *next_node;

28885

+

28886

+ LOG_ENTRY_EXIT("%s: ENTRY\n", __FUNCTION__);

28887

+

28888

+ /* initialize global variable */

28889

+ exported_nodes = 0;

28890

+

28891

+ /* examine each node on the discover list */

28892

+ next_node = *discover_list;

28893

+ while(next_node) {

28894

+ node = next_node;

28895

+ next_node = node->next;

28896

+ if (GetPluginType(node->plugin->id) != EVMS_DEVICE_MANAGER)

28897

+ /* only process disk nodes

28898

+ */

28899

+ continue;

28900

+ s390_probe_for_segments(discover_list, node);

28901

+ }

28902

+

28903

+ LOG_ENTRY_EXIT("%s: EXIT(exported nodes:%d, error code:%d)\n",

28904

+ __FUNCTION__, exported_nodes, rc);

28905

+ if (exported_nodes)

28906

+ rc = exported_nodes;

28907

+ return(rc);

28908

+}

28909

+

28910

+/*

28911

+ * Function: s390_partition_delete

28912

+ *

28913

+ */

28914

+static int

28915

+s390_partition_delete(evms_logical_node_t *segment)

28916

+{

28917

+ int rc = 0;

28918

+ local_instance_data_t *LID;

28919

+ evms_logical_node_t *empty_disk = NULL;

28920

+

28921

+ LOG_DETAILS("deleting segment '%s'.\n",segment->name);

28922

+

28923

+ if (!segment) {

28924

+ rc = -ENODEV;

28925

+ } else {

28926

+ LID = segment->instance_data;

28927

+ if (LID) {

28928

+ /* remove the segment from the

28929

+ * disk's segment list

28930

+ */

28931

+ rc = remove_segment_from_disk(

28932

+ LID->source_disk,

28933

+ segment,

28934

+ &empty_disk);

28935

+ /* free the local instance data */

28936

+ evms_cs_deallocate_memory(LID);

28937

+ }

28938

+ /* free the segment node */

28939

+ evms_cs_deallocate_logical_node(segment);

28940

+ MOD_DEC_USE_COUNT;

28941

+ /* if the last segment on the disk was

28942

+ * deleted, delete the disk node too

28943

+ */

28944

+ if (empty_disk)

28945

+ DELETE(empty_disk);

28946

+ }

28947

+ return(rc);

28948

+}

28949

+

28950

+/*

28951

+ * function: s390_partition_io_error

28952

+ *

28953

+ * this function was primarily created because the function

28954

+ * buffer_IO_error is inline and kgdb doesn't allow breakpoints

28955

+ * to be set on inline functions. Since this was an error path

28956

+ * and not mainline, I decided to add a trace statement to help

28957

+ * report on the failing condition.

28958

+ *

28959

+ */

28960

+static void

28961

+s390_partition_io_error(

28962

+ evms_logical_node_t *node,

28963

+ int io_flag,

28964

+ eio_t *eio)

28965

+{

28966

+ LOG_SERIOUS("attempt to %s beyond partition boundary(%Ld) on (%s), rsector(%Ld).\n",

28967

+ (io_flag) ? "WRITE" : "READ",

28968

+ node->total_vsectors - 1,

28969

+ node->name,

28970

+ eio->rsector);

28971

+

28972

+ EVMS_IO_ERROR(eio);

28973

+}

28974

+

28975

+/*

28976

+ * Function: s390_partition_read

28977

+ *

28978

+ */

28979

+static void

28980

+s390_partition_read(

28981

+ evms_logical_node_t *partition,

28982

+ eio_t *eio)

28983

+{

28984

+ local_instance_data_t *LID = partition->instance_data;

28985

+

28986

+ if ((eio->rsector + eio->rsize) <= partition->total_vsectors) {

28987

+ eio->rsector += LID->start_sect;

28988

+ R_IO(LID->source_disk, eio);

28989

+ } else

28990

+ s390_partition_io_error(partition, READ, eio);

28991

+}

28992

+

28993

+/*

28994

+ * Function: s390_partition_write

28995

+ *

28996

+ */

28997

+static void

28998

+s390_partition_write(

28999

+ evms_logical_node_t *partition,

29000

+ eio_t *eio)

29001

+{

29002

+ local_instance_data_t *LID = partition->instance_data;

29003

+

29004

+ if ((eio->rsector + eio->rsize) <= partition->total_vsectors) {

29005

+ eio->rsector += LID->start_sect;

29006

+ W_IO(LID->source_disk, eio);

29007

+ } else

29008

+ s390_partition_io_error(partition, WRITE, eio);

29009

+}

29010

+

29011

+/*

29012

+ * Function: s390_partition_init_io

29013

+ *

29014

+ */

29015

+static int

29016

+s390_partition_init_io(

29017

+ evms_logical_node_t *partition,

29018

+ int io_flag, /* 0=read, 1=write*/

29019

+ evms_sector_t sect_nr, /* disk LBA */

29020

+ evms_sector_t num_sects, /* # of sectors */

29021

+ void *buf_addr) /* buffer address */

29022

+{

29023

+ int rc;

29024

+ local_instance_data_t *LID = partition->instance_data;

29025

+

29026

+ if ((sect_nr + num_sects) <= partition->total_vsectors) {

29027

+ rc = INIT_IO(LID->source_disk, io_flag, sect_nr + LID->start_sect, num_sects, buf_addr);

29028

+ } else {

29029

+ LOG_SERIOUS("init_io: attempt to %s beyond partition(%s) boundary(%Ld) at sector(%Ld) for count(%Ld).\n",

29030

+ (io_flag) ? "WRITE" : "READ",

29031

+ partition->name,

29032

+ (LID->nr_sects - 1),

29033

+ sect_nr, num_sects);

29034

+ rc = -EINVAL;

29035

+ }

29036

+

29037

+ return(rc);

29038

+}

29039

+

29040

+/*

29041

+ * Function: s390_partition_ioctl

29042

+ *

29043

+ */

29044

+static int

29045

+s390_partition_ioctl (

29046

+ evms_logical_node_t *partition,

29047

+ struct inode *inode,

29048

+ struct file *file,

29049

+ unsigned int cmd,

29050

+ unsigned long arg)

29051

+{

29052

+ local_instance_data_t *LID;

29053

+ struct hd_geometry hd_geo;

29054

+ int rc;

29055

+

29056

+ rc = 0;

29057

+ LID = partition->instance_data;

29058

+ if (!inode)

29059

+ return -EINVAL;

29060

+ switch (cmd) {

29061

+ case HDIO_GETGEO:

29062

+ {

29063

+ rc = IOCTL(LID->source_disk, inode, file, cmd, arg);

29064

+ if (rc) break;

29065

+ if (copy_from_user(&hd_geo, (void *)arg, sizeof(struct hd_geometry)))

29066

+ rc = -EFAULT;

29067

+ if (rc) break;

29068

+ hd_geo.start = LID->start_sect;

29069

+ if (copy_to_user((void *)arg, &hd_geo, sizeof(struct hd_geometry)))

29070

+ rc = -EFAULT;

29071

+ }

29072

+ break;

29073

+ case EVMS_GET_BMAP:

29074

+ {

29075

+ evms_get_bmap_t *bmap = (evms_get_bmap_t *)arg;

29076

+ bmap->rsector += LID->start_sect;

29077

+ /* intentionally fall thru to

29078

+ * default ioctl down to device

29079

+ * manager.

29080

+ */

29081

+ }

29082

+ default:

29083

+ rc = IOCTL(LID->source_disk, inode, file, cmd, arg);

29084

+ }

29085

+ return rc;

29086

+}

29087

+

29088

+/*

29089

+ * Function: s390_part_init

29090

+ *

29091

+ */

29092

+static int __init

29093

+s390_part_init(void)

29094

+{

29095

+ return evms_cs_register_plugin(&plugin_header); /* register with EVMS */

29096

+}

29097

+

29098

+static void __exit

29099

+s390_part_exit(void)

29100

+{

29101

+ evms_cs_unregister_plugin(&plugin_header);

29102

+}

29103

+

29104

+module_init(s390_part_init);

29105

+module_exit(s390_part_exit);

29106

+#ifdef MODULE_LICENSE

29107

+MODULE_LICENSE("GPL");

29108

+#endif

29109

+

29110

diff -Naur linux-2002-03-28/drivers/evms/snapshot.c evms-2002-03-28/drivers/evms/snapshot.c

29111

--- linux-2002-03-28/drivers/evms/snapshot.c Wed Dec 31 18:00:00 1969

29112

+++ evms-2002-03-28/drivers/evms/snapshot.c Thu Mar 21 16:17:47 2002

29113

@@ -0,0 +1,1212 @@

29114

+/* -*- linux-c -*- */

29115

+

29116

+/*

29117

+ *

29118

+ *

29119

+ * Copyright (c) International Business Machines Corp., 2000

29120

+ *

29121

+ * This program is free software; you can redistribute it and/or modify

29122

+ * it under the terms of the GNU General Public License as published by

29123

+ * the Free Software Foundation; either version 2 of the License, or

29124

+ * (at your option) any later version.

29125

+ *

29126

+ * This program is distributed in the hope that it will be useful,

29127

+ * but WITHOUT ANY WARRANTY; without even the implied warranty of

29128

+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See

29129

+ * the GNU General Public License for more details.

29130

+ *

29131

+ * You should have received a copy of the GNU General Public License

29132

+ * along with this program; if not, write to the Free Software

29133

+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

29134

+ *

29135

+ *

29136

+ */

29137

+/*

29138

+ * linux/drivers/evms/snapshot.c

29139

+

29140

+ *

29141

+ * EVMS SnapShot Feature.

29142

+ *

29143

+ * This feature provides the ability to Snapshot ANY existing EVMS volume(including compatibility)

29144

+ * to a new EVMS volume that is created when the SnapShot is enabled.

29145

+ *

29146

+ * This feature will appear in the call stack for both the original and the snapshot volume.

29147

+ */

29148

+

29149

+#include <linux/module.h>

29150

+#include <linux/kernel.h>

29151

+#include <linux/config.h>

29152

+#include <linux/genhd.h>

29153

+#include <linux/major.h>

29154

+#include <linux/string.h>

29155

+#include <linux/blk.h>

29156

+#include <linux/init.h>

29157

+#include <linux/slab.h>

29158

+#include <linux/vmalloc.h>

29159

+#include <linux/evms/evms_kernel.h>

29160

+#include <linux/evms/evms_snapshot.h>

29161

+#include <asm/system.h>

29162

+#include <asm/uaccess.h>

29163

+

29164

+#define LOG_PREFIX "snapshot: "

29165

+

29166

+static struct proc_dir_entry * snap_proc = NULL;

29167

+

29168

+static int discover_snapshot_volumes( evms_logical_node_t ** evms_node_list );

29169

+static int delete_snapshot_volume( evms_logical_node_t * node );

29170

+static void read_snap( evms_logical_node_t * node,

29171

+ eio_t * eio );

29172

+static void write_snap( evms_logical_node_t * node,

29173

+ eio_t * eio );

29174

+static int init_io_snap( evms_logical_node_t * node,

29175

+ int io_flag,

29176

+ evms_sector_t sect_nr,

29177

+ evms_sector_t num_sects,

29178

+ void * buf_addr );

29179

+static int ioctl_snap( evms_logical_node_t * node,

29180

+ struct inode * inode,

29181

+ struct file * file,

29182

+ unsigned int cmd,

29183

+ unsigned long arg );

29184

+static int add_snapshot(evms_logical_node_t * node,

29185

+ snapshot_metadata_t * metadata,

29186

+ evms_logical_node_t ** evms_node_list );

29187

+static int snap_proc_read(char * page,

29188

+ char ** start,

29189

+ off_t off,

29190

+ int count,

29191

+ int * eof,

29192

+ void * data );

29193

+

29194

+

29195

+/********** Required Plugin Functions **********/

29196

+

29197

+

29198

+static evms_plugin_function_table_t function_table = {

29199

+ discover: &discover_snapshot_volumes,

29200

+ delete : &delete_snapshot_volume,

29201

+ read : &read_snap,

29202

+ write : &write_snap,

29203

+ init_io : &init_io_snap,

29204

+ ioctl : &ioctl_snap

29205

+};

29206

+

29207

+

29208

+static evms_plugin_header_t plugin_header = {

29209

+ id : SetPluginID(

29210

+ IBM_OEM_ID,

29211

+ EVMS_ASSOCIATIVE_FEATURE, // Feature class

29212

+ EVMS_SNAPSHOT_FEATURE_ID ), // Unique ID within features

29213

+ version : {

29214

+ major : 2,

29215

+ minor : 0,

29216

+ patchlevel : 0

29217

+ },

29218

+ required_common_services_version : {

29219

+ major : EVMS_COMMON_SERVICES_MAJOR,

29220

+ minor : EVMS_COMMON_SERVICES_MINOR,

29221

+ patchlevel : EVMS_COMMON_SERVICES_PATCHLEVEL

29222

+ },

29223

+ function_table : &function_table // function table for this plugin

29224

+};

29225

+

29226

+/*

29227

+ * Function: convert_metadata

29228

+ *

29229

+ * Performs endian conversion on metadata sector.

29230

+ */

29231

+static int convert_metadata( snapshot_metadata_t * metadata ){

29232

+

29233

+ metadata->chunk_size = le32_to_cpu(metadata->chunk_size);

29234

+ metadata->flags = le32_to_cpu(metadata->flags);

29235

+ metadata->lba_of_COW_table = le64_to_cpu(metadata->lba_of_COW_table);

29236

+ metadata->lba_of_first_chunk = le64_to_cpu(metadata->lba_of_first_chunk);

29237

+ metadata->original_size = le64_to_cpu(metadata->original_size);

29238

+ metadata->signature = le32_to_cpu(metadata->signature);

29239

+ metadata->total_chunks = le32_to_cpu(metadata->total_chunks);

29240

+ metadata->version.major = le32_to_cpu(metadata->version.major);

29241

+ metadata->version.minor = le32_to_cpu(metadata->version.minor);

29242

+ metadata->version.patchlevel = le32_to_cpu(metadata->version.patchlevel);

29243

+ metadata->CRC = le32_to_cpu(metadata->CRC);

29244

+

29245

+ return(0);

29246

+}

29247

+

29248

+/*

29249

+ * Function: insert_snapshot_hash_entry

29250

+ *

29251

+ * This function inserts a new entry into a snapshot hash chain, immediately

29252

+ * following the specified entry. This function should not be used to add an

29253

+ * entry into an empty list, or as the first entry in an existing list. For

29254

+ * that case, use insert_snapshot_map_entry_at_head().

29255

+ */

29256

+static int insert_snapshot_hash_entry( snapshot_hash_entry_t * entry,

29257

+ snapshot_hash_entry_t * base )

29258

+{

29259

+ entry->next = base->next;

29260

+ entry->prev = base;

29261

+ base->next = entry;

29262

+ if ( entry->next ) {

29263

+ entry->next->prev = entry;

29264

+ }

29265

+ return 0;

29266

+}

29267

+

29268

+/*

29269

+ * Function: insert_snapshot_hash_entry_at_head

29270

+ *

29271

+ * This function inserts a new entry into a snapshot chain as the first

29272

+ * entry in the chain.

29273

+ */

29274

+static int insert_snapshot_hash_entry_at_head( snapshot_hash_entry_t * entry,

29275

+ snapshot_hash_entry_t ** head )

29276

+{

29277

+ entry->next = *head;

29278

+ entry->prev = NULL;

29279

+ *head = entry;

29280

+ if ( entry->next ) {

29281

+ entry->next->prev = entry;

29282

+ }

29283

+ return 0;

29284

+}

29285

+

29286

+

29287

+/*

29288

+ * Function: set_snapshot_flags

29289

+ *

29290

+ * Set a bit in the flags field of the metadata to mark the snapshot node

29291

+ * as either disabled or full, and write the metadata sector to the

29292

+ * snapshot volume. The node passed in to this function should be the

29293

+ * "lower" of the snapshot nodes, meaning the one passed into the snapshot

29294

+ * plugin, not the one exported from the plugin. Currently, appropriate

29295

+ * values for "flag" are EVMS_SNAPSHOT_DISABLED and EVMS_SNAPSHOT_FULL.

29296

+ */

29297

+static int set_snapshot_flags( evms_logical_node_t * snap_node,

29298

+ unsigned long flag )

29299

+{

29300

+ unsigned char data[EVMS_VSECTOR_SIZE] = {0};

29301

+ snapshot_metadata_t * metadata = (snapshot_metadata_t*)data;

29302

+

29303

+ // Read the metadata sector

29304

+ if ( INIT_IO( snap_node, 0, snap_node->total_vsectors-3, 1, data ) ) {

29305

+ return -EIO;

29306

+ }

29307

+ // Set the appropriate flag.

29308

+ // do endian conversion on the fly

29309

+ metadata->flags |= cpu_to_le32(flag);

29310

+ metadata->CRC = 0;

29311

+ metadata->CRC = evms_cs_calculate_crc(

29312

+ EVMS_INITIAL_CRC,

29313

+ metadata, sizeof(snapshot_metadata_t));

29314

+ // Write the metadata sector back to the volume

29315

+ if ( INIT_IO( snap_node, 1, snap_node->total_vsectors-3, 1, data ) ) {

29316

+ return -EIO;

29317

+ }

29318

+ return 0;

29319

+}

29320

+

29321

+

29322

+/*

29323

+ * Function: discover_snapshot_volumes

29324

+ *

29325

+ * Inspect the global node list, looking for volumes with a valid

29326

+ * snapshot metadata sector.

29327

+ */

29328

+static int discover_snapshot_volumes( evms_logical_node_t ** evms_node_list )

29329

+{

29330

+ evms_logical_node_t * node;

29331

+ evms_logical_node_t * next_node;

29332

+ snapshot_metadata_t * metadata = NULL;

29333

+ int rc = 0;

29334

+ int org_crc, final_crc;

29335

+

29336

+ if ( evms_cs_allocate_memory( (void**)&metadata, EVMS_VSECTOR_SIZE )) {

29337

+ return -ENOMEM;

29338

+ }

29339

+

29340

+ for ( node = *evms_node_list; node && (rc == 0); node = next_node) {

29341

+ next_node = node->next;

29342

+ // if the id of this node is ours, skip to next node because this

29343

+ // must be one we put back on the list

29344

+ if (node->plugin->id == plugin_header.id) {

29345

+ continue;

29346

+ }

29347

+ if (node->feature_header && node->feature_header->feature_id == plugin_header.id) {

29348

+ // Read next to last sector for the snapshot metadata. Check for

29349

+ // a valid snapshot signature.

29350

+ if ( INIT_IO(node, 0, node->total_vsectors-3, 1, metadata) ) {

29351

+ LOG_ERROR("IO error on '%s' sector %Ld.\n",

29352

+ node->name, node->total_vsectors-3);

29353

+ rc = -EVMS_FEATURE_FATAL_ERROR;

29354

+ evms_cs_remove_logical_node_from_list(evms_node_list,node);

29355

+ DELETE(node);

29356

+ break;

29357

+ }

29358

+ if ( le32_to_cpu(metadata->signature) == EVMS_SNAPSHOT_SIGNATURE ) {

29359

+ org_crc = le32_to_cpu(metadata->CRC);

29360

+ metadata->CRC = 0;

29361

+ final_crc = evms_cs_calculate_crc(

29362

+ EVMS_INITIAL_CRC,

29363

+ metadata, sizeof(snapshot_metadata_t));

29364

+ if (final_crc != org_crc) {

29365

+ LOG_ERROR("CRC error in feature data on '%s'.\n", node->name);

29366

+ rc = -EVMS_FEATURE_FATAL_ERROR;

29367

+ evms_cs_remove_logical_node_from_list(evms_node_list,node);

29368

+ DELETE(node);

29369

+ } else{

29370

+ convert_metadata(metadata);

29371

+ if (metadata->version.major > plugin_header.version.major) {

29372

+ LOG_ERROR("ERROR: unsuppoprted version of feature in meta data on '%s'.\n",

29373

+ node->name);

29374

+ rc = -EVMS_FEATURE_FATAL_ERROR;

29375

+ evms_cs_remove_logical_node_from_list(evms_node_list,node);

29376

+ DELETE(node);

29377

+ }else {

29378

+ rc = add_snapshot(node, metadata, evms_node_list);

29379

+ }

29380

+ }

29381

+ }

29382

+ }

29383

+ }

29384

+ if (metadata) {

29385

+ evms_cs_deallocate_memory(metadata);

29386

+ }

29387

+ return rc;

29388

+}

29389

+

29390

+

29391

+/*

29392

+ * Function: check_quiesce

29393

+ *

29394

+ * Make sure a snapshot and it's original volume quiesced.

29395

+ */

29396

+static int check_quiesce( snapshot_volume_t * org_volume )

29397

+{

29398

+ snapshot_volume_t * next_vol;

29399

+ for ( next_vol = org_volume; next_vol; next_vol = next_vol->snapshot_next ) {

29400

+ if ( ! (next_vol->flags & EVMS_SNAPSHOT_QUIESCED) ) {

29401

+ LOG_ERROR("Can't delete snapshot, volume '%s' not quiesced.\n",

29402

+ next_vol->logical_node->name);

29403

+ return -EBUSY;

29404

+ }

29405

+ }

29406

+ return 0;

29407

+}

29408

+

29409

+

29410

+/*

29411

+ * Function: remove_snapshot_from_chain

29412

+ *

29413

+ * Remove the specified snapshot volume from its original's chain of

29414

+ * snapshots.

29415

+ */

29416

+static int remove_snapshot_from_chain( snapshot_volume_t * snap_volume )

29417

+{

29418

+ snapshot_volume_t * org_volume = snap_volume->snapshot_org;

29419

+

29420

+ if ( org_volume ) {

29421

+ while ( org_volume->snapshot_next && org_volume->snapshot_next != snap_volume ) {

29422

+ org_volume = org_volume->snapshot_next;

29423

+ }

29424

+ if ( org_volume->snapshot_next ) {

29425

+ org_volume->snapshot_next = org_volume->snapshot_next->snapshot_next;

29426

+ }

29427

+ }

29428

+ snap_volume->snapshot_org = NULL;

29429

+ snap_volume->snapshot_next = NULL;

29430

+ return 0;

29431

+}

29432

+

29433

+

29434

+/*

29435

+ * Function: delete_snapshot_hash_chain

29436

+ *

29437

+ * Delete all items in a single chain in the hash table.

29438

+ */

29439

+static int delete_snapshot_hash_chain( snapshot_hash_entry_t * head )

29440

+{

29441

+ snapshot_hash_entry_t * next;

29442

+

29443

+ while ( head ) {

29444

+ next = head->next;

29445

+ evms_cs_deallocate_memory(head);

29446

+ head = next;

29447

+ }

29448

+ return 0;

29449

+}

29450

+

29451

+

29452

+/*

29453

+ * Function: delete_snapshot_volume

29454

+ *

29455

+ * Delete the in-memory representation of a volume. The specified node

29456

+ * can actually be either a snapshot or an original. Deleting a snapshot

29457

+ * causes it to be removed from its original's chain of snapshots.

29458

+ */

29459

+static int delete_snapshot_volume(evms_logical_node_t * node)

29460

+{

29461

+ snapshot_volume_t * volume = (snapshot_volume_t *) node->instance_data;

29462

+ snapshot_volume_t * org_volume = volume->snapshot_org;

29463

+ snapshot_volume_t * next_vol;

29464

+ int rc = 0;

29465

+ int i;

29466

+

29467

+ // Delete the instance data

29468

+ if ( volume ) {

29469

+ if (volume->flags & EVMS_SNAPSHOT) {

29470

+ // This node is a snapshot. Remove it from the

29471

+ // original's list. Check all snapshots in the chain

29472

+ // for quiesce before this is done.

29473

+ if ( !(volume->flags & EVMS_SNAPSHOT_QUIESCED) ){

29474

+ return(-EBUSY);

29475

+ }

29476

+ if ( volume->snapshot_org &&

29477

+ !(org_volume->flags & EVMS_SNAPSHOT_QUIESCED)) {

29478

+ return(-EBUSY);

29479

+ }

29480

+

29481

+ remove_snapshot_from_chain( volume );

29482

+

29483

+ // If we just deleted the only/last snapshot for this

29484

+ // original, the original will not be modified. It is

29485

+ // the engine's responsibility to delete the original

29486

+ // and rediscover in order to clear it of its snapshot

29487

+ // information. Even if that doesn't happen, the state

29488

+ // of the kernel will still be safe. I/O's coming into

29489

+ // this plugin for the original will just be passed

29490

+ // down without any other action or modification.

29491

+

29492

+ // Unregister the proc-fs entry for this node.

29493

+ if ( snap_proc ) {

29494

+ remove_proc_entry(node->volume_info->volume_name, snap_proc);

29495

+ }

29496

+ }

29497

+ else {

29498

+ // This is an original. It's the engine's responsibility

29499

+ // to delete all snapshots before deleting an original.

29500

+ // Otherwise, a snapshot could be left pointing to an

29501

+ // original that no longer exists. Thus, we just need to

29502

+ // make sure there are no snapshots in the chain.

29503

+ if ( (rc = check_quiesce(volume)) ) {

29504

+// if ( volume->snapshot_next ) {

29505

+ return -EBUSY;

29506

+ }

29507

+ // loop through all snapshots left on this original, and

29508

+ // NULL out their org pointer and mark disabled, in case they don't get deleted.

29509

+ for ( next_vol = volume->snapshot_next;

29510

+ next_vol; next_vol = next_vol->snapshot_next ) {

29511

+ next_vol->snapshot_org = NULL;

29512

+ next_vol->flags |= EVMS_SNAPSHOT_DISABLED; // disable in memory only.

29513

+ }

29514

+ }

29515

+

29516

+ // Free up all memory used by the instance data, including

29517

+ // the underlying node, the hash table, and the data buffer.

29518

+ if (volume->logical_node) {

29519

+ if ( (rc = DELETE(volume->logical_node)) ) {

29520

+ return(rc);

29521

+ }

29522

+ }

29523

+ if (volume->snapshot_map) {

29524

+ // Delete all of the hash chains, then the actual table.

29525

+ for ( i = 0; i < volume->hash_table_size; i++ ) {

29526

+ delete_snapshot_hash_chain( volume->snapshot_map[i] );

29527

+ }

29528

+ vfree(volume->snapshot_map);

29529

+ }

29530

+ if (volume->chunk_data_buffer) {

29531

+ evms_cs_deallocate_memory(volume->chunk_data_buffer);

29532

+ }

29533

+

29534

+ evms_cs_deallocate_memory(volume);

29535

+ }

29536

+

29537

+ evms_cs_deallocate_logical_node(node);

29538

+

29539

+ MOD_DEC_USE_COUNT;

29540

+

29541

+ return 0;

29542

+}

29543

+

29544

+/*

29545

+ * Function: search_snapshot_hash_chain

29546

+ *

29547

+ * This function will search the hash chain that is anchored at the

29548

+ * specified head pointer. If the sector number is found, a pointer to that

29549

+ * entry in the chain is set, and a 1 is returned. If the sector is not

29550

+ * found, a pointer to the previous entry is set and 0 is returned. If the

29551

+ * return pointer is NULL, this means either the list is empty, or the

29552

+ * specified sector should become the first list item.

29553

+ */

29554

+static int search_snapshot_hash_chain( u_int64_t chunk,

29555

+ snapshot_hash_entry_t * head,

29556

+ snapshot_hash_entry_t ** result )

29557

+{

29558

+ snapshot_hash_entry_t * curr = head;

29559

+ snapshot_hash_entry_t * prev = head;

29560

+ while ( curr && curr->org_chunk < chunk ) {

29561

+ prev = curr;

29562

+ curr = curr->next;

29563

+ }

29564

+ if (!curr) { // Either an empty chain or went off the end of the chain.

29565

+ *result = prev;

29566

+ return 0;

29567

+ }

29568

+ else if ( curr->org_chunk != chunk ) {

29569

+ *result = curr->prev;

29570

+ return 0;

29571

+ }

29572

+ else {

29573

+ *result = curr;

29574

+ return 1;

29575

+ }

29576

+}

29577

+

29578

+

29579

+/*

29580

+ * Function: snapshot_remap_chunk

29581

+ *

29582

+ * This function performs a sector remap on a snapshot volume. This should

29583

+ * be called from the I/O read path, It first determines the base sector of

29584

+ * the chunk containing the specified sector, and saves the remainder. Then

29585

+ * it performs a search through the snapshot map for the specified volume.

29586

+ * If a match is found, the sector number is changed to the new value. If

29587

+ * no match is found, the value is left the same, meaning the read should

29588

+ * proceed down the original volume.

29589

+ */

29590

+static int snapshot_remap_chunk(snapshot_volume_t * snap_volume,

29591

+ evms_sector_t * sector )

29592

+{

29593

+ snapshot_hash_entry_t * result;

29594

+ unsigned long hash_value;

29595

+ u_int64_t chunk;

29596

+ unsigned long remainder;

29597

+

29598

+ remainder = *sector & (u_int64_t)( snap_volume->chunk_size -1);

29599

+ chunk = *sector >> snap_volume->chunk_shift;

29600

+ hash_value = ((unsigned long)chunk) % snap_volume->hash_table_size;

29601

+

29602

+ if ( search_snapshot_hash_chain( chunk, snap_volume->snapshot_map[hash_value], &result ) ) {

29603

+ *sector = (result->snap_chunk << snap_volume->chunk_shift) + remainder;

29604

+ return 0;

29605

+ }

29606

+ return 1;

29607

+}

29608

+

29609

+

29610

+/*

29611

+ * Function: read_snap

29612

+ */

29613

+static void read_snap( evms_logical_node_t * node, eio_t *eio)

29614

+{

29615

+ snapshot_volume_t * volume = (snapshot_volume_t * ) node->instance_data;

29616

+

29617

+ // Size check

29618

+ if ( (eio->rsector + eio->rsize) > node->total_vsectors ) {

29619

+ EVMS_IO_ERROR(eio);

29620

+ return;

29621

+ }

29622

+

29623

+ // On a read to the original, we can just pass it through completely

29624

+ // untouched. Only reads to the snapshot can be broken up.

29625

+ if ( volume->flags & EVMS_SNAPSHOT_ORG ) {

29626

+ R_IO(volume->logical_node,eio);

29627

+ return;

29628

+ }

29629

+

29630

+ // Lock the snapshot before processing the request.

29631

+ down(&volume->snap_semaphore);

29632

+

29633

+ // Make sure the snapshot is not full/disabled, and that

29634

+ // the original is present.

29635

+ if ( (volume->flags & (EVMS_SNAPSHOT_DISABLED|EVMS_SNAPSHOT_FULL)) ||

29636

+ (! volume->snapshot_org) ) {

29637

+ EVMS_IO_ERROR(eio);

29638

+ up(&volume->snap_semaphore);

29639

+ return;

29640

+ }

29641

+

29642

+

29643

+ // Check if this sector has been remapped

29644

+ if ( snapshot_remap_chunk(volume, &eio->rsector)){

29645

+ // Has not been remapped. Send IO to the original.

29646

+ R_IO(volume->snapshot_org->logical_node,eio);

29647

+ } else {

29648

+ // Sector was remapped. Send IO to the snapshot.

29649

+ R_IO(volume->logical_node,eio);

29650

+ }

29651

+

29652

+ up(&volume->snap_semaphore);

29653

+}

29654

+

29655

+

29656

+static int snapshot_copy_1( snapshot_volume_t * snap_volume, evms_sector_t org_sector,

29657

+ u_int64_t * remap_chunk) {

29658

+

29659

+ snapshot_hash_entry_t * target_entry;

29660

+ snapshot_hash_entry_t * new_map_entry;

29661

+ snapshot_volume_t * org_volume = snap_volume->snapshot_org;

29662

+ unsigned long hash_value;

29663

+ u_int64_t chunk;

29664

+ u_int32_t io_size = snap_volume->chunk_size;

29665

+ int i, iterations = 1;

29666

+

29667

+ if ( SNAPSHOT_CHUNK_BUFFER_SIZE < snap_volume->chunk_size ) {

29668

+ iterations = snap_volume->chunk_size / org_volume->chunk_size;

29669

+ io_size = org_volume->chunk_size;

29670

+ }

29671

+

29672

+ // Lock out this snapshot while we are remapping.

29673

+ down(&snap_volume->snap_semaphore);

29674

+

29675

+ // Make sure the snapshot has not been disabled.

29676

+ if ( snap_volume->flags & (EVMS_SNAPSHOT_DISABLED|EVMS_SNAPSHOT_FULL) ) {

29677

+ up(&snap_volume->snap_semaphore);

29678

+ return -ENOSPC;

29679

+ }

29680

+

29681

+ // Search the hash table to see if this sector has already been

29682

+ // remapped on this snapshot.

29683

+ chunk = org_sector >> snap_volume->chunk_shift;

29684

+ hash_value = (long)chunk % snap_volume->hash_table_size;

29685

+ if ( search_snapshot_hash_chain( chunk, snap_volume->snapshot_map[hash_value], &target_entry ) ) {

29686

+ // Chunk is already remapped.

29687

+ up(&snap_volume->snap_semaphore);

29688

+ *remap_chunk = target_entry->snap_chunk;

29689

+ return 0;

29690

+ }

29691

+

29692

+ // Is there enough room remaining on the snapshot to

29693

+ // remap this chunk?

29694

+ if ( snap_volume->next_free_chunk >= snap_volume->num_chunks ) {

29695

+ // Once the snapshot becomes full, further writes to the

29696

+ // original can't be remapped, and thus this snapshot

29697

+ // will become "corrupted".

29698

+ set_snapshot_flags(snap_volume->logical_node,EVMS_SNAPSHOT_FULL);

29699

+ snap_volume->flags |= EVMS_SNAPSHOT_FULL;

29700

+ up(&snap_volume->snap_semaphore);

29701

+ return -ENOSPC;

29702

+ }

29703

+

29704

+

29705

+ for ( i = 0; i < iterations; i++ ) {

29706

+ // Read the part of all chunk from the original volume.

29707

+ if ( INIT_IO( org_volume->logical_node, 0, chunk * snap_volume->chunk_size + i*io_size, io_size, org_volume->chunk_data_buffer ) ) {

29708

+ // An error reading from the original volume is very bad.

29709

+ // If the read fails, the original write will likely fail

29710

+ // as well, so let's just return an error.

29711

+ up(&snap_volume->snap_semaphore);

29712

+ return -EIO;

29713

+ }

29714

+

29715

+ // save of chunk number of the destination in snapshot of where this remap is going.

29716

+ *remap_chunk = snap_volume->next_free_chunk;

29717

+ // Write this chunk to the snapshot volume.

29718

+ if ( INIT_IO( snap_volume->logical_node, 1, (snap_volume->next_free_chunk * snap_volume->chunk_size + i*io_size), io_size, org_volume->chunk_data_buffer) ) {

29719

+ // An error writing to the snapshot is the same

29720

+ // situation as a full snapshot.

29721

+ set_snapshot_flags(snap_volume->logical_node,EVMS_SNAPSHOT_DISABLED);

29722

+ snap_volume->flags |= EVMS_SNAPSHOT_DISABLED;

29723

+ up(&snap_volume->snap_semaphore);

29724

+ LOG_ERROR("I/O error on COW on '%s' disabling snapshot.\n",

29725

+ snap_volume->logical_node->name);

29726

+ return -ENOSPC;

29727

+ }

29728

+ }

29729

+ // Fill in the appropriate COW table entry and write that

29730

+ // metadata sector back to the snapshot volume.

29731

+ // convert to little endian on disk

29732

+ snap_volume->cow_table[snap_volume->next_cow_entry] = cpu_to_le64(chunk);

29733

+ if ( INIT_IO( snap_volume->logical_node, 1, snap_volume->current_cow_sector, 1, snap_volume->cow_table ) ) {

29734

+ // The data was written to the snapshot, but writing the

29735

+ // metadata failed.

29736

+ set_snapshot_flags(snap_volume->logical_node,EVMS_SNAPSHOT_DISABLED);

29737

+ snap_volume->flags |= EVMS_SNAPSHOT_DISABLED;

29738

+ up(&snap_volume->snap_semaphore);

29739

+ LOG_ERROR("I/O error on COW table on '%s' disabling snapshot.\n",

29740

+ snap_volume->logical_node->name);

29741

+ return -ENOSPC;

29742

+ }

29743

+ snap_volume->next_cow_entry++;

29744

+ if ( snap_volume->next_cow_entry >= (SECTOR_SIZE/sizeof(u_int64_t)) ) {

29745

+ snap_volume->next_cow_entry = 0;

29746

+ snap_volume->current_cow_sector++;

29747

+ memset( snap_volume->cow_table, 0xff, SECTOR_SIZE );

29748

+ if ( INIT_IO( snap_volume->logical_node, 1, snap_volume->current_cow_sector, 1, snap_volume->cow_table ) ) {

29749

+ // Can't clear out the next sector of metadata. This

29750

+ // is bad and would kill us on a new discover, so

29751

+ // disable the snapshot now before we really screw up.

29752

+ set_snapshot_flags(snap_volume->logical_node,EVMS_SNAPSHOT_DISABLED);

29753

+ snap_volume->flags |= EVMS_SNAPSHOT_DISABLED;

29754

+ up(&snap_volume->snap_semaphore);

29755

+ LOG_ERROR("I/O error on COW table init on '%s' disabling snapshot.\n",

29756

+ snap_volume->logical_node->name);

29757

+ return -ENOSPC;

29758

+ }

29759

+ }

29760

+

29761

+ // Create a new snapshot map entry and add it in the appropriate

29762

+ // place in the map.

29763

+ if ( evms_cs_allocate_memory((void **)&new_map_entry, sizeof(snapshot_hash_entry_t)) ) {

29764

+ set_snapshot_flags(snap_volume->logical_node,EVMS_SNAPSHOT_DISABLED);

29765

+ snap_volume->flags |= EVMS_SNAPSHOT_DISABLED;

29766

+ up(&snap_volume->snap_semaphore);

29767

+ LOG_ERROR("no memory for remap entry, on '%s' disabling snapshot.\n",

29768

+ snap_volume->logical_node->name);

29769

+ return -ENOMEM;

29770

+ }

29771

+ new_map_entry->org_chunk = chunk;

29772

+ new_map_entry->snap_chunk = snap_volume->next_free_chunk;

29773

+

29774

+ if ( target_entry ) {

29775

+ insert_snapshot_hash_entry( new_map_entry, target_entry );

29776

+ }

29777

+ else {

29778

+ insert_snapshot_hash_entry_at_head( new_map_entry, &(snap_volume->snapshot_map[hash_value]) );

29779

+ }

29780

+ snap_volume->next_free_chunk++;

29781

+

29782

+ up(&snap_volume->snap_semaphore);

29783

+

29784

+ return 0;

29785

+}

29786

+/*

29787

+ * Function: snapshot_copy_data

29788

+ *

29789

+ * On a write to a snapshotted volume, check all snapshots to see if the

29790

+ * specified chunk has already been remapped. If it has not, read the

29791

+ * original data from the volume, write the data to the next available

29792

+ * chunk on the snapshot, update the COW table, write the COW table to

29793

+ * the snapshot, and insert a new entry into the snapshot map.

29794

+ */

29795

+static int snapshot_copy_data( snapshot_volume_t * org_volume,

29796

+ evms_sector_t org_sector)

29797

+{

29798

+ snapshot_volume_t * snap_volume;

29799

+ snapshot_volume_t * next_volume;

29800

+ u_int64_t remap_chunk; // unused here, needed for call to copy1

29801

+

29802

+ // Volumes can be snapshotted multiple times. Check every snapshot.

29803

+ for ( snap_volume = org_volume->snapshot_next; snap_volume; snap_volume = next_volume ) {

29804

+ next_volume = snap_volume->snapshot_next;

29805

+ snapshot_copy_1(snap_volume, org_sector, &remap_chunk);

29806

+

29807

+ }

29808

+

29809

+ return 0;

29810

+}

29811

+

29812

+

29813

+/*

29814

+ * Function: write_snap

29815

+ */

29816

+static void write_snap( evms_logical_node_t * node, eio_t * eio)

29817

+{

29818

+ snapshot_volume_t * volume = (snapshot_volume_t *) node->instance_data;

29819

+ int rc = 0;

29820

+ u_int64_t remap_chunk;

29821

+ u_int64_t remainder;

29822

+

29823

+

29824

+ // Size check.

29825

+ if ( eio->rsector + eio->rsize > node->total_vsectors) {

29826

+ EVMS_IO_ERROR(eio);

29827

+ return;

29828

+ }

29829

+

29830

+ // if this is a snapshot

29831

+ if ( volume->flags & EVMS_SNAPSHOT ) {

29832

+ if ( volume->flags & EVMS_SNAPSHOT_WRITEABLE) {

29833

+ if (snapshot_copy_1(volume, eio->rsector, &remap_chunk)){

29834

+ EVMS_IO_ERROR(eio);

29835

+ } else{

29836

+ remainder = eio->rsector & (u_int64_t)(volume->chunk_size -1);

29837

+ eio->rsector = (remap_chunk * volume->chunk_size) + remainder;

29838

+ W_IO(volume->logical_node,eio);

29839

+ }

29840

+ } else{

29841

+ EVMS_IO_ERROR(eio);

29842

+ }

29843

+

29844

+ return;

29845

+ } else{ // write to original

29846

+ // Remap this sector if necessary.

29847

+ if ( (rc = snapshot_copy_data(volume, eio->rsector)) ) {

29848

+ return;

29849

+ }

29850

+ W_IO(volume->logical_node,eio);

29851

+ }

29852

+ return;

29853

+}

29854

+

29855

+

29856

+/*

29857

+ * Function: ioctl_snap

29858

+ *

29859

+ */

29860

+static int ioctl_snap( evms_logical_node_t * logical_node,

29861

+ struct inode * inode,

29862

+ struct file * file,

29863

+ unsigned int cmd,

29864

+ unsigned long arg)

29865

+{

29866

+ int rc=0;

29867

+ snapshot_volume_t * volume = (snapshot_volume_t*)logical_node->instance_data;

29868

+

29869

+ if (!inode || !logical_node) {

29870

+ return -EINVAL;

29871

+ }

29872

+ switch (cmd) {

29873

+ case EVMS_QUIESCE_VOLUME:

29874

+ {

29875

+ evms_quiesce_volume_t *tmp = (evms_quiesce_volume_t*)arg;

29876

+ if ( tmp->command ) { // Quiesce

29877

+ volume->flags |= EVMS_SNAPSHOT_QUIESCED;

29878

+ }

29879

+ else { // Un-quiesce

29880

+ volume->flags &= ~EVMS_SNAPSHOT_QUIESCED;

29881

+ }

29882

+ }

29883

+ break;

29884

+

29885

+ case EVMS_GET_BMAP:

29886

+ {

29887

+ if ( volume->flags & EVMS_SNAPSHOT_ORG ) {

29888

+ rc = IOCTL(volume->logical_node, inode, file, cmd, arg);

29889

+ }

29890

+ else {

29891

+ rc = -EINVAL;

29892

+ }

29893

+ }

29894

+ break;

29895

+

29896

+ case EVMS_PLUGIN_IOCTL:

29897

+ {

29898

+ evms_plugin_ioctl_t tmp, *user_parms;

29899

+ int percent_full;

29900

+ user_parms = (evms_plugin_ioctl_t *)arg;

29901

+

29902

+ /* copy user's parameters to kernel space */

29903

+ if (copy_from_user(&tmp, user_parms, sizeof(tmp)))

29904

+ rc = -EFAULT;

29905

+

29906

+ if (!rc) {

29907

+ /* is this cmd targetted at this feature ? */

29908

+ if (tmp.feature_id == logical_node->plugin->id) {

29909

+ switch(tmp.feature_command) {

29910

+ case SNAPSHOT_QUERY_PERCENT_FULL:

29911

+ if (volume->flags & EVMS_SNAPSHOT_FULL) {

29912

+ percent_full = -1;

29913

+ } else if (volume->flags & EVMS_SNAPSHOT_DISABLED) {

29914

+ percent_full = -2;

29915

+ } else {

29916

+ percent_full = (volume->next_free_chunk * 100) / volume->num_chunks;

29917

+ }

29918

+ rc = copy_to_user(tmp.feature_ioctl_data, &percent_full, sizeof(percent_full));

29919

+ default:

29920

+ break;

29921

+ }

29922

+ } else { /* broadcast this cmd to all children */

29923

+ rc = IOCTL(logical_node,inode, file, cmd, arg);

29924

+ break;

29925

+ }

29926

+ }

29927

+ }

29928

+ break;

29929

+ case EVMS_CHECK_MEDIA_CHANGE:

29930

+ case EVMS_REVALIDATE_DISK:

29931

+ case EVMS_GET_DISK_LIST:

29932

+

29933

+ if (!(volume->flags & EVMS_SNAPSHOT_ORG)) {

29934

+ volume = volume->snapshot_org;

29935

+ }

29936

+ while ( volume ) {

29937

+ rc = IOCTL(volume->logical_node, inode, file, cmd, arg);

29938

+ volume = volume->snapshot_next;

29939

+ }

29940

+ break;

29941

+

29942

+ default:

29943

+ rc = IOCTL(volume->logical_node, inode, file, cmd, arg);

29944

+

29945

+ }

29946

+ return rc;

29947

+}

29948

+

29949

+

29950

+static int init_io_snap(evms_logical_node_t * node,

29951

+ int io_flag, /* 0=read, 1=write*/

29952

+ evms_sector_t sect_nr, /* disk LBA */

29953

+ evms_sector_t num_sects, /* # of sectors */

29954

+ void * buf_addr ) /* buffer address */

29955

+{

29956

+ snapshot_volume_t * volume = (snapshot_volume_t *)(node->instance_data);

29957

+

29958

+ // no init io access to snapshot, and no writes allowed to original

29959

+ // since they would not be snapshotted.

29960

+ if (io_flag || (volume->flags & EVMS_SNAPSHOT)) {

29961

+ return(-EINVAL);

29962

+ }

29963

+ return INIT_IO(volume->logical_node, io_flag, sect_nr, num_sects, buf_addr);

29964

+}

29965

+

29966

+

29967

+

29968

+/*

29969

+ * Function: snapshot_init

29970

+ *

29971

+ */

29972

+int __init snapshot_init(void)

29973

+{

29974

+ struct proc_dir_entry * pde;

29975

+

29976

+ // Register a directory in proc-fs.

29977

+ pde = evms_cs_get_evms_proc_dir();

29978

+ if (pde) {

29979

+ snap_proc = create_proc_entry("snapshot", S_IFDIR, pde);

29980

+ }

29981

+

29982

+ return evms_cs_register_plugin(&plugin_header); /* register with EVMS */

29983

+}

29984

+

29985

+/*

29986

+ * Function: snapshot_exit

29987

+ */

29988

+void __exit snapshot_exit(void)

29989

+{

29990

+ struct proc_dir_entry * pde;

29991

+

29992

+ // Unregister the directory in proc-fs.

29993

+ pde = evms_cs_get_evms_proc_dir();

29994

+ if (pde) {

29995

+ remove_proc_entry("snapshot", pde);

29996

+ }

29997

+

29998

+ evms_cs_unregister_plugin(&plugin_header);

29999

+}

30000

+

30001

+module_init(snapshot_init);

30002

+module_exit(snapshot_exit);

30003

+#ifdef MODULE_LICENSE

30004

+MODULE_LICENSE("GPL");

30005

+#endif

30006

+

30007

+

30008

+/********** SnapShot Functions **********/

30009

+

30010

+

30011

+

30012

+/*

30013

+ * Function: add_cow_entry_to_snapshot_map

30014

+ *

30015

+ * This function takes a cow table entry (from the on-disk data), and

30016

+ * converts it into an appropriate entry for the snapshot map, and

30017

+ * inserts it into the appropriate map for the specified volume.

30018

+ */

30019

+static int add_cow_entry_to_snapshot_map( u_int64_t org_chunk,

30020

+ u_int64_t snap_chunk,

30021

+ snapshot_volume_t * volume )

30022

+{

30023

+ snapshot_hash_entry_t * new_entry;

30024

+ snapshot_hash_entry_t * target_entry;

30025

+ unsigned long hash_value;

30026

+

30027

+ evms_cs_allocate_memory((void **)&new_entry,sizeof (snapshot_hash_entry_t));

30028

+ if (!new_entry) {

30029

+ return -ENOMEM;

30030

+ }

30031

+ new_entry->org_chunk = org_chunk;

30032

+ new_entry->snap_chunk = snap_chunk;

30033

+

30034

+ hash_value = (long)org_chunk % volume->hash_table_size;

30035

+ if ( search_snapshot_hash_chain( org_chunk, volume->snapshot_map[hash_value], &target_entry ) ) {

30036

+ // This means a duplicate mapping was found. This should not happen.

30037

+ }

30038

+ else {

30039

+ if ( target_entry ) {

30040

+ insert_snapshot_hash_entry( new_entry, target_entry );

30041

+ }

30042

+ else {

30043

+ insert_snapshot_hash_entry_at_head( new_entry, &(volume->snapshot_map[hash_value]) );

30044

+ }

30045

+ }

30046

+ return 0;

30047

+}

30048

+

30049

+

30050

+/*

30051

+ * Function: build_snapshot_maps

30052

+ *

30053

+ * Construct the initial hash table state based on

30054

+ * existing COW tables on the disk.

30055

+ */

30056

+static int build_snapshot_maps(snapshot_volume_t * volume)

30057

+{

30058

+ int rc = 0;

30059

+ int done = 0;

30060

+ while (!done) {

30061

+

30062

+ // Read in one sector's worth of COW tables.

30063

+ if ( INIT_IO(volume->logical_node, 0, volume->current_cow_sector, 1, volume->cow_table) ) {

30064

+ return -EIO;

30065

+ }

30066

+ // Translate every valid COW table entry into

30067

+ // a snapshot map entry.

30068

+ for ( volume->next_cow_entry = 0;

30069

+ volume->next_cow_entry < (SECTOR_SIZE/sizeof(u_int64_t)) &&

30070

+ volume->cow_table[volume->next_cow_entry] != 0xffffffffffffffff;

30071

+ volume->next_cow_entry++, volume->next_free_chunk++ ) {

30072

+ if ( (rc = add_cow_entry_to_snapshot_map( le64_to_cpu(volume->cow_table[volume->next_cow_entry]),

30073

+ volume->next_free_chunk, volume ))) {

30074

+ return(rc);

30075

+ }

30076

+ }

30077

+ // Move on to the next sector if necessary.

30078

+ if ( volume->next_cow_entry == (SECTOR_SIZE/sizeof(u_int64_t)) ) {

30079

+ volume->current_cow_sector++;

30080

+ }

30081

+ else {

30082

+ done = 1;

30083

+ }

30084

+ }

30085

+ return 0;

30086

+}

30087

+

30088

+

30089

+/*

30090

+ * Function: add_snapshot

30091

+ *

30092

+ * Initializes a snapshot instance and exports an evms_logical_node to

30093

+ * the global list.

30094

+ */

30095

+static int add_snapshot(evms_logical_node_t * snap_node,

30096

+ snapshot_metadata_t * metadata,

30097

+ evms_logical_node_t ** evms_node_list )

30098

+{

30099

+ evms_logical_node_t * new_snap_node;

30100

+ evms_logical_node_t * new_org_node;

30101

+ evms_logical_node_t * org_node;

30102

+ snapshot_volume_t * snap_volume;

30103

+ snapshot_volume_t * org_volume;

30104

+ snapshot_volume_t * tmp_volume;

30105

+ int rc = 0;

30106

+

30107

+ evms_cs_remove_logical_node_from_list(evms_node_list,snap_node);

30108

+

30109

+ // Make sure the snapshot is not full or disabled.

30110

+ if ( metadata->flags & (EVMS_SNAPSHOT_DISABLED | EVMS_SNAPSHOT_FULL) ) {

30111

+ LOG_WARNING("Error: Snapshot %s discovered as disabled/full.\n", snap_node->name);

30112

+ LOG_WARNING(" Deleting from further use.\n");

30113

+ DELETE(snap_node);

30114

+ return -ENOSPC;

30115

+ }

30116

+

30117

+ // Inspect the global list until a node is found with the name of

30118

+ // this snapshot's original. There can only be one original for

30119

+ // each snapshot.

30120

+ for ( org_node = *evms_node_list;

30121

+ org_node &&

30122

+ strncmp(EVMS_GET_NODE_NAME(org_node), metadata->original_volume, EVMS_VOLUME_NAME_SIZE);

30123

+ org_node = org_node->next ) {

30124

+ ;

30125

+ }

30126

+ if (!org_node) {

30127

+ // No original was found. Disable and delete the snapshot.

30128

+ LOG_WARNING("Error: No original found for snapshot %s, looking for %s\n", snap_node->name,metadata->original_volume);

30129

+ set_snapshot_flags(snap_node,EVMS_SNAPSHOT_DISABLED);

30130

+ DELETE(snap_node);

30131

+ return -ENODEV;

30132

+ }

30133

+

30134

+ LOG_EXTRA("Adding snapshot for volume %s\n",org_node->name);

30135

+

30136

+ // ok, we found the original on the list.

30137

+ // verify the size to be sure the name didn't change for compatibility

30138

+ if (org_node->total_vsectors != metadata->original_size) {

30139

+ LOG_WARNING("Error: Original volume size does not match\n");

30140

+ LOG_WARNING(" vol=%s: org_size=%d, current size=%d\n",

30141

+ org_node->name, (int)(metadata->original_size), (int)(org_node->total_vsectors));

30142

+ // The snapshot no longer points at a valid original.

30143

+ // Disable and delete the snapshot.

30144

+ set_snapshot_flags(snap_node,EVMS_SNAPSHOT_DISABLED);

30145

+ DELETE(snap_node);

30146

+ return -ENODEV;

30147

+ }

30148

+

30149

+ // New EVMS node for the snapshot

30150

+ if ( evms_cs_allocate_logical_node( &new_snap_node ) ) {

30151

+ set_snapshot_flags(snap_node,EVMS_SNAPSHOT_DISABLED);

30152

+ DELETE( snap_node );

30153

+ return -ENOMEM;

30154

+ }

30155

+

30156

+ MOD_INC_USE_COUNT;

30157

+

30158

+ // Instance data for the snapshot

30159

+ if ( evms_cs_allocate_memory( (void**)&snap_volume, sizeof(snapshot_volume_t) )) {

30160

+ delete_snapshot_volume( new_snap_node );

30161

+ set_snapshot_flags(snap_node,EVMS_SNAPSHOT_DISABLED);

30162

+ DELETE( snap_node );

30163

+ return -ENOMEM;

30164

+ }

30165

+

30166

+ // Initialize the snapshot node

30167

+ if (metadata->flags & EVMS_SNAPSHOT_WRITEABLE) {

30168

+ new_snap_node->flags = snap_node->flags;

30169

+ }else { // if not writeable, set read only

30170

+ new_snap_node->flags = snap_node->flags | EVMS_VOLUME_SET_READ_ONLY;

30171

+ }

30172

+ new_snap_node->flags = new_snap_node->flags |

30173

+ (org_node->flags &(EVMS_DEVICE_REMOVABLE | EVMS_VOLUME_PARTIAL));

30174

+ new_snap_node->system_id = 0x536e4170; // SnAp

30175

+ new_snap_node->total_vsectors = org_node->total_vsectors; // Lying about the size.

30176

+ new_snap_node->block_size = snap_node->block_size;

30177

+ new_snap_node->hardsector_size = snap_node->hardsector_size;

30178

+ new_snap_node->plugin = &plugin_header;

30179

+ new_snap_node->instance_data = (void*)snap_volume;

30180

+ // Get the new node's name from the consumed node's feature

30181

+ // header.

30182

+ strcpy(new_snap_node->name, snap_node->feature_header->object_name);

30183

+ // No problem with propagating the volume name up.

30184

+ new_snap_node->volume_info = snap_node->volume_info;

30185

+

30186

+ // Initialize the instance data

30187

+ snap_volume->logical_node = snap_node;

30188

+ snap_volume->chunk_size = metadata->chunk_size;

30189

+ snap_volume->chunk_shift = evms_cs_log2((u_int64_t)metadata->chunk_size);

30190

+ snap_volume->num_chunks = metadata->total_chunks;

30191

+ snap_volume->current_cow_sector = metadata->lba_of_COW_table;

30192

+ snap_volume->hash_table_size = (metadata->total_chunks)/MAX_HASH_CHAIN_ENTRIES + 1;

30193

+ snap_volume->flags = EVMS_SNAPSHOT;

30194

+ if (metadata->flags & EVMS_SNAPSHOT_WRITEABLE) {

30195

+ snap_volume->flags |= EVMS_SNAPSHOT_WRITEABLE;

30196

+ }

30197

+

30198

+ // Snapshot hash table

30199

+ snap_volume->snapshot_map = vmalloc(snap_volume->hash_table_size * sizeof(snapshot_hash_entry_t*));

30200

+ if ( !snap_volume->snapshot_map) {

30201

+ set_snapshot_flags(snap_node,EVMS_SNAPSHOT_DISABLED);

30202

+ delete_snapshot_volume( new_snap_node );

30203

+ return -ENOMEM;

30204

+ }

30205

+

30206

+ memset(snap_volume->snapshot_map, 0, snap_volume->hash_table_size * sizeof(snapshot_hash_entry_t*));

30207

+

30208

+ if ( (rc = build_snapshot_maps(snap_volume)) ){

30209

+ set_snapshot_flags(snap_node,EVMS_SNAPSHOT_DISABLED);

30210

+ delete_snapshot_volume( new_snap_node );

30211

+ return(rc);

30212

+ }

30213

+

30214

+ // check to see if the node we found is one we put back on the list due to

30215

+ // another snapshot of the original, if so then don't allocate a new

30216

+ // node and volume info, just get the old

30217

+ if (org_node->plugin->id != plugin_header.id) {

30218

+

30219

+ // New EVMS node for the original

30220

+ if ( evms_cs_allocate_logical_node( &new_org_node ) ) {

30221

+ set_snapshot_flags(snap_node,EVMS_SNAPSHOT_DISABLED);

30222

+ delete_snapshot_volume( new_snap_node );

30223

+ return -ENOMEM;

30224

+ }

30225

+

30226

+ MOD_INC_USE_COUNT;

30227

+

30228

+ // Instance data for the original

30229

+ if ( evms_cs_allocate_memory( (void**)&org_volume, sizeof(snapshot_volume_t) )) {

30230

+ set_snapshot_flags(snap_node,EVMS_SNAPSHOT_DISABLED);

30231

+ delete_snapshot_volume( new_snap_node );

30232

+ delete_snapshot_volume( new_org_node );

30233

+ return -ENOMEM;

30234

+ }

30235

+

30236

+ // Initialize the new node

30237

+ new_org_node->flags = org_node->flags |

30238

+ (snap_node->flags &(EVMS_DEVICE_REMOVABLE | EVMS_VOLUME_PARTIAL));

30239

+ new_org_node->system_id = 0x4f724967; // OrIg

30240

+ new_org_node->total_vsectors = org_node->total_vsectors;

30241

+ new_org_node->block_size = org_node->block_size;

30242

+ new_org_node->hardsector_size = org_node->hardsector_size;

30243

+ new_org_node->plugin = &plugin_header;

30244

+ new_org_node->instance_data = (void*)org_volume;

30245

+ // Must reuse the original node's name

30246

+ strcpy(new_org_node->name, org_node->name);

30247

+ new_org_node->volume_info = org_node->volume_info;

30248

+

30249

+ // Initialize the instance data

30250

+ org_volume->chunk_size = SNAPSHOT_CHUNK_BUFFER_SIZE;

30251

+ org_volume->num_chunks = 0;

30252

+ org_volume->current_cow_sector = 0;

30253

+ org_volume->flags = EVMS_SNAPSHOT_ORG;

30254

+ org_volume->snapshot_next = snap_volume;

30255

+ snap_volume->snapshot_next = NULL;

30256

+

30257

+ // Buffer for copying data from the original to the snapshot

30258

+ if ( evms_cs_allocate_memory( (void**)(&org_volume->chunk_data_buffer), SNAPSHOT_CHUNK_BUFFER_SIZE * SECTOR_SIZE)) {

30259

+ set_snapshot_flags(snap_node,EVMS_SNAPSHOT_DISABLED);

30260

+ delete_snapshot_volume( new_snap_node );

30261

+ delete_snapshot_volume( new_org_node );

30262

+ return -ENOMEM;

30263

+ }

30264

+

30265

+ // remove the original volume from the global list, then

30266

+ // add the new version of the original to the global list.

30267

+ evms_cs_remove_logical_node_from_list(evms_node_list,org_node);

30268

+ org_volume->logical_node = org_node;

30269

+ evms_cs_add_logical_node_to_list(evms_node_list,new_org_node);

30270

+

30271

+ } else {

30272

+ // There is already at least one snapshot for this original.

30273

+ new_org_node = org_node;

30274

+ org_volume = (snapshot_volume_t*)org_node->instance_data;

30275

+

30276

+ // propagate the flags from the new snapshot node to the original, and then to every other snapshot

30277

+ for (tmp_volume=org_volume; tmp_volume;tmp_volume=tmp_volume->snapshot_next) {

30278

+ tmp_volume->logical_node->flags = org_node->flags |

30279

+ (snap_node->flags &(EVMS_DEVICE_REMOVABLE | EVMS_VOLUME_PARTIAL));

30280

+ }

30281

+ // Insert the new snapshot at the start of the original's chain.

30282

+ snap_volume->snapshot_next = org_volume->snapshot_next;

30283

+ org_volume->snapshot_next = snap_volume;

30284

+ }

30285

+

30286

+ if ( snap_proc ) {

30287

+ create_proc_read_entry(snap_node->feature_header->volume_name, S_IFREG, snap_proc, snap_proc_read, new_snap_node);

30288

+ }

30289

+

30290

+ init_MUTEX( &snap_volume->snap_semaphore );

30291

+ snap_volume->snapshot_org = org_volume;

30292

+ evms_cs_add_logical_node_to_list(evms_node_list,new_snap_node);

30293

+

30294

+ return 0;

30295

+}

30296

+

30297

+

30298

+

30299

+/* Function: snap_proc_read

30300

+ *

30301

+ * Callback function for the proc-fs entry for each snapshot node.

30302

+ * Print out pertinent information about this snapshot. The "data"

30303

+ * parameter is a pointer to an EVMS logical node.

30304

+ */

30305

+static int snap_proc_read(char * page,

30306

+ char ** start,

30307

+ off_t off,

30308

+ int count,

30309

+ int * eof,

30310

+ void * data )

30311

+{

30312

+ evms_logical_node_t * snap_node = data;

30313

+ snapshot_volume_t * snap_volume = snap_node->instance_data;

30314

+ int sz = 0;

30315

+

30316

+ PROCPRINT("Snapshot of : %s\n", (snap_volume->snapshot_org) ? EVMS_GET_NODE_NAME(snap_volume->snapshot_org->logical_node) : "Unknown");

30317

+ PROCPRINT("Size (KB) : %ld\n", (snap_volume->num_chunks * snap_volume->chunk_size)/2);

30318

+ PROCPRINT("Chunk Size (KB): %ld\n", (snap_volume->chunk_size)/2);

30319

+ PROCPRINT("Writeable : %s\n", (snap_volume->flags & EVMS_SNAPSHOT_WRITEABLE) ? "True" : "False");

30320

+ PROCPRINT("Usage : %ld%%\n", (snap_volume->next_free_chunk * 100) / snap_volume->num_chunks);

30321

+ PROCPRINT("Status : %s\n", (snap_volume->flags & EVMS_SNAPSHOT_FULL) ? "Full / Disabled" : (snap_volume->flags & EVMS_SNAPSHOT_DISABLED) ? "Disabled" : "Active");

30322

+

30323

+ return sz;

30324

+}

30325

+

30326

diff -Naur linux-2002-03-28/include/linux/evms/evms.h evms-2002-03-28/include/linux/evms/evms.h

30327

--- linux-2002-03-28/include/linux/evms/evms.h Wed Dec 31 18:00:00 1969

30328

+++ evms-2002-03-28/include/linux/evms/evms.h Mon Mar 25 15:51:13 2002

30329

@@ -0,0 +1,279 @@

30330

+/* -*- linux-c -*- */

30331

+/*

30332

+ *

30333

+ * Copyright (c) International Business Machines Corp., 2000

30334

+ *

30335

+ * This program is free software; you can redistribute it and/or modify

30336

+ * it under the terms of the GNU General Public License as published by

30337

+ * the Free Software Foundation; either version 2 of the License, or

30338

+ * (at your option) any later version.

30339

+ *

30340

+ * This program is distributed in the hope that it will be useful,

30341

+ * but WITHOUT ANY WARRANTY; without even the implied warranty of

30342

+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See

30343

+ * the GNU General Public License for more details.

30344

+ *

30345

+ * You should have received a copy of the GNU General Public License

30346

+ * along with this program; if not, write to the Free Software

30347

+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

30348

+ */

30349

+/*

30350

+ * linux/include/linux/evms/evms.h

30351

+ *

30352

+ * EVMS public kernel header file

30353

+ *

30354

+ */

30355

+

30356

+#ifndef __EVMS_INCLUDED__

30357

+#define __EVMS_INCLUDED__

30358

+

30359

+#include <linux/genhd.h>

30360

+#include <linux/fs.h>

30361

+#include <linux/iobuf.h>

30362

+#include <linux/kdev_t.h>

30363

+#include <linux/hdreg.h>

30364

+#include <linux/slab.h>

30365

+#include <linux/proc_fs.h>

30366

+

30367

+#define FALSE 0

30368

+#define TRUE 1

30369

+

30370

+/* tracing info */

30371

+#define EVMS_INFO_CRITICAL 0

30372

+#define EVMS_INFO_SERIOUS 1

30373

+#define EVMS_INFO_ERROR 2

30374

+#define EVMS_INFO_WARNING 3

30375

+#define EVMS_INFO_DEFAULT 5

30376

+#define EVMS_INFO_DETAILS 6

30377

+#define EVMS_INFO_DEBUG 7

30378

+#define EVMS_INFO_EXTRA 8

30379

+#define EVMS_INFO_ENTRY_EXIT 9

30380

+#define EVMS_INFO_EVERYTHING 10

30381

+

30382

+extern int evms_info_level;

30383

+/* information message: e.g., configuration, major event */

30384

+#define evmsTRACE(info_level,prspec) { if (evms_info_level >= info_level) printk prspec; }

30385

+#define evmsTRACE2(info_level,statement) { if (evms_info_level >= info_level) statement; }

30386

+// sample - be sure to use enclose "prspec" or "statement" with parens ()

30387

+// evmsTRACE(info_level,(KERN_INFO "evms_myfunction: name = %s\n", name));

30388

+// evmsTRACE2(info_level,(print_mem( buffer_address, buffer_length)));

30389

+

30390

+/* LOG MACROS to make evms log messages look much

30391

+ * cleaner in the source.

30392

+ */

30393

+#define EVMS_LOG_PREFIX "evms: "

30394

+#define LOG_CRITICAL(msg, args...) evmsTRACE(EVMS_INFO_CRITICAL, (KERN_CRIT EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))

30395

+#define LOG_SERIOUS(msg, args...) evmsTRACE(EVMS_INFO_SERIOUS, (KERN_ERR EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))

30396

+#define LOG_ERROR(msg, args...) evmsTRACE(EVMS_INFO_ERROR, (KERN_ERR EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))

30397

+#define LOG_WARNING(msg, args...) evmsTRACE(EVMS_INFO_WARNING, (KERN_WARNING EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))

30398

+#define LOG_DEFAULT(msg, args...) evmsTRACE(EVMS_INFO_DEFAULT, (KERN_INFO EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))

30399

+#define LOG_DETAILS(msg, args...) evmsTRACE(EVMS_INFO_DETAILS, (KERN_INFO EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))

30400

+#define LOG_DEBUG(msg, args...) evmsTRACE(EVMS_INFO_DEBUG, (KERN_INFO EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))

30401

+#define LOG_EXTRA(msg, args...) evmsTRACE(EVMS_INFO_EXTRA, (KERN_INFO EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))

30402

+#define LOG_ENTRY_EXIT(msg, args...) evmsTRACE(EVMS_INFO_ENTRY_EXIT, (KERN_INFO EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))

30403

+#define LOG_EVERYTHING(msg, args...) evmsTRACE(EVMS_INFO_EVERYTHING, (KERN_INFO EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))

30404

+

30405

+#ifdef CONFIG_PROC_FS

30406

+#define PROCPRINT(msg, args...) (sz += sprintf(page + sz, msg, ## args))

30407

+#endif

30408

+

30409

+#define EVMS_HANDLE_KEY 0x89ABCDEF

30410

+

30411

+/* Plugin structure definitions */

30412

+

30413

+typedef struct evms_plugin_header_s {

30414

+ u_int32_t id;

30415

+ evms_version_t version;

30416

+ evms_version_t required_common_services_version;

30417

+ struct evms_plugin_function_table_s *function_table;

30418

+} evms_plugin_header_t;

30419

+

30420

+typedef struct evms_volume_info_s {

30421

+/* 0*/ u_int64_t volume_serial_number;

30422

+/* 8*/ u_int32_t volume_system_id; /* the minor is stored here */

30423

+/* 12*/ char volume_name[EVMS_VOLUME_NAME_SIZE+1];

30424

+/*140*/

30425

+} evms_volume_info_t;

30426

+

30427

+/* flags field bit definitions in evms_common.h */

30428

+/* iflags field used internally by the kernel only */

30429

+#define EVMS_FEATURE_BOTTOM (1<<0)

30430

+typedef struct evms_logical_node_s {

30431

+/* 0*/ evms_sector_t total_vsectors;

30432

+/* 8*/ evms_plugin_header_t * plugin;

30433

+/* 12*/ void * instance_data; /* ptr to private instance data */

30434

+/* 16*/ unsigned int flags;

30435

+/* 20*/ unsigned int iflags;

30436

+/* 24*/ int hardsector_size;

30437

+/* 28*/ int block_size;

30438

+/* 32*/ unsigned int system_id;

30439

+/* 36*/ evms_volume_info_t * volume_info;

30440

+/* 40*/ evms_feature_header_t * feature_header;

30441

+/* 44*/ struct evms_logical_node_s * next;

30442

+/* 48*/ char name[EVMS_VOLUME_NAME_SIZE+1];

30443

+/*176*/

30444

+} evms_logical_node_t;

30445

+

30446

+/* this macro will retrieve the appropriate kernel node name

30447

+ * based on the node type.

30448

+ */

30449

+#define EVMS_GET_NODE_NAME(node) \

30450

+ ((node->flags & EVMS_VOLUME_FLAG) ? \

30451

+ node->volume_info->volume_name : \

30452

+ node->name)

30453

+

30454

+/* bit definitions of FLAGS field in logical volume struct */

30455

+/* NOTE: these bit field definitions can be found in

30456

+ * evms_ioctl.h above the evms_volume_data_t structure

30457

+ */

30458

+typedef struct evms_logical_volume_s {

30459

+ char * name; /* devfs name if any */

30460

+ evms_logical_node_t * node; /* ptr to top logical node */

30461

+ int flags;

30462

+ int quiesced;

30463

+ int vfs_quiesced;

30464

+ atomic_t requests_in_progress;

30465

+ wait_queue_head_t wait_queue;

30466

+ devfs_handle_t devfs_handle;

30467

+#ifdef CONFIG_SMP

30468

+ request_queue_t request_queue;

30469

+#endif

30470

+} evms_logical_volume_t;

30471

+

30472

+/* EVMS generic I/O structure */

30473

+typedef struct eio_s {

30474

+ evms_sector_t rsector;

30475

+ evms_sector_t rsize;

30476

+ struct buffer_head *bh;

30477

+} eio_t;

30478

+

30479

+/* Abstraction MACROs */

30480

+#define EVMS_IO_ERROR(eio) (buffer_IO_error(eio->bh))

30481

+

30482

+/*

30483

+ * The following function table is used for all plugins.

30484

+ */

30485

+typedef struct evms_plugin_function_table_s {

30486

+ int (* discover)(evms_logical_node_t **);

30487

+ int (* end_discover)(evms_logical_node_t **);

30488

+ int (* delete) (evms_logical_node_t *);

30489

+ void (* read) (evms_logical_node_t *, eio_t *);

30490

+ void (* write) (evms_logical_node_t *, eio_t *);

30491

+ int (* init_io) (evms_logical_node_t *, int, evms_sector_t,

30492

+ evms_sector_t, void *);

30493

+ int (* ioctl) (evms_logical_node_t *, struct inode *,

30494

+ struct file *, unsigned int, unsigned long);

30495

+ int (* direct_ioctl)(struct inode *, struct file *,

30496

+ unsigned int, unsigned long);

30497

+} evms_plugin_function_table_t;

30498

+

30499

+/*

30500

+ * These macros facilitate easier use of the

30501

+ * entry points in the function table

30502

+ */

30503

+#define DISCOVER(node, list) ((node)->plugin->function_table->discover(list))

30504

+#define END_DISCOVER(node, list) ((node)->plugin->function_table->end_discover(list))

30505

+#define DELETE(node) ((node)->plugin->function_table->delete(node))

30506

+#define R_IO(node, eio) ((node)->plugin->function_table->read(node, eio))

30507

+#define W_IO(node, eio) ((node)->plugin->function_table->write(node, eio))

30508

+#define INIT_IO(node, rw_flag, start_sec, num_secs, buf_addr) ((node)->plugin->function_table->init_io(node, rw_flag, start_sec, num_secs, buf_addr))

30509

+#define INT_IO(node, rw_flag, start_sec, num_secs, buf_addr) ((node)->plugin->function_table->int_io(node, rw_flag, start_sec, num_secs, buf_addr))

30510

+#define IOCTL(node, inode, file, cmd, arg) ((node)->plugin->function_table->ioctl(node, inode, file, cmd, arg))

30511

+#define DIRECT_IOCTL(reg_record, inode, file, cmd, arg) ((reg_record)->plugin->function_table->direct_ioctl(inode, file, cmd, arg))

30512

+

30513

+typedef struct evms_list_node_s {

30514

+ void *item;

30515

+ struct evms_list_node_s *next;

30516

+} evms_list_node_t;

30517

+

30518

+/* pool management structure */

30519

+typedef struct evms_pool_mgmt_s {

30520

+ kmem_cache_t *cachep;

30521

+ int member_size;

30522

+ void *head;

30523

+ atomic_t waiters;

30524

+ wait_queue_head_t wait_queue;

30525

+ /* WARNING!!! pool name MUST be less than 20 chars */

30526

+ char *name;

30527

+} evms_pool_mgmt_t;

30528

+

30529

+/*

30530

+ * Notes:

30531

+ * All of the following kernel thread functions belong to EVMS base.

30532

+ * These functions were copied from md_core.c

30533

+ */

30534

+#define EVMS_THREAD_WAKEUP 0

30535

+typedef struct evms_thread_s {

30536

+ void (*run) (void *data);

30537

+ void *data;

30538

+ wait_queue_head_t wqueue;

30539

+ unsigned long flags;

30540

+ struct completion *event;

30541

+ struct task_struct *tsk;

30542

+ const char *name;

30543

+} evms_thread_t;

30544

+

30545

+/* EVMS (common services) exported functions prototypes */

30546

+#define EVMS_COMMON_SERVICES_MAJOR 0

30547

+#define EVMS_COMMON_SERVICES_MINOR 6

30548

+#define EVMS_COMMON_SERVICES_PATCHLEVEL 0

30549

+

30550

+void evms_cs_get_version(int *, int *);

30551

+int evms_cs_check_version(evms_version_t *, evms_version_t *);

30552

+int evms_cs_register_plugin(evms_plugin_header_t *);

30553

+int evms_cs_unregister_plugin(evms_plugin_header_t *);

30554

+#ifdef EVMS_MEM_DEBUG

30555

+int evms_cs_verify_memory_integrity(int);

30556

+#endif

30557

+int evms_cs_allocate_memory(void **, int);

30558

+int evms_cs_deallocate_memory(void *);

30559

+int evms_cs_allocate_logical_node(evms_logical_node_t **);

30560

+void evms_cs_deallocate_volume_info(evms_logical_node_t *);

30561

+int evms_cs_deallocate_logical_node(evms_logical_node_t *);

30562

+int evms_cs_add_logical_node_to_list(evms_logical_node_t **,

30563

+ evms_logical_node_t *);

30564

+int evms_cs_remove_logical_node_from_list(evms_logical_node_t **,

30565

+ evms_logical_node_t *);

30566

+int evms_cs_kernel_ioctl(evms_logical_node_t *, unsigned int,

30567

+ unsigned long);

30568

+int evms_cs_get_hardsect_size(evms_logical_node_t *, int *);

30569

+int evms_cs_get_blocksize_size(evms_logical_node_t *, int *);

30570

+unsigned long evms_cs_size_in_sectors(unsigned long, unsigned long);

30571

+unsigned long evms_cs_size_in_vsectors(long long);

30572

+int evms_cs_log2(long long);

30573

+u_int32_t evms_cs_calculate_crc(u_int32_t, void *, u_int32_t);

30574

+int evms_cs_register_for_end_io_notification(void *,

30575

+ struct buffer_head *,

30576

+ void *callback_function);

30577

+evms_pool_mgmt_t * evms_cs_create_pool(

30578

+ int,

30579

+ char *,

30580

+ void (*ctor)(void*, kmem_cache_t *, unsigned long),

30581

+ void (*dtor)(void*, kmem_cache_t *, unsigned long));

30582

+#define EVMS_BLOCKABLE TRUE

30583

+void * evms_cs_allocate_from_pool(evms_pool_mgmt_t *, int);

30584

+void evms_cs_deallocate_to_pool(evms_pool_mgmt_t *, void *);

30585

+void evms_cs_destroy_pool(evms_pool_mgmt_t *);

30586

+int evms_cs_add_item_to_list(evms_list_node_t **, void *);

30587

+int evms_cs_remove_item_from_list(evms_list_node_t **, void *);

30588

+int evms_cs_register_device(evms_logical_node_t *);

30589

+int evms_cs_unregister_device(evms_logical_node_t *);

30590

+int evms_cs_find_next_device(evms_logical_node_t *,

30591

+ evms_logical_node_t **);

30592

+void evms_cs_signal_event(int);

30593

+evms_thread_t * evms_cs_register_thread (

30594

+ void (*run) (void *),

30595

+ void *data,

30596

+ const char *name);

30597

+void evms_cs_unregister_thread (evms_thread_t *thread);

30598

+void evms_cs_wakeup_thread(evms_thread_t *thread);

30599

+void evms_cs_interrupt_thread (evms_thread_t *thread);

30600

+struct proc_dir_entry *evms_cs_get_evms_proc_dir(void);

30601

+int evms_cs_volume_request_in_progress(kdev_t, int, int *);

30602

+

30603

+

30604

+/* EVMS exported global variables */

30605

+extern evms_pool_mgmt_t *evms_bh_pool;

30606

+extern char *evms_primary_string;

30607

+extern char *evms_secondary_string;

30608

+#endif

30609

diff -Naur linux-2002-03-28/include/linux/evms/evms_aix.h evms-2002-03-28/include/linux/evms/evms_aix.h

30610

--- linux-2002-03-28/include/linux/evms/evms_aix.h Wed Dec 31 18:00:00 1969

30611

+++ evms-2002-03-28/include/linux/evms/evms_aix.h Wed Mar 27 19:27:56 2002

30612

@@ -0,0 +1,401 @@

30613

+/*

30614

+* The following structures are nested within the structures used by the

30615

+* system management routines. These structures and sizes were pulled from the AIX

30616

+* src tree.

30617

+*/

30618

+#define LVM_MAXLPS 65535 /* max number of logical partitions allowed */

30619

+#define LVM_NAMESIZ 64 /* maximum size for the logical volume name */

30620

+#define LVM_NUMCOPIES 3 /* max number of copies allowed of a logical partition */

30621

+#define LVM_MAXVGS 255

30622

+#define LVM_MAXPVS 32

30623

+#define LVM_MAXLVS 256

30624

+#define AIX_MIN_BLOCK_SIZE 4096

30625

+#define VGSA_BT_PV 127

30626

+#define NBPI 32

30627

+#define TRUE 1

30628

+#define OFFSET_CONSTANT 144

30629

+#define SLEEP_TIME 0

30630

+#define MAXLVS_OFFSET 16

30631

+#define PHYS_VOL_OFFSET 34

30632

+#define AIX_PVHPP_LENGTH PHYS_VOL_OFFSET

30633

+#define MAX_SECTORS_NAMELIST 32

30634

+#define AIX_DEFAULT_MIRRORING 1

30635

+#define AIX_FIRST_MIRROR 2

30636

+#define AIX_MAX_MIRRORS 3 // AIX defines ALL copies as mirrors - 3 mirrors MAX - 1 orig and 2 copies

30637

+

30638

+#define PSN_LVM_REC 7

30639

+#define PSN_VGSA_REC 128

30640

+#define PSN_NAMELIST_REC 2065

30641

+#define PSN_VGT_TRAILER 135

30642

+#define PSN_LVE_REC 1

30643

+#define PSN_PPH_OFFSET 17

30644

+#define PSN_PVH_INCREMENT 34

30645

+#define AIX_SECTOR_SIZE 512

30646

+#define MAX_PPENT_SECTOR 16

30647

+#define NAME_LEN 128 /* don't change!!! */

30648

+#define UUID_LEN 32 /* don't change!!! */

30649

+#define MAX_SECTORS_LV_ENTRIES 16

30650

+#define AIX_MIN_MIRROR_POOL 10

30651

+#define AIX_MIRROR_POOL_CHANGE 10

30652

+

30653

+#define LV_SET_ACCESS _IOW ( 0xfe, 0x28, 1)

30654

+#define LV_SET_ALLOCATION _IOW ( 0xfe, 0x29, 1)

30655

+#define LV_SET_STATUS _IOW ( 0xfe, 0x2a, 1)

30656

+#define LV_BMAP _IOWR ( 0xfe, 0x30, 1)

30657

+

30658

+#define LV_ACTIVE 0x01 /* lv_status */

30659

+#define LV_SPINDOWN 0x02 /* " */

30660

+#define LV_ERROR 0x99 /* " */

30661

+

30662

+#define VG_ACTIVE 0x01 /* vg_status */

30663

+

30664

+#define AIX_LV_READ 0x00 /* lv_access */

30665

+#define AIX_LV_WRITE 0x01 /* " */

30666

+#define EVMS_LV_NEW 0x10 // volume was created during the current discovery pass

30667

+#define EVMS_LV_INCOMPLETE 0x20 // volume has an incomplete LE map

30668

+#define EVMS_LV_INVALID 0x40 // volume has a memory-corruption problem

30669

+#define EVMS_VG_DIRTY 0x01 // group has had a new PV added during this discovery

30670

+#define AIX_VG_INCOMPLETE 0x20 // volume group is incomplete

30671

+

30672

+

30673

+#define LOG_PREFIX "--AIXlvm: "

30674

+

30675

+// Entries in the list of physical volumes (PV)

30676

+// in a volume group (VG)

30677

+

30678

+typedef struct unique_id_s {

30679

+ uint32_t word1;

30680

+ uint32_t word2;

30681

+ uint32_t word3;

30682

+ uint32_t word4;

30683

+} unique_id;

30684

+

30685

+typedef struct _partition_list_entry {

30686

+ evms_logical_node_t * logical_node;

30687

+ u_int32_t pv_number;

30688

+ u_int32_t block_size; // bytes

30689

+ u_int32_t hard_sect_size; // bytes

30690

+ struct _partition_list_entry * next;

30691

+

30692

+} partition_list_entry_t;

30693

+

30694

+// Table for mapping logical extents (LE) to physical extents (PE)

30695

+typedef struct _pe_table_entry {

30696

+ partition_list_entry_t * owning_pv;

30697

+ u_int64_t pe_sector_offset;

30698

+} pe_table_entry_t;

30699

+

30700

+// Logical volumes (LV) in a volume group (VG)

30701

+typedef struct _aix_logical_volume {

30702

+ u_int32_t lv_number;

30703

+ u_int64_t lv_size; // Sectors

30704

+ u_int32_t lv_access; // Flags: LV_READ, LV_WRITE, LV_SNAPSHOT, LV_SNAPSHOT_ORG, EVMS_QUIESCE

30705

+ u_int32_t lv_status; // Flags: LV_ACTIVE, LV_SPINDOWN

30706

+ u_int32_t lv_minor; // Device minor number

30707

+ u_int32_t mirror_copies; // Do we have mirroring and how many ?

30708

+ u_int32_t mirror_number; // mirror number - which copy is this ?

30709

+ u_int32_t mirror_iterations; // Which mirror should we be writing to ?

30710

+ u_int32_t stripes;

30711

+ u_int32_t stripe_size; // Sectors

30712

+ u_int32_t stripe_size_shift; // Number of bits to shift right instead of dividing by stripe_size

30713

+ u_int32_t pe_size; // Sectors

30714

+ u_int32_t pe_size_shift; // Number of bits to shift right instead of dividing by pe_size

30715

+ u_int32_t num_le; // Number of entries in the le_to_pe_map

30716

+ u_int32_t new_volume; // Flag to indicate if this volume needs to be exported

30717

+ struct _aix_volume_group * group; // Pointer back to parent volume group

30718

+ unsigned char name[EVMS_VOLUME_NAME_SIZE+1]; // Dev-tree volume name (eg: /dev/group0/vol0)

30719

+ pe_table_entry_t * le_to_pe_map; // Mapping of logical to physical extents

30720

+ pe_table_entry_t * le_to_pe_map_mir1; // Mapping of logical to physical extents for mirror 1

30721

+ pe_table_entry_t * le_to_pe_map_mir2; // Mapping of logical to physical extents for mirror 2

30722

+ evms_logical_node_t * volume_node; // Pointer to the parent EVMS node representing this volume

30723

+

30724

+} aix_logical_volume_t;

30725

+

30726

+// Volume groups (VG)

30727

+typedef struct _aix_volume_group {

30728

+ unique_id vg_id; // volume group number */

30729

+ u_int32_t numpvs; // Number of PVs found on this VG.

30730

+ u_int32_t numlvs; // Number of LVs found on this VG.

30731

+ u_int32_t hard_sect_size; // The largest hard_sect_size and block_size

30732

+ u_int32_t block_size; // values of all partitions in this group.

30733

+ u_int32_t flags; //

30734

+ u_int32_t lv_max; // maximum logical volumes */

30735

+ u_int32_t pe_size; // physical extent size in sectors */

30736

+ partition_list_entry_t * partition_list; // List of partitions/segments/PVs that make up this VG

30737

+ u_int32_t partition_count;

30738

+ struct _aix_logical_volume ** volume_list; // Array of volumes found in this VG.

30739

+ struct _aix_volume_group * next; // Pointer to the next VG

30740

+ u_int32_t CleanVGInfo; // Do we have a clean VG Info to work with ?

30741

+ daddr_t vgda_psn; // Which VGDA we should use

30742

+ long vgda_len; // length of the volume group descriptor area */

30743

+ struct _vg_header * AIXvgh; // Pointer to valid data area on disk for the VG

30744

+} aix_volume_group_t;

30745

+

30746

+typedef struct _aix_mirror_bh {

30747

+ atomic_t remaining;

30748

+ int iteration; // 'have we finished' count, used from IRQ handlers

30749

+ int cmd;

30750

+ u_int64_t mir_sector1;

30751

+ u_int64_t mir_sector2;

30752

+ struct buffer_head *master_bh;

30753

+ struct buffer_head bh_req;

30754

+ struct _aix_mirror_bh *mirror_bh_list;

30755

+ evms_logical_node_t *node; // map to evms node (READ only)

30756

+ evms_logical_node_t *mir_node1; //

30757

+ evms_logical_node_t *mir_node2; //

30758

+ eio_t eio;

30759

+ struct _aix_mirror_bh *next_r1; // next for retry or in free list

30760

+} aix_mirror_bh_t;

30761

+

30762

+typedef struct _timestruc_t

30763

+{

30764

+ int tv_sec;

30765

+ int tv_nsec;

30766

+

30767

+} timestruc_t;

30768

+

30769

+typedef struct ipl_rec_area

30770

+{

30771

+ unsigned int IPL_record_id; /* This physical volume contains a */

30772

+ /* valid IPL record if and only if */

30773

+ /* this field contains IPLRECID */

30774

+

30775

+#define IPLRECID 0xc9c2d4c1 /* Value is EBCIDIC 'IBMA' */

30776

+

30777

+ char reserved1[20];

30778

+ unsigned int formatted_cap; /* Formatted capacity. The number of */

30779

+ /* sectors available after formatting*/

30780

+ /* The presence or absence of bad */

30781

+ /* blocks does not alter this value. */

30782

+

30783

+ char last_head; /* THIS IS DISKETTE INFORMATION */

30784

+ /* The number of heads minus 1. Heads*/

30785

+ /* are number from 0 to last_head. */

30786

+

30787

+ char last_sector; /* THIS IS DISKETTE INFORMATION */

30788

+ /* The number of sectors per track. */

30789

+ /* Sectors are numbered from 1 to */

30790

+ /* last_sector. */

30791

+

30792

+ char reserved2[6];

30793

+

30794

+ unsigned int boot_code_length; /* Boot code length in sectors. A 0 */

30795

+ /* value implies no boot code present*/

30796

+

30797

+ unsigned int boot_code_offset; /* Boot code offset. Must be 0 if no */

30798

+ /* boot code present, else contains */

30799

+ /* byte offset from start of boot */

30800

+ /* code to first instruction. */

30801

+

30802

+ unsigned int boot_lv_start; /* Contains the PSN of the start of */

30803

+ /* the BLV. */

30804

+

30805

+ unsigned int boot_prg_start; /* Boot code start. Must be 0 if no */

30806

+ /* boot code present, else contains */

30807

+ /* the PSN of the start of boot code.*/

30808

+

30809

+ unsigned int boot_lv_length; /* BLV length in sectors. */

30810

+

30811

+ unsigned int boot_load_add; /* 512 byte boundary load address for*/

30812

+ /* boot code. */

30813

+

30814

+ char boot_frag; /* Boot code fragmentation flag. Must*/

30815

+ /* be 0 if no fragmentation allowed, */

30816

+ /* else must be 0x01. */

30817

+

30818

+ char boot_emulation; /* ROS network emulation flag */

30819

+ /* 0x0 => not an emul support image */

30820

+ /* 0x1 => ROS network emulation code */

30821

+ /* 0x2 => AIX code supporting ROS emul*/

30822

+

30823

+ char reserved3[2];

30824

+

30825

+ ushort basecn_length; /* Number of sectors for base */

30826

+ /* customization. Normal mode. */

30827

+

30828

+ ushort basecs_length; /* Number of sectors for base */

30829

+ /* customization. Service mode. */

30830

+

30831

+ unsigned int basecn_start; /* Starting PSN value for base */

30832

+ /* customization. Normal mode. */

30833

+

30834

+ unsigned int basecs_start; /* Starting PSN value for base */

30835

+ /* customization. Service mode. */

30836

+

30837

+ char reserved4[24];

30838

+

30839

+ unsigned int ser_code_length; /* Service code length in sectors. */

30840

+ /* A 0 value implies no service code */

30841

+ /* present. */

30842

+

30843

+ unsigned int ser_code_offset; /* Service code offset. Must be 0 if */

30844

+ /* no service code is present, else */

30845

+ /* contains byte offset from start of*/

30846

+ /* service code to first instruction.*/

30847

+

30848

+ unsigned int ser_lv_start; /* Contains the PSN of the start of */

30849

+ /* the SLV. */

30850

+

30851

+ unsigned int ser_prg_start; /* Service code start. Must be 0 if */

30852

+ /* service code is not present, else */

30853

+ /* contains the PSN of the start of */

30854

+ /* service code. */

30855

+

30856

+ unsigned int ser_lv_length; /* SLV length in sectors. */

30857

+

30858

+ unsigned int ser_load_add; /* 512 byte boundary load address for*/

30859

+ /* service code. */

30860

+

30861

+ char ser_frag; /* Service code fragmentation flag. */

30862

+ /* Must be 0 if no fragmentation */

30863

+ /* allowed, else must be 0x01. */

30864

+

30865

+ char ser_emulation; /* ROS network emulation flag */

30866

+ /* 0x0 => not an emul support image */

30867

+ /* 0x1 => ROS network emulation code */

30868

+ /* 0x2 => AIX code supporting ROS emul*/

30869

+

30870

+ char reserved5[2];

30871

+

30872

+ unique_id pv_id; /* The unique identifier for this */

30873

+ /* physical volume. */

30874

+ char dummy[512 - 128 - sizeof(unique_id)];

30875

+}AIXIPL_REC, *AIXIPL_REC_PTR;

30876

+

30877

+

30878

+typedef struct AIXlvm_rec_s

30879

+ /* structure which describes the physical volume LVM record */

30880

+ {

30881

+ long lvm_id; /* LVM id field which identifies whether the PV is a member of a volume group */

30882

+

30883

+#define AIX_LVM_LVMID 0x5F4C564D /* LVM id field of ASCII "_LVM" */

30884

+

30885

+ unique_id vg_id; /* the id of the volume group to which this physical volume belongs */

30886

+ long lvmarea_len; /* the length of the LVM reserved area */

30887

+ long vgda_len; /* length of the volume group descriptor area */

30888

+ daddr_t vgda_psn [2]; /* the physical sector numbers of the beginning of the volume group descriptor area copies on this disk */

30889

+ daddr_t reloc_psn; /* the physical sector number of the beginning of a pool of blocks */

30890

+ /* (located at the end of the PV) which are reserved for the relocation of bad blocks */

30891

+ long reloc_len; /* the length in number of sectors of the pool of bad block relocation blocks */

30892

+ short int pv_num; /* the physical volume number within the volume group of this physical volume */

30893

+ short int pp_size; /* the size in bytes for the partition, expressed as a power of 2 (i.e., the partition size is 2 to the power pp_size) */

30894

+ long vgsa_len; /* length of the volume group status area */

30895

+ daddr_t vgsa_psn [2]; /* the physical sector numbers of the beginning of the volume group status area copies on this disk */

30896

+ short int version; /* the version number of this volume group descriptor and status area */

30897

+

30898

+#define LVM_VERSION_1 1 /* first version - AIX 3.0 */

30899

+#define LVM_STRIPE_ENHANCE 2 /* version with striped lv's - AIX 4.1 */

30900

+#define LVM_1024_PPSIZE 3 /* ppsizes of 512 and 1024 */

30901

+#define LVM_GT_1016 4 /* version with support for > 1016 pps/pv */

30902

+#define LVM_MAX_VERSION LVM_GT_1016 /* max version # */

30903

+

30904

+ char res1 [450]; /* reserved area */

30905

+

30906

+ } AIXlvm_rec_t;

30907

+

30908

+

30909

+

30910

+/* II.Volume Group Descriptor Area */

30911

+

30912

+typedef struct _vgsa_area

30913

+{

30914

+ timestruc_t b_tmstamp; /* Beginning timestamp */

30915

+ unsigned int pv_missing [(LVM_MAXPVS + (NBPI -1)) / NBPI]; /* Bit per PV */

30916

+ unsigned char stalepp [LVM_MAXPVS] [VGSA_BT_PV];

30917

+ short factor;

30918

+ char resv[10]; /* Padding */

30919

+ timestruc_t e_tmstamp; /* Ending timestamp */

30920

+

30921

+} vgsa_area;

30922

+

30923

+typedef struct _vg_header

30924

+{

30925

+ timestruc_t vg_timestamp; /* time of last update */

30926

+ unique_id vg_id; /* unique id for volume group */

30927

+ short numlvs; /* number of lvs in vg */

30928

+ short maxlvs; /* max number of lvs allowed in vg */

30929

+ short pp_size; /* size of pps in the vg */

30930

+ short numpvs; /* number of pvs in the vg */

30931

+ short total_vgdas; /* number of copies of vg */

30932

+ /* descriptor area on disk */

30933

+ short vgda_size; /* size of volume group descriptor */

30934

+ short bigvg;

30935

+ short quorum;

30936

+ short auto_varyon;

30937

+ int checksum;

30938

+ int bigda_size;

30939

+ } vg_header;

30940

+

30941

+typedef struct _lv_entries

30942

+ {

30943

+ short lvname; /* name of LV */

30944

+ short res1; /* reserved area */

30945

+ int maxsize; /* maximum number of partitions allowed */

30946

+ char lv_state; /* state of logical volume */

30947

+ char mirror; /* none,single, or double */

30948

+ short mirror_policy; /* type of writing used to write */

30949

+ int num_lps; /* number of logical partitions on the lv */

30950

+ /* base 1 */

30951

+ char permissions; /* read write or read only */

30952

+ char bb_relocation; /* specifies if bad block */

30953

+ /* relocation is desired */

30954

+ char write_verify; /* verify all writes to the LV */

30955

+ char mirwrt_consist; /* mirror write consistency flag */

30956

+ unsigned short stripe_exp; /* stripe size in exponent value */

30957

+ unsigned short striping_width; /* stripe width */

30958

+ unsigned short lv_avoid;

30959

+ unsigned short child_minor_num;

30960

+ char res4[4]; /* reserved area on disk */

30961

+ } lv_entries;

30962

+

30963

+

30964

+typedef struct _pv_header

30965

+ {

30966

+ unique_id pv_id; /* unique identifier of PV */

30967

+ unsigned short pp_count; /* number of physical partitions */

30968

+ /* on PV */

30969

+ char pv_state; /* state of physical volume */

30970

+ char res1; /* reserved area on disk */

30971

+ daddr_t psn_part1; /* physical sector number of 1st pp */

30972

+ short pvnum_vgdas;/* number of vg descriptor areas */

30973

+ /* on the physical volume */

30974

+ short pv_num; /* PV number */

30975

+ long res2; /* reserved area on disk */

30976

+

30977

+ } pv_header;

30978

+

30979

+typedef struct _pp_entries

30980

+ {

30981

+ short lv_index; /* index to lv pp is on */

30982

+ short res_1; /* reserved area on disk */

30983

+ long lp_num; /* log. part. number */

30984

+ char copy; /* the copy of the logical partition */

30985

+ /* that this pp is allocated for */

30986

+ char pp_state; /* current state of pp */

30987

+ char fst_alt_vol; /* pv where partition allocation for*/

30988

+ /* first mirror begins */

30989

+ char snd_alt_vol; /* pv where partition allocation for*/

30990

+ /* second mirror begins */

30991

+ short fst_alt_part; /* partition to begin first mirror */

30992

+ short snd_alt_part; /*partition to begin second mirror */

30993

+ double res_3; /* reserved area on disk */

30994

+ double res_4; /* reserved area on disk */

30995

+ } pp_entries;

30996

+

30997

+typedef struct _namelist

30998

+{

30999

+ char name[LVM_MAXLVS][LVM_NAMESIZ];

31000

+} namelist;

31001

+

31002

+typedef struct _vg_trailer

31003

+{

31004

+ timestruc_t timestamp; /* time of last update */

31005

+ short concurrency;

31006

+ /* MS Nibble = concurrent capable */

31007

+ /* LS Nibble = concurrent auto-varyon */

31008

+ short res_2;

31009

+ int res_3; /* reserved area on disk */

31010

+ double res_4; /* reserved area on disk */

31011

+ double res_5; /* reserved area on disk */

31012

+} vg_trailer;

31013

+

31014

diff -Naur linux-2002-03-28/include/linux/evms/evms_bbr.h evms-2002-03-28/include/linux/evms/evms_bbr.h

31015

--- linux-2002-03-28/include/linux/evms/evms_bbr.h Wed Dec 31 18:00:00 1969

31016

+++ evms-2002-03-28/include/linux/evms/evms_bbr.h Tue Mar 26 16:04:31 2002

31017

@@ -0,0 +1,96 @@

31018

+/*

31019

+ *

31020

+ * Copyright (c) International Business Machines Corp., 2000

31021

+ *

31022

+ * This program is free software; you can redistribute it and/or modify

31023

+ * it under the terms of the GNU General Public License as published by

31024

+ * the Free Software Foundation; either version 2 of the License, or

31025

+ * (at your option) any later version.

31026

+ *

31027

+ * This program is distributed in the hope that it will be useful,

31028

+ * but WITHOUT ANY WARRANTY; without even the implied warranty of

31029

+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See

31030

+ * the GNU General Public License for more details.

31031

+ *

31032

+ * You should have received a copy of the GNU General Public License

31033

+ * along with this program; if not, write to the Free Software

31034

+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

31035

+ *

31036

+ */

31037

+/*

31038

+ * linux/include/linux/evms_bbr.h

31039

+ *

31040

+ * EVMS Bad Block Relocation Feature kernel header file

31041

+ *

31042

+ */

31043

+

31044

+#ifndef EVMS_BBR_INCLUDED

31045

+

31046

+#define EVMS_BBR_INCLUDED

31047

+

31048

+#define EVMS_BBR_VERSION_MAJOR 1

31049

+#define EVMS_BBR_VERSION_MINOR 0

31050

+#define EVMS_BBR_VERSION_PATCHLEVEL 0

31051

+

31052

+#define EVMS_BBR_FEATURE_ID 6

31053

+#define EVMS_BBR_SIGNATURE 0x42627246 /* BbrF */

31054

+

31055

+/* The following defines establish the minimum and maximum number of

31056

+ * replacement sectors which can be allocated for Bad Block Relocation.

31057

+ * Otherwise, 1 replacement sector per MB of disk space is allocated. */

31058

+#define EVMS_BBR_ENTRIES_PER_SECT 31 /* Assume sector size is 512 bytes*/

31059

+#define EVMS_BBR_LIMIT 4096

31060

+

31061

+#define EVMS_BBR_TABLE_SIGNATURE 0x42627254 /* BbrT */

31062

+

31063

+typedef struct evms_bbr_table_entry_s {

31064

+ u_int64_t bad_sect;

31065

+ u_int64_t replacement_sect;

31066

+} evms_bbr_table_entry_t;

31067

+

31068

+typedef struct evms_bbr_table_s {

31069

+ u_int32_t signature; /* Signature for a sector of the bbr table (EVMS_BBR_TABLE_SIGNATURE) */

31070

+ u_int32_t crc; /* CRC for this sector of the BBR Table. */

31071

+ u_int32_t sequence_number; /* Used to resolve conflicts when the primary and secondary tables do not match. */

31072

+ u_int32_t in_use_cnt; /* number of in-use entries */

31073

+ evms_bbr_table_entry_t entries[EVMS_BBR_ENTRIES_PER_SECT]; /* BBR table entries available for this sector of the BBR table */

31074

+} evms_bbr_table_t;

31075

+

31076

+/* description of on disk meta data sector for bbr feature */

31077

+typedef struct evms_bbr_metadata_s {

31078

+/* 0*/ u_int32_t signature; /* EVMS_BBR_SIGNATURE */

31079

+/* 4*/ u_int32_t crc;

31080

+/* 8*/ u_int32_t block_size; /* block size in bytes */

31081

+/*12*/ u_int32_t flags; /* Global flag used by BBR */

31082

+/*16*/ u_int64_t sequence_number;

31083

+/*24*/ u_int64_t start_sect_bbr_table; /* start 64-bit LBA of the BBR table */

31084

+/*32*/ u_int64_t nr_sects_bbr_table; /* number of sectors to hold the BBR table */

31085

+/*40*/ u_int64_t start_replacement_sect; /* start 64-bit LBA of the replacement sectors */

31086

+/*48*/ u_int64_t nr_replacement_blks; /* number of replacement blocks. */

31087

+/*56*/ char pads[456]; /* padding for 512-byte sector alignment */

31088

+} evms_bbr_metadata_t;

31089

+

31090

+

31091

+// BBR direct ioctl commands.

31092

+#define BBR_GET_INFO_CMD 1 // Return the total number of sectors

31093

+ // that are currently remapped for the

31094

+ // bbr object.

31095

+#define BBR_STOP_REMAP_CMD 2 // Stop ... do not remap any new sectors

31096

+ // or even honor any existing remaps for

31097

+ // the bbr object until after the next

31098

+ // rediscover command is received.

31099

+#define BBR_SECTOR_IO_CMD 3 // Process an I/O from the engine directly

31100

+ // through the bbr object.

31101

+

31102

+typedef struct evms_notify_bbr_s {

31103

+ char object_name[EVMS_VOLUME_NAME_SIZE+1]; // Input - Name of bbr object from feature header

31104

+ u_int64_t count; // Output - Count of remapped sectors

31105

+ u_int64_t start_sect; // Input - Starting sector for sector_io

31106

+ u_int64_t nr_sect; // Input - Number of sectors for sector_io

31107

+ unsigned long buffer; // Input - Pointer to buffer for sector_io

31108

+ int rw; // Input - READ or WRITE for sector_io

31109

+} evms_notify_bbr_t;

31110

+

31111

+

31112

+

31113

+#endif

31114

diff -Naur linux-2002-03-28/include/linux/evms/evms_bbr_k.h evms-2002-03-28/include/linux/evms/evms_bbr_k.h

31115

--- linux-2002-03-28/include/linux/evms/evms_bbr_k.h Wed Dec 31 18:00:00 1969

31116

+++ evms-2002-03-28/include/linux/evms/evms_bbr_k.h Wed Mar 27 16:08:55 2002

31117

@@ -0,0 +1,207 @@

31118

+#ifndef __EVMS_BBR_K__

31119

+#define __EVMS_BBR_K__

31120

+

31121

+/*

31122

+ *

31123

+ * Copyright (c) International Business Machines Corp., 2000

31124

+ *

31125

+ * This program is free software; you can redistribute it and/or modify

31126

+ * it under the terms of the GNU General Public License as published by

31127

+ * the Free Software Foundation; either version 2 of the License, or

31128

+ * (at your option) any later version.

31129

+ *

31130

+ * This program is distributed in the hope that it will be useful,

31131

+ * but WITHOUT ANY WARRANTY; without even the implied warranty of

31132

+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See

31133

+ * the GNU General Public License for more details.

31134

+ *

31135

+ * You should have received a copy of the GNU General Public License

31136

+ * along with this program; if not, write to the Free Software

31137

+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

31138

+ */

31139

+

31140

+/* linux/include/linux/evms/evms_bbr_k.h

31141

+ *

31142

+ * Kernel header file for Bad Block Relocation (BBR) Feature

31143

+ *

31144

+ * BBR feature is designed to remap I/O write failures to another safe location on disk.

31145

+ * Note that most disk drives have BBR built into them, this means that our software BBR

31146

+ * will be only activated when all hardware BBR replacement sectors have been used.

31147

+ */

31148

+

31149

+#include <linux/config.h>

31150

+#include <linux/module.h>

31151

+#include <linux/kernel.h>

31152

+#include <linux/sched.h>

31153

+#include <linux/smp_lock.h>

31154

+#include <linux/locks.h>

31155

+#include <linux/delay.h>

31156

+#include <linux/reboot.h>

31157

+#include <linux/completion.h>

31158

+#include <linux/vmalloc.h>

31159

+#include <asm/uaccess.h>

31160

+#include <linux/blk.h>

31161

+

31162

+#include <linux/evms/evms_kernel.h>

31163

+#include <linux/evms/evms_bbr.h>

31164

+

31165

+#define BBR_POOL_NAME_LENGTH 20

31166

+

31167

+/* Required common services version */

31168

+#define EVMS_BBR_COMMON_SERVICES_MAJOR 0

31169

+#define EVMS_BBR_COMMON_SERVICES_MINOR 6

31170

+#define EVMS_BBR_COMMON_SERVICES_PATCHLEVEL 0

31171

+

31172

+

31173

+static int bbr_notify_reboot(

31174

+ struct notifier_block *this,

31175

+ unsigned long code,

31176

+ void *x);

31177

+

31178

+typedef struct bbr_runtime_remap_s {

31179

+ evms_bbr_table_entry_t remap;

31180

+ struct bbr_runtime_remap_s *left; /** for binary tree */

31181

+ struct bbr_runtime_remap_s *right; /** for binary tree */

31182

+}bbr_runtime_remap_t;

31183

+

31184

+

31185

+/* local instance data structure definition */

31186

+

31187

+#define BBR_STOP_REMAP (1<<0)

31188

+

31189

+typedef struct bbr_instance_data_s {

31190

+ struct bbr_instance_data_s *next; /* link all bbr_instances */

31191

+ evms_logical_node_t *node; /* bbr_node */

31192

+ evms_logical_node_t *source; /* consumed node */

31193

+ evms_bbr_table_t *bbr_table;

31194

+ u_int64_t lba_table1;

31195

+ u_int64_t lba_table2;

31196

+ u_int64_t nr_sects_bbr_table;

31197

+ u_int64_t nr_replacement_blks;

31198

+ u_int64_t start_replacement_sect;

31199

+ u_int32_t blksize_in_sects;

31200

+ evms_pool_mgmt_t *bbr_bh_pool;

31201

+ char bh_pool_name[BBR_POOL_NAME_LENGTH+1];

31202

+ evms_pool_mgmt_t *remap_pool;

31203

+ char remap_pool_name[BBR_POOL_NAME_LENGTH+1];

31204

+ atomic_t in_use_replacement_blks;

31205

+ bbr_runtime_remap_t *remap_root; /* for binary tree */

31206

+ spinlock_t bbr_id_lock; /* lock for runtime remap table */

31207

+ u_int32_t flag;

31208

+ evms_sector_t total_vsectors;

31209

+} bbr_instance_data_t;

31210

+

31211

+#define BBR_BH_USE_EVMS_CALLBACK (1<<0) // Set if an EVMS callback was registered for this I/O

31212

+

31213

+typedef struct bbr_bh_s {

31214

+ struct bbr_bh_s *next; // Used by bbr_io_list.

31215

+ bbr_instance_data_t *BBRID; // Object for this request.

31216

+ eio_t eio; // Original eio.

31217

+ atomic_t waiters; // Used by bbr_init_io.

31218

+ int rw; // READ or WRITE

31219

+ int rc; // Return code from bbr_io_handler.

31220

+ unsigned long flag;

31221

+}bbr_bh_t;

31222

+

31223

+

31224

+/* --- discovery support functions --- */

31225

+static int load_feature_data(

31226

+ evms_logical_node_t *node,

31227

+ bbr_instance_data_t **ID);

31228

+

31229

+static int load_meta_data(

31230

+ evms_logical_node_t *node,

31231

+ evms_sector_t LSN,

31232

+ evms_bbr_metadata_t **md,

31233

+ evms_bbr_table_t **bbr_table);

31234

+

31235

+static int validate_meta_data(evms_bbr_metadata_t *md);

31236

+static int validate_bbr_table_sector(evms_bbr_table_t *p);

31237

+static u_int32_t validate_bbr_table(

31238

+ evms_bbr_metadata_t *md,

31239

+ evms_bbr_table_t *p);

31240

+static u_int32_t validate_bbr_tables(

31241

+ evms_logical_node_t *node,

31242

+ evms_bbr_metadata_t *MD1,

31243

+ evms_bbr_metadata_t *MD2,

31244

+ evms_bbr_table_t *p1,

31245

+ evms_bbr_table_t *p2);

31246

+void update_invalid_bbr_table_sector(

31247

+ evms_logical_node_t *node,

31248

+ evms_bbr_table_t *valid,

31249

+ evms_bbr_table_t *invalid,

31250

+ evms_sector_t LSN);

31251

+

31252

+static u_int32_t bbr_table_to_remap_list(bbr_instance_data_t *BBRID);

31253

+

31254

+static int bbr_create_pools(bbr_instance_data_t *BBRID);

31255

+static void bbr_destroy_pools(bbr_instance_data_t *BBRID);

31256

+

31257

+#ifdef EVMS_BBR_DEBUG

31258

+static void print_meta_data(evms_bbr_metadata_t *md);

31259

+static void print_bbr_table_sector(evms_bbr_table_t *bbr_table);

31260

+static void print_remap_list(bbr_instance_data_t *BBRID);

31261

+#define BBR_DEBUG_PRINT_META_DATA(md) print_meta_data(md)

31262

+#define BBR_DEBUG_PRINT_TABLE_SECTOR(table) print_bbr_table_sector(table)

31263

+#define BBR_DEBUG_PRINT_REMAP_LIST(BBRID) print_remap_list(BBRID)

31264

+#else

31265

+#define BBR_DEBUG_PRINT_META_DATA(md)

31266

+#define BBR_DEBUG_PRINT_TABLE_SECTOR(table)

31267

+#define BBR_DEBUG_PRINT_REMAP_LIST(BBRID)

31268

+#endif

31269

+

31270

+#define BBR_BUG(msg) LOG_SERIOUS(__FUNCTION__ msg "\n")

31271

+

31272

+/* -- Mapping functions -- */

31273

+void bbr_binary_tree_insert(

31274

+ bbr_runtime_remap_t **node,

31275

+ bbr_runtime_remap_t *newnode);

31276

+bbr_runtime_remap_t * bbr_binary_search(

31277

+ bbr_runtime_remap_t *node,

31278

+ evms_sector_t bad_sect);

31279

+static int bbr_insert_remap_entry(

31280

+ bbr_instance_data_t *BBRID,

31281

+ evms_bbr_table_entry_t *new_bbr_entry);

31282

+static evms_bbr_table_entry_t * bbr_search_remap_entry(

31283

+ bbr_instance_data_t *BBRID,

31284

+ evms_sector_t sect);

31285

+static inline int bbr_remap(

31286

+ bbr_instance_data_t *BBRID,

31287

+ evms_sector_t *lsn);

31288

+static void bbr_free_remap(bbr_instance_data_t *BBRID);

31289

+static void bbr_free_instance_data(bbr_instance_data_t *BBRID);

31290

+static inline void bbr_list_add(bbr_instance_data_t *BBRID);

31291

+static void bbr_list_remove(bbr_instance_data_t *BBRID);

31292

+static bbr_instance_data_t *bbr_find_instance_data (char * object_name);

31293

+

31294

+/* --- runtime support functions --- */

31295

+static bbr_bh_t * allocate_bbr_bh(

31296

+ bbr_instance_data_t *BBRID,

31297

+ int rw);

31298

+static void bbr_io_handler( void * void_data );

31299

+

31300

+/* -- EVMS Plugin interface functions -- */

31301

+static int bbr_discover(evms_logical_node_t **);

31302

+static int bbr_delete(evms_logical_node_t *);

31303

+static void bbr_read(evms_logical_node_t *, eio_t *);

31304

+static void bbr_write(evms_logical_node_t *, eio_t *);

31305

+static int bbr_ioctl (

31306

+ evms_logical_node_t *bbr_node,

31307

+ struct inode *inode,

31308

+ struct file *file,

31309

+ unsigned int cmd,

31310

+ unsigned long arg);

31311

+static int bbr_direct_ioctl (

31312

+ struct inode *inode,

31313

+ struct file *file,

31314

+ unsigned int cmd,

31315

+ unsigned long arg);

31316

+

31317

+static int bbr_init_io(

31318

+ evms_logical_node_t * bbr_node,

31319

+ int io_flag,

31320

+ evms_sector_t startLSN,

31321

+ evms_sector_t nr_sects,

31322

+ void *bufptr );

31323

+

31324

+#endif

31325

diff -Naur linux-2002-03-28/include/linux/evms/evms_common.h evms-2002-03-28/include/linux/evms/evms_common.h

31326

--- linux-2002-03-28/include/linux/evms/evms_common.h Wed Dec 31 18:00:00 1969

31327

+++ evms-2002-03-28/include/linux/evms/evms_common.h Wed Mar 27 15:51:36 2002

31328

@@ -0,0 +1,158 @@

31329

+/* -*- linux-c -*- */

31330

+/*

31331

+ *

31332

+ * Copyright (c) International Business Machines Corp., 2000

31333

+ *

31334

+ * This program is free software; you can redistribute it and/or modify

31335

+ * it under the terms of the GNU General Public License as published by

31336

+ * the Free Software Foundation; either version 2 of the License, or

31337

+ * (at your option) any later version.

31338

+ *

31339

+ * This program is distributed in the hope that it will be useful,

31340

+ * but WITHOUT ANY WARRANTY; without even the implied warranty of

31341

+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See

31342

+ * the GNU General Public License for more details.

31343

+ *

31344

+ * You should have received a copy of the GNU General Public License

31345

+ * along with this program; if not, write to the Free Software

31346

+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

31347

+ */

31348

+/*

31349

+ * linux/include/linux/evms/evms_common.h

31350

+ *

31351

+ * EVMS common (kernel and user) header file

31352

+ *

31353

+ */

31354

+

31355

+#ifndef __EVMS_COMMON_INCLUDED__

31356

+#define __EVMS_COMMON_INCLUDED__

31357

+

31358

+/* version info */

31359

+#define EVMS_MAJOR 63 /* use experimental major 63 for now */

31360

+#define EVMS_MAJOR_VERSION 1

31361

+#define EVMS_MINOR_VERSION 0

31362

+#define EVMS_PATCHLEVEL_VERSION 0

31363

+

31364

+#define MAX_EVMS_VOLUMES 256 /* There are 256 minors */

31365

+#define EVMS_VOLUME_NAME_SIZE 127

31366

+

31367

+#define IBM_OEM_ID 8112 // could be anything, but used

31368

+ // I=8, B=1, M=12

31369

+// this one going away as well.

31370

+#define EVMS_OEM_IBM IBM_OEM_ID

31371

+

31372

+#define EVMS_INITIAL_CRC 0xFFFFFFFF

31373

+#define EVMS_MAGIC_CRC 0x31415926

31374

+

31375

+#define EVMS_VSECTOR_SIZE 512

31376

+#define EVMS_VSECTOR_SIZE_SHIFT 9

31377

+

31378

+#define DEV_PATH "/dev"

31379

+#define EVMS_DIR_NAME "evms"

31380

+#define EVMS_DEV_NAME "block_device"

31381

+#define EVMS_DEV_NODE_PATH DEV_PATH "/" EVMS_DIR_NAME "/"

31382

+#define EVMS_DEVICE_NAME DEV_PATH "/" EVMS_DIR_NAME "/" EVMS_DEV_NAME

31383

+

31384

+/* EVMS will always use 64-bit fields */

31385

+typedef u_int64_t evms_sector_t;

31386

+

31387

+typedef struct evms_version_s {

31388

+ /* major changes when incompatible differences are introduced */

31389

+ u_int32_t major;

31390

+ /* minor changes when additions are made */

31391

+ u_int32_t minor;

31392

+ /* patchlevel changes when bugs are fixed */

31393

+ u_int32_t patchlevel;

31394

+} evms_version_t;

31395

+

31396

+typedef enum evms_plugin_code_s {

31397

+ EVMS_NO_PLUGIN, // 0

31398

+ EVMS_DEVICE_MANAGER, // 1

31399

+ EVMS_SEGMENT_MANAGER, // 2

31400

+ EVMS_REGION_MANAGER, // 3

31401

+ EVMS_FEATURE, // 4

31402

+ EVMS_ASSOCIATIVE_FEATURE, // 5

31403

+ EVMS_FILESYSTEM_INTERFACE_MODULE, // 6

31404

+ EVMS_CLUSTER_MANAGER_INTERFACE_MODULE, // 7

31405

+ EVMS_DISTRIBUTED_LOCK_MANAGER_INTERFACE_MODULE // 8

31406

+} evms_plugin_code_t;

31407

+

31408

+#define SetPluginID(oem, type, id) ((oem << 16) | (type << 12) | id)

31409

+#define GetPluginOEM(pluginid) (pluginid >> 16)

31410

+#define GetPluginType(pluginid) ((pluginid >> 12) & 0xf)

31411

+#define GetPluginID(pluginid) (pluginid & 0xfff)

31412

+

31413

+/* bit definitions for the flags field in

31414

+ * the EVMS LOGICAL NODE (kernel) and

31415

+ * the EVMS LOGICAL VOLUME (user) structures.

31416

+ */

31417

+#define EVMS_FLAGS_WIDTH 32

31418

+#define EVMS_VOLUME_FLAG (1<<0)

31419

+#define EVMS_VOLUME_PARTIAL_FLAG (1<<1)

31420

+#define EVMS_VOLUME_PARTIAL (1<<1)

31421

+#define EVMS_VOLUME_SET_READ_ONLY (1<<2)

31422

+#define EVMS_VOLUME_READ_ONLY (1<<2)

31423

+/* queued flags bits */

31424

+#define EVMS_REQUESTED_DELETE (1<<5)

31425

+#define EVMS_REQUESTED_QUIESCE (1<<6)

31426

+#define EVMS_REQUESTED_VFS_QUIESCE (1<<7)

31427

+/* this bit indicates corruption */

31428

+#define EVMS_VOLUME_CORRUPT (1<<8)

31429

+/* these bits define the source of the corruption */

31430

+#define EVMS_VOLUME_SOFT_DELETED (1<<9)

31431

+#define EVMS_VOLUME_GENDISK_GONE (1<<10)

31432

+/* these bits define volume status */

31433

+#define EVMS_MEDIA_CHANGED (1<<20)

31434

+#define EVMS_DEVICE_UNPLUGGED (1<<21)

31435

+/* these bits used for removable status */

31436

+#define EVMS_DEVICE_MEDIA_PRESENT (1<<24)

31437

+#define EVMS_DEVICE_PRESENT (1<<25)

31438

+#define EVMS_DEVICE_LOCKABLE (1<<26)

31439

+#define EVMS_DEVICE_REMOVABLE (1<<27)

31440

+

31441

+/* version info for evms_feature_header_t */

31442

+#define EVMS_FEATURE_HEADER_MAJOR 3

31443

+#define EVMS_FEATURE_HEADER_MINOR 0

31444

+#define EVMS_FEATURE_HEADER_PATCHLEVEL 0

31445

+

31446

+/* bit definitions of FEATURE HEADER bits in the FLAGS field */

31447

+#define EVMS_FEATURE_ACTIVE (1<<0)

31448

+#define EVMS_FEATURE_VOLUME_COMPLETE (1<<1)

31449

+/* bit definitions for VOLUME bits in the FLAGS field */

31450

+#define EVMS_VOLUME_DATA_OBJECT (1<<16)

31451

+#define EVMS_VOLUME_DATA_STOP (1<<17)

31452

+

31453

+#define EVMS_FEATURE_HEADER_SIGNATURE 0x54414546 //FEAT

31454

+typedef struct evms_feature_header_s {

31455

+/* 0*/ u_int32_t signature;

31456

+/* 4*/ u_int32_t crc;

31457

+/* 8*/ evms_version_t version; /* structure version */

31458

+/* 20*/ evms_version_t engine_version; /* version of the Engine that */

31459

+ /* wrote this feature header */

31460

+/* 32*/ u_int32_t flags;

31461

+/* 36*/ u_int32_t feature_id;

31462

+/* 40*/ u_int64_t sequence_number;

31463

+/* 48*/ u_int64_t alignment_padding;

31464

+ //required: starting lsn to 1st copy of feature's metadata.

31465

+/* 56*/ evms_sector_t feature_data1_start_lsn;

31466

+/* 64*/ evms_sector_t feature_data1_size; //in 512 byte units

31467

+ //optional: starting lsn to 2nd copy of feature's metadata.

31468

+ // if unused set size field to 0.

31469

+/* 72*/ evms_sector_t feature_data2_start_lsn;

31470

+/* 80*/ evms_sector_t feature_data2_size; //in 512 byte units

31471

+/* 88*/ u_int64_t volume_serial_number;

31472

+/* 96*/ u_int32_t volume_system_id; /* the minor is stored here */

31473

+/*100*/ u_int32_t object_depth; /* depth of object in the volume tree */

31474

+/*104*/ char object_name[EVMS_VOLUME_NAME_SIZE+1];

31475

+/*232*/ char volume_name[EVMS_VOLUME_NAME_SIZE+1];

31476

+/*360*/ unsigned char pad[152];

31477

+/*512*/

31478

+} evms_feature_header_t;

31479

+

31480

+/* EVMS specific error codes */

31481

+#define EVMS_FEATURE_FATAL_ERROR 257

31482

+#define EVMS_VOLUME_FATAL_ERROR 258

31483

+

31484

+#define EVMS_FEATURE_INCOMPLETE_ERROR 259

31485

+

31486

+#endif

31487

diff -Naur linux-2002-03-28/include/linux/evms/evms_drivelink.h evms-2002-03-28/include/linux/evms/evms_drivelink.h

31488

--- linux-2002-03-28/include/linux/evms/evms_drivelink.h Wed Dec 31 18:00:00 1969

31489

+++ evms-2002-03-28/include/linux/evms/evms_drivelink.h Wed Dec 12 09:37:43 2001

31490

@@ -0,0 +1,78 @@

31491

+/* -*- linux-c -*- */

31492

+/*

31493

+ *

31494

+ * Copyright (c) International Business Machines Corp., 2000

31495

+ *

31496

+ * This program is free software; you can redistribute it and/or modify

31497

+ * it under the terms of the GNU General Public License as published by

31498

+ * the Free Software Foundation; either version 2 of the License, or

31499

+ * (at your option) any later version.

31500

+ *

31501

+ * This program is distributed in the hope that it will be useful,

31502

+ * but WITHOUT ANY WARRANTY; without even the implied warranty of

31503

+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See

31504

+ * the GNU General Public License for more details.

31505

+ *

31506

+ * You should have received a copy of the GNU General Public License

31507

+ * along with this program; if not, write to the Free Software

31508

+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

31509

+ */

31510

+/*

31511

+ * linux/include/linux/evms_drvlink.h

31512

+ *

31513

+ * EVMS DriveLink Feature kernel header file

31514

+ *

31515

+ */

31516

+

31517

+#ifndef __EVMS_DRIVELINK_INCLUDED__

31518

+#define __EVMS_DRIVELINK_INCLUDED__

31519

+

31520

+#define EVMS_DRIVELINK_VERSION_MAJOR 2

31521

+#define EVMS_DRIVELINK_VERSION_MINOR 0

31522

+#define EVMS_DRIVELINK_VERSION_PATCHLEVEL 0

31523

+

31524

+#define EVMS_DRIVELINK_FEATURE_ID 1

31525

+#define EVMS_DRIVELINK_SIGNATURE 0x4C767244 //DrvL

31526

+#define EVMS_DRIVELINK_MAX_ENTRIES 60

31527

+

31528

+// description of on disk meta data sector for drivelink feature

31529

+

31530

+typedef struct evms_dl_ordering_table_entry_s {

31531

+ u_int64_t child_serial_number;

31532

+ evms_sector_t child_vsize;

31533

+} evms_dl_ordering_table_entry_t;

31534

+

31535

+typedef struct evms_drivelink_metadata_s {

31536

+/* 0*/ u_int32_t signature;

31537

+/* 4*/ u_int32_t crc;

31538

+/* 8*/ evms_version_t version;

31539

+/* 20*/ u_int32_t flags;

31540

+/* 24*/ u_int64_t sequence_number;

31541

+/* 32*/ u_int64_t child_serial_number;

31542

+/* 40*/ u_int64_t parent_serial_number;

31543

+/* 48*/ u_int64_t child_count;

31544

+/* 56*/ u_int64_t pad;

31545

+/* 64*/ evms_dl_ordering_table_entry_t ordering_table[EVMS_DRIVELINK_MAX_ENTRIES];

31546

+/*1024*/

31547

+} evms_drivelink_metadata_t;

31548

+

31549

+#ifdef __KERNEL__

31550

+// description of in memory meta data for drivelink feature

31551

+typedef struct evms_drivelink_runtime_entry_s {

31552

+ u_int64_t block_size;

31553

+ evms_sector_t voffset;

31554

+ evms_sector_t vsize;

31555

+ evms_logical_node_t *child_node;

31556

+ evms_drivelink_metadata_t *child_metadata;

31557

+} evms_drivelink_runtime_entry_t;

31558

+

31559

+typedef struct evms_drivelink_runtime_data_s {

31560

+ u_int64_t block_size;

31561

+ // keep the fields below this point in order

31562

+ u_int64_t parent_serial_number;

31563

+ u_int64_t child_count;

31564

+ evms_drivelink_runtime_entry_t *child_table;

31565

+} evms_drivelink_runtime_data_t;

31566

+#endif

31567

+

31568

+#endif

31569

diff -Naur linux-2002-03-28/include/linux/evms/evms_ecr.h evms-2002-03-28/include/linux/evms/evms_ecr.h

31570

--- linux-2002-03-28/include/linux/evms/evms_ecr.h Wed Dec 31 18:00:00 1969

31571

+++ evms-2002-03-28/include/linux/evms/evms_ecr.h Wed Nov 7 14:32:21 2001

31572

@@ -0,0 +1,107 @@

31573

+/*

31574

+ *

31575

+ * Copyright (c) International Business Machines Corp., 2000

31576

+ *

31577

+ * This program is free software; you can redistribute it and/or modify

31578

+ * it under the terms of the GNU General Public License as published by

31579

+ * the Free Software Foundation; either version 2 of the License, or

31580

+ * (at your option) any later version.

31581

+ *

31582

+ * This program is distributed in the hope that it will be useful,

31583

+ * but WITHOUT ANY WARRANTY; without even the implied warranty of

31584

+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See

31585

+ * the GNU General Public License for more details.

31586

+ *

31587

+ * You should have received a copy of the GNU General Public License

31588

+ * along with this program; if not, write to the Free Software

31589

+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

31590

+ *

31591

+ */

31592

+/*

31593

+ * linux/include/linux/evms_ecr.h

31594

+ *

31595

+ * EVMS Cluster enablement kernel header file

31596

+ *

31597

+ */

31598

+

31599

+#ifndef __EVMS_ECR__

31600

+

31601

+#define __EVMS_ECR__

31602

+

31603

+#define ECR_SUCCESS 0

31604

+#define ECR_FAIL -1

31605

+

31606

+/*

31607

+ * Beginning of group messaging API

31608

+ */

31609

+typedef int ecr_group_t;

31610

+typedef int ecr_nodeid_t;

31611

+typedef void ecr_cred_t;

31612

+typedef void ecr_instance_t;

31613

+typedef void ecr_message_t;

31614

+

31615

+typedef enum ecr_type_s {

31616

+ ECR_GROUP_START, /* 0th entry is reserved */

31617

+ ECR_P2P, /* Point to Point message type */

31618

+ ECR_BROADCAST, /* Broadcast message type */

31619

+ ECR_ATOMIC_EXECUTE, /* Atomic execute type */

31620

+ ECR_GROUP_LAST /* Just a last enum type, not a message type */

31621

+} ecr_type_t;

31622

+

31623

+typedef struct ecr_table_s {

31624

+ void (*join) (ecr_nodeid_t, uint, ecr_nodeid_t *, ecr_instance_t *);

31625

+ int (*can_join)(ecr_nodeid_t, ecr_cred_t *, size_t, ecr_instance_t *);

31626

+ void (*leave) (ecr_nodeid_t, ecr_instance_t *);

31627

+ void (*recover)(ecr_nodeid_t, ecr_instance_t *);

31628

+ void (*message)(ecr_message_t *, ecr_type_t, ecr_nodeid_t,

31629

+ void *, size_t, ecr_instance_t *);

31630

+ void (*vol_leave)(ecr_nodeid_t, ecr_instance_t *);

31631

+} ecr_table_t;

31632

+

31633

+

31634

+#define ECR_GROUPNAME_MAX_SIZE NAME_SIZE /* maximum size of a group name */

31635

+

31636

+ecr_group_t ecr_group_join(char *, ecr_table_t *, ecr_cred_t *, size_t,

31637

+ ecr_instance_t *);

31638

+void ecr_group_leave(ecr_group_t);

31639

+int ecr_group_send(ecr_group_t, ecr_nodeid_t, void *, size_t,

31640

+ ecr_instance_t *,

31641

+ void callback(int, ecr_instance_t *));

31642

+int ecr_group_send_wait(ecr_group_t, ecr_nodeid_t, void *, size_t,

31643

+ int *);

31644

+int ecr_group_broadcast(ecr_group_t, void *, size_t, ecr_instance_t *,

31645

+ void callback(u_char, ecr_instance_t *));

31646

+int ecr_group_broadcast_wait(ecr_group_t, void *, size_t, u_char *);

31647

+int ecr_group_atomic_execute(ecr_group_t, void *, size_t,

31648

+ ecr_instance_t *,

31649

+ void callback(ecr_instance_t *));

31650

+int ecr_group_atomic_execute_wait(ecr_group_t, void *, size_t);

31651

+void ecr_group_success_response(ecr_message_t *);

31652

+void ecr_group_failure_response(ecr_message_t *, int);

31653

+

31654

+

31655

+

31656

+/*

31657

+ * Beginning of distributed lock API

31658

+ */

31659

+

31660

+typedef int ecr_lock_t;

31661

+typedef enum ecr_lock_mode_s {

31662

+ ECR_LOCK_START, /* 0th entry is reserved */

31663

+ ECR_LOCK_CONCURRENT, /* concurrent access */

31664

+ ECR_LOCK_EXCLUSIVE, /* exclusive access */

31665

+ ECR_LOCK_LAST /* Just a last enum type, not a lock type */

31666

+} ecr_lock_mode_t;

31667

+

31668

+typedef u_char ecr_mode_t;

31669

+

31670

+

31671

+#define ECR_LOCKNAME_MAX_SIZE NAME_SIZE /* maximum size of a lock name */

31672

+#define ECR_BLOCK 1 /* waitflag set */

31673

+

31674

+ecr_lock_t ecr_lock_create(char * /* lock name */);

31675

+int ecr_lock(ecr_lock_t, u_int64_t, u_int64_t, ecr_lock_mode_t,

31676

+ u_char /*waitflag*/);

31677

+int ecr_unlock(ecr_lock_t, u_int64_t, u_int64_t);

31678

+

31679

+#endif /* __EVMS_ECR__ */

31680

diff -Naur linux-2002-03-28/include/linux/evms/evms_ioctl.h evms-2002-03-28/include/linux/evms/evms_ioctl.h

31681

--- linux-2002-03-28/include/linux/evms/evms_ioctl.h Wed Dec 31 18:00:00 1969

31682

+++ evms-2002-03-28/include/linux/evms/evms_ioctl.h Thu Mar 21 14:08:50 2002

31683

@@ -0,0 +1,293 @@

31684

+/* -*- linux-c -*- */

31685

+/*

31686

+ *

31687

+ * Copyright (c) International Business Machines Corp., 2000

31688

+ *

31689

+ * This program is free software; you can redistribute it and/or modify

31690

+ * it under the terms of the GNU General Public License as published by

31691

+ * the Free Software Foundation; either version 2 of the License, or

31692

+ * (at your option) any later version.

31693

+ *

31694

+ * This program is distributed in the hope that it will be useful,

31695

+ * but WITHOUT ANY WARRANTY; without even the implied warranty of

31696

+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See

31697

+ * the GNU General Public License for more details.

31698

+ *

31699

+ * You should have received a copy of the GNU General Public License

31700

+ * along with this program; if not, write to the Free Software

31701

+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

31702

+ */

31703

+/*

31704

+ * linux/include/linux/evms.h

31705

+ *

31706

+ * EVMS public kernel header file

31707

+ *

31708

+ */

31709

+

31710

+#ifndef __EVMS_IOCTL_INCLUDED__

31711

+#define __EVMS_IOCTL_INCLUDED__

31712

+

31713

+#include <linux/hdreg.h>

31714

+

31715

+/* IOCTL interface version definitions */

31716

+#define EVMS_IOCTL_INTERFACE_MAJOR 10

31717

+#define EVMS_IOCTL_INTERFACE_MINOR 0

31718

+#define EVMS_IOCTL_INTERFACE_PATCHLEVEL 0

31719

+

31720

+/* IOCTL definitions */

31721

+typedef enum evms_ioctl_cmds_s {

31722

+ /* version commands */

31723

+ EVMS_GET_IOCTL_VERSION_NUMBER = 0,

31724

+ EVMS_GET_VERSION_NUMBER,

31725

+#ifdef __KERNEL__

31726

+ /* EVMS internal commands */

31727

+ EVMS_GET_DISK_LIST_NUMBER = 0x40,

31728

+ EVMS_CHECK_MEDIA_CHANGE_NUMBER,

31729

+ EVMS_REVALIDATE_DISK_NUMBER,

31730

+ EVMS_OPEN_VOLUME_NUMBER,

31731

+ EVMS_CLOSE_VOLUME_NUMBER,

31732

+ EVMS_QUIESCE_VOLUME_NUMBER,

31733

+#endif

31734

+ /* configuration commands */

31735

+ EVMS_GET_INFO_LEVEL_NUMBER = 0x80,

31736

+ EVMS_SET_INFO_LEVEL_NUMBER,

31737

+ EVMS_REDISCOVER_VOLUMES_NUMBER,

31738

+ EVMS_DELETE_VOLUME_NUMBER,

31739

+ EVMS_PLUGIN_IOCTL_NUMBER,

31740

+ EVMS_PROCESS_NOTIFY_EVENT_NUMBER,

31741

+ /* query info commands */

31742

+ EVMS_GET_LOGICAL_DISK_NUMBER = 0xC0,

31743

+ EVMS_GET_LOGICAL_DISK_INFO_NUMBER,

31744

+ EVMS_SECTOR_IO_NUMBER,

31745

+ EVMS_GET_MINOR_NUMBER,

31746

+ EVMS_GET_VOLUME_DATA_NUMBER,

31747

+ EVMS_GET_PLUGIN_NUMBER,

31748

+ EVMS_COMPUTE_CSUM_NUMBER,

31749

+ EVMS_GET_BMAP_NUMBER,

31750

+} evms_ioctl_cmds_t;

31751

+

31752

+/* version commands */

31753

+#define EVMS_GET_IOCTL_VERSION_STRING "EVMS_GET_IOCTL_VERSION"

31754

+#define EVMS_GET_IOCTL_VERSION _IOR(EVMS_MAJOR, EVMS_GET_IOCTL_VERSION_NUMBER, evms_version_t)

31755

+

31756

+#define EVMS_GET_VERSION_STRING "EVMS_GET_VERSION"

31757

+#define EVMS_GET_VERSION _IOR(EVMS_MAJOR, EVMS_GET_VERSION_NUMBER, evms_version_t)

31758

+

31759

+#ifdef __KERNEL__

31760

+

31761

+/* EVMS internal commands */

31762

+#define EVMS_GET_DISK_LIST_STRING "EVMS_GET_DISK_LIST"

31763

+#define EVMS_GET_DISK_LIST _IOWR(EVMS_MAJOR, EVMS_GET_DISK_LIST_NUMBER, evms_list_node_t **)

31764

+

31765

+#define EVMS_CHECK_MEDIA_CHANGE_STRING "EVMS_CHECK_MEDIA_CHANGE"

31766

+#define EVMS_CHECK_MEDIA_CHANGE _IO(EVMS_MAJOR, EVMS_CHECK_MEDIA_CHANGE_NUMBER)

31767

+

31768

+#define EVMS_REVALIDATE_DISK_STRING "EVMS_REVALIDATE_DISK"

31769

+#define EVMS_REVALIDATE_DISK _IO(EVMS_MAJOR, EVMS_REVALIDATE_DISK_NUMBER)

31770

+

31771

+#define EVMS_OPEN_VOLUME_STRING "EVMS_OPEN_VOLUME"

31772

+#define EVMS_OPEN_VOLUME _IO(EVMS_MAJOR, EVMS_OPEN_VOLUME_NUMBER)

31773

+

31774

+#define EVMS_CLOSE_VOLUME_STRING "EVMS_CLOSE_VOLUME"

31775

+#define EVMS_CLOSE_VOLUME _IO(EVMS_MAJOR, EVMS_CLOSE_VOLUME_NUMBER)

31776

+

31777

+/* field: command: defines */

31778

+#define EVMS_UNQUIESCE 0

31779

+#define EVMS_QUIESCE 1

31780

+

31781

+/* field: do_vfs: defines */

31782

+/* see evms_delete_volume */

31783

+typedef struct evms_quiesce_volume_s {

31784

+ int command; /* 0 = unquiesce, 1 = quiesce */

31785

+ int minor; /* minor device number of target volume */

31786

+ int do_vfs; /* 0 = do nothing, 1 = also perform equivalent VFS operation */

31787

+ int status; /* 0 = success */

31788

+} evms_quiesce_volume_t;

31789

+

31790

+#define EVMS_QUIESCE_VOLUME_STRING "EVMS_QUIESCE_VOLUME"

31791

+#define EVMS_QUIESCE_VOLUME _IOR(EVMS_MAJOR, EVMS_QUIESCE_VOLUME_NUMBER, evms_quiesce_volume_t)

31792

+

31793

+#endif

31794

+

31795

+/* configuration commands */

31796

+#define EVMS_GET_INFO_LEVEL_STRING "EVMS_GET_INFO_LEVEL"

31797

+#define EVMS_GET_INFO_LEVEL _IOR(EVMS_MAJOR, EVMS_GET_INFO_LEVEL_NUMBER, int)

31798

+

31799

+#define EVMS_SET_INFO_LEVEL_STRING "EVMS_SET_INFO_LEVEL"

31800

+#define EVMS_SET_INFO_LEVEL _IOW(EVMS_MAJOR, EVMS_SET_INFO_LEVEL_NUMBER, int)

31801

+

31802

+/* field: drive_count: defines */

31803

+#define REDISCOVER_ALL_DEVICES 0xFFFFFFFF

31804

+typedef struct evms_rediscover_s {

31805

+ int status;

31806

+ unsigned int drive_count; /* 0xffffffff = rediscover all known disks */

31807

+ unsigned long *drive_array;

31808

+} evms_rediscover_t;

31809

+

31810

+#define EVMS_REDISCOVER_VOLUMES_STRING "EVMS_REDISCOVER_VOLUMES"

31811

+#define EVMS_REDISCOVER_VOLUMES _IOWR(EVMS_MAJOR, EVMS_REDISCOVER_VOLUMES_NUMBER, evms_rediscover_t)

31812

+

31813

+/* field: command: defines */

31814

+#define EVMS_SOFT_DELETE 0

31815

+#define EVMS_HARD_DELETE 1

31816

+

31817

+/* field: do_vfs: defines */

31818

+#define EVMS_VFS_DO_NOTHING 0

31819

+#define EVMS_VFS_DO 1

31820

+typedef struct evms_delete_volume_s {

31821

+ int command; /* 0 = "temp", 1 = "permanent" */

31822

+ int minor; /* minor device number of target volume */

31823

+ int do_vfs; /* 0 = do nothing, 1 = perform VFS operations */

31824

+ int associative_minor; /* optional minor of associative volume */

31825

+ /* must be 0 when not in use */

31826

+ int status; /* 0 = success, other is error */

31827

+} evms_delete_volume_t;

31828

+

31829

+#define EVMS_DELETE_VOLUME_STRING "EVMS_DELETE_VOLUME"

31830

+#define EVMS_DELETE_VOLUME _IOR(EVMS_MAJOR, EVMS_DELETE_VOLUME_NUMBER, evms_delete_volume_t)

31831

+

31832

+typedef struct evms_plugin_ioctl_s {

31833

+ unsigned long feature_id; /* ID of feature to receive this ioctl */

31834

+ int feature_command; /* feature specific ioctl command */

31835

+ int status; /* 0 = completed, non-0 = error */

31836

+ void *feature_ioctl_data; /* ptr to feature specific struct */

31837

+} evms_plugin_ioctl_t;

31838

+

31839

+#define EVMS_PLUGIN_IOCTL_STRING "EVMS_PLUGIN_IOCTL"

31840

+#define EVMS_PLUGIN_IOCTL _IOR(EVMS_MAJOR, EVMS_PLUGIN_IOCTL_NUMBER, evms_plugin_ioctl_t)

31841

+

31842

+/* field: eventid: defines */

31843

+#define EVMS_EVENT_END_OF_DISCOVERY 0

31844

+typedef struct evms_event_s {

31845

+ int pid; /* PID to act on */

31846

+ int eventid; /* event id to respond to */

31847

+ int signo; /* signal # to send when event occurs */

31848

+} evms_event_t;

31849

+

31850

+/* field: command: defines */

31851

+#define EVMS_EVENT_UNREGISTER 0

31852

+#define EVMS_EVENT_REGISTER 1

31853

+typedef struct evms_notify_s {

31854

+ int command; /* 0 = unregister, 1 = register */

31855

+ evms_event_t eventry; /* event structure */

31856

+ int status; /* return status */

31857

+} evms_notify_t;

31858

+

31859

+#define EVMS_PROCESS_NOTIFY_EVENT_STRING "EVMS_PROCESS_NOTIFY_EVENT"

31860

+#define EVMS_PROCESS_NOTIFY_EVENT _IOWR(EVMS_MAJOR, EVMS_PROCESS_NOTIFY_EVENT_NUMBER, evms_notify_t)

31861

+

31862

+/* query info commands */

31863

+

31864

+/* field: command: defines */

31865

+#define EVMS_FIRST_DISK 0

31866

+#define EVMS_NEXT_DISK 1

31867

+

31868

+/* field: status: defines */

31869

+#define EVMS_DISK_INVALID 0

31870

+#define EVMS_DISK_VALID 1

31871

+typedef struct evms_user_disk_s {

31872

+ int command; /* 0 = first disk, 1 = next disk */

31873

+ int status; /* 0 = no more disks, 1 = valid disk info */

31874

+ unsigned long disk_handle; /* only valid when status == 1 */

31875

+} evms_user_disk_t;

31876

+

31877

+#define EVMS_GET_LOGICAL_DISK_STRING "EVMS_GET_LOGICAL_DISK"

31878

+#define EVMS_GET_LOGICAL_DISK _IOWR(EVMS_MAJOR, EVMS_GET_LOGICAL_DISK_NUMBER, evms_user_disk_t)

31879

+

31880

+/* flags fields described in evms_common.h */

31881

+typedef struct evms_user_disk_info_s {

31882

+ unsigned int status;

31883

+ unsigned int flags;

31884

+ unsigned long disk_handle;

31885

+ unsigned int disk_dev;

31886

+ struct hd_geometry geometry;

31887

+ unsigned int block_size;

31888

+ unsigned int hardsect_size;

31889

+ u_int64_t total_sectors;

31890

+ char disk_name[EVMS_VOLUME_NAME_SIZE];

31891

+} evms_user_disk_info_t;

31892

+

31893

+#define EVMS_GET_LOGICAL_DISK_INFO_STRING "EVMS_GET_LOGICAL_DISK_INFO"

31894

+#define EVMS_GET_LOGICAL_DISK_INFO _IOWR(EVMS_MAJOR, EVMS_GET_LOGICAL_DISK_INFO_NUMBER, evms_user_disk_info_t)

31895

+

31896

+/* field: io_flag: defines */

31897

+#define EVMS_SECTOR_IO_READ 0

31898

+#define EVMS_SECTOR_IO_WRITE 1

31899

+typedef struct evms_sector_io_s {

31900

+ unsigned long disk_handle; /* valid disk handle */

31901

+ int io_flag; /* 0 = READ, 1 = WRITE */

31902

+ evms_sector_t starting_sector; /* disk relative LBA */

31903

+ evms_sector_t sector_count; /* number of sectors in IO */

31904

+ unsigned char *buffer_address; /* IO address */

31905

+ int status; /* 0 = success, not 0 = error */

31906

+} evms_sector_io_t;

31907

+

31908

+#define EVMS_SECTOR_IO_STRING "EVMS_SECTOR_IO"

31909

+#define EVMS_SECTOR_IO _IOWR(EVMS_MAJOR, EVMS_SECTOR_IO_NUMBER, evms_sector_io_t)

31910

+

31911

+/* field: command: defines */

31912

+#define EVMS_FIRST_VOLUME 0

31913

+#define EVMS_NEXT_VOLUME 1

31914

+

31915

+/* field: status: defines */

31916

+#define EVMS_VOLUME_INVALID 0

31917

+#define EVMS_VOLUME_VALID 1

31918

+typedef struct evms_user_minor_s {

31919

+ int command; /* 0 = first volume, 1 = next volume */

31920

+ int status; /* 0 = no more, 1 = valid info */

31921

+ int minor; /* only valid when status == 1 */

31922

+} evms_user_minor_t;

31923

+

31924

+#define EVMS_GET_MINOR_STRING "EVMS_GET_MINOR"

31925

+#define EVMS_GET_MINOR _IOWR(EVMS_MAJOR, EVMS_GET_MINOR_NUMBER, evms_user_minor_t)

31926

+

31927

+/* flags field described in evms_common.h */

31928

+typedef struct evms_volume_data_s {

31929

+ int minor; /* minor of target volume */

31930

+ int flags;

31931

+ char volume_name[EVMS_VOLUME_NAME_SIZE + 1];

31932

+ int status;

31933

+} evms_volume_data_t;

31934

+

31935

+#define EVMS_GET_VOLUME_DATA_STRING "EVMS_GET_VOLUME_DATA"

31936

+#define EVMS_GET_VOLUME_DATA _IOWR(EVMS_MAJOR, EVMS_GET_VOLUME_DATA_NUMBER, evms_volume_data_t)

31937

+

31938

+/* field: command: defines */

31939

+#define EVMS_FIRST_PLUGIN 0

31940

+#define EVMS_NEXT_PLUGIN 1

31941

+

31942

+/* field: status: defines */

31943

+#define EVMS_PLUGIN_INVALID 0

31944

+#define EVMS_PLUGIN_VALID 1

31945

+typedef struct evms_kernel_plugin_s {

31946

+ int command; /* 0 = first item, 1 = next item */

31947

+ u_int32_t id; /* returned plugin id */

31948

+ evms_version_t version; /* maj,min,patch of plugin */

31949

+ int status; /* 0 = no more, 1 = valid info */

31950

+} evms_kernel_plugin_t;

31951

+

31952

+#define EVMS_GET_PLUGIN_STRING "EVMS_GET_PLUGIN"

31953

+#define EVMS_GET_PLUGIN _IOWR(EVMS_MAJOR, EVMS_GET_PLUGIN_NUMBER, evms_kernel_plugin_t)

31954

+

31955

+typedef struct evms_compute_csum_s {

31956

+ unsigned char *buffer_address; /* IO address */

31957

+ int buffer_size; /* byte size of buffer */

31958

+ unsigned int insum; /* previous csum to be factored in */

31959

+ unsigned int outsum; /* resulting csum value of buffer */

31960

+ int status; /* 0 = success, not 0 = error */

31961

+} evms_compute_csum_t;

31962

+

31963

+#define EVMS_COMPUTE_CSUM_STRING "EVMS_COMPUTE_CSUM"

31964

+#define EVMS_COMPUTE_CSUM _IOWR(EVMS_MAJOR, EVMS_COMPUTE_CSUM_NUMBER, evms_compute_csum_t)

31965

+

31966

+typedef struct evms_get_bmap_s {

31967

+ u_int64_t rsector; /* input: volume relative rsector value */

31968

+ /* output: disk relative rsector value */

31969

+ u_int32_t dev; /* output = physical device */

31970

+ int status; /* 0 = success, not 0 = error */

31971

+} evms_get_bmap_t;

31972

+

31973

+#define EVMS_GET_BMAP_STRING "EVMS_GET_BMAP"

31974

+#define EVMS_GET_BMAP _IOWR(EVMS_MAJOR, EVMS_GET_BMAP_NUMBER, evms_get_bmap_t)

31975

+

31976

+#endif

31977

diff -Naur linux-2002-03-28/include/linux/evms/evms_kernel.h evms-2002-03-28/include/linux/evms/evms_kernel.h

31978

--- linux-2002-03-28/include/linux/evms/evms_kernel.h Wed Dec 31 18:00:00 1969

31979

+++ evms-2002-03-28/include/linux/evms/evms_kernel.h Wed May 16 13:40:56 2001

31980

@@ -0,0 +1,29 @@

31981

+/* -*- linux-c -*- */

31982

+/*

31983

+ *

31984

+ * Copyright (c) International Business Machines Corp., 2000

31985

+ *

31986

+ * This program is free software; you can redistribute it and/or modify

31987

+ * it under the terms of the GNU General Public License as published by

31988

+ * the Free Software Foundation; either version 2 of the License, or

31989

+ * (at your option) any later version.

31990

+ *

31991

+ * This program is distributed in the hope that it will be useful,

31992

+ * but WITHOUT ANY WARRANTY; without even the implied warranty of

31993

+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See

31994

+ * the GNU General Public License for more details.

31995

+ *

31996

+ * You should have received a copy of the GNU General Public License

31997

+ * along with this program; if not, write to the Free Software

31998

+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

31999

+ */

32000

+/*

32001

+ * linux/include/linux/evms_kernel.h

32002

+ *

32003

+ * EVMS (master) kernel header file

32004

+ *

32005

+ */

32006

+

32007

+#include <linux/evms/evms_common.h>

32008

+#include <linux/evms/evms.h>

32009

+#include <linux/evms/evms_ioctl.h>

32010

diff -Naur linux-2002-03-28/include/linux/evms/evms_linear.h evms-2002-03-28/include/linux/evms/evms_linear.h

32011

--- linux-2002-03-28/include/linux/evms/evms_linear.h Wed Dec 31 18:00:00 1969

32012

+++ evms-2002-03-28/include/linux/evms/evms_linear.h Thu Jan 10 12:51:50 2002

32013

@@ -0,0 +1,33 @@

32014

+#ifndef __EVMS_LINEAR_H

32015

+#define __EVMS_LINEAR_H

32016

+

32017

+#include <linux/evms/evms_md.h>

32018

+

32019

+struct dev_info {

32020

+ evms_logical_node_t *node;

32021

+ kdev_t dev;

32022

+ unsigned long size;

32023

+ unsigned long offset;

32024

+};

32025

+

32026

+typedef struct dev_info dev_info_t;

32027

+

32028

+struct linear_hash

32029

+{

32030

+ dev_info_t *dev0, *dev1;

32031

+};

32032

+

32033

+struct linear_private_data

32034

+{

32035

+ struct linear_hash *hash_table;

32036

+ dev_info_t disks[MD_SB_DISKS];

32037

+ dev_info_t *smallest;

32038

+ int nr_zones;

32039

+};

32040

+

32041

+

32042

+typedef struct linear_private_data linear_conf_t;

32043

+

32044

+#define mddev_to_conf(mddev) ((linear_conf_t *) mddev->private)

32045

+

32046

+#endif

32047

diff -Naur linux-2002-03-28/include/linux/evms/evms_lvm.h evms-2002-03-28/include/linux/evms/evms_lvm.h

32048

--- linux-2002-03-28/include/linux/evms/evms_lvm.h Wed Dec 31 18:00:00 1969

32049

+++ evms-2002-03-28/include/linux/evms/evms_lvm.h Thu Mar 21 16:30:34 2002

32050

@@ -0,0 +1,300 @@

32051

+/* -*- linux-c -*- */

32052

+/*

32053

+ * Copyright (c) International Business Machines Corp., 2000

32054

+ *

32055

+ * This program is free software; you can redistribute it and/or modify

32056

+ * it under the terms of the GNU General Public License as published by

32057

+ * the Free Software Foundation; either version 2 of the License, or

32058

+ * (at your option) any later version.

32059

+ *

32060

+ * This program is distributed in the hope that it will be useful,

32061

+ * but WITHOUT ANY WARRANTY; without even the implied warranty of

32062

+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See

32063

+ * the GNU General Public License for more details.

32064

+ *

32065

+ * You should have received a copy of the GNU General Public License

32066

+ * along with this program; if not, write to the Free Software

32067

+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

32068

+ */

32069

+/*

32070

+ * linux/include/linux/evms_lvm.h

32071

+ *

32072

+ * EVMS LVM VGE kernel header file

32073

+ */

32074

+

32075

+

32076

+#ifndef __EVMS_LVM_H__

32077

+#define __EVMS_LVM_H__

32078

+

32079

+#define EVMS_LVM_VERSION_MAJOR 1

32080

+#define EVMS_LVM_VERSION_MINOR 0

32081

+#define EVMS_LVM_VERSION_PATCH 0

32082

+

32083

+// The following definitions and data structures are copied from lvm.h and

32084

+// liblvm.h from the LVM 0.9.1beta8 distribution. Since the metadata format

32085

+// changed in beta8, lvm.h changed significantly enough that this module would

32086

+// no longer compile. Instead of requiring evms users to install the latest lvm

32087

+// release, the required definitions and data structures will now be included

32088

+// in this header file.

32089

+

32090

+#ifndef SECTOR_SIZE

32091

+#define SECTOR_SIZE 512

32092

+#endif

32093

+#define MAX_VG 99

32094

+#define MAX_LV 256

32095

+#define MAX_PV 256 /* caused by 8 bit minor */

32096

+#define NAME_LEN 128 /* don't change!!! */

32097

+#define UUID_LEN 32 /* don't change!!! */

32098

+#define LV_SET_ACCESS _IOW ( 0xfe, 0x28, 1)

32099

+#define LV_SET_ALLOCATION _IOW ( 0xfe, 0x29, 1)

32100

+#define LV_SET_STATUS _IOW ( 0xfe, 0x2a, 1)

32101

+#define LV_SNAPSHOT_USE_RATE _IOWR ( 0xfe, 0x2c, 1)

32102

+#define LV_BMAP _IOWR ( 0xfe, 0x30, 1)

32103

+#define LVM_VGDA_ALIGN 4096UL /* some metadata on the disk need to be aligned */

32104

+#define LVM_PV_DISK_BASE 0L /* base of PV structure in disk partition */

32105

+#define LVM_PV_DISK_SIZE 1024L /* size reserved for PV structure on disk */

32106

+#define LVM_VG_DISK_BASE round_up(LVM_PV_DISK_BASE + LVM_PV_DISK_SIZE, LVM_VGDA_ALIGN)

32107

+ /* base of VG structure in disk partition */

32108

+#define LVM_VG_DISK_SIZE (8*512L) /* size reserved for VG structure */

32109

+

32110

+/*

32111

+ * Status flags

32112

+ */

32113

+/* logical volume */

32114

+#define LV_ACTIVE 0x01 /* lv_status */

32115

+#define LV_READ 0x01 /* lv_access */

32116

+#define LV_WRITE 0x02 /* " */

32117

+#define LV_SNAPSHOT 0x04 /* " */

32118

+#define LV_SNAPSHOT_ORG 0x08 /* " */

32119

+

32120

+/* copy on write tables in disk format */

32121

+typedef struct lv_COW_table_disk_v1 {

32122

+ uint64_t pv_org_number;

32123

+ uint64_t pv_org_rsector;

32124

+ uint64_t pv_snap_number;

32125

+ uint64_t pv_snap_rsector;

32126

+} lv_COW_table_disk_t;

32127

+

32128

+/* disk stored pe information */

32129

+typedef struct {

32130

+ uint16_t lv_num;

32131

+ uint16_t le_num;

32132

+} pe_disk_t;

32133

+

32134

+/* disk stored PV, VG, LV and PE size and offset information */

32135

+typedef struct {

32136

+ uint32_t base;

32137

+ uint32_t size;

32138

+} lvm_disk_data_t;

32139

+

32140

+/* disk */

32141

+typedef struct pv_disk_v2 {

32142

+ uint8_t id[2]; /* Identifier */

32143

+ uint16_t version; /* HM lvm version */

32144

+ lvm_disk_data_t pv_on_disk;

32145

+ lvm_disk_data_t vg_on_disk;

32146

+ lvm_disk_data_t pv_uuidlist_on_disk;

32147

+ lvm_disk_data_t lv_on_disk;

32148

+ lvm_disk_data_t pe_on_disk;

32149

+ uint8_t pv_uuid[NAME_LEN];

32150

+ uint8_t vg_name[NAME_LEN];

32151

+ uint8_t system_id[NAME_LEN]; /* for vgexport/vgimport */

32152

+ uint32_t pv_major;

32153

+ uint32_t pv_number;

32154

+ uint32_t pv_status;

32155

+ uint32_t pv_allocatable;

32156

+ uint32_t pv_size; /* HM */

32157

+ uint32_t lv_cur;

32158

+ uint32_t pe_size;

32159

+ uint32_t pe_total;

32160

+ uint32_t pe_allocated;

32161

+

32162

+ /* new in struct version 2 */

32163

+ uint32_t pe_start; /* in sectors */

32164

+

32165

+} pv_disk_t;

32166

+

32167

+/* disk */

32168

+typedef struct lv_disk_v3 {

32169

+ uint8_t lv_name[NAME_LEN];

32170

+ uint8_t vg_name[NAME_LEN];

32171

+ uint32_t lv_access;

32172

+ uint32_t lv_status;

32173

+ uint32_t lv_open; /* HM */

32174

+ uint32_t lv_dev; /* HM */

32175

+ uint32_t lv_number; /* HM */

32176

+ uint32_t lv_mirror_copies; /* for future use */

32177

+ uint32_t lv_recovery; /* " */

32178

+ uint32_t lv_schedule; /* " */

32179

+ uint32_t lv_size;

32180

+ uint32_t lv_snapshot_minor;/* minor number of original */

32181

+ uint16_t lv_chunk_size; /* chunk size of snapshot */

32182

+ uint16_t dummy;

32183

+ uint32_t lv_allocated_le;

32184

+ uint32_t lv_stripes;

32185

+ uint32_t lv_stripesize;

32186

+ uint32_t lv_badblock; /* for future use */

32187

+ uint32_t lv_allocation;

32188

+ uint32_t lv_io_timeout; /* for future use */

32189

+ uint32_t lv_read_ahead; /* HM */

32190

+} lv_disk_t;

32191

+

32192

+/* disk */

32193

+typedef struct vg_disk_v2 {

32194

+ uint8_t vg_uuid[UUID_LEN]; /* volume group UUID */

32195

+ uint8_t vg_name_dummy[NAME_LEN-UUID_LEN]; /* rest of v1 VG name */

32196

+ uint32_t vg_number; /* volume group number */

32197

+ uint32_t vg_access; /* read/write */

32198

+ uint32_t vg_status; /* active or not */

32199

+ uint32_t lv_max; /* maximum logical volumes */

32200

+ uint32_t lv_cur; /* current logical volumes */

32201

+ uint32_t lv_open; /* open logical volumes */

32202

+ uint32_t pv_max; /* maximum physical volumes */

32203

+ uint32_t pv_cur; /* current physical volumes FU */

32204

+ uint32_t pv_act; /* active physical volumes */

32205

+ uint32_t dummy;

32206

+ uint32_t vgda; /* volume group descriptor arrays FU */

32207

+ uint32_t pe_size; /* physical extent size in sectors */

32208

+ uint32_t pe_total; /* total of physical extents */

32209

+ uint32_t pe_allocated; /* allocated physical extents */

32210

+ uint32_t pvg_total; /* physical volume groups FU */

32211

+} vg_disk_t;

32212

+

32213

+/* useful inlines */

32214

+static inline ulong round_up(ulong n, ulong size) {

32215

+ size--;

32216

+ return (n + size) & ~size;

32217

+}

32218

+

32219

+static inline ulong div_up(ulong n, ulong size) {

32220

+ return round_up(n, size) / size;

32221

+}

32222

+

32223

+// End of lvm.h imported data structures

32224

+

32225

+

32226

+#define DEV_DIRECTORY "/dev/"

32227

+#define LVM_DEV_DIRECTORY "lvm/"

32228

+#define LVM_PROC_NAME "lvm"

32229

+#define LVM_PROC_VG_NAME "VGs"

32230

+#define LVM_PROC_LV_NAME "LVs"

32231

+#define LVM_PROC_PV_NAME "PVs"

32232

+#define LVM_PROC_GLOBAL_NAME "global"

32233

+#define IO_BUFFER_SECTORS 8

32234

+

32235

+// Structure for doing PV remove ioctls

32236

+

32237

+#define EVMS_LVM_PV_REMOVE_IOCTL 0x01

32238

+#define EVMS_LVM_SNAPSHOT_STAT_IOCTL 0x02

32239

+

32240

+typedef struct lvm_pv_remove_ioctl_s {

32241

+ unsigned char vg_uuid[UUID_LEN];

32242

+ int pv_number;

32243

+ struct lvm_pv_remove_ioctl_s * next;

32244

+} lvm_pv_remove_ioctl_t;

32245

+

32246

+

32247

+// Structure for doing snapshot stat ioctls

32248

+typedef struct lvm_snapshot_stat_ioctl_s {

32249

+ unsigned char vg_uuid[UUID_LEN];

32250

+ int lv_number;

32251

+ evms_sector_t next_free_chunk;

32252

+ u_int32_t lv_status;

32253

+} lvm_snapshot_stat_ioctl_t;

32254

+

32255

+

32256

+// Entries in the list of physical volumes (PV)

32257

+// in a volume group (VG)

32258

+typedef struct lvm_physical_volume_s {

32259

+ evms_logical_node_t * logical_node;

32260

+ pv_disk_t * pv; // Copy of on-disk PV struct

32261

+ pe_disk_t * pe_map;

32262

+ u_int32_t pv_number;

32263

+ struct lvm_physical_volume_s * next;

32264

+} lvm_physical_volume_t;

32265

+

32266

+

32267

+// Table for mapping logical extents (LE) to physical extents (PE)

32268

+typedef struct le_table_entry_s {

32269

+ lvm_physical_volume_t * owning_pv;

32270

+ evms_sector_t pe_sector_offset;

32271

+} le_table_entry_t;

32272

+

32273

+

32274

+// Entries in the snapshot remapping structure

32275

+typedef struct snapshot_map_entry_s {

32276

+ evms_sector_t org_sector;

32277

+ evms_sector_t snap_sector;

32278

+ lvm_physical_volume_t * snap_pv;

32279

+ struct snapshot_map_entry_s * next;

32280

+ struct snapshot_map_entry_s * prev;

32281

+} snapshot_map_entry_t;

32282

+

32283

+

32284

+// Logical volumes (LV) in a volume group (VG)

32285

+#define EVMS_LV_NEW 0x10 // volume was created during the current discovery pass

32286

+#define EVMS_LV_INCOMPLETE 0x20 // volume has an incomplete LE map

32287

+#define EVMS_LV_INVALID 0x40 // volume has a memory-corruption problem

32288

+#define EVMS_LV_QUIESCED 0x80 // volume is in quiesced state

32289

+#define MAX_HASH_CHAIN_ENTRIES 10

32290

+#define CHUNK_DATA_BUFFER_SIZE 64 // 32k in sectors. Feel free to change, but must be power of 2!

32291

+

32292

+typedef struct lvm_logical_volume_s {

32293

+ u_int32_t lv_number;

32294

+ evms_sector_t lv_size; // Sectors

32295

+ u_int32_t lv_access; // Flags: LV_READ, LV_WRITE, LV_SNAPSHOT, LV_SNAPSHOT_ORG, EVMS_LV_*

32296

+ u_int32_t lv_status; // Flags: LV_ACTIVE, LV_SPINDOWN

32297

+ u_int32_t lv_minor; // Device minor number

32298

+ u_int32_t stripes;

32299

+ u_int32_t stripe_size; // Sectors

32300

+ u_int32_t stripe_size_shift; // Number of bits to shift right instead of dividing by stripe_size

32301

+ u_int32_t pe_size; // Sectors

32302

+ u_int32_t pe_size_shift; // Number of bits to shift right instead of dividing by pe_size

32303

+ u_int32_t num_le; // Number of entries in the le_to_pe_map

32304

+ struct lvm_volume_group_s * group; // Pointer back to parent volume group

32305

+ unsigned char name[NAME_LEN]; // Dev-tree volume name (eg: /dev/group0/vol0)

32306

+ le_table_entry_t * le_map; // Mapping of logical to physical extents

32307

+ evms_logical_node_t * volume_node; // Pointer to the parent EVMS node representing this volume

32308

+

32309

+ // Snapshotting information

32310

+ u_int32_t chunk_size; // Sectors

32311

+ u_int32_t num_chunks; // lv_size/chunk_size

32312

+ u_int32_t snap_org_minor; // Minor number of snapshot original

32313

+ u_int32_t next_cow_entry; // Index into current COW table

32314

+ evms_sector_t current_cow_sector; // LOGICAL sector of current COW table

32315

+ evms_sector_t next_free_chunk; // Starting LOGICAL sector of next free chunk

32316

+ u_int32_t hash_table_size; // Number of pointers in each hash table

32317

+ lv_COW_table_disk_t * cow_table; // Pointer to one sector's worth of COW tables

32318

+ unsigned char * chunk_data_buffer; // Buffer for reading data when doing a copy-on-write

32319

+ struct semaphore snap_semaphore; // For locking during snapshot I/O operations

32320

+ snapshot_map_entry_t *** snapshot_map; // Pointer to the remapping hash tables

32321

+ struct lvm_logical_volume_s * snapshot_next; // Linked list of volumes snapshotting the original

32322

+ struct lvm_logical_volume_s * snapshot_org; // Pointer to volume being snapshotted

32323

+} lvm_logical_volume_t;

32324

+

32325

+

32326

+// Volume groups (VG)

32327

+

32328

+#define EVMS_VG_DIRTY (1 << 0) // group is new or has had a PV added during this discovery

32329

+#define EVMS_VG_PARTIAL_PVS (1 << 1) // group contains at least one partial PV.

32330

+#define EVMS_VG_REMOVABLE_PVS (1 << 2) // group contains at least one removeable PV.

32331

+

32332

+typedef struct lvm_volume_group_s {

32333

+ vg_disk_t * vg; // Copy of on-disk VG metadata

32334

+ lvm_physical_volume_t * pv_list; // List of PVs that make up this group

32335

+ lvm_logical_volume_t * volume_list[MAX_LV+1]; // Array of volumes

32336

+ lv_disk_t * lv_array; // Array of LV metadata

32337

+ unsigned char * uuid_list; // List of PV UUIDs

32338

+ unsigned char vg_uuid[UUID_LEN]; // UUID from the VG metadata

32339

+ char vg_name[NAME_LEN]; // Name from the PV metadata

32340

+ u_int32_t pv_count; // Number of PVs found in this group

32341

+ u_int32_t volume_count; // Number of LVs found in this group

32342

+ int hard_sect_size; // The largest hard_sect_size and block_size

32343

+ int block_size; // values of all PVs in this group.

32344

+ u_int32_t flags; // EVMS_VG_?

32345

+ struct lvm_volume_group_s * next_group;

32346

+} lvm_volume_group_t;

32347

+

32348

+

32349

+#endif

32350

+

32351

diff -Naur linux-2002-03-28/include/linux/evms/evms_md.h evms-2002-03-28/include/linux/evms/evms_md.h

32352

--- linux-2002-03-28/include/linux/evms/evms_md.h Wed Dec 31 18:00:00 1969

32353

+++ evms-2002-03-28/include/linux/evms/evms_md.h Thu Mar 14 17:01:39 2002

32354

@@ -0,0 +1,107 @@

32355

+/*

32356

+ * Copyright (c) International Business Machines Corp., 2000

32357

+ *

32358

+ * This program is free software; you can redistribute it and/or modify

32359

+ * it under the terms of the GNU General Public License as published by

32360

+ * the Free Software Foundation; either version 2 of the License, or

32361

+ * (at your option) any later version.

32362

+ *

32363

+ * This program is distributed in the hope that it will be useful,

32364

+ * but WITHOUT ANY WARRANTY; without even the implied warranty of

32365

+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See

32366

+ * the GNU General Public License for more details.

32367

+ *

32368

+ * You should have received a copy of the GNU General Public License

32369

+ * along with this program; if not, write to the Free Software

32370

+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

32371

+ *

32372

+ * linux/include/linux/evms/evms_md.h

32373

+ *

32374

+ * EVMS Linux MD Region Manager Public Header File

32375

+ *

32376

+ * 'evms_md.h' is an EVMS version of linux/include/linux/raid/md.h modified

32377

+ * by Cuong (Mike) Tran <miketran@us.ibm.com>, January 2002.

32378

+ *

32379

+ */

32380

+

32381

+#ifndef __EVMS_MD_INCLUDED

32382

+#define __EVMS_MD_INCLUDED

32383

+

32384

+#include <linux/mm.h>

32385

+#include <linux/fs.h>

32386

+#include <linux/blkdev.h>

32387

+#include <asm/semaphore.h>

32388

+#include <linux/major.h>

32389

+#include <linux/ioctl.h>

32390

+#include <linux/types.h>

32391

+#include <asm/bitops.h>

32392

+#include <linux/module.h>

32393

+#include <linux/hdreg.h>

32394

+#include <linux/proc_fs.h>

32395

+#include <linux/smp_lock.h>

32396

+#include <linux/delay.h>

32397

+#include <net/checksum.h>

32398

+#include <linux/random.h>

32399

+#include <linux/locks.h>

32400

+#include <linux/kernel_stat.h>

32401

+#include <asm/io.h>

32402

+#include <linux/completion.h>

32403

+

32404

+#include <linux/evms/evms_kernel.h>

32405

+

32406

+#include <linux/raid/md_compatible.h>

32407

+/*

32408

+ * 'md_p.h' holds the 'physical' layout of RAID devices

32409

+ * 'md_u.h' holds the user <=> kernel API

32410

+ *

32411

+ * 'md_k.h' holds kernel internal definitions

32412

+ */

32413

+

32414

+#include <linux/evms/evms_md_p.h>

32415

+#include <linux/evms/evms_md_u.h>

32416

+#include <linux/evms/evms_md_k.h>

32417

+

32418

+#ifndef MAX_READAHEAD /* The following #defines were removed as of 2.4.16 kernel */

32419

+

32420

+#define MAX_READAHEAD 31

32421

+#define MIN_READAHEAD 3

32422

+

32423

+#endif

32424

+

32425

+/*

32426

+ * Different major versions are not compatible.

32427

+ * Different minor versions are only downward compatible.

32428

+ * Different patchlevel versions are downward and upward compatible.

32429

+ */

32430

+#define MD_MAJOR_VERSION 0

32431

+#define MD_MINOR_VERSION 90

32432

+#define MD_PATCHLEVEL_VERSION 0

32433

+

32434

+#define EVMS_MD_COMMON_SERVICES_MAJOR 0

32435

+#define EVMS_MD_COMMON_SERVICES_MINOR 5

32436

+#define EVMS_MD_COMMON_SERVICES_PATCHLEVEL 0

32437

+

32438

+

32439

+extern int evms_md_size[MAX_MD_DEVS];

32440

+

32441

+extern void evms_md_add_mddev_mapping (mddev_t *mddev, kdev_t dev, void *data);

32442

+extern void evms_md_del_mddev_mapping (mddev_t *mddev, kdev_t dev);

32443

+extern char * evms_md_partition_name (evms_logical_node_t *node);

32444

+extern int evms_register_md_personality (int p_num, mdk_personality_t *p);

32445

+extern int evms_unregister_md_personality (int p_num);

32446

+

32447

+extern int evms_md_update_sb (mddev_t *mddev);

32448

+extern int evms_md_check_ordering (mddev_t *mddev);

32449

+extern void evms_md_print_devices (void);

32450

+

32451

+extern int evms_md_do_sync(mddev_t *mddev, mdp_disk_t *spare);

32452

+extern void evms_md_done_sync(mddev_t *mddev, int blocks, int ok);

32453

+extern void evms_md_sync_acct(kdev_t dev, unsigned long nr_sectors);

32454

+extern void evms_md_recover_arrays (void);

32455

+extern int evms_md_error (mddev_t *mddev, evms_logical_node_t *node);

32456

+

32457

+#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); evms_md_print_devices(); }

32458

+

32459

+

32460

+#endif

32461

+

32462

diff -Naur linux-2002-03-28/include/linux/evms/evms_md_k.h evms-2002-03-28/include/linux/evms/evms_md_k.h

32463

--- linux-2002-03-28/include/linux/evms/evms_md_k.h Wed Dec 31 18:00:00 1969

32464

+++ evms-2002-03-28/include/linux/evms/evms_md_k.h Mon Mar 11 22:58:16 2002

32465

@@ -0,0 +1,419 @@

32466

+/*

32467

+ * Copyright (c) International Business Machines Corp., 2000

32468

+ *

32469

+ * This program is free software; you can redistribute it and/or modify

32470

+ * it under the terms of the GNU General Public License as published by

32471

+ * the Free Software Foundation; either version 2 of the License, or

32472

+ * (at your option) any later version.

32473

+ *

32474

+ * This program is distributed in the hope that it will be useful,

32475

+ * but WITHOUT ANY WARRANTY; without even the implied warranty of

32476

+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See

32477

+ * the GNU General Public License for more details.

32478

+ *

32479

+ * You should have received a copy of the GNU General Public License

32480

+ * along with this program; if not, write to the Free Software

32481

+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

32482

+ */

32483

+/*

32484

+ * linux/include/linux/evms/evms_md_k.h

32485

+ *

32486

+ * EVMS Linux MD Region Manager Public Header File

32487

+ *

32488

+ * 'evms_md_k.h' is an EVMS version of linux/include/linux/raid/md_k.h modified

32489

+ * by Cuong (Mike) Tran <miketran@us.ibm.com>, January 2002.

32490

+ *

32491

+ */

32492

+

32493

+#ifndef __EVMS_MD_K_INC__

32494

+#define __EVMS_MD_K_INC__

32495

+

32496

+#define MD_RESERVED 0UL

32497

+#define LINEAR 1UL

32498

+#define RAID0 2UL

32499

+#define RAID1 3UL

32500

+#define RAID5 4UL

32501

+#define TRANSLUCENT 5UL

32502

+#define HSM 6UL

32503

+#define MULTIPATH 7UL

32504

+#define MAX_PERSONALITY 8UL

32505

+

32506

+static inline int pers_to_level (int pers)

32507

+{

32508

+ switch (pers) {

32509

+ case MULTIPATH: return -4;

32510

+ case HSM: return -3;

32511

+ case TRANSLUCENT: return -2;

32512

+ case LINEAR: return -1;

32513

+ case RAID0: return 0;

32514

+ case RAID1: return 1;

32515

+ case RAID5: return 5;

32516

+ }

32517

+ BUG();

32518

+ return MD_RESERVED;

32519

+}

32520

+

32521

+static inline int level_to_pers (int level)

32522

+{

32523

+ switch (level) {

32524

+ case -3: return HSM;

32525

+ case -2: return TRANSLUCENT;

32526

+ case -1: return LINEAR;

32527

+ case 0: return RAID0;

32528

+ case 1: return RAID1;

32529

+ case 4:

32530

+ case 5: return RAID5;

32531

+ }

32532

+ return MD_RESERVED;

32533

+}

32534

+

32535

+typedef struct mddev_s mddev_t;

32536

+typedef struct mdk_rdev_s mdk_rdev_t;

32537

+

32538

+#if (MINORBITS != 8)

32539

+#error MD doesnt handle bigger kdev yet

32540

+#endif

32541

+

32542

+#define MAX_MD_DEVS (1<<MINORBITS) /* Max number of md dev */

32543

+

32544

+/*

32545

+ * Maps a kdev to an mddev/subdev. How 'data' is handled is up to

32546

+ * the personality. (eg. HSM uses this to identify individual LVs)

32547

+ */

32548

+typedef struct dev_mapping_s {

32549

+ mddev_t *mddev;

32550

+ void *data;

32551

+} dev_mapping_t;

32552

+

32553

+

32554

+extern dev_mapping_t evms_mddev_map [MAX_MD_DEVS];

32555

+static inline mddev_t * kdev_to_mddev (kdev_t dev)

32556

+{

32557

+ if (MAJOR(dev) != MD_MAJOR)

32558

+ BUG();

32559

+ return evms_mddev_map[MINOR(dev)].mddev;

32560

+}

32561

+

32562

+/*

32563

+ * options passed in raidrun:

32564

+ */

32565

+

32566

+#define MAX_CHUNK_SIZE (4096*1024)

32567

+

32568

+/*

32569

+ * default readahead

32570

+ */

32571

+#define MD_READAHEAD MAX_READAHEAD

32572

+

32573

+static inline int disk_faulty(mdp_disk_t * d)

32574

+{

32575

+ return d->state & (1 << MD_DISK_FAULTY);

32576

+}

32577

+

32578

+static inline int disk_active(mdp_disk_t * d)

32579

+{

32580

+ return d->state & (1 << MD_DISK_ACTIVE);

32581

+}

32582

+

32583

+static inline int disk_sync(mdp_disk_t * d)

32584

+{

32585

+ return d->state & (1 << MD_DISK_SYNC);

32586

+}

32587

+

32588

+static inline int disk_spare(mdp_disk_t * d)

32589

+{

32590

+ return !disk_sync(d) && !disk_active(d) && !disk_faulty(d);

32591

+}

32592

+

32593

+static inline int disk_removed(mdp_disk_t * d)

32594

+{

32595

+ return d->state & (1 << MD_DISK_REMOVED);

32596

+}

32597

+

32598

+static inline void mark_disk_faulty(mdp_disk_t * d)

32599

+{

32600

+ d->state |= (1 << MD_DISK_FAULTY);

32601

+}

32602

+

32603

+static inline void mark_disk_active(mdp_disk_t * d)

32604

+{

32605

+ d->state |= (1 << MD_DISK_ACTIVE);

32606

+ d->state &= ~(1 << MD_DISK_PENDING_ACTIVE);

32607

+}

32608

+

32609

+static inline void mark_disk_sync(mdp_disk_t * d)

32610

+{

32611

+ d->state |= (1 << MD_DISK_SYNC);

32612

+}

32613

+

32614

+static inline void mark_disk_spare(mdp_disk_t * d)

32615

+{

32616

+ d->state = 0;

32617

+}

32618

+

32619

+static inline void mark_disk_removed(mdp_disk_t * d)

32620

+{

32621

+ d->state = (1 << MD_DISK_FAULTY) | (1 << MD_DISK_REMOVED);

32622

+}

32623

+

32624

+static inline void mark_disk_inactive(mdp_disk_t * d)

32625

+{

32626

+ d->state &= ~(1 << MD_DISK_ACTIVE);

32627

+}

32628

+

32629

+static inline void mark_disk_nonsync(mdp_disk_t * d)

32630

+{

32631

+ d->state &= ~(1 << MD_DISK_SYNC);

32632

+}

32633

+

32634

+/*

32635

+ * MD's 'extended' device

32636

+ */

32637

+struct mdk_rdev_s

32638

+{

32639

+ struct md_list_head same_set; /* RAID devices within the same set */

32640

+ struct md_list_head all; /* all RAID devices */

32641

+ struct md_list_head pending; /* undetected RAID devices */

32642

+ evms_logical_node_t *node; /* EVMS device node */

32643

+ kdev_t dev; /* Device number */

32644

+ kdev_t old_dev; /* "" when it was last imported */

32645

+ unsigned long size; /* Device size (in blocks) */

32646

+ mddev_t *mddev; /* RAID array if running */

32647

+ unsigned long last_events; /* IO event timestamp */

32648

+

32649

+ struct block_device *bdev; /* block device handle */

32650

+

32651

+ mdp_super_t *sb;

32652

+ unsigned long sb_offset; /* in blocks */

32653

+

32654

+ int virtual_spare; /* "virtual" spare added via IOCTL */

32655

+ int alias_device; /* device alias to the same disk */

32656

+ int faulty; /* if faulty do not issue IO requests */

32657

+ int desc_nr; /* descriptor index in the superblock */

32658

+};

32659

+

32660

+

32661

+/*

32662

+ * disk operations in a working array:

32663

+ */

32664

+#define DISKOP_SPARE_INACTIVE 0

32665

+#define DISKOP_SPARE_WRITE 1

32666

+#define DISKOP_SPARE_ACTIVE 2

32667

+#define DISKOP_HOT_SPARE_ACTIVE 3

32668

+#define DISKOP_HOT_REMOVE_SPARE 4

32669

+#define DISKOP_HOT_REMOVE_DISK 5

32670

+#define DISKOP_HOT_ADD_DISK 6

32671

+#define DISKOP_HOT_DEACTIVATE_DISK 7

32672

+

32673

+typedef struct mdk_personality_s mdk_personality_t;

32674

+

32675

+#define EVMS_MD_INCOMPLETE (1<<0)

32676

+

32677

+struct mddev_s

32678

+{

32679

+ void *private;

32680

+ mdk_personality_t *pers;

32681

+ evms_logical_node_t *node; /* evms node */

32682

+ unsigned long flag;

32683

+ int nr_raid_disks;

32684

+ int __minor;

32685

+ mdp_super_t *sb;

32686

+ int nb_dev;

32687

+ struct md_list_head disks;

32688

+ int sb_dirty;

32689

+ mdu_param_t param;

32690

+ int ro;

32691

+ unsigned long curr_resync; /* blocks scheduled */

32692

+ unsigned long resync_mark; /* a recent timestamp */

32693

+ unsigned long resync_mark_cnt;/* blocks written at resync_mark */

32694

+ char *name;

32695

+ int recovery_running;

32696

+ struct semaphore reconfig_sem;

32697

+ struct semaphore recovery_sem;

32698

+ struct semaphore resync_sem;

32699

+ atomic_t active;

32700

+

32701

+ atomic_t recovery_active; /* blocks scheduled, but not written */

32702

+ md_wait_queue_head_t recovery_wait;

32703

+

32704

+ struct md_list_head all_mddevs;

32705

+};

32706

+

32707

+struct mdk_personality_s

32708

+{

32709

+ char *name;

32710

+ int (* init_io) (mddev_t *mddev, int rw, evms_sector_t LSN, evms_sector_t nr_sects, void *data);

32711

+ int (*make_request)(mddev_t *mddev, int rw, eio_t *eio);

32712

+ int (*run)(mddev_t *mddev);

32713

+ int (*stop)(mddev_t *mddev);

32714

+ int (*status)(char *page, mddev_t *mddev);

32715

+ int (*error_handler)(mddev_t *mddev, evms_logical_node_t *node);

32716

+

32717

+/*

32718

+ * Some personalities (RAID-1, RAID-5) can have disks hot-added and

32719

+ * hot-removed. Hot removal is different from failure. (failure marks

32720

+ * a disk inactive, but the disk is still part of the array) The interface

32721

+ * to such operations is the 'pers->diskop()' function, can be NULL.

32722

+ *

32723

+ * the diskop function can change the pointer pointing to the incoming

32724

+ * descriptor, but must do so very carefully. (currently only

32725

+ * SPARE_ACTIVE expects such a change)

32726

+ */

32727

+ int (*diskop) (mddev_t *mddev, mdp_disk_t **descriptor, int state);

32728

+

32729

+ int (*stop_resync)(mddev_t *mddev);

32730

+ int (*restart_resync)(mddev_t *mddev);

32731

+ int (*sync_request)(mddev_t *mddev, unsigned long block_nr);

32732

+ int (*evms_ioctl)(mddev_t *mddev, struct inode *inode, struct file *file,

32733

+ unsigned int cmd, unsigned long arg);

32734

+ int (*md_pers_ioctl)(mddev_t *mddev, int cmd, void* pers_arg);

32735

+};

32736

+

32737

+/* This structure is required for activating a spare device */

32738

+typedef struct evms_md_activate_spare_s {

32739

+ struct evms_md_activate_spare_s *next; /* next entry */

32740

+ mddev_t *mddev; /* target mddev */

32741

+ mdp_disk_t *spare; /* spare to activate */

32742

+} evms_md_activate_spare_t;

32743

+

32744

+/*

32745

+ * Currently we index md_array directly, based on the minor

32746

+ * number. This will have to change to dynamic allocation

32747

+ * once we start supporting partitioning of md devices.

32748

+ */

32749

+static inline int mdidx (mddev_t * mddev)

32750

+{

32751

+ return mddev->__minor;

32752

+}

32753

+

32754

+static inline kdev_t mddev_to_kdev(mddev_t * mddev)

32755

+{

32756

+ return MKDEV(MD_MAJOR, mdidx(mddev));

32757

+}

32758

+

32759

+extern mdk_rdev_t * evms_md_find_rdev(mddev_t * mddev, kdev_t dev);

32760

+extern mdk_rdev_t * evms_md_find_rdev_nr(mddev_t *mddev, int nr);

32761

+extern mdp_disk_t *get_spare(mddev_t *mddev);

32762

+

32763

+/*

32764

+ * iterates through some rdev ringlist. It's safe to remove the

32765

+ * current 'rdev'. Dont touch 'tmp' though.

32766

+ */

32767

+#define ITERATE_RDEV_GENERIC(head,field,rdev,tmp) \

32768

+ \

32769

+ for (tmp = head.next; \

32770

+ rdev = md_list_entry(tmp, mdk_rdev_t, field), \

32771

+ tmp = tmp->next, tmp->prev != &head \

32772

+ ; )

32773

+/*

32774

+ * iterates through the 'same array disks' ringlist

32775

+ */

32776

+#define ITERATE_RDEV(mddev,rdev,tmp) \

32777

+ ITERATE_RDEV_GENERIC((mddev)->disks,same_set,rdev,tmp)

32778

+

32779

+/*

32780

+ * Same as above, but assumes that the device has rdev->desc_nr numbered

32781

+ * from 0 to mddev->nb_dev, and iterates through rdevs in ascending order.

32782

+ */

32783

+#define ITERATE_RDEV_ORDERED(mddev,rdev,i) \

32784

+ for (i = 0; rdev = evms_md_find_rdev_nr(mddev, i), i < mddev->nb_dev; i++)

32785

+

32786

+

32787

+/*

32788

+ * Iterates through all 'RAID managed disks'

32789

+ */

32790

+#define ITERATE_RDEV_ALL(rdev,tmp) \

32791

+ ITERATE_RDEV_GENERIC(all_raid_disks,all,rdev,tmp)

32792

+

32793

+/*

32794

+ * Iterates through 'pending RAID disks'

32795

+ */

32796

+#define ITERATE_RDEV_PENDING(rdev,tmp) \

32797

+ ITERATE_RDEV_GENERIC(pending_raid_disks,pending,rdev,tmp)

32798

+

32799

+/*

32800

+ * iterates through all used mddevs in the system.

32801

+ */

32802

+#define ITERATE_MDDEV(mddev,tmp) \

32803

+ \

32804

+ for (tmp = all_mddevs.next; \

32805

+ mddev = md_list_entry(tmp, mddev_t, all_mddevs), \

32806

+ tmp = tmp->next, tmp->prev != &all_mddevs \

32807

+ ; )

32808

+

32809

+static inline int lock_mddev (mddev_t * mddev)

32810

+{

32811

+ return down_interruptible(&mddev->reconfig_sem);

32812

+}

32813

+

32814

+static inline void unlock_mddev (mddev_t * mddev)

32815

+{

32816

+ up(&mddev->reconfig_sem);

32817

+}

32818

+

32819

+#define xchg_values(x,y) do { __typeof__(x) __tmp = x; \

32820

+ x = y; y = __tmp; } while (0)

32821

+

32822

+#define MAX_DISKNAME_LEN 64

32823

+

32824

+typedef struct dev_name_s {

32825

+ struct md_list_head list;

32826

+ kdev_t dev;

32827

+ char namebuf [MAX_DISKNAME_LEN];

32828

+ char *name;

32829

+} dev_name_t;

32830

+

32831

+

32832

+#define __wait_event_lock_irq(wq, condition, lock) \

32833

+do { \

32834

+ wait_queue_t __wait; \

32835

+ init_waitqueue_entry(&__wait, current); \

32836

+ \

32837

+ add_wait_queue(&wq, &__wait); \

32838

+ for (;;) { \

32839

+ set_current_state(TASK_UNINTERRUPTIBLE); \

32840

+ if (condition) \

32841

+ break; \

32842

+ spin_unlock_irq(&lock); \

32843

+ run_task_queue(&tq_disk); \

32844

+ schedule(); \

32845

+ spin_lock_irq(&lock); \

32846

+ } \

32847

+ current->state = TASK_RUNNING; \

32848

+ remove_wait_queue(&wq, &__wait); \

32849

+} while (0)

32850

+

32851

+#define wait_event_lock_irq(wq, condition, lock) \

32852

+do { \

32853

+ if (condition) \

32854

+ break; \

32855

+ __wait_event_lock_irq(wq, condition, lock); \

32856

+} while (0)

32857

+

32858

+

32859

+#define __wait_disk_event(wq, condition) \

32860

+do { \

32861

+ wait_queue_t __wait; \

32862

+ init_waitqueue_entry(&__wait, current); \

32863

+ \

32864

+ add_wait_queue(&wq, &__wait); \

32865

+ for (;;) { \

32866

+ set_current_state(TASK_UNINTERRUPTIBLE); \

32867

+ if (condition) \

32868

+ break; \

32869

+ run_task_queue(&tq_disk); \

32870

+ schedule(); \

32871

+ } \

32872

+ current->state = TASK_RUNNING; \

32873

+ remove_wait_queue(&wq, &__wait); \

32874

+} while (0)

32875

+

32876

+#define wait_disk_event(wq, condition) \

32877

+do { \

32878

+ if (condition) \

32879

+ break; \

32880

+ __wait_disk_event(wq, condition); \

32881

+} while (0)

32882

+

32883

+#endif

32884

+

32885

diff -Naur linux-2002-03-28/include/linux/evms/evms_md_p.h evms-2002-03-28/include/linux/evms/evms_md_p.h

32886

--- linux-2002-03-28/include/linux/evms/evms_md_p.h Wed Dec 31 18:00:00 1969

32887

+++ evms-2002-03-28/include/linux/evms/evms_md_p.h Tue Mar 26 18:58:57 2002

32888

@@ -0,0 +1,197 @@

32889

+/*

32890

+ * Copyright (c) International Business Machines Corp., 2000

32891

+ *

32892

+ * This program is free software; you can redistribute it and/or modify

32893

+ * it under the terms of the GNU General Public License as published by

32894

+ * the Free Software Foundation; either version 2 of the License, or

32895

+ * (at your option) any later version.

32896

+ *

32897

+ * This program is distributed in the hope that it will be useful,

32898

+ * but WITHOUT ANY WARRANTY; without even the implied warranty of

32899

+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See

32900

+ * the GNU General Public License for more details.

32901

+ *

32902

+ * You should have received a copy of the GNU General Public License

32903

+ * along with this program; if not, write to the Free Software

32904

+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

32905

+ */

32906

+/*

32907

+ * linux/include/linux/evms/evms_md_p.h

32908

+ *

32909

+ * EVMS Linux MD Region Manager Public Header File

32910

+ *

32911

+ * 'evms_md_p.h' is an EVMS version of linux/include/linux/raid/md_p.h modified

32912

+ * by Cuong (Mike) Tran <miketran@us.ibm.com>, March 2002.

32913

+ *

32914

+ */

32915

+

32916

+#ifndef __EVMS_MD_P_INC__

32917

+#define __EVMS_MD_P_INC__

32918

+

32919

+/*

32920

+ * RAID superblock.

32921

+ *

32922

+ * The RAID superblock maintains some statistics on each RAID configuration.

32923

+ * Each real device in the RAID set contains it near the end of the device.

32924

+ * Some of the ideas are copied from the ext2fs implementation.

32925

+ *

32926

+ * We currently use 4096 bytes as follows:

32927

+ *

32928

+ * word offset function

32929

+ *

32930

+ * 0 - 31 Constant generic RAID device information.

32931

+ * 32 - 63 Generic state information.

32932

+ * 64 - 127 Personality specific information.

32933

+ * 128 - 511 12 32-words descriptors of the disks in the raid set.

32934

+ * 512 - 911 Reserved.

32935

+ * 912 - 1023 Disk specific descriptor.

32936

+ */

32937

+

32938

+/*

32939

+ * If x is the real device size in bytes, we return an apparent size of:

32940

+ *

32941

+ * y = (x & ~(MD_RESERVED_BYTES - 1)) - MD_RESERVED_BYTES

32942

+ *

32943

+ * and place the 4kB superblock at offset y.

32944

+ */

32945

+#define MD_RESERVED_BYTES (64 * 1024)

32946

+#define MD_RESERVED_SECTORS (MD_RESERVED_BYTES / 512)

32947

+#define MD_RESERVED_BLOCKS (MD_RESERVED_BYTES / BLOCK_SIZE)

32948

+

32949

+#define MD_NEW_SIZE_SECTORS(x) ((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS)

32950

+#define MD_NEW_SIZE_BLOCKS(x) ((x & ~(MD_RESERVED_BLOCKS - 1)) - MD_RESERVED_BLOCKS)

32951

+

32952

+#define MD_SB_BYTES 4096

32953

+#define MD_SB_WORDS (MD_SB_BYTES / 4)

32954

+#define MD_SB_BLOCKS (MD_SB_BYTES / BLOCK_SIZE)

32955

+#define MD_SB_SECTORS (MD_SB_BYTES / 512)

32956

+

32957

+/*

32958

+ * The following are counted in 32-bit words

32959

+ */

32960

+#define MD_SB_GENERIC_OFFSET 0

32961

+#define MD_SB_PERSONALITY_OFFSET 64

32962

+#define MD_SB_DISKS_OFFSET 128

32963

+#define MD_SB_DESCRIPTOR_OFFSET 992

32964

+

32965

+#define MD_SB_GENERIC_CONSTANT_WORDS 32

32966

+#define MD_SB_GENERIC_STATE_WORDS 32

32967

+#define MD_SB_GENERIC_WORDS (MD_SB_GENERIC_CONSTANT_WORDS + MD_SB_GENERIC_STATE_WORDS)

32968

+#define MD_SB_PERSONALITY_WORDS 64

32969

+#define MD_SB_DESCRIPTOR_WORDS 32

32970

+#define MD_SB_DISKS 27

32971

+#define MD_SB_DISKS_WORDS (MD_SB_DISKS*MD_SB_DESCRIPTOR_WORDS)

32972

+#define MD_SB_RESERVED_WORDS (1024 - MD_SB_GENERIC_WORDS - MD_SB_PERSONALITY_WORDS - MD_SB_DISKS_WORDS - MD_SB_DESCRIPTOR_WORDS)

32973

+#define MD_SB_EQUAL_WORDS (MD_SB_GENERIC_WORDS + MD_SB_PERSONALITY_WORDS + MD_SB_DISKS_WORDS)

32974

+

32975

+/*

32976

+ * Device "operational" state bits

32977

+ */

32978

+#define MD_DISK_FAULTY 0 /* disk is faulty / operational */

32979

+#define MD_DISK_ACTIVE 1 /* disk is running or spare disk */

32980

+#define MD_DISK_SYNC 2 /* disk is in sync with the raid set */

32981

+#define MD_DISK_REMOVED 3 /* disk has kind of been removed, but not really or it would not be here */

32982

+#define MD_DISK_NEW 4 /* disk has just been added to the raid set */

32983

+#define MD_DISK_PENDING_ACTIVE 5 /* disk was spare, but should be activated */

32984

+

32985

+typedef struct mdp_device_descriptor_s {

32986

+ __u32 number; /* 0 Device number in the entire set */

32987

+ __u32 major; /* 1 Device major number */

32988

+ __u32 minor; /* 2 Device minor number */

32989

+ __u32 raid_disk; /* 3 The role of the device in the raid set */

32990

+ __u32 state; /* 4 Operational state */

32991

+ __u32 reserved[MD_SB_DESCRIPTOR_WORDS - 5];

32992

+} mdp_disk_t;

32993

+

32994

+#define MD_SB_MAGIC 0xa92b4efc

32995

+

32996

+/*

32997

+ * Superblock state bits

32998

+ */

32999

+#define MD_SB_CLEAN 0

33000

+#define MD_SB_ERRORS 1

33001

+

33002

+typedef struct mdp_superblock_s {

33003

+ /*

33004

+ * Constant generic information

33005

+ */

33006

+ __u32 md_magic; /* 0 MD identifier */

33007

+ __u32 major_version; /* 1 major version to which the set conforms */

33008

+ __u32 minor_version; /* 2 minor version ... */

33009

+ __u32 patch_version; /* 3 patchlevel version ... */

33010

+ __u32 gvalid_words; /* 4 Number of used words in this section */

33011

+ __u32 set_uuid0; /* 5 Raid set identifier */

33012

+ __u32 ctime; /* 6 Creation time */

33013

+ __u32 level; /* 7 Raid personality */

33014

+ __u32 size; /* 8 Apparent size of each individual disk */

33015

+ __u32 nr_disks; /* 9 total disks in the raid set */

33016

+ __u32 raid_disks; /* 10 disks in a fully functional raid set */

33017

+ __u32 md_minor; /* 11 preferred MD minor device number */

33018

+ __u32 not_persistent; /* 12 does it have a persistent superblock */

33019

+ __u32 set_uuid1; /* 13 Raid set identifier #2 */

33020

+ __u32 set_uuid2; /* 14 Raid set identifier #3 */

33021

+ __u32 set_uuid3; /* 15 Raid set identifier #4 */

33022

+ __u32 gstate_creserved[MD_SB_GENERIC_CONSTANT_WORDS - 16];

33023

+

33024

+ /*

33025

+ * Generic state information

33026

+ */

33027

+ __u32 utime; /* 0 Superblock update time */

33028

+ __u32 state; /* 1 State bits (clean, ...) */

33029

+ __u32 active_disks; /* 2 Number of currently active disks */

33030

+ __u32 working_disks; /* 3 Number of working disks */

33031

+ __u32 failed_disks; /* 4 Number of failed disks */

33032

+ __u32 spare_disks; /* 5 Number of spare disks */

33033

+ __u32 sb_csum; /* 6 checksum of the whole superblock */

33034

+#ifdef __KERNEL__

33035

+#ifdef __BIG_ENDIAN

33036

+ __u32 events_hi; /* 7 high-order of superblock update count */

33037

+ __u32 events_lo; /* 8 low-order of superblock update count */

33038

+#else

33039

+ __u32 events_lo; /* 7 low-order of superblock update count */

33040

+ __u32 events_hi; /* 8 high-order of superblock update count */

33041

+#endif

33042

+#else

33043

+#if __BYTE_ORDER == __BIG_ENDIAN

33044

+ __u32 events_hi; /* 7 high-order of superblock update count */

33045

+ __u32 events_lo; /* 8 low-order of superblock update count */

33046

+#else

33047

+ __u32 events_lo; /* 7 low-order of superblock update count */

33048

+ __u32 events_hi; /* 8 high-order of superblock update count */

33049

+#endif

33050

+#endif

33051

+ __u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 9];

33052

+

33053

+ /*

33054

+ * Personality information

33055

+ */

33056

+ __u32 layout; /* 0 the array's physical layout */

33057

+ __u32 chunk_size; /* 1 chunk size in bytes */

33058

+ __u32 root_pv; /* 2 LV root PV */

33059

+ __u32 root_block; /* 3 LV root block */

33060

+ __u32 pstate_reserved[MD_SB_PERSONALITY_WORDS - 4];

33061

+

33062

+ /*

33063

+ * Disks information

33064

+ */

33065

+ mdp_disk_t disks[MD_SB_DISKS];

33066

+

33067

+ /*

33068

+ * Reserved

33069

+ */

33070

+ __u32 reserved[MD_SB_RESERVED_WORDS];

33071

+

33072

+ /*

33073

+ * Active descriptor

33074

+ */

33075

+ mdp_disk_t this_disk;

33076

+

33077

+}mdp_super_t;

33078

+

33079

+static inline __u64 md_event(mdp_super_t *sb) {

33080

+ __u64 ev = sb->events_hi;

33081

+ return (ev<<32)| sb->events_lo;

33082

+}

33083

+

33084

+#endif

33085

+

33086

diff -Naur linux-2002-03-28/include/linux/evms/evms_md_u.h evms-2002-03-28/include/linux/evms/evms_md_u.h

33087

--- linux-2002-03-28/include/linux/evms/evms_md_u.h Wed Dec 31 18:00:00 1969

33088

+++ evms-2002-03-28/include/linux/evms/evms_md_u.h Wed Mar 6 17:08:40 2002

33089

@@ -0,0 +1,68 @@

33090

+/*

33091

+ * Copyright (c) International Business Machines Corp., 2000

33092

+ *

33093

+ * This program is free software; you can redistribute it and/or modify

33094

+ * it under the terms of the GNU General Public License as published by

33095

+ * the Free Software Foundation; either version 2 of the License, or

33096

+ * (at your option) any later version.

33097

+ *

33098

+ * This program is distributed in the hope that it will be useful,

33099

+ * but WITHOUT ANY WARRANTY; without even the implied warranty of

33100

+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See

33101

+ * the GNU General Public License for more details.

33102

+ *

33103

+ * You should have received a copy of the GNU General Public License

33104

+ * along with this program; if not, write to the Free Software

33105

+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

33106

+ *

33107

+ *

33108

+ * linux/include/linux/evms/evms_md_h.c

33109

+ *

33110

+ * EVMS MD Region Manager, User <-> Kernel common file

33111

+ *

33112

+ */

33113

+

33114

+#ifndef _EVMS_MD_U_INC_

33115

+#define _EVMS_MD_U_INC_

33116

+

33117

+#define EVMS_MD_ID 4

33118

+#define MD_SET_PLUGIN_ID SetPluginID(IBM_OEM_ID,EVMS_REGION_MANAGER,EVMS_MD_ID)

33119

+

33120

+#define EVMS_MD_PERS_IOCTL_CMD 1 /* personality specific ioctl command */

33121

+#define EVMS_MD_ADD 2

33122

+#define EVMS_MD_REMOVE 3

33123

+#define EVMS_MD_ACTIVATE 4

33124

+#define EVMS_MD_DEACTIVATE 5

33125

+#define EVMS_MD_GET_ARRAY_INFO 6

33126

+

33127

+/* structure definition to use with MD_ADD, MD_REMOVE, MD_ACTIVATE */

33128

+typedef struct evms_md_kdev_s {

33129

+ u_int32_t major; /* 1 Device major number */

33130

+ u_int32_t minor; /* 2 Device minor number */

33131

+} evms_md_kdev_t;

33132

+

33133

+/* structure definition to use with MD_GET_ARRAY_INFO */

33134

+#define EVMS_MD_ARRAY_DEGRADED (1<<0)

33135

+#define EVMS_MD_ARRAY_SYNCING (1<<1)

33136

+typedef struct evms_md_array_info_s {

33137

+ unsigned long state; /* degraded mode, syncing,...*/

33138

+ mdp_super_t *sb; /* array super block */

33139

+} evms_md_array_info_t;

33140

+

33141

+typedef struct evms_md_ioctl_s {

33142

+ int mddev_idx; /* same as __minor in mddev_s struct */

33143

+ int cmd; /* Command for personality */

33144

+ void *arg; /* Command specific ioctl command structure */

33145

+} evms_md_ioctl_t;

33146

+

33147

+/* Needed by mddev_s structure in evms_md_k.h */

33148

+typedef struct mdu_param_s

33149

+{

33150

+ int personality; /* 1,2,3,4 */

33151

+ int chunk_size; /* in bytes */

33152

+ int max_fault; /* unused for now */

33153

+} mdu_param_t;

33154

+

33155

+

33156

+#endif

33157

+

33158

diff -Naur linux-2002-03-28/include/linux/evms/evms_os2.h evms-2002-03-28/include/linux/evms/evms_os2.h

33159

--- linux-2002-03-28/include/linux/evms/evms_os2.h Wed Dec 31 18:00:00 1969

33160

+++ evms-2002-03-28/include/linux/evms/evms_os2.h Wed Mar 27 23:55:42 2002

33161

@@ -0,0 +1,407 @@

33162

+/*

33163

+ *

33164

+ * Copyright (c) International Business Machines Corp., 2000

33165

+ *

33166

+ * This program is free software; you can redistribute it and/or modify

33167

+ * it under the terms of the GNU General Public License as published by

33168

+ * the Free Software Foundation; either version 2 of the License, or

33169

+ * (at your option) any later version.

33170

+ *

33171

+ * This program is distributed in the hope that it will be useful,

33172

+ * but WITHOUT ANY WARRANTY; without even the implied warranty of

33173

+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See

33174

+ * the GNU General Public License for more details.

33175

+ *

33176

+ * You should have received a copy of the GNU General Public License

33177

+ * along with this program; if not, write to the Free Software

33178

+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

33179

+ *

33180

+ * Module: linux/include/linux/evms_os2.h

33181

+ */

33182

+

33183

+/*

33184

+ * Change History:

33185

+ *

33186

+ */

33187

+

33188

+/*

33189

+ * Description: This module defines the disk structures used by the OS/2

33190

+ * Logical Volume Manager, including that of the Master

33191

+ * Boot Record (MBR) and Extended Boot Records (EBR).

33192

+ *

33193

+ * Notes: LVM Drive Letter Assignment Tables (DLA_Tables) appear on the

33194

+ * last sector of each track containing a valid MBR or EBR. Since

33195

+ * partitions must be track aligned, any track containing an MBR or

33196

+ * EBR will be almost all empty sectors. We will grab the last

33197

+ * of these empty sectors for our DLT_Tables.

33198

+ *

33199

+ */

33200

+

33201

+

33202

+#ifndef OS2LVM_INCLUDED__

33203

+#define OS2LVM_INCLUDED__

33204

+

33205

+/* The following define the values used to indicate that a partition table entry is for an EBR, not a partition. */

33206

+#define EBR_BOOT_INDICATOR 0

33207

+#define EBR_FORMAT_INDICATOR 5

33208

+

33209

+/* The following define is used as the default Format_Indicator for new non-primary partitions. */

33210

+#define NEW_LOGICAL_DRIVE_FORMAT_INDICATOR 0x6

33211

+

33212

+/* The following define is used as the default Format_Indicator for a new non-active primary partitions. */

33213

+#define NEW_PRIMARY_PARTITION_FORMAT_INDICATOR 0x16

33214

+

33215

+/* The following define is used as the default Format_Indicator for a new active primary partition. */

33216

+#define NEW_ACTIVE_PRIMARY_PARTITION_FORMAT_INDICATOR 0x06

33217

+

33218

+/* The following define is used to hold the value of the Boot_Indicator for active partitions. */

33219

+#define ACTIVE_PARTITION 0x80

33220

+

33221

+/* Define the size of a Partition Name. Partition Names are user defined names given to a partition. */

33222

+#define PARTITION_NAME_SIZE 20

33223

+

33224

+/* Define the size of a volume name. Volume Names are user defined names given to a volume. */

33225

+#define VOLUME_NAME_SIZE 20

33226

+

33227

+/* Define the size of a disk name. Disk Names are user defined names given to physical disk drives in the system. */

33228

+#define DISK_NAME_SIZE 20

33229

+

33230

+/* The name of the filesystem in use on a partition. This name may be up to 12 ( + NULL terminator) characters long. */

33231

+#define FILESYSTEM_NAME_SIZE 20

33232

+

33233

+/* The comment field is reserved but is not currently used. This is for future expansion and use. */

33234

+#define COMMENT_SIZE 81

33235

+

33236

+

33237

+/* Define the minimum number of sectors to reserve on the disk for Boot Manager. */

33238

+#define BOOT_MANAGER_SIZE 2048

33239

+

33240

+#define OS2_BYTES_PER_SECTOR 512

33241

+#define OS2_SECTOR_SHIFT 9

33242

+

33243

+

33244

+/*--------------------------------------------------

33245

+ * Type definitions

33246

+ --------------------------------------------------*/

33247

+

33248

+/* The following definitions define the drive letter assignment table used by LVM.

33249

+ For each partition table on the disk, there will be a drive letter assignment table in the last sector

33250

+ of the track containing the partition table. */

33251

+

33252

+/* NOTE: DLA stands for Drive Letter Assignment. */

33253

+

33254

+#define DLA_TABLE_SIGNATURE1 0x424D5202L

33255

+#define DLA_TABLE_SIGNATURE2 0x44464D50L

33256

+

33257

+

33258

+typedef struct _DLA_Entry { /* DE */

33259

+ u_int32_t Volume_Serial_Number; /* The serial number of the volume that this partition belongs to. */

33260

+ u_int32_t Partition_Serial_Number; /* The serial number of this partition. */

33261

+ u_int32_t Partition_Size; /* The size of the partition, in sectors. */

33262

+ u_int32_t Partition_Start; /* The starting sector of the partition. */

33263

+ unsigned char On_Boot_Manager_Menu; /* Set to TRUE if this volume/partition is on the Boot Manager Menu. */

33264

+ unsigned char Installable; /* Set to TRUE if this volume is the one to install the operating system on. */

33265

+ char Drive_Letter; /* The drive letter assigned to the partition. */

33266

+ unsigned char Reserved;

33267

+ char Volume_Name[VOLUME_NAME_SIZE]; /* The name assigned to the volume by the user. */

33268

+ char Partition_Name[PARTITION_NAME_SIZE]; /* The name assigned to the partition. */

33269

+} DLA_Entry;

33270

+

33271

+typedef struct _DLA_Table_Sector { /* DTS */

33272

+ u_int32_t DLA_Signature1; /* The magic signature (part 1) of a Drive Letter Assignment Table. */

33273

+ u_int32_t DLA_Signature2; /* The magic signature (part 2) of a Drive Letter Assignment Table. */

33274

+ u_int32_t DLA_CRC; /* The 32 bit CRC for this sector. Calculated assuming that this field and all unused space in the sector is 0. */

33275

+ u_int32_t Disk_Serial_Number; /* The serial number assigned to this disk. */

33276

+ u_int32_t Boot_Disk_Serial_Number; /* The serial number of the disk used to boot the system. This is for conflict resolution when multiple volumes

33277

+ want the same drive letter. Since LVM.EXE will not let this situation happen, the only way to get this situation

33278

+ is for the disk to have been altered by something other than LVM.EXE, or if a disk drive has been moved from one

33279

+ machine to another. If the drive has been moved, then it should have a different Boot_Disk_Serial_Number. Thus,

33280

+ we can tell which disk drive is the "foreign" drive and therefore reject its claim for the drive letter in question.

33281

+ If we find that all of the claimaints have the same Boot_Disk_Serial_Number, then we must assign drive letters on

33282

+ a first come, first serve basis.*/

33283

+ u_int32_t Install_Flags; /* Used by the Install program. */

33284

+ u_int32_t Cylinders;

33285

+ u_int32_t Heads_Per_Cylinder;

33286

+ u_int32_t Sectors_Per_Track;

33287

+ char Disk_Name[DISK_NAME_SIZE]; /* The name assigned to the disk containing this sector. */

33288

+ unsigned char Reboot; /* For use by Install. Used to keep track of reboots initiated by install. */

33289

+ unsigned char Reserved[3]; /* Alignment. */

33290

+ DLA_Entry DLA_Array[4]; /* These are the four entries which correspond to the entries in the partition table. */

33291

+} DLA_Table_Sector;

33292

+

33293

+

33294

+/* The following definitions define the LVM signature sector which will appear as the last sector in an LVM partition. */

33295

+

33296

+

33297

+#define OS2LVM_PRIMARY_SIGNATURE 0x4A435332L

33298

+#define OS2LVM_SECONDARY_SIGNATURE 0x4252444BL

33299

+

33300

+

33301

+#define CURRENT_OS2LVM_MAJOR_VERSION_NUMBER 2 /* Define as appropriate. */

33302

+#define CURRENT_OS2LVM_MINOR_VERSION_NUMBER 0 /* Define as appropriate. */

33303

+

33304

+

33305

+/* The following definitions limit the number of LVM features that can be applied to a volume, as well as defining a "NULL" feature for use in feature table entries that are not being used. */

33306

+#define OS2LVM_MAX_FEATURES_PER_VOLUME 10 /* The maximum number of LVM features that can be applied to a volume. */

33307

+#define OS2LVM_NULL_FEATURE 0 /* No feature. Used in all unused entries of the feature array in the LVM Signature sector. */

33308

+

33309

+

33310

+/* The following structure is used to hold the location of the feature specific data for LVM features. */

33311

+typedef struct _LVM_Feature_Data { /* LFD */

33312

+ u_int32_t Feature_ID; /* The ID of the feature. */

33313

+ u_int32_t Location_Of_Primary_Feature_Data; /* The u_int32_t of the starting sector of the private data for this feature. */

33314

+ u_int32_t Location_Of_Secondary_Feature_Data; /* The u_int32_t of the starting sector of the backup copy of the private data for this feature. */

33315

+ u_int32_t Feature_Data_Size; /* The number of sectors used by this feature for its private data. */

33316

+ u_int16_t Feature_Major_Version_Number; /* The integer portion of the version number of this feature. */

33317

+ u_int16_t Feature_Minor_Version_Number; /* The decimal portion of the version number of this feature. */

33318

+ unsigned char Feature_Active; /* TRUE if this feature is active on this partition/volume, FALSE otherwise. */

33319

+ unsigned char Reserved[3]; /* Alignment. */

33320

+} LVM_Feature_Data;

33321

+

33322

+

33323

+/* The following structure defines the LVM Signature Sector. This is the last sector of every partition which is part of an LVM volume. It gives vital

33324

+ information about the version of LVM used to create the LVM volume that it is a part of, as well as which LVM features (BBR, drive linking, etc.) are

33325

+ active on the volume that this partition is a part of. */

33326

+typedef struct _LVM_Signature_Sector { /* LSS */

33327

+ u_int32_t LVM_Signature1; /* The first part of the magic LVM signature. */

33328

+ u_int32_t LVM_Signature2; /* The second part of the magic LVM signature. */

33329

+ u_int32_t Signature_Sector_CRC; /* 32 bit CRC for this sector. Calculated using 0 for this field. */

33330

+ u_int32_t Partition_Serial_Number; /* The LVM assigned serial number for this partition. */

33331

+ u_int32_t Partition_Start; /* u_int32_t of the first sector of this partition. */

33332

+ u_int32_t Partition_End; /* u_int32_t of the last sector of this partition. */

33333

+ u_int32_t Partition_Sector_Count; /* The number of sectors in this partition. */

33334

+ u_int32_t LVM_Reserved_Sector_Count; /* The number of sectors reserved for use by LVM. */

33335

+ u_int32_t Partition_Size_To_Report_To_User; /* The size of the partition as the user sees it - i.e. (the actual size of the partition - LVM reserved sectors) rounded to a track boundary. */

33336

+ u_int32_t Boot_Disk_Serial_Number; /* The serial number of the boot disk for the system. If the system contains Boot Manager, then this is the serial number of the disk containing the active copy of Boot Manager. */

33337

+ u_int32_t Volume_Serial_Number; /* The serial number of the volume that this partition belongs to. */

33338

+ u_int32_t Fake_EBR_Location; /* The location, on disk, of a Fake EBR, if one has been allocated. */

33339

+ u_int16_t LVM_Major_Version_Number; /* Major version number of the LVM that created this partition. */

33340

+ u_int16_t LVM_Minor_Version_Number; /* Minor version number of the LVM that created this partition. */

33341

+ char Partition_Name[PARTITION_NAME_SIZE]; /* User defined partition name. */

33342

+ char Volume_Name[VOLUME_NAME_SIZE]; /* The name of the volume that this partition belongs to. */

33343

+ LVM_Feature_Data LVM_Feature_Array[OS2LVM_MAX_FEATURES_PER_VOLUME]; /* The feature array. This indicates which LVM features, if any, are active on this volume

33344

+ and what order they should be applied in. */

33345

+ char Drive_Letter; /* The drive letter assigned to the volume that this partition is part of. */

33346

+ unsigned char Fake_EBR_Allocated; /* If TRUE, then a fake EBR has been allocated. */

33347

+ char Comment[COMMENT_SIZE]; /* User comment. */

33348

+ char Disk_Name[DISK_NAME_SIZE]; /* Added to allow BBR to report the name of a disk when bad sectors are encountered on that disk. */

33349

+ u_int32_t Sequence_Number; /* This indicates the order that partitions within a volume are used. This number is 1 based. A 0 here indicates that the volume was made by LVM Ver. 1. */

33350

+ u_int32_t Next_Aggregate_Number; /* Used during volume creation and expansion when creating unique names for aggregates. */

33351

+ /* The remainder of the sector is reserved for future use and should be all zero or else the CRC will not come out correctly. */

33352

+} LVM_Signature_Sector;

33353

+

33354

+

33355

+/* The following definitions define the format of a partition table and the Master Boot Record (MBR). */

33356

+typedef struct _Partition_Record { /* PR */

33357

+ unsigned char Boot_Indicator; /* 80h = active partition. */

33358

+ unsigned char Starting_Head;

33359

+ unsigned char Starting_Sector; /* Bits 0-5 are the sector. Bits 6 and 7 are the high order bits of the starting cylinder. */

33360

+ unsigned char Starting_Cylinder; /* The cylinder number is a 10 bit value. The high order bits of the 10 bit value come from bits 6 & 7 of the Starting_Sector field. */

33361

+ unsigned char Format_Indicator; /* An indicator of the format/operation system on this partition. */

33362

+ unsigned char Ending_Head;

33363

+ unsigned char Ending_Sector;

33364

+ unsigned char Ending_Cylinder;

33365

+ u_int32_t Sector_Offset; /* The number of sectors on the disk which are prior to the start of this partition. */

33366

+ u_int32_t Sector_Count; /* The number of sectors in this partition. */

33367

+} Partition_Record;

33368

+

33369

+typedef struct _Master_Boot_Record { /* MBR */

33370

+ unsigned char Reserved[446];

33371

+ Partition_Record Partition_Table[4];

33372

+ u_int16_t Signature; /* AA55h in this field indicates that this is a valid partition table/MBR. */

33373

+} Master_Boot_Record;

33374

+

33375

+typedef Master_Boot_Record Extended_Boot_Record;

33376

+

33377

+/* The following definition covers the Boot Manager Alias Table in the EBR.

33378

+

33379

+ The Alias Table in the EBR has 2 entries in it, although only the first one is actually used. */

33380

+#define ALIAS_NAME_SIZE 8

33381

+typedef struct _AliasTableEntry { /* ATE */

33382

+ unsigned char On_Boot_Manager_Menu;

33383

+ char Name[ALIAS_NAME_SIZE];

33384

+} AliasTableEntry;

33385

+

33386

+#define ALIAS_TABLE_OFFSET 0x18A

33387

+

33388

+/* XLATOFF */

33389

+/* The following text is used for the Boot Manager Alias for items that were placed on the Boot Manager Menu by FDISK and

33390

+ which have since been migrated to the new LVM format. This text is put into the Name field of an AliasTableEntry so

33391

+ that, if FDISK ( or another program which understands the old Boot Manager Menu format) is run, it will display

33392

+ something for those partitions/volumes which are on the Boot Manager Menu.

33393

+

33394

+ NOTE: This text must be exactly ALIAS_NAME_SIZE characters in length! */

33395

+#define ALIAS_TABLE_ENTRY_MIGRATION_TEXT "--> LVM "

33396

+#define ALIAS_TABLE_ENTRY_MIGRATION_TEXT2 "--> LVM*"

33397

+

33398

+/* XLATON */

33399

+

33400

+/* The following is the signature used for an Master Boot Record, an Extended Boot Record, and a Boot Sector. */

33401

+#define MBR_EBR_SIGNATURE 0xAA55

33402

+

33403

+

33404

+/* The following list of definitions defines the values of interest for the Format_Indicator in a Partition_Record. */

33405

+#define EBR_INDICATOR 0x5

33406

+#define WINDOZE_EBR_INDICATOR 0xF

33407

+#define UNUSED_INDICATOR 0x0

33408

+#define IFS_INDICATOR 0x7

33409

+#define FAT12_INDICATOR 0x1

33410

+#define FAT16_SMALL_PARTITION_INDICATOR 0x4

33411

+#define FAT16_LARGE_PARTITION_INDICATOR 0x6

33412

+#define BOOT_MANAGER_HIDDEN_PARTITION_FLAG 0x10

33413

+#define LVM_PARTITION_INDICATOR 0x35

33414

+#define BOOT_MANAGER_INDICATOR 0x0A

33415

+

33416

+

33417

+/* The following is the signature used in the Boot Sector for Boot Manager. */

33418

+#define OS2LVM_BOOT_MANAGER_SIGNATURE "APJ&WN"

33419

+

33420

+

33421

+/* The following is used for determining the synthetic geometry reported for Volumes employing drive linking. */

33422

+#define OS2LVM_SYNTHETIC_SECTORS_PER_TRACK 63

33423

+

33424

+

33425

+/*--------------------------------------------------

33426

+ * Declares for Drive Linking feature:

33427

+ *--------------------------------------------------*/

33428

+

33429

+/* The following defines uniquely identify Drive Linking. */

33430

+#define DRIVE_LINKING_FEATURE_ID 100

33431

+#define DRIVE_LINKING_MAJOR_VERSION 1

33432

+#define DRIVE_LINKING_MINOR_VERSION 0

33433

+

33434

+/* The following definitions are used for the disk structures supporting drive linking. */

33435

+

33436

+#define LINK_TABLE_MASTER_SIGNATURE 0x434E4157L

33437

+#define LINK_TABLE_SIGNATURE 0X4D4D5652L

33438

+

33439

+#define MAXIMUM_LINKS 246

33440

+

33441

+#define DRIVE_LINKING_RESERVED_SECTOR_COUNT 4

33442

+

33443

+#define LINKS_IN_FIRST_SECTOR 60

33444

+

33445

+#define LINKS_IN_NEXT_SECTOR 62

33446

+

33447

+typedef struct _Drive_Link {

33448

+ u_int32_t Drive_Serial_Number;

33449

+ u_int32_t Partition_Serial_Number;

33450

+} Drive_Link;

33451

+

33452

+typedef struct _LVM_Link_Table_First_Sector {

33453

+ u_int32_t Link_Table_Signature; /* Use the LINK_TABLE_MASTER_SIGNATURE here. */

33454

+ u_int32_t Link_Table_CRC;

33455

+ u_int32_t Sequence_Number; /* Used to resolve conflicts when the primary and secondary tables do not match. */

33456

+ u_int32_t Links_In_Use;

33457

+ Drive_Link Link_Table[LINKS_IN_FIRST_SECTOR];

33458

+} LVM_Link_Table_First_Sector;

33459

+

33460

+typedef struct _LVM_Link_Table_Sector {

33461

+ u_int32_t Link_Table_Signature; /* Use LINK_TABLE_SIGNATURE here. */

33462

+ u_int32_t Link_Table_CRC;

33463

+ u_int32_t Sequence_Number; /* Used to resolve conflicts when the primary and secondary tables do not match. */

33464

+ Drive_Link Link_Table[LINKS_IN_NEXT_SECTOR];

33465

+} LVM_Link_Table_Sector;

33466

+

33467

+

33468

+/*--------------------------------------------------

33469

+ * Declares for Bad Block Relocation feature:

33470

+ *--------------------------------------------------*/

33471

+

33472

+/* The following definition is the numeric ID for Bad Block Relocation. */

33473

+#define BBR_FEATURE_ID 101

33474

+

33475

+#define BBR_FEATURE_MAJOR_VERSION 0x0001

33476

+#define BBR_FEATURE_MINOR_VERSION 0x0000

33477

+

33478

+/* The following definitions are used for the disk structures supporting bad block relocation. */

33479

+

33480

+/* NOTE: BBR stands for Bad Block Relocation. */

33481

+

33482

+#define BBR_TABLE_MASTER_SIGNATURE 0x00726D62

33483

+#define BBR_TABLE_SIGNATURE 0x01726276

33484

+

33485

+

33486

+typedef struct _BBR_Table_Entry {

33487

+ u_int32_t BadSector;

33488

+ u_int32_t ReplacementSector;

33489

+} BBR_Table_Entry;

33490

+

33491

+typedef struct _LVM_BBR_Table_First_Sector {

33492

+ u_int32_t Signature; /* Signature for the first sector of the BBR Table. Use BBR_TABLE_MASTER_SIGNATURE here.*/

33493

+ u_int32_t CRC;/* CRC for this sector.*/

33494

+ u_int32_t Sequence_Number; /* Used to resolve conflicts when the primary and secondary tables do not match.*/

33495

+ u_int32_t Table_Size; /* The number of BBR_Table_Entries in the BBR Table.*/

33496

+ u_int32_t Table_Entries_In_Use;/* The number of BBR Table entries which are in use.*/

33497

+ u_int32_t Sectors_Per_Table; /* The number of LVM_BBR_Table_Sectors used to hold the BBR Table.*/

33498

+ u_int32_t First_Replacement_Sector; /* The location of the first replacement sector.*/

33499

+ u_int32_t Last_Replacement_Sector; /* The location of the last replacement sector.*/

33500

+ u_int32_t Replacement_Sector_Count; /* The number of replacement sectors.*/

33501

+ u_int32_t Flags; /* Flags global to the Bad Block Relocation Feature.*/

33502

+} LVM_BBR_Table_First_Sector;

33503

+

33504

+/* Flags for LVM_BBR_Table_First_Sector */

33505

+#define BBR_Flag_Write_Verify 0x00000001/* Indicate convert Write I/O to Write/Verify*/

33506

+

33507

+#define BBR_TABLE_ENTRIES_PER_SECTOR 62

33508

+

33509

+typedef struct _LVM_BBR_Table_Sector {

33510

+ u_int32_t Signature;/* Signature for a sector of the BBR_Table which is not the first sector of the BBR Table. Use BBR_TABLE_SIGNATURE here.*/

33511

+ u_int32_t CRC;/* CRC for this sector of the BBR Table.*/

33512

+ u_int32_t Sequence_Number; /* Used to resolve conflicts when the primary and secondary tables do not match.*/

33513

+ BBR_Table_Entry BBR_Table[BBR_TABLE_ENTRIES_PER_SECTOR];

33514

+ u_int32_t reserved1;/* for block alignment*/

33515

+} LVM_BBR_Table_Sector;

33516

+

33517

+//

33518

+// Combined structure to hold entire BBR feature data as it exists on disk.

33519

+typedef struct _LVM_BBR_Feature

33520

+{

33521

+ LVM_BBR_Table_First_Sector control;

33522

+ char reserved1[OS2_BYTES_PER_SECTOR - sizeof(LVM_BBR_Table_First_Sector)];

33523

+ LVM_BBR_Table_Sector remap[1];

33524

+} LVM_BBR_Feature;

33525

+

33526

+/* The following defines establish the minimum and maximum number of replacement sectors which can be allocated for

33527

+ Bad Block Relocation. Otherwise, 1 replacement sector per MB of disk space is allocated. */

33528

+#define BBR_FLOOR 62

33529

+#define BBR_LIMIT 4096

33530

+

33531

+

33532

+#ifdef __KERNEL__

33533

+// In-memory Meta Data for Bad Block Relocation

33534

+// In-memory Meta Data for Drive Linking

33535

+typedef struct os2_drivelink_runtime_entry_s {

33536

+ evms_sector_t start_sector;

33537

+ evms_sector_t sector_count;

33538

+ evms_sector_t Drive_Link_Data_Copy1; /* LSN of first on-disk copy of drive linking data. */

33539

+ evms_sector_t Drive_Link_Data_Copy2; /* LSN of the second on-disk copy of drive linking data. */

33540

+ char *link_data;

33541

+ u_int32_t Partition_Serial_Number;

33542

+ evms_sector_t BBR_Data_Copy1; /* LSN of the first on-disk copy of the BBR data.*/

33543

+ evms_sector_t BBR_Data_Copy2; /* LSN of the second on-disk copy of the BBR data.*/

33544

+ u_int32_t BBR_Feature_Size; /* # of sectors of BBR data. */

33545

+ u_int32_t bbr_is_active;

33546

+ struct semaphore BBR_Table_Lock; /* Used to serialize writers */

33547

+ unsigned int Guard1; /* Lamport's Theorem for mutual exclusion */

33548

+ char *bbr_data;

33549

+ unsigned int Guard2; /* Lamport's Theorem for mutual exclusion */

33550

+ evms_logical_node_t *link_partition;

33551

+ struct os2_drivelink_runtime_entry_s *next;

33552

+} os2_drivelink_runtime_entry_t;

33553

+

33554

+// In-memory Meta Data for each OS/2 LVM Volume:

33555

+typedef struct os2_volume_runtime_entry_s {

33556

+ int complete;

33557

+ u_int32_t Export_Needed;

33558

+ evms_sector_t size_in_sectors;

33559

+ u_int32_t Volume_Serial_Number;

33560

+ u_int32_t drive_link_count;

33561

+ os2_drivelink_runtime_entry_t *drive_link;

33562

+ evms_logical_node_t *next_os2lvm_node;

33563

+} os2_volume_runtime_entry_t;

33564

+#endif

33565

+

33566

+

33567

+#endif

33568

+

33569

diff -Naur linux-2002-03-28/include/linux/evms/evms_raid0.h evms-2002-03-28/include/linux/evms/evms_raid0.h

33570

--- linux-2002-03-28/include/linux/evms/evms_raid0.h Wed Dec 31 18:00:00 1969

33571

+++ evms-2002-03-28/include/linux/evms/evms_raid0.h Thu Jan 3 13:15:19 2002

33572

@@ -0,0 +1,33 @@

33573

+#ifndef _RAID0_H

33574

+#define _RAID0_H

33575

+

33576

+#include <linux/evms/evms_md.h>

33577

+

33578

+struct strip_zone

33579

+{

33580

+ unsigned long zone_offset; /* Zone offset in md_dev */

33581

+ unsigned long dev_offset; /* Zone offset in real dev */

33582

+ unsigned long size; /* Zone size */

33583

+ int nb_dev; /* # of devices attached to the zone */

33584

+ mdk_rdev_t *dev[MD_SB_DISKS]; /* Devices attached to the zone */

33585

+};

33586

+

33587

+struct raid0_hash

33588

+{

33589

+ struct strip_zone *zone0, *zone1;

33590

+};

33591

+

33592

+struct raid0_private_data

33593

+{

33594

+ struct raid0_hash *hash_table; /* Dynamically allocated */

33595

+ struct strip_zone *strip_zone; /* This one too */

33596

+ int nr_strip_zones;

33597

+ struct strip_zone *smallest;

33598

+ int nr_zones;

33599

+};

33600

+

33601

+typedef struct raid0_private_data raid0_conf_t;

33602

+

33603

+#define mddev_to_conf(mddev) ((raid0_conf_t *) mddev->private)

33604

+

33605

+#endif

33606

diff -Naur linux-2002-03-28/include/linux/evms/evms_raid1.h evms-2002-03-28/include/linux/evms/evms_raid1.h

33607

--- linux-2002-03-28/include/linux/evms/evms_raid1.h Wed Dec 31 18:00:00 1969

33608

+++ evms-2002-03-28/include/linux/evms/evms_raid1.h Mon Mar 11 22:58:26 2002

33609

@@ -0,0 +1,104 @@

33610

+#ifndef _EVMS_RAID1_H

33611

+#define _EVMS_RAID1_H

33612

+

33613

+#include <linux/evms/evms_md.h>

33614

+

33615

+struct mirror_info {

33616

+ int number;

33617

+ int raid_disk;

33618

+ evms_logical_node_t *node;

33619

+ kdev_t dev;

33620

+ int sect_limit;

33621

+ int head_position;

33622

+

33623

+ /*

33624

+ * State bits:

33625

+ */

33626

+ int operational;

33627

+ int write_only;

33628

+ int spare;

33629

+

33630

+ int used_slot;

33631

+};

33632

+

33633

+struct raid1_private_data {

33634

+ mddev_t *mddev;

33635

+ struct mirror_info mirrors[MD_SB_DISKS];

33636

+ int nr_disks;

33637

+ int raid_disks;

33638

+ int working_disks;

33639

+ int last_used;

33640

+ unsigned long next_sect;

33641

+ int sect_count;

33642

+ evms_thread_t *thread, *resync_thread;

33643

+ int resync_mirrors;

33644

+ struct mirror_info *spare;

33645

+ md_spinlock_t device_lock;

33646

+

33647

+ /* buffer pool */

33648

+ /* buffer_heads that we have pre-allocated have b_pprev -> &freebh

33649

+ * and are linked into a stack using b_next

33650

+ * raid1_bh that are pre-allocated have R1BH_PreAlloc set.

33651

+ * All these variable are protected by device_lock

33652

+ */

33653

+ struct buffer_head *freebh;

33654

+ int freebh_cnt; /* how many are on the list */

33655

+ int freebh_blocked;

33656

+ struct raid1_bh *freer1;

33657

+ int freer1_blocked;

33658

+ int freer1_cnt;

33659

+ struct raid1_bh *freebuf; /* each bh_req has a page allocated */

33660

+ md_wait_queue_head_t wait_buffer;

33661

+

33662

+ /* for use when syncing mirrors: */

33663

+ unsigned long start_active, start_ready,

33664

+ start_pending, start_future;

33665

+ int cnt_done, cnt_active, cnt_ready,

33666

+ cnt_pending, cnt_future;

33667

+ int phase;

33668

+ int window;

33669

+ md_wait_queue_head_t wait_done;

33670

+ md_wait_queue_head_t wait_ready;

33671

+ md_spinlock_t segment_lock;

33672

+};

33673

+

33674

+typedef struct raid1_private_data raid1_conf_t;

33675

+

33676

+/*

33677

+ * this is the only point in the RAID code where we violate

33678

+ * C type safety. mddev->private is an 'opaque' pointer.

33679

+ */

33680

+#define mddev_to_conf(mddev) ((raid1_conf_t *) mddev->private)

33681

+

33682

+/*

33683

+ * this is our 'private' 'collective' RAID1 buffer head.

33684

+ * it contains information about what kind of IO operations were started

33685

+ * for this RAID1 operation, and about their status:

33686

+ */

33687

+

33688

+/* This structure is used to map a buffer head to a evms logical node */

33689

+typedef struct raid1_node_map_s {

33690

+ evms_logical_node_t *node;

33691

+ struct buffer_head *bh;

33692

+} raid1_node_map_t;

33693

+

33694

+struct raid1_bh {

33695

+ atomic_t remaining; /* 'have we finished' count,

33696

+ * used from IRQ handlers

33697

+ */

33698

+ int cmd;

33699

+ unsigned long state;

33700

+ mddev_t *mddev;

33701

+ struct buffer_head *master_bh;

33702

+ struct buffer_head *mirror_bh_list;

33703

+ raid1_node_map_t mirror_node_map[MD_SB_DISKS];

33704

+ struct buffer_head bh_req;

33705

+ evms_logical_node_t *node; /* map to evms node (READ only) */

33706

+ eio_t eio;

33707

+ struct raid1_bh *next_r1; /* next for retry or in free list */

33708

+};

33709

+/* bits for raid1_bh.state */

33710

+#define R1BH_Uptodate 1

33711

+#define R1BH_SyncPhase 2

33712

+#define R1BH_PreAlloc 3 /* this was pre-allocated, add to free list */

33713

+#endif

33714

diff -Naur linux-2002-03-28/include/linux/evms/evms_raid5.h evms-2002-03-28/include/linux/evms/evms_raid5.h

33715

--- linux-2002-03-28/include/linux/evms/evms_raid5.h Wed Dec 31 18:00:00 1969

33716

+++ evms-2002-03-28/include/linux/evms/evms_raid5.h Mon Mar 11 22:58:36 2002

33717

@@ -0,0 +1,251 @@

33718

+#ifndef _RAID5_H

33719

+#define _RAID5_H

33720

+

33721

+#include <linux/evms/evms_md.h>

33722

+#include <linux/evms/evms_xor.h>

33723

+

33724

+/*

33725

+ *

33726

+ * Each stripe contains one buffer per disc. Each buffer can be in

33727

+ * one of a number of states determined by bh_state. Changes between

33728

+ * these states happen *almost* exclusively under a per-stripe

33729

+ * spinlock. Some very specific changes can happen in b_end_io, and

33730

+ * these are not protected by the spin lock.

33731

+ *

33732

+ * The bh_state bits that are used to represent these states are:

33733

+ * BH_Uptodate, BH_Lock

33734

+ *

33735

+ * State Empty == !Uptodate, !Lock

33736

+ * We have no data, and there is no active request

33737

+ * State Want == !Uptodate, Lock

33738

+ * A read request is being submitted for this block

33739

+ * State Dirty == Uptodate, Lock

33740

+ * Some new data is in this buffer, and it is being written out

33741

+ * State Clean == Uptodate, !Lock

33742

+ * We have valid data which is the same as on disc

33743

+ *

33744

+ * The possible state transitions are:

33745

+ *

33746

+ * Empty -> Want - on read or write to get old data for parity calc

33747

+ * Empty -> Dirty - on compute_parity to satisfy write/sync request.(RECONSTRUCT_WRITE)

33748

+ * Empty -> Clean - on compute_block when computing a block for failed drive

33749

+ * Want -> Empty - on failed read

33750

+ * Want -> Clean - on successful completion of read request

33751

+ * Dirty -> Clean - on successful completion of write request

33752

+ * Dirty -> Clean - on failed write

33753

+ * Clean -> Dirty - on compute_parity to satisfy write/sync (RECONSTRUCT or RMW)

33754

+ *

33755

+ * The Want->Empty, Want->Clean, Dirty->Clean, transitions

33756

+ * all happen in b_end_io at interrupt time.

33757

+ * Each sets the Uptodate bit before releasing the Lock bit.

33758

+ * This leaves one multi-stage transition:

33759

+ * Want->Dirty->Clean

33760

+ * This is safe because thinking that a Clean buffer is actually dirty

33761

+ * will at worst delay some action, and the stripe will be scheduled

33762

+ * for attention after the transition is complete.

33763

+ *

33764

+ * There is one possibility that is not covered by these states. That

33765

+ * is if one drive has failed and there is a spare being rebuilt. We

33766

+ * can't distinguish between a clean block that has been generated

33767

+ * from parity calculations, and a clean block that has been

33768

+ * successfully written to the spare ( or to parity when resyncing).

33769

+ * To distingush these states we have a stripe bit STRIPE_INSYNC that

33770

+ * is set whenever a write is scheduled to the spare, or to the parity

33771

+ * disc if there is no spare. A sync request clears this bit, and

33772

+ * when we find it set with no buffers locked, we know the sync is

33773

+ * complete.

33774

+ *

33775

+ * Buffers for the md device that arrive via make_request are attached

33776

+ * to the appropriate stripe in one of two lists linked on b_reqnext.

33777

+ * One list (bh_read) for read requests, one (bh_write) for write.

33778

+ * There should never be more than one buffer on the two lists

33779

+ * together, but we are not guaranteed of that so we allow for more.

33780

+ *

33781

+ * If a buffer is on the read list when the associated cache buffer is

33782

+ * Uptodate, the data is copied into the read buffer and it's b_end_io

33783

+ * routine is called. This may happen in the end_request routine only

33784

+ * if the buffer has just successfully been read. end_request should

33785

+ * remove the buffers from the list and then set the Uptodate bit on

33786

+ * the buffer. Other threads may do this only if they first check

33787

+ * that the Uptodate bit is set. Once they have checked that they may

33788

+ * take buffers off the read queue.

33789

+ *

33790

+ * When a buffer on the write list is committed for write is it copied

33791

+ * into the cache buffer, which is then marked dirty, and moved onto a

33792

+ * third list, the written list (bh_written). Once both the parity

33793

+ * block and the cached buffer are successfully written, any buffer on

33794

+ * a written list can be returned with b_end_io.

33795

+ *

33796

+ * The write list and read list both act as fifos. The read list is

33797

+ * protected by the device_lock. The write and written lists are

33798

+ * protected by the stripe lock. The device_lock, which can be

33799

+ * claimed while the stipe lock is held, is only for list

33800

+ * manipulations and will only be held for a very short time. It can

33801

+ * be claimed from interrupts.

33802

+ *

33803

+ *

33804

+ * Stripes in the stripe cache can be on one of two lists (or on

33805

+ * neither). The "inactive_list" contains stripes which are not

33806

+ * currently being used for any request. They can freely be reused

33807

+ * for another stripe. The "handle_list" contains stripes that need

33808

+ * to be handled in some way. Both of these are fifo queues. Each

33809

+ * stripe is also (potentially) linked to a hash bucket in the hash

33810

+ * table so that it can be found by sector number. Stripes that are

33811

+ * not hashed must be on the inactive_list, and will normally be at

33812

+ * the front. All stripes start life this way.

33813

+ *

33814

+ * The inactive_list, handle_list and hash bucket lists are all protected by the

33815

+ * device_lock.

33816

+ * - stripes on the inactive_list never have their stripe_lock held.

33817

+ * - stripes have a reference counter. If count==0, they are on a list.

33818

+ * - If a stripe might need handling, STRIPE_HANDLE is set.

33819

+ * - When refcount reaches zero, then if STRIPE_HANDLE it is put on

33820

+ * handle_list else inactive_list

33821

+ *

33822

+ * This, combined with the fact that STRIPE_HANDLE is only ever

33823

+ * cleared while a stripe has a non-zero count means that if the

33824

+ * refcount is 0 and STRIPE_HANDLE is set, then it is on the

33825

+ * handle_list and if recount is 0 and STRIPE_HANDLE is not set, then

33826

+ * the stripe is on inactive_list.

33827

+ *

33828

+ * The possible transitions are:

33829

+ * activate an unhashed/inactive stripe (get_active_stripe())

33830

+ * lockdev check-hash unlink-stripe cnt++ clean-stripe hash-stripe unlockdev

33831

+ * activate a hashed, possibly active stripe (get_active_stripe())

33832

+ * lockdev check-hash if(!cnt++)unlink-stripe unlockdev

33833

+ * attach a request to an active stripe (add_stripe_bh())

33834

+ * lockdev attach-buffer unlockdev

33835

+ * handle a stripe (handle_stripe())

33836

+ * lockstripe clrSTRIPE_HANDLE ... (lockdev check-buffers unlockdev) .. change-state .. record io needed unlockstripe schedule io

33837

+ * release an active stripe (release_stripe())

33838

+ * lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev

33839

+ *

33840

+ * The refcount counts each thread that have activated the stripe,

33841

+ * plus raid5d if it is handling it, plus one for each active request

33842

+ * on a cached buffer.

33843

+ */

33844

+struct stripe_head {

33845

+ struct stripe_head *hash_next, **hash_pprev; /* hash pointers */

33846

+ struct list_head lru; /* inactive_list or handle_list */

33847

+ struct raid5_private_data *raid_conf;

33848

+ struct buffer_head *bh_cache[MD_SB_DISKS]; /* buffered copy */

33849

+ struct buffer_head *bh_read[MD_SB_DISKS]; /* read request buffers of the MD device */

33850

+ struct buffer_head *bh_write[MD_SB_DISKS]; /* write request buffers of the MD device */

33851

+ struct buffer_head *bh_written[MD_SB_DISKS]; /* write request buffers of the MD device that have been scheduled for write */

33852

+ struct page *bh_page[MD_SB_DISKS]; /* saved bh_cache[n]->b_page when reading around the cache */

33853

+ evms_logical_node_t *node[MD_SB_DISKS]; /* the target device node */

33854

+ unsigned long sector; /* sector of this row */

33855

+ int size; /* buffers size */

33856

+ int pd_idx; /* parity disk index */

33857

+ unsigned long state; /* state flags */

33858

+ atomic_t count; /* nr of active thread/requests */

33859

+ spinlock_t lock;

33860

+ int sync_redone;

33861

+};

33862

+

33863

+

33864

+/*

33865

+ * Write method

33866

+ */

33867

+#define RECONSTRUCT_WRITE 1

33868

+#define READ_MODIFY_WRITE 2

33869

+/* not a write method, but a compute_parity mode */

33870

+#define CHECK_PARITY 3

33871

+

33872

+/*

33873

+ * Stripe state

33874

+ */

33875

+#define STRIPE_ERROR 1

33876

+#define STRIPE_HANDLE 2

33877

+#define STRIPE_SYNCING 3

33878

+#define STRIPE_INSYNC 4

33879

+#define STRIPE_PREREAD_ACTIVE 5

33880

+#define STRIPE_DELAYED 6

33881

+

33882

+/*

33883

+ * Plugging:

33884

+ *

33885

+ * To improve write throughput, we need to delay the handling of some

33886

+ * stripes until there has been a chance that several write requests

33887

+ * for the one stripe have all been collected.

33888

+ * In particular, any write request that would require pre-reading

33889

+ * is put on a "delayed" queue until there are no stripes currently

33890

+ * in a pre-read phase. Further, if the "delayed" queue is empty when

33891

+ * a stripe is put on it then we "plug" the queue and do not process it

33892

+ * until an unplg call is made. (the tq_disk list is run).

33893

+ *

33894

+ * When preread is initiated on a stripe, we set PREREAD_ACTIVE and add

33895

+ * it to the count of prereading stripes.

33896

+ * When write is initiated, or the stripe refcnt == 0 (just in case) we

33897

+ * clear the PREREAD_ACTIVE flag and decrement the count

33898

+ * Whenever the delayed queue is empty and the device is not plugged, we

33899

+ * move any strips from delayed to handle and clear the DELAYED flag and set PREREAD_ACTIVE.

33900

+ * In stripe_handle, if we find pre-reading is necessary, we do it if

33901

+ * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue.

33902

+ * HANDLE gets cleared if stripe_handle leave nothing locked.

33903

+ */

33904

+

33905

+

33906

+struct disk_info {

33907

+ kdev_t dev;

33908

+ evms_logical_node_t *node;

33909

+ int operational;

33910

+ int number;

33911

+ int raid_disk;

33912

+ int write_only;

33913

+ int spare;

33914

+ int used_slot;

33915

+};

33916

+

33917

+struct raid5_private_data {

33918

+ struct stripe_head **stripe_hashtbl;

33919

+ mddev_t *mddev;

33920

+ evms_thread_t *thread, *resync_thread;

33921

+ struct disk_info disks[MD_SB_DISKS];

33922

+ struct disk_info *spare;

33923

+ int buffer_size;

33924

+ int chunk_size, level, algorithm;

33925

+ int raid_disks, working_disks, failed_disks;

33926

+ int resync_parity;

33927

+ int max_nr_stripes;

33928

+

33929

+ struct list_head handle_list; /* stripes needing handling */

33930

+ struct list_head delayed_list; /* stripes that have plugged requests */

33931

+ atomic_t preread_active_stripes; /* stripes with scheduled io */

33932

+ /*

33933

+ * Free stripes pool

33934

+ */

33935

+ atomic_t active_stripes;

33936

+ struct list_head inactive_list;

33937

+ md_wait_queue_head_t wait_for_stripe;

33938

+ int inactive_blocked; /* release of inactive stripes blocked,

33939

+ * waiting for 25% to be free

33940

+ */

33941

+ md_spinlock_t device_lock;

33942

+

33943

+ int plugged;

33944

+ struct tq_struct plug_tq;

33945

+};

33946

+

33947

+typedef struct raid5_private_data raid5_conf_t;

33948

+

33949

+#define mddev_to_conf(mddev) ((raid5_conf_t *) mddev->private)

33950

+

33951

+/*

33952

+ * Our supported algorithms

33953

+ */

33954

+#define ALGORITHM_LEFT_ASYMMETRIC 0

33955

+#define ALGORITHM_RIGHT_ASYMMETRIC 1

33956

+#define ALGORITHM_LEFT_SYMMETRIC 2

33957

+#define ALGORITHM_RIGHT_SYMMETRIC 3

33958

+

33959

+

33960

+#define EVMS_MD_RAID5_INIT_IO 1

33961

+

33962

+typedef struct raid5_ioctl_init_io_s {

33963

+ int rw;

33964

+ evms_sector_t lsn;

33965

+ evms_sector_t nr_sects;

33966

+ void *data;

33967

+} raid5_ioctl_init_io_t;

33968

+#endif

33969

diff -Naur linux-2002-03-28/include/linux/evms/evms_snapshot.h evms-2002-03-28/include/linux/evms/evms_snapshot.h

33970

--- linux-2002-03-28/include/linux/evms/evms_snapshot.h Wed Dec 31 18:00:00 1969

33971

+++ evms-2002-03-28/include/linux/evms/evms_snapshot.h Thu Dec 6 18:42:08 2001

33972

@@ -0,0 +1,131 @@

33973

+/* -*- linux-c -*- */

33974

+/*

33975

+ *

33976

+ * Copyright (c) International Business Machines Corp., 2000

33977

+ *

33978

+ * This program is free software; you can redistribute it and/or modify

33979

+ * it under the terms of the GNU General Public License as published by

33980

+ * the Free Software Foundation; either version 2 of the License, or

33981

+ * (at your option) any later version.

33982

+ *

33983

+ * This program is distributed in the hope that it will be useful,

33984

+ * but WITHOUT ANY WARRANTY; without even the implied warranty of

33985

+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See

33986

+ * the GNU General Public License for more details.

33987

+ *

33988

+ * You should have received a copy of the GNU General Public License

33989

+ * along with this program; if not, write to the Free Software

33990

+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

33991

+ */

33992

+/*

33993

+ * linux/include/linux/evms_snapshot.h

33994

+ *

33995

+ * EVMS Snapshot Feature kernel header file

33996

+ *

33997

+ */

33998

+

33999

+#ifndef __EVMS_SNAPSHOT_INCLUDED__

34000

+#define __EVMS_SNAPSHOT_INCLUDED__

34001

+

34002

+#define EVMS_SNAPSHOT_VERSION_MAJOR 2

34003

+#define EVMS_SNAPSHOT_VERSION_MINOR 0

34004

+#define EVMS_SNAPSHOT_VERSION_PATCHLEVEL 0

34005

+

34006

+#define EVMS_SNAPSHOT_FEATURE_ID 104

34007

+

34008

+#define EVMS_SNAPSHOT_SIGNATURE 0x536e4170 // SnAp

34009

+#define MAX_HASH_CHAIN_ENTRIES 10

34010

+

34011

+#define EVMS_SNAPSHOT 0x01 // Status flags

34012

+#define EVMS_SNAPSHOT_ORG 0x02

34013

+#define EVMS_SNAPSHOT_DISABLED 0x04

34014

+#define EVMS_SNAPSHOT_FULL 0x08

34015

+#define EVMS_SNAPSHOT_QUIESCED 0x10

34016

+#define EVMS_SNAPSHOT_WRITEABLE 0x20

34017

+

34018

+ // option definitions

34019

+#define SNAP_OPTION_ORG_VOLUME_NAME "original" // original volume

34020

+#define SNAP_OPTION_ORG_VOLUME_INDEX 0 // original volume

34021

+#define SNAP_OPTION_SNAPSHOT_NAME "snapshot" // snapshot volume

34022

+#define SNAP_OPTION_SNAPSHOT_INDEX 1 // snapshot volume

34023

+#define SNAP_OPTION_CHUNKSIZE_NAME "chunksize" // chunksize

34024

+#define SNAP_OPTION_CHUNKSIZE_INDEX 2 // chunksize

34025

+#define SNAP_OPTION_WRITEABLE_NAME "writeable" // writeable snapshot

34026

+#define SNAP_OPTION_WRITEABLE_INDEX 3 // writeable snapshot

34027

+

34028

+#define SNAPSHOT_DEFAULT_CHUNK_SIZE 128 //sectors

34029

+#define SNAPSHOT_MIN_CHUNK_SIZE 16 // 8k

34030

+#define SNAPSHOT_MAX_CHUNK_SIZE 2048 // = 1Meg

34031

+#define SNAPSHOT_CHUNK_BUFFER_SIZE 128 // copy buffer

34032

+

34033

+#define SNAPSHOT_QUERY_PERCENT_FULL 1 // ioctl internal command to query percent full

34034

+

34035

+#define SECTOR_SIZE 512

34036

+

34037

+// description of on disk meta data sector for snapshot feature

34038

+typedef struct _snapshot_metadata {

34039

+/* 0*/ u_int32_t signature;

34040

+/* 4*/ u_int32_t CRC;

34041

+/* 8*/ evms_version_t version; /* structure version */

34042

+/*12*/ u_int32_t flags;

34043

+/*16*/ char original_volume[128];

34044

+/*144*/ u_int64_t original_size;

34045

+/*152*/ u_int64_t lba_of_COW_table;

34046

+/*160*/ u_int64_t lba_of_first_chunk;

34047

+/*168*/ u_int32_t chunk_size; // in sectors

34048

+/*172*/ u_int32_t total_chunks;

34049

+} snapshot_metadata_t;

34050

+

34051

+

34052

+#ifdef __KERNEL__

34053

+

34054

+// Entries in the snapshot remapping structure

34055

+typedef struct _snapshot_hash_entry {

34056

+ unsigned long long org_chunk;

34057

+ unsigned long long snap_chunk;

34058

+ struct _snapshot_hash_entry * next;

34059

+ struct _snapshot_hash_entry * prev;

34060

+} snapshot_hash_entry_t;

34061

+

34062

+

34063

+typedef struct _snapshot_volume {

34064

+ evms_logical_node_t * logical_node; // node below us

34065

+ unsigned long chunk_size; // Sectors

34066

+ unsigned long chunk_shift; // shift value for chunk size

34067

+ unsigned long num_chunks; // in this volume

34068

+ unsigned long next_cow_entry; // Index into current COW table

34069

+ unsigned long long current_cow_sector; // LOGICAL sector of current COW table

34070

+ unsigned long next_free_chunk; // index of next free chunk (not LBA!)

34071

+ u_int64_t cow_table[64]; // Pointer to one sector's worth of COW tables

34072

+ unsigned long hash_table_size; // size of the hash table for the remap

34073

+ unsigned long flags; // status flags

34074

+ snapshot_hash_entry_t ** snapshot_map; // array of remapped chunks

34075

+ struct _snapshot_volume * snapshot_next; // Linked list of volumes snapshotting this original

34076

+ struct _snapshot_volume * snapshot_org; // Pointer to volume being snapshotted

34077

+ struct semaphore snap_semaphore; // Semaphore for locking of snapshots

34078

+ unsigned char * chunk_data_buffer; // Buffer for reading data when doing a copy-on-write

34079

+} snapshot_volume_t;

34080

+

34081

+#else

34082

+typedef struct _snapshot_volume {

34083

+ storage_object_t * object; // our exported object

34084

+ storage_object_t * child_object; // our child object

34085

+ unsigned long chunk_size; // Sectors

34086

+ unsigned long num_chunks; // in this volume

34087

+ unsigned long next_cow_entry; // Index into current COW table

34088

+ unsigned long long current_cow_sector; // LOGICAL sector of current COW table

34089

+ unsigned long next_free_chunk; // index of next free chunk (not LBA!)

34090

+ u_int64_t cow_table[64]; // Pointer to one sector's worth of COW tables

34091

+ unsigned long hash_table_size; // size of the hash table for the remap

34092

+ unsigned long flags; // status flags

34093

+// snapshot_hash_entry_t ** snapshot_map; // array of remapped chunks

34094

+ struct _snapshot_volume * snapshot_next; // Linked list of volumes snapshotting this original

34095

+ struct _snapshot_volume * snapshot_org; // Pointer to volume being snapshotted

34096

+// struct semaphore snap_semaphore; // Semaphore for locking of snapshots

34097

+// unsigned char * chunk_data_buffer; // Buffer for reading data when doing a copy-on-write

34098

+ snapshot_metadata_t meta_data; // copy of metadata if not original

34099

+} snapshot_volume_t;

34100

+

34101

+#endif

34102

+#endif

34103

+

34104

diff -Naur linux-2002-03-28/include/linux/evms/evms_user.h evms-2002-03-28/include/linux/evms/evms_user.h

34105

--- linux-2002-03-28/include/linux/evms/evms_user.h Wed Dec 31 18:00:00 1969

34106

+++ evms-2002-03-28/include/linux/evms/evms_user.h Wed May 16 13:40:56 2001

34107

@@ -0,0 +1,28 @@

34108

+/* -*- linux-c -*- */

34109

+/*

34110

+ *

34111

+ * Copyright (c) International Business Machines Corp., 2000

34112

+ *

34113

+ * This program is free software; you can redistribute it and/or modify

34114

+ * it under the terms of the GNU General Public License as published by

34115

+ * the Free Software Foundation; either version 2 of the License, or

34116

+ * (at your option) any later version.

34117

+ *

34118

+ * This program is distributed in the hope that it will be useful,

34119

+ * but WITHOUT ANY WARRANTY; without even the implied warranty of

34120

+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See

34121

+ * the GNU General Public License for more details.

34122

+ *

34123

+ * You should have received a copy of the GNU General Public License

34124

+ * along with this program; if not, write to the Free Software

34125

+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

34126

+ */

34127

+/*

34128

+ * linux/include/linux/evms_user.h

34129

+ *

34130

+ * EVMS (master) user header file

34131

+ *

34132

+ */

34133

+

34134

+#include <linux/evms/evms_common.h>

34135

+#include <linux/evms/evms_ioctl.h>

34136

diff -Naur linux-2002-03-28/include/linux/evms/evms_xor.h evms-2002-03-28/include/linux/evms/evms_xor.h

34137

--- linux-2002-03-28/include/linux/evms/evms_xor.h Wed Dec 31 18:00:00 1969

34138

+++ evms-2002-03-28/include/linux/evms/evms_xor.h Mon Feb 4 09:58:43 2002

34139

@@ -0,0 +1,23 @@

34140

+#ifndef _XOR_H

34141

+#define _XOR_H

34142

+

34143

+#include <linux/evms/evms_md.h>

34144

+

34145

+#define MAX_XOR_BLOCKS 5

34146

+

34147

+extern void evms_md_xor_block(unsigned int count, struct buffer_head **bh_ptr);

34148

+

34149

+struct xor_block_template {

34150

+ struct xor_block_template *next;

34151

+ const char *name;

34152

+ int speed;

34153

+ void (*do_2)(unsigned long, unsigned long *, unsigned long *);

34154

+ void (*do_3)(unsigned long, unsigned long *, unsigned long *,

34155

+ unsigned long *);

34156

+ void (*do_4)(unsigned long, unsigned long *, unsigned long *,

34157

+ unsigned long *, unsigned long *);

34158

+ void (*do_5)(unsigned long, unsigned long *, unsigned long *,

34159

+ unsigned long *, unsigned long *, unsigned long *);

34160

+};

34161

+

34162

+#endif