1
diff -Naur linux-2002-03-28/drivers/evms/AIXlvm_vge.c evms-2002-03-28/drivers/evms/AIXlvm_vge.c
2
--- linux-2002-03-28/drivers/evms/AIXlvm_vge.c Wed Dec 31 18:00:00 1969
3
+++ evms-2002-03-28/drivers/evms/AIXlvm_vge.c Thu Mar 28 13:53:07 2002
10
+ * Copyright (c) International Business Machines Corp., 2000
12
+ * This program is free software; you can redistribute it and/or modify
13
+ * it under the terms of the GNU General Public License as published by
14
+ * the Free Software Foundation; either version 2 of the License, or
15
+ * (at your option) any later version.
17
+ * This program is distributed in the hope that it will be useful,
18
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
19
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
20
+ * the GNU General Public License for more details.
22
+ * You should have received a copy of the GNU General Public License
23
+ * along with this program; if not, write to the Free Software
24
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
29
+ * linux/drivers/evms/AIXlvm_vge.c
31
+ * EVMS AIX LVM Volume Group Emulator
37
+#define EVMS_AIX_DEBUG 1
39
+#define AIX_COMMON_SERVICES_MAJOR 0 // Required common services levels for the AIX kernel plugin
40
+#define AIX_COMMON_SERVICES_MINOR 5 // These must be incremented if new function is added to common
41
+#define AIX_COMMON_SERVICES_PATCHLEVEL 0 // services and the AIX kernel plugin uses the new function.
42
+#define AIX_INCREMENT_REQUEST 1
43
+#define AIX_DECREMENT_REQUEST -1
46
+#include <linux/module.h>
47
+#include <linux/kernel.h>
48
+#include <linux/config.h>
50
+#include <linux/genhd.h>
51
+#include <linux/major.h>
52
+#include <linux/string.h>
53
+#include <linux/blk.h>
54
+#include <linux/init.h>
55
+#include <linux/slab.h>
57
+#include <linux/evms/evms_kernel.h>
58
+#include <linux/evms/evms_aix.h>
59
+#include <asm/system.h>
60
+#include <asm/uaccess.h>
62
+#include <linux/sched.h>
63
+#include <linux/smp_lock.h>
64
+#include <linux/locks.h>
65
+#include <linux/delay.h>
66
+#include <linux/reboot.h>
67
+#include <linux/completion.h>
68
+#include <linux/vmalloc.h>
70
+#ifdef EVMS_AIX_DEBUG
71
+static int AIX_volume_group_dump(void);
74
+static aix_volume_group_t * AIXVolumeGroupList=NULL;
75
+static evms_thread_t * AIX_mirror_thread;
76
+static evms_pool_mgmt_t * AIX_BH_list_pool = NULL;
77
+static aix_mirror_bh_t * AIX_retry_list = NULL;
78
+static aix_mirror_bh_t ** AIX_retry_tail = NULL;
79
+static spinlock_t AIX_retry_list_lock = SPIN_LOCK_UNLOCKED;
81
+// Plugin API prototypes
83
+static void AIXiod (void *data);
84
+static int discover_aix(evms_logical_node_t ** evms_logical_disk_head);
85
+static int discover_volume_groups( evms_logical_node_t ** );
86
+static int discover_logical_volumes( void );
87
+static int end_discover_aix(evms_logical_node_t ** evms_logical_disk_head);
88
+static void read_aix(evms_logical_node_t * node, eio_t * eio);
89
+static void write_aix(evms_logical_node_t * node, eio_t * eio);
90
+static int ioctl_aix( evms_logical_node_t * logical_node,
91
+ struct inode * inode,
95
+static int AIX_remap_sector(evms_logical_node_t * node,
96
+ evms_sector_t org_sector, // logical sector to remap
97
+ evms_sector_t size, // size (in sectors) of request to remap
98
+ evms_sector_t * new_sector, // remapped sector
99
+ evms_sector_t * new_size, // new size (in sectors)
100
+ partition_list_entry_t ** partition, // new node for which new_sector is relative
102
+ u_int32_t * offset_in_le);
104
+static int validate_build_volume_group_disk_info(evms_logical_node_t * logical_node,
105
+ AIXlvm_rec_t * AIXlvm);
107
+static int add_VG_data_to_VG_list ( evms_logical_node_t * logical_node,
108
+ aix_volume_group_t * new_group,
110
+static int add_PV_to_volume_group( aix_volume_group_t * group,
111
+ evms_logical_node_t * evms_partition,
113
+static aix_volume_group_t * AIX_create_volume_group(evms_logical_node_t * logical_node,
114
+ AIXlvm_rec_t * AIXlvm);
116
+static int AIX_update_volume_group(aix_volume_group_t * AIXVGLptr,
117
+ evms_logical_node_t * logical_node,
118
+ AIXlvm_rec_t * AIXlvm);
120
+static int AIX_evms_cs_notify_lv_io_error(evms_logical_node_t * node);
123
+static int export_volumes( evms_logical_node_t ** evms_logical_disk_head );
124
+static int lvm_cleanup( void );
125
+static int AIX_copy_header_info(vg_header *AIXvgh, vg_header *AIXvgh2);
126
+static int build_pe_maps( aix_volume_group_t * volume_group);
128
+static aix_logical_volume_t * new_logical_volume(lv_entries *AIXlvent,
129
+ aix_volume_group_t *group,
131
+ u_int32_t stripesize);
133
+static int check_log_volume_and_pe_maps( aix_volume_group_t * group );
134
+static int check_volume_groups(void);
135
+static int init_io_aix( evms_logical_node_t * node,
136
+ int io_flag, /* 0=read, 1=write*/
137
+ evms_sector_t sect_nr, /* disk LBA */
138
+ evms_sector_t num_sects, /* # of sectors */
139
+ void * buf_addr ); /* buffer address */
142
+static int delete_logical_volume( aix_logical_volume_t * volume );
143
+static int delete_aix_node( evms_logical_node_t * logical_node );
144
+static int deallocate_volume_group( aix_volume_group_t * group );
146
+static void AIX_handle_read_mirror_drives(struct buffer_head * bh,
149
+static void AIX_handle_write_mirror_drives(struct buffer_head * bh,
152
+static void aix_notify_cache_ctor(void * foo, kmem_cache_t * cachep, unsigned long flags);
154
+static aix_mirror_bh_t * AIX_alloc_rbh(evms_logical_node_t * node,
156
+ uint32_t mirror_copies,
157
+ evms_sector_t org_sector,
160
+static aix_mirror_bh_t * AIX_alloc_wbh(evms_logical_node_t * node,
161
+ evms_logical_node_t * node2,
162
+ evms_logical_node_t * node3,
164
+ uint32_t mirror_copies,
165
+ evms_sector_t new_sector2,
166
+ evms_sector_t new_sector3);
168
+static void AIX_free_headers(vg_header *AIXvgh, vg_header *AIXvgh2, vg_trailer *AIXvgt, vg_trailer *AIXvgt2);
169
+//****************************************************************************************************
171
+/* END of PROTOTYES*/
173
+#define GET_PHYSICAL_PART_SIZE(v1) (1 << v1)
175
+#define AIX_PVH_DATA_PSN(vgda_psn, pvNum) (vgda_psn + PSN_PPH_OFFSET + ((pvNum -1) * PSN_PVH_INCREMENT))
177
+#define COMPARE_TIMESTAMPS(t1, t2) ( (t1).tv_sec == (t2).tv_sec && \
178
+ (t1).tv_nsec == (t2).tv_nsec )
180
+#define COMPARE_UNIQUE_IDS(id1, id2) ( (id1).word1 == (id2).word1 && \
181
+ (id1).word2 == (id2).word2 && \
182
+ (id1).word3 == (id2).word3 && \
183
+ (id1).word4 == (id2).word4 )
185
+#define AIX_PV_STATE_VALID 0 // Both VGDAs are valid and match.
186
+#define AIX_PV_STATE_FIRST_VGDA 1 // Only the first VGDA is valid.
187
+#define AIX_PV_STATE_SECOND_VGDA 2 // Only the second VGDA is valid.
188
+#define AIX_PV_STATE_EITHER_VGDA -1 // Both VGDAs are valid, but do not match each other.
189
+#define AIX_PV_STATE_INVALID -2 // We're in an invalid state but there's more PVs in this group
192
+#ifndef EVMS_AIX_DEBUG
193
+ #define AIX_VOLUME_GROUP_DUMP()
195
+ #define AIX_VOLUME_GROUP_DUMP() LOG_DEBUG("Called line:%d \n",__LINE__); \
196
+ AIX_volume_group_dump()
199
+// Global LVM data structures
201
+static evms_plugin_function_table_t AIXlvm_function_table = {
202
+ discover: &discover_aix,
203
+ end_discover: &end_discover_aix,
204
+ delete : &delete_aix_node,
206
+ write : &write_aix,
207
+ init_io : &init_io_aix,
211
+static evms_plugin_header_t plugin_header = {
214
+ EVMS_REGION_MANAGER, // Region Manager class
215
+ 3 ), // Unique ID within VGEs
220
+ }, // Major, Minor, Patchlevel
221
+ required_common_services_version: {
222
+ major : AIX_COMMON_SERVICES_MAJOR,
223
+ minor : AIX_COMMON_SERVICES_MINOR,
224
+ patchlevel : AIX_COMMON_SERVICES_PATCHLEVEL
226
+ function_table : &AIXlvm_function_table // Function table for this plugin
233
+ * Function: remap sector
234
+ * Common function to remap volume lba to partition lba in appropriate PE
236
+static int AIX_remap_sector(evms_logical_node_t * node,
237
+ evms_sector_t org_sector, // logical sector to remap
238
+ evms_sector_t size, // size (in sectors) of request to remap
239
+ evms_sector_t * new_sector, // remapped sector
240
+ evms_sector_t * new_size, // new size (in sectors)
241
+ partition_list_entry_t ** partition, // new node for which new_sector is relative
243
+ u_int32_t * offset_in_le)
245
+ aix_logical_volume_t * volume;
247
+ u_int32_t sectors_per_stripe;
248
+ u_int32_t partition_to_use;
250
+ u_int32_t stripe_in_column;
252
+ u_int32_t org_sector32; // Until striping is 64-bit enabled.
254
+ volume = (aix_logical_volume_t *) node->instance_data;
257
+ LOG_DEBUG("-- %s volume:%p lv:%d size:%Ld Name:%s\n",__FUNCTION__, volume,volume->lv_number,size,volume->name);
258
+ LOG_DEBUG(" node %p node_name [%s] org_sector:%Ld\n",node, node->name, org_sector);
259
+ LOG_DEBUG(" mirror_copies:%d volume->lv_size:%Ld\n",volume->mirror_copies,volume->lv_size);
262
+ org_sector32 = org_sector;
264
+ *(new_size) = size;
266
+ // Check if volume is striped. Reset the size if the request
267
+ // crosses a stripe boundary.
268
+ if ( volume->stripes > 1 ) {
270
+ LOG_DEBUG(" *** STRIPED ***\n");
271
+ LOG_DEBUG(" ------- volume->stripe_size:%d org_sector:%d volume_stripes:%d\n",volume->stripe_size, org_sector32, volume->stripes);
274
+ *(le) = org_sector >> volume->pe_size_shift; // 64-bit safe
275
+ *(offset_in_le) = org_sector & (volume->pe_size - 1); // 64-bit safe
278
+ LOG_DEBUG("OLD - le:%d -- offset_in_le:%d \n",*(le), *(offset_in_le));
281
+ sectors_per_stripe = volume->stripe_size / AIX_SECTOR_SIZE;
282
+ partition_to_use = (org_sector32 / sectors_per_stripe) % volume->stripes;
283
+ stripe_in_column = ((((org_sector32 / volume->stripe_size) / volume->stripes) * volume->stripe_size) + (org_sector32 % sectors_per_stripe));
284
+ column = ((org_sector32 / sectors_per_stripe) / volume->stripes) * sectors_per_stripe;
287
+ LOG_DEBUG("offset_in_le:%d org_sector:%Ld pe_shift:%d stripe_shift:%d\n",*(offset_in_le), org_sector, volume->pe_size_shift,volume->stripe_size_shift);
289
+ LOG_DEBUG(" org_sector:%d sectors_per_stripe:%d partition_to_use:%d stripe_in_column:%d column:%d\n",org_sector32, sectors_per_stripe, partition_to_use,stripe_in_column,column);
290
+ LOG_DEBUG(" offset_in_le + size:%Ld volume->pe_size:%d volume->lv_size:%Ld\n",(*(offset_in_le)+size),volume->pe_size ,volume->lv_size);
293
+ if ( *(offset_in_le) + size > volume->pe_size ) {
294
+ *new_size = volume->pe_size - *(offset_in_le);
295
+ LOG_DEBUG(" new_size %Ld\n",*new_size);
299
+ // Non-striped volume. Just find LE and offset. Reset the size
300
+ // if the request crosses an LE boundary.
303
+ LOG_DEBUG(" *** NON-STRIPED ***\n");
306
+ *(le) = org_sector >> volume->pe_size_shift; // 64-bit safe
307
+ *(offset_in_le) = org_sector & (volume->pe_size - 1); // 64-bit safe
312
+ LOG_DEBUG(" offset_in_le:%d org_sector:%Ld shift:%d\n",*(offset_in_le), org_sector, volume->pe_size_shift);
314
+ if (*(le) >= volume->num_le) {
315
+ LOG_DEBUG(" le Memory Overwrite !! le:%d vs volume->num_le:%d\n",*(le),volume->num_le);
320
+ *(new_sector) = volume->le_to_pe_map[*(le)].pe_sector_offset + *(offset_in_le);
321
+ *(partition) = volume->le_to_pe_map[*(le)].owning_pv;
324
+ LOG_DEBUG(" new_sector:%Ld\n", *(new_sector));
325
+ LOG_DEBUG(" Owning Part %p\n",*(partition));
326
+ LOG_DEBUG(" End %s\n",__FUNCTION__);
334
+ * Function: read_aix
336
+static void read_aix(evms_logical_node_t * node,
339
+ partition_list_entry_t * partition;
340
+ evms_sector_t org_sector;
341
+ evms_sector_t new_sector;
342
+ evms_sector_t new_size;
343
+ aix_logical_volume_t * volume;
344
+ aix_mirror_bh_t * tmp_bh;
345
+ u_int32_t le, offset_in_le,count;
348
+ volume = (aix_logical_volume_t *) node->instance_data;
350
+ LOG_DEBUG(" ***** %s ***** bh:%p volume->iter:%d\n",__FUNCTION__,eio->bh, volume->mirror_iterations);
355
+ LOG_DEBUG(" node->total_vsectors:%Lu\n",node->total_vsectors);
356
+ LOG_DEBUG(" rsector:%Lu rsize:%Lu node_flags:%u\n",eio->rsector,eio->rsize,node->flags);
359
+ // Check if I/O goes past end of logical volume.
360
+ if ( eio->rsector + eio->rsize > node->total_vsectors ) {
361
+ LOG_CRITICAL(" read_aix ERROR %d\n",__LINE__);
362
+ EVMS_IO_ERROR(eio);
367
+ // Logical-to-physical remapping.
368
+ if ( AIX_remap_sector(node, eio->rsector, eio->rsize, &new_sector, &new_size, &partition, &le, &offset_in_le) ||
369
+ (!partition || !new_sector)) {
370
+ LOG_CRITICAL(" read_aix bh: ERROR %d\n",__LINE__);
371
+ EVMS_IO_ERROR(eio);
375
+ org_sector = eio->rsector;
376
+ eio->rsector = new_sector;
377
+ eio->rsize = new_size;
380
+ LOG_DEBUG(" read_aix Mirror_Copies:%d\n",volume->mirror_copies);
383
+ if (volume->mirror_copies > AIX_DEFAULT_MIRRORING) {
386
+ tmp_bh = AIX_alloc_rbh(node, eio, 1, new_sector, AIX_LV_READ);
389
+ EVMS_IO_ERROR(eio);
393
+ if (volume->le_to_pe_map_mir1) {
394
+ tmp_bh->mir_node1 = volume->le_to_pe_map_mir1[le].owning_pv->logical_node;
395
+ tmp_bh->mir_sector1 = volume->le_to_pe_map_mir1[le].pe_sector_offset + offset_in_le;
398
+ if (volume->mirror_copies == AIX_MAX_MIRRORS) {
399
+ tmp_bh->mir_node2 = volume->le_to_pe_map_mir2[le].owning_pv->logical_node;
400
+ tmp_bh->mir_sector2 = volume->le_to_pe_map_mir2[le].pe_sector_offset + offset_in_le;
403
+ if (evms_cs_volume_request_in_progress(tmp_bh->bh_req.b_dev , AIX_INCREMENT_REQUEST, &count)) {
404
+ EVMS_IO_ERROR(eio);
408
+ R_IO(partition->logical_node, &tmp_bh->eio);
411
+ R_IO(partition->logical_node, eio);
416
+ LOG_DEBUG(" ***** %s ***** returning\n",__FUNCTION__);
423
+ * Function: write_aix
425
+static void write_aix( evms_logical_node_t * node,
428
+ partition_list_entry_t * partition;
429
+ evms_sector_t new_sector, new_sector2 = 0, new_sector3 = 0;
430
+ evms_sector_t org_sector;
431
+ evms_sector_t new_size;
432
+ aix_logical_volume_t * volume;
433
+ aix_mirror_bh_t * tmp_bh;
434
+ evms_logical_node_t * node2 = NULL, *node3 = NULL;
435
+ u_int32_t le, offset_in_le, count;
437
+ volume = (aix_logical_volume_t *) node->instance_data;
440
+ LOG_DEBUG(" ***** %s ***** bh:%p volume->iter:%d\n",__FUNCTION__,eio->bh, volume->mirror_iterations);
441
+ LOG_DEBUG(" write_aix rsector:%Lu rsize:%Lu\n",eio->rsector,eio->rsize);
442
+ LOG_DEBUG(" write_aix total_sectors:%Lu\n",node->total_vsectors);
445
+ if (volume->lv_access & EVMS_LV_INCOMPLETE) { //No writes allowed on incomplete volumes
446
+ LOG_CRITICAL(" write_aix incomplete volume ERROR %d\n",__LINE__);
447
+ EVMS_IO_ERROR(eio);
452
+ // Check if I/O goes past end of logical volume.
453
+ if ( eio->rsector + eio->rsize > node->total_vsectors ) {
454
+ LOG_CRITICAL(" write_aix ERROR %d\n",__LINE__);
455
+ EVMS_IO_ERROR(eio);
459
+ // Logical-to-Physical remapping
460
+ if ( AIX_remap_sector(node, eio->rsector, eio->rsize, &new_sector, &new_size, &partition, &le, &offset_in_le) ||
461
+ (!new_sector || !partition)) {
462
+ LOG_CRITICAL(" write_aix ERROR %d\n",__LINE__);
463
+ EVMS_IO_ERROR(eio);
467
+ org_sector = eio->rsector;
468
+ eio->rsector = new_sector;
469
+ eio->rsize = new_size;
472
+ LOG_DEBUG(" write_aix Mirror_Copies:%d\n", volume->mirror_copies);
476
+ if (volume->mirror_copies > AIX_DEFAULT_MIRRORING) {
478
+ if (volume->le_to_pe_map_mir1) {
479
+ new_sector2 = volume->le_to_pe_map_mir1[le].pe_sector_offset + offset_in_le;
480
+ node2 = volume->le_to_pe_map_mir1[le].owning_pv->logical_node;
483
+ if (volume->mirror_copies == AIX_MAX_MIRRORS) {
485
+ new_sector3 = volume->le_to_pe_map_mir2[le].pe_sector_offset + offset_in_le;
486
+ node3 = volume->le_to_pe_map_mir2[le].owning_pv->logical_node;
489
+ tmp_bh = AIX_alloc_wbh(partition->logical_node, node2, node3, eio, volume->mirror_copies, new_sector2, new_sector3);
492
+ EVMS_IO_ERROR(eio);
495
+ tmp_bh->node = node;
497
+ tmp_bh = tmp_bh->mirror_bh_list;
499
+ if (evms_cs_volume_request_in_progress(tmp_bh->bh_req.b_dev , AIX_INCREMENT_REQUEST, &count)) {
500
+ EVMS_IO_ERROR(eio);
501
+ // free memory here
505
+ W_IO(tmp_bh->node, &tmp_bh->eio);
507
+ tmp_bh = tmp_bh->next_r1;
510
+ W_IO(tmp_bh->node, &tmp_bh->eio);
511
+ tmp_bh = tmp_bh->next_r1;
515
+ W_IO(tmp_bh->node, &tmp_bh->eio);
520
+ W_IO(partition->logical_node, eio);
525
+ LOG_DEBUG(" ***** %s returning *****\n",__FUNCTION__);
532
+ * Function: ioctl_aix
535
+static int ioctl_aix( evms_logical_node_t * logical_node,
536
+ struct inode * inode,
537
+ struct file * file,
541
+ aix_logical_volume_t * volume = (aix_logical_volume_t*)(logical_node->instance_data);
544
+ LOG_EXTRA(" Ioctl %u\n",cmd);
551
+ // Fixed geomerty for all LVM volumes
552
+ unsigned char heads = 64;
553
+ unsigned char sectors = 32;
555
+ struct hd_geometry *hd = (struct hd_geometry *)arg;
557
+ cylinders = logical_node->total_vsectors;
558
+ cylinders = (cylinders / heads) / sectors;
564
+ if ( copy_to_user((char*)(&hd->heads), &heads, sizeof(heads)) != 0 ||
565
+ copy_to_user((char*)(&hd->sectors), §ors, sizeof(sectors)) != 0 ||
566
+ copy_to_user((short*)(&hd->cylinders), &cylinders, sizeof(cylinders)) != 0 ||
567
+ copy_to_user((long*)(&hd->start), &start, sizeof(start)) != 0 ) {
573
+ case EVMS_QUIESCE_VOLUME:
576
+ case EVMS_GET_DISK_LIST:
577
+ case EVMS_CHECK_MEDIA_CHANGE:
578
+ case EVMS_REVALIDATE_DISK:
579
+ case EVMS_OPEN_VOLUME:
580
+ case EVMS_CLOSE_VOLUME:
582
+ // These five ioctl all need to be broadcast to all PVs.
583
+ aix_volume_group_t * group = volume->group;
584
+ partition_list_entry_t * partition;
585
+ for ( partition = group->partition_list; partition; partition = partition->next ) {
586
+ rc |= IOCTL(partition->logical_node, inode, file, cmd, arg);
592
+ // Currently the VGE does not send any ioctl's down to the
593
+ // partitions. Which partition would they go to?
602
+ * Function: init_io_aix
605
+static int init_io_aix( evms_logical_node_t * node,
606
+ int io_flag, /* 0=read, 1=write*/
607
+ evms_sector_t sect_nr, /* disk LBA */
608
+ evms_sector_t num_sects, /* # of sectors */
609
+ void * buf_addr ) /* buffer address */
611
+ partition_list_entry_t * partition;
612
+ evms_sector_t new_sector = 0;
613
+ evms_sector_t new_size = 0;
615
+ u_int32_t le, offset;
617
+ LOG_DEBUG(" ************ init_io_aix() num_sects:%Ld node:%p sect_nr:%Ld\n",num_sects, node, sect_nr);
619
+ // Init IO needs to deal with the possibility that a request can come
620
+ // in that spans PEs or stripes. This is possible because there is no
621
+ // limit on num_sects. To fix this, we loop through AIX_remap_sector and
622
+ // INIT_IO until num_sects reaches zero.
625
+ while ( num_sects > 0 ) {
627
+ if (AIX_remap_sector(node, sect_nr, num_sects, &new_sector, &new_size, &partition, &le, &offset) ||
628
+ (!new_sector || !partition)) {
629
+ LOG_CRITICAL("--- Error returned from AIX_remap_sector %d\n",__LINE__);
633
+ LOG_DEBUG(" init_io_aix() line:%d logical_node:%p io_flag:%d new_sector:%Ld new_size:%Ld\n",__LINE__,partition->logical_node, io_flag, new_sector, new_size);
635
+ rc = INIT_IO(partition->logical_node, io_flag, new_sector, new_size, buf_addr);
636
+ num_sects -= new_size;
637
+ sect_nr += new_size;
638
+ buf_addr = (void*)(((unsigned long)buf_addr) + (unsigned long)(new_size << EVMS_VSECTOR_SIZE_SHIFT));
645
+ * Function: AIXlvm_vge_init
648
+int __init AIXlvm_vge_init(void)
650
+ const char * name = "evms_AIXiod";
652
+ LOG_DEBUG(" %s --------\n",__FUNCTION__);
654
+ AIX_mirror_thread = evms_cs_register_thread(AIXiod, NULL, name);
657
+ return evms_cs_register_plugin(&plugin_header); /* register with EVMS */
660
+module_init(AIXlvm_vge_init);
665
+/********** Required Plugin Functions **********/
669
+ * Function: discover_aix
671
+ * This is the entry point into the LVM discovery process.
673
+static int discover_aix(evms_logical_node_t ** evms_logical_disk_head)
675
+ int rc = 0, count = 0;
677
+ LOG_DEBUG("[%s] discover_volume_groups\n",__FUNCTION__);
679
+ rc = discover_volume_groups(evms_logical_disk_head);
682
+ LOG_ERROR("[%s] discover_volume_groups rc=%d\n",__FUNCTION__ ,rc);
685
+ if (AIXVolumeGroupList) {
687
+ LOG_DEBUG("[%s] discover_logical_volumes\n",__FUNCTION__);
689
+ rc = discover_logical_volumes();
692
+ LOG_ERROR("[%s] discover_logical_volumes rc=%d\n",__FUNCTION__ ,rc);
696
+ LOG_DEBUG("[%s] export_volumes\n",__FUNCTION__);
698
+ count = export_volumes(evms_logical_disk_head);
700
+ LOG_DEBUG("[%s] export_volumes count=%d\n",__FUNCTION__ ,count);
708
+static int discover_volume_groups(evms_logical_node_t ** evms_logical_disk_head)
710
+ evms_logical_node_t * logical_node;
711
+ evms_logical_node_t * next_node;
712
+ AIXIPL_REC * AIXpv;
713
+ AIXlvm_rec_t * AIXlvm; // Temp holder for the LVM on disk rec
716
+ LOG_DEBUG(" Begin %s\n", __FUNCTION__);
718
+ if (evms_cs_allocate_memory((void**)&AIXpv, AIX_SECTOR_SIZE)) {
722
+ // We'll create at least one volume entry, if we don't find any AIX volumes we'll clean it up later
724
+ if (evms_cs_allocate_memory((void**)&AIXlvm, sizeof(AIXlvm_rec_t))) {
725
+ evms_cs_deallocate_memory(AIXpv);
730
+ for ( logical_node = *evms_logical_disk_head; logical_node; logical_node = next_node ) {
732
+ // Grab the next list item in case we remove this partition from the global list.
733
+ next_node = logical_node->next;
735
+ // Read the first sector and see if it has a valid AIX PV signature.
737
+ if ( INIT_IO(logical_node, 0, 0, 1, AIXpv) ) {
738
+ // On an I/O error, continue on to the next
739
+ // partition. The group that this partition
740
+ // belongs to will be incomplete, but we still
741
+ // need to discover any other groups.
743
+ LOG_ERROR(" Error reading PV [%p]\n",logical_node);
748
+ if (AIXpv->IPL_record_id == IPLRECID) {
750
+ // This partition is definitely a PV,
751
+ // but is it part of a valid VG?
752
+ LOG_DEBUG(" DVG removing node from list logical_node %p\n", logical_node);
754
+ if (INIT_IO(logical_node, 0, PSN_LVM_REC, 1, AIXlvm)) {
755
+ LOG_ERROR(" Error reading PV [%p]\n",logical_node);
759
+ if (AIXlvm->lvm_id == AIX_LVM_LVMID) {
761
+ if (validate_build_volume_group_disk_info(logical_node, AIXlvm) ) {
762
+ // Again, continue on and we'll
767
+ evms_cs_remove_logical_node_from_list( evms_logical_disk_head, logical_node );
770
+ LOG_DEBUG(" Found an AIX PV with no parent LVM (LVM ID: %ld)\n",AIXlvm->lvm_id);
774
+ LOG_DEBUG(" Found a PV not belonging to AIX [%p]\n",logical_node);
778
+ AIX_VOLUME_GROUP_DUMP();
780
+ if (check_volume_groups()) {
784
+ evms_cs_deallocate_memory(AIXpv);
785
+ evms_cs_deallocate_memory(AIXlvm);
792
+ * Function: validate_build_volume_group_disk_info
794
+ * Creates and validates the volume groups found on the disk structures.
797
+static int validate_build_volume_group_disk_info(evms_logical_node_t * logical_node,
798
+ AIXlvm_rec_t * AIXlvm)
801
+ aix_volume_group_t * AIXVGLptr = AIXVolumeGroupList;
803
+ LOG_DEBUG(" VBVGDI pv_num:%d\n", AIXlvm->pv_num);
805
+ while (AIXVGLptr) {
806
+ if (COMPARE_UNIQUE_IDS(AIXlvm->vg_id, AIXVGLptr->vg_id)) {
809
+ AIXVGLptr = AIXVGLptr->next; // There is more than one so walk the list
813
+ LOG_DEBUG(" VBVGDI AIXVGLptr:%p line:%d\n", AIXVGLptr,__LINE__);
814
+ AIXVGLptr = AIX_create_volume_group(logical_node, AIXlvm);
815
+ AIXVGLptr->next = AIXVolumeGroupList;
816
+ AIXVolumeGroupList = AIXVGLptr;
818
+ LOG_DEBUG(" VBVGDI Rediscover AIXVGLptr:%p line:%d\n", AIXVGLptr,__LINE__);
820
+ if (AIX_update_volume_group(AIXVGLptr, logical_node, AIXlvm)) {
821
+ LOG_DEBUG(" VBVGDI ERROR on Rediscover AIXVGLptr:%p line:%d\n", AIXVGLptr,__LINE__);
827
+ LOG_DEBUG(" VBVGDI AIXVGLptr:%p line:%d\n", AIXVGLptr,__LINE__);
828
+ LOG_DEBUG(" VBVGDI flags:%d\n", AIXVGLptr->flags);
829
+ LOG_CRITICAL("Unable to allocate volume group data struct Volume Group Corruption !!\n");
833
+ LOG_DEBUG(" VBVGDI AIXVolumeGroupList:%p line:%d\n", AIXVolumeGroupList,__LINE__);
834
+ LOG_DEBUG(" VBVGDI AIXVGLptr:%p line:%d\n", AIXVGLptr,__LINE__);
835
+ LOG_DEBUG(" VBVGDI flags:%d\n", AIXVGLptr->flags);
837
+ if ( add_PV_to_volume_group(AIXVGLptr, logical_node, AIXlvm->pv_num) ) {
845
+ * Function: add_VG_data_to_VG_list
847
+ * Allocate space for a new LVM volume group and all of its sub-fields.
848
+ * Initialize the appropriate fields.
851
+static int add_VG_data_to_VG_list ( evms_logical_node_t * logical_node,
852
+ aix_volume_group_t * new_group,
859
+ // The array of pointer to the logical volumes.
860
+ // Leave this allocation at the max permitted, the lv numbering may not be sequential so you may have gaps
861
+ // in the array allocation i.e. 1,2,3,4,5,6,7,8,11,15,21,33 etc. even though you only have 12 LVs.
863
+ LOG_DEBUG(" AVGDVGL Entering pvNum:%d vgda_PSN:%d\n",pvNum, new_group->vgda_psn);
865
+ pvh_pos = AIX_PVH_DATA_PSN(new_group->vgda_psn, pvNum);
867
+ if (evms_cs_allocate_memory((void**)&AIXpvh, AIX_SECTOR_SIZE)) {
871
+ LOG_DEBUG(" AVGDVGL pvh_pos:%d\n", pvh_pos);
873
+ if (INIT_IO(logical_node, 0, pvh_pos, 1, AIXpvh)) {
877
+ LOG_DEBUG(" AVGDVGL AIXpvh->pv_num:%d\n", pvNum);
879
+ if (!new_group->volume_list) {
880
+ if ( evms_cs_allocate_memory((void**)&(new_group->volume_list), LVM_MAXLVS*sizeof(aix_logical_volume_t*)) ) {
881
+ evms_cs_deallocate_memory(AIXpvh);
886
+ new_group->vg_id.word1 = new_group->AIXvgh->vg_id.word1;
887
+ new_group->vg_id.word2 = new_group->AIXvgh->vg_id.word2;
888
+ new_group->vg_id.word3 = new_group->AIXvgh->vg_id.word3;
889
+ new_group->vg_id.word4 = new_group->AIXvgh->vg_id.word4;
890
+ new_group->numpvs = new_group->AIXvgh->numpvs;
891
+ new_group->numlvs = new_group->AIXvgh->numlvs;
892
+ new_group->lv_max = new_group->AIXvgh->maxlvs;
893
+ new_group->pe_size = (GET_PHYSICAL_PART_SIZE(new_group->AIXvgh->pp_size) / AIX_SECTOR_SIZE);
895
+ new_group->block_size = 0;
896
+ new_group->hard_sect_size = 0;
897
+ new_group->flags |= EVMS_VG_DIRTY;
899
+ evms_cs_deallocate_memory(AIXpvh);
902
+ LOG_DEBUG(" AVGDVGL Vol Group ID %x\n", new_group->vg_id.word2);
910
+ * Function: add_PV_to_volume_group
912
+ * Create a new partition_list_entry for the specified volume group.
913
+ * Initialize the new partition with the evms node and lvm pv information,
914
+ * and add the new partition to the group's list.
917
+static int add_PV_to_volume_group( aix_volume_group_t * group,
918
+ evms_logical_node_t * evms_partition,
921
+ partition_list_entry_t * new_partition;
923
+ LOG_DEBUG(" APVVG Entering pvNum:%d\n",pvNum);
925
+ group->flags |= EVMS_VG_DIRTY;
927
+ for (new_partition = group->partition_list; new_partition != NULL; new_partition=new_partition->next) {
928
+ if (new_partition->logical_node == evms_partition) {
933
+ if ( evms_cs_allocate_memory((void**)&new_partition, sizeof(partition_list_entry_t)) ) {
937
+ // Add this partition to this group's list.
938
+ new_partition->logical_node = evms_partition;
939
+ new_partition->pv_number = pvNum;
941
+ group->hard_sect_size = evms_partition->hardsector_size;
942
+ group->block_size = evms_partition->block_size;
944
+ // Add this partition to the beginning of its group's list.
945
+ new_partition->next = group->partition_list;
946
+ group->partition_list = new_partition;
947
+ group->partition_count++;
949
+ LOG_DEBUG(" APVVG partition_count:%d pv_num:%d\n",group->partition_count, pvNum);
953
+/****************************************************
957
+*****************************************************/
958
+static aix_volume_group_t * AIX_create_volume_group(evms_logical_node_t * logical_node,
959
+ AIXlvm_rec_t * AIXlvm)
961
+ vg_header * AIXvgh, *AIXvgh2;
962
+ vg_trailer * AIXvgt, *AIXvgt2;
963
+ aix_volume_group_t * AIXVGLptr;
967
+ if (evms_cs_allocate_memory((void**)&AIXvgh, AIX_SECTOR_SIZE)) {
971
+ if (evms_cs_allocate_memory((void**)&AIXvgh2, AIX_SECTOR_SIZE)) {
972
+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
976
+ if (evms_cs_allocate_memory((void**)&AIXvgt, AIX_SECTOR_SIZE)) {
977
+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
981
+ if (evms_cs_allocate_memory((void**)&AIXvgt2, AIX_SECTOR_SIZE)) {
982
+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
986
+ // First time thru we want to read this in, we may only have one PV in this group, all others
987
+ // may be corrupt, etc. If the info is clean we shouldn't get here.
989
+ if (INIT_IO(logical_node, 0, AIXlvm->vgda_psn[0], 1, AIXvgh)) {
990
+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
994
+ if (INIT_IO(logical_node, 0, AIXlvm->vgda_psn[1], 1, AIXvgh2)) {
995
+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
999
+ if (INIT_IO(logical_node, 0, (AIXlvm->vgda_psn[0]+AIXlvm->vgda_len-1), 1, AIXvgt)) {
1000
+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1004
+ if (INIT_IO(logical_node, 0, (AIXlvm->vgda_psn[1]+AIXlvm->vgda_len-1), 1, AIXvgt2)) {
1005
+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1009
+ LOG_DEBUG("CVG AIXvgh->vgda_psn[%d]:%d\n", 0, AIXlvm->vgda_psn[0]);
1010
+ LOG_DEBUG("CVG AIXvgh->vgda_psn[%d]:%d\n", 1, AIXlvm->vgda_psn[1]);
1011
+ LOG_DEBUG("CVG AIXvgt psn[%d]:%ld\n", 0, (AIXlvm->vgda_psn[0]+AIXlvm->vgda_len-1));
1012
+ LOG_DEBUG("CVG AIXvgt psn[%d]:%ld\n", 1, (AIXlvm->vgda_psn[1]+AIXlvm->vgda_len-1));
1015
+ LOG_DEBUG("CVG Allocating AIXVGLptr:size:%d \n",sizeof(aix_volume_group_t));
1016
+ if (evms_cs_allocate_memory((void**)&AIXVGLptr, sizeof(aix_volume_group_t))) {
1017
+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1022
+ AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;
1023
+ AIXVGLptr->flags |= EVMS_VG_DIRTY;
1025
+ LOG_DEBUG("CVG AIXVGLptr:%p line %d\n",AIXVGLptr, __LINE__);
1027
+ if (evms_cs_allocate_memory((void**)&AIXVGLptr->AIXvgh, sizeof(vg_header))) {
1028
+ evms_cs_deallocate_memory(AIXVGLptr);
1029
+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1035
+ LOG_DEBUG("CVG COMP TS AIXVGLptr->CleanVGInfo:%d \n",AIXVGLptr->CleanVGInfo);
1037
+ if (AIXVGLptr->CleanVGInfo == AIX_PV_STATE_INVALID) {
1038
+ if (COMPARE_TIMESTAMPS(AIXvgh->vg_timestamp,AIXvgt->timestamp)) {
1039
+ if (COMPARE_TIMESTAMPS(AIXvgh2->vg_timestamp,AIXvgt2->timestamp)) {
1040
+ if (COMPARE_TIMESTAMPS(AIXvgh->vg_timestamp,AIXvgh2->vg_timestamp)) {
1041
+ // All timestamps match. Yea!
1042
+ AIXVGLptr->CleanVGInfo = AIX_PV_STATE_VALID;
1044
+ // Both VGDAs are good, but timestamps are
1045
+ // different. Can't tell yet which one is
1047
+ AIXVGLptr->CleanVGInfo = AIX_PV_STATE_EITHER_VGDA;
1050
+ // First VGDA is good, second is bad.
1051
+ AIXVGLptr->CleanVGInfo = AIX_PV_STATE_FIRST_VGDA;
1054
+ if ( COMPARE_TIMESTAMPS(AIXvgh2->vg_timestamp, AIXvgt2->timestamp) ) {
1055
+ // First VGDA is bad, second is good.
1056
+ AIXVGLptr->CleanVGInfo = AIX_PV_STATE_SECOND_VGDA;
1057
+ } else if (AIXvgh->numpvs == 1) { // We only have 1 PV in this group, mismatch or not this will have to do
1058
+ AIXVGLptr->CleanVGInfo = AIX_PV_STATE_VALID;
1060
+ // This should never happen.
1061
+ LOG_DEBUG("All four VG timestamps for %d are different. What happened?!?\n", AIXVGLptr->vg_id.word2);
1062
+ AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;
1067
+ LOG_DEBUG("CVG SWITCH TS AIXVGLptr->CleanVGInfo:%d \n",AIXVGLptr->CleanVGInfo);
1069
+ switch (AIXVGLptr->CleanVGInfo) {
1070
+ case AIX_PV_STATE_VALID:
1071
+ case AIX_PV_STATE_FIRST_VGDA:
1073
+ LOG_DEBUG("CVG SWITCH VALID %d size:%d\n",AIXVGLptr->CleanVGInfo,sizeof(vg_header));
1075
+ AIX_copy_header_info(AIXVGLptr->AIXvgh, AIXvgh); // Get the info. we need
1077
+ AIXVGLptr->vgda_psn = AIXlvm->vgda_psn[0];
1078
+ AIXVGLptr->vgda_len = AIXlvm->vgda_len;
1081
+ case AIX_PV_STATE_SECOND_VGDA:
1082
+ LOG_DEBUG("CVG SWITCH SECOND VGDA %d size:%d\n",AIXVGLptr->CleanVGInfo,sizeof(vg_header));
1084
+ AIX_copy_header_info(AIXVGLptr->AIXvgh, AIXvgh2); // Get the info. we need
1086
+ AIXVGLptr->vgda_psn = AIXlvm->vgda_psn[1];
1087
+ AIXVGLptr->vgda_len = AIXlvm->vgda_len;
1090
+ case AIX_PV_STATE_EITHER_VGDA:
1091
+ LOG_DEBUG("CVG SWITCH EITHER VGDA %d size:%d\n",AIXVGLptr->CleanVGInfo,sizeof(vg_header));
1092
+ if ( COMPARE_UNIQUE_IDS(AIXvgh->vg_id, AIXvgh2->vg_id) ) {
1094
+ AIX_copy_header_info(AIXVGLptr->AIXvgh, AIXvgh); // Get the info. we need
1096
+ AIXVGLptr->vgda_psn = AIXlvm->vgda_psn[0];
1097
+ AIXVGLptr->vgda_len = AIXlvm->vgda_len;
1099
+ AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;
1100
+ // Not sure where this PV belongs. It thinks it is
1101
+ // supposed to be in two different containers. We will
1102
+ // probably need to put this on a separate, temporary
1103
+ // list, and determine later which container is missing
1109
+ LOG_ERROR("Invalid PV state (%d) for %d\n",AIXVGLptr->CleanVGInfo , AIXVGLptr->vg_id.word2);
1110
+ AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;
1116
+ add_VG_data_to_VG_list(logical_node, AIXVGLptr, AIXlvm->pv_num);
1118
+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1120
+ LOG_DEBUG("CVG Exiting CleanVGInfo:%d\n", AIXVGLptr->CleanVGInfo);
1124
+/****************************************************
1128
+*****************************************************/
1129
+static int AIX_update_volume_group(aix_volume_group_t * AIXVGLptr,
1130
+ evms_logical_node_t * logical_node,
1131
+ AIXlvm_rec_t * AIXlvm)
1133
+ vg_header * AIXvgh, *AIXvgh2;
1134
+ vg_trailer * AIXvgt, *AIXvgt2;
1138
+ if (evms_cs_allocate_memory((void**)&AIXvgh, AIX_SECTOR_SIZE)) {
1142
+ if (evms_cs_allocate_memory((void**)&AIXvgh2, AIX_SECTOR_SIZE)) {
1143
+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1147
+ if (evms_cs_allocate_memory((void**)&AIXvgt, AIX_SECTOR_SIZE)) {
1148
+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1152
+ if (evms_cs_allocate_memory((void**)&AIXvgt2, AIX_SECTOR_SIZE)) {
1153
+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1157
+ // First time thru we want to read this in, we may only have one PV in this group, all others
1158
+ // may be corrupt, etc. If the info is clean we shouldn't get here.
1160
+ if (INIT_IO(logical_node, 0, AIXlvm->vgda_psn[0], 1, AIXvgh)) {
1161
+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1165
+ if (INIT_IO(logical_node, 0, AIXlvm->vgda_psn[1], 1, AIXvgh2)) {
1166
+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1170
+ if (INIT_IO(logical_node, 0, (AIXlvm->vgda_psn[0]+AIXlvm->vgda_len-1), 1, AIXvgt)) {
1171
+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1175
+ if (INIT_IO(logical_node, 0, (AIXlvm->vgda_psn[1]+AIXlvm->vgda_len-1), 1, AIXvgt2)) {
1176
+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1180
+ LOG_DEBUG("UVG AIXvgh->vgda_psn[%d]:%d\n", 0, AIXlvm->vgda_psn[0]);
1181
+ LOG_DEBUG("UVG AIXvgh->vgda_psn[%d]:%d\n", 1, AIXlvm->vgda_psn[1]);
1182
+ LOG_DEBUG("UVG AIXvgt psn[%d]:%ld\n", 0, (AIXlvm->vgda_psn[0]+AIXlvm->vgda_len-1));
1183
+ LOG_DEBUG("UVG AIXvgt psn[%d]:%ld\n", 1, (AIXlvm->vgda_psn[1]+AIXlvm->vgda_len-1));
1186
+ AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;
1187
+ AIXVGLptr->flags |= EVMS_VG_DIRTY;
1189
+ LOG_DEBUG("UVG AIXVGLptr:%p line %d\n",AIXVGLptr, __LINE__);
1191
+ if (evms_cs_allocate_memory((void**)&AIXVGLptr->AIXvgh, sizeof(vg_header))) {
1192
+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1198
+ LOG_DEBUG("UVG COMP TS AIXVGLptr->CleanVGInfo:%d \n",AIXVGLptr->CleanVGInfo);
1200
+ if (AIXVGLptr->CleanVGInfo == AIX_PV_STATE_INVALID) {
1201
+ if (COMPARE_TIMESTAMPS(AIXvgh->vg_timestamp,AIXvgt->timestamp)) {
1202
+ if (COMPARE_TIMESTAMPS(AIXvgh2->vg_timestamp,AIXvgt2->timestamp)) {
1203
+ if (COMPARE_TIMESTAMPS(AIXvgh->vg_timestamp,AIXvgh2->vg_timestamp)) {
1204
+ // All timestamps match. Yea!
1205
+ AIXVGLptr->CleanVGInfo = AIX_PV_STATE_VALID;
1207
+ // Both VGDAs are good, but timestamps are
1208
+ // different. Can't tell yet which one is
1210
+ AIXVGLptr->CleanVGInfo = AIX_PV_STATE_EITHER_VGDA;
1213
+ // First VGDA is good, second is bad.
1214
+ AIXVGLptr->CleanVGInfo = AIX_PV_STATE_FIRST_VGDA;
1217
+ if ( COMPARE_TIMESTAMPS(AIXvgh2->vg_timestamp, AIXvgt2->timestamp) ) {
1218
+ // First VGDA is bad, second is good.
1219
+ AIXVGLptr->CleanVGInfo = AIX_PV_STATE_SECOND_VGDA;
1220
+ } else if (AIXvgh->numpvs == 1) { // We only have 1 PV in this group, mismatch or not this will have to do
1221
+ AIXVGLptr->CleanVGInfo = AIX_PV_STATE_VALID;
1223
+ // This should never happen.
1224
+ LOG_DEBUG("All four VG timestamps for %d are different. What happened?!?\n", AIXVGLptr->vg_id.word2);
1225
+ AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;
1230
+ LOG_DEBUG("UVG SWITCH TS AIXVGLptr->CleanVGInfo:%d \n",AIXVGLptr->CleanVGInfo);
1232
+ switch (AIXVGLptr->CleanVGInfo) {
1233
+ case AIX_PV_STATE_VALID:
1234
+ case AIX_PV_STATE_FIRST_VGDA:
1236
+ LOG_DEBUG("UVG SWITCH VALID %d size:%d\n",AIXVGLptr->CleanVGInfo,sizeof(vg_header));
1238
+ AIX_copy_header_info(AIXVGLptr->AIXvgh, AIXvgh); // Get the info. we need
1240
+ AIXVGLptr->vgda_psn = AIXlvm->vgda_psn[0];
1241
+ AIXVGLptr->vgda_len = AIXlvm->vgda_len;
1244
+ case AIX_PV_STATE_SECOND_VGDA:
1245
+ LOG_DEBUG("UVG SWITCH SECOND VGDA %d size:%d\n",AIXVGLptr->CleanVGInfo,sizeof(vg_header));
1247
+ AIX_copy_header_info(AIXVGLptr->AIXvgh, AIXvgh2); // Get the info. we need
1249
+ AIXVGLptr->vgda_psn = AIXlvm->vgda_psn[1];
1250
+ AIXVGLptr->vgda_len = AIXlvm->vgda_len;
1253
+ case AIX_PV_STATE_EITHER_VGDA:
1254
+ LOG_DEBUG("UVG SWITCH EITHER VGDA %d size:%d\n",AIXVGLptr->CleanVGInfo,sizeof(vg_header));
1255
+ if ( COMPARE_UNIQUE_IDS(AIXvgh->vg_id, AIXvgh2->vg_id) ) {
1257
+ AIX_copy_header_info(AIXVGLptr->AIXvgh, AIXvgh); // Get the info. we need
1259
+ AIXVGLptr->vgda_psn = AIXlvm->vgda_psn[0];
1260
+ AIXVGLptr->vgda_len = AIXlvm->vgda_len;
1262
+ AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;
1263
+ // Not sure where this PV belongs. It thinks it is
1264
+ // supposed to be in two different containers. We will
1265
+ // probably need to put this on a separate, temporary
1266
+ // list, and determine later which container is missing
1272
+ LOG_ERROR("UVG Invalid PV state (%d) for %d\n",AIXVGLptr->CleanVGInfo , AIXVGLptr->vg_id.word2);
1273
+ AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;
1279
+ add_VG_data_to_VG_list(logical_node, AIXVGLptr, AIXlvm->pv_num);
1281
+ AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1283
+ LOG_DEBUG("UVG Exiting CleanVGInfo:%d\n", AIXVGLptr->CleanVGInfo);
1287
+/****************************************************
1288
+* Function: check_volume_groups
1290
+* We just want to make sure the volume groups have found
1291
+* all their drives.
1293
+* If not, we'll continue and build what we can
1294
+*****************************************************/
1295
+static int check_volume_groups(void)
1297
+ aix_volume_group_t * group;
1298
+ partition_list_entry_t * partitions;
1302
+ LOG_DEBUG("CHVG Checking volume groups:\n");
1304
+ group = AIXVolumeGroupList;
1307
+ partitions = group->partition_list;
1308
+ while (partitions) {
1310
+ partitions = partitions->next;
1313
+ if (NumPVS != group->numpvs) {
1314
+ group->flags |= AIX_VG_INCOMPLETE;
1315
+ LOG_ERROR("CHVG Found incomplete VG !! flags:%x\n",group->flags);
1316
+ LOG_ERROR("CHVG Found %d PVs should have %d PVs\n",NumPVS, group->numpvs);
1319
+ group = group->next;
1323
+ LOG_DEBUG("CHVG Finished Checking volume groups:\n");
1328
+/************************************************************************
1329
+ * Function: discover_logical_volumes
1331
+ * After all PVs have been claimed and added to the appropriate VG list,
1332
+ * the volumes for each VG must be constructed.
1336
+static int discover_logical_volumes( void )
1339
+ aix_volume_group_t * AIXVGLPtr;
1340
+ aix_logical_volume_t * new_LV;
1341
+ partition_list_entry_t * partition;
1342
+ evms_logical_node_t * node;
1343
+ lv_entries * AIXlvent, *AIXlventHead;
1344
+ int j, lv_found, all_lvs_found, rc;
1345
+ namelist * AIXnamelist;
1346
+ char * NameBuffer;
1348
+ AIXVGLPtr = AIXVolumeGroupList;
1350
+ LOG_DEBUG("DLV Discover Logical volume AIXVGLPtr:%p\n",AIXVGLPtr);
1352
+ if ( evms_cs_allocate_memory((void**)&AIXlventHead, MAX_SECTORS_LV_ENTRIES * AIX_SECTOR_SIZE) ) {
1356
+ if ( evms_cs_allocate_memory((void**)&NameBuffer, MAX_SECTORS_NAMELIST * EVMS_VSECTOR_SIZE) ) {
1357
+ evms_cs_deallocate_memory(AIXlventHead);
1361
+ while (AIXVGLPtr) {
1362
+ partition = AIXVGLPtr->partition_list;
1363
+ node = partition->logical_node;
1366
+ LOG_DEBUG("DLV INIT_IO AIXNameList position:%ld\n",((AIXVGLPtr->vgda_psn + AIXVGLPtr->vgda_len) - 1 - MAX_SECTORS_NAMELIST));
1368
+ if (INIT_IO(node, 0, ((AIXVGLPtr->vgda_psn + AIXVGLPtr->vgda_len) - 1 - MAX_SECTORS_NAMELIST), MAX_SECTORS_NAMELIST, NameBuffer)) {
1372
+ LOG_DEBUG("DLV INIT_IO AIXNameList\n");
1374
+ if (INIT_IO(node, 0, AIXVGLPtr->vgda_psn + PSN_LVE_REC, MAX_SECTORS_LV_ENTRIES, AIXlventHead)) {
1377
+ AIXlvent = AIXlventHead;
1378
+ AIXnamelist = (namelist *)NameBuffer;
1380
+ LOG_DEBUG("DLV INIT_IO AIXlvent\n");
1381
+ // Search through the LV structs for valid LV entries
1382
+ // We're just going to search until all valid LVs are found
1383
+ // The max. allowable LVs is 256 and we want don't want to
1384
+ // search for 255 if only 8 are defined 1-8 however, there
1385
+ // could be gaps in the LV numbering. i.e 1,2,3,4,5,6,7,8, 27,43, etc.
1387
+ for ( j = 0, lv_found = 0, all_lvs_found = FALSE; !all_lvs_found && j < LVM_MAXLVS; j++, AIXlvent++) {
1389
+ LOG_DEBUG(" ** DVIG:lv_size:%d lvname:[%s] j:%d lv_number:%d ** \n",AIXlvent->num_lps, AIXnamelist->name[j], j, AIXlvent->lvname);
1390
+ LOG_DEBUG(" DVIG:stripe_exp:%u stripesize:%u lv_status:%d\n", AIXlvent->striping_width, GET_PHYSICAL_PART_SIZE(AIXlvent->stripe_exp), AIXlvent->lv_state);
1391
+ LOG_DEBUG(" DVIG Group:%x.Access:%x\n",(unsigned int)AIXVGLPtr->vg_id.word2,AIXlvent->permissions);
1392
+ LOG_DEBUG(" DVIG mirror:%d mirror_policy:%d mirwrt:%d \n", AIXlvent->mirror, AIXlvent->mirror_policy, AIXlvent->mirwrt_consist);
1394
+ // This is the same check we used in "diskedit" and "readdisk"
1395
+ if ( AIXlvent->lv_state != 0 &&
1396
+ AIXlvent->permissions <= 0x10 ) {
1400
+ if (lv_found == AIXVGLPtr->numlvs) {
1401
+ all_lvs_found = TRUE;
1404
+ LOG_DEBUG(" DVIG lv_found:%d all_lvs_found:%d \n", lv_found, all_lvs_found);
1406
+ // Create a new logical volume and place it in the appropriate
1407
+ // spot in this VG's volume list. For re-discovery, make sure
1408
+ // this volume does not already exist.
1409
+ if ( !AIXVGLPtr->volume_list[AIXlvent->lvname] ) {
1410
+ new_LV = new_logical_volume( AIXlvent, AIXVGLPtr, AIXnamelist->name[j],GET_PHYSICAL_PART_SIZE(AIXlvent->stripe_exp));
1414
+ LOG_DEBUG(" DVIG Adding new logical volume %d to group:%x \n",new_LV->lv_number, AIXVGLPtr->vg_id.word2);
1415
+ AIXVGLPtr->volume_list[new_LV->lv_number] = new_LV;
1417
+ LOG_DEBUG("DVIG Updating Vol Exists\n");
1423
+ // Build the le_to_pe_map for each volume that was discovered above.
1424
+ // This has to be done after all volumes in the group are discovered
1425
+ if ( (rc = build_pe_maps(AIXVGLPtr)) ) {
1429
+ check_log_volume_and_pe_maps( AIXVGLPtr );
1431
+ AIXVGLPtr = AIXVGLPtr->next;
1434
+ evms_cs_deallocate_memory(NameBuffer);
1435
+ evms_cs_deallocate_memory(AIXlventHead);
1440
+ * Function: new_logical_volume
1442
+ * Allocate space for a new LVM logical volume, including space for the
1445
+static aix_logical_volume_t * new_logical_volume(lv_entries *AIXlvent,
1446
+ aix_volume_group_t *volume_group,
1448
+ u_int32_t stripesize)
1450
+ aix_logical_volume_t * new_volume;
1453
+ LOG_DEBUG(" NLV: lv_number:%d lv_allocated_le:%d lv_size:%d\n", AIXlvent->lvname,
1454
+ AIXlvent->num_lps,
1455
+ AIXlvent->num_lps * volume_group->pe_size);
1457
+ // Allocate space for the new logical volume.
1458
+ if ( evms_cs_allocate_memory((void**)&new_volume, sizeof(aix_logical_volume_t)) ) {
1462
+ // Allocate space for the LE to PE mapping table
1463
+ // We add 1 for the allocated le to ease mapping later on, all AIX le are 1 based
1464
+ if ( evms_cs_allocate_memory((void**)&(new_volume->le_to_pe_map), (AIXlvent->num_lps+1)*sizeof(pe_table_entry_t)) ) {
1465
+ delete_logical_volume( new_volume );
1469
+ if (AIXlvent->mirror > AIX_DEFAULT_MIRRORING) {
1470
+ if (evms_cs_allocate_memory((void**)&(new_volume->le_to_pe_map_mir1), (AIXlvent->num_lps+1)*sizeof(pe_table_entry_t)) ) {
1471
+ delete_logical_volume( new_volume );
1476
+ if (AIXlvent->mirror == AIX_MAX_MIRRORS) {
1477
+ if (evms_cs_allocate_memory((void**)&(new_volume->le_to_pe_map_mir2), (AIXlvent->num_lps+1)*sizeof(pe_table_entry_t)) ) {
1478
+ delete_logical_volume( new_volume );
1484
+ // Initialize the rest of the new volume.
1485
+ new_volume->lv_number = AIXlvent->lvname;
1486
+ new_volume->lv_size = AIXlvent->num_lps * (volume_group->pe_size);
1487
+ new_volume->lv_access = AIXlvent->permissions | EVMS_LV_NEW; // All volumes start new.
1488
+ new_volume->lv_status = AIXlvent->lv_state;
1489
+ //new_volume->lv_minor = MINOR(1);
1490
+ new_volume->mirror_copies = AIXlvent->mirror;
1491
+ new_volume->mirror_iterations = AIX_DEFAULT_MIRRORING;
1492
+ new_volume->stripes = AIXlvent->striping_width;
1493
+ new_volume->stripe_size = stripesize;
1494
+ new_volume->stripe_size_shift = evms_cs_log2(stripesize);
1495
+ new_volume->pe_size = volume_group->pe_size;
1496
+ new_volume->pe_size_shift = evms_cs_log2(volume_group->pe_size);
1497
+ new_volume->num_le = AIXlvent->num_lps;
1498
+ new_volume->new_volume = TRUE;
1499
+ new_volume->group = volume_group;
1501
+ sprintf(new_volume->name, "aix/%s", lv_name);
1503
+ if (!AIX_BH_list_pool && new_volume->mirror_copies > AIX_DEFAULT_MIRRORING) {
1504
+ AIX_BH_list_pool = evms_cs_create_pool(sizeof(aix_mirror_bh_t), "EVMS_AIX_BH", aix_notify_cache_ctor, NULL);
1505
+ if (!AIX_BH_list_pool) {
1510
+ LOG_DEBUG("NLV lv_number:%d name:%s lv_size %Ld \n", new_volume->lv_number, new_volume->name, new_volume->lv_size);
1511
+ LOG_DEBUG("NLV stripe_size:%d stripe_size_shift:%d\n", new_volume->stripe_size, new_volume->stripe_size_shift);
1513
+ return new_volume;
1516
+ * Function: aix_notify_cache_ctor
1517
+ * this function initializes the b_wait field in the buffer heads
1518
+ * in our private buffer head pool.
1521
+aix_notify_cache_ctor(
1523
+ kmem_cache_t * cachep,
1524
+ unsigned long flags)
1526
+ if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
1527
+ SLAB_CTOR_CONSTRUCTOR) {
1528
+ aix_mirror_bh_t *rbh = (aix_mirror_bh_t *)foo;
1529
+ memset(rbh, 0, sizeof(aix_mirror_bh_t));
1530
+ init_waitqueue_head(&rbh->bh_req.b_wait);
1535
+ * Function: build_pe_maps
1537
+ * After all logical volumes have been discovered, the mappings from
1538
+ * logical extents to physical extents must be constructed. Each PV
1539
+ * contains a map on-disk of its PEs. Each PE map entry contains the
1540
+ * logical volume number and the logical extent number on that volume.
1541
+ * Our internal map is the reverse of this map for each volume, listing
1542
+ * the PV node and sector offset for every logical extent on the volume.
1544
+static int build_pe_maps( aix_volume_group_t * volume_group)
1546
+ partition_list_entry_t * partition;
1547
+ partition_list_entry_t * mirror_partition;
1548
+ pp_entries * AIXppent, *AIXppent_buff;
1549
+ pv_header * AIXpvh;
1551
+ u_int32_t le_number;
1552
+ u_int32_t j, pp_count,pvh_pos;
1553
+ u_int32_t MirrorFound;
1554
+#ifdef EVMS_DEBUG_MIRRORS
1555
+ u_int32_t lv_found, all_lvs_found;
1556
+ u_int32_t mirs = 0;
1559
+ LOG_DEBUG(" *** BPEM ***\n");
1560
+ // For every partition in this VG
1562
+ if (evms_cs_allocate_memory((void**)&AIXppent_buff, (AIX_SECTOR_SIZE * PHYS_VOL_OFFSET))) {
1566
+ if (evms_cs_allocate_memory((void**)&AIXpvh, AIX_SECTOR_SIZE)) {
1567
+ evms_cs_deallocate_memory(AIXppent_buff);
1571
+ LOG_DEBUG(" BPEM AIXppent_buff:%d \n", (AIX_SECTOR_SIZE * PHYS_VOL_OFFSET));
1573
+ for ( partition = volume_group->partition_list; partition; partition = partition->next ) {
1575
+ LOG_DEBUG(" BPEM partition:%p next:%p\n", partition, partition->next);
1577
+ pvh_pos = AIX_PVH_DATA_PSN(volume_group->vgda_psn, partition->pv_number);
1579
+ LOG_DEBUG(" BPEM pvh_pos:%d\n", pvh_pos);
1581
+ if (INIT_IO(partition->logical_node, 0, pvh_pos, 1, AIXpvh)) {
1582
+ evms_cs_deallocate_memory(AIXppent_buff);
1583
+ evms_cs_deallocate_memory(AIXpvh);
1587
+ // For every entry in the PE map, calculate the PE's sector offset
1588
+ // and update the correct LV's PE map. LV number of 0 marks an unused PE.
1589
+ // For re-discovery, only compute entries for new volumes.
1591
+ if (INIT_IO(partition->logical_node, 0, pvh_pos, AIX_PVHPP_LENGTH, AIXppent_buff)) {
1592
+ evms_cs_deallocate_memory(AIXppent_buff);
1593
+ evms_cs_deallocate_memory(AIXpvh);
1597
+ AIXppent = AIXppent_buff;
1600
+ pp_count = AIXpvh->pp_count;
1602
+ LOG_DEBUG(" PE Map: volgrp:%x AIXpvh->pv_num:%d partition:%p next:%p lv_index:%d pp_count:%d\n",
1603
+ volume_group->vg_id.word2,
1607
+ AIXppent->lv_index,
1610
+ for (j = 0; j < pp_count; j++) {
1611
+ if (AIXppent->lv_index && AIXppent->pp_state ) {
1613
+ LOG_EXTRA(" -- pv:%x pp:%d st:%d nm:%s lv:%d lp:%ld cp:%d fst v:%d fst p:%d snd v:%d snd p:%d \n",
1614
+ volume_group->vg_id.word2, j+1, AIXppent->pp_state, volume_group->volume_list[AIXppent->lv_index-1]->name,
1615
+ AIXppent->lv_index,
1616
+ AIXppent->lp_num, AIXppent->copy,
1617
+ AIXppent->fst_alt_vol, AIXppent->fst_alt_part,
1618
+ AIXppent->snd_alt_vol, AIXppent->snd_alt_part);
1620
+ le_number = AIXppent->lp_num -1; // AIX lp's start @ 1, we want a 0 index
1621
+ offset = ((j * (volume_group->pe_size)) + AIXpvh->psn_part1);
1623
+ LOG_DEBUG(" PE Map: le_number:%d partition:%p lv_index:%d lv_name:%s\n",
1626
+ AIXppent->lv_index,
1627
+ volume_group->volume_list[AIXppent->lv_index-1]->name);
1629
+ if (volume_group->volume_list[AIXppent->lv_index-1]->le_to_pe_map &&
1630
+ le_number <= volume_group->volume_list[AIXppent->lv_index-1]->num_le) {
1631
+ volume_group->volume_list[AIXppent->lv_index-1]->le_to_pe_map[le_number].owning_pv = partition;
1632
+ volume_group->volume_list[AIXppent->lv_index-1]->le_to_pe_map[le_number].pe_sector_offset = offset;
1636
+ if (volume_group->volume_list[AIXppent->lv_index-1]->mirror_copies > AIX_DEFAULT_MIRRORING) {
1638
+ LOG_EXTRA(" PE Map: Mirror found lv:%d -- \n", AIXppent->lv_index);
1640
+ for ( mirror_partition = volume_group->partition_list, MirrorFound = FALSE; mirror_partition && !MirrorFound; mirror_partition = mirror_partition->next ) {
1642
+ if (mirror_partition->pv_number == AIXppent->fst_alt_vol) {
1644
+ offset = (((AIXppent->fst_alt_part -1) * (volume_group->pe_size)) + AIXpvh->psn_part1);
1647
+ volume_group->volume_list[AIXppent->lv_index-1]->le_to_pe_map_mir1[le_number].owning_pv = mirror_partition;
1648
+ volume_group->volume_list[AIXppent->lv_index-1]->le_to_pe_map_mir1[le_number].pe_sector_offset = offset;
1650
+ LOG_EXTRA(" PE Map: mirror_partition:%p \n", mirror_partition);
1651
+ LOG_EXTRA(" PE Map: mirror_sector_offet:%d\n", AIXppent->fst_alt_part);
1653
+ MirrorFound = TRUE;
1657
+ if (volume_group->volume_list[AIXppent->lv_index-1]->mirror_copies == AIX_MAX_MIRRORS) {
1659
+ for ( mirror_partition = volume_group->partition_list, MirrorFound = FALSE; mirror_partition && !MirrorFound; mirror_partition = mirror_partition->next ) {
1661
+ if (mirror_partition->pv_number == AIXppent->snd_alt_vol) {
1663
+ offset = (((AIXppent->snd_alt_part -1) * (volume_group->pe_size)) + AIXpvh->psn_part1);
1665
+ volume_group->volume_list[AIXppent->lv_index-1]->le_to_pe_map_mir2[le_number].owning_pv = mirror_partition;
1666
+ volume_group->volume_list[AIXppent->lv_index-1]->le_to_pe_map_mir2[le_number].pe_sector_offset = offset;
1668
+ LOG_EXTRA(" PE Map: mirror_partition2:%p \n", mirror_partition);
1669
+ LOG_EXTRA(" PE Map: mirror_sector_offet2:%d\n", AIXppent->snd_alt_part);
1671
+ MirrorFound = TRUE;
1677
+ } // End of if mirroring is enabled
1686
+// LOG_EXTRA(" PE Map: PE maps:%d Mirror count:%d -- \n", lvs, mirs);
1688
+#ifdef EVMS_DEBUG_MIRRORS
1689
+ for (mirs = 0, lv_found = 0, all_lvs_found = FALSE; !all_lvs_found && mirs < LVM_MAXLVS; mirs++) {
1691
+ if (volume_group->volume_list[mirs] != NULL) {
1692
+ if (volume_group->volume_list[mirs]->lv_status == LV_ACTIVE) {
1696
+ LOG_DEBUG(" PE Map: owning part lv %d -- %p\n", mirs, volume_group->volume_list[mirs]->le_to_pe_map[0].owning_pv);
1697
+ if (volume_group->volume_list[mirs]->mirror_copies > AIX_DEFAULT_MIRRORING) {
1698
+ LOG_DEBUG(" PE Map: mirror_partition lv %d -- %p \n", mirs, volume_group->volume_list[mirs]->le_to_pe_map_mir1[0].owning_pv);
1700
+ if (volume_group->volume_list[mirs]->mirror_copies == AIX_MAX_MIRRORS) {
1701
+ LOG_DEBUG(" PE Map: mirror_partition lv %d -- %p \n", mirs, volume_group->volume_list[mirs]->le_to_pe_map_mir2[0].owning_pv);
1704
+ if (lv_found == volume_group->numlvs) {
1705
+ all_lvs_found = TRUE;
1706
+ LOG_DEBUG(" PE Map: all_lvs_found\n" );
1712
+ evms_cs_deallocate_memory(AIXpvh);
1713
+ evms_cs_deallocate_memory(AIXppent_buff);
1718
+ * Function: check_log_volume_and_pe_maps
1720
+ * Make sure all volumes in this group have valid LE-to-PE maps.
1721
+ * Any volume that doesn't is deleted. This is safe for re-discovery
1722
+ * because only new volumes could have corrupted PE maps.
1724
+static int check_log_volume_and_pe_maps( aix_volume_group_t * group )
1726
+ aix_logical_volume_t * volume;
1727
+ int i, j, lv_found, all_lvs_found;
1729
+ LOG_DEBUG(" check_pe_map.\n");
1731
+ for ( i = 0, all_lvs_found = FALSE, lv_found = 0; !all_lvs_found && i < LVM_MAXLVS; i++ ) {
1732
+ if ( ! group->volume_list[i] ) {
1733
+ LOG_DEBUG(" CPEM No Volume %d found \n",i);
1737
+ volume = group->volume_list[i];
1738
+ if ( ! volume->le_to_pe_map ) {
1739
+ LOG_DEBUG(" CPEM Volume %s has no PE map.\n",volume->name);
1740
+ delete_logical_volume(volume);
1744
+ LOG_DEBUG(" CPEM volume %s num_le: %d \n",volume->name, volume->num_le);
1748
+ if (lv_found == group->numlvs) {
1749
+ all_lvs_found = TRUE;
1754
+ for ( j = 0; j < volume->num_le; j++) {
1755
+ if ( ! volume->le_to_pe_map[j].owning_pv ||
1756
+ ! volume->le_to_pe_map[j].pe_sector_offset ) {
1757
+ LOG_SERIOUS(" CPEM Volume (%s) incomplete PE map (LE %d) \n",volume->name, j);
1758
+ volume->lv_access |= EVMS_LV_INCOMPLETE;
1761
+ if (volume->mirror_copies > AIX_DEFAULT_MIRRORING) {
1762
+ if ( ! volume->le_to_pe_map_mir1[j].owning_pv ||
1763
+ ! volume->le_to_pe_map_mir1[j].pe_sector_offset ) {
1764
+ LOG_SERIOUS(" CPEM Volume (%s) incomplete PE mirror map 1 (LE %d) \n",volume->name, j);
1765
+ volume->lv_access |= EVMS_LV_INCOMPLETE;
1768
+ if (volume->mirror_copies == AIX_MAX_MIRRORS) {
1769
+ if ( ! volume->le_to_pe_map_mir2[j].owning_pv ||
1770
+ ! volume->le_to_pe_map_mir2[j].pe_sector_offset ) {
1771
+ LOG_SERIOUS(" CPEM Volume (%s) incomplete PE mirror map 2 (LE %d) \n",volume->name, j);
1772
+ volume->lv_access |= EVMS_LV_INCOMPLETE;
1779
+ LOG_EXTRA(" Leaving check_pe_map.\n");
1783
+ * Function: export_volumes
1785
+ * The last thing this VGE must do is take each constructed volume and
1786
+ * place it back on the evms logical partition list.
1788
+static int export_volumes( evms_logical_node_t ** evms_partition_list )
1790
+ aix_volume_group_t * AIXVGLPtr;
1791
+ evms_logical_node_t * new_node;
1792
+ aix_logical_volume_t * volume;
1793
+ int j, lv_found, all_lvs_found;
1796
+ AIXVGLPtr = AIXVolumeGroupList;
1798
+ while (AIXVGLPtr) {
1800
+ if (AIXVGLPtr->flags & EVMS_VG_DIRTY) {
1802
+ LOG_DEBUG(" Exporting all new volumes numpvs:%d numlvs:%d \n",AIXVGLPtr->numpvs,AIXVGLPtr->numlvs);
1804
+ // Export every valid volume in the group. For re-discovery,
1805
+ // make sure we are only exporting "new" volumes.
1807
+ for ( j = 0, all_lvs_found = FALSE, lv_found = 0; !all_lvs_found && j < LVM_MAXLVS ; j++ ) {
1808
+ if (AIXVGLPtr->volume_list[j] != NULL ) {
1809
+ if (AIXVGLPtr->volume_list[j]->new_volume == TRUE) {
1811
+ LOG_DEBUG(" EV Checking LV:[%d] volume:%p\n",j, AIXVGLPtr->volume_list[j]);
1812
+ volume = AIXVGLPtr->volume_list[j];
1815
+ if (lv_found == AIXVGLPtr->numlvs) {
1816
+ all_lvs_found = TRUE;
1819
+ // For new volumes, create a new EVMS node and
1820
+ // initialize the appropriate fields.
1821
+ if ( volume->lv_access & EVMS_LV_NEW ) {
1822
+ if ( evms_cs_allocate_logical_node( &new_node ) ) {
1823
+ LOG_DEBUG(" Export Vol Error allocating node !!\n");
1826
+ LOG_DEBUG(" EV Node allocated OK\n");
1829
+ volume->new_volume = 0;
1830
+ volume->volume_node = new_node;
1831
+ volume->lv_access &= (~EVMS_LV_NEW);
1832
+ new_node->hardsector_size = AIXVGLPtr->hard_sect_size;
1833
+ new_node->block_size = AIXVGLPtr->block_size;
1834
+ new_node->plugin = &plugin_header;
1835
+ new_node->instance_data = volume;
1836
+ new_node->total_vsectors = volume->lv_size;
1839
+ LOG_DEBUG(" EV volume->name:[%s]\n",volume->name);
1841
+ strncpy(new_node->name, volume->name, EVMS_VOLUME_NAME_SIZE+1);
1844
+ // Is the volume read-only?
1845
+ if ( !(volume->lv_access & AIX_LV_WRITE) ||
1846
+ volume->lv_access & EVMS_LV_INCOMPLETE ) {
1847
+ new_node->flags |= EVMS_VOLUME_SET_READ_ONLY;
1848
+ LOG_DEBUG(" EV Read Only volume->lv_access:%d\n",volume->lv_access);
1851
+ LOG_DEBUG(" EV Node [%s] allocated previously\n",volume->name);
1854
+ evms_cs_add_logical_node_to_list( evms_partition_list, new_node );
1857
+ LOG_DEBUG(" Exporting LVM volume %p new_node:%p ESD->volume_name[%s]\n", volume, new_node,new_node->name);
1859
+ evms_cs_add_logical_node_to_list( evms_partition_list, AIXVGLPtr->volume_list[j]->volume_node);
1861
+ LOG_DEBUG(" ELV vol_list[%d]%p\n",j, AIXVGLPtr->volume_list[j]);
1864
+ LOG_DEBUG(" EV Checking LV:[%d] == NULL\n",j);
1866
+ } // end checking all lvs
1869
+ LOG_DEBUG(" ELV Existing volume -- %d\n",AIXVGLPtr->vg_id.word2);
1872
+ AIXVGLPtr->flags &= ~EVMS_VG_DIRTY;
1873
+ AIXVGLPtr = AIXVGLPtr->next;
1881
+ * Function: delete_logical_volume
1883
+ * This function deletes the in-memory representation of a single LVM
1884
+ * logical volume, including its PE map and any snapshot data. It does
1885
+ * not alter the parent volume group, except to remove this volume from
1886
+ * its volume list.
1888
+static int delete_logical_volume( aix_logical_volume_t * volume )
1890
+ aix_volume_group_t * group = volume->group;
1892
+ LOG_DEBUG(" Deleting volume %s\n",volume->name);
1894
+ // Now free up all the memory. This includes the LE-to-PE map, any
1895
+ // mirror PEs, etc.
1896
+ if ( volume->le_to_pe_map ) {
1897
+ evms_cs_deallocate_memory( volume->le_to_pe_map );
1898
+ volume->le_to_pe_map = NULL;
1901
+ if ( volume->le_to_pe_map_mir1 ) {
1902
+ evms_cs_deallocate_memory( volume->le_to_pe_map_mir1 );
1903
+ volume->le_to_pe_map_mir1 = NULL;
1906
+ if ( volume->le_to_pe_map_mir2 ) {
1907
+ evms_cs_deallocate_memory( volume->le_to_pe_map_mir2 );
1908
+ volume->le_to_pe_map_mir2 = NULL;
1911
+ // Remove this volume from the volume-group's list.
1912
+ if ( group && group->volume_list[volume->lv_number] == volume ) {
1913
+ group->volume_list[volume->lv_number] = NULL;
1917
+ evms_cs_deallocate_memory(volume);
1923
+/* Function: remove_group_from_list
1925
+ * Remove an LVM volume group from the global LVM list.
1927
+static int remove_group_from_list( aix_volume_group_t * group )
1929
+ aix_volume_group_t ** p_group;
1931
+ for ( p_group = &AIXVolumeGroupList; *p_group; p_group = &(*p_group)->next ) {
1932
+ if ( *p_group == group ) {
1933
+ *p_group = (*p_group)->next;
1934
+ group->next = NULL;
1943
+ * Function: delete_aix_node
1945
+ * This function deletes the in-memory representation of an LVM
1946
+ * logical volume. Right now it makes a lot of assumptions about
1947
+ * the data in the group not being corrupted. It would be possible
1948
+ * to put in a lot of consistency checks before deleting everything
1949
+ * to indicate if problems have occurred during the lifetime of the
1950
+ * volume and its volume group.
1952
+static int delete_aix_node( evms_logical_node_t * logical_node )
1954
+ aix_logical_volume_t * volume = (aix_logical_volume_t*)(logical_node->instance_data);
1955
+ aix_volume_group_t * group = volume->group;
1957
+ if ( delete_logical_volume(volume) ) {
1961
+ // If we just removed the last volume from this group, the entire group
1962
+ // can also be deleted.
1963
+ if ( group && group->numlvs == 0) {
1964
+ remove_group_from_list(group);
1965
+ deallocate_volume_group(group);
1968
+ // Free the logical node.
1969
+ evms_cs_deallocate_logical_node(logical_node);
1974
+/* Function: deallocate_volume_group
1976
+ * This function deletes the entire in-memory representation of an LVM
1977
+ * volume group, including all partitions and logical volumes. If this
1978
+ * group is on the VGE's volume group list, it is removed.
1980
+static int deallocate_volume_group( aix_volume_group_t * group )
1982
+ partition_list_entry_t * partition;
1983
+ partition_list_entry_t * next_part;
1986
+ LOG_DEBUG(" Deleting volume group %x\n",group->vg_id.word2);
1989
+ // Delete all partitions from the group's list.
1990
+ for ( partition = group->partition_list; partition; partition = next_part ) {
1992
+ next_part = partition->next;
1994
+ if ( partition->logical_node ) {
1995
+ // Send a delete command down to the partition manager.
1996
+ LOG_DEBUG(" Deleting PV %d from group %x\n",partition->pv_number,group->vg_id.word2);
1997
+ DELETE(partition->logical_node);
1999
+ evms_cs_deallocate_memory(partition);
2003
+ // Delete all logical volumes, and the array of pointers.
2004
+ for ( i = 0; i < LVM_MAXLVS; i++ ) {
2005
+ if ( group->volume_list[i] ) {
2006
+ delete_logical_volume(group->volume_list[i]);
2010
+ evms_cs_deallocate_memory(group);
2014
+/* Function: end_discover_aix
2016
+ * The discovery process at the region-manager level is now iterative,
2017
+ * much like the EVMS feature level. To accomplish this correctly, and
2018
+ * also to accomplish partial volume discovery, a second discover
2019
+ * entry point is needed, so EVMS can tell the region managers that
2020
+ * discovery is over, and to finish up any discovery that is not yet
2021
+ * complete. When this function is called, it should be assumed that
2022
+ * the node list has had nothing new added to it since the last call
2023
+ * of the regular discover function. Therefore, when this function is
2024
+ * called, we do not need to try to discovery any additional volume
2025
+ * groups. We will, however, look for logical volumes once more. This
2026
+ * gives us the ability to export (read-only) volumes that have
2027
+ * partially corrupted LE maps due to missing PVs in their VG.
2029
+static int end_discover_aix(evms_logical_node_t ** evms_logical_disk_head)
2034
+ LOG_DEBUG("Final Discovery:\n");
2037
+ if ( (rc = discover_logical_volumes()) ) {
2041
+ rc = export_volumes(evms_logical_disk_head);
2047
+/****************************************************
2048
+* Function: AIX_alloc_wbh
2050
+* Alloc any buffer heads from the pool and return a linked list
2053
+*****************************************************/
2054
+static aix_mirror_bh_t * AIX_alloc_wbh(evms_logical_node_t * node,
2055
+ evms_logical_node_t * node2,
2056
+ evms_logical_node_t * node3,
2058
+ uint32_t mirror_copies,
2059
+ evms_sector_t new_sector2,
2060
+ evms_sector_t new_sector3)
2063
+ aix_mirror_bh_t * tmp_bh = NULL, *head_bh = NULL;
2066
+ head_bh = evms_cs_allocate_from_pool(AIX_BH_list_pool, EVMS_BLOCKABLE);
2069
+ LOG_SERIOUS("Unable to allocate memory for mirror pool line:%d\n",__LINE__);
2073
+ head_bh->master_bh = eio->bh;
2074
+ head_bh->mirror_bh_list = NULL;
2075
+ atomic_set(&head_bh->remaining, 0);
2077
+ for (i = AIX_DEFAULT_MIRRORING; i <= mirror_copies; i++) {
2079
+ tmp_bh = evms_cs_allocate_from_pool(AIX_BH_list_pool, EVMS_BLOCKABLE);
2081
+ LOG_SERIOUS("Unable to allocate memory for mirror pool line:%d\n",__LINE__);
2085
+ tmp_bh->next_r1 = head_bh->mirror_bh_list;
2086
+ head_bh->mirror_bh_list = tmp_bh;
2087
+ atomic_inc(&head_bh->remaining);
2089
+ memcpy(&tmp_bh->bh_req, eio->bh, sizeof(struct buffer_head));
2090
+ init_waitqueue_head(&tmp_bh->bh_req.b_wait);
2091
+// tmp_bh->master_bh = eio->bh;
2092
+// tmp_bh->iteration = AIX_DEFAULT_MIRRORING + i;
2093
+ tmp_bh->eio.rsize = eio->rsize;
2094
+ tmp_bh->eio.bh = &tmp_bh->bh_req;
2098
+ case AIX_DEFAULT_MIRRORING:
2099
+ tmp_bh->node = node;
2100
+ tmp_bh->eio.rsector = eio->rsector;
2103
+ case AIX_FIRST_MIRROR:
2104
+ tmp_bh->node = node2;
2105
+ tmp_bh->eio.rsector = new_sector2;
2108
+ case AIX_MAX_MIRRORS:
2109
+ tmp_bh->node = node3;
2110
+ tmp_bh->eio.rsector = new_sector3;
2114
+ tmp_bh->bh_req.b_end_io = AIX_handle_write_mirror_drives; //setup callback routine
2115
+ tmp_bh->bh_req.b_private = (void*)head_bh;
2122
+/****************************************************
2123
+* Function: AIX_handle_write_mirror_drives
2125
+* Handles a write from a set of mirrored AIX LVs
2129
+*****************************************************/
2130
+static void AIX_handle_write_mirror_drives(struct buffer_head * bh,
2133
+ aix_logical_volume_t * volume;
2134
+ evms_logical_node_t * node;
2135
+ aix_mirror_bh_t * tmp_bh = NULL, * tmp_bh2 = NULL;
2136
+ kdev_t tmp_b_dev = bh->b_dev;
2139
+ tmp_bh = (aix_mirror_bh_t *)bh->b_private;
2140
+ node = tmp_bh->node;
2141
+ volume = (aix_logical_volume_t *) node->instance_data;
2143
+ LOG_DEBUG("AHWMD node:%p bh_flags:%lu uptodate:%d mirror_copies:%d \n", node, bh->b_state,uptodate, volume->mirror_copies);
2147
+ AIX_evms_cs_notify_lv_io_error(node);
2150
+ if (atomic_dec_and_test(&tmp_bh->remaining)) {
2151
+ tmp_bh->master_bh->b_end_io(tmp_bh->master_bh, uptodate);
2152
+ tmp_bh2 = tmp_bh->mirror_bh_list;
2153
+ evms_cs_deallocate_to_pool(AIX_BH_list_pool, tmp_bh);
2156
+ tmp_bh = tmp_bh2->next_r1;
2157
+ evms_cs_deallocate_to_pool(AIX_BH_list_pool, tmp_bh2);
2161
+ evms_cs_volume_request_in_progress(tmp_b_dev, AIX_DECREMENT_REQUEST, &count);
2167
+/****************************************************
2168
+* Function: AIX_alloc_rbh
2170
+* Alloc any buffer heads from the pool and return a linked list
2173
+*****************************************************/
2174
+static aix_mirror_bh_t * AIX_alloc_rbh(evms_logical_node_t * node,
2176
+ uint32_t mirror_copies,
2177
+ evms_sector_t org_sector,
2180
+ aix_mirror_bh_t * tmp_bh = NULL;
2182
+ tmp_bh = evms_cs_allocate_from_pool(AIX_BH_list_pool, EVMS_BLOCKABLE);
2185
+ LOG_SERIOUS("Unable to allocate memory for mirror pool line:%d\n",__LINE__);
2189
+ memcpy(&tmp_bh->bh_req, eio->bh, sizeof(struct buffer_head));
2190
+ tmp_bh->node = node;
2191
+ tmp_bh->master_bh = eio->bh;
2192
+ tmp_bh->iteration = AIX_FIRST_MIRROR;
2193
+ tmp_bh->eio.rsector = eio->rsector;
2194
+ tmp_bh->eio.rsize = eio->rsize;
2195
+ tmp_bh->eio.bh = &tmp_bh->bh_req;
2198
+ tmp_bh->bh_req.b_end_io = AIX_handle_read_mirror_drives; //setup callback routine
2199
+ tmp_bh->bh_req.b_private = (void*)tmp_bh;
2201
+ tmp_bh->cmd = cmd;
2202
+ tmp_bh->next_r1 = NULL;
2203
+ tmp_bh->node = node;
2209
+static void AIX_reschedule_retry (aix_mirror_bh_t *aix_bh)
2211
+ unsigned long flags;
2213
+ spin_lock_irqsave(&AIX_retry_list_lock, flags);
2214
+ if (AIX_retry_list == NULL)
2215
+ AIX_retry_tail = &AIX_retry_list;
2216
+ *AIX_retry_tail = aix_bh;
2217
+ AIX_retry_tail = &aix_bh->next_r1;
2218
+ aix_bh->next_r1 = NULL;
2219
+ spin_unlock_irqrestore(&AIX_retry_list_lock, flags);
2220
+ evms_cs_wakeup_thread(AIX_mirror_thread);
2222
+/****************************************************
2223
+* Function: AIX_handle_read_mirror_drives
2225
+* Handles a read from a set of mirrored AIX LVs
2229
+*****************************************************/
2230
+static void AIX_handle_read_mirror_drives(struct buffer_head * bh,
2233
+ aix_logical_volume_t * volume;
2234
+ evms_logical_node_t * node;
2235
+ aix_mirror_bh_t * tmp_bh;
2236
+ kdev_t tmp_b_dev = bh->b_dev;
2239
+ tmp_bh = (aix_mirror_bh_t *)bh->b_private;
2240
+ volume = (aix_logical_volume_t *) tmp_bh->node->instance_data;
2241
+ node = tmp_bh->node;
2243
+ LOG_DEBUG("AHRMD node:%p bh_flags:%lu uptodate:%d mirror_copies:%d \n", node, bh->b_state,uptodate, volume->mirror_copies);
2245
+ if (!uptodate && tmp_bh->iteration < volume->mirror_copies) {
2246
+ AIX_evms_cs_notify_lv_io_error(node);
2247
+ AIX_reschedule_retry(tmp_bh);
2249
+ tmp_bh->master_bh->b_end_io(tmp_bh->master_bh, uptodate);
2250
+ evms_cs_deallocate_to_pool(AIX_BH_list_pool, tmp_bh);
2251
+ evms_cs_volume_request_in_progress(tmp_b_dev, AIX_DECREMENT_REQUEST, &count);
2259
+/****************************************************
2260
+* This is a temporary function until a common EVMS
2261
+* notification function can be created.
2263
+*****************************************************/
2264
+static int AIX_evms_cs_notify_lv_io_error(evms_logical_node_t * node)
2266
+ aix_logical_volume_t * volume;
2268
+ volume = (aix_logical_volume_t *)node->instance_data;
2270
+ LOG_CRITICAL("Notify_ERROR !! node:%p volume->lv_status:%d volume->name:[%s]\n", node, volume->lv_status,volume->name);
2275
+/* Function: lvm_cleanup
2277
+ * This function runs through the entire lvm data structure, removing
2278
+ * all items that are not needed at runtime. Currently, this is just the
2279
+ * vg_disk_t structure and the pv_disk_t structure for each PV. Also, any
2280
+ * groups that don't contain any volumes are deleted. All of the other
2281
+ * volume_group, logical_volume and evms_logical_node structures will be
2282
+ * kept around at run-time.
2284
+static int lvm_cleanup( void )
2286
+ aix_volume_group_t * group;
2288
+ group = AIXVolumeGroupList;
2292
+ if (group->AIXvgh) {
2293
+ evms_cs_deallocate_memory(group->AIXvgh);
2294
+ group->AIXvgh = NULL;
2297
+ group = group->next;
2303
+/****************************************************
2304
+* Function: AIX_copy_header_info
2306
+* Copy the disk header info into the volume struct
2307
+* so we can use it later.
2311
+*****************************************************/
2312
+static int AIX_copy_header_info(vg_header *AIXvgh, vg_header *AIXvgh2)
2315
+ LOG_DEBUG("CHI AIXvgh:%p AIXvgh2:%p\n",AIXvgh,AIXvgh2);
2319
+ AIXvgh->vg_timestamp.tv_sec = AIXvgh2->vg_timestamp.tv_sec;
2320
+ AIXvgh->vg_timestamp.tv_nsec = AIXvgh2->vg_timestamp.tv_nsec;
2321
+ AIXvgh->vg_id.word1 = AIXvgh2->vg_id.word1;
2322
+ AIXvgh->vg_id.word2 = AIXvgh2->vg_id.word2;
2323
+ AIXvgh->vg_id.word3 = AIXvgh2->vg_id.word3;
2324
+ AIXvgh->vg_id.word4 = AIXvgh2->vg_id.word4;
2325
+ AIXvgh->numlvs = AIXvgh2->numlvs;
2326
+ AIXvgh->maxlvs = AIXvgh2->maxlvs;
2327
+ AIXvgh->pp_size = AIXvgh2->pp_size;
2328
+ AIXvgh->numpvs = AIXvgh2->numpvs;
2329
+ AIXvgh->total_vgdas = AIXvgh2->total_vgdas;
2330
+ AIXvgh->vgda_size = AIXvgh2->vgda_size;
2331
+ AIXvgh->bigvg = AIXvgh2->bigvg;
2332
+ AIXvgh->quorum = AIXvgh2->quorum;
2333
+ AIXvgh->auto_varyon = AIXvgh2->auto_varyon;
2334
+ AIXvgh->checksum = AIXvgh2->checksum;
2335
+ AIXvgh->bigda_size = AIXvgh2->bigda_size;
2341
+ LOG_DEBUG("Returning CHI AIXvgh:%p AIXvgh2:%p\n",AIXvgh,AIXvgh2);
2345
+/****************************************************
2346
+* Function: AIX_free_header
2352
+*****************************************************/
2353
+static void AIX_free_headers(vg_header *AIXvgh, vg_header *AIXvgh2, vg_trailer *AIXvgt, vg_trailer *AIXvgt2)
2357
+ evms_cs_deallocate_memory(AIXvgh);
2362
+ evms_cs_deallocate_memory(AIXvgh2);
2367
+ evms_cs_deallocate_memory(AIXvgt);
2372
+ evms_cs_deallocate_memory(AIXvgt2);
2378
+/****************************************************
2381
+* This is a kernel thread that handles read/write of mirrorss
2382
+* This shouldn't ever run on a non-mirrored LV read/write
2385
+*****************************************************/
2386
+static void AIXiod (void *data)
2388
+ aix_mirror_bh_t * r1_bh;
2389
+ evms_logical_node_t * node;
2390
+ unsigned long flags;
2395
+ spin_lock_irqsave(&AIX_retry_list_lock, flags);
2396
+ if (AIX_retry_list == NULL){
2397
+ spin_unlock_irqrestore(&AIX_retry_list_lock, flags);
2400
+ r1_bh = AIX_retry_list;
2401
+ AIX_retry_list = r1_bh->next_r1;
2402
+ spin_unlock_irqrestore(&AIX_retry_list_lock, flags);
2403
+ r1_bh->next_r1 = NULL; // for mark
2405
+ switch (r1_bh->cmd) {
2408
+ r1_bh->iteration++;
2409
+ LOG_DEBUG("Report from thread AIXiod READ\n");
2411
+ if (r1_bh->iteration == AIX_FIRST_MIRROR) {
2412
+ node = r1_bh->mir_node1;
2413
+ r1_bh->eio.rsector = r1_bh->mir_sector1;
2415
+ node = r1_bh->mir_node2;
2416
+ r1_bh->eio.rsector = r1_bh->mir_sector2;
2420
+ R_IO(node, &r1_bh->eio);
2425
+ LOG_DEBUG("AIXiod unknown cmd passed to thread:%d\n", r1_bh->cmd);
2432
+/****************************************************
2433
+* Function: AIX_volume_group_dump
2435
+* This is for debug purposes and will walk the volume group list
2436
+* and LV's within the volume groups
2438
+* It can be called at anytime however the output to the display is large
2440
+*****************************************************/
2441
+#ifdef EVMS_AIX_DEBUG
2442
+static int AIX_volume_group_dump(void)
2444
+ aix_volume_group_t * AIXVGLDebugPtr;
2445
+ partition_list_entry_t * DebugPartitionList;
2446
+ aix_logical_volume_t * DebugLVList;
2449
+ AIXVGLDebugPtr = AIXVolumeGroupList;
2451
+ if (!AIXVGLDebugPtr) {
2452
+ LOG_DEBUG("***********************************************\n");
2453
+ LOG_DEBUG("ERROR Nothing built in the list to check !!! \n");
2454
+ LOG_DEBUG("***********************************************\n");
2458
+ LOG_DEBUG("*********************************************** \n");
2459
+ LOG_DEBUG("Begin Volume Group Dump \n");
2460
+ LOG_DEBUG("*********************************************** \n");
2462
+ while (AIXVGLDebugPtr) {
2464
+ LOG_DEBUG("vg_number %x\n",AIXVGLDebugPtr->vg_id.word2 );
2465
+ LOG_DEBUG("numpvs %d\n",AIXVGLDebugPtr->numpvs );
2466
+ LOG_DEBUG("numlvs %d\n",AIXVGLDebugPtr->numlvs );
2467
+ LOG_DEBUG("hard_sect_size %d\n",AIXVGLDebugPtr->hard_sect_size);
2468
+ LOG_DEBUG("block_size %d\n",AIXVGLDebugPtr->block_size );
2469
+ LOG_DEBUG("flags %d\n",AIXVGLDebugPtr->flags );
2470
+ LOG_DEBUG("lv_max %d\n",AIXVGLDebugPtr->lv_max );
2471
+ LOG_DEBUG("pe_size %d\n",AIXVGLDebugPtr->pe_size );
2472
+ LOG_DEBUG("CleanVGInfo %d\n",AIXVGLDebugPtr->CleanVGInfo );
2474
+ DebugPartitionList = AIXVGLDebugPtr->partition_list;
2476
+ LOG_DEBUG("********* Begin Volume Partition Dump ********* \n");
2478
+ if (!DebugPartitionList) {
2479
+ LOG_DEBUG("No partitions to check !! \n");
2483
+ while (DebugPartitionList) {
2484
+ LOG_DEBUG("logical_node %p\n",DebugPartitionList->logical_node );
2485
+ LOG_DEBUG("pv_number %d\n",DebugPartitionList->pv_number );
2486
+ LOG_DEBUG("block_size %d\n",DebugPartitionList->block_size );
2487
+ LOG_DEBUG("hard_sect_size %d\n",DebugPartitionList->hard_sect_size );
2488
+ LOG_DEBUG("-------------------------------------------------------------\n");
2489
+ DebugPartitionList = DebugPartitionList->next;
2492
+ LOG_DEBUG("********* End Volume Partition Dump **********\n");
2494
+ LOG_DEBUG("********** Begin Logical Volume Partition Dump **********\n");
2496
+ DebugLVList = AIXVGLDebugPtr->volume_list[0];
2498
+ if (!DebugLVList) {
2499
+ LOG_DEBUG("No logical volumes to check !! \n");
2502
+ for (i = 0; i < LVM_MAXLVS && DebugLVList; i++) {
2504
+ DebugLVList = AIXVGLDebugPtr->volume_list[i];
2506
+ if (DebugLVList) {
2507
+ LOG_DEBUG("volume_list # %d \n", i );
2508
+ LOG_DEBUG("lv_number %d \n", DebugLVList->lv_number );
2509
+ LOG_DEBUG("LV name %s \n", DebugLVList->name );
2510
+ LOG_DEBUG("lv_size %Ld \n", DebugLVList->lv_size );
2511
+ LOG_DEBUG("lv_access %d \n", DebugLVList->lv_access );
2512
+ LOG_DEBUG("lv_status %d \n", DebugLVList->lv_status );
2513
+ LOG_DEBUG("lv_minor %d \n", DebugLVList->lv_minor );
2514
+ LOG_DEBUG("mirror_copies %d \n", DebugLVList->mirror_copies );
2515
+ LOG_DEBUG("mirror_number %d \n", DebugLVList->mirror_number );
2516
+ LOG_DEBUG("stripes %d \n", DebugLVList->stripes );
2517
+ LOG_DEBUG("stripe_size %d \n", DebugLVList->stripe_size );
2518
+ LOG_DEBUG("stripe_size_shift%d \n", DebugLVList->stripe_size_shift);
2519
+ LOG_DEBUG("pe_size %d \n", DebugLVList->pe_size );
2520
+ LOG_DEBUG("pe_size_shift %d \n", DebugLVList->pe_size_shift );
2521
+ LOG_DEBUG("num_le %d \n", DebugLVList->num_le );
2522
+ LOG_DEBUG("new_volume %d \n", DebugLVList->new_volume );
2523
+ LOG_DEBUG("group %p \n", DebugLVList->group );
2529
+ AIXVGLDebugPtr = AIXVGLDebugPtr->next;
2531
+ LOG_DEBUG("********** End Logical Volume Partition Dump **********\n");
2536
+ LOG_DEBUG("***********************************************\n");
2537
+ LOG_DEBUG("End Volume Group Dump \n");
2538
+ LOG_DEBUG("***********************************************\n");
2545
diff -Naur linux-2002-03-28/drivers/evms/Config.in evms-2002-03-28/drivers/evms/Config.in
2546
--- linux-2002-03-28/drivers/evms/Config.in Wed Dec 31 18:00:00 1969
2547
+++ evms-2002-03-28/drivers/evms/Config.in Mon Mar 18 16:54:45 2002
2550
+# Copyright (c) International Business Machines Corp., 2000
2552
+# This program is free software; you can redistribute it and/or modify
2553
+# it under the terms of the GNU General Public License as published by
2554
+# the Free Software Foundation; either version 2 of the License, or
2555
+# (at your option) any later version.
2557
+# This program is distributed in the hope that it will be useful,
2558
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
2559
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
2560
+# the GNU General Public License for more details.
2562
+# You should have received a copy of the GNU General Public License
2563
+# along with this program; if not, write to the Free Software
2564
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
2567
+# EVMS driver configuration
2570
+mainmenu_option next_comment
2571
+comment 'Enterprise Volume Management System'
2573
+tristate 'EVMS Kernel Runtime' CONFIG_EVMS
2574
+dep_tristate ' EVMS Local Device Manager Plugin' CONFIG_EVMS_LOCAL_DEV_MGR_PLUGIN $CONFIG_EVMS
2575
+dep_tristate ' EVMS DOS Partition Manager Plugin' CONFIG_EVMS_DOS_PARTITION_PLUGIN $CONFIG_EVMS
2576
+dep_tristate ' EVMS SnapShot Feature' CONFIG_EVMS_SNAPSHOT_PLUGIN $CONFIG_EVMS
2577
+dep_tristate ' EVMS DriveLink Feature' CONFIG_EVMS_DRIVELINK_PLUGIN $CONFIG_EVMS
2578
+dep_tristate ' EVMS Bad Block Relocation (BBR) Feature' CONFIG_EVMS_BBR_PLUGIN $CONFIG_EVMS
2579
+dep_tristate ' EVMS Linux LVM Package' CONFIG_EVMS_LVM_PLUGIN $CONFIG_EVMS
2580
+dep_tristate ' EVMS Linux MD Package' CONFIG_EVMS_MD_PLUGIN $CONFIG_EVMS
2581
+dep_tristate ' EVMS MD Linear (append) mode' CONFIG_EVMS_MD_LINEAR_PERS $CONFIG_EVMS_MD_PLUGIN
2582
+dep_tristate ' EVMS MD RAID-0 (stripe) mode' CONFIG_EVMS_MD_RAID0_PERS $CONFIG_EVMS_MD_PLUGIN
2583
+dep_tristate ' EVMS MD RAID-1 (mirroring) mode' CONFIG_EVMS_MD_RAID1_PERS $CONFIG_EVMS_MD_PLUGIN
2584
+dep_tristate ' EVMS MD RAID-4/RAID-5 mode' CONFIG_EVMS_MD_RAID5_PERS $CONFIG_EVMS_MD_PLUGIN
2585
+dep_tristate ' EVMS AIX LVM Package' CONFIG_EVMS_AIX_PLUGIN $CONFIG_EVMS
2586
+dep_tristate ' EVMS OS/2 LVM Package' CONFIG_EVMS_OS2_PLUGIN $CONFIG_EVMS
2587
+dep_tristate ' EVMS Clustering Package' CONFIG_EVMS_ECR_PLUGIN $CONFIG_EVMS
2589
+if [ "$CONFIG_ARCH_S390" = "y" ]; then
2590
+dep_tristate ' EVMS s390 Partition Manager Plugin' CONFIG_EVMS_S390_PART_PLUGIN $CONFIG_EVMS
2593
+if [ "$CONFIG_EVMS" != "n" ]; then
2594
+ choice ' EVMS Debug Level' \
2595
+ "Critical CONFIG_EVMS_INFO_CRITICAL \
2596
+ Serious CONFIG_EVMS_INFO_SERIOUS \
2597
+ Error CONFIG_EVMS_INFO_ERROR \
2598
+ Warning CONFIG_EVMS_INFO_WARNING \
2599
+ Default CONFIG_EVMS_INFO_DEFAULT \
2600
+ Details CONFIG_EVMS_INFO_DETAILS \
2601
+ Debug CONFIG_EVMS_INFO_DEBUG \
2602
+ Extra CONFIG_EVMS_INFO_EXTRA \
2603
+ Entry_Exit CONFIG_EVMS_INFO_ENTRY_EXIT \
2604
+ Everything CONFIG_EVMS_INFO_EVERYTHING" Default
2609
diff -Naur linux-2002-03-28/drivers/evms/Makefile evms-2002-03-28/drivers/evms/Makefile
2610
--- linux-2002-03-28/drivers/evms/Makefile Wed Dec 31 18:00:00 1969
2611
+++ evms-2002-03-28/drivers/evms/Makefile Thu Mar 28 15:13:34 2002
2614
+# Makefile for the kernel EVMS driver and modules.
2616
+# 08 March 2001, Mark Peloquin <peloquin@us.ibm.com>
2619
+O_TARGET := evmsdrvr.o
2621
+export-objs := evms.o evms_passthru.o ldev_mgr.o dos_part.o lvm_vge.o snapshot.o evms_drivelink.o evms_bbr.o AIXlvm_vge.o os2lvm_vge.o evms_ecr.o md_core.o md_linear.o md_raid0.o md_raid1.o md_raid5.o md_xor.o s390_part.o
2623
+# Link order is important! Plugins must come first, then the EVMS core.
2625
+obj-$(CONFIG_EVMS_LOCAL_DEV_MGR_PLUGIN) += ldev_mgr.o
2626
+obj-$(CONFIG_EVMS_DOS_PARTITION_PLUGIN) += dos_part.o
2627
+obj-$(CONFIG_EVMS_MD_PLUGIN) += md_core.o
2628
+obj-$(CONFIG_EVMS_MD_LINEAR_PERS) += md_linear.o
2629
+obj-$(CONFIG_EVMS_MD_RAID0_PERS) += md_raid0.o
2630
+obj-$(CONFIG_EVMS_MD_RAID1_PERS) += md_raid1.o
2631
+obj-$(CONFIG_EVMS_MD_RAID5_PERS) += md_raid5.o md_xor.o
2632
+obj-$(CONFIG_EVMS_LVM_PLUGIN) += lvm_vge.o
2633
+obj-$(CONFIG_EVMS_AIX_PLUGIN) += AIXlvm_vge.o
2634
+obj-$(CONFIG_EVMS_OS2_PLUGIN) += os2lvm_vge.o
2635
+obj-$(CONFIG_EVMS_DRIVELINK_PLUGIN) += evms_drivelink.o
2636
+obj-$(CONFIG_EVMS_BBR_PLUGIN) += evms_bbr.o
2637
+obj-$(CONFIG_EVMS_SNAPSHOT_PLUGIN) += snapshot.o
2638
+obj-$(CONFIG_EVMS_ECR_PLUGIN) += evms_ecr.o
2639
+obj-$(CONFIG_EVMS_S390_PART_PLUGIN) += s390_part.o
2640
+obj-$(CONFIG_EVMS) += evms_passthru.o evms.o
2642
+EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_DEFAULT
2643
+ifeq ($(CONFIG_EVMS_INFO_CRITICAL),y)
2644
+ EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_CRITICAL
2646
+ifeq ($(CONFIG_EVMS_INFO_SERIOUS),y)
2647
+ EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_SERIOUS
2649
+ifeq ($(CONFIG_EVMS_INFO_ERROR),y)
2650
+ EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_ERROR
2652
+ifeq ($(CONFIG_EVMS_INFO_WARNING),y)
2653
+ EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_WARNING
2655
+ifeq ($(CONFIG_EVMS_INFO_DETAILS),y)
2656
+ EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_DETAILS
2658
+ifeq ($(CONFIG_EVMS_INFO_DEBUG),y)
2659
+ EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_DEBUG
2661
+ifeq ($(CONFIG_EVMS_INFO_EXTRA),y)
2662
+ EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_EXTRA
2664
+ifeq ($(CONFIG_EVMS_INFO_ENTRY_EXIT),y)
2665
+ EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_ENTRY_EXIT
2667
+ifeq ($(CONFIG_EVMS_INFO_EVERYTHING),y)
2668
+ EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_EVERYTHING
2671
+include $(TOPDIR)/Rules.make
2673
diff -Naur linux-2002-03-28/drivers/evms/dos_part.c evms-2002-03-28/drivers/evms/dos_part.c
2674
--- linux-2002-03-28/drivers/evms/dos_part.c Wed Dec 31 18:00:00 1969
2675
+++ evms-2002-03-28/drivers/evms/dos_part.c Wed Mar 27 21:24:20 2002
2677
+/* -*- linux-c -*- */
2681
+ * Copyright (c) International Business Machines Corp., 2000
2683
+ * This program is free software; you can redistribute it and/or modify
2684
+ * it under the terms of the GNU General Public License as published by
2685
+ * the Free Software Foundation; either version 2 of the License, or
2686
+ * (at your option) any later version.
2688
+ * This program is distributed in the hope that it will be useful,
2689
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
2690
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
2691
+ * the GNU General Public License for more details.
2693
+ * You should have received a copy of the GNU General Public License
2694
+ * along with this program; if not, write to the Free Software
2695
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
2700
+ * linux/drivers/evms/dos_part.c
2702
+ * EVMS DOS partition manager
2704
+ * Partial code extracted from
2706
+ * linux/fs/partitions/msdos.c
2710
+#include <linux/config.h>
2711
+#include <linux/module.h>
2712
+#include <linux/kernel.h>
2713
+#include <linux/config.h>
2714
+#include <linux/fs.h>
2715
+#include <linux/genhd.h>
2716
+#include <linux/major.h>
2717
+#include <linux/string.h>
2718
+#include <linux/blk.h>
2719
+#include <linux/init.h>
2720
+#include <linux/iobuf.h> /* for kiobuf stuffs */
2722
+#ifdef CONFIG_BLK_DEV_IDE
2723
+#include <linux/ide.h> /* IDE xlate */
2724
+#endif /* CONFIG_BLK_DEV_IDE */
2726
+#include <linux/evms/evms_kernel.h>
2727
+#include <linux/evms/evms_os2.h>
2729
+#include <asm/system.h>
2730
+#include <asm/uaccess.h>
2732
+/* prefix used in logging messages */
2733
+#define LOG_PREFIX "dos_part: "
2735
+/* #include "msdos.h" */
2736
+#define MSDOS_LABEL_MAGIC 0xAA55
2738
+/* Skeletal MBR/EBR structure useful for our purposes */
2739
+typedef struct mbr_ebr_s {
2740
+ u_int8_t unused1[0x1be];
2741
+ struct partition partitions[4];
2742
+ u_int16_t signature;
2745
+/* Private instance data structure for node we produced */
2746
+typedef struct local_instance_data_s {
2747
+ evms_logical_node_t * source_disk;
2748
+ evms_sector_t start_sect; /* starting LBA */
2749
+ evms_sector_t nr_sects; /* number of sectors */
2750
+ unsigned char type; /* partition type or filesystem format indicator, can be set to 0 */
2751
+} local_instance_data_t;
2753
+/* Structure used to track progress traversing an EBR chain */
2754
+typedef struct extended_part_s {
2755
+ int partition_number;
2756
+ struct partition *extended;
2757
+ u_int64_t start_sect;
2758
+ u_int64_t next_ebr_start;
2762
+/* Global variables */
2763
+static int cur_comp_part_num; /* used to track non-primary
2764
+ * partition numbers
2766
+static int exported_nodes; /* total # of exported segments
2767
+ * produced during this discovery.
2770
+/* External references */
2771
+#if CONFIG_BLK_DEV_MD && CONFIG_AUTODETECT_RAID
2772
+extern void md_autodetect_dev(kdev_t dev);
2776
+static int mbr_ebr_partition_discover(evms_logical_node_t **);
2777
+static int mbr_ebr_partition_delete(evms_logical_node_t *);
2778
+static void mbr_ebr_partition_read(evms_logical_node_t *,
2780
+static void mbr_ebr_partition_write(evms_logical_node_t *,
2782
+static int mbr_ebr_partition_ioctl(evms_logical_node_t *,
2787
+static int mbr_ebr_partition_init_io(evms_logical_node_t *,
2793
+static evms_plugin_function_table_t function_table = {
2794
+ discover: &mbr_ebr_partition_discover,
2795
+ delete : &mbr_ebr_partition_delete,
2796
+ read : &mbr_ebr_partition_read,
2797
+ write : &mbr_ebr_partition_write,
2798
+ init_io : &mbr_ebr_partition_init_io,
2799
+ ioctl : &mbr_ebr_partition_ioctl
2802
+#define EVMS_MSDOS_PARTITION_MANAGER_ID 1
2804
+static evms_plugin_header_t plugin_header = {
2807
+ EVMS_SEGMENT_MANAGER,
2808
+ EVMS_MSDOS_PARTITION_MANAGER_ID),
2814
+ required_common_services_version : {
2819
+ function_table : &function_table
2823
+ * Many architectures don't like unaligned accesses, which is
2824
+ * frequently the case with the nr_sects and start_sect partition
2827
+#include <asm/unaligned.h>
2829
+#define SYS_IND(p) (get_unaligned(&p->sys_ind))
2830
+#define NR_SECTS(p) (u_int64_t)({ __typeof__(p->nr_sects) __a = \
2831
+ get_unaligned(&p->nr_sects); \
2832
+ le32_to_cpu(__a); \
2835
+#define START_SECT(p) (u_int64_t)({ __typeof__(p->start_sect) __a = \
2836
+ get_unaligned(&p->start_sect); \
2837
+ le32_to_cpu(__a); \
2841
+/***************************************************/
2842
+/* List Support - Typedefs, Variables, & Functions */
2843
+/***************************************************/
2847
+typedef struct local_segment_list_node_s {
2848
+ evms_logical_node_t *segment;
2849
+ struct local_segment_list_node_s *next;
2850
+} local_segment_list_node_t;
2852
+typedef struct local_disk_list_node_s {
2853
+ evms_logical_node_t *disk;
2854
+ local_segment_list_node_t *segment_list;
2855
+ struct local_disk_list_node_s *next;
2856
+} local_disk_list_node_t;
2860
+static local_disk_list_node_t *my_disk_list;
2864
+static local_disk_list_node_t **
2866
+ evms_logical_node_t *disk)
2868
+ local_disk_list_node_t **ldln;
2870
+ ldln = &my_disk_list;
2872
+ if ((*ldln)->disk == disk)
2874
+ ldln = &(*ldln)->next;
2879
+static local_segment_list_node_t **
2881
+ local_disk_list_node_t *disk,
2882
+ evms_logical_node_t *segment)
2884
+ local_segment_list_node_t **lsln;
2886
+ lsln = &disk->segment_list;
2888
+ if ((*lsln)->segment == segment)
2890
+ lsln = &(*lsln)->next;
2895
+static evms_logical_node_t *
2896
+find_segment_on_disk(
2897
+ evms_logical_node_t *disk,
2898
+ u_int64_t start_sect,
2899
+ u_int64_t nr_sects)
2901
+ evms_logical_node_t *rc = NULL;
2902
+ local_disk_list_node_t **ldln;
2903
+ local_segment_list_node_t **lsln;
2904
+ local_instance_data_t *lid;
2906
+ ldln = lookup_disk(disk);
2908
+ /* disk found in list */
2909
+ /* attempt to find segment */
2911
+ lsln = &(*ldln)->segment_list;
2913
+ lid = (*lsln)->segment->instance_data;
2914
+ if (lid->start_sect == start_sect)
2915
+ if (lid->nr_sects == nr_sects)
2917
+ lsln = &(*lsln)->next;
2920
+ rc = (*lsln)->segment;
2925
+/* function description: add_segment_to_disk
2927
+ * this function attempts to add a segment to the segment
2928
+ * list of a disk. if the specified disk is not found, it
2929
+ * will be added to the global disk list. this function will
2930
+ * return a pointer to the matching segment in the disk's
2931
+ * segment list. the caller must compare the returned pointer
2932
+ * to the specified segment to see if the
2933
+ * specified segment was already present in the disk's segment
2934
+ * list. if the return pointer matches the specified segment,
2935
+ * then the specified segment was added to the list. if the
2936
+ * return segment pointer to does not match the specified
2937
+ * segment pointer, then the specified segment pointer was
2938
+ * a duplicate and can be thrown away.
2941
+add_segment_to_disk(
2942
+ evms_logical_node_t *disk,
2943
+ evms_logical_node_t *segment)
2946
+ local_disk_list_node_t **ldln, *new_disk;
2947
+ local_segment_list_node_t **lsln, *new_segment;
2949
+ ldln = lookup_disk(disk);
2950
+ if (*ldln == NULL) {
2951
+ /* disk not in list, add disk */
2952
+ rc = evms_cs_allocate_memory((void **)&new_disk,
2953
+ sizeof(*new_disk));
2955
+ new_disk->disk = disk;
2960
+ /* attempt to add segment */
2961
+ lsln = lookup_segment(*ldln, segment);
2962
+ if (*lsln == NULL) {
2963
+ /* segment not in list, add segment */
2964
+ rc = evms_cs_allocate_memory((void **)&new_segment,
2965
+ sizeof(*new_segment));
2967
+ new_segment->segment = segment;
2968
+ *lsln = new_segment;
2977
+remove_segment_from_disk(
2978
+ evms_logical_node_t *disk,
2979
+ evms_logical_node_t *segment,
2980
+ evms_logical_node_t **empty_disk)
2983
+ local_disk_list_node_t **ldln, *tmp_disk_node;
2984
+ local_segment_list_node_t **lsln, *tmp_segment_node;
2986
+ *empty_disk = NULL;
2987
+ ldln = lookup_disk(disk);
2988
+ if (*ldln == NULL) {
2991
+ /* disk found in list */
2992
+ /* attempt to add segment */
2993
+ lsln = lookup_segment(*ldln, segment);
2994
+ if (*lsln == NULL) {
2997
+ tmp_segment_node = *lsln;
2998
+ /* remove segment from list */
2999
+ *lsln = (*lsln)->next;
3000
+ /* free the segment list node */
3001
+ evms_cs_deallocate_memory(tmp_segment_node);
3003
+ if ((*ldln)->segment_list == NULL) {
3004
+ tmp_disk_node = *ldln;
3005
+ *empty_disk = tmp_disk_node->disk;
3006
+ /* remove disk from list */
3007
+ *ldln = (*ldln)->next;
3008
+ /* free the disk list node */
3009
+ evms_cs_deallocate_memory(tmp_disk_node);
3017
+is_extended_partition(struct partition *p)
3019
+ return (SYS_IND(p) == DOS_EXTENDED_PARTITION ||
3020
+ SYS_IND(p) == WIN98_EXTENDED_PARTITION ||
3021
+ SYS_IND(p) == LINUX_EXTENDED_PARTITION);
3025
+part_start(struct partition *part, u64 ext_start, u64 ebr_start)
3027
+ u64 pstart = START_SECT(part);
3028
+ pstart += (is_extended_partition(part)) ? ext_start : ebr_start;
3034
+ evms_logical_node_t *node,
3035
+ mbr_ebr_t *mbr_ebr,
3036
+ u_int64_t ext_start,
3037
+ u_int64_t ebr_start)
3039
+ int valid_mbr_ebr, i, j, mbr_flag;
3040
+ struct partition *pi, *pj;
3041
+ u_int64_t pi_start, pi_end, pj_start, pj_end;
3043
+ /* assume an MBR */
3046
+ /* assume its valid */
3047
+ valid_mbr_ebr = TRUE;
3049
+ /* check for valid signature */
3050
+ if (mbr_ebr->signature != cpu_to_le16(MSDOS_LABEL_MAGIC)) {
3051
+ LOG_DEBUG("%s: invalid signature on '%s'!\n",
3052
+ __FUNCTION__, node->name);
3053
+ valid_mbr_ebr = FALSE;
3056
+ /* check for an AIX IPL signature */
3057
+ #define IPLRECID 0xc9c2d4c1 /* Value is EBCIDIC 'IBMA' */
3058
+ if ( *(unsigned int *)mbr_ebr == IPLRECID ) {
3059
+ LOG_DEBUG("%s: found an AIX IPL signature on '%s'\n",
3060
+ __FUNCTION__, node->name);
3061
+ valid_mbr_ebr = FALSE;
3065
+ /* check for boot sector fields */
3067
+#if 0 //Remove checking of the first byte
3069
+ /* attempt to make some initial assumptions about
3070
+ * what type of data structure this could be. we
3071
+ * start by checking the 1st byte. we can tell a
3072
+ * few things based on what is or isn't there.
3074
+ if (valid_mbr_ebr == TRUE)
3075
+ switch(*(u_char *)mbr_ebr) {
3076
+ /* check for JMP as 1st instruction
3077
+ * if found, assume (for now), that
3078
+ * this is a boot sector.
3080
+ /* Removed the JMP opcode check because it's not enough to determine
3081
+ * that this sector does not have a valid MBR.
3082
+ * Note: To avoid going thru validation process of partition table,
3083
+ * it's necessary to have a better boot sector check
3084
+ * (eg. JMP opcode && other conditions) */
3087
+ LOG_DEBUG("%s: boot sector detected!\n", __FUNCTION__);
3088
+ valid_mbr_ebr = FALSE;
3090
+ /* let this fall thru to pick up the
3091
+ * mbr_flag == FALSE.
3095
+ /* the MBR should contain boot strap
3096
+ * code, so we don't expect the 1st
3097
+ * byte to be a 0x0. If the 1st byte
3098
+ * IS 0x0, its assumed (for now) to
3105
+#endif //Remove checking of the first byte
3107
+ if (valid_mbr_ebr == TRUE) {
3108
+ /* dump the partition table entries in debug mode */
3109
+ LOG_DEBUG("%s: disk relative starts: ext_part(%Ld), ebr(%Ld).\n",
3110
+ __FUNCTION__, ext_start, ebr_start);
3111
+ for (i = 0; i < 4; i++) {
3112
+ pi = &mbr_ebr->partitions[i];
3113
+ LOG_DEBUG("%s: Partition: index(%d), start(%Ld), size(%Ld), sys(0x%x).\n",
3114
+ __FUNCTION__, i, START_SECT(pi), NR_SECTS(pi), SYS_IND(pi));
3116
+ /* check for mbr/ebr partition table validity */
3117
+ for (i = 0; i < 4; i++) {
3118
+ pi = &mbr_ebr->partitions[i];
3119
+ if (NR_SECTS(pi)) {
3120
+ /* check for partition extending past end of node */
3121
+ pi_start = part_start(pi, ext_start, ebr_start);
3122
+ pi_end = pi_start + NR_SECTS(pi) - 1;
3123
+ if ( pi_end >= node->total_vsectors) {
3124
+ LOG_DEBUG("%s: partition(%d) ends(%Ld) beyond the end of the disk(%s,%Ld)!\n",
3125
+ __FUNCTION__, i, pi_end,
3126
+ node->name, node->total_vsectors);
3127
+ valid_mbr_ebr = FALSE;
3129
+ if (valid_mbr_ebr == FALSE) break;
3131
+ /* check for partition overlap */
3132
+ for (j = i + 1; j < 4; j++) {
3133
+ pj = &mbr_ebr->partitions[j];
3134
+ if (NR_SECTS(pj)) {
3135
+ pj_start = part_start(pj, ext_start, ebr_start);
3136
+ pj_end = pj_start + NR_SECTS(pj) - 1;
3137
+ if (pi_start == pj_start) {
3138
+ valid_mbr_ebr = FALSE;
3139
+ } else if (pi_start < pj_start) {
3140
+ if (pi_end >= pj_start)
3141
+ valid_mbr_ebr = FALSE;
3142
+ } else if (pi_start <= pj_end)
3143
+ valid_mbr_ebr = FALSE;
3145
+ if (valid_mbr_ebr == FALSE) {
3146
+ LOG_DEBUG("%s: overlapping partitions(%d,%d) detected on '%s'!\n",
3147
+ __FUNCTION__,i,j, node->name);
3152
+ if (valid_mbr_ebr == FALSE) break;
3156
+ if (valid_mbr_ebr == TRUE) {
3157
+ LOG_DEBUG("%s: valid %cBR detected on '%s'!\n", __FUNCTION__,
3158
+ (mbr_flag == TRUE) ? 'M' : 'E', node->name);
3160
+ LOG_DEBUG("%s: no valid MBR/EBR detected on '%s'!\n",
3161
+ __FUNCTION__, node->name);
3163
+ return(valid_mbr_ebr);
3167
+ * Function: add_segment
3170
+mbr_ebr_process_segment(
3171
+ evms_logical_node_t **discover_list,
3172
+ evms_logical_node_t *node,
3173
+ u_int64_t start_sect,
3174
+ u_int64_t nr_sects,
3175
+ unsigned char type,
3177
+ char *partition_name)
3179
+ local_instance_data_t *InstData = NULL;
3180
+ evms_logical_node_t *segment;
3183
+ segment = find_segment_on_disk(node, start_sect, nr_sects);
3185
+ LOG_DETAILS("exporting segment '%s'.\n",
3188
+ rc = evms_cs_allocate_memory((void **)&InstData,sizeof(*InstData));
3190
+ InstData->source_disk = node;
3191
+ InstData->start_sect = start_sect;
3192
+ InstData->nr_sects = nr_sects;
3193
+ InstData->type = type;
3194
+ rc = evms_cs_allocate_logical_node(&segment);
3197
+ segment->plugin = &plugin_header;
3198
+ segment->system_id = (unsigned int)type;
3199
+ segment->total_vsectors = nr_sects;
3200
+ segment->block_size = node->block_size;
3201
+ segment->hardsector_size = node->hardsector_size;
3202
+ segment->instance_data = InstData;
3203
+ segment->flags = node->flags;
3204
+ if (partition_name)
3205
+ strcpy(segment->name, partition_name);
3207
+ strcpy(segment->name, node->name);
3208
+ sprintf(segment->name + strlen(segment->name), "%d", part_num);
3210
+ LOG_DETAILS("creating segment '%s'.\n",
3212
+ rc = add_segment_to_disk(node, segment);
3214
+ LOG_ERROR("%s: error(%d) adding segment '%s'!\n",
3215
+ __FUNCTION__, rc, segment->name);
3218
+ MOD_INC_USE_COUNT;
3223
+ evms_cs_deallocate_memory(InstData);
3225
+ evms_cs_deallocate_logical_node(segment);
3229
+ evms_cs_add_logical_node_to_list(discover_list, segment);
3236
+print_partition_info( char *leading_comment, struct partition *p )
3238
+ LOG_EXTRA("%s: boot_ind(0x%02x), sys_ind(0x%02x), startCHS(%u,%u,%u), endCHS(%u,%u,%u), startLBA(%Lu), sizeLBA(%Lu)\n",
3239
+ leading_comment,p->boot_ind,p->sys_ind,p->cyl,p->head,p->sector,
3240
+ p->end_cyl,p->end_head,p->end_sector,START_SECT(p),NR_SECTS(p));
3243
+#ifdef CONFIG_BSD_DISKLABEL
3244
+#define BSD_DISKLABEL_PART_TABLE_SECTOR_OFFSET 1
3246
+print_bsd_partition_info( char *leading_comment, struct bsd_partition *p )
3248
+ LOG_EXTRA("%s: p_size(%u), p_offset(%u), p_fsize(%u), p_fstype(0x%02X), p_frag(0x%02X), p_cpg(%u)\n",
3249
+ leading_comment,p->p_size, p->p_offset, p->p_fsize, p->p_fstype, p->p_frag, p->p_cpg);
3253
+ * bsd_disklabel_partition
3256
+ * - 0 for 0 partition
3257
+ * - (positive) number for number of BSD partitions found
3258
+ * - (negative) error code
3261
+bsd_disklabel_partition(
3262
+ evms_logical_node_t **discover_list,
3263
+ evms_logical_node_t *node,
3264
+ struct partition *bsd)
3266
+ struct bsd_disklabel *l;
3267
+ struct bsd_partition *p;
3268
+ int max_partitions;
3273
+ rc = evms_cs_allocate_memory((void**) &data, node->hardsector_size);
3275
+ rc = INIT_IO(node,
3277
+ START_SECT(bsd) + BSD_DISKLABEL_PART_TABLE_SECTOR_OFFSET,
3282
+ l = (struct bsd_disklabel *) data;
3283
+ if (l->d_magic == BSD_DISKMAGIC) {
3285
+ max_partitions = ((SYS_IND(bsd) == OPENBSD_PARTITION) ? OPENBSD_MAXPARTITIONS
3286
+ : BSD_MAXPARTITIONS);
3287
+ if (l->d_npartitions < max_partitions)
3288
+ max_partitions = l->d_npartitions;
3289
+ for (p = l->d_partitions; p - l->d_partitions < max_partitions; p++) {
3290
+ if (p->p_fstype != BSD_FS_UNUSED) {
3291
+ evmsTRACE2(EVMS_INFO_EXTRA,
3292
+ (print_bsd_partition_info(__FUNCTION__, p)));
3293
+ rc = mbr_ebr_process_segment(
3296
+ (u_int64_t)p->p_offset,
3297
+ (u_int64_t)p->p_size,
3299
+ cur_comp_part_num++,
3309
+ evms_cs_deallocate_memory(data);
3312
+ LOG_DETAILS("%s: exported (%d) partitions\n", __FUNCTION__, rc);
3317
+#ifdef CONFIG_UNIXWARE_DISKLABEL
3318
+#define UNIXWARE_PART_TABLE_SECTOR_OFFSET 29
3321
+ * unixware_partition
3324
+ * - 0 for 0 partition
3325
+ * - (positive) number for number of UNIXWARE partitions found
3326
+ * - (negative) error code
3329
+unixware_partition(
3330
+ evms_logical_node_t **discover_list,
3331
+ evms_logical_node_t *node,
3332
+ struct partition *unixware_part)
3334
+ struct unixware_disklabel *l;
3335
+ struct unixware_slice *p;
3336
+ char *data = NULL;
3340
+ rc = evms_cs_allocate_memory((void**) &data, node->hardsector_size);
3342
+ rc = INIT_IO(node,
3344
+ START_SECT(unixware_part) + UNIXWARE_PART_TABLE_SECTOR_OFFSET,
3348
+ l = (struct unixware_disklabel *)data;
3349
+ if ( le32_to_cpu(l->d_magic) == UNIXWARE_DISKMAGIC &&
3350
+ le32_to_cpu(l->vtoc.v_magic) == UNIXWARE_DISKMAGIC2) {
3351
+ p = &l->vtoc.v_slice[1]; /* The 0th slice is the same as whole disk. */
3352
+ while (p - &l->vtoc.v_slice[0] < UNIXWARE_NUMSLICE) {
3353
+ if (p->s_label != UNIXWARE_FS_UNUSED) {
3354
+ rc = mbr_ebr_process_segment(
3359
+ UNIXWARE_PARTITION,
3360
+ cur_comp_part_num++,
3371
+ evms_cs_deallocate_memory(data);
3374
+ LOG_DETAILS("%s: exported (%d) partitions\n", __FUNCTION__, rc);
3379
+#ifdef CONFIG_SOLARIS_X86_PARTITION
3380
+#define SOLARIS_X86_PART_TABLE_SECTOR_OFFSET 1
3382
+ * solaris_x86_partition
3385
+ * - 0 for 0 partition
3386
+ * - (positive) number for number of solaris partitions found
3387
+ * - (negative) error code
3390
+solaris_x86_partition(
3391
+ evms_logical_node_t **discover_list,
3392
+ evms_logical_node_t *node,
3393
+ struct partition *solaris_x86,
3394
+ int probe_only) /* if TRUE, do not add segments */
3396
+ long offset = START_SECT(solaris_x86);
3397
+ struct solaris_x86_vtoc *v;
3398
+ struct solaris_x86_slice *s;
3400
+ char *data = NULL;
3404
+ rc = evms_cs_allocate_memory((void**) &data, node->hardsector_size);
3406
+ rc = INIT_IO(node,
3408
+ START_SECT(solaris_x86) + SOLARIS_X86_PART_TABLE_SECTOR_OFFSET,
3413
+ v = (struct solaris_x86_vtoc *)data;
3415
+ if (v->v_sanity == SOLARIS_X86_VTOC_SANE) {
3416
+ if (v->v_version != 1) {
3417
+ LOG_WARNING("%s: cannot handle version %d vtoc>\n", __FUNCTION__, v->v_version);
3419
+ for (i=0; i<v->v_nparts; i++) {
3420
+ s = &v->v_slice[i];
3421
+ LOG_EXTRA("s[%d] s_tag(%u), s_flag(%u), s_start(%u), s_size(%u), last_sector(%u)\n",
3422
+ i,s->s_tag, s->s_flag, s->s_start, s->s_size, s->s_start + s->s_size -1);
3424
+ if ((s->s_size == 0) || (s->s_tag == 0x05))
3426
+ if (!probe_only) {
3427
+ rc = mbr_ebr_process_segment(
3430
+ (u_int64_t)(s->s_start+offset),
3431
+ (u_int64_t)s->s_size,
3432
+ SOLARIS_X86_PARTITION,
3433
+ cur_comp_part_num++,
3444
+ evms_cs_deallocate_memory(data);
3447
+ LOG_DETAILS("%s: %s (%d) partitions\n",
3448
+ __FUNCTION__, probe_only ? " " : "exported", rc);
3454
+ * os2lvm_partition() looks for DLAT at last sector of the track containing MBR/EBR
3456
+ * Returns: 1 - os2 DLAT was found
3462
+ u_int64_t MBR_EBR_sect,
3463
+ evms_logical_node_t *node,
3464
+ DLA_Table_Sector *dlat)
3466
+ struct hd_geometry geometry;
3468
+ u_int32_t crc_hold;
3470
+ rc = evms_cs_kernel_ioctl(node, HDIO_GETGEO, (unsigned long)&geometry);
3472
+ LOG_SERIOUS("%s: ioctl failed(%u) on '%s'\n",
3473
+ __FUNCTION__, rc, node->name);
3474
+ } else if (!INIT_IO(node, 0, MBR_EBR_sect + geometry.sectors - 1, 1, dlat)) {
3475
+ if ( (dlat->DLA_Signature1 == cpu_to_le32(DLA_TABLE_SIGNATURE1)) &&
3476
+ (dlat->DLA_Signature2 == cpu_to_le32(DLA_TABLE_SIGNATURE2)) ) {
3477
+ crc_hold = le32_to_cpu( dlat->DLA_CRC );
3478
+ dlat->DLA_CRC = 0;
3479
+ if ( evms_cs_calculate_crc( EVMS_INITIAL_CRC, (void *)dlat,
3480
+ node->hardsector_size ) == crc_hold )
3488
+mbr_ebr_process_logical_drive(
3489
+ evms_logical_node_t **discover_list,
3490
+ evms_logical_node_t *node,
3491
+ extended_part_t *ext_info,
3493
+ struct partition *p,
3495
+ DLA_Table_Sector *dlat)
3498
+ char tmp_buf[EVMS_VOLUME_NAME_SIZE], *partition_name;
3500
+ LOG_EXTRA("%s: PartitionTableIndex(%i), Start(%Lu), Size(%Lu)\n",
3501
+ __FUNCTION__, i, START_SECT(p), NR_SECTS(p));
3503
+ if (NR_SECTS(p)) {
3504
+ if (is_extended_partition(p)) {
3505
+ ext_info->next_ebr_start =
3506
+ (u_int64_t)(START_SECT(p) + START_SECT(ext_info->extended));
3507
+ ext_info->done = FALSE; /* not done yet */
3509
+ partition_name = NULL;
3510
+ if ( os2lvm && p->sys_ind != LVM_PARTITION_INDICATOR &&
3511
+ le32_to_cpu( dlat->DLA_Array[i].Partition_Start ) == ( ext_info->start_sect + START_SECT(p) ) &&
3512
+ le32_to_cpu( dlat->DLA_Array[i].Partition_Size ) == NR_SECTS(p) &&
3513
+ dlat->DLA_Array[i].Drive_Letter != '\0' ) {
3514
+ sprintf( tmp_buf, "os2/%c", dlat->DLA_Array[i].Drive_Letter );
3515
+ partition_name = tmp_buf;
3517
+ evmsTRACE2(EVMS_INFO_EXTRA,
3518
+ (print_partition_info(__FUNCTION__, p)));
3520
+ rc = mbr_ebr_process_segment(
3523
+ ext_info->start_sect + START_SECT(p),
3526
+ cur_comp_part_num++,
3534
+mbr_ebr_process_ebr(
3535
+ evms_logical_node_t **discover_list,
3536
+ evms_logical_node_t *node,
3537
+ extended_part_t *ext_info,
3540
+ int rc = 0, i, os2lvm;
3541
+ struct partition *p;
3542
+ DLA_Table_Sector *dlat = NULL;
3544
+ /* allocate space for the OS2 DLAT info */
3545
+ rc = evms_cs_allocate_memory((void **)&dlat, node->hardsector_size);
3547
+ /* read the dlat for this mbr */
3548
+ os2lvm = os2lvm_partition(ext_info->start_sect, node, dlat);
3550
+ /* walk thru the partition table in the mbr
3551
+ * processing each partition record.
3553
+ for (i = 0; i < 4; i++) {
3554
+ p = &ebr->partitions[i];
3555
+ rc = mbr_ebr_process_logical_drive(
3566
+ /* free the space used for OS2 DLAT info */
3568
+ evms_cs_deallocate_memory(dlat);
3574
+mbr_ebr_probe_for_ebr(
3575
+ evms_logical_node_t **discover_list,
3576
+ evms_logical_node_t *node,
3577
+ extended_part_t *ext_info)
3580
+ u_char *sector_buffer = NULL;
3581
+ mbr_ebr_t *ebr = NULL;
3583
+ /* allocate a sector size buffer */
3584
+ rc = evms_cs_allocate_memory((void **)§or_buffer,
3585
+ node->hardsector_size);
3587
+ /* read the location of the mbr sector */
3588
+ rc = INIT_IO(node, 0, ext_info->start_sect, 1, sector_buffer);
3591
+ ebr = (mbr_ebr_t *)sector_buffer;
3592
+ if (validate_mbr_ebr(node, ebr,
3593
+ START_SECT(ext_info->extended),
3594
+ ext_info->start_sect) == TRUE)
3595
+ rc = mbr_ebr_process_ebr(
3602
+ if (sector_buffer)
3603
+ evms_cs_deallocate_memory(sector_buffer);
3609
+mbr_ebr_process_extended_partition(
3610
+ evms_logical_node_t **discover_list,
3611
+ evms_logical_node_t *node,
3612
+ struct partition *p)
3615
+ extended_part_t ext_info;
3617
+ memset(&ext_info, 0, sizeof(ext_info));
3618
+ ext_info.done = FALSE;
3619
+ ext_info.extended = p;
3620
+ ext_info.next_ebr_start = START_SECT(p);
3621
+ while (ext_info.done == FALSE) {
3622
+ ext_info.done = TRUE; /* assume done, unless we find another EBR */
3623
+ ext_info.start_sect = ext_info.next_ebr_start;
3624
+ rc = mbr_ebr_probe_for_ebr(
3633
+ * is_non_dos_extended
3635
+ * This function returns TRUE if the partition entry represents a non-DOS
3636
+ * extended partition such as UnixWare, Solaris x86 and BSD
3639
+is_non_dos_extended(
3640
+ evms_logical_node_t **discover_list,
3641
+ evms_logical_node_t *node,
3642
+ struct partition *p)
3644
+ if (NR_SECTS(p)) {
3645
+ #ifdef CONFIG_BSD_DISKLABEL
3646
+ if (SYS_IND(p) == BSD_PARTITION ||
3647
+ SYS_IND(p) == NETBSD_PARTITION ||
3648
+ SYS_IND(p) == OPENBSD_PARTITION)
3652
+ #ifdef CONFIG_UNIXWARE_DISKLABEL
3653
+ if (SYS_IND(p) == UNIXWARE_PARTITION)
3657
+ #ifdef CONFIG_SOLARIS_X86_PARTITION
3658
+ if ( (SYS_IND(p) == SOLARIS_X86_PARTITION) &&
3659
+ (solaris_x86_partition(discover_list, node, p, TRUE) > 0) )
3667
+ * mbr_ebr_process_other_primary_partition
3668
+ * This function processes other (non-DOS) primary partitions such as
3669
+ * UnixWare, Solaris x86 and BSD
3672
+mbr_ebr_process_other_primary_partition(
3673
+ evms_logical_node_t **discover_list,
3674
+ evms_logical_node_t *node,
3675
+ struct partition *p)
3677
+ if (NR_SECTS(p)) {
3678
+ #ifdef CONFIG_BSD_DISKLABEL
3679
+ if (SYS_IND(p) == BSD_PARTITION ||
3680
+ SYS_IND(p) == NETBSD_PARTITION ||
3681
+ SYS_IND(p) == OPENBSD_PARTITION)
3682
+ return bsd_disklabel_partition(discover_list, node, p);
3685
+ #ifdef CONFIG_UNIXWARE_DISKLABEL
3686
+ if (SYS_IND(p) == UNIXWARE_PARTITION)
3687
+ return unixware_partition(discover_list, node, p);
3690
+ #ifdef CONFIG_SOLARIS_X86_PARTITION
3691
+ if (SYS_IND(p) == SOLARIS_X86_PARTITION)
3692
+ return solaris_x86_partition(discover_list, node, p, FALSE);
3699
+mbr_ebr_process_dos_primary_partition(
3700
+ evms_logical_node_t **discover_list,
3701
+ evms_logical_node_t *node,
3703
+ struct partition *p,
3705
+ DLA_Table_Sector *dlat)
3708
+ char tmp_buf[EVMS_VOLUME_NAME_SIZE], *partition_name;
3710
+ LOG_EVERYTHING("%s: PartitionTableIndex(%i), Start(%Lu), Size(%Lu)\n",
3711
+ __FUNCTION__, i, START_SECT(p), NR_SECTS(p));
3713
+ if (NR_SECTS(p)) {
3715
+ if (is_extended_partition(p))
3716
+ rc = mbr_ebr_process_extended_partition(
3717
+ discover_list,node,p);
3720
+ partition_name = NULL;
3721
+ if ( os2lvm && p->sys_ind != LVM_PARTITION_INDICATOR &&
3722
+ le32_to_cpu( dlat->DLA_Array[i].Partition_Start ) == START_SECT(p) &&
3723
+ le32_to_cpu( dlat->DLA_Array[i].Partition_Size ) == NR_SECTS(p) &&
3724
+ dlat->DLA_Array[i].Drive_Letter != '\0' ) {
3725
+ sprintf( tmp_buf, "os2/%c", dlat->DLA_Array[i].Drive_Letter );
3726
+ partition_name = tmp_buf;
3728
+ evmsTRACE2(EVMS_INFO_EXTRA,
3729
+ (print_partition_info(__FUNCTION__, p)));
3731
+ rc = mbr_ebr_process_segment(
3745
+mbr_ebr_process_mbr(
3746
+ evms_logical_node_t **discover_list,
3747
+ evms_logical_node_t *node,
3750
+ int rc = 0, i, os2lvm;
3751
+ struct partition *p;
3752
+ DLA_Table_Sector *dlat = NULL;
3754
+ cur_comp_part_num = 5; /* set this value for each disk */
3756
+ /* allocate space for the OS2 DLAT info */
3757
+ rc = evms_cs_allocate_memory((void **)&dlat, node->hardsector_size);
3759
+ /* read the dlat for this mbr */
3760
+ os2lvm = os2lvm_partition(0, node, dlat);
3762
+ /* Pass 1: walk thru the partition table in the mbr
3763
+ * processing each partition record.
3765
+ for (i = 0; i < 4; i++) {
3766
+ p = &mbr->partitions[i];
3767
+ if (is_non_dos_extended(discover_list, node, p)) {
3768
+ LOG_DETAILS(" Found and skip a non-dos extended partition.\n");
3772
+ mbr_ebr_process_dos_primary_partition(
3781
+ /* Pass 2: walk thru the partition table in the mbr
3782
+ * processing each partition record for non-DOS extended partitions
3784
+ for (i = 0; i < 4; i++) {
3785
+ p = &mbr->partitions[i];
3786
+ mbr_ebr_process_other_primary_partition(
3794
+ /* free the space used for OS2 DLAT info */
3796
+ evms_cs_deallocate_memory(dlat);
3802
+mbr_ebr_probe_for_mbr(
3803
+ evms_logical_node_t **discover_list,
3804
+ evms_logical_node_t *node)
3807
+ u_char *sector_buffer = NULL;
3808
+ mbr_ebr_t *mbr = NULL;
3810
+ LOG_DEBUG("%s: probing (%s).\n",
3811
+ __FUNCTION__, node->name);
3813
+ /* allocate a sector size buffer */
3814
+ rc = evms_cs_allocate_memory((void **)§or_buffer,
3815
+ node->hardsector_size);
3817
+ /* read the location of the mbr sector */
3818
+ rc = INIT_IO(node, 0, 0, 1, sector_buffer);
3820
+ LOG_ERROR("%s: read error(%d) on '%s'.\n",
3821
+ __FUNCTION__, rc, node->name);
3823
+ mbr = (mbr_ebr_t *)sector_buffer;
3824
+ if (validate_mbr_ebr(node, mbr, 0, 0) == TRUE) {
3825
+ /* since it looks like this disk has a
3826
+ * valid MBR, remove the disk node from
3827
+ * the discover list. it may already be
3828
+ * on the global list, or it will be
3829
+ * added to it. in the case of an mbr
3830
+ * with no partitions, it is simply
3831
+ * removed and forgotten. when one or
3832
+ * more partitions are created, the
3833
+ * disk will be examined and handled
3834
+ * properly during the following
3835
+ * rediscover operation.
3837
+ evms_cs_remove_logical_node_from_list(
3838
+ discover_list, node);
3840
+ rc = mbr_ebr_process_mbr(discover_list,node,mbr);
3844
+ if (sector_buffer)
3845
+ evms_cs_deallocate_memory(sector_buffer);
3851
+ * Function: mbr_ebr_partition_discover
3855
+mbr_ebr_partition_discover(evms_logical_node_t **discover_list)
3858
+ evms_logical_node_t *node, *next_node;
3860
+ LOG_ENTRY_EXIT("%s: ENTRY\n", __FUNCTION__);
3862
+ /* initialize global variable */
3863
+ exported_nodes = 0;
3865
+ /* examine each node on the discover list */
3866
+ next_node = *discover_list;
3867
+ while(next_node) {
3869
+ next_node = node->next;
3870
+ if (node->plugin->id == plugin_header.id)
3871
+ /* don't recurse into our own objects
3874
+ mbr_ebr_probe_for_mbr(discover_list,node);
3877
+ LOG_ENTRY_EXIT("%s: EXIT(exported nodes:%d, error code:%d)\n",
3878
+ __FUNCTION__, exported_nodes, rc);
3879
+ if (exported_nodes)
3880
+ rc = exported_nodes;
3885
+ * Function: mbr_ebr_partition_delete
3889
+mbr_ebr_partition_delete(evms_logical_node_t *segment)
3892
+ local_instance_data_t *LID;
3893
+ evms_logical_node_t *empty_disk = NULL;
3895
+ LOG_DETAILS("deleting segment '%s'.\n",segment->name);
3900
+ LID = segment->instance_data;
3902
+ /* remove the segment from the
3903
+ * disk's segment list
3905
+ rc = remove_segment_from_disk(
3909
+ /* free the local instance data */
3910
+ evms_cs_deallocate_memory(LID);
3912
+ /* free the segment node */
3913
+ evms_cs_deallocate_logical_node(segment);
3914
+ MOD_DEC_USE_COUNT;
3915
+ /* if the last segment on the disk was
3916
+ * deleted, delete the disk node too
3919
+ DELETE(empty_disk);
3925
+ * function: mbr_ebr_partition_io_error
3927
+ * this function was primarily created because the function
3928
+ * buffer_IO_error is inline and kgdb doesn't allow breakpoints
3929
+ * to be set on inline functions. Since this was an error path
3930
+ * and not mainline, I decided to add a trace statement to help
3931
+ * report on the failing condition.
3935
+mbr_ebr_partition_io_error(
3936
+ evms_logical_node_t *node,
3940
+ LOG_SERIOUS("attempt to %s beyond partition boundary(%Ld) on (%s), rsector(%Ld).\n",
3941
+ (io_flag) ? "WRITE" : "READ",
3942
+ node->total_vsectors - 1,
3946
+ EVMS_IO_ERROR(eio);
3950
+ * Function: mbr_ebr_partition_read
3954
+mbr_ebr_partition_read(
3955
+ evms_logical_node_t *partition,
3958
+ local_instance_data_t *LID = partition->instance_data;
3960
+ if ((eio->rsector + eio->rsize) <= partition->total_vsectors) {
3961
+ eio->rsector += LID->start_sect;
3962
+ R_IO(LID->source_disk, eio);
3964
+ mbr_ebr_partition_io_error(partition, READ, eio);
3968
+ * Function: mbr_ebr_partition_write
3972
+mbr_ebr_partition_write(
3973
+ evms_logical_node_t *partition,
3976
+ local_instance_data_t *LID = partition->instance_data;
3978
+ if ((eio->rsector + eio->rsize) <= partition->total_vsectors) {
3979
+ eio->rsector += LID->start_sect;
3980
+ W_IO(LID->source_disk, eio);
3982
+ mbr_ebr_partition_io_error(partition, WRITE, eio);
3986
+ * Function: mbr_ebr_partition_init_io
3990
+mbr_ebr_partition_init_io(
3991
+ evms_logical_node_t *partition,
3992
+ int io_flag, /* 0=read, 1=write*/
3993
+ evms_sector_t sect_nr, /* disk LBA */
3994
+ evms_sector_t num_sects, /* # of sectors */
3995
+ void *buf_addr) /* buffer address */
3998
+ local_instance_data_t *LID = partition->instance_data;
4000
+ if ((sect_nr + num_sects) <= partition->total_vsectors) {
4001
+ rc = INIT_IO(LID->source_disk, io_flag, sect_nr + LID->start_sect, num_sects, buf_addr);
4003
+ LOG_SERIOUS("init_io: attempt to %s beyond partition(%s) boundary(%Ld) at sector(%Ld) for count(%Ld).\n",
4004
+ (io_flag) ? "WRITE" : "READ",
4006
+ (LID->nr_sects - 1),
4007
+ sect_nr, num_sects);
4015
+ * Function: mbr_ebr_partition_ioctl
4019
+mbr_ebr_partition_ioctl (
4020
+ evms_logical_node_t *partition,
4021
+ struct inode *inode,
4022
+ struct file *file,
4024
+ unsigned long arg)
4026
+ local_instance_data_t *LID;
4027
+ struct hd_geometry hd_geo;
4031
+ LID = partition->instance_data;
4037
+ rc = IOCTL(LID->source_disk, inode, file, cmd, arg);
4039
+ if (copy_from_user(&hd_geo, (void *)arg, sizeof(struct hd_geometry)))
4042
+ hd_geo.start = LID->start_sect;
4043
+ if (copy_to_user((void *)arg, &hd_geo, sizeof(struct hd_geometry)))
4047
+ case EVMS_GET_BMAP:
4049
+ evms_get_bmap_t *bmap = (evms_get_bmap_t *)arg;
4050
+ bmap->rsector += LID->start_sect;
4051
+ /* intentionally fall thru to
4052
+ * default ioctl down to device
4057
+ rc = IOCTL(LID->source_disk, inode, file, cmd, arg);
4063
+ * Function: dos_part_init
4067
+dos_part_init(void)
4069
+ return evms_cs_register_plugin(&plugin_header); /* register with EVMS */
4073
+dos_part_exit(void)
4075
+ evms_cs_unregister_plugin(&plugin_header);
4078
+module_init(dos_part_init);
4079
+module_exit(dos_part_exit);
4080
+#ifdef MODULE_LICENSE
4081
+MODULE_LICENSE("GPL");
4084
diff -Naur linux-2002-03-28/drivers/evms/evms.c evms-2002-03-28/drivers/evms/evms.c
4085
--- linux-2002-03-28/drivers/evms/evms.c Wed Dec 31 18:00:00 1969
4086
+++ evms-2002-03-28/drivers/evms/evms.c Thu Mar 28 15:43:00 2002
4088
+/* -*- linux-c -*- */
4092
+ * Copyright (c) International Business Machines Corp., 2000
4094
+ * This program is free software; you can redistribute it and/or modify
4095
+ * it under the terms of the GNU General Public License as published by
4096
+ * the Free Software Foundation; either version 2 of the License, or
4097
+ * (at your option) any later version.
4099
+ * This program is distributed in the hope that it will be useful,
4100
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
4101
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
4102
+ * the GNU General Public License for more details.
4104
+ * You should have received a copy of the GNU General Public License
4105
+ * along with this program; if not, write to the Free Software
4106
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
4112
+ * linux/drivers/evms/evms.c
4114
+ * EVMS Base and Common Services
4118
+#define DEVICE_NR(device) MINOR(device) /* evms has no partition bits */
4119
+#define DEVICE_NAME "evms" /* name for messaging */
4120
+#define DEVICE_NO_RANDOM /* no entropy to contribute */
4121
+#define DEVICE_OFF(d) /* do nothing */
4123
+#include <linux/config.h>
4124
+#include <linux/module.h>
4125
+#include <linux/errno.h>
4126
+#include <linux/kernel.h>
4127
+#include <linux/init.h>
4128
+#include <linux/fs.h>
4129
+#include <linux/major.h>
4130
+#include <linux/slab.h>
4131
+#include <asm/uaccess.h>
4132
+#include <linux/blk.h> /* must be included by all block drivers */
4133
+#include <linux/blkdev.h>
4134
+#include <linux/blkpg.h>
4135
+#include <linux/iobuf.h>
4136
+#include <linux/genhd.h>
4137
+#include <linux/major.h>
4138
+#include <linux/sched.h>
4139
+#include <linux/version.h>
4140
+#include <linux/swap.h>
4141
+#include <net/checksum.h>
4142
+#include <linux/sysctl.h>
4143
+#include <linux/smp_lock.h>
4144
+#include <linux/evms/evms_kernel.h>
4146
+//#define VFS_PATCH_PRESENT
4148
+/* prefix used in logging messages */
4151
+typedef struct evms_registered_plugin_s {
4152
+ evms_plugin_header_t * plugin;
4153
+ struct evms_registered_plugin_s * next;
4154
+} evms_registered_plugin_t;
4155
+static evms_registered_plugin_t * registered_plugin_head = NULL;
4157
+static evms_list_node_t *evms_global_device_list = NULL;
4158
+static evms_list_node_t *evms_global_feature_node_list = NULL;
4159
+static evms_list_node_t *evms_global_notify_list = NULL;
4161
+int evms_info_level = EVMS_INFO_LEVEL;
4162
+struct proc_dir_entry *evms_proc_dir = NULL;
4163
+EXPORT_SYMBOL(evms_info_level);
4164
+static evms_logical_volume_t * evms_logical_volumes;
4165
+static int evms_volumes = 0;
4166
+/* a few variables to aid in detecting memory leaks.
4167
+ * these variables are always in use, regardless of
4168
+ * the state of EVMS_MEM_DEBUG.
4170
+static atomic_t evms_allocs;
4171
+static atomic_t evms_logical_nodes;
4173
+char *evms_primary_string = "primary";
4174
+EXPORT_SYMBOL(evms_primary_string);
4175
+char *evms_secondary_string = "secondary";
4176
+EXPORT_SYMBOL(evms_secondary_string);
4178
+static evms_version_t evms_svc_version = {
4179
+ major : EVMS_COMMON_SERVICES_MAJOR,
4180
+ minor : EVMS_COMMON_SERVICES_MINOR,
4181
+ patchlevel : EVMS_COMMON_SERVICES_PATCHLEVEL
4184
+static int evms_discover_volumes(evms_rediscover_t *);
4186
+/* Handles for "private" EVMS object pools */
4187
+static evms_pool_mgmt_t *evms_io_notify_pool;
4189
+/* Handles for "public" EVMS object pools */
4190
+evms_pool_mgmt_t *evms_bh_pool;
4191
+EXPORT_SYMBOL(evms_bh_pool);
4193
+/* Handle for the devfs directory entry */
4194
+devfs_handle_t evms_dir_devfs_handle;
4195
+devfs_handle_t evms_blk_devfs_handle;
4198
+/**********************************************************/
4199
+/* SYSCTL - EVMS folder */
4200
+/**********************************************************/
4202
+#ifdef CONFIG_PROC_FS
4203
+static struct ctl_table_header *evms_table_header;
4204
+static int evms_info_level_min = EVMS_INFO_CRITICAL;
4205
+static int evms_info_level_max = EVMS_INFO_EVERYTHING;
4207
+static ctl_table evms_table[] = {
4208
+ {DEV_EVMS_INFO_LEVEL, "evms_info_level",
4209
+ &evms_info_level, sizeof(int), 0644, NULL,
4210
+ &proc_dointvec_minmax, &sysctl_intvec,
4211
+ NULL, &evms_info_level_min, &evms_info_level_max},
4215
+static ctl_table evms_dir_table[] = {
4216
+ {DEV_EVMS, "evms", NULL, 0, 0555, evms_table},
4220
+static ctl_table dev_dir_table[] = {
4221
+ {CTL_DEV, "dev", NULL, 0, 0555, evms_dir_table},
4226
+/**********************************************************/
4227
+/* START -- exported functions/Common Services */
4228
+/**********************************************************/
4231
+ * Function: evms_cs_get_version
4232
+ * Description: This function returns the current EVMS version
4235
+evms_cs_get_version(int * major, int *minor)
4237
+ *major = EVMS_MAJOR_VERSION;
4238
+ *minor = EVMS_MINOR_VERSION;
4240
+EXPORT_SYMBOL(evms_cs_get_version);
4243
+evms_cs_check_version(
4244
+ evms_version_t *required,
4245
+ evms_version_t *actual)
4249
+ if (required->major != actual->major)
4251
+ else if (required->minor > actual->minor)
4253
+ else if (required->minor == actual->minor)
4254
+ if (required->patchlevel > actual->patchlevel)
4258
+EXPORT_SYMBOL(evms_cs_check_version);
4260
+#ifdef EVMS_MEM_DEBUG
4261
+#define EVMS_MEM_SSIGNATURE 0x4D444D63 //SMEM
4262
+typedef struct memobj_head_s {
4263
+ unsigned int ssignature;
4264
+ struct memobj_head_s *next;
4266
+ struct memobj_tail_s *tail;
4268
+#define EVMS_MEM_ESIGNATURE 0x4D444D44 //EMEM
4269
+typedef struct memobj_tail_s {
4270
+ unsigned int esignature;
4271
+ memobj_head_t *head;
4274
+static memobj_head_t *memobj_head = NULL;
4275
+static spinlock_t mem_debug_lock = SPIN_LOCK_UNLOCKED;
4278
+ * function description: evms_cs_verify_memory_integrity
4280
+ * the count of memory objects in the list
4281
+ * the starting signature (SSIGNATURE) hasn't been overwritten
4282
+ * the ending signature (ESIGNATURE) hasn't been overwritten
4284
+ * op_flag: controls the behaviour when a problem is found
4285
+ * 0 = stop immediately where a problem is found
4286
+ * !0 = don't stop, but report problem(s) exist, via return code
4289
+evms_cs_verify_memory_integrity(int op_flag)
4291
+ int rc = 0, objcount;
4292
+ memobj_head_t *mobj, **ppmobj;
4293
+ memobj_tail_t *mobjtail;
4295
+ /* verify each object in the linked list */
4297
+ spin_lock(&mem_debug_lock);
4298
+ ppmobj = &memobj_head;
4302
+ /* verify starting signature */
4303
+ if (mobj->ssignature != EVMS_MEM_SSIGNATURE) {
4309
+ /* verify ending signature */
4310
+ mobjtail = mobj->tail;
4311
+ if (mobjtail->esignature != EVMS_MEM_ESIGNATURE) {
4317
+ ppmobj = &(*ppmobj)->next;
4319
+ spin_unlock(&mem_debug_lock);
4320
+ /* verify object count */
4321
+ if (objcount != evms_allocs) {
4329
+EXPORT_SYMBOL(evms_cs_verify_memory_integrity);
4333
+ * function: evms_cs_allocate_memory
4335
+ * This function is a wrapper function for the kernel malloc
4336
+ * (kmalloc) function. It provides a consistent method of
4337
+ * allocating kernel memory for all evms code.
4340
+ * This function takes as arguments:
4342
+ * **pp: the address of the pointer which is to contain the
4343
+ * the address of the allocated memory object.
4344
+ * size: the size in bytes of the memory object to be
4348
+ * This function returns:
4350
+ * *pp = NULL, and return set to -ENOMEM when there is
4351
+ * insufficient memory to satisfy the request.
4355
+ * *pp = NULL, and return set to 0 when the specified
4356
+ * size is invalid.
4360
+ * *pp is set to the address of the allocated memory object
4361
+ * and return code is set to 0.
4364
+ * NOTE: Defining EVMS_MEM_DEBUG turns on memory integrity
4365
+ * checking. This wraps each memory object with a
4366
+ * header and trailer. The header and trailer contain
4367
+ * signatures and sizes that are used to verify that
4368
+ * existing memory objects have not been overwritten.
4369
+ * Refer to the evms_cs_verify_memory_integrity
4370
+ * function for more details.
4373
+evms_cs_allocate_memory(void **pp, int size)
4377
+#ifdef EVMS_MEM_DEBUG
4378
+ memobj_head_t *mobj, **ppmobj;
4379
+ memobj_tail_t *mobjtail;
4381
+ /* verify a valid size parameter was specified */
4383
+ /* return NULL on invalid size */
4386
+#ifdef EVMS_MEM_DEBUG
4387
+ size += sizeof(memobj_head_t) + sizeof(memobj_tail_t);
4389
+// *pp = kmalloc(size, GFP_KERNEL);
4390
+ *pp = kmalloc(size, GFP_NOIO);
4394
+#ifdef EVMS_MEM_DEBUG
4395
+ /* adjust variables to caller values */
4396
+ mobj = (memobj_head_t *)*pp;
4397
+ *pp += sizeof(memobj_head_t);
4398
+ size -= sizeof(memobj_head_t) + sizeof(memobj_tail_t);
4400
+ /* setup memobj head */
4401
+ mobj->ssignature = EVMS_MEM_SSIGNATURE;
4402
+ mobj->size = size;
4404
+ /* setup memobj tail */
4405
+ mobjtail = (memobj_tail_t *)(*pp + size);
4406
+ mobjtail->esignature = EVMS_MEM_ESIGNATURE;
4407
+ mobj->tail = mobjtail;
4408
+ mobjtail->head = mobj;
4410
+ /* add mobj to linked list */
4412
+ spin_lock(&mem_debug_lock);
4413
+ ppmobj = &memobj_head;
4414
+ while(*ppmobj > mobj)
4415
+ ppmobj = &(*ppmobj)->next;
4416
+ mobj->next = *ppmobj;
4418
+ spin_unlock(&mem_debug_lock);
4420
+ memset(*pp, 0, size);
4421
+ atomic_inc(&evms_allocs);
4425
+#ifdef EVMS_MEM_DEBUG
4426
+ evms_cs_verify_memory_integrity(0);
4430
+EXPORT_SYMBOL(evms_cs_allocate_memory);
4433
+evms_cs_deallocate_memory(void *p)
4435
+#ifdef EVMS_MEM_DEBUG
4436
+ memobj_head_t *mobj, **ppmobj;
4438
+ evms_cs_verify_memory_integrity(0);
4440
+ /* init ptr to memobj structure */
4441
+ mobj = (memobj_head_t *)(p - sizeof(memobj_head_t));
4443
+ /* find mobj in linked list */
4444
+ spin_lock(&mem_debug_lock);
4445
+ ppmobj = &memobj_head;
4446
+ while(*ppmobj != mobj)
4447
+ ppmobj = &(*ppmobj)->next;
4448
+ *ppmobj = mobj->next;
4449
+ spin_unlock(&mem_debug_lock);
4452
+ atomic_dec(&evms_allocs);
4455
+EXPORT_SYMBOL(evms_cs_deallocate_memory);
4458
+evms_cs_allocate_logical_node(evms_logical_node_t **pp)
4462
+ rc = evms_cs_allocate_memory((void **)pp, sizeof(evms_logical_node_t));
4464
+ atomic_inc(&evms_logical_nodes);
4467
+EXPORT_SYMBOL(evms_cs_allocate_logical_node);
4470
+evms_cs_deallocate_volume_info(evms_logical_node_t *p)
4472
+ if (p->iflags & EVMS_FEATURE_BOTTOM) {
4473
+ evms_cs_remove_item_from_list(
4474
+ &evms_global_feature_node_list, p);
4475
+ evms_cs_deallocate_memory(p->volume_info);
4476
+ p->volume_info = NULL;
4477
+ p->iflags &= ~EVMS_FEATURE_BOTTOM;
4480
+EXPORT_SYMBOL(evms_cs_deallocate_volume_info);
4483
+evms_cs_deallocate_logical_node(evms_logical_node_t *p)
4486
+ LOG_SERIOUS("Deallocating object whose NEXT ptr is not null!!\n");
4488
+ evms_cs_deallocate_volume_info(p);
4489
+ if (p->feature_header) {
4490
+ evms_cs_deallocate_memory(p->feature_header);
4491
+ p->feature_header = NULL;
4493
+ evms_cs_deallocate_memory(p);
4494
+ atomic_dec(&evms_logical_nodes);
4497
+EXPORT_SYMBOL(evms_cs_deallocate_logical_node);
4500
+ * Function: evms_cs_register_plugin
4501
+ * Description: This function is exported so that all plugins can register with EVMS
4504
+evms_cs_register_plugin(evms_plugin_header_t * plugin)
4507
+ evms_registered_plugin_t *reg_record, **pp;
4508
+ evms_version_t *ver;
4510
+ ver = &plugin->required_common_services_version;
4512
+ LOG_EXTRA("registering plugin (plugin.id=%d.%d.%d, plugin.ver=%d.%d.%d, req.svc.ver=%d.%d.%d)\n",
4513
+ GetPluginOEM(plugin->id),
4514
+ GetPluginType(plugin->id),
4515
+ GetPluginID(plugin->id),
4516
+ plugin->version.major,
4517
+ plugin->version.minor,
4518
+ plugin->version.patchlevel,
4523
+ /* check common services requirements */
4524
+ rc = evms_cs_check_version(ver, &evms_svc_version);
4526
+ LOG_SERIOUS("plugin failed to load: common services (vers:%d,%d,%d) incompatibility!\n",
4527
+ EVMS_COMMON_SERVICES_MAJOR,
4528
+ EVMS_COMMON_SERVICES_MINOR,
4529
+ EVMS_COMMON_SERVICES_PATCHLEVEL);
4532
+ /* ensure a plugin with this feature id is
4533
+ * not already loaded.
4535
+ for (pp = ®istered_plugin_head; *pp; pp = &(*pp)->next) {
4536
+ if ((*pp)->plugin->id == plugin->id) {
4538
+ LOG_ERROR("error(%d) attempting to load another plugin with id(%x).\n",
4544
+ /* ensure the plugin has provided functions for
4545
+ * the mandatory entry points.
4547
+ if (!plugin->function_table->discover) {
4549
+ } else if (!plugin->function_table->init_io) {
4551
+ } else if (!plugin->function_table->ioctl) {
4553
+ } else if (!plugin->function_table->read) {
4555
+ } else if (!plugin->function_table->write) {
4557
+ } else if (!plugin->function_table->delete) {
4562
+ /* allocate a new plugin registration record */
4563
+ rc = evms_cs_allocate_memory((void **)®_record,
4564
+ sizeof(evms_registered_plugin_t));
4567
+ /* store ptr to plugin header in new registration record */
4568
+ reg_record->plugin = plugin;
4570
+ /* terminate the record */
4571
+ reg_record->next = NULL;
4573
+ /* find end of the plugin registration list */
4574
+ for (pp = ®istered_plugin_head; *pp; pp = &(*pp)->next);
4575
+ /* add registration record to list */
4578
+ /* increment the usage count */
4579
+ MOD_INC_USE_COUNT;
4584
+EXPORT_SYMBOL(evms_cs_register_plugin);
4587
+ * Function: evms_cs_unregister_plugin
4588
+ * Description: This function is exported so that all plugins can
4589
+ * unregister with EVMS
4592
+evms_cs_unregister_plugin(evms_plugin_header_t * plugin)
4594
+ int rc = 0, found = FALSE;
4595
+ evms_registered_plugin_t **pp;
4596
+ evms_version_t *ver;
4598
+ ver = &plugin->required_common_services_version;
4600
+ LOG_EXTRA("unregistering plugin (plugin.id=%d.%d.%d, plugin.ver=%d.%d.%d, req.svc.ver=%d.%d.%d)\n",
4601
+ GetPluginOEM(plugin->id),
4602
+ GetPluginType(plugin->id),
4603
+ GetPluginID(plugin->id),
4604
+ plugin->version.major,
4605
+ plugin->version.minor,
4606
+ plugin->version.patchlevel,
4610
+ /* ensure a plugin with this feature id is
4611
+ * currently loaded.
4613
+ for (pp = ®istered_plugin_head; *pp; pp = &(*pp)->next) {
4614
+ if ((*pp)->plugin->id == plugin->id) {
4621
+ LOG_ERROR("error(%d) attempt to unload a non-loaded plugin with id(%x).\n",
4624
+ /* actually unload the plugin now */
4626
+ evms_registered_plugin_t * tmp = *pp;
4628
+ /* remove the plugin record from our
4629
+ * internal plugin list
4631
+ *pp = (*pp)->next;
4632
+ /* deallocate the plugin registration record
4634
+ evms_cs_deallocate_memory(tmp);
4636
+ /* decrement the usage count */
4637
+ MOD_DEC_USE_COUNT;
4641
+EXPORT_SYMBOL(evms_cs_unregister_plugin);
4643
+/* function: evms_cs_add_logical_node_to_list
4645
+ * This functions adds a new logical node to the end of a
4648
+ * NOTE: This function is only expected to be called at
4649
+ * discovery time, which is singled threaded by nature,
4650
+ * and therefore doesn't need to be made SMP safe.
4653
+evms_cs_add_logical_node_to_list(evms_logical_node_t ** list_head, evms_logical_node_t * node)
4656
+ evms_logical_node_t **pp = NULL;
4658
+ /* check to make sure node is not already on a list */
4662
+ /* check to make sure node being added is not already in the list */
4663
+ for (pp = list_head; *pp; pp = &(*pp)->next)
4664
+ if (*pp == node) {
4669
+ /* add node to the end of the list */
4675
+EXPORT_SYMBOL(evms_cs_add_logical_node_to_list);
4677
+/* function: evms_cs_remove_logical_node_from_list
4679
+ * This functions removes a new logical node from a node list.
4681
+ * NOTE: This function is only expected to be called at
4682
+ * discovery time, which is singled threaded by nature,
4683
+ * and therefore doesn't need to be made SMP safe.
4686
+evms_cs_remove_logical_node_from_list(evms_logical_node_t ** list_head, evms_logical_node_t * node)
4688
+ /* remove this node from the head of the list */
4689
+ int rc = 1; /* assume failure until target node is found */
4690
+ evms_logical_node_t **pp;
4691
+ for (pp = list_head; *pp; pp = &(*pp)->next)
4692
+ if (*pp == node) {
4693
+ *pp = (*pp)->next;
4694
+ node->next = NULL;
4700
+EXPORT_SYMBOL(evms_cs_remove_logical_node_from_list);
4703
+evms_cs_kernel_ioctl(evms_logical_node_t *node, unsigned int cmd, unsigned long arg)
4706
+ struct inode tmp_inode;
4712
+ rc = IOCTL(node, &tmp_inode, NULL, cmd, arg);
4719
+EXPORT_SYMBOL(evms_cs_kernel_ioctl);
4722
+ * function: evms_cs_size_in_vsectors
4724
+ * In EVMS a V(irtual)Sector is 512 bytes in size.
4725
+ * This function computes the number of VSECTORs an specified
4726
+ * item size would require.
4728
+ * NOTE: This function has been coded to work with 64 bit values.
4731
+evms_cs_size_in_vsectors(long long item_size)
4733
+ long long sectors;
4735
+ sectors = item_size >> EVMS_VSECTOR_SIZE_SHIFT;
4736
+ if (item_size & (EVMS_VSECTOR_SIZE - 1))
4741
+EXPORT_SYMBOL(evms_cs_size_in_vsectors);
4744
+ * function: evms_cs_log2
4746
+ * this function computes the power of the 2 of specified
4747
+ * value. If the value is 0, a -1 is returned. If the value
4748
+ * is NOT a power of 2, a -2 is return. Otherwise the power
4749
+ * of 2 is returned.
4751
+int evms_cs_log2(long long value)
4759
+ while(!(tmp & 1)) {
4769
+EXPORT_SYMBOL(evms_cs_log2);
4774
+ * build_crc_table()
4778
+ * Description: The functions in this module provide a means of calculating
4779
+ * the 32 bit CRC for a block of data. build_crc_table must
4780
+ * be called to initialize this module. calculate_crc must
4781
+ * NOT be used until after build_crc_table has been called.
4782
+ * Once build_crc_table has been called, calculate_crc can
4783
+ * be used to calculate the crc of the data residing in a
4784
+ * user specified buffer.
4788
+#define CRC_POLYNOMIAL 0xEDB88320L
4790
+static u_int32_t crc_table[256];
4791
+static u_int32_t crc_table_built = FALSE;
4793
+/*********************************************************************/
4795
+/* Function Name: build_crc_table */
4797
+/* Descriptive Name: This module implements the crc function using */
4798
+/* a table driven method. The required table */
4799
+/* must be setup before the calculate_crc */
4800
+/* function can be used. This table only needs */
4801
+/* to be set up once. This function sets up the */
4802
+/* crc table needed by calculate_crc. */
4808
+/* Error Handling: N/A */
4810
+/* Side Effects: The internal crc table is initialized. */
4814
+/*********************************************************************/
4816
+build_crc_table( void )
4818
+ u_int32_t i, j, crc;
4820
+ for (i = 0; i <= 255; i++) {
4822
+ for (j = 8; j > 0; j--) {
4824
+ crc = (crc >> 1) ^ CRC_POLYNOMIAL;
4828
+ crc_table[i] = crc;
4830
+ crc_table_built = TRUE;
4833
+/*********************************************************************/
4835
+/* Function Name: calculate_crc */
4837
+/* Descriptive Name: This function calculates the crc value for */
4838
+/* the data in the buffer specified by Buffer. */
4840
+/* Input: u_int32_t crc : This is the starting crc. If you are */
4841
+/* starting a new crc calculation, then */
4842
+/* this should be set to 0xFFFFFFFF. If */
4843
+/* you are continuing a crc calculation */
4844
+/* (i.e. all of the data did not fit in */
4845
+/* the buffer so you could not calculate */
4846
+/* the crc in a single operation), then */
4847
+/* this is the crc output by the last */
4848
+/* calculate_crc call. */
4850
+/* Output: The crc for the data in the buffer, based upon the value*/
4851
+/* of the input parameter crc. */
4853
+/* Error Handling: None. */
4855
+/* Side Effects: None. */
4859
+/*********************************************************************/
4861
+evms_cs_calculate_crc(u_int32_t crc, void * buffer, u_int32_t buffersize)
4863
+ unsigned char * current_byte;
4864
+ u_int32_t temp1, temp2, i;
4866
+ current_byte = (unsigned char *) buffer;
4867
+ /* Make sure the crc table is available */
4868
+ if (crc_table_built==FALSE) build_crc_table();
4869
+ /* Process each byte in the buffer. */
4870
+ for (i = 0; i < buffersize; i++) {
4871
+ temp1 = (crc >> 8) & 0x00FFFFFF;
4872
+ temp2 = crc_table[(crc ^ (u_int32_t)*current_byte) & (u_int32_t)0xff];
4874
+ crc = temp1 ^ temp2;
4878
+EXPORT_SYMBOL(evms_cs_calculate_crc);
4880
+#define EVMS_ORIGINAL_CALLBACK_FLAG 1<<0
4881
+typedef struct io_notify_s {
4882
+ unsigned int flags;
4884
+ struct buffer_head *bh;
4885
+ u_int64_t rsector;
4887
+ void (*callback_function)(evms_logical_node_t *node,
4888
+ struct buffer_head *bh,
4889
+ int uptodate, int *redrive);
4890
+ struct io_notify_s *next;
4894
+evms_cs_create_pool(
4897
+ void (*ctor)(void*, kmem_cache_t *, unsigned long),
4898
+ void (*dtor)(void*, kmem_cache_t *, unsigned long))
4900
+ evms_pool_mgmt_t *pool;
4902
+ /* create the pool management structure */
4903
+ if (evms_cs_allocate_memory((void **)&pool, sizeof(evms_pool_mgmt_t))) {
4904
+ panic("Cannot create %s fpool mgmt structure", pool_name);
4906
+ /* initialize various field in pool mgmt structure */
4907
+ pool->member_size = objsize;
4908
+ pool->name = pool_name;
4909
+ atomic_set(&pool->waiters, 0);
4910
+ init_waitqueue_head(&pool->wait_queue);
4911
+ /* go create the pool */
4912
+ pool->cachep = kmem_cache_create(
4914
+ pool->member_size,
4916
+ SLAB_HWCACHE_ALIGN,
4919
+ panic("Cannot create %s SLAB cache", pool->name);
4922
+EXPORT_SYMBOL(evms_cs_create_pool);
4925
+evms_cs_allocate_from_pool(evms_pool_mgmt_t *pool, int blockable)
4930
+ objp = kmem_cache_alloc(pool->cachep, SLAB_NOIO);
4931
+ if (objp || !blockable) {
4934
+ /* block and wait for an object to
4935
+ * be returned to the pool
4937
+ atomic_inc(&pool->waiters);
4938
+ wait_event(pool->wait_queue,
4939
+ (!atomic_read(&pool->waiters)));
4944
+EXPORT_SYMBOL(evms_cs_allocate_from_pool);
4947
+evms_cs_deallocate_to_pool(evms_pool_mgmt_t *pool, void *objp)
4949
+ kmem_cache_free(pool->cachep, objp);
4950
+ atomic_set(&pool->waiters,0);
4951
+ if (waitqueue_active(&pool->wait_queue)) {
4952
+ wake_up(&pool->wait_queue);
4955
+EXPORT_SYMBOL(evms_cs_deallocate_to_pool);
4958
+evms_cs_destroy_pool(evms_pool_mgmt_t *pool)
4960
+ kmem_cache_destroy(pool->cachep);
4961
+ evms_cs_deallocate_memory(pool);
4963
+EXPORT_SYMBOL(evms_cs_destroy_pool);
4966
+ * function: evms_end_io
4968
+ * This is a support function for
4969
+ * evms_cs_register_for_end_io_notification.
4970
+ * This function is called during I/O completion on any buffer
4971
+ * head that was registered by a plugin. Control is passed here
4972
+ * and this routine will, thru the use of the I/O notify entry
4973
+ * stored in the b_private field of the buffer head, restore
4974
+ * the b_rsector value the buffer head had at the time of
4975
+ * registration and pass control to the registered callback
4976
+ * address, with pointers to the buffer head and an optional
4977
+ * plugin private data. Upon completion of the callback,
4978
+ * control is returned back here. The io notify list entry
4979
+ * is deleted. This process repeats until this routine
4980
+ * detects that all registered plugins have been called back
4981
+ * and the buffer head's original end_io function has been
4982
+ * called. At this point the DONE flag is set, and we terminate
4983
+ * callback loop and exit.
4985
+ * Plugins may desire to break or interrupt the callback
4986
+ * sequence or chain. This may be useful to redrive I/O or
4987
+ * to wait for other buffer heads to complete before
4988
+ * allowing the original buffer head callback to occur.
4989
+ * To interrupt the callback "chain", a registered
4990
+ * plugin's callback must return with the DONE flag set.
4992
+ * NOTE: If a plugin set the DONE flag, and wishes to redrive
4993
+ * a buffer head, the plugin MUST reregister the buffer head
4994
+ * to receive another callback on this buffer head. Also, the
4995
+ * plugin MUST ensure that the original buffer head end_io
4996
+ * function get called at some point, either by reregistering
4997
+ * this buffer head and receiving another callback, or by
4998
+ * means of buffer head aggregation triggered by the callbacks
4999
+ * of other buffer heads.
5003
+evms_end_io(struct buffer_head *bh, int uptodate)
5005
+ io_notify_t *entry;
5010
+ /* retrieve the io_notify_entry ptr from
5011
+ * the b_private field in the buffer head.
5013
+ entry = (io_notify_t *)bh->b_private;
5015
+ /* restore the b_private value to
5016
+ * the previous b_private value (which
5017
+ * should be a previous io_notify_entry
5018
+ * or the original b_private pointer).
5020
+ bh->b_private = entry->b_private;
5022
+ /* check for original callback for this bh */
5023
+ if (entry->flags & EVMS_ORIGINAL_CALLBACK_FLAG) {
5024
+ /* this is the original for bh */
5026
+ /* turn off flag marking this as the original */
5027
+ entry->flags &= ~EVMS_ORIGINAL_CALLBACK_FLAG;
5029
+ /* decrement volume's requests_in_progress var */
5030
+ atomic_dec(&evms_logical_volumes[MINOR(bh->b_dev)].requests_in_progress);
5032
+ /* restore b_end_io to original value */
5033
+ bh->b_end_io = (void *)entry->callback_function;
5034
+ if (bh->b_end_io) {
5035
+ /* invoke original callback function
5038
+ bh->b_end_io(bh, uptodate);
5042
+ /* this is a plugin callback */
5044
+ /* restore the rsector value to the
5045
+ * value at the time of callback
5048
+ bh->b_rsector = entry->rsector;
5049
+ /* invoke plugin callback function */
5050
+ entry->callback_function(entry->private, bh, uptodate, &done);
5052
+ /* free the io notify entry */
5053
+ evms_cs_deallocate_to_pool(evms_io_notify_pool, entry);
5058
+ * function: evms_cs_register_for_end_io_notification
5060
+ * This function is an evms common service.
5061
+ * This routine allows a (plugin) function to register to
5062
+ * participate in the io completion notification process.
5063
+ * This is useful for plugins which alter data after it
5064
+ * has been read from the disk (i.e. encryption or
5067
+ * This routine also records the rsector value at the time
5068
+ * of registration, so that it can be restored to that value
5069
+ * prior to the callback to a plugin, thus allowing that
5070
+ * plugin to work with the value it had seen during the
5071
+ * initiating I/O request.
5073
+ * This routine also records a private data pointer at the
5074
+ * time of registration, and is returned to the plugin
5075
+ * at callback time. This private data pointer was designed
5076
+ * to contain context/callback/buffer_head specific data, and
5077
+ * frees the plugin from having to store and find associated
5078
+ * data at the time of the callback. This field is not used
5079
+ * by this function and is optional (NULL if unused). It is
5080
+ * recorded and returned as a convenience for the plugins.
5082
+ * DANGER!!! - WILL ROBINSON - DANGER!!!
5083
+ * This routine uses the b_private field in the
5084
+ * buffer_head structure. If any lower level driver uses this
5085
+ * field and do NOT restore it, the I/O callback will fail!!
5087
+ * Any plugins writers requiring a field for private storage
5088
+ * should instead use the private field parameter in this
5089
+ * function to store their private data.
5094
+evms_cs_register_for_end_io_notification(
5096
+ struct buffer_head *bh,
5097
+ void *callback_function)
5100
+ io_notify_t *new_entry;
5104
+ /* allocate a notify entry */
5105
+ new_entry = evms_cs_allocate_from_pool(evms_io_notify_pool, EVMS_BLOCKABLE);
5111
+ /* initialize notify entry */
5112
+ new_entry->private = private;
5113
+ new_entry->bh = bh;
5114
+ new_entry->rsector = bh->b_rsector;
5115
+ new_entry->b_private = bh->b_private;
5116
+ new_entry->flags = 0;
5118
+ /* is this the first callback for this bh? */
5119
+ if (bh->b_end_io != evms_end_io) {
5120
+ /* yes, first callback */
5121
+ new_entry->flags |= EVMS_ORIGINAL_CALLBACK_FLAG;
5122
+ new_entry->callback_function = (void *)bh->b_end_io;
5124
+ /* increment volume's requests_in_progress var */
5125
+ atomic_inc(&evms_logical_volumes[MINOR(bh->b_dev)].requests_in_progress);
5127
+ /* set b_end_io so we get control */
5128
+ bh->b_end_io = evms_end_io;
5130
+ /* no, not first callback */
5131
+ new_entry->callback_function = callback_function;
5134
+ /* set b_private to aid in quick lookup */
5135
+ bh->b_private = new_entry;
5139
+EXPORT_SYMBOL(evms_cs_register_for_end_io_notification);
5141
+/* function description: evms_lookup_item_in_list
5143
+ * this function searches for the specified item in the
5144
+ * specified node list. it returns the address of the
5145
+ * evms_list_node containing the specified item.
5147
+static evms_list_node_t **
5148
+evms_lookup_item_in_list(
5149
+ evms_list_node_t **node_list,
5152
+ evms_list_node_t **list_node;
5154
+ list_node = node_list;
5155
+ while(*list_node) {
5156
+ if ((*list_node)->item == item)
5158
+ list_node = &(*list_node)->next;
5160
+ return(list_node);
5163
+/* function description: evms_add_item_to_list
5165
+ * this function adds an item to the list. the
5166
+ * node for the new item is added to the end
5167
+ * of the list. the list is traversed to find the end.
5168
+ * while the traversal occurs, the list is checked
5169
+ * for the presence of the specified item. if already
5170
+ * present in the list, and error code is returned.
5172
+/* function description: evms_cs_add_item_to_list
5174
+ * this function adds an item to an item list.
5176
+ * RC == 0 is returned for:
5177
+ * a successful add of a new item
5179
+ * RC == 1 is returned when:
5180
+ * the item is already on the list
5182
+ * RC < 0 is returned for an error attempting to add the item.
5185
+evms_cs_add_item_to_list(
5186
+ evms_list_node_t **list,
5190
+ evms_list_node_t **list_node, *new_node;
5192
+ list_node = evms_lookup_item_in_list(list, item);
5193
+ if (*list_node == NULL) {
5194
+ rc = evms_cs_allocate_memory(
5195
+ (void **)&new_node,
5196
+ sizeof(evms_list_node_t));
5198
+ new_node->item = item;
5199
+ *list_node = new_node;
5203
+ LOG_DEBUG("warning: attempt to add duplicate item(%p) to list(%p).\n",
5208
+EXPORT_SYMBOL(evms_cs_add_item_to_list);
5210
+/* function description: evms_remove_item_from_list
5212
+ * this function removes a specified item from the
5213
+ * specified list. if the specified item is not
5214
+ * found in the list, and error is returned.
5217
+evms_cs_remove_item_from_list(
5218
+ evms_list_node_t **list,
5222
+ evms_list_node_t **list_node;
5224
+ /* check to see if item is in the list */
5225
+ list_node = evms_lookup_item_in_list(list, item);
5227
+ /* was the node found in the list? */
5229
+ /* yes, it was found */
5230
+ evms_list_node_t *tmp_node;
5232
+ /* save ptr to node being removed*/
5233
+ tmp_node = *list_node;
5234
+ /* remove it from the global list */
5235
+ *list_node = tmp_node->next;
5236
+ /* delete removed node */
5237
+ evms_cs_deallocate_memory(tmp_node);
5239
+ /* no, it was not found */
5241
+ LOG_ERROR("error(%d): attempt to remove nonexistant node(%p) from list(%p).\n",
5246
+EXPORT_SYMBOL(evms_cs_remove_item_from_list);
5248
+/* function description: evms_cs_register_device
5250
+ * this function adds a device to the EVMS global device list.
5252
+ * RC == 0 is returned for:
5253
+ * a successful add of a new device
5255
+ * RC == 1 is returned when:
5256
+ * the device is already on the list
5258
+ * RC < 0 is returned for an error attempting to add the device.
5261
+evms_cs_register_device(evms_logical_node_t *device)
5263
+ return(evms_cs_add_item_to_list(
5264
+ &evms_global_device_list,
5267
+EXPORT_SYMBOL(evms_cs_register_device);
5269
+/* function description: evms_cs_unregister_device
5271
+ * this function removes a device from the EVMS global device list.
5273
+ * RC == 0 is returned for:
5274
+ * a successful removal of the specified device
5276
+ * RC < 0 is returned for an error attempting to add the device.
5277
+ * -ENODATA is returned if specified device is not found.
5280
+evms_cs_unregister_device(evms_logical_node_t *device)
5282
+ return(evms_cs_remove_item_from_list(
5283
+ &evms_global_device_list,
5286
+EXPORT_SYMBOL(evms_cs_unregister_device);
5288
+static evms_list_node_t *find_first_next_list_node = NULL;
5290
+evms_cs_find_next_device(
5291
+ evms_logical_node_t *in_device,
5292
+ evms_logical_node_t **out_device)
5295
+ evms_list_node_t **list_node;
5297
+ if (in_device == NULL)
5298
+ find_first_next_list_node = evms_global_device_list;
5300
+ list_node = evms_lookup_item_in_list(
5301
+ &evms_global_device_list,
5303
+ find_first_next_list_node = *list_node;
5304
+ if (find_first_next_list_node == NULL)
5307
+ find_first_next_list_node =
5308
+ find_first_next_list_node->next;
5311
+ if (find_first_next_list_node == NULL)
5312
+ *out_device = NULL;
5314
+ *out_device = (evms_logical_node_t *)
5315
+ find_first_next_list_node->item;
5319
+EXPORT_SYMBOL(evms_cs_find_next_device);
5322
+evms_cs_signal_event(int eventid)
5325
+ evms_list_node_t **list_node;
5327
+ /* signal PID(s) of specified event */
5328
+ list_node = &evms_global_notify_list;
5329
+ while(*list_node) {
5330
+ evms_event_t *event;
5332
+ event = (*list_node)->item;
5333
+ if (event->eventid == eventid) {
5334
+ struct task_struct *tsk;
5336
+ tsk = find_task_by_pid(event->pid);
5338
+ struct siginfo siginfo;
5340
+ siginfo.si_signo = event->signo;
5341
+ siginfo.si_errno = 0;
5342
+ siginfo.si_code = 0;
5343
+ rc = send_sig_info(event->signo,
5348
+ * unregister this stale
5349
+ * notification record
5353
+ list_node = &(*list_node)->next;
5356
+EXPORT_SYMBOL(evms_cs_signal_event);
5359
+evms_flush_signals (void)
5361
+ spin_lock(¤t->sigmask_lock);
5362
+ flush_signals(current);
5363
+ spin_unlock(¤t->sigmask_lock);
5367
+evms_init_signals (void)
5369
+ current->exit_signal = SIGCHLD;
5370
+ siginitsetinv(¤t->blocked, sigmask(SIGKILL));
5374
+evms_thread(void * arg)
5376
+ evms_thread_t *thread = arg;
5385
+ sprintf(current->comm, thread->name);
5386
+ evms_init_signals();
5387
+ evms_flush_signals();
5388
+ thread->tsk = current;
5390
+ current->policy = SCHED_OTHER;
5391
+ current->nice = -20;
5394
+ complete(thread->event);
5395
+ while (thread->run) {
5396
+ void (*run)(void *data);
5397
+ DECLARE_WAITQUEUE(wait, current);
5399
+ add_wait_queue(&thread->wqueue, &wait);
5400
+ set_task_state(current, TASK_INTERRUPTIBLE);
5401
+ if (!test_bit(EVMS_THREAD_WAKEUP, &thread->flags)) {
5404
+ current->state = TASK_RUNNING;
5405
+ remove_wait_queue(&thread->wqueue, &wait);
5406
+ clear_bit(EVMS_THREAD_WAKEUP, &thread->flags);
5408
+ run = thread->run;
5410
+ run(thread->data);
5411
+ run_task_queue(&tq_disk);
5413
+ if (signal_pending(current)) {
5414
+ evms_flush_signals();
5417
+ complete(thread->event);
5422
+evms_cs_register_thread (
5423
+ void (*run) (void *),
5427
+ evms_thread_t *thread;
5429
+ struct completion event;
5431
+ if (evms_cs_allocate_memory((void**)&thread,sizeof(evms_thread_t)))
5434
+ memset(thread, 0, sizeof(evms_thread_t));
5435
+ init_waitqueue_head(&thread->wqueue);
5437
+ init_completion(&event);
5438
+ thread->event = &event;
5439
+ thread->run = run;
5440
+ thread->data = data;
5441
+ thread->name = name;
5442
+ ret = kernel_thread(evms_thread, thread, 0);
5444
+ evms_cs_deallocate_memory(thread);
5447
+ wait_for_completion(&event);
5450
+EXPORT_SYMBOL(evms_cs_register_thread);
5453
+evms_cs_unregister_thread (evms_thread_t *thread)
5455
+ struct completion event;
5457
+ init_completion(&event);
5459
+ thread->event = &event;
5460
+ thread->run = NULL;
5461
+ thread->name = NULL;
5462
+ evms_cs_interrupt_thread(thread);
5463
+ wait_for_completion(&event);
5464
+ evms_cs_deallocate_memory(thread);
5466
+EXPORT_SYMBOL(evms_cs_unregister_thread);
5469
+evms_cs_wakeup_thread(evms_thread_t *thread)
5471
+ set_bit(EVMS_THREAD_WAKEUP, &thread->flags);
5472
+ wake_up(&thread->wqueue);
5474
+EXPORT_SYMBOL(evms_cs_wakeup_thread);
5477
+evms_cs_interrupt_thread (evms_thread_t *thread)
5479
+ if (!thread->tsk) {
5480
+ LOG_ERROR("error: attempted to interrupt an invalid thread!\n");
5483
+ send_sig(SIGKILL, thread->tsk, 1);
5485
+EXPORT_SYMBOL(evms_cs_interrupt_thread);
5487
+struct proc_dir_entry *
5488
+evms_cs_get_evms_proc_dir(void)
5490
+#ifdef CONFIG_PROC_FS
5491
+ if (!evms_proc_dir) {
5492
+ evms_proc_dir = create_proc_entry("evms", S_IFDIR, &proc_root);
5495
+ return(evms_proc_dir);
5497
+EXPORT_SYMBOL(evms_cs_get_evms_proc_dir);
5500
+evms_cs_volume_request_in_progress(
5503
+ int *current_count)
5506
+ evms_logical_volume_t *volume;
5508
+ volume = &evms_logical_volumes[MINOR(dev)];
5509
+ if (volume->node) {
5510
+ if (operation > 0) {
5511
+ atomic_inc(&volume->requests_in_progress);
5512
+ } else if (operation < 0) {
5513
+ atomic_dec(&volume->requests_in_progress);
5515
+ if (current_count) {
5516
+ *current_count = atomic_read(&volume->requests_in_progress);
5523
+EXPORT_SYMBOL(evms_cs_volume_request_in_progress);
5525
+/**********************************************************/
5526
+/* END -- exported functions/Common Services */
5527
+/**********************************************************/
5529
+/**********************************************************/
5530
+/* START -- Proc FS Support functions */
5531
+/**********************************************************/
5533
+#ifdef CONFIG_PROC_FS
5535
+evms_info_read_proc(
5544
+ char *info_level_text = NULL;
5546
+ PROCPRINT("Enterprise Volume Management System: Info\n");
5547
+ switch(evms_info_level) {
5549
+ info_level_text = "critical";
5552
+ info_level_text = "serious";
5555
+ info_level_text = "error";
5558
+ info_level_text = "warning";
5561
+ info_level_text = "default";
5564
+ info_level_text = "details";
5567
+ info_level_text = "debug";
5570
+ info_level_text = "extra";
5573
+ info_level_text = "entry exit";
5576
+ info_level_text = "everything";
5579
+ info_level_text = "unknown";
5582
+ PROCPRINT("EVMS info level: %d (%s).\n",
5583
+ evms_info_level, info_level_text);
5585
+ PROCPRINT("EVMS kernel version: %d.%d.%d\n",
5586
+ EVMS_MAJOR_VERSION,
5587
+ EVMS_MINOR_VERSION,
5588
+ EVMS_PATCHLEVEL_VERSION);
5590
+ PROCPRINT("EVMS IOCTL interface version: %d.%d.%d\n",
5591
+ EVMS_IOCTL_INTERFACE_MAJOR,
5592
+ EVMS_IOCTL_INTERFACE_MINOR,
5593
+ EVMS_IOCTL_INTERFACE_PATCHLEVEL);
5595
+ PROCPRINT("EVMS Common Services version: %d.%d.%d\n",
5596
+ EVMS_COMMON_SERVICES_MAJOR,
5597
+ EVMS_COMMON_SERVICES_MINOR,
5598
+ EVMS_COMMON_SERVICES_PATCHLEVEL);
5604
+evms_plugins_read_proc(
5613
+ evms_registered_plugin_t *rp = NULL;
5615
+ PROCPRINT("Enterprise Volume Management System: Plugins\n");
5616
+ /* 0 1 1 2 2 3 3 4 4 5 5 6 6 7*/
5617
+ /* 1 5 0 5 0 5 0 5 0 5 0 5 0 5 0*/
5618
+ PROCPRINT(" ---------Plugin---------- required services\n");
5619
+ PROCPRINT(" ----id---- version version\n\n");
5620
+ for (rp = registered_plugin_head; rp; rp = rp->next) {
5621
+ PROCPRINT(" %x.%x.%x\t %d.%d.%d\t%d.%d.%d\n",
5622
+ GetPluginOEM(rp->plugin->id),
5623
+ GetPluginType(rp->plugin->id),
5624
+ GetPluginID(rp->plugin->id),
5625
+ rp->plugin->version.major,
5626
+ rp->plugin->version.minor,
5627
+ rp->plugin->version.patchlevel,
5628
+ rp->plugin->required_common_services_version.major,
5629
+ rp->plugin->required_common_services_version.minor,
5630
+ rp->plugin->required_common_services_version.patchlevel);
5637
+evms_volumes_read_proc(
5647
+ PROCPRINT("Enterprise Volume Management System: Volumes\n");
5648
+ PROCPRINT("major minor #blocks type flags name\n\n");
5649
+ for (j = 1; j < MAX_EVMS_VOLUMES; j++) {
5650
+ evms_logical_volume_t *volume;
5652
+ volume = &evms_logical_volumes[j];
5653
+ if (volume->node) {
5654
+ PROCPRINT("%5d %7d %16Ld %s %s %s %s%s\n",
5656
+ volume->node->total_vsectors >> 1,
5657
+ (volume->flags & EVMS_VOLUME_FLAG) ? "evms " : "compat",
5658
+ (volume->flags & EVMS_VOLUME_READ_ONLY) ? "ro" : "rw",
5659
+ (volume->flags & EVMS_VOLUME_PARTIAL) ? "p " : " ",
5660
+ EVMS_DEV_NODE_PATH,
5669
+/**********************************************************/
5670
+/* END -- Proc FS Support functions */
5671
+/**********************************************************/
5673
+/**********************************************************/
5674
+/* START -- FOPS functions definitions */
5675
+/**********************************************************/
5677
+/************************************************/
5678
+/* START -- IOCTL commands -- EVMS specific */
5679
+/************************************************/
5682
+evms_ioctl_cmd_get_ioctl_version (void * arg)
5685
+ evms_version_t ver;
5687
+ ver.major = EVMS_IOCTL_INTERFACE_MAJOR;
5688
+ ver.minor = EVMS_IOCTL_INTERFACE_MINOR;
5689
+ ver.patchlevel = EVMS_IOCTL_INTERFACE_PATCHLEVEL;
5691
+ /* copy info to userspace */
5692
+ if (copy_to_user(arg, &ver, sizeof(ver)))
5699
+evms_ioctl_cmd_get_version (void * arg)
5702
+ evms_version_t ver;
5704
+ ver.major = EVMS_MAJOR_VERSION;
5705
+ ver.minor = EVMS_MINOR_VERSION;
5706
+ ver.patchlevel = EVMS_PATCHLEVEL_VERSION;
5708
+ /* copy info to userspace */
5709
+ if (copy_to_user(arg, &ver, sizeof(ver)))
5716
+evms_ioctl_cmd_get_info_level (void * arg)
5720
+ /* copy info to userspace */
5721
+ if (copy_to_user(arg, &evms_info_level, sizeof(evms_info_level)))
5728
+evms_ioctl_cmd_set_info_level (void * arg)
5732
+ /* copy info from userspace */
5733
+ if (copy_from_user(&evms_info_level, arg, sizeof(evms_info_level)))
5739
+/* function: evms_quiesce_volume
5741
+ * this function performs the actual quiesce operation on
5742
+ * a volume in kernel memory.
5744
+ * when quiescing, all new I/Os to a volume are stopped,
5745
+ * causing the calling thread to block. this thread then
5746
+ * waits until all I/Os in progress are completed, before
5747
+ * return control to the caller.
5749
+ * when unquiescing, all new I/Os are allowed to proceed
5750
+ * unencumbered, and all threads waiting (blocked) on this
5751
+ * volume, are woken up and allowed to proceed.
5755
+evms_quiesce_volume(
5756
+ evms_logical_volume_t *volume,
5757
+ struct inode *inode,
5758
+ struct file *file,
5759
+ evms_quiesce_volume_t *qv)
5763
+ LOG_DEBUG("%squiescing %s.\n",
5764
+ ((qv->command) ? "" : "un"), volume->name);
5766
+#ifdef VFS_PATCH_PRESENT
5768
+ /* VFS function call to sync and lock the filesystem */
5769
+ fsync_dev_lockfs(MKDEV(EVMS_MAJOR, qv->minor));
5770
+ volume->vfs_quiesced = TRUE;
5773
+ volume->quiesced = qv->command;
5775
+ /* Command specified was "quiesce". */
5776
+ if (qv->command) {
5777
+ /* After setting the volume to
5778
+ * a quiesced state, there could
5779
+ * be threads (on SMP systems)
5780
+ * that are executing in the
5781
+ * function, evms_handle_request,
5782
+ * between the "wait_event" and the
5783
+ * "atomic_inc" lines. We need to
5784
+ * provide a "delay" sufficient
5785
+ * to allow those threads to
5786
+ * to reach the atomic_inc's
5787
+ * before executing the while loop
5788
+ * below. The "schedule" call should
5792
+ /* wait for outstanding requests
5795
+ while(atomic_read(&volume->requests_in_progress)>0)
5798
+ /* send this command down the stack so lower */
5799
+ /* layers can know about this */
5800
+ rc = IOCTL(volume->node, inode, file,
5801
+ EVMS_QUIESCE_VOLUME, (unsigned long)&qv);
5803
+ /* Command specified was "unquiesce". */
5804
+ if (!qv->command) {
5805
+ /* "wakeup" any I/O requests waiting on
5808
+ if (waitqueue_active(&volume->wait_queue))
5809
+ wake_up(&volume->wait_queue);
5810
+#ifdef VFS_PATCH_PRESENT
5811
+ if (volume->vfs_quiesced) {
5812
+ /* VFS function call to unlock the filesystem */
5813
+ unlockfs(MKDEV(EVMS_MAJOR, qv->minor));
5814
+ volume->vfs_quiesced = FALSE;
5819
+ LOG_ERROR("error(%d) %squiescing %s.\n",
5821
+ ((qv->command) ? "" : "un"),
5827
+/* function: evms_delete_volume
5829
+ * this function performs the actual delete operation on
5830
+ * a volume to purge it from kernel memory. all structures
5831
+ * and memory consumed by this volume will be free as well
5832
+ * as clearing or unregistering any system services or
5833
+ * global data arrays.
5835
+ * NOTE: this function will return -EBUSY on attempts to
5836
+ * delete mounted volumes.
5840
+evms_delete_volume(
5841
+ evms_logical_volume_t *volume,
5842
+ evms_delete_volume_t *dv)
5846
+ /* if this is a "permament" delete */
5847
+ /* check to make sure volume is not mounted */
5848
+ if (dv->command) {
5849
+ if (is_mounted(MKDEV(EVMS_MAJOR, dv->minor))) {
5854
+ /* invoke the delete ioctl at the top of the feature stack */
5856
+ LOG_DETAILS("deleting '%s'.\n",volume->name);
5857
+ rc = DELETE(volume->node);
5860
+ /* the volume has been deleted, do any clean up work
5864
+ devfs_unregister(volume->devfs_handle);
5865
+ if (dv->command) {
5866
+ /* if "permanent" delete, free the name
5867
+ * and NULL the name field.
5869
+ evms_cs_deallocate_memory(volume->name);
5870
+ volume->name = NULL;
5871
+ volume->flags = 0;
5873
+ /* if "soft" delete, leave the name so
5874
+ * we can use it to reassign the same
5875
+ * minor to this volume after a
5878
+ volume->flags = EVMS_VOLUME_SOFT_DELETED;
5880
+ volume->node = NULL;
5881
+ set_device_ro(MKDEV(EVMS_MAJOR,dv->minor),0);
5882
+ blk_size[EVMS_MAJOR][dv->minor] = 0;
5883
+ blksize_size[EVMS_MAJOR][dv->minor] = 0;
5884
+ hardsect_size[EVMS_MAJOR][dv->minor] = 0;
5887
+ LOG_ERROR("error(%d) %s deleting %s.\n",
5889
+ ((dv->command) ? "hard" : "soft"),
5895
+/* function: evms_user_delete_volume
5897
+ * this function, depending on the parameters, performs
5898
+ * a "soft" or a "hard" delete. for a "soft" delete, a
5899
+ * quiesce & delete request is queued up, to be executed
5900
+ * at the beginning of the next rediscovery. for a
5901
+ * "hard" delete, the target volume is quiesced and then
5902
+ * deleted. if there is any errors attempting to delete
5903
+ * the target, then the target is unquiesced. if an
5904
+ * associative volume is specified it is quiesced before
5905
+ * the target volume is quiesced, and is unquiesced
5906
+ * after the attempt to delete the target volume.
5910
+evms_user_delete_volume(
5911
+ evms_logical_volume_t *lvt,
5912
+ struct inode *inode,
5913
+ struct file *file,
5914
+ evms_delete_volume_t *dv)
5918
+ if (!dv->command) {
5919
+ /* "soft delete" requested */
5920
+ lvt->flags |= (EVMS_REQUESTED_QUIESCE |
5921
+ EVMS_REQUESTED_DELETE);
5923
+ lvt->flags |= EVMS_REQUESTED_VFS_QUIESCE;
5926
+ /* "hard delete" requested */
5928
+ evms_quiesce_volume_t qv;
5929
+ evms_logical_volume_t *lva = NULL;
5931
+ if (dv->associative_minor) {
5932
+ /* associative volume specified
5936
+ lva = &evms_logical_volumes[dv->associative_minor];
5937
+ /* quiesce associative volume */
5938
+ qv.command = EVMS_QUIESCE;
5939
+ qv.do_vfs = EVMS_VFS_DO_NOTHING;
5940
+ qv.minor = dv->associative_minor;
5941
+ rc = evms_quiesce_volume(lva, inode, file, &qv);
5942
+ qa = (rc) ? FALSE : TRUE;
5945
+ /* quiesce target volume */
5946
+ qv.command = EVMS_QUIESCE;
5947
+ qv.do_vfs = EVMS_VFS_DO_NOTHING;
5948
+ qv.minor = dv->minor;
5949
+ rc = evms_quiesce_volume(lvt, inode, file, &qv);
5952
+ /* delete the target volume */
5953
+ rc = evms_delete_volume(lvt, dv);
5955
+ /* got an error undeleting...
5957
+ * unquiesce the target
5959
+ qv.command = EVMS_UNQUIESCE;
5960
+ qv.do_vfs = EVMS_VFS_DO_NOTHING;
5961
+ qv.minor = dv->minor;
5962
+ evms_quiesce_volume(lvt, inode, file, &qv);
5965
+ if (dv->associative_minor) {
5966
+ /* associative volume specified
5971
+ /* only unquiesce associative
5972
+ * if we successfully quiesced
5975
+ qv.command = EVMS_UNQUIESCE;
5976
+ qv.do_vfs = EVMS_VFS_DO_NOTHING;
5977
+ qv.minor = dv->associative_minor;
5978
+ evms_quiesce_volume(lva, inode, file, &qv);
5985
+/* function: evms_ioctl_cmd_delete_volume
5987
+ * this function copy user data to/from the kernel, and
5988
+ * validates user parameters. after validation, control
5989
+ * is passed to worker routine evms_user_delete_volume.
5993
+evms_ioctl_cmd_delete_volume(
5994
+ struct inode *inode,
5995
+ struct file *file,
5996
+ unsigned long arg)
5999
+ evms_delete_volume_t tmp, *user_parms;
6000
+ evms_logical_volume_t *volume = NULL;
6002
+ user_parms = (evms_delete_volume_t *)arg;
6003
+ /* copy user's parameters to kernel space */
6004
+ if (copy_from_user(&tmp, user_parms, sizeof(tmp)))
6007
+ /* check to make sure associative minor is in use */
6009
+ if (tmp.associative_minor) {
6010
+ volume = &evms_logical_volumes[tmp.associative_minor];
6011
+ if (volume->node == NULL)
6015
+ /* check to make sure target minor is in use */
6017
+ volume = &evms_logical_volumes[tmp.minor];
6018
+ if (volume->node == NULL)
6021
+ rc = evms_user_delete_volume(
6022
+ volume,inode,file,&tmp);
6024
+ /* copy the status value back to the user */
6026
+ if (copy_to_user(user_parms, &tmp, sizeof(tmp)))
6032
+/* function: evms_full_rediscover_prep
6034
+ * this function helps to prevent problems when evms is
6035
+ * configured with the base built in statically and some
6036
+ * plugins built as modules.
6038
+ * in these cases, when the initial discovery is done,
6039
+ * only the statically built modules are available for
6040
+ * volume construction. as a result, some volumes that
6041
+ * require the plugins built as modules (which haven't
6042
+ * been loaded), to be fully reconstructed, may come up
6043
+ * as compatibility volumes or partial volumes.
6045
+ * when parts of evms are built as modules, the
6046
+ * evms_rediscovery utility is used, to perform a secondary
6047
+ * rediscover, after all the plugins built as modules
6048
+ * have been loaded, to construct all the volumes
6049
+ * requiring these plugins.
6051
+ * however since some of the volumes, requiring the plugins
6052
+ * built as modules, may have been already exported as
6053
+ * compatibility or partial volumes, we need to purge these
6054
+ * volumes from kernel's memory, so that can be rediscovered
6055
+ * and claimed by the appropriate plugins, and reconstructed
6056
+ * into the correct volumes.
6058
+ * this function purges all compatibility volumes that are
6059
+ * not in use(mounted) and all partial volumes, prior to
6060
+ * doing the secondary rediscover, thus allowing volumes to
6061
+ * rediscovered correctly.
6063
+ * NOTE: again, this is only required in cases when a
6064
+ * combination of plugins are built statically and as
6069
+evms_full_rediscover_prep(struct inode *inode, struct file *file)
6073
+ LOG_DETAILS("%s: started.\n", __FUNCTION__);
6074
+ /* check for acceptable volumes to be deleted */
6075
+ for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
6076
+ evms_logical_volume_t *volume = NULL;
6077
+ evms_delete_volume_t dv;
6078
+ int volume_mounted, doit;
6081
+ volume = &evms_logical_volumes[i];
6082
+ if (!volume->node)
6084
+ devp = MKDEV(EVMS_MAJOR,i);
6085
+ volume_mounted = (is_mounted(devp)) ? 1 : 0;
6086
+ /* only proceed on volumes that are:
6089
+ * unmounted compatibility volumes
6092
+ if (volume->flags & EVMS_VOLUME_PARTIAL) {
6093
+ /* do all partial volumes
6096
+ } else if (!(volume->flags & EVMS_VOLUME_FLAG)) {
6097
+ /* check all compatibility volumes
6099
+ if (!volume_mounted && !is_swap_partition(devp)) {
6100
+ /* only do unmounted volumes
6105
+ if (doit == FALSE) {
6108
+ /* delete the volume from memory.
6109
+ * do a 'soft' delete if volume
6110
+ * is mounted, and 'hard' delete
6113
+ * NOTE: the delete operation will
6114
+ * clear the bits in the flags field.
6116
+ dv.command = (volume_mounted) ?
6117
+ EVMS_SOFT_DELETE : EVMS_HARD_DELETE;
6119
+ dv.associative_minor = 0;
6121
+ rc = evms_user_delete_volume(volume,inode,file,&dv);
6123
+ LOG_DETAILS("%s: completed.\n", __FUNCTION__);
6127
+evms_ioctl_cmd_rediscover_volumes(
6128
+ struct inode *inode,
6129
+ struct file *file,
6131
+ unsigned long arg)
6134
+ evms_rediscover_t tmp, *user_parms;
6135
+ unsigned long *array_ptr = NULL, array_size = 0;
6136
+ evms_logical_volume_t *volume = NULL;
6138
+ rc = tmp.drive_count = 0;
6139
+ user_parms = (evms_rediscover_t *)arg;
6140
+ /* copy user's parameters to kernel space */
6141
+ if (copy_from_user(&tmp, user_parms, sizeof(tmp)))
6144
+ if (tmp.drive_count == REDISCOVER_ALL_DEVICES) {
6145
+ evms_full_rediscover_prep(inode, file);
6147
+ /* quiesce all queued volumes */
6148
+ for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
6149
+ evms_quiesce_volume_t qv;
6151
+ volume = &evms_logical_volumes[i];
6152
+ if (!volume->node) {
6155
+ if (!(volume->flags & EVMS_REQUESTED_QUIESCE)) {
6158
+ qv.command = EVMS_QUIESCE;
6160
+ qv.do_vfs = (volume->flags & EVMS_REQUESTED_VFS_QUIESCE) ?
6161
+ EVMS_VFS_DO : EVMS_VFS_DO_NOTHING,
6163
+ rc = evms_quiesce_volume(volume,inode,file,&qv);
6165
+ /* "soft" delete all queued volumes */
6166
+ for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
6167
+ evms_delete_volume_t dv;
6169
+ volume = &evms_logical_volumes[i];
6170
+ if (!volume->node) {
6173
+ if (!(volume->flags & EVMS_REQUESTED_DELETE)) {
6176
+ dv.command = EVMS_SOFT_DELETE;
6178
+ dv.associative_minor = 0;
6180
+ rc = evms_delete_volume(volume, &dv);
6183
+ if (tmp.drive_count &&
6184
+ (tmp.drive_count != REDISCOVER_ALL_DEVICES)) {
6186
+ /* create space for userspace drive array */
6187
+ array_size = sizeof(*tmp.drive_array) * tmp.drive_count;
6188
+ array_ptr = tmp.drive_array;
6189
+ rc = evms_cs_allocate_memory((void **)&tmp.drive_array, array_size);
6192
+ /* copy rediscover drive array to kernel space */
6193
+ if (copy_from_user(tmp.drive_array, array_ptr, array_size))
6198
+ /* perform the rediscovery operation */
6199
+ rc = evms_discover_volumes(&tmp);
6202
+ /* clean up after operation */
6203
+ if (tmp.drive_count &&
6204
+ (tmp.drive_count != REDISCOVER_ALL_DEVICES))
6205
+ evms_cs_deallocate_memory(tmp.drive_array);
6207
+ /* set return code and copy info to userspace */
6209
+ if (copy_to_user(&user_parms->status, &tmp.status, sizeof(tmp.status)))
6215
+static evms_list_node_t *user_disk_ptr;
6217
+evms_ioctl_cmd_get_logical_disk(void * arg)
6220
+ evms_user_disk_t tmp, *user_parms;
6222
+ user_parms = (evms_user_disk_t *)arg;
6223
+ /* copy user's parameters to kernel space */
6224
+ if (copy_from_user(&tmp.command, &user_parms->command, sizeof(tmp.command)))
6228
+ if (tmp.command == EVMS_FIRST_DISK)
6229
+ user_disk_ptr = evms_global_device_list;
6230
+ else /* tmp.command == EVMS_NEXT_DISK */
6231
+ user_disk_ptr = user_disk_ptr->next;
6233
+ if (user_disk_ptr == NULL)
6234
+ tmp.status = EVMS_DISK_INVALID;
6236
+ tmp.status = EVMS_DISK_VALID;
6237
+ tmp.disk_handle = (unsigned long)user_disk_ptr->item ^ EVMS_HANDLE_KEY;
6239
+ /* copy info to userspace */
6240
+ if (copy_to_user(user_parms, &tmp, sizeof(tmp)))
6247
+evms_ioctl_cmd_get_logical_disk_info(void * arg)
6250
+ evms_user_disk_info_t tmp, *user_parms;
6251
+ evms_list_node_t *p;
6253
+ user_parms = (evms_user_disk_info_t *)arg;
6254
+ /* copy user's parameters to kernel space */
6255
+ if (copy_from_user(&tmp.disk_handle, &user_parms->disk_handle, sizeof(tmp.disk_handle)))
6258
+ /* check handle for validity */
6261
+ for (p = evms_global_device_list; p; p = p->next)
6262
+ if (p->item == (evms_logical_node_t *)(tmp.disk_handle ^ EVMS_HANDLE_KEY)) {
6264
+ user_disk_ptr = p;
6269
+ /* populate kernel copy of user's structure with appropriate info */
6271
+ evms_logical_node_t *node = (evms_logical_node_t *)user_disk_ptr->item;
6272
+ tmp.flags = node->flags;
6273
+ strcpy(tmp.disk_name, EVMS_DEV_NODE_PATH);
6274
+ strcat(tmp.disk_name, node->name);
6275
+ tmp.total_sectors = node->total_vsectors;
6276
+ tmp.hardsect_size = node->hardsector_size;
6277
+ tmp.block_size = node->block_size;
6278
+ rc = evms_cs_kernel_ioctl(node, HDIO_GETGEO,
6279
+ (unsigned long)&tmp.geometry);
6282
+ /* set return code and copy info to userspace */
6284
+ if (copy_to_user(user_parms, &tmp, sizeof(tmp)))
6290
+#define MAX_IO_SIZE 128
6292
+evms_ioctl_cmd_sector_io(void * arg)
6295
+ evms_sector_t io_size = MAX_IO_SIZE;
6296
+ evms_sector_io_t tmp, *user_parms;
6297
+ evms_logical_node_t *disk_node = NULL;
6298
+ evms_list_node_t *list_node;
6299
+ unsigned char *io_buffer;
6305
+ user_parms = (evms_sector_io_t *)arg;
6306
+ /* copy user's parameters to kernel space */
6307
+ if (copy_from_user(&tmp, user_parms, sizeof(tmp)))
6310
+ /* check handle for validity */
6313
+ disk_node = (evms_logical_node_t *)(tmp.disk_handle ^ EVMS_HANDLE_KEY);
6314
+ for (list_node = evms_global_device_list; list_node; list_node = list_node->next)
6315
+ if (list_node->item == disk_node) {
6321
+ /* allocate a io buffer upto 64Kbytes in size */
6322
+ if (tmp.sector_count < MAX_IO_SIZE)
6323
+ io_size = tmp.sector_count;
6325
+ /* allocate buffer large enough to hold a single sector */
6326
+ rc = evms_cs_allocate_memory(
6327
+ (void **)&io_buffer,
6328
+ io_size << EVMS_VSECTOR_SIZE_SHIFT);
6330
+ /* perform io with specified disk */
6332
+ evms_sector_t io_sector_offset, io_remaining;
6333
+ u_int64_t io_bytes;
6334
+ u_char *user_buffer_ptr;
6336
+ io_remaining = tmp.sector_count;
6337
+ io_sector_offset = 0;
6338
+ user_buffer_ptr = tmp.buffer_address;
6339
+ while(io_remaining) {
6340
+ /* compute the io_size for this pass */
6341
+ io_size = (io_remaining >= MAX_IO_SIZE) ?
6342
+ MAX_IO_SIZE : io_remaining;
6344
+ io_bytes = io_size << EVMS_VSECTOR_SIZE_SHIFT;
6345
+ /* for writes, copy a sector from user to kernel */
6346
+ if (tmp.io_flag == EVMS_SECTOR_IO_WRITE) {
6347
+ /* copy sector from user data buffer */
6348
+ if (copy_from_user(io_buffer,
6355
+ /* perform IO one sector at a time */
6359
+ io_sector_offset + tmp.starting_sector,
6365
+ if (tmp.io_flag != EVMS_SECTOR_IO_WRITE) {
6366
+ /* copy sector to user data buffer */
6367
+ if (copy_to_user(user_buffer_ptr,
6374
+ user_buffer_ptr += io_bytes;
6375
+ tmp.buffer_address += io_bytes;
6376
+ io_sector_offset += io_size;
6377
+ io_remaining -= io_size;
6381
+ /* if the sector_buffer was allocated, free it */
6383
+ evms_cs_deallocate_memory(io_buffer);
6385
+ /* copy the status value back to the user */
6387
+ if (copy_to_user(user_parms, &tmp, sizeof(tmp)))
6394
+static int user_minor;
6396
+evms_ioctl_cmd_get_minor(void * arg)
6399
+ evms_user_minor_t tmp, *user_parms;
6401
+ user_parms = (evms_user_minor_t *)arg;
6402
+ /* copy user's parameters to kernel space */
6403
+ if (copy_from_user(&tmp.command, &user_parms->command, sizeof(tmp.command)))
6407
+ if (tmp.command == EVMS_FIRST_VOLUME)
6409
+ else /* tmp.command == EVMS_NEXT_VOLUME */
6412
+ tmp.status = EVMS_VOLUME_INVALID;
6413
+ for (; user_minor < MAX_EVMS_VOLUMES; user_minor++) {
6414
+ evms_logical_volume_t *lv;
6416
+ lv = &evms_logical_volumes[user_minor];
6417
+ /* see if any corrupt volumes have been
6418
+ * unmounted. If so, clean up the
6419
+ * evms_logical_volumes array entry, and
6420
+ * don't report the volume to the user.
6422
+ if (lv->flags & EVMS_VOLUME_CORRUPT) {
6423
+ if (!get_super(MKDEV(EVMS_MAJOR,user_minor))) {
6424
+ /* clear logical volume structure
6425
+ * for this volume so it may be
6428
+ LOG_WARNING("ioctl_get_minor: found unmounted %s volume(%u,%u,%s).\n",
6429
+ ((lv->flags & EVMS_VOLUME_SOFT_DELETED) ?
6430
+ "'soft deleted'" : ""),
6431
+ EVMS_MAJOR, user_minor,
6433
+ LOG_WARNING(" releasing minor(%d) used by volume(%s)!\n",
6434
+ user_minor, lv->name);
6435
+ evms_cs_deallocate_memory(lv->name);
6440
+ if (lv->node || (lv->flags & EVMS_VOLUME_CORRUPT)) {
6441
+ tmp.status = EVMS_VOLUME_VALID;
6442
+ tmp.minor = user_minor;
6447
+ /* copy info to userspace */
6448
+ if (copy_to_user(user_parms, &tmp, sizeof(tmp)))
6455
+evms_ioctl_cmd_get_volume_data(void * arg)
6458
+ evms_volume_data_t tmp, *user_parms;
6459
+ evms_logical_volume_t *volume = NULL;
6460
+ evms_logical_node_t *node = NULL;
6462
+ user_parms = (evms_volume_data_t *)arg;
6463
+ /* copy user's parameters to kernel space */
6464
+ if (copy_from_user(&tmp, user_parms, sizeof(tmp)))
6468
+ volume = &evms_logical_volumes[tmp.minor];
6469
+ node = volume->node;
6474
+ tmp.flags = volume->flags;
6475
+ strcpy(tmp.volume_name, EVMS_DEV_NODE_PATH);
6476
+ strcat(tmp.volume_name, volume->name);
6479
+ /* copy return code and info to userspace */
6481
+ if (copy_to_user(user_parms, &tmp, sizeof(tmp)))
6486
+static evms_registered_plugin_t *ioctl_reg_record;
6488
+evms_ioctl_cmd_get_plugin(void * arg)
6491
+ evms_kernel_plugin_t tmp, *user_parms;
6493
+ user_parms = (evms_kernel_plugin_t *)arg;
6494
+ /* copy user's parameters to kernel space */
6495
+ if (copy_from_user(&tmp.command, &user_parms->command, sizeof(tmp.command)))
6499
+ /* if the command is not 0, then verify
6500
+ * that ioctl_reg_record is pointing to
6501
+ * current and valid plugin header.
6503
+ if (tmp.command) { /* tmp.command == EVMS_NEXT_PLUGIN */
6504
+ evms_registered_plugin_t *tmp_reg_record;
6505
+ tmp_reg_record = registered_plugin_head;
6506
+ /* search the current plugin list */
6507
+ while(tmp_reg_record) {
6508
+ if (tmp_reg_record == ioctl_reg_record)
6510
+ tmp_reg_record = tmp_reg_record->next;
6512
+ /* if the ioctl_reg_record is not in the
6513
+ * current list, then start at the beginning.
6515
+ if (!tmp_reg_record)
6516
+ tmp.command = EVMS_FIRST_PLUGIN;
6519
+ if (tmp.command == EVMS_FIRST_PLUGIN)
6520
+ /* start at beginning of plugin list */
6521
+ ioctl_reg_record = registered_plugin_head;
6522
+ else /* tmp.command == EVMS_NEXT_PLUGIN */
6523
+ /* continue from current position in list */
6524
+ ioctl_reg_record = ioctl_reg_record->next;
6526
+ tmp.status = EVMS_PLUGIN_INVALID;
6528
+ if (ioctl_reg_record) {
6529
+ tmp.id = ioctl_reg_record->plugin->id;
6530
+ tmp.version = ioctl_reg_record->plugin->version;
6531
+ tmp.status = EVMS_PLUGIN_VALID;
6534
+ /* copy info to userspace */
6535
+ if (copy_to_user(user_parms, &tmp, sizeof(tmp)))
6542
+evms_ioctl_cmd_plugin_ioctl(
6543
+ struct inode *inode,
6544
+ struct file *file,
6546
+ unsigned long arg)
6548
+ int rc = 0, found = FALSE;
6549
+ evms_plugin_ioctl_t tmp, *user_parms;
6550
+ evms_registered_plugin_t * p;
6552
+ user_parms = (evms_plugin_ioctl_t *)arg;
6553
+ /* copy user's parameters to kernel space */
6554
+ if (copy_from_user(&tmp, user_parms, sizeof(tmp)))
6558
+ /* search for the specified plugin */
6559
+ for (p = registered_plugin_head; p; p = p->next)
6560
+ /* check for the specified feature id */
6561
+ if (p->plugin->id == tmp.feature_id) {
6563
+ /* check that entry point is used */
6564
+ if (p->plugin->function_table->direct_ioctl)
6565
+ rc = DIRECT_IOCTL(p, inode, file, cmd, arg);
6570
+ /* was the specified plugin found? */
6571
+ if (found == FALSE)
6574
+ /* copy the status value back to the user */
6576
+ if (copy_to_user(user_parms, &tmp, sizeof(tmp)))
6582
+#define MAX_BUFFER_SIZE 65536
6584
+evms_ioctl_cmd_kernel_partial_csum(void * arg)
6587
+ u_int64_t compute_size = MAX_BUFFER_SIZE;
6588
+ evms_compute_csum_t tmp, *user_parms;
6589
+ unsigned char *buffer = NULL;
6591
+ user_parms = (evms_compute_csum_t *)arg;
6592
+ /* copy user's parameters to kernel space */
6593
+ if (copy_from_user(&tmp, user_parms, sizeof(tmp)))
6597
+ /* allocate a io buffer upto 64Kbytes in size */
6598
+ if (tmp.buffer_size < MAX_BUFFER_SIZE)
6599
+ compute_size = tmp.buffer_size;
6601
+ /* allocate buffer large enough to hold a single sector */
6602
+ rc = evms_cs_allocate_memory(
6603
+ (void **)&buffer, compute_size);
6605
+ /* perform io with specified disk */
6607
+ evms_sector_t remaining_bytes;
6608
+ u_char *user_buffer_ptr;
6609
+ unsigned int insum = tmp.insum;
6611
+ remaining_bytes = tmp.buffer_size;
6612
+ user_buffer_ptr = tmp.buffer_address;
6613
+ while(remaining_bytes) {
6614
+ /* compute the compute_size for this pass */
6615
+ compute_size = (remaining_bytes >= MAX_BUFFER_SIZE) ?
6616
+ MAX_BUFFER_SIZE : remaining_bytes;
6618
+ /* copy into kernel from user data buffer */
6619
+ if (copy_from_user(buffer, user_buffer_ptr,
6623
+ /* compute the checksum for this pass */
6624
+ tmp.outsum = csum_partial(buffer, tmp.buffer_size,
6626
+ /* set up for another possible pass */
6627
+ insum = tmp.outsum;
6628
+ /* update loop progress variables */
6629
+ user_buffer_ptr += compute_size;
6630
+ tmp.buffer_address += compute_size;
6631
+ remaining_bytes -= compute_size;
6635
+ /* if the sector_buffer was allocated, free it */
6637
+ evms_cs_deallocate_memory(buffer);
6639
+ /* copy the status value back to the user */
6641
+ if (copy_to_user(user_parms, &tmp, sizeof(tmp)))
6646
+#undef MAX_BUFFER_SIZE
6649
+evms_ioctl_cmd_get_bmap(
6650
+ struct inode *inode,
6651
+ struct file *file,
6653
+ unsigned long arg)
6656
+ evms_get_bmap_t tmp, *user_parms;
6658
+ user_parms = (evms_get_bmap_t *)arg;
6659
+ /* copy user's parameters to kernel space */
6660
+ if (copy_from_user(&tmp, user_parms, sizeof(tmp)))
6663
+ /* pass the ioctl down the volume stack */
6665
+ evms_logical_volume_t *volume;
6667
+ volume = &evms_logical_volumes[MINOR(inode->i_rdev)];
6668
+ rc = IOCTL(volume->node, inode, file, cmd, (unsigned long)&tmp);
6670
+ /* copy the status value back to the user */
6672
+ if (copy_to_user(user_parms, &tmp, sizeof(tmp)))
6679
+evms_ioctl_cmd_process_notify_event(unsigned long arg)
6681
+ int rc = 0, found = FALSE;
6682
+ evms_notify_t tmp, *user_parms;
6683
+ evms_list_node_t **list_node = NULL;
6684
+ evms_event_t *event = NULL;
6686
+ user_parms = (evms_notify_t *)arg;
6687
+ /* copy user's parameters to kernel space */
6688
+ if (copy_from_user(&tmp, user_parms, sizeof(tmp)))
6691
+ /* check to see if PID has already been registered
6695
+ list_node = &evms_global_notify_list;
6696
+ while(*list_node) {
6697
+ event = (*list_node)->item;
6698
+ if ((event->pid == tmp.eventry.pid) &&
6699
+ (event->eventid == tmp.eventry.eventid)) {
6703
+ list_node = &(*list_node)->next;
6706
+ if (tmp.command) { /* tmp.command == EVMS_REGISTER_EVENT */
6707
+ /* registration code */
6710
+ LOG_ERROR("error(%d) pid(%d) already register to receive signal(%d) on event(%d).\n",
6711
+ rc, tmp.eventry.pid, tmp.eventry.signo, tmp.eventry.eventid);
6713
+ /* register this pid/event type */
6714
+ rc = evms_cs_allocate_memory((void **)&event, sizeof(evms_event_t));
6716
+ LOG_ERROR("error(%d) allocating event structure.\n",
6719
+ event->pid = tmp.eventry.pid;
6720
+ event->eventid = tmp.eventry.eventid;
6721
+ event->signo = tmp.eventry.signo;
6722
+ rc = evms_cs_add_item_to_list(
6723
+ &evms_global_notify_list,
6727
+ } else { /* tmp.command == EVMS_UNREGISTER_EVENT */
6728
+ /* unregistration code */
6731
+ LOG_ERROR("error(%d) attempting to unregister a non-registered pid(%d) on event(%d).\n",
6732
+ rc, tmp.eventry.pid, tmp.eventry.eventid);
6734
+ event = (*list_node)->item;
6735
+ rc = evms_cs_remove_item_from_list(
6736
+ &evms_global_notify_list,
6739
+ evms_cs_deallocate_memory(event);
6743
+ /* copy the status value back to the user */
6745
+ if (copy_to_user(user_parms, &tmp, sizeof(tmp)))
6750
+/************************************************/
6751
+/* END -- IOCTL commands -- EVMS specific */
6752
+/************************************************/
6754
+/************************************************/
6755
+/* START -- IOCTL commands -- Volume specific */
6756
+/************************************************/
6758
+/************************************************/
6759
+/* END -- IOCTL commands -- Volume specific */
6760
+/************************************************/
6762
+/************************************************/
6763
+/* START -- IOCTL main */
6764
+/************************************************/
6767
+ * Function: evms_ioctl
6769
+ * This function is the main ioctl entry point for all of evms.
6774
+ struct inode *inode,
6775
+ struct file *file,
6777
+ unsigned long arg)
6779
+ unsigned long minor = 0;
6781
+ evms_logical_node_t *node = NULL;
6783
+ /* check user access */
6784
+ if (!capable(CAP_SYS_ADMIN))
6791
+ /* get the minor */
6792
+ minor = MINOR(inode->i_rdev);
6793
+ LOG_EXTRA("ioctl: minor(%lu), dir(%d), size(%d), type(%d), nr(%d)\n",
6795
+ (cmd >> _IOC_DIRSHIFT) & _IOC_DIRMASK,
6796
+ (cmd >> _IOC_SIZESHIFT) & _IOC_SIZEMASK,
6797
+ (cmd >> _IOC_TYPESHIFT) & _IOC_TYPEMASK,
6798
+ (cmd >> _IOC_NRSHIFT) & _IOC_NRMASK);
6800
+ /* insure this minor points to a valid volume */
6802
+ node = evms_logical_volumes[minor].node;
6808
+ /* process the IOCTL commands */
6811
+ /* process all EVMS specific commands */
6813
+ case EVMS_GET_IOCTL_VERSION:
6814
+ rc = evms_ioctl_cmd_get_ioctl_version((void *)arg);
6816
+ case EVMS_GET_VERSION:
6817
+ rc = evms_ioctl_cmd_get_version((void *)arg);
6819
+ case EVMS_GET_INFO_LEVEL:
6820
+ rc = evms_ioctl_cmd_get_info_level((void *)arg);
6822
+ case EVMS_SET_INFO_LEVEL:
6823
+ rc = evms_ioctl_cmd_set_info_level((void *)arg);
6825
+ case EVMS_REDISCOVER_VOLUMES:
6826
+ rc = evms_ioctl_cmd_rediscover_volumes(inode, file, cmd, arg);
6828
+ case EVMS_GET_LOGICAL_DISK:
6829
+ rc = evms_ioctl_cmd_get_logical_disk((void *)arg);
6831
+ case EVMS_GET_LOGICAL_DISK_INFO:
6832
+ rc = evms_ioctl_cmd_get_logical_disk_info((void *)arg);
6834
+ case EVMS_SECTOR_IO:
6835
+ rc = evms_ioctl_cmd_sector_io((void *)arg);
6837
+ case EVMS_GET_MINOR:
6838
+ rc = evms_ioctl_cmd_get_minor((void *)arg);
6840
+ case EVMS_GET_VOLUME_DATA:
6841
+ rc = evms_ioctl_cmd_get_volume_data((void *)arg);
6843
+ case EVMS_DELETE_VOLUME:
6844
+ rc = evms_ioctl_cmd_delete_volume(inode, file, arg);
6846
+ case EVMS_GET_PLUGIN:
6847
+ rc = evms_ioctl_cmd_get_plugin((void *)arg);
6849
+ case EVMS_PLUGIN_IOCTL:
6850
+ rc = evms_ioctl_cmd_plugin_ioctl(inode, file, cmd, arg);
6852
+ case EVMS_COMPUTE_CSUM:
6853
+ rc = evms_ioctl_cmd_kernel_partial_csum((void *)arg);
6855
+ case EVMS_PROCESS_NOTIFY_EVENT:
6856
+ rc = evms_ioctl_cmd_process_notify_event(arg);
6863
+ /* process Volume specific commands */
6865
+ /* pick up standard blk ioctls */
6873
+ rc = blk_ioctl(inode->i_rdev, cmd, arg);
6877
+ /* casting size down to 32-bits until
6878
+ * kernel allows return of 64-bit size
6881
+ long size = node->total_vsectors;
6882
+ if (copy_to_user((long *)arg, &size, sizeof(long)))
6886
+ case BLKGETSIZE64:
6888
+ u64 size_in_bytes = node->total_vsectors << EVMS_VSECTOR_SIZE_SHIFT;
6889
+ if (copy_to_user((u64 *)arg, &size_in_bytes, sizeof(u64)))
6893
+ case EVMS_GET_IOCTL_VERSION:
6894
+ rc = evms_ioctl_cmd_get_ioctl_version((void *)arg);
6896
+ case EVMS_GET_BMAP:
6897
+ rc = evms_ioctl_cmd_get_bmap(inode, file, cmd, arg);
6900
+ rc = IOCTL(node, inode, file, cmd, arg);
6908
+/************************************************/
6909
+/* END -- IOCTL main */
6910
+/************************************************/
6912
+/************************************************/
6913
+/* START -- CHECK MEDIA CHANGE */
6914
+/************************************************/
6917
+evms_check_media_change(kdev_t dev)
6920
+ evms_logical_volume_t *volume = NULL;
6922
+ /* check user access */
6923
+ if (!capable(CAP_SYS_ADMIN))
6927
+ /* get the minor */
6928
+ minor = MINOR(dev);
6929
+ /* insure this minor points to a valid volume */
6930
+ volume = &evms_logical_volumes[minor];
6931
+ if (volume->node == NULL) {
6936
+ if (volume->flags & EVMS_DEVICE_REMOVABLE) {
6937
+ /* check for media change */
6938
+ rc = evms_cs_kernel_ioctl(
6940
+ EVMS_CHECK_MEDIA_CHANGE,
6941
+ (unsigned long)NULL);
6943
+ LOG_ERROR("error(%d) doing EVMS_CHECK_MEDIA_CHANGE ioctl on '%s'.\n",
6944
+ rc, volume->name);
6951
+/************************************************/
6952
+/* END -- CHECK MEDIA CHANGE */
6953
+/************************************************/
6956
+evms_discover_logical_disks(evms_logical_node_t **);
6959
+evms_check_for_device_changes(
6960
+ struct inode *inode,
6961
+ struct file *file)
6963
+ int rc = 0, something_changed = 0, i;
6964
+ evms_rediscover_t kernel_rd_pckt = {0,0,NULL};
6965
+ evms_list_node_t *disk_list = NULL, *lnode, *next_lnode;
6966
+ evms_logical_node_t *disk, *new_device_list = NULL;
6967
+ evms_logical_volume_t *volume = NULL;
6969
+ /* check for new devices
6971
+ * put all new devices on the disk list so they
6972
+ * will be included in the rediscovery process.
6974
+ evms_discover_logical_disks(&new_device_list);
6975
+ if (new_device_list) {
6976
+ LOG_DETAILS("%s: new devices detected.\n", __FUNCTION__);
6977
+ something_changed++;
6978
+ /* put these new nodes on the disk list */
6979
+ while(new_device_list) {
6980
+ disk = new_device_list;
6981
+ rc = evms_cs_remove_logical_node_from_list(
6982
+ &new_device_list,disk);
6984
+ LOG_ERROR("%s: error(%d) removing device(%s) from list.\n",
6985
+ __FUNCTION__, rc, disk->name);
6987
+ rc = evms_cs_add_item_to_list(
6990
+ LOG_ERROR("%s: error(%d) adding device(%s) from list.\n",
6991
+ __FUNCTION__, rc, disk->name);
6996
+ /* check all devices for changed removable media
6998
+ * scan the global device list and issue check
6999
+ * media change on each removable media device.
7000
+ * put all removable devices that indicate a
7001
+ * media change on the disk list.
7003
+ for (lnode = evms_global_device_list; lnode; lnode = lnode->next) {
7004
+ disk = (evms_logical_node_t *)lnode->item;
7005
+ /* only really check removable media devices */
7006
+ if (disk->flags & EVMS_DEVICE_REMOVABLE) {
7007
+ /* check for media change */
7008
+ rc = evms_cs_kernel_ioctl(
7010
+ EVMS_CHECK_MEDIA_CHANGE,
7011
+ (unsigned long)NULL);
7013
+ LOG_ERROR("%s: error(%d) doing EVMS_CHECK_MEDIA_CHANGE ioctl on '%s'.\n",
7014
+ __FUNCTION__, rc, disk->name);
7015
+ } else if (rc == 1) {
7016
+ something_changed++;
7017
+ rc = evms_cs_add_item_to_list(
7018
+ &disk_list, disk);
7022
+ /* log a statement that we detected changed media.
7025
+ LOG_DETAILS("%s: media change detected.\n", __FUNCTION__);
7028
+ /* check for volumes with removed removable media.
7029
+ * mark the volumes that reside on changed media.
7031
+ for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
7032
+ volume = &evms_logical_volumes[i];
7033
+ if (!volume->node)
7035
+ if (!(volume->flags & EVMS_DEVICE_REMOVABLE))
7037
+ if (evms_check_media_change(MKDEV(EVMS_MAJOR,i)) <= 0)
7039
+ /* remember which volumes have changed media */
7040
+ volume->flags |= EVMS_MEDIA_CHANGED;
7041
+ something_changed++;
7044
+ /* check for removed hotplug devices */
7046
+ /* do we have some work to do? */
7047
+ if (something_changed) {
7048
+ /* check for volumes to be deleted */
7049
+ for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
7050
+ evms_quiesce_volume_t qv;
7052
+ volume = &evms_logical_volumes[i];
7053
+ if (!volume->node)
7055
+ /* only proceed on volumes with:
7057
+ * hot-unplugged devices,
7058
+ * & partial volumes
7060
+ if (!(volume->flags &
7061
+ (EVMS_MEDIA_CHANGED |
7062
+ EVMS_VOLUME_PARTIAL |
7063
+ EVMS_DEVICE_UNPLUGGED)))
7065
+ /* gather the disk's needing to be
7066
+ * rediscovered to rebuild this
7069
+ * this will locate other disks that
7070
+ * the volume resides on that don't
7071
+ * indicate media change.
7073
+ rc = evms_cs_kernel_ioctl(
7075
+ EVMS_GET_DISK_LIST,
7076
+ (unsigned long)&disk_list);
7078
+ LOG_ERROR("%s: error(%d) retrieving underlying disk list for '%s', skipping ...\n",
7079
+ __FUNCTION__, rc, volume->name);
7082
+ /* quiesce all the changed volumes
7083
+ * prior to being deleted.
7085
+ qv.command = 1; // quiesce
7087
+ qv.status = 0; // reset status
7089
+ rc = evms_quiesce_volume(volume, inode, file, &qv);
7091
+ LOG_ERROR("%s: error(%d) attempting to quiesce '%s%s'.\n",
7093
+ EVMS_DEV_NODE_PATH,
7098
+ /* we need to revalidate all the changed
7099
+ * media. this is accomplished by issuing
7100
+ * the revalidate disk ioctl to each device
7101
+ * with changed media. the device manager
7102
+ * remembers which devices indicated
7103
+ * media changed (set by check media
7104
+ * changed ioctl issued earlier), and will
7105
+ * only issue the revalidate disk ioctl to
7106
+ * those disks one time.
7109
+ * this needs to be done BEFORE deleting
7110
+ * the volumes because deleting the
7111
+ * last segment on disk will cause the
7112
+ * associated disk node to freed, and we
7113
+ * will not be able to issue the
7114
+ * revalidate disk ioctl after that.
7116
+ for (lnode = disk_list; lnode; lnode = lnode->next) {
7117
+ disk = (evms_logical_node_t *)lnode->item;
7118
+ /* only really do removable media devices */
7119
+ if (disk->flags & EVMS_MEDIA_CHANGED) {
7120
+ /* go revalidate the change media */
7121
+ rc = evms_cs_kernel_ioctl(
7123
+ EVMS_REVALIDATE_DISK,
7124
+ (unsigned long)NULL);
7128
+ /* delete all the affected volumes */
7129
+ for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
7130
+ evms_delete_volume_t dv;
7132
+ volume = &evms_logical_volumes[i];
7133
+ if (!volume->node)
7135
+ /* only proceed on volumes with:
7137
+ * hot-unplugged devices,
7138
+ * & partial volumes
7140
+ if (!(volume->flags &
7141
+ (EVMS_MEDIA_CHANGED |
7142
+ EVMS_VOLUME_PARTIAL |
7143
+ EVMS_DEVICE_UNPLUGGED)))
7145
+ /* only delete quiesced volumes */
7146
+ if (!volume->quiesced)
7148
+ /* delete the volume from memory.
7149
+ * do a 'soft' delete if volume
7150
+ * is mounted, and 'hard' delete
7153
+ * NOTE: the delete operation will
7154
+ * clear the bits in the flags field.
7156
+ dv.command = (is_mounted(MKDEV(EVMS_MAJOR,i))) ? 0 : 1;
7159
+ rc = evms_delete_volume(volume, &dv);
7162
+ /* at this point all devices indicating
7163
+ * media change that had volumes on them
7164
+ * should be gone. however, we could still
7165
+ * have devices indicating media change
7166
+ * that had no volumes on them in the disk
7167
+ * list. we need to delete these devices
7168
+ * from kernel memory and the global device
7171
+ for (lnode = evms_global_device_list; lnode; lnode = next_lnode) {
7172
+ next_lnode = lnode->next;
7174
+ disk = (evms_logical_node_t *)lnode->item;
7175
+ if (disk->flags & EVMS_MEDIA_CHANGED) {
7176
+ rc = DELETE(disk);
7180
+ /* all the devices that indicated media
7181
+ * change should be gone, both from kernel
7182
+ * memory and global device list. we now
7183
+ * need to remove any references to these
7184
+ * devices from the disk list.
7186
+ * when removable media is installed, it
7187
+ * will get detected in the device manager's
7188
+ * rediscovery as a new device and added to
7189
+ * the discover list.
7191
+ for (lnode = disk_list; lnode; lnode = next_lnode) {
7192
+ evms_list_node_t *glnode;
7193
+ int lnode_still_there;
7195
+ next_lnode = lnode->next;
7197
+ lnode_still_there = FALSE;
7198
+ for (glnode = evms_global_device_list;
7199
+ glnode; glnode = glnode->next) {
7200
+ if (glnode->item == lnode->item) {
7201
+ lnode_still_there = TRUE;
7205
+ if (lnode_still_there == FALSE) {
7206
+ rc = evms_cs_remove_item_from_list(
7210
+ LOG_ERROR("%s: error(%d) attempting to remove item(%p) from disk_list(%p).\n",
7211
+ __FUNCTION__, rc, lnode->item, &disk_list);
7216
+ /* build the in-kernel rediscover packet */
7218
+ /* allocate the space for the drive_array in
7219
+ * the evms_rediscover_t packet. to do this
7220
+ * we need to count the number of disk nodes,
7221
+ * then allocate the necessary space.
7223
+ /* count the disk nodes */
7224
+ for (lnode = disk_list; lnode; lnode = lnode->next)
7225
+ kernel_rd_pckt.drive_count++;
7226
+ /* allocate the space */
7227
+ if (kernel_rd_pckt.drive_count) {
7228
+ rc = evms_cs_allocate_memory(
7229
+ (void **)&kernel_rd_pckt.drive_array,
7230
+ kernel_rd_pckt.drive_count *
7231
+ sizeof(unsigned long));
7233
+ LOG_ERROR("%s: error(%d) allocating rediscover drive array.\n",
7234
+ __FUNCTION__, rc);
7237
+ /* populate the drive array
7239
+ * this also frees the disk_list which is useful
7240
+ * if we had an error allocating the drive array.
7242
+ for (i = 0, lnode = disk_list; lnode; lnode = next_lnode, i++) {
7243
+ next_lnode = lnode->next;
7245
+ /* remove this disk from the disk list */
7246
+ disk = (evms_logical_node_t *)lnode->item;
7247
+ rc = evms_cs_remove_item_from_list(&disk_list, disk);
7249
+ /* add this disk to rediscover
7252
+ kernel_rd_pckt.drive_array[i] =
7253
+ (unsigned long)disk ^ EVMS_HANDLE_KEY;
7256
+ /* perform the rediscovery operation */
7258
+ rc = evms_discover_volumes(&kernel_rd_pckt);
7259
+ if (kernel_rd_pckt.drive_count) {
7260
+ evms_cs_deallocate_memory(
7261
+ kernel_rd_pckt.drive_array);
7264
+ LOG_DETAILS("%s: rediscover completed.\n", __FUNCTION__);
7270
+/************************************************/
7271
+/* START -- REVALIDATE DISK */
7272
+/************************************************/
7275
+evms_revalidate_disk(kdev_t dev)
7278
+ evms_logical_volume_t *volume = NULL;
7280
+ /* check user access */
7281
+ if (!capable(CAP_SYS_ADMIN))
7285
+ /* get the minor */
7286
+ minor = MINOR(dev);
7287
+ /* insure this minor points to a valid volume */
7288
+ volume = &evms_logical_volumes[minor];
7289
+ if (volume->node == NULL) {
7294
+ /* go revalidate the change media */
7295
+ rc = evms_cs_kernel_ioctl(
7297
+ EVMS_REVALIDATE_DISK,
7298
+ (unsigned long)NULL);
7303
+/************************************************/
7304
+/* END -- REVALIDATE DISK */
7305
+/************************************************/
7307
+/************************************************/
7308
+/* START -- OPEN */
7309
+/************************************************/
7312
+evms_open(struct inode * inode, struct file * file)
7314
+ int rc = 0, minor = 0;
7315
+ evms_logical_volume_t *volume = NULL;
7317
+ /* check user access */
7318
+ if (!capable(CAP_SYS_ADMIN))
7324
+ rc = evms_check_for_device_changes(inode, file);
7326
+ /* get the minor */
7327
+ minor = MINOR(inode->i_rdev);
7329
+ /* insure this minor points to a valid volume */
7330
+ volume = &evms_logical_volumes[minor];
7331
+ if (volume->node == NULL) {
7336
+ /* go "open" the volume */
7337
+ if (!rc && minor) {
7338
+ rc = IOCTL(volume->node, inode, file,
7340
+ (unsigned long)NULL);
7342
+ LOG_ERROR("error(%d) doing EVMS_OPEN_VOLUME ioctl to '%s'.\n",
7343
+ rc, volume->name);
7349
+/************************************************/
7351
+/************************************************/
7353
+/************************************************/
7354
+/* START -- RELEASE */
7355
+/************************************************/
7358
+evms_release(struct inode * inode, struct file * file)
7360
+ int rc = 0, minor = 0;
7361
+ evms_logical_volume_t *volume = NULL;
7363
+ /* check user access */
7364
+ if (!capable(CAP_SYS_ADMIN))
7371
+ /* get the minor */
7372
+ minor = MINOR(inode->i_rdev);
7374
+ /* insure this minor points to a valid volume */
7375
+ volume = &evms_logical_volumes[minor];
7376
+ if (volume->node == NULL) {
7381
+ /* go "close" the volume */
7382
+ if (!rc && minor) {
7383
+ rc = IOCTL(volume->node, inode, file,
7384
+ EVMS_CLOSE_VOLUME,
7385
+ (unsigned long)NULL);
7387
+ LOG_ERROR("error(%d) doing EVMS_CLOSE_VOLUME ioctl to '%s'.\n",
7388
+ rc, volume->name);
7394
+/************************************************/
7395
+/* END -- RELEASE */
7396
+/************************************************/
7398
+struct block_device_operations evms_fops = {
7399
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,14)
7400
+ owner: THIS_MODULE,
7403
+ release: evms_release,
7404
+ ioctl: evms_ioctl,
7405
+ check_media_change: evms_check_media_change,
7406
+ revalidate: evms_revalidate_disk
7409
+/**********************************************************/
7410
+/* END -- FOPS functions definitions */
7411
+/**********************************************************/
7413
+/**********************************************************/
7414
+/* START -- RUNTIME support functions */
7415
+/**********************************************************/
7418
+evms_do_request_fn(request_queue_t *q) {
7419
+ LOG_WARNING("This function should not be called.\n");
7423
+static request_queue_t *
7424
+evms_find_queue(kdev_t dev)
7426
+ request_queue_t *rq = NULL;
7427
+ evms_logical_volume_t *volume;
7429
+ volume = &evms_logical_volumes[MINOR(dev)];
7431
+ rq = &volume->request_queue;
7437
+ * Function: evms_make_request_fn
7441
+evms_make_request_fn(
7442
+ request_queue_t *q,
7444
+ struct buffer_head *bh)
7446
+ evms_logical_volume_t *volume;
7449
+ eio.rsector = bh->b_rsector;
7450
+ eio.rsize = bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT;
7453
+ volume = &evms_logical_volumes[MINOR(bh->b_dev)];
7454
+ wait_event(volume->wait_queue, (!volume->quiesced));
7455
+ if (volume->node) {
7459
+ atomic_inc(&volume->requests_in_progress);
7460
+ R_IO(volume->node, &eio);
7461
+ atomic_dec(&volume->requests_in_progress);
7464
+ atomic_inc(&volume->requests_in_progress);
7465
+ W_IO(volume->node, &eio);
7466
+ atomic_dec(&volume->requests_in_progress);
7469
+ buffer_IO_error(bh);
7473
+ LOG_ERROR("request for unknown logical volume [minor(%d)].\n",
7475
+ buffer_IO_error(bh);
7480
+/**********************************************************/
7481
+/* END -- RUNTIME support functions */
7482
+/**********************************************************/
7484
+/**********************************************************/
7485
+/* START -- INIT/DISCOVERY support functions */
7486
+/**********************************************************/
7489
+ * Function: evms_discover_logical_disks
7490
+ * Description: Construct the logical disk list by calling all registered device managers.
7493
+evms_discover_logical_disks(evms_logical_node_t **disk_list)
7495
+ evms_registered_plugin_t * p;
7496
+ LOG_EXTRA("discovering logical disks...\n");
7497
+ for (p = registered_plugin_head; p; p = p->next) {
7498
+ if (GetPluginType(p->plugin->id) == EVMS_DEVICE_MANAGER) {
7499
+ DISCOVER(p, disk_list);
7505
+ * Function: evms_discover_logical_partitions
7506
+ * Description: Construct the logical partition list by calling all registered partition managers.
7509
+evms_discover_logical_partitions(evms_logical_node_t **discover_list)
7513
+ evms_registered_plugin_t * p;
7514
+ LOG_EXTRA("discovering logical partitions...\n");
7517
+ for (p = registered_plugin_head; p; p = p->next) {
7518
+ if (GetPluginType(p->plugin->id) == EVMS_SEGMENT_MANAGER) {
7519
+ rc = DISCOVER(p, discover_list);
7520
+ /* RC > 0 means the plugin
7521
+ * added something to the
7522
+ * discover list. This also
7523
+ * means we must loop thru
7524
+ * these plugins another time.
7525
+ * RC == 0 means nothing was
7526
+ * added to the discover list
7528
+ * RC < 0 means the plugin
7529
+ * encountered some error and
7530
+ * nothing was added to the list.
7531
+ * NOTE: If a plugin has both
7532
+ * added something new to the
7533
+ * discover list and encountered
7534
+ * an error, RC > 0 must be
7541
+ } while (done == FALSE);
7543
+ /* send the end of discovery signal to each
7544
+ * partition manager plugin.
7546
+ for (p = registered_plugin_head; p; p = p->next)
7547
+ if (GetPluginType(p->plugin->id) == EVMS_SEGMENT_MANAGER)
7548
+ if (p->plugin->function_table->end_discover)
7549
+ rc = END_DISCOVER(p, discover_list);
7553
+ * Function: evms_discover_volume_groups
7554
+ * Description: Find volume groups within the logical partitions list
7557
+evms_discover_volume_groups(evms_logical_node_t **discover_list)
7561
+ evms_registered_plugin_t * p;
7562
+ LOG_EXTRA("discovering logical volume groups...\n");
7565
+ for (p = registered_plugin_head; p; p = p->next) {
7566
+ if (GetPluginType(p->plugin->id) == EVMS_REGION_MANAGER) {
7567
+ rc = DISCOVER(p, discover_list);
7568
+ /* RC > 0 means the plugin
7569
+ * added something to the
7570
+ * discover list. This also
7571
+ * means we must loop thru
7572
+ * these plugins another time.
7573
+ * RC == 0 means nothing was
7574
+ * added to the discover list
7576
+ * RC < 0 means the plugin
7577
+ * encountered some error and
7578
+ * nothing was added to the list.
7579
+ * NOTE: If a plugin has both
7580
+ * added something new to the
7581
+ * discover list and encountered
7582
+ * an error, RC > 0 must be
7589
+ } while (done == FALSE);
7591
+ /* send the end of discovery signal to each volume
7594
+ for (p = registered_plugin_head; p; p = p->next)
7595
+ if (GetPluginType(p->plugin->id) == EVMS_REGION_MANAGER)
7596
+ if (p->plugin->function_table->end_discover)
7597
+ rc = END_DISCOVER(p, discover_list);
7602
+ * convert all the feature header fields into cpu native format
7603
+ * from the on-disk Little Endian format. From this point forward
7604
+ * all plugins can deal with feature headers natively.
7607
+le_feature_header_to_cpu(evms_feature_header_t *fh)
7609
+ fh->signature = le32_to_cpu(fh->signature);
7610
+ fh->crc = le32_to_cpu(fh->crc);
7611
+ fh->version.major = le32_to_cpu(fh->version.major);
7612
+ fh->version.minor = le32_to_cpu(fh->version.minor);
7613
+ fh->version.patchlevel = le32_to_cpu(fh->version.patchlevel);
7614
+ fh->engine_version.major = le32_to_cpu(fh->engine_version.major);
7615
+ fh->engine_version.minor = le32_to_cpu(fh->engine_version.minor);
7616
+ fh->engine_version.patchlevel = le32_to_cpu(fh->engine_version.patchlevel);
7617
+ fh->flags = le32_to_cpu(fh->flags);
7618
+ fh->feature_id = le32_to_cpu(fh->feature_id);
7619
+ fh->sequence_number = le64_to_cpu(fh->sequence_number);
7620
+ fh->alignment_padding = le64_to_cpu(fh->alignment_padding);
7621
+ fh->feature_data1_start_lsn = le64_to_cpu(fh->feature_data1_start_lsn);
7622
+ fh->feature_data1_size = le64_to_cpu(fh->feature_data1_size);
7623
+ fh->feature_data2_start_lsn = le64_to_cpu(fh->feature_data2_start_lsn);
7624
+ fh->feature_data2_size = le64_to_cpu(fh->feature_data2_size);
7625
+ fh->volume_serial_number = le64_to_cpu(fh->volume_serial_number);
7626
+ fh->volume_system_id = le32_to_cpu(fh->volume_system_id);
7627
+ fh->object_depth = le32_to_cpu(fh->object_depth);
7631
+edef_load_feature_header(evms_logical_node_t *node)
7633
+ int i, rc = 0, rc_array[2] = {0,0};
7634
+ unsigned long size_in_bytes;
7635
+ u_int64_t size_in_sectors, starting_sector = 0;
7636
+ evms_feature_header_t *fh = NULL, *fh1 = NULL, *fh2 = NULL;
7637
+ char *location_name = NULL;
7638
+ evms_version_t version = {
7639
+ EVMS_FEATURE_HEADER_MAJOR,
7640
+ EVMS_FEATURE_HEADER_MINOR,
7641
+ EVMS_FEATURE_HEADER_PATCHLEVEL
7644
+ if (!node->feature_header) {
7645
+ size_in_sectors = evms_cs_size_in_vsectors(sizeof(*fh));
7646
+ size_in_bytes = size_in_sectors << EVMS_VSECTOR_SIZE_SHIFT;
7647
+ rc = evms_cs_allocate_memory((void **)&fh1,size_in_bytes);
7649
+ rc = evms_cs_allocate_memory((void **)&fh2,size_in_bytes);
7651
+ evms_cs_deallocate_memory(fh1);
7653
+ for (i = 0; i < 2; i++) {
7656
+ node->total_vsectors -
7659
+ location_name = evms_primary_string;
7661
+ starting_sector--;
7663
+ location_name = evms_secondary_string;
7665
+ /* read header into buffer */
7673
+ LOG_ERROR("error(%d) probing for %s feature header(at %Ld) on '%s'.\n",
7681
+ /* validate header signature */
7682
+ if (cpu_to_le32(fh->signature) != EVMS_FEATURE_HEADER_SIGNATURE) {
7687
+ /* validate header CRC */
7688
+ if (fh->crc != EVMS_MAGIC_CRC) {
7689
+ u_int32_t org_crc, final_crc;
7690
+ org_crc = cpu_to_le32(fh->crc);
7692
+ final_crc = evms_cs_calculate_crc(
7695
+ if (final_crc != org_crc) {
7696
+ LOG_ERROR("CRC mismatch error [stored(%x), computed(%x)] in %s feature header(at %Ld) on '%s'.\n",
7697
+ org_crc, final_crc,
7706
+ LOG_WARNING("CRC disabled in %s feature header(at %Ld) on '%s'.\n",
7711
+ /* convert the feature header from the
7712
+ * on-disk format (Little Endian) to
7713
+ * native cpu format.
7715
+ le_feature_header_to_cpu(fh);
7716
+ /* verify the system data version */
7717
+ rc = evms_cs_check_version(
7721
+ LOG_ERROR("error: obsolete version(%d,%d,%d) in %s feature header on '%s'.\n",
7722
+ fh->version.major,
7723
+ fh->version.minor,
7724
+ fh->version.patchlevel,
7731
+ /* getting same return code for both copies? */
7732
+ if (rc_array[0] == rc_array[1]) {
7734
+ /* if no errors on both copies,
7735
+ * check the sequence numbers.
7736
+ * use the highest sequence number.
7739
+ /* compare sequence numbers */
7740
+ if (fh1->sequence_number == fh2->sequence_number) {
7743
+ LOG_WARNING("%s feature header sequence number(%Ld) mismatches %s feature header sequence number(%Ld) on '%s'!\n",
7744
+ evms_primary_string,
7745
+ fh1->sequence_number,
7746
+ evms_secondary_string,
7747
+ fh2->sequence_number,
7749
+ if (fh1->sequence_number > fh2->sequence_number) {
7751
+ location_name = evms_primary_string;
7752
+ /* indicate bad sequence number of secondary */
7756
+ location_name = evms_secondary_string;
7757
+ /* indicate bad sequence number of primary */
7762
+ /* getting different return codes for each copy */
7764
+ /* either primary or secondary copy is
7765
+ * valid, so use the valid copy.
7767
+ if ((rc_array[0] == 0) ||
7768
+ (rc_array[1] == 0)) {
7769
+ char *warn_name = NULL;
7771
+ /* indicate success */
7773
+ /* set variables based on which copy is valid */
7774
+ if (rc_array[0] == 0) {
7775
+ /* use primary (rear) copy if its good */
7777
+ location_name = evms_primary_string;
7778
+ warn_name = evms_secondary_string;
7780
+ /* use secondary (front) copy if its good */
7782
+ location_name = evms_secondary_string;
7783
+ warn_name = evms_primary_string;
7785
+ /* warn the user about the invalid copy */
7786
+ LOG_WARNING("warning: error(%d) probing/verifying the %s feature header on '%s'.\n",
7787
+ rc_array[0] + rc_array[1],
7791
+ /* both copies had a different error,
7792
+ * and one was a fatal error, so
7793
+ * indicate fatal error.
7795
+ if ((rc_array[0] == -EINVAL) ||
7796
+ (rc_array[1] == -EINVAL)) {
7800
+ /* on error, set fh to NULL */
7801
+ if (rc) fh = NULL;
7803
+ /* deallocate metadata buffers appropriately */
7805
+ evms_cs_deallocate_memory(fh1);
7807
+ evms_cs_deallocate_memory(fh2);
7809
+ /* save validated feature header pointer */
7811
+ node->feature_header = fh;
7812
+ if (rc_array[0] != rc_array[1]) {
7813
+ LOG_DETAILS("using %s feature header on '%s'.\n",
7819
+ /* if no signature found, adjust return code */
7820
+ if (rc == -ENODATA) {
7822
+ LOG_DEBUG("no feature header found on '%s'.\n",
7830
+edef_find_first_features(evms_logical_node_t **discover_list)
7833
+ evms_logical_node_t *node, *tmp_list_head;
7835
+ tmp_list_head = *discover_list;
7836
+ *discover_list = NULL;
7838
+ while(tmp_list_head) {
7839
+ node = tmp_list_head;
7840
+ rc = evms_cs_remove_logical_node_from_list(
7844
+ /* load the feature header if present */
7845
+ rc = edef_load_feature_header(node);
7846
+ /* This node have a feature header ?
7847
+ * it won't be if there is no header to load
7849
+ * there was a fatal error attempting to read it.
7851
+ if (node->feature_header) {
7852
+ /* check for object flag */
7853
+ if (node->feature_header->flags &
7854
+ EVMS_VOLUME_DATA_OBJECT) {
7855
+ LOG_DEFAULT("object detected, deleting '%s'.\n",
7859
+ /* check for stop-data flag */
7860
+ if (node->feature_header->flags &
7861
+ EVMS_VOLUME_DATA_STOP) {
7862
+ LOG_DEFAULT("stop data detected, deleting '%s'.\n",
7866
+ /* register node on global list */
7867
+ evms_list_node_t **evms_node;
7869
+ /* check for duplicate pointers */
7870
+ /* search for node in global list */
7871
+ evms_node = evms_lookup_item_in_list(
7872
+ &evms_global_feature_node_list,
7874
+ /* already present? */
7876
+ /* yes, already present */
7877
+ rc = -ENODATA; /* dont process this node further */
7878
+ LOG_DEFAULT("deleting duplicate reference to '%s'.\n",
7880
+ /* forget this node */
7883
+ /* no, not present.
7884
+ * add it to the list.
7886
+ node->flags |= EVMS_VOLUME_FLAG;
7887
+ node->iflags |= EVMS_FEATURE_BOTTOM;
7888
+ rc = evms_cs_allocate_memory(
7889
+ (void **)&node->volume_info,
7890
+ sizeof(evms_volume_info_t));
7892
+ node->volume_info->volume_serial_number =
7893
+ node->feature_header->volume_serial_number;
7894
+ node->volume_info->volume_system_id =
7895
+ node->feature_header->volume_system_id;
7896
+ strcpy(node->volume_info->volume_name,
7897
+ node->feature_header->volume_name);
7898
+ rc = evms_cs_add_item_to_list(
7899
+ &evms_global_feature_node_list,
7905
+ /* if any errors, delete the node */
7910
+ /* on successful processing of this node
7911
+ * place it back on the discover list.
7913
+ evms_cs_add_logical_node_to_list(
7920
+/* These define describe the node types that can be isolated. */
7921
+#define ISOLATE_ASSOCIATIVE_FEATURES 0
7922
+#define ISOLATE_COMPATIBILITY_VOLUMES 1
7923
+#define ISOLATE_EVMS_VOLUMES 2
7924
+#define ISOLATE_EVMS_VOLUME_SERIAL_NUMBER 3
7925
+#define ISOLATE_EVMS_NODES_BY_FEATURE_AND_DEPTH 4
7927
+edef_isolate_nodes_by_type(
7928
+ unsigned int type,
7929
+ evms_logical_node_t **src_list,
7930
+ evms_logical_node_t **trg_list,
7931
+ u_int32_t compare32,
7932
+ u_int64_t compare64)
7934
+ evms_logical_node_t *node, *next_node;
7935
+ int rc = 0, found_node;
7936
+ evms_feature_header_t *fh = NULL;
7938
+ for (node = *src_list; node; node = next_node) {
7939
+ next_node = node->next;
7941
+ if (node->feature_header)
7942
+ fh = node->feature_header;
7943
+ found_node = FALSE;
7945
+ case ISOLATE_ASSOCIATIVE_FEATURES:
7947
+ if (GetPluginType(fh->feature_id) ==
7948
+ EVMS_ASSOCIATIVE_FEATURE)
7949
+ found_node = TRUE;
7952
+ case ISOLATE_COMPATIBILITY_VOLUMES:
7953
+ if (!(node->flags & EVMS_VOLUME_FLAG))
7954
+ found_node = TRUE;
7956
+ case ISOLATE_EVMS_VOLUMES:
7957
+ if (node->flags & EVMS_VOLUME_FLAG)
7958
+ found_node = TRUE;
7960
+ /* EVMS volumes with same serial # */
7961
+ case ISOLATE_EVMS_VOLUME_SERIAL_NUMBER:
7962
+ if (node->volume_info->volume_serial_number == compare64)
7963
+ found_node = TRUE;
7965
+ case ISOLATE_EVMS_NODES_BY_FEATURE_AND_DEPTH:
7967
+ if (fh->object_depth == compare64)
7968
+ if (fh->feature_id == compare32)
7969
+ found_node = TRUE;
7972
+ if (found_node == TRUE) {
7973
+ rc = evms_cs_remove_logical_node_from_list(src_list, node);
7975
+ rc = evms_cs_add_logical_node_to_list(trg_list, node);
7983
+edef_apply_feature(
7984
+ evms_logical_node_t *node,
7985
+ evms_logical_node_t **volume_node_list)
7987
+ evms_registered_plugin_t * p;
7990
+ for (p = registered_plugin_head; p; p = p->next) {
7991
+ if (p->plugin->id ==
7992
+ node->feature_header->feature_id) {
7993
+ rc = DISCOVER(p, volume_node_list);
8001
+edef_get_feature_plugin_header(
8003
+ evms_plugin_header_t **header)
8006
+ evms_registered_plugin_t *p;
8008
+ for (p = registered_plugin_head; p; p = p->next) {
8009
+ if (p->plugin->id == id) {
8010
+ *header = p->plugin;
8016
+ LOG_SERIOUS("no plugin loaded for feature id(0x%x)\n", id);
8021
+typedef struct evms_volume_build_info_s {
8023
+ int feature_header_count;
8024
+ int feature_count;
8025
+ int associative_feature_count;
8026
+ u_int64_t max_depth;
8027
+ evms_plugin_header_t *plugin;
8028
+ evms_logical_node_t *feature_node_list;
8029
+} evms_volume_build_info_t;
8032
+ * edef_evaluate_volume_node_list:
8034
+ * 1) put all nodes from feature list back on volume list
8035
+ * 2) loads the node's feature headers
8036
+ * 3) counts the node list's entries
8037
+ * 4) builds the feature node list
8038
+ * 5) counts the feature headers for associative features
8039
+ * 6) sets feature count to >1 if >1 features to be processed
8042
+edef_evaluate_volume_node_list(
8043
+ evms_logical_node_t **volume_node_list,
8044
+ evms_volume_build_info_t *vbi,
8045
+ int volume_complete)
8048
+ evms_logical_node_t *node;
8051
+ vbi->feature_count =
8052
+ vbi->associative_feature_count =
8053
+ vbi->max_depth = 0;
8054
+ vbi->plugin = NULL;
8056
+ /* put all feature nodes back on the volume list */
8057
+ rc = edef_isolate_nodes_by_type(
8058
+ ISOLATE_EVMS_VOLUMES,
8059
+ &vbi->feature_node_list,
8062
+ if (rc) return(rc);
8064
+ /* load all the feature headers */
8065
+ if (!volume_complete) {
8066
+ for(node = *volume_node_list; node; node = node->next) {
8067
+ rc = edef_load_feature_header(node);
8068
+ if (rc) return(rc);
8072
+ /* find the 1st max depth object:
8073
+ * record the depth
8074
+ * record the plugin
8076
+ for(node = *volume_node_list; node; node = node->next) {
8077
+ evms_plugin_header_t *plugin;
8078
+ evms_feature_header_t *fh = node->feature_header;
8080
+ /* count the nodes */
8081
+ vbi->node_count++;
8083
+ /* no feature header found, continue to next node */
8084
+ if (!fh) continue;
8086
+ /* check the depth */
8087
+ if (fh->object_depth > vbi->max_depth) {
8088
+ /* record new max depth */
8089
+ vbi->max_depth = fh->object_depth;
8090
+ /* find the plugin header for this feature id */
8091
+ rc = edef_get_feature_plugin_header(
8094
+ if (rc) return(rc);
8095
+ /* check for >1 plugins */
8096
+ if (vbi->plugin != plugin) {
8097
+ vbi->feature_count++;
8098
+ vbi->plugin = plugin;
8101
+ /* check for "associative" feature indicator */
8102
+ if (GetPluginType(vbi->plugin->id) ==
8103
+ EVMS_ASSOCIATIVE_FEATURE)
8104
+ vbi->associative_feature_count++;
8106
+ /* build a list of max depth nodes for this feature */
8107
+ if (vbi->max_depth) {
8108
+ rc = edef_isolate_nodes_by_type(
8109
+ ISOLATE_EVMS_NODES_BY_FEATURE_AND_DEPTH,
8111
+ &vbi->feature_node_list,
8114
+ if (rc) return(rc);
8117
+ if (!vbi->feature_node_list)
8124
+/* function: edef_check_feature_conditions
8126
+ * This routine verifies the state of volume based on the features
8127
+ * headers and nodes in the current discovery list. All detected
8128
+ * errors are considered fatal.
8131
+edef_check_feature_conditions(evms_volume_build_info_t *vbi)
8135
+ if (vbi->associative_feature_count) {
8136
+ if (vbi->node_count > 1) {
8137
+ rc = -EVMS_VOLUME_FATAL_ERROR;
8138
+ LOG_ERROR("associative ERROR: > 1 nodes(%d) remaining to be processed!\n",
8140
+ } else if (vbi->max_depth != 1) {
8141
+ rc = -EVMS_VOLUME_FATAL_ERROR;
8142
+ LOG_ERROR("associative ERROR: associative feature found at node depth(%Ld) != 1!\n",
8145
+ rc = -EVMS_ASSOCIATIVE_FEATURE;
8148
+ if (!vbi->max_depth) {
8149
+ if (vbi->node_count > 1) {
8150
+ rc = -EVMS_VOLUME_FATAL_ERROR;
8151
+ LOG_ERROR("max depth ERROR: > 1 nodes(%d) remaining to be processed!\n",
8154
+ } else if (vbi->max_depth == 1) {
8155
+ if (vbi->feature_count > 1) {
8156
+ rc = -EVMS_VOLUME_FATAL_ERROR;
8157
+ LOG_ERROR("max depth 1 ERROR: > 1 features remaining to be processed!\n");
8164
+/* function: edef_apply_features
8166
+ * This routine applies none, one, or more features to an EVMS
8167
+ * volume. The system data structure is first verified and then
8168
+ * features are applied and verified recursively until the
8169
+ * entire volume has been constructed. Fatal errors result in
8170
+ * all nodes in the volume discovery list being deleted.
8173
+edef_apply_features(evms_logical_node_t **volume_node_list)
8175
+ int rc = 1, done, top_feature_applying;
8176
+ evms_volume_build_info_t vbi;
8178
+ vbi.feature_node_list = NULL;
8179
+ rc = edef_evaluate_volume_node_list(
8183
+ /* this loop should ONLY get used when
8184
+ * there are features to process.
8186
+ done = (rc) ? TRUE : FALSE;
8188
+ rc = edef_check_feature_conditions(&vbi);
8190
+ top_feature_applying = (vbi.max_depth == 1) ? TRUE : FALSE;
8191
+ rc = vbi.plugin->function_table->
8192
+ discover(&vbi.feature_node_list);
8194
+ rc = edef_evaluate_volume_node_list(
8196
+ &vbi, top_feature_applying);
8197
+ if (top_feature_applying == TRUE) {
8198
+ if (vbi.node_count > 1) {
8199
+ rc = -EVMS_VOLUME_FATAL_ERROR;
8200
+ LOG_ERROR("ERROR: detected > 1 node at volume completion!\n");
8204
+ if (!vbi.plugin) {
8205
+ rc = -EVMS_VOLUME_FATAL_ERROR;
8206
+ LOG_ERROR("ERROR: depth(%Ld): expected another feature!\n",
8211
+ } else { /* rc != 0 */
8212
+ rc = -EVMS_VOLUME_FATAL_ERROR;
8217
+ /* put all feature nodes back on the volume list */
8218
+ if (edef_isolate_nodes_by_type(
8219
+ ISOLATE_EVMS_VOLUMES,
8220
+ &vbi.feature_node_list,
8229
+ evms_logical_node_t **node_list,
8230
+ evms_logical_node_t *node,
8236
+ rc = evms_cs_remove_logical_node_from_list(node_list, node);
8238
+ LOG_ERROR("%s error(%d): deleting volume(%s), node(%s)\n",
8239
+ log_text, return_code,
8240
+ node->volume_info->volume_name,
8242
+ rc = DELETE(node);
8244
+ LOG_ERROR("error(%d) while deleting node(%s)\n",
8248
+ LOG_WARNING("%s error(%d): node gone, assumed deleted by plugin.\n",
8249
+ log_text, return_code);
8250
+ /* plugin must have cleaned up the node.
8251
+ * So just reset the return code and leave.
8260
+edef_process_evms_volumes(
8261
+ evms_logical_node_t **discover_list,
8262
+ evms_logical_node_t **associative_feature_list)
8265
+ evms_logical_node_t *node, *evms_volumes_list, *volume_node_list;
8266
+ u_int64_t volume_sn;
8268
+ /* put all EVMS volumes on their own list */
8269
+ evms_volumes_list = NULL;
8270
+ rc = edef_isolate_nodes_by_type(
8271
+ ISOLATE_EVMS_VOLUMES,
8273
+ &evms_volumes_list,
8276
+ /* apply features to each EVMS volume */
8277
+ /* one volume at a time on each pass */
8278
+ while (evms_volumes_list) {
8279
+ node = evms_volumes_list;
8280
+ /* put all nodes for one EVMS volume on separate list */
8281
+ volume_node_list = NULL;
8282
+ volume_sn = node->volume_info->volume_serial_number;
8283
+ rc = edef_isolate_nodes_by_type(
8284
+ ISOLATE_EVMS_VOLUME_SERIAL_NUMBER,
8285
+ &evms_volumes_list,
8286
+ &volume_node_list,
8289
+ /* go apply all the volume features now */
8290
+ rc = edef_apply_features(&volume_node_list);
8292
+ case 0: /* SUCCESS */
8293
+ /* remove volume just processed */
8294
+ node = volume_node_list;
8295
+ rc = evms_cs_remove_logical_node_from_list(&volume_node_list, node);
8297
+ /* put volume on global list */
8298
+ rc = evms_cs_add_logical_node_to_list(discover_list, node);
8300
+ case -EVMS_ASSOCIATIVE_FEATURE:
8301
+ /* put all "associative" features on their own list */
8302
+ rc = edef_isolate_nodes_by_type(
8303
+ ISOLATE_ASSOCIATIVE_FEATURES,
8304
+ &volume_node_list,
8305
+ associative_feature_list,
8308
+ default:/* FATAL ERROR */
8309
+ /* delete each node remaining in the list */
8310
+ if (volume_node_list) {
8311
+ LOG_ERROR("encountered fatal error building volume '%s'\n",
8312
+ volume_node_list->volume_info->volume_name);
8314
+ while(volume_node_list) {
8315
+ node = volume_node_list;
8317
+ &volume_node_list,
8330
+edef_process_associative_volumes(
8331
+ evms_logical_node_t **associative_feature_list,
8332
+ evms_logical_node_t **discover_list)
8335
+ evms_logical_node_t *node;
8337
+ while (*associative_feature_list) {
8338
+ node = *associative_feature_list;
8339
+ /* remove this node from associative feature list */
8340
+ rc = evms_cs_remove_logical_node_from_list(associative_feature_list, node);
8342
+ /* put volume on global list */
8343
+ rc = evms_cs_add_logical_node_to_list(discover_list, node);
8345
+ rc = edef_load_feature_header(node);
8347
+ rc = edef_apply_feature(node, discover_list);
8350
+ discover_list, node, rc,
8351
+ "Associative feature");
8357
+edef_check_for_incomplete_volumes(
8358
+ evms_logical_node_t **discover_list)
8361
+ evms_logical_node_t *next_node, *node;
8363
+ /* check to see if any incomplete volumes are left around */
8364
+ /* if so, delete them. */
8365
+ /* complete volumes should not have feature_headers */
8366
+ /* hanging off them, if we find any, we know the volume */
8367
+ /* is incomplete. */
8369
+ for (node = *discover_list; node; node = next_node) {
8370
+ next_node = node->next;
8372
+ if (node->feature_header) {
8374
+ discover_list, node, rc,
8375
+ "Unexpected feature header");
8382
+ * Function: evms_discover_evms_features
8383
+ * Description: Find features for nodes on the logical partitions list
8386
+evms_discover_evms_features(evms_logical_node_t **discover_list)
8388
+ evms_logical_node_t *associative_feature_list;
8391
+ LOG_EXTRA("discovering evms volume features...\n");
8393
+ /* initialize "associative" features list */
8394
+ associative_feature_list = NULL;
8396
+ /* find the bottom features */
8397
+ rc = edef_find_first_features(discover_list);
8399
+ /* process EVMS volumes here */
8400
+ rc = edef_process_evms_volumes(discover_list, &associative_feature_list);
8402
+ /* process "associative" features here */
8403
+ rc = edef_process_associative_volumes(
8404
+ &associative_feature_list, discover_list);
8406
+ /* check for incomplete volumes */
8407
+ rc = edef_check_for_incomplete_volumes(discover_list);
8413
+ * function: eelv_assign_volume_minor
8415
+ * This is a support function for evms_export_logical_volumes.
8416
+ * This routine assigns a specific minor number to a volume. It
8417
+ * also performs the remaining steps to make this volume visible
8418
+ * and usable to the kernel.
8422
+eelv_assign_volume_minor(evms_logical_node_t *node, int minor)
8424
+ evms_logical_volume_t *volume;
8427
+ /* initialize the logical_node entry in the volume array */
8428
+ volume = &evms_logical_volumes[minor];
8429
+ volume->node = node;
8430
+ rc = evms_cs_allocate_memory((void **)&volume->name,
8431
+ strlen(EVMS_GET_NODE_NAME(node)) + 1);
8433
+ strcpy(volume->name, EVMS_GET_NODE_NAME(node));
8435
+ /* copy flags from top level node into volume structure */
8436
+ volume->flags = node->flags;
8438
+ /* check for read-only volume */
8439
+ if ( volume->flags & EVMS_VOLUME_READ_ONLY ) {
8440
+ set_device_ro(MKDEV(EVMS_MAJOR, minor),1);
8443
+ /* initialize the global device arrays */
8444
+ blksize_size[EVMS_MAJOR][minor] = node->block_size;
8445
+ hardsect_size[EVMS_MAJOR][minor] = node->hardsector_size;
8446
+ blk_size[EVMS_MAJOR][minor] = (int)(node->total_vsectors >> 1);
8448
+ /* register this volume with devfs */
8449
+ volume->devfs_handle =
8450
+ devfs_register(evms_dir_devfs_handle,
8453
+ EVMS_MAJOR, minor,
8454
+ S_IFBLK | S_IRUGO | S_IWUGO,
8455
+ &evms_fops, NULL);
8459
+ LOG_DEFAULT("Exporting EVMS Volume(%u,%u) from \"%s%s\".\n",
8460
+ EVMS_MAJOR, minor,
8461
+ EVMS_DEV_NODE_PATH, volume->name);
8465
+ * function: eelv_check_for_duplicity
8467
+ * This is a support function for evms_export_logical_volumes.
8468
+ * This routine compares the serial number in the top most node
8469
+ * in the volume to the list of currently exported volumes. If
8470
+ * this volumes serial number is found in the list then we know
8471
+ * this volume is a duplicate and it is then delete.
8475
+eelv_check_for_duplicity(evms_logical_node_t **discover_list)
8477
+ evms_logical_node_t *next_node, *node;
8478
+ evms_logical_volume_t *lv;
8481
+ for (node = *discover_list; node; node = next_node) {
8482
+ next_node = node->next;
8485
+ for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
8486
+ lv = &evms_logical_volumes[i];
8487
+ /* only check exported volumes */
8489
+ char *type_ptr = NULL;
8491
+ /* check for duplicate pointer */
8492
+ if (node == lv->node) {
8494
+ type_ptr = "pointer";
8495
+ /* check for duplicate node */
8496
+ } else if (!strcmp(node->name,
8497
+ lv->node->name)) {
8499
+ type_ptr = "node";
8501
+ if (is_dup == TRUE) {
8502
+ evms_cs_remove_logical_node_from_list(discover_list, node);
8503
+ LOG_DEFAULT("deleting duplicate %s to EVMS volume(%u,%u,%s)...\n",
8506
+ EVMS_GET_NODE_NAME(node));
8507
+ /* forget duplicate */
8516
+ * function: eelv_reassign_soft_deleted_volume_minors
8518
+ * This is a support function for evms_export_logical_volumes.
8519
+ * This routine reassigns minor numbers to rediscovered "soft"
8520
+ * deleted volumes.
8524
+eelv_reassign_soft_deleted_volume_minors(evms_logical_node_t **discover_list)
8526
+ evms_logical_node_t *next_node, *node;
8527
+ evms_logical_volume_t *lv;
8528
+ int i, node_removed;
8530
+ for (node = *discover_list; node; node = next_node) {
8531
+ next_node = node->next;
8533
+ node_removed = FALSE;
8534
+ for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
8535
+ lv = &evms_logical_volumes[i];
8536
+ /* only check soft deleted volumes:
8537
+ * they have a non-NULL name.
8539
+ if (lv->flags & EVMS_VOLUME_SOFT_DELETED) {
8540
+ if (!strcmp(EVMS_GET_NODE_NAME(node),lv->name)) {
8541
+ /* reassign requested minor */
8542
+ evms_cs_remove_logical_node_from_list(discover_list, node);
8543
+ node_removed = TRUE;
8544
+ LOG_DEFAULT("Re");
8545
+ /* free the previously used name */
8546
+ evms_cs_deallocate_memory(lv->name);
8548
+ /* clear the EVMS_VOLUME_SOFT_DELETED flag */
8550
+ eelv_assign_volume_minor(node, i);
8559
+ * function: eelv_assign_evms_volume_minors
8561
+ * This is a support function for evms_export_logical_volumes.
8562
+ * This routine assigns minor numbers to new evms volumes. If
8563
+ * the specified minor is already in use, the requested minor
8564
+ * is set to 0, and will be assigned next available along with
8565
+ * any remaining volumes at the end of evms_export_logical_volumes.
8569
+eelv_assign_evms_volume_minors(evms_logical_node_t **discover_list)
8571
+ evms_logical_node_t *next_node, *node, *lv_node;
8572
+ unsigned int requested_minor, node_removed;
8574
+ for (node = *discover_list; node; node = next_node) {
8575
+ next_node = node->next;
8577
+ node_removed = FALSE;
8578
+ /* only process evms volumes */
8579
+ if (node->flags & EVMS_VOLUME_FLAG) {
8580
+ requested_minor = node->volume_info->volume_system_id;
8581
+ /* is there a requested minor? */
8582
+ if (requested_minor) {
8585
+ /* check range of requested minor */
8586
+ if (requested_minor >= MAX_EVMS_VOLUMES)
8589
+ evms_logical_volume_t *lv;
8590
+ lv = &evms_logical_volumes[requested_minor];
8591
+ lv_node = lv->node;
8592
+ lv_flags = lv->flags;
8594
+ if ( (!lv_node) && (!(lv_flags & EVMS_VOLUME_SOFT_DELETED)) ) {
8595
+ /* assign requested minor */
8596
+ evms_cs_remove_logical_node_from_list(discover_list, node);
8597
+ node_removed = TRUE;
8598
+ eelv_assign_volume_minor(node, requested_minor);
8600
+ LOG_WARNING("EVMS volume(%s) requesting invalid/in-use minor(%d), assigning next available!\n",
8601
+ node->volume_info->volume_name,
8604
+ * requested minor is already
8605
+ * in use, defer assignment
8608
+ node->volume_info->volume_system_id = 0;
8616
+ * function: eelv_assign_remaining_evms_volume_minors
8618
+ * This is a support function for evms_export_logical_volumes.
8619
+ * This routine assigns minor numbers to new evms volumes that
8620
+ * have no/conflicting minor assignments. This function will
8621
+ * search from high(255) minor values down, for the first available
8622
+ * minor. Searching high to low minimizes the possibility of
8623
+ * conflicting evms volumes causing "compatibility" minor
8624
+ * assignments to shift from expected assignments.
8628
+eelv_assign_remaining_evms_volume_minors(
8629
+ evms_logical_node_t **discover_list)
8631
+ evms_logical_node_t *next_node, *node;
8632
+ int requested_minor, node_removed;
8634
+ for (node = *discover_list; node; node = next_node) {
8635
+ next_node = node->next;
8637
+ node_removed = FALSE;
8638
+ /* only process evms volumes */
8639
+ /* all remaining evms volumes should now
8640
+ * have a minor value of 0, meaning they
8641
+ * had no minor assignment, or their minor
8642
+ * assignment conflicted with an existing
8643
+ * minor assignment.
8645
+ if (node->flags & EVMS_VOLUME_FLAG) {
8646
+ evms_cs_remove_logical_node_from_list(discover_list, node);
8647
+ node_removed = TRUE;
8648
+ /* find next available minor number */
8649
+ for (requested_minor = 255;
8650
+ (evms_logical_volumes[requested_minor].node ||
8651
+ evms_logical_volumes[requested_minor].name) &&
8653
+ requested_minor--);
8654
+ /* check range of assigned minor */
8655
+ if (!requested_minor) {
8656
+ LOG_CRITICAL("no more minor numbers available for evms volumes!!!!\n");
8659
+ /* assign requested minor */
8660
+ eelv_assign_volume_minor(node, requested_minor);
8666
+ * function: eelv_assign_remaining_volume_minors
8668
+ * This is a support function for evms_export_logical_volumes.
8669
+ * This routine assigns minor numbers to all remaining unassigned
8670
+ * volumes. Minor numbers are assigned on an availability
8671
+ * basis. The first free minor number is used in the assignment.
8675
+eelv_assign_remaining_volume_minors(
8676
+ evms_logical_node_t **discover_list)
8678
+ evms_logical_node_t *node;
8681
+ while(*discover_list) {
8682
+ node = *discover_list;
8683
+ evms_cs_remove_logical_node_from_list(discover_list, node);
8685
+ /* find next available minor number */
8687
+ (evms_logical_volumes[minor].node ||
8688
+ evms_logical_volumes[minor].name) &&
8689
+ minor < MAX_EVMS_VOLUMES;
8692
+ if (minor >= MAX_EVMS_VOLUMES) {
8693
+ LOG_CRITICAL("no more minor numbers available for compatibility volumes!!!!\n");
8696
+ /* assign minor */
8697
+ eelv_assign_volume_minor(node, minor);
8702
+ * function: eelv_check_for_unreassign_soft_deleted_volume
8704
+ * This is a support function for evms_export_logical_volumes.
8705
+ * This routine reports any "soft deleted" volumes that were not
8706
+ * found after a rediscovery.
8709
+eelv_check_for_unreassign_soft_deleted_volume(void)
8711
+ evms_logical_volume_t *lv;
8714
+ for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
8715
+ lv = &evms_logical_volumes[i];
8716
+ /* only check soft deleted volumes:
8717
+ * they have a NULL node ptr &
8718
+ * they have a non-NULL name.
8720
+ if (lv->flags & EVMS_VOLUME_SOFT_DELETED) {
8721
+ if (get_super(MKDEV(EVMS_MAJOR, i)))
8722
+ lv->flags |= EVMS_VOLUME_CORRUPT;
8723
+ LOG_ERROR("error: rediscovery failed to find %smounted 'soft deleted' volume(%u,%u,%s)...\n",
8724
+ ((lv->flags & EVMS_VOLUME_CORRUPT) ? "" : "un"),
8727
+ if (lv->flags & EVMS_VOLUME_CORRUPT) {
8728
+ LOG_ERROR(" flagging volume(%u,%u,%s) as CORRUPT!\n",
8732
+ LOG_ERROR(" releasing minor(%d) used by volume(%s)!\n",
8734
+ /* clear logical volume structure
8735
+ * for this volume so it may be
8738
+ evms_cs_deallocate_memory(lv->name);
8747
+eelv_unquiesce_volumes(void)
8751
+ /* check each volume array entry */
8752
+ for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
8753
+ evms_logical_volume_t *volume;
8755
+ volume = &evms_logical_volumes[i];
8756
+ /* is this volume "quiesced" ? */
8757
+ if (volume->quiesced) {
8759
+ if (volume->node) {
8760
+ /* "unquiesce" it */
8761
+ struct inode inode;
8762
+ evms_quiesce_volume_t qv;
8764
+ qv.command = qv.status = 0;
8767
+ rc = evms_quiesce_volume(volume, &inode, NULL, &qv);
8769
+ /* Wake up any waiters */
8771
+ /* clear the flag */
8772
+ volume->quiesced = 0;
8773
+ /* wake up the waiters */
8774
+ if (waitqueue_active(&volume->wait_queue))
8775
+ wake_up(&volume->wait_queue);
8776
+#ifdef VFS_PATCH_PRESENT
8777
+ /* unquiesce VFS if quiesced */
8778
+ if (volume->vfs_quiesced) {
8779
+ /* VFS function call to unlock the filesystem */
8780
+ unlockfs(MKDEV(EVMS_MAJOR, i));
8781
+ volume->vfs_quiesced = FALSE;
8790
+ * Function: evms_export_logical_volumes
8792
+ * This function is called from evms_discover_volumes. It
8793
+ * check for duplicate volumes, assigns minor values to evms
8794
+ * volumes, and assigns minor values to the remaining volumes.
8795
+ * In addition to assigning minor values to each volume this
8796
+ * function also completes the final steps necessary to allow
8797
+ * the volumes to be using by the operating system.
8800
+evms_export_logical_volumes(evms_logical_node_t **discover_list)
8802
+ LOG_EXTRA("exporting EVMS logical volumes...\n");
8804
+ eelv_check_for_duplicity(discover_list);
8806
+ eelv_reassign_soft_deleted_volume_minors(discover_list);
8808
+ eelv_assign_evms_volume_minors(discover_list);
8810
+ eelv_assign_remaining_evms_volume_minors(discover_list);
8812
+ eelv_assign_remaining_volume_minors(discover_list);
8814
+ eelv_check_for_unreassign_soft_deleted_volume();
8816
+ /* "unquiesce" any "quiesced" volumes */
8817
+ eelv_unquiesce_volumes();
8821
+edv_populate_discover_list(
8822
+ evms_list_node_t *src_list,
8823
+ evms_logical_node_t **trg_list,
8824
+ evms_rediscover_t *discover_parms)
8826
+ int rc = 0, i, move_node, use_all_disks = FALSE;
8827
+ evms_list_node_t *src_node;
8830
+ /* if no discover parameters are specified */
8831
+ /* copy ALL the disk nodes into the */
8832
+ /* discovery list. */
8833
+ if ((discover_parms == NULL) ||
8834
+ (discover_parms->drive_count == REDISCOVER_ALL_DEVICES))
8835
+ use_all_disks = TRUE;
8837
+ /* copy the disk nodes specified in the */
8838
+ /* discover_parms over to a discover list */
8839
+ src_node = src_list;
8841
+ move_node = use_all_disks;
8842
+ if (move_node == FALSE)
8843
+ /* check the rediscovery array */
8844
+ for (i = 0; i < discover_parms->drive_count; i++)
8845
+ if (discover_parms->drive_array[i] == ((unsigned long)src_node->item ^ EVMS_HANDLE_KEY)) {
8849
+ /* check to see if we want this node */
8850
+ if (move_node == TRUE)
8851
+ evms_cs_add_logical_node_to_list(
8853
+ (evms_logical_node_t *)src_node->item);
8854
+ /* advance to next evms_list_node_t */
8855
+ src_node = src_node->next;
8861
+evms_discover_volumes(evms_rediscover_t *discover_parms)
8864
+ evms_logical_node_t *discover_list = NULL;
8866
+ evms_discover_logical_disks(&discover_list);
8867
+ if (evms_global_device_list) {
8868
+ /* move the appropriate disk nodes, based on */
8869
+ /* on the discover parameters, onto the */
8870
+ /* discover list for the partition managers */
8872
+ edv_populate_discover_list(
8873
+ evms_global_device_list,
8874
+ &discover_list, discover_parms);
8876
+ if (discover_list) {
8877
+ evms_discover_logical_partitions(&discover_list);
8879
+ if (discover_list) {
8880
+ evms_discover_volume_groups(&discover_list);
8882
+ if (discover_list) {
8883
+ evms_discover_evms_features(&discover_list);
8885
+ if (discover_list) {
8886
+ evms_export_logical_volumes(&discover_list);
8887
+ evms_cs_signal_event(EVMS_EVENT_END_OF_DISCOVERY);
8893
+ * Function: find_root_fs_dev
8894
+ * If "root=/dev/evms/???" was specified on the kernel command line, and devfs
8895
+ * is not enabled, we need to determine the appropriate minor number for the
8896
+ * specified volume for the root fs.
8898
+static void find_root_fs_dev(void)
8903
+ if ( ! strncmp(root_device_name, EVMS_DIR_NAME "/", strlen(EVMS_DIR_NAME)+1) ) {
8904
+ name = &root_device_name[strlen(EVMS_DIR_NAME)+1];
8906
+ for ( i = 1; i <= MAX_EVMS_VOLUMES; i++ ) {
8907
+ if ( evms_logical_volumes[i].name &&
8908
+ ! strncmp(name, evms_logical_volumes[i].name, strlen(evms_logical_volumes[i].name)) ) {
8909
+ ROOT_DEV = MKDEV(EVMS_MAJOR,i);
8917
+ * Function: bh_cache_ctor
8918
+ * this function initializes the b_wait field in the buffer heads
8919
+ * in our private buffer head pool.
8922
+io_notify_cache_ctor(
8924
+ kmem_cache_t * cachep,
8925
+ unsigned long flags)
8927
+ if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
8928
+ SLAB_CTOR_CONSTRUCTOR)
8930
+ io_notify_t *io_notify = (io_notify_t *)foo;
8931
+ memset(io_notify, 0, sizeof(*io_notify));
8936
+ * Function: bh_cache_ctor
8937
+ * this function initializes the b_wait field in the buffer heads
8938
+ * in our private buffer head pool.
8943
+ kmem_cache_t * cachep,
8944
+ unsigned long flags)
8946
+ if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
8947
+ SLAB_CTOR_CONSTRUCTOR)
8949
+ struct buffer_head *bh = (struct buffer_head *)foo;
8950
+ memset(bh, 0, sizeof(*bh));
8951
+ init_waitqueue_head(&bh->b_wait);
8956
+ * Function: evms_init_module
8957
+ * This function runs once at system initialization.
8960
+evms_init_module (void)
8963
+ int *evms_blocksizes;
8965
+ LOG_DEFAULT("EVMS v%d.%d.%d initializing .... info level(%d).\n",
8966
+ EVMS_MAJOR_VERSION,
8967
+ EVMS_MINOR_VERSION,
8968
+ EVMS_PATCHLEVEL_VERSION,
8971
+ /* initialize memory management counters */
8972
+ atomic_set(&evms_allocs,0);
8973
+ atomic_set(&evms_logical_nodes,0);
8975
+ /* initialize the io_notify_entry pool */
8977
+ evms_io_notify_pool = evms_cs_create_pool(
8978
+ sizeof(io_notify_t),
8980
+ io_notify_cache_ctor,
8983
+ /* initialize the "public" buffer_head pool */
8985
+ evms_bh_pool = evms_cs_create_pool(
8986
+ sizeof(struct buffer_head),
8991
+ /* allocate the logical volume array */
8993
+ rc = evms_cs_allocate_memory(
8994
+ (void **)&evms_logical_volumes,
8995
+ sizeof(evms_logical_volume_t) * MAX_EVMS_VOLUMES);
8997
+ /* initialize the logical volume array entries */
8999
+ for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
9000
+ evms_logical_volume_t *volume;
9002
+ volume = &evms_logical_volumes[i];
9003
+ init_waitqueue_head(&volume->wait_queue);
9005
+ blk_init_queue(&volume->request_queue,
9006
+ evms_do_request_fn);
9007
+ blk_queue_make_request(&volume->request_queue,
9008
+ evms_make_request_fn);
9012
+ /* allocate EVMS' blk_size array */
9014
+ rc = evms_cs_allocate_memory((void **)&evms_blocksizes, MAX_EVMS_VOLUMES * sizeof(int));
9016
+ LOG_CRITICAL("can't allocate memory for EVMS blk_size\n");
9017
+ } else blk_size[EVMS_MAJOR] = evms_blocksizes;
9020
+ /* allocate EVMS' blksize_size array */
9022
+ rc = evms_cs_allocate_memory((void **)&evms_blocksizes, MAX_EVMS_VOLUMES * sizeof(int));
9024
+ LOG_CRITICAL("can't allocate memory for EVMS blksize_size\n");
9025
+ } else blksize_size[EVMS_MAJOR] = evms_blocksizes;
9028
+ /* allocate EVMS' hardsect_size array */
9030
+ rc = evms_cs_allocate_memory((void **)&evms_blocksizes, MAX_EVMS_VOLUMES * sizeof(int));
9032
+ LOG_CRITICAL("can't allocate memory for EVMS hardsect_size\n");
9033
+ } else hardsect_size[EVMS_MAJOR] = evms_blocksizes;
9036
+ /* Register the block device */
9038
+ rc = devfs_register_blkdev(EVMS_MAJOR, EVMS_DIR_NAME, &evms_fops);
9040
+ LOG_CRITICAL("error calling devfs_register_blkdev() err=%u\n", rc);
9045
+ /* Register with devfs */
9047
+ evms_dir_devfs_handle = devfs_mk_dir(NULL, EVMS_DIR_NAME, NULL);
9048
+ // A NULL return cannot be fatal.
9049
+ // Devfs just might not be running
9050
+ if ( ! evms_dir_devfs_handle ) {
9051
+ LOG_EXTRA("NULL return from devfs_mk_dir() for \"%s\"\n", EVMS_DIR_NAME);
9052
+ LOG_EXTRA("Is devfs enabled?\n");
9055
+ evms_blk_devfs_handle = devfs_register(evms_dir_devfs_handle,
9059
+ S_IFBLK | S_IRUGO | S_IWUGO,
9060
+ &evms_fops, NULL);
9061
+ if ( ! evms_blk_devfs_handle ) {
9062
+ LOG_DETAILS("NULL return from devfs_register() for \"%s\"\n", EVMS_DEV_NAME);
9068
+ read_ahead[EVMS_MAJOR] = 4096;
9070
+ blk_dev[EVMS_MAJOR].queue = evms_find_queue;
9072
+ blk_init_queue(BLK_DEFAULT_QUEUE(EVMS_MAJOR), evms_do_request_fn);
9073
+ blk_queue_make_request(BLK_DEFAULT_QUEUE(EVMS_MAJOR), evms_make_request_fn);
9075
+#ifdef CONFIG_PROC_FS
9076
+ evms_cs_get_evms_proc_dir();
9077
+ if (evms_proc_dir) {
9078
+ create_proc_read_entry("info", 0, evms_proc_dir, evms_info_read_proc, NULL);
9079
+ create_proc_read_entry("plugins", 0, evms_proc_dir, evms_plugins_read_proc, NULL);
9080
+ create_proc_read_entry("volumes", 0, evms_proc_dir, evms_volumes_read_proc, NULL);
9082
+ evms_table_header = register_sysctl_table(dev_dir_table, 1);
9090
+ * Function: evms_init_module
9091
+ * This function runs once at system initialization.
9094
+evms_exit_module (void)
9098
+ LOG_DEFAULT("EVMS v%d.%d.%d unloading ....\n",
9099
+ EVMS_MAJOR_VERSION,
9100
+ EVMS_MINOR_VERSION,
9101
+ EVMS_PATCHLEVEL_VERSION);
9103
+ /* ensure no EVMS volumes exist
9105
+ for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
9106
+ if (evms_logical_volumes[i].node) {
9107
+ LOG_ERROR("volume(%d,%d,%s) still exists.\n",
9109
+ evms_logical_volumes[i].name);
9114
+ LOG_ERROR("unable to unload until no volumes exist!\n");
9117
+ /* ensure no plugins are loaded.
9119
+ evms_registered_plugin_t *p;
9120
+ int found = FALSE;
9122
+ for (p = registered_plugin_head; p; p = p->next) {
9124
+ LOG_ERROR("plugin (plugin.id=%d.%d.%d, plugin.ver=%d.%d.%d still loaded.\n",
9125
+ GetPluginOEM(p->plugin->id),
9126
+ GetPluginType(p->plugin->id),
9127
+ GetPluginID(p->plugin->id),
9128
+ p->plugin->version.major,
9129
+ p->plugin->version.minor,
9130
+ p->plugin->version.patchlevel);
9133
+ LOG_ERROR("unable to unload while plugins still loaded!\n");
9137
+ /* unregister with devfs
9139
+ devfs_unregister(evms_dir_devfs_handle);
9140
+ /* clean up the queue for the block device
9142
+ blk_cleanup_queue(blk_get_queue(MKDEV(EVMS_MAJOR,0)));
9143
+ /* unregister block device
9145
+ rc = devfs_unregister_blkdev(EVMS_MAJOR, EVMS_DIR_NAME);
9148
+ /* deallocate device arrays
9150
+ evms_cs_deallocate_memory(blk_size[EVMS_MAJOR]);
9151
+ blk_size[EVMS_MAJOR] = NULL;
9152
+ evms_cs_deallocate_memory(blksize_size[EVMS_MAJOR]);
9153
+ blksize_size[EVMS_MAJOR] = NULL;
9154
+ evms_cs_deallocate_memory(hardsect_size[EVMS_MAJOR]);
9155
+ hardsect_size[EVMS_MAJOR] = NULL;
9156
+ read_ahead[EVMS_MAJOR] = 0;
9157
+ /* deallocate logical volumes array
9159
+ evms_cs_deallocate_memory(evms_logical_volumes);
9160
+ /* destroy buffer head pool
9162
+ evms_cs_destroy_pool(evms_bh_pool);
9163
+ /* destroy io notify pool
9165
+ evms_cs_destroy_pool(evms_io_notify_pool);
9166
+#ifdef CONFIG_PROC_FS
9167
+ if (evms_proc_dir) {
9168
+ remove_proc_entry("volumes", evms_proc_dir);
9169
+ remove_proc_entry("plugins", evms_proc_dir);
9170
+ remove_proc_entry("info", evms_proc_dir);
9171
+ remove_proc_entry("evms", NULL);
9173
+ unregister_sysctl_table(evms_table_header);
9179
+ * Function: evms_init_discover
9180
+ * If EVMS is statically built into the kernel, this function will be called
9181
+ * to perform an initial volume discovery.
9184
+evms_init_discover (void)
9186
+ /* go find volumes */
9187
+ evms_discover_volumes(NULL);
9189
+ /* Check if the root fs is on EVMS */
9190
+ if ( MAJOR(ROOT_DEV) == EVMS_MAJOR ) {
9191
+ find_root_fs_dev();
9199
+ * a placeholder for cluster enablement
9202
+evms_cluster_init(int nodeid, int clusterid)
9207
+EXPORT_SYMBOL(evms_cluster_init);
9210
+ * a placeholder for cluster enablement
9213
+evms_cluster_shutdown(void)
9218
+EXPORT_SYMBOL(evms_cluster_shutdown);
9221
+evms_boot_info_level(char *str)
9223
+ int evms_boot_info_level = (int) simple_strtoul(str, NULL, 10);
9224
+ if (evms_boot_info_level) {
9225
+ evms_info_level = evms_boot_info_level;
9230
+__setup("evms_info_level=", evms_boot_info_level);
9231
+module_init(evms_init_module);
9232
+module_exit(evms_exit_module);
9233
+__initcall(evms_init_discover);
9234
+#ifdef MODULE_LICENSE
9235
+MODULE_LICENSE("GPL");
9238
+/**********************************************************/
9239
+/* END -- INIT/DISCOVERY support functions */
9240
+/**********************************************************/
9241
diff -Naur linux-2002-03-28/drivers/evms/evms_bbr.c evms-2002-03-28/drivers/evms/evms_bbr.c
9242
--- linux-2002-03-28/drivers/evms/evms_bbr.c Wed Dec 31 18:00:00 1969
9243
+++ evms-2002-03-28/drivers/evms/evms_bbr.c Wed Mar 27 19:01:30 2002
9245
+/* -*- linux-c -*- */
9248
+ * Copyright (c) International Business Machines Corp., 2000
9250
+ * This program is free software; you can redistribute it and/or modify
9251
+ * it under the terms of the GNU General Public License as published by
9252
+ * the Free Software Foundation; either version 2 of the License, or
9253
+ * (at your option) any later version.
9255
+ * This program is distributed in the hope that it will be useful,
9256
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
9257
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
9258
+ * the GNU General Public License for more details.
9260
+ * You should have received a copy of the GNU General Public License
9261
+ * along with this program; if not, write to the Free Software
9262
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
9265
+/* linux/driver/evms/evms_bbr.c
9267
+ * EVMS - Bad Block Relocation (BBR) Feature Plugin
9269
+ * BBR feature is designed to remap I/O write failures to another safe location on disk.
9270
+ * Note that most disk drives have BBR built into them, this means that our software BBR
9271
+ * will be only activated when all hardware BBR replacement sectors have been used.
9275
+/* #define EVMS_BBR_DEBUG 1 */
9277
+#include <linux/evms/evms_bbr_k.h>
9279
+#define LOG_PREFIX "bbr: "
9281
+static bbr_instance_data_t *bbr_instances = NULL;
9283
+static struct notifier_block bbr_notifier = {
9284
+ notifier_call: bbr_notify_reboot,
9286
+ priority: INT_MAX, /* before any real devices */
9289
+// Data pertaining to the I/O thread.
9290
+static evms_thread_t * bbr_io_thread = NULL;
9291
+static spinlock_t bbr_io_list_lock = SPIN_LOCK_UNLOCKED;
9292
+static bbr_bh_t * bbr_io_list = NULL, **bbr_io_list_tail;
9294
+/* plugin function table definition */
9295
+static evms_plugin_function_table_t function_table = {
9296
+ discover : bbr_discover,
9297
+ delete : bbr_delete,
9299
+ write : bbr_write,
9300
+ init_io : bbr_init_io,
9301
+ ioctl : bbr_ioctl,
9302
+ direct_ioctl : bbr_direct_ioctl
9305
+static evms_plugin_header_t plugin_header = {
9309
+ EVMS_BBR_FEATURE_ID),
9310
+ version : { 1,0,0 },
9311
+ required_common_services_version : {
9312
+ EVMS_BBR_COMMON_SERVICES_MAJOR,
9313
+ EVMS_BBR_COMMON_SERVICES_MINOR,
9314
+ EVMS_BBR_COMMON_SERVICES_PATCHLEVEL
9316
+ function_table : &function_table
9321
+ * Function: le_meta_data_to_cpu
9322
+ * convert bbr meta data from on-disk (LE) format to the native cpu endian format.
9324
+void le_meta_data_to_cpu(evms_bbr_metadata_t *md)
9326
+ md->signature = le32_to_cpu(md->signature);
9327
+ md->crc = le32_to_cpu(md->crc);
9328
+ md->block_size = le32_to_cpu(md->block_size);
9329
+ md->flags = le32_to_cpu(md->flags);
9330
+ md->sequence_number = le64_to_cpu(md->sequence_number);
9331
+ md->start_sect_bbr_table = le64_to_cpu(md->start_sect_bbr_table);
9332
+ md->nr_sects_bbr_table = le64_to_cpu(md->nr_sects_bbr_table);
9333
+ md->start_replacement_sect = le64_to_cpu(md->start_replacement_sect);
9334
+ md->nr_replacement_blks = le64_to_cpu(md->nr_replacement_blks);
9338
+ * Function: le_bbr_table_sector_to_cpu
9339
+ * convert bbr meta data from on-disk (LE) format to the native cpu endian format.
9341
+void le_bbr_table_sector_to_cpu(evms_bbr_table_t *p)
9344
+ p->signature = le32_to_cpu(p->signature);
9345
+ p->crc = le32_to_cpu(p->crc);
9346
+ p->sequence_number = le32_to_cpu(p->sequence_number);
9347
+ p->in_use_cnt = le32_to_cpu(p->in_use_cnt);
9348
+ for (i=0; i<EVMS_BBR_ENTRIES_PER_SECT; i++) {
9349
+ p->entries[i].bad_sect = le64_to_cpu(p->entries[i].bad_sect);
9350
+ p->entries[i].replacement_sect = le64_to_cpu(p->entries[i].replacement_sect);
9355
+ * Function: cpu_bbr_table_sector_to_le
9356
+ * convert bbr meta data from cpu endian format to on-disk (LE) format
9358
+void cpu_bbr_table_sector_to_le(evms_bbr_table_t *p, evms_bbr_table_t *le)
9361
+ le->signature = cpu_to_le32(p->signature);
9362
+ le->crc = cpu_to_le32(p->crc);
9363
+ le->sequence_number = cpu_to_le32(p->sequence_number);
9364
+ le->in_use_cnt = cpu_to_le32(p->in_use_cnt);
9365
+ for (i=0; i<EVMS_BBR_ENTRIES_PER_SECT; i++) {
9366
+ le->entries[i].bad_sect = cpu_to_le64(p->entries[i].bad_sect);
9367
+ le->entries[i].replacement_sect = cpu_to_le64(p->entries[i].replacement_sect);
9373
+static int validate_bbr_table_sector(evms_bbr_table_t *p)
9376
+ int org_crc, final_crc;
9378
+ if (le32_to_cpu(p->signature) != EVMS_BBR_TABLE_SIGNATURE) {
9379
+ LOG_ERROR("BBR_TABLE_SIGNATURE don't match! sector has (0x%08X) expected(0x%08X)\n",
9380
+ le32_to_cpu(p->signature), EVMS_BBR_TABLE_SIGNATURE);
9384
+ org_crc = le32_to_cpu(p->crc);
9386
+ final_crc = evms_cs_calculate_crc(EVMS_INITIAL_CRC, p, sizeof(*p));
9387
+ if (final_crc != org_crc) {
9388
+ LOG_ERROR("CRC failed! sector has (0x%08X) calculated(0x%08X)\n",
9389
+ org_crc, final_crc);
9392
+ p->crc = cpu_to_le32(org_crc);
9394
+ LOG_ERROR("bbr table sector has no crc\n");
9399
+ BBR_DEBUG_PRINT_TABLE_SECTOR(p);
9400
+ le_bbr_table_sector_to_cpu(p);
9404
+void update_invalid_bbr_table_sector(
9405
+ evms_logical_node_t *node,
9406
+ evms_bbr_table_t *valid,
9407
+ evms_bbr_table_t *invalid,
9408
+ evms_sector_t LSN)
9411
+ evms_bbr_table_t *tmp_bbr_table;
9413
+ /* Correct the invalid bbr table sector */
9414
+ memcpy(invalid, valid, sizeof(evms_bbr_table_t));
9416
+ /* Allocate memory for I/O */
9417
+ rc = evms_cs_allocate_memory((void**)&tmp_bbr_table,sizeof(evms_bbr_table_t));
9419
+ cpu_bbr_table_sector_to_le(valid, tmp_bbr_table);
9420
+ LOG_WARNING("%s: updating LSN=%Lu\n", __FUNCTION__, LSN);
9421
+ rc = INIT_IO(node, 1, LSN, 1, tmp_bbr_table);
9423
+ LOG_ERROR("Could not update bbr table sector, INIT_IO(rc=%d)\n", rc);
9425
+ evms_cs_deallocate_memory(tmp_bbr_table);
9429
+static u_int32_t validate_bbr_table(
9430
+ evms_bbr_metadata_t *md,
9431
+ evms_bbr_table_t *p)
9433
+ u_int32_t i, nr_sects;
9435
+ nr_sects = md->nr_sects_bbr_table;
9437
+ for (i=0; i<nr_sects; i++, p++) {
9438
+ if (validate_bbr_table_sector(p))
9442
+ if (i != nr_sects) {
9443
+ LOG_SERIOUS("stop validation at sector[%d]\n",i);
9446
+ LOG_DEBUG("processed %d bbr table sectors\n", nr_sects);
9451
+static u_int32_t validate_bbr_tables(
9452
+ evms_logical_node_t *node,
9453
+ evms_bbr_metadata_t *MD1,
9454
+ evms_bbr_metadata_t *MD2,
9455
+ evms_bbr_table_t *p1,
9456
+ evms_bbr_table_t *p2)
9458
+ u_int32_t i, rc1, rc2, nr_sects;
9460
+ nr_sects = MD1->nr_sects_bbr_table;
9461
+ if (nr_sects != MD2->nr_sects_bbr_table) {
9462
+ nr_sects = (MD1->nr_sects_bbr_table < MD2->nr_sects_bbr_table) ?
9463
+ MD1->nr_sects_bbr_table : MD2->nr_sects_bbr_table;
9464
+ LOG_SERIOUS("number of bbr table sectors don't match, use %d",nr_sects);
9467
+ for (i=0; i<nr_sects; i++, p1++, p2++) {
9469
+ if ((rc1 = validate_bbr_table_sector(p1)))
9470
+ LOG_WARNING("%s: MD1 has invalid bbr table sector at (LSN=%Lu)\n",
9471
+ __FUNCTION__, MD1->start_sect_bbr_table + i);
9473
+ if ((rc2 = validate_bbr_table_sector(p2)))
9474
+ LOG_WARNING("%s: MD2 has invalid bbr table sector at (LSN=%Lu)\n",
9475
+ __FUNCTION__, MD2->start_sect_bbr_table + i);
9477
+ /* cannot continue */
9482
+ update_invalid_bbr_table_sector(node, p2, p1,
9483
+ MD1->start_sect_bbr_table + i);
9485
+ update_invalid_bbr_table_sector(node, p1, p2,
9486
+ MD2->start_sect_bbr_table + i);
9488
+ /* skip sequence number check, advance to next bbr table sector */
9493
+ if (p1->sequence_number != p2->sequence_number) {
9494
+ LOG_WARNING("at bbr table sector idx[%d] MD1 sequence_nr=%u <> MD2 sequence_nr_2=%u\n",
9495
+ i, p1->sequence_number, p2->sequence_number);
9496
+ if (p1->sequence_number < p2->sequence_number)
9497
+ update_invalid_bbr_table_sector(node, p2, p1,
9498
+ MD1->start_sect_bbr_table + i);
9500
+ update_invalid_bbr_table_sector(node, p1, p2,
9501
+ MD2->start_sect_bbr_table + i);
9504
+ if (i != nr_sects) {
9505
+ LOG_SERIOUS("stop validation at sector[%d]\n",i);
9508
+ LOG_DEBUG("%s processed %d bbr table sectors\n", __FUNCTION__, nr_sects);
9512
+#ifdef EVMS_BBR_DEBUG
9513
+static void print_meta_data(evms_bbr_metadata_t *md)
9515
+ LOG_DEBUG("META DATA SECTOR\n sig(0x%08X) crc(0x%08X) block_size=%d\n"
9516
+ " start_sect_bbr_table=%Lu, nr_sects_bbr_table=%Lu\n"
9517
+ " start_replacement_sect=%Lu, nr_replacement_blks=%Lu\n",
9521
+ md->start_sect_bbr_table,
9522
+ md->nr_sects_bbr_table,
9523
+ md->start_replacement_sect,
9524
+ md->nr_replacement_blks);
9527
+static void print_bbr_table_sector(evms_bbr_table_t *p)
9530
+ LOG_DEBUG("BBR TABLE SECTOR\n sig(0x%08X) crc(0x%08X) sequence=%d, in_use_cnt=%d\n ENTRIES:",
9531
+ p->signature, p->crc, p->sequence_number, p->in_use_cnt);
9532
+ for (i=0; i<EVMS_BBR_ENTRIES_PER_SECT; i++) {
9533
+ LOG_DEBUG(" [%d] bad_sect=%Lu, replacement_sect=%Lu\n",
9534
+ i, p->entries[i].bad_sect, p->entries[i].replacement_sect);
9540
+static int validate_meta_data(evms_bbr_metadata_t *md)
9542
+ int org_crc, final_crc;
9544
+ BBR_DEBUG_PRINT_META_DATA(md);
9546
+ if (le32_to_cpu(md->signature) != EVMS_BBR_SIGNATURE) {
9547
+ LOG_SERIOUS("EVMS_BBR_SIGNATURE don't match, got(0x%08X), expected(0x%08X)\n",
9548
+ le32_to_cpu(md->signature), EVMS_BBR_SIGNATURE);
9553
+ org_crc = le32_to_cpu(md->crc);
9555
+ final_crc = evms_cs_calculate_crc(EVMS_INITIAL_CRC, md, sizeof(*md));
9556
+ if (final_crc != org_crc) {
9557
+ LOG_SERIOUS("metadata has crc(0x%08X), calculated(0x%08X)\n",
9558
+ org_crc, final_crc);
9561
+ md->crc = cpu_to_le32(org_crc);
9563
+ LOG_WARNING("metadata has no crc!!!\n");
9566
+ le_meta_data_to_cpu(md);
9571
+ * Function: bbr_load_meta_data
9572
+ * Load and validate bbr meta data
9574
+static int load_meta_data(
9575
+ evms_logical_node_t *node,
9576
+ evms_sector_t LSN,
9577
+ evms_bbr_metadata_t **md,
9578
+ evms_bbr_table_t **bbr_table)
9583
+ *bbr_table = NULL;
9587
+ LOG_WARNING("No meta data\n");
9591
+ rc = evms_cs_allocate_memory((void **)md, sizeof(evms_bbr_metadata_t));
9593
+ int metadata_hdr_size;
9594
+ metadata_hdr_size = evms_cs_size_in_vsectors(sizeof(evms_bbr_metadata_t));
9595
+ rc = INIT_IO(node, 0, LSN, metadata_hdr_size, *md);
9597
+ rc = validate_meta_data(*md);
9599
+ rc = evms_cs_allocate_memory((void**)bbr_table,
9600
+ (*md)->nr_sects_bbr_table * EVMS_VSECTOR_SIZE);
9602
+ /* load BBR table but do not validate here */
9603
+ rc = INIT_IO(node, 0,
9604
+ (*md)->start_sect_bbr_table,
9605
+ (*md)->nr_sects_bbr_table,
9613
+ LOG_ERROR("%s failed rc=%d. Free allocated memory!\n",__FUNCTION__,rc);
9615
+ evms_cs_deallocate_memory(*md);
9620
+ evms_cs_deallocate_memory(*bbr_table);
9621
+ *bbr_table = NULL;
9629
+ * Function: bbr_load_feature_data
9630
+ * Load 2 copies meta data
9633
+static int load_feature_data(
9634
+ evms_logical_node_t *node,
9635
+ bbr_instance_data_t **ID)
9639
+ evms_bbr_metadata_t *md1 = NULL;
9640
+ evms_bbr_metadata_t *md2 = NULL;
9641
+ evms_bbr_table_t *table1 = NULL;
9642
+ evms_bbr_table_t *table2 = NULL;
9643
+ u_int64_t lba_table1 = 0;
9644
+ u_int64_t lba_table2 = 0;
9645
+ u_int32_t nr_sects = 0;
9649
+ /* Loads metadata 1 */
9650
+ rc1 = load_meta_data(node,
9651
+ node->feature_header->feature_data1_start_lsn,
9654
+ /* Loads metadata 2 */
9655
+ rc2 = load_meta_data(node,
9656
+ node->feature_header->feature_data2_start_lsn,
9660
+ if (rc1 && rc2) { /* both copies are bad ?*/
9661
+ rc = -ENODATA; /* cannot continue */
9663
+ if (!rc1 && !rc2) {
9664
+ lba_table1 = md1->start_sect_bbr_table;
9665
+ lba_table2 = md2->start_sect_bbr_table;
9666
+ nr_sects = validate_bbr_tables(node, md1, md2, table1, table2);
9667
+ if (nr_sects == 0) {
9671
+ /* only 1 copy of meta data */
9673
+ lba_table2 = md2->start_sect_bbr_table;
9674
+ /* free meta data 1 */
9675
+ evms_cs_deallocate_memory(table1);
9678
+ evms_cs_deallocate_memory(md1);
9682
+ lba_table1 = md1->start_sect_bbr_table;
9684
+ nr_sects = validate_bbr_table(md1,table1);
9685
+ if (nr_sects == 0) {
9691
+ if (!rc && nr_sects) {
9692
+ rc = evms_cs_allocate_memory((void **)ID, sizeof(bbr_instance_data_t));
9694
+ /* memset(*ID, 0, sizeof(bbr_instance_data_t)); */ /* not needed */
9695
+ (*ID)->source = node;
9696
+ (*ID)->blksize_in_sects = md1->block_size >> EVMS_VSECTOR_SIZE_SHIFT;
9697
+ (*ID)->remap_root = NULL;
9698
+ (*ID)->lba_table1 = lba_table1;
9699
+ (*ID)->lba_table2 = lba_table2;
9700
+ (*ID)->bbr_table = table1; /* use only 1 copy of bbr table */
9701
+ (*ID)->nr_sects_bbr_table = nr_sects;
9702
+ if (nr_sects < md1->nr_sects_bbr_table) {
9703
+ LOG_WARNING(" making bbr node read-only\n");
9704
+ (*ID)->flag |= EVMS_VOLUME_READ_ONLY;
9706
+ (*ID)->nr_replacement_blks = nr_sects * EVMS_BBR_ENTRIES_PER_SECT;
9707
+ (*ID)->start_replacement_sect = md1->start_replacement_sect;
9708
+ atomic_set(&(*ID)->in_use_replacement_blks,0);
9709
+ (*ID)->bbr_id_lock = SPIN_LOCK_UNLOCKED;
9710
+ rc = bbr_create_pools(*ID);
9712
+ atomic_set(&(*ID)->in_use_replacement_blks,bbr_table_to_remap_list(*ID));
9717
+ if (!bbr_io_thread) {
9718
+ const char * name1 = "evms_bbr_io";
9719
+ bbr_io_thread = evms_cs_register_thread(bbr_io_handler, NULL, name1);
9720
+ if (!bbr_io_thread) {
9726
+ /* if error, free table1 */
9729
+ evms_cs_deallocate_memory(table1);
9731
+ (*ID)->bbr_table = NULL;
9732
+ bbr_free_instance_data(*ID);
9737
+ /* Will never use md1, md2 and table2 again */
9739
+ evms_cs_deallocate_memory(md1);
9741
+ evms_cs_deallocate_memory(md2);
9743
+ evms_cs_deallocate_memory(table2);
9748
+#ifdef EVMS_BBR_DEBUG
9751
+ * bbr_print_binary_tree
9752
+ * Traverse the tree and print out each node
9754
+void print_binary_tree(bbr_runtime_remap_t *node)
9756
+ if (node == NULL) {
9759
+ LOG_DEFAULT("[%Lu,%Lu]\n",node->remap.bad_sect, node->remap.replacement_sect);
9760
+ print_binary_tree(node->left);
9761
+ print_binary_tree(node->right);
9766
+static void print_remap_list(bbr_instance_data_t *BBRID)
9768
+ if (!BBRID->remap_root)
9770
+ LOG_DEFAULT("%s for %s\n", __FUNCTION__,
9771
+ BBRID->node ? BBRID->node->name : "?");
9772
+ print_binary_tree(BBRID->remap_root);
9777
+#ifdef BBR_USE_RECURSIVE_FUNCTIONS
9780
+ * Recursive function to insert a node into the binary tree
9782
+void bbr_binary_tree_insert(bbr_runtime_remap_t **node, bbr_runtime_remap_t *newnode)
9784
+ if (*node == NULL) {
9785
+ newnode->left = newnode->right = NULL;
9789
+ if (newnode->remap.bad_sect > (*node)->remap.bad_sect)
9790
+ return bbr_binary_tree_insert(&((*node)->right),newnode);
9792
+ return bbr_binary_tree_insert(&((*node)->left),newnode);
9797
+ * Recursive function to search for a node that contains bad_sect = lsn
9799
+bbr_runtime_remap_t * bbr_binary_search(bbr_runtime_remap_t *node, evms_sector_t lsn)
9801
+ if ((node == NULL) || (node->remap.bad_sect == lsn)) {
9804
+ if (lsn > node->remap.bad_sect)
9805
+ return bbr_binary_search(node->right, lsn);
9807
+ return bbr_binary_search(node->left, lsn);
9812
+ * Recursive function to detroy the binary tree
9814
+void bbr_binary_tree_destroy(bbr_runtime_remap_t *node, bbr_instance_data_t *BBRID)
9817
+ bbr_binary_tree_destroy(node->left, BBRID);
9818
+ bbr_binary_tree_destroy(node->right, BBRID);
9819
+ evms_cs_deallocate_to_pool(BBRID->remap_pool, node);
9826
+ * Insert a node into the binary tree
9828
+void bbr_binary_tree_insert(bbr_runtime_remap_t **root, bbr_runtime_remap_t *newnode)
9830
+ bbr_runtime_remap_t **node = root;
9831
+ while (node && *node) {
9832
+ if (newnode->remap.bad_sect > (*node)->remap.bad_sect)
9833
+ node = &((*node)->right);
9835
+ node = &((*node)->left);
9838
+ newnode->left = newnode->right = NULL;
9843
+ * Search for a node that contains bad_sect = lsn
9845
+bbr_runtime_remap_t * bbr_binary_search(
9846
+ bbr_runtime_remap_t *root,
9847
+ evms_sector_t lsn)
9849
+ bbr_runtime_remap_t *node = root;
9851
+ if (node->remap.bad_sect == lsn)
9853
+ if (lsn > node->remap.bad_sect)
9854
+ node = node->right;
9856
+ node = node->left;
9862
+ * Destroy the binary tree
9864
+void bbr_binary_tree_destroy(bbr_runtime_remap_t *root, bbr_instance_data_t *BBRID)
9866
+ bbr_runtime_remap_t **link = NULL;
9867
+ bbr_runtime_remap_t *node = root;
9871
+ link = &(node->left);
9872
+ node = node->left;
9875
+ if (node->right) {
9876
+ link = &(node->right);
9877
+ node = node->right;
9881
+ evms_cs_deallocate_to_pool(BBRID->remap_pool, node);
9883
+ if (node == root) /* if root is deleted, it's done. */
9885
+ node = root; /* back to root */
9892
+static void bbr_free_remap(bbr_instance_data_t *BBRID)
9894
+ unsigned long flags;
9895
+ spin_lock_irqsave(&BBRID->bbr_id_lock, flags);
9896
+ bbr_binary_tree_destroy(BBRID->remap_root, BBRID);
9897
+ BBRID->remap_root = NULL;
9898
+ spin_unlock_irqrestore(&BBRID->bbr_id_lock, flags);
9902
+ * bbr_insert_remap_entry
9904
+static int bbr_insert_remap_entry(bbr_instance_data_t *BBRID,
9905
+ evms_bbr_table_entry_t *new_bbr_entry)
9907
+ bbr_runtime_remap_t *newnode = NULL;
9908
+ unsigned long flags;
9911
+ newnode = kmem_cache_alloc (BBRID->remap_pool->cachep, SLAB_ATOMIC);
9914
+ LOG_SERIOUS("could not allocate from remap pool! (rc=%d)\n", rc);
9917
+ newnode->remap.bad_sect = new_bbr_entry->bad_sect;
9918
+ newnode->remap.replacement_sect = new_bbr_entry->replacement_sect;
9919
+ spin_lock_irqsave(&BBRID->bbr_id_lock, flags);
9920
+ bbr_binary_tree_insert(&BBRID->remap_root,newnode);
9921
+ spin_unlock_irqrestore(&BBRID->bbr_id_lock, flags);
9926
+ * bbr_table_to_remap_list
9928
+ * The on-disk bbr table is sorted by the replacement sector LBA
9929
+ * In order to improve run time performance, the in memory remap
9930
+ * list must be sorted by the bad sector LBA.
9931
+ * This function is called at discovery time to initialize the remap
9932
+ * list. This function assumes that at least one copy of meta data is valid.
9934
+static u_int32_t bbr_table_to_remap_list(bbr_instance_data_t *BBRID)
9936
+ u_int32_t in_use_blks = 0;
9938
+ evms_bbr_table_t *p;
9941
+ for (i=0, p=BBRID->bbr_table; i<BBRID->nr_sects_bbr_table; i++, p++) {
9942
+ if (!p->in_use_cnt)
9944
+ in_use_blks += p->in_use_cnt;
9945
+ for (j=0; j<p->in_use_cnt; j++) {
9946
+ bbr_insert_remap_entry(BBRID, &p->entries[j]);
9951
+ return in_use_blks;
9955
+ * bbr_search_remap_entry
9957
+ * Search remap entry for the specified sector.
9958
+ * If found, return pointer to evms_bbr_table_entry_t.
9959
+ * Otherwise, return NULL.
9961
+static evms_bbr_table_entry_t * bbr_search_remap_entry(bbr_instance_data_t *BBRID, evms_sector_t lsn)
9963
+ bbr_runtime_remap_t *p;
9964
+ unsigned long flags;
9966
+ spin_lock_irqsave(&BBRID->bbr_id_lock, flags);
9967
+ p = bbr_binary_search(BBRID->remap_root, lsn);
9968
+ spin_unlock_irqrestore(&BBRID->bbr_id_lock, flags);
9970
+ return (&p->remap);
9977
+ * if *lsn is in the remap table, return TRUE and modify *lsn
9978
+ * else, return FALSE.
9980
+static inline int bbr_remap(bbr_instance_data_t *BBRID,
9981
+ evms_sector_t *lsn)
9983
+ evms_bbr_table_entry_t *e;
9985
+ if (atomic_read(&BBRID->in_use_replacement_blks) &&
9986
+ !(BBRID->flag & BBR_STOP_REMAP) ) {
9987
+ e = bbr_search_remap_entry(BBRID,*lsn);
9989
+ *lsn = e->replacement_sect;
9990
+ LOG_EXTRA("%s replacement sector(LSN=%Lu)\n", __FUNCTION__, *lsn);
9999
+ * if any of the sectors [lsn, lsn+nr_sects] in the remap table
10001
+ * else, return FALSE.
10003
+static inline int bbr_remap_probe(
10004
+ bbr_instance_data_t *BBRID,
10005
+ evms_sector_t lsn,
10006
+ evms_sector_t nr_sects)
10008
+ evms_sector_t tmp, cnt;
10010
+ if (atomic_read(&BBRID->in_use_replacement_blks) &&
10011
+ !(BBRID->flag & BBR_STOP_REMAP) ) {
10012
+ for (cnt = 0, tmp=lsn;
10014
+ cnt += BBRID->blksize_in_sects, tmp = lsn + cnt) {
10015
+ if (bbr_remap(BBRID,&tmp))
10022
+static int bbr_create_pools(bbr_instance_data_t *BBRID)
10024
+ /* create a memory pool for the remap list */
10025
+ sprintf(BBRID->remap_pool_name, "BBR_REMAP_%p", BBRID);
10026
+ sprintf(BBRID->bh_pool_name, "BBR_BH_%p", BBRID);
10027
+ BBRID->remap_pool = evms_cs_create_pool(
10028
+ sizeof (bbr_runtime_remap_t), BBRID->remap_pool_name, NULL, NULL);
10029
+ BBRID->bbr_bh_pool = evms_cs_create_pool(
10030
+ sizeof(bbr_bh_t), BBRID->bh_pool_name, NULL, NULL);
10032
+ if (!BBRID->remap_pool || !BBRID->bbr_bh_pool) {
10033
+ BBR_BUG(" Could not allocate pools!");
10034
+ bbr_destroy_pools(BBRID);
10040
+static void bbr_destroy_pools(bbr_instance_data_t *BBRID)
10042
+ if (BBRID->bbr_bh_pool)
10043
+ evms_cs_destroy_pool(BBRID->bbr_bh_pool);
10044
+ if (BBRID->remap_pool)
10045
+ evms_cs_destroy_pool(BBRID->remap_pool);
10048
+static int bbr_discover(evms_logical_node_t **discover_list)
10051
+ evms_logical_node_t *node, *next_node;
10052
+ evms_logical_node_t *bbr_node = NULL;
10053
+ bbr_instance_data_t *BBRID;
10055
+ next_node = *discover_list;
10056
+ while(next_node) {
10058
+ node = next_node;
10059
+ next_node = node->next;
10061
+ if ((!node->feature_header) || (node->feature_header->feature_id != plugin_header.id))
10062
+ continue; // probably a node we just put on the list, skip and go to next.
10064
+ rc = load_feature_data(node, &BBRID);
10066
+ /* error loading feature data */
10067
+ /* This node belongs to us, but metadata is invalid,
10068
+ * remove it from the discovery list
10070
+ * clear error code then continue.
10071
+ * Will consider creating a read only BBR node in the future.
10073
+ LOG_SERIOUS(" Error in node (%s) with %Lu sectors.\n",
10074
+ node->name,node->total_vsectors);
10075
+ evms_cs_remove_logical_node_from_list(discover_list, node);
10081
+ rc = evms_cs_allocate_logical_node(&bbr_node);
10085
+ bbr_node->volume_info = node->volume_info;
10086
+ bbr_node->flags |= node->flags;
10087
+ bbr_node->plugin = &plugin_header;
10088
+ strcpy(bbr_node->name, node->feature_header->object_name);
10089
+ bbr_node->hardsector_size = node->hardsector_size;
10090
+ bbr_node->total_vsectors = node->total_vsectors;
10091
+ bbr_node->total_vsectors -= (u_int64_t)(evms_cs_size_in_vsectors(sizeof(evms_feature_header_t)) * 2);
10092
+ bbr_node->total_vsectors -= (u_int64_t)node->feature_header->feature_data1_size;
10093
+ bbr_node->total_vsectors -= (u_int64_t)node->feature_header->feature_data2_size;
10094
+ bbr_node->block_size = node->block_size;
10095
+ bbr_node->instance_data = BBRID;
10096
+ BBRID->total_vsectors = bbr_node->total_vsectors;
10097
+ BBRID->node = bbr_node;
10099
+ /* free the feature header */
10100
+ evms_cs_deallocate_memory(node->feature_header);
10101
+ node->feature_header = NULL;
10102
+ evms_cs_remove_logical_node_from_list(discover_list, node);
10104
+ /* If bad blocks exist, give warning */
10105
+ bad_blocks = atomic_read(&BBRID->in_use_replacement_blks);
10106
+ if (bad_blocks) {
10107
+ BBR_DEBUG_PRINT_REMAP_LIST(BBRID);
10108
+ LOG_WARNING("%s has %d bad blocks\n", BBRID->source->name, bad_blocks);
10109
+ LOG_WARNING("There are %Lu total replacement blocks.\n",
10110
+ BBRID->nr_replacement_blks);
10111
+ LOG_WARNING("There are %Lu remaining replacement blocks.\n",
10112
+ BBRID->nr_replacement_blks - bad_blocks);
10115
+ evms_cs_add_logical_node_to_list(discover_list, bbr_node);
10117
+ MOD_INC_USE_COUNT;
10118
+ bbr_list_add(BBRID);
10120
+ LOG_SERIOUS("could not allocate logical node! rc=%d\n",rc);
10121
+ bbr_free_instance_data(BBRID);
10123
+ } /* end while()*/
10127
+static inline void bbr_list_add(bbr_instance_data_t *BBRID)
10129
+ BBRID->next = bbr_instances;
10130
+ bbr_instances = BBRID;
10133
+static void bbr_list_remove(bbr_instance_data_t *BBRID)
10135
+ bbr_instance_data_t *p;
10140
+ if (BBRID == bbr_instances) {
10141
+ bbr_instances = NULL;
10145
+ p = bbr_instances;
10147
+ if (p->next == BBRID) {
10148
+ p->next = p->next->next;
10155
+static bbr_instance_data_t *bbr_find_instance_data (char * object_name)
10157
+ bbr_instance_data_t *p = bbr_instances;
10160
+ if (!strcmp(p->node->name, object_name))
10167
+static void bbr_free_instance_data(bbr_instance_data_t *BBRID)
10169
+ if (BBRID->remap_root)
10170
+ bbr_free_remap(BBRID);
10171
+ bbr_destroy_pools(BBRID);
10172
+ if (BBRID->bbr_table)
10173
+ evms_cs_deallocate_memory(BBRID->bbr_table);
10174
+ bbr_list_remove(BBRID);
10175
+ evms_cs_deallocate_memory(BBRID);
10178
+static int bbr_delete(evms_logical_node_t *bbr_node)
10180
+ bbr_instance_data_t *BBRID;
10183
+ BBRID = bbr_node->instance_data;
10185
+ rc = DELETE(BBRID->source);
10187
+ /* Now cleanup and go away */
10188
+ bbr_free_instance_data(BBRID);
10189
+ evms_cs_deallocate_logical_node(bbr_node);
10190
+ MOD_DEC_USE_COUNT;
10191
+ if (!bbr_instances) {
10192
+ if (bbr_io_thread) {
10193
+ evms_cs_unregister_thread(bbr_io_thread);
10194
+ bbr_io_thread = NULL;
10201
+static bbr_bh_t * allocate_bbr_bh(bbr_instance_data_t *BBRID, int rw)
10203
+ bbr_bh_t * bbr_bh;
10205
+ bbr_bh = evms_cs_allocate_from_pool(BBRID->bbr_bh_pool, TRUE);
10207
+ memset(bbr_bh, 0, sizeof(bbr_bh_t));
10208
+ bbr_bh->BBRID = BBRID;
10210
+ atomic_set(&bbr_bh->waiters, 0);
10213
+ LOG_WARNING("Could not allocate from BBR BH pool!\n");
10218
+static void free_bbr_bh(bbr_bh_t *bbr_bh)
10220
+ evms_cs_deallocate_to_pool(bbr_bh->BBRID->bbr_bh_pool, bbr_bh);
10224
+/* bbr_io_remap_error
10226
+ * For the requested range, try to write each sector individually. For each
10227
+ * sector that fails, find the next available remap location and write the
10228
+ * data to that new location. Then update the table and write both copies
10229
+ * of the table to disk. Finally, update the in-memory mapping and do any
10230
+ * other necessary bookkeeping.
10232
+static int bbr_io_remap_error( bbr_instance_data_t * BBRID,
10234
+ evms_sector_t starting_lsn,
10235
+ evms_sector_t count,
10238
+ evms_sector_t lsn, new_lsn;
10239
+ evms_bbr_table_t * bbr_table;
10240
+ unsigned long table_sector_index;
10241
+ unsigned long table_sector_offset;
10242
+ unsigned long index;
10245
+ if ( rw == READ ) {
10246
+ // Nothing can be done about read errors.
10250
+ // For each sector in the request.
10251
+ for ( lsn = 0; lsn < count; lsn++, buffer += EVMS_VSECTOR_SIZE ) {
10252
+ rc = INIT_IO(BBRID->source, rw, starting_lsn + lsn, 1, buffer);
10254
+ if ( BBRID->flag & BBR_STOP_REMAP ) {
10255
+ // Can't allow new remaps if the engine told us to stop.
10256
+ LOG_ERROR("object %s: Bad sector (%Lu), but remapping is turned off.\n",
10257
+ BBRID->node->name, starting_lsn + lsn);
10261
+ // Find the next available relocation sector.
10262
+ new_lsn = atomic_read(&BBRID->in_use_replacement_blks);
10263
+ if ( new_lsn >= BBRID->nr_replacement_blks ) {
10264
+ // No more replacement sectors available.
10267
+ new_lsn += BBRID->start_replacement_sect;
10269
+ // Write the data to its new location.
10270
+ LOG_WARNING("object %s: Trying to remap bad sector (%Lu) to sector (%Lu)\n",
10271
+ BBRID->node->name, starting_lsn + lsn, new_lsn);
10272
+ rc = INIT_IO(BBRID->source, rw, new_lsn, 1, buffer);
10274
+ // This replacement sector is bad. Try the next.
10275
+ LOG_ERROR("object %s: Replacement sector (%Lu) is bad. Skipping.\n",
10276
+ BBRID->node->name, new_lsn);
10277
+ atomic_inc(&BBRID->in_use_replacement_blks);
10281
+ // Add this new entry to the on-disk table.
10282
+ table_sector_index = new_lsn - BBRID->start_replacement_sect;
10283
+ table_sector_offset = table_sector_index / EVMS_BBR_ENTRIES_PER_SECT;
10284
+ index = table_sector_index % EVMS_BBR_ENTRIES_PER_SECT;
10286
+ bbr_table = &BBRID->bbr_table[table_sector_offset];
10287
+ bbr_table->entries[index].bad_sect = starting_lsn + lsn;
10288
+ bbr_table->entries[index].replacement_sect = new_lsn;
10289
+ bbr_table->in_use_cnt++;
10290
+ bbr_table->sequence_number++;
10291
+ bbr_table->crc = 0;
10292
+ bbr_table->crc = evms_cs_calculate_crc( EVMS_INITIAL_CRC,
10294
+ sizeof(evms_bbr_table_t));
10296
+ // Write the table to disk.
10297
+ cpu_bbr_table_sector_to_le(bbr_table, bbr_table);
10298
+ if ( BBRID->lba_table1 ) {
10299
+ rc = INIT_IO(BBRID->source, WRITE, BBRID->lba_table1 + table_sector_offset, 1, bbr_table);
10301
+ if ( BBRID->lba_table2 ) {
10302
+ rc |= INIT_IO(BBRID->source, WRITE, BBRID->lba_table2 + table_sector_offset, 1, bbr_table);
10304
+ le_bbr_table_sector_to_cpu(bbr_table);
10307
+ // Error writing one of the tables to disk.
10308
+ LOG_ERROR("object %s: Error updating BBR tables on disk.\n",
10309
+ BBRID->node->name);
10313
+ // Insert a new entry in the remapping binary-tree.
10314
+ rc = bbr_insert_remap_entry(BBRID, &bbr_table->entries[index]);
10316
+ LOG_ERROR("object %s: Error adding new entry to remap tree.\n",
10317
+ BBRID->node->name);
10321
+ atomic_inc(&BBRID->in_use_replacement_blks);
10329
+/* bbr_io_process_request
10331
+ * For each sector in this request, check if the sector has already
10332
+ * been remapped. If so, process all previous sectors in the request,
10333
+ * followed by the remapped sector. Then reset the starting lsn and
10334
+ * count, and keep going with the rest of the request as if it were
10335
+ * a whole new request. If any of the INIT_IO's return an error,
10336
+ * call the remapper to relocate the bad sector(s).
10338
+static int bbr_io_process_request( bbr_bh_t * bbr_bh )
10340
+ bbr_instance_data_t * BBRID = bbr_bh->BBRID;
10341
+ evms_sector_t starting_lsn = bbr_bh->eio.rsector;
10342
+ evms_sector_t count = bbr_bh->eio.rsize;
10343
+ evms_sector_t lsn, remapped_lsn;
10344
+ char * buffer = bbr_bh->eio.bh->b_data;
10345
+ int rc = 0, rw = bbr_bh->rw;
10347
+ // For each sector in this request, check if this sector has already
10348
+ // been remapped. If so, process all previous sectors in this request,
10349
+ // followed by the remapped sector. Then reset the starting lsn and
10350
+ // count and keep going with the rest of the request as if it were
10351
+ // a whole new request.
10352
+ for ( lsn = 0; lsn < count && !(BBRID->flag & BBR_STOP_REMAP); lsn++ ) {
10353
+ remapped_lsn = starting_lsn + lsn;
10354
+ rc = bbr_remap(BBRID, &remapped_lsn);
10356
+ // Process all sectors in the request up to this one.
10358
+ rc = INIT_IO(BBRID->source, rw, starting_lsn, lsn, buffer);
10360
+ // If this I/O failed, then one of the
10361
+ // sectors in this request needs to be
10363
+ rc = bbr_io_remap_error(BBRID, rw, starting_lsn, lsn, buffer);
10368
+ buffer += (lsn << EVMS_VSECTOR_SIZE_SHIFT);
10371
+ // Process the remapped sector.
10372
+ rc = INIT_IO(BBRID->source, rw, remapped_lsn, 1, buffer);
10374
+ // BUGBUG - Need more processing if this caused an error.
10375
+ // If this I/O failed, then the existing remap
10376
+ // is now bad, and we need to find a new remap.
10377
+ // Can't use bbr_io_remap_error(), because the
10378
+ // existing map entry needs to be changed, not
10379
+ // added again, and the original table entry
10380
+ // also needs to be changed.
10384
+ buffer += EVMS_VSECTOR_SIZE;
10385
+ starting_lsn += (lsn + 1);
10386
+ count -= (lsn + 1);
10391
+ // Check for any remaining sectors after the last split. This could
10392
+ // potentially be the whole request, but that should be a rare case
10393
+ // because requests should only be processed by the thread if we know
10394
+ // an error occurred or they contained one or more remapped sectors.
10396
+ rc = INIT_IO(BBRID->source, rw, starting_lsn, count, buffer);
10398
+ // If this I/O failed, then one of the sectors in this
10399
+ // request needs to be relocated.
10400
+ rc = bbr_io_remap_error(BBRID, rw, starting_lsn, lsn, buffer);
10413
+ * This is the handler for the bbr_io_thread. It continuously loops,
10414
+ * taking I/O requests off its list and processing them. If nothing
10415
+ * is on the list, the thread goes back to sleep until specifically
10418
+ * I/O requests should only be sent to this thread if we know that:
10419
+ * a) the request contains at least one remapped sector.
10421
+ * b) the request caused an error on the normal I/O path.
10422
+ * This function uses synchronous I/O, so sending a request to this
10423
+ * thread that doesn't need special processing will cause severe
10424
+ * performance degredation.
10426
+static void bbr_io_handler( void * void_data )
10428
+ bbr_bh_t * bbr_bh;
10429
+ struct buffer_head * bh;
10430
+ unsigned long flags;
10434
+ // Process bbr_io_list, one entry at a time.
10435
+ spin_lock_irqsave(&bbr_io_list_lock, flags);
10436
+ bbr_bh = bbr_io_list;
10438
+ spin_unlock_irqrestore(&bbr_io_list_lock, flags);
10439
+ break; // No more items on the list.
10441
+ bbr_io_list = bbr_bh->next;
10442
+ spin_unlock_irqrestore(&bbr_io_list_lock, flags);
10444
+ rc = bbr_io_process_request(bbr_bh);
10446
+ // Clean up and complete the original I/O.
10447
+ bh = bbr_bh->eio.bh;
10448
+ if (bh->b_end_io) {
10449
+ // A normal request that originated from above EVMS.
10450
+ if ( ! (bbr_bh->flag & BBR_BH_USE_EVMS_CALLBACK) ) {
10451
+ evms_cs_volume_request_in_progress(bh->b_dev, -1, NULL);
10453
+ free_bbr_bh(bbr_bh);
10454
+ bh->b_end_io(bh, rc ? 0 : 1);
10457
+ // A request that originated from bbr_init_io.
10459
+ if ( waitqueue_active(&bh->b_wait) ) {
10460
+ atomic_dec(&bbr_bh->waiters);
10461
+ wake_up(&bh->b_wait);
10468
+/* bbr_schedule_io
10470
+ * Place the specified bbr_bh on the thread's processing list.
10472
+static void bbr_schedule_io( bbr_bh_t * bbr_bh )
10474
+ unsigned long flags;
10476
+ spin_lock_irqsave(&bbr_io_list_lock, flags);
10477
+ if (bbr_io_list == NULL)
10478
+ bbr_io_list_tail = &bbr_io_list;
10479
+ *bbr_io_list_tail = bbr_bh;
10480
+ bbr_io_list_tail = &bbr_bh->next;
10481
+ bbr_bh->next = NULL;
10482
+ spin_unlock_irqrestore(&bbr_io_list_lock, flags);
10483
+ evms_cs_wakeup_thread(bbr_io_thread);
10489
+ * If there are any remapped sectors on this object, send this request over
10490
+ * to the thread for processing. Otherwise send it down the stack normally.
10492
+static void bbr_read( evms_logical_node_t * bbr_node,
10495
+ bbr_instance_data_t * BBRID = bbr_node->instance_data;
10496
+ bbr_bh_t * bbr_bh;
10498
+ if ( eio->rsector + eio->rsize <= bbr_node->total_vsectors ) {
10499
+ if ( atomic_read(&BBRID->in_use_replacement_blks) == 0 ||
10500
+ BBRID->flag & BBR_STOP_REMAP ||
10501
+ ! bbr_remap_probe(BBRID, eio->rsector, eio->rsize) ) {
10502
+ R_IO(BBRID->source, eio);
10505
+ bbr_bh = allocate_bbr_bh(BBRID, READ);
10507
+ bbr_bh->eio = *eio;
10508
+ evms_cs_volume_request_in_progress(bbr_bh->eio.bh->b_dev, +1, NULL);
10509
+ bbr_schedule_io(bbr_bh);
10512
+ // Can't get memory to track the I/O.
10513
+ EVMS_IO_ERROR(eio);
10518
+ // Request is off the end of the object.
10519
+ EVMS_IO_ERROR(eio);
10524
+/* bbr_write_callback
10526
+ * This is the callback for normal write requests. Check for an error
10527
+ * during the I/O, and send to the thread for processing if necessary.
10529
+static void bbr_write_callback( bbr_bh_t * bbr_bh,
10530
+ struct buffer_head * bh,
10534
+ if ( ! uptodate &&
10535
+ ! (bbr_bh->BBRID->flag & BBR_STOP_REMAP) ) {
10536
+ LOG_ERROR("object %s: Write failure on sector (%Lu). Scheduling for retry.\n",
10537
+ bbr_bh->BBRID->node->name, bbr_bh->eio.rsector);
10538
+ bbr_schedule_io(bbr_bh);
10542
+ free_bbr_bh(bbr_bh);
10549
+ * If there are any remapped sectors on this object, send the request over
10550
+ * to the thread for processing. Otherwise, register for callback
10551
+ * notification, and send the request down normally.
10553
+static void bbr_write(evms_logical_node_t *bbr_node, eio_t *eio)
10555
+ bbr_instance_data_t * BBRID = bbr_node->instance_data;
10556
+ bbr_bh_t * bbr_bh;
10558
+ if ( eio->rsector + eio->rsize <= bbr_node->total_vsectors &&
10559
+ ! (BBRID->flag & EVMS_VOLUME_READ_ONLY) ) {
10560
+ bbr_bh = allocate_bbr_bh(BBRID, WRITE);
10562
+ bbr_bh->eio = *eio;
10564
+ if ( atomic_read(&BBRID->in_use_replacement_blks) == 0 ||
10565
+ BBRID->flag & BBR_STOP_REMAP ||
10566
+ ! bbr_remap_probe(BBRID, eio->rsector, eio->rsize) ) {
10567
+ bbr_bh->flag |= BBR_BH_USE_EVMS_CALLBACK;
10568
+ evms_cs_register_for_end_io_notification(bbr_bh, eio->bh, bbr_write_callback);
10569
+ W_IO(BBRID->source, eio);
10572
+ evms_cs_volume_request_in_progress(eio->bh->b_dev, +1, NULL);
10573
+ bbr_schedule_io(bbr_bh);
10577
+ // Can't get memory to track the I/O.
10578
+ EVMS_IO_ERROR(eio);
10582
+ // Request is off the end of the object, or this
10583
+ // is a read-only object.
10584
+ EVMS_IO_ERROR(eio);
10589
+/********************************************************/
10590
+/* Required Plugin Function Table Entry Point: */
10591
+/* Init_io function */
10592
+/********************************************************/
10595
+static int bbr_init_io_schedule_io( bbr_instance_data_t * BBRID,
10597
+ evms_sector_t lsn,
10598
+ evms_sector_t count,
10601
+ bbr_bh_t * bbr_bh;
10602
+ struct buffer_head * bh;
10605
+ if ( rw == WRITE ) {
10606
+ LOG_ERROR("object %s: init_io write failure (sector %Lu: count %Lu). Scheduling for retry.\n",
10607
+ BBRID->node->name, lsn, count);
10608
+ bbr_bh = allocate_bbr_bh(BBRID,rw);
10610
+ bbr_bh->eio.rsector = lsn;
10611
+ bbr_bh->eio.rsize = count;
10613
+ bh = evms_cs_allocate_from_pool(evms_bh_pool, TRUE);
10615
+ bbr_bh->eio.bh = bh;
10617
+ memset(bh, 0, sizeof(*bh));
10618
+ init_waitqueue_head(&bh->b_wait);
10619
+ bh->b_data = buffer;
10620
+ bh->b_end_io = NULL;
10622
+ atomic_inc(&bbr_bh->waiters);
10623
+ bbr_schedule_io(bbr_bh);
10624
+ wait_event(bh->b_wait, (atomic_read(&bbr_bh->waiters) == 0));
10628
+ evms_cs_deallocate_to_pool(evms_bh_pool, bh);
10631
+ // Couldn't get buffer head.
10635
+ free_bbr_bh(bbr_bh);
10638
+ // Couldn't get bbr_bh.
10643
+ // Nothing can be done about read failures.
10650
+static int bbr_init_io( evms_logical_node_t * bbr_node,
10652
+ evms_sector_t start_lsn,
10653
+ evms_sector_t count,
10656
+ bbr_instance_data_t * BBRID;
10657
+ evms_sector_t lsn;
10660
+ if ( start_lsn + count <= bbr_node->total_vsectors ) {
10661
+ BBRID = bbr_node->instance_data;
10663
+ if ( io_flag == WRITE && (BBRID->flag & EVMS_VOLUME_READ_ONLY) ) {
10664
+ // Can't write to a read-only object.
10668
+ if ( BBRID->flag & BBR_STOP_REMAP ) {
10669
+ // Can't remap at all.
10670
+ rc = INIT_IO(BBRID->source, io_flag, start_lsn, count, buffer);
10672
+ else if ( atomic_read(&BBRID->in_use_replacement_blks) == 0 ||
10673
+ ! bbr_remap_probe(BBRID, start_lsn, count) ) {
10674
+ // Normal case (no existing remaps)
10675
+ rc = INIT_IO(BBRID->source, io_flag, start_lsn, count, buffer);
10677
+ // Init_io error. Send request over to
10678
+ // thread for further processing.
10679
+ rc = bbr_init_io_schedule_io(BBRID, io_flag, start_lsn, count, buffer);
10683
+ // At least one sector in this request needs to
10684
+ // be remapped. Test and send each one down
10686
+ for ( lsn = start_lsn; lsn < start_lsn + count; lsn++, buffer += EVMS_VSECTOR_SIZE ) {
10687
+ bbr_remap(BBRID, &lsn);
10688
+ rc = INIT_IO(BBRID->source, io_flag, lsn, 1, buffer);
10690
+ // Init_io error. Send request
10691
+ // to thread for processing.
10692
+ rc = bbr_init_io_schedule_io(BBRID, io_flag, lsn, 1, buffer);
10702
+ // Request is off the end of the object.
10710
+/********************************************************/
10711
+/* Required Plugin Function Table Entry Point: */
10712
+/* IOCTL function */
10713
+/********************************************************/
10715
+static int bbr_direct_ioctl_sector_io( bbr_instance_data_t * BBRID,
10716
+ evms_notify_bbr_t * ioctl_arg )
10718
+ char * buffer, *user_buffer;
10719
+ evms_sector_t lsn;
10722
+ if ( evms_cs_allocate_memory((void**)&buffer, EVMS_VSECTOR_SIZE) ) {
10726
+ user_buffer = (char*)ioctl_arg->buffer;
10728
+ for ( lsn = 0; lsn < ioctl_arg->nr_sect; lsn++, user_buffer += EVMS_VSECTOR_SIZE ) {
10729
+ if ( ioctl_arg->rw == WRITE ) {
10730
+ if ( copy_from_user(buffer, user_buffer, EVMS_VSECTOR_SIZE) ) {
10736
+ rc = bbr_init_io(BBRID->node, ioctl_arg->rw, ioctl_arg->start_sect + lsn, 1, buffer);
10741
+ if ( ioctl_arg->rw == READ ) {
10742
+ if ( copy_to_user(user_buffer, buffer, EVMS_VSECTOR_SIZE) ) {
10749
+ evms_cs_deallocate_memory(buffer);
10753
+static int bbr_direct_ioctl (
10754
+ struct inode *inode,
10755
+ struct file *file,
10756
+ unsigned int cmd,
10757
+ unsigned long arg)
10760
+ bbr_instance_data_t *BBRID;
10761
+ evms_plugin_ioctl_t argument;
10762
+ evms_notify_bbr_t ioctl_arg, *usr_ioctl_arg;
10764
+ if ( copy_from_user(&argument, (evms_plugin_ioctl_t *)arg, sizeof(argument)) ) {
10768
+ if ( argument.feature_id != plugin_header.id ) {
10772
+ usr_ioctl_arg = (evms_notify_bbr_t*)argument.feature_ioctl_data;
10773
+ if ( copy_from_user(&ioctl_arg, usr_ioctl_arg, sizeof(ioctl_arg)) ) {
10777
+ BBRID = bbr_find_instance_data(ioctl_arg.object_name);
10783
+ switch(argument.feature_command) {
10785
+ case BBR_STOP_REMAP_CMD:
10786
+ BBRID->flag |= BBR_STOP_REMAP;
10789
+ case BBR_GET_INFO_CMD:
10790
+ ioctl_arg.count = atomic_read(&BBRID->in_use_replacement_blks);
10791
+ if ( copy_to_user(&usr_ioctl_arg->count,
10792
+ &ioctl_arg.count,
10793
+ sizeof(usr_ioctl_arg->count)) ) {
10798
+ case BBR_SECTOR_IO_CMD:
10799
+ rc = bbr_direct_ioctl_sector_io(BBRID, &ioctl_arg);
10808
+ argument.status = rc;
10809
+ copy_to_user((evms_plugin_ioctl_t*)arg, &argument, sizeof(argument));
10813
+static int bbr_ioctl (evms_logical_node_t *bbr_node,
10814
+ struct inode *inode,
10815
+ struct file *file,
10816
+ unsigned int cmd,
10817
+ unsigned long arg)
10819
+ bbr_instance_data_t *BBRID;
10823
+ BBRID = bbr_node->instance_data;
10827
+ case EVMS_PLUGIN_IOCTL:
10828
+ rc = bbr_direct_ioctl(inode,file,cmd,arg);
10830
+ case EVMS_GET_BMAP:
10832
+ evms_get_bmap_t *bmap = (evms_get_bmap_t *)arg;
10834
+ bbr_remap(BBRID, &bmap->rsector);
10839
+ rc = IOCTL(BBRID->source, inode, file, cmd, arg);
10844
+int bbr_notify_reboot(struct notifier_block *this, unsigned long code, void *x)
10846
+ if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) {
10848
+ LOG_DEFAULT("%s unregister BBR threads\n", __FUNCTION__);
10849
+ if (bbr_io_thread)
10850
+ evms_cs_unregister_thread(bbr_io_thread);
10851
+ mdelay(1000*1); /* delay some */
10853
+ return NOTIFY_DONE;
10856
+static int __init bbr_init(void)
10858
+ /* Register for reboot notification */
10859
+ register_reboot_notifier(&bbr_notifier);
10861
+ return evms_cs_register_plugin(&plugin_header);
10864
+static void __exit bbr_exit(void)
10866
+ evms_cs_unregister_plugin(&plugin_header);
10870
+module_init(bbr_init);
10871
+module_exit(bbr_exit);
10872
+#ifdef MODULE_LICENSE
10873
+MODULE_LICENSE("GPL");
10876
diff -Naur linux-2002-03-28/drivers/evms/evms_drivelink.c evms-2002-03-28/drivers/evms/evms_drivelink.c
10877
--- linux-2002-03-28/drivers/evms/evms_drivelink.c Wed Dec 31 18:00:00 1969
10878
+++ evms-2002-03-28/drivers/evms/evms_drivelink.c Wed Mar 27 15:51:36 2002
10880
+/* -*- linux-c -*- */
10885
+ * Copyright (c) International Business Machines Corp., 2000
10887
+ * This program is free software; you can redistribute it and/or modify
10888
+ * it under the terms of the GNU General Public License as published by
10889
+ * the Free Software Foundation; either version 2 of the License, or
10890
+ * (at your option) any later version.
10892
+ * This program is distributed in the hope that it will be useful,
10893
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
10894
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
10895
+ * the GNU General Public License for more details.
10897
+ * You should have received a copy of the GNU General Public License
10898
+ * along with this program; if not, write to the Free Software
10899
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
10904
+ * linux/drivers/evms/drvlink.c
10907
+ * EVMS Drive Linking Feature.
10909
+ * This feature provides the ability to link multiple storage objects
10910
+ * together as a single virtual storage object.
10914
+#include <linux/module.h>
10915
+#include <linux/kernel.h>
10916
+#include <linux/config.h>
10917
+#include <linux/genhd.h>
10918
+#include <linux/blk.h>
10919
+#include <linux/evms/evms_kernel.h>
10920
+#include <linux/evms/evms_drivelink.h>
10921
+#include <asm/uaccess.h>
10923
+#define LOG_PREFIX "drivelink: "
10925
+/* prototypes for mandatory plugin interface functions */
10926
+static int drivelink_discover(evms_logical_node_t **);
10927
+static int drivelink_delete(evms_logical_node_t *);
10928
+static void drivelink_read(evms_logical_node_t *, eio_t *);
10929
+static void drivelink_write(evms_logical_node_t *, eio_t *);
10930
+static int drivelink_ioctl(evms_logical_node_t *,
10935
+static int drivelink_init_io(evms_logical_node_t *,
10941
+/* plugin function table definition */
10942
+static evms_plugin_function_table_t function_table = {
10943
+ discover: &drivelink_discover,
10944
+ delete : &drivelink_delete,
10945
+ read : &drivelink_read,
10946
+ write : &drivelink_write,
10947
+ init_io : &drivelink_init_io,
10948
+ ioctl : &drivelink_ioctl
10951
+/* plugin header definition */
10952
+static evms_plugin_header_t plugin_header = {
10953
+ id : SetPluginID(
10955
+ EVMS_FEATURE, //FEATURE class
10956
+ EVMS_DRIVELINK_FEATURE_ID), // unique id for feature
10958
+ major : EVMS_DRIVELINK_VERSION_MAJOR,
10959
+ minor : EVMS_DRIVELINK_VERSION_MINOR,
10960
+ patchlevel : EVMS_DRIVELINK_VERSION_PATCHLEVEL
10962
+ required_common_services_version : {
10967
+ function_table : &function_table // function table for this plugin
10970
+/********************************************************/
10971
+/* Required Plugin Function Table Entry Point: */
10972
+/* Discover function & Support routines */
10973
+/********************************************************/
10978
+ * convert feature data from on-disk (Little Endian) format
10979
+ * to the native cpu endian format.
10982
+le_feature_data_to_cpu(evms_drivelink_metadata_t *DLMD)
10986
+ DLMD->signature = le32_to_cpu(DLMD->signature);
10987
+ DLMD->crc = le32_to_cpu(DLMD->crc);
10988
+ DLMD->version.major = le32_to_cpu(DLMD->version.major);
10989
+ DLMD->version.minor = le32_to_cpu(DLMD->version.minor);
10990
+ DLMD->version.patchlevel = le32_to_cpu(DLMD->version.patchlevel);
10991
+ DLMD->flags = le32_to_cpu(DLMD->flags);
10992
+ DLMD->sequence_number = le64_to_cpu(DLMD->sequence_number);
10993
+ DLMD->child_serial_number = le64_to_cpu(DLMD->child_serial_number);
10994
+ DLMD->parent_serial_number = le64_to_cpu(DLMD->parent_serial_number);
10995
+ DLMD->child_count = le64_to_cpu(DLMD->child_count);
10996
+ for (i = 0; i < EVMS_DRIVELINK_MAX_ENTRIES; i++) {
10997
+ evms_dl_ordering_table_entry_t *child_entry;
10999
+ child_entry = &DLMD->ordering_table[i];
11000
+ child_entry->child_serial_number =
11001
+ le64_to_cpu(child_entry->child_serial_number);
11002
+ child_entry->child_vsize =
11003
+ le64_to_cpu(child_entry->child_vsize);
11008
+load_feature_data(
11009
+ evms_logical_node_t *node,
11010
+ evms_drivelink_metadata_t **DLMD)
11012
+ int i, rc = 0, rc_array[2] = {0,0}, size_in_bytes;
11013
+ u_int64_t real_metadata_size, feature_data_size;
11014
+ u_int64_t starting_sector;
11015
+ evms_drivelink_metadata_t *cur_DLMD, *DLMD1, *DLMD2;
11016
+ char *location_name;
11018
+ /* verify the feature metadata size from the */
11019
+ /* feature header agrees with the real size */
11020
+ /* of the current metadata structure. */
11021
+ real_metadata_size = evms_cs_size_in_vsectors(sizeof(**DLMD));
11023
+ /* allocate a buffer large enough to hold all */
11024
+ /* sectors containing the feature's metadata */
11025
+ size_in_bytes = real_metadata_size * EVMS_VSECTOR_SIZE;
11026
+ rc = evms_cs_allocate_memory((void **)&DLMD1, size_in_bytes);
11028
+ rc = evms_cs_allocate_memory((void **)&DLMD2, size_in_bytes);
11029
+ if (rc) evms_cs_deallocate_memory(DLMD1);
11032
+ for (i = 0; i < 2; i++) {
11034
+ starting_sector = node->feature_header->feature_data1_start_lsn;
11035
+ feature_data_size = node->feature_header->feature_data1_size;
11036
+ cur_DLMD = DLMD1;
11037
+ location_name = evms_primary_string;
11039
+ starting_sector = node->feature_header->feature_data2_start_lsn;
11040
+ feature_data_size = node->feature_header->feature_data2_size;
11041
+ cur_DLMD = DLMD2;
11042
+ location_name = evms_secondary_string;
11044
+ /* check that real metadata size matches the */
11045
+ /* feature data size */
11046
+ if (real_metadata_size != feature_data_size) {
11047
+ LOG_ERROR("%s feature data size(%Lu bytes) doesn't match expected size(%Lu bytes).\n",
11049
+ feature_data_size << EVMS_VSECTOR_SIZE_SHIFT,
11050
+ real_metadata_size << EVMS_VSECTOR_SIZE_SHIFT);
11052
+ rc_array[i] = rc;
11055
+ /* load the node's feature data */
11056
+ rc = INIT_IO(node,
11059
+ feature_data_size,
11062
+ LOG_ERROR("error(%d) probing for %s feature data at sector(%Ld) on '%s'.\n",
11067
+ rc_array[i] = rc;
11070
+ /* check for valid metadata signature */
11071
+ if (le32_to_cpu(cur_DLMD->signature) != EVMS_DRIVELINK_SIGNATURE) {
11073
+ LOG_SERIOUS("error(%d) invalid signature in %s feature data on '%s'\n",
11077
+ rc_array[i] = rc;
11080
+ /* validate feature data CRC */
11081
+ if (cur_DLMD->crc != EVMS_MAGIC_CRC) {
11082
+ int org_crc, final_crc;
11083
+ org_crc = le32_to_cpu(cur_DLMD->crc);
11084
+ cur_DLMD->crc = 0;
11085
+ final_crc = evms_cs_calculate_crc(
11086
+ EVMS_INITIAL_CRC,
11087
+ cur_DLMD, sizeof(*cur_DLMD));
11088
+ if (final_crc != org_crc) {
11089
+ LOG_ERROR("CRC mismatch error [stored(%x), computed(%x)] in %s feature data on '%s'.\n",
11090
+ org_crc, final_crc,
11094
+ rc_array[i] = rc;
11098
+ LOG_WARNING("CRC disabled in %s feature data on '%s'.\n",
11102
+ /* convert feature data from on-disk
11103
+ * format (Little Endian) to native
11104
+ * cpu endian format.
11106
+ le_feature_data_to_cpu(cur_DLMD);
11107
+ /* check for valid structure version */
11108
+ rc = evms_cs_check_version(
11109
+ &plugin_header.version,
11110
+ &cur_DLMD->version);
11112
+ LOG_SERIOUS("error(%d) obsolete version(%d,%d,%d) detected in %s feature data on '%s'\n",
11114
+ cur_DLMD->version.major,
11115
+ cur_DLMD->version.minor,
11116
+ cur_DLMD->version.patchlevel,
11119
+ rc_array[i] = rc;
11122
+ /* getting same return code for both copies? */
11123
+ if (rc_array[0] == rc_array[1]) {
11124
+ rc = rc_array[0];
11125
+ /* if no errors on both copies,
11126
+ * check the sequence numbers.
11127
+ * use the highest sequence number.
11130
+ /* compare sequence numbers */
11131
+ if (DLMD1->sequence_number == DLMD2->sequence_number) {
11132
+ cur_DLMD = DLMD1;
11134
+ LOG_WARNING("sequence number mismatches between front(%Ld) and rear(%Ld) feature data copies on node(%s)!\n",
11135
+ DLMD2->sequence_number,
11136
+ DLMD1->sequence_number,
11138
+ if (DLMD1->sequence_number > DLMD2->sequence_number)
11139
+ cur_DLMD = DLMD1;
11141
+ cur_DLMD = DLMD2;
11142
+ LOG_WARNING("using %s feature data copy!\n",
11143
+ (cur_DLMD == DLMD1) ?
11144
+ evms_primary_string :
11145
+ evms_secondary_string);
11148
+ /* getting different return codes for each copy */
11149
+ } else if (rc_array[0] == 0) {
11150
+ /* use 1st (rear) copy if its good */
11152
+ cur_DLMD = DLMD1;
11153
+ } else if (rc_array[1] == 0) {
11154
+ /* use 2nd (front) copy if its good */
11156
+ cur_DLMD = DLMD2;
11157
+ } else if ((rc_array[0] == -EINVAL) ||
11158
+ (rc_array[1] == -EINVAL)) {
11159
+ /* fail if either give a fatal error */
11164
+ /* deallocate metadata buffers appropriately */
11165
+ if (rc || (cur_DLMD == DLMD1))
11166
+ evms_cs_deallocate_memory(DLMD2);
11167
+ if (rc || (cur_DLMD == DLMD2))
11168
+ evms_cs_deallocate_memory(DLMD1);
11170
+ /* save validated feature header pointer */
11172
+ *DLMD = cur_DLMD;
11178
+find_parent_node_for_child_node(
11179
+ evms_logical_node_t *child_node,
11180
+ evms_drivelink_metadata_t *DLMD,
11181
+ evms_logical_node_t **parent_node,
11182
+ evms_drivelink_runtime_data_t **drivelink_instance_data,
11183
+ evms_logical_node_t **discover_list)
11185
+ int rc = 0, parent_found = FALSE;
11186
+ evms_logical_node_t *parent = NULL;
11187
+ evms_drivelink_runtime_data_t *DLID = NULL;
11189
+ /* find the parent node for this child */
11190
+ for (parent = *discover_list; parent; parent = parent->next) {
11191
+ /* only parent nodes will have null feature headers */
11192
+ if (!parent->feature_header) {
11193
+ DLID = (evms_drivelink_runtime_data_t *)parent->instance_data;
11194
+ if (DLID->parent_serial_number == DLMD->parent_serial_number) {
11195
+ parent_found = TRUE;
11200
+ /* if no parent node found, create it */
11201
+ if (parent_found == FALSE) {
11202
+ rc = evms_cs_allocate_logical_node(&parent);
11204
+ /* transpose info from child to parent */
11205
+ parent->flags |= child_node->flags;
11206
+ strcpy(parent->name, child_node->feature_header->object_name);
11207
+ /* copy evms system data to parent */
11208
+ parent->volume_info = child_node->volume_info;
11209
+ /* initialize the plugin id field */
11210
+ parent->plugin = &plugin_header;
11211
+ /* allocate parent's instance data */
11212
+ rc = evms_cs_allocate_memory(
11213
+ (void **)&parent->instance_data,
11217
+ /* initialize some instance data fields */
11218
+ DLID = (evms_drivelink_runtime_data_t *)parent->instance_data;
11219
+ DLID->parent_serial_number = DLMD->parent_serial_number;
11220
+ DLID->child_count = DLMD->child_count;
11221
+ /* allocate the child table */
11222
+ rc = evms_cs_allocate_memory(
11223
+ (void **)&DLID->child_table,
11224
+ sizeof(evms_drivelink_runtime_entry_t) *
11225
+ DLID->child_count);
11228
+ /* add the parent node to the discover list */
11229
+ rc = evms_cs_add_logical_node_to_list(discover_list, parent);
11230
+ MOD_INC_USE_COUNT;
11232
+ /* if any errors encountered, try to clean up */
11234
+ LOG_SERIOUS("find_parent_node: rc(%d) from '%s'\n",
11235
+ rc, child_node->name);
11244
+ *drivelink_instance_data = DLID;
11245
+ *parent_node = parent;
11251
+compute_child_index(
11252
+ evms_logical_node_t *node,
11253
+ evms_drivelink_metadata_t *DLMD)
11255
+ int i, position = -1;
11257
+ for(i = 0; i < DLMD->child_count; i++) {
11258
+ if (DLMD->ordering_table[i].child_serial_number ==
11259
+ DLMD->child_serial_number) {
11264
+ if (position == -1) {
11265
+ LOG_SERIOUS("%s: child not found from '%s'\n",
11266
+ __FUNCTION__, node->name);
11268
+ return(position);
11272
+process_child_nodes(evms_logical_node_t **discover_list)
11274
+ int rc = 0, index = -1;
11275
+ evms_logical_node_t *node, *next_node, *parent;
11276
+ evms_drivelink_metadata_t *DLMD;
11277
+ evms_drivelink_runtime_data_t *DLID;
11278
+ evms_drivelink_runtime_entry_t *child_entry = NULL;
11280
+ for (node = *discover_list; node; node = next_node) {
11281
+ next_node = node->next;
11282
+ if ( (!node->feature_header) ||
11283
+ (node->feature_header->feature_id != plugin_header.id) ) {
11287
+ rc = evms_cs_remove_logical_node_from_list(discover_list, node);
11289
+ /* we need to load the feature data to */
11290
+ /* find the parent's serial number this */
11291
+ /* child node belongs to. */
11293
+ rc = load_feature_data(node,&DLMD);
11295
+ /* find the parent node for this child */
11297
+ rc = find_parent_node_for_child_node(
11298
+ node, DLMD, &parent, &DLID, discover_list);
11301
+ /* determine position of child in drive link object */
11302
+ index = compute_child_index(node, DLMD);
11307
+ /* check for multiple child index requests */
11308
+ child_entry = (evms_drivelink_runtime_entry_t *)&DLID->child_table[index];
11309
+ /* check to see if this child index is
11310
+ * already in use.
11312
+ if (child_entry->child_node) {
11313
+ LOG_SERIOUS("attempt to put '%s' in child index(%d). Already occupied by '%s'.\n",
11314
+ node->name, index, child_entry->child_node->name);
11319
+ /* fill in child info in parent */
11321
+ /* check the sector size for this node */
11322
+ if (node->hardsector_size > parent->hardsector_size)
11323
+ parent->hardsector_size = node->hardsector_size;
11324
+ /* check the block size for this node */
11325
+ if (node->block_size > parent->block_size)
11326
+ parent->block_size = node->block_size;
11327
+ /* set the child node */
11328
+ child_entry->child_node = node;
11329
+ /* set the metadata for this node */
11330
+ child_entry->child_metadata = DLMD;
11333
+ /* on error, clean up accordingly */
11336
+ evms_cs_deallocate_memory(DLMD);
11337
+ LOG_SERIOUS("%s: rc(%d) from '%s'\n",
11338
+ __FUNCTION__, rc, node->name);
11339
+ LOG_SERIOUS("deleting child node '%s'.\n",
11341
+ rc = DELETE(node);
11343
+ LOG_SERIOUS("error(%d) attempting to delete '%s'.\n",
11349
+ /* errors are handled internal to this function */
11350
+ /* by deleting the failed node. This will get */
11351
+ /* picked up by finalize_parent_nodes as a */
11352
+ /* missing child node */
11356
+#define TEST_CHILD_PRESENCE 0
11357
+#define TEST_CHILD_COUNT 1
11358
+#define TEST_CHILD_PARENTS_SERIAL_NUM 2
11359
+#define TEST_CHILD_POSITION 3
11360
+#define TEST_CHILD_METADATA 4
11363
+test_parent_node(evms_logical_node_t *node)
11366
+ evms_drivelink_runtime_data_t *DLID;
11367
+ evms_drivelink_runtime_entry_t *child_entry;
11369
+ DLID = (evms_drivelink_runtime_data_t *)node->instance_data;
11370
+ for(i = 0; i < DLID->child_count; i++) {
11371
+ child_entry = (evms_drivelink_runtime_entry_t *)&DLID->child_table[i];
11373
+ /* insure each child entry is filled */
11374
+ if (!child_entry->child_node) {
11376
+ EVMS_VOLUME_SET_READ_ONLY |
11377
+ EVMS_VOLUME_PARTIAL;
11378
+ LOG_ERROR("%s: missing child(%d).\n",__FUNCTION__,i);
11380
+ /* insure child count is the same */
11381
+ /* in each child's metadata */
11382
+ if (child_entry->child_metadata->child_count !=
11383
+ DLID->child_count) {
11384
+ rc = -EVMS_FEATURE_FATAL_ERROR;
11385
+ LOG_ERROR("%s: child count wrong for node '%s'\n",
11386
+ __FUNCTION__, node->name);
11388
+ /* insure parent serial number is */
11389
+ /* the same in each child's metadata */
11390
+ if (child_entry->child_metadata->parent_serial_number !=
11391
+ DLID->parent_serial_number) {
11392
+ rc = -EVMS_FEATURE_FATAL_ERROR;
11393
+ LOG_ERROR("%s: incorrect [is(%Ld), should be(%Ld)] child serial number for node '%s'\n",
11395
+ child_entry->child_metadata->parent_serial_number,
11396
+ DLID->parent_serial_number,
11399
+ /* insure each is in the correct entry */
11400
+ if (child_entry->child_metadata->ordering_table[i].child_serial_number !=
11401
+ child_entry->child_metadata->child_serial_number) {
11402
+ rc = -EVMS_FEATURE_FATAL_ERROR;
11403
+ LOG_ERROR("%s: child reports different index for node '%s'\n",
11404
+ __FUNCTION__, node->name);
11406
+ evms_drivelink_runtime_entry_t *other_child_entry;
11408
+ /* compare the children's metadata */
11410
+ /* look for another present child to
11411
+ * compare against.
11413
+ other_child_entry = NULL;
11414
+ for (j = 0; j < DLID->child_count; j++) {
11415
+ /* skip comparing to ourselves */
11419
+ /* is this child is present? */
11420
+ if (DLID->child_table[j].child_node) {
11421
+ /* yes, use it */
11422
+ other_child_entry = &DLID->child_table[j];
11426
+ /* if we can't find another valid
11427
+ * child node's metadata to compare
11428
+ * against, just skip this test.
11430
+ if (!other_child_entry) {
11434
+ other_child_entry->child_metadata->ordering_table,
11435
+ child_entry->child_metadata->ordering_table,
11436
+ sizeof(child_entry->child_metadata->ordering_table));
11438
+ rc = -EVMS_FEATURE_FATAL_ERROR;
11439
+ LOG_ERROR("%s: mismatching child metadata for nodes '%s' and '%s'\n",
11440
+ __FUNCTION__, DLID->child_table[i-1].child_node->name,
11441
+ child_entry->child_node->name);
11444
+ /* stop if fatal error encountered */
11445
+ if (rc == -EVMS_FEATURE_FATAL_ERROR) {
11453
+ * function: perform_final_adjustments
11455
+ * This function does the following:
11456
+ * sets the vsize (in vsectors) field in each child node
11457
+ * sets the voffset (in vsectors) field in each child node
11458
+ * frees each child node's metadata
11459
+ * sets the parent's total size field
11462
+perform_final_adjustments(evms_logical_node_t *node)
11465
+ evms_drivelink_runtime_data_t *DLID;
11466
+ evms_drivelink_runtime_entry_t *child_entry = NULL;
11467
+ evms_drivelink_metadata_t *ref_data = NULL;
11469
+ DLID = (evms_drivelink_runtime_data_t *)node->instance_data;
11470
+ /* find a valid copy of the ordering table.
11471
+ * since all the ordering tables are the same
11472
+ * we can just pick one to use for all the
11473
+ * child computations.
11475
+ for(i = 0; i < DLID->child_count; i++) {
11476
+ child_entry = (evms_drivelink_runtime_entry_t *)&DLID->child_table[i];
11477
+ if (child_entry->child_node) {
11478
+ ref_data = child_entry->child_metadata;
11482
+ /* if we got this far, there should
11483
+ * always be at least one valid child.
11485
+ if (!ref_data) BUG();
11486
+ /* compute the parent's usable size,
11487
+ * and construct the table used to
11488
+ * remap parent I/Os to child I/Os */
11489
+ for(i = 0; i < DLID->child_count; i++) {
11490
+ child_entry = (evms_drivelink_runtime_entry_t *)&DLID->child_table[i];
11491
+ /* set the LBA count for this child node */
11492
+ child_entry->vsize = ref_data->ordering_table[i].child_vsize;
11493
+ /* set the start LBA value for this child node */
11494
+ child_entry->voffset = node->total_vsectors;
11495
+ /* keep a running total of size in sectors */
11496
+ node->total_vsectors += child_entry->vsize;
11497
+ /* free the metadata for this child node */
11498
+ if (ref_data != child_entry->child_metadata) {
11499
+ evms_cs_deallocate_memory(child_entry->child_metadata);
11501
+ child_entry->child_metadata = NULL;
11502
+ /* free the feature header for this child node */
11503
+ if (child_entry->child_node) {
11504
+ evms_cs_deallocate_memory(child_entry->child_node->feature_header);
11505
+ child_entry->child_node->feature_header = NULL;
11508
+ /* free the reference data */
11509
+ evms_cs_deallocate_memory(ref_data);
11513
+finalize_parent_nodes(evms_logical_node_t **discover_list)
11516
+ evms_logical_node_t *node, *next_node;
11518
+ for (node = *discover_list; node; node = next_node) {
11519
+ next_node = node->next;
11520
+ /* only check parent nodes */
11521
+ if (!node->feature_header) {
11522
+ /* valid the children of this parent */
11523
+ rc = test_parent_node(node);
11525
+ /* compute parent size and
11526
+ * child remap table.
11528
+ perform_final_adjustments(node);
11530
+ /* fatal error encountered.
11531
+ * cleanup from this node and
11532
+ * delete it from memory.
11534
+ evms_cs_remove_logical_node_from_list(discover_list, node);
11535
+ rc2 = DELETE(node);
11537
+ LOG_SERIOUS("error(%d) attempting to delete '%s'.\n",
11538
+ rc2, node->name);
11547
+ * Function: discover drive linked storage objects
11551
+drivelink_discover(evms_logical_node_t **discover_list)
11555
+ rc = process_child_nodes(discover_list);
11557
+ rc = finalize_parent_nodes(discover_list);
11563
+/********************************************************/
11564
+/* Required Plugin Function Table Entry Point: */
11565
+/* Delete function */
11566
+/********************************************************/
11569
+ * Function: drivelink_delete
11573
+drivelink_delete(evms_logical_node_t * node)
11576
+ evms_drivelink_runtime_data_t *DLID;
11577
+ evms_drivelink_runtime_entry_t *child_entry;
11579
+ LOG_DETAILS("deleting '%s'.\n", node->name);
11581
+ DLID = (evms_drivelink_runtime_data_t *)node->instance_data;
11583
+ for (i = 0; i < DLID->child_count; i++) {
11584
+ child_entry = &DLID->child_table[i];
11585
+ /* delete the child node */
11586
+ if (child_entry->child_node) {
11587
+ rc = DELETE(child_entry->child_node);
11589
+ child_entry->child_node = NULL;
11591
+ /* delete the child's metadata */
11592
+ if (child_entry->child_metadata) {
11593
+ evms_cs_deallocate_memory(child_entry->child_metadata);
11594
+ child_entry->child_metadata = NULL;
11598
+ /* delete the child table */
11599
+ if (DLID->child_table) {
11600
+ evms_cs_deallocate_memory(DLID->child_table);
11601
+ DLID->child_table = NULL;
11603
+ /* delete the instance data */
11604
+ evms_cs_deallocate_memory(DLID);
11605
+ node->instance_data = NULL;
11609
+ evms_cs_deallocate_logical_node(node);
11610
+ MOD_DEC_USE_COUNT;
11616
+/********************************************************/
11617
+/* Required Plugin Function Table Entry Point: */
11618
+/* Read function & Support routines */
11619
+/********************************************************/
11622
+ * function: which_child
11624
+ * This function find the child node a parent rsector maps to.
11625
+ * It then adjusts the rsector value to be child relative and
11626
+ * optionally computes the max # of sectors that can be access
11627
+ * from this starting point on the child. The child node, the
11628
+ * child relative rsector and max io size are returned to the
11632
+static evms_logical_node_t *
11634
+ evms_logical_node_t *parent,
11635
+ evms_sector_t *rsector,
11636
+ evms_sector_t *max_io_sects)
11639
+ evms_logical_node_t *child = NULL;
11640
+ evms_drivelink_runtime_data_t *DLID;
11641
+ evms_drivelink_runtime_entry_t *child_entry = NULL;
11643
+ DLID = (evms_drivelink_runtime_data_t *)parent->instance_data;
11644
+ for (i = 0; i < DLID->child_count; i++) {
11645
+ child_entry = (evms_drivelink_runtime_entry_t *)&DLID->child_table[i];
11647
+ if (*rsector >= child_entry->vsize) {
11648
+ *rsector -= child_entry->vsize;
11650
+ /* get the child node */
11651
+ child = child_entry->child_node;
11652
+ /* compute the sector count if requested */
11653
+ if (max_io_sects)
11654
+ /* this is only used for INIT I/O
11655
+ * to return the largest sector
11656
+ * count size for this child based
11657
+ * on first sector in the I/O.
11660
+ child_entry->vsize - *rsector;
11668
+ * function: drivelink_io_error
11670
+ * this function was primarily created because the function
11671
+ * buffer_IO_error is inline and kgdb doesn't allow breakpoints
11672
+ * to be set on inline functions. Since this was an error path
11673
+ * and not mainline, I decided to add a trace statement to help
11674
+ * report on the failing condition.
11678
+drivelink_io_error(
11679
+ evms_logical_node_t *node,
11683
+ LOG_SERIOUS("sector remap error %sING on (%s), rsector(%Ld).\n",
11684
+ (io_flag) ? "WRIT" : "READ",
11688
+ EVMS_IO_ERROR(eio);
11692
+ * Function: drivelink_read
11695
+drivelink_read(evms_logical_node_t *node, eio_t *eio)
11697
+ evms_logical_node_t *child;
11699
+ child = which_child(node, &eio->rsector, NULL);
11701
+ R_IO(child, eio);
11703
+ drivelink_io_error(node, READ, eio);
11707
+/********************************************************/
11708
+/* Required Plugin Function Table Entry Point: */
11709
+/* Read function & Support routines */
11710
+/********************************************************/
11713
+ * Function: drivelink_write
11717
+drivelink_write(evms_logical_node_t *node, eio_t *eio)
11719
+ evms_logical_node_t *child;
11721
+ child = which_child(node, &eio->rsector, NULL);
11723
+ W_IO(child, eio);
11725
+ drivelink_io_error(node, WRITE, eio);
11729
+/********************************************************/
11730
+/* Required Plugin Function Table Entry Point: */
11731
+/* Init I/O function */
11732
+/********************************************************/
11735
+ * function: init_io
11737
+ * This function must determine which child or children a
11738
+ * specified I/O request must be passed to. Also if, when,
11739
+ * and how a request must be broken up.
11743
+drivelink_init_io(
11744
+ evms_logical_node_t * node,
11745
+ int io_flag, /* 0=read, 1=write*/
11746
+ evms_sector_t sect_nr, /* disk LBA */
11747
+ evms_sector_t num_sects, /* # of sectors */
11748
+ void * buf_addr ) /* buffer address */
11755
+ evms_sector_t starting_sector, remaining_sectors;
11757
+ evms_drivelink_runtime_data_t *DLID;
11759
+ if ( (sect_nr + num_sects) > node->total_vsectors) {
11760
+ LOG_SERIOUS("attempted out of bound(%Ld) %s on '%s' at sector(%Ld), count(%Ld).\n",
11761
+ node->total_vsectors,
11762
+ (io_flag) ? "WRITE" : "READ",
11764
+ sect_nr, num_sects);
11767
+ DLID = (evms_drivelink_runtime_data_t *)node->instance_data;
11768
+ /* make working copies of input parameters */
11769
+ starting_sector = sect_nr;
11770
+ remaining_sectors = num_sects;
11771
+ io_buf = buf_addr;
11772
+ /* loop until all I/O is performed */
11773
+ while(remaining_sectors) {
11774
+ evms_sector_t io_start, io_size;
11775
+ evms_logical_node_t *child;
11777
+ /* compute the child relative io_start
11778
+ * and max io_size.
11780
+ io_start = starting_sector;
11781
+ child = which_child(node, &io_start, &io_size);
11782
+ /* adjust io_size based on
11783
+ * original remaining sectors
11786
+ if (io_size > remaining_sectors)
11787
+ io_size = remaining_sectors;
11789
+ rc = INIT_IO(child,
11795
+ /* if partial volume, return 0's
11796
+ * for missing children.
11798
+ if (io_flag == READ) {
11799
+ memset(io_buf, 0, io_size << EVMS_VSECTOR_SIZE_SHIFT);
11803
+ /* adjust working copies */
11804
+ starting_sector += io_size;
11805
+ remaining_sectors -= io_size;
11806
+ io_buf += io_size <<
11807
+ EVMS_VSECTOR_SIZE_SHIFT;
11817
+/********************************************************/
11818
+/* Required Plugin Function Table Entry Point: */
11819
+/* IOCTL function & Support routines */
11820
+/********************************************************/
11823
+drivelink_ioctl_cmd_plugin_ioctl(
11824
+ evms_logical_node_t *node,
11825
+ struct inode *inode, struct file *file,
11826
+ unsigned long cmd, unsigned long arg)
11829
+ evms_drivelink_runtime_data_t *DLID;
11830
+ evms_plugin_ioctl_t tmp, *user_parms;
11832
+ user_parms = (evms_plugin_ioctl_t *)arg;
11833
+ /* copy user's parameters to kernel space */
11834
+ if (copy_from_user(&tmp, user_parms, sizeof(tmp)))
11838
+ DLID = (evms_drivelink_runtime_data_t *)node->instance_data;
11839
+ /* is this cmd targetted at this feature ? */
11840
+ if (tmp.feature_id == node->plugin->id) {
11841
+ switch(tmp.feature_command) {
11845
+ } else { /* broadcast this cmd to all children */
11846
+ for (i = 0; i < DLID->child_count; i++) {
11847
+ rc = IOCTL(DLID->child_table[i].child_node,
11848
+ inode, file, cmd, arg);
11852
+ /* copy info to userspace */
11853
+ if (copy_to_user(user_parms, &tmp, sizeof(tmp)))
11860
+drivelink_ioctl_cmd_broadcast(
11861
+ evms_logical_node_t *node,
11862
+ struct inode *inode, struct file *file,
11863
+ unsigned long cmd, unsigned long arg)
11866
+ evms_drivelink_runtime_data_t *DLID;
11868
+ DLID = (evms_drivelink_runtime_data_t *)node->instance_data;
11869
+ /* broadcast this cmd to all children */
11870
+ for (i = 0; i < DLID->child_count; i++) {
11871
+ evms_logical_node_t *child_node;
11873
+ child_node = DLID->child_table[i].child_node;
11874
+ if (child_node) {
11875
+ rc |= IOCTL(child_node, inode, file, cmd, arg);
11882
+ * Function: drivelink_ioctl
11887
+ evms_logical_node_t * node,
11888
+ struct inode * inode,
11889
+ struct file * file,
11890
+ unsigned int cmd,
11891
+ unsigned long arg)
11894
+ evms_drivelink_runtime_data_t *DLID = NULL;
11895
+ struct hd_geometry hdgeo;
11897
+ if ( (!node) || (!inode) )
11901
+ DLID = (evms_drivelink_runtime_data_t *)node->instance_data;
11903
+ case HDIO_GETGEO:
11904
+ hdgeo.heads = 255;
11905
+ hdgeo.sectors = 63;
11906
+ hdgeo.cylinders = ((unsigned int)node->total_vsectors) /
11907
+ hdgeo.heads / hdgeo.sectors;
11909
+ if (copy_to_user((int *)arg,
11914
+ case EVMS_QUIESCE_VOLUME:
11915
+ case EVMS_GET_DISK_LIST:
11916
+ case EVMS_CHECK_MEDIA_CHANGE:
11917
+ case EVMS_REVALIDATE_DISK:
11918
+ case EVMS_OPEN_VOLUME:
11919
+ case EVMS_CLOSE_VOLUME:
11920
+ rc = drivelink_ioctl_cmd_broadcast(
11921
+ node, inode, file, cmd, arg);
11923
+ case EVMS_PLUGIN_IOCTL:
11924
+ rc = drivelink_ioctl_cmd_plugin_ioctl(
11925
+ node, inode, file, cmd, arg);
11927
+ case EVMS_GET_BMAP:
11929
+ evms_get_bmap_t *bmap;
11930
+ evms_sector_t io_start, io_size;
11931
+ evms_logical_node_t *child;
11933
+ bmap = (evms_get_bmap_t *)arg;
11934
+ io_start = bmap->rsector;
11935
+ child = which_child(node, &io_start, &io_size);
11937
+ if (node->block_size !=
11938
+ child->block_size) {
11939
+ bmap->status = -EPERM;
11941
+ bmap->rsector = io_start;
11942
+ rc = IOCTL(child,
11960
+/********************************************************/
11961
+/* Required Module Entry Point: */
11962
+/* drivelink_init */
11963
+/********************************************************/
11966
+ * Function: drivelink_init
11970
+drivelink_init(void)
11972
+ return evms_cs_register_plugin(&plugin_header); /* register with EVMS */
11976
+drivelink_exit(void)
11978
+ evms_cs_unregister_plugin(&plugin_header);
11981
+module_init(drivelink_init);
11982
+module_exit(drivelink_exit);
11983
+#ifdef MODULE_LICENSE
11984
+MODULE_LICENSE("GPL");
11987
diff -Naur linux-2002-03-28/drivers/evms/evms_ecr.c evms-2002-03-28/drivers/evms/evms_ecr.c
11988
--- linux-2002-03-28/drivers/evms/evms_ecr.c Wed Dec 31 18:00:00 1969
11989
+++ evms-2002-03-28/drivers/evms/evms_ecr.c Wed Mar 6 16:01:37 2002
11991
+/* -*- linux-c -*- */
11994
+ * Copyright (c) International Business Machines Corp., 2000
11996
+ * This program is free software; you can redistribute it and/or modify
11997
+ * it under the terms of the GNU General Public License as published by
11998
+ * the Free Software Foundation; either version 2 of the License, or
11999
+ * (at your option) any later version.
12001
+ * This program is distributed in the hope that it will be useful,
12002
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
12003
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12004
+ * the GNU General Public License for more details.
12006
+ * You should have received a copy of the GNU General Public License
12007
+ * along with this program; if not, write to the Free Software
12008
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
12011
+/* linux/driver/evms/evms_ecr.c
12013
+ * EVMS - Cluster enablement (ECR) module
12018
+#include <linux/kernel.h>
12019
+#include <linux/module.h>
12020
+#include <linux/init.h>
12021
+#include <linux/types.h>
12022
+#include <linux/evms/evms_ecr.h>
12024
+#define LOG_PREFIX "ecr: "
12030
+ecr_group_t ecr_group_join(char *group_name, ecr_table_t *f_table,
12031
+ ecr_cred_t * cred, size_t size, ecr_instance_t *instance)
12041
+ * ecr_group_leave
12043
+void ecr_group_leave(ecr_group_t group)
12054
+int ecr_group_send(ecr_group_t group, ecr_nodeid_t node, void *message,
12055
+ size_t size, ecr_instance_t *instance,
12056
+ void callback(int ret, ecr_instance_t *instance))
12065
+ * ecr_group_send_wait
12067
+int ecr_group_send_wait(ecr_group_t group, ecr_nodeid_t node, void *message,
12068
+ size_t size, int *ret)
12078
+ * ecr_group_broadcast
12080
+int ecr_group_broadcast(ecr_group_t group, void *message, size_t size,
12081
+ ecr_instance_t *instance,
12082
+ void callback(u_char ret, ecr_instance_t *instance))
12091
+ * ecr_group_broadcast_wait
12093
+int ecr_group_broadcast_wait(ecr_group_t group, void *message, size_t size,
12104
+ * ecr_group_atomic_execute
12106
+int ecr_group_atomic_execute(ecr_group_t group, void *message, size_t size,
12107
+ ecr_instance_t *instance,
12108
+ void callback(ecr_instance_t *instance))
12117
+ * ecr_group_atomic_execute_wait
12119
+int ecr_group_atomic_execute_wait(ecr_group_t group, void *message, size_t size)
12128
+ * ecr_group_success_response
12130
+void ecr_group_success_response(ecr_message_t *handle)
12140
+ * ecr_group_failure_response
12142
+void ecr_group_failure_response(ecr_message_t *handle, int ret)
12151
+ * ecr_lock_create
12153
+ecr_lock_t ecr_lock_create(char *lockname)
12162
+int ecr_lock(ecr_lock_t lock, u_int64_t start, u_int64_t length,
12163
+ ecr_lock_mode_t mode, u_char flag)
12174
+int ecr_unlock(ecr_lock_t lock, u_int64_t start, u_int64_t length)
12181
+/********************************************************/
12182
+/* Required Module Entry Point: */
12184
+/********************************************************/
12186
+static int __init ecr_init(void)
12192
+static void __exit ecr_exit(void)
12197
+module_init(ecr_init);
12198
+module_exit(ecr_exit);
12199
+#ifdef MODULE_LICENSE
12200
+MODULE_LICENSE("GPL");
12203
diff -Naur linux-2002-03-28/drivers/evms/evms_passthru.c evms-2002-03-28/drivers/evms/evms_passthru.c
12204
--- linux-2002-03-28/drivers/evms/evms_passthru.c Wed Dec 31 18:00:00 1969
12205
+++ evms-2002-03-28/drivers/evms/evms_passthru.c Mon Mar 18 17:39:22 2002
12207
+/* -*- linux-c -*- */
12212
+ * Copyright (c) International Business Machines Corp., 2000
12214
+ * This program is free software; you can redistribute it and/or modify
12215
+ * it under the terms of the GNU General Public License as published by
12216
+ * the Free Software Foundation; either version 2 of the License, or
12217
+ * (at your option) any later version.
12219
+ * This program is distributed in the hope that it will be useful,
12220
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
12221
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12222
+ * the GNU General Public License for more details.
12224
+ * You should have received a copy of the GNU General Public License
12225
+ * along with this program; if not, write to the Free Software
12226
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
12231
+ * linux/drivers/evms/evms_passthru.c
12233
+ * EVMS System Data Manager
12238
+#include <linux/module.h>
12239
+#include <linux/kernel.h>
12240
+#include <linux/config.h>
12241
+#include <linux/genhd.h>
12242
+#include <linux/major.h>
12243
+#include <linux/string.h>
12244
+#include <linux/blk.h>
12245
+#include <linux/init.h>
12246
+#include <linux/slab.h>
12247
+#include <linux/evms/evms_kernel.h>
12248
+#include <asm/system.h>
12250
+#define EVMS_PASSTHRU_ID 0
12251
+#define LOG_PREFIX "passthru: "
12253
+static int passthru_mgr_discover(evms_logical_node_t **);
12254
+static int passthru_mgr_delete(evms_logical_node_t *);
12255
+static void passthru_mgr_read(evms_logical_node_t *,
12257
+static void passthru_mgr_write(evms_logical_node_t *,
12259
+static int passthru_mgr_ioctl(evms_logical_node_t *,
12264
+static int passthru_mgr_init_io(evms_logical_node_t *,
12270
+static evms_plugin_function_table_t function_table = {
12271
+ discover: &passthru_mgr_discover,
12272
+ delete : &passthru_mgr_delete,
12273
+ read : &passthru_mgr_read,
12274
+ write : &passthru_mgr_write,
12275
+ init_io : &passthru_mgr_init_io,
12276
+ ioctl : &passthru_mgr_ioctl
12279
+static evms_plugin_header_t plugin_header = {
12280
+ id : SetPluginID(
12283
+ EVMS_PASSTHRU_ID),
12289
+ required_common_services_version : {
12294
+ function_table : &function_table // function table for this plugin
12297
+/*******************************/
12298
+/* discovery support functions */
12299
+/*******************************/
12302
+process_passthru_data(evms_logical_node_t **pp)
12304
+ int rc, size_in_sectors;
12305
+ evms_logical_node_t *node, *new_node;
12309
+ size_in_sectors = evms_cs_size_in_vsectors(
12310
+ sizeof(evms_feature_header_t));
12312
+ /* allocate "parent" node */
12313
+ rc = evms_cs_allocate_logical_node(&new_node);
12315
+ /* initialize "parent" node */
12316
+ new_node->instance_data = node;
12317
+ new_node->flags = node->flags;
12318
+ new_node->plugin = &plugin_header;
12319
+ new_node->system_id = node->system_id;
12320
+ new_node->block_size = node->block_size;
12321
+ new_node->hardsector_size = node->hardsector_size;
12322
+ new_node->total_vsectors = node->total_vsectors;
12323
+ new_node->total_vsectors -=
12324
+ (size_in_sectors << 1) +
12325
+ node->feature_header->alignment_padding;
12326
+ new_node->volume_info = node->volume_info;
12327
+ strcpy(new_node->name, node->name);
12328
+ if (strlen(node->feature_header->object_name))
12329
+ strcat(new_node->name, node->feature_header->object_name);
12331
+ strcat(new_node->name, "_Passthru");
12333
+ /* return "parent" node to caller */
12336
+ MOD_INC_USE_COUNT;
12338
+ LOG_DETAILS("feature header found on '%s', created '%s'.\n",
12339
+ node->name, new_node->name);
12340
+ /* we're done with the passthru feature headers
12341
+ * so lets delete them now.
12343
+ evms_cs_deallocate_memory(node->feature_header);
12344
+ node->feature_header = NULL;
12346
+ /* on any fatal error, delete the node */
12347
+ int rc2 = DELETE(node);
12349
+ LOG_DEFAULT("error(%d) attempting to delete node(%p,%s).\n",
12350
+ rc2, node, node->name);
12356
+/********** Required Plugin Functions **********/
12360
+ * Function: passthru_mgr_discover
12364
+passthru_mgr_discover(evms_logical_node_t **discover_list)
12367
+ evms_logical_node_t *node, *tmp_list_head;
12369
+ tmp_list_head = *discover_list;
12370
+ *discover_list = NULL;
12372
+ while(tmp_list_head) {
12373
+ node = tmp_list_head;
12374
+ rc = evms_cs_remove_logical_node_from_list(&tmp_list_head, node);
12376
+ rc = process_passthru_data(&node);
12379
+ rc = evms_cs_add_logical_node_to_list(discover_list, node);
12385
+ * Function: passthru_mgr_delete
12389
+passthru_mgr_delete(evms_logical_node_t * node)
12392
+ evms_logical_node_t *p;
12394
+ LOG_DETAILS("deleting '%s'.\n", node->name);
12396
+ p = node->instance_data;
12399
+ evms_cs_deallocate_logical_node(node);
12400
+ MOD_DEC_USE_COUNT;
12406
+ * function: passthru_io_error
12408
+ * this function was primarily created because the function
12409
+ * buffer_IO_error is inline and kgdb doesn't allow breakpoints
12410
+ * to be set on inline functions. Since this was an error path
12411
+ * and not mainline, I decided to add a trace statement to help
12412
+ * report on the failing condition.
12416
+passthru_io_error(
12417
+ evms_logical_node_t *node,
12421
+ LOG_SERIOUS("attempt to %s beyond boundary(%Ld) on (%s), rsector(%Ld).\n",
12422
+ (io_flag) ? "WRITE" : "READ",
12423
+ node->total_vsectors - 1,
12427
+ EVMS_IO_ERROR(eio);
12431
+ * Function: passthru_mgr_read
12434
+passthru_mgr_read(
12435
+ evms_logical_node_t *node,
12438
+ if ((eio->rsector + eio->rsize) <= node->total_vsectors) {
12439
+ R_IO(((evms_logical_node_t*)(node->instance_data)),
12442
+ passthru_io_error(node, READ, eio);
12446
+ * Function: passthru_mgr_write
12450
+passthru_mgr_write(
12451
+ evms_logical_node_t *node,
12454
+ if ((eio->rsector + eio->rsize) <= node->total_vsectors) {
12455
+ W_IO(((evms_logical_node_t*)(node->instance_data)),
12458
+ passthru_io_error(node, WRITE, eio);
12462
+ * Function: passthru_mgr_ioctl
12466
+passthru_mgr_ioctl(
12467
+ evms_logical_node_t * node,
12468
+ struct inode * inode,
12469
+ struct file * file,
12470
+ unsigned int cmd,
12471
+ unsigned long arg)
12475
+ if ((!node) || (!inode))
12478
+ rc = IOCTL(((evms_logical_node_t*)(node->instance_data)), inode, file, cmd, arg);
12484
+passthru_mgr_init_io(
12485
+ evms_logical_node_t * node,
12486
+ int io_flag, /* 0=read, 1=write*/
12487
+ evms_sector_t sect_nr, /* disk LBA */
12488
+ evms_sector_t num_sects, /* # of sectors */
12489
+ void * buf_addr ) /* buffer address */
12492
+ if ((sect_nr + num_sects) <= node->total_vsectors) {
12493
+ rc = INIT_IO(((evms_logical_node_t*)(node->instance_data)),
12494
+ io_flag, sect_nr, num_sects, buf_addr);
12503
+ * Function: passthru_init
12507
+evms_passthru_manager_init(void)
12509
+ return evms_cs_register_plugin(&plugin_header); /* register with EVMS */
12513
+evms_passthru_manager_exit(void)
12515
+ evms_cs_unregister_plugin(&plugin_header);
12518
+module_init(evms_passthru_manager_init);
12519
+module_exit(evms_passthru_manager_exit);
12520
+#ifdef MODULE_LICENSE
12521
+MODULE_LICENSE("GPL");
12524
diff -Naur linux-2002-03-28/drivers/evms/ldev_mgr.c evms-2002-03-28/drivers/evms/ldev_mgr.c
12525
--- linux-2002-03-28/drivers/evms/ldev_mgr.c Wed Dec 31 18:00:00 1969
12526
+++ evms-2002-03-28/drivers/evms/ldev_mgr.c Wed Mar 27 16:25:55 2002
12528
+/* -*- linux-c -*- */
12531
+ * Copyright (c) International Business Machines Corp., 2000
12533
+ * This program is free software; you can redistribute it and/or modify
12534
+ * it under the terms of the GNU General Public License as published by
12535
+ * the Free Software Foundation; either version 2 of the License, or
12536
+ * (at your option) any later version.
12538
+ * This program is distributed in the hope that it will be useful,
12539
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
12540
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12541
+ * the GNU General Public License for more details.
12543
+ * You should have received a copy of the GNU General Public License
12544
+ * along with this program; if not, write to the Free Software
12545
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
12548
+/* linux/driver/evms/ldev_mgr.c
12550
+ * EVMS - Local Device (Hard Drive) Manager
12552
+ * This plugin walks the gendisk list and creates logical disk structures for each
12553
+ * local ide or scsi device.
12557
+#include <linux/config.h>
12558
+#include <linux/module.h>
12559
+#include <linux/errno.h>
12560
+#include <linux/kernel.h>
12561
+#include <linux/fs.h>
12562
+#include <linux/major.h>
12563
+#include <linux/slab.h>
12564
+#include <asm/uaccess.h>
12565
+#include <linux/blk.h> /* must be included by all block drivers */
12566
+#include <linux/genhd.h>
12567
+#include <linux/ide.h>
12568
+#include "../scsi/scsi.h"
12569
+#include "../scsi/sd.h"
12570
+#include <linux/init.h>
12571
+#include <linux/evms/evms_kernel.h>
12573
+#define LOG_PREFIX "ldev_mgr: "
12575
+#define EVMS_LOCAL_DEVICE_MANAGER_ID 1
12577
+/* local instance data structure definition */
12578
+typedef struct ldev_mgr_instance_data_s {
12580
+ struct gendisk *gd;
12581
+ int media_changed;
12582
+} ldev_mgr_instance_data_t;
12584
+/* prototypes for mandatory plugin interface functions */
12585
+static int discover_disks(evms_logical_node_t **);
12586
+static int ldev_mgr_delete(evms_logical_node_t *);
12587
+static void ldev_mgr_read(evms_logical_node_t *, eio_t *);
12588
+static void ldev_mgr_write(evms_logical_node_t *, eio_t *);
12589
+static int ldev_mgr_ioctl(evms_logical_node_t *,
12594
+static int ldev_init_io(evms_logical_node_t *,
12600
+/* plugin function table definition */
12601
+static evms_plugin_function_table_t function_table = {
12602
+ discover : &discover_disks,
12603
+ delete : &ldev_mgr_delete,
12604
+ read : &ldev_mgr_read,
12605
+ write : &ldev_mgr_write,
12606
+ init_io : &ldev_init_io,
12607
+ ioctl : &ldev_mgr_ioctl
12610
+/* plugin header definition */
12611
+static evms_plugin_header_t plugin_header = {
12612
+ id : SetPluginID(
12614
+ EVMS_DEVICE_MANAGER,
12615
+ EVMS_LOCAL_DEVICE_MANAGER_ID),
12621
+ required_common_services_version : {
12626
+ function_table : &function_table
12629
+#define TYPE_NONE 0
12630
+#define TYPE_GENERIC 1
12631
+#define TYPE_IDE 2
12632
+#define TYPE_SCSI 3
12634
+#define INDEX_ALPHA 0
12635
+#define INDEX_NUMERIC 1
12637
+/********************************************************/
12638
+/* Required Plugin Function Table Entry Point: */
12639
+/* Discover function & Support routines */
12640
+/********************************************************/
12642
+#define MAX_NAME_BASE_SIZE 10
12643
+#define MAX_NAME_MODIFIER_SIZE 4
12644
+typedef struct blk_device_info_s {
12645
+ char devnode_name_base[MAX_NAME_BASE_SIZE];
12647
+ char devnode_name_modifier[MAX_NAME_MODIFIER_SIZE];
12649
+ int devnode_name_index;
12650
+ int devnode_name_type;
12652
+} blk_device_info_t;
12654
+static blk_device_info_t *blk_dev_info = NULL;
12656
+#define BLK_DEV_INFO(a,b,c,d,e) \
12657
+ strncpy(blk_dev_info[a].devnode_name_base, b, MAX_NAME_BASE_SIZE); \
12658
+ blk_dev_info[a].null1 = 0; \
12659
+ strncpy(blk_dev_info[a].devnode_name_modifier, c, MAX_NAME_MODIFIER_SIZE); \
12660
+ blk_dev_info[a].null2 = 0; \
12661
+ blk_dev_info[a].devnode_name_index = 0; \
12662
+ blk_dev_info[a].device_type = d; \
12663
+ blk_dev_info[a].devnode_name_type = e;
12666
+init_blk_dev_info( blk_device_info_t *blk_dev_info )
12668
+ BLK_DEV_INFO( IDE0_MAJOR, "hd", "a", TYPE_IDE, INDEX_ALPHA );
12669
+ BLK_DEV_INFO( IDE1_MAJOR, "hd", "c", TYPE_IDE, INDEX_ALPHA );
12670
+ BLK_DEV_INFO( IDE2_MAJOR, "hd", "e", TYPE_IDE, INDEX_ALPHA );
12671
+ BLK_DEV_INFO( IDE3_MAJOR, "hd", "g", TYPE_IDE, INDEX_ALPHA );
12672
+ BLK_DEV_INFO( IDE4_MAJOR, "hd", "i", TYPE_IDE, INDEX_ALPHA );
12673
+ BLK_DEV_INFO( IDE5_MAJOR, "hd", "k", TYPE_IDE, INDEX_ALPHA );
12674
+ BLK_DEV_INFO( IDE6_MAJOR, "hd", "m", TYPE_IDE, INDEX_ALPHA );
12675
+ BLK_DEV_INFO( IDE7_MAJOR, "hd", "o", TYPE_IDE, INDEX_ALPHA );
12676
+ BLK_DEV_INFO( IDE8_MAJOR, "hd", "q", TYPE_IDE, INDEX_ALPHA );
12677
+ BLK_DEV_INFO( IDE9_MAJOR, "hd", "s", TYPE_IDE, INDEX_ALPHA );
12679
+ BLK_DEV_INFO( SCSI_DISK0_MAJOR, "sd", "a", TYPE_SCSI, INDEX_ALPHA );
12680
+ BLK_DEV_INFO( SCSI_DISK1_MAJOR, "sd", "q", TYPE_SCSI, INDEX_ALPHA );
12681
+ BLK_DEV_INFO( SCSI_DISK2_MAJOR, "sd", "ag", TYPE_SCSI, INDEX_ALPHA );
12682
+ BLK_DEV_INFO( SCSI_DISK3_MAJOR, "sd", "aw", TYPE_SCSI, INDEX_ALPHA );
12683
+ BLK_DEV_INFO( SCSI_DISK4_MAJOR, "sd", "bm", TYPE_SCSI, INDEX_ALPHA );
12684
+ BLK_DEV_INFO( SCSI_DISK5_MAJOR, "sd", "cc", TYPE_SCSI, INDEX_ALPHA );
12685
+ BLK_DEV_INFO( SCSI_DISK6_MAJOR, "sd", "cs", TYPE_SCSI, INDEX_ALPHA );
12686
+ BLK_DEV_INFO( SCSI_DISK7_MAJOR, "sd", "di", TYPE_SCSI, INDEX_ALPHA );
12688
+// BLK_DEV_INFO( MD_MAJOR, "md", "0", TYPE_GENERIC, INDEX_NUMERIC );
12690
+ BLK_DEV_INFO( XT_DISK_MAJOR, "xd", "a", TYPE_GENERIC, INDEX_ALPHA );
12692
+ BLK_DEV_INFO( CYCLADES_MAJOR, "double", "0", TYPE_GENERIC, INDEX_NUMERIC );
12694
+ BLK_DEV_INFO( MFM_ACORN_MAJOR, "mfm", "a", TYPE_GENERIC, INDEX_ALPHA );
12696
+ BLK_DEV_INFO( ACSI_MAJOR, "ad", "a", TYPE_GENERIC, INDEX_ALPHA );
12698
+ BLK_DEV_INFO( PS2ESDI_MAJOR, "ed", "a", TYPE_GENERIC, INDEX_ALPHA );
12700
+ BLK_DEV_INFO( 40, "ez", "a", TYPE_GENERIC, INDEX_ALPHA );
12701
+ BLK_DEV_INFO( 43, "nb", "0", TYPE_GENERIC, INDEX_NUMERIC );
12702
+ BLK_DEV_INFO( 44, "ftl", "a", TYPE_GENERIC, INDEX_ALPHA );
12703
+ BLK_DEV_INFO( 45, "pd", "a", TYPE_GENERIC, INDEX_ALPHA );
12704
+ BLK_DEV_INFO( 47, "pf", "0", TYPE_GENERIC, INDEX_NUMERIC );
12706
+ BLK_DEV_INFO( DAC960_MAJOR + 0, "rd/c0d", "0", TYPE_GENERIC, INDEX_NUMERIC );
12707
+ BLK_DEV_INFO( DAC960_MAJOR + 1, "rd/c1d", "0", TYPE_GENERIC, INDEX_NUMERIC );
12708
+ BLK_DEV_INFO( DAC960_MAJOR + 2, "rd/c2d", "0", TYPE_GENERIC, INDEX_NUMERIC );
12709
+ BLK_DEV_INFO( DAC960_MAJOR + 3, "rd/c3d", "0", TYPE_GENERIC, INDEX_NUMERIC );
12710
+ BLK_DEV_INFO( DAC960_MAJOR + 4, "rd/c4d", "0", TYPE_GENERIC, INDEX_NUMERIC );
12711
+ BLK_DEV_INFO( DAC960_MAJOR + 5, "rd/c5d", "0", TYPE_GENERIC, INDEX_NUMERIC );
12712
+ BLK_DEV_INFO( DAC960_MAJOR + 6, "rd/c6d", "0", TYPE_GENERIC, INDEX_NUMERIC );
12713
+ BLK_DEV_INFO( DAC960_MAJOR + 7, "rd/c7d", "0", TYPE_GENERIC, INDEX_NUMERIC );
12715
+ BLK_DEV_INFO( COMPAQ_SMART2_MAJOR, "ida/c0d", "0", TYPE_GENERIC, INDEX_NUMERIC );
12716
+ BLK_DEV_INFO( COMPAQ_SMART2_MAJOR1, "ida/c1d", "0", TYPE_GENERIC, INDEX_NUMERIC );
12717
+ BLK_DEV_INFO( COMPAQ_SMART2_MAJOR2, "ida/c2d", "0", TYPE_GENERIC, INDEX_NUMERIC );
12718
+ BLK_DEV_INFO( COMPAQ_SMART2_MAJOR3, "ida/c3d", "0", TYPE_GENERIC, INDEX_NUMERIC );
12719
+ BLK_DEV_INFO( COMPAQ_SMART2_MAJOR4, "ida/c4d", "0", TYPE_GENERIC, INDEX_NUMERIC );
12720
+ BLK_DEV_INFO( COMPAQ_SMART2_MAJOR5, "ida/c5d", "0", TYPE_GENERIC, INDEX_NUMERIC );
12721
+ BLK_DEV_INFO( COMPAQ_SMART2_MAJOR6, "ida/c6d", "0", TYPE_GENERIC, INDEX_NUMERIC );
12722
+ BLK_DEV_INFO( COMPAQ_SMART2_MAJOR7, "ida/c7d", "0", TYPE_GENERIC, INDEX_NUMERIC );
12724
+ BLK_DEV_INFO( I2O_MAJOR + 0, "i2o/hd", "a", TYPE_GENERIC, INDEX_ALPHA );
12725
+ BLK_DEV_INFO( I2O_MAJOR + 1, "i2o/hd", "q", TYPE_GENERIC, INDEX_ALPHA );
12726
+ BLK_DEV_INFO( I2O_MAJOR + 2, "i2o/hd", "ag", TYPE_GENERIC, INDEX_ALPHA );
12727
+ BLK_DEV_INFO( I2O_MAJOR + 3, "i2o/hd", "aw", TYPE_GENERIC, INDEX_ALPHA );
12728
+ BLK_DEV_INFO( I2O_MAJOR + 4, "i2o/hd", "bm", TYPE_GENERIC, INDEX_ALPHA );
12729
+ BLK_DEV_INFO( I2O_MAJOR + 5, "i2o/hd", "cc", TYPE_GENERIC, INDEX_ALPHA );
12730
+ BLK_DEV_INFO( I2O_MAJOR + 6, "i2o/hd", "cs", TYPE_GENERIC, INDEX_ALPHA );
12731
+ BLK_DEV_INFO( I2O_MAJOR + 7, "i2o/hd", "di", TYPE_GENERIC, INDEX_ALPHA );
12733
+ BLK_DEV_INFO( 92, "ppdd", "0", TYPE_GENERIC, INDEX_NUMERIC );
12734
+ BLK_DEV_INFO( 93, "nftl", "a", TYPE_GENERIC, INDEX_ALPHA );
12736
+ BLK_DEV_INFO( DASD_MAJOR, "dasd", "a", TYPE_GENERIC, INDEX_ALPHA );
12737
+ BLK_DEV_INFO( MDISK_MAJOR, "mdisk", "a", TYPE_GENERIC, INDEX_ALPHA );
12739
+ BLK_DEV_INFO( 96, "msd", "0", TYPE_GENERIC, INDEX_NUMERIC );
12740
+ BLK_DEV_INFO( 97, "pktcdvd", "0", TYPE_GENERIC, INDEX_NUMERIC );
12742
+ BLK_DEV_INFO( UBD_MAJOR, "ubd", "0", TYPE_GENERIC, INDEX_NUMERIC );
12744
+ BLK_DEV_INFO( JSFD_MAJOR, "jsfd", "", TYPE_GENERIC, INDEX_NUMERIC );
12746
+ BLK_DEV_INFO( 101, "amiraid/ar", "0", TYPE_GENERIC, INDEX_NUMERIC );
12748
+ BLK_DEV_INFO( 104, "cciss/c0d", "0", TYPE_GENERIC, INDEX_NUMERIC );
12749
+ BLK_DEV_INFO( 105, "cciss/c1d", "0", TYPE_GENERIC, INDEX_NUMERIC );
12750
+ BLK_DEV_INFO( 106, "cciss/c2d", "0", TYPE_GENERIC, INDEX_NUMERIC );
12751
+ BLK_DEV_INFO( 107, "cciss/c3d", "0", TYPE_GENERIC, INDEX_NUMERIC );
12752
+ BLK_DEV_INFO( 108, "cciss/c4d", "0", TYPE_GENERIC, INDEX_NUMERIC );
12753
+ BLK_DEV_INFO( 108, "cciss/c5d", "0", TYPE_GENERIC, INDEX_NUMERIC );
12754
+ BLK_DEV_INFO( 110, "cciss/c6d", "0", TYPE_GENERIC, INDEX_NUMERIC );
12755
+ BLK_DEV_INFO( 111, "cciss/c7d", "0", TYPE_GENERIC, INDEX_NUMERIC );
12757
+ BLK_DEV_INFO( RAW_MAJOR, "raw", "0", TYPE_GENERIC, INDEX_NUMERIC );
12759
+ BLK_DEV_INFO( VXVM_MAJOR, "vx/dsk", "0", TYPE_GENERIC, INDEX_NUMERIC );
12760
+ BLK_DEV_INFO( VXDMP_MAJOR, "vx/dmp", "0", TYPE_GENERIC, INDEX_NUMERIC );
12764
+is_in_device_list(
12765
+ struct gendisk *gd,
12766
+ int major, int minor)
12768
+ int found, done, rc;
12769
+ evms_logical_node_t *device = NULL;
12770
+ ldev_mgr_instance_data_t *LID;
12772
+ done = found = FALSE;
12773
+ while(done == FALSE) {
12774
+ rc = evms_cs_find_next_device(device, &device);
12775
+ if (rc || !device)
12778
+ LID = device->instance_data;
12779
+ if (LID->gd == gd)
12780
+ if (MAJOR(LID->dev) == major)
12781
+ if (MINOR(LID->dev) == minor)
12782
+ done = found = TRUE;
12789
+build_devnode_name(char *name_buf, int major)
12791
+ char buf[11], *modifier, *buf_ptr;
12793
+ blk_device_info_t *bdi;
12795
+ bdi = &blk_dev_info[major];
12797
+ /* convert the base name modifier to an integer */
12798
+ modifier = bdi->devnode_name_modifier;
12800
+ while (*modifier) {
12801
+ if (bdi->devnode_name_type == INDEX_ALPHA) {
12803
+ int_mod += *modifier - 'a';
12806
+ int_mod += *modifier - '0';
12810
+ /* add in device_index_value */
12811
+ int_mod += bdi->devnode_name_index;
12812
+ bdi->devnode_name_index++;
12814
+ /* convert integer modifier back to ALPHA/NUMERIC chars */
12815
+ memset(buf, 0, sizeof(buf));
12816
+ /* fill the buffer from the rear to front with the
12817
+ * ascii version of the modifier, leaving space for
12818
+ * NULL terminator at the end.
12820
+ buf_ptr = &buf[sizeof(buf) - 2];
12822
+ if (bdi->devnode_name_type == INDEX_ALPHA) {
12823
+ *buf_ptr = (int_mod % 26) + 'a';
12826
+ *buf_ptr = (int_mod % 10) + '0';
12830
+ } while (int_mod);
12832
+ /* find beginning of modifier in buffer */
12834
+ while (!*modifier)
12837
+ /* build the final device devnode name */
12838
+ sprintf(name_buf, "%s%s",
12839
+ bdi->devnode_name_base,
12843
+#define DEVICE_KNOWN 1234
12844
+#define DEVICE_UNINITIALIZED 1235
12845
+#define DEVICE_MEDIA_NOT_PRESENT 1236
12847
+create_logical_disk(
12848
+ evms_logical_node_t **disk_list,
12849
+ struct gendisk *gd,
12850
+ int device_index)
12852
+ int rc = 0, major, minor;
12853
+ evms_logical_node_t *new_disk;
12854
+ ldev_mgr_instance_data_t *InstData;
12855
+ char device_name[EVMS_VOLUME_NAME_SIZE + 1];
12857
+ major = gd->major;
12858
+ minor = device_index << gd->minor_shift;
12860
+ /* skip uninitialized devices */
12861
+ if (!blk_size[major])
12862
+ rc = DEVICE_UNINITIALIZED;
12863
+ else if (!blk_size[major][minor])
12864
+ rc = DEVICE_UNINITIALIZED;
12866
+ /* construct the devnode name for this device */
12867
+ build_devnode_name(device_name, major);
12869
+ /* skip devices we already know about */
12870
+ if (is_in_device_list(gd, major, minor) == TRUE)
12871
+ rc = DEVICE_KNOWN;
12873
+ /* allocate the new node & it's instance data */
12875
+ rc = evms_cs_allocate_logical_node(&new_disk);
12877
+ rc = evms_cs_allocate_memory((void **)&InstData,sizeof(ldev_mgr_instance_data_t));
12879
+ evms_cs_deallocate_logical_node(new_disk);
12881
+ /* initialize the new node */
12883
+ struct hd_geometry dev_geo;
12884
+ new_disk->plugin = &plugin_header;
12886
+ /* initialize the instance data */
12887
+ new_disk->instance_data = InstData;
12888
+ InstData->dev = MKDEV(major, minor);
12889
+ InstData->gd = gd;
12891
+ /* determine hardsector size */
12892
+ new_disk->hardsector_size = 512;
12893
+ if (hardsect_size[major])
12894
+ new_disk->hardsector_size = hardsect_size[major][minor];
12896
+ /* determine block size */
12897
+ new_disk->block_size = 1024;
12898
+ if (blksize_size[major])
12899
+ new_disk->block_size = blksize_size[major][minor];
12901
+ /* determine the device size in sectors */
12902
+ new_disk->total_vsectors = blk_size[major][minor] << 1;
12903
+ /* check the size based on the device geometry
12904
+ * and use this if its larger than the blk_size
12905
+ * info. because of odd(non-even) geometry, the
12906
+ * total sector count could be an odd number,
12907
+ * and we need to insure we truly reflect the
12908
+ * maximum size of the device.
12910
+ rc = evms_cs_kernel_ioctl(
12913
+ (unsigned long)&dev_geo);
12915
+ LOG_ERROR("error(%d) retrieving geometry for '%s'.\n",
12916
+ rc, device_name);
12920
+ dev_size = dev_geo.cylinders;
12921
+ dev_size *= (u64)dev_geo.heads;
12922
+ dev_size *= (u64)dev_geo.sectors;
12924
+ /* convert device size to 512 byte units */
12925
+ dev_size <<= evms_cs_log2(new_disk->hardsector_size) - 9;
12927
+ if (dev_size > new_disk->total_vsectors) {
12928
+ new_disk->total_vsectors = dev_size;
12930
+ LOG_DEBUG("blk_size(%Lu), geometry size(%Lu) in 512 byte units.\n",
12931
+ (u64)blk_size[major][minor] << 1,
12935
+ /* remember removable devices */
12937
+ if (gd->flags[device_index] & GENHD_FL_REMOVABLE)
12938
+ new_disk->flags |= EVMS_DEVICE_REMOVABLE;
12940
+ /* save the devnode name for this device */
12941
+ strcpy(new_disk->name, device_name);
12943
+ /* register this device with evms */
12944
+ evms_cs_register_device(new_disk);
12945
+ MOD_INC_USE_COUNT;
12947
+ /* append this record the linked list */
12948
+ evms_cs_add_logical_node_to_list(disk_list, new_disk);
12949
+ LOG_DETAILS("added logical disk(%s) for physical disk(%u,%u,%s), size(%Lu) in 512 byte units\n",
12953
+ new_disk->total_vsectors);
12956
+ /* reset the "benign" error codes for the caller */
12958
+ case DEVICE_UNINITIALIZED:
12959
+ case DEVICE_KNOWN:
12960
+ case DEVICE_MEDIA_NOT_PRESENT:
12967
+create_logical_generic_disks(
12968
+ evms_logical_node_t **disk_list,
12969
+ struct gendisk *gd)
12973
+ /* This is a generic device */
12976
+ LOG_DEBUG("major name = %s\n", gd->major_name);
12977
+ LOG_DEBUG("number of real devices = %i\n", gd->nr_real);
12978
+ for ( i = 0; i < gd->nr_real; i++ ) {
12979
+ LOG_DEBUG("device %d:\n", i);
12980
+ rc = create_logical_disk(disk_list, gd, i);
12987
+create_logical_ide_disks(
12988
+ evms_logical_node_t **disk_list,
12989
+ struct gendisk *gd)
12992
+ ide_hwif_t * ide_hwif;
12993
+ ide_drive_t * drive;
12995
+ /* This is an IDE device */
12996
+ LOG_DEBUG("found IDE major : %i - searching for disks\n",
12999
+ ide_hwif = gd->real_devices; /* IDE internal data */
13000
+ for (i = 0; i < MAX_DRIVES; i++) {
13001
+ drive = &(ide_hwif->drives[i]);
13002
+ if (drive->present && (drive->media == ide_disk)) {
13003
+ /* force the name index value on ide drives */
13004
+ blk_dev_info[gd->major].devnode_name_index = i;
13005
+ rc = create_logical_disk(disk_list, gd, i);
13013
+create_logical_scsi_disks(
13014
+ evms_logical_node_t **disk_list,
13015
+ struct gendisk *gd)
13018
+ Scsi_Disk *SDisks;
13019
+ Scsi_Device *SDev;
13021
+ /* This is an SCSI device */
13022
+ LOG_DEBUG("found SCSI major : %i - searching for disks\n",gd->major);
13023
+ LOG_DEBUG("scsi: major name = %s\n",gd->major_name);
13024
+ LOG_DEBUG("scsi: number of real devices = %i\n",gd->nr_real);
13025
+ SDisks = gd->real_devices; /* SCSI internal data */
13026
+ for ( i = 0; i < gd->nr_real; i++ ) {
13027
+ SDev = SDisks[i].device;
13028
+ LOG_DEBUG("scsi: Channel = %i, Id = %i, Lun = %i, Capacity = %i\n",
13029
+ SDev->channel, SDev->id, SDev->lun, SDisks[i].capacity);
13030
+ rc = create_logical_disk(disk_list, gd, i);
13037
+create_logical_disks(struct gendisk *gd,
13038
+ void * p_disk_list)
13041
+ evms_logical_node_t **disk_list = p_disk_list;
13043
+ /* create logical disks from all IDE & SCSI devices */
13044
+ switch(blk_dev_info[gd->major].device_type) {
13046
+ rc = create_logical_ide_disks(disk_list, gd);
13049
+ rc = create_logical_scsi_disks(disk_list, gd);
13051
+ case TYPE_GENERIC:
13052
+ rc = create_logical_generic_disks(disk_list, gd);
13055
+ LOG_DEBUG("unrecognized device major : %i\n",gd->major);
13063
+discover_disks(evms_logical_node_t **disk_list)
13067
+ LOG_ENTRY_EXIT("%s Entry\n", __FUNCTION__);
13069
+ if (blk_dev_info == NULL) {
13070
+ /* allocate space for device info array */
13071
+ rc = evms_cs_allocate_memory(
13072
+ (void **)&blk_dev_info,
13073
+ sizeof(blk_device_info_t) * (MAX_BLKDEV + 1));
13075
+ /* initialize device info array */
13076
+ init_blk_dev_info(blk_dev_info);
13079
+ /* create logical disks from the raw devices */
13080
+ rc = walk_gendisk(create_logical_disks, disk_list);
13082
+ /* free blk_dev_info table and null the ptr to it */
13083
+ evms_cs_deallocate_memory(blk_dev_info);
13084
+ blk_dev_info = NULL;
13086
+ LOG_ENTRY_EXIT("%s Exit\n", __FUNCTION__);
13090
+/********************************************************/
13091
+/* Required Plugin Function Table Entry Point: */
13092
+/* Delete function */
13093
+/********************************************************/
13096
+ldev_mgr_delete(evms_logical_node_t *disk)
13098
+ ldev_mgr_instance_data_t *LID;
13100
+ /* reset any evms volume related info from
13101
+ * the device node, because we can't predict
13102
+ * how this node will be used in the future.
13105
+ /* removed the feature header if its been used
13107
+ if (disk->feature_header) {
13108
+ evms_cs_deallocate_memory(disk->feature_header);
13109
+ disk->feature_header = NULL;
13111
+ /* remove the volume_info structure and flag
13112
+ * if this has been used directly by an evms
13115
+ evms_cs_deallocate_volume_info(disk);
13116
+ /* reset the flags field to the appropriate state
13118
+ disk->flags &= ~EVMS_VOLUME_FLAG;
13120
+ /* disk nodes only get deleted when:
13121
+ * 1) there are no references to the disk node
13123
+ * 2) the device is removable
13124
+ * 3) the device reported a media change
13126
+ * All three of these conditions must be true
13127
+ * before the disk node can be deleted.
13128
+ * evms_check_for_device_changes should set
13129
+ * and ensure these conditions before issuing
13132
+ * Newly installed removable media will be
13133
+ * picked up in this modules discover code.
13135
+ if (disk->flags & EVMS_MEDIA_CHANGED) {
13136
+ LOG_DETAILS("deleting '%s'.\n",disk->name);
13138
+ evms_cs_unregister_device(disk);
13139
+ MOD_DEC_USE_COUNT;
13140
+ LID = disk->instance_data;
13142
+ evms_cs_deallocate_memory(LID);
13144
+ evms_cs_deallocate_logical_node(disk);
13149
+/********************************************************/
13150
+/* Required Plugin Function Table Entry Point: */
13151
+/* Read function */
13152
+/********************************************************/
13155
+ * function: ldev_mgr_io_error
13157
+ * this function was primarily created because the function
13158
+ * buffer_IO_error is inline and kgdb doesn't allow breakpoints
13159
+ * to be set on inline functions. Since this was an error path
13160
+ * and not mainline, I decided to add a trace statement to help
13161
+ * report on the failing condition.
13165
+ldev_mgr_io_error(
13166
+ evms_logical_node_t *disk,
13171
+ if (rc == -EOVERFLOW) {
13172
+ LOG_SERIOUS("attempt to %s beyond boundary(%Ld) on (%s), rsector(%Ld).\n",
13173
+ (io_flag) ? "WRITE" : "READ",
13174
+ disk->total_vsectors - 1,
13177
+ } else if (rc == -ENXIO) {
13178
+ LOG_SERIOUS("attempt to access a non-existent device(%s).\n",
13182
+ EVMS_IO_ERROR(eio);
13186
+ldev_mgr_read(evms_logical_node_t *disk, eio_t *eio)
13189
+ request_queue_t *q;
13190
+ ldev_mgr_instance_data_t *InstData;
13192
+ InstData = disk->instance_data;
13193
+ if ((eio->rsector + eio->rsize) <= disk->total_vsectors) {
13194
+ eio->bh->b_rsector = eio->rsector;
13195
+ eio->bh->b_size = eio->rsize << EVMS_VSECTOR_SIZE_SHIFT;
13196
+ eio->bh->b_rdev = InstData->dev;
13197
+ q = blk_get_queue(InstData->dev);
13199
+ q->make_request_fn(q, READ, eio->bh);
13203
+ disk->flags |= EVMS_VOLUME_CORRUPT |
13204
+ EVMS_VOLUME_GENDISK_GONE;
13210
+ ldev_mgr_io_error(disk, READ, eio, rc);
13214
+/********************************************************/
13215
+/* Required Plugin Function Table Entry Point: */
13216
+/* Write function */
13217
+/********************************************************/
13220
+ldev_mgr_write(evms_logical_node_t *disk, eio_t *eio)
13223
+ request_queue_t *q;
13224
+ ldev_mgr_instance_data_t *InstData;
13226
+ InstData = disk->instance_data;
13227
+ if ((eio->rsector + eio->rsize) <= disk->total_vsectors) {
13228
+ eio->bh->b_rsector = eio->rsector;
13229
+ eio->bh->b_size = eio->rsize << EVMS_VSECTOR_SIZE_SHIFT;
13230
+ eio->bh->b_rdev = InstData->dev;
13231
+ q = blk_get_queue(InstData->dev);
13233
+ q->make_request_fn(q, WRITE, eio->bh);
13237
+ disk->flags |= EVMS_VOLUME_CORRUPT |
13238
+ EVMS_VOLUME_GENDISK_GONE;
13244
+ ldev_mgr_io_error(disk, WRITE, eio, rc);
13248
+/********************************************************/
13249
+/* Required Plugin Function Table Entry Point: */
13250
+/* Init_io function & Support routines */
13251
+/********************************************************/
13254
+ * function: allocate_bh
13256
+ * This function obtains a buffer head from the private
13257
+ * buffer head pool (pre-allocated at EVMS initial
13258
+ * discovery time).
13260
+ * NOTE: All access to the buffer head pool are protected
13261
+ * by a private spinlock.
13264
+static inline struct buffer_head *
13267
+ struct buffer_head *bh =
13268
+ evms_cs_allocate_from_pool(evms_bh_pool, FALSE);
13270
+ init_waitqueue_head(&bh->b_wait);
13276
+ * function: deallocate_bh
13278
+ * This function returns a buffer head to the private
13279
+ * buffer head pool (pre-allocated at EVMS initial
13280
+ * discovery time).
13282
+ * NOTE: All access to the buffer head pool are protected
13283
+ * by a private spinlock.
13286
+static inline void
13287
+deallocate_bh(struct buffer_head *bh)
13289
+ evms_cs_deallocate_to_pool(evms_bh_pool, bh);
13292
+/* this is the buffer head control block structure definition */
13293
+typedef struct bh_cb_s {
13295
+ atomic_t blks_allocated;
13296
+ wait_queue_head_t cb_wait;
13300
+ * function: __wait_on_bh_cb
13302
+ * This is a worker function to wait_on_bh_cb.
13303
+ * This function waits for a set of private buffer heads
13304
+ * associated to the specified buffer head control block
13305
+ * to return from I/O completion. On completion of the
13306
+ * last buffer head, the calling function is awakened
13307
+ * and continues running.
13309
+ * This is the worker function to the function wait_on_bh_cb.
13313
+__wait_on_bh_cb(bh_cb_t *bh_cb)
13315
+ struct task_struct *tsk = current;
13316
+ DECLARE_WAITQUEUE(wait, tsk);
13318
+ add_wait_queue(&bh_cb->cb_wait, &wait);
13320
+ run_task_queue(&tq_disk);
13321
+ set_task_state(tsk, TASK_UNINTERRUPTIBLE);
13322
+ if (!atomic_read(&bh_cb->blks_allocated))
13325
+ } while (atomic_read(&bh_cb->blks_allocated));
13326
+ tsk->state = TASK_RUNNING;
13327
+ remove_wait_queue(&bh_cb->cb_wait, &wait);
13331
+ * function: wait_on_bh_cb
13333
+ * This function waits for a set of private buffer heads
13334
+ * associated to the specified buffer head control block
13335
+ * to return from I/O completion. On completion of the
13336
+ * last buffer head, the calling function is awakened
13337
+ * and continues running.
13341
+wait_on_bh_cb(bh_cb_t *bh_cb)
13343
+ if (atomic_read(&bh_cb->blks_allocated))
13344
+ __wait_on_bh_cb(bh_cb);
13346
+ /* if we ended up with no buffer heads on
13347
+ * this pass, lets wait a until a few buffer
13348
+ * heads have been freed and try again. This
13349
+ * should provide a reasonable delay.
13355
+ * function: end_bh_cb_io
13357
+ * This is the I/O completion function that is called for
13358
+ * each private buffer head obtained from the buffer head
13359
+ * pool. Control is return thru this routine so we can track
13360
+ * all outstanding requests to know when to awaken the caller,
13361
+ * and to regain control after all I/Os have been performed.
13365
+end_bh_cb_io_sync(struct buffer_head *bh, int uptodate)
13367
+ bh_cb_t *bh_cb = (bh_cb_t *)bh->b_private;
13369
+ /* record that errors occurred */
13371
+ bh_cb->rc = -EIO;
13373
+ mark_buffer_uptodate(bh, uptodate);
13374
+ unlock_buffer(bh);
13376
+ deallocate_bh(bh);
13377
+ atomic_dec(&bh_cb->blks_allocated);
13378
+ if (!atomic_read(&bh_cb->blks_allocated))
13379
+ if (waitqueue_active(&bh_cb->cb_wait))
13380
+ wake_up(&bh_cb->cb_wait);
13384
+ * function: ldev_partial_sector_init_io
13386
+ * This function is a support function for ldev_init_io,
13387
+ * which handles the cases of performing I/O to only a part
13388
+ * of non-standard sized hardsector. This function is not
13389
+ * designed to be called directly, but via ldev_init_io.
13393
+ldev_partial_sector_init_io(
13394
+ evms_logical_node_t *node,
13397
+ u_int64_t next_lsn,
13398
+ u_int64_t sector_lsn,
13399
+ u_int64_t io_size,
13401
+ unsigned char **sector_buf )
13404
+ ldev_mgr_instance_data_t *InstData = node->instance_data;
13405
+ kdev_t dev = InstData->dev;
13406
+ struct buffer_head *bh;
13408
+ if (*sector_buf == NULL) {
13409
+ /* allocate buffer for incoming sector */
13410
+ rc = evms_cs_allocate_memory((void **)sector_buf,
13411
+ node->hardsector_size);
13412
+ if (rc) return(rc);
13414
+ /* allocate a buffer head from the pool */
13415
+ while((bh = allocate_bh()) == NULL)
13416
+ /* yielding the cpu is playing it
13417
+ * safe. it might be wiser to just
13418
+ * spin. requires more thought.
13422
+ /* set up the buffer head for this sector */
13423
+ bh->b_end_io = end_bh_cb_io_sync;
13424
+ bh->b_size = node->hardsector_size;
13425
+ bh->b_rdev = dev;
13426
+ bh->b_rsector = next_lsn - sector_lsn;
13427
+ bh->b_data = *sector_buf;
13428
+ bh->b_page = virt_to_page(*sector_buf); /* this isn't handling the case of a block with more than 1 sector, that spans pages */
13430
+ set_bit(BH_Dirty, &bh->b_state);
13431
+ set_bit(BH_Lock, &bh->b_state);
13432
+ set_bit(BH_Req, &bh->b_state);
13433
+ set_bit(BH_Mapped, &bh->b_state);
13434
+ bh->b_private = (void *)bh_cb;
13435
+ atomic_inc(&bh_cb->blks_allocated);
13437
+ /* drive the buffer head down */
13438
+ /* to the device */
13439
+ generic_make_request(READ, bh);
13441
+ /* wait for all bh's I/O's to end */
13442
+ wait_on_bh_cb(bh_cb);
13444
+ /* copy data to/from user */
13445
+ if (io_flag != WRITE)
13448
+ *sector_buf + (sector_lsn << EVMS_VSECTOR_SIZE_SHIFT),
13449
+ io_size << EVMS_VSECTOR_SIZE_SHIFT);
13452
+ memcpy(*sector_buf + (sector_lsn << EVMS_VSECTOR_SIZE_SHIFT),
13454
+ io_size << EVMS_VSECTOR_SIZE_SHIFT);
13456
+ /* allocate a buffer head from the pool */
13457
+ while((bh = allocate_bh()) == NULL)
13458
+ /* yielding the cpu is playing it
13459
+ * safe. it might be wiser to just
13460
+ * spin. requires more thought.
13464
+ /* set up the buffer head for this sector */
13465
+ bh->b_end_io = end_bh_cb_io_sync;
13466
+ bh->b_size = node->hardsector_size;
13467
+ bh->b_rdev = dev;
13468
+ bh->b_rsector = next_lsn - sector_lsn;
13469
+ bh->b_data = *sector_buf;
13470
+ bh->b_page = virt_to_page(*sector_buf); /* this isn't handling the case of a block with more than 1 sector, that spans pages */
13472
+ set_bit(BH_Dirty, &bh->b_state);
13473
+ set_bit(BH_Lock, &bh->b_state);
13474
+ set_bit(BH_Req, &bh->b_state);
13475
+ set_bit(BH_Mapped, &bh->b_state);
13476
+ bh->b_private = (void *)bh_cb;
13477
+ atomic_inc(&bh_cb->blks_allocated);
13479
+ /* drive the buffer head down */
13480
+ /* to the device */
13481
+ generic_make_request(WRITE, bh);
13483
+ /* wait for all bh's I/O's to end */
13484
+ wait_on_bh_cb(bh_cb);
13490
+ * function: ldev_init_io
13492
+ * This function provides support for synchronous I/O
13493
+ * operations to the underlying devices. These I/O
13494
+ * operations are NOT buffered in any way including the
13495
+ * operating system's buffer cache.
13497
+ * This function can work with any hardsector size that
13498
+ * is a power of 2.
13500
+ * node : logical node of the target logical disk
13501
+ * io_flag : 0 = read, 1 = write, 2 = read-a-head
13502
+ * starting_lsn : the 0-based (disk relative) logical
13503
+ * : (512 byte) sector number (lsn)
13504
+ * num_lsns : the total number of lsns in this I/O
13505
+ * bufptr : address of the memory to read/write the data
13510
+ evms_logical_node_t *node,
13512
+ u_int64_t starting_lsn,
13513
+ u_int64_t num_lsns,
13516
+ int rc = 0, lsns_per_hardsector, lsns_per_blocksize;
13517
+ unchar *sector_buf = NULL, *cur_bufptr;
13518
+ u_int64_t next_lsn, remaining_lsns, sector_lsn;
13519
+ ldev_mgr_instance_data_t *InstData = node->instance_data;
13520
+ kdev_t dev = InstData->dev;
13523
+ LOG_EVERYTHING("%s Entry: Disk(%u,%u), ioflag(%u), start_lsn(%Lu), num_lsns(%Lu), bufptr(0x%p)\n",
13524
+ __FUNCTION__, MAJOR(dev), MINOR(dev), io_flag, starting_lsn, num_lsns, bufptr);
13526
+ /* check for valid device */
13527
+ if (!blk_size[MAJOR(dev)][MINOR(dev)]) {
13528
+ node->flags |= EVMS_VOLUME_CORRUPT |
13529
+ EVMS_VOLUME_GENDISK_GONE;
13532
+ /* check for 0 length request */
13533
+ if ( num_lsns == 0 ) {
13534
+ LOG_ERROR("%s: error requesting 0 sectors.\n", __FUNCTION__);
13537
+ /* check for out of bound request */
13538
+ if ( (starting_lsn + num_lsns) > node->total_vsectors) {
13539
+ LOG_ERROR("%s: attempted %s beyond logical disk boundary(%Lu LSNs), requesting LSN(%Lu), total LSNs(%Lu).\n",
13540
+ __FUNCTION__, (io_flag == WRITE) ? "WRITE" : "READ",
13541
+ node->total_vsectors,
13542
+ starting_lsn, num_lsns);
13545
+ /* check for invalid io_flag value */
13546
+ switch( io_flag ) {
13547
+ case READ: /* read... */
13548
+ case WRITE: /* write... */
13549
+ case READA: /* reada... */
13555
+ /* compute some per device info once up-front */
13556
+ lsns_per_hardsector = node->hardsector_size / EVMS_VSECTOR_SIZE;
13557
+ lsns_per_blocksize = node->block_size / EVMS_VSECTOR_SIZE;
13559
+ /* initialize the buffer head control block */
13560
+ memset(&bh_cb, 0, sizeof(bh_cb_t));
13561
+ init_waitqueue_head(&bh_cb.cb_wait);
13563
+ /* only update the local copy of variables */
13564
+ cur_bufptr = bufptr;
13565
+ next_lsn = starting_lsn;
13566
+ remaining_lsns = num_lsns;
13568
+ /* check for a mid-sector starting offset
13570
+ * if found, perform I/O on part of that
13573
+ sector_lsn = next_lsn & (lsns_per_hardsector - 1);
13574
+ if (sector_lsn) {
13575
+ u_int64_t io_size;
13577
+ /* determine bytes in IO to this sector */
13578
+ io_size = lsns_per_hardsector - sector_lsn;
13579
+ if (io_size > remaining_lsns)
13580
+ io_size = remaining_lsns;
13582
+ /* perform the partial sector io */
13583
+ rc = ldev_partial_sector_init_io(
13584
+ node,io_flag, &bh_cb,
13586
+ sector_lsn, io_size,
13587
+ cur_bufptr, §or_buf);
13590
+ /* update progress in local variables */
13591
+ cur_bufptr += io_size << EVMS_VSECTOR_SIZE_SHIFT;
13592
+ next_lsn += io_size;
13593
+ remaining_lsns -= io_size;
13597
+ /* continue if no errors found */
13599
+ /* perform I/O on all the complete sectors
13600
+ * in this request.
13602
+ * loop until there are no more complete sectors
13605
+ while(remaining_lsns >= lsns_per_hardsector) {
13606
+ /* this inner loop attempts to drive as many
13607
+ * bytes (in sector size multiples) down to
13608
+ * the device as possible using the available
13609
+ * buffer heads in the pool.
13611
+ while(remaining_lsns >= lsns_per_hardsector) {
13612
+ struct buffer_head *bh;
13614
+ /* allocate a buffer head from the pool */
13615
+ bh = allocate_bh();
13616
+ if (bh == NULL) break;
13618
+ /* set up the buffer head for this I/O */
13619
+ bh->b_end_io = end_bh_cb_io_sync;
13621
+ (remaining_lsns >= lsns_per_blocksize) ?
13622
+ node->block_size :
13623
+ node->hardsector_size;
13624
+ bh->b_data = cur_bufptr;
13625
+ bh->b_rdev = dev;
13626
+ bh->b_rsector = next_lsn;
13627
+ bh->b_page = virt_to_page(cur_bufptr); /* this isn't handling the case of a block with more than 1 sector, that spans pages */
13629
+ set_bit(BH_Dirty, &bh->b_state);
13630
+ set_bit(BH_Lock, &bh->b_state);
13631
+ set_bit(BH_Req, &bh->b_state);
13632
+ set_bit(BH_Mapped, &bh->b_state);
13633
+ bh->b_private = (void *)&bh_cb;
13634
+ atomic_inc(&bh_cb.blks_allocated);
13636
+ /* drive the buffer head down */
13637
+ /* to the device */
13638
+ generic_make_request(io_flag, bh);
13640
+ /* update progress in local variables */
13641
+ cur_bufptr += bh->b_size;
13642
+ next_lsn += bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT;
13643
+ remaining_lsns -= bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT;
13645
+ /* wait for all bh's I/O's to end */
13646
+ wait_on_bh_cb(&bh_cb);
13650
+ /* continue if no errors found */
13652
+ /* check for a mid-sector ending offset
13654
+ * if found, perform I/O on part of that
13657
+ if (remaining_lsns)
13658
+ /* perform the partial sector io */
13659
+ rc = ldev_partial_sector_init_io(
13660
+ node, io_flag, &bh_cb,
13662
+ 0, remaining_lsns,
13663
+ cur_bufptr, §or_buf);
13665
+ /* free the sector buffer if it was allocated */
13667
+ evms_cs_deallocate_memory(sector_buf);
13669
+ /* coalesce return codes */
13672
+ LOG_EVERYTHING("%s Exit: rc(%u)\n", __FUNCTION__, rc);
13677
+/********************************************************/
13678
+/* Required Plugin Function Table Entry Point: */
13679
+/* IOCTL function & Support routines */
13680
+/********************************************************/
13684
+ evms_logical_node_t * disk,
13685
+ struct inode * inode,
13686
+ struct file * file,
13687
+ unsigned int cmd,
13688
+ unsigned long arg)
13691
+ ldev_mgr_instance_data_t *InstData = disk->instance_data;
13694
+ if (!inode || !disk)
13697
+ save_dev = inode->i_rdev;
13698
+ inode->i_rdev = InstData->dev;
13700
+ case EVMS_QUIESCE_VOLUME:
13701
+ case EVMS_PLUGIN_IOCTL:
13704
+ case EVMS_GET_BMAP:
13706
+ evms_get_bmap_t *bmap = (evms_get_bmap_t *)arg;
13707
+ bmap->dev = InstData->dev;
13708
+ bmap->status = 0;
13711
+ case EVMS_OPEN_VOLUME:
13712
+ rc = InstData->gd->fops->open(inode, file);
13714
+ case EVMS_CLOSE_VOLUME:
13715
+ rc = InstData->gd->fops->release(inode, file);
13717
+ case EVMS_CHECK_MEDIA_CHANGE:
13718
+ /* once we detect that media changed
13719
+ * is 'set', don't send any more ioctls
13720
+ * down to the device, until the
13721
+ * media change has been 'reset' by a
13722
+ * revalidate disk ioctl. when already
13723
+ * 'set', just return a 1 w/o actually
13724
+ * performing another ioctl call to the
13727
+ if (InstData->media_changed == TRUE) {
13731
+ rc = InstData->gd->fops->check_media_change(InstData->dev);
13733
+ InstData->media_changed = TRUE;
13734
+ disk->flags |= EVMS_MEDIA_CHANGED;
13737
+ case EVMS_REVALIDATE_DISK:
13738
+ /* don't actually send this ioctl down
13739
+ * to the device, until we know that
13740
+ * previous check media change ioctl
13743
+ * when we do actually send the ioctl
13744
+ * down, reset the local media_changed
13747
+ if (InstData->media_changed == FALSE)
13749
+ rc = InstData->gd->fops->revalidate(InstData->dev);
13750
+ InstData->media_changed = FALSE;
13752
+ case EVMS_GET_DISK_LIST:
13753
+ rc = evms_cs_add_item_to_list(
13754
+ (evms_list_node_t **)arg,
13760
+ rc = InstData->gd->fops->ioctl(inode, file, cmd, arg);
13763
+ inode->i_rdev = save_dev;
13768
+/********************************************************/
13769
+/* Required Module Entry Point: */
13770
+/* ldev_mgr_init */
13771
+/********************************************************/
13774
+ldev_mgr_init(void)
13776
+ return evms_cs_register_plugin(&plugin_header);
13779
+static void __exit
13780
+ldev_mgr_exit(void)
13782
+ evms_cs_unregister_plugin(&plugin_header);
13785
+module_init(ldev_mgr_init);
13786
+module_exit(ldev_mgr_exit);
13787
+#ifdef MODULE_LICENSE
13788
+MODULE_LICENSE("GPL");
13790
diff -Naur linux-2002-03-28/drivers/evms/lvm_vge.c evms-2002-03-28/drivers/evms/lvm_vge.c
13791
--- linux-2002-03-28/drivers/evms/lvm_vge.c Wed Dec 31 18:00:00 1969
13792
+++ evms-2002-03-28/drivers/evms/lvm_vge.c Thu Mar 28 10:20:25 2002
13794
+/* -*- linux-c -*- */
13797
+ * Copyright (c) International Business Machines Corp., 2000
13799
+ * This program is free software; you can redistribute it and/or modify
13800
+ * it under the terms of the GNU General Public License as published by
13801
+ * the Free Software Foundation; either version 2 of the License, or
13802
+ * (at your option) any later version.
13804
+ * This program is distributed in the hope that it will be useful,
13805
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13806
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
13807
+ * the GNU General Public License for more details.
13809
+ * You should have received a copy of the GNU General Public License
13810
+ * along with this program; if not, write to the Free Software
13811
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
13814
+ * linux/drivers/evms/lvm_vge.c
13816
+ * EVMS Linux LVM Region Manager
13819
+#include <linux/module.h>
13820
+#include <linux/kernel.h>
13821
+#include <linux/config.h>
13822
+#include <linux/genhd.h>
13823
+#include <linux/major.h>
13824
+#include <linux/string.h>
13825
+#include <linux/blk.h>
13826
+#include <linux/init.h>
13827
+#include <linux/slab.h>
13828
+#include <linux/vmalloc.h>
13829
+#include <linux/evms/evms_kernel.h>
13830
+#include <linux/evms/evms_lvm.h>
13831
+#include <asm/system.h>
13832
+#include <asm/uaccess.h>
13834
+#define LOG_PREFIX "lvm: "
13836
+// Plugin API prototypes
13837
+static int lvm_discover( evms_logical_node_t ** evms_node_list );
13838
+static int lvm_discover_end( evms_logical_node_t ** evms_node_list );
13839
+static int lvm_delete_node( evms_logical_node_t * logical_node );
13840
+static void lvm_read( evms_logical_node_t * node,
13842
+static void lvm_write( evms_logical_node_t * node,
13844
+static int lvm_init_io( evms_logical_node_t * node,
13846
+ evms_sector_t sect_nr,
13847
+ evms_sector_t num_sects,
13848
+ void * buf_addr );
13849
+static int lvm_ioctl( evms_logical_node_t * logical_node,
13850
+ struct inode * inode,
13851
+ struct file * file,
13852
+ unsigned int cmd,
13853
+ unsigned long arg);
13854
+static int lvm_direct_ioctl( struct inode * inode,
13855
+ struct file * file,
13856
+ unsigned int cmd,
13857
+ unsigned long args );
13859
+static snapshot_map_entry_t * allocate_snapshot_map_entry(evms_sector_t org_sector,
13860
+ evms_sector_t snap_sector );
13863
+// Global LVM data structures
13864
+static evms_plugin_function_table_t lvm_function_table = {
13865
+ discover : lvm_discover,
13866
+ end_discover : lvm_discover_end,
13867
+ delete : lvm_delete_node,
13869
+ write : lvm_write,
13870
+ init_io : lvm_init_io,
13871
+ ioctl : lvm_ioctl,
13872
+ direct_ioctl : lvm_direct_ioctl
13875
+static evms_plugin_header_t lvm_plugin_header = {
13876
+ id : SetPluginID(
13878
+ EVMS_REGION_MANAGER,
13881
+ major : EVMS_LVM_VERSION_MAJOR,
13882
+ minor : EVMS_LVM_VERSION_MINOR,
13883
+ patchlevel : EVMS_LVM_VERSION_PATCH
13885
+ required_common_services_version: {
13890
+ function_table : &lvm_function_table
13893
+static lvm_volume_group_t * lvm_group_list = NULL;
13894
+static struct proc_dir_entry * lvm_proc = NULL;
13898
+/********** Miscellaneous Functions **********/
13902
+/* Function: remap sector
13904
+ * Common function to remap LV lba to PV lba in appropriate PE. This
13905
+ * function needs to deal with requests that span PEs and/or stripes. If
13906
+ * this occurs, the request will simply be chopped off at the boundary of
13907
+ * the first PE/stripe. It is up to the calling function to loop
13908
+ * accordingly to finish the full remapping. This function is now partially
13909
+ * 64-bit enabled. The striping section contains code that currently cannot
13910
+ * eliminate at least one mod operation on 64 bit values.
13912
+static int remap_sector(evms_logical_node_t * node,
13913
+ evms_sector_t org_sector, // logical sector to remap
13914
+ evms_sector_t size, // size (in sectors) of request to remap
13915
+ evms_sector_t * new_sector, // remapped sector
13916
+ evms_sector_t * new_size, // new size (in sectors)
13917
+ evms_sector_t * pe_start_sector,// starting sector of pe - needed for snapshotting
13918
+ lvm_physical_volume_t ** pv_entry ) // new node for which new_sector is relative
13920
+ lvm_logical_volume_t * volume = node->instance_data;
13921
+ le_table_entry_t * le_entry;
13923
+ u_int32_t offset_in_le;
13925
+ u_int32_t sectors_per_column;
13926
+ u_int32_t column;
13927
+ u_int32_t sector_in_column;
13928
+ u_int32_t stripe_in_column;
13929
+ u_int32_t le_in_column;
13930
+ u_int32_t columns;
13931
+ u_int32_t offset_in_stripe;
13932
+ u_int32_t stripe_in_le;
13933
+ u_int32_t org_sector32; // Needed for striping - not 64-bit enabled
13935
+ *new_size = size;
13937
+ // Check if volume is striped. Reset the size if the request
13938
+ // crosses a stripe boundary. Striping in LVM is not 64-bit
13940
+ if ( volume->stripes > 1 ) {
13941
+ org_sector32 = org_sector;
13942
+ sectors_per_column = volume->stripes * volume->pe_size;
13943
+ column = org_sector32 / sectors_per_column;
13944
+ sector_in_column = org_sector32 % sectors_per_column;
13945
+ stripe_in_column = sector_in_column / volume->stripe_size;
13946
+ le_in_column = stripe_in_column % volume->stripes;
13947
+ columns = volume->num_le / volume->stripes;
13948
+ le = column + (columns * le_in_column);
13950
+ offset_in_stripe = org_sector32 % volume->stripe_size;
13951
+ stripe_in_le = stripe_in_column / volume->stripes;
13952
+ offset_in_le = offset_in_stripe + stripe_in_le * volume->stripe_size;
13954
+ if ( offset_in_stripe + size > volume->stripe_size ) {
13955
+ *new_size = volume->stripe_size - offset_in_stripe;
13958
+ // Non-striped volume. Just find LE and offset. Reset the size if
13959
+ // the request crosses an LE boundary. This path is 64-bit safe.
13961
+ le = org_sector >> volume->pe_size_shift;
13962
+ offset_in_le = org_sector & (volume->pe_size - 1);
13964
+ if ( offset_in_le + size > volume->pe_size ) {
13965
+ *new_size = volume->pe_size - offset_in_le;
13969
+ le_entry = &volume->le_map[le];
13970
+ *pe_start_sector = le_entry->pe_sector_offset;
13971
+ *new_sector = le_entry->pe_sector_offset + offset_in_le;
13972
+ *pv_entry = le_entry->owning_pv;
13978
+/* Function: add_group_to_list
13980
+ * Add an LVM volume group to the global LVM list. This inserts at
13981
+ * the start of the list, since order isn't particularly important.
13983
+ * So, it appears that order is important. :) Now inserting at the
13984
+ * end of the list instead of the beginning.
13986
+static int add_group_to_list( lvm_volume_group_t * group )
13988
+ lvm_volume_group_t ** p_group;
13990
+ for ( p_group = &lvm_group_list; *p_group; p_group = &(*p_group)->next_group ) {
13994
+ *p_group = group;
13995
+ group->next_group = NULL;
14001
+/* Function: remove_group_from_list
14003
+ * Remove an LVM volume group from the global LVM list.
14005
+static int remove_group_from_list( lvm_volume_group_t * group )
14007
+ lvm_volume_group_t ** p_group;
14009
+ for ( p_group = &lvm_group_list; *p_group; p_group = &(*p_group)->next_group ) {
14010
+ if ( *p_group == group ) {
14011
+ *p_group = (*p_group)->next_group;
14012
+ group->next_group = NULL;
14021
+/* Function: find_group_by_uuid
14023
+ * Use the vg_uuid to find the desired volume group.
14025
+static int find_group_by_uuid( unsigned char * vg_uuid,
14026
+ lvm_volume_group_t ** group)
14028
+ lvm_volume_group_t * gp;
14030
+ for ( gp = lvm_group_list; gp; gp = gp->next_group ) {
14031
+ if ( ! memcmp(vg_uuid, gp->vg_uuid, UUID_LEN) ) {
14041
+/* Function: find_pv_by_number
14043
+ * Search the PV list of the specified volume group, looking for the
14044
+ * specified PV number. If found, return a pointer to that PV.
14046
+static lvm_physical_volume_t * find_pv_by_number(u_int32_t pv_number,
14047
+ lvm_volume_group_t * group )
14049
+ lvm_physical_volume_t * pv_entry;
14051
+ for ( pv_entry = group->pv_list; pv_entry; pv_entry = pv_entry->next ) {
14052
+ if ( pv_entry->pv_number == pv_number ) {
14060
+/* Function: translate_lv_name
14062
+ * In LVM, volumes have names based on their dev-node, which follow the
14063
+ * pattern /dev/group_name/volume_name. In EVMS, the same volume needs
14064
+ * to appear as /dev/evms/lvm/group_name/volume_name. Thus, the name from
14065
+ * the lv_disk_t needs to be translated before copying to the associated
14066
+ * node. evms_node_name must point to a NAME_LEN sized buffer.
14068
+static int translate_lv_name( char * lvm_lv_name, char * evms_node_name )
14072
+ memset(evms_node_name, 0, NAME_LEN);
14074
+ // Make sure the string starts with /dev/, and skip over it.
14075
+ ptr = strstr(lvm_lv_name, DEV_DIRECTORY);
14076
+ if ( ptr != lvm_lv_name ) {
14077
+ LOG_SERIOUS("Invalid LV name: %s\n", lvm_lv_name);
14080
+ ptr = &ptr[strlen(DEV_DIRECTORY)];
14082
+ // ptr now points to "group_name/volume_name".
14083
+ // Use this to create the name for the EVMS node.
14084
+ strcpy(evms_node_name, LVM_DEV_DIRECTORY);
14085
+ strncat(evms_node_name, ptr, NAME_LEN-strlen(evms_node_name)-1);
14091
+/* Function: check_pv_for_lv
14093
+ * Run through all LE maps of all LVs in this group, and make sure the
14094
+ * specified PV is not being pointed to by any LEs.
14096
+static int check_pv_for_lv( lvm_physical_volume_t * pv_entry,
14097
+ lvm_volume_group_t * group )
14099
+ lvm_logical_volume_t * volume;
14102
+ for ( i = 1; i <= MAX_LV; i++ ) {
14103
+ if ( (volume = group->volume_list[i]) ) {
14104
+ for ( j = 0; j < volume->num_le; j++ ) {
14105
+ if ( volume->le_map[j].owning_pv == pv_entry ) {
14116
+/********** Metadata I/O Functions **********/
14119
+/* Function: endian_convert_pv
14121
+ * Endian-neutral conversion for PV structures.
14123
+static inline void endian_convert_pv( pv_disk_t * pv )
14125
+ pv->version = le16_to_cpu(pv->version);
14126
+ pv->pv_on_disk.base = le32_to_cpu(pv->pv_on_disk.base);
14127
+ pv->pv_on_disk.size = le32_to_cpu(pv->pv_on_disk.size);
14128
+ pv->vg_on_disk.base = le32_to_cpu(pv->vg_on_disk.base);
14129
+ pv->vg_on_disk.size = le32_to_cpu(pv->vg_on_disk.size);
14130
+ pv->pv_uuidlist_on_disk.base = le32_to_cpu(pv->pv_uuidlist_on_disk.base);
14131
+ pv->pv_uuidlist_on_disk.size = le32_to_cpu(pv->pv_uuidlist_on_disk.size);
14132
+ pv->lv_on_disk.base = le32_to_cpu(pv->lv_on_disk.base);
14133
+ pv->lv_on_disk.size = le32_to_cpu(pv->lv_on_disk.size);
14134
+ pv->pe_on_disk.base = le32_to_cpu(pv->pe_on_disk.base);
14135
+ pv->pe_on_disk.size = le32_to_cpu(pv->pe_on_disk.size);
14136
+ pv->pv_major = le32_to_cpu(pv->pv_major);
14137
+ pv->pv_number = le32_to_cpu(pv->pv_number);
14138
+ pv->pv_status = le32_to_cpu(pv->pv_status);
14139
+ pv->pv_allocatable = le32_to_cpu(pv->pv_allocatable);
14140
+ pv->pv_size = le32_to_cpu(pv->pv_size);
14141
+ pv->lv_cur = le32_to_cpu(pv->lv_cur);
14142
+ pv->pe_size = le32_to_cpu(pv->pe_size);
14143
+ pv->pe_total = le32_to_cpu(pv->pe_total);
14144
+ pv->pe_allocated = le32_to_cpu(pv->pe_allocated);
14145
+ pv->pe_start = le32_to_cpu(pv->pe_start);
14149
+/* Function: read_pv
14151
+ * Read in the PV structure from the specified node. If it contains a
14152
+ * valid PV signature, allocate a new pv_disk_t and copy the data.
14154
+static int read_pv( evms_logical_node_t * node,
14155
+ pv_disk_t ** pv )
14157
+ pv_disk_t * pv_buffer;
14161
+ // Buffer for reading the PV metadata.
14162
+ pv_buffer = kmalloc(LVM_PV_DISK_SIZE, GFP_NOIO);
14163
+ if ( ! pv_buffer ) {
14164
+ LOG_CRITICAL("Memory error creating buffer to read PV metadata for node %s\n", node->name);
14168
+ // Read the first two sectors.
14169
+ if ( INIT_IO(node, 0, evms_cs_size_in_vsectors(LVM_PV_DISK_BASE),
14170
+ evms_cs_size_in_vsectors(LVM_PV_DISK_SIZE), pv_buffer) ) {
14171
+ LOG_SERIOUS("Error reading PV metadata from node %s\n", node->name);
14172
+ kfree(pv_buffer);
14176
+ // Endian-neutral conversion of PV metadata.
14177
+ endian_convert_pv(pv_buffer);
14179
+ // Check for an LVM signature and make sure the sizes match.
14180
+ // Versions 1 and 2 are both valid now. Thanks LVM! :)
14181
+ if ( ! ( pv_buffer->id[0] == 'H' &&
14182
+ pv_buffer->id[1] == 'M' &&
14183
+ (pv_buffer->version == 1 || pv_buffer->version == 2) &&
14184
+ pv_buffer->pv_size == node->total_vsectors ) ) {
14185
+ LOG_EXTRA("Node %s is not an LVM PV\n", node->name);
14186
+ kfree(pv_buffer);
14190
+ // This is a valid PV. Allocate a new pv_disk_t.
14191
+ *pv = kmalloc(sizeof(pv_disk_t), GFP_NOIO);
14193
+ LOG_CRITICAL("Memory error creating new PV for node %s\n", node->name);
14194
+ kfree(pv_buffer);
14198
+ // Copy the metadata.
14199
+ memcpy(*pv, pv_buffer, sizeof(pv_disk_t));
14200
+ kfree(pv_buffer);
14205
+/* Function: endian_convert_vg
14207
+ * Endian-neutral conversion for VG structures
14209
+static inline void endian_convert_vg( vg_disk_t * vg )
14211
+ vg->vg_number = le32_to_cpu(vg->vg_number);
14212
+ vg->vg_access = le32_to_cpu(vg->vg_access);
14213
+ vg->vg_status = le32_to_cpu(vg->vg_status);
14214
+ vg->lv_max = le32_to_cpu(vg->lv_max);
14215
+ vg->lv_cur = le32_to_cpu(vg->lv_cur);
14216
+ vg->lv_open = le32_to_cpu(vg->lv_open);
14217
+ vg->pv_max = le32_to_cpu(vg->pv_max);
14218
+ vg->pv_cur = le32_to_cpu(vg->pv_cur);
14219
+ vg->pv_act = le32_to_cpu(vg->pv_act);
14220
+ vg->dummy = le32_to_cpu(vg->dummy);
14221
+ vg->vgda = le32_to_cpu(vg->vgda);
14222
+ vg->pe_size = le32_to_cpu(vg->pe_size);
14223
+ vg->pe_total = le32_to_cpu(vg->pe_total);
14224
+ vg->pe_allocated= le32_to_cpu(vg->pe_allocated);
14225
+ vg->pvg_total = le32_to_cpu(vg->pvg_total);
14229
+/* Function: read_vg
14231
+ * Read in the VG structure from the specified node. Allocate a new
14232
+ * vg_disk_t and copy the data.
14234
+static int read_vg( evms_logical_node_t * node,
14236
+ vg_disk_t ** vg )
14238
+ vg_disk_t * vg_buffer;
14239
+ unsigned long vg_sectors;
14241
+ // Allocate a buffer to read the VG metadata.
14242
+ vg_sectors = evms_cs_size_in_vsectors(pv->vg_on_disk.size);
14243
+ vg_buffer = kmalloc(vg_sectors << EVMS_VSECTOR_SIZE_SHIFT, GFP_NOIO);
14244
+ if ( ! vg_buffer ) {
14245
+ LOG_CRITICAL("Memory error creating buffer to read VG metadata from node %s\n", node->name);
14249
+ // Read the VG metadata.
14250
+ if ( INIT_IO(node, 0, evms_cs_size_in_vsectors(pv->vg_on_disk.base), vg_sectors, vg_buffer) ) {
14251
+ LOG_SERIOUS("Error reading VG metadata from node %s\n", node->name);
14252
+ kfree(vg_buffer);
14256
+ // Endian-neutral conversion of VG metadata.
14257
+ endian_convert_vg(vg_buffer);
14259
+ // Allocate a new vg_disk_t
14260
+ *vg = kmalloc(sizeof(vg_disk_t), GFP_NOIO);
14262
+ LOG_CRITICAL("Memory error creating new VG structure for node %s\n", node->name);
14263
+ kfree(vg_buffer);
14267
+ // Copy the metadata.
14268
+ memcpy(*vg, vg_buffer, sizeof(vg_disk_t));
14269
+ kfree(vg_buffer);
14274
+/* Function: read_uuid_list
14276
+static int read_uuid_list( evms_logical_node_t * node,
14278
+ lvm_volume_group_t * group )
14280
+ evms_sector_t start_sector;
14281
+ unsigned long total_sectors;
14282
+ unsigned char * uuid_buffer;
14283
+ unsigned long buffer_size = IO_BUFFER_SECTORS * EVMS_VSECTOR_SIZE;
14284
+ unsigned long uuid_list_size;
14287
+ if ( group->uuid_list ) {
14288
+ LOG_EXTRA("Already read PV UUIDs for group %s\n", group->vg_name);
14292
+ start_sector = evms_cs_size_in_vsectors(pv->pv_uuidlist_on_disk.base);
14293
+ total_sectors = evms_cs_size_in_vsectors(pv->pv_uuidlist_on_disk.size);
14294
+ uuid_list_size = round_up(total_sectors * EVMS_VSECTOR_SIZE, buffer_size);
14296
+ // Allocate memory for the UUID array for this group.
14297
+ group->uuid_list = vmalloc(uuid_list_size);
14298
+ if ( ! group->uuid_list ) {
14299
+ LOG_CRITICAL("Memory error creating UUID list for group %s\n", group->vg_name);
14302
+ memset(group->uuid_list, 0, uuid_list_size);
14304
+ // Allocate a buffer to perform the I/Os.
14305
+ uuid_buffer = kmalloc(buffer_size, GFP_NOIO);
14306
+ if ( ! uuid_buffer ) {
14307
+ LOG_CRITICAL("Memory error creating I/O buffer for UUID list in group %s\n", group->vg_name);
14308
+ vfree(group->uuid_list);
14309
+ group->uuid_list = NULL;
14313
+ for ( i = 0; i < total_sectors; i += IO_BUFFER_SECTORS ) {
14314
+ if ( INIT_IO(node, 0, start_sector + i, IO_BUFFER_SECTORS, uuid_buffer) ) {
14315
+ LOG_SERIOUS("Error reading PV UUID list from node %s\n", node->name);
14316
+ kfree(uuid_buffer);
14317
+ vfree(group->uuid_list);
14318
+ group->uuid_list = NULL;
14322
+ // Copy the I/O buffer into the UUID array.
14323
+ memcpy(&(group->uuid_list[i*EVMS_VSECTOR_SIZE]), uuid_buffer, buffer_size);
14326
+ // Clear out the unused portion at the end of the uuid_list
14327
+ memset(&(group->uuid_list[pv->pv_uuidlist_on_disk.size]), 0, uuid_list_size - pv->pv_uuidlist_on_disk.size);
14329
+ kfree(uuid_buffer);
14334
+/* Function: endian_convert_lv
14336
+ * Endian-neutral conversion for LV structures
14338
+static inline void endian_convert_lv( lv_disk_t * lv )
14340
+ lv->lv_access = le32_to_cpu(lv->lv_access);
14341
+ lv->lv_status = le32_to_cpu(lv->lv_status);
14342
+ lv->lv_open = le32_to_cpu(lv->lv_open);
14343
+ lv->lv_dev = le32_to_cpu(lv->lv_dev);
14344
+ lv->lv_number = le32_to_cpu(lv->lv_number);
14345
+ lv->lv_mirror_copies = le32_to_cpu(lv->lv_mirror_copies);
14346
+ lv->lv_recovery = le32_to_cpu(lv->lv_recovery);
14347
+ lv->lv_schedule = le32_to_cpu(lv->lv_schedule);
14348
+ lv->lv_size = le32_to_cpu(lv->lv_size);
14349
+ lv->lv_snapshot_minor = le32_to_cpu(lv->lv_snapshot_minor);
14350
+ lv->lv_chunk_size = le16_to_cpu(lv->lv_chunk_size);
14351
+ lv->dummy = le16_to_cpu(lv->dummy);
14352
+ lv->lv_allocated_le = le32_to_cpu(lv->lv_allocated_le);
14353
+ lv->lv_stripes = le32_to_cpu(lv->lv_stripes);
14354
+ lv->lv_stripesize = le32_to_cpu(lv->lv_stripesize);
14355
+ lv->lv_badblock = le32_to_cpu(lv->lv_badblock);
14356
+ lv->lv_allocation = le32_to_cpu(lv->lv_allocation);
14357
+ lv->lv_io_timeout = le32_to_cpu(lv->lv_io_timeout);
14358
+ lv->lv_read_ahead = le32_to_cpu(lv->lv_read_ahead);
14361
+static inline void endian_convert_lvs( lvm_volume_group_t * group )
14364
+ for ( i = 0; i < group->vg->lv_max; i++ ) {
14365
+ endian_convert_lv(&(group->lv_array[i]));
14370
+/* Function: read_lv
14372
+ * Read in the LV structures for the specified group. Do the read from
14373
+ * the first PV in the group. If that one fails, keep trying on the
14374
+ * remaining PVs until one works. This function will allocate a buffer
14375
+ * for the group to read in the structures.
14377
+static int read_lv( lvm_volume_group_t * group )
14379
+ lvm_physical_volume_t * pv_entry = group->pv_list;
14380
+ unsigned char * lv_buffer = NULL;
14381
+ evms_sector_t start_sector;
14382
+ unsigned long total_sectors;
14383
+ unsigned long buffer_size = IO_BUFFER_SECTORS * EVMS_VSECTOR_SIZE;
14384
+ unsigned long lv_array_size;
14387
+ if ( group->lv_array ) {
14391
+ if ( ! pv_entry ) {
14392
+ LOG_ERROR("Group %s has no PVs. Cannot read LV structures.\n", group->vg_name);
14396
+ // Allocate a buffer to do the actual I/Os.
14397
+ lv_buffer = kmalloc(buffer_size, GFP_NOIO);
14398
+ if ( ! lv_buffer ) {
14399
+ LOG_CRITICAL("Memory error creating I/O buffer for LV structs for Group %s\n", group->vg_name);
14403
+ // Read in the LV structures 4k at a time. If one PV returns errors,
14404
+ // start over with the next PV in the group.
14405
+ while (rc && pv_entry) {
14406
+ start_sector = evms_cs_size_in_vsectors(pv_entry->pv->lv_on_disk.base);
14407
+ total_sectors = evms_cs_size_in_vsectors(pv_entry->pv->lv_on_disk.size);
14408
+ lv_array_size = round_up(total_sectors * EVMS_VSECTOR_SIZE, buffer_size);
14410
+ // Allocate the buffer for this group to hold the entire LV array.
14411
+ if ( group->lv_array ) {
14412
+ vfree(group->lv_array);
14413
+ group->lv_array = NULL;
14415
+ group->lv_array = vmalloc(lv_array_size);
14416
+ if ( ! group->lv_array ) {
14417
+ LOG_CRITICAL("Memory error creating lv_array buffer for Group %s\n", group->vg_name);
14418
+ kfree(lv_buffer);
14421
+ memset(group->lv_array, 0, lv_array_size);
14423
+ for ( i = 0; i < total_sectors; i += IO_BUFFER_SECTORS ) {
14424
+ rc = INIT_IO(pv_entry->logical_node, 0, start_sector + i, IO_BUFFER_SECTORS, lv_buffer);
14426
+ LOG_SERIOUS("Error reading LV metadata from node %s in Group %s\n",
14427
+ pv_entry->logical_node->name, group->vg_name);
14429
+ // Try the next PV if the current one caused any errors.
14430
+ pv_entry = pv_entry->next;
14434
+ // Copy the I/O buffer into the lv_array
14435
+ memcpy(&(((char*)(group->lv_array))[i*EVMS_VSECTOR_SIZE]), lv_buffer, buffer_size);
14440
+ LOG_SERIOUS("Unable to read LV metadata from any PV in Group %s\n", group->vg_name);
14441
+ kfree(lv_buffer);
14442
+ vfree(group->lv_array);
14443
+ group->lv_array = NULL;
14447
+ // Clear out the unused portion at the end of the lv_array.
14448
+ memset(&(((char*)(group->lv_array))[pv_entry->pv->lv_on_disk.size]), 0, lv_array_size - pv_entry->pv->lv_on_disk.size);
14450
+ // Endian-neutral conversion of the LV metadata.
14451
+ endian_convert_lvs(group);
14453
+ kfree(lv_buffer);
14458
+/* Function: endian_convert_pe_map
14460
+ * Endian-neutral conversion for PE structures
14462
+static inline void endian_convert_pe_map( lvm_physical_volume_t * pv_entry )
14465
+ for ( i = 0; i < pv_entry->pv->pe_total; i++ ) {
14466
+ pv_entry->pe_map[i].lv_num = le16_to_cpu(pv_entry->pe_map[i].lv_num);
14467
+ pv_entry->pe_map[i].le_num = le16_to_cpu(pv_entry->pe_map[i].le_num);
14472
+/* Function: read_pe_map
14474
+ * Read in the PE map for the specified PV. This function will allocate a
14475
+ * buffer to read in the data.
14477
+static int read_pe_map( lvm_physical_volume_t * pv_entry )
14479
+ evms_logical_node_t * node = pv_entry->logical_node;
14480
+ pv_disk_t * pv = pv_entry->pv;
14481
+ unsigned char * pe_buffer;
14482
+ evms_sector_t start_sector;
14483
+ unsigned long total_sectors;
14484
+ unsigned long buffer_size = IO_BUFFER_SECTORS * EVMS_VSECTOR_SIZE;
14485
+ unsigned long pe_map_size;
14488
+ if ( pv_entry->pe_map ) {
14492
+ start_sector = evms_cs_size_in_vsectors(pv->pe_on_disk.base);
14493
+ total_sectors = evms_cs_size_in_vsectors(pv->pe_total * sizeof(pe_disk_t));
14494
+ pe_map_size = round_up(total_sectors * EVMS_VSECTOR_SIZE, buffer_size);
14496
+ // Allocate a buffer to hold the PE map for this PV.
14497
+ //pv_entry->pe_map = vmalloc(total_sectors << EVMS_VSECTOR_SIZE_SHIFT);
14498
+ pv_entry->pe_map = vmalloc(pe_map_size);
14499
+ if ( ! pv_entry->pe_map ) {
14500
+ LOG_CRITICAL("Memory error creating PE map for node %s\n", node->name);
14503
+ memset(pv_entry->pe_map, 0, pe_map_size);
14505
+ // Allocate a buffer for performing the I/O.
14506
+ pe_buffer = kmalloc(buffer_size, GFP_NOIO);
14507
+ if ( ! pe_buffer ) {
14508
+ LOG_CRITICAL("Memory error creating I/O buffer for PE maps for node %s\n", node->name);
14509
+ vfree(pv_entry->pe_map);
14510
+ pv_entry->pe_map = NULL;
14514
+ for ( i = 0; i < total_sectors; i += IO_BUFFER_SECTORS ) {
14515
+ if ( INIT_IO(node, 0, start_sector + i, IO_BUFFER_SECTORS, pe_buffer) ) {
14516
+ LOG_SERIOUS("Error reading PE maps from node %s.\n", node->name);
14517
+ kfree(pe_buffer);
14518
+ vfree(pv_entry->pe_map);
14519
+ pv_entry->pe_map = NULL;
14522
+ // Copy the data to the actual PE map.
14523
+ memcpy(&(((char*)(pv_entry->pe_map))[i*EVMS_VSECTOR_SIZE]), pe_buffer, buffer_size);
14526
+ // Clear out the unused portion at the end of the PE map.
14527
+ memset(&(((char*)(pv_entry->pe_map))[total_sectors*EVMS_VSECTOR_SIZE]), 0, pe_map_size - total_sectors*EVMS_VSECTOR_SIZE);
14529
+ // Endian-neutral conversion of the PE metadata.
14530
+ endian_convert_pe_map(pv_entry);
14532
+ kfree(pe_buffer);
14538
+/********** Snapshot Manipulation Functions **********/
14541
+/* Function: snapshot_check_quiesce_original
14543
+ * For this snapshot LV, check that both it and its original are quiesced.
14545
+static int snapshot_check_quiesce_original( lvm_logical_volume_t * snap_volume )
14547
+ lvm_logical_volume_t * org_volume = snap_volume->snapshot_org;
14549
+ if ( ! (snap_volume->lv_access & EVMS_LV_QUIESCED) ) {
14553
+ if ( org_volume &&
14554
+ ! (org_volume->lv_access & EVMS_LV_QUIESCED) ) {
14562
+/* Function: snapshot_check_quiesce_all
14564
+ * Go through the list of all snapshots for an original volume, and make
14565
+ * sure everyone is in a quiesced state.
14567
+static int snapshot_check_quiesce_all( lvm_logical_volume_t * org_volume )
14569
+ lvm_logical_volume_t * snap;
14571
+ if ( ! (org_volume->lv_access & EVMS_LV_QUIESCED) ) {
14575
+ for ( snap = org_volume->snapshot_next; snap; snap = snap->snapshot_next ) {
14576
+ if ( ! (snap->lv_access & EVMS_LV_QUIESCED) ) {
14585
+/* Function: invalidate_snapshot_volume
14587
+ * In the event a snapshot volume becomes full or corrupted, its metadata
14588
+ * must be altered in order to prevent it from being used again. Write some
14589
+ * invalid data into the first entry of the COW table. If this volume is
14590
+ * not fully deleted by the user/engine, this invalid COW entry will be
14591
+ * detected by build_snapshot_maps(), and will cause the volume to be
14592
+ * deleted before being exported to EVMS during discover. This is obviously
14593
+ * a hack, but it is the same hack currently used by LVM. We're just trying
14594
+ * to be compatible. :)
14596
+static int invalidate_snapshot_volume( lvm_logical_volume_t * snap_volume )
14598
+ evms_logical_node_t tmp_node;
14600
+ tmp_node.instance_data = snap_volume;
14601
+ tmp_node.total_vsectors = snap_volume->lv_size;
14603
+ if ( ! (snap_volume->lv_access & LV_SNAPSHOT) ) {
14604
+ LOG_WARNING("Volume %s is not a snapshot. Cannot invalidate\n", snap_volume->name);
14608
+ LOG_WARNING("Invalidating full/corrupted snapshot volume %s\n", snap_volume->name);
14609
+ LOG_WARNING("Run the EVMS administration tools to remove this snapshot.\n");
14611
+ if ( snap_volume->cow_table ) {
14612
+ snap_volume->cow_table[0].pv_org_rsector = cpu_to_le64(((evms_sector_t)1));
14613
+ if ( lvm_init_io(&tmp_node, 4, 0, 1, snap_volume->cow_table) ) {
14614
+ LOG_SERIOUS("Unable to invalidate snapshot volume %s\n", snap_volume->name);
14618
+ LOG_SERIOUS("Unable to invalidate snapshot volume %s\n", snap_volume->name);
14621
+ snap_volume->lv_status &= ~LV_ACTIVE;
14627
+/* Function: remove_snapshot_from_chain
14629
+ * Remove a snapshot volume from its original's chain of snapshots. This
14630
+ * does not delete the snapshot volume. At runtime, we cannot delete
14631
+ * volumes at the region-manager level, because EVMS may have this volume
14632
+ * exported, and there is no way to notify EVMS of the deletion. It will
14633
+ * eventually need to be deleted in the engine, which will then tell the
14634
+ * EVMS kernel services to delete the volume in the kernel.
14636
+static int remove_snapshot_from_chain( lvm_logical_volume_t * snap_volume )
14638
+ lvm_logical_volume_t * org_volume = snap_volume->snapshot_org;
14639
+ lvm_logical_volume_t ** p_volume;
14641
+ if ( org_volume ) {
14642
+ for ( p_volume = &org_volume->snapshot_next; *p_volume; p_volume = &(*p_volume)->snapshot_next ) {
14643
+ if ( *p_volume == snap_volume ) {
14644
+ *p_volume = snap_volume->snapshot_next;
14650
+ snap_volume->snapshot_org = NULL;
14651
+ snap_volume->snapshot_next = NULL;
14656
+/* Function: snapshot_hash
14658
+ * The snapshot hash tables are NEVER going to have 4 billion entries, so
14659
+ * we can safely cast the org_sector to 32 bits and just mod it by the
14660
+ * hash table size.
14662
+static u_int32_t snapshot_hash( evms_sector_t org_sector,
14663
+ lvm_logical_volume_t * snap_volume )
14665
+ return( ((u_int32_t)org_sector) % snap_volume->hash_table_size);
14669
+/* Function: snapshot_search_hash_chain
14671
+ * Search the hash chain that is anchored at the specified head pointer.
14672
+ * If the sector number is found, the result pointer is set to that entry
14673
+ * in the chain, and a 1 is returned. If the sector is not found, the
14674
+ * result pointer is set to the previous entry and 0 is returned. If the
14675
+ * result pointer is NULL, this means either the list is empty, or the
14676
+ * specified sector should become the first list item.
14678
+static int snapshot_search_hash_chain( evms_sector_t org_sector,
14679
+ snapshot_map_entry_t * head,
14680
+ snapshot_map_entry_t ** result )
14682
+ snapshot_map_entry_t * curr = head;
14683
+ snapshot_map_entry_t * prev = head;
14684
+ while ( curr && curr->org_sector < org_sector ) {
14686
+ curr = curr->next;
14689
+ // Either an empty chain or went off the end of the chain.
14693
+ else if ( curr->org_sector != org_sector ) {
14694
+ *result = curr->prev;
14698
+ // Found the desired sector.
14705
+/* Function: insert_snapshot_map_entry
14707
+ * Insert a new entry into a snapshot hash chain, immediately following the
14708
+ * specified entry. This function should not be used to add an entry into
14709
+ * an empty list, or as the first entry in an existing list. For that case,
14710
+ * use insert_snapshot_map_entry_at_head().
14712
+static int insert_snapshot_map_entry( snapshot_map_entry_t * entry,
14713
+ snapshot_map_entry_t * base )
14715
+ entry->next = base->next;
14716
+ entry->prev = base;
14717
+ base->next = entry;
14718
+ if ( entry->next ) {
14719
+ entry->next->prev = entry;
14725
+/* Function: insert_snapshot_map_entry_at_head
14727
+ * Insert a new entry into a snapshot chain as the first entry.
14729
+static int insert_snapshot_map_entry_at_head( snapshot_map_entry_t * entry,
14730
+ snapshot_map_entry_t ** head )
14732
+ entry->next = *head;
14733
+ entry->prev = NULL;
14735
+ if ( entry->next ) {
14736
+ entry->next->prev = entry;
14742
+/* Function: add_cow_entry_to_snapshot_map
14744
+ * Convert a cow table entry (from the on-disk data) into an appropriate
14745
+ * entry for the snapshot map. Insert this new entry into the appropriate
14746
+ * map for the specified volume.
14748
+ * The cow_entry passed into this function must have already been
14749
+ * endian-converted from disk-order to cpu-order.
14751
+static int add_cow_entry_to_snapshot_map(lv_COW_table_disk_t * cow_entry,
14752
+ lvm_logical_volume_t * volume )
14754
+ snapshot_map_entry_t * new_entry;
14755
+ snapshot_map_entry_t ** hash_table;
14756
+ snapshot_map_entry_t * chain_head;
14757
+ snapshot_map_entry_t * target_entry;
14758
+ u_int32_t hash_value;
14760
+ if ( cow_entry->pv_org_number == 0 ) {
14763
+ new_entry = allocate_snapshot_map_entry(cow_entry->pv_org_rsector, cow_entry->pv_snap_rsector);
14764
+ if ( ! new_entry ) {
14767
+ new_entry->snap_pv = find_pv_by_number(cow_entry->pv_snap_number, volume->group);
14768
+ if ( ! new_entry->snap_pv ) {
14772
+ hash_value = snapshot_hash(new_entry->org_sector, volume);
14773
+ hash_table = volume->snapshot_map[cow_entry->pv_org_number];
14774
+ chain_head = hash_table[hash_value];
14775
+ if ( snapshot_search_hash_chain(new_entry->org_sector, chain_head, &target_entry) ) {
14776
+ // In general, we should not find this entry in the snapshot
14777
+ // map already. However, it could happen on a re-discover, but
14778
+ // the build_snapshot_maps function should weed out those cases.
14779
+ // In either event, we can simply ignore duplicates.
14780
+ LOG_WARNING("Detected a duplicate snapshot map entry\n");
14781
+ LOG_WARNING("Snap PV %Ld:%Ld, Org PV %Ld:%Ld\n", cow_entry->pv_snap_number, cow_entry->pv_snap_rsector,
14782
+ cow_entry->pv_org_number, cow_entry->pv_org_rsector);
14783
+ kfree(new_entry);
14786
+ if ( target_entry ) {
14787
+ insert_snapshot_map_entry(new_entry, target_entry);
14790
+ insert_snapshot_map_entry_at_head(new_entry, &hash_table[hash_value]);
14798
+/* Function: snapshot_remap_sector
14800
+ * Perform a sector remap on a snapshot volume. This should be called from
14801
+ * the I/O read path, after the LE-to-PE translation has already been
14802
+ * performed. First, determine the base sector of the chunk containing the
14803
+ * specified sector, and save the remainder. Then, perform a search through
14804
+ * the snapshot map for the specified volume. If an match is found, change
14805
+ * the PV and sector numbers to the new values. If no match is found, leave
14806
+ * the values alone, meaning the read should proceed down the original
14809
+static void snapshot_remap_sector( lvm_logical_volume_t * snap_volume,
14810
+ evms_sector_t pe_start_sector,
14811
+ evms_sector_t * sector,
14812
+ lvm_physical_volume_t ** pv_entry )
14814
+ snapshot_map_entry_t ** hash_table;
14815
+ snapshot_map_entry_t * chain_head;
14816
+ snapshot_map_entry_t * result;
14817
+ u_int32_t hash_value;
14818
+ evms_sector_t chunk_sector;
14819
+ evms_sector_t remainder;
14821
+ if ( ! (snap_volume->lv_access & LV_SNAPSHOT) ) {
14825
+ chunk_sector = ((*sector - pe_start_sector) & ((evms_sector_t)(~(snap_volume->chunk_size - 1)))) + pe_start_sector;
14826
+ remainder = *sector - chunk_sector;
14827
+ hash_value = snapshot_hash(chunk_sector, snap_volume);
14828
+ hash_table = snap_volume->snapshot_map[(*pv_entry)->pv_number];
14829
+ chain_head = hash_table[hash_value];
14831
+ if ( snapshot_search_hash_chain(chunk_sector, chain_head, &result) ) {
14832
+ *pv_entry = result->snap_pv;
14833
+ *sector = result->snap_sector + remainder;
14838
+/* Function: snapshot_read_write_chunk
14840
+ * This function takes care of reading one chunk of data from the
14841
+ * original, and writing it to the snapshot. Since the original now has
14842
+ * a fixed sized buffer for this data, we may have to loop to get the
14843
+ * whole chunk copied.
14845
+static int snapshot_read_write_chunk( lvm_logical_volume_t * org_volume,
14846
+ lvm_physical_volume_t * org_pv,
14847
+ evms_sector_t chunk_sector,
14848
+ lvm_logical_volume_t * snap_volume,
14849
+ lvm_physical_volume_t ** snap_pv,
14850
+ evms_sector_t * snap_sector )
14852
+ u_int32_t io_size = snap_volume->chunk_size;
14853
+ evms_sector_t snap_pe_start_sector;
14854
+ evms_sector_t size;
14855
+ int i, iterations = 1;
14857
+ if ( org_volume->chunk_size < snap_volume->chunk_size ) {
14858
+ iterations = snap_volume->chunk_size / org_volume->chunk_size;
14859
+ io_size = org_volume->chunk_size;
14862
+ remap_sector(snap_volume->volume_node, snap_volume->next_free_chunk, 1, snap_sector, &size, &snap_pe_start_sector, snap_pv);
14864
+ // Check for an incomplete volume
14865
+ if ( ! *snap_sector || ! *snap_pv ) {
14866
+ invalidate_snapshot_volume(snap_volume);
14870
+ for ( i = 0; i < iterations; i++ ) {
14872
+ // Read the chunk from the original volume. This is a physical
14873
+ // read, not logical. Thus, stripe boundary considerations are
14874
+ // unnecessary. Also, chunks are always aligned with PEs, so PE
14875
+ // boundary considerations are unnecessary.
14876
+ if ( INIT_IO(org_pv->logical_node, 0, chunk_sector + i*io_size, io_size, org_volume->chunk_data_buffer) ) {
14880
+ // Write this chunk to the snapshot volume. This does duplicate
14881
+ // the local init_io code, but we need to have the remapped
14882
+ // sector later on, so this is slightly more efficient. Snapshot
14883
+ // volumes cannot be striped, so there is no need to consider
14884
+ // stripe-boundary conditions. And just like the read in the
14885
+ // previous line, chunks are always aligned with PEs, so we
14886
+ // don't have to consider PE-boundary conditions.
14887
+ if ( INIT_IO((*snap_pv)->logical_node, 1, *snap_sector + i*io_size, io_size, org_volume->chunk_data_buffer) ) {
14888
+ // An error writing the chunk to the snapshot is the
14889
+ // same situation as the snapshot being full.
14890
+ invalidate_snapshot_volume(snap_volume);
14899
+/* Function: snapshot_copy_data
14901
+ * On a write to a snapshotted volume, check all snapshots to see if the
14902
+ * specified chunk has already been remapped. If it has not, read the
14903
+ * original data from the volume, write the data to the next available
14904
+ * chunk on the snapshot, update the COW table, write the COW table to
14905
+ * the snapshot, and insert a new entry into the snapshot map.
14907
+ * Now converted to copy data to a single snapshot. The looping is left
14908
+ * up to lvm_write.
14910
+static int snapshot_copy_data( lvm_logical_volume_t * org_volume,
14911
+ lvm_logical_volume_t * snap_volume,
14912
+ evms_sector_t pe_start_sector,
14913
+ evms_sector_t org_sector,
14914
+ lvm_physical_volume_t * org_pv )
14916
+ lvm_physical_volume_t * snap_pv;
14917
+ snapshot_map_entry_t ** hash_table;
14918
+ snapshot_map_entry_t * chain_head;
14919
+ snapshot_map_entry_t * target_entry;
14920
+ snapshot_map_entry_t * new_map_entry;
14921
+ u_int32_t hash_value;
14922
+ evms_sector_t chunk_sector;
14923
+ evms_sector_t snap_sector;
14926
+ // Lock out this snapshot while we are remapping.
14927
+ down(&snap_volume->snap_semaphore);
14929
+ // Make sure the snapshot has not been deactivated.
14930
+ if ( ! (snap_volume->lv_status & LV_ACTIVE) ) {
14931
+ up(&snap_volume->snap_semaphore);
14935
+ // Search the hash table to see if this sector has already been
14936
+ // remapped on this snapshot.
14937
+ chunk_sector = ((org_sector - pe_start_sector) & ((evms_sector_t)(~(snap_volume->chunk_size - 1)))) + pe_start_sector;
14938
+ hash_value = snapshot_hash(chunk_sector, snap_volume);
14939
+ hash_table = snap_volume->snapshot_map[org_pv->pv_number];
14940
+ chain_head = hash_table[hash_value];
14941
+ if ( snapshot_search_hash_chain(chunk_sector, chain_head, &target_entry) ) {
14942
+ // Chunk is already remapped.
14943
+ up(&snap_volume->snap_semaphore);
14947
+ // Is there room on the snapshot to remap this chunk?
14948
+ if ( snap_volume->next_free_chunk >= snap_volume->lv_size ) {
14949
+ // At this point, the snapshot is full. Any further
14950
+ // writes to the original will cause the snapshot to
14951
+ // become "corrupt" because they can't be remapped.
14952
+ // Take this snapshot permanently offline.
14953
+ invalidate_snapshot_volume(snap_volume);
14954
+ up(&snap_volume->snap_semaphore);
14958
+ rc = snapshot_read_write_chunk(org_volume, org_pv, chunk_sector, snap_volume, &snap_pv, &snap_sector);
14960
+ up(&snap_volume->snap_semaphore);
14963
+ else if ( rc < 0 ) {
14964
+ up(&snap_volume->snap_semaphore);
14968
+ // Fill in the appropriate COW table entry and write that
14969
+ // metadata sector back to the snapshot volume. Since we are
14970
+ // only writing one sector, there are no boundary conditions.
14971
+ // Must endian-convert each entry as it is added.
14972
+ snap_volume->cow_table[snap_volume->next_cow_entry].pv_org_number = cpu_to_le64((evms_sector_t)(org_pv->pv_number));
14973
+ snap_volume->cow_table[snap_volume->next_cow_entry].pv_org_rsector = cpu_to_le64(chunk_sector);
14974
+ snap_volume->cow_table[snap_volume->next_cow_entry].pv_snap_number = cpu_to_le64((evms_sector_t)(snap_pv->pv_number));
14975
+ snap_volume->cow_table[snap_volume->next_cow_entry].pv_snap_rsector = cpu_to_le64(snap_sector);
14976
+ if ( lvm_init_io(snap_volume->volume_node, 4, snap_volume->current_cow_sector, 1, snap_volume->cow_table) ) {
14977
+ // The data was written to the snapshot, but
14978
+ // writing the metadata failed.
14979
+ invalidate_snapshot_volume(snap_volume);
14980
+ up(&snap_volume->snap_semaphore);
14983
+ snap_volume->next_cow_entry++;
14984
+ if ( snap_volume->next_cow_entry >= (EVMS_VSECTOR_SIZE/sizeof(lv_COW_table_disk_t)) ) {
14985
+ snap_volume->next_cow_entry = 0;
14986
+ snap_volume->current_cow_sector++;
14987
+ memset(snap_volume->cow_table, 0, EVMS_VSECTOR_SIZE);
14988
+ if ( lvm_init_io(snap_volume->volume_node, 4, snap_volume->current_cow_sector, 1, snap_volume->cow_table) ) {
14989
+ // Can't clear out the next sector of metadata.
14990
+ invalidate_snapshot_volume(snap_volume);
14991
+ up(&snap_volume->snap_semaphore);
14995
+ snap_volume->next_free_chunk += snap_volume->chunk_size;
14997
+ // Create a new snapshot map entry and add it in the appropriate
14998
+ // place in the map.
14999
+ if ( ! (new_map_entry = allocate_snapshot_map_entry(chunk_sector, snap_sector)) ) {
15000
+ invalidate_snapshot_volume(snap_volume);
15001
+ up(&snap_volume->snap_semaphore);
15004
+ new_map_entry->snap_pv = snap_pv;
15005
+ if ( target_entry ) {
15006
+ insert_snapshot_map_entry(new_map_entry, target_entry);
15009
+ insert_snapshot_map_entry_at_head(new_map_entry, &(hash_table[hash_value]));
15012
+ up(&snap_volume->snap_semaphore);
15017
+/* Function: get_snapshot_stats
15019
+static int get_snapshot_stats( lvm_snapshot_stat_ioctl_t * snap_stats )
15021
+ lvm_logical_volume_t * volume;
15022
+ lvm_volume_group_t * group;
15024
+ // Make sure the parameters are in range.
15025
+ if ( snap_stats->lv_number < 1 ||
15026
+ snap_stats->lv_number > MAX_LV ) {
15030
+ // Make sure the specified group and volume exist, and that
15031
+ // this is a snapshot volume.
15032
+ find_group_by_uuid(snap_stats->vg_uuid, &group);
15034
+ ! (volume = group->volume_list[snap_stats->lv_number]) ||
15035
+ ! (volume->lv_access & LV_SNAPSHOT) ) {
15039
+ // Return the starting LBA of the next available chunk.
15040
+ snap_stats->next_free_chunk = volume->next_free_chunk;
15041
+ snap_stats->lv_status = volume->lv_status;
15047
+/********** Memory Allocation/Deallocation Functions **********/
15051
+/* Function: deallocate_physical_volume
15053
+ * Free the memory used by this physical volume. Do not delete the EVMS
15054
+ * node in this function, since this could be called during an error
15055
+ * path when we want to save the logical node.
15057
+static int deallocate_physical_volume( lvm_physical_volume_t * pv_entry )
15059
+ if ( pv_entry->pv ) {
15060
+ kfree(pv_entry->pv);
15061
+ pv_entry->pv = NULL;
15064
+ if ( pv_entry->pe_map ) {
15065
+ vfree(pv_entry->pe_map);
15066
+ pv_entry->pe_map = NULL;
15074
+/* Function: allocate_physical_volume
15076
+ * Create a new lvm_physical_volume_t for the specified volume group.
15077
+ * Initialize the new PV with the evms node and lvm pv information.
15079
+static lvm_physical_volume_t * allocate_physical_volume(evms_logical_node_t * node,
15082
+ lvm_physical_volume_t * new_pv;
15084
+ new_pv = kmalloc(sizeof(lvm_physical_volume_t), GFP_NOIO);
15085
+ if ( ! new_pv ) {
15086
+ LOG_CRITICAL("Memory error creating physical volume for node %s.\n", node->name);
15091
+ // Initialize the PV
15092
+ memset(new_pv, 0, sizeof(lvm_physical_volume_t));
15093
+ new_pv->logical_node = node;
15095
+ new_pv->pv_number = pv->pv_number;
15101
+/* Function: allocate_snapshot_map_entry
15103
+ * Allocate memory for a new entry in the snapshot map and fill in the
15104
+ * sector values. The PV pointer is not filled in here, but can easily
15105
+ * be found by using the find_pv_by_number function.
15107
+static snapshot_map_entry_t * allocate_snapshot_map_entry(evms_sector_t org_sector,
15108
+ evms_sector_t snap_sector )
15110
+ snapshot_map_entry_t * new_entry;
15112
+ new_entry = kmalloc(sizeof(snapshot_map_entry_t), GFP_NOIO);
15113
+ if ( ! new_entry ) {
15116
+ memset(new_entry, 0, sizeof(snapshot_map_entry_t));
15117
+ new_entry->org_sector = org_sector;
15118
+ new_entry->snap_sector = snap_sector;
15119
+ return new_entry;
15123
+/* Function: deallocate_snapshot_map
15125
+ * This function will delete one hash table, which is part of the whole
15126
+ * snapshot remapping structure. Each hash table is an array of pointers
15127
+ * to linked lists of snapshot_map_entry_t's.
15129
+static int deallocate_snapshot_map( snapshot_map_entry_t ** table, u_int32_t table_size )
15131
+ snapshot_map_entry_t * entry;
15132
+ snapshot_map_entry_t * next;
15136
+ for ( i = 0; i < table_size; i++ ) {
15137
+ for ( entry = table[i]; entry; entry = next ) {
15138
+ next = entry->next;
15148
+/* Function: deallocate_logical_volume
15150
+ * Delete the in-memory representation of a single LVM logical volume,
15151
+ * including its PE map and any snapshot data. Do not alter the parent
15152
+ * volume group, except to remove this volume from its volume list.
15154
+static int deallocate_logical_volume( lvm_logical_volume_t * volume )
15156
+ lvm_volume_group_t * group = volume->group;
15157
+ lvm_logical_volume_t * org_volume;
15158
+ lvm_logical_volume_t * snap_volume;
15161
+ // If this volume is a snapshot, remove it from the linked list of
15162
+ // volumes that are snapshotting the original. First, the original
15163
+ // volume must be quiesced.
15164
+ if ( volume->lv_access & LV_SNAPSHOT ) {
15165
+ org_volume = volume->snapshot_org;
15167
+ if ( snapshot_check_quiesce_original(volume) ) {
15171
+ remove_snapshot_from_chain(volume);
15173
+ // If the snapshot that was just removed was the last/only
15174
+ // volume snapshotting the original, then mark the original
15175
+ // as no longer being snapshotted.
15176
+ if ( org_volume && ! org_volume->snapshot_next ) {
15177
+ org_volume->lv_access &= ~LV_SNAPSHOT_ORG;
15181
+ // If this volume is a snapshot original, all of its snapshots must also
15182
+ // be deleted. However, Those deletions need to be taken care of by the
15183
+ // engine. So just check that they have all been quiesced before
15184
+ // removing the original.
15185
+ else if ( volume->lv_access & LV_SNAPSHOT_ORG ) {
15186
+ if ( snapshot_check_quiesce_all(volume) ) {
15190
+ // In case there are any snapshots remaining, we must clear out
15191
+ // their pointers to this original to prevent errors when those
15192
+ // snapshots are accessed or deleted.
15193
+ for ( snap_volume = volume->snapshot_next; snap_volume; snap_volume = snap_volume->snapshot_next ) {
15194
+ snap_volume->snapshot_org = NULL;
15198
+ LOG_DEBUG("Deleting volume %s\n", volume->name);
15200
+ // Free all the memory. This includes the LE-to-PE map, any snapshot
15201
+ // hash tables, the COW table, and chunk data buffer.
15202
+ if ( volume->le_map ) {
15203
+ vfree(volume->le_map);
15204
+ volume->le_map = NULL;
15206
+ if ( volume->snapshot_map ) {
15207
+ for ( i = 1; i <= group->pv_count; i++ ) {
15208
+ deallocate_snapshot_map(volume->snapshot_map[i], volume->hash_table_size);
15210
+ kfree(volume->snapshot_map);
15211
+ volume->snapshot_map = NULL;
15213
+ if ( volume->cow_table ) {
15214
+ kfree(volume->cow_table);
15215
+ volume->cow_table = NULL;
15217
+ if ( volume->chunk_data_buffer ) {
15218
+ kfree(volume->chunk_data_buffer);
15219
+ volume->chunk_data_buffer = NULL;
15222
+ // Remove this volume from the volume-group's list.
15223
+ if ( group && group->volume_list[volume->lv_number] == volume ) {
15224
+ group->volume_list[volume->lv_number] = NULL;
15225
+ group->volume_count--;
15234
+/* Function: allocate_logical_volume
15236
+ * Allocate space for a new LVM logical volume, including space for the
15237
+ * LE-to-PE map and any necessary snapshot data.
15239
+static lvm_logical_volume_t * allocate_logical_volume( lv_disk_t * lv,
15240
+ lvm_volume_group_t * group )
15242
+ lvm_logical_volume_t * new_volume;
15243
+ u_int32_t table_entries_per_chunk;
15244
+ u_int32_t table_chunks;
15247
+ // Allocate space for the new logical volume.
15248
+ new_volume = kmalloc(sizeof(lvm_logical_volume_t), GFP_NOIO);
15249
+ if ( ! new_volume ) {
15250
+ LOG_CRITICAL("Memory error creating new logical volume %s\n", lv->lv_name);
15253
+ memset(new_volume, 0, sizeof(lvm_logical_volume_t));
15255
+ // Allocate space for the LE to PE mapping table
15256
+ new_volume->le_map = vmalloc(lv->lv_allocated_le*sizeof(le_table_entry_t));
15257
+ if ( ! new_volume->le_map ) {
15258
+ LOG_CRITICAL("Memory error creating LE map for logical volume %s\n", lv->lv_name);
15259
+ kfree(new_volume);
15262
+ memset(new_volume->le_map, 0, lv->lv_allocated_le*sizeof(le_table_entry_t));
15264
+ // Initialize the rest of the new volume.
15265
+ new_volume->lv_number = lv->lv_number + 1; // Need the +1 to match the PE Map entries on the PV
15266
+ new_volume->lv_size = lv->lv_size;
15267
+ new_volume->lv_access = lv->lv_access | EVMS_LV_NEW | EVMS_LV_QUIESCED; // All volumes start new and quieseced.
15268
+ new_volume->lv_status = lv->lv_status | LV_ACTIVE; // All LVs start as active.
15269
+ new_volume->lv_minor = MINOR(lv->lv_dev);
15270
+ new_volume->stripes = lv->lv_stripes;
15271
+ new_volume->stripe_size = lv->lv_stripesize;
15272
+ new_volume->stripe_size_shift = evms_cs_log2(lv->lv_stripesize);
15273
+ new_volume->pe_size = group->vg->pe_size;
15274
+ new_volume->pe_size_shift = evms_cs_log2(group->vg->pe_size);
15275
+ new_volume->num_le = lv->lv_allocated_le;
15276
+ new_volume->group = group;
15277
+ // Different naming scheme for EVMS nodes.
15278
+ if ( translate_lv_name(lv->lv_name, new_volume->name) ) {
15279
+ deallocate_logical_volume(new_volume);
15283
+ // If the volume is a snapshot, initialize the remaining data, and
15284
+ // allocate space for the remapping structures, and one sector's worth
15285
+ // of COW tables.
15286
+ if ( new_volume->lv_access & LV_SNAPSHOT ) {
15287
+ new_volume->chunk_size = lv->lv_chunk_size;
15288
+ new_volume->num_chunks = lv->lv_size / lv->lv_chunk_size;
15289
+ new_volume->snap_org_minor = lv->lv_snapshot_minor;
15290
+ new_volume->next_cow_entry = 0;
15291
+ new_volume->current_cow_sector = 0;
15292
+ table_entries_per_chunk = (new_volume->chunk_size << EVMS_VSECTOR_SIZE_SHIFT) / sizeof(lv_COW_table_disk_t);
15293
+ table_chunks = (new_volume->num_chunks + table_entries_per_chunk - 1) / table_entries_per_chunk;
15294
+ new_volume->next_free_chunk = table_chunks * new_volume->chunk_size;
15295
+ new_volume->hash_table_size = (lv->lv_size / lv->lv_chunk_size / MAX_HASH_CHAIN_ENTRIES) + 1;
15297
+ new_volume->cow_table = kmalloc(EVMS_VSECTOR_SIZE, GFP_NOIO);
15298
+ if ( ! new_volume->cow_table ) {
15299
+ LOG_CRITICAL("Memory error creating COW table for logical volume %s\n", lv->lv_name);
15300
+ deallocate_logical_volume(new_volume);
15303
+ memset(new_volume->cow_table, 0, EVMS_VSECTOR_SIZE);
15305
+ new_volume->snapshot_map = kmalloc((group->pv_count+1) * sizeof(snapshot_map_entry_t**), GFP_NOIO);
15306
+ if ( ! new_volume->snapshot_map ) {
15307
+ LOG_CRITICAL("Memory error creating snapshot map for logical volume %s\n", lv->lv_name);
15308
+ deallocate_logical_volume(new_volume);
15312
+ new_volume->snapshot_map[0] = NULL;
15313
+ for ( i = 1; i <= group->pv_count; i++ ) {
15314
+ new_volume->snapshot_map[i] = vmalloc(new_volume->hash_table_size * sizeof(snapshot_map_entry_t*));
15315
+ if ( ! new_volume->snapshot_map[i] ) {
15316
+ LOG_CRITICAL("Memory error creating snapshot sub-map for logical volume %s\n", lv->lv_name);
15317
+ deallocate_logical_volume(new_volume);
15320
+ memset(new_volume->snapshot_map[i], 0, new_volume->hash_table_size*sizeof(snapshot_map_entry_t*));
15322
+ init_MUTEX(&new_volume->snap_semaphore);
15325
+ // If the volume is a snapshot original, allocate space to use for
15326
+ // copying snapshot chunks. This will now be a fixed size instead of
15327
+ // being based on the chunk size of the snapshots.
15328
+ else if ( new_volume->lv_access & LV_SNAPSHOT_ORG ) {
15329
+ new_volume->chunk_size = CHUNK_DATA_BUFFER_SIZE;
15330
+ new_volume->chunk_data_buffer = kmalloc(new_volume->chunk_size << EVMS_VSECTOR_SIZE_SHIFT, GFP_NOIO);
15331
+ if ( ! new_volume->chunk_data_buffer ) {
15332
+ LOG_SERIOUS("Memory error creating snapshot chunk buffer for logical volume %s\n", lv->lv_name);
15333
+ deallocate_logical_volume(new_volume);
15336
+ memset(new_volume->chunk_data_buffer, 0, new_volume->chunk_size << EVMS_VSECTOR_SIZE_SHIFT);
15339
+ return new_volume;
15343
+/* Function: deallocate_volume_group
15345
+ * Delete the entire in-memory representation of an LVM volume group,
15346
+ * including all PVs and logical volumes. If this group is on LVM's
15347
+ * volume group list, remove it.
15349
+static int deallocate_volume_group( lvm_volume_group_t * group )
15351
+ lvm_physical_volume_t * pv_entry;
15352
+ lvm_physical_volume_t * next_pv;
15355
+ LOG_DEBUG("Deleting volume group %s\n", group->vg_name);
15357
+ // Remove the group from the global list.
15358
+ remove_group_from_list(group);
15360
+ // Delete the LV metadata array.
15361
+ if ( group->lv_array ) {
15362
+ vfree(group->lv_array);
15363
+ group->lv_array = NULL;
15366
+ // Delete the PV UUID list
15367
+ if ( group->uuid_list ) {
15368
+ vfree(group->uuid_list);
15369
+ group->uuid_list = NULL;
15372
+ // Delete all logical volumes.
15373
+ for ( i = 1; i <= MAX_LV; i++ ) {
15374
+ if ( group->volume_list[i] ) {
15375
+ deallocate_logical_volume(group->volume_list[i]);
15376
+ group->volume_list[i] = NULL;
15380
+ // Delete all PVs from the group's list.
15381
+ for ( pv_entry = group->pv_list; pv_entry; pv_entry = next_pv ) {
15382
+ next_pv = pv_entry->next;
15383
+ if ( pv_entry->logical_node ) {
15384
+ // Send a delete command down to the partition manager.
15385
+ LOG_DEBUG("Deleting PV %s from group %s\n", pv_entry->logical_node->name, group->vg_name);
15386
+ DELETE(pv_entry->logical_node);
15387
+ pv_entry->logical_node = NULL;
15389
+ deallocate_physical_volume(pv_entry);
15392
+ // Delete the VG metadata.
15393
+ if ( group->vg ) {
15394
+ kfree(group->vg);
15395
+ group->vg = NULL;
15404
+/* Function: allocate_volume_group
15406
+ * Allocate space for a new LVM volume group and all of its sub-fields.
15407
+ * Initialize the appropriate fields.
15408
+ * vg parameter should already have an allocate/initialized vg_disk_t.
15410
+static lvm_volume_group_t * allocate_volume_group( vg_disk_t * vg,
15411
+ unsigned char * vg_name )
15413
+ lvm_volume_group_t * new_group;
15415
+ // The volume group itself.
15416
+ new_group = kmalloc(sizeof(lvm_volume_group_t), GFP_NOIO);
15417
+ if ( ! new_group ) {
15422
+ // Initialize the new group.
15423
+ memset(new_group, 0, sizeof(lvm_volume_group_t));
15424
+ memcpy(new_group->vg_uuid, vg->vg_uuid, UUID_LEN);
15425
+ strncpy(new_group->vg_name, vg_name, NAME_LEN-1);
15426
+ new_group->vg = vg;
15427
+ new_group->hard_sect_size = 512; // Default value
15428
+ new_group->block_size = 1024; // Default value
15429
+ new_group->flags = EVMS_VG_DIRTY;
15431
+ LOG_DETAILS("Discovered volume group %s\n", new_group->vg_name);
15433
+ return new_group;
15437
+/* Function: remove_pv_from_group
15439
+ * In the engine, when a PV is removed from a group (on a vgreduce), that
15440
+ * same PV must be removed from that group in the kernel. Otherwise, when
15441
+ * the rediscover occurs, that PV will still appear in the group, and
15442
+ * will cause segfaults when we try to read metadata from it.
15444
+static int remove_pv_from_group(int pv_number,
15445
+ unsigned char * vg_uuid )
15447
+ lvm_volume_group_t * group;
15448
+ lvm_physical_volume_t * pv_entry;
15449
+ lvm_physical_volume_t ** p_pv_entry;
15452
+ // Make sure the numbers are in range.
15453
+ if ( pv_number < 0 || pv_number > MAX_PV ) {
15457
+ // Make sure the group exists.
15458
+ find_group_by_uuid(vg_uuid, &group);
15463
+ // Make sure the PV is in this group.
15464
+ pv_entry = find_pv_by_number(pv_number, group);
15465
+ if ( ! pv_entry ) {
15466
+ LOG_WARNING("Did not find PV %d in group %s\n", pv_number, group->vg_name);
15470
+ // Make sure the PV is not in use by any volumes
15471
+ if ( check_pv_for_lv(pv_entry, group) ) {
15472
+ LOG_SERIOUS("PV %d in group %s still contains LVs\n", pv_number, group->vg_name);
15476
+ // Take this PV out of the group's list.
15477
+ for ( p_pv_entry = &group->pv_list; *p_pv_entry; p_pv_entry = &(*p_pv_entry)->next ) {
15478
+ if ( *p_pv_entry == pv_entry ) {
15479
+ *p_pv_entry = (*p_pv_entry)->next;
15480
+ pv_entry->next = NULL;
15485
+ group->pv_count--;
15487
+ // There is no way that this PV was the last from this group, so the
15488
+ // group never needs to be deleted at this point. The only way this
15489
+ // group will exist in the kernel is if there are volumes exported from
15490
+ // it. If this was the last PV, then those volumes must be on that PV,
15491
+ // and it wouldn't be allowed to be removed from the group (above).
15493
+ // Free up the memory for this PV. Just drop the node.
15494
+ deallocate_physical_volume(pv_entry);
15496
+ LOG_DEBUG("PV %d removed from group %s\n", pv_number, group->vg_name);
15502
+/********** Consistency Checking Functions **********/
15505
+/* Function: clear_le_entries_for_missing_pv
15507
+static void clear_le_entries_for_missing_pv( lvm_volume_group_t * group,
15508
+ lvm_physical_volume_t * pv_entry )
15510
+ lvm_logical_volume_t * volume;
15513
+ for ( i = 1; i <= MAX_LV; i++ ) {
15514
+ if ( group->volume_list[i] ) {
15515
+ volume = group->volume_list[i];
15516
+ for ( j = 0; j < volume->num_le; j++ ) {
15517
+ if ( volume->le_map[j].owning_pv == pv_entry ) {
15518
+ volume->le_map[j].owning_pv = NULL;
15519
+ volume->le_map[j].pe_sector_offset = 0;
15527
+/* Function: check_volume_groups
15529
+ * This function performs some simple consistency checks on all dirty
15530
+ * volume groups. Any groups that have no PVs are deleted. If any metadata
15531
+ * structures (PV or VG) are missing, they are read in from disk.
15533
+static int check_volume_groups( void )
15535
+ lvm_volume_group_t * group;
15536
+ lvm_volume_group_t * next_group;
15537
+ lvm_physical_volume_t * pv_entry;
15538
+ lvm_physical_volume_t * next_pv;
15541
+ for ( group = lvm_group_list; group; group = next_group) {
15542
+ next_group = group->next_group;
15544
+ LOG_DEBUG("Checking Group %s\n", group->vg_name);
15546
+ // If a group has no PVs, it can be safely deleted,
15547
+ // because we can't find any volumes on it.
15548
+ if ( ! group->pv_count ) {
15549
+ LOG_WARNING("No PVs found for Group %s.\n", group->vg_name);
15550
+ if ( ! group->volume_count ) {
15551
+ deallocate_volume_group(group);
15556
+ // Make sure all metadata for the PVs is present. On a
15557
+ // rediscover, it may be missing, because we delete it at the
15558
+ // end of discovery. If any is missing, read it in from disk.
15559
+ // This is only necessary in the kernel. It can't happen in
15561
+ for ( pv_entry = group->pv_list; pv_entry; pv_entry = next_pv ) {
15562
+ next_pv = pv_entry->next;
15563
+ if ( ! pv_entry->pv ) {
15564
+ LOG_DEBUG("Re-reading PV metadata for node %s\n", pv_entry->logical_node->name);
15565
+ rc = read_pv(pv_entry->logical_node, &pv_entry->pv);
15567
+ // What happens if we can't re-read the
15568
+ // PV metadata? This PV must be removed
15569
+ // from the group. Need to also clear
15570
+ // all LE entries in all LVs that are
15571
+ // pointing to this PV before it can be
15572
+ // removed from the list.
15573
+ LOG_SERIOUS("PV metadata is missing or cannot be read from node %s\n", pv_entry->logical_node->name);
15574
+ clear_le_entries_for_missing_pv(group, pv_entry);
15575
+ remove_pv_from_group(pv_entry->pv_number, group->vg_uuid);
15578
+ pv_entry->pv_number = pv_entry->pv->pv_number;
15580
+ // Check for a "stale" PV. This case should be
15581
+ // already be covered, as long as the Engine is
15582
+ // calling the PV_REMOVE ioctl when it does a
15583
+ // vgreduce or a pvremove. If this is the last
15584
+ // PV in the group, the group will be deleted.
15585
+ if ( ! pv_entry->pv_number ) {
15586
+ remove_pv_from_group(0, group->vg_uuid);
15591
+ if ( ! pv_entry->pe_map ) {
15592
+ LOG_DEBUG("Re-reading PE maps for node %s\n", pv_entry->logical_node->name);
15593
+ rc = read_pe_map(pv_entry);
15595
+ LOG_WARNING("Error reading PE maps for node %s\n", pv_entry->logical_node->name);
15596
+ LOG_WARNING("Any volumes residing on this node will be incomplete!\n");
15601
+ // Make sure the metadata for the VG is present. If it's
15602
+ // missing, read it in from the first PV in the VG.
15603
+ if ( ! group->vg && group->pv_count ) {
15604
+ LOG_DEBUG("Re-reading VG metadata for Group %s\n", group->vg_name);
15605
+ pv_entry = group->pv_list;
15606
+ rc = read_vg(pv_entry->logical_node, pv_entry->pv, &group->vg);
15608
+ // What happens if we can't re-read the
15609
+ // VG metadata? It's definitely bad
15610
+ // news. Should we delete the VG?
15615
+ // Display a warning if the number of PVs found for the group
15616
+ // doesn't match the number of PVs recorded for the VG.
15617
+ if ( group->vg && group->pv_count != group->vg->pv_cur ) {
15618
+ LOG_WARNING("Group %s is incomplete.\n", group->vg_name);
15619
+ LOG_WARNING(" Only %d of %d PVs found.\n", group->pv_count, group->vg->pv_cur);
15620
+ LOG_WARNING(" Volumes in this group may be incomplete.\n");
15628
+/* Function: check_le_maps
15630
+ * Make sure all volumes in this group have valid LE-to-PE maps. Any
15631
+ * volume that doesn't is marked as incomplete. This is safe for
15632
+ * re-discovery because only new volumes could have corrupted LE maps.
15634
+static int check_le_maps( lvm_volume_group_t * group )
15636
+ lvm_logical_volume_t * volume;
15639
+ for ( i = 1; i <= MAX_LV; i++ ) {
15640
+ volume = group->volume_list[i];
15641
+ if ( ! volume ) {
15645
+ if ( ! volume->le_map ) {
15646
+ // No point in keeping the volume around if it has
15647
+ // no LE map at all.
15648
+ LOG_SERIOUS("Volume %s has no LE map.\n", volume->name);
15649
+ deallocate_logical_volume(volume);
15653
+ // If any entries in the LE map are missing, mark this volume
15654
+ // as incomplete.
15655
+ for ( j = 0, count = 0; j < volume->num_le; j++ ) {
15656
+ if ( ! volume->le_map[j].owning_pv ||
15657
+ ! volume->le_map[j].pe_sector_offset ) {
15662
+ LOG_SERIOUS("Volume %s has incomplete LE map.\n", volume->name);
15663
+ LOG_SERIOUS(" Missing %d out of %d LEs.\n", count, volume->num_le);
15664
+ volume->lv_access |= EVMS_LV_INCOMPLETE;
15671
+/* Function: check_snapshot_map
15673
+ * For snapshot volumes, make sure the snapshot map is intact, and that
15674
+ * any existing entries in the map are in the correct order and there
15675
+ * are no duplicate entries.
15677
+static int check_snapshot_map( lvm_logical_volume_t * snap_volume )
15679
+ snapshot_map_entry_t ** table;
15680
+ snapshot_map_entry_t * curr;
15683
+ if ( ! (snap_volume->lv_access & LV_SNAPSHOT) ) {
15686
+ if ( ! snap_volume->snapshot_map ) {
15687
+ snap_volume->lv_access |= EVMS_LV_INVALID;
15690
+ for ( i = 1; i <= snap_volume->group->pv_count; i++ ) {
15691
+ if ( ! snap_volume->snapshot_map[i] ) {
15692
+ snap_volume->lv_access |= EVMS_LV_INVALID;
15695
+ table = snap_volume->snapshot_map[i];
15696
+ for ( j = 0; j < snap_volume->hash_table_size; j++ ) {
15697
+ for ( curr = table[j]; curr; curr = curr->next ) {
15698
+ if ( curr->next && curr->org_sector >= curr->next->org_sector ) {
15699
+ snap_volume->lv_access |= EVMS_LV_INVALID;
15709
+/* Function: check_logical_volumes
15711
+ * Perform a consistency check on all of the logical volumes that have been
15712
+ * discovered. Any volume that has any inconsistencies will be marked as
15713
+ * incomplete or invalid, depending on the severity of the problem. At the
15714
+ * end, all invalid volumes are deleted. If the deleted_incompletes
15715
+ * parameter is set, those will also be deleted.
15717
+static int check_logical_volumes( int final_discovery )
15719
+ lvm_volume_group_t * group;
15720
+ lvm_logical_volume_t * volume;
15721
+ lvm_logical_volume_t * snap;
15722
+ lvm_logical_volume_t * next;
15726
+ // Check every valid, dirty volume group
15727
+ for ( group = lvm_group_list; group; group = group->next_group ) {
15728
+ if ( ! (group->flags & EVMS_VG_DIRTY) ) {
15732
+ // Check every valid volume in this group
15733
+ for ( i = 1; i <= MAX_LV; i++ ) {
15734
+ volume = group->volume_list[i];
15735
+ if ( ! volume ) {
15739
+ LOG_DEBUG("Checking logical volume %s\n", volume->name);
15741
+ if ( ! volume->group ) {
15742
+ volume->group = group;
15745
+ // All LE-map entries must have valid values. The I/O
15746
+ // paths now detect missing LE entries.
15747
+ if ( volume->le_map ) {
15748
+ for ( j = 0, count = 0; j < volume->num_le; j++ ) {
15749
+ if ( ! volume->le_map[j].owning_pv ||
15750
+ ! volume->le_map[j].pe_sector_offset ) {
15755
+ LOG_SERIOUS("Volume %s has incomplete LE map.\n", volume->name);
15756
+ LOG_SERIOUS(" Missing %d out of %d LEs.\n", count, volume->num_le);
15757
+ volume->lv_access |= EVMS_LV_INCOMPLETE;
15760
+ // In case this volume was previously
15761
+ // marked incomplete.
15762
+ volume->lv_access &= ~EVMS_LV_INCOMPLETE;
15766
+ // This should only ever happen due to
15767
+ // memory corruption.
15768
+ LOG_SERIOUS("Volume %s has no LE map.\n", volume->name);
15769
+ volume->lv_access |= EVMS_LV_INVALID;
15772
+ // For a snapshot original, check all snapshots in the
15773
+ // chain, to make sure they point back to the original.
15774
+ // Also, make sure there is memory for the chunk buffer.
15775
+ if ( volume->lv_access & LV_SNAPSHOT_ORG ) {
15776
+ for ( snap = volume->snapshot_next, count = 0; snap; snap = snap->snapshot_next, count++ ) {
15777
+ if ( snap->snapshot_org != volume ) {
15778
+ LOG_SERIOUS("Snapshot volume %s not pointing at correct original\n", volume->name);
15779
+ snap->snapshot_org = NULL;
15780
+ snap->lv_access |= EVMS_LV_INVALID;
15784
+ LOG_WARNING("No snapshots found for volume %s\n", volume->name);
15785
+ if ( final_discovery ) {
15786
+ volume->lv_access &= ~LV_SNAPSHOT_ORG;
15789
+ else if ( ! volume->chunk_data_buffer ) {
15790
+ volume->lv_access |= EVMS_LV_INVALID;
15794
+ // For a snapshot volume, make sure it points back to
15795
+ // its original. Also make sure there is memory for the
15796
+ // cow table, and that any existing snapshot entries in
15797
+ // the snapshot map are correctly ordered.
15798
+ else if ( volume->lv_access & LV_SNAPSHOT ) {
15799
+ // Is there a COW table?
15800
+ if ( ! volume->cow_table ) {
15801
+ LOG_SERIOUS("Snapshot volume %s has no COW table\n", volume->name);
15802
+ volume->lv_access |= EVMS_LV_INVALID;
15804
+ // Is the snapshot map in order?
15805
+ if ( check_snapshot_map(volume) ) {
15806
+ LOG_SERIOUS("Snapshot volume %s has snapshot map inconsistency\n", volume->name);
15807
+ volume->lv_access |= EVMS_LV_INVALID;
15809
+ // Is there an original volume? This is only
15810
+ // a real problem during final discovery.
15811
+ if ( ! volume->snapshot_org ) {
15812
+ LOG_SERIOUS("Snapshot volume %s not pointing at an original\n", volume->name);
15813
+ if ( final_discovery ) {
15814
+ volume->lv_access |= EVMS_LV_INVALID;
15817
+ // Is the original the correct one?
15818
+ else if ( volume->snap_org_minor != volume->snapshot_org->lv_minor ) {
15819
+ LOG_SERIOUS("Snapshot volume %s not pointing at correct original\n", volume->name);
15820
+ volume->lv_access |= EVMS_LV_INVALID;
15824
+ // Delete any invalid volumes from use. Delete
15825
+ // incomplete volumes as well if this is not final
15826
+ // discovery. If a snapshot original is bad, delete all
15827
+ // of its snapshots.
15828
+ if ( volume->lv_access & EVMS_LV_INVALID ||
15829
+ (!final_discovery &&
15830
+ (volume->lv_access & EVMS_LV_INCOMPLETE) &&
15831
+ (volume->lv_access & EVMS_LV_NEW) ) ) {
15832
+ if ( volume->lv_access & LV_SNAPSHOT_ORG ) {
15833
+ for ( snap = volume->snapshot_next; snap; snap = next ) {
15834
+ next = snap->snapshot_next;
15835
+ snap->snapshot_next = NULL;
15836
+ snap->snapshot_org = NULL;
15837
+ invalidate_snapshot_volume(snap);
15838
+ deallocate_logical_volume(snap);
15840
+ volume->snapshot_next = NULL;
15842
+ else if ( volume->lv_access & LV_SNAPSHOT ) {
15843
+ invalidate_snapshot_volume(volume);
15845
+ deallocate_logical_volume(volume);
15855
+/********** Volume Group Discovery Functions **********/
15859
+/* Function: find_group_for_pv
15861
+ * This is a discover-time function. It reads the VG metadata info for the
15862
+ * specified node, and locates the appropriate group that owns that
15863
+ * node. If that group does not already exist, it is created and
15866
+static int find_group_for_pv( evms_logical_node_t * node,
15868
+ lvm_volume_group_t ** group )
15875
+ // Check for an unassigned PV.
15876
+ if ( pv->vg_name[0] == 0 ) {
15880
+ // Read the VG on-disk info for this PV. If this succeeds, it
15881
+ // allocates a new VG metadata structure.
15882
+ rc = read_vg(node, pv, &vg);
15887
+ // Use the UUID from the VG metadata to determine if this group
15888
+ // has already been discovered and constructed.
15889
+ find_group_by_uuid(vg->vg_uuid, group);
15891
+ if ( ! *group ) {
15892
+ // Create a new group entry and add to the global list.
15893
+ *group = allocate_volume_group(vg, pv->vg_name);
15894
+ if ( ! *group ) {
15897
+ add_group_to_list(*group);
15899
+ else if ( ! (*group)->vg ) {
15900
+ // On a rediscover, the VG metadata for an existing group might
15901
+ // be missing. Fill it in if necessary. This check is also not
15902
+ // necessary in the engine, since the metadata is never deleted.
15903
+// Should we re-copy vg_name? (vg_uuid can not be allowed to change).
15904
+// Or should vg_name changes be done through direct ioctl only?
15905
+ (*group)->vg = vg;
15911
+ // Read in the UUID list for this group, if it isn't present.
15912
+ rc = read_uuid_list(node, pv, *group);
15914
+ LOG_WARNING("Error reading UUID list for group %s.\n", (*group)->vg_name);
15915
+ LOG_WARNING("May not be able to verify PV UUIDs for group %s\n", (*group)->vg_name);
15918
+ // In the kernel, any time we even see a PV for a group, that group
15919
+ // must be marked dirty so its volumes will be re-exported.
15920
+ (*group)->flags |= EVMS_VG_DIRTY;
15926
+/* Function: check_for_duplicate_pv
15928
+ * Search the list of PVs in the specified volume group. If the
15929
+ * specified node already exists in the list, we can discard it.
15931
+static int check_for_duplicate_pv( evms_logical_node_t * node,
15933
+ lvm_volume_group_t * group )
15935
+ lvm_physical_volume_t * pv_entry;
15937
+ // For re-discovery, we need to search all existing PVs in this VG to
15938
+ // make sure we didn't get a duplicate from the plugin below us. The
15939
+ // plugins below us should be re-exporting the same node on
15940
+ // re-discovery, instead of creating a new node to represent the same
15941
+ // objects, so just check the memory location.
15942
+ for ( pv_entry = group->pv_list; pv_entry; pv_entry = pv_entry->next ) {
15943
+ if ( pv_entry->logical_node == node ) {
15945
+ // We found a duplicate. Just ignore the duplicate.
15946
+ LOG_DEBUG("PV %s is already in Group %s.\n", node->name, group->vg_name);
15948
+ // Even if the node was a duplicate, we may need to
15949
+ // fill in the pv entry for this partition, since we
15950
+ // always delete those at the end of discovery.
15951
+ if ( ! pv_entry->pv ) {
15952
+ pv_entry->pv = pv;
15953
+ pv_entry->pv_number = pv->pv_number;
15963
+ // No duplicate was found.
15968
+/* Function: verify_pv_uuid
15970
+ * Verify that the specified PV belongs in the specified group by
15971
+ * searching for the PV's UUID in the group's list.
15973
+static int verify_pv_uuid( lvm_physical_volume_t * pv_entry,
15974
+ lvm_volume_group_t * group )
15978
+ // Obviously the UUID list must be present in order to search.
15979
+ if ( ! group->uuid_list ) {
15980
+ LOG_WARNING("UUID list is missing from group %s.\n", group->vg_name);
15981
+ LOG_WARNING("Cannot verify UUID for PV %s\n", pv_entry->logical_node->name);
15985
+ // Start with the UUID entry for this PV's number
15986
+ if ( ! memcmp(pv_entry->pv->pv_uuid, &(group->uuid_list[(pv_entry->pv_number-1)*NAME_LEN]), UUID_LEN) ) {
15990
+ // If it wasn't found there, then search the entire group's list.
15991
+ for ( i = 0; i < group->vg->pv_cur; i++ ) {
15992
+ if ( ! memcmp(pv_entry->pv->pv_uuid, &(group->uuid_list[i*NAME_LEN]), UUID_LEN) ) {
15993
+ // Found the UUID.
15994
+ LOG_WARNING("Detected UUID mismatch for PV %s!\n", pv_entry->logical_node->name);
15995
+ LOG_WARNING("PV %s is recorded as being at index %d,\n", pv_entry->logical_node->name, pv_entry->pv_number);
15996
+ LOG_WARNING(" but Group %s has it recorded at index %d.\n", group->vg_name, i+1);
15997
+ LOG_WARNING("Run the EVMS Engine to correct the problem.\n");
15998
+ LOG_WARNING("If you have any snapshot regions in group %s\n", group->vg_name);
15999
+ LOG_WARNING(" it is recommended that you delete them immediately!\n");
16004
+ LOG_SERIOUS("Could not find UUID for PV %s in group %s\n", pv_entry->logical_node->name, group->vg_name);
16009
+/* Function: add_pv_to_group
16011
+ * Adds the physical volume to the appropriate volume group. The PV
16012
+ * passed into this function MUST be part of a valid VG.
16014
+static int add_pv_to_group( lvm_physical_volume_t * pv_entry,
16015
+ lvm_volume_group_t * group )
16019
+ // Make sure this PV's UUID is listed in the group.
16020
+ rc = verify_pv_uuid(pv_entry, group);
16022
+ LOG_SERIOUS("PV %s does not belong in group %s!\n", pv_entry->logical_node->name, group->vg_name);
16026
+ // Add this PV to the beginning of its group's list.
16027
+ pv_entry->next = group->pv_list;
16028
+ group->pv_list = pv_entry;
16029
+ group->pv_count++;
16031
+ // Update the group's block and hardsector sizes as appropriate.
16032
+ group->block_size = max(pv_entry->logical_node->block_size, group->block_size);
16033
+ group->hard_sect_size = max(pv_entry->logical_node->hardsector_size, group->hard_sect_size);
16035
+ // Check for the Partial or Removable flag on the PV.
16036
+ if ( pv_entry->logical_node->flags & EVMS_VOLUME_PARTIAL ) {
16037
+ group->flags |= EVMS_VG_PARTIAL_PVS;
16039
+ if ( pv_entry->logical_node->flags & EVMS_DEVICE_REMOVABLE ) {
16040
+ group->flags |= EVMS_VG_REMOVABLE_PVS;
16043
+ LOG_DETAILS("PV %s added to Group %s\n", pv_entry->logical_node->name, group->vg_name);
16049
+/* Function: discover_volume_groups
16051
+ * Examine the list of logical nodes. Any node that contains a valid PV
16052
+ * structure is consumed and added to the appropriate volume group. PVs
16053
+ * which do not belong to any group are deleted. Everything else is left
16054
+ * on the discovery list.
16056
+static int discover_volume_groups( evms_logical_node_t ** evms_node_list )
16058
+ evms_logical_node_t * node;
16059
+ evms_logical_node_t * next_node;
16061
+ lvm_volume_group_t * group;
16062
+ lvm_physical_volume_t * pv_entry;
16065
+ LOG_EXTRA("Searching for PVs in the node list.\n");
16067
+ // Run through the discovery list
16068
+ for ( node = *evms_node_list; node; node = next_node ) {
16069
+ // Save the next node. We may remove this one from the list.
16070
+ next_node = node->next;
16072
+ // Read the PV metadata. This will also create a new pv_disk_t
16073
+ // if it finds the correct LVM signatures.
16074
+ rc = read_pv(node, &pv);
16076
+ // This node is not an LVM PV, or an error occurred.
16077
+ // Just leave the node on the discovery list.
16081
+ rc = find_group_for_pv(node, pv, &group);
16083
+ // Error getting the group for this PV.
16089
+ // This node is an unassigned PV.
16090
+ LOG_DETAILS("PV %s is unassigned.\n", node->name);
16095
+ rc = check_for_duplicate_pv(node, pv, group);
16097
+ // This node is already in the group. This check is also
16098
+ // only in the kernel because the engine has no notion
16099
+ // of rediscover, and thus can never get a duplicate.
16100
+ evms_cs_remove_logical_node_from_list(evms_node_list, node);
16104
+ // Allocate a PV entry for this node.
16105
+ pv_entry = allocate_physical_volume(node, pv);
16106
+ if ( ! pv_entry ) {
16110
+ // Add this PV to the appropriate volume group.
16111
+ rc = add_pv_to_group(pv_entry, group);
16113
+ deallocate_physical_volume(pv_entry);
16117
+ rc = read_pe_map(pv_entry);
16119
+ LOG_WARNING("Error reading PE maps for node %s\n", node->name);
16120
+ LOG_WARNING("Any volumes residing on this node will be incomplete!\n");
16123
+ evms_cs_remove_logical_node_from_list(evms_node_list, node);
16126
+ LOG_EXTRA("Group discovery complete.\n");
16132
+/********** Logical Volume Discovery Functions **********/
16136
+/* Function: build_le_maps
16138
+ * After all logical volumes have been discovered, the mappings from
16139
+ * logical extents to physical extents must be constructed. Each PV
16140
+ * contains a map on-disk of its PEs. Each PE map entry contains the
16141
+ * logical volume number and the logical extent number on that volume.
16142
+ * Our internal map is the reverse of this map for each volume, listing
16143
+ * the PV node and sector offset for every logical extent on the volume.
16145
+static int build_le_maps( lvm_volume_group_t * group )
16147
+ lvm_logical_volume_t ** volume_list = group->volume_list;
16148
+ lvm_physical_volume_t * pv_entry;
16149
+ evms_logical_node_t * node;
16151
+ pe_disk_t * pe_map;
16152
+ evms_sector_t offset;
16153
+ u_int32_t lv_number;
16154
+ u_int32_t le_number;
16155
+ u_int32_t first_pe_sector;
16158
+ LOG_DEBUG("Building LE maps for new volumes in group %s.\n", group->vg_name);
16160
+ // For every PV in this VG
16161
+ for ( pv_entry = group->pv_list; pv_entry; pv_entry = pv_entry->next ) {
16162
+ node = pv_entry->logical_node;
16163
+ pv = pv_entry->pv;
16164
+ pe_map = pv_entry->pe_map;
16166
+ // Version 1 metadata uses pe_on_disk.base + .size to find start
16167
+ // of first PE. Version 2 uses pe_start.
16168
+ if ( pv->version == 1 ) {
16169
+ first_pe_sector = evms_cs_size_in_vsectors(pv->pe_on_disk.base + pv->pe_on_disk.size);
16172
+ first_pe_sector = pv->pe_start;
16173
+ if ( ! first_pe_sector ) {
16174
+ first_pe_sector = evms_cs_size_in_vsectors(pv->pe_on_disk.base + pv->pe_on_disk.size);
16178
+ // For every entry in the PE map, calculate the PE's sector offset
16179
+ // and update the correct LV's PE map. LV number of 0 marks an unused PE.
16180
+ // For re-discovery, only compute entries for new volumes. If a PV
16181
+ // is read-only, all LVs on that PV will also be read-only.
16182
+ for ( i = 0; i < pv->pe_total; i++ ) {
16183
+ lv_number = pe_map[i].lv_num;
16184
+ if ( lv_number &&
16185
+ volume_list[lv_number] &&
16186
+ volume_list[lv_number]->lv_access & (EVMS_LV_NEW|EVMS_LV_INCOMPLETE) ) {
16187
+ le_number = pe_map[i].le_num;
16188
+ offset = i * pv->pe_size + first_pe_sector;
16189
+ volume_list[lv_number]->le_map[le_number].owning_pv = pv_entry;
16190
+ volume_list[lv_number]->le_map[le_number].pe_sector_offset = offset;
16191
+ if ( node->flags & EVMS_VOLUME_SET_READ_ONLY ) {
16192
+ volume_list[lv_number]->lv_access &= ~LV_WRITE;
16202
+/* Function: build_snapshot_maps
16204
+ * For every volume in this group that is a snapshot, read all of the
16205
+ * existing entries in the COW table, and build up the snapshot mapping
16206
+ * structures accordingly.
16208
+ * For reference, the COW tables attached to the snapshot volumes will
16209
+ * always be in disk-order (little-endian), so that it can always be
16210
+ * immediately written to disk. Therefore, endian conversions are necessary
16211
+ * any time the COW table is accessed. This function will make a local
16212
+ * copy of each COW table sector, and convert the local copy before
16213
+ * building the snapshot maps.
16215
+static int build_snapshot_maps( lvm_volume_group_t * group )
16217
+ lvm_logical_volume_t * volume;
16218
+ evms_logical_node_t tmp_node;
16219
+ lv_COW_table_disk_t cow_table[EVMS_VSECTOR_SIZE/sizeof(lv_COW_table_disk_t)];
16220
+ unsigned long max_entries = EVMS_VSECTOR_SIZE / sizeof(lv_COW_table_disk_t);
16221
+ int i, j, rc = 0;
16223
+ // Check every volume in the group to see if it is a snapshot. Also
16224
+ // check to make sure it is a new volume in the case of re-discovery.
16225
+ for ( i = 1; i <= MAX_LV; i++ ) {
16227
+ // The volume must exist, must be new, and must be a snapshot
16228
+ volume = group->volume_list[i];
16230
+ ! (volume->lv_access & EVMS_LV_NEW) ||
16231
+ ! (volume->lv_access & LV_SNAPSHOT) ) {
16235
+ // Set up a temporary EVMS node
16236
+ tmp_node.instance_data = volume;
16239
+ LOG_DEBUG("Building snapshot map for volume %s\n", volume->name);
16242
+ // Read in one sector's worth of COW tables.
16243
+ if ( lvm_init_io(&tmp_node, 0, volume->current_cow_sector, 1, volume->cow_table) ) {
16244
+ invalidate_snapshot_volume(volume);
16245
+ deallocate_logical_volume(volume);
16249
+ // Endian-conversion of this COW table to a local table.
16250
+ for ( j = 0; j < max_entries; j++ ) {
16251
+ cow_table[j].pv_org_number = le64_to_cpu(volume->cow_table[j].pv_org_number);
16252
+ cow_table[j].pv_org_rsector = le64_to_cpu(volume->cow_table[j].pv_org_rsector);
16253
+ cow_table[j].pv_snap_number = le64_to_cpu(volume->cow_table[j].pv_snap_number);
16254
+ cow_table[j].pv_snap_rsector = le64_to_cpu(volume->cow_table[j].pv_snap_rsector);
16258
+ // Translate every valid COW table entry into
16259
+ // a snapshot map entry.
16260
+ for ( volume->next_cow_entry = 0;
16261
+ volume->next_cow_entry < max_entries &&
16262
+ cow_table[volume->next_cow_entry].pv_org_number;
16263
+ volume->next_cow_entry++ ) {
16264
+ // org_rsector must be a valid sector number,
16265
+ // i.e. it can't be within a PVs metadata. This
16266
+ // is how we detect invalidated snapshots.
16267
+ if ( (cow_table[volume->next_cow_entry].pv_org_rsector < 10) ||
16268
+ (cow_table[volume->next_cow_entry].pv_org_number > group->pv_count) ||
16269
+ (add_cow_entry_to_snapshot_map(&(cow_table[volume->next_cow_entry]),volume)) ) {
16270
+ // This volume either has an invalid COW entry,
16271
+ // or had an error adding that COW entry to the
16272
+ // snapshot map. This snapshot is done.
16273
+ invalidate_snapshot_volume(volume);
16274
+ deallocate_logical_volume(volume);
16278
+ volume->next_free_chunk += volume->chunk_size;
16280
+ // Move on to the next sector if necessary.
16281
+ if ( !rc && volume->next_cow_entry == max_entries ) {
16282
+ volume->current_cow_sector++;
16294
+/* Function: link_snapshot_volumes
16296
+ * This function examines the list of logical volumes in this group and
16297
+ * sets up the necessary pointers to link snapshots and their originals.
16298
+ * A singly-linked list is created starting with the original volume. Also,
16299
+ * all snapshot volumes point directly back to their original. This
16300
+ * function should not be run until all volumes have been discovered.
16301
+ * In the case of re-discovery, all of these links/lists get rebuilt as if
16302
+ * they were not already there. Currently this should not pose a problem.
16304
+static int link_snapshot_volumes( lvm_volume_group_t * group )
16306
+ lvm_logical_volume_t * org_volume;
16307
+ lvm_logical_volume_t * snap_volume;
16308
+ u_int32_t org_minor;
16309
+ u_int32_t buffer_size = 0;
16312
+ for ( i = 1; i <= MAX_LV; i++ ) {
16314
+ // Only process snapshot-originals
16315
+ org_volume = group->volume_list[i];
16316
+ if ( ! org_volume ||
16317
+ ! (org_volume->lv_access & LV_SNAPSHOT_ORG) ) {
16321
+ // For snapshot-originals, look for all other volumes that
16322
+ // claim to be snapshotting it. For each one that is found,
16323
+ // insert it at the start of the original's list of snapshots.
16324
+ org_minor = org_volume->lv_minor;
16325
+ org_volume->snapshot_next = NULL; // This is necessary for rediscovery to work properly.
16326
+ // Could get circular snapshot lists otherwise.
16327
+ for ( j = 1; j <= MAX_LV; j++ ) {
16328
+ snap_volume = group->volume_list[j];
16329
+ if ( snap_volume &&
16330
+ snap_volume->lv_access & LV_SNAPSHOT &&
16331
+ (snap_volume->snap_org_minor == org_minor) ) {
16332
+ snap_volume->snapshot_org = org_volume;
16333
+ snap_volume->snapshot_next = org_volume->snapshot_next;
16334
+ org_volume->snapshot_next = snap_volume;
16335
+ if ( snap_volume->chunk_size > buffer_size ) {
16336
+ buffer_size = snap_volume->chunk_size;
16338
+ LOG_DEBUG("Linking snapshot (%s) to original (%s)\n", snap_volume->name, org_volume->name);
16342
+ // If no snapshots were found for a volume that claims to be
16343
+ // under snapshot, mark the group dirty. If this is final
16344
+ // discovery, the original will have the snapshot flag turned
16345
+ // off in check_logical_volumes().
16346
+ if ( ! org_volume->snapshot_next ) {
16347
+ LOG_WARNING("No snapshots found for original (%s)\n", org_volume->name);
16348
+ group->flags |= EVMS_VG_DIRTY;
16355
+/* Function: discover_volumes_in_group
16358
+static int discover_volumes_in_group( lvm_volume_group_t * group )
16360
+ lv_disk_t * lv_array = group->lv_array;
16361
+ lvm_logical_volume_t * new_volume;
16364
+ // Search through the LV structs for valid LV entries
16365
+ for ( i = 0; i < group->vg->lv_max; i++ ) {
16367
+ // Only discover valid, active volumes
16368
+ if ( ! lv_array[i].lv_name[0] ||
16369
+ lv_array[i].lv_number >= MAX_LV ) {
16373
+ // Make sure this volume isn't already in the list.
16374
+ if ( group->volume_list[lv_array[i].lv_number+1] ) {
16378
+ // Create a new logical volume and place it in the appropriate
16379
+ // spot in this VG's volume list.
16380
+ new_volume = allocate_logical_volume(&(lv_array[i]), group);
16381
+ if ( ! new_volume ) {
16382
+ // This volume will be missing, but other
16383
+ // volumes in this group can still be built.
16384
+ LOG_CRITICAL("Memory error creating LV %s in Group %s\n", lv_array[i].lv_name, group->vg_name);
16388
+ group->volume_list[new_volume->lv_number] = new_volume;
16389
+ group->volume_count++;
16390
+ group->flags |= EVMS_VG_DIRTY;
16392
+ LOG_DEBUG("Discovered volume %s in group %s.\n", new_volume->name, group->vg_name);
16399
+/* Function: discover_logical_volumes
16401
+ * After all PVs have been claimed and added to the appropriate VG list,
16402
+ * the volumes for each VG must be constructed. For each group, read all
16403
+ * the LV structs off the first PV in the list. Search this list of
16404
+ * structs for valid LVs. For each valid LV, create a new volume and add
16405
+ * it to the group.
16407
+static int discover_logical_volumes( void )
16409
+ lvm_volume_group_t * group;
16412
+ // Look for volumes in each valid VG entry. We even need to check ones
16413
+ // that aren't dirty - We could have deleted an incomplete volume on
16414
+ // the previous pass, and need to rediscover it in case this is final
16415
+ // discovery and we now want to export it.
16416
+ for ( group = lvm_group_list; group; group = group->next_group ) {
16418
+ if ( ! group->vg ) {
16422
+ LOG_DEBUG("Searching for volumes in group %s\n", group->vg_name);
16424
+ // Read in the LV array from disk if necessary.
16425
+ rc = read_lv(group);
16427
+ LOG_WARNING("Unable to read LV metadata for group %s\n", group->vg_name);
16428
+ LOG_WARNING("No regions can be discovered for group %s\n", group->vg_name);
16432
+ // Assemble each volume in the group.
16433
+ discover_volumes_in_group(group);
16435
+ // Build the LE map for each LV discovered in this group. This
16436
+ // must be done after all LVS in the group are discovered.
16437
+ build_le_maps(group);
16438
+ check_le_maps(group);
16440
+ // Set up all of the initial snapshot maps. Only the kernel
16441
+ // keeps track of the snapshot maps.
16442
+ build_snapshot_maps(group);
16444
+ // Set up the pointers to link snapshot volumes
16445
+ // with their originals.
16446
+ link_snapshot_volumes(group);
16453
+/* Function: export_volumes
16455
+ * The last thing the plugin must do is take each newly constructed volume
16456
+ * and place it on the evms logical node list. A zero return-code from
16457
+ * this function means nothing new was added to the list, and a positive
16458
+ * return code means that many new items were added to the list.
16460
+static int export_volumes( evms_logical_node_t ** evms_node_list )
16462
+ lvm_volume_group_t * group;
16463
+ evms_logical_node_t * new_node;
16464
+ lvm_logical_volume_t * volume;
16468
+ LOG_EXTRA("Exporting volumes\n");
16470
+ // For every valid, dirty volume group
16471
+ for ( group = lvm_group_list; group; group = group->next_group ) {
16472
+ if ( ! (group->flags & EVMS_VG_DIRTY) ) {
16476
+ // Export every valid volume in the group. For re-discovery,
16477
+ // we re-export the same logical node.
16478
+ for ( i = 1; i <= MAX_LV; i++ ) {
16479
+ volume = group->volume_list[i];
16480
+ if ( ! volume ) {
16484
+ // For new volumes, create a new EVMS node and
16485
+ // initialize the appropriate fields.
16486
+ if ( volume->lv_access & EVMS_LV_NEW ) {
16487
+ if ( evms_cs_allocate_logical_node(&new_node) ) {
16491
+ volume->volume_node = new_node;
16492
+ volume->lv_access &= (~EVMS_LV_QUIESCED & ~EVMS_LV_NEW);
16493
+ new_node->hardsector_size = group->hard_sect_size;
16494
+ new_node->block_size = group->block_size;
16495
+ new_node->plugin = &lvm_plugin_header;
16496
+ new_node->instance_data = volume;
16497
+ memcpy(new_node->name, volume->name, NAME_LEN);
16499
+ // Snapshot volumes should report the size of their original
16500
+ if ( volume->lv_access & LV_SNAPSHOT ) {
16501
+ new_node->total_vsectors = volume->snapshot_org->lv_size;
16504
+ new_node->total_vsectors = volume->lv_size;
16507
+ // Is the volume read-only?
16508
+ if ( ! (volume->lv_access & LV_WRITE) ) {
16509
+ new_node->flags |= EVMS_VOLUME_READ_ONLY;
16510
+ LOG_DEBUG("LVM volume %s is read-only\n", volume->name);
16513
+ // Is the volume incomplete?
16514
+ if ( volume->lv_access & EVMS_LV_INCOMPLETE ) {
16515
+ new_node->flags |= (EVMS_VOLUME_READ_ONLY | EVMS_VOLUME_PARTIAL);
16516
+ LOG_DEBUG("LVM volume %s is incomplete\n", volume->name);
16519
+ // Does the volume group contain any partial or
16520
+ // removable PVs?
16521
+ if ( group->flags & EVMS_VG_PARTIAL_PVS ) {
16522
+ new_node->flags |= EVMS_VOLUME_PARTIAL;
16524
+ if ( group->flags & EVMS_VG_REMOVABLE_PVS ) {
16525
+ new_node->flags |= EVMS_DEVICE_REMOVABLE;
16528
+ MOD_INC_USE_COUNT;
16531
+ // Export the node. The add_to_list will catch it if
16532
+ // we try to add the same node to the list twice.
16533
+ if ( ! evms_cs_add_logical_node_to_list(evms_node_list, volume->volume_node) ) {
16534
+ LOG_DETAILS("Exporting LVM volume %s\n", volume->name);
16539
+ // The group is clean now.
16540
+ group->flags &= ~EVMS_VG_DIRTY;
16547
+/* Function: lvm_cleanup
16549
+ * This function runs through the entire lvm data structure, removing
16550
+ * all items that are not needed at runtime. Currently, this is just the
16551
+ * vg_disk_t structure and the pv_disk_t structure for each PV. Also, any
16552
+ * groups that don't contain any volumes are deleted. All of the other
16553
+ * volume_group, logical_volume and evms_logical_node structures will be
16554
+ * kept around at run-time.
16556
+static int lvm_cleanup( void )
16558
+ lvm_volume_group_t * group;
16559
+ lvm_volume_group_t * next_group;
16560
+ lvm_physical_volume_t * pv_entry;
16562
+ for ( group = lvm_group_list; group; group = next_group ) {
16563
+ next_group = group->next_group;
16565
+ // Delete groups with no volumes.
16566
+ if ( ! group->volume_count ) {
16567
+ LOG_WARNING("Group %s contains no logical volumes. Deleting.\n", group->vg_name);
16568
+ remove_group_from_list(group);
16569
+ deallocate_volume_group(group);
16570
+ // Need to go back to the start of the list,
16571
+ // just to be safe. :)
16572
+ next_group = lvm_group_list;
16576
+ // Delete data structures that aren't used at runtime.
16577
+ if ( group->vg ) {
16578
+ kfree(group->vg);
16579
+ group->vg = NULL;
16581
+ for ( pv_entry = group->pv_list; pv_entry; pv_entry = pv_entry->next ) {
16582
+ if ( pv_entry->pv ) {
16583
+ kfree(pv_entry->pv);
16584
+ pv_entry->pv = NULL;
16586
+ if ( pv_entry->pe_map ) {
16587
+ vfree(pv_entry->pe_map);
16588
+ pv_entry->pe_map = NULL;
16591
+ if ( group->lv_array ) {
16592
+ vfree(group->lv_array);
16593
+ group->lv_array = NULL;
16595
+ if ( group->uuid_list ) {
16596
+ vfree(group->uuid_list);
16597
+ group->uuid_list = NULL;
16604
+/* Function: lvm_get_bmap
16606
+ * Support for the BMAP ioctl used by LILO to translate filesystem blocks
16607
+ * to disk blocks to map kernel images for boot time.
16609
+static int lvm_get_bmap(evms_logical_node_t * node,
16610
+ evms_get_bmap_t * bmap,
16611
+ evms_logical_node_t ** pv_node )
16613
+ lvm_logical_volume_t * volume = node->instance_data;
16614
+ lvm_physical_volume_t * pv_entry;
16615
+ evms_sector_t new_sector = 0;
16616
+ evms_sector_t new_size = 0;
16617
+ evms_sector_t pe_start_sector;
16620
+ // No kernel images allowed on snapshot LVs.
16621
+ if ( volume->lv_access & LV_SNAPSHOT ) {
16626
+ if ( bmap->rsector >= volume->lv_size ) {
16630
+ rc = remap_sector(node, bmap->rsector, 1, &new_sector, &new_size, &pe_start_sector, &pv_entry);
16632
+ if (rc || !pv_entry || !new_sector) {
16636
+ bmap->rsector = new_sector;
16637
+ *pv_node = pv_entry->logical_node;
16643
+/* Function: lvm_global_proc_read
16645
+ * A callback function for the lvm-global proc-fs entry. This will print
16646
+ * general info about all LVM VGs, PVs, and LVs.
16648
+static int lvm_global_proc_read(char * page,
16655
+ lvm_volume_group_t * group;
16656
+ lvm_physical_volume_t * pv_entry;
16657
+ lvm_logical_volume_t * volume;
16658
+ lvm_logical_volume_t * snap;
16665
+ PROCPRINT("Enterprise Volume Management System: LVM Plugin\n");
16666
+ PROCPRINT("Plugin ID: %x.%x.%x\n",
16667
+ GetPluginOEM(lvm_plugin_header.id),
16668
+ GetPluginType(lvm_plugin_header.id),
16669
+ GetPluginID(lvm_plugin_header.id));
16670
+ PROCPRINT("Plugin Version: %d.%d.%d\n",
16671
+ lvm_plugin_header.version.major,
16672
+ lvm_plugin_header.version.minor,
16673
+ lvm_plugin_header.version.patchlevel);
16674
+ PROCPRINT("Required EVMS Services Version: %d.%d.%d\n",
16675
+ lvm_plugin_header.required_common_services_version.major,
16676
+ lvm_plugin_header.required_common_services_version.minor,
16677
+ lvm_plugin_header.required_common_services_version.patchlevel);
16679
+ // Count all existing items.
16680
+ for ( group = lvm_group_list; group; group = group->next_group ) {
16681
+ lvs += group->volume_count;
16682
+ pvs += group->pv_count;
16687
+ PROCPRINT("Total: %d VGs %d PVs %d LVs\n", vgs, pvs, lvs);
16689
+ // Print out specifics about each VG.
16690
+ for ( group = lvm_group_list; group; group = group->next_group ) {
16692
+ PROCPRINT("VG: %s [%d PV, %d LV]\n",
16693
+ group->vg_name, group->pv_count, group->volume_count);
16694
+ PROCPRINT("PVs:\n");
16695
+ for ( pv_entry = group->pv_list; pv_entry; pv_entry = pv_entry->next ) {
16696
+ if ( pv_entry->logical_node ) {
16697
+ PROCPRINT("\t%s\t%10Ld KB\n",
16698
+ pv_entry->logical_node->name,
16699
+ pv_entry->logical_node->total_vsectors / 2);
16702
+ PROCPRINT("LVs:\n");
16703
+ for ( i = 1; i <= MAX_LV; i++ ) {
16704
+ if ( group->volume_list[i] ) {
16705
+ volume = group->volume_list[i];
16706
+ PROCPRINT("\t%s\t%10Ld KB / %5d LEs",
16708
+ volume->lv_size / 2,
16710
+ if ( volume->lv_access & LV_SNAPSHOT ) {
16711
+ PROCPRINT("\tSnapshot of : ");
16712
+ if ( volume->snapshot_org ) {
16713
+ PROCPRINT("%s : ", volume->snapshot_org->name);
16716
+ PROCPRINT("(unknown) : ");
16718
+ PROCPRINT("%ld%% full : ", (long)(volume->next_free_chunk) * 100 / (long)(volume->lv_size));
16719
+ if ( volume->lv_status & LV_ACTIVE ) {
16720
+ PROCPRINT("active");
16723
+ PROCPRINT("disabled");
16726
+ else if ( volume->lv_access & LV_SNAPSHOT_ORG ) {
16727
+ PROCPRINT("\tSnapshotted by : ");
16728
+ for ( snap = volume->snapshot_next; snap; snap = snap->snapshot_next ) {
16729
+ PROCPRINT("%s ", snap->name);
16741
+/********** Required EVMS Plugin Functions **********/
16744
+/* Function: lvm_discover
16746
+ * This is the entry point into the LVM discovery process. It is a three
16747
+ * phase process. First, the list of nodes are examined for PVs, and the
16748
+ * appropriate volume groups are created. Then each volume group is
16749
+ * examined to find all available logical volumes. Finally, each LVM
16750
+ * logical volume has a new EVMS node created for it, and added to the
16753
+static int lvm_discover( evms_logical_node_t ** evms_node_list )
16757
+ LOG_EXTRA("Beginning discovery.\n");
16759
+ discover_volume_groups(evms_node_list);
16761
+ check_volume_groups();
16763
+ discover_logical_volumes();
16765
+ check_logical_volumes(0);
16767
+ rc = export_volumes(evms_node_list);
16769
+ LOG_EXTRA("Discovery complete.\n");
16774
+/* Function: lvm_discover_end
16776
+ * The discovery process at the region-manager level is now iterative,
16777
+ * much like the EVMS feature level. This allows the ability to stack
16778
+ * LVM on top of MD, or vice-versa. To accomplish this correctly, and
16779
+ * also to accomplish partial volume discovery, a second discover
16780
+ * entry point is needed, so EVMS can tell the region managers that
16781
+ * discovery is over, and to finish up any discovery that is not yet
16782
+ * complete. When this function is called, it should be assumed that
16783
+ * the node list has had nothing new added to it since the last call
16784
+ * of the regular discover function. Therefore, when this function is
16785
+ * called, we do not need to try to discovery any additional volume
16786
+ * groups. We will, however, look for logical volumes once more. This
16787
+ * gives us the ability to export (read-only) volumes that have
16788
+ * partially corrupted LE maps due to missing PVs in their VG.
16790
+static int lvm_discover_end( evms_logical_node_t ** evms_node_list )
16794
+ LOG_EXTRA("Beginning final discovery\n");
16796
+ discover_volume_groups(evms_node_list);
16798
+ check_volume_groups();
16800
+ discover_logical_volumes();
16802
+ check_logical_volumes(1);
16804
+ rc = export_volumes(evms_node_list);
16808
+ LOG_EXTRA("Final discovery complete.\n");
16813
+/* Function: lvm_delete_node
16815
+ * This function deletes the in-memory representation of an LVM
16816
+ * logical volume.
16818
+static int lvm_delete_node( evms_logical_node_t * logical_node )
16820
+ lvm_logical_volume_t * volume = logical_node->instance_data;
16821
+ lvm_volume_group_t * group = volume->group;
16823
+ LOG_DEBUG("Deleting LVM node %s\n", logical_node->name);
16825
+ if ( deallocate_logical_volume(volume) ) {
16829
+ // If we just removed the last volume from this group, the entire group
16830
+ // must also be deleted.
16831
+ if ( group && group->volume_count == 0 ) {
16832
+ remove_group_from_list(group);
16833
+ deallocate_volume_group(group);
16836
+ // Free the logical node.
16837
+ evms_cs_deallocate_logical_node(logical_node);
16839
+ MOD_DEC_USE_COUNT;
16845
+/* Function: lvm_read
16847
+static void lvm_read( evms_logical_node_t * node,
16850
+ lvm_logical_volume_t * volume = node->instance_data;
16851
+ lvm_physical_volume_t * pv_entry;
16852
+ evms_sector_t pe_start_sector;
16853
+ evms_sector_t new_sector;
16854
+ evms_sector_t new_size;
16856
+ // Make sure the volume is active and readable
16857
+ if ( ! (volume->lv_access & LV_READ && volume->lv_status & LV_ACTIVE) ) {
16858
+ EVMS_IO_ERROR(eio);
16862
+ // If this volume is a snapshot, lock the volume, and do
16863
+ // the LE-PE translation on its original volume.
16864
+ if ( volume->lv_access & LV_SNAPSHOT ) {
16865
+ down( &volume->snap_semaphore );
16866
+ if ( ! volume->snapshot_org ) {
16867
+ EVMS_IO_ERROR(eio);
16868
+ up( &volume->snap_semaphore );
16871
+ node = volume->snapshot_org->volume_node;
16874
+ // Check if I/O goes past end of logical volume. Must use the
16875
+ // node, not the volume, so snapshots will work correctly.
16876
+ if ( eio->rsector + eio->rsize > node->total_vsectors ) {
16877
+ if ( volume->lv_access & LV_SNAPSHOT ) {
16878
+ up( &volume->snap_semaphore );
16880
+ EVMS_IO_ERROR(eio);
16884
+ // Logical-to-Physical remapping. Check for incomplete volumes.
16885
+ if ( remap_sector(node, eio->rsector, eio->rsize, &new_sector, &new_size, &pe_start_sector, &pv_entry) ||
16886
+ ! pe_start_sector || ! pv_entry ) {
16887
+ if ( volume->lv_access & LV_SNAPSHOT ) {
16888
+ up( &volume->snap_semaphore );
16890
+ EVMS_IO_ERROR(eio);
16894
+ // For snapshot volumes, check if this sector's chunk has been
16895
+ // remapped. If it has, new_sector and pv_entry will be changed
16896
+ // accordingly. If not, they remain the same.
16897
+ if ( volume->lv_access & LV_SNAPSHOT ) {
16898
+ snapshot_remap_sector(volume, pe_start_sector , &new_sector, &pv_entry);
16901
+ eio->rsector = new_sector;
16902
+ eio->rsize = new_size;
16903
+ R_IO(pv_entry->logical_node, eio);
16905
+ // Unlock the snapshot
16906
+ if ( volume->lv_access & LV_SNAPSHOT ) {
16907
+ up( &volume->snap_semaphore );
16912
+/* Function: lvm_write
16914
+static void lvm_write( evms_logical_node_t * node,
16917
+ lvm_logical_volume_t * volume = node->instance_data;
16918
+ lvm_logical_volume_t * snap_volume;
16919
+ lvm_physical_volume_t * pv_entry;
16920
+ evms_sector_t pe_start_sector;
16921
+ evms_sector_t new_sector;
16922
+ evms_sector_t new_size;
16924
+ // Make sure the volume is active and writable
16925
+ if ( ! (volume->lv_access & LV_WRITE && volume->lv_status & LV_ACTIVE) ) {
16926
+ EVMS_IO_ERROR(eio);
16930
+ // Check if I/O goes past end of logical volume.
16931
+ if ( eio->rsector + eio->rsize > node->total_vsectors ) {
16932
+ EVMS_IO_ERROR(eio);
16936
+ // Logical-to-Physical remapping. Check for incomplete volumes.
16937
+ if ( remap_sector(node, eio->rsector, eio->rsize, &new_sector, &new_size, &pe_start_sector, &pv_entry) ||
16938
+ ! pe_start_sector || ! pv_entry ) {
16939
+ EVMS_IO_ERROR(eio);
16943
+ // Copy-on-write for snapshotting
16944
+ if ( volume->lv_access & LV_SNAPSHOT_ORG ) {
16945
+ // Originals can be snapshotted multiple times
16946
+ for ( snap_volume = volume->snapshot_next; snap_volume; snap_volume = snap_volume->snapshot_next ) {
16947
+ if ( snapshot_copy_data(volume, snap_volume, pe_start_sector, new_sector, pv_entry) ) {
16948
+ EVMS_IO_ERROR(eio);
16954
+ eio->rsector = new_sector;
16955
+ eio->rsize = new_size;
16956
+ W_IO(pv_entry->logical_node, eio);
16960
+/* Function: lvm_init_io
16962
+ * Init_io on a snapshot volume treats it like a regular volume.
16964
+static int lvm_init_io( evms_logical_node_t * node,
16965
+ int io_flag, // 0=read, 1=write, 4=LVM-internal-write
16966
+ evms_sector_t sect_nr, // node LBA
16967
+ evms_sector_t num_sects, // # of sectors
16968
+ void * buf_addr ) // buffer address
16970
+ lvm_physical_volume_t * pv_entry;
16971
+ lvm_logical_volume_t * volume = node->instance_data;
16972
+ evms_sector_t pe_start_sector;
16973
+ evms_sector_t new_sector;
16974
+ evms_sector_t new_size;
16977
+ // Only allow internal writes to snapshots (io_flag==4). Disallow
16978
+ // writes to snapshot originals.
16979
+ if ( io_flag == 1 &&
16980
+ volume->lv_access & (LV_SNAPSHOT|LV_SNAPSHOT_ORG) ) {
16983
+ // The node for a snapshot reports the size of the original. If a
16984
+ // request comes in in that range, just return.
16985
+ else if ( volume->lv_access & LV_SNAPSHOT &&
16986
+ sect_nr >= volume->lv_size &&
16987
+ sect_nr < node->total_vsectors ) {
16988
+ if ( io_flag == 0 ) {
16989
+ memset( buf_addr, 0, num_sects << EVMS_VSECTOR_SIZE_SHIFT );
16993
+ // Regular range check.
16994
+ else if ( sect_nr + num_sects > volume->lv_size ) {
16998
+ if ( io_flag == 4 ) {
17002
+ // Init IO needs to deal with the possibility of a request that spans
17003
+ // PEs or stripes. This is possible because there is no limit on
17004
+ // num_sects. To handle this, we loop through remap_sector and
17005
+ // INIT_IO until num_sects reaches zero.
17006
+ while ( num_sects ) {
17007
+ if ( remap_sector(node, sect_nr, num_sects, &new_sector, &new_size, &pe_start_sector, &pv_entry) ) {
17010
+ // If the volume is incomplete, clear the buffer (on a read).
17011
+ if ( !pe_start_sector || !pv_entry ) {
17012
+ if ( io_flag == 0 ) {
17013
+ memset(buf_addr, 0, new_size << EVMS_VSECTOR_SIZE_SHIFT);
17017
+ rc = INIT_IO(pv_entry->logical_node, io_flag, new_sector, new_size, buf_addr);
17019
+ num_sects -= new_size;
17020
+ sect_nr += new_size;
17021
+ buf_addr = (void*)(((unsigned long)buf_addr) + (unsigned long)(new_size << EVMS_VSECTOR_SIZE_SHIFT));
17028
+/* Function: lvm_ioctl
17030
+static int lvm_ioctl( evms_logical_node_t * logical_node,
17031
+ struct inode * inode,
17032
+ struct file * file,
17033
+ unsigned int cmd,
17034
+ unsigned long arg)
17036
+ lvm_logical_volume_t * volume = logical_node->instance_data;
17039
+ LOG_ENTRY_EXIT("--lvm: Ioctl %d\n",cmd);
17043
+ case HDIO_GETGEO:
17045
+ // Fixed geometry for all LVM volumes
17046
+ unsigned char heads = 64;
17047
+ unsigned char sectors = 32;
17049
+ struct hd_geometry *hd = (struct hd_geometry *)arg;
17051
+ cylinders = logical_node->total_vsectors;
17052
+ cylinders = (cylinders / heads) / sectors;
17054
+ if (hd == NULL) {
17058
+ if ( copy_to_user((char*)(&hd->heads), &heads, sizeof(heads)) != 0 ||
17059
+ copy_to_user((char*)(&hd->sectors), §ors, sizeof(sectors)) != 0 ||
17060
+ copy_to_user((short*)(&hd->cylinders), &cylinders, sizeof(cylinders)) != 0 ||
17061
+ copy_to_user((long*)(&hd->start), &start, sizeof(start)) != 0 ) {
17067
+ case LV_SET_ACCESS:
17068
+ // Set access flags of a logical volume
17069
+ // If we decide to make a volume read-only, how do we
17070
+ // tell the EVMS level?
17072
+ if (!capable(CAP_SYS_ADMIN)) return -EACCES;
17073
+ lv_ptr->lv_access = (ulong) arg;
17074
+ if ( lv_ptr->lv_access & LV_WRITE)
17075
+ set_device_ro(lv_ptr->lv_dev, 0);
17077
+ set_device_ro(lv_ptr->lv_dev, 1);
17082
+ case LV_SET_STATUS:
17083
+ // Set status flags of a logical volume
17085
+ if (!capable(CAP_SYS_ADMIN)) return -EACCES;
17086
+ if (!((ulong) arg & LV_ACTIVE) && lv_ptr->lv_open > 1)
17088
+ lv_ptr->lv_status = (ulong) arg;
17093
+ case EVMS_QUIESCE_VOLUME:
17095
+ evms_quiesce_volume_t * tmp = (evms_quiesce_volume_t*)arg;
17096
+ if ( tmp->command ) { // Quiesce
17097
+ volume->lv_access |= EVMS_LV_QUIESCED;
17099
+ else { // Un-quiesce
17100
+ volume->lv_access &= ~EVMS_LV_QUIESCED;
17105
+ case EVMS_GET_BMAP:
17107
+ evms_get_bmap_t * bmap = (evms_get_bmap_t*)arg;
17108
+ evms_logical_node_t * pv_node;
17110
+ rc = lvm_get_bmap(logical_node, bmap, &pv_node);
17112
+ rc = IOCTL(pv_node, inode, file, cmd, (unsigned long)bmap);
17117
+ case EVMS_GET_DISK_LIST:
17118
+ case EVMS_CHECK_MEDIA_CHANGE:
17119
+ case EVMS_REVALIDATE_DISK:
17120
+ case EVMS_OPEN_VOLUME:
17121
+ case EVMS_CLOSE_VOLUME:
17123
+ // These five ioctl all need to be broadcast to all PVs.
17124
+ lvm_volume_group_t * group = volume->group;
17125
+ lvm_physical_volume_t * pv_entry;
17126
+ for ( pv_entry = group->pv_list; pv_entry; pv_entry = pv_entry->next ) {
17127
+ rc |= IOCTL(pv_entry->logical_node, inode, file, cmd, arg);
17133
+ // Currently LVM does not send any ioctl's down to the
17134
+ // PVs. Which PV would they go to? What would we do with
17135
+ // the return codes?
17143
+/* Function: lvm_direct_ioctl
17145
+ * This function provides a method for user-space to communicate directly
17146
+ * with a plugin in the kernel.
17148
+static int lvm_direct_ioctl( struct inode * inode,
17149
+ struct file * file,
17150
+ unsigned int cmd,
17151
+ unsigned long args )
17153
+ evms_plugin_ioctl_t argument;
17156
+ // Copy user's parameters to kernel space
17157
+ if ( copy_from_user(&argument, (evms_plugin_ioctl_t*)args, sizeof(argument)) ) {
17161
+ // Make sure this is supposed to be our ioctl.
17162
+ if ( argument.feature_id != lvm_plugin_header.id ) {
17166
+ switch(argument.feature_command) {
17168
+ case EVMS_LVM_PV_REMOVE_IOCTL:
17170
+ lvm_pv_remove_ioctl_t pv_remove;
17171
+ if ( copy_from_user(&pv_remove, (lvm_pv_remove_ioctl_t*)argument.feature_ioctl_data, sizeof(pv_remove)) ) {
17175
+ rc = remove_pv_from_group(pv_remove.pv_number, pv_remove.vg_uuid);
17179
+ case EVMS_LVM_SNAPSHOT_STAT_IOCTL:
17181
+ lvm_snapshot_stat_ioctl_t snap_stats;
17182
+ if ( copy_from_user(&snap_stats, (lvm_snapshot_stat_ioctl_t*)argument.feature_ioctl_data, sizeof(snap_stats)) ) {
17186
+ rc = get_snapshot_stats(&snap_stats);
17187
+ if ( copy_to_user((lvm_snapshot_stat_ioctl_t*)argument.feature_ioctl_data, &snap_stats, sizeof(snap_stats)) ) {
17199
+ argument.status = rc;
17200
+ copy_to_user((evms_plugin_ioctl_t*)args, &argument, sizeof(argument));
17205
+/* Function: lvm_vge_init
17207
+int __init lvm_vge_init(void)
17209
+ struct proc_dir_entry * pde;
17211
+ lvm_group_list = NULL;
17214
+ // Register the global proc-fs entries.
17215
+ pde = evms_cs_get_evms_proc_dir();
17217
+ lvm_proc = create_proc_entry(LVM_PROC_NAME, S_IFDIR, pde);
17218
+ if ( lvm_proc ) {
17219
+ create_proc_read_entry(LVM_PROC_GLOBAL_NAME, S_IFREG, lvm_proc, lvm_global_proc_read, NULL);
17223
+ // Register this plugin with EVMS.
17224
+ return evms_cs_register_plugin(&lvm_plugin_header);
17228
+/* Function: lvm_vge_exit
17230
+void __exit lvm_vge_exit(void)
17232
+ lvm_volume_group_t * group;
17233
+ lvm_volume_group_t * next_group;
17234
+ struct proc_dir_entry * pde;
17237
+ // If LVM is called for module_exit, that means the reference
17238
+ // count must be zero, which means there should be no volumes,
17239
+ // and thus no volume groups. But, check anyway and delete
17240
+ // any volumes and groups that are still hanging around.
17241
+ if ( lvm_group_list ) {
17242
+ LOG_SERIOUS("Called for module_exit, but group list is not empty!\n");
17244
+ for ( group = lvm_group_list; group; group = next_group ) {
17245
+ next_group = group->next_group;
17247
+ LOG_SERIOUS("In module_exit: deleting all volumes from group %s.\n", group->vg_name);
17249
+ for ( i = 1; i <= MAX_LV; i++ ) {
17250
+ if ( group->volume_list[i] ) {
17251
+ lvm_delete_node(group->volume_list[i]->volume_node);
17256
+ // Unregister the proc-fs entries.
17257
+ pde = evms_cs_get_evms_proc_dir();
17259
+ remove_proc_entry(LVM_PROC_GLOBAL_NAME, lvm_proc);
17260
+ remove_proc_entry(LVM_PROC_NAME, pde);
17263
+ // Unregister this plugin from EVMS.
17264
+ evms_cs_unregister_plugin(&lvm_plugin_header);
17268
+module_init(lvm_vge_init);
17269
+module_exit(lvm_vge_exit);
17270
+#ifdef MODULE_LICENSE
17271
+MODULE_LICENSE("GPL");
17274
diff -Naur linux-2002-03-28/drivers/evms/md_core.c evms-2002-03-28/drivers/evms/md_core.c
17275
--- linux-2002-03-28/drivers/evms/md_core.c Wed Dec 31 18:00:00 1969
17276
+++ evms-2002-03-28/drivers/evms/md_core.c Thu Mar 28 08:37:22 2002
17279
+ * Copyright (c) International Business Machines Corp., 2000
17281
+ * This program is free software; you can redistribute it and/or modify
17282
+ * it under the terms of the GNU General Public License as published by
17283
+ * the Free Software Foundation; either version 2 of the License, or
17284
+ * (at your option) any later version.
17286
+ * This program is distributed in the hope that it will be useful,
17287
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
17288
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
17289
+ * the GNU General Public License for more details.
17291
+ * You should have received a copy of the GNU General Public License
17292
+ * along with this program; if not, write to the Free Software
17293
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17296
+ * linux/drivers/evms/md_core.c
17298
+ * EVMS Linux MD Region Manager
17303
+#include <linux/module.h>
17304
+#include <linux/kmod.h>
17305
+#include <linux/kernel.h>
17306
+#include <linux/config.h>
17307
+#include <linux/genhd.h>
17308
+#include <linux/major.h>
17309
+#include <linux/string.h>
17310
+#include <linux/blk.h>
17311
+#include <linux/init.h>
17312
+#include <linux/slab.h>
17313
+#include <linux/vmalloc.h>
17314
+#include <linux/evms/evms_kernel.h>
17315
+#include <linux/evms/evms_md.h>
17316
+#include <linux/sysctl.h>
17317
+#include <asm/system.h>
17318
+#include <asm/uaccess.h>
17320
+#define LOG_PREFIX "md core: "
17323
+ * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
17324
+ * is 100 KB/sec, so the extra system load does not show up that much.
17325
+ * Increase it if you want to have more _guaranteed_ speed. Note that
17326
+ * the RAID driver will use the maximum available bandwith if the IO
17327
+ * subsystem is idle. There is also an 'absolute maximum' reconstruction
17328
+ * speed limit - in case reconstruction slows down your system despite
17329
+ * idle IO detection.
17331
+ * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
17334
+static MD_LIST_HEAD(all_raid_disks);
17335
+static MD_LIST_HEAD(pending_raid_disks);
17337
+static int sysctl_speed_limit_min = 100;
17338
+static int sysctl_speed_limit_max = 100000;
17341
+static mdk_personality_t *pers[MAX_PERSONALITY];
17343
+static int md_blocksizes[MAX_MD_DEVS];
17344
+static int md_hardsect_sizes[MAX_MD_DEVS];
17345
+int evms_md_size[MAX_MD_DEVS];
17346
+static evms_thread_t *evms_md_recovery_thread;
17349
+ * Enables to iterate over all existing md arrays
17351
+static MD_LIST_HEAD(all_mddevs);
17354
+ * The mapping between kdev and mddev is not necessary a simple
17355
+ * one! Eg. HSM uses several sub-devices to implement Logical
17356
+ * Volumes. All these sub-devices map to the same mddev.
17358
+dev_mapping_t evms_mddev_map[MAX_MD_DEVS];
17361
+static md_spinlock_t activate_spare_list_lock = MD_SPIN_LOCK_UNLOCKED;
17362
+static evms_md_activate_spare_t *evms_activate_spare_list = NULL, **evms_activate_spare_tail;
17364
+/* Support functions for discovery */
17365
+static int evms_md_import_device (evms_logical_node_t **discover_list,
17366
+ evms_logical_node_t *node,
17368
+static void evms_md_autostart_arrays(evms_logical_node_t **discover_list);
17369
+static void evms_md_autorun_devices (evms_logical_node_t **discover_list,
17370
+ kdev_t countdev);
17371
+static void evms_md_autorun_array (evms_logical_node_t ** discover_list,
17373
+static int evms_md_create_logical_node(evms_logical_node_t **discover_list,
17374
+ mddev_t *mddev, uint flags);
17375
+static int evms_md_read_disk_sb (mdk_rdev_t * rdev);
17376
+static int evms_md_analyze_sbs (mddev_t * mddev);
17377
+static mddev_t * alloc_mddev (kdev_t dev);
17378
+static void free_mddev(mddev_t * mddev);
17379
+static int do_md_run (mddev_t * mddev);
17380
+static int do_md_stop (mddev_t * mddev, int ro);
17382
+static void kick_rdev_from_array (mdk_rdev_t * rdev);
17383
+static mdp_disk_t *evms_md_find_disk(mddev_t *mddev, kdev_t dev);
17384
+static void remove_descriptor (mdp_disk_t *disk, mdp_super_t *sb);
17386
+/* Plugin API prototypes */
17387
+static int md_discover( evms_logical_node_t ** discover_list );
17388
+static int md_end_discover( evms_logical_node_t ** discover_list );
17389
+static int md_delete( evms_logical_node_t * node);
17390
+static void md_read( evms_logical_node_t * node,
17392
+static void md_write( evms_logical_node_t * node,
17394
+static int md_init_io( evms_logical_node_t * node,
17396
+ evms_sector_t sect_nr,
17397
+ evms_sector_t num_sects,
17398
+ void * buf_addr );
17399
+static int md_ioctl( evms_logical_node_t * node,
17400
+ struct inode * inode,
17401
+ struct file * file,
17402
+ unsigned int cmd,
17403
+ unsigned long arg);
17404
+static int md_ioctl_cmd_broadcast(
17405
+ evms_logical_node_t *node,
17406
+ struct inode *inode,
17407
+ struct file *file,
17408
+ unsigned long cmd,
17409
+ unsigned long arg);
17411
+static int md_direct_ioctl(
17412
+ struct inode * inode,
17413
+ struct file * file,
17414
+ unsigned int cmd,
17415
+ unsigned long arg);
17417
+/* global MD data structures */
17418
+static evms_plugin_function_table_t md_function_table = {
17419
+ discover : &md_discover,
17420
+ end_discover : &md_end_discover,
17421
+ delete : &md_delete,
17423
+ write : &md_write,
17424
+ init_io : &md_init_io,
17425
+ ioctl : &md_ioctl,
17426
+ direct_ioctl : &md_direct_ioctl
17429
+static evms_plugin_header_t md_plugin_header = {
17430
+ id : SetPluginID(
17432
+ EVMS_REGION_MANAGER,
17435
+ major : MD_MAJOR_VERSION,
17436
+ minor : MD_MINOR_VERSION,
17437
+ patchlevel : MD_PATCHLEVEL_VERSION
17439
+ required_common_services_version: {
17440
+ major : EVMS_MD_COMMON_SERVICES_MAJOR,
17441
+ minor : EVMS_MD_COMMON_SERVICES_MINOR,
17442
+ patchlevel : EVMS_MD_COMMON_SERVICES_PATCHLEVEL
17444
+ function_table : &md_function_table
17447
+/* local instance data structure definition */
17448
+typedef struct md_instance_data_s {
17450
+} md_instance_data_t;
17452
+/* global variables */
17453
+static int exported_nodes; /* total # of exported devices
17454
+ * produced during this discovery.
17456
+static evms_logical_node_t **cur_discover_list = NULL;
17458
+/**********************************************************/
17459
+/* SYSCTL - EVMS/RAID folder */
17460
+/**********************************************************/
17462
+#ifdef CONFIG_PROC_FS
17463
+static struct ctl_table_header *md_table_header;
17465
+static ctl_table md_table[] = {
17466
+ {DEV_EVMS_MD_SPEED_LIMIT_MIN, "speed_limit_min",
17467
+ &sysctl_speed_limit_min, sizeof(int), 0644, NULL, &proc_dointvec},
17468
+ {DEV_EVMS_MD_SPEED_LIMIT_MAX, "speed_limit_max",
17469
+ &sysctl_speed_limit_max, sizeof(int), 0644, NULL, &proc_dointvec},
17473
+static ctl_table md_dir_table[] = {
17474
+ {DEV_EVMS_MD, "md", NULL, 0, 0555, md_table},
17478
+static ctl_table evms_dir_table[] = {
17479
+ {DEV_EVMS, "evms", NULL, 0, 0555, md_dir_table},
17483
+static ctl_table dev_dir_table[] = {
17484
+ {CTL_DEV, "dev", NULL, 0, 0555, evms_dir_table},
17488
+/********** Required EVMS Plugin Functions **********/
17491
+ * Function: md_discover
17492
+ * We should only export complete MD device nodes
17494
+static int md_discover( evms_logical_node_t ** discover_list )
17496
+ LOG_ENTRY_EXIT("md_discover() ENTRY\n");
17498
+ /* initialize global variable */
17499
+ exported_nodes = 0;
17500
+ cur_discover_list = discover_list;
17501
+ evms_md_autostart_arrays(discover_list);
17503
+ LOG_ENTRY_EXIT("md_discover() EXIT (exported nodes: %d)\n", exported_nodes);
17504
+ cur_discover_list = NULL;
17505
+ return(exported_nodes);
17510
+ * Function: md_discover_end
17512
+static int md_end_discover( evms_logical_node_t ** discover_list )
17516
+ struct md_list_head *tmp;
17517
+ int done = FALSE;
17519
+ rc = md_discover(discover_list);
17523
+ ITERATE_MDDEV(mddev,tmp){
17524
+ if (!mddev->nr_raid_disks) {
17525
+ free_mddev(mddev);
17529
+ if (mddev->flag & EVMS_MD_INCOMPLETE) {
17530
+ LOG_DETAILS("trying to run incomplete array md%d\n", mdidx(mddev));
17531
+ evms_md_autorun_array(discover_list,mddev);
17543
+ * Function: md_delete_node
17545
+static int md_delete( evms_logical_node_t * node)
17547
+ md_instance_data_t *MDID;
17550
+ MDID = node->instance_data;
17551
+ mddev = MDID->mddev;
17553
+ LOG_DEFAULT("md_delete() name=%s\n", evms_md_partition_name(node));
17555
+ do_md_stop(mddev,0);
17557
+ evms_cs_deallocate_memory(MDID);
17558
+ evms_cs_deallocate_logical_node(node);
17564
+ * Function: md_read
17566
+static void md_read( evms_logical_node_t * node,
17569
+ md_instance_data_t *MDID;
17572
+ MDID = node->instance_data;
17573
+ mddev = MDID->mddev;
17574
+ if ((eio->rsector + eio->rsize) > node->total_vsectors)
17575
+ EVMS_IO_ERROR(eio);
17577
+ if (mddev && mddev->pers)
17578
+ mddev->pers->make_request(mddev, READ, eio);
17584
+ * Function: md_write
17586
+static void md_write( evms_logical_node_t * node,
17589
+ md_instance_data_t *MDID;
17592
+ MDID = node->instance_data;
17593
+ mddev = MDID->mddev;
17594
+ if ((eio->rsector + eio->rsize) > node->total_vsectors)
17595
+ EVMS_IO_ERROR(eio);
17597
+ if (mddev && mddev->pers)
17598
+ mddev->pers->make_request(mddev, WRITE, eio);
17604
+ * Function: md_init_io
17606
+static int md_init_io( evms_logical_node_t * node,
17608
+ evms_sector_t sect_nr,
17609
+ evms_sector_t num_sects, /* # of sectors */
17610
+ void * buf_addr ) /* buffer address */
17612
+ md_instance_data_t *MDID;
17616
+ MDID = node->instance_data;
17617
+ mddev = MDID->mddev;
17618
+ if (sect_nr + num_sects > node->total_vsectors) {
17619
+ LOG_ERROR(" md_init_io() attempt to %s beyond MD device(%s) boundary(%Lu) with sect_nr(%Lu) and num_sects(%Lu)\n",
17620
+ rw ? "WRITE" : "READ", evms_md_partition_name(node),node->total_vsectors,sect_nr,num_sects);
17623
+ if (!rc && mddev && mddev->pers)
17624
+ rc = mddev->pers->init_io(mddev, rw, sect_nr, num_sects, buf_addr);
17632
+ * Function: md_ioctl
17634
+static int md_ioctl(
17635
+ evms_logical_node_t * node,
17636
+ struct inode * inode,
17637
+ struct file * file,
17638
+ unsigned int cmd,
17639
+ unsigned long arg)
17641
+ md_instance_data_t * MDID = node->instance_data;
17645
+ if ((!inode) || (!MDID) )
17651
+ * We have a problem here : there is no easy way to give a CHS
17652
+ * virtual geometry. We currently pretend that we have a 2 heads
17653
+ * 4 sectors (with a BIG number of cylinders...). This drives
17654
+ * dosfs just mad... ;-)
17657
+ case HDIO_GETGEO:
17659
+ struct hd_geometry hdgeo;
17661
+ hdgeo.sectors = 4;
17662
+ hdgeo.cylinders = ((unsigned int)node->total_vsectors) /
17663
+ hdgeo.heads / hdgeo.sectors;
17665
+ if (copy_to_user((int *)arg,
17671
+ case EVMS_QUIESCE_VOLUME:
17672
+ case EVMS_GET_DISK_LIST:
17673
+ case EVMS_CHECK_MEDIA_CHANGE:
17674
+ case EVMS_REVALIDATE_DISK:
17675
+ case EVMS_OPEN_VOLUME:
17676
+ case EVMS_CLOSE_VOLUME:
17677
+ rc = md_ioctl_cmd_broadcast(
17678
+ node, inode, file, cmd, arg);
17680
+ case EVMS_PLUGIN_IOCTL:
17681
+ rc = md_direct_ioctl(
17682
+ inode, file, cmd, arg);
17685
+ mddev = MDID->mddev;
17686
+ if (mddev == NULL) {
17688
+ } else if (mddev->pers->evms_ioctl == NULL) {
17691
+ rc = mddev->pers->evms_ioctl(mddev, inode, file, cmd, arg);
17698
+static int md_ioctl_cmd_broadcast(
17699
+ evms_logical_node_t *node,
17700
+ struct inode *inode,
17701
+ struct file *file,
17702
+ unsigned long cmd,
17703
+ unsigned long arg)
17706
+ md_instance_data_t *MDID;
17708
+ struct md_list_head *tmp;
17709
+ mdk_rdev_t *rdev;
17711
+ MDID = node->instance_data;
17712
+ mddev = MDID->mddev;
17714
+ /* broadcast this cmd to all children */
17715
+ ITERATE_RDEV(mddev,rdev,tmp) {
17716
+ if (!rdev->mddev) {
17720
+ if (!rdev->virtual_spare) {
17721
+ rc |= IOCTL(rdev->node, inode, file, cmd, arg);
17728
+static int evms_md_add_virtual_spare (mddev_t *mddev, kdev_t dev)
17730
+ mdk_rdev_t *rdev;
17731
+ mdp_disk_t *disk = NULL;
17734
+ if (evms_md_find_rdev(mddev,dev))
17737
+ LOG_ENTRY_EXIT("%s ENTRY\n", __FUNCTION__);
17738
+ if( evms_cs_allocate_memory((void**)&rdev, sizeof(*rdev)))
17741
+ memset(rdev, 0, sizeof(*rdev));
17743
+ for (i = mddev->sb->raid_disks; i < MD_SB_DISKS; i++) {
17744
+ disk = mddev->sb->disks + i;
17745
+ if (!disk->major && !disk->minor)
17747
+ if (disk_removed(disk))
17750
+ if (i == MD_SB_DISKS) {
17751
+ LOG_WARNING("%s : [md%d]can not hot-add to full array!\n", __FUNCTION__, mdidx(mddev));
17752
+ evms_cs_deallocate_memory(rdev);
17756
+ if (disk_removed(disk)) {
17760
+ if (disk->number != i) {
17762
+ evms_cs_deallocate_memory(rdev);
17766
+ disk->number = i;
17769
+ disk->raid_disk = disk->number;
17770
+ disk->major = MAJOR(dev);
17771
+ disk->minor = MINOR(dev);
17773
+ mark_disk_spare(disk);
17775
+ rdev->mddev = mddev;
17777
+ rdev->desc_nr = disk->number;
17778
+ rdev->virtual_spare = 1;
17780
+ /* bind rdev to mddev array */
17781
+ md_list_add(&rdev->all, &all_raid_disks);
17782
+ md_list_add(&rdev->same_set, &mddev->disks);
17783
+ MD_INIT_LIST_HEAD(&rdev->pending);
17785
+ mddev->sb->nr_disks++;
17786
+ mddev->sb->spare_disks++;
17787
+ mddev->sb->working_disks++;
17790
+ mddev->sb_dirty = 1;
17792
+ evms_md_update_sb(mddev);
17797
+static int evms_md_remove_disk(mddev_t *mddev, kdev_t dev)
17799
+ mdk_rdev_t *rdev = NULL;
17800
+ mdp_disk_t *disk;
17803
+ disk = evms_md_find_disk(mddev,dev);
17807
+ rdev = evms_md_find_rdev(mddev,dev);
17809
+ if (rdev && !rdev->faulty) {
17811
+ * The disk is active in the array,
17812
+ * must ask the personality to do it
17814
+ if (mddev->pers && mddev->pers->diskop) {
17815
+ /* Assume spare, try to remove it first. */
17816
+ rc = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_SPARE);
17818
+ rc = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_DISK);
17824
+ remove_descriptor(disk,mddev->sb);
17826
+ kick_rdev_from_array(rdev);
17827
+ mddev->sb_dirty = 1;
17828
+ evms_md_update_sb(mddev);
17834
+static int evms_md_activate_spare(mddev_t *mddev, kdev_t dev)
17836
+ mdk_rdev_t *rdev = NULL;
17837
+ evms_md_activate_spare_t activate_spare;
17838
+ unsigned long flags;
17841
+ rdev = evms_md_find_rdev(mddev,dev);
17843
+ if (mddev->recovery_running) {
17846
+ activate_spare.mddev = mddev;
17847
+ activate_spare.spare = &mddev->sb->disks[rdev->sb->this_disk.number];
17848
+ md_spin_lock_irqsave(&activate_spare_list_lock, flags);
17849
+ if (evms_activate_spare_list == NULL)
17850
+ evms_activate_spare_tail = &evms_activate_spare_list;
17851
+ *evms_activate_spare_tail = &activate_spare;
17852
+ evms_activate_spare_tail = &activate_spare.next;
17853
+ activate_spare.next = NULL;
17854
+ md_spin_unlock_irqrestore(&activate_spare_list_lock, flags);
17856
+ mddev->sb->raid_disks++;
17857
+ evms_md_recover_arrays();
17865
+static int evms_md_deactivate_disk(mddev_t *mddev, kdev_t dev)
17867
+ mdk_rdev_t *rdev = NULL;
17868
+ mdp_disk_t *disk;
17871
+ disk = evms_md_find_disk(mddev,dev);
17872
+ rdev = evms_md_find_rdev(mddev,dev);
17873
+ if (!disk || !rdev || rdev->faulty)
17876
+ /* Make sure it's not a spare */
17877
+ if (disk_spare(disk))
17880
+ * The disk is active in the array,
17881
+ * must ask the personality to do it
17883
+ if (mddev->pers && mddev->pers->diskop) {
17884
+ rc = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_DEACTIVATE_DISK);
17886
+ mark_disk_spare(disk);
17887
+ mddev->sb->active_disks--;
17888
+ mddev->sb->raid_disks--;
17889
+ mddev->sb->spare_disks++;
17890
+ mddev->sb_dirty = 1;
17891
+ evms_md_update_sb(mddev);
17901
+ * Function: md_direct_ioctl
17903
+ * This function provides a method for user-space to communicate directly
17904
+ * with a plugin in the kernel.
17906
+static int md_direct_ioctl(
17907
+ struct inode * inode,
17908
+ struct file * file,
17909
+ unsigned int cmd,
17910
+ unsigned long args )
17912
+ evms_plugin_ioctl_t argument;
17914
+ mddev_t *mddev = NULL;
17915
+ evms_md_ioctl_t ioctl_arg;
17916
+ evms_md_kdev_t device;
17917
+ evms_md_array_info_t array_info, *usr_array_info;
17920
+ // Copy user's parameters to kernel space
17921
+ if ( copy_from_user(&argument, (evms_plugin_ioctl_t*)args, sizeof(argument)) )
17924
+ // Make sure this is supposed to be our ioctl.
17925
+ if ( argument.feature_id != md_plugin_header.id )
17928
+ // Copy user's md ioclt parmeters to kernel space
17929
+ if ( copy_from_user(&ioctl_arg,
17930
+ (evms_md_ioctl_t*)argument.feature_ioctl_data,
17931
+ sizeof(ioctl_arg)) )
17934
+ if (ioctl_arg.mddev_idx < MAX_MD_DEVS) {
17935
+ md_kdev = MKDEV(MD_MAJOR, ioctl_arg.mddev_idx);
17936
+ mddev = kdev_to_mddev(md_kdev);
17937
+ if (mddev == NULL)
17944
+ switch(argument.feature_command) {
17945
+ case EVMS_MD_PERS_IOCTL_CMD:
17946
+ if (mddev->pers->md_pers_ioctl == NULL)
17948
+ rc = mddev->pers->md_pers_ioctl(mddev,
17951
+ copy_to_user((evms_md_ioctl_t*)argument.feature_ioctl_data,
17953
+ sizeof(ioctl_arg));
17956
+ case EVMS_MD_ADD:
17957
+ if ( copy_from_user(&device,
17958
+ (evms_md_kdev_t*)ioctl_arg.arg,
17959
+ sizeof(device)) )
17962
+ rc = evms_md_add_virtual_spare(mddev,MKDEV(device.major, device.minor));
17965
+ case EVMS_MD_REMOVE:
17966
+ if ( copy_from_user(&device,
17967
+ (evms_md_kdev_t*)ioctl_arg.arg,
17968
+ sizeof(device)) )
17971
+ rc = evms_md_remove_disk(mddev,MKDEV(device.major, device.minor));
17974
+ case EVMS_MD_ACTIVATE:
17975
+ if ( copy_from_user(&device,
17976
+ (evms_md_kdev_t*)ioctl_arg.arg,
17977
+ sizeof(device)) )
17980
+ rc = evms_md_activate_spare(mddev,MKDEV(device.major, device.minor));
17983
+ case EVMS_MD_DEACTIVATE:
17984
+ if ( copy_from_user(&device,
17985
+ (evms_md_kdev_t*)ioctl_arg.arg,
17986
+ sizeof(device)) )
17989
+ rc = evms_md_deactivate_disk(mddev,MKDEV(device.major, device.minor));
17992
+ case EVMS_MD_GET_ARRAY_INFO:
17994
+ usr_array_info = (evms_md_array_info_t*)ioctl_arg.arg;
17995
+ if ( copy_from_user(&array_info, usr_array_info,
17996
+ sizeof(array_info)) )
17999
+ array_info.state = 0;
18000
+ if (mddev->curr_resync)
18001
+ array_info.state |= EVMS_MD_ARRAY_SYNCING;
18002
+ copy_to_user(&usr_array_info->state, &array_info.state,
18003
+ sizeof(usr_array_info->state));
18004
+ if (copy_to_user(array_info.sb, mddev->sb,
18005
+ sizeof(mdp_super_t)))
18015
+ argument.status = rc;
18016
+ copy_to_user((evms_plugin_ioctl_t*)args, &argument, sizeof(argument));
18023
+void evms_md_add_mddev_mapping (mddev_t * mddev, kdev_t dev, void *data)
18025
+ unsigned int minor = MINOR(dev);
18027
+ if (MAJOR(dev) != MD_MAJOR) {
18031
+ if (evms_mddev_map[minor].mddev != NULL) {
18035
+ evms_mddev_map[minor].mddev = mddev;
18036
+ evms_mddev_map[minor].data = data;
18039
+void evms_md_del_mddev_mapping (mddev_t * mddev, kdev_t dev)
18041
+ unsigned int minor = MINOR(dev);
18043
+ if (MAJOR(dev) != MD_MAJOR) {
18047
+ if (evms_mddev_map[minor].mddev != mddev) {
18051
+ evms_mddev_map[minor].mddev = NULL;
18052
+ evms_mddev_map[minor].data = NULL;
18055
+static mddev_t * alloc_mddev (kdev_t dev)
18059
+ if (MAJOR(dev) != MD_MAJOR) {
18063
+ mddev = (mddev_t *) kmalloc(sizeof(*mddev), GFP_KERNEL);
18067
+ memset(mddev, 0, sizeof(*mddev));
18069
+ mddev->__minor = MINOR(dev);
18070
+ init_MUTEX(&mddev->reconfig_sem);
18071
+ init_MUTEX(&mddev->recovery_sem);
18072
+ init_MUTEX(&mddev->resync_sem);
18073
+ MD_INIT_LIST_HEAD(&mddev->disks);
18074
+ MD_INIT_LIST_HEAD(&mddev->all_mddevs);
18075
+ atomic_set(&mddev->active, 0);
18078
+ * The 'base' mddev is the one with data NULL.
18079
+ * personalities can create additional mddevs
18082
+ evms_md_add_mddev_mapping(mddev, dev, 0);
18083
+ md_list_add(&mddev->all_mddevs, &all_mddevs);
18085
+ MOD_INC_USE_COUNT;
18090
+mdk_rdev_t * evms_md_find_rdev_nr(mddev_t *mddev, int nr)
18092
+ mdk_rdev_t * rdev;
18093
+ struct md_list_head *tmp;
18095
+ ITERATE_RDEV(mddev,rdev,tmp) {
18096
+ if (rdev->desc_nr == nr)
18103
+mdk_rdev_t * evms_md_find_rdev(mddev_t * mddev, kdev_t dev)
18105
+ struct md_list_head *tmp;
18106
+ mdk_rdev_t *rdev;
18108
+ ITERATE_RDEV(mddev,rdev,tmp) {
18109
+ if (rdev->dev == dev)
18115
+mdk_rdev_t * evms_md_find_rdev_from_node(mddev_t * mddev, evms_logical_node_t * node)
18117
+ struct md_list_head *tmp;
18118
+ mdk_rdev_t *rdev;
18120
+ ITERATE_RDEV(mddev,rdev,tmp) {
18121
+ if (rdev->node == node)
18127
+static MD_LIST_HEAD(device_names);
18129
+static char * org_partition_name (kdev_t dev)
18131
+ struct gendisk *hd;
18132
+ static char nomem [] = "<nomem>";
18133
+ dev_name_t *dname;
18134
+ struct md_list_head *tmp = device_names.next;
18136
+ while (tmp != &device_names) {
18137
+ dname = md_list_entry(tmp, dev_name_t, list);
18138
+ if (dname->dev == dev)
18139
+ return dname->name;
18143
+ dname = (dev_name_t *) kmalloc(sizeof(*dname), GFP_KERNEL);
18148
+ * ok, add this new device name to the list
18150
+ hd = get_gendisk (dev);
18151
+ dname->name = NULL;
18153
+ dname->name = disk_name (hd, MINOR(dev), dname->namebuf);
18154
+ if (!dname->name) {
18155
+ sprintf (dname->namebuf, "[dev %s]", kdevname(dev));
18156
+ dname->name = dname->namebuf;
18159
+ dname->dev = dev;
18160
+ MD_INIT_LIST_HEAD(&dname->list);
18161
+ md_list_add(&dname->list, &device_names);
18163
+ return dname->name;
18167
+#define EVMS_MD_NULL_PARTITION_NAME "<EVMS_NODE_NO_NAME>"
18168
+char * evms_md_partition_name (evms_logical_node_t *node)
18170
+ if (node && node->name)
18171
+ return node->name;
18173
+ return EVMS_MD_NULL_PARTITION_NAME;
18176
+static char * get_partition_name (mdk_rdev_t *rdev)
18179
+ return evms_md_partition_name(rdev->node);
18181
+ return org_partition_name(rdev->dev);
18185
+ * Function: evms_md_calc_dev_sboffset
18186
+ * return the LSN for md super block.
18188
+static u_int64_t evms_md_calc_dev_sboffset (evms_logical_node_t *node,mddev_t *mddev, int persistent)
18190
+ u_int64_t size = 0;
18192
+ size = node->total_vsectors;
18193
+ if (persistent) {
18194
+ size = MD_NEW_SIZE_SECTORS(size);
18196
+ return size; /* size in sectors */
18200
+ * Function: evms_md_calc_dev_size
18201
+ * return data size (in blocks) for an "extended" device.
18203
+static unsigned long evms_md_calc_dev_size (evms_logical_node_t *node,
18207
+ unsigned long size;
18208
+ u_int64_t size_in_sectors;
18210
+ size_in_sectors = evms_md_calc_dev_sboffset(node, mddev, persistent);
18211
+ size = size_in_sectors >> 1;
18212
+ if (!mddev->sb) {
18216
+ if (mddev->sb->chunk_size)
18217
+ size &= ~(mddev->sb->chunk_size/1024 - 1);
18221
+static unsigned int zoned_raid_size (mddev_t *mddev)
18223
+ unsigned int mask;
18224
+ mdk_rdev_t * rdev;
18225
+ struct md_list_head *tmp;
18227
+ if (!mddev->sb) {
18232
+ * do size and offset calculations.
18234
+ mask = ~(mddev->sb->chunk_size/1024 - 1);
18236
+ ITERATE_RDEV(mddev,rdev,tmp) {
18237
+ rdev->size &= mask;
18238
+ evms_md_size[mdidx(mddev)] += rdev->size;
18244
+ * We check wether all devices are numbered from 0 to nb_dev-1. The
18245
+ * order is guaranteed even after device name changes.
18247
+ * Some personalities (raid0, linear) use this. Personalities that
18248
+ * provide data have to be able to deal with loss of individual
18249
+ * disks, so they do their checking themselves.
18251
+int evms_md_check_ordering (mddev_t *mddev)
18254
+ mdk_rdev_t *rdev;
18255
+ struct md_list_head *tmp;
18258
+ * First, all devices must be fully functional
18260
+ ITERATE_RDEV(mddev,rdev,tmp) {
18261
+ if (rdev->faulty) {
18262
+ LOG_ERROR("evms_md_check_ordering() md%d's device %s faulty, aborting.\n",
18263
+ mdidx(mddev), get_partition_name(rdev));
18269
+ ITERATE_RDEV(mddev,rdev,tmp) {
18272
+ if (c != mddev->nb_dev) {
18276
+ if (mddev->nb_dev != mddev->sb->raid_disks) {
18277
+ LOG_ERROR("[md%d] array needs %d disks, has %d, aborting.\n",
18278
+ mdidx(mddev), mddev->sb->raid_disks, mddev->nb_dev);
18282
+ * Now the numbering check
18284
+ for (i = 0; i < mddev->nb_dev; i++) {
18286
+ ITERATE_RDEV(mddev,rdev,tmp) {
18287
+ if (rdev->desc_nr == i)
18291
+ LOG_ERROR("md%d, missing disk #%d, aborting.\n",mdidx(mddev), i);
18295
+ LOG_ERROR("md%d, too many disks #%d, aborting.\n",mdidx(mddev), i);
18304
+static void remove_descriptor (mdp_disk_t *disk, mdp_super_t *sb)
18306
+ if (disk_active(disk)) {
18307
+ sb->working_disks--;
18309
+ if (disk_spare(disk)) {
18310
+ sb->spare_disks--;
18311
+ sb->working_disks--;
18313
+ sb->failed_disks--;
18317
+ disk->major = disk->minor = 0;
18318
+ mark_disk_removed(disk);
18321
+#define BAD_MAGIC \
18322
+"invalid raid superblock magic on %s\n"
18324
+#define BAD_MINOR \
18325
+"%s: invalid raid minor (%x)\n"
18328
+"disabled device %s, could not read superblock.\n"
18330
+#define BAD_CSUM \
18331
+"invalid superblock checksum on %s\n"
18334
+static int alloc_array_sb (mddev_t * mddev)
18341
+ mddev->sb = (mdp_super_t *) __get_free_page (GFP_KERNEL);
18342
+ if (!mddev->sb) {
18343
+ LOG_ERROR("%s: Out of memory!\n", __FUNCTION__);
18346
+ md_clear_page(mddev->sb);
18350
+static int alloc_disk_sb (mdk_rdev_t * rdev)
18355
+ rdev->sb = (mdp_super_t *) __get_free_page(GFP_KERNEL);
18357
+ LOG_ERROR("%s: Out of memory!\n", __FUNCTION__);
18360
+ md_clear_page(rdev->sb);
18366
+ * Function: free_disk_sb
18369
+static void free_disk_sb (mdk_rdev_t * rdev)
18372
+ free_page((unsigned long) rdev->sb);
18374
+ rdev->sb_offset = 0;
18377
+ if (!rdev->virtual_spare && !rdev->faulty)
18383
+ * Function: evms_md_read_disk_sb
18384
+ * Read the MD superblock.
18386
+static int evms_md_read_disk_sb (mdk_rdev_t * rdev)
18389
+ evms_logical_node_t *node = rdev->node;
18390
+ u_int64_t sb_offset_in_sectors;
18396
+ if (node->total_vsectors <= MD_RESERVED_SECTORS) {
18397
+ LOG_DETAILS("%s is too small, total_vsectors(%Lu)\n",
18398
+ evms_md_partition_name(node), node->total_vsectors);
18403
+ * Calculate the position of the superblock,
18404
+ * it's at the end of the disk
18406
+ sb_offset_in_sectors = evms_md_calc_dev_sboffset(node, rdev->mddev, 1);
18407
+ rdev->sb_offset = (unsigned long)(sb_offset_in_sectors >> 1);
18408
+ LOG_DEBUG("(read) %s's sb offset(%Lu) total_vsectors(%Lu)\n",
18409
+ evms_md_partition_name(node), sb_offset_in_sectors, node->total_vsectors);
18412
+ * Read superblock
18414
+ rc = INIT_IO(node, 0, sb_offset_in_sectors, MD_SB_SECTORS, rdev->sb);
18417
+ LOG_DEBUG(" [events: %x]\n", rdev->sb->events_lo);
18419
+ LOG_ERROR(NO_SB, evms_md_partition_name(node));
18424
+static unsigned int calc_sb_csum (mdp_super_t * sb)
18426
+ unsigned int disk_csum, csum;
18428
+ disk_csum = sb->sb_csum;
18430
+ csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
18431
+ sb->sb_csum = disk_csum;
18438
+ * Check one RAID superblock for generic plausibility
18441
+static int check_disk_sb (mdk_rdev_t * rdev)
18444
+ int ret = -EINVAL;
18452
+ if (sb->md_magic != MD_SB_MAGIC) {
18453
+ LOG_DEBUG(BAD_MAGIC, get_partition_name(rdev));
18457
+ if (sb->md_minor >= MAX_MD_DEVS) {
18458
+ LOG_ERROR(BAD_MINOR, get_partition_name(rdev), sb->md_minor);
18461
+ if (calc_sb_csum(sb) != sb->sb_csum) {
18462
+ LOG_ERROR(BAD_CSUM, get_partition_name(rdev));
18470
+static kdev_t dev_unit(kdev_t dev)
18472
+ unsigned int mask;
18473
+ struct gendisk *hd = get_gendisk(dev);
18477
+ mask = ~((1 << hd->minor_shift) - 1);
18479
+ return MKDEV(MAJOR(dev), MINOR(dev) & mask);
18482
+static mdk_rdev_t * match_dev_unit(mddev_t *mddev, kdev_t dev)
18484
+ struct md_list_head *tmp;
18485
+ mdk_rdev_t *rdev;
18487
+ ITERATE_RDEV(mddev,rdev,tmp)
18488
+ if (dev_unit(rdev->dev) == dev_unit(dev))
18494
+static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
18496
+ struct md_list_head *tmp;
18497
+ mdk_rdev_t *rdev;
18499
+ ITERATE_RDEV(mddev1,rdev,tmp)
18500
+ if (match_dev_unit(mddev2, rdev->dev))
18507
+static void bind_rdev_to_array (mdk_rdev_t * rdev, mddev_t * mddev)
18509
+ mdk_rdev_t *same_pdev;
18511
+ if (rdev->mddev) {
18516
+ same_pdev = match_dev_unit(mddev, rdev->dev);
18518
+ LOG_WARNING("[md%d] WARNING: %s appears to be on the same physical disk as %s. True\n"
18519
+ " protection against single-disk failure might be compromised.\n",
18520
+ mdidx(mddev), get_partition_name(rdev),get_partition_name(same_pdev));
18522
+ md_list_add(&rdev->same_set, &mddev->disks);
18523
+ rdev->mddev = mddev;
18525
+ if (rdev->sb && disk_active(&rdev->sb->this_disk))
18526
+ mddev->nr_raid_disks++;
18527
+ LOG_DETAILS("bind<%s,%d>\n", get_partition_name(rdev), rdev->mddev->nb_dev);
18530
+static void unbind_rdev_from_array (mdk_rdev_t * rdev)
18532
+ if (!rdev->mddev) {
18536
+ md_list_del(&rdev->same_set);
18537
+ MD_INIT_LIST_HEAD(&rdev->same_set);
18538
+ rdev->mddev->nb_dev--;
18539
+ if (rdev->sb && disk_active(&rdev->sb->this_disk))
18540
+ rdev->mddev->nr_raid_disks--;
18541
+ LOG_DETAILS("unbind<%s,%d>\n", get_partition_name(rdev), rdev->mddev->nb_dev);
18542
+ rdev->mddev = NULL;
18547
+ * Function: evms_md_export_rdev
18548
+ * EVMS MD version of export_rdev()
18549
+ * Discard this MD "extended" device
18551
+static void evms_md_export_rdev (mdk_rdev_t * rdev)
18553
+ LOG_DETAILS("%s: (%s)\n", __FUNCTION__ , get_partition_name(rdev));
18556
+ free_disk_sb(rdev);
18557
+ md_list_del(&rdev->all);
18558
+ MD_INIT_LIST_HEAD(&rdev->all);
18559
+ if (rdev->pending.next != &rdev->pending) {
18560
+ LOG_WARNING("%s: (%s was pending)\n",__FUNCTION__ ,get_partition_name(rdev));
18561
+ md_list_del(&rdev->pending);
18562
+ MD_INIT_LIST_HEAD(&rdev->pending);
18564
+ if (rdev->node) {
18565
+ LOG_DETAILS("%s: deleting node %s\n", __FUNCTION__, get_partition_name(rdev));
18566
+ if (cur_discover_list) {
18567
+ LOG_DETAILS("%s: remove (%s) from discover list.\n", __FUNCTION__,
18568
+ get_partition_name(rdev));
18569
+ evms_cs_remove_logical_node_from_list(cur_discover_list, rdev->node);
18571
+ DELETE(rdev->node);
18572
+ rdev->node = NULL;
18575
+ rdev->faulty = 0;
18580
+static void kick_rdev_from_array (mdk_rdev_t * rdev)
18582
+ LOG_DEFAULT("%s: (%s)\n", __FUNCTION__,get_partition_name(rdev));
18583
+ unbind_rdev_from_array(rdev);
18584
+ evms_md_export_rdev(rdev);
18587
+static void export_array (mddev_t *mddev)
18589
+ struct md_list_head *tmp;
18590
+ mdk_rdev_t *rdev;
18591
+ mdp_super_t *sb = mddev->sb;
18593
+ LOG_DEFAULT("%s: [md%d]\n",__FUNCTION__ ,mdidx(mddev));
18595
+ mddev->sb = NULL;
18596
+ free_page((unsigned long) sb);
18599
+ LOG_DEBUG("%s: removing all extended devices belong to md%d\n",__FUNCTION__,mdidx(mddev));
18600
+ ITERATE_RDEV(mddev,rdev,tmp) {
18601
+ if (!rdev->mddev) {
18605
+ kick_rdev_from_array(rdev);
18607
+ if (mddev->nb_dev)
18611
+static void free_mddev (mddev_t *mddev)
18618
+ export_array(mddev);
18619
+ evms_md_size[mdidx(mddev)] = 0;
18623
+ * Make sure nobody else is using this mddev
18624
+ * (careful, we rely on the global kernel lock here)
18626
+ while (md_atomic_read(&mddev->resync_sem.count) != 1)
18628
+ while (md_atomic_read(&mddev->recovery_sem.count) != 1)
18631
+ evms_md_del_mddev_mapping(mddev, MKDEV(MD_MAJOR, mdidx(mddev)));
18632
+ md_list_del(&mddev->all_mddevs);
18633
+ MD_INIT_LIST_HEAD(&mddev->all_mddevs);
18635
+ MOD_DEC_USE_COUNT;
18639
+static void print_desc(mdp_disk_t *desc)
18641
+ printk(" DISK<N:%d,R:%d,S:%d>\n", desc->number,
18642
+ desc->raid_disk,desc->state);
18645
+static void print_sb(mdp_super_t *sb)
18649
+ printk(" SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
18650
+ sb->major_version, sb->minor_version, sb->patch_version,
18651
+ sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
18653
+ printk(" L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", sb->level,
18654
+ sb->size, sb->nr_disks, sb->raid_disks, sb->md_minor,
18655
+ sb->layout, sb->chunk_size);
18656
+ printk(" UT:%08x ST:%d AD:%d WD:%d FD:%d SD:%d CSUM:%08x E:%x\n",
18657
+ sb->utime, sb->state, sb->active_disks, sb->working_disks,
18658
+ sb->failed_disks, sb->spare_disks,
18659
+ sb->sb_csum, sb->events_lo);
18661
+ for (i = 0; i < MD_SB_DISKS; i++) {
18662
+ mdp_disk_t *desc;
18664
+ desc = sb->disks + i;
18665
+ if (desc->number || desc->major || desc->minor || desc->raid_disk || (desc->state && (desc->state != 4))) {
18666
+ printk(" D %2d: ", i);
18667
+ print_desc(desc);
18670
+ printk(" THIS: ");
18671
+ print_desc(&sb->this_disk);
18675
+static void print_rdev(mdk_rdev_t *rdev)
18677
+ printk("rdev %s: SZ:%08ld F:%d DN:%d ",
18678
+ get_partition_name(rdev),
18679
+ rdev->size, rdev->faulty, rdev->desc_nr);
18681
+ printk("rdev superblock:\n");
18682
+ print_sb(rdev->sb);
18684
+ printk("no rdev superblock!\n");
18687
+void evms_md_print_devices (void)
18689
+ struct md_list_head *tmp, *tmp2;
18690
+ mdk_rdev_t *rdev;
18694
+ printk(": **********************************\n");
18695
+ printk(": * <COMPLETE RAID STATE PRINTOUT> *\n");
18696
+ printk(": **********************************\n");
18697
+ ITERATE_MDDEV(mddev,tmp) {
18698
+ printk("md%d: ", mdidx(mddev));
18700
+ ITERATE_RDEV(mddev,rdev,tmp2)
18701
+ printk("<%s>", get_partition_name(rdev));
18704
+ printk(" array superblock:\n");
18705
+ print_sb(mddev->sb);
18707
+ printk(" no array superblock.\n");
18709
+ ITERATE_RDEV(mddev,rdev,tmp2)
18710
+ print_rdev(rdev);
18712
+ printk(": **********************************\n");
18716
+static int sb_equal ( mdp_super_t *sb1, mdp_super_t *sb2)
18719
+ mdp_super_t *tmp1, *tmp2;
18721
+ tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
18722
+ tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
18724
+ if (!tmp1 || !tmp2) {
18726
+ printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n");
18734
+ * nr_disks is not constant
18736
+ tmp1->nr_disks = 0;
18737
+ tmp2->nr_disks = 0;
18739
+ if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
18753
+static int uuid_equal(mdk_rdev_t *rdev1, mdk_rdev_t *rdev2)
18755
+ if ( (rdev1->sb->set_uuid0 == rdev2->sb->set_uuid0) &&
18756
+ (rdev1->sb->set_uuid1 == rdev2->sb->set_uuid1) &&
18757
+ (rdev1->sb->set_uuid2 == rdev2->sb->set_uuid2) &&
18758
+ (rdev1->sb->set_uuid3 == rdev2->sb->set_uuid3))
18766
+ * Function: evms_md_find_rdev_all
18767
+ * EVMS MD version of find_rdev_all() above
18768
+ * Search entire all_raid_disks for "node"
18769
+ * Return the MD "extended" device if found.
18771
+static mdk_rdev_t * evms_md_find_rdev_all (evms_logical_node_t *node)
18773
+ struct md_list_head *tmp;
18774
+ mdk_rdev_t *rdev;
18776
+ tmp = all_raid_disks.next;
18777
+ while (tmp != &all_raid_disks) {
18778
+ rdev = md_list_entry(tmp, mdk_rdev_t, all);
18779
+ if (rdev->node == node)
18788
+ * Function: evms_md_write_disk_sb
18789
+ * EVMS MD version of write_disk_sb
18791
+static int evms_md_write_disk_sb(mdk_rdev_t * rdev)
18793
+ unsigned long size;
18794
+ u_int64_t sb_offset_in_sectors;
18800
+ if (rdev->faulty) {
18804
+ if (rdev->sb->md_magic != MD_SB_MAGIC) {
18809
+ sb_offset_in_sectors = evms_md_calc_dev_sboffset(rdev->node, rdev->mddev, 1);
18810
+ if (rdev->sb_offset != (sb_offset_in_sectors >> 1)) {
18811
+ LOG_WARNING("%s's sb offset has changed from blocks(%ld) to blocks(%ld), skipping\n",
18812
+ get_partition_name(rdev),
18814
+ (unsigned long)(sb_offset_in_sectors >> 1));
18818
+ * If the disk went offline meanwhile and it's just a spare, then
18819
+ * its size has changed to zero silently, and the MD code does
18820
+ * not yet know that it's faulty.
18822
+ size = evms_md_calc_dev_size(rdev->node, rdev->mddev, 1);
18823
+ if (size != rdev->size) {
18824
+ LOG_WARNING("%s's size has changed from %ld to %ld since import, skipping\n",
18825
+ get_partition_name(rdev), rdev->size, size);
18829
+ LOG_DETAILS("(write) %s's sb offset: %Lu\n",get_partition_name(rdev), sb_offset_in_sectors);
18831
+ INIT_IO(rdev->node,WRITE,sb_offset_in_sectors,MD_SB_SECTORS,rdev->sb);
18837
+static int evms_md_sync_sbs(mddev_t * mddev)
18839
+ mdk_rdev_t *rdev;
18840
+ struct md_list_head *tmp;
18841
+ mdp_disk_t * disk;
18843
+ ITERATE_RDEV(mddev,rdev,tmp) {
18844
+ if (rdev->virtual_spare || rdev->faulty || rdev->alias_device)
18847
+ /* copy everything from the master */
18848
+ *rdev->sb = *mddev->sb;
18850
+ /* this_disk is unique, copy it from the master */
18851
+// rdev->sb->this_disk = mddev->sb->disks[rdev->desc_nr];
18852
+ // use the SB disk array since if update occurred on normal shutdown
18853
+ // the rdevs may be out of date.
18854
+ disk = evms_md_find_disk(mddev, rdev->dev);
18856
+ rdev->sb->this_disk = *disk;
18859
+ rdev->sb->sb_csum = calc_sb_csum(rdev->sb);
18864
+int evms_md_update_sb_sync(mddev_t * mddev)
18866
+ mdk_rdev_t *rdev;
18867
+ struct md_list_head *tmp;
18869
+ ITERATE_RDEV(mddev,rdev,tmp) {
18870
+ if (rdev->virtual_spare || rdev->faulty || rdev->alias_device)
18873
+ /* found first good device, so read the new SB */
18874
+ if (!evms_md_read_disk_sb(rdev)){
18875
+ /* this_disk is unique, copy it from the master */
18876
+ if (rdev->sb->md_magic == MD_SB_MAGIC) {
18877
+ *mddev->sb = *rdev->sb;
18878
+ mddev->sb->state |= 1 << MD_SB_CLEAN;
18879
+ evms_md_update_sb(mddev);
18888
+int evms_md_update_sb(mddev_t * mddev)
18890
+ int err, count = 100;
18891
+ struct md_list_head *tmp;
18892
+ mdk_rdev_t *rdev;
18896
+ mddev->sb->utime = CURRENT_TIME;
18897
+ if ((++mddev->sb->events_lo)==0)
18898
+ ++mddev->sb->events_hi;
18900
+ if ((mddev->sb->events_lo|mddev->sb->events_hi)==0) {
18902
+ * oops, this 64-bit counter should never wrap.
18903
+ * Either we are in around ~1 trillion A.C., assuming
18904
+ * 1 reboot per second, or we have a bug:
18907
+ mddev->sb->events_lo = mddev->sb->events_hi = 0xffffffff;
18909
+ evms_md_sync_sbs(mddev);
18912
+ * do not write anything to disk if using
18913
+ * nonpersistent superblocks
18915
+ if (mddev->sb->not_persistent)
18918
+ LOG_DETAILS("%s: updating [md%d] superblock\n",__FUNCTION__ ,mdidx(mddev));
18921
+ ITERATE_RDEV(mddev,rdev,tmp) {
18922
+ if (!rdev->virtual_spare && !rdev->faulty && !rdev->alias_device) {
18923
+ LOG_DETAILS(" %s [events: %x]",
18924
+ get_partition_name(rdev),
18925
+ rdev->sb->events_lo);
18926
+ err += evms_md_write_disk_sb(rdev);
18928
+ if (rdev->faulty)
18929
+ LOG_DETAILS(" skipping faulty %s\n", get_partition_name(rdev));
18930
+ if (rdev->alias_device)
18931
+ LOG_DETAILS(" skipping alias %s\n", get_partition_name(rdev));
18932
+ if (rdev->virtual_spare)
18933
+ LOG_DETAILS(" skipping virtual spare.\n");
18938
+ LOG_WARNING("errors occurred during superblock update, repeating\n");
18941
+ LOG_ERROR("excessive errors occurred during superblock update, exiting\n");
18947
+ * Function: evms_md_import_device
18948
+ * Insure that node is not yet imported.
18949
+ * Read and validate the MD super block on this device
18950
+ * Add to the global MD "extended" devices list (all_raid_disks)
18953
+static int evms_md_import_device (evms_logical_node_t **discover_list,
18954
+ evms_logical_node_t *node,
18958
+ mdk_rdev_t *rdev;
18960
+ LOG_ENTRY_EXIT("%s: discovering %s\n",__FUNCTION__,evms_md_partition_name(node));
18962
+ if (evms_md_find_rdev_all(node)) {
18963
+ LOG_DEBUG("%s exists\n", evms_md_partition_name(node));
18967
+ rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL);
18969
+ LOG_ERROR("could not alloc mem for %s!\n", evms_md_partition_name(node));
18972
+ memset(rdev, 0, sizeof(*rdev));
18974
+ if ((err = alloc_disk_sb(rdev)))
18977
+ rdev->node = node; /* set this for evms_md_read_disk_sb() */
18979
+ rdev->desc_nr = -1;
18980
+ rdev->faulty = 0;
18982
+ if (!node->total_vsectors) {
18983
+ LOG_ERROR("%s has zero size, marking faulty!\n", evms_md_partition_name(node));
18989
+ if ((err = evms_md_read_disk_sb(rdev))) {
18990
+ LOG_EXTRA("could not read %s's sb, not importing!\n",evms_md_partition_name(node));
18993
+ if ((err = check_disk_sb(rdev))) {
18994
+ LOG_EXTRA("%s has invalid sb, not importing!\n",evms_md_partition_name(node));
18997
+ if (rdev->sb->level != -4) {
18998
+ rdev->old_dev = MKDEV(rdev->sb->this_disk.major,
18999
+ rdev->sb->this_disk.minor);
19000
+ rdev->desc_nr = rdev->sb->this_disk.number;
19002
+ rdev->old_dev = MKDEV(0, 0);
19003
+ rdev->desc_nr = -1;
19005
+ rdev->dev = MKDEV(rdev->sb->this_disk.major, rdev->sb->this_disk.minor);
19006
+ LOG_DETAILS("FOUND %s desc_nr(%d)\n", get_partition_name(rdev), rdev->desc_nr);
19008
+ md_list_add(&rdev->all, &all_raid_disks);
19009
+ MD_INIT_LIST_HEAD(&rdev->pending);
19011
+ if (rdev->faulty && rdev->sb)
19012
+ free_disk_sb(rdev);
19018
+ free_disk_sb(rdev);
19027
+ * Function: evms_md_analyze_sbs
19028
+ * EVMS MD version of analyze_sbs()
19030
+static int evms_md_analyze_sbs (mddev_t * mddev)
19032
+ int out_of_date = 0, i;
19033
+ struct md_list_head *tmp, *tmp2;
19034
+ mdk_rdev_t *rdev, *rdev2, *freshest;
19037
+ LOG_ENTRY_EXIT("Analyzing all superblocks...\n");
19039
+ * Verify the RAID superblock on each real device
19041
+ ITERATE_RDEV(mddev,rdev,tmp) {
19042
+ if (rdev->faulty) {
19050
+ if (check_disk_sb(rdev))
19055
+ * The superblock constant part has to be the same
19056
+ * for all disks in the array.
19060
+ ITERATE_RDEV(mddev,rdev,tmp) {
19065
+ if (!sb_equal(sb, rdev->sb)) {
19066
+ LOG_WARNING("kick out %s\n",get_partition_name(rdev));
19067
+ kick_rdev_from_array(rdev);
19073
+ * OK, we have all disks and the array is ready to run. Let's
19074
+ * find the freshest superblock, that one will be the superblock
19075
+ * that represents the whole array.
19078
+ if (alloc_array_sb(mddev))
19083
+ ITERATE_RDEV(mddev,rdev,tmp) {
19086
+ * if the checksum is invalid, use the superblock
19087
+ * only as a last resort. (decrease it's age by
19090
+ if (calc_sb_csum(rdev->sb) != rdev->sb->sb_csum) {
19091
+ if (rdev->sb->events_lo || rdev->sb->events_hi)
19092
+ if ((rdev->sb->events_lo--)==0)
19093
+ rdev->sb->events_hi--;
19095
+ LOG_DETAILS("%s's event counter: %x\n",get_partition_name(rdev), rdev->sb->events_lo);
19102
+ * Find the newest superblock version
19104
+ ev1 = md_event(rdev->sb);
19105
+ ev2 = md_event(freshest->sb);
19106
+ if (ev1 != ev2) {
19112
+ if (out_of_date) {
19113
+ LOG_WARNING("OUT OF DATE, freshest: %s\n",get_partition_name(freshest));
19115
+ memcpy (sb, freshest->sb, sizeof(*sb));
19118
+ * at this point we have picked the 'best' superblock
19119
+ * from all available superblocks.
19120
+ * now we validate this superblock and kick out possibly
19123
+ ITERATE_RDEV(mddev,rdev,tmp) {
19125
+ * Kick all non-fresh devices
19128
+ ev1 = md_event(rdev->sb);
19129
+ ev2 = md_event(sb);
19132
+ LOG_WARNING("kicking non-fresh %s from array!\n",get_partition_name(rdev));
19133
+ kick_rdev_from_array(rdev);
19136
+ LOG_DETAILS("%s is a new spare.\n",get_partition_name(rdev));
19142
+ * Remove unavailable and faulty devices ...
19144
+ * note that if an array becomes completely unrunnable due to
19145
+ * missing devices, we do not write the superblock back, so the
19146
+ * administrator has a chance to fix things up. The removal thus
19147
+ * only happens if it's nonfatal to the contents of the array.
19149
+ for (i = 0; i < MD_SB_DISKS; i++) {
19151
+ mdp_disk_t *desc;
19153
+ desc = sb->disks + i;
19156
+ * We kick faulty devices/descriptors immediately.
19158
+ * Note: multipath devices are a special case. Since we
19159
+ * were able to read the superblock on the path, we don't
19160
+ * care if it was previously marked as faulty, it's up now
19163
+ if (disk_faulty(desc) && mddev->sb->level != -4) {
19165
+ ITERATE_RDEV(mddev,rdev,tmp) {
19166
+ if (rdev->desc_nr != desc->number)
19168
+ LOG_WARNING("[md%d] kicking faulty %s!\n",mdidx(mddev),get_partition_name(rdev));
19169
+ kick_rdev_from_array(rdev);
19174
+ LOG_WARNING("%s: [md%d] found former faulty device [number=%d]\n",
19175
+ __FUNCTION__ ,mdidx(mddev), desc->number);
19178
+ * Don't call remove_descriptor(),
19179
+ * let the administrator remove it from the user-land */
19180
+ /* remove_descriptor(desc, sb); */
19182
+ } else if (disk_faulty(desc)) {
19184
+ * multipath entry marked as faulty, unfaulty it
19188
+ dev = MKDEV(desc->major, desc->minor);
19190
+ rdev = evms_md_find_rdev(mddev, dev);
19192
+ mark_disk_spare(desc);
19194
+ LOG_WARNING("%s: [md%d] (MULTIPATH) found former faulty device [number=%d]\n",
19195
+ __FUNCTION__ ,mdidx(mddev), desc->number);
19197
+ * Don't call remove_descriptor(),
19198
+ * let the administrator remove it from the user-land */
19199
+ /* remove_descriptor(desc, sb); */
19204
+ * Is this device present in the rdev ring?
19207
+ ITERATE_RDEV(mddev,rdev,tmp) {
19209
+ * Multi-path IO special-case: since we have no
19210
+ * this_disk descriptor at auto-detect time,
19211
+ * we cannot check rdev->number.
19212
+ * We can check the device though.
19214
+ if ((sb->level == -4) && (rdev->dev ==
19215
+ MKDEV(desc->major,desc->minor))) {
19219
+ if (rdev->desc_nr == desc->number) {
19227
+ LOG_WARNING(" [md%d]: former device [number=%d] is unavailable!\n",
19228
+ mdidx(mddev), desc->number);
19230
+ * Don't call remove_descriptor(),
19231
+ * let the administrator remove it from the user-land */
19232
+ /* remove_descriptor(desc, sb); */
19236
+ * Kick all rdevs that are not in the
19237
+ * descriptor array:
19239
+ ITERATE_RDEV(mddev,rdev,tmp) {
19240
+ if (rdev->desc_nr == -1)
19241
+ kick_rdev_from_array(rdev);
19245
+ * Do a final reality check.
19247
+ if (mddev->sb->level != -4) {
19248
+ ITERATE_RDEV(mddev,rdev,tmp) {
19249
+ if (rdev->desc_nr == -1) {
19254
+ * is the desc_nr unique?
19256
+ ITERATE_RDEV(mddev,rdev2,tmp2) {
19257
+ if ((rdev2 != rdev) &&
19258
+ (rdev2->desc_nr == rdev->desc_nr)) {
19266
+#define OLD_VERSION KERN_ALERT \
19267
+"md%d: unsupported raid array version %d.%d.%d\n"
19269
+#define NOT_CLEAN_IGNORE KERN_ERR \
19270
+"md%d: raid array is not clean -- starting background reconstruction\n"
19273
+ * Check if we can support this RAID array
19275
+ if (sb->major_version != MD_MAJOR_VERSION ||
19276
+ sb->minor_version > MD_MINOR_VERSION) {
19278
+ LOG_ERROR("[md%d] unsupported raid array version %d.%d.%d\n",
19280
+ sb->major_version,
19281
+ sb->minor_version,
19282
+ sb->patch_version);
19286
+ if ((sb->state != (1 << MD_SB_CLEAN)) && ((sb->level == 1) ||
19287
+ (sb->level == 4) || (sb->level == 5)))
19288
+ LOG_WARNING("[md%d, level=%d] raid array is not clean -- starting background reconstruction\n",
19289
+ mdidx(mddev), sb->level);
19291
+ LOG_ENTRY_EXIT("analysis of all superblocks is OK!\n");
19294
+ LOG_WARNING("ABORT analyze_sbs()!!!\n");
19299
+static int device_size_calculation (mddev_t * mddev)
19301
+ int data_disks = 0, persistent;
19302
+ //unsigned int readahead;
19303
+ mdp_super_t *sb = mddev->sb;
19304
+ struct md_list_head *tmp;
19305
+ mdk_rdev_t *rdev;
19308
+ * Do device size calculation. Bail out if too small.
19309
+ * (we have to do this after having validated chunk_size,
19310
+ * because device size has to be modulo chunk_size)
19312
+ persistent = !mddev->sb->not_persistent;
19313
+ ITERATE_RDEV(mddev,rdev,tmp) {
19314
+ if (rdev->faulty)
19316
+ if (rdev->size) {
19320
+ rdev->size = evms_md_calc_dev_size(rdev->node, mddev, persistent);
19321
+ if (rdev->size < sb->chunk_size / 1024) {
19322
+ LOG_WARNING("Dev %s smaller than chunk_size: %ldk < %dk\n",
19323
+ get_partition_name(rdev), rdev->size, sb->chunk_size / 1024);
19328
+ switch (sb->level) {
19339
+ zoned_raid_size(mddev);
19343
+ zoned_raid_size(mddev);
19344
+ data_disks = sb->raid_disks;
19351
+ data_disks = sb->raid_disks-1;
19354
+ LOG_ERROR("[md%d] unkown level %d\n", mdidx(mddev), sb->level);
19357
+ if (!evms_md_size[mdidx(mddev)])
19358
+ evms_md_size[mdidx(mddev)] = sb->size * data_disks;
19366
+#define TOO_BIG_CHUNKSIZE KERN_ERR \
19367
+"too big chunk_size: %d > %d\n"
19369
+#define TOO_SMALL_CHUNKSIZE KERN_ERR \
19370
+"too small chunk_size: %d < %ld\n"
19372
+#define BAD_CHUNKSIZE KERN_ERR \
19373
+"no chunksize specified, see 'man raidtab'\n"
19375
+static int do_md_run (mddev_t * mddev)
19379
+ struct md_list_head *tmp;
19380
+ mdk_rdev_t *rdev;
19383
+ if (!mddev->nb_dev) {
19392
+ * Resize disks to align partitions size on a given
19395
+ evms_md_size[mdidx(mddev)] = 0;
19398
+ * Analyze all RAID superblock(s)
19400
+ if (evms_md_analyze_sbs(mddev)) {
19405
+ chunk_size = mddev->sb->chunk_size;
19406
+ pnum = level_to_pers(mddev->sb->level);
19408
+ mddev->param.chunk_size = chunk_size;
19409
+ mddev->param.personality = pnum;
19411
+ if ((pnum != MULTIPATH) && (pnum != RAID1)) {
19412
+ if (!chunk_size) {
19414
+ * 'default chunksize' in the old md code used to
19415
+ * be PAGE_SIZE, baaad.
19416
+ * we abort here to be on the safe side. We dont
19417
+ * want to continue the bad practice.
19419
+ printk(BAD_CHUNKSIZE);
19422
+ if (chunk_size > MAX_CHUNK_SIZE) {
19423
+ printk(TOO_BIG_CHUNKSIZE, chunk_size, MAX_CHUNK_SIZE);
19427
+ * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
19429
+ if ( (1 << ffz(~chunk_size)) != chunk_size) {
19433
+ if (chunk_size < PAGE_SIZE) {
19434
+ printk(TOO_SMALL_CHUNKSIZE, chunk_size, PAGE_SIZE);
19439
+ printk(KERN_INFO "RAID level %d does not need chunksize! Continuing anyway.\n", mddev->sb->level);
19441
+ if (pnum >= MAX_PERSONALITY) {
19447
+#ifdef CONFIG_KMOD
19448
+ char module_name[80];
19449
+ sprintf (module_name, "md-personality-%d", pnum);
19450
+ request_module (module_name);
19454
+ printk(KERN_ERR "personality %d is not loaded!\n",
19459
+ if (device_size_calculation(mddev))
19463
+ * Drop all container device buffers, from now on
19464
+ * the only valid external interface is through the md
19466
+ * Also find largest hardsector size
19468
+ md_hardsect_sizes[mdidx(mddev)] = 512;
19469
+ ITERATE_RDEV(mddev,rdev,tmp) {
19470
+ if (rdev->faulty)
19472
+ invalidate_device(rdev->dev, 1);
19473
+/* if (get_hardsect_size(rdev->dev)
19474
+ > md_hardsect_sizes[mdidx(mddev)])
19475
+ md_hardsect_sizes[mdidx(mddev)] =
19476
+ get_hardsect_size(rdev->dev); */
19477
+ if (rdev->node->hardsector_size > md_hardsect_sizes[mdidx(mddev)]) {
19478
+ md_hardsect_sizes[mdidx(mddev)] = rdev->node->hardsector_size;
19482
+ md_blocksizes[mdidx(mddev)] = 1024;
19483
+ if (md_blocksizes[mdidx(mddev)] < md_hardsect_sizes[mdidx(mddev)])
19484
+ md_blocksizes[mdidx(mddev)] = md_hardsect_sizes[mdidx(mddev)];
19486
+ mddev->pers = pers[pnum];
19488
+ err = mddev->pers->run(mddev);
19490
+ printk("pers->run() failed ...\n");
19491
+ mddev->pers = NULL;
19494
+ mddev->sb->state &= ~(1 << MD_SB_CLEAN);
19496
+ evms_md_update_sb(mddev);
19498
+ mddev->flag &= ~EVMS_MD_INCOMPLETE; /* Clear incomplete flag */
19503
+#undef TOO_BIG_CHUNKSIZE
19504
+#undef BAD_CHUNKSIZE
19507
+#define OUT(x) do { err = (x); goto out; } while (0)
19510
+#define STILL_MOUNTED KERN_WARNING \
19511
+"md%d still mounted.\n"
19512
+#define STILL_IN_USE \
19513
+"md%d still in use.\n"
19515
+static int do_md_stop (mddev_t * mddev, int ro)
19517
+ int err = 0, resync_interrupted = 0;
19518
+ kdev_t dev = mddev_to_kdev(mddev);
19520
+ if (atomic_read(&mddev->active)>1) {
19521
+ printk(STILL_IN_USE, mdidx(mddev));
19525
+ if (mddev->pers) {
19527
+ * It is safe to call stop here, it only frees private
19528
+ * data. Also, it tells us if a device is unstoppable
19529
+ * (eg. resyncing is in progress)
19531
+ if (mddev->pers->stop_resync)
19532
+ if (mddev->pers->stop_resync(mddev))
19533
+ resync_interrupted = 1;
19535
+ if (mddev->recovery_running)
19536
+ evms_cs_interrupt_thread(evms_md_recovery_thread);
19539
+ * This synchronizes with signal delivery to the
19540
+ * resync or reconstruction thread. It also nicely
19541
+ * hangs the process if some reconstruction has not
19544
+ down(&mddev->recovery_sem);
19545
+ up(&mddev->recovery_sem);
19547
+ invalidate_device(dev, 1);
19555
+ set_device_ro(dev, 0);
19556
+ if (mddev->pers->stop(mddev)) {
19558
+ set_device_ro(dev, 1);
19566
+ * mark it clean only if there was no resync
19569
+ if (!mddev->recovery_running && !resync_interrupted) {
19570
+ printk("marking sb clean...\n");
19571
+ mddev->sb->state |= 1 << MD_SB_CLEAN;
19573
+ evms_md_update_sb_sync(mddev);
19576
+ set_device_ro(dev, 1);
19580
+ * Free resources if final stop
19583
+ printk (KERN_INFO "md%d stopped.\n", mdidx(mddev));
19584
+ free_mddev(mddev);
19587
+ printk (KERN_INFO
19588
+ "md%d switched to read-only mode.\n", mdidx(mddev));
19594
+static void evms_md_autorun_array (evms_logical_node_t ** discover_list, mddev_t *mddev)
19596
+ mdk_rdev_t *rdev;
19597
+ struct md_list_head *tmp;
19601
+ if (mddev->disks.prev == &mddev->disks) {
19606
+ LOG_DETAILS("%s: trying to run array md%d\n", __FUNCTION__,mdidx(mddev) );
19608
+ ITERATE_RDEV(mddev,rdev,tmp) {
19609
+ LOG_DETAILS(" <%s>\n", get_partition_name(rdev));
19612
+ err = do_md_run (mddev);
19615
+ * remove all nodes consumed by this md device from the discover list
19617
+ ITERATE_RDEV(mddev,rdev,tmp) {
19618
+ LOG_DETAILS(" removing %s from discover list.\n", get_partition_name(rdev));
19619
+ evms_cs_remove_logical_node_from_list(discover_list,rdev->node);
19620
+ flags |= rdev->node->flags;
19622
+ err = evms_md_create_logical_node(discover_list,mddev,flags);
19624
+ exported_nodes++;
19627
+ LOG_WARNING("%s: cannot run array md%d\n",__FUNCTION__,mdidx(mddev));
19628
+ mddev->sb_dirty = 0;
19629
+ do_md_stop (mddev, 0);
19634
+ * lets try to run arrays based on all disks that have arrived
19635
+ * until now. (those are in the ->pending list)
19637
+ * the method: pick the first pending disk, collect all disks with
19638
+ * the same UUID, remove all from the pending list and put them into
19639
+ * the 'same_array' list. Then order this list based on superblock
19640
+ * update time (freshest comes first), kick out 'old' disks and
19641
+ * compare superblocks. If everything's fine then run it.
19643
+ * If "unit" is allocated, then bump its reference count
19645
+static void evms_md_autorun_devices (evms_logical_node_t **discover_list, kdev_t countdev)
19647
+ struct md_list_head candidates;
19648
+ struct md_list_head *tmp;
19649
+ mdk_rdev_t *rdev0, *rdev;
19654
+ LOG_DETAILS("autorun ...\n");
19655
+ while (pending_raid_disks.next != &pending_raid_disks) {
19656
+ rdev0 = md_list_entry(pending_raid_disks.next,
19657
+ mdk_rdev_t, pending);
19658
+ LOG_DETAILS("considering %s ...\n",get_partition_name(rdev0));
19659
+ MD_INIT_LIST_HEAD(&candidates);
19660
+ ITERATE_RDEV_PENDING(rdev,tmp) {
19661
+ if (uuid_equal(rdev0, rdev)) {
19662
+ if (!sb_equal(rdev0->sb, rdev->sb)) {
19663
+ LOG_DETAILS("%s has same UUID as %s, but superblocks differ ...\n",\
19664
+ get_partition_name(rdev),get_partition_name(rdev0));
19667
+ LOG_DETAILS(" adding %s ...\n", get_partition_name(rdev));
19668
+ md_list_del(&rdev->pending);
19669
+ md_list_add(&rdev->pending, &candidates);
19674
+ * now we have a set of devices, with all of them having
19675
+ * mostly sane superblocks. It's time to allocate the
19678
+ md_kdev = MKDEV(MD_MAJOR, rdev0->sb->md_minor);
19679
+ mddev = kdev_to_mddev(md_kdev);
19680
+ if (mddev && (!(mddev->flag & EVMS_MD_INCOMPLETE))) {
19681
+ LOG_DETAILS("md%d already running, cannot run %s\n",
19682
+ mdidx(mddev), get_partition_name(rdev0));
19684
+ * This is EVMS re-discovery!
19685
+ * Remove all nodes consumed by this md device from the discover list
19687
+ ITERATE_RDEV(mddev,rdev,tmp)
19688
+ evms_cs_remove_logical_node_from_list(discover_list,rdev->node);
19689
+ ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp)
19690
+ evms_md_export_rdev(rdev);
19695
+ mddev = alloc_mddev(md_kdev);
19696
+ if (mddev == NULL) {
19697
+ LOG_ERROR("cannot allocate memory for md drive.\n");
19700
+ LOG_DETAILS("created md%d\n", mdidx(mddev));
19702
+ LOG_DETAILS("found INCOMPLETE md%d\n", mdidx(mddev));
19705
+ if (md_kdev == countdev)
19706
+ atomic_inc(&mddev->active);
19708
+ ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) {
19709
+ bind_rdev_to_array(rdev, mddev);
19710
+ md_list_del(&rdev->pending);
19711
+ MD_INIT_LIST_HEAD(&rdev->pending);
19714
+ if ((mddev->nr_raid_disks >= rdev0->sb->raid_disks) ||
19715
+ (mddev->nb_dev == rdev0->sb->nr_disks)) {
19716
+ evms_md_autorun_array(discover_list,mddev);
19718
+ mddev->flag |= EVMS_MD_INCOMPLETE;
19719
+ LOG_DETAILS("THIS md%d IS INCOMPLETE, found %d devices, need %d\n",
19720
+ mdidx(mddev), mddev->nr_raid_disks, rdev0->sb->raid_disks);
19721
+ ITERATE_RDEV(mddev,rdev,tmp) {
19722
+ evms_cs_remove_logical_node_from_list(discover_list,rdev->node);
19726
+ LOG_DETAILS("... autorun DONE.\n");
19729
+void evms_md_recover_arrays(void)
19731
+ if (!evms_md_recovery_thread) {
19735
+ evms_cs_wakeup_thread(evms_md_recovery_thread);
19738
+int evms_md_error(
19740
+ evms_logical_node_t *node)
19742
+ mdk_rdev_t * rrdev;
19744
+ LOG_ERROR("evms_md_error dev:(md%d), node:(%s), (caller: %p,%p,%p,%p).\n",
19745
+ mdidx(mddev), node->name,
19746
+ __builtin_return_address(0),__builtin_return_address(1),
19747
+ __builtin_return_address(2),__builtin_return_address(3));
19753
+ rrdev = evms_md_find_rdev_from_node(mddev, node);
19754
+ if (!rrdev || rrdev->faulty)
19756
+ if (!mddev->pers->error_handler
19757
+ || mddev->pers->error_handler(mddev,node) <= 0) {
19758
+ free_disk_sb(rrdev);
19759
+ rrdev->faulty = 1;
19763
+ * if recovery was running, stop it now.
19765
+ if (mddev->pers->stop_resync)
19766
+ mddev->pers->stop_resync(mddev);
19767
+ if (mddev->recovery_running)
19768
+ evms_cs_interrupt_thread(evms_md_recovery_thread);
19769
+ evms_md_recover_arrays();
19774
+int evms_register_md_personality (int pnum, mdk_personality_t *p)
19776
+ if (pnum >= MAX_PERSONALITY) {
19781
+ if (pers[pnum]) {
19787
+ LOG_DETAILS("%s personality registered as nr %d\n",p->name, pnum);
19791
+int evms_unregister_md_personality (int pnum)
19793
+ if (pnum >= MAX_PERSONALITY) {
19798
+ printk(KERN_INFO "%s personality unregistered\n", pers[pnum]->name);
19799
+ pers[pnum] = NULL;
19803
+mdp_disk_t *evms_md_get_spare(mddev_t *mddev)
19805
+ mdp_super_t *sb = mddev->sb;
19806
+ mdp_disk_t *disk;
19807
+ mdk_rdev_t *rdev;
19808
+// struct md_list_head *tmp;
19811
+ for (i = 0, j = 0; j < mddev->nb_dev; i++) {
19812
+ rdev = evms_md_find_rdev_nr(mddev, i);
19813
+ if (rdev == NULL)
19816
+ if (rdev->faulty)
19819
+ if (!rdev->virtual_spare)
19823
+ disk = &sb->disks[rdev->desc_nr];
19824
+ if (disk_faulty(disk)) {
19828
+ if (disk_active(disk))
19835
+static mdp_disk_t *evms_md_find_disk(mddev_t *mddev, kdev_t dev)
19837
+ mdp_super_t *sb = mddev->sb;
19838
+ mdp_disk_t *disk;
19841
+ for (i=0; i < MD_SB_DISKS; i++) {
19842
+ disk = &sb->disks[i];
19843
+ if ((disk->major == MAJOR(dev)) && (disk->minor == MINOR(dev)))
19849
+static unsigned int sync_io[DK_MAX_MAJOR][DK_MAX_DISK];
19850
+void evms_md_sync_acct(
19852
+ unsigned long nr_sectors)
19854
+ unsigned int major = MAJOR(dev);
19855
+ unsigned int index;
19857
+ index = disk_index(dev);
19858
+ if ((index >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))
19861
+ sync_io[major][index] += nr_sectors;
19864
+static int is_mddev_idle(mddev_t *mddev)
19866
+ mdk_rdev_t * rdev;
19867
+ struct md_list_head *tmp;
19869
+ unsigned long curr_events;
19872
+ ITERATE_RDEV(mddev,rdev,tmp) {
19873
+ int major = MAJOR(rdev->dev);
19874
+ int idx = disk_index(rdev->dev);
19876
+ if ((idx >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))
19879
+ curr_events = kstat.dk_drive_rblk[major][idx] +
19880
+ kstat.dk_drive_wblk[major][idx] ;
19881
+ curr_events -= sync_io[major][idx];
19882
+ if ((curr_events - rdev->last_events) > 32) {
19883
+ rdev->last_events = curr_events;
19890
+MD_DECLARE_WAIT_QUEUE_HEAD(evms_resync_wait);
19892
+void evms_md_done_sync(mddev_t *mddev, int blocks, int ok)
19894
+ /* another "blocks" (512byte) blocks have been synced */
19895
+ atomic_sub(blocks, &mddev->recovery_active);
19896
+ wake_up(&mddev->recovery_wait);
19898
+ // stop recovery, signal do_sync ....
19902
+#define SYNC_MARKS 10
19903
+#define SYNC_MARK_STEP (3*HZ)
19904
+int evms_md_do_sync(mddev_t *mddev, mdp_disk_t *spare)
19907
+ unsigned int max_sectors, currspeed,
19908
+ j, window, err, serialize;
19909
+ unsigned long mark[SYNC_MARKS];
19910
+ unsigned long mark_cnt[SYNC_MARKS];
19912
+ struct md_list_head *tmp;
19913
+ unsigned long last_check;
19916
+ err = down_interruptible(&mddev->resync_sem);
19922
+ ITERATE_MDDEV(mddev2,tmp) {
19923
+ if (mddev2 == mddev)
19925
+ if (mddev2->curr_resync && match_mddev_units(mddev,mddev2)) {
19926
+ LOG_DEFAULT("delaying resync of md%d until md%d "
19927
+ "has finished resync (they share one or more physical units)\n",
19928
+ mdidx(mddev), mdidx(mddev2));
19934
+ interruptible_sleep_on(&evms_resync_wait);
19935
+ if (md_signal_pending(current)) {
19936
+ md_flush_signals();
19943
+ mddev->curr_resync = 1;
19945
+ max_sectors = mddev->sb->size<<1;
19947
+ LOG_DEFAULT("syncing RAID array md%d\n", mdidx(mddev));
19948
+ LOG_DEFAULT("minimum _guaranteed_ reconstruction speed: %d KB/sec/disc.\n",
19949
+ sysctl_speed_limit_min);
19950
+ LOG_DEFAULT("using maximum available idle IO bandwith "
19951
+ "(but not more than %d KB/sec) for reconstruction.\n",
19952
+ sysctl_speed_limit_max);
19955
+ * Resync has low priority.
19957
+ current->nice = 19;
19959
+ is_mddev_idle(mddev); /* this also initializes IO event counters */
19960
+ for (m = 0; m < SYNC_MARKS; m++) {
19961
+ mark[m] = jiffies;
19965
+ mddev->resync_mark = mark[last_mark];
19966
+ mddev->resync_mark_cnt = mark_cnt[last_mark];
19969
+ * Tune reconstruction:
19971
+ window = MAX_READAHEAD*(PAGE_SIZE/512);
19972
+ LOG_DEFAULT("using %dk window, over a total of %d blocks.\n",
19973
+ window/2,max_sectors/2);
19975
+ atomic_set(&mddev->recovery_active, 0);
19976
+ init_waitqueue_head(&mddev->recovery_wait);
19978
+ for (j = 0; j < max_sectors;) {
19981
+ sectors = mddev->pers->sync_request(mddev, j);
19983
+ if (sectors < 0) {
19987
+ atomic_add(sectors, &mddev->recovery_active);
19989
+ mddev->curr_resync = j;
19991
+ if (last_check + window > j)
19996
+ run_task_queue(&tq_disk);
19999
+ if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) {
20001
+ int next = (last_mark+1) % SYNC_MARKS;
20003
+ mddev->resync_mark = mark[next];
20004
+ mddev->resync_mark_cnt = mark_cnt[next];
20005
+ mark[next] = jiffies;
20006
+ mark_cnt[next] = j - atomic_read(&mddev->recovery_active);
20007
+ last_mark = next;
20011
+ if (md_signal_pending(current)) {
20013
+ * got a signal, exit.
20015
+ mddev->curr_resync = 0;
20016
+ LOG_DEFAULT("evms_md_do_sync() got signal ... exiting\n");
20017
+ md_flush_signals();
20023
+ * this loop exits only if either when we are slower than
20024
+ * the 'hard' speed limit, or the system was IO-idle for
20026
+ * the system might be non-idle CPU-wise, but we only care
20027
+ * about not overloading the IO subsystem. (things like an
20028
+ * e2fsck being done on the RAID array should execute fast)
20030
+ if (md_need_resched(current))
20033
+ currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1;
20035
+ if (currspeed > sysctl_speed_limit_min) {
20036
+ current->nice = 19;
20038
+ if ((currspeed > sysctl_speed_limit_max) ||
20039
+ !is_mddev_idle(mddev)) {
20040
+ current->state = TASK_INTERRUPTIBLE;
20041
+ md_schedule_timeout(HZ/4);
20045
+ current->nice = -20;
20047
+ LOG_DEFAULT("md%d: sync done.\n",mdidx(mddev));
20050
+ * this also signals 'finished resyncing' to md_stop
20053
+ wait_event(mddev->recovery_wait, atomic_read(&mddev->recovery_active)==0);
20054
+ up(&mddev->resync_sem);
20056
+ mddev->curr_resync = 0;
20057
+ wake_up(&evms_resync_wait);
20064
+ * This is a kernel thread which syncs a spare disk with the active array
20066
+ * the amount of foolproofing might seem to be a tad excessive, but an
20067
+ * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs
20068
+ * of my root partition with the first 0.5 gigs of my /home partition ... so
20069
+ * i'm a bit nervous ;)
20071
+void evms_md_do_recovery(void *data)
20076
+ mdp_disk_t *spare;
20077
+ struct md_list_head *tmp;
20078
+ unsigned long flags;
20079
+ evms_md_activate_spare_t *activate_spare;
20081
+ LOG_DEFAULT("recovery thread got woken up ...\n");
20083
+ ITERATE_MDDEV(mddev,tmp) {
20088
+ if (mddev->recovery_running)
20090
+ if (sb->active_disks == sb->raid_disks)
20092
+ if (!sb->spare_disks) {
20093
+ LOG_ERROR(" [md%d] no spare disk to reconstruct array! "
20094
+ "-- continuing in degraded mode\n", mdidx(mddev));
20099
+ activate_spare = NULL;
20101
+ spin_lock_irqsave(&activate_spare_list_lock, flags);
20102
+ activate_spare = evms_activate_spare_list;
20103
+ if (activate_spare && (activate_spare->mddev == mddev)) {
20104
+ spare = activate_spare->spare;
20105
+ evms_activate_spare_list = activate_spare->next;
20107
+ spin_unlock_irqrestore(&activate_spare_list_lock, flags);
20111
+ * now here we get the spare and resync it.
20113
+ spare = evms_md_get_spare(mddev);
20118
+ LOG_DEFAULT(" [md%d] resyncing spare disk %s to replace failed disk\n",
20119
+ mdidx(mddev), org_partition_name(MKDEV(spare->major,spare->minor)));
20120
+ if (!mddev->pers->diskop)
20123
+ if (mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_WRITE))
20126
+ down(&mddev->recovery_sem);
20127
+ mddev->recovery_running = 1;
20128
+ err = evms_md_do_sync(mddev, spare);
20129
+ if (err == -EIO) {
20130
+ LOG_DEFAULT("[md%d] spare disk %s failed, skipping to next spare.\n",
20131
+ mdidx(mddev), org_partition_name(MKDEV(spare->major,spare->minor)));
20132
+ if (!disk_faulty(spare)) {
20133
+ mddev->pers->diskop(mddev,&spare,DISKOP_SPARE_INACTIVE);
20134
+ mark_disk_faulty(spare);
20135
+ mark_disk_nonsync(spare);
20136
+ mark_disk_inactive(spare);
20137
+ sb->spare_disks--;
20138
+ sb->working_disks--;
20139
+ sb->failed_disks++;
20142
+ if (disk_faulty(spare))
20143
+ mddev->pers->diskop(mddev, &spare,
20144
+ DISKOP_SPARE_INACTIVE);
20145
+ if (err == -EINTR || err == -ENOMEM) {
20147
+ * Recovery got interrupted, or ran out of mem ...
20148
+ * signal back that we have finished using the array.
20150
+ mddev->pers->diskop(mddev, &spare,
20151
+ DISKOP_SPARE_INACTIVE);
20152
+ up(&mddev->recovery_sem);
20153
+ mddev->recovery_running = 0;
20156
+ mddev->recovery_running = 0;
20157
+ up(&mddev->recovery_sem);
20159
+ if (!disk_faulty(spare)) {
20161
+ * the SPARE_ACTIVE diskop possibly changes the
20164
+ if (activate_spare)
20165
+ mddev->pers->diskop(mddev, &spare, DISKOP_HOT_SPARE_ACTIVE);
20167
+ mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_ACTIVE);
20168
+ mark_disk_sync(spare);
20169
+ mark_disk_active(spare);
20170
+ sb->active_disks++;
20171
+ sb->spare_disks--;
20173
+ mddev->sb_dirty = 1;
20174
+ evms_md_update_sb(mddev);
20177
+ LOG_DEFAULT("recovery thread finished ...\n");
20181
+int evms_md_notify_reboot(struct notifier_block *this,
20182
+ unsigned long code, void *x)
20184
+ struct md_list_head *tmp;
20187
+ if ((code == MD_SYS_DOWN) || (code == MD_SYS_HALT)
20188
+ || (code == MD_SYS_POWER_OFF)) {
20190
+ LOG_DEFAULT("stopping all md devices.\n");
20192
+ ITERATE_MDDEV(mddev,tmp)
20193
+ do_md_stop (mddev, 1);
20195
+ * certain more exotic SCSI devices are known to be
20196
+ * volatile wrt too early system reboots. While the
20197
+ * right place to handle this issue is the given
20198
+ * driver, we do want to have a safe RAID driver ...
20200
+ md_mdelay(1000*1);
20202
+ return NOTIFY_DONE;
20205
+static struct notifier_block md_notifier = {
20206
+ notifier_call: evms_md_notify_reboot,
20208
+ priority: INT_MAX, /* before any real devices */
20214
+ * Function: evms_md_create_logical_node
20216
+static int evms_md_create_logical_node(evms_logical_node_t **discover_list,
20217
+ mddev_t *mddev, uint flags)
20220
+ md_instance_data_t *MDID = NULL;
20221
+ evms_logical_node_t *newnode = NULL;
20223
+ rc = evms_cs_allocate_logical_node(&newnode);
20225
+ rc = evms_cs_allocate_memory((void**)&MDID,sizeof(*MDID));
20228
+ memset(newnode,0,sizeof(*MDID));
20229
+ newnode->plugin = &md_plugin_header;
20230
+ newnode->total_vsectors = (u_int64_t)evms_md_size[mdidx(mddev)] * 2;
20231
+ newnode->block_size = md_blocksizes[mdidx(mddev)];
20232
+ newnode->hardsector_size = md_hardsect_sizes[mdidx(mddev)];
20233
+ sprintf(newnode->name,"md/md%d",mdidx(mddev));
20234
+ MDID->mddev = mddev;
20235
+ newnode->instance_data = MDID;
20236
+ newnode->flags = flags;
20239
+ rc = evms_cs_add_logical_node_to_list(discover_list, newnode);
20241
+ LOG_ERROR("could not add md node %s\n",newnode->name);
20243
+ LOG_DETAILS("added our md node %s to discover list (total_vsectors=%Lu, blk_size=%d, sector_size=%d)\n",
20244
+ newnode->name, newnode->total_vsectors, newnode->block_size, newnode->hardsector_size);
20249
+ mddev->node = newnode;
20252
+ evms_cs_deallocate_memory(MDID);
20254
+ evms_cs_deallocate_logical_node(newnode);
20260
+ * Function: evms_md_autostart_arrays
20261
+ * Discover MD "extended" devices
20262
+ * Add MD "extended" devices to pending list for further processing
20264
+static void evms_md_autostart_arrays (evms_logical_node_t **discover_list)
20266
+ evms_logical_node_t *node, *next_node;
20267
+ mdk_rdev_t *rdev;
20270
+ LOG_ENTRY_EXIT(":autostart_arrays() ENTRY\n");
20272
+ /* examine each node on the discover list */
20273
+ next_node = *discover_list;
20274
+ while(next_node) {
20275
+ node = next_node;
20276
+ next_node = node->next;
20278
+ rc = evms_md_import_device(discover_list, node,1);
20279
+ if (rc && (rc != -EEXIST)) {
20280
+ LOG_EXTRA("autostart_arrrays() Not %s!\n",evms_md_partition_name(node));
20287
+ rdev = evms_md_find_rdev_all(node);
20289
+ LOG_ERROR("find_rdev_all() failed\n");
20292
+ if (rdev->faulty) {
20298
+ md_list_add(&rdev->pending, &pending_raid_disks);
20299
+ } else if (rc == -EEXIST) {
20300
+ evms_logical_node_t *md_node;
20302
+ * Must be in a re-discovery process here.
20303
+ * Find the EVMS MD node that this rdev is a member of
20305
+ if (rdev->mddev) {
20306
+ md_node = rdev->mddev->node;
20308
+ rc = evms_cs_add_logical_node_to_list(discover_list,md_node);
20311
+ exported_nodes++;
20312
+ LOG_DETAILS("Added MD node (%s) to discover list\n",
20315
+ case 1: /* already on the list */
20316
+ case 2: /* already on the list */
20319
+ LOG_WARNING("could not add md node (%s), rc=%d\n",
20320
+ md_node->name, rc);
20323
+ LOG_ERROR("This MD device [md%d] does not have an EVMS logical node.\n",
20324
+ rdev->mddev->__minor);
20327
+ LOG_ERROR("This device [%s] does not belong to any array!\n",
20328
+ get_partition_name(rdev));
20329
+ evms_md_export_rdev(rdev);
20331
+ evms_cs_remove_logical_node_from_list(discover_list,node);
20335
+ evms_md_autorun_devices(discover_list, -1);
20336
+ LOG_DETAILS("EVMD MD:autostart_arrays() EXIT (exported_nodes=%d)\n",exported_nodes);
20339
+#ifdef CONFIG_PROC_FS
20340
+static int status_resync(char * page, mddev_t * mddev)
20343
+ unsigned long max_blocks, resync, res, dt, db, rt;
20345
+ resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
20346
+ max_blocks = mddev->sb->size;
20349
+ * Should not happen.
20351
+ if (!max_blocks) {
20355
+ res = (resync/1024)*1000/(max_blocks/1024 + 1);
20357
+ int i, x = res/50, y = 20-x;
20359
+ for (i = 0; i < x; i++)
20361
+ sz += sprintf(page + sz, ">");
20362
+ for (i = 0; i < y; i++)
20366
+ if (!mddev->recovery_running)
20370
+ PROCPRINT(" resync =%3lu.%lu%% (%lu/%lu)",
20371
+ res/10, res % 10, resync, max_blocks);
20376
+ PROCPRINT(" recovery =%3lu.%lu%% (%lu/%lu)",
20377
+ res/10, res % 10, resync, max_blocks);
20380
+ * We do not want to overflow, so the order of operands and
20381
+ * the * 100 / 100 trick are important. We do a +1 to be
20382
+ * safe against division by zero. We only estimate anyway.
20384
+ * dt: time from mark until now
20385
+ * db: blocks written from mark until now
20386
+ * rt: remaining time
20388
+ dt = ((jiffies - mddev->resync_mark) / HZ);
20390
+ db = resync - (mddev->resync_mark_cnt/2);
20391
+ rt = (dt * ((max_blocks-resync) / (db/100+1)))/100;
20393
+ PROCPRINT(" finish=%lu.%lumin", rt / 60, (rt % 60)/6);
20395
+ PROCPRINT(" speed=%ldK/sec", db/dt);
20400
+static int evms_md_status_read_proc(char *page, char **start, off_t off,
20401
+ int count, int *eof, void *data)
20403
+ int sz = 0, j, size;
20404
+ struct md_list_head *tmp, *tmp2;
20405
+ mdk_rdev_t *rdev;
20408
+ PROCPRINT("Enterprise Volume Management System: MD Status\n");
20409
+ PROCPRINT("Personalities : ");
20410
+ for (j = 0; j < MAX_PERSONALITY; j++)
20412
+ PROCPRINT("[%s] ", pers[j]->name);
20417
+ ITERATE_MDDEV(mddev,tmp) {
20418
+ PROCPRINT("md%d : %sactive", mdidx(mddev),
20419
+ mddev->pers ? "" : "in");
20420
+ if (mddev->pers) {
20422
+ PROCPRINT(" (read-only)");
20423
+ PROCPRINT(" %s", mddev->pers->name);
20427
+ ITERATE_RDEV(mddev,rdev,tmp2) {
20428
+ PROCPRINT(" %s[%d]",
20429
+ rdev->node->name, rdev->desc_nr);
20430
+ if (rdev->faulty) {
20431
+ PROCPRINT("(F)");
20434
+ size += rdev->size;
20437
+ if (mddev->nb_dev) {
20439
+ PROCPRINT("\n %Ld blocks",
20440
+ mddev->node->total_vsectors >> 1);
20442
+ PROCPRINT("\n %d blocks", size);
20445
+ if (!mddev->pers) {
20450
+ sz += mddev->pers->status (page+sz, mddev);
20452
+ PROCPRINT("\n ");
20453
+ if (mddev->curr_resync) {
20454
+ sz += status_resync (page+sz, mddev);
20456
+ if (atomic_read(&mddev->resync_sem.count) != 1)
20457
+ PROCPRINT(" resync=DELAYED");
20467
+/* Function: md_core_init
20469
+int __init md_core_init(void)
20471
+ static char * name = "evms_mdrecoveryd";
20472
+#ifdef CONFIG_PROC_FS
20473
+ struct proc_dir_entry *evms_proc_dir;
20476
+ // Increment the use count, so it never goes to zero.
20477
+ // This is necessary for now because we don't have code
20478
+ // to shut down the MD threads. When that is written,
20479
+ // this line should be removed.
20480
+ MOD_INC_USE_COUNT;
20482
+#ifdef CONFIG_PROC_FS
20483
+ evms_proc_dir = evms_cs_get_evms_proc_dir();
20484
+ if (evms_proc_dir) {
20485
+ create_proc_read_entry("mdstat", 0, evms_proc_dir, evms_md_status_read_proc, NULL);
20487
+ md_table_header = register_sysctl_table(dev_dir_table, 1);
20490
+ /* Create MD recovery thread */
20491
+ evms_md_recovery_thread = evms_cs_register_thread(evms_md_do_recovery, NULL, name);
20492
+ if (!evms_md_recovery_thread)
20493
+ LOG_SERIOUS("%s: evms_cs_recovery_thread failed\n", __FUNCTION__);
20495
+ /* Register for reboot notification */
20496
+ md_register_reboot_notifier(&md_notifier);
20498
+ return evms_cs_register_plugin(&md_plugin_header);
20501
+static void __exit md_core_exit(void)
20503
+#ifdef CONFIG_PROC_FS
20504
+ struct proc_dir_entry *evms_proc_dir;
20506
+ evms_proc_dir = evms_cs_get_evms_proc_dir();
20507
+ if (evms_proc_dir) {
20508
+ remove_proc_entry("mdstat", evms_proc_dir);
20510
+ unregister_sysctl_table(md_table_header);
20512
+ evms_cs_unregister_plugin(&md_plugin_header);
20515
+module_init(md_core_init);
20516
+module_exit(md_core_exit);
20517
+#ifdef MODULE_LICENSE
20518
+MODULE_LICENSE("GPL");
20522
+ * In order to have the coexistence of this EVMS plugin and the orginal MD
20523
+ * module, the symbols exported by this plugin are prefixed with "evms_"
20526
+MD_EXPORT_SYMBOL(evms_md_size);
20527
+MD_EXPORT_SYMBOL(evms_register_md_personality);
20528
+MD_EXPORT_SYMBOL(evms_unregister_md_personality);
20529
+ /* Export the following function for use with rdev->node in evms_md_k.h */
20530
+MD_EXPORT_SYMBOL(evms_md_partition_name);
20531
+ /* Export the following function for use with disks[] in md_p.h */
20532
+//MD_EXPORT_SYMBOL(get_partition_name);
20533
+MD_EXPORT_SYMBOL(evms_md_error);
20534
+MD_EXPORT_SYMBOL(evms_md_update_sb);
20535
+MD_EXPORT_SYMBOL(evms_md_find_rdev_nr);
20536
+MD_EXPORT_SYMBOL(evms_md_print_devices);
20537
+MD_EXPORT_SYMBOL(evms_mddev_map);
20538
+MD_EXPORT_SYMBOL(evms_md_check_ordering);
20539
+MD_EXPORT_SYMBOL(evms_md_do_sync);
20540
+MD_EXPORT_SYMBOL(evms_md_sync_acct);
20541
+MD_EXPORT_SYMBOL(evms_md_done_sync);
20542
+MD_EXPORT_SYMBOL(evms_md_recover_arrays);
20543
+MD_EXPORT_SYMBOL(evms_md_get_spare);
20545
diff -Naur linux-2002-03-28/drivers/evms/md_linear.c evms-2002-03-28/drivers/evms/md_linear.c
20546
--- linux-2002-03-28/drivers/evms/md_linear.c Wed Dec 31 18:00:00 1969
20547
+++ evms-2002-03-28/drivers/evms/md_linear.c Thu Mar 28 16:28:59 2002
20550
+ linear.c : Multiple Devices driver for Linux
20551
+ Copyright (C) 1994-96 Marc ZYNGIER
20552
+ <zyngier@ufr-info-p7.ibp.fr> or
20553
+ <maz@gloups.fdn.fr>
20555
+ Linear mode management functions.
20557
+ This program is free software; you can redistribute it and/or modify
20558
+ it under the terms of the GNU General Public License as published by
20559
+ the Free Software Foundation; either version 2, or (at your option)
20560
+ any later version.
20562
+ You should have received a copy of the GNU General Public License
20563
+ (for example /usr/src/linux/COPYING); if not, write to the Free
20564
+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20567
+#include <linux/module.h>
20568
+#include <linux/evms/evms_md.h>
20569
+#include <linux/evms/evms_linear.h>
20570
+#include <linux/slab.h>
20573
+#define MAJOR_NR MD_MAJOR
20575
+#define MD_PERSONALITY
20577
+#define LOG_PREFIX "md linear: "
20578
+static int linear_run (mddev_t *mddev)
20580
+ linear_conf_t *conf;
20581
+ struct linear_hash *table;
20582
+ mdk_rdev_t *rdev;
20583
+ int size, i, j, nb_zone;
20584
+ unsigned int curr_offset;
20586
+ MOD_INC_USE_COUNT;
20588
+ conf = kmalloc (sizeof (*conf), GFP_KERNEL);
20591
+ mddev->private = conf;
20593
+ if (evms_md_check_ordering(mddev)) {
20594
+ printk("linear: disks are not ordered, aborting!\n");
20599
+ * Find the smallest device.
20602
+ conf->smallest = NULL;
20604
+ ITERATE_RDEV_ORDERED(mddev,rdev,j) {
20605
+ dev_info_t *disk = conf->disks + j;
20606
+ disk->node = rdev->node;
20607
+ LOG_DETAILS(__FUNCTION__" is taking %s, total_vsectors=%Lu\n",
20608
+ disk->node->name,disk->node->total_vsectors);
20609
+ disk->dev = rdev->dev;
20610
+ disk->size = rdev->size;
20611
+ disk->offset = curr_offset;
20613
+ curr_offset += disk->size;
20615
+ if (!conf->smallest || (disk->size < conf->smallest->size))
20616
+ conf->smallest = disk;
20619
+ nb_zone = conf->nr_zones = evms_md_size[mdidx(mddev)] / conf->smallest->size +
20620
+ ((evms_md_size[mdidx(mddev)] % conf->smallest->size) ? 1 : 0);
20622
+ conf->hash_table = kmalloc (sizeof (struct linear_hash) * nb_zone,
20624
+ if (!conf->hash_table)
20628
+ * Here we generate the linear hash table
20630
+ table = conf->hash_table;
20633
+ for (j = 0; j < mddev->nb_dev; j++) {
20634
+ dev_info_t *disk = conf->disks + j;
20637
+ table[-1].dev1 = disk;
20639
+ size += disk->size;
20642
+ table->dev0 = disk;
20643
+ table->dev1 = NULL;
20644
+ size -= conf->smallest->size;
20648
+ if (table-conf->hash_table != nb_zone)
20650
+ LOG_DETAILS(__FUNCTION__" EXIT nr_zones=%d, smallest=%lu\n",
20651
+ conf->nr_zones,conf->smallest->size);
20657
+ MOD_DEC_USE_COUNT;
20661
+static int linear_stop (mddev_t *mddev)
20663
+ linear_conf_t *conf = mddev_to_conf(mddev);
20665
+ kfree(conf->hash_table);
20668
+ MOD_DEC_USE_COUNT;
20674
+ * Function: linear_map
20676
+static int linear_map(mddev_t *mddev, evms_logical_node_t **node, evms_sector_t *LSN)
20678
+ linear_conf_t *conf = mddev_to_conf(mddev);
20679
+ struct linear_hash *hash;
20680
+ dev_info_t *tmp_dev;
20683
+ block = (long)(*LSN >> 1);
20684
+ hash = conf->hash_table + (block / conf->smallest->size);
20685
+ if (block >= (hash->dev0->size + hash->dev0->offset)) {
20686
+ if (!hash->dev1) {
20687
+ LOG_ERROR(__FUNCTION__ " hash->dev1==NULL for block %ld\n",block);
20690
+ tmp_dev = hash->dev1;
20692
+ tmp_dev = hash->dev0;
20694
+ if (block >= (tmp_dev->size + tmp_dev->offset)
20695
+ || block < tmp_dev->offset) {
20696
+ LOG_ERROR(__FUNCTION__" Block %ld out of bounds on node %s size %ld offset %ld\n",
20698
+ tmp_dev->node->name,
20700
+ tmp_dev->offset);
20703
+ *LSN -= (evms_sector_t)(tmp_dev->offset << 1);
20704
+ *node = tmp_dev->node;
20708
+static int linear_init_io(mddev_t *mddev,
20710
+ evms_sector_t LSN,
20711
+ evms_sector_t nr_sects,
20715
+ evms_logical_node_t *node;
20717
+ LOG_ENTRY_EXIT(__FUNCTION__" LSN=%Lu, nr_sects=%Lu\n", LSN, nr_sects);
20718
+ rc = linear_map(mddev, &node, &LSN);
20720
+ rc = INIT_IO(node, rw, LSN, nr_sects, data);
20724
+static int linear_make_request (mddev_t *mddev,
20728
+ evms_logical_node_t *node;
20731
+ rc = linear_map(mddev, &node, &eio->rsector);
20734
+ if (rw == READ) {
20739
+ return 1; /* success */
20741
+ LOG_ERROR(__FUNCTION__ " FAILED %s node(%s) rsector(%Lu)\n",
20742
+ (rw == READ) ? "READ" : "WRITE",node->name,eio->rsector);
20744
+ EVMS_IO_ERROR(eio);
20749
+static int linear_status (char *page, mddev_t *mddev)
20756
+ linear_conf_t *conf = mddev_to_conf(mddev);
20758
+ sz += sprintf(page+sz, " ");
20759
+ for (j = 0; j < conf->nr_zones; j++)
20761
+ sz += sprintf(page+sz, "[%s",
20762
+ partition_name(conf->hash_table[j].dev0->dev));
20764
+ if (conf->hash_table[j].dev1)
20765
+ sz += sprintf(page+sz, "/%s] ",
20766
+ partition_name(conf->hash_table[j].dev1->dev));
20768
+ sz += sprintf(page+sz, "] ");
20770
+ sz += sprintf(page+sz, "\n");
20772
+ sz += sprintf(page+sz, " %dk rounding", mddev->param.chunk_size/1024);
20776
+static int linear_evms_ioctl (
20778
+ struct inode * inode,
20779
+ struct file * file,
20780
+ unsigned int cmd,
20781
+ unsigned long arg)
20784
+ evms_logical_node_t *node;
20787
+ case EVMS_GET_BMAP:
20789
+ evms_get_bmap_t *bmap = (evms_get_bmap_t *)arg;
20790
+ rc = linear_map(mddev,&node, &bmap->rsector);
20793
+ rc = IOCTL(node, inode, file, cmd, arg);
20806
+static mdk_personality_t linear_personality=
20808
+ name: "evms_linear",
20809
+ init_io: linear_init_io,
20810
+ make_request: linear_make_request,
20812
+ stop: linear_stop,
20813
+ status: linear_status,
20814
+ evms_ioctl: linear_evms_ioctl
20817
+static int md__init linear_init (void)
20819
+ return evms_register_md_personality (LINEAR, &linear_personality);
20822
+static void linear_exit (void)
20824
+ evms_unregister_md_personality (LINEAR);
20828
+module_init(linear_init);
20829
+module_exit(linear_exit);
20830
+#ifdef MODULE_LICENSE
20831
+MODULE_LICENSE("GPL");
20833
diff -Naur linux-2002-03-28/drivers/evms/md_raid0.c evms-2002-03-28/drivers/evms/md_raid0.c
20834
--- linux-2002-03-28/drivers/evms/md_raid0.c Wed Dec 31 18:00:00 1969
20835
+++ evms-2002-03-28/drivers/evms/md_raid0.c Thu Mar 28 16:28:46 2002
20838
+ raid0.c : Multiple Devices driver for Linux
20839
+ Copyright (C) 1994-96 Marc ZYNGIER
20840
+ <zyngier@ufr-info-p7.ibp.fr> or
20841
+ <maz@gloups.fdn.fr>
20842
+ Copyright (C) 1999, 2000 Ingo Molnar, Red Hat
20845
+ RAID-0 management functions.
20847
+ This program is free software; you can redistribute it and/or modify
20848
+ it under the terms of the GNU General Public License as published by
20849
+ the Free Software Foundation; either version 2, or (at your option)
20850
+ any later version.
20852
+ You should have received a copy of the GNU General Public License
20853
+ (for example /usr/src/linux/COPYING); if not, write to the Free
20854
+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20857
+#include <linux/module.h>
20858
+#include <linux/evms/evms_raid0.h>
20860
+#define MAJOR_NR MD_MAJOR
20862
+#define MD_PERSONALITY
20864
+#define LOG_PREFIX "md raid0: "
20866
+static int create_strip_zones (mddev_t *mddev)
20868
+ int i, c, j, j1, j2;
20869
+ unsigned long current_offset, curr_zone_offset;
20870
+ raid0_conf_t *conf = mddev_to_conf(mddev);
20871
+ mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev;
20874
+ * The number of 'same size groups'
20876
+ conf->nr_strip_zones = 0;
20878
+ ITERATE_RDEV_ORDERED(mddev,rdev1,j1) {
20879
+ LOG_DETAILS(" looking at %s\n", evms_md_partition_name(rdev1->node));
20881
+ ITERATE_RDEV_ORDERED(mddev,rdev2,j2) {
20882
+ LOG_DETAILS(" comparing %s(%ld) with %s(%ld)\n",
20883
+ evms_md_partition_name(rdev1->node), rdev1->size,
20884
+ evms_md_partition_name(rdev2->node), rdev2->size);
20885
+ if (rdev2 == rdev1) {
20886
+ LOG_DETAILS(" END\n");
20889
+ if (rdev2->size == rdev1->size)
20892
+ * Not unique, dont count it as a new
20895
+ LOG_DETAILS(" EQUAL\n");
20899
+ LOG_DETAILS(" NOT EQUAL\n");
20902
+ LOG_DETAILS(" ==> UNIQUE\n");
20903
+ conf->nr_strip_zones++;
20904
+ LOG_DETAILS(" %d zones\n",conf->nr_strip_zones);
20907
+ LOG_DETAILS(" FINAL %d zones\n",conf->nr_strip_zones);
20909
+ conf->strip_zone = vmalloc(sizeof(struct strip_zone)*
20910
+ conf->nr_strip_zones);
20911
+ if (!conf->strip_zone)
20915
+ conf->smallest = NULL;
20916
+ current_offset = 0;
20917
+ curr_zone_offset = 0;
20919
+ for (i = 0; i < conf->nr_strip_zones; i++)
20921
+ struct strip_zone *zone = conf->strip_zone + i;
20923
+ LOG_DETAILS(" zone %d\n", i);
20924
+ zone->dev_offset = current_offset;
20928
+ ITERATE_RDEV_ORDERED(mddev,rdev,j) {
20930
+ LOG_DETAILS(" checking %s ...",evms_md_partition_name(rdev->node));
20931
+ if (rdev->size > current_offset)
20933
+ LOG_DETAILS(" contained as device %d\n", c);
20934
+ zone->dev[c] = rdev;
20936
+ if (!smallest || (rdev->size <smallest->size)) {
20938
+ LOG_DETAILS(" (%ld) is smallest!.\n", rdev->size);
20941
+ LOG_DETAILS(" nope.\n");
20944
+ zone->nb_dev = c;
20945
+ zone->size = (smallest->size - current_offset) * c;
20946
+ LOG_DETAILS(" zone->nb_dev: %d, size: %ld\n",
20947
+ zone->nb_dev,zone->size);
20949
+ if (!conf->smallest || (zone->size < conf->smallest->size))
20950
+ conf->smallest = zone;
20952
+ zone->zone_offset = curr_zone_offset;
20953
+ curr_zone_offset += zone->size;
20955
+ current_offset = smallest->size;
20956
+ LOG_DETAILS(" current zone offset: %ld\n",current_offset);
20958
+ LOG_DETAILS(" done.\n");
20962
+static int raid0_run (mddev_t *mddev)
20964
+ unsigned long cur=0, i=0, size, zone0_size, nb_zone;
20965
+ raid0_conf_t *conf;
20967
+ MOD_INC_USE_COUNT;
20969
+ conf = vmalloc(sizeof (raid0_conf_t));
20972
+ mddev->private = (void *)conf;
20974
+ if (evms_md_check_ordering(mddev)) {
20975
+ LOG_ERROR("disks are not ordered, aborting!\n");
20976
+ goto out_free_conf;
20979
+ if (create_strip_zones (mddev))
20980
+ goto out_free_conf;
20982
+ LOG_DETAILS("evms_md_size is %d blocks.\n", evms_md_size[mdidx(mddev)]);
20983
+ LOG_DETAILS("conf->smallest->size is %ld blocks.\n", conf->smallest->size);
20984
+ nb_zone = evms_md_size[mdidx(mddev)]/conf->smallest->size +
20985
+ (evms_md_size[mdidx(mddev)] % conf->smallest->size ? 1 : 0);
20986
+ LOG_DETAILS("nb_zone is %ld.\n", nb_zone);
20987
+ conf->nr_zones = nb_zone;
20989
+ LOG_DETAILS("Allocating %ld bytes for hash.\n", nb_zone*sizeof(struct raid0_hash));
20991
+ conf->hash_table = vmalloc (sizeof (struct raid0_hash)*nb_zone);
20992
+ if (!conf->hash_table)
20993
+ goto out_free_zone_conf;
20994
+ size = conf->strip_zone[cur].size;
20997
+ while (cur < conf->nr_strip_zones) {
20998
+ conf->hash_table[i].zone0 = conf->strip_zone + cur;
21001
+ * If we completely fill the slot
21003
+ if (size >= conf->smallest->size) {
21004
+ conf->hash_table[i++].zone1 = NULL;
21005
+ size -= conf->smallest->size;
21008
+ if (++cur == conf->nr_strip_zones)
21010
+ size = conf->strip_zone[cur].size;
21014
+ if (++cur == conf->nr_strip_zones) {
21016
+ * Last dev, set unit1 as NULL
21018
+ conf->hash_table[i].zone1=NULL;
21023
+ * Here we use a 2nd dev to fill the slot
21025
+ zone0_size = size;
21026
+ size = conf->strip_zone[cur].size;
21027
+ conf->hash_table[i++].zone1 = conf->strip_zone + cur;
21028
+ size -= (conf->smallest->size - zone0_size);
21032
+out_free_zone_conf:
21033
+ vfree(conf->strip_zone);
21034
+ conf->strip_zone = NULL;
21038
+ mddev->private = NULL;
21040
+ MOD_DEC_USE_COUNT;
21044
+static int raid0_stop (mddev_t *mddev)
21046
+ raid0_conf_t *conf = mddev_to_conf(mddev);
21048
+ vfree (conf->hash_table);
21049
+ conf->hash_table = NULL;
21050
+ vfree (conf->strip_zone);
21051
+ conf->strip_zone = NULL;
21053
+ mddev->private = NULL;
21055
+ MOD_DEC_USE_COUNT;
21061
+ * Function: raid0_map
21063
+ * Return 0 for success, else error
21065
+ * Comment from original code:
21067
+ * FIXME - We assume some things here :
21068
+ * - requested buffers NEVER bigger than chunk size,
21069
+ * - requested buffers NEVER cross stripes limits.
21070
+ * Of course, those facts may not be valid anymore (and surely won't...)
21071
+ * Hey guys, there's some work out there ;-)
21074
+static inline int raid0_map(mddev_t *mddev, evms_logical_node_t **node, evms_sector_t *LSN, evms_sector_t size)
21076
+ unsigned int sect_in_chunk, chunksize_bits, chunk_size;
21077
+ raid0_conf_t *conf = mddev_to_conf(mddev);
21078
+ struct raid0_hash *hash;
21079
+ struct strip_zone *zone;
21080
+ mdk_rdev_t *tmp_dev;
21081
+ unsigned long chunk, block, rsect;
21082
+ unsigned long b_rsector;
21083
+ unsigned int b_size;
21085
+ b_rsector = (unsigned long)*LSN;
21086
+ b_size = (unsigned int)size;
21088
+ chunk_size = mddev->param.chunk_size >> 10;
21089
+ chunksize_bits = ffz(~chunk_size);
21090
+ block = b_rsector >> 1;
21091
+ hash = conf->hash_table + block / conf->smallest->size;
21093
+ /* Sanity check */
21094
+ if (chunk_size < (block % chunk_size) + (b_size >> 10))
21100
+ if (!hash->zone0)
21103
+ if (block >= (hash->zone0->size + hash->zone0->zone_offset)) {
21104
+ if (!hash->zone1)
21106
+ zone = hash->zone1;
21108
+ zone = hash->zone0;
21110
+ sect_in_chunk = b_rsector & ((chunk_size<<1) -1);
21111
+ chunk = (block - zone->zone_offset) / (zone->nb_dev << chunksize_bits);
21112
+ tmp_dev = zone->dev[(block >> chunksize_bits) % zone->nb_dev];
21113
+ rsect = (((chunk << chunksize_bits) + zone->dev_offset)<<1)
21117
+ * The new BH_Lock semantics in ll_rw_blk.c guarantee that this
21118
+ * is the only IO operation happening on this bh.
21120
+ *LSN = (evms_sector_t)rsect;
21121
+ *node = tmp_dev->node;
21125
+ LOG_ERROR(__FUNCTION__ " bug: can't convert block across chunks or bigger than %dk %ld %d\n",
21126
+ chunk_size, b_rsector, b_size >> 10);
21129
+ LOG_ERROR(__FUNCTION__ " bug: hash==NULL for block %ld\n",block);
21132
+ LOG_ERROR(__FUNCTION__ " bug: hash->zone0==NULL for block %ld\n", block);
21135
+ LOG_ERROR(__FUNCTION__ " bug: hash->zone1==NULL for block %ld\n",block);
21141
+ * Function: raid0_init_io
21143
+static int raid0_init_io(
21146
+ evms_sector_t LSN,
21147
+ evms_sector_t nr_sects,
21151
+ evms_logical_node_t *node;
21153
+ LOG_ENTRY_EXIT(__FUNCTION__ " LSN=%Lu, nr_sects=%Lu\n", LSN, nr_sects);
21154
+ rc = raid0_map(mddev, &node, &LSN, nr_sects);
21156
+ rc = INIT_IO(node, rw, LSN, nr_sects, data);
21160
+static int raid0_make_request (
21165
+ evms_logical_node_t *node;
21168
+ rc = raid0_map(mddev, &node, &eio->rsector, eio->rsize);
21170
+ if (rw == READ) {
21175
+ return 1; /* success */
21177
+ LOG_ERROR(__FUNCTION__ " FAILED %s node(%s) rsector(%Lu)\n",
21178
+ (rw == READ) ? "READ" : "WRITE",node->name,eio->rsector);
21180
+ EVMS_IO_ERROR(eio);
21186
+static int raid0_status (char *page, mddev_t *mddev)
21192
+ raid0_conf_t *conf = mddev_to_conf(mddev);
21194
+ sz += sprintf(page + sz, " ");
21195
+ for (j = 0; j < conf->nr_zones; j++) {
21196
+ sz += sprintf(page + sz, "[z%d",
21197
+ conf->hash_table[j].zone0 - conf->strip_zone);
21198
+ if (conf->hash_table[j].zone1)
21199
+ sz += sprintf(page+sz, "/z%d] ",
21200
+ conf->hash_table[j].zone1 - conf->strip_zone);
21202
+ sz += sprintf(page+sz, "] ");
21205
+ sz += sprintf(page + sz, "\n");
21207
+ for (j = 0; j < conf->nr_strip_zones; j++) {
21208
+ sz += sprintf(page + sz, " z%d=[", j);
21209
+ for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
21210
+ sz += sprintf (page+sz, "%s/", partition_name(
21211
+ conf->strip_zone[j].dev[k]->dev));
21213
+ sz += sprintf (page+sz, "] zo=%d do=%d s=%d\n",
21214
+ conf->strip_zone[j].zone_offset,
21215
+ conf->strip_zone[j].dev_offset,
21216
+ conf->strip_zone[j].size);
21219
+ sz += sprintf(page + sz, " %dk chunks", mddev->param.chunk_size/1024);
21223
+static int raid0_evms_ioctl (
21225
+ struct inode * inode,
21226
+ struct file * file,
21227
+ unsigned int cmd,
21228
+ unsigned long arg)
21231
+ evms_logical_node_t *node;
21234
+ case EVMS_GET_BMAP:
21236
+ evms_get_bmap_t *bmap = (evms_get_bmap_t *)arg;
21237
+ rc = raid0_map(mddev,&node, &bmap->rsector, mddev->node->block_size);
21240
+ rc = IOCTL(node, inode, file, cmd, arg);
21253
+static mdk_personality_t raid0_personality=
21255
+ name: "evms_raid0",
21256
+ init_io: raid0_init_io,
21257
+ make_request: raid0_make_request,
21259
+ stop: raid0_stop,
21260
+ status: raid0_status,
21261
+ evms_ioctl: raid0_evms_ioctl
21264
+static int md__init raid0_init (void)
21266
+ return evms_register_md_personality (RAID0, &raid0_personality);
21269
+static void raid0_exit (void)
21271
+ evms_unregister_md_personality (RAID0);
21274
+module_init(raid0_init);
21275
+module_exit(raid0_exit);
21276
+#ifdef MODULE_LICENSE
21277
+MODULE_LICENSE("GPL");
21279
diff -Naur linux-2002-03-28/drivers/evms/md_raid1.c evms-2002-03-28/drivers/evms/md_raid1.c
21280
--- linux-2002-03-28/drivers/evms/md_raid1.c Wed Dec 31 18:00:00 1969
21281
+++ evms-2002-03-28/drivers/evms/md_raid1.c Wed Mar 27 09:07:59 2002
21284
+ * md_raid1.c : Multiple Devices driver for Linux
21286
+ * Copyright (C) 1999, 2000 Ingo Molnar, Red Hat
21288
+ * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
21290
+ * RAID-1 management functions.
21292
+ * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
21294
+ * Fixes to reconstruction by Jakob �stergaard" <jakob@ostenfeld.dk>
21295
+ * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
21297
+ * 'md_raid1.c' is an EVMS version of linux/drivers/md/raid1.c modified
21298
+ * by Cuong (Mike) Tran <miketran@us.ibm.com>, January 2002.
21300
+ * This program is free software; you can redistribute it and/or modify
21301
+ * it under the terms of the GNU General Public License as published by
21302
+ * the Free Software Foundation; either version 2, or (at your option)
21303
+ * any later version.
21305
+ * You should have received a copy of the GNU General Public License
21306
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
21307
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21310
+#include <linux/module.h>
21311
+#include <linux/slab.h>
21312
+#include <linux/evms/evms_raid1.h>
21313
+#include <asm/atomic.h>
21315
+#define MAJOR_NR MD_MAJOR
21317
+#define MD_PERSONALITY
21319
+#define MAX_WORK_PER_DISK 128
21321
+#define NR_RESERVED_BUFS 32
21323
+#define LOG_PREFIX "md raid1: "
21325
+ * The following can be used to debug the driver
21327
+#define RAID1_DEBUG 0
21330
+#define PRINTK(x...) LOG_DEFAULT(x)
21332
+#define __inline__
21334
+#define PRINTK(x...) do { } while (0)
21338
+static mdk_personality_t raid1_personality;
21339
+static md_spinlock_t retry_list_lock = MD_SPIN_LOCK_UNLOCKED;
21340
+struct raid1_bh *evms_raid1_retry_list = NULL, **evms_raid1_retry_tail;
21342
+static inline void add_node_mapping(
21343
+ struct raid1_bh *r1_bh,
21344
+ evms_logical_node_t *node,
21345
+ struct buffer_head *bh)
21348
+ for (i=0; i<MD_SB_DISKS; i++) {
21349
+ if (!r1_bh->mirror_node_map[i].node) {
21350
+ r1_bh->mirror_node_map[i].node = node;
21351
+ r1_bh->mirror_node_map[i].bh = bh;
21355
+ LOG_ERROR(__FUNCTION__" Cannot create mapping for %s\n",node->name);
21358
+static inline evms_logical_node_t * bh_to_node(
21359
+ struct raid1_bh *r1_bh,
21360
+ struct buffer_head *bh)
21363
+ for (i=0; i<MD_SB_DISKS; i++) {
21364
+ if (r1_bh->mirror_node_map[i].bh == bh) {
21365
+ return r1_bh->mirror_node_map[i].node;
21368
+ LOG_ERROR(__FUNCTION__" Cannot find mapping for bh(%p)\n",bh);
21372
+static struct buffer_head *raid1_alloc_bh(raid1_conf_t *conf, int cnt)
21374
+ /* return a linked list of "cnt" struct buffer_heads.
21375
+ * don't take any off the free list unless we know we can
21376
+ * get all we need, otherwise we could deadlock
21378
+ struct buffer_head *bh=NULL;
21381
+ struct buffer_head *t;
21382
+ md_spin_lock_irq(&conf->device_lock);
21383
+ if (!conf->freebh_blocked && conf->freebh_cnt >= cnt)
21385
+ t = conf->freebh;
21386
+ conf->freebh = t->b_next;
21390
+ conf->freebh_cnt--;
21393
+ md_spin_unlock_irq(&conf->device_lock);
21396
+ t = kmem_cache_alloc(bh_cachep, SLAB_NOIO);
21402
+ PRINTK("raid1: waiting for %d bh\n", cnt);
21403
+ conf->freebh_blocked = 1;
21404
+ wait_disk_event(conf->wait_buffer,
21405
+ !conf->freebh_blocked ||
21406
+ conf->freebh_cnt > conf->raid_disks * NR_RESERVED_BUFS/2);
21407
+ conf->freebh_blocked = 0;
21413
+static inline void raid1_free_bh(raid1_conf_t *conf, struct buffer_head *bh)
21415
+ unsigned long flags;
21416
+ spin_lock_irqsave(&conf->device_lock, flags);
21418
+ struct buffer_head *t = bh;
21420
+ if (t->b_pprev == NULL)
21421
+ kmem_cache_free(bh_cachep, t);
21423
+ t->b_next= conf->freebh;
21424
+ conf->freebh = t;
21425
+ conf->freebh_cnt++;
21428
+ spin_unlock_irqrestore(&conf->device_lock, flags);
21429
+ wake_up(&conf->wait_buffer);
21432
+static int raid1_grow_bh(raid1_conf_t *conf, int cnt)
21434
+ /* allocate cnt buffer_heads, possibly less if kmalloc fails */
21437
+ while (i < cnt) {
21438
+ struct buffer_head *bh;
21439
+ bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL);
21442
+ md_spin_lock_irq(&conf->device_lock);
21443
+ bh->b_pprev = &conf->freebh;
21444
+ bh->b_next = conf->freebh;
21445
+ conf->freebh = bh;
21446
+ conf->freebh_cnt++;
21447
+ md_spin_unlock_irq(&conf->device_lock);
21454
+static void raid1_shrink_bh(raid1_conf_t *conf)
21456
+ /* discard all buffer_heads */
21458
+ md_spin_lock_irq(&conf->device_lock);
21459
+ while (conf->freebh) {
21460
+ struct buffer_head *bh = conf->freebh;
21461
+ conf->freebh = bh->b_next;
21462
+ kmem_cache_free(bh_cachep, bh);
21463
+ conf->freebh_cnt--;
21465
+ md_spin_unlock_irq(&conf->device_lock);
21469
+static struct raid1_bh *raid1_alloc_r1bh(raid1_conf_t *conf)
21471
+ struct raid1_bh *r1_bh = NULL;
21474
+ md_spin_lock_irq(&conf->device_lock);
21475
+ if (!conf->freer1_blocked && conf->freer1) {
21476
+ r1_bh = conf->freer1;
21477
+ conf->freer1 = r1_bh->next_r1;
21478
+ conf->freer1_cnt--;
21479
+ r1_bh->next_r1 = NULL;
21480
+ r1_bh->state = (1 << R1BH_PreAlloc);
21481
+ r1_bh->bh_req.b_state = 0;
21482
+ memset(r1_bh->mirror_node_map, 0, sizeof(r1_bh->mirror_node_map));
21484
+ md_spin_unlock_irq(&conf->device_lock);
21487
+ r1_bh = (struct raid1_bh *) kmalloc(sizeof(struct raid1_bh), GFP_NOIO);
21489
+ memset(r1_bh, 0, sizeof(*r1_bh));
21492
+ conf->freer1_blocked = 1;
21493
+ wait_disk_event(conf->wait_buffer,
21494
+ !conf->freer1_blocked ||
21495
+ conf->freer1_cnt > NR_RESERVED_BUFS/2
21497
+ conf->freer1_blocked = 0;
21501
+static inline void raid1_free_r1bh(struct raid1_bh *r1_bh)
21503
+ struct buffer_head *bh = r1_bh->mirror_bh_list;
21504
+ raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
21506
+ r1_bh->mirror_bh_list = NULL;
21508
+ if (test_bit(R1BH_PreAlloc, &r1_bh->state)) {
21509
+ unsigned long flags;
21510
+ spin_lock_irqsave(&conf->device_lock, flags);
21511
+ r1_bh->next_r1 = conf->freer1;
21512
+ conf->freer1 = r1_bh;
21513
+ conf->freer1_cnt++;
21514
+ spin_unlock_irqrestore(&conf->device_lock, flags);
21515
+ /* don't need to wakeup wait_buffer because
21516
+ * raid1_free_bh below will do that
21521
+ raid1_free_bh(conf, bh);
21524
+static int raid1_grow_r1bh (raid1_conf_t *conf, int cnt)
21528
+ while (i < cnt) {
21529
+ struct raid1_bh *r1_bh;
21530
+ r1_bh = (struct raid1_bh*)kmalloc(sizeof(*r1_bh), GFP_KERNEL);
21533
+ memset(r1_bh, 0, sizeof(*r1_bh));
21534
+ set_bit(R1BH_PreAlloc, &r1_bh->state);
21535
+ r1_bh->mddev = conf->mddev;
21537
+ raid1_free_r1bh(r1_bh);
21543
+static void raid1_shrink_r1bh(raid1_conf_t *conf)
21545
+ md_spin_lock_irq(&conf->device_lock);
21546
+ while (conf->freer1) {
21547
+ struct raid1_bh *r1_bh = conf->freer1;
21548
+ conf->freer1 = r1_bh->next_r1;
21549
+ conf->freer1_cnt--;
21552
+ md_spin_unlock_irq(&conf->device_lock);
21557
+static inline void raid1_free_buf(struct raid1_bh *r1_bh)
21559
+ unsigned long flags;
21560
+ struct buffer_head *bh = r1_bh->mirror_bh_list;
21561
+ raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
21562
+ r1_bh->mirror_bh_list = NULL;
21564
+ spin_lock_irqsave(&conf->device_lock, flags);
21565
+ r1_bh->next_r1 = conf->freebuf;
21566
+ conf->freebuf = r1_bh;
21567
+ spin_unlock_irqrestore(&conf->device_lock, flags);
21568
+ raid1_free_bh(conf, bh);
21571
+static struct raid1_bh *raid1_alloc_buf(raid1_conf_t *conf)
21573
+ struct raid1_bh *r1_bh;
21575
+ md_spin_lock_irq(&conf->device_lock);
21576
+ wait_event_lock_irq(conf->wait_buffer, conf->freebuf, conf->device_lock);
21577
+ r1_bh = conf->freebuf;
21578
+ conf->freebuf = r1_bh->next_r1;
21579
+ r1_bh->next_r1= NULL;
21580
+ md_spin_unlock_irq(&conf->device_lock);
21581
+ memset(r1_bh->mirror_node_map, 0, sizeof(r1_bh->mirror_node_map));
21585
+static int raid1_grow_buffers (raid1_conf_t *conf, int cnt)
21589
+ md_spin_lock_irq(&conf->device_lock);
21590
+ while (i < cnt) {
21591
+ struct raid1_bh *r1_bh;
21592
+ struct page *page;
21594
+ page = alloc_page(GFP_KERNEL);
21598
+ r1_bh = (struct raid1_bh *) kmalloc(sizeof(*r1_bh), GFP_KERNEL);
21600
+ __free_page(page);
21603
+ memset(r1_bh, 0, sizeof(*r1_bh));
21604
+ r1_bh->bh_req.b_page = page;
21605
+ r1_bh->bh_req.b_data = page_address(page);
21606
+ r1_bh->next_r1 = conf->freebuf;
21607
+ conf->freebuf = r1_bh;
21610
+ md_spin_unlock_irq(&conf->device_lock);
21614
+static void raid1_shrink_buffers (raid1_conf_t *conf)
21616
+ md_spin_lock_irq(&conf->device_lock);
21617
+ while (conf->freebuf) {
21618
+ struct raid1_bh *r1_bh = conf->freebuf;
21619
+ conf->freebuf = r1_bh->next_r1;
21620
+ __free_page(r1_bh->bh_req.b_page);
21623
+ md_spin_unlock_irq(&conf->device_lock);
21628
+ * EVMS raid1 version of raid1_map()
21630
+static int evms_raid1_map (mddev_t *mddev, evms_logical_node_t **node)
21632
+ raid1_conf_t *conf = mddev_to_conf(mddev);
21636
+ * Later we do read balancing on the read side
21637
+ * now we use the first available disk.
21640
+ for (i = 0; i < MD_SB_DISKS; i++) {
21641
+ if (conf->mirrors[i].operational) {
21642
+ *node = conf->mirrors[i].node;
21647
+ LOG_ERROR("huh, no more operational devices?\n");
21652
+static void raid1_reschedule_retry (struct raid1_bh *r1_bh)
21654
+ unsigned long flags;
21655
+ mddev_t *mddev = r1_bh->mddev;
21656
+ raid1_conf_t *conf = mddev_to_conf(mddev);
21658
+ md_spin_lock_irqsave(&retry_list_lock, flags);
21659
+ if (evms_raid1_retry_list == NULL)
21660
+ evms_raid1_retry_tail = &evms_raid1_retry_list;
21661
+ *evms_raid1_retry_tail = r1_bh;
21662
+ evms_raid1_retry_tail = &r1_bh->next_r1;
21663
+ r1_bh->next_r1 = NULL;
21664
+ md_spin_unlock_irqrestore(&retry_list_lock, flags);
21665
+ evms_cs_wakeup_thread(conf->thread);
21669
+static void inline io_request_done(unsigned long sector, raid1_conf_t *conf, int phase)
21671
+ unsigned long flags;
21672
+ spin_lock_irqsave(&conf->segment_lock, flags);
21673
+ if (sector < conf->start_active)
21674
+ conf->cnt_done--;
21675
+ else if (sector >= conf->start_future && conf->phase == phase)
21676
+ conf->cnt_future--;
21677
+ else if (!--conf->cnt_pending)
21678
+ wake_up(&conf->wait_ready);
21680
+ spin_unlock_irqrestore(&conf->segment_lock, flags);
21683
+static void inline sync_request_done (unsigned long sector, raid1_conf_t *conf)
21685
+ unsigned long flags;
21686
+ spin_lock_irqsave(&conf->segment_lock, flags);
21687
+ if (sector >= conf->start_ready)
21688
+ --conf->cnt_ready;
21689
+ else if (sector >= conf->start_active) {
21690
+ if (!--conf->cnt_active) {
21691
+ conf->start_active = conf->start_ready;
21692
+ wake_up(&conf->wait_done);
21695
+ spin_unlock_irqrestore(&conf->segment_lock, flags);
21699
+ * raid1_end_bh_io() is called when we have finished servicing a mirrored
21700
+ * operation and are ready to return a success/failure code to the buffer
21703
+static void raid1_end_bh_io (struct raid1_bh *r1_bh, int uptodate)
21705
+ struct buffer_head *bh = r1_bh->master_bh;
21706
+ unsigned long rsector = (unsigned long)r1_bh->eio.rsector;
21708
+ //io_request_done(bh->b_rsector, mddev_to_conf(r1_bh->mddev),
21709
+ io_request_done(rsector, mddev_to_conf(r1_bh->mddev),
21710
+ test_bit(R1BH_SyncPhase, &r1_bh->state));
21712
+ bh->b_end_io(bh, uptodate);
21713
+ raid1_free_r1bh(r1_bh);
21716
+void evms_raid1_end_request (struct buffer_head *bh, int uptodate)
21718
+ struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
21721
+ * this branch is our 'one mirror IO has finished' event handler:
21726
+ evms_md_error (r1_bh->mddev, r1_bh->node);
21727
+ else { /* WRITE */
21728
+ evms_logical_node_t *node;
21729
+ node = bh_to_node(r1_bh,bh);
21731
+ evms_md_error (r1_bh->mddev, node);
21735
+ * Set R1BH_Uptodate in our master buffer_head, so that
21736
+ * we will return a good error code for to the higher
21737
+ * levels even if IO on some other mirrored buffer fails.
21739
+ * The 'master' represents the complex operation to
21740
+ * user-side. So if something waits for IO, then it will
21741
+ * wait for the 'master' buffer_head.
21743
+ set_bit (R1BH_Uptodate, &r1_bh->state);
21746
+ * We split up the read and write side, imho they are
21747
+ * conceptually different.
21750
+ if ( (r1_bh->cmd == READ) || (r1_bh->cmd == READA) ) {
21752
+ * we have only one buffer_head on the read side
21756
+ raid1_end_bh_io(r1_bh, uptodate);
21760
+ * oops, read error:
21762
+ LOG_ERROR("rescheduling block %lu\n", bh->b_blocknr);
21763
+ raid1_reschedule_retry(r1_bh);
21770
+ * Let's see if all mirrored write operations have finished
21774
+ if (atomic_dec_and_test(&r1_bh->remaining))
21775
+ raid1_end_bh_io(r1_bh, test_bit(R1BH_Uptodate, &r1_bh->state));
21779
+ * This routine returns the disk from which the requested read should
21780
+ * be done. It bookkeeps the last read position for every disk
21781
+ * in array and when new read requests come, the disk which last
21782
+ * position is nearest to the request, is chosen.
21784
+ * TODO: now if there are 2 mirrors in the same 2 devices, performance
21785
+ * degrades dramatically because position is mirror, not device based.
21786
+ * This should be changed to be device based. Also atomic sequential
21787
+ * reads should be somehow balanced.
21790
+//static int raid1_read_balance (raid1_conf_t *conf, struct buffer_head *bh)
21791
+static int raid1_read_balance (raid1_conf_t *conf, eio_t *eio)
21793
+ int new_disk = conf->last_used;
21794
+ //const int sectors = bh->b_size >> 9;
21795
+ const int sectors = (int)eio->rsize;
21796
+ //const unsigned long this_sector = bh->b_rsector;
21797
+ const unsigned long this_sector = (unsigned long)eio->rsector;
21798
+ int disk = new_disk;
21799
+ unsigned long new_distance;
21800
+ unsigned long current_distance;
21803
+ * Check if it is sane at all to balance
21806
+ if (conf->resync_mirrors)
21810
+ /* make sure that disk is operational */
21811
+ while( !conf->mirrors[new_disk].operational) {
21812
+ if (new_disk <= 0) new_disk = conf->raid_disks;
21814
+ if (new_disk == disk) {
21816
+ * This means no working disk was found
21817
+ * Nothing much to do, lets not change anything
21818
+ * and hope for the best...
21821
+ new_disk = conf->last_used;
21827
+ /* now disk == new_disk == starting point for search */
21830
+ * Don't touch anything for sequential reads.
21833
+ if (this_sector == conf->mirrors[new_disk].head_position)
21837
+ * If reads have been done only on a single disk
21838
+ * for a time, lets give another disk a change.
21839
+ * This is for kicking those idling disks so that
21840
+ * they would find work near some hotspot.
21843
+ if (conf->sect_count >= conf->mirrors[new_disk].sect_limit) {
21844
+ conf->sect_count = 0;
21848
+ new_disk = conf->raid_disks;
21850
+ if (new_disk == disk)
21852
+ } while ((conf->mirrors[new_disk].write_only) ||
21853
+ (!conf->mirrors[new_disk].operational));
21858
+ current_distance = abs(this_sector -
21859
+ conf->mirrors[disk].head_position);
21861
+ /* Find the disk which is closest */
21865
+ disk = conf->raid_disks;
21868
+ if ((conf->mirrors[disk].write_only) ||
21869
+ (!conf->mirrors[disk].operational))
21872
+ new_distance = abs(this_sector -
21873
+ conf->mirrors[disk].head_position);
21875
+ if (new_distance < current_distance) {
21876
+ conf->sect_count = 0;
21877
+ current_distance = new_distance;
21880
+ } while (disk != conf->last_used);
21883
+ conf->mirrors[new_disk].head_position = this_sector + sectors;
21885
+ conf->last_used = new_disk;
21886
+ conf->sect_count += sectors;
21892
+static int raid1_init_io(mddev_t *mddev,
21894
+ evms_sector_t LSN,
21895
+ evms_sector_t nr_sects,
21899
+ raid1_conf_t *conf = mddev_to_conf(mddev);
21900
+ struct mirror_info *mirror;
21902
+ LOG_EXTRA(__FUNCTION__ " LSN=%Lu, nr_sects=%Lu\n", LSN, nr_sects);
21904
+ if (rw == READ) {
21906
+ * read balancing logic:
21909
+ eio.rsector = LSN;
21910
+ eio.rsize = nr_sects;
21911
+ mirror = conf->mirrors + raid1_read_balance(conf, &eio);
21913
+ return INIT_IO(mirror->node, rw, LSN, nr_sects, data);
21916
+ int saved_rc = 0;
21917
+ for (i=0; i< MD_SB_DISKS; i++) {
21918
+ if (!conf->mirrors[i].operational)
21920
+ rc = INIT_IO(conf->mirrors[i].node, rw, LSN, nr_sects, data);
21922
+ LOG_ERROR(__FUNCTION__ " WRITE failed on %s, rc=%d\n",
21923
+ conf->mirrors[i].node->name, rc);
21934
+static int raid1_make_request (mddev_t *mddev,
21938
+ struct buffer_head *bh = eio->bh;
21939
+ unsigned long rsector = (unsigned long)eio->rsector;
21940
+ raid1_conf_t *conf = mddev_to_conf(mddev);
21941
+ struct buffer_head *bh_req;
21942
+ struct raid1_bh * r1_bh;
21943
+ int disks = MD_SB_DISKS;
21944
+ struct buffer_head *bhl;
21945
+ int i, sum_bhs = 0;
21946
+ struct mirror_info *mirror;
21948
+ if (!buffer_locked(bh))
21952
+ * make_request() can abort the operation when READA is being
21953
+ * used and no empty request is available.
21955
+ * Currently, just replace the command with READ/WRITE.
21960
+ r1_bh = raid1_alloc_r1bh (conf);
21962
+ spin_lock_irq(&conf->segment_lock);
21963
+ wait_event_lock_irq(conf->wait_done,
21964
+ rsector < conf->start_active ||
21965
+ rsector >= conf->start_future,
21966
+ conf->segment_lock);
21967
+ if (rsector < conf->start_active)
21968
+ conf->cnt_done++;
21970
+ conf->cnt_future++;
21972
+ set_bit(R1BH_SyncPhase, &r1_bh->state);
21974
+ spin_unlock_irq(&conf->segment_lock);
21977
+ * i think the read and write branch should be separated completely,
21978
+ * since we want to do read balancing on the read side for example.
21979
+ * Alternative implementations? :) --mingo
21982
+ r1_bh->master_bh = bh;
21983
+ r1_bh->mddev = mddev;
21986
+ if (rw == READ) {
21988
+ * read balancing logic:
21990
+ //mirror = conf->mirrors + raid1_read_balance(conf, bh);
21991
+ mirror = conf->mirrors + raid1_read_balance(conf, eio);
21993
+ bh_req = &r1_bh->bh_req;
21994
+ memcpy(bh_req, bh, sizeof(*bh));
21995
+ bh_req->b_blocknr = rsector;
21996
+ bh_req->b_dev = mirror->dev;
21997
+ bh_req->b_rdev = mirror->dev;
21998
+ /* bh_req->b_rsector = bh->n_rsector; */
21999
+ bh_req->b_end_io = evms_raid1_end_request;
22000
+ bh_req->b_private = r1_bh;
22001
+ //generic_make_request (rw, bh_req);
22002
+ eio->bh = bh_req;
22003
+ r1_bh->node = mirror->node;
22004
+ r1_bh->eio = *eio;
22005
+ R_IO(mirror->node, eio);
22013
+ bhl = raid1_alloc_bh(conf, conf->raid_disks);
22014
+ r1_bh->node = NULL;
22015
+ r1_bh->eio = *eio;
22016
+ for (i = 0; i < disks; i++) {
22017
+ struct buffer_head *mbh;
22018
+ if (!conf->mirrors[i].operational)
22022
+ * We should use a private pool (size depending on NR_REQUEST),
22023
+ * to avoid writes filling up the memory with bhs
22025
+ * Such pools are much faster than kmalloc anyways (so we waste
22026
+ * almost nothing by not using the master bh when writing and
22027
+ * win alot of cleanness) but for now we are cool enough. --mingo
22029
+ * It's safe to sleep here, buffer heads cannot be used in a shared
22030
+ * manner in the write branch. Look how we lock the buffer at the
22031
+ * beginning of this function to grok the difference ;)
22034
+ if (mbh == NULL) {
22038
+ bhl = mbh->b_next;
22039
+ mbh->b_next = NULL;
22040
+ mbh->b_this_page = (struct buffer_head *)1;
22043
+ * prepare mirrored mbh (fields ordered for max mem throughput):
22045
+ mbh->b_blocknr = rsector;
22046
+ mbh->b_dev = conf->mirrors[i].dev;
22047
+ mbh->b_rdev = conf->mirrors[i].dev;
22048
+ mbh->b_rsector = rsector;
22049
+ mbh->b_state = (1<<BH_Req) | (1<<BH_Dirty) |
22050
+ (1<<BH_Mapped) | (1<<BH_Lock);
22052
+ atomic_set(&mbh->b_count, 1);
22053
+ mbh->b_size = bh->b_size;
22054
+ mbh->b_page = bh->b_page;
22055
+ mbh->b_data = bh->b_data;
22056
+ mbh->b_list = BUF_LOCKED;
22057
+ mbh->b_end_io = evms_raid1_end_request;
22058
+ //mbh->b_private = r1_bh;
22059
+ mbh->b_private = conf->mirrors[i].node;
22061
+ mbh->b_next = r1_bh->mirror_bh_list;
22062
+ r1_bh->mirror_bh_list = mbh;
22065
+ if (bhl) raid1_free_bh(conf,bhl);
22067
+ /* Gag - all mirrors non-operational.. */
22068
+ raid1_end_bh_io(r1_bh, 0);
22071
+ md_atomic_set(&r1_bh->remaining, sum_bhs);
22074
+ * We have to be a bit careful about the semaphore above, thats
22075
+ * why we start the requests separately. Since kmalloc() could
22076
+ * fail, sleep and make_request() can sleep too, this is the
22077
+ * safer solution. Imagine, end_request decreasing the semaphore
22078
+ * before we could have set it up ... We could play tricks with
22079
+ * the semaphore (presetting it and correcting at the end if
22080
+ * sum_bhs is not 'n' but we have to do end_request by hand if
22081
+ * all requests finish until we had a chance to set up the
22082
+ * semaphore correctly ... lots of races).
22084
+ bh = r1_bh->mirror_bh_list;
22086
+ evms_logical_node_t *node;
22088
+ struct buffer_head *bh2 = bh;
22091
+ node = (evms_logical_node_t *)bh2->b_private;
22092
+ bh2->b_private = r1_bh;
22093
+ this_eio = r1_bh->eio;
22094
+ this_eio.bh = bh2;
22095
+ add_node_mapping(r1_bh, node, bh2);
22096
+ W_IO(node, &this_eio);
22097
+ //generic_make_request(rw, bh2);
22103
+static int raid1_status (char *page, mddev_t *mddev)
22105
+ raid1_conf_t *conf = mddev_to_conf(mddev);
22108
+ sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks,
22109
+ conf->working_disks);
22110
+ for (i = 0; i < conf->raid_disks; i++)
22111
+ sz += sprintf (page+sz, "%s",
22112
+ conf->mirrors[i].operational ? "U" : "_");
22113
+ sz += sprintf (page+sz, "]");
22117
+#define LAST_DISK KERN_ALERT \
22118
+"EVMS raid1: only one disk left and IO error.\n"
22120
+#define NO_SPARE_DISK KERN_ALERT \
22121
+"EVMS raid1: no spare disk left, degrading mirror level by one.\n"
22123
+#define DISK_FAILED KERN_ALERT \
22124
+"EVMS raid1: Disk failure on %s, disabling device. \n" \
22125
+" Operation continuing on %d devices\n"
22127
+#define START_SYNCING KERN_ALERT \
22128
+"EVMS raid1: start syncing spare disk.\n"
22130
+#define ALREADY_SYNCING KERN_INFO \
22131
+"EVMS raid1: syncing already in progress.\n"
22133
+static void mark_disk_bad (mddev_t *mddev, int failed)
22135
+ raid1_conf_t *conf = mddev_to_conf(mddev);
22136
+ struct mirror_info *mirror = conf->mirrors+failed;
22137
+ mdp_super_t *sb = mddev->sb;
22139
+ mirror->operational = 0;
22140
+ mark_disk_faulty(sb->disks+mirror->number);
22141
+ mark_disk_nonsync(sb->disks+mirror->number);
22142
+ mark_disk_inactive(sb->disks+mirror->number);
22143
+ if (!mirror->write_only)
22144
+ sb->active_disks--;
22145
+ sb->working_disks--;
22146
+ sb->failed_disks++;
22147
+ mddev->sb_dirty = 1;
22148
+ evms_cs_wakeup_thread(conf->thread);
22149
+ if (!mirror->write_only)
22150
+ conf->working_disks--;
22151
+ LOG_SERIOUS(DISK_FAILED, evms_md_partition_name(mirror->node),conf->working_disks);
22154
+static int raid1_error (
22156
+ evms_logical_node_t *node)
22158
+ raid1_conf_t *conf = mddev_to_conf(mddev);
22159
+ struct mirror_info * mirrors = conf->mirrors;
22160
+ int disks = MD_SB_DISKS;
22163
+ /* Find the drive.
22164
+ * If it is not operational, then we have already marked it as dead
22165
+ * else if it is the last working disks, ignore the error, let the
22166
+ * next level up know.
22167
+ * else mark the drive as failed
22170
+ for (i = 0; i < disks; i++)
22171
+ if (mirrors[i].node==node && mirrors[i].operational)
22176
+ if (i < conf->raid_disks && conf->working_disks == 1) {
22177
+ /* Don't fail the drive, act as though we were just a
22178
+ * normal single drive
22183
+ mark_disk_bad(mddev, i);
22188
+#undef NO_SPARE_DISK
22189
+#undef DISK_FAILED
22190
+#undef START_SYNCING
22193
+static void print_raid1_conf (raid1_conf_t *conf)
22196
+ struct mirror_info *tmp;
22198
+ LOG_DEFAULT("RAID1 conf printout:\n");
22200
+ LOG_DEFAULT("(conf==NULL)\n");
22203
+ LOG_DEFAULT(" --- wd:%d rd:%d nd:%d\n",
22204
+ conf->working_disks,conf->raid_disks, conf->nr_disks);
22206
+ for (i = 0; i < conf->nr_disks; i++) {
22207
+ tmp = conf->mirrors + i;
22208
+ LOG_DEFAULT(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
22209
+ i, tmp->spare,tmp->operational,
22210
+ tmp->number,tmp->raid_disk,tmp->used_slot,
22211
+ evms_md_partition_name(tmp->node));
22215
+static void close_sync(raid1_conf_t *conf)
22217
+ mddev_t *mddev = conf->mddev;
22218
+ /* If reconstruction was interrupted, we need to close the "active" and "pending"
22220
+ * we know that there are no active rebuild requests, os cnt_active == cnt_ready ==0
22222
+ /* this is really needed when recovery stops too... */
22223
+ spin_lock_irq(&conf->segment_lock);
22224
+ conf->start_active = conf->start_pending;
22225
+ conf->start_ready = conf->start_pending;
22226
+ wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
22227
+ conf->start_active =conf->start_ready = conf->start_pending = conf->start_future;
22228
+ conf->start_future = mddev->sb->size+1;
22229
+ conf->cnt_pending = conf->cnt_future;
22230
+ conf->cnt_future = 0;
22231
+ conf->phase = conf->phase ^1;
22232
+ wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
22233
+ conf->start_active = conf->start_ready = conf->start_pending = conf->start_future = 0;
22235
+ conf->cnt_future = conf->cnt_done;;
22236
+ conf->cnt_done = 0;
22237
+ spin_unlock_irq(&conf->segment_lock);
22238
+ wake_up(&conf->wait_done);
22241
+static int raid1_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
22244
+ int i, failed_disk=-1, spare_disk=-1, removed_disk=-1;
22245
+ raid1_conf_t *conf = mddev->private;
22246
+ struct mirror_info *tmp, *sdisk, *fdisk, *rdisk;
22247
+ mdp_super_t *sb = mddev->sb;
22248
+ mdp_disk_t *failed_desc, *spare_desc;
22249
+ mdk_rdev_t *spare_rdev, *failed_rdev;
22251
+ print_raid1_conf(conf);
22252
+ md_spin_lock_irq(&conf->device_lock);
22254
+ * find the disk ...
22258
+ case DISKOP_SPARE_ACTIVE:
22261
+ * Find the failed disk within the RAID1 configuration ...
22262
+ * (this can only be in the first conf->working_disks part)
22264
+ for (i = 0; i < conf->raid_disks; i++) {
22265
+ tmp = conf->mirrors + i;
22266
+ if ((!tmp->operational && !tmp->spare) ||
22267
+ !tmp->used_slot) {
22273
+ * When we activate a spare disk we _must_ have a disk in
22274
+ * the lower (active) part of the array to replace.
22276
+/* if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
22281
+ */ /* fall through */
22283
+ case DISKOP_HOT_SPARE_ACTIVE:
22284
+ case DISKOP_SPARE_WRITE:
22285
+ case DISKOP_SPARE_INACTIVE:
22288
+ * Find the spare disk ... (can only be in the 'high'
22289
+ * area of the array)
22290
+ ##### Actually it can be sooner now that we have improved MD #####
22291
+ This support required for expanding number of active mirrors.
22293
+ for (i = 0; i < MD_SB_DISKS; i++) {
22294
+ tmp = conf->mirrors + i;
22295
+ if (tmp->spare && tmp->number == (*d)->number) {
22300
+ if (spare_disk == -1) {
22307
+ case DISKOP_HOT_REMOVE_SPARE:
22309
+ for (i = 0; i < MD_SB_DISKS; i++) {
22310
+ tmp = conf->mirrors + i;
22311
+ if (tmp->used_slot && (tmp->number == (*d)->number)) {
22312
+ if (tmp->operational) {
22315
+ } else if (!tmp->spare){
22320
+ removed_disk = i;
22324
+ if (removed_disk == -1) {
22331
+ case DISKOP_HOT_REMOVE_DISK:
22332
+ if (conf->working_disks <= 1) {
22336
+ for (i = 0; i < MD_SB_DISKS; i++) {
22337
+ tmp = conf->mirrors + i;
22338
+ if (tmp->used_slot && (tmp->number == (*d)->number)) {
22339
+ removed_disk = i;
22343
+ if (removed_disk == -1) {
22350
+ case DISKOP_HOT_ADD_DISK:
22358
+ * Switch the spare disk to write-only mode:
22360
+ case DISKOP_SPARE_WRITE:
22361
+ sdisk = conf->mirrors + spare_disk;
22362
+ sdisk->operational = 1;
22363
+ sdisk->write_only = 1;
22366
+ * Deactivate a spare disk:
22368
+ case DISKOP_SPARE_INACTIVE:
22369
+ close_sync(conf);
22370
+ sdisk = conf->mirrors + spare_disk;
22371
+ sdisk->operational = 0;
22372
+ sdisk->write_only = 0;
22375
+ * Activate (mark read-write) the (now sync) spare disk,
22376
+ * which means we switch it's 'raid position' (->raid_disk)
22377
+ * with the failed disk. (only the first 'conf->nr_disks'
22378
+ * slots are used for 'real' disks and we must preserve this
22381
+ case DISKOP_SPARE_ACTIVE:
22382
+ close_sync(conf);
22383
+ sdisk = conf->mirrors + spare_disk;
22384
+ if (failed_disk < 0) {
22385
+ // preset failed disk to itself if no failed disk.
22386
+ failed_disk = spare_disk;
22387
+ // try to find spare earlier in array
22388
+ for (i = conf->raid_disks; i < spare_disk; i++) {
22389
+ tmp = conf->mirrors + i;
22390
+ if ((tmp->spare) || !tmp->used_slot) {
22396
+ fdisk = conf->mirrors + failed_disk;
22398
+ spare_desc = &sb->disks[sdisk->number];
22399
+ failed_desc = &sb->disks[fdisk->number];
22401
+ if (spare_desc != *d) {
22407
+ if (spare_desc->raid_disk != sdisk->raid_disk) {
22413
+ if (sdisk->raid_disk != spare_disk) {
22419
+ if (failed_desc->raid_disk != fdisk->raid_disk) {
22425
+ if (fdisk->raid_disk != failed_disk) {
22432
+ * do the switch finally
22434
+ spare_rdev = evms_md_find_rdev_nr(mddev, spare_desc->number);
22435
+ failed_rdev = evms_md_find_rdev_nr(mddev, failed_desc->number);
22437
+ /* There must be a spare_rdev, but there may not be a
22438
+ * failed_rdev. That slot might be empty...
22440
+ spare_rdev->desc_nr = failed_desc->number;
22442
+ failed_rdev->desc_nr = spare_desc->number;
22444
+ xchg_values(*spare_desc, *failed_desc);
22445
+ xchg_values(*fdisk, *sdisk);
22448
+ * (careful, 'failed' and 'spare' are switched from now on)
22450
+ * we want to preserve linear numbering and we want to
22451
+ * give the proper raid_disk number to the now activated
22452
+ * disk. (this means we switch back these values)
22455
+ xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
22456
+ xchg_values(sdisk->raid_disk, fdisk->raid_disk);
22457
+ xchg_values(spare_desc->number, failed_desc->number);
22458
+ xchg_values(sdisk->number, fdisk->number);
22460
+ *d = failed_desc;
22462
+ if (sdisk->dev == MKDEV(0,0))
22463
+ sdisk->used_slot = 0;
22465
+ * this really activates the spare.
22467
+ fdisk->spare = 0;
22468
+ fdisk->write_only = 0;
22471
+ * if we activate a spare, we definitely replace a
22472
+ * non-operational disk slot in the 'low' area of
22473
+ * the disk array.
22476
+ conf->working_disks++;
22480
+ /* Activate a spare disk without a failed disk */
22481
+ case DISKOP_HOT_SPARE_ACTIVE:
22482
+ sdisk = conf->mirrors + spare_disk;
22483
+ sdisk->spare = 0;
22484
+ sdisk->write_only = 0;
22485
+ conf->working_disks++;
22486
+ conf->raid_disks++;
22487
+ if (raid1_grow_bh(conf, NR_RESERVED_BUFS) < NR_RESERVED_BUFS)
22488
+ LOG_WARNING("%s: Cannot grow BH pool\n", __FUNCTION__);
22491
+ case DISKOP_HOT_REMOVE_SPARE:
22492
+ rdisk = conf->mirrors + removed_disk;
22494
+ if (removed_disk < conf->raid_disks) {
22500
+ LOG_WARNING("%s: removing spare %s, [md%d] nr_disks=%d\n",
22501
+ __FUNCTION__, evms_md_partition_name(rdisk->node),
22502
+ conf->mddev->__minor, conf->nr_disks-1);
22504
+ rdisk->dev = MKDEV(0,0);
22505
+ rdisk->node = NULL;
22506
+ rdisk->used_slot = 0;
22507
+ conf->nr_disks--;
22510
+ case DISKOP_HOT_REMOVE_DISK:
22511
+ rdisk = conf->mirrors + removed_disk;
22513
+ LOG_WARNING("%s: removing active disk %s, [md%d] nr_disks=%d\n",
22514
+ __FUNCTION__, evms_md_partition_name(rdisk->node),
22515
+ conf->mddev->__minor, conf->nr_disks-1);
22517
+ rdisk->dev = MKDEV(0,0);
22518
+ rdisk->node = NULL;
22519
+ rdisk->used_slot = 0;
22520
+ rdisk->operational = 0;
22521
+ conf->working_disks--;
22522
+ conf->nr_disks--;
22523
+ sb->raid_disks--; //decrement raid disks. md_core now increments
22524
+ //when activating new spare, don't assume add spare here
22532
+ md_spin_unlock_irq(&conf->device_lock);
22533
+ if (state == DISKOP_SPARE_ACTIVE || state == DISKOP_SPARE_INACTIVE)
22534
+ /* should move to "END_REBUILD" when such exists */
22535
+ raid1_shrink_buffers(conf);
22537
+ print_raid1_conf(conf);
22542
+#define IO_ERROR KERN_ALERT \
22543
+"EVMS raid1: %s: unrecoverable I/O read error for block %lu\n"
22545
+#define REDIRECT_SECTOR KERN_ERR \
22546
+"EVMS raid1: %s: redirecting sector %lu to another mirror\n"
22549
+ * This is a kernel thread which:
22551
+ * 1. Retries failed read operations on working mirrors.
22552
+ * 2. Updates the raid superblock when problems encounter.
22553
+ * 3. Performs writes following reads for array syncronising.
22555
+static void end_sync_write(struct buffer_head *bh, int uptodate);
22556
+static void end_sync_read(struct buffer_head *bh, int uptodate);
22558
+static void raid1d (void *data)
22560
+ struct raid1_bh *r1_bh;
22561
+ struct buffer_head *bh;
22562
+ unsigned long flags;
22564
+#ifdef ORG_RAID1_CODE
22569
+ md_spin_lock_irqsave(&retry_list_lock, flags);
22570
+ r1_bh = evms_raid1_retry_list;
22573
+ evms_raid1_retry_list = r1_bh->next_r1;
22574
+ md_spin_unlock_irqrestore(&retry_list_lock, flags);
22576
+ mddev = r1_bh->mddev;
22577
+ if (mddev->sb_dirty) {
22578
+ LOG_DEFAULT("EVMS raid1: dirty sb detected, updating.\n");
22579
+ mddev->sb_dirty = 0;
22580
+ evms_md_update_sb(mddev);
22582
+ bh = &r1_bh->bh_req;
22583
+ switch(r1_bh->cmd) {
22585
+ /* have to allocate lots of bh structures and
22586
+ * schedule writes
22588
+ if (test_bit(R1BH_Uptodate, &r1_bh->state)) {
22589
+ int i, sum_bhs = 0;
22590
+ int disks = MD_SB_DISKS;
22591
+ struct buffer_head *bhl, *mbh;
22592
+ raid1_conf_t *conf;
22594
+ conf = mddev_to_conf(mddev);
22595
+ bhl = raid1_alloc_bh(conf, conf->raid_disks); /* don't really need this many */
22596
+ for (i = 0; i < disks ; i++) {
22597
+ if (!conf->mirrors[i].operational)
22599
+ if (i==conf->last_used)
22600
+ /* we read from here, no need to write */
22602
+ if (i < conf->raid_disks
22603
+ && !conf->resync_mirrors
22604
+ && !conf->mirrors[i].write_only)
22605
+ /* don't need to write this,
22606
+ * we are just rebuilding */
22613
+ bhl = mbh->b_next;
22614
+ mbh->b_this_page = (struct buffer_head *)1;
22618
+ * prepare mirrored bh (fields ordered for max mem throughput):
22620
+ mbh->b_blocknr = bh->b_blocknr;
22621
+ mbh->b_dev = conf->mirrors[i].dev;
22622
+ mbh->b_rdev = conf->mirrors[i].dev;
22623
+ mbh->b_rsector = bh->b_blocknr;
22624
+ mbh->b_state = (1<<BH_Req) | (1<<BH_Dirty) |
22625
+ (1<<BH_Mapped) | (1<<BH_Lock);
22626
+ atomic_set(&mbh->b_count, 1);
22627
+ mbh->b_size = bh->b_size;
22628
+ mbh->b_page = bh->b_page;
22629
+ mbh->b_data = bh->b_data;
22630
+ mbh->b_list = BUF_LOCKED;
22631
+ mbh->b_end_io = end_sync_write;
22632
+ //mbh->b_private = r1_bh;
22633
+ mbh->b_private = conf->mirrors[i].node;
22635
+ mbh->b_next = r1_bh->mirror_bh_list;
22636
+ r1_bh->mirror_bh_list = mbh;
22640
+ md_atomic_set(&r1_bh->remaining, sum_bhs);
22641
+ if (bhl) raid1_free_bh(conf, bhl);
22642
+ mbh = r1_bh->mirror_bh_list;
22645
+ /* nowhere to write this too... I guess we
22648
+ sync_request_done(bh->b_blocknr, conf);
22649
+ evms_md_done_sync(mddev, bh->b_size>>9, 0);
22650
+ raid1_free_buf(r1_bh);
22653
+ evms_logical_node_t *node;
22655
+ struct buffer_head *bh1 = mbh;
22657
+ mbh = mbh->b_next;
22658
+ node = (evms_logical_node_t *)bh1->b_private;
22659
+ bh1->b_private = r1_bh;
22660
+ eio = r1_bh->eio;
22662
+ add_node_mapping(r1_bh, node, bh1);
22663
+ W_IO(node, &eio);
22664
+ evms_md_sync_acct(bh1->b_dev, bh1->b_size/512);
22667
+ /* There is no point trying a read-for-reconstruct
22668
+ * as reconstruct is about to be aborted
22671
+ LOG_ERROR(IO_ERROR, evms_md_partition_name(r1_bh->node), bh->b_blocknr);
22672
+ evms_md_done_sync(mddev, bh->b_size>>9, 0);
22679
+ evms_logical_node_t *node, *new_node;
22681
+ node = r1_bh->node;
22682
+ evms_raid1_map(mddev,&new_node);
22683
+ if (new_node == node) {
22684
+ LOG_ERROR(" unrecoverable read error on %s at LBA(%Lu)\n",
22685
+ node->name, r1_bh->eio.rsector);
22686
+ raid1_end_bh_io(r1_bh, 0);
22688
+ /* retry I/O on new device */
22690
+ eio = r1_bh->eio;
22691
+ R_IO(new_node, &eio);
22697
+ md_spin_unlock_irqrestore(&retry_list_lock, flags);
22700
+#undef REDIRECT_SECTOR
22703
+ * Private kernel thread to reconstruct mirrors after an unclean
22706
+static void raid1syncd (void *data)
22708
+ raid1_conf_t *conf = data;
22709
+ mddev_t *mddev = conf->mddev;
22711
+ if (!conf->resync_mirrors)
22713
+ if (conf->resync_mirrors == 2)
22715
+ down(&mddev->recovery_sem);
22716
+ if (!evms_md_do_sync(mddev, NULL)) {
22718
+ * Only if everything went Ok.
22720
+ conf->resync_mirrors = 0;
22723
+ close_sync(conf);
22725
+ up(&mddev->recovery_sem);
22726
+ raid1_shrink_buffers(conf);
22730
+ * perform a "sync" on one "block"
22732
+ * We need to make sure that no normal I/O request - particularly write
22733
+ * requests - conflict with active sync requests.
22734
+ * This is achieved by conceptually dividing the device space into a
22735
+ * number of sections:
22736
+ * DONE: 0 .. a-1 These blocks are in-sync
22737
+ * ACTIVE: a.. b-1 These blocks may have active sync requests, but
22738
+ * no normal IO requests
22739
+ * READY: b .. c-1 These blocks have no normal IO requests - sync
22740
+ * request may be happening
22741
+ * PENDING: c .. d-1 These blocks may have IO requests, but no new
22742
+ * ones will be added
22743
+ * FUTURE: d .. end These blocks are not to be considered yet. IO may
22744
+ * be happening, but not sync
22747
+ * phase which flips (0 or 1) each time d moves and
22749
+ * z = active io requests in FUTURE since d moved - marked with
22751
+ * y = active io requests in FUTURE before d moved, or PENDING -
22752
+ * marked with previous phase
22753
+ * x = active sync requests in READY
22754
+ * w = active sync requests in ACTIVE
22755
+ * v = active io requests in DONE
22757
+ * Normally, a=b=c=d=0 and z= active io requests
22758
+ * or a=b=c=d=END and v= active io requests
22759
+ * Allowed changes to a,b,c,d:
22760
+ * A: c==d && y==0 -> d+=window, y=z, z=0, phase=!phase
22762
+ * C: b=c, w+=x, x=0
22764
+ * E: a==b==c==d==end -> a=b=c=d=0, z=v, v=0
22766
+ * At start of sync we apply A.
22767
+ * When y reaches 0, we apply B then A then being sync requests
22768
+ * When sync point reaches c-1, we wait for y==0, and W==0, and
22769
+ * then apply apply B then A then D then C.
22770
+ * Finally, we apply E
22772
+ * The sync request simply issues a "read" against a working drive
22773
+ * This is marked so that on completion the raid1d thread is woken to
22774
+ * issue suitable write requests
22777
+static int raid1_sync_request (mddev_t *mddev, unsigned long sector_nr)
22779
+ raid1_conf_t *conf = mddev_to_conf(mddev);
22780
+ struct mirror_info *mirror;
22781
+ struct raid1_bh *r1_bh;
22782
+ struct buffer_head *bh;
22788
+ spin_lock_irq(&conf->segment_lock);
22789
+ if (!sector_nr) {
22790
+ /* initialize ...*/
22792
+ conf->start_active = 0;
22793
+ conf->start_ready = 0;
22794
+ conf->start_pending = 0;
22795
+ conf->start_future = 0;
22797
+ /* we want enough buffers to hold twice the window of 128*/
22798
+ buffs = 128 *2 / (PAGE_SIZE>>9);
22799
+ buffs = raid1_grow_buffers(conf, buffs);
22803
+ conf->window = buffs*(PAGE_SIZE>>9)/2;
22804
+ conf->cnt_future += conf->cnt_done+conf->cnt_pending;
22805
+ conf->cnt_done = conf->cnt_pending = 0;
22806
+ if (conf->cnt_ready || conf->cnt_active)
22809
+ while (sector_nr >= conf->start_pending) {
22810
+ PRINTK("wait .. sect=%lu start_active=%d ready=%d pending=%d future=%d, cnt_done=%d active=%d ready=%d pending=%d future=%d\n",
22811
+ sector_nr, conf->start_active, conf->start_ready, conf->start_pending, conf->start_future,
22812
+ conf->cnt_done, conf->cnt_active, conf->cnt_ready, conf->cnt_pending, conf->cnt_future);
22813
+ wait_event_lock_irq(conf->wait_done,
22814
+ !conf->cnt_active,
22815
+ conf->segment_lock);
22816
+ wait_event_lock_irq(conf->wait_ready,
22817
+ !conf->cnt_pending,
22818
+ conf->segment_lock);
22819
+ conf->start_active = conf->start_ready;
22820
+ conf->start_ready = conf->start_pending;
22821
+ conf->start_pending = conf->start_future;
22822
+ conf->start_future = conf->start_future+conf->window;
22823
+ // Note: falling off the end is not a problem
22824
+ conf->phase = conf->phase ^1;
22825
+ conf->cnt_active = conf->cnt_ready;
22826
+ conf->cnt_ready = 0;
22827
+ conf->cnt_pending = conf->cnt_future;
22828
+ conf->cnt_future = 0;
22829
+ wake_up(&conf->wait_done);
22831
+ conf->cnt_ready++;
22832
+ spin_unlock_irq(&conf->segment_lock);
22835
+ /* If reconstructing, and >1 working disc,
22836
+ * could dedicate one to rebuild and others to
22837
+ * service read requests ..
22839
+ disk = conf->last_used;
22840
+ /* make sure disk is operational */
22841
+ while (!conf->mirrors[disk].operational) {
22842
+ if (disk <= 0) disk = conf->raid_disks;
22844
+ if (disk == conf->last_used)
22847
+ conf->last_used = disk;
22849
+ mirror = conf->mirrors+conf->last_used;
22851
+ r1_bh = raid1_alloc_buf (conf);
22852
+ r1_bh->master_bh = NULL;
22853
+ r1_bh->mddev = mddev;
22854
+ r1_bh->cmd = SPECIAL;
22855
+ bh = &r1_bh->bh_req;
22857
+ block_nr = sector_nr;
22859
+ while (!(block_nr & 1) && bsize < PAGE_SIZE
22860
+ && (block_nr+2)*(bsize>>9) <= (mddev->sb->size *2)) {
22864
+ bh->b_size = bsize;
22865
+ bh->b_list = BUF_LOCKED;
22866
+ bh->b_dev = mirror->dev;
22867
+ bh->b_rdev = mirror->dev;
22868
+ bh->b_state = (1<<BH_Req) | (1<<BH_Mapped) | (1<<BH_Lock);
22873
+ if (bh->b_data != page_address(bh->b_page))
22875
+ bh->b_end_io = end_sync_read;
22876
+ bh->b_private = r1_bh;
22877
+ bh->b_blocknr = sector_nr;
22878
+ bh->b_rsector = sector_nr;
22879
+ r1_bh->node = mirror->node;
22880
+ r1_bh->eio.bh = bh;
22881
+ r1_bh->eio.rsector = bh->b_rsector;
22882
+ r1_bh->eio.rsize = bh->b_size/512;
22883
+ eio = r1_bh->eio;
22884
+ init_waitqueue_head(&bh->b_wait);
22886
+ R_IO(mirror->node,&eio);
22887
+ evms_md_sync_acct(bh->b_dev, bh->b_size/512);
22889
+ return (bsize >> 9);
22892
+ raid1_shrink_buffers(conf);
22893
+ spin_unlock_irq(&conf->segment_lock);
22897
+static void end_sync_read(struct buffer_head *bh, int uptodate)
22899
+ struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
22901
+ /* we have read a block, now it needs to be re-written,
22902
+ * or re-read if the read failed.
22903
+ * We don't do much here, just schedule handling by raid1d
22907
+ evms_md_error (r1_bh->mddev, r1_bh->node);
22910
+ set_bit(R1BH_Uptodate, &r1_bh->state);
22911
+ raid1_reschedule_retry(r1_bh);
22914
+static void end_sync_write(struct buffer_head *bh, int uptodate)
22916
+ struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
22919
+ evms_logical_node_t *node;
22920
+ node = bh_to_node(r1_bh,bh);
22922
+ evms_md_error (r1_bh->mddev, node);
22924
+ if (atomic_dec_and_test(&r1_bh->remaining)) {
22925
+ mddev_t *mddev = r1_bh->mddev;
22926
+ unsigned long sect = bh->b_blocknr;
22927
+ int size = bh->b_size;
22928
+ raid1_free_buf(r1_bh);
22929
+ sync_request_done(sect, mddev_to_conf(mddev));
22930
+ evms_md_done_sync(mddev,size>>9, uptodate);
22934
+#define INVALID_LEVEL KERN_WARNING \
22935
+"EVMS raid1: md%d: raid level not set to mirroring (%d)\n"
22937
+#define NO_SB KERN_ERR \
22938
+"EVMS raid1: disabled mirror %s (couldn't access raid superblock)\n"
22940
+#define ERRORS KERN_ERR \
22941
+"EVMS raid1: disabled mirror %s (errors detected)\n"
22943
+#define NOT_IN_SYNC KERN_ERR \
22944
+"EVMS raid1: disabled mirror %s (not in sync)\n"
22946
+#define INCONSISTENT KERN_ERR \
22947
+"EVMS raid1: disabled mirror %s (inconsistent descriptor)\n"
22949
+#define ALREADY_RUNNING KERN_ERR \
22950
+"EVMS raid1: disabled mirror %s (mirror %d already operational)\n"
22952
+#define OPERATIONAL KERN_INFO \
22953
+"EVMS raid1: device %s operational as mirror %d\n"
22955
+#define MEM_ERROR KERN_ERR \
22956
+"EVMS raid1: couldn't allocate memory for md%d\n"
22958
+#define SPARE KERN_INFO \
22959
+"EVMS raid1: spare disk %s\n"
22961
+#define NONE_OPERATIONAL KERN_ERR \
22962
+"EVMS raid1: no operational mirrors for md%d\n"
22964
+#define ARRAY_IS_ACTIVE KERN_INFO \
22965
+"EVMS raid1: raid set md%d active with %d out of %d mirrors\n"
22967
+#define THREAD_ERROR KERN_ERR \
22968
+"EVMS raid1: couldn't allocate thread for md%d\n"
22970
+#define START_RESYNC KERN_WARNING \
22971
+"EVMS raid1: raid set md%d not clean; reconstructing mirrors\n"
22973
+static int raid1_run (mddev_t *mddev)
22975
+ raid1_conf_t *conf;
22976
+ int i, j, disk_idx;
22977
+ struct mirror_info *disk;
22978
+ mdp_super_t *sb = mddev->sb;
22979
+ mdp_disk_t *descriptor;
22980
+ mdk_rdev_t *rdev;
22981
+ struct md_list_head *tmp;
22982
+ int start_recovery = 0;
22984
+ MOD_INC_USE_COUNT;
22986
+ LOG_EXTRA(__FUNCTION__" ENTRY\n");
22987
+ if (sb->level != 1) {
22988
+ LOG_ERROR(INVALID_LEVEL, mdidx(mddev), sb->level);
22992
+ * copy the already verified devices into our private RAID1
22993
+ * bookkeeping area. [whatever we allocate in raid1_run(),
22994
+ * should be freed in raid1_stop()]
22997
+ conf = kmalloc(sizeof(raid1_conf_t), GFP_KERNEL);
22998
+ mddev->private = conf;
23000
+ LOG_ERROR(MEM_ERROR, mdidx(mddev));
23003
+ memset(conf, 0, sizeof(*conf));
23005
+ ITERATE_RDEV(mddev,rdev,tmp) {
23006
+ if (rdev->faulty) {
23007
+ LOG_ERROR(ERRORS, evms_md_partition_name(rdev->node));
23014
+ if (rdev->desc_nr == -1) {
23018
+ descriptor = &sb->disks[rdev->desc_nr];
23019
+ disk_idx = descriptor->raid_disk;
23020
+ disk = conf->mirrors + disk_idx;
23022
+ if (disk_faulty(descriptor)) {
23023
+ disk->number = descriptor->number;
23024
+ disk->raid_disk = disk_idx;
23025
+ disk->node = rdev->node;
23026
+ disk->dev = rdev->dev;
23027
+ disk->sect_limit = MAX_WORK_PER_DISK;
23028
+ disk->operational = 0;
23029
+ disk->write_only = 0;
23031
+ disk->used_slot = 1;
23032
+ disk->head_position = 0;
23035
+ if (disk_active(descriptor)) {
23036
+ if (!disk_sync(descriptor)) {
23037
+ LOG_ERROR(NOT_IN_SYNC, evms_md_partition_name(rdev->node));
23040
+ if ((descriptor->number > MD_SB_DISKS) ||
23041
+ (disk_idx > sb->raid_disks)) {
23043
+ LOG_ERROR(INCONSISTENT,evms_md_partition_name(rdev->node));
23046
+ if (disk->operational) {
23047
+ LOG_ERROR(ALREADY_RUNNING, evms_md_partition_name(rdev->node), disk_idx);
23050
+ LOG_DEFAULT(OPERATIONAL, evms_md_partition_name(rdev->node), disk_idx);
23051
+ disk->number = descriptor->number;
23052
+ disk->raid_disk = disk_idx;
23053
+ disk->node = rdev->node;
23054
+ disk->dev = rdev->dev;
23055
+ disk->sect_limit = MAX_WORK_PER_DISK;
23056
+ disk->operational = 1;
23057
+ disk->write_only = 0;
23059
+ disk->used_slot = 1;
23060
+ disk->head_position = 0;
23061
+ conf->working_disks++;
23064
+ * Must be a spare disk ..
23066
+ LOG_DEFAULT(SPARE, evms_md_partition_name(rdev->node));
23067
+ disk->number = descriptor->number;
23068
+ disk->raid_disk = disk_idx;
23069
+ disk->node = rdev->node;
23070
+ disk->dev = rdev->dev;
23071
+ disk->sect_limit = MAX_WORK_PER_DISK;
23072
+ disk->operational = 0;
23073
+ disk->write_only = 0;
23075
+ disk->used_slot = 1;
23076
+ disk->head_position = 0;
23079
+ conf->raid_disks = sb->raid_disks;
23080
+ conf->nr_disks = sb->nr_disks;
23081
+ conf->mddev = mddev;
23082
+ conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
23084
+ conf->segment_lock = MD_SPIN_LOCK_UNLOCKED;
23085
+ init_waitqueue_head(&conf->wait_buffer);
23086
+ init_waitqueue_head(&conf->wait_done);
23087
+ init_waitqueue_head(&conf->wait_ready);
23089
+ if (!conf->working_disks) {
23090
+ LOG_ERROR(NONE_OPERATIONAL, mdidx(mddev));
23091
+ goto out_free_conf;
23095
+ /* pre-allocate some buffer_head structures.
23096
+ * As a minimum, 1 r1bh and raid_disks buffer_heads
23097
+ * would probably get us by in tight memory situations,
23098
+ * but a few more is probably a good idea.
23099
+ * For now, try NR_RESERVED_BUFS r1bh and
23100
+ * NR_RESERVED_BUFS*raid_disks bufferheads
23101
+ * This will allow at least NR_RESERVED_BUFS concurrent
23102
+ * reads or writes even if kmalloc starts failing
23104
+ if (raid1_grow_r1bh(conf, NR_RESERVED_BUFS) < NR_RESERVED_BUFS ||
23105
+ raid1_grow_bh(conf, NR_RESERVED_BUFS*conf->raid_disks)
23106
+ < NR_RESERVED_BUFS*conf->raid_disks) {
23107
+ LOG_ERROR(MEM_ERROR, mdidx(mddev));
23108
+ goto out_free_conf;
23111
+ for (i = 0; i < MD_SB_DISKS; i++) {
23113
+ descriptor = sb->disks+i;
23114
+ disk_idx = descriptor->raid_disk;
23115
+ disk = conf->mirrors + disk_idx;
23117
+ if (disk_faulty(descriptor) && (disk_idx < conf->raid_disks) &&
23118
+ !disk->used_slot) {
23120
+ disk->number = descriptor->number;
23121
+ disk->raid_disk = disk_idx;
23122
+ disk->dev = MKDEV(0,0);
23124
+ disk->operational = 0;
23125
+ disk->write_only = 0;
23127
+ disk->used_slot = 1;
23128
+ disk->head_position = 0;
23133
+ * find the first working one and use it as a starting point
23134
+ * to read balancing.
23136
+ for (j = 0; !conf->mirrors[j].operational && j < MD_SB_DISKS; j++)
23138
+ conf->last_used = j;
23141
+ if (conf->working_disks != sb->raid_disks) {
23142
+ LOG_SERIOUS(" md%d, not all disks are operational -- trying to recover array\n",
23144
+ start_recovery = 1;
23148
+ const char * name = "evms_raid1d";
23150
+ conf->thread = evms_cs_register_thread(raid1d, conf, name);
23151
+ if (!conf->thread) {
23152
+ LOG_ERROR(THREAD_ERROR, mdidx(mddev));
23153
+ goto out_free_conf;
23157
+ if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN)) &&
23158
+ (conf->working_disks > 1)) {
23159
+ const char * name = "evms_raid1syncd";
23161
+ conf->resync_thread = evms_cs_register_thread(raid1syncd, conf,name);
23162
+ if (!conf->resync_thread) {
23163
+ LOG_ERROR(THREAD_ERROR, mdidx(mddev));
23164
+ goto out_free_conf;
23167
+ LOG_WARNING(START_RESYNC, mdidx(mddev));
23168
+ conf->resync_mirrors = 1;
23169
+ evms_cs_wakeup_thread(conf->resync_thread);
23173
+ * Regenerate the "device is in sync with the raid set" bit for
23176
+ for (i = 0; i < MD_SB_DISKS; i++) {
23177
+ mark_disk_nonsync(sb->disks+i);
23178
+ for (j = 0; j < sb->raid_disks; j++) {
23179
+ if (!conf->mirrors[j].operational)
23181
+ if (sb->disks[i].number == conf->mirrors[j].number)
23182
+ mark_disk_sync(sb->disks+i);
23185
+ sb->active_disks = conf->working_disks;
23187
+ if (start_recovery)
23188
+ evms_md_recover_arrays();
23191
+ LOG_DEFAULT(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks);
23193
+ * Ok, everything is just fine now
23198
+ raid1_shrink_r1bh(conf);
23199
+ raid1_shrink_bh(conf);
23200
+ raid1_shrink_buffers(conf);
23202
+ mddev->private = NULL;
23204
+ MOD_DEC_USE_COUNT;
23208
+#undef INVALID_LEVEL
23211
+#undef NOT_IN_SYNC
23212
+#undef INCONSISTENT
23213
+#undef ALREADY_RUNNING
23214
+#undef OPERATIONAL
23216
+#undef NONE_OPERATIONAL
23217
+#undef ARRAY_IS_ACTIVE
23219
+static int raid1_stop_resync (mddev_t *mddev)
23221
+ raid1_conf_t *conf = mddev_to_conf(mddev);
23223
+ LOG_DEFAULT(__FUNCTION__ " ENTRY\n");
23224
+ if (conf->resync_thread) {
23225
+ if (conf->resync_mirrors) {
23226
+ conf->resync_mirrors = 2;
23227
+ evms_cs_interrupt_thread(conf->resync_thread);
23228
+ LOG_WARNING(" mirror resync was not fully finished, restarting next time.\n");
23236
+static int raid1_restart_resync (mddev_t *mddev)
23238
+ raid1_conf_t *conf = mddev_to_conf(mddev);
23240
+ LOG_DEFAULT(__FUNCTION__" ENTRY\n");
23241
+ if (conf->resync_mirrors) {
23242
+ if (!conf->resync_thread) {
23246
+ conf->resync_mirrors = 1;
23247
+ evms_cs_wakeup_thread(conf->resync_thread);
23253
+static int raid1_stop (mddev_t *mddev)
23255
+ raid1_conf_t *conf = mddev_to_conf(mddev);
23257
+ LOG_DEFAULT(__FUNCTION__ " ENTRY\n");
23258
+ evms_cs_unregister_thread(conf->thread);
23259
+ if (conf->resync_thread)
23260
+ evms_cs_unregister_thread(conf->resync_thread);
23261
+ raid1_shrink_r1bh(conf);
23262
+ raid1_shrink_bh(conf);
23263
+ raid1_shrink_buffers(conf);
23265
+ mddev->private = NULL;
23266
+ MOD_DEC_USE_COUNT;
23270
+static int raid1_evms_ioctl (
23272
+ struct inode * inode,
23273
+ struct file * file,
23274
+ unsigned int cmd,
23275
+ unsigned long arg)
23278
+ evms_logical_node_t *node = NULL;
23279
+ raid1_conf_t *conf = mddev_to_conf(mddev);
23282
+ case EVMS_GET_BMAP:
23284
+ for (i = 0; i < MD_SB_DISKS; i++) {
23285
+ if (conf->mirrors[i].operational) {
23286
+ node = conf->mirrors[i].node;
23292
+ rc = IOCTL(node, inode, file, cmd, arg);
23305
+static mdk_personality_t raid1_personality=
23307
+ name: "evms_raid1",
23308
+ init_io: raid1_init_io,
23309
+ make_request: raid1_make_request,
23311
+ stop: raid1_stop,
23312
+ status: raid1_status,
23313
+ error_handler: raid1_error,
23314
+ diskop: raid1_diskop,
23315
+ stop_resync: raid1_stop_resync,
23316
+ restart_resync: raid1_restart_resync,
23317
+ sync_request: raid1_sync_request,
23318
+ evms_ioctl: raid1_evms_ioctl
23321
+static int md__init raid1_init (void)
23323
+ return evms_register_md_personality (RAID1, &raid1_personality);
23326
+static void raid1_exit (void)
23328
+ evms_unregister_md_personality (RAID1);
23331
+module_init(raid1_init);
23332
+module_exit(raid1_exit);
23333
+#ifdef MODULE_LICENSE
23334
+MODULE_LICENSE("GPL");
23336
diff -Naur linux-2002-03-28/drivers/evms/md_raid5.c evms-2002-03-28/drivers/evms/md_raid5.c
23337
--- linux-2002-03-28/drivers/evms/md_raid5.c Wed Dec 31 18:00:00 1969
23338
+++ evms-2002-03-28/drivers/evms/md_raid5.c Thu Mar 28 16:28:37 2002
23341
+ * md_raid5.c : Multiple Devices driver for Linux
23342
+ * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
23343
+ * Copyright (C) 1999, 2000 Ingo Molnar
23345
+ * RAID-5 management functions.
23347
+ * 'md_raid5.c' is an EVMS version of linux/drivers/md/raid5.c modified
23348
+ * by Cuong (Mike) Tran <miketran@us.ibm.com>, January 2002.
23350
+ * This program is free software; you can redistribute it and/or modify
23351
+ * it under the terms of the GNU General Public License as published by
23352
+ * the Free Software Foundation; either version 2, or (at your option)
23353
+ * any later version.
23355
+ * You should have received a copy of the GNU General Public License
23356
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
23357
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23361
+#include <linux/config.h>
23362
+#include <linux/module.h>
23363
+#include <linux/locks.h>
23364
+#include <linux/slab.h>
23365
+#include <linux/evms/evms_raid5.h>
23366
+#include <asm/bitops.h>
23367
+#include <asm/atomic.h>
23369
+#define LOG_PREFIX "md raid5: "
23371
+static mdk_personality_t raid5_personality;
23377
+#define NR_STRIPES 256
23378
+#define IO_THRESHOLD 1
23379
+#define HASH_PAGES 1
23380
+#define HASH_PAGES_ORDER 0
23381
+#define NR_HASH (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *))
23382
+#define HASH_MASK (NR_HASH - 1)
23383
+#define stripe_hash(conf, sect) ((conf)->stripe_hashtbl[((sect) / ((conf)->buffer_size >> 9)) & HASH_MASK])
23386
+ * The following can be used to debug the driver
23388
+#define RAID5_DEBUG 0
23389
+#define RAID5_PARANOIA 1
23390
+#if RAID5_PARANOIA && CONFIG_SMP
23391
+# define CHECK_DEVLOCK() if (!spin_is_locked(&conf->device_lock)) BUG()
23393
+# define CHECK_DEVLOCK()
23397
+static void print_raid5_conf (raid5_conf_t *conf);
23399
+static inline void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
23401
+ if (atomic_dec_and_test(&sh->count)) {
23402
+ if (!list_empty(&sh->lru))
23404
+ if (atomic_read(&conf->active_stripes)==0)
23406
+ if (test_bit(STRIPE_HANDLE, &sh->state)) {
23407
+ if (test_bit(STRIPE_DELAYED, &sh->state))
23408
+ list_add_tail(&sh->lru, &conf->delayed_list);
23410
+ list_add_tail(&sh->lru, &conf->handle_list);
23411
+ evms_cs_wakeup_thread(conf->thread);
23413
+ if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
23414
+ atomic_dec(&conf->preread_active_stripes);
23415
+ if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
23416
+ evms_cs_wakeup_thread(conf->thread);
23418
+ list_add_tail(&sh->lru, &conf->inactive_list);
23419
+ atomic_dec(&conf->active_stripes);
23420
+ if (!conf->inactive_blocked ||
23421
+ atomic_read(&conf->active_stripes) < (NR_STRIPES*3/4))
23422
+ wake_up(&conf->wait_for_stripe);
23426
+static void release_stripe(struct stripe_head *sh)
23428
+ raid5_conf_t *conf = sh->raid_conf;
23429
+ unsigned long flags;
23431
+ spin_lock_irqsave(&conf->device_lock, flags);
23432
+ __release_stripe(conf, sh);
23433
+ spin_unlock_irqrestore(&conf->device_lock, flags);
23436
+static void remove_hash(struct stripe_head *sh)
23438
+ LOG_DEBUG("%s: stripe %lu\n", __FUNCTION__, sh->sector);
23440
+ if (sh->hash_pprev) {
23441
+ if (sh->hash_next)
23442
+ sh->hash_next->hash_pprev = sh->hash_pprev;
23443
+ *sh->hash_pprev = sh->hash_next;
23444
+ sh->hash_pprev = NULL;
23448
+static __inline__ void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
23450
+ struct stripe_head **shp = &stripe_hash(conf, sh->sector);
23452
+ LOG_DEBUG("%s: stripe %lu\n", __FUNCTION__, sh->sector);
23455
+ if ((sh->hash_next = *shp) != NULL)
23456
+ (*shp)->hash_pprev = &sh->hash_next;
23458
+ sh->hash_pprev = shp;
23462
+/* find an idle stripe, make sure it is unhashed, and return it. */
23463
+static struct stripe_head *get_free_stripe(raid5_conf_t *conf)
23465
+ struct stripe_head *sh = NULL;
23466
+ struct list_head *first;
23469
+ if (list_empty(&conf->inactive_list))
23471
+ first = conf->inactive_list.next;
23472
+ sh = list_entry(first, struct stripe_head, lru);
23473
+ list_del_init(first);
23475
+ atomic_inc(&conf->active_stripes);
23480
+static void shrink_buffers(struct stripe_head *sh, int num)
23482
+ struct buffer_head *bh;
23485
+ for (i=0; i<num ; i++) {
23486
+ bh = sh->bh_cache[i];
23489
+ sh->bh_cache[i] = NULL;
23490
+ free_page((unsigned long) bh->b_data);
23495
+static int grow_buffers(struct stripe_head *sh, int num, int b_size, int priority)
23497
+ struct buffer_head *bh;
23500
+ for (i=0; i<num; i++) {
23501
+ struct page *page;
23502
+ bh = kmalloc(sizeof(struct buffer_head), priority);
23505
+ memset(bh, 0, sizeof (struct buffer_head));
23506
+ init_waitqueue_head(&bh->b_wait);
23507
+ if ((page = alloc_page(priority)))
23508
+ bh->b_data = page_address(page);
23513
+ atomic_set(&bh->b_count, 0);
23514
+ bh->b_page = page;
23515
+ sh->bh_cache[i] = bh;
23521
+static struct buffer_head *raid5_build_block (struct stripe_head *sh, int i);
23523
+static inline void init_stripe(struct stripe_head *sh, unsigned long sector)
23525
+ raid5_conf_t *conf = sh->raid_conf;
23526
+ int disks = conf->raid_disks, i;
23528
+ if (atomic_read(&sh->count) != 0)
23530
+ if (test_bit(STRIPE_HANDLE, &sh->state))
23534
+ LOG_EXTRA("init_stripe called, stripe %lu\n", sh->sector);
23538
+ sh->sector = sector;
23539
+ sh->size = conf->buffer_size;
23542
+ for (i=disks; i--; ) {
23543
+ if (sh->bh_read[i] || sh->bh_write[i] || sh->bh_written[i] ||
23544
+ buffer_locked(sh->bh_cache[i])) {
23545
+ LOG_ERROR("sector=%lx i=%d %p %p %p %d\n",
23546
+ sh->sector, i, sh->bh_read[i],
23547
+ sh->bh_write[i], sh->bh_written[i],
23548
+ buffer_locked(sh->bh_cache[i]));
23551
+ clear_bit(BH_Uptodate, &sh->bh_cache[i]->b_state);
23552
+ raid5_build_block(sh, i);
23554
+ insert_hash(conf, sh);
23557
+/* the buffer size has changed, so unhash all stripes
23558
+ * as active stripes complete, they will go onto inactive list
23560
+static void shrink_stripe_cache(raid5_conf_t *conf)
23564
+ if (atomic_read(&conf->active_stripes))
23566
+ for (i=0; i < NR_HASH; i++) {
23567
+ struct stripe_head *sh;
23568
+ while ((sh = conf->stripe_hashtbl[i]))
23573
+static struct stripe_head *__find_stripe(raid5_conf_t *conf, unsigned long sector)
23575
+ struct stripe_head *sh;
23578
+ LOG_DEBUG("%s: sector %lu\n", __FUNCTION__, sector);
23579
+ for (sh = stripe_hash(conf, sector); sh; sh = sh->hash_next)
23580
+ if (sh->sector == sector)
23582
+ LOG_DEBUG("%s: %lu not in cache\n", __FUNCTION__, sector);
23586
+static struct stripe_head *get_active_stripe(raid5_conf_t *conf, unsigned long sector, int size, int noblock)
23588
+ struct stripe_head *sh;
23591
+ md_spin_lock_irq(&conf->device_lock);
23594
+ if (conf->buffer_size == 0 ||
23595
+ (size && size != conf->buffer_size)) {
23596
+ /* either the size is being changed (buffer_size==0) or
23597
+ * we need to change it.
23598
+ * If size==0, we can proceed as soon as buffer_size gets set.
23599
+ * If size>0, we can proceed when active_stripes reaches 0, or
23600
+ * when someone else sets the buffer_size to size.
23601
+ * If someone sets the buffer size to something else, we will need to
23602
+ * assert that we want to change it again
23605
+ wait_event_lock_irq(conf->wait_for_stripe,
23606
+ conf->buffer_size,
23607
+ conf->device_lock);
23609
+ while (conf->buffer_size != size && atomic_read(&conf->active_stripes)) {
23610
+ conf->buffer_size = 0;
23611
+ wait_event_lock_irq(conf->wait_for_stripe,
23612
+ atomic_read(&conf->active_stripes)==0 || conf->buffer_size,
23613
+ conf->device_lock);
23616
+ if (conf->buffer_size != size) {
23617
+ shrink_stripe_cache(conf);
23618
+ if (size==0) BUG();
23619
+ conf->buffer_size = size;
23624
+ sector -= sector & ((conf->buffer_size>>9)-1);
23626
+ sh = __find_stripe(conf, sector);
23628
+ if (!conf->inactive_blocked)
23629
+ sh = get_free_stripe(conf);
23630
+ if (noblock && sh == NULL)
23633
+ conf->inactive_blocked = 1;
23634
+ wait_event_lock_irq(conf->wait_for_stripe,
23635
+ !list_empty(&conf->inactive_list) &&
23636
+ (atomic_read(&conf->active_stripes) < (NR_STRIPES *3/4)
23637
+ || !conf->inactive_blocked),
23638
+ conf->device_lock);
23639
+ conf->inactive_blocked = 0;
23641
+ init_stripe(sh, sector);
23643
+ if (atomic_read(&sh->count)) {
23644
+ if (!list_empty(&sh->lru))
23647
+ if (!test_bit(STRIPE_HANDLE, &sh->state))
23648
+ atomic_inc(&conf->active_stripes);
23649
+ if (list_empty(&sh->lru))
23651
+ list_del_init(&sh->lru);
23654
+ } while (sh == NULL);
23657
+ atomic_inc(&sh->count);
23659
+ md_spin_unlock_irq(&conf->device_lock);
23663
+static int grow_stripes(raid5_conf_t *conf, int num, int priority)
23665
+ struct stripe_head *sh;
23668
+ sh = kmalloc(sizeof(struct stripe_head), priority);
23671
+ memset(sh, 0, sizeof(*sh));
23672
+ sh->raid_conf = conf;
23673
+ sh->lock = SPIN_LOCK_UNLOCKED;
23675
+ if (grow_buffers(sh, conf->raid_disks, PAGE_SIZE, priority)) {
23676
+ shrink_buffers(sh, conf->raid_disks);
23680
+ /* we just created an active stripe so... */
23681
+ atomic_set(&sh->count, 1);
23682
+ atomic_inc(&conf->active_stripes);
23683
+ INIT_LIST_HEAD(&sh->lru);
23684
+ release_stripe(sh);
23689
+static void shrink_stripes(raid5_conf_t *conf, int num)
23691
+ struct stripe_head *sh;
23694
+ spin_lock_irq(&conf->device_lock);
23695
+ sh = get_free_stripe(conf);
23696
+ spin_unlock_irq(&conf->device_lock);
23699
+ if (atomic_read(&sh->count))
23701
+ shrink_buffers(sh, conf->raid_disks);
23703
+ atomic_dec(&conf->active_stripes);
23708
+static void raid5_end_read_request (struct buffer_head * bh, int uptodate)
23710
+ struct stripe_head *sh = bh->b_private;
23711
+ raid5_conf_t *conf = sh->raid_conf;
23712
+ int disks = conf->raid_disks, i;
23713
+ unsigned long flags;
23715
+ for (i=0 ; i<disks; i++)
23716
+ if (bh == sh->bh_cache[i])
23719
+ if (i == disks) {
23725
+ struct buffer_head *buffer;
23726
+ spin_lock_irqsave(&conf->device_lock, flags);
23727
+ /* we can return a buffer if we bypassed the cache or
23728
+ * if the top buffer is not in highmem. If there are
23729
+ * multiple buffers, leave the extra work to
23732
+ buffer = sh->bh_read[i];
23734
+ (!PageHighMem(buffer->b_page)
23735
+ || buffer->b_page == bh->b_page )
23737
+ sh->bh_read[i] = buffer->b_reqnext;
23738
+ buffer->b_reqnext = NULL;
23741
+ spin_unlock_irqrestore(&conf->device_lock, flags);
23742
+ if (sh->bh_page[i]==NULL)
23743
+ set_bit(BH_Uptodate, &bh->b_state);
23745
+ if (buffer->b_page != bh->b_page)
23746
+ memcpy(buffer->b_data, bh->b_data, bh->b_size);
23747
+ buffer->b_end_io(buffer, 1);
23752
+ evms_md_error(conf->mddev, sh->node[i]);
23754
+ LOG_WARNING("NODE was not set, skipping evms_md_error()\n");
23755
+ clear_bit(BH_Uptodate, &bh->b_state);
23757
+ /* must restore b_page before unlocking buffer... */
23758
+ if (sh->bh_page[i]) {
23759
+ bh->b_page = sh->bh_page[i];
23760
+ bh->b_data = page_address(bh->b_page);
23761
+ sh->bh_page[i] = NULL;
23762
+ clear_bit(BH_Uptodate, &bh->b_state);
23764
+ clear_bit(BH_Lock, &bh->b_state);
23765
+ set_bit(STRIPE_HANDLE, &sh->state);
23766
+ release_stripe(sh);
23767
+ if (sh->node[i]) {
23768
+ sh->node[i] = NULL;
23770
+ LOG_WARNING(" evms node was not set.\n");
23775
+static void raid5_end_write_request (struct buffer_head *bh, int uptodate)
23777
+ struct stripe_head *sh = bh->b_private;
23778
+ raid5_conf_t *conf = sh->raid_conf;
23779
+ int disks = conf->raid_disks, i;
23780
+ unsigned long flags;
23782
+ for (i=0 ; i<disks; i++)
23783
+ if (bh == sh->bh_cache[i])
23786
+ if (i == disks) {
23791
+ md_spin_lock_irqsave(&conf->device_lock, flags);
23795
+ evms_md_error(conf->mddev, sh->node[i]);
23797
+ LOG_WARNING(" NODE was not set, skipping evms_md_error()\n");
23799
+ clear_bit(BH_Lock, &bh->b_state);
23800
+ set_bit(STRIPE_HANDLE, &sh->state);
23801
+ __release_stripe(conf, sh);
23802
+ md_spin_unlock_irqrestore(&conf->device_lock, flags);
23803
+ if (sh->node[i]) {
23804
+ sh->node[i] = NULL;
23806
+ LOG_WARNING(" evms node was not set.\n");
23812
+static struct buffer_head *raid5_build_block (struct stripe_head *sh, int i)
23814
+ raid5_conf_t *conf = sh->raid_conf;
23815
+ struct buffer_head *bh = sh->bh_cache[i];
23816
+ unsigned long block = sh->sector / (sh->size >> 9);
23818
+ init_buffer(bh, raid5_end_read_request, sh);
23819
+ bh->b_dev = conf->disks[i].dev;
23820
+ bh->b_blocknr = block;
23822
+ bh->b_state = (1 << BH_Req) | (1 << BH_Mapped);
23823
+ bh->b_size = sh->size;
23824
+ bh->b_list = BUF_LOCKED;
23828
+static int raid5_error (
23830
+ evms_logical_node_t *node)
23832
+ raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
23833
+ mdp_super_t *sb = mddev->sb;
23834
+ struct disk_info *disk;
23837
+ LOG_WARNING("%s: called\n", __FUNCTION__);
23839
+ for (i = 0, disk = conf->disks; i < conf->raid_disks; i++, disk++) {
23840
+ if (disk->node == node) {
23841
+ if (disk->operational) {
23842
+ disk->operational = 0;
23843
+ mark_disk_faulty(sb->disks+disk->number);
23844
+ mark_disk_nonsync(sb->disks+disk->number);
23845
+ mark_disk_inactive(sb->disks+disk->number);
23846
+ sb->active_disks--;
23847
+ sb->working_disks--;
23848
+ sb->failed_disks++;
23849
+ mddev->sb_dirty = 1;
23850
+ conf->working_disks--;
23851
+ conf->failed_disks++;
23852
+ evms_cs_wakeup_thread(conf->thread);
23853
+ LOG_WARNING("Disk failure on %s, disabling device."
23854
+ " Operation continuing on %d devices\n",
23855
+ evms_md_partition_name (disk->node), conf->working_disks);
23861
+ * handle errors in spares (during reconstruction)
23863
+ if (conf->spare) {
23864
+ disk = conf->spare;
23865
+ if (disk->node == node) {
23866
+ LOG_WARNING("EVMS RAID5: Disk failure on spare %s\n",
23867
+ evms_md_partition_name (disk->node));
23868
+ if (!conf->spare->operational) {
23869
+ /* probably a SET_DISK_FAULTY ioctl */
23872
+ disk->operational = 0;
23873
+ disk->write_only = 0;
23874
+ conf->spare = NULL;
23875
+ mark_disk_faulty(sb->disks+disk->number);
23876
+ mark_disk_nonsync(sb->disks+disk->number);
23877
+ mark_disk_inactive(sb->disks+disk->number);
23878
+ sb->spare_disks--;
23879
+ sb->working_disks--;
23880
+ sb->failed_disks++;
23882
+ mddev->sb_dirty = 1;
23883
+ evms_cs_wakeup_thread(conf->thread);
23893
+ * Input: a 'big' sector number,
23894
+ * Output: index of the data and parity disk, and the sector # in them.
23896
+static unsigned long raid5_compute_sector(unsigned long r_sector, unsigned int raid_disks,
23897
+ unsigned int data_disks, unsigned int * dd_idx,
23898
+ unsigned int * pd_idx, raid5_conf_t *conf)
23900
+ unsigned long stripe;
23901
+ unsigned long chunk_number;
23902
+ unsigned int chunk_offset;
23903
+ unsigned long new_sector;
23904
+ int sectors_per_chunk = conf->chunk_size >> 9;
23906
+ /* First compute the information on this sector */
23909
+ * Compute the chunk number and the sector offset inside the chunk
23911
+ chunk_number = r_sector / sectors_per_chunk;
23912
+ chunk_offset = r_sector % sectors_per_chunk;
23915
+ * Compute the stripe number
23917
+ stripe = chunk_number / data_disks;
23920
+ * Compute the data disk and parity disk indexes inside the stripe
23922
+ *dd_idx = chunk_number % data_disks;
23925
+ * Select the parity disk based on the user selected algorithm.
23927
+ if (conf->level == 4)
23928
+ *pd_idx = data_disks;
23929
+ else switch (conf->algorithm) {
23930
+ case ALGORITHM_LEFT_ASYMMETRIC:
23931
+ *pd_idx = data_disks - stripe % raid_disks;
23932
+ if (*dd_idx >= *pd_idx)
23935
+ case ALGORITHM_RIGHT_ASYMMETRIC:
23936
+ *pd_idx = stripe % raid_disks;
23937
+ if (*dd_idx >= *pd_idx)
23940
+ case ALGORITHM_LEFT_SYMMETRIC:
23941
+ *pd_idx = data_disks - stripe % raid_disks;
23942
+ *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
23944
+ case ALGORITHM_RIGHT_SYMMETRIC:
23945
+ *pd_idx = stripe % raid_disks;
23946
+ *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
23949
+ LOG_ERROR(" unsupported algorithm %d\n", conf->algorithm);
23953
+ * Finally, compute the new sector number
23955
+ new_sector = stripe * sectors_per_chunk + chunk_offset;
23956
+ return new_sector;
23959
+#define check_xor() do { \
23960
+ if (count == MAX_XOR_BLOCKS) { \
23961
+ evms_md_xor_block(count, bh_ptr); \
23967
+static void compute_block(struct stripe_head *sh, int dd_idx)
23969
+ raid5_conf_t *conf = sh->raid_conf;
23970
+ int i, count, disks = conf->raid_disks;
23971
+ struct buffer_head *bh_ptr[MAX_XOR_BLOCKS], *bh;
23973
+ memset(sh->bh_cache[dd_idx]->b_data, 0, sh->size);
23974
+ bh_ptr[0] = sh->bh_cache[dd_idx];
23976
+ for (i = disks ; i--; ) {
23979
+ bh = sh->bh_cache[i];
23980
+ if (buffer_uptodate(bh))
23981
+ bh_ptr[count++] = bh;
23983
+ LOG_ERROR("%s: %d, stripe %lu, %d not present\n",
23984
+ __FUNCTION__, dd_idx, sh->sector, i);
23989
+ evms_md_xor_block(count, bh_ptr);
23990
+ set_bit(BH_Uptodate, &sh->bh_cache[dd_idx]->b_state);
23993
+static void compute_parity(struct stripe_head *sh, int method)
23995
+ raid5_conf_t *conf = sh->raid_conf;
23996
+ int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count;
23997
+ struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];
23998
+ struct buffer_head *chosen[MD_SB_DISKS];
24000
+ memset(chosen, 0, sizeof(chosen));
24003
+ bh_ptr[0] = sh->bh_cache[pd_idx];
24005
+ case READ_MODIFY_WRITE:
24006
+ if (!buffer_uptodate(sh->bh_cache[pd_idx]))
24008
+ for (i=disks ; i-- ;) {
24011
+ if (sh->bh_write[i] &&
24012
+ buffer_uptodate(sh->bh_cache[i])) {
24013
+ bh_ptr[count++] = sh->bh_cache[i];
24014
+ chosen[i] = sh->bh_write[i];
24015
+ sh->bh_write[i] = sh->bh_write[i]->b_reqnext;
24016
+ chosen[i]->b_reqnext = sh->bh_written[i];
24017
+ sh->bh_written[i] = chosen[i];
24022
+ case RECONSTRUCT_WRITE:
24023
+ memset(sh->bh_cache[pd_idx]->b_data, 0, sh->size);
24024
+ for (i= disks; i-- ;)
24025
+ if (i!=pd_idx && sh->bh_write[i]) {
24026
+ chosen[i] = sh->bh_write[i];
24027
+ sh->bh_write[i] = sh->bh_write[i]->b_reqnext;
24028
+ chosen[i]->b_reqnext = sh->bh_written[i];
24029
+ sh->bh_written[i] = chosen[i];
24032
+ case CHECK_PARITY:
24036
+ evms_md_xor_block(count, bh_ptr);
24040
+ for (i = disks; i--;)
24042
+ struct buffer_head *bh = sh->bh_cache[i];
24044
+ bdata = bh_kmap(chosen[i]);
24045
+ memcpy(bh->b_data,
24047
+ bh_kunmap(chosen[i]);
24048
+ set_bit(BH_Lock, &bh->b_state);
24049
+ mark_buffer_uptodate(bh, 1);
24053
+ case RECONSTRUCT_WRITE:
24054
+ case CHECK_PARITY:
24055
+ for (i=disks; i--;)
24056
+ if (i != pd_idx) {
24057
+ bh_ptr[count++] = sh->bh_cache[i];
24061
+ case READ_MODIFY_WRITE:
24062
+ for (i = disks; i--;)
24064
+ bh_ptr[count++] = sh->bh_cache[i];
24069
+ evms_md_xor_block(count, bh_ptr);
24071
+ if (method != CHECK_PARITY) {
24072
+ mark_buffer_uptodate(sh->bh_cache[pd_idx], 1);
24073
+ set_bit(BH_Lock, &sh->bh_cache[pd_idx]->b_state);
24075
+ mark_buffer_uptodate(sh->bh_cache[pd_idx], 0);
24078
+static void add_stripe_bh (struct stripe_head *sh, struct buffer_head *bh, int dd_idx, int rw)
24080
+ struct buffer_head **bhp;
24081
+ raid5_conf_t *conf = sh->raid_conf;
24083
+ spin_lock(&sh->lock);
24084
+ spin_lock_irq(&conf->device_lock);
24085
+ bh->b_reqnext = NULL;
24087
+ bhp = &sh->bh_read[dd_idx];
24089
+ bhp = &sh->bh_write[dd_idx];
24091
+ LOG_DEFAULT("EVMS RAID5: multiple %d requests for sector %ld\n", rw, sh->sector);
24092
+ bhp = & (*bhp)->b_reqnext;
24095
+ spin_unlock_irq(&conf->device_lock);
24096
+ spin_unlock(&sh->lock);
24105
+ * handle_stripe - do things to a stripe.
24107
+ * We lock the stripe and then examine the state of various bits
24108
+ * to see what needs to be done.
24109
+ * Possible results:
24110
+ * return some read request which now have data
24111
+ * return some write requests which are safely on disc
24112
+ * schedule a read on some buffers
24113
+ * schedule a write of some buffers
24114
+ * return confirmation of parity correctness
24116
+ * Parity calculations are done inside the stripe lock
24117
+ * buffers are taken off read_list or write_list, and bh_cache buffers
24118
+ * get BH_Lock set before the stripe lock is released.
24122
+static void handle_stripe(struct stripe_head *sh)
24124
+ raid5_conf_t *conf = sh->raid_conf;
24125
+ int disks = conf->raid_disks;
24126
+ struct buffer_head *return_ok= NULL, *return_fail = NULL;
24127
+ int action[MD_SB_DISKS];
24130
+ int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
24131
+ int failed_num=0;
24132
+ struct buffer_head *bh;
24134
+ memset(action, 0, sizeof(action));
24136
+ spin_lock(&sh->lock);
24137
+ clear_bit(STRIPE_HANDLE, &sh->state);
24138
+ clear_bit(STRIPE_DELAYED, &sh->state);
24140
+ syncing = test_bit(STRIPE_SYNCING, &sh->state);
24141
+ /* Now to look around and see what can be done */
24143
+ for (i=disks; i--; ) {
24144
+ bh = sh->bh_cache[i];
24145
+ /* maybe we can reply to a read */
24146
+ if (buffer_uptodate(bh) && sh->bh_read[i]) {
24147
+ struct buffer_head *rbh, *rbh2;
24148
+ spin_lock_irq(&conf->device_lock);
24149
+ rbh = sh->bh_read[i];
24150
+ sh->bh_read[i] = NULL;
24151
+ spin_unlock_irq(&conf->device_lock);
24154
+ bdata = bh_kmap(rbh);
24155
+ memcpy(bdata, bh->b_data, bh->b_size);
24157
+ rbh2 = rbh->b_reqnext;
24158
+ rbh->b_reqnext = return_ok;
24164
+ /* now count some things */
24165
+ if (buffer_locked(bh)) locked++;
24166
+ if (buffer_uptodate(bh)) uptodate++;
24169
+ if (sh->bh_read[i]) to_read++;
24170
+ if (sh->bh_write[i]) to_write++;
24171
+ if (sh->bh_written[i]) written++;
24172
+ if (!conf->disks[i].operational) {
24177
+ /* check if the array has lost two devices and, if so, some requests might
24178
+ * need to be failed
24180
+ if (failed > 1 && to_read+to_write) {
24181
+ for (i=disks; i--; ) {
24182
+ /* fail all writes first */
24183
+ if (sh->bh_write[i]) to_write--;
24184
+ while ((bh = sh->bh_write[i])) {
24185
+ sh->bh_write[i] = bh->b_reqnext;
24186
+ bh->b_reqnext = return_fail;
24187
+ return_fail = bh;
24189
+ /* fail any reads if this device is non-operational */
24190
+ if (!conf->disks[i].operational) {
24191
+ spin_lock_irq(&conf->device_lock);
24192
+ if (sh->bh_read[i]) to_read--;
24193
+ while ((bh = sh->bh_read[i])) {
24194
+ sh->bh_read[i] = bh->b_reqnext;
24195
+ bh->b_reqnext = return_fail;
24196
+ return_fail = bh;
24198
+ spin_unlock_irq(&conf->device_lock);
24202
+ if (failed > 1 && syncing) {
24203
+ evms_md_done_sync(conf->mddev, (sh->size>>9) - sh->sync_redone,0);
24204
+ clear_bit(STRIPE_SYNCING, &sh->state);
24208
+ /* might be able to return some write requests if the parity block
24209
+ * is safe, or on a failed drive
24211
+ bh = sh->bh_cache[sh->pd_idx];
24213
+ ( (conf->disks[sh->pd_idx].operational && !buffer_locked(bh) && buffer_uptodate(bh))
24214
+ || (failed == 1 && failed_num == sh->pd_idx))
24216
+ /* any written block on a uptodate or failed drive can be returned */
24217
+ for (i=disks; i--; )
24218
+ if (sh->bh_written[i]) {
24219
+ bh = sh->bh_cache[i];
24220
+ if (!conf->disks[sh->pd_idx].operational ||
24221
+ (!buffer_locked(bh) && buffer_uptodate(bh)) ) {
24222
+ /* maybe we can return some write requests */
24223
+ struct buffer_head *wbh, *wbh2;
24224
+ wbh = sh->bh_written[i];
24225
+ sh->bh_written[i] = NULL;
24227
+ wbh2 = wbh->b_reqnext;
24228
+ wbh->b_reqnext = return_ok;
24236
+ /* Now we might consider reading some blocks, either to check/generate
24237
+ * parity, or to satisfy requests
24239
+ if (to_read || (syncing && (uptodate+failed < disks))) {
24240
+ for (i=disks; i--;) {
24241
+ bh = sh->bh_cache[i];
24242
+ if (!buffer_locked(bh) && !buffer_uptodate(bh) &&
24243
+ (sh->bh_read[i] || syncing || (failed && sh->bh_read[failed_num]))) {
24244
+ /* we would like to get this block, possibly
24245
+ * by computing it, but we might not be able to
24247
+ if (uptodate == disks-1) {
24248
+ compute_block(sh, i);
24250
+ } else if (conf->disks[i].operational) {
24251
+ set_bit(BH_Lock, &bh->b_state);
24252
+ action[i] = READ+1;
24253
+ /* if I am just reading this block and we don't have
24254
+ a failed drive, or any pending writes then sidestep the cache */
24255
+ if (sh->bh_page[i]) BUG();
24256
+ if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext &&
24257
+ ! syncing && !failed && !to_write) {
24258
+ sh->bh_page[i] = sh->bh_cache[i]->b_page;
24259
+ sh->bh_cache[i]->b_page = sh->bh_read[i]->b_page;
24260
+ sh->bh_cache[i]->b_data = sh->bh_read[i]->b_data;
24264
+ evms_md_sync_acct(conf->disks[i].dev, bh->b_size>>9);
24268
+ set_bit(STRIPE_HANDLE, &sh->state);
24271
+ /* now to consider writing and what else, if anything should be read */
24273
+ int rmw=0, rcw=0;
24274
+ for (i=disks ; i--;) {
24275
+ /* would I have to read this buffer for read_modify_write */
24276
+ bh = sh->bh_cache[i];
24277
+ if ((sh->bh_write[i] || i == sh->pd_idx) &&
24278
+ (!buffer_locked(bh) || sh->bh_page[i]) &&
24279
+ !buffer_uptodate(bh)) {
24280
+ if (conf->disks[i].operational
24281
+/* && !(conf->resync_parity && i == sh->pd_idx) */
24284
+ else rmw += 2*disks; /* cannot read it */
24286
+ /* Would I have to read this buffer for reconstruct_write */
24287
+ if (!sh->bh_write[i] && i != sh->pd_idx &&
24288
+ (!buffer_locked(bh) || sh->bh_page[i]) &&
24289
+ !buffer_uptodate(bh)) {
24290
+ if (conf->disks[i].operational) rcw++;
24291
+ else rcw += 2*disks;
24294
+ set_bit(STRIPE_HANDLE, &sh->state);
24295
+ if (rmw < rcw && rmw > 0)
24296
+ /* prefer read-modify-write, but need to get some data */
24297
+ for (i=disks; i--;) {
24298
+ bh = sh->bh_cache[i];
24299
+ if ((sh->bh_write[i] || i == sh->pd_idx) &&
24300
+ !buffer_locked(bh) && !buffer_uptodate(bh) &&
24301
+ conf->disks[i].operational) {
24302
+ if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
24304
+ set_bit(BH_Lock, &bh->b_state);
24305
+ action[i] = READ+1;
24308
+ set_bit(STRIPE_DELAYED, &sh->state);
24309
+ set_bit(STRIPE_HANDLE, &sh->state);
24313
+ if (rcw <= rmw && rcw > 0)
24314
+ /* want reconstruct write, but need to get some data */
24315
+ for (i=disks; i--;) {
24316
+ bh = sh->bh_cache[i];
24317
+ if (!sh->bh_write[i] && i != sh->pd_idx &&
24318
+ !buffer_locked(bh) && !buffer_uptodate(bh) &&
24319
+ conf->disks[i].operational) {
24320
+ if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
24322
+ set_bit(BH_Lock, &bh->b_state);
24323
+ action[i] = READ+1;
24326
+ set_bit(STRIPE_DELAYED, &sh->state);
24327
+ set_bit(STRIPE_HANDLE, &sh->state);
24331
+ /* now if nothing is locked, and if we have enough data, we can start a write request */
24332
+ if (locked == 0 && (rcw == 0 ||rmw == 0)) {
24333
+ compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
24334
+ /* now every locked buffer is ready to be written */
24335
+ for (i=disks; i--;)
24336
+ if (buffer_locked(sh->bh_cache[i])) {
24338
+ action[i] = WRITE+1;
24339
+ if (!conf->disks[i].operational
24340
+ || (i==sh->pd_idx && failed == 0))
24341
+ set_bit(STRIPE_INSYNC, &sh->state);
24343
+ if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
24344
+ atomic_dec(&conf->preread_active_stripes);
24345
+ if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
24346
+ evms_cs_wakeup_thread(conf->thread);
24351
+ /* maybe we need to check and possibly fix the parity for this stripe
24352
+ * Any reads will already have been scheduled, so we just see if enough data
24355
+ if (syncing && locked == 0 &&
24356
+ !test_bit(STRIPE_INSYNC, &sh->state) && failed <= 1) {
24357
+ set_bit(STRIPE_HANDLE, &sh->state);
24358
+ if (failed == 0) {
24359
+ if (uptodate != disks)
24361
+ compute_parity(sh, CHECK_PARITY);
24363
+ bh = sh->bh_cache[sh->pd_idx];
24364
+ if ((*(u32*)bh->b_data) == 0 &&
24365
+ !memcmp(bh->b_data, bh->b_data+4, bh->b_size-4)) {
24366
+ /* parity is correct (on disc, not in buffer any more) */
24367
+ set_bit(STRIPE_INSYNC, &sh->state);
24370
+ if (!test_bit(STRIPE_INSYNC, &sh->state)) {
24371
+ struct disk_info *spare;
24373
+ failed_num = sh->pd_idx;
24374
+ /* should be able to compute the missing block and write it to spare */
24375
+ if (!buffer_uptodate(sh->bh_cache[failed_num])) {
24376
+ if (uptodate+1 != disks)
24378
+ compute_block(sh, failed_num);
24381
+ if (uptodate != disks)
24383
+ bh = sh->bh_cache[failed_num];
24384
+ set_bit(BH_Lock, &bh->b_state);
24385
+ action[failed_num] = WRITE+1;
24387
+ set_bit(STRIPE_INSYNC, &sh->state);
24388
+ if (conf->disks[failed_num].operational)
24389
+ evms_md_sync_acct(conf->disks[failed_num].dev, bh->b_size>>9);
24390
+ else if ((spare=conf->spare))
24391
+ evms_md_sync_acct(spare->dev, bh->b_size>>9);
24395
+ if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
24396
+ evms_md_done_sync(conf->mddev, (sh->size>>9) - sh->sync_redone,1);
24397
+ clear_bit(STRIPE_SYNCING, &sh->state);
24401
+ spin_unlock(&sh->lock);
24403
+ while ((bh=return_ok)) {
24404
+ return_ok = bh->b_reqnext;
24405
+ bh->b_reqnext = NULL;
24406
+ bh->b_end_io(bh, 1);
24408
+ while ((bh=return_fail)) {
24409
+ return_fail = bh->b_reqnext;
24410
+ bh->b_reqnext = NULL;
24411
+ bh->b_end_io(bh, 0);
24413
+ for (i=disks; i-- ;)
24415
+ struct buffer_head *bh = sh->bh_cache[i];
24416
+ struct disk_info *spare = conf->spare;
24417
+ evms_logical_node_t *node = NULL;
24420
+ if (action[i] == READ+1)
24421
+ bh->b_end_io = raid5_end_read_request;
24423
+ bh->b_end_io = raid5_end_write_request;
24424
+ if (conf->disks[i].operational) {
24425
+ bh->b_dev = conf->disks[i].dev;
24426
+ node = conf->disks[i].node;
24427
+ } else if (spare && action[i] == WRITE+1) {
24428
+ bh->b_dev = spare->dev;
24429
+ node = spare->node;
24432
+ atomic_inc(&sh->count);
24433
+ bh->b_rdev = bh->b_dev;
24434
+ bh->b_rsector = bh->b_blocknr * (bh->b_size>>9);
24436
+ eio.rsector = bh->b_rsector;
24437
+ eio.rsize = bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT;
24438
+ sh->node[i] = node;
24439
+ if (action[i] == READ+1)
24440
+ R_IO(node, &eio);
24442
+ W_IO(node, &eio);
24444
+ clear_bit(BH_Lock, &bh->b_state);
24445
+ set_bit(STRIPE_HANDLE, &sh->state);
24450
+static inline void raid5_activate_delayed(raid5_conf_t *conf)
24452
+ if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
24453
+ while (!list_empty(&conf->delayed_list)) {
24454
+ struct list_head *l = conf->delayed_list.next;
24455
+ struct stripe_head *sh;
24456
+ sh = list_entry(l, struct stripe_head, lru);
24457
+ list_del_init(l);
24458
+ clear_bit(STRIPE_DELAYED, &sh->state);
24459
+ if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
24460
+ atomic_inc(&conf->preread_active_stripes);
24461
+ list_add_tail(&sh->lru, &conf->handle_list);
24465
+static void raid5_unplug_device(void *data)
24467
+ raid5_conf_t *conf = (raid5_conf_t *)data;
24468
+ unsigned long flags;
24470
+ spin_lock_irqsave(&conf->device_lock, flags);
24472
+ raid5_activate_delayed(conf);
24474
+ conf->plugged = 0;
24475
+ evms_cs_wakeup_thread(conf->thread);
24477
+ spin_unlock_irqrestore(&conf->device_lock, flags);
24480
+static inline void raid5_plug_device(raid5_conf_t *conf)
24482
+ spin_lock_irq(&conf->device_lock);
24483
+ if (list_empty(&conf->delayed_list))
24484
+ if (!conf->plugged) {
24485
+ conf->plugged = 1;
24486
+ queue_task(&conf->plug_tq, &tq_disk);
24488
+ spin_unlock_irq(&conf->device_lock);
24492
+static int raid5_make_request (mddev_t *mddev,
24496
+ raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
24497
+ const unsigned int raid_disks = conf->raid_disks;
24498
+ const unsigned int data_disks = raid_disks - 1;
24499
+ unsigned int dd_idx, pd_idx;
24500
+ unsigned long new_sector;
24501
+ int read_ahead = 0;
24502
+ struct buffer_head *bh = eio->bh;
24504
+ struct stripe_head *sh;
24506
+ /* Note: Need to add 64-bit support in the future */
24507
+ bh->b_size = (unsigned short)eio->rsize << EVMS_VSECTOR_SIZE_SHIFT;
24508
+ bh->b_rsector = (unsigned long)eio->rsector;
24509
+ if (rw == READA) {
24514
+ new_sector = raid5_compute_sector(bh->b_rsector,
24515
+ raid_disks, data_disks, &dd_idx, &pd_idx, conf);
24517
+ sh = get_active_stripe(conf, new_sector, bh->b_size, read_ahead);
24519
+ sh->pd_idx = pd_idx;
24521
+ add_stripe_bh(sh, bh, dd_idx, rw);
24523
+ raid5_plug_device(conf);
24524
+ handle_stripe(sh);
24525
+ release_stripe(sh);
24527
+ bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
24532
+ * function: allocate_bh
24534
+ * This function obtains a buffer head from the private
24535
+ * buffer head pool (pre-allocated at EVMS initial
24536
+ * discovery time).
24538
+ * NOTE: All access to the buffer head pool are protected
24539
+ * by a private spinlock.
24542
+static inline struct buffer_head *
24545
+ struct buffer_head *bh =
24546
+ evms_cs_allocate_from_pool(evms_bh_pool, FALSE);
24548
+ init_waitqueue_head(&bh->b_wait);
24554
+ * function: deallocate_bh
24556
+ * This function returns a buffer head to the private
24557
+ * buffer head pool (pre-allocated at EVMS initial
24558
+ * discovery time).
24560
+ * NOTE: All access to the buffer head pool are protected
24561
+ * by a private spinlock.
24564
+static inline void
24565
+deallocate_bh(struct buffer_head *bh)
24567
+ evms_cs_deallocate_to_pool(evms_bh_pool, bh);
24570
+/* this is the buffer head control block structure definition */
24571
+typedef struct bh_cb_s {
24573
+ atomic_t blks_allocated;
24574
+ wait_queue_head_t cb_wait;
24578
+ * function: __wait_on_bh_cb
24580
+ * This is a worker function to wait_on_bh_cb.
24581
+ * This function waits for a set of private buffer heads
24582
+ * associated to the specified buffer head control block
24583
+ * to return from I/O completion. On completion of the
24584
+ * last buffer head, the calling function is awakened
24585
+ * and continues running.
24587
+ * This is the worker function to the function wait_on_bh_cb.
24591
+__wait_on_bh_cb(bh_cb_t *bh_cb)
24593
+ struct task_struct *tsk = current;
24594
+ DECLARE_WAITQUEUE(wait, tsk);
24596
+ add_wait_queue(&bh_cb->cb_wait, &wait);
24598
+ run_task_queue(&tq_disk);
24599
+ set_task_state(tsk, TASK_UNINTERRUPTIBLE);
24600
+ if (!atomic_read(&bh_cb->blks_allocated))
24603
+ } while (atomic_read(&bh_cb->blks_allocated));
24604
+ tsk->state = TASK_RUNNING;
24605
+ remove_wait_queue(&bh_cb->cb_wait, &wait);
24609
+ * function: wait_on_bh_cb
24611
+ * This function waits for a set of private buffer heads
24612
+ * associated to the specified buffer head control block
24613
+ * to return from I/O completion. On completion of the
24614
+ * last buffer head, the calling function is awakened
24615
+ * and continues running.
24619
+wait_on_bh_cb(bh_cb_t *bh_cb)
24621
+ if (atomic_read(&bh_cb->blks_allocated))
24622
+ __wait_on_bh_cb(bh_cb);
24624
+ /* if we ended up with no buffer heads on
24625
+ * this pass, lets wait a until a few buffer
24626
+ * heads have been freed and try again. This
24627
+ * should provide a reasonable delay.
24633
+ * function: end_bh_cb_io
24635
+ * This is the I/O completion function that is called for
24636
+ * each private buffer head obtained from the buffer head
24637
+ * pool. Control is return thru this routine so we can track
24638
+ * all outstanding requests to know when to awaken the caller,
24639
+ * and to regain control after all I/Os have been performed.
24643
+end_bh_cb_io_sync(struct buffer_head *bh, int uptodate)
24645
+ bh_cb_t *bh_cb = (bh_cb_t *)bh->b_private;
24647
+ /* record that errors occurred */
24649
+ bh_cb->rc = -EIO;
24651
+ mark_buffer_uptodate(bh, uptodate);
24652
+ unlock_buffer(bh);
24654
+ deallocate_bh(bh);
24655
+ atomic_dec(&bh_cb->blks_allocated);
24656
+ if (!atomic_read(&bh_cb->blks_allocated))
24657
+ if (waitqueue_active(&bh_cb->cb_wait))
24658
+ wake_up(&bh_cb->cb_wait);
24662
+ * function: md_raid5_internal_partial_sector_io
24664
+ * This function is a support function for md_raid5_internal_io,
24665
+ * which handles the cases of performing I/O to only a part
24666
+ * of sector. This function is not designed to be called
24667
+ * directly, other than by md_raid5_internal_io.
24671
+md_raid5_internal_partial_sector_io(
24675
+ u_int64_t next_offset,
24676
+ u_int64_t sector_offset,
24677
+ u_int64_t io_size,
24679
+ unsigned char **sector_buf )
24682
+ struct buffer_head *bh;
24684
+ raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
24686
+ if (*sector_buf == NULL)
24687
+ /* allocate buffer for incoming sector */
24688
+ rc = evms_cs_allocate_memory((void **)sector_buf,
24689
+ conf->buffer_size);
24691
+ /* allocate a buffer head from the pool */
24692
+ while((bh = allocate_bh()) == NULL)
24693
+ /* yielding the cpu is playing it
24694
+ * safe. it might be wiser to just
24695
+ * spin. requires more thought.
24699
+ /* set up the buffer head for this sector */
24700
+ bh->b_end_io = end_bh_cb_io_sync;
24701
+ bh->b_size = conf->buffer_size;
24703
+ bh->b_rsector = (next_offset - sector_offset) >> EVMS_VSECTOR_SIZE_SHIFT;
24704
+ bh->b_data = *sector_buf;
24705
+ bh->b_page = virt_to_page(*sector_buf); /* this isn't handling the case of a block with more than 1 sector, that spans pages */
24707
+ set_bit(BH_Dirty, &bh->b_state);
24708
+ set_bit(BH_Lock, &bh->b_state);
24709
+ set_bit(BH_Req, &bh->b_state);
24710
+ set_bit(BH_Mapped, &bh->b_state);
24711
+ bh->b_private = (void *)bh_cb;
24712
+ atomic_inc(&bh_cb->blks_allocated);
24714
+ /* drive the buffer head down */
24715
+ /* to the device */
24717
+ eio.rsector = bh->b_rsector;
24718
+ eio.rsize = (u64)bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT;
24719
+ raid5_make_request(mddev, READ, &eio);
24721
+ /* wait for all bh's I/O's to end */
24722
+ wait_on_bh_cb(bh_cb);
24724
+ /* copy data to/from user */
24725
+ if (io_flag != WRITE)
24728
+ *sector_buf + sector_offset,
24732
+ memcpy(*sector_buf + sector_offset,
24736
+ /* allocate a buffer head from the pool */
24737
+ while((bh = allocate_bh()) == NULL)
24738
+ /* yielding the cpu is playing it
24739
+ * safe. it might be wiser to just
24740
+ * spin. requires more thought.
24744
+ /* set up the buffer head for this sector */
24745
+ bh->b_end_io = end_bh_cb_io_sync;
24746
+ bh->b_size = conf->buffer_size;
24748
+ bh->b_rsector = (next_offset - sector_offset) >> EVMS_VSECTOR_SIZE_SHIFT;
24749
+ bh->b_data = *sector_buf;
24750
+ bh->b_page = virt_to_page(*sector_buf); /* this isn't handling the case of a block with more than 1 sector, that spans pages */
24752
+ set_bit(BH_Dirty, &bh->b_state);
24753
+ set_bit(BH_Lock, &bh->b_state);
24754
+ set_bit(BH_Req, &bh->b_state);
24755
+ set_bit(BH_Mapped, &bh->b_state);
24756
+ bh->b_private = (void *)bh_cb;
24757
+ atomic_inc(&bh_cb->blks_allocated);
24759
+ /* drive the buffer head down */
24760
+ /* to the device */
24762
+ eio.rsector = bh->b_rsector;
24763
+ eio.rsize = (u64)bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT;
24764
+ raid5_make_request(mddev, WRITE, &eio);
24766
+ /* wait for all bh's I/O's to end */
24767
+ wait_on_bh_cb(bh_cb);
24774
+ * function: md_raid5_internal_io
24776
+ * This function provides support for synchronous I/O
24777
+ * operations to the underlying devices. These I/O
24778
+ * operations are NOT buffered in any way including the
24779
+ * operating system's buffer cache.
24781
+ * This function can work with any hardsector size that
24782
+ * is a power of 2.
24784
+ * node : logical node of the target logical disk
24785
+ * io_flag : 0 = read, 1 = write, 2 = read-a-head
24786
+ * starting_offset: the 0-based (disk relative) byte offset
24787
+ * num_bytes : the total number of bytes in this I/O
24788
+ * bufptr : address of the memory to read/write the data
24792
+md_raid5_internal_io(
24795
+ u_int64_t starting_offset,
24796
+ u_int64_t num_bytes,
24800
+ u_int64_t next_offset, remaining_bytes;
24801
+ char *cur_bufptr;
24803
+ unsigned char *sector_buf = NULL;
24804
+ evms_logical_node_t *node = mddev->node;
24805
+ raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
24807
+ LOG_EVERYTHING("%s: node(%s), ioflag(%u), start_offset(%Lu), num_bytes(%Lu), bufptr(0x%p)\n",
24808
+ __FUNCTION__, node->name, io_flag, starting_offset, num_bytes, bufptr);
24810
+ /* check for 0 length request */
24811
+ if ( num_bytes == 0 ) {
24812
+ LOG_ERROR("%s: error requesting 0 bytes.\n", __FUNCTION__);
24815
+ /* check for out of bound request */
24817
+ u64 node_total_bytes =
24818
+ node->total_vsectors <<
24819
+ EVMS_VSECTOR_SIZE_SHIFT;
24820
+ if ( (starting_offset + num_bytes) > node_total_bytes) {
24821
+ LOG_ERROR("%s: attempted %s beyond boundary(%Lu bytes), requesting offset(%Lu), length(%Lu).\n",
24822
+ __FUNCTION__, (io_flag == WRITE) ? "WRITE" : "READ",
24823
+ node_total_bytes, starting_offset, num_bytes);
24827
+ /* check for invalid io_flag value */
24829
+ switch( io_flag ) {
24830
+ case READ: /* read... */
24831
+ case WRITE: /* write... */
24832
+ case READA: /* reada... */
24839
+ /* initialize the buffer head control block */
24840
+ memset(&bh_cb, 0, sizeof(bh_cb_t));
24841
+ init_waitqueue_head(&bh_cb.cb_wait);
24843
+ /* only update the local copy of variables */
24844
+ cur_bufptr = bufptr;
24845
+ next_offset = starting_offset;
24846
+ remaining_bytes = num_bytes;
24848
+ /* continue if no errors found */
24850
+ u_int64_t sector_offset;
24852
+ /* check for a mid-sector starting offset
24854
+ * if found, perform I/O on part of that
24857
+ sector_offset = next_offset & (conf->buffer_size - 1);
24858
+ if (sector_offset) {
24859
+ u_int64_t io_size;
24861
+ /* determine bytes in IO to this sector */
24862
+ io_size = conf->buffer_size - sector_offset;
24863
+ if (io_size > remaining_bytes)
24864
+ io_size = remaining_bytes;
24866
+ /* perform the partial sector io */
24867
+ rc = md_raid5_internal_partial_sector_io(
24868
+ mddev,io_flag,&bh_cb,
24870
+ sector_offset, io_size,
24871
+ cur_bufptr, §or_buf);
24874
+ /* update progress in local variables */
24875
+ cur_bufptr += io_size;
24876
+ next_offset += io_size;
24877
+ remaining_bytes -= io_size;
24882
+ /* continue if no errors found */
24884
+ /* perform I/O on all the complete sectors
24885
+ * in this request.
24887
+ * loop until there are no more complete sectors
24890
+ while(remaining_bytes >= conf->buffer_size) {
24891
+ /* this inner loop attempts to drive as many
24892
+ * bytes (in sector size multiples) down to
24893
+ * the device as possible using the available
24894
+ * buffer heads in the pool.
24896
+ while(remaining_bytes >= conf->buffer_size) {
24897
+ struct buffer_head *bh;
24900
+ /* allocate a buffer head from the pool */
24901
+ bh = allocate_bh();
24902
+ if (bh == NULL) break;
24904
+ /* set up the buffer head for this I/O */
24905
+ bh->b_end_io = end_bh_cb_io_sync;
24906
+ bh->b_size = conf->buffer_size;
24907
+ bh->b_data = cur_bufptr;
24909
+ bh->b_rsector = next_offset >> EVMS_VSECTOR_SIZE_SHIFT;
24910
+ bh->b_page = virt_to_page(cur_bufptr); /* this isn't handling the case of a block with more than 1 sector, that spans pages */
24912
+ set_bit(BH_Dirty, &bh->b_state);
24913
+ set_bit(BH_Lock, &bh->b_state);
24914
+ set_bit(BH_Req, &bh->b_state);
24915
+ set_bit(BH_Mapped, &bh->b_state);
24916
+ bh->b_private = (void *)&bh_cb;
24917
+ atomic_inc(&bh_cb.blks_allocated);
24919
+ /* drive the buffer head down */
24920
+ /* to the device */
24922
+ eio.rsector = bh->b_rsector;
24923
+ eio.rsize = (u64)bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT;
24924
+ raid5_make_request(mddev, io_flag, &eio);
24926
+ /* update progress in local variables */
24927
+ cur_bufptr += bh->b_size;
24928
+ next_offset += bh->b_size;
24929
+ remaining_bytes -= bh->b_size;
24931
+ /* wait for all bh's I/O's to end */
24932
+ wait_on_bh_cb(&bh_cb);
24936
+ /* continue if no errors found */
24938
+ /* check for a mid-sector ending offset
24940
+ * if found, perform I/O on part of that
24943
+ if (remaining_bytes)
24944
+ /* perform the partial sector io */
24945
+ rc = md_raid5_internal_partial_sector_io(
24946
+ mddev, io_flag, &bh_cb,
24948
+ 0, remaining_bytes,
24949
+ cur_bufptr, §or_buf);
24951
+ /* free the sector buffer if it was allocated */
24953
+ evms_cs_deallocate_memory(sector_buf);
24955
+ /* coalesce return codes */
24958
+ LOG_EVERYTHING("%s: rc(%u)\n", __FUNCTION__, rc);
24966
+ evms_sector_t startingLSN,
24967
+ evms_sector_t numLSNs,
24971
+ u_int64_t starting_offset, num_bytes;
24973
+ starting_offset = startingLSN;
24974
+ starting_offset <<= EVMS_VSECTOR_SIZE_SHIFT;
24975
+ num_bytes = numLSNs;
24976
+ num_bytes <<= EVMS_VSECTOR_SIZE_SHIFT;
24977
+ rc = md_raid5_internal_io(mddev,io_flag,starting_offset,
24978
+ num_bytes, bufptr);
24982
+static int raid5_sync_request (mddev_t *mddev, unsigned long sector_nr)
24984
+ raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
24985
+ struct stripe_head *sh;
24986
+ int sectors_per_chunk = conf->chunk_size >> 9;
24987
+ unsigned long stripe = sector_nr/sectors_per_chunk;
24988
+ int chunk_offset = sector_nr % sectors_per_chunk;
24989
+ int dd_idx, pd_idx;
24990
+ unsigned long first_sector;
24991
+ int raid_disks = conf->raid_disks;
24992
+ int data_disks = raid_disks-1;
24996
+ sh = get_active_stripe(conf, sector_nr, 0, 0);
24997
+ bufsize = sh->size;
24998
+ redone = sector_nr - sh->sector;
24999
+ first_sector = raid5_compute_sector(stripe*data_disks*sectors_per_chunk
25000
+ + chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf);
25001
+ sh->pd_idx = pd_idx;
25002
+ spin_lock(&sh->lock);
25003
+ set_bit(STRIPE_SYNCING, &sh->state);
25004
+ clear_bit(STRIPE_INSYNC, &sh->state);
25005
+ sh->sync_redone = redone;
25006
+ spin_unlock(&sh->lock);
25008
+ handle_stripe(sh);
25009
+ release_stripe(sh);
25011
+ return (bufsize>>9)-redone;
25015
+ * This is our raid5 kernel thread.
25017
+ * We scan the hash table for stripes which can be handled now.
25018
+ * During the scan, completed stripes are saved for us by the interrupt
25019
+ * handler, so that they will not have to wait for our next wakeup.
25021
+static void raid5d (void *data)
25023
+ struct stripe_head *sh;
25024
+ raid5_conf_t *conf = data;
25025
+ mddev_t *mddev = conf->mddev;
25028
+ LOG_ENTRY_EXIT("+++ raid5d active\n");
25032
+ if (mddev->sb_dirty) {
25033
+ mddev->sb_dirty = 0;
25034
+ evms_md_update_sb(mddev);
25036
+ md_spin_lock_irq(&conf->device_lock);
25038
+ struct list_head *first;
25040
+ if (list_empty(&conf->handle_list) &&
25041
+ atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD &&
25042
+ !conf->plugged &&
25043
+ !list_empty(&conf->delayed_list))
25044
+ raid5_activate_delayed(conf);
25046
+ if (list_empty(&conf->handle_list))
25049
+ first = conf->handle_list.next;
25050
+ sh = list_entry(first, struct stripe_head, lru);
25052
+ list_del_init(first);
25053
+ atomic_inc(&sh->count);
25054
+ if (atomic_read(&sh->count)!= 1)
25056
+ md_spin_unlock_irq(&conf->device_lock);
25059
+ handle_stripe(sh);
25060
+ release_stripe(sh);
25062
+ md_spin_lock_irq(&conf->device_lock);
25064
+ LOG_DEBUG("%d stripes handled\n", handled);
25066
+ md_spin_unlock_irq(&conf->device_lock);
25068
+ LOG_ENTRY_EXIT("+++ raid5d inactive\n");
25072
+ * Private kernel thread for parity reconstruction after an unclean
25073
+ * shutdown. Reconstruction on spare drives in case of a failed drive
25074
+ * is done by the generic mdsyncd.
25076
+static void raid5syncd (void *data)
25078
+ raid5_conf_t *conf = data;
25079
+ mddev_t *mddev = conf->mddev;
25081
+ if (!conf->resync_parity)
25083
+ if (conf->resync_parity == 2)
25085
+ down(&mddev->recovery_sem);
25086
+ if (evms_md_do_sync(mddev,NULL)) {
25087
+ up(&mddev->recovery_sem);
25088
+ LOG_WARNING("resync aborted!\n");
25091
+ conf->resync_parity = 0;
25092
+ up(&mddev->recovery_sem);
25093
+ LOG_DEFAULT("resync finished.\n");
25096
+static int raid5_run (mddev_t *mddev)
25098
+ raid5_conf_t *conf;
25099
+ int i, j, raid_disk, memory;
25100
+ mdp_super_t *sb = mddev->sb;
25101
+ mdp_disk_t *desc;
25102
+ mdk_rdev_t *rdev;
25103
+ struct disk_info *disk;
25104
+ struct md_list_head *tmp;
25105
+ int start_recovery = 0;
25107
+ MOD_INC_USE_COUNT;
25109
+ if (sb->level != 5 && sb->level != 4) {
25110
+ LOG_ERROR("%s: [md%d] raid level not set to 4/5 (%d)\n",
25111
+ __FUNCTION__, mdidx(mddev), sb->level);
25112
+ MOD_DEC_USE_COUNT;
25116
+ mddev->private = kmalloc (sizeof (raid5_conf_t), GFP_KERNEL);
25117
+ if ((conf = mddev->private) == NULL)
25119
+ memset (conf, 0, sizeof (*conf));
25120
+ conf->mddev = mddev;
25122
+ if ((conf->stripe_hashtbl = (struct stripe_head **) md__get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL)
25124
+ memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
25126
+ conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
25127
+ md_init_waitqueue_head(&conf->wait_for_stripe);
25128
+ INIT_LIST_HEAD(&conf->handle_list);
25129
+ INIT_LIST_HEAD(&conf->delayed_list);
25130
+ INIT_LIST_HEAD(&conf->inactive_list);
25131
+ atomic_set(&conf->active_stripes, 0);
25132
+ atomic_set(&conf->preread_active_stripes, 0);
25133
+ conf->buffer_size = PAGE_SIZE; /* good default for rebuild */
25135
+ conf->plugged = 0;
25136
+ conf->plug_tq.sync = 0;
25137
+ conf->plug_tq.routine = &raid5_unplug_device;
25138
+ conf->plug_tq.data = conf;
25140
+ ITERATE_RDEV(mddev,rdev,tmp) {
25142
+ * This is important -- we are using the descriptor on
25143
+ * the disk only to get a pointer to the descriptor on
25144
+ * the main superblock, which might be more recent.
25146
+ desc = sb->disks + rdev->desc_nr;
25147
+ raid_disk = desc->raid_disk;
25148
+ disk = conf->disks + raid_disk;
25150
+ if (disk_faulty(desc)) {
25151
+ LOG_ERROR("%s: disabled device %s (errors detected)\n",
25152
+ __FUNCTION__, evms_md_partition_name(rdev->node));
25153
+ if (!rdev->faulty) {
25157
+ disk->number = desc->number;
25158
+ disk->raid_disk = raid_disk;
25159
+ disk->dev = rdev->dev;
25160
+ disk->node = rdev->node;
25162
+ disk->operational = 0;
25163
+ disk->write_only = 0;
25165
+ disk->used_slot = 1;
25168
+ if (disk_active(desc)) {
25169
+ if (!disk_sync(desc)) {
25170
+ LOG_ERROR("%s: disabled device %s (not in sync)\n",
25171
+ __FUNCTION__, evms_md_partition_name(rdev->node));
25175
+ if (raid_disk > sb->raid_disks) {
25176
+ LOG_ERROR("%s: disabled device %s (inconsistent descriptor)\n",
25177
+ __FUNCTION__, evms_md_partition_name(rdev->node));
25180
+ if (disk->operational) {
25181
+ LOG_ERROR("%s: disabled device %s (device %d already operational)\n",
25182
+ __FUNCTION__, evms_md_partition_name(rdev->node), raid_disk);
25185
+ LOG_DEFAULT("%s: device %s operational as raid disk %d\n",
25186
+ __FUNCTION__, evms_md_partition_name(rdev->node), raid_disk);
25188
+ disk->number = desc->number;
25189
+ disk->raid_disk = raid_disk;
25190
+ disk->dev = rdev->dev;
25191
+ disk->node = rdev->node;
25192
+ disk->operational = 1;
25193
+ disk->used_slot = 1;
25195
+ conf->working_disks++;
25198
+ * Must be a spare disk ..
25200
+ LOG_DEFAULT(" spare disk %s\n", evms_md_partition_name(rdev->node));
25201
+ disk->number = desc->number;
25202
+ disk->raid_disk = raid_disk;
25203
+ disk->dev = rdev->dev;
25204
+ disk->node = rdev->node;
25206
+ disk->operational = 0;
25207
+ disk->write_only = 0;
25209
+ disk->used_slot = 1;
25213
+ for (i = 0; i < MD_SB_DISKS; i++) {
25214
+ desc = sb->disks + i;
25215
+ raid_disk = desc->raid_disk;
25216
+ disk = conf->disks + raid_disk;
25218
+ if (disk_faulty(desc) && (raid_disk < sb->raid_disks) &&
25219
+ !conf->disks[raid_disk].used_slot) {
25221
+ disk->number = desc->number;
25222
+ disk->raid_disk = raid_disk;
25223
+ disk->dev = MKDEV(0,0);
25224
+ disk->node = NULL;
25226
+ disk->operational = 0;
25227
+ disk->write_only = 0;
25229
+ disk->used_slot = 1;
25233
+ conf->raid_disks = sb->raid_disks;
25235
+ * faied_disks: 0 for a fully functional array, 1 for a degraded array.
25237
+ conf->failed_disks = conf->raid_disks - conf->working_disks;
25238
+ conf->mddev = mddev;
25239
+ conf->chunk_size = sb->chunk_size;
25240
+ conf->level = sb->level;
25241
+ conf->algorithm = sb->layout;
25242
+ conf->max_nr_stripes = NR_STRIPES;
25245
+ * If chunk_size is validated in md_core.c, why do it again?
25246
+ * And the check in md_core is:
25247
+ * chunk_size has to be a power of 2 and multiples of PAGE_SIZE
25250
+ if (!conf->chunk_size ||
25251
+ ( (1 << ffz(~conf->chunk_size)) != conf->chunk_size) ||
25252
+ (conf->chunk_size < PAGE_SIZE)) {
25253
+ LOG_ERROR("%s: invalid chunk size %d for md%d\n", __FUNCTION__, conf->chunk_size, mdidx(mddev));
25256
+ if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
25257
+ LOG_ERROR(" unsupported parity algorithm %d for md%d\n", conf->algorithm, mdidx(mddev));
25260
+ if (conf->failed_disks > 1) {
25261
+ LOG_ERROR(" not enough operational devices for md%d (%d/%d failed)\n",
25262
+ mdidx(mddev), conf->failed_disks, conf->raid_disks);
25266
+ if (conf->working_disks != sb->raid_disks) {
25267
+ LOG_WARNING(" md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
25268
+ start_recovery = 1;
25272
+ const char * name = "evms_raid5d";
25274
+ conf->thread = evms_cs_register_thread(raid5d, conf, name);
25275
+ if (!conf->thread) {
25276
+ LOG_ERROR("%s: couldn't allocate thread for md%d\n", __FUNCTION__, mdidx(mddev));
25281
+ memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
25282
+ conf->raid_disks * ((sizeof(struct buffer_head) + PAGE_SIZE))) / 1024;
25283
+ if (grow_stripes(conf, conf->max_nr_stripes, GFP_KERNEL)) {
25284
+ LOG_ERROR("%s: couldn't allocate %dkB for buffers\n", __FUNCTION__, memory);
25285
+ shrink_stripes(conf, conf->max_nr_stripes);
25288
+ LOG_DETAILS("%s: allocated %dkB for md%d\n", __FUNCTION__, memory, mdidx(mddev));
25291
+ * Regenerate the "device is in sync with the raid set" bit for
25294
+ for (i = 0; i < MD_SB_DISKS ; i++) {
25295
+ mark_disk_nonsync(sb->disks + i);
25296
+ for (j = 0; j < sb->raid_disks; j++) {
25297
+ if (!conf->disks[j].operational)
25299
+ if (sb->disks[i].number == conf->disks[j].number)
25300
+ mark_disk_sync(sb->disks + i);
25303
+ sb->active_disks = conf->working_disks;
25305
+ if (sb->active_disks == sb->raid_disks) {
25306
+ LOG_DETAILS("%s: raid level %d set md%d active with %d out of %d devices, algorithm %d\n",
25307
+ __FUNCTION__, conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm);
25309
+ LOG_WARNING("%s: raid level %d set md%d active with %d out of %d devices, algorithm %d\n",
25310
+ __FUNCTION__, conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm);
25313
+ if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN))) {
25314
+ const char * name = "evms_raid5syncd";
25316
+ conf->resync_thread = evms_cs_register_thread(raid5syncd, conf,name);
25317
+ if (!conf->resync_thread) {
25318
+ LOG_ERROR("%s: couldn't allocate thread for md%d\n", __FUNCTION__, mdidx(mddev));
25322
+ LOG_WARNING("%s: raid set md%d not clean; reconstructing parity\n", __FUNCTION__, mdidx(mddev));
25323
+ conf->resync_parity = 1;
25324
+ evms_cs_wakeup_thread(conf->resync_thread);
25327
+ print_raid5_conf(conf);
25328
+ if (start_recovery)
25329
+ evms_md_recover_arrays();
25330
+ print_raid5_conf(conf);
25332
+ /* Ok, everything is just fine now */
25336
+ print_raid5_conf(conf);
25337
+ if (conf->stripe_hashtbl)
25338
+ free_pages((unsigned long) conf->stripe_hashtbl,
25339
+ HASH_PAGES_ORDER);
25342
+ mddev->private = NULL;
25343
+ LOG_WARNING("%s: failed to run raid set md%d\n", __FUNCTION__, mdidx(mddev));
25344
+ MOD_DEC_USE_COUNT;
25348
+static int raid5_stop_resync (mddev_t *mddev)
25350
+ raid5_conf_t *conf = mddev_to_conf(mddev);
25351
+ evms_thread_t *thread;
25353
+ if (conf == NULL) {
25357
+ thread = conf->resync_thread;
25360
+ if (conf->resync_parity) {
25361
+ conf->resync_parity = 2;
25362
+ evms_cs_interrupt_thread(thread);
25363
+ LOG_WARNING("%s: parity resync was not fully finished, restarting next time.\n", __FUNCTION__);
25371
+static int raid5_restart_resync (mddev_t *mddev)
25373
+ raid5_conf_t *conf = mddev_to_conf(mddev);
25375
+ if (conf->resync_parity) {
25376
+ if (!conf->resync_thread) {
25380
+ LOG_DEFAULT("%s: waking up raid5resync.\n", __FUNCTION__);
25381
+ conf->resync_parity = 1;
25382
+ evms_cs_wakeup_thread(conf->resync_thread);
25385
+ LOG_DEFAULT("%s: no restart-resync needed.\n", __FUNCTION__);
25390
+static int raid5_stop (mddev_t *mddev)
25392
+ raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
25394
+ if (conf != NULL) {
25395
+ if (conf->resync_thread)
25396
+ evms_cs_unregister_thread(conf->resync_thread);
25397
+ evms_cs_unregister_thread(conf->thread);
25398
+ shrink_stripes(conf, conf->max_nr_stripes);
25399
+ free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER);
25401
+ mddev->private = NULL;
25403
+ MOD_DEC_USE_COUNT;
25408
+static void print_sh (struct stripe_head *sh)
25412
+ LOG_DEFAULT("sh %lu, size %d, pd_idx %d, state %ld.\n", sh->sector, sh->size, sh->pd_idx, sh->state);
25413
+ LOG_DEFAULT("sh %lu, count %d.\n", sh->sector, atomic_read(&sh->count));
25414
+ LOG_DEFAULT("sh %lu, ", sh->sector);
25415
+ for (i = 0; i < MD_SB_DISKS; i++) {
25416
+ if (sh->bh_cache[i])
25417
+ LOG_DEFAULT("(cache%d: %p %ld) ", i, sh->bh_cache[i], sh->bh_cache[i]->b_state);
25419
+ LOG_DEFAULT("\n");
25422
+static void printall (raid5_conf_t *conf)
25424
+ struct stripe_head *sh;
25427
+ md_spin_lock_irq(&conf->device_lock);
25428
+ for (i = 0; i < NR_HASH; i++) {
25429
+ sh = conf->stripe_hashtbl[i];
25430
+ for (; sh; sh = sh->hash_next) {
25431
+ if (sh->raid_conf != conf)
25436
+ md_spin_unlock_irq(&conf->device_lock);
25440
+static int raid5_status (char *page, mddev_t *mddev)
25442
+ raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
25443
+ mdp_super_t *sb = mddev->sb;
25446
+ sz += sprintf (page+sz, " level %d, %dk chunk, algorithm %d", sb->level, sb->chunk_size >> 10, sb->layout);
25447
+ sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks, conf->working_disks);
25448
+ for (i = 0; i < conf->raid_disks; i++)
25449
+ sz += sprintf (page+sz, "%s", conf->disks[i].operational ? "U" : "_");
25450
+ sz += sprintf (page+sz, "]");
25453
+ sz += sprintf (page+sz, "<"#x":%d>", atomic_read(&conf->x))
25459
+static void print_raid5_conf (raid5_conf_t *conf)
25462
+ struct disk_info *tmp;
25464
+ LOG_DEFAULT("RAID5 conf printout:\n");
25466
+ LOG_DEFAULT("(conf==NULL)\n");
25469
+ LOG_DEFAULT(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks,
25470
+ conf->working_disks, conf->failed_disks);
25473
+ for (i = 0; i < MD_SB_DISKS; i++) {
25475
+ for (i = 0; i < conf->working_disks+conf->failed_disks; i++) {
25477
+ tmp = conf->disks + i;
25478
+ LOG_DEFAULT(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
25479
+ i, tmp->spare,tmp->operational,
25480
+ tmp->number,tmp->raid_disk,tmp->used_slot,
25481
+ evms_md_partition_name(tmp->node));
25485
+static int raid5_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
25488
+ int i, failed_disk=-1, spare_disk=-1, removed_disk=-1;
25489
+ raid5_conf_t *conf = mddev->private;
25490
+ struct disk_info *tmp, *sdisk, *fdisk, *rdisk;
25491
+ mdp_super_t *sb = mddev->sb;
25492
+ mdp_disk_t *failed_desc, *spare_desc;
25493
+ mdk_rdev_t *spare_rdev, *failed_rdev;
25495
+ print_raid5_conf(conf);
25496
+ md_spin_lock_irq(&conf->device_lock);
25498
+ * find the disk ...
25502
+ case DISKOP_SPARE_ACTIVE:
25505
+ * Find the failed disk within the RAID5 configuration ...
25506
+ * (this can only be in the first conf->raid_disks part)
25508
+ for (i = 0; i < conf->raid_disks; i++) {
25509
+ tmp = conf->disks + i;
25510
+ if ((!tmp->operational && !tmp->spare) ||
25511
+ !tmp->used_slot) {
25517
+ * When we activate a spare disk we _must_ have a disk in
25518
+ * the lower (active) part of the array to replace.
25520
+ if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
25525
+ /* fall through */
25527
+ case DISKOP_SPARE_WRITE:
25528
+ case DISKOP_SPARE_INACTIVE:
25531
+ * Find the spare disk ... (can only be in the 'high'
25532
+ * area of the array)
25534
+ for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
25535
+ tmp = conf->disks + i;
25536
+ if (tmp->spare && tmp->number == (*d)->number) {
25541
+ if (spare_disk == -1) {
25548
+ case DISKOP_HOT_REMOVE_SPARE:
25550
+ for (i = 0; i < MD_SB_DISKS; i++) {
25551
+ tmp = conf->disks + i;
25552
+ if (tmp->used_slot && (tmp->number == (*d)->number)) {
25553
+ if (tmp->operational) {
25556
+ } else if (!tmp->spare) {
25561
+ removed_disk = i;
25565
+ if (removed_disk == -1) {
25572
+ case DISKOP_HOT_REMOVE_DISK:
25573
+ for (i = 0; i < MD_SB_DISKS; i++) {
25574
+ tmp = conf->disks + i;
25575
+ if (tmp->used_slot && (tmp->number == (*d)->number)) {
25576
+ if (i < conf->raid_disks) {
25577
+ if (conf->working_disks != conf->raid_disks) {
25579
+ * Can't remove a disk from an
25580
+ * array that is running in
25586
+ if (sb->spare_disks == 0) {
25588
+ * Must have a spare ready
25589
+ * before removing an active
25596
+ removed_disk = i;
25600
+ if (removed_disk == -1) {
25607
+ case DISKOP_HOT_ADD_DISK:
25615
+ * Switch the spare disk to write-only mode:
25617
+ case DISKOP_SPARE_WRITE:
25618
+ if (conf->spare) {
25623
+ sdisk = conf->disks + spare_disk;
25624
+ sdisk->operational = 1;
25625
+ sdisk->write_only = 1;
25626
+ conf->spare = sdisk;
25629
+ * Deactivate a spare disk:
25631
+ case DISKOP_SPARE_INACTIVE:
25632
+ sdisk = conf->disks + spare_disk;
25633
+ sdisk->operational = 0;
25634
+ sdisk->write_only = 0;
25636
+ * Was the spare being resynced?
25638
+ if (conf->spare == sdisk)
25639
+ conf->spare = NULL;
25642
+ * Activate (mark read-write) the (now sync) spare disk,
25643
+ * which means we switch it's 'raid position' (->raid_disk)
25644
+ * with the failed disk. (only the first 'conf->raid_disks'
25645
+ * slots are used for 'real' disks and we must preserve this
25648
+ case DISKOP_SPARE_ACTIVE:
25649
+ if (!conf->spare) {
25654
+ sdisk = conf->disks + spare_disk;
25655
+ fdisk = conf->disks + failed_disk;
25657
+ spare_desc = &sb->disks[sdisk->number];
25658
+ failed_desc = &sb->disks[fdisk->number];
25660
+ if (spare_desc != *d) {
25666
+ if (spare_desc->raid_disk != sdisk->raid_disk) {
25672
+ if (sdisk->raid_disk != spare_disk) {
25678
+ if (failed_desc->raid_disk != fdisk->raid_disk) {
25684
+ if (fdisk->raid_disk != failed_disk) {
25691
+ * do the switch finally
25693
+ spare_rdev = evms_md_find_rdev_nr(mddev, spare_desc->number);
25694
+ failed_rdev = evms_md_find_rdev_nr(mddev, failed_desc->number);
25696
+ /* There must be a spare_rdev, but there may not be a
25697
+ * failed_rdev. That slot might be empty...
25699
+ spare_rdev->desc_nr = failed_desc->number;
25701
+ failed_rdev->desc_nr = spare_desc->number;
25703
+ xchg_values(*spare_desc, *failed_desc);
25704
+ xchg_values(*fdisk, *sdisk);
25707
+ * (careful, 'failed' and 'spare' are switched from now on)
25709
+ * we want to preserve linear numbering and we want to
25710
+ * give the proper raid_disk number to the now activated
25711
+ * disk. (this means we switch back these values)
25714
+ xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
25715
+ xchg_values(sdisk->raid_disk, fdisk->raid_disk);
25716
+ xchg_values(spare_desc->number, failed_desc->number);
25717
+ xchg_values(sdisk->number, fdisk->number);
25719
+ *d = failed_desc;
25721
+ //if (sdisk->dev == MKDEV(0,0))
25722
+ if (sdisk->node == NULL)
25723
+ sdisk->used_slot = 0;
25726
+ * this really activates the spare.
25728
+ fdisk->spare = 0;
25729
+ fdisk->write_only = 0;
25732
+ * if we activate a spare, we definitely replace a
25733
+ * non-operational disk slot in the 'low' area of
25734
+ * the disk array.
25736
+ conf->failed_disks--;
25737
+ conf->working_disks++;
25738
+ conf->spare = NULL;
25742
+ case DISKOP_HOT_REMOVE_SPARE:
25743
+ rdisk = conf->disks + removed_disk;
25745
+ if (rdisk->spare && (removed_disk < conf->raid_disks)) {
25750
+ if (conf->spare != NULL) {
25751
+ if (conf->spare->number == removed_disk) {
25752
+ conf->spare = NULL;
25756
+ rdisk->dev = MKDEV(0,0);
25757
+ rdisk->node = NULL;
25758
+ rdisk->used_slot = 0;
25762
+ case DISKOP_HOT_REMOVE_DISK:
25763
+ rdisk = conf->disks + removed_disk;
25764
+ if (rdisk->operational) {
25765
+ /* We're removing a running disk in the array. */
25766
+ conf->working_disks--;
25767
+ conf->failed_disks++;
25769
+ rdisk->dev = MKDEV(0,0);
25770
+ rdisk->node = NULL;
25771
+ rdisk->used_slot = 0;
25772
+ rdisk->operational = 0;
25781
+ md_spin_unlock_irq(&conf->device_lock);
25782
+ print_raid5_conf(conf);
25786
+static int raid5_bmap(mddev_t *mddev, evms_sector_t *rsector, evms_logical_node_t **node)
25788
+ raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
25789
+ const unsigned int raid_disks = conf->raid_disks;
25790
+ const unsigned int data_disks = raid_disks - 1;
25791
+ unsigned int dd_idx, pd_idx;
25793
+ *rsector = (evms_sector_t)raid5_compute_sector((unsigned long)*rsector,
25799
+ *node = conf->disks[dd_idx].node;
25800
+ return 0; /* always successful */
25803
+static int raid5_evms_ioctl (
25805
+ struct inode * inode,
25806
+ struct file * file,
25807
+ unsigned int cmd,
25808
+ unsigned long arg)
25811
+ evms_logical_node_t *node;
25814
+ case EVMS_GET_BMAP:
25816
+ evms_get_bmap_t *bmap = (evms_get_bmap_t *)arg;
25817
+ rc = raid5_bmap(mddev,&bmap->rsector,&node);
25820
+ rc = IOCTL(node, inode, file, cmd, arg);
25833
+static int raid5_pers_ioctl(mddev_t *mddev, int cmd, void * args){
25836
+ raid5_ioctl_init_io_t init_io_args;
25839
+ LOG_DETAILS("%s: cmd == %d.\n", __FUNCTION__, cmd);
25841
+ case EVMS_MD_RAID5_INIT_IO:
25843
+ if (copy_from_user(&init_io_args, (raid5_ioctl_init_io_t*)args, sizeof(init_io_args)) ) {
25847
+ rc = evms_cs_allocate_memory(&data, init_io_args.nr_sects << EVMS_VSECTOR_SIZE_SHIFT);
25852
+ if (copy_from_user(data, init_io_args.data, init_io_args.nr_sects << EVMS_VSECTOR_SIZE_SHIFT)) {
25853
+ evms_cs_deallocate_memory(data);
25857
+ rc = raid5_init_io(mddev, init_io_args.rw,
25858
+ init_io_args.lsn, init_io_args.nr_sects,data);
25860
+ copy_to_user(init_io_args.data, data, init_io_args.nr_sects << EVMS_VSECTOR_SIZE_SHIFT);
25861
+ evms_cs_deallocate_memory(data);
25863
+ copy_to_user((raid5_ioctl_init_io_t*)args, &init_io_args, sizeof(init_io_args));
25874
+static mdk_personality_t raid5_personality=
25876
+ name: "evms_raid5",
25877
+ init_io: raid5_init_io,
25878
+ make_request: raid5_make_request,
25880
+ stop: raid5_stop,
25881
+ status: raid5_status,
25882
+ error_handler: raid5_error,
25883
+ diskop: raid5_diskop,
25884
+ stop_resync: raid5_stop_resync,
25885
+ restart_resync: raid5_restart_resync,
25886
+ sync_request: raid5_sync_request,
25887
+ evms_ioctl: raid5_evms_ioctl,
25888
+ md_pers_ioctl: raid5_pers_ioctl
25891
+static int md__init raid5_init (void)
25893
+ return evms_register_md_personality (RAID5, &raid5_personality);
25896
+static void raid5_exit (void)
25898
+ evms_unregister_md_personality (RAID5);
25901
+module_init(raid5_init);
25902
+module_exit(raid5_exit);
25903
+#ifdef MODULE_LICENSE
25904
+MODULE_LICENSE("GPL");
25906
diff -Naur linux-2002-03-28/drivers/evms/md_xor.c evms-2002-03-28/drivers/evms/md_xor.c
25907
--- linux-2002-03-28/drivers/evms/md_xor.c Wed Dec 31 18:00:00 1969
25908
+++ evms-2002-03-28/drivers/evms/md_xor.c Fri Mar 1 11:50:58 2002
25911
+ * md_xor.c : Multiple Devices driver for Linux
25913
+ * Copyright (C) 1996, 1997, 1998, 1999, 2000,
25914
+ * Ingo Molnar, Matti Aarnio, Jakub Jelinek, Richard Henderson.
25916
+ * Dispatch optimized RAID-5 checksumming functions.
25918
+ * 'md_xor.c' is an EVMS version of linux/drivers/md/xor.c modified
25919
+ * by Cuong (Mike) Tran <miketran@us.ibm.com>, January 2002.
25921
+ * This program is free software; you can redistribute it and/or modify
25922
+ * it under the terms of the GNU General Public License as published by
25923
+ * the Free Software Foundation; either version 2, or (at your option)
25924
+ * any later version.
25926
+ * You should have received a copy of the GNU General Public License
25927
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
25928
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25931
+#define BH_TRACE 0
25932
+#include <linux/module.h>
25933
+#include <linux/evms/evms_md.h>
25934
+#include <linux/evms/evms_xor.h>
25935
+#include <asm/xor.h>
25937
+#define LOG_PREFIX "md raid5: "
25938
+/* The xor routines to use. */
25939
+static struct xor_block_template *active_template;
25942
+evms_md_xor_block(unsigned int count, struct buffer_head **bh_ptr)
25944
+ unsigned long *p0, *p1, *p2, *p3, *p4;
25945
+ unsigned long bytes = bh_ptr[0]->b_size;
25947
+ p0 = (unsigned long *) bh_ptr[0]->b_data;
25948
+ p1 = (unsigned long *) bh_ptr[1]->b_data;
25949
+ if (count == 2) {
25950
+ active_template->do_2(bytes, p0, p1);
25954
+ p2 = (unsigned long *) bh_ptr[2]->b_data;
25955
+ if (count == 3) {
25956
+ active_template->do_3(bytes, p0, p1, p2);
25960
+ p3 = (unsigned long *) bh_ptr[3]->b_data;
25961
+ if (count == 4) {
25962
+ active_template->do_4(bytes, p0, p1, p2, p3);
25966
+ p4 = (unsigned long *) bh_ptr[4]->b_data;
25967
+ active_template->do_5(bytes, p0, p1, p2, p3, p4);
25970
+/* Set of all registered templates. */
25971
+static struct xor_block_template *template_list;
25973
+#define BENCH_SIZE (PAGE_SIZE)
25976
+do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
25979
+ unsigned long now;
25980
+ int i, count, max;
25982
+ tmpl->next = template_list;
25983
+ template_list = tmpl;
25986
+ * Count the number of XORs done during a whole jiffy, and use
25987
+ * this to calculate the speed of checksumming. We use a 2-page
25988
+ * allocation to have guaranteed color L1-cache layout.
25991
+ for (i = 0; i < 5; i++) {
25994
+ while (jiffies == now) {
25996
+ tmpl->do_2(BENCH_SIZE, b1, b2);
26005
+ speed = max * (HZ * BENCH_SIZE / 1024);
26006
+ tmpl->speed = speed;
26008
+ LOG_DEFAULT(" %-10s: %5d.%03d MB/sec\n", tmpl->name,
26009
+ speed / 1000, speed % 1000);
26013
+calibrate_xor_block(void)
26016
+ struct xor_block_template *f, *fastest;
26018
+ b1 = (void *) md__get_free_pages(GFP_KERNEL, 2);
26020
+ LOG_ERROR("Yikes! No memory available.\n");
26023
+ b2 = b1 + 2*PAGE_SIZE + BENCH_SIZE;
26025
+ LOG_DEFAULT("measuring checksumming speed\n");
26028
+#define xor_speed(templ) do_xor_speed((templ), b1, b2)
26030
+ XOR_TRY_TEMPLATES;
26034
+ free_pages((unsigned long)b1, 2);
26036
+ fastest = template_list;
26037
+ for (f = fastest; f; f = f->next)
26038
+ if (f->speed > fastest->speed)
26041
+#ifdef XOR_SELECT_TEMPLATE
26042
+ fastest = XOR_SELECT_TEMPLATE(fastest);
26045
+ active_template = fastest;
26046
+ LOG_DEFAULT("using function: %s (%d.%03d MB/sec)\n",
26047
+ fastest->name, fastest->speed / 1000, fastest->speed % 1000);
26052
+MD_EXPORT_SYMBOL(evms_md_xor_block);
26054
+#ifdef MODULE_LICENSE
26055
+MODULE_LICENSE("GPL");
26058
+module_init(calibrate_xor_block);
26059
diff -Naur linux-2002-03-28/drivers/evms/os2lvm_vge.c evms-2002-03-28/drivers/evms/os2lvm_vge.c
26060
--- linux-2002-03-28/drivers/evms/os2lvm_vge.c Wed Dec 31 18:00:00 1969
26061
+++ evms-2002-03-28/drivers/evms/os2lvm_vge.c Thu Mar 28 12:50:56 2002
26065
+ * Copyright (c) International Business Machines Corp., 2001
26067
+ * This program is free software; you can redistribute it and/or modify
26068
+ * it under the terms of the GNU General Public License as published by
26069
+ * the Free Software Foundation; either version 2 of the License, or
26070
+ * (at your option) any later version.
26072
+ * This program is distributed in the hope that it will be useful,
26073
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
26074
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
26075
+ * the GNU General Public License for more details.
26077
+ * You should have received a copy of the GNU General Public License
26078
+ * along with this program; if not, write to the Free Software
26079
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
26084
+ * linux/drivers/evms/os2lvm_vge.c
26086
+ * EVMS OS/2 LVM Emulator
26088
+ * This Volume Group Emulator will take the type 0x35 partitions created by
26089
+ * OS/2 versions 4.5 and later and build them into volumes. It emulates
26090
+ * the Drive Linking and Bad Block Relocation features and therefore
26091
+ * provides binary compatibility with the OS/2 version. Of course, if
26092
+ * you select to mkfs a file system OS/2 doesn't support, you're on your
26095
+ * Since OS/2 LVM volumes can only exist on DOS-style partitioned disks,
26096
+ * this VGE has a dependency on dospart.c to report a list of the
26097
+ * candidate partitions. This module will then take the appropriate partitions
26098
+ * from the list and use them to build the OS/2-style volumes.
26100
+ * Change Activity:
26102
+ * 7/01/2001 John Stiles getting started.
26103
+ * 9/14/2001 John Stiles original version.
26104
+ * 11/01/2001 John Stiles new naming scheme.
26105
+ * 11/21/2001 John Stiles i/o path changes.
26108
+#define EVMS_DEBUG 1
26109
+#define EVMS_OS2_DEBUG 1
26111
+#include <linux/module.h>
26112
+#include <linux/kernel.h>
26113
+#include <linux/config.h>
26114
+#include <linux/genhd.h>
26115
+#include <linux/major.h>
26116
+#include <linux/string.h>
26117
+#include <linux/blk.h>
26118
+#include <linux/init.h>
26119
+#include <linux/evms/evms_kernel.h>
26120
+#include <linux/evms/evms_os2.h>
26121
+#include <asm/uaccess.h>
26123
+#define LOG_PREFIX "os2lvm: "
26125
+// Global Structure and Type definitions
26126
+typedef struct BBR_IO_Transfer_Record_s{
26127
+ int Write_Flag; /* 0 = read, 1 = write */
26128
+ os2_drivelink_runtime_entry_t * Partition_Data;
26130
+ struct BBR_IO_Transfer_Record_s * Next;
26131
+} BBR_IO_Transfer_Record_t;
26133
+typedef struct DL_IO_Tracking_Record_s{ /* structure used to track IO requests that must be broken into two pieces due to drive linking */
26134
+ unsigned int IO_In_Progress;
26136
+ eio_t Original; /* Original IO */
26137
+ eio_t Link1; /* First child. */
26138
+ os2_drivelink_runtime_entry_t * Link1_Partition_Data;
26139
+ BBR_IO_Transfer_Record_t * Link1_Transfer_Record;
26140
+ int Link1_BBR_Attempted;
26141
+ eio_t Link2; /* Second child */
26142
+ os2_drivelink_runtime_entry_t * Link2_Partition_Data;
26143
+ BBR_IO_Transfer_Record_t * Link2_Transfer_Record;
26144
+ int Link2_BBR_Attempted;
26145
+} DL_IO_Tracking_Record_t;
26147
+// Prototypes for local VGE functions
26148
+static int discover_os2lvm_partitions( evms_logical_node_t ** );
26149
+static evms_logical_node_t * find_os2_volume( u_int32_t );
26150
+static int add_os2link( os2_drivelink_runtime_entry_t *, evms_logical_node_t * );
26151
+static os2_drivelink_runtime_entry_t * find_link_data( os2_drivelink_runtime_entry_t **, u_int32_t );
26152
+static int find_drive_link( evms_logical_node_t *, os2_drivelink_runtime_entry_t **, evms_sector_t *, evms_sector_t * );
26153
+static int validate_signaturesector( evms_logical_node_t *, LVM_Signature_Sector *, u_int32_t );
26154
+static int validate_drivelinksector( void *, int, u_int32_t);
26155
+static int validate_bbrtablesector( void *, int, u_int32_t );
26156
+static u_int32_t check_for_os2_bbr_relocations( char * );
26157
+static int check_os2_volumes( evms_logical_node_t ** );
26158
+static int OS2_ioctl_cmd_broadcast(
26159
+ evms_logical_node_t *node,
26160
+ struct inode *inode, struct file *file,
26161
+ unsigned long cmd, unsigned long arg);
26162
+static int os2_ioctl_cmd_plugin_ioctl(
26163
+ evms_logical_node_t *node,
26164
+ struct inode *inode, struct file *file,
26165
+ unsigned long cmd, unsigned long arg);
26166
+static void BBR_Worker( void *);
26167
+static void OS2_BBR_Write_Callback( BBR_IO_Transfer_Record_t * Transfer_Record,
26168
+ struct buffer_head * bh,
26171
+static void BBR_Transfer_IO(BBR_IO_Transfer_Record_t * Transfer_Record);
26172
+static void OS2_DL_Callback(struct buffer_head *bh, int uptodate);
26173
+static int Sector_Is_Remapped(os2_drivelink_runtime_entry_t * io_dlentry,
26174
+ evms_sector_t Source_Sector,
26175
+ evms_sector_t * Replacement_Sector);
26176
+static void Invalidate_Mapping(os2_drivelink_runtime_entry_t * io_dlentry,
26177
+ evms_sector_t Source_Sector,
26178
+ int Replacement_Sector_Is_Bad);
26179
+static int Create_New_BBR_Table_Entry(os2_drivelink_runtime_entry_t * io_dlentry,
26180
+ evms_sector_t starting_lsn,
26181
+ unsigned int count,
26183
+static void Clone_Bufferhead(struct buffer_head * Source, struct buffer_head * Child);
26186
+// Prototypes for local memory allocation/deallocation functions
26187
+static os2_drivelink_runtime_entry_t * new_os2_drive_link( LVM_Signature_Sector *, evms_logical_node_t * );
26188
+static char * new_os2_link_data( u_int32_t, u_int32_t, u_int32_t, evms_logical_node_t * );
26189
+static char * new_os2_bbr_data( u_int32_t, u_int32_t, u_int32_t, evms_logical_node_t * );
26190
+static evms_logical_node_t * new_os2volume( u_int32_t, char * );
26191
+static int delete_os2lvm_volume( evms_logical_node_t * );
26192
+static int delete_os2_drive_link( os2_drivelink_runtime_entry_t *, int );
26195
+// Prototypes for Function Table interface
26196
+static int discover_os2lvm( evms_logical_node_t ** );
26197
+static int delete_os2lvm( evms_logical_node_t * );
26198
+static void read_os2lvm( evms_logical_node_t *, eio_t * );
26199
+static void write_os2lvm( evms_logical_node_t *, eio_t * );
26200
+static int init_io_os2lvm( evms_logical_node_t *, int, evms_sector_t, evms_sector_t, void * );
26201
+static int ioctl_os2lvm( evms_logical_node_t *, struct inode *, struct file *, unsigned int, unsigned long );
26202
+static int do_os2_bbr_io( os2_drivelink_runtime_entry_t *, int, evms_sector_t, evms_sector_t, void * );
26205
+// Global data structures
26206
+static evms_logical_node_t * os2lvm_nodes = NULL;
26207
+static evms_thread_t * BBR_Worker_Thread = NULL;
26208
+static spinlock_t BBR_Queue_Lock = SPIN_LOCK_UNLOCKED;
26209
+static const char * BBR_Worker_Name = "evms_os2_bbr_io";
26210
+static BBR_IO_Transfer_Record_t *BBR_IO_List_Head = NULL;
26211
+static BBR_IO_Transfer_Record_t *BBR_IO_List_Tail = NULL;
26212
+static evms_pool_mgmt_t * BBR_Transfer_Pool = NULL;
26213
+static char * BBR_Transfer_Pool_Name = "OS-2 Transfer Pool";
26214
+static char * DL_Tracking_Pool_Name = "OS-2 Tracking Pool";
26215
+static evms_pool_mgmt_t * DL_Tracking_Pool = NULL;
26218
+// Required plug-in Function Table definition
26219
+static evms_plugin_function_table_t function_table = {
26220
+ discover: &discover_os2lvm,
26221
+ delete : &delete_os2lvm,
26222
+ read : &read_os2lvm,
26223
+ write : &write_os2lvm,
26224
+ init_io : &init_io_os2lvm,
26225
+ ioctl : &ioctl_os2lvm
26229
+// Required plug-in Header definition
26230
+static evms_plugin_header_t plugin_header = {
26231
+ id : SetPluginID(
26233
+ EVMS_REGION_MANAGER, // Region Manger class
26234
+ 2 ), // Unique ID within VGEs
26240
+ required_common_services_version: {
26241
+ major : EVMS_COMMON_SERVICES_MAJOR,
26242
+ minor : EVMS_COMMON_SERVICES_MINOR,
26243
+ patchlevel : EVMS_COMMON_SERVICES_PATCHLEVEL
26245
+ function_table : &function_table // Function table for this plugin
26249
+// Required Plugin Functions
26253
+ * Function: discover_os2lvm
26255
+ * This is the entry point into the discovery process.
26257
+static int discover_os2lvm( evms_logical_node_t ** evms_partition_list )
26261
+ if ( ! BBR_Transfer_Pool ) {
26262
+ BBR_Transfer_Pool = evms_cs_create_pool( sizeof(BBR_IO_Transfer_Record_t), BBR_Transfer_Pool_Name, NULL, NULL);
26263
+ if ( ! BBR_Transfer_Pool ) {
26268
+ if ( ! DL_Tracking_Pool ) {
26269
+ DL_Tracking_Pool = evms_cs_create_pool( sizeof(DL_IO_Tracking_Record_t), DL_Tracking_Pool_Name, NULL, NULL);
26270
+ if ( ! DL_Tracking_Pool ) {
26275
+ rc = discover_os2lvm_partitions( evms_partition_list );
26278
+ rc = check_os2_volumes( evms_partition_list );
26286
+ * Function: delete_os2lvm
26288
+ * This is the entry point for deleting a node.
26290
+static int delete_os2lvm( evms_logical_node_t * logical_node )
26292
+ LOG_EXTRA("Deleting volume: %s\n", logical_node->name );
26294
+ return delete_os2lvm_volume( logical_node );
26299
+ * Function: read_os2lvm
26301
+static void read_os2lvm( evms_logical_node_t * node,
26305
+ evms_sector_t sector_count;
26306
+ struct buffer_head * Link1 = NULL;
26307
+ struct buffer_head * Link2 = NULL;
26308
+ DL_IO_Tracking_Record_t * Tracking_Record = NULL;
26309
+ os2_drivelink_runtime_entry_t * cur_dlentry = NULL;
26310
+ BBR_IO_Transfer_Record_t * Transfer_Record;
26312
+ sector_count = eio->rsize;
26313
+ rc = find_drive_link( node, &cur_dlentry, &eio->rsector, §or_count );
26316
+ if ( cur_dlentry->bbr_is_active ) {
26317
+ Transfer_Record = evms_cs_allocate_from_pool(BBR_Transfer_Pool,1); /* Block until we get a transfer record. */
26318
+ /* Transfer the IO to the BBR Worker Thread. */
26319
+ Transfer_Record->Write_Flag = 0;
26320
+ Transfer_Record->Partition_Data = cur_dlentry;
26321
+ Transfer_Record->eio = *eio;
26322
+ Transfer_Record->Next = NULL;
26323
+ BBR_Transfer_IO(Transfer_Record);
26326
+ R_IO( cur_dlentry->link_partition, eio );
26329
+ /* We must split the IO. Duplicate the buffer head twice and allocate the tracking record. */
26330
+ Tracking_Record = evms_cs_allocate_from_pool(DL_Tracking_Pool,1); /* Block until we get a tracking record. */
26331
+ Link1 = evms_cs_allocate_from_pool(evms_bh_pool,1);
26332
+ Link2 = evms_cs_allocate_from_pool(evms_bh_pool,1);
26334
+ /* Initialize the tracking record so we can associate the two new I/Os with the original. */
26335
+ Tracking_Record->IO_In_Progress = 2;
26336
+ Tracking_Record->Up_To_Date = 0;
26337
+ Tracking_Record->Original = *eio;
26339
+ /* Create the I/O to the first link. */
26340
+ Clone_Bufferhead(eio->bh,Link1);
26341
+ Link1->b_private = Tracking_Record;
26342
+ Link1->b_end_io = OS2_DL_Callback;
26343
+ Link1->b_size = sector_count << EVMS_VSECTOR_SIZE_SHIFT;
26344
+ Tracking_Record->Link1.rsector = eio->rsector;
26345
+ Tracking_Record->Link1.rsize = sector_count;
26346
+ Tracking_Record->Link1.bh = Link1;
26347
+ Tracking_Record->Link1_Partition_Data = cur_dlentry;
26348
+ Tracking_Record->Link1_BBR_Attempted = 0;
26349
+ Tracking_Record->Link1_Transfer_Record = NULL;
26351
+ /* Create the I/O to the second link */
26352
+ Clone_Bufferhead(eio->bh,Link2);
26353
+ Link2->b_private = Tracking_Record;
26354
+ Link2->b_end_io = OS2_DL_Callback;
26355
+ Link2->b_data += sector_count << EVMS_VSECTOR_SIZE_SHIFT;
26356
+ Tracking_Record->Link2.bh = Link2;
26357
+ Tracking_Record->Link2_Partition_Data = cur_dlentry->next;
26358
+ Link2->b_rsector = 0;
26359
+ Tracking_Record->Link2.rsector = 0;
26360
+ Tracking_Record->Link2.rsize = eio->rsize - sector_count;
26361
+ Link2->b_size = Tracking_Record->Link2.rsize << EVMS_VSECTOR_SIZE_SHIFT;
26362
+ Tracking_Record->Link2_BBR_Attempted = 0;
26363
+ Tracking_Record->Link2_Transfer_Record = NULL;
26365
+ /* Process the I/O to the first link. */
26366
+ if ( cur_dlentry->bbr_is_active ) {
26367
+ Transfer_Record = evms_cs_allocate_from_pool(BBR_Transfer_Pool,1); /* Block until we get a transfer record. */
26368
+ /* Transfer the IO to the BBR Worker Thread. */
26369
+ Transfer_Record->Write_Flag = 0;
26370
+ Transfer_Record->Partition_Data = cur_dlentry;
26371
+ Transfer_Record->eio = Tracking_Record->Link1;
26372
+ Transfer_Record->Next = NULL;
26373
+ BBR_Transfer_IO(Transfer_Record);
26376
+ R_IO( cur_dlentry->link_partition, &Tracking_Record->Link1 );
26378
+ /* Process the I/O to the second link. */
26379
+ cur_dlentry = cur_dlentry->next;
26380
+ if ( cur_dlentry->bbr_is_active ) {
26381
+ Transfer_Record = evms_cs_allocate_from_pool(BBR_Transfer_Pool,1); /* Block until we get a transfer record. */
26382
+ /* Transfer the IO to the BBR Worker Thread. */
26383
+ Transfer_Record->Write_Flag = 0;
26384
+ Transfer_Record->Partition_Data = cur_dlentry;
26385
+ Transfer_Record->eio = Tracking_Record->Link2;
26386
+ Transfer_Record->Next = NULL;
26387
+ BBR_Transfer_IO(Transfer_Record);
26390
+ R_IO( cur_dlentry->link_partition, &Tracking_Record->Link2 );
26394
+ LOG_SERIOUS("READ error, request exceeds volume size.\n" );
26395
+ EVMS_IO_ERROR(eio);
26402
+ * Function: write_os2lvm
26404
+static void write_os2lvm( evms_logical_node_t * node,
26408
+ evms_sector_t sector_count;
26409
+ struct buffer_head * Link1 = NULL;
26410
+ struct buffer_head * Link2 = NULL;
26411
+ DL_IO_Tracking_Record_t * Tracking_Record = NULL;
26412
+ os2_drivelink_runtime_entry_t * cur_dlentry = NULL;
26413
+ BBR_IO_Transfer_Record_t * Transfer_Record;
26415
+ sector_count = eio->rsize;
26416
+ rc = find_drive_link( node, &cur_dlentry, &eio->rsector, §or_count );
26419
+ /* Set up a Transfer Record. If there are Bad Blocks on the partition that this I/O is
26420
+ directed to, then we will need the Transfer Record to put the I/O in the queue for the
26421
+ BBR Worker Thread. If there are no bad blocks, then we will need the Transfer Record
26422
+ for the OS2_BBR_Write_Callback function. This function expects the Transfer Record to
26423
+ be pre-allocated and available because it is running on an interrupt thread and should
26424
+ not do memory allocation. If there is an error during the write, then the
26425
+ OS2_BBR_Write_Callback function will use the Transfer Record to transfer the I/O
26426
+ to the BBR worker thread for further processing. If there are no errors during the I/O,
26427
+ then the OS2_BBR_Write_Callback will deallocate the Transfer Record. */
26428
+ Transfer_Record = evms_cs_allocate_from_pool(BBR_Transfer_Pool,1); /* Block until we get a transfer record. */
26429
+ Transfer_Record->Write_Flag = 1;
26430
+ Transfer_Record->Partition_Data = cur_dlentry;
26431
+ Transfer_Record->eio = *eio;
26432
+ Transfer_Record->Next = NULL;
26433
+ if ( cur_dlentry->bbr_is_active ) {
26434
+ /* Transfer the IO to the BBR Worker Thread. */
26435
+ BBR_Transfer_IO(Transfer_Record);
26438
+ evms_cs_register_for_end_io_notification(Transfer_Record,eio->bh,OS2_BBR_Write_Callback);
26439
+ W_IO( cur_dlentry->link_partition, eio );
26443
+ /* We must split the IO. Duplicate the buffer head twice and allocate the tracking record. */
26444
+ Tracking_Record = evms_cs_allocate_from_pool(DL_Tracking_Pool,1); /* Block until we get a tracking record. */
26445
+ Link1 = evms_cs_allocate_from_pool(evms_bh_pool,1);
26446
+ Link2 = evms_cs_allocate_from_pool(evms_bh_pool,1);
26448
+ /* Initialize the tracking record so we can associate the two new I/Os with the original. */
26449
+ Tracking_Record->IO_In_Progress = 2;
26450
+ Tracking_Record->Up_To_Date = 0;
26451
+ Tracking_Record->Original = *eio;
26453
+ /* Create the I/O to the first link. */
26454
+ Clone_Bufferhead(eio->bh,Link1);
26455
+ Link1->b_private = Tracking_Record;
26456
+ Link1->b_end_io = OS2_DL_Callback;
26457
+ Link1->b_size = sector_count << EVMS_VSECTOR_SIZE_SHIFT;
26458
+ Tracking_Record->Link1.rsector = eio->rsector;
26459
+ Tracking_Record->Link1.rsize = sector_count;
26460
+ Tracking_Record->Link1.bh = Link1;
26461
+ Tracking_Record->Link1_Partition_Data = cur_dlentry;
26463
+ /* Create the I/O to the second link */
26464
+ Clone_Bufferhead(eio->bh,Link2);
26465
+ Link2->b_private = Tracking_Record;
26466
+ Link2->b_end_io = OS2_DL_Callback;
26467
+ Link2->b_data += sector_count << EVMS_VSECTOR_SIZE_SHIFT;
26468
+ Tracking_Record->Link2.bh = Link2;
26469
+ Tracking_Record->Link2_Partition_Data = cur_dlentry->next;
26470
+ Link2->b_rsector = 0;
26471
+ Tracking_Record->Link2.rsector = 0;
26472
+ Tracking_Record->Link2.rsize = eio->rsize - sector_count;
26473
+ Link2->b_size = Tracking_Record->Link2.rsize << EVMS_VSECTOR_SIZE_SHIFT;
26475
+ Transfer_Record = evms_cs_allocate_from_pool(BBR_Transfer_Pool,1); /* Block until we get a transfer record. */
26476
+ Transfer_Record->Write_Flag = 1;
26477
+ Transfer_Record->Partition_Data = cur_dlentry;
26478
+ Transfer_Record->eio = Tracking_Record->Link1;
26479
+ Transfer_Record->Next = NULL;
26480
+ Tracking_Record->Link1_Transfer_Record = Transfer_Record;
26481
+ /* Process the I/O to the first link. */
26482
+ if ( cur_dlentry->bbr_is_active ) {
26483
+ /* Transfer the IO to the BBR Worker Thread. */
26484
+ Tracking_Record->Link1_BBR_Attempted = 1;
26485
+ BBR_Transfer_IO(Transfer_Record);
26488
+ Tracking_Record->Link1_BBR_Attempted = 0;
26489
+ W_IO( cur_dlentry->link_partition, &Tracking_Record->Link1 );
26492
+ /* Process the I/O to the second link. */
26493
+ cur_dlentry = cur_dlentry->next;
26494
+ Transfer_Record = evms_cs_allocate_from_pool(BBR_Transfer_Pool,1); /* Block until we get a transfer record. */
26495
+ Transfer_Record->Write_Flag = 1;
26496
+ Transfer_Record->Partition_Data = cur_dlentry;
26497
+ Transfer_Record->eio = Tracking_Record->Link2;
26498
+ Transfer_Record->Next = NULL;
26499
+ Tracking_Record->Link2_Transfer_Record= Transfer_Record;
26500
+ if ( cur_dlentry->bbr_is_active ) {
26501
+ /* Transfer the IO to the BBR Worker Thread. */
26502
+ Tracking_Record->Link2_BBR_Attempted = 1;
26503
+ BBR_Transfer_IO(Transfer_Record);
26506
+ Tracking_Record->Link2_BBR_Attempted = 0;
26507
+ W_IO( cur_dlentry->link_partition, &Tracking_Record->Link2 );
26512
+ LOG_SERIOUS("WRITE error, request exceeds volume size.\n" );
26513
+ EVMS_IO_ERROR(eio);
26519
+static int os2_ioctl_cmd_plugin_ioctl( evms_logical_node_t *node,
26520
+ struct inode *inode,
26521
+ struct file *file,
26522
+ unsigned long cmd,
26523
+ unsigned long arg)
26526
+ os2_volume_runtime_entry_t * Node_Data;
26527
+ os2_drivelink_runtime_entry_t * curlink, * nextlink;
26528
+ evms_plugin_ioctl_t tmp, *user_parms;
26530
+ user_parms = (evms_plugin_ioctl_t *)arg;
26531
+ /* copy user's parameters to kernel space */
26532
+ if (copy_from_user(&tmp, user_parms, sizeof(tmp)))
26536
+ Node_Data = (os2_volume_runtime_entry_t *)node->instance_data;
26537
+ /* is this cmd targetted at this feature ? */
26538
+ if (tmp.feature_id == node->plugin->id) {
26539
+ switch(tmp.feature_command) {
26543
+ } else { /* broadcast this cmd to all children */
26544
+ curlink = Node_Data->drive_link;
26546
+ /* broadcast this cmd to all children */
26547
+ while ( curlink ) {
26548
+ nextlink = curlink->next;
26550
+ rc = IOCTL(curlink->link_partition,inode,file,cmd,arg);
26555
+ curlink = nextlink;
26559
+ /* copy info to userspace */
26560
+ if (copy_to_user(user_parms, &tmp, sizeof(tmp)))
26567
+static int OS2_ioctl_cmd_broadcast( evms_logical_node_t *node,
26568
+ struct inode *inode,
26569
+ struct file *file,
26570
+ unsigned long cmd,
26571
+ unsigned long arg)
26574
+ os2_volume_runtime_entry_t * Node_Data;
26575
+ os2_drivelink_runtime_entry_t * curlink, * nextlink;
26577
+ Node_Data = (os2_volume_runtime_entry_t *)node->instance_data;
26578
+ curlink = Node_Data->drive_link;
26580
+ /* broadcast this cmd to all children */
26581
+ while ( curlink ) {
26582
+ nextlink = curlink->next;
26584
+ rc |= IOCTL(curlink->link_partition,inode,file,cmd,arg);
26586
+ curlink = nextlink;
26594
+ * Function: ioctl_os2lvm
26596
+static int ioctl_os2lvm( evms_logical_node_t * logical_node,
26597
+ struct inode * inode,
26598
+ struct file * file,
26599
+ unsigned int cmd,
26600
+ unsigned long arg )
26603
+ evms_sector_t Sectors_Per_Cylinder;
26604
+ evms_sector_t Total_Sectors;
26605
+ evms_logical_node_t * partition_node;
26607
+ partition_node = (( os2_volume_runtime_entry_t * )logical_node->instance_data )->drive_link->link_partition;
26612
+ LOG_EVERYTHING("Ioctl %d\n", cmd );
26615
+ case HDIO_GETGEO:
26617
+ // Return fake geometry
26618
+ struct hd_geometry *hd = ( struct hd_geometry * )arg;
26620
+ unsigned char heads = 255;
26621
+ unsigned char sectors = OS2LVM_SYNTHETIC_SECTORS_PER_TRACK;
26624
+ /* OS/2 always created a fake geometry using the maximum cylinder size. */
26625
+ Sectors_Per_Cylinder = heads * sectors;
26626
+ for ( cylinders = 0, Total_Sectors = 0; Total_Sectors < ( ( os2_volume_runtime_entry_t * )logical_node->instance_data )->size_in_sectors; cylinders++ )
26627
+ Total_Sectors += Sectors_Per_Cylinder;
26631
+ if ( copy_to_user(( short * )( &hd->cylinders ), &cylinders, sizeof( cylinders )) ||
26632
+ copy_to_user(( char * )( &hd->heads ), &heads, sizeof( heads )) ||
26633
+ copy_to_user(( char * )( &hd->sectors ), §ors, sizeof( sectors )) ||
26634
+ copy_to_user(( long * )( &hd->start ), &start, sizeof( start )) ) {
26640
+ case EVMS_GET_BMAP:
26641
+ // No kernel images allowed on OS/2 volumes right now.
26645
+ case EVMS_QUIESCE_VOLUME:
26646
+ case EVMS_GET_DISK_LIST:
26647
+ case EVMS_CHECK_MEDIA_CHANGE:
26648
+ case EVMS_REVALIDATE_DISK:
26649
+ case EVMS_OPEN_VOLUME:
26650
+ case EVMS_CLOSE_VOLUME:
26651
+ rc = OS2_ioctl_cmd_broadcast(logical_node, inode, file, cmd, arg);
26653
+ case EVMS_PLUGIN_IOCTL:
26654
+ rc = os2_ioctl_cmd_plugin_ioctl( logical_node, inode, file, cmd, arg);
26666
+ * Function: init_io_os2lvm
26668
+static int init_io_os2lvm( evms_logical_node_t * node,
26669
+ int io_flag, /* 0=read, 1=write */
26670
+ evms_sector_t sect_nr, /* disk LBA */
26671
+ evms_sector_t num_sects, /* # of sectors */
26672
+ void * buf_addr ) /* buffer address */
26675
+ evms_sector_t sector_count;
26676
+ evms_logical_node_t * partition_node;
26677
+ os2_drivelink_runtime_entry_t * cur_dlentry = NULL;
26679
+ sector_count = num_sects;
26680
+ rc = find_drive_link( node, &cur_dlentry, §_nr, §or_count );
26683
+ partition_node = cur_dlentry->link_partition;
26684
+ if ( cur_dlentry->bbr_is_active )
26685
+ rc = do_os2_bbr_io( cur_dlentry, io_flag, sect_nr, num_sects, buf_addr );
26687
+ rc = INIT_IO( partition_node, io_flag, sect_nr, num_sects, buf_addr );
26688
+ if ( rc && io_flag ) {
26689
+ cur_dlentry->bbr_is_active = 1;
26690
+ rc = do_os2_bbr_io( cur_dlentry, io_flag, sect_nr, num_sects, buf_addr );
26695
+ partition_node = cur_dlentry->link_partition;
26696
+ if ( cur_dlentry->bbr_is_active )
26697
+ rc = do_os2_bbr_io( cur_dlentry, io_flag, sect_nr, sector_count, buf_addr );
26699
+ rc = INIT_IO( partition_node, io_flag, sect_nr, sector_count, buf_addr );
26700
+ if ( rc && io_flag) {
26701
+ cur_dlentry->bbr_is_active = 1;
26702
+ rc = do_os2_bbr_io( cur_dlentry, io_flag, sect_nr, sector_count, buf_addr );
26707
+ cur_dlentry = cur_dlentry->next;
26708
+ partition_node = cur_dlentry->link_partition;
26709
+ num_sects -= sector_count;
26710
+ buf_addr += sector_count << OS2_SECTOR_SHIFT;
26712
+ if ( cur_dlentry->bbr_is_active )
26713
+ rc = do_os2_bbr_io( cur_dlentry, io_flag, 0, num_sects, buf_addr );
26715
+ rc = INIT_IO( partition_node, io_flag, 0, num_sects, buf_addr );
26716
+ if ( rc && io_flag ) {
26717
+ cur_dlentry->bbr_is_active = 1;
26718
+ rc = do_os2_bbr_io( cur_dlentry, io_flag, 0, num_sects, buf_addr );
26725
+ LOG_SERIOUS("INITIO error, request exceeds volume size.\n" );
26734
+ * Function: do_os2_bbr_io
26736
+ * Check the Bad Block Relocation list for relocated sectors. If any are found,
26737
+ * this function will do the i/o directly.
26738
+ * Return values: 0 == i/o done, 1 == unable to complete i/o
26740
+static int do_os2_bbr_io( os2_drivelink_runtime_entry_t * io_dlentry,
26741
+ int rw, /* 0=read, 1=write */
26742
+ evms_sector_t starting_lsn, /* disk LBA */
26743
+ evms_sector_t count, /* # of sectors */
26744
+ void * buffer ) /* buffer address */
26746
+ evms_sector_t lsn, remapped_lsn;
26749
+ // For each sector in this request, check if this sector has already
26750
+ // been remapped. If so, process all previous sectors in this request,
26751
+ // followed by the remapped sector. Then reset the starting lsn and
26752
+ // count and keep going with the rest of the request as if it were
26753
+ // a whole new request.
26754
+ for ( lsn = 0; lsn < count; lsn++ ) {
26755
+ remapped_lsn = starting_lsn + lsn;
26756
+ rc = Sector_Is_Remapped(io_dlentry,remapped_lsn, &remapped_lsn);
26758
+ // Process all sectors in the request up to this one.
26760
+ rc = INIT_IO(io_dlentry->link_partition, rw, starting_lsn, lsn, buffer);
26762
+ /* If this is a read, then we are done. */
26767
+ /* Since this was a write, we must see if we can remap the bad sector to a replacement sector. */
26768
+ if ( !Create_New_BBR_Table_Entry(io_dlentry, starting_lsn, lsn, buffer) ) {
26769
+ /* We were unable to remap the bad sector(s) in the I/O. We can not complete the I/O. */
26773
+ buffer += (lsn * OS2_BYTES_PER_SECTOR);
26776
+ // Process the remapped sector.
26777
+ rc = INIT_IO(io_dlentry->link_partition, rw, remapped_lsn, 1, buffer);
26779
+ /* If this is a read, then we are done. */
26784
+ /* Get the original sector that was remapped. */
26785
+ remapped_lsn = starting_lsn + lsn;
26787
+ /* Invalidate the current remapping. */
26788
+ Invalidate_Mapping(io_dlentry,remapped_lsn,1);
26790
+ /* Try to remap the bad sector to another replacement sector. */
26791
+ if ( !Create_New_BBR_Table_Entry(io_dlentry, remapped_lsn, 1, buffer) ) {
26792
+ /* We were unable to remap the bad sector(s) in the I/O. We can not complete the I/O. */
26798
+ buffer += OS2_BYTES_PER_SECTOR;
26800
+ starting_lsn += (lsn + 1);
26801
+ count -= (lsn + 1);
26807
+ /* Are there any sectors left to process? */
26808
+ if ( count > 0 ) {
26809
+ rc = INIT_IO(io_dlentry->link_partition, rw, starting_lsn, count, buffer);
26811
+ /* If this is a read, then we are done. */
26816
+ /* Since this was a write, we must see if we can remap the bad sector to a replacement sector. */
26817
+ if ( !Create_New_BBR_Table_Entry(io_dlentry, starting_lsn, count, buffer) ) {
26818
+ /* We were unable to remap the bad sector(s) in the I/O. We can not complete the I/O. */
26831
+ * Function: os2lvm_vge_init
26833
+int __init os2lvm_vge_init( void )
26835
+ /* Should I be allocating the pools and BBR Worker Thread here? */
26836
+ return evms_cs_register_plugin( &plugin_header );/* register with EVMS*/
26839
+void __exit os2lvm_vge_exit( void )
26841
+ /* BUGBUG - Is there where I need to kill the BBR Worker Thread and free any memory I am still holding? */
26843
+ evms_cs_unregister_plugin(&plugin_header);
26846
+module_init(os2lvm_vge_init);
26847
+module_exit(os2lvm_vge_exit);
26848
+#ifdef MODULE_LICENSE
26849
+MODULE_LICENSE("GPL");
26855
+// Local VGE Functions
26859
+ * Function: discover_os2lvm_partitions
26861
+ * Examine the list of logical partitions. Any type 0x35 partition that contains
26862
+ * a valid OS/2 signature sector is consumed and added to the appropriate logical
26865
+static int discover_os2lvm_partitions( evms_logical_node_t ** evms_partition_list )
26867
+ evms_logical_node_t * evms_partition;
26868
+ evms_logical_node_t * next_partition;
26869
+ evms_logical_node_t * new_volume;
26870
+ evms_sector_t sectornum = 0;
26871
+ u_int32_t volumeserial;
26873
+ char * volumename;
26874
+ char driveletter[8];
26875
+ LVM_Signature_Sector * sigsector;
26876
+ os2_drivelink_runtime_entry_t * new_dlentry;
26878
+ LOG_ENTRY_EXIT("Discovering OS/2 Logical Volumes\n" );
26879
+ if ( evms_cs_allocate_memory(( void** )&sigsect, OS2_BYTES_PER_SECTOR ) ) {
26880
+ LOG_SERIOUS("Could not allocate Signature sector data\n" );
26884
+ for ( evms_partition = *evms_partition_list; evms_partition; evms_partition = next_partition ) {
26885
+ // Save the next node. We may remove this one from the list.
26886
+ next_partition = evms_partition->next;
26888
+ // The node must not have the OS/2 vge id.
26889
+ if ( evms_partition->plugin->id == plugin_header.id ) {
26893
+ LOG_EXTRA("Examining partition serial %s\n", evms_partition->name );
26895
+ // Have to go to the last accessible sector of the partition and
26896
+ // read it in. It should be the LVM Signature Sector.
26897
+ sectornum = evms_partition->total_vsectors - 1;
26898
+ if ( INIT_IO( evms_partition, 0, sectornum, 1, sigsect ) ) {
26899
+ // On an I/O error, continue on to the next partition.
26900
+ // This means that the volume it belongs to will be incomplete
26901
+ // and later deleted in the completeness check.
26902
+ LOG_SERIOUS("I/O error on Signature sector read\n" );
26905
+ sigsector = ( LVM_Signature_Sector * )sigsect;
26907
+ // Validate the Signature Sector
26908
+ if ( validate_signaturesector( evms_partition, sigsector, OS2_BYTES_PER_SECTOR )) {
26909
+ LOG_EXTRA("Signature sector is not valid\n" );
26912
+// Bugbug - At this point, we have validated an OS/2 LVM Signature Sector. However, if the partition
26913
+// is not marked as a type 0x35, then this Signature Sector may be erroneous. The problem here is that
26914
+// there is currently no way to find out if this partition was marked as a type 0x35. Also, if we
26915
+// should reject this partition due to some problem with the drive linking or BBR metadata, should we
26916
+// leave the partition in the evms partition list or not? If the partition was marked as a type 0x35
26917
+// and the Signature Sector was valid, then I would say that we should remove it from the evms partition
26918
+// partition list. If the partition is not marked as a type 0x35 but the Signature Sector is valid, then
26919
+// we could have a stray Signature Sector, in which case the partition should remain in the evms partition
26920
+// list. The OS/2 LVM Signature Sector does have additional information that could be used to resolve
26921
+// this issue, such as the starting LBA of the partition that the Signature Sector belongs to, but
26922
+// we can not get the starting LBA of the partition to compare against. If we leave the partition in
26923
+// the evms partition list when we should not, then an extraneous compatibility volume could result.
26924
+ // Build the Metadata for this partition
26925
+ if ( !( new_dlentry = new_os2_drive_link( sigsector, evms_partition )) ) {
26929
+ // Search for the parent Volume for this partition
26930
+ volumeserial = sigsector->Volume_Serial_Number;
26931
+ if ( !( new_volume = find_os2_volume( volumeserial )) ) {
26933
+ // If not found, allocate a new Volume
26934
+ LOG_EVERYTHING("Parent not found, allocate new.\n" );
26935
+ if ( sigsector->Drive_Letter != '\0' ) {
26936
+ driveletter[0] = sigsector->Drive_Letter;
26937
+ driveletter[1] = '\0';
26938
+ volumename = driveletter;
26941
+ volumename = sigsector->Volume_Name;
26943
+ if ( !( new_volume = new_os2volume( volumeserial, volumename )) ) {
26944
+ delete_os2_drive_link( new_dlentry, 0 );
26945
+ new_dlentry = NULL;
26950
+ // Now remove the partition from the List
26951
+ evms_cs_remove_logical_node_from_list( evms_partition_list, evms_partition );
26953
+ if ( (( os2_volume_runtime_entry_t * )new_volume->instance_data )->complete ) {
26954
+ // Volume is complete, delete this duplicate
26955
+ delete_os2_drive_link( new_dlentry, 0 );
26956
+ LOG_EVERYTHING("Deleting duplicate node.\n" );
26957
+ (( os2_volume_runtime_entry_t * )new_volume->instance_data )->Export_Needed = 1; //We must export this volume again!
26959
+ else /* Add this partition to its parent Volume */
26960
+ add_os2link( new_dlentry, new_volume );
26964
+ evms_cs_deallocate_memory(( void* )sigsect );
26965
+ LOG_ENTRY_EXIT("Finished Discovering OS/2 Logical Volumes\n" );
26972
+ * Function: find_os2_volume
26974
+ * Search for the OS/2 volume that matches the volume serial.
26976
+static evms_logical_node_t * find_os2_volume( u_int32_t volumeserial )
26978
+ os2_volume_runtime_entry_t * cur_volume;
26979
+ evms_logical_node_t * cur_node;
26981
+ cur_node = os2lvm_nodes;
26983
+ while ( cur_node ) {
26984
+ cur_volume = ( os2_volume_runtime_entry_t * )cur_node->instance_data;
26985
+ if ( cur_volume->Volume_Serial_Number == volumeserial ) {
26986
+ LOG_EVERYTHING("%s: found volser match.\n", __FUNCTION__ );
26989
+ LOG_EVERYTHING("%s: volser does not match.\n", __FUNCTION__ );
26990
+ cur_node = cur_volume->next_os2lvm_node;
26998
+ * Function: add_os2link
27000
+ * Add the Drive Link metadata to the parent OS/2 volume.
27002
+static int add_os2link( os2_drivelink_runtime_entry_t * newlink,
27003
+ evms_logical_node_t * parent_volume )
27005
+ os2_volume_runtime_entry_t * parent_metadata = ( os2_volume_runtime_entry_t * )parent_volume->instance_data;
27006
+ os2_drivelink_runtime_entry_t * curlink = parent_metadata->drive_link, * nextlink;
27009
+ nextlink = curlink->next;
27010
+ while ( nextlink ) {
27011
+ curlink = nextlink;
27012
+ nextlink = curlink->next;
27014
+ curlink->next = newlink;
27017
+ parent_metadata->drive_link = newlink;
27019
+ parent_metadata->drive_link_count++;
27020
+ parent_metadata->size_in_sectors += newlink->sector_count;
27021
+ parent_volume->total_vsectors += newlink->sector_count;
27027
+ * Function: find_link_data
27029
+ * Find the Drive Link metadata that matches the partition serial number.
27030
+ * Remove it from the link_list passed in.
27032
+static os2_drivelink_runtime_entry_t * find_link_data( os2_drivelink_runtime_entry_t ** link_list,
27033
+ u_int32_t partitionser )
27035
+ os2_drivelink_runtime_entry_t * curlink = *link_list, * prevlink = NULL;
27037
+ while ( curlink ) {
27038
+ if ( curlink->Partition_Serial_Number == partitionser ) {
27039
+ if ( prevlink ) {
27040
+ prevlink->next = curlink->next;
27043
+ *link_list = curlink->next;
27045
+ curlink->next = NULL;
27048
+ prevlink = curlink;
27049
+ curlink = prevlink->next;
27057
+ * Function: find_drive_link
27059
+ * Walk the linked list of drive links to find the proper
27060
+ * target partition. Returns the metadata associated with
27061
+ * the drive link.
27062
+ * Return values: 1 == data contained in 1 partition, 2 == data crosses 2 partitions,
27063
+ * 0 == target partition not found
27065
+static int find_drive_link( evms_logical_node_t * node,
27066
+ os2_drivelink_runtime_entry_t ** dlentry,
27067
+ evms_sector_t * sector,
27068
+ evms_sector_t * num_sectors )
27070
+ evms_sector_t last_link_sector, cur_last_sector;
27071
+ os2_drivelink_runtime_entry_t * curlink = (( os2_volume_runtime_entry_t * )node->instance_data )->drive_link, * nextlink;
27073
+ while ( curlink ) {
27074
+ nextlink = curlink->next;
27075
+ last_link_sector = curlink->start_sector + curlink->sector_count;
27076
+ if ( *sector < last_link_sector ) {
27077
+ *dlentry = curlink;
27078
+ cur_last_sector = *sector + *num_sectors;
27079
+ *sector -= curlink->start_sector;
27080
+ LOG_EVERYTHING("I/O start_RBA == %Ld , sector_count == %Ld\n", *sector, *num_sectors );
27081
+ if ( cur_last_sector <= last_link_sector )
27084
+ if ( (*dlentry)->next )
27085
+ *num_sectors -= cur_last_sector - last_link_sector;
27092
+ curlink = nextlink;
27100
+// Allocation/Deallocation Functions
27104
+ * Function: new_os2_drive_link
27106
+ * Allocate space for a new OS/2 drive link structure.
27107
+ * Initialize the appropriate fields.
27108
+ * Note: since the BBR info applies to each link, the BBR structures
27109
+ * are also initialized here.
27111
+static os2_drivelink_runtime_entry_t * new_os2_drive_link( LVM_Signature_Sector * signature_sector,
27112
+ evms_logical_node_t * evms_partition )
27115
+ u_int32_t feature, feature_size, sectoroffset;
27116
+ os2_drivelink_runtime_entry_t * new_dlentry;
27118
+ if ( evms_cs_allocate_memory(( void** )&new_dlentry, sizeof( os2_drivelink_runtime_entry_t )) ) {
27119
+ LOG_SERIOUS("Could not allocate drivelink metadata\n" );
27122
+ new_dlentry->sector_count = signature_sector->Partition_Size_To_Report_To_User;
27123
+ new_dlentry->Partition_Serial_Number = signature_sector->Partition_Serial_Number;
27124
+ new_dlentry->bbr_is_active = 0; // initialize to not active
27125
+ new_dlentry->link_partition = evms_partition;
27126
+ init_MUTEX( &(new_dlentry->BBR_Table_Lock) );
27128
+ sectoroffset = signature_sector->Partition_Start;
27129
+ LOG_EVERYTHING("Partition Start is at LBA %i\n", sectoroffset );
27130
+ for ( i = 0 ; i < OS2LVM_MAX_FEATURES_PER_VOLUME ; i++ ) {
27131
+ feature = signature_sector->LVM_Feature_Array[i].Feature_ID;
27133
+ feature_size = signature_sector->LVM_Feature_Array[i].Feature_Data_Size;
27134
+ LOG_EVERYTHING("Entry %d in Feature Table is valid,\n", i+1 );
27135
+ LOG_EVERYTHING("Feature Data size is %i sectors.\n", feature_size );
27136
+ if ( feature == DRIVE_LINKING_FEATURE_ID ) {
27137
+ if ( !new_dlentry->link_data ) {
27138
+ new_dlentry->Drive_Link_Data_Copy1 = signature_sector->LVM_Feature_Array[i].Location_Of_Primary_Feature_Data - sectoroffset;
27139
+ new_dlentry->Drive_Link_Data_Copy2 = signature_sector->LVM_Feature_Array[i].Location_Of_Secondary_Feature_Data - sectoroffset;
27140
+ new_dlentry->link_data = new_os2_link_data( new_dlentry->Drive_Link_Data_Copy1, new_dlentry->Drive_Link_Data_Copy2, feature_size, evms_partition );
27141
+ if ( new_dlentry->link_data == NULL) {
27142
+ delete_os2_drive_link(new_dlentry,0);
27143
+ new_dlentry = NULL;
27147
+ LOG_WARNING("os2lvm_vge: Drive Linking Feature encountered twice in the same Feature Array!\n");
27148
+ delete_os2_drive_link(new_dlentry,0);
27149
+ new_dlentry = NULL;
27152
+ else if ( feature == BBR_FEATURE_ID ) {
27153
+ if ( !new_dlentry->bbr_data ) {
27154
+ new_dlentry->BBR_Data_Copy1 = signature_sector->LVM_Feature_Array[i].Location_Of_Primary_Feature_Data;
27155
+ new_dlentry->BBR_Data_Copy2 = signature_sector->LVM_Feature_Array[i].Location_Of_Secondary_Feature_Data;
27156
+ new_dlentry->BBR_Feature_Size = feature_size;
27157
+ new_dlentry->bbr_data = new_os2_bbr_data( new_dlentry->BBR_Data_Copy1, new_dlentry->BBR_Data_Copy2, feature_size, evms_partition );
27158
+ if ( new_dlentry->bbr_data == NULL) {
27159
+ delete_os2_drive_link(new_dlentry,0);
27160
+ new_dlentry = NULL;
27162
+ else if ( signature_sector->LVM_Feature_Array[i].Feature_Active ) {
27163
+ new_dlentry->bbr_is_active = check_for_os2_bbr_relocations( new_dlentry->bbr_data );
27167
+ LOG_WARNING("os2lvm_vge: BBR Feature encountered twice in the same Feature Array!\n");
27168
+ delete_os2_drive_link(new_dlentry,0);
27169
+ new_dlentry = NULL;
27173
+ LOG_WARNING("os2lvm_vge: Unknown Feature entry %d found.\n", feature );
27174
+ delete_os2_drive_link(new_dlentry,0);
27175
+ new_dlentry = NULL;
27178
+ if ( signature_sector->LVM_Feature_Array[i].Feature_Active ) {
27179
+ LOG_EVERYTHING("Feature is active.\n" );
27184
+ if ( new_dlentry &&
27185
+ ( ( ! new_dlentry->bbr_data ) || ( ! new_dlentry->link_data ) )
27187
+ LOG_WARNING("os2lvm_vge: Incomplete Feature Data found.\n" );
27188
+ delete_os2_drive_link(new_dlentry,0);
27189
+ new_dlentry = NULL;
27191
+ return new_dlentry;
27196
+ * Function: new_os2_link_data
27198
+ * Allocate space for OS/2 drive link information.
27199
+ * Read in and validate the information from disk.
27200
+ * Note: assumes 512 byte sectors.
27202
+static char * new_os2_link_data( u_int32_t linksector1,
27203
+ u_int32_t linksector2,
27204
+ u_int32_t linknumsectors,
27205
+ evms_logical_node_t * link_partition )
27207
+ char * new_data1; /* Buffer used to hold the primary copy of the drive linking data. */
27208
+ char * new_data2; /* Buffer used to hold the secondary copy of the drive linking data. */
27209
+ char * p1; /* Used to access individual sectors of data within new_data1. */
27210
+ char * p2; /* Used to access individual sectors of data within new_data2. */
27211
+ int memsize = linknumsectors * OS2_BYTES_PER_SECTOR;
27212
+ u_int32_t i, seq1, seq2;
27214
+ /* Allocate Memory for the buffers to hold the drive linking data. */
27215
+ LOG_EVERYTHING("Drive Linking Feature entry found.\n" );
27216
+ if ( evms_cs_allocate_memory(( void** )&new_data1, memsize ) ) {
27217
+ LOG_SERIOUS("Could not allocate Primary Link data\n" );
27220
+ if ( evms_cs_allocate_memory(( void** )&new_data2, memsize ) ) {
27221
+ LOG_SERIOUS("Could not allocate Secondary Link data\n" );
27222
+ evms_cs_deallocate_memory(( void* )new_data1 );
27226
+ LOG_EVERYTHING("Primary Feature Data starts at RBA %i\n", linksector1 );
27227
+ LOG_EVERYTHING("Secondary Feature Data starts at RBA %i\n", linksector2 );
27229
+ /* Read the drive linking data into memory. */
27230
+ if ( INIT_IO( link_partition, 0, linksector1, linknumsectors, new_data1 ) ) {
27231
+ LOG_SERIOUS("I/O error reading Primary Feature Data.\n" );
27236
+ /* Set up access to the buffer. Extract the Master Sequence Number from the buffer. */
27238
+ seq1 = (( LVM_Link_Table_First_Sector * )p1 )->Sequence_Number;
27241
+ if ( INIT_IO( link_partition, 0, linksector2, linknumsectors, new_data2 ) ) {
27242
+ LOG_SERIOUS("I/O error reading Secondary Feature Data.\n" );
27247
+ /* Set up access to the second buffer. Extract its copy of the Master Sequence Number. */
27249
+ seq2 = (( LVM_Link_Table_Sector * )p2 )->Sequence_Number;
27252
+ /* Validate both copies of the drive linking data one sector at a time. */
27253
+ for ( i = 0; i < linknumsectors; i++, p1 += OS2_BYTES_PER_SECTOR, p2 += OS2_BYTES_PER_SECTOR ) {
27254
+ if ( (seq1 > 0) && validate_drivelinksector( ( LVM_Link_Table_Sector * )p1, i, seq1 )) {
27255
+ LOG_SERIOUS("The primary copy of the drive link data is invalid! Sector %i is not valid\n", i );
27259
+ if ( (seq2 > 0) && validate_drivelinksector( ( LVM_Link_Table_Sector * )p2, i, seq2 )) {
27260
+ LOG_SERIOUS("The secondary copy of the drive link data is invalid! Sector %i is not valid\n", i );
27266
+ LOG_EVERYTHING("Primary Feature Data sequence # %i\n", seq1 );
27267
+ LOG_EVERYTHING("Secondary Feature Data sequence # %i\n", seq2 );
27269
+ /* Choose which copy of the drive linking data to use. If both sequence numbers are 0, then both copies
27270
+ of the drive linking data are bad. If both are equal and non-zero, then both copies are good and it
27271
+ really doesn't matter which one you choose. Otherwise, choose the copy with the highest sequence number. */
27272
+ if ( seq2 > seq1 ) {
27273
+ evms_cs_deallocate_memory(( void* )new_data1 );
27274
+ return new_data2;
27277
+ evms_cs_deallocate_memory(( void* )new_data2 );
27279
+ evms_cs_deallocate_memory(( void* )new_data1 );
27280
+ new_data1 = NULL;
27283
+ return new_data1;
27288
+ * Function: new_os2_bbr_data
27290
+ * Allocate space for OS/2 bad block relocation information.
27291
+ * Read in and validate the information from disk.
27292
+ * Note: assumes 512 byte sectors.
27294
+static char * new_os2_bbr_data( u_int32_t bbrsector1,
27295
+ u_int32_t bbrsector2,
27296
+ u_int32_t bbrnumsectors,
27297
+ evms_logical_node_t * bbr_partition )
27299
+ char * new_data1; /* Buffer to hold the primary copy of the BBR data. */
27300
+ char * new_data2; /* Buffer to hold the secondary copy of the BBR data. */
27301
+ char * p1; /* Used to examine the individual sectors of BBR data within new_data1. */
27302
+ char * p2; /* Used to examine the individual sectors of BBR data within new_data2. */
27303
+ int memsize = bbrnumsectors * OS2_BYTES_PER_SECTOR;
27304
+ u_int32_t i, seq1, seq2;
27306
+ LOG_EVERYTHING("BBR Feature entry found.\n" );
27308
+ /* Allocate memory for the buffers. */
27309
+ if ( evms_cs_allocate_memory(( void** )&new_data1, memsize ) ) {
27310
+ LOG_SERIOUS("Could not allocate Primary BBR data\n" );
27313
+ if ( evms_cs_allocate_memory(( void** )&new_data2, memsize ) ) {
27314
+ LOG_SERIOUS("Could not allocate Secondary BBR data\n" );
27315
+ evms_cs_deallocate_memory(( void* )new_data1 );
27319
+ LOG_EVERYTHING("Primary Feature Data starts at RBA %i\n", bbrsector1 );
27320
+ LOG_EVERYTHING("Secondary Feature Data starts at RBA %i\n", bbrsector2 );
27322
+ /* Read in both copies of the BBR data. */
27323
+ if ( INIT_IO( bbr_partition, 0, bbrsector1, bbrnumsectors, new_data1 ) ) {
27324
+ LOG_SERIOUS("I/O error reading Primary Feature Data.\n" );
27329
+ /* Establish access to the first sector of the BBR data. Extract the Master Sequence Number
27330
+ for this copy of the BBR data. */
27332
+ seq1 = (( LVM_BBR_Table_First_Sector * )p1 )->Sequence_Number;
27335
+ if ( INIT_IO( bbr_partition, 0, bbrsector2, bbrnumsectors, new_data2 ) ) {
27336
+ LOG_SERIOUS("I/O error reading Secondary Feature Data.\n" );
27341
+ /* Establish access to the first sector of the second copy of the BBR data. Extract the
27342
+ Master Sequence Number for this copy of the BBR data. */
27344
+ seq2 = (( LVM_BBR_Table_Sector * )p2 )->Sequence_Number;
27347
+ /* Validate both copies of the BBR Data, one sector at a time. */
27348
+ for ( i = 0; i < bbrnumsectors; i++, p1 += OS2_BYTES_PER_SECTOR, p2 += OS2_BYTES_PER_SECTOR ) {
27349
+ if ( (seq1 > 0) && validate_bbrtablesector( p1, i, seq1 )) {
27350
+ LOG_SERIOUS("The primary BBR data is invalid! Sector %i is not valid\n", i );
27354
+ if ( (seq2 > 0) && validate_bbrtablesector( p2, i, seq2 )) {
27355
+ LOG_SERIOUS("The secondary BBR data is invalid! Sector %i is not valid\n", i );
27361
+ LOG_EVERYTHING("Primary Feature Data sequence # %i\n", seq1 );
27362
+ LOG_EVERYTHING("Secondary Feature Data sequence # %i\n", seq2 );
27364
+ /* Choose which copy of the BBR Data to use based upon the sequence number. If both sequence numbers
27365
+ are 0, then there is no valid BBR data. If both are non-zero and equal, then it really doesn't
27366
+ matter which copy is used. Otherwise, choose the copy with the highest sequence number. */
27367
+ if ( seq2 > seq1 ) {
27368
+ evms_cs_deallocate_memory(( void* )new_data1 );
27369
+ return new_data2;
27372
+ evms_cs_deallocate_memory(( void* )new_data2 );
27374
+ evms_cs_deallocate_memory(( void* )new_data1 );
27375
+ new_data1 = NULL;
27378
+ return new_data1;
27383
+ * Function: new_os2volume
27385
+ * Allocate space for a new OS/2 logical volume.
27386
+ * Initialize the appropriate fields.
27388
+static evms_logical_node_t * new_os2volume( u_int32_t volumeserial,
27389
+ char * volume_name )
27391
+ evms_logical_node_t * new_node;
27392
+ os2_volume_runtime_entry_t * cur_volume;
27394
+ if ( evms_cs_allocate_logical_node( &new_node ) ) {
27395
+ LOG_SERIOUS("Could not allocate new volume\n" );
27398
+ if ( evms_cs_allocate_memory( &new_node->instance_data, sizeof( os2_volume_runtime_entry_t )) ) {
27399
+ LOG_SERIOUS("Could not allocate volume metadata\n" );
27400
+ evms_cs_deallocate_logical_node( new_node );
27403
+ new_node->plugin = &plugin_header;
27404
+ new_node->system_id = LVM_PARTITION_INDICATOR;
27405
+ sprintf( new_node->name, "os2/%s", volume_name );
27406
+ cur_volume = ( os2_volume_runtime_entry_t * )new_node->instance_data;
27407
+ cur_volume->Volume_Serial_Number = volumeserial;
27408
+ cur_volume->Export_Needed = 1;
27410
+ if ( os2lvm_nodes == NULL )
27411
+ os2lvm_nodes = new_node;
27413
+ // This is the first node discovered. Start the BBR thread.
27414
+ if ( ! BBR_Worker_Thread ) {
27415
+ BBR_Worker_Thread = evms_cs_register_thread(BBR_Worker, NULL, BBR_Worker_Name);
27416
+ if ( ! BBR_Worker_Thread ) {
27417
+ evms_cs_deallocate_memory(new_node->instance_data);
27418
+ evms_cs_deallocate_logical_node(new_node);
27419
+ os2lvm_nodes = NULL;
27424
+ cur_volume = ( os2_volume_runtime_entry_t * )os2lvm_nodes->instance_data;
27425
+ while ( cur_volume->next_os2lvm_node )
27426
+ cur_volume = ( os2_volume_runtime_entry_t * )cur_volume->next_os2lvm_node->instance_data;
27427
+ cur_volume->next_os2lvm_node = new_node;
27430
+ MOD_INC_USE_COUNT;
27437
+ * Function: delete_os2lvm_volume
27439
+ * This function deletes the in-memory representation of an OS/2
27440
+ * logical volume.
27442
+static int delete_os2lvm_volume( evms_logical_node_t * logical_node )
27444
+ os2_drivelink_runtime_entry_t * curdrvlink = (( os2_volume_runtime_entry_t * )logical_node->instance_data )->drive_link, * nextdrvlink;
27445
+ os2_volume_runtime_entry_t * cur_volume, * next_volume;
27447
+ while ( curdrvlink ) {
27448
+ nextdrvlink = curdrvlink->next;
27449
+ delete_os2_drive_link( curdrvlink, 1 );
27450
+ curdrvlink = nextdrvlink;
27453
+ cur_volume = ( os2_volume_runtime_entry_t * )os2lvm_nodes->instance_data;
27454
+ if ( os2lvm_nodes == logical_node )
27455
+ os2lvm_nodes = cur_volume->next_os2lvm_node;
27457
+ while ( cur_volume->next_os2lvm_node ) {
27458
+ next_volume = ( os2_volume_runtime_entry_t * )cur_volume->next_os2lvm_node->instance_data;
27459
+ if ( cur_volume->next_os2lvm_node == logical_node ) {
27460
+ cur_volume->next_os2lvm_node = next_volume->next_os2lvm_node;
27466
+ if ( os2lvm_nodes == NULL ) {
27467
+ // Just deleted the last os2 node. Stop the BBR thread.
27468
+ if ( BBR_Worker_Thread ) {
27469
+ evms_cs_unregister_thread(BBR_Worker_Thread);
27470
+ BBR_Worker_Thread = NULL;
27474
+ evms_cs_deallocate_memory( logical_node->instance_data );
27475
+ evms_cs_deallocate_logical_node( logical_node );
27477
+ MOD_DEC_USE_COUNT;
27484
+ * Function: delete_os2_drive_link
27486
+ * This function deletes the drive link runtime structure and any
27487
+ * other structures it points to.
27489
+static int delete_os2_drive_link( os2_drivelink_runtime_entry_t * drive_link,
27490
+ int delete_link_partition )
27492
+ if ( drive_link->link_data )
27493
+ evms_cs_deallocate_memory( drive_link->link_data );
27494
+ if ( drive_link->bbr_data )
27495
+ evms_cs_deallocate_memory( drive_link->bbr_data );
27496
+ if ( delete_link_partition )
27497
+ DELETE( drive_link->link_partition );
27498
+ evms_cs_deallocate_memory( drive_link );
27505
+// Consistency Checking Functions
27509
+ * Function: validate_signaturesector
27511
+ * This function checks the OS/2 LVM Signature Sector
27513
+static int validate_signaturesector(evms_logical_node_t * evms_partition,
27514
+ LVM_Signature_Sector * signature_sector,
27515
+ u_int32_t sectorsize )
27517
+ u_int32_t crc_hold, crc_new;
27519
+ /* In order for a signature sector to be considered valid, its signature and CRC must
27520
+ be correct. Also, OS/2 stores the starting LBA of the partition and the size of
27521
+ the partition that this signature sector corresponds to. These should be checked
27522
+ as well. However, since the starting LBA of the partition that this belongs to is
27523
+ not available to us as part of an evms_logical_node_t, we can only check the size
27524
+ of the partition against what is stored in the signature sector. */
27526
+ /* The signature used is in two parts. Test the first part. */
27527
+ if ( signature_sector->LVM_Signature1 != OS2LVM_PRIMARY_SIGNATURE ) {
27528
+ LOG_EVERYTHING("Primary LVM Signature failed.\n" );
27532
+ /* Test the second part of the signature. */
27533
+ if ( signature_sector->LVM_Signature2 != OS2LVM_SECONDARY_SIGNATURE ) {
27534
+ LOG_EVERYTHING("Secondary LVM Signature failed.\n" );
27538
+ /* Calculate the CRC and compare it against the stored CRC. */
27539
+ crc_hold = signature_sector->Signature_Sector_CRC;
27540
+ signature_sector->Signature_Sector_CRC = 0;
27541
+ crc_new = evms_cs_calculate_crc( EVMS_INITIAL_CRC, ( void * )signature_sector, sectorsize );
27542
+ if ( crc_hold != crc_new ) {
27543
+ LOG_EVERYTHING("Signature sector crc failed.\n" );
27544
+ LOG_EVERYTHING("sector_crc == %x , calc_crc == %x \n", crc_hold, crc_new );
27548
+ // The partition size must == that found in the Signature Sector
27549
+ if ( evms_partition->total_vsectors != signature_sector->Partition_Sector_Count ) {
27550
+ LOG_EXTRA("Partition size is not valid\n" );
27559
+ * Function: validate_drivelinksector
27561
+ * This function checks the OS/2 LVM Drivelink Feature Sector
27563
+static int validate_drivelinksector( void * Sector_To_Validate,
27564
+ int Sector_Index,
27565
+ u_int32_t Master_Sequence_Number )
27567
+ u_int32_t crc_hold, crc_new;
27568
+ LVM_Link_Table_First_Sector * First_Sector = (LVM_Link_Table_First_Sector * ) Sector_To_Validate;
27569
+ LVM_Link_Table_Sector * Link_Sector = (LVM_Link_Table_Sector * ) Sector_To_Validate;
27571
+ /* The OS/2 drive linking data covers several sectors. The format of the first sector is slightly
27572
+ different from the following sectors because it contains additional information about how many
27573
+ drive links are actually in use. The following sectors just contain portions of the drive link
27574
+ table. Each sector of OS/2 drive linking data contains a signature, crc, and sequence number
27575
+ which must be validated. */
27577
+ if ( Sector_Index == 0 ) {
27579
+ /* Link Table Master Signature Check */
27580
+ if ( LINK_TABLE_MASTER_SIGNATURE != First_Sector->Link_Table_Signature ) {
27581
+ LOG_EVERYTHING("Link Table Master Signature Test failed.\n" );
27585
+ /* We will NOT check the sequence number here as the first sector of drive link data is the
27586
+ source of the Master_Sequence_Number which was passed in to us. */
27588
+ /* Set up for the CRC Check */
27589
+ crc_hold = First_Sector->Link_Table_CRC;
27590
+ First_Sector->Link_Table_CRC = 0;
27593
+ /* Link Table Internal Signature Check */
27594
+ if ( LINK_TABLE_SIGNATURE != Link_Sector->Link_Table_Signature ) {
27595
+ LOG_EVERYTHING("Link Table Internal Signature Test failed.\n" );
27599
+ /* Check the sequence number. */
27600
+ if ( Master_Sequence_Number != Link_Sector->Sequence_Number ) {
27601
+ LOG_EVERYTHING("Link Table Internal Sequence Number Test failed.\n" );
27605
+ /* Set up for the CRC Check */
27606
+ crc_hold = Link_Sector->Link_Table_CRC;
27607
+ Link_Sector->Link_Table_CRC = 0;
27610
+ crc_new = evms_cs_calculate_crc( EVMS_INITIAL_CRC, Sector_To_Validate, OS2_BYTES_PER_SECTOR );
27611
+ if ( crc_hold != crc_new ) {
27612
+ LOG_EVERYTHING("Link Table crc failed.\n" );
27613
+ LOG_EVERYTHING("sector_crc == %x , calc_crc == %x \n", crc_hold, crc_new );
27622
+ * Function: validate_bbrtablesector
27624
+ * This function checks the OS/2 LVM Bad Block Relocation Feature Sector
27626
+static int validate_bbrtablesector( void * Sector_To_Validate,
27627
+ int Sector_Index,
27628
+ u_int32_t Master_Sequence_Number )
27630
+ u_int32_t crc_hold, crc_new;
27631
+ LVM_BBR_Table_First_Sector * First_Sector = (LVM_BBR_Table_First_Sector * ) Sector_To_Validate;
27632
+ LVM_BBR_Table_Sector * BBR_Sector = (LVM_BBR_Table_Sector * ) Sector_To_Validate;
27634
+ /* The OS/2 bad block relocation (BBR) data covers several sectors. The format of the first sector
27635
+ is different from the following sectors because it contains additional information about how many
27636
+ relocations are actually in use and the size and location of the block of replacement sectors.
27637
+ The following sectors just contain portions of the BBR remap table. Each sector of OS/2 BBR data
27638
+ contains a signature, crc, and sequence number which must be validated. */
27640
+ if ( Sector_Index == 0 ) {
27642
+ /* BBR Table Master Signature Check */
27643
+ if ( BBR_TABLE_MASTER_SIGNATURE != First_Sector->Signature ) {
27644
+ LOG_EVERYTHING("BBR Table Master Signature Test failed.\n" );
27648
+ /* We will NOT check the sequence number here as the first sector of BBR data is the
27649
+ source of the Master_Sequence_Number which was passed in to us. */
27651
+ /* Set up for the CRC Check */
27652
+ crc_hold = First_Sector->CRC;
27653
+ First_Sector->CRC = 0;
27657
+ /* BBR Table Internal Signature Check */
27658
+ if ( BBR_TABLE_SIGNATURE != BBR_Sector->Signature ) {
27659
+ LOG_EVERYTHING("BBR Table Internal Signature Test failed.\n" );
27663
+ /* Check the sequence number. */
27664
+ if ( Master_Sequence_Number != BBR_Sector->Sequence_Number ) {
27665
+ LOG_EVERYTHING("BBR Table Internal Sequence Number Test failed.\n" );
27669
+ /* Set up for the CRC Check */
27670
+ crc_hold = BBR_Sector->CRC;
27671
+ BBR_Sector->CRC = 0;
27674
+ crc_new = evms_cs_calculate_crc( EVMS_INITIAL_CRC, Sector_To_Validate, OS2_BYTES_PER_SECTOR );
27675
+ if ( crc_hold != crc_new ) {
27676
+ LOG_EVERYTHING("BBRTable crc failed.\n" );
27677
+ LOG_EVERYTHING("sector_crc == %x , calc_crc == %x \n", crc_hold, crc_new );
27686
+ * Function: check_for_os2_bbr_relocations
27688
+ * This function checks the OS/2 LVM Bad Block Relocation Tables
27689
+ * for any active relocation sectors. The bbr table is reformatted in memory
27690
+ * to make searches faster.
27691
+ * Return values: 0 == no active relocations, 1 == contains active relocations
27693
+static u_int32_t check_for_os2_bbr_relocations( char * bbr_data_ptr )
27695
+ LVM_BBR_Feature * feature_data = ( LVM_BBR_Feature * )bbr_data_ptr;
27697
+ if ( feature_data->control.Table_Entries_In_Use ) {
27698
+ LOG_EVERYTHING("There are %d active relocations.\n", feature_data->control.Table_Entries_In_Use );
27707
+ * Function: check_os2_volumes
27709
+ * This function performs a consistency check on all existing OS/2
27710
+ * Logical Volumes. The list of constituent partitions ( links )
27711
+ * is checked and ordered according to the Link Table. If any link
27712
+ * is missing or inconsistent, the entire volume will be deleted.
27714
+static int check_os2_volumes( evms_logical_node_t ** node_list )
27716
+ os2_volume_runtime_entry_t * cur_volume;
27717
+ os2_volume_runtime_entry_t * previous_volume;
27718
+ evms_logical_node_t * cur_node;
27719
+ evms_logical_node_t * previous_node = NULL;
27720
+ os2_drivelink_runtime_entry_t * link_list, * link_hold;
27721
+ LVM_Link_Table_First_Sector * psector1;
27723
+ u_int32_t numlinks, countlinks, linkser;
27724
+ u_int32_t Master_Sequence_Number; /* Used to check whether or not all of the copies of Drive Linking data match. */
27725
+ evms_sector_t partition_offset;
27728
+ LOG_ENTRY_EXIT("Checking OS/2 Logical Volumes\n" );
27730
+ cur_node = os2lvm_nodes;
27732
+ while ( cur_node ) {
27733
+ cur_volume = ( os2_volume_runtime_entry_t * )cur_node->instance_data;
27734
+ link_list = NULL;
27735
+ if ( !cur_volume->complete ) { /* need to verify this one */
27736
+ cur_volume->complete = 1;
27737
+ LOG_EVERYTHING("Checking volume %s\n", cur_node->name );
27739
+ // Reset fields for sort operation
27740
+ cur_volume->size_in_sectors = 0;
27741
+ numlinks = cur_volume->drive_link_count;
27742
+ cur_volume->drive_link_count = 0;
27743
+ cur_node->total_vsectors = 0;
27744
+ link_list = cur_volume->drive_link;
27745
+ cur_volume->drive_link = NULL;
27747
+ // Access the link data to order the drive links
27748
+ psector1 = ( LVM_Link_Table_First_Sector * )link_list->link_data;
27749
+ Master_Sequence_Number = psector1->Sequence_Number;
27751
+ if ( numlinks != psector1->Links_In_Use ) {
27752
+ LOG_SERIOUS("Link Count mismatch vol=%i, table=%i\n", numlinks, psector1->Links_In_Use );
27753
+ cur_volume->complete = 0;
27757
+ if ( numlinks > LINKS_IN_FIRST_SECTOR ) {
27758
+ countlinks = LINKS_IN_FIRST_SECTOR;
27759
+ numlinks -= LINKS_IN_FIRST_SECTOR;
27762
+ countlinks = numlinks;
27768
+ partition_offset = 0;
27769
+ for ( i = 0; (i < countlinks) && (cur_volume->complete == 1); i++ ) {
27770
+ linkser = psector1->Link_Table[i].Partition_Serial_Number;
27771
+ if ( ( link_hold = find_link_data( &link_list, linkser ) ) ) {
27772
+ // Add this partition to its parent Volume
27773
+ add_os2link( link_hold, cur_node );
27774
+ LOG_EVERYTHING("Link start_RBA == %Ld , sector_count == %Ld\n",
27775
+ partition_offset, link_hold->sector_count );
27776
+ link_hold->start_sector = partition_offset;
27777
+ partition_offset += link_hold->sector_count;
27780
+ LOG_SERIOUS("Link Table entry %i metadata missing\n", i );
27781
+ cur_volume->complete = 0;
27786
+ sect_ptr = ( char * )psector1;
27788
+ while ( numlinks && (cur_volume->complete == 1) ) {
27789
+ if ( numlinks > LINKS_IN_NEXT_SECTOR ) {
27790
+ countlinks = LINKS_IN_NEXT_SECTOR;
27791
+ numlinks -= LINKS_IN_NEXT_SECTOR;
27794
+ countlinks = numlinks;
27797
+ sect_ptr += OS2_BYTES_PER_SECTOR;
27798
+ if ( Master_Sequence_Number != (( LVM_Link_Table_Sector * )sect_ptr )->Sequence_Number ) {
27799
+ cur_volume->complete = 0;
27800
+ LOG_SERIOUS("Bad Sequence Number for Drive Linking Metadata!\n");
27803
+ for ( i = 0; i < countlinks; i++ ) {
27804
+ linkser = (( LVM_Link_Table_Sector * )sect_ptr )->Link_Table[i].Partition_Serial_Number;
27805
+ if ( ( link_hold = find_link_data( &link_list, linkser ) ) ) {
27806
+ // Add this partition to its parent Volume
27807
+ add_os2link( link_hold, cur_node );
27808
+ LOG_EVERYTHING("Link start_RBA == %Ld , sector_count == %Ld\n",
27809
+ partition_offset, link_hold->sector_count );
27810
+ link_hold->start_sector = partition_offset;
27811
+ partition_offset += link_hold->sector_count;
27814
+ LOG_SERIOUS("Link Table entry %i metadata missing\n", i );
27815
+ cur_volume->complete = 0;
27823
+ /* If the volume is complete we can export it for use. */
27824
+ if ( cur_volume->complete && (link_list == NULL) ) {
27826
+ // Link new volume into the node list
27827
+ if ( cur_volume->Export_Needed &&
27828
+ ( !evms_cs_add_logical_node_to_list( node_list, cur_node ) )
27831
+ cur_volume->Export_Needed = 0;
27834
+ previous_node = cur_node;
27835
+ cur_node = cur_volume->next_os2lvm_node;
27838
+ /* Remove the volume from os2lvm_nodes list and delete it. */
27839
+ if ( previous_node != NULL ) {
27841
+ previous_volume = ( os2_volume_runtime_entry_t * )previous_node->instance_data;
27842
+ previous_volume->next_os2lvm_node = cur_volume->next_os2lvm_node;
27843
+ cur_volume->next_os2lvm_node = NULL;
27845
+ delete_os2lvm_volume(cur_node);
27847
+ cur_node = previous_volume->next_os2lvm_node;
27850
+ previous_node = cur_volume->next_os2lvm_node;
27851
+ delete_os2lvm_volume(cur_node);
27852
+ cur_node = previous_node;
27853
+ previous_node = NULL;
27854
+ os2lvm_nodes = cur_node;
27857
+ /* If any items remain in link_list, delete those as well. */
27858
+ while (link_list) {
27859
+ link_hold = link_list->next;
27860
+ delete_os2_drive_link(link_list,1);
27861
+ link_list = link_hold;
27868
+ LOG_ENTRY_EXIT("Finished Checking OS/2 Logical Volumes\n" );
27875
+/* BBR_Transfer_IO
27877
+ * Transfer the responsibility for completing the specified IO from
27878
+ * the thread that requested it to the BBR Worker Thread
27880
+static void BBR_Transfer_IO(BBR_IO_Transfer_Record_t * Transfer_Record)
27882
+ unsigned long flags;
27883
+ int Wake_Worker_Thread = 0; /* Assume that the worker is already awake. */
27885
+ spin_lock_irqsave(&BBR_Queue_Lock, flags);
27887
+ /* The BBR IO List is a singly linked list. BBR_IO_List_Head points
27888
+ to the first item in the list, and BBR_IO_List_Tail points to the
27889
+ last item in the list. */
27890
+ Transfer_Record->Next = NULL;
27891
+ if ( !BBR_IO_List_Tail ) { /* Empty list */
27892
+ BBR_IO_List_Head = Transfer_Record;
27893
+ Wake_Worker_Thread = 1; /* Wake up the worker thread. */
27895
+ else /* Items already in the list. */
27896
+ BBR_IO_List_Tail->Next = Transfer_Record;
27898
+ BBR_IO_List_Tail = Transfer_Record;
27900
+ spin_unlock_irqrestore(&BBR_Queue_Lock, flags);
27901
+ if ( Wake_Worker_Thread )
27902
+ evms_cs_wakeup_thread(BBR_Worker_Thread);
27908
+/* OS2_DL_Callback
27910
+ * This is the callback function used when an I/O request has to be broken
27911
+ * into two parts because it crosses a drive link boundary.
27914
+static void OS2_DL_Callback(struct buffer_head *bh, int uptodate)
27917
+ DL_IO_Tracking_Record_t * Tracking_Record;
27918
+ struct buffer_head * Original;
27920
+ Tracking_Record = bh->b_private;
27922
+ /* Is this a read or a write? */
27923
+ if ( Tracking_Record->Link1_Transfer_Record ||
27924
+ Tracking_Record->Link2_Transfer_Record ) {
27925
+ /* We have a write here. Was it successful? */
27926
+ if ( ! uptodate) {
27927
+ /* Have we tried BBR yet? */
27928
+ if ( ( bh == Tracking_Record->Link1.bh ) &&
27929
+ ( ! Tracking_Record->Link1_BBR_Attempted ) ){
27930
+ /* Attempt BBR. */
27931
+ BBR_Transfer_IO(Tracking_Record->Link1_Transfer_Record);
27932
+ Tracking_Record->Link1_BBR_Attempted = 1;
27935
+ else if ( ( bh == Tracking_Record->Link2.bh ) &&
27936
+ ( ! Tracking_Record->Link2_BBR_Attempted ) ) {
27937
+ /* Attempt BBR. */
27938
+ BBR_Transfer_IO(Tracking_Record->Link2_Transfer_Record);
27939
+ Tracking_Record->Link2_BBR_Attempted = 1;
27947
+ Tracking_Record->IO_In_Progress -= 1;
27948
+ if ( Tracking_Record->IO_In_Progress) {
27949
+ Tracking_Record->Up_To_Date = uptodate;
27951
+ Original = Tracking_Record->Original.bh;
27953
+ if ( ! Tracking_Record->IO_In_Progress ) {
27954
+ uptodate &= Tracking_Record->Up_To_Date;
27955
+ /* If this is a write, then Transfer Records will have been set up for both Link1 and Link2.
27956
+ If the transfer records were used because of BBR, then the BBR worker thread will have
27957
+ disposed of the transfer records. If the transfer records were not used, then we must
27958
+ dispose of them here to prevent memory leaks. */
27959
+ if ( Tracking_Record->Link1_Transfer_Record &&
27960
+ ( ! Tracking_Record->Link1_BBR_Attempted) ) {
27961
+ evms_cs_deallocate_to_pool( BBR_Transfer_Pool,Tracking_Record->Link1_Transfer_Record);
27963
+ if ( Tracking_Record->Link2_Transfer_Record &&
27964
+ ( ! Tracking_Record->Link2_BBR_Attempted) ) {
27965
+ evms_cs_deallocate_to_pool( BBR_Transfer_Pool,Tracking_Record->Link2_Transfer_Record);
27967
+ evms_cs_deallocate_to_pool(evms_bh_pool,Tracking_Record->Link1.bh);
27968
+ evms_cs_deallocate_to_pool(evms_bh_pool,Tracking_Record->Link2.bh);
27969
+ evms_cs_deallocate_to_pool(DL_Tracking_Pool,Tracking_Record);
27970
+ Original->b_end_io(Original,uptodate);
27976
+/* OS2_BBR_Write_Callback
27978
+ * This is the callback for normal write requests. Check for an error
27979
+ * during the I/O, and send to the worker thread for processing if necessary.
27981
+static void OS2_BBR_Write_Callback( BBR_IO_Transfer_Record_t * Transfer_Record,
27982
+ struct buffer_head * bh,
27986
+ if ( ! uptodate ) {
27987
+ BBR_Transfer_IO(Transfer_Record);
27991
+ evms_cs_deallocate_to_pool(BBR_Transfer_Pool,Transfer_Record);
28000
+/* Worker thread to handle:
28002
+ I/O to drive/partitions/objects where bad blocks are known to exist
28003
+ I/O to drive/partition/object where a new bad block has been discovered and the I/O must be redriven.
28006
+static void BBR_Worker( void * Not_Used)
28008
+ unsigned long flags;
28009
+ BBR_IO_Transfer_Record_t * Current_IO;
28013
+ // Process bbr_io_list, one entry at a time.
28014
+ spin_lock_irqsave(&BBR_Queue_Lock, flags);
28016
+ /* Is there any work for us? */
28017
+ if ( ! BBR_IO_List_Head ) {
28018
+ spin_unlock_irqrestore(&BBR_Queue_Lock, flags);
28019
+ break; /* List empty - nothing to do. */
28022
+ /* Get the IO to perform. */
28023
+ Current_IO = BBR_IO_List_Head;
28024
+ BBR_IO_List_Head = Current_IO->Next;
28025
+ if (! BBR_IO_List_Head )
28026
+ BBR_IO_List_Tail = BBR_IO_List_Head;
28028
+ spin_unlock_irqrestore(&BBR_Queue_Lock, flags);
28030
+ /* Now lets process the I/O request. */
28031
+ complete = do_os2_bbr_io(Current_IO->Partition_Data,Current_IO->Write_Flag, Current_IO->eio.rsector, Current_IO->eio.rsize, Current_IO->eio.bh->b_data);
28033
+ /* We need to do the callback. */
28034
+ Current_IO->eio.bh->b_end_io(Current_IO->eio.bh, (complete == 0) );
28036
+ /* Now cleanup */
28037
+ evms_cs_deallocate_to_pool(BBR_Transfer_Pool,Current_IO);
28040
+ return; /* Go to sleep. */
28046
+ * Sector_Is_Remapped
28048
+ * This function returns 1 if the specified sector has been remapped, 0 if it has not
28050
+ * If the sector has been remapped, then the new sector is returned in Replacement_Sector
28053
+static int Sector_Is_Remapped(os2_drivelink_runtime_entry_t * io_dlentry, evms_sector_t Source_Sector, evms_sector_t * Replacement_Sector)
28055
+ LVM_BBR_Feature * Feature_Data = ( LVM_BBR_Feature * )io_dlentry->bbr_data;
28056
+ unsigned int Sector_Index; /* The BBR Table is spread across several sectors. This tracks which sector we are looking at. */
28057
+ unsigned int BBR_Table_Index; /* This tracks the actual entry in the BBR Table that we are examining. */
28058
+ unsigned int BBR_Table_Entries_In_Use = Feature_Data->control.Table_Entries_In_Use;
28059
+ BBR_Table_Entry * BBR_Table_Entry;
28060
+ unsigned int Guard1;
28062
+ /* Default value is no remap. */
28063
+ *Replacement_Sector = Source_Sector;
28066
+ Guard1 = io_dlentry->Guard1; /* Lamport's Theorem */
28068
+ for ( BBR_Table_Index = 0; BBR_Table_Index < BBR_Table_Entries_In_Use; BBR_Table_Index++) {
28069
+ Sector_Index = BBR_Table_Index / BBR_TABLE_ENTRIES_PER_SECTOR;
28070
+ BBR_Table_Entry = &(Feature_Data->remap[Sector_Index].BBR_Table[BBR_Table_Index - (Sector_Index * BBR_TABLE_ENTRIES_PER_SECTOR)]);
28071
+ if ( BBR_Table_Entry->BadSector == Source_Sector ){
28072
+ *Replacement_Sector = BBR_Table_Entry->ReplacementSector;
28077
+ } while ( Guard1 != io_dlentry->Guard2 ); /* Lamport's Theorem */
28079
+ if ( *Replacement_Sector != Source_Sector )
28087
+ * Invalidate_Mapping
28089
+ * This function either frees a replacement sector to be reused, or it
28090
+ * marks the replacement sector as bad.
28093
+static void Invalidate_Mapping(os2_drivelink_runtime_entry_t * dlentry,
28094
+ evms_sector_t Source_Sector,
28095
+ int Replacement_Sector_Is_Bad)
28097
+ LVM_BBR_Feature * Feature_Data = ( LVM_BBR_Feature * )dlentry->bbr_data;
28098
+ unsigned int Sector_Index; /* The BBR Table is spread across several sectors. This tracks which sector we are looking at. */
28099
+ unsigned int BBR_Table_Index; /* This tracks the actual entry in the BBR Table that we are examining. */
28100
+ unsigned int BBR_Table_Entries_In_Use = Feature_Data->control.Table_Entries_In_Use;
28101
+ BBR_Table_Entry * BBR_Table_Entry = NULL;
28103
+ /* Lock for the BBR Table. */
28104
+ down( &(dlentry->BBR_Table_Lock) );
28106
+ /* Find the entry to invalidate. */
28107
+ for ( BBR_Table_Index = 0; BBR_Table_Index < BBR_Table_Entries_In_Use; BBR_Table_Index++) {
28108
+ Sector_Index = BBR_Table_Index / BBR_TABLE_ENTRIES_PER_SECTOR;
28109
+ BBR_Table_Entry = &(Feature_Data->remap[Sector_Index].BBR_Table[BBR_Table_Index - (Sector_Index * BBR_TABLE_ENTRIES_PER_SECTOR)]);
28110
+ if ( BBR_Table_Entry->BadSector == Source_Sector ){
28115
+ /* Now that we have found the entry, we must invalidate it. */
28116
+ if ( Replacement_Sector_Is_Bad ) {
28117
+ BBR_Table_Entry->BadSector = (u_int32_t) -1;
28119
+ /* OS/2 supported a method for clearing out bad block remappings if the filesystem on the volume supported
28120
+ the tracking of bad blocks. We don't support that under Linux, so there is no else case here. */
28122
+ /* Unlock the BBR Table */
28123
+ up( &(dlentry->BBR_Table_Lock) );
28129
+ * Create_New_BBR_Table_Entry
28131
+ * Finds bad blocks within the range specified, allocates replacement sectors,
28132
+ * writes the data to the replacement sectors, and updates the BBR metadata on
28133
+ * disk to reflect the new mapping. Returns 1 if successful, 0 otherwise.
28136
+static int Create_New_BBR_Table_Entry(os2_drivelink_runtime_entry_t * dlentry,
28137
+ evms_sector_t starting_lsn,
28138
+ unsigned int count,
28141
+ evms_sector_t lsn;
28142
+ BBR_Table_Entry *Table_Entry;
28143
+ unsigned int Sector_Index;
28144
+ unsigned int Table_Index;
28147
+ u_int32_t New_Sequence_Number;
28148
+ LVM_BBR_Feature *BBR_Data = (LVM_BBR_Feature*) dlentry->bbr_data;
28150
+ for ( lsn = starting_lsn; lsn < (starting_lsn + count); lsn++) {
28151
+ rc = INIT_IO(dlentry->link_partition, 1, lsn, 1, buffer);
28154
+ /* Lock for the BBR Table. */
28155
+ down( &(dlentry->BBR_Table_Lock) );
28157
+ /* Increment the second guard value. This will cause those reading the BBR Table to spin.*/
28158
+ dlentry->Guard2++;
28160
+ /* Ensure that the bbr active flag is set. */
28161
+ dlentry->bbr_is_active = 1;
28163
+ /* Allocate a replacement sector */
28164
+ if ( BBR_Data->control.Table_Entries_In_Use < BBR_Data->control.Table_Size ) {
28165
+ Sector_Index = BBR_Data->control.Table_Entries_In_Use / BBR_TABLE_ENTRIES_PER_SECTOR;
28166
+ Table_Index = BBR_Data->control.Table_Entries_In_Use % BBR_TABLE_ENTRIES_PER_SECTOR;
28167
+ BBR_Data->control.Table_Entries_In_Use = BBR_Data->control.Table_Entries_In_Use + 1;
28168
+ Table_Entry = (BBR_Table_Entry *) &(BBR_Data->remap[Sector_Index].BBR_Table[Table_Index]);
28169
+ Table_Entry->BadSector = lsn;
28172
+ /* There are no more replacement sectors available! Time to bail ... */
28173
+ up( &(dlentry->BBR_Table_Lock) );
28177
+ /* Now that we have a replacement sector, increment the first guard value. This will free any
28178
+ threads reading the BBR Table. */
28179
+ dlentry->Guard1++;
28181
+ /* Release the lock now that we have a replacement sector. */
28182
+ up( &(dlentry->BBR_Table_Lock) );
28184
+ /* Test the replacement sector. */
28185
+ rc = INIT_IO(dlentry->link_partition, 1, Table_Entry->ReplacementSector, 1, buffer);
28187
+ /* The replacement sector was bad. Lets mark it bad in the table and try again. */
28188
+ Table_Entry->BadSector = (u_int32_t) -1;
28191
+ } /* End of processing for the current sector. */
28193
+ } /* end of loop to test each sector in the I/O and remap any bad ones found. */
28195
+ /* Need to write the modified BBR Table back to disk. This includes updating the sequence numbers and CRCs. */
28197
+ /* Lock for the BBR Table. */
28198
+ down( &(dlentry->BBR_Table_Lock) );
28200
+ /* Increment the sequence numbers. */
28201
+ New_Sequence_Number = BBR_Data->control.Sequence_Number + 1;
28202
+ BBR_Data->control.Sequence_Number = New_Sequence_Number;
28203
+ for ( Sector_Index = 0; Sector_Index < BBR_Data->control.Sectors_Per_Table; Sector_Index++) {
28204
+ BBR_Data->remap[Sector_Index].Sequence_Number = New_Sequence_Number;
28207
+ /* Calculate the new CRC values. */
28208
+ BBR_Data->control.CRC = 0;
28209
+ BBR_Data->control.CRC = evms_cs_calculate_crc(EVMS_INITIAL_CRC,&(BBR_Data->control),OS2_BYTES_PER_SECTOR);
28210
+ for ( Sector_Index = 0; Sector_Index < BBR_Data->control.Sectors_Per_Table; Sector_Index++) {
28211
+ BBR_Data->remap[Sector_Index].CRC = 0;
28212
+ BBR_Data->remap[Sector_Index].CRC = evms_cs_calculate_crc(EVMS_INITIAL_CRC,&(BBR_Data->remap[Sector_Index]),OS2_BYTES_PER_SECTOR);
28215
+ /* Now we must write the table back to the partition from whence it came. */
28217
+ /* Write the first copy. */
28218
+ rc = INIT_IO(dlentry->link_partition,1,dlentry->BBR_Data_Copy1,dlentry->BBR_Feature_Size,BBR_Data);
28220
+ /* Write the second copy. */
28221
+ rc2 = INIT_IO(dlentry->link_partition,1,dlentry->BBR_Data_Copy2,dlentry->BBR_Feature_Size,BBR_Data);
28223
+ /* If both copies failed to reach the disk, then fail the I/O. */
28224
+ if ( rc && rc2 ) {
28230
+ /* Unlock the BBR Table */
28231
+ up( &(dlentry->BBR_Table_Lock) );
28233
+ /* Indicate success. */
28239
+ * Clone_Bufferhead
28241
+ * Prepares a usable copy of an existing bufferhead.
28244
+static void Clone_Bufferhead(struct buffer_head * Source, struct buffer_head * Child)
28246
+ Child->b_next = NULL;
28247
+ Child->b_blocknr = Source->b_blocknr;
28248
+ Child->b_size = Source->b_size;
28249
+ Child->b_list = 0;
28250
+ Child->b_dev = Source->b_dev;
28251
+ Child->b_count = Source->b_count;
28252
+ Child->b_rdev = Source->b_rdev;
28253
+ Child->b_state = Source->b_state;
28254
+ Child->b_flushtime = 0;
28255
+ Child->b_next_free = NULL;
28256
+ Child->b_prev_free = NULL;
28257
+ Child->b_this_page = NULL;
28258
+ Child->b_reqnext = NULL;
28259
+ Child->b_pprev = NULL;
28260
+ Child->b_data = Source->b_data;
28261
+ Child->b_page = Source->b_page;
28262
+ Child->b_end_io = Source->b_end_io;
28263
+ Child->b_private = Source->b_private;
28264
+ Child->b_rsector = Source->b_rsector;
28265
+ Child->b_inode = NULL;
28266
+ Child->b_inode_buffers.next = NULL;
28267
+ Child->b_inode_buffers.prev = NULL;
28270
diff -Naur linux-2002-03-28/drivers/evms/s390_part.c evms-2002-03-28/drivers/evms/s390_part.c
28271
--- linux-2002-03-28/drivers/evms/s390_part.c Wed Dec 31 18:00:00 1969
28272
+++ evms-2002-03-28/drivers/evms/s390_part.c Tue Mar 26 14:28:49 2002
28274
+/* -*- linux-c -*- */
28278
+ * Copyright (c) International Business Machines Corp., 2000
28280
+ * This program is free software; you can redistribute it and/or modify
28281
+ * it under the terms of the GNU General Public License as published by
28282
+ * the Free Software Foundation; either version 2 of the License, or
28283
+ * (at your option) any later version.
28285
+ * This program is distributed in the hope that it will be useful,
28286
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
28287
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
28288
+ * the GNU General Public License for more details.
28290
+ * You should have received a copy of the GNU General Public License
28291
+ * along with this program; if not, write to the Free Software
28292
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
28297
+ * linux/drivers/evms/s390_part.c
28299
+ * EVMS S/390 partition manager
28301
+ * Partial code extracted from
28303
+ * linux/fs/partitions/ibm.c
28307
+#include <linux/config.h>
28308
+#include <linux/module.h>
28309
+#include <linux/kernel.h>
28310
+#include <linux/config.h>
28311
+#include <linux/string.h>
28312
+#include <linux/blk.h>
28313
+#include <asm/ebcdic.h>
28314
+#include <asm/uaccess.h>
28315
+#include <asm/dasd.h>
28316
+#include <asm/vtoc.h>
28317
+#include <linux/evms/evms_kernel.h>
28319
+/* prefix used in logging messages */
28320
+#define LOG_PREFIX "s390_part: "
28322
+/* Private instance data structure for node we produced */
28323
+typedef struct local_instance_data_s {
28324
+ evms_logical_node_t * source_disk;
28325
+ evms_sector_t start_sect; /* starting LBA */
28326
+ evms_sector_t nr_sects; /* number of sectors */
28327
+ unsigned char type; /* partition type or filesystem format indicator, can be set to 0 */
28328
+} local_instance_data_t;
28330
+static int exported_nodes; /* total # of exported segments
28331
+ * produced during this discovery.
28335
+static int s390_partition_discover(evms_logical_node_t **);
28336
+static int s390_partition_delete(evms_logical_node_t *);
28337
+static void s390_partition_read(evms_logical_node_t *,
28339
+static void s390_partition_write(evms_logical_node_t *,
28341
+static int s390_partition_ioctl(evms_logical_node_t *,
28346
+static int s390_partition_init_io(evms_logical_node_t *,
28352
+static evms_plugin_function_table_t function_table = {
28353
+ discover: &s390_partition_discover,
28354
+ delete : &s390_partition_delete,
28355
+ read : &s390_partition_read,
28356
+ write : &s390_partition_write,
28357
+ init_io : &s390_partition_init_io,
28358
+ ioctl : &s390_partition_ioctl
28361
+#define EVMS_S390_PARTITION_MANAGER_ID 2
28363
+static evms_plugin_header_t plugin_header = {
28364
+ id : SetPluginID(
28366
+ EVMS_SEGMENT_MANAGER,
28367
+ EVMS_S390_PARTITION_MANAGER_ID),
28373
+ required_common_services_version : {
28378
+ function_table : &function_table
28381
+/***************************************************/
28382
+/* List Support - Typedefs, Variables, & Functions */
28383
+/***************************************************/
28387
+typedef struct local_segment_list_node_s {
28388
+ evms_logical_node_t *segment;
28389
+ struct local_segment_list_node_s *next;
28390
+} local_segment_list_node_t;
28392
+typedef struct local_disk_list_node_s {
28393
+ evms_logical_node_t *disk;
28394
+ local_segment_list_node_t *segment_list;
28395
+ struct local_disk_list_node_s *next;
28396
+} local_disk_list_node_t;
28400
+static local_disk_list_node_t *my_disk_list;
28404
+static local_disk_list_node_t **
28406
+ evms_logical_node_t *disk)
28408
+ local_disk_list_node_t **ldln;
28410
+ ldln = &my_disk_list;
28412
+ if ((*ldln)->disk == disk)
28414
+ ldln = &(*ldln)->next;
28419
+static local_segment_list_node_t **
28421
+ local_disk_list_node_t *disk,
28422
+ evms_logical_node_t *segment)
28424
+ local_segment_list_node_t **lsln;
28426
+ lsln = &disk->segment_list;
28428
+ if ((*lsln)->segment == segment)
28430
+ lsln = &(*lsln)->next;
28435
+static evms_logical_node_t *
28436
+find_segment_on_disk(
28437
+ evms_logical_node_t *disk,
28438
+ u_int64_t start_sect,
28439
+ u_int64_t nr_sects)
28441
+ evms_logical_node_t *rc = NULL;
28442
+ local_disk_list_node_t **ldln;
28443
+ local_segment_list_node_t **lsln;
28444
+ local_instance_data_t *lid;
28446
+ ldln = lookup_disk(disk);
28448
+ /* disk found in list */
28449
+ /* attempt to find segment */
28451
+ lsln = &(*ldln)->segment_list;
28453
+ lid = (*lsln)->segment->instance_data;
28454
+ if (lid->start_sect == start_sect)
28455
+ if (lid->nr_sects == nr_sects)
28457
+ lsln = &(*lsln)->next;
28460
+ rc = (*lsln)->segment;
28465
+/* function description: add_segment_to_disk
28467
+ * this function attempts to add a segment to the segment
28468
+ * list of a disk. if the specified disk is not found, it
28469
+ * will be added to the global disk list. this function will
28470
+ * return a pointer to the matching segment in the disk's
28471
+ * segment list. the caller must compare the returned pointer
28472
+ * to the specified segment to see if the
28473
+ * specified segment was already present in the disk's segment
28474
+ * list. if the return pointer matches the specified segment,
28475
+ * then the specified segment was added to the list. if the
28476
+ * return segment pointer to does not match the specified
28477
+ * segment pointer, then the specified segment pointer was
28478
+ * a duplicate and can be thrown away.
28481
+add_segment_to_disk(
28482
+ evms_logical_node_t *disk,
28483
+ evms_logical_node_t *segment)
28486
+ local_disk_list_node_t **ldln, *new_disk;
28487
+ local_segment_list_node_t **lsln, *new_segment;
28489
+ ldln = lookup_disk(disk);
28490
+ if (*ldln == NULL) {
28491
+ /* disk not in list, add disk */
28492
+ rc = evms_cs_allocate_memory((void **)&new_disk,
28493
+ sizeof(*new_disk));
28495
+ new_disk->disk = disk;
28496
+ *ldln = new_disk;
28500
+ /* attempt to add segment */
28501
+ lsln = lookup_segment(*ldln, segment);
28502
+ if (*lsln == NULL) {
28503
+ /* segment not in list, add segment */
28504
+ rc = evms_cs_allocate_memory((void **)&new_segment,
28505
+ sizeof(*new_segment));
28507
+ new_segment->segment = segment;
28508
+ *lsln = new_segment;
28517
+remove_segment_from_disk(
28518
+ evms_logical_node_t *disk,
28519
+ evms_logical_node_t *segment,
28520
+ evms_logical_node_t **empty_disk)
28523
+ local_disk_list_node_t **ldln, *tmp_disk_node;
28524
+ local_segment_list_node_t **lsln, *tmp_segment_node;
28526
+ *empty_disk = NULL;
28527
+ ldln = lookup_disk(disk);
28528
+ if (*ldln == NULL) {
28531
+ /* disk found in list */
28532
+ /* attempt to add segment */
28533
+ lsln = lookup_segment(*ldln, segment);
28534
+ if (*lsln == NULL) {
28537
+ tmp_segment_node = *lsln;
28538
+ /* remove segment from list */
28539
+ *lsln = (*lsln)->next;
28540
+ /* free the segment list node */
28541
+ evms_cs_deallocate_memory(tmp_segment_node);
28543
+ if ((*ldln)->segment_list == NULL) {
28544
+ tmp_disk_node = *ldln;
28545
+ *empty_disk = tmp_disk_node->disk;
28546
+ /* remove disk from list */
28547
+ *ldln = (*ldln)->next;
28548
+ /* free the disk list node */
28549
+ evms_cs_deallocate_memory(tmp_disk_node);
28557
+ * Function: add_segment
28560
+s390_process_segment(
28561
+ evms_logical_node_t **discover_list,
28562
+ evms_logical_node_t *node,
28563
+ u_int64_t start_sect,
28564
+ u_int64_t nr_sects,
28565
+ unsigned char type,
28568
+ local_instance_data_t *InstData = NULL;
28569
+ evms_logical_node_t *segment;
28572
+ segment = find_segment_on_disk(node, start_sect, nr_sects);
28574
+ LOG_DETAILS("exporting segment '%s'.\n",
28577
+ rc = evms_cs_allocate_memory((void **)&InstData,sizeof(*InstData));
28579
+ InstData->source_disk = node;
28580
+ InstData->start_sect = start_sect;
28581
+ InstData->nr_sects = nr_sects;
28582
+ InstData->type = type;
28583
+ rc = evms_cs_allocate_logical_node(&segment);
28586
+ segment->plugin = &plugin_header;
28587
+ segment->system_id = (unsigned int)type;
28588
+ segment->total_vsectors = nr_sects;
28589
+ segment->block_size = node->block_size;
28590
+ segment->hardsector_size = node->hardsector_size;
28591
+ segment->instance_data = InstData;
28592
+ segment->flags = node->flags;
28593
+ strcpy(segment->name, node->name);
28594
+ sprintf(segment->name + strlen(segment->name), "%d", part_num);
28595
+ LOG_DETAILS("creating segment '%s'.\n",
28597
+ rc = add_segment_to_disk(node, segment);
28599
+ LOG_ERROR("%s: error(%d) adding segment '%s'!\n",
28600
+ __FUNCTION__, rc, segment->name);
28603
+ MOD_INC_USE_COUNT;
28608
+ evms_cs_deallocate_memory(InstData);
28610
+ evms_cs_deallocate_logical_node(segment);
28614
+ evms_cs_add_logical_node_to_list(discover_list, segment);
28615
+ exported_nodes++;
28621
+ ibm_partition_lnx1 = 0,
28622
+ ibm_partition_vol1 = 1,
28623
+ ibm_partition_cms1 = 2,
28624
+ ibm_partition_none = 3
28625
+} ibm_partition_t;
28627
+static char* part_names[] = {
28628
+ [ibm_partition_lnx1] = "LNX1",
28629
+ [ibm_partition_vol1] = "VOL1",
28630
+ [ibm_partition_cms1] = "CMS1",
28631
+ [ibm_partition_none] = "(nonl)"
28634
+static ibm_partition_t
28635
+get_partition_type ( char * type )
28638
+ for ( i = 0; i < 3; i ++) {
28639
+ if ( ! strncmp (type,part_names[i],4) )
28646
+ * compute the block number from a
28647
+ * cyl-cyl-head-head structure
28650
+cchh2blk (cchh_t *ptr, struct hd_geometry *geo) {
28651
+ return ptr->cc * geo->heads * geo->sectors +
28652
+ ptr->hh * geo->sectors;
28657
+ * compute the block number from a
28658
+ * cyl-cyl-head-head-block structure
28661
+cchhb2blk (cchhb_t *ptr, struct hd_geometry *geo) {
28662
+ return ptr->cc * geo->heads * geo->sectors +
28663
+ ptr->hh * geo->sectors +
28667
+void print_mem( void *buffer, int length )
28670
+ unsigned char *bufptr;
28672
+ bufptr = (unsigned char *)buffer;
28675
+ if ( (i % 16) == 0 )
28676
+ printk(KERN_INFO "\n0x%p->", buffer + i);
28677
+ printk(KERN_INFO "%02x ", bufptr[i]);
28678
+ if ( ++i >= length )
28681
+ printk(KERN_INFO "\n");
28685
+s390_probe_for_segments(
28686
+ evms_logical_node_t **discover_list,
28687
+ evms_logical_node_t *disk)
28689
+ char type[5] = {0,}, name[7] = {0,};
28690
+ int rc, vsects_per_hardsect = 0;
28691
+ unsigned int blk;
28693
+ dasd_information_t *info = NULL;
28694
+ struct hd_geometry *geo = NULL;
28695
+ unchar *data = NULL;
28697
+ /* allocate space for DASD ioctl packet
28699
+ rc = evms_cs_allocate_memory((void **)&info, sizeof(dasd_information_t));
28701
+ LOG_DEBUG("probing '%s' for 390 DASD info...\n",
28703
+ /* issue DASD info ioctl
28705
+ rc = evms_cs_kernel_ioctl(disk, BIODASDINFO, (unsigned long)info);
28707
+ LOG_DEBUG("error(%d) from BIODASDINFO ioctl.\n", rc);
28708
+ LOG_DEBUG("assuming '%s' is not a valid 390 device!\n",
28713
+ /* if we successfully completed the previous
28714
+ * get DASD info ioctl, we will assume that
28715
+ * the device is a valid 390 disk.
28717
+ * remove it from the discover list.
28719
+ rc = evms_cs_remove_logical_node_from_list(
28720
+ discover_list, disk);
28722
+ LOG_ERROR("error(%d) removing disk(%s) from discover list.\n",
28727
+ /* allocate space for the geometry packet
28729
+ rc = evms_cs_allocate_memory((void **)&geo, sizeof(struct hd_geometry));
28731
+ /* issue the Get GEO ioctl
28733
+ rc = evms_cs_kernel_ioctl(disk, HDIO_GETGEO, (unsigned long)geo);
28735
+ LOG_ERROR("error(%d) from HDIO_GETGEO ioctl.\n", rc);
28739
+ /* retrieve the vsects_per_hardsect (hardsector size)
28741
+ vsects_per_hardsect = disk->hardsector_size;
28742
+ vsects_per_hardsect >>= EVMS_VSECTOR_SIZE_SHIFT;
28743
+ rc = evms_cs_allocate_memory((void **)&data, EVMS_VSECTOR_SIZE);
28746
+ /* go read the 1st block on the disk
28748
+ io_start = info->label_block * vsects_per_hardsect;
28749
+ rc = INIT_IO(disk, READ, io_start, 1, data);
28751
+ LOG_ERROR("error(%d) reading sector(%Ld) from '%s'.\n",
28752
+ rc, io_start, disk->name);
28754
+// print_mem(data, EVMS_VSECTOR_SIZE);
28758
+ int offset, size, psize, counter = 0;
28759
+ format1_label_t f1;
28760
+ volume_label_t vlabel;
28761
+ ibm_partition_t partition_type;
28763
+ /* determine the format type
28766
+ strncpy (type, data, 4);
28767
+ if ((!info->FBA_layout) && (!strcmp(info->type,"ECKD"))) {
28768
+ strncpy ( name, data + 8, 6);
28770
+ strncpy ( name, data + 4, 6);
28772
+ memcpy (&vlabel, data, sizeof(volume_label_t));
28776
+ partition_type = get_partition_type(type);
28777
+ LOG_DETAILS("disk: raw type(%s), type(%s), name(%s)\n",
28778
+ type, part_names[partition_type], name);
28779
+ switch ( partition_type ) {
28780
+ case ibm_partition_cms1:
28781
+ if (*((long *)data + 13) != 0) {
28782
+ /* disk is reserved minidisk */
28783
+ long *label=(long*)data;
28784
+ vsects_per_hardsect = label[3] >> EVMS_VSECTOR_SIZE_SHIFT;
28785
+ offset = label[13];
28786
+ size = (label[7] - 1) * vsects_per_hardsect;
28787
+ LOG_DEBUG("(MDSK)");
28789
+ offset = info->label_block + 1;
28790
+ size = disk->total_vsectors;
28792
+ offset *= vsects_per_hardsect;
28793
+ /* adjust for 0 thru label block offset
28796
+ rc = s390_process_segment(discover_list,
28803
+ case ibm_partition_lnx1:
28804
+ case ibm_partition_none:
28805
+ offset = info->label_block + 1;
28806
+ offset *= vsects_per_hardsect;
28807
+ size = disk->total_vsectors;
28808
+ /* adjust for 0 thru label block offset
28811
+ rc = s390_process_segment(discover_list,
28818
+ case ibm_partition_vol1:
28819
+ /* get block number and read then first format1 label */
28820
+ blk = cchhb2blk(&vlabel.vtoc, geo) + 1;
28821
+ io_start = blk * vsects_per_hardsect;
28822
+ rc = INIT_IO(disk, READ, io_start, 1, data);
28824
+ LOG_ERROR("error(%d) reading sector(%Ld) from '%s'.\n",
28825
+ rc, io_start, disk->name);
28828
+// print_mem(data, EVMS_VSECTOR_SIZE);
28830
+ memcpy (&f1, data, sizeof(format1_label_t));
28832
+ while (f1.DS1FMTID == _ascebc['1']) {
28833
+ offset = cchh2blk(&f1.DS1EXT1.llimit, geo);
28834
+ psize = cchh2blk(&f1.DS1EXT1.ulimit, geo) -
28835
+ offset + geo->sectors;
28838
+ rc = s390_process_segment(discover_list,
28840
+ offset * vsects_per_hardsect,
28841
+ psize * vsects_per_hardsect,
28846
+ io_start = blk * vsects_per_hardsect;
28847
+ rc = INIT_IO(disk, READ, io_start, 1, data);
28849
+ LOG_ERROR("error(%d) reading sector(%Ld) from '%s'.\n",
28850
+ rc, io_start, disk->name);
28853
+// print_mem(data, EVMS_VSECTOR_SIZE);
28855
+ memcpy (&f1, data, sizeof(format1_label_t));
28859
+ rc = s390_process_segment(discover_list,
28860
+ disk, 0, 0, 0, 1);
28865
+ evms_cs_deallocate_memory(info);
28868
+ evms_cs_deallocate_memory(geo);
28871
+ evms_cs_deallocate_memory(data);
28877
+ * Function: s390_partition_discover
28881
+s390_partition_discover(evms_logical_node_t **discover_list)
28884
+ evms_logical_node_t *node, *next_node;
28886
+ LOG_ENTRY_EXIT("%s: ENTRY\n", __FUNCTION__);
28888
+ /* initialize global variable */
28889
+ exported_nodes = 0;
28891
+ /* examine each node on the discover list */
28892
+ next_node = *discover_list;
28893
+ while(next_node) {
28894
+ node = next_node;
28895
+ next_node = node->next;
28896
+ if (GetPluginType(node->plugin->id) != EVMS_DEVICE_MANAGER)
28897
+ /* only process disk nodes
28900
+ s390_probe_for_segments(discover_list, node);
28903
+ LOG_ENTRY_EXIT("%s: EXIT(exported nodes:%d, error code:%d)\n",
28904
+ __FUNCTION__, exported_nodes, rc);
28905
+ if (exported_nodes)
28906
+ rc = exported_nodes;
28911
+ * Function: s390_partition_delete
28915
+s390_partition_delete(evms_logical_node_t *segment)
28918
+ local_instance_data_t *LID;
28919
+ evms_logical_node_t *empty_disk = NULL;
28921
+ LOG_DETAILS("deleting segment '%s'.\n",segment->name);
28926
+ LID = segment->instance_data;
28928
+ /* remove the segment from the
28929
+ * disk's segment list
28931
+ rc = remove_segment_from_disk(
28932
+ LID->source_disk,
28935
+ /* free the local instance data */
28936
+ evms_cs_deallocate_memory(LID);
28938
+ /* free the segment node */
28939
+ evms_cs_deallocate_logical_node(segment);
28940
+ MOD_DEC_USE_COUNT;
28941
+ /* if the last segment on the disk was
28942
+ * deleted, delete the disk node too
28945
+ DELETE(empty_disk);
28951
+ * function: s390_partition_io_error
28953
+ * this function was primarily created because the function
28954
+ * buffer_IO_error is inline and kgdb doesn't allow breakpoints
28955
+ * to be set on inline functions. Since this was an error path
28956
+ * and not mainline, I decided to add a trace statement to help
28957
+ * report on the failing condition.
28961
+s390_partition_io_error(
28962
+ evms_logical_node_t *node,
28966
+ LOG_SERIOUS("attempt to %s beyond partition boundary(%Ld) on (%s), rsector(%Ld).\n",
28967
+ (io_flag) ? "WRITE" : "READ",
28968
+ node->total_vsectors - 1,
28972
+ EVMS_IO_ERROR(eio);
28976
+ * Function: s390_partition_read
28980
+s390_partition_read(
28981
+ evms_logical_node_t *partition,
28984
+ local_instance_data_t *LID = partition->instance_data;
28986
+ if ((eio->rsector + eio->rsize) <= partition->total_vsectors) {
28987
+ eio->rsector += LID->start_sect;
28988
+ R_IO(LID->source_disk, eio);
28990
+ s390_partition_io_error(partition, READ, eio);
28994
+ * Function: s390_partition_write
28998
+s390_partition_write(
28999
+ evms_logical_node_t *partition,
29002
+ local_instance_data_t *LID = partition->instance_data;
29004
+ if ((eio->rsector + eio->rsize) <= partition->total_vsectors) {
29005
+ eio->rsector += LID->start_sect;
29006
+ W_IO(LID->source_disk, eio);
29008
+ s390_partition_io_error(partition, WRITE, eio);
29012
+ * Function: s390_partition_init_io
29016
+s390_partition_init_io(
29017
+ evms_logical_node_t *partition,
29018
+ int io_flag, /* 0=read, 1=write*/
29019
+ evms_sector_t sect_nr, /* disk LBA */
29020
+ evms_sector_t num_sects, /* # of sectors */
29021
+ void *buf_addr) /* buffer address */
29024
+ local_instance_data_t *LID = partition->instance_data;
29026
+ if ((sect_nr + num_sects) <= partition->total_vsectors) {
29027
+ rc = INIT_IO(LID->source_disk, io_flag, sect_nr + LID->start_sect, num_sects, buf_addr);
29029
+ LOG_SERIOUS("init_io: attempt to %s beyond partition(%s) boundary(%Ld) at sector(%Ld) for count(%Ld).\n",
29030
+ (io_flag) ? "WRITE" : "READ",
29032
+ (LID->nr_sects - 1),
29033
+ sect_nr, num_sects);
29041
+ * Function: s390_partition_ioctl
29045
+s390_partition_ioctl (
29046
+ evms_logical_node_t *partition,
29047
+ struct inode *inode,
29048
+ struct file *file,
29049
+ unsigned int cmd,
29050
+ unsigned long arg)
29052
+ local_instance_data_t *LID;
29053
+ struct hd_geometry hd_geo;
29057
+ LID = partition->instance_data;
29061
+ case HDIO_GETGEO:
29063
+ rc = IOCTL(LID->source_disk, inode, file, cmd, arg);
29065
+ if (copy_from_user(&hd_geo, (void *)arg, sizeof(struct hd_geometry)))
29068
+ hd_geo.start = LID->start_sect;
29069
+ if (copy_to_user((void *)arg, &hd_geo, sizeof(struct hd_geometry)))
29073
+ case EVMS_GET_BMAP:
29075
+ evms_get_bmap_t *bmap = (evms_get_bmap_t *)arg;
29076
+ bmap->rsector += LID->start_sect;
29077
+ /* intentionally fall thru to
29078
+ * default ioctl down to device
29083
+ rc = IOCTL(LID->source_disk, inode, file, cmd, arg);
29089
+ * Function: s390_part_init
29093
+s390_part_init(void)
29095
+ return evms_cs_register_plugin(&plugin_header); /* register with EVMS */
29098
+static void __exit
29099
+s390_part_exit(void)
29101
+ evms_cs_unregister_plugin(&plugin_header);
29104
+module_init(s390_part_init);
29105
+module_exit(s390_part_exit);
29106
+#ifdef MODULE_LICENSE
29107
+MODULE_LICENSE("GPL");
29110
diff -Naur linux-2002-03-28/drivers/evms/snapshot.c evms-2002-03-28/drivers/evms/snapshot.c
29111
--- linux-2002-03-28/drivers/evms/snapshot.c Wed Dec 31 18:00:00 1969
29112
+++ evms-2002-03-28/drivers/evms/snapshot.c Thu Mar 21 16:17:47 2002
29114
+/* -*- linux-c -*- */
29119
+ * Copyright (c) International Business Machines Corp., 2000
29121
+ * This program is free software; you can redistribute it and/or modify
29122
+ * it under the terms of the GNU General Public License as published by
29123
+ * the Free Software Foundation; either version 2 of the License, or
29124
+ * (at your option) any later version.
29126
+ * This program is distributed in the hope that it will be useful,
29127
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
29128
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
29129
+ * the GNU General Public License for more details.
29131
+ * You should have received a copy of the GNU General Public License
29132
+ * along with this program; if not, write to the Free Software
29133
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
29138
+ * linux/drivers/evms/snapshot.c
29141
+ * EVMS SnapShot Feature.
29143
+ * This feature provides the ability to Snapshot ANY existing EVMS volume(including compatibility)
29144
+ * to a new EVMS volume that is created when the SnapShot is enabled.
29146
+ * This feature will appear in the call stack for both the original and the snapshot volume.
29149
+#include <linux/module.h>
29150
+#include <linux/kernel.h>
29151
+#include <linux/config.h>
29152
+#include <linux/genhd.h>
29153
+#include <linux/major.h>
29154
+#include <linux/string.h>
29155
+#include <linux/blk.h>
29156
+#include <linux/init.h>
29157
+#include <linux/slab.h>
29158
+#include <linux/vmalloc.h>
29159
+#include <linux/evms/evms_kernel.h>
29160
+#include <linux/evms/evms_snapshot.h>
29161
+#include <asm/system.h>
29162
+#include <asm/uaccess.h>
29164
+#define LOG_PREFIX "snapshot: "
29166
+static struct proc_dir_entry * snap_proc = NULL;
29168
+static int discover_snapshot_volumes( evms_logical_node_t ** evms_node_list );
29169
+static int delete_snapshot_volume( evms_logical_node_t * node );
29170
+static void read_snap( evms_logical_node_t * node,
29172
+static void write_snap( evms_logical_node_t * node,
29174
+static int init_io_snap( evms_logical_node_t * node,
29176
+ evms_sector_t sect_nr,
29177
+ evms_sector_t num_sects,
29178
+ void * buf_addr );
29179
+static int ioctl_snap( evms_logical_node_t * node,
29180
+ struct inode * inode,
29181
+ struct file * file,
29182
+ unsigned int cmd,
29183
+ unsigned long arg );
29184
+static int add_snapshot(evms_logical_node_t * node,
29185
+ snapshot_metadata_t * metadata,
29186
+ evms_logical_node_t ** evms_node_list );
29187
+static int snap_proc_read(char * page,
29195
+/********** Required Plugin Functions **********/
29198
+static evms_plugin_function_table_t function_table = {
29199
+ discover: &discover_snapshot_volumes,
29200
+ delete : &delete_snapshot_volume,
29201
+ read : &read_snap,
29202
+ write : &write_snap,
29203
+ init_io : &init_io_snap,
29204
+ ioctl : &ioctl_snap
29208
+static evms_plugin_header_t plugin_header = {
29209
+ id : SetPluginID(
29211
+ EVMS_ASSOCIATIVE_FEATURE, // Feature class
29212
+ EVMS_SNAPSHOT_FEATURE_ID ), // Unique ID within features
29218
+ required_common_services_version : {
29219
+ major : EVMS_COMMON_SERVICES_MAJOR,
29220
+ minor : EVMS_COMMON_SERVICES_MINOR,
29221
+ patchlevel : EVMS_COMMON_SERVICES_PATCHLEVEL
29223
+ function_table : &function_table // function table for this plugin
29227
+ * Function: convert_metadata
29229
+ * Performs endian conversion on metadata sector.
29231
+static int convert_metadata( snapshot_metadata_t * metadata ){
29233
+ metadata->chunk_size = le32_to_cpu(metadata->chunk_size);
29234
+ metadata->flags = le32_to_cpu(metadata->flags);
29235
+ metadata->lba_of_COW_table = le64_to_cpu(metadata->lba_of_COW_table);
29236
+ metadata->lba_of_first_chunk = le64_to_cpu(metadata->lba_of_first_chunk);
29237
+ metadata->original_size = le64_to_cpu(metadata->original_size);
29238
+ metadata->signature = le32_to_cpu(metadata->signature);
29239
+ metadata->total_chunks = le32_to_cpu(metadata->total_chunks);
29240
+ metadata->version.major = le32_to_cpu(metadata->version.major);
29241
+ metadata->version.minor = le32_to_cpu(metadata->version.minor);
29242
+ metadata->version.patchlevel = le32_to_cpu(metadata->version.patchlevel);
29243
+ metadata->CRC = le32_to_cpu(metadata->CRC);
29249
+ * Function: insert_snapshot_hash_entry
29251
+ * This function inserts a new entry into a snapshot hash chain, immediately
29252
+ * following the specified entry. This function should not be used to add an
29253
+ * entry into an empty list, or as the first entry in an existing list. For
29254
+ * that case, use insert_snapshot_map_entry_at_head().
29256
+static int insert_snapshot_hash_entry( snapshot_hash_entry_t * entry,
29257
+ snapshot_hash_entry_t * base )
29259
+ entry->next = base->next;
29260
+ entry->prev = base;
29261
+ base->next = entry;
29262
+ if ( entry->next ) {
29263
+ entry->next->prev = entry;
29269
+ * Function: insert_snapshot_hash_entry_at_head
29271
+ * This function inserts a new entry into a snapshot chain as the first
29272
+ * entry in the chain.
29274
+static int insert_snapshot_hash_entry_at_head( snapshot_hash_entry_t * entry,
29275
+ snapshot_hash_entry_t ** head )
29277
+ entry->next = *head;
29278
+ entry->prev = NULL;
29280
+ if ( entry->next ) {
29281
+ entry->next->prev = entry;
29288
+ * Function: set_snapshot_flags
29290
+ * Set a bit in the flags field of the metadata to mark the snapshot node
29291
+ * as either disabled or full, and write the metadata sector to the
29292
+ * snapshot volume. The node passed in to this function should be the
29293
+ * "lower" of the snapshot nodes, meaning the one passed into the snapshot
29294
+ * plugin, not the one exported from the plugin. Currently, appropriate
29295
+ * values for "flag" are EVMS_SNAPSHOT_DISABLED and EVMS_SNAPSHOT_FULL.
29297
+static int set_snapshot_flags( evms_logical_node_t * snap_node,
29298
+ unsigned long flag )
29300
+ unsigned char data[EVMS_VSECTOR_SIZE] = {0};
29301
+ snapshot_metadata_t * metadata = (snapshot_metadata_t*)data;
29303
+ // Read the metadata sector
29304
+ if ( INIT_IO( snap_node, 0, snap_node->total_vsectors-3, 1, data ) ) {
29307
+ // Set the appropriate flag.
29308
+ // do endian conversion on the fly
29309
+ metadata->flags |= cpu_to_le32(flag);
29310
+ metadata->CRC = 0;
29311
+ metadata->CRC = evms_cs_calculate_crc(
29312
+ EVMS_INITIAL_CRC,
29313
+ metadata, sizeof(snapshot_metadata_t));
29314
+ // Write the metadata sector back to the volume
29315
+ if ( INIT_IO( snap_node, 1, snap_node->total_vsectors-3, 1, data ) ) {
29323
+ * Function: discover_snapshot_volumes
29325
+ * Inspect the global node list, looking for volumes with a valid
29326
+ * snapshot metadata sector.
29328
+static int discover_snapshot_volumes( evms_logical_node_t ** evms_node_list )
29330
+ evms_logical_node_t * node;
29331
+ evms_logical_node_t * next_node;
29332
+ snapshot_metadata_t * metadata = NULL;
29334
+ int org_crc, final_crc;
29336
+ if ( evms_cs_allocate_memory( (void**)&metadata, EVMS_VSECTOR_SIZE )) {
29340
+ for ( node = *evms_node_list; node && (rc == 0); node = next_node) {
29341
+ next_node = node->next;
29342
+ // if the id of this node is ours, skip to next node because this
29343
+ // must be one we put back on the list
29344
+ if (node->plugin->id == plugin_header.id) {
29347
+ if (node->feature_header && node->feature_header->feature_id == plugin_header.id) {
29348
+ // Read next to last sector for the snapshot metadata. Check for
29349
+ // a valid snapshot signature.
29350
+ if ( INIT_IO(node, 0, node->total_vsectors-3, 1, metadata) ) {
29351
+ LOG_ERROR("IO error on '%s' sector %Ld.\n",
29352
+ node->name, node->total_vsectors-3);
29353
+ rc = -EVMS_FEATURE_FATAL_ERROR;
29354
+ evms_cs_remove_logical_node_from_list(evms_node_list,node);
29358
+ if ( le32_to_cpu(metadata->signature) == EVMS_SNAPSHOT_SIGNATURE ) {
29359
+ org_crc = le32_to_cpu(metadata->CRC);
29360
+ metadata->CRC = 0;
29361
+ final_crc = evms_cs_calculate_crc(
29362
+ EVMS_INITIAL_CRC,
29363
+ metadata, sizeof(snapshot_metadata_t));
29364
+ if (final_crc != org_crc) {
29365
+ LOG_ERROR("CRC error in feature data on '%s'.\n", node->name);
29366
+ rc = -EVMS_FEATURE_FATAL_ERROR;
29367
+ evms_cs_remove_logical_node_from_list(evms_node_list,node);
29370
+ convert_metadata(metadata);
29371
+ if (metadata->version.major > plugin_header.version.major) {
29372
+ LOG_ERROR("ERROR: unsuppoprted version of feature in meta data on '%s'.\n",
29374
+ rc = -EVMS_FEATURE_FATAL_ERROR;
29375
+ evms_cs_remove_logical_node_from_list(evms_node_list,node);
29378
+ rc = add_snapshot(node, metadata, evms_node_list);
29385
+ evms_cs_deallocate_memory(metadata);
29392
+ * Function: check_quiesce
29394
+ * Make sure a snapshot and it's original volume quiesced.
29396
+static int check_quiesce( snapshot_volume_t * org_volume )
29398
+ snapshot_volume_t * next_vol;
29399
+ for ( next_vol = org_volume; next_vol; next_vol = next_vol->snapshot_next ) {
29400
+ if ( ! (next_vol->flags & EVMS_SNAPSHOT_QUIESCED) ) {
29401
+ LOG_ERROR("Can't delete snapshot, volume '%s' not quiesced.\n",
29402
+ next_vol->logical_node->name);
29411
+ * Function: remove_snapshot_from_chain
29413
+ * Remove the specified snapshot volume from its original's chain of
29416
+static int remove_snapshot_from_chain( snapshot_volume_t * snap_volume )
29418
+ snapshot_volume_t * org_volume = snap_volume->snapshot_org;
29420
+ if ( org_volume ) {
29421
+ while ( org_volume->snapshot_next && org_volume->snapshot_next != snap_volume ) {
29422
+ org_volume = org_volume->snapshot_next;
29424
+ if ( org_volume->snapshot_next ) {
29425
+ org_volume->snapshot_next = org_volume->snapshot_next->snapshot_next;
29428
+ snap_volume->snapshot_org = NULL;
29429
+ snap_volume->snapshot_next = NULL;
29435
+ * Function: delete_snapshot_hash_chain
29437
+ * Delete all items in a single chain in the hash table.
29439
+static int delete_snapshot_hash_chain( snapshot_hash_entry_t * head )
29441
+ snapshot_hash_entry_t * next;
29444
+ next = head->next;
29445
+ evms_cs_deallocate_memory(head);
29453
+ * Function: delete_snapshot_volume
29455
+ * Delete the in-memory representation of a volume. The specified node
29456
+ * can actually be either a snapshot or an original. Deleting a snapshot
29457
+ * causes it to be removed from its original's chain of snapshots.
29459
+static int delete_snapshot_volume(evms_logical_node_t * node)
29461
+ snapshot_volume_t * volume = (snapshot_volume_t *) node->instance_data;
29462
+ snapshot_volume_t * org_volume = volume->snapshot_org;
29463
+ snapshot_volume_t * next_vol;
29467
+ // Delete the instance data
29469
+ if (volume->flags & EVMS_SNAPSHOT) {
29470
+ // This node is a snapshot. Remove it from the
29471
+ // original's list. Check all snapshots in the chain
29472
+ // for quiesce before this is done.
29473
+ if ( !(volume->flags & EVMS_SNAPSHOT_QUIESCED) ){
29476
+ if ( volume->snapshot_org &&
29477
+ !(org_volume->flags & EVMS_SNAPSHOT_QUIESCED)) {
29481
+ remove_snapshot_from_chain( volume );
29483
+ // If we just deleted the only/last snapshot for this
29484
+ // original, the original will not be modified. It is
29485
+ // the engine's responsibility to delete the original
29486
+ // and rediscover in order to clear it of its snapshot
29487
+ // information. Even if that doesn't happen, the state
29488
+ // of the kernel will still be safe. I/O's coming into
29489
+ // this plugin for the original will just be passed
29490
+ // down without any other action or modification.
29492
+ // Unregister the proc-fs entry for this node.
29493
+ if ( snap_proc ) {
29494
+ remove_proc_entry(node->volume_info->volume_name, snap_proc);
29498
+ // This is an original. It's the engine's responsibility
29499
+ // to delete all snapshots before deleting an original.
29500
+ // Otherwise, a snapshot could be left pointing to an
29501
+ // original that no longer exists. Thus, we just need to
29502
+ // make sure there are no snapshots in the chain.
29503
+ if ( (rc = check_quiesce(volume)) ) {
29504
+// if ( volume->snapshot_next ) {
29507
+ // loop through all snapshots left on this original, and
29508
+ // NULL out their org pointer and mark disabled, in case they don't get deleted.
29509
+ for ( next_vol = volume->snapshot_next;
29510
+ next_vol; next_vol = next_vol->snapshot_next ) {
29511
+ next_vol->snapshot_org = NULL;
29512
+ next_vol->flags |= EVMS_SNAPSHOT_DISABLED; // disable in memory only.
29516
+ // Free up all memory used by the instance data, including
29517
+ // the underlying node, the hash table, and the data buffer.
29518
+ if (volume->logical_node) {
29519
+ if ( (rc = DELETE(volume->logical_node)) ) {
29523
+ if (volume->snapshot_map) {
29524
+ // Delete all of the hash chains, then the actual table.
29525
+ for ( i = 0; i < volume->hash_table_size; i++ ) {
29526
+ delete_snapshot_hash_chain( volume->snapshot_map[i] );
29528
+ vfree(volume->snapshot_map);
29530
+ if (volume->chunk_data_buffer) {
29531
+ evms_cs_deallocate_memory(volume->chunk_data_buffer);
29534
+ evms_cs_deallocate_memory(volume);
29537
+ evms_cs_deallocate_logical_node(node);
29539
+ MOD_DEC_USE_COUNT;
29545
+ * Function: search_snapshot_hash_chain
29547
+ * This function will search the hash chain that is anchored at the
29548
+ * specified head pointer. If the sector number is found, a pointer to that
29549
+ * entry in the chain is set, and a 1 is returned. If the sector is not
29550
+ * found, a pointer to the previous entry is set and 0 is returned. If the
29551
+ * return pointer is NULL, this means either the list is empty, or the
29552
+ * specified sector should become the first list item.
29554
+static int search_snapshot_hash_chain( u_int64_t chunk,
29555
+ snapshot_hash_entry_t * head,
29556
+ snapshot_hash_entry_t ** result )
29558
+ snapshot_hash_entry_t * curr = head;
29559
+ snapshot_hash_entry_t * prev = head;
29560
+ while ( curr && curr->org_chunk < chunk ) {
29562
+ curr = curr->next;
29564
+ if (!curr) { // Either an empty chain or went off the end of the chain.
29568
+ else if ( curr->org_chunk != chunk ) {
29569
+ *result = curr->prev;
29580
+ * Function: snapshot_remap_chunk
29582
+ * This function performs a sector remap on a snapshot volume. This should
29583
+ * be called from the I/O read path, It first determines the base sector of
29584
+ * the chunk containing the specified sector, and saves the remainder. Then
29585
+ * it performs a search through the snapshot map for the specified volume.
29586
+ * If a match is found, the sector number is changed to the new value. If
29587
+ * no match is found, the value is left the same, meaning the read should
29588
+ * proceed down the original volume.
29590
+static int snapshot_remap_chunk(snapshot_volume_t * snap_volume,
29591
+ evms_sector_t * sector )
29593
+ snapshot_hash_entry_t * result;
29594
+ unsigned long hash_value;
29596
+ unsigned long remainder;
29598
+ remainder = *sector & (u_int64_t)( snap_volume->chunk_size -1);
29599
+ chunk = *sector >> snap_volume->chunk_shift;
29600
+ hash_value = ((unsigned long)chunk) % snap_volume->hash_table_size;
29602
+ if ( search_snapshot_hash_chain( chunk, snap_volume->snapshot_map[hash_value], &result ) ) {
29603
+ *sector = (result->snap_chunk << snap_volume->chunk_shift) + remainder;
29611
+ * Function: read_snap
29613
+static void read_snap( evms_logical_node_t * node, eio_t *eio)
29615
+ snapshot_volume_t * volume = (snapshot_volume_t * ) node->instance_data;
29618
+ if ( (eio->rsector + eio->rsize) > node->total_vsectors ) {
29619
+ EVMS_IO_ERROR(eio);
29623
+ // On a read to the original, we can just pass it through completely
29624
+ // untouched. Only reads to the snapshot can be broken up.
29625
+ if ( volume->flags & EVMS_SNAPSHOT_ORG ) {
29626
+ R_IO(volume->logical_node,eio);
29630
+ // Lock the snapshot before processing the request.
29631
+ down(&volume->snap_semaphore);
29633
+ // Make sure the snapshot is not full/disabled, and that
29634
+ // the original is present.
29635
+ if ( (volume->flags & (EVMS_SNAPSHOT_DISABLED|EVMS_SNAPSHOT_FULL)) ||
29636
+ (! volume->snapshot_org) ) {
29637
+ EVMS_IO_ERROR(eio);
29638
+ up(&volume->snap_semaphore);
29643
+ // Check if this sector has been remapped
29644
+ if ( snapshot_remap_chunk(volume, &eio->rsector)){
29645
+ // Has not been remapped. Send IO to the original.
29646
+ R_IO(volume->snapshot_org->logical_node,eio);
29648
+ // Sector was remapped. Send IO to the snapshot.
29649
+ R_IO(volume->logical_node,eio);
29652
+ up(&volume->snap_semaphore);
29656
+static int snapshot_copy_1( snapshot_volume_t * snap_volume, evms_sector_t org_sector,
29657
+ u_int64_t * remap_chunk) {
29659
+ snapshot_hash_entry_t * target_entry;
29660
+ snapshot_hash_entry_t * new_map_entry;
29661
+ snapshot_volume_t * org_volume = snap_volume->snapshot_org;
29662
+ unsigned long hash_value;
29664
+ u_int32_t io_size = snap_volume->chunk_size;
29665
+ int i, iterations = 1;
29667
+ if ( SNAPSHOT_CHUNK_BUFFER_SIZE < snap_volume->chunk_size ) {
29668
+ iterations = snap_volume->chunk_size / org_volume->chunk_size;
29669
+ io_size = org_volume->chunk_size;
29672
+ // Lock out this snapshot while we are remapping.
29673
+ down(&snap_volume->snap_semaphore);
29675
+ // Make sure the snapshot has not been disabled.
29676
+ if ( snap_volume->flags & (EVMS_SNAPSHOT_DISABLED|EVMS_SNAPSHOT_FULL) ) {
29677
+ up(&snap_volume->snap_semaphore);
29681
+ // Search the hash table to see if this sector has already been
29682
+ // remapped on this snapshot.
29683
+ chunk = org_sector >> snap_volume->chunk_shift;
29684
+ hash_value = (long)chunk % snap_volume->hash_table_size;
29685
+ if ( search_snapshot_hash_chain( chunk, snap_volume->snapshot_map[hash_value], &target_entry ) ) {
29686
+ // Chunk is already remapped.
29687
+ up(&snap_volume->snap_semaphore);
29688
+ *remap_chunk = target_entry->snap_chunk;
29692
+ // Is there enough room remaining on the snapshot to
29693
+ // remap this chunk?
29694
+ if ( snap_volume->next_free_chunk >= snap_volume->num_chunks ) {
29695
+ // Once the snapshot becomes full, further writes to the
29696
+ // original can't be remapped, and thus this snapshot
29697
+ // will become "corrupted".
29698
+ set_snapshot_flags(snap_volume->logical_node,EVMS_SNAPSHOT_FULL);
29699
+ snap_volume->flags |= EVMS_SNAPSHOT_FULL;
29700
+ up(&snap_volume->snap_semaphore);
29705
+ for ( i = 0; i < iterations; i++ ) {
29706
+ // Read the part of all chunk from the original volume.
29707
+ if ( INIT_IO( org_volume->logical_node, 0, chunk * snap_volume->chunk_size + i*io_size, io_size, org_volume->chunk_data_buffer ) ) {
29708
+ // An error reading from the original volume is very bad.
29709
+ // If the read fails, the original write will likely fail
29710
+ // as well, so let's just return an error.
29711
+ up(&snap_volume->snap_semaphore);
29715
+ // save of chunk number of the destination in snapshot of where this remap is going.
29716
+ *remap_chunk = snap_volume->next_free_chunk;
29717
+ // Write this chunk to the snapshot volume.
29718
+ if ( INIT_IO( snap_volume->logical_node, 1, (snap_volume->next_free_chunk * snap_volume->chunk_size + i*io_size), io_size, org_volume->chunk_data_buffer) ) {
29719
+ // An error writing to the snapshot is the same
29720
+ // situation as a full snapshot.
29721
+ set_snapshot_flags(snap_volume->logical_node,EVMS_SNAPSHOT_DISABLED);
29722
+ snap_volume->flags |= EVMS_SNAPSHOT_DISABLED;
29723
+ up(&snap_volume->snap_semaphore);
29724
+ LOG_ERROR("I/O error on COW on '%s' disabling snapshot.\n",
29725
+ snap_volume->logical_node->name);
29729
+ // Fill in the appropriate COW table entry and write that
29730
+ // metadata sector back to the snapshot volume.
29731
+ // convert to little endian on disk
29732
+ snap_volume->cow_table[snap_volume->next_cow_entry] = cpu_to_le64(chunk);
29733
+ if ( INIT_IO( snap_volume->logical_node, 1, snap_volume->current_cow_sector, 1, snap_volume->cow_table ) ) {
29734
+ // The data was written to the snapshot, but writing the
29735
+ // metadata failed.
29736
+ set_snapshot_flags(snap_volume->logical_node,EVMS_SNAPSHOT_DISABLED);
29737
+ snap_volume->flags |= EVMS_SNAPSHOT_DISABLED;
29738
+ up(&snap_volume->snap_semaphore);
29739
+ LOG_ERROR("I/O error on COW table on '%s' disabling snapshot.\n",
29740
+ snap_volume->logical_node->name);
29743
+ snap_volume->next_cow_entry++;
29744
+ if ( snap_volume->next_cow_entry >= (SECTOR_SIZE/sizeof(u_int64_t)) ) {
29745
+ snap_volume->next_cow_entry = 0;
29746
+ snap_volume->current_cow_sector++;
29747
+ memset( snap_volume->cow_table, 0xff, SECTOR_SIZE );
29748
+ if ( INIT_IO( snap_volume->logical_node, 1, snap_volume->current_cow_sector, 1, snap_volume->cow_table ) ) {
29749
+ // Can't clear out the next sector of metadata. This
29750
+ // is bad and would kill us on a new discover, so
29751
+ // disable the snapshot now before we really screw up.
29752
+ set_snapshot_flags(snap_volume->logical_node,EVMS_SNAPSHOT_DISABLED);
29753
+ snap_volume->flags |= EVMS_SNAPSHOT_DISABLED;
29754
+ up(&snap_volume->snap_semaphore);
29755
+ LOG_ERROR("I/O error on COW table init on '%s' disabling snapshot.\n",
29756
+ snap_volume->logical_node->name);
29761
+ // Create a new snapshot map entry and add it in the appropriate
29762
+ // place in the map.
29763
+ if ( evms_cs_allocate_memory((void **)&new_map_entry, sizeof(snapshot_hash_entry_t)) ) {
29764
+ set_snapshot_flags(snap_volume->logical_node,EVMS_SNAPSHOT_DISABLED);
29765
+ snap_volume->flags |= EVMS_SNAPSHOT_DISABLED;
29766
+ up(&snap_volume->snap_semaphore);
29767
+ LOG_ERROR("no memory for remap entry, on '%s' disabling snapshot.\n",
29768
+ snap_volume->logical_node->name);
29771
+ new_map_entry->org_chunk = chunk;
29772
+ new_map_entry->snap_chunk = snap_volume->next_free_chunk;
29774
+ if ( target_entry ) {
29775
+ insert_snapshot_hash_entry( new_map_entry, target_entry );
29778
+ insert_snapshot_hash_entry_at_head( new_map_entry, &(snap_volume->snapshot_map[hash_value]) );
29780
+ snap_volume->next_free_chunk++;
29782
+ up(&snap_volume->snap_semaphore);
29787
+ * Function: snapshot_copy_data
29789
+ * On a write to a snapshotted volume, check all snapshots to see if the
29790
+ * specified chunk has already been remapped. If it has not, read the
29791
+ * original data from the volume, write the data to the next available
29792
+ * chunk on the snapshot, update the COW table, write the COW table to
29793
+ * the snapshot, and insert a new entry into the snapshot map.
29795
+static int snapshot_copy_data( snapshot_volume_t * org_volume,
29796
+ evms_sector_t org_sector)
29798
+ snapshot_volume_t * snap_volume;
29799
+ snapshot_volume_t * next_volume;
29800
+ u_int64_t remap_chunk; // unused here, needed for call to copy1
29802
+ // Volumes can be snapshotted multiple times. Check every snapshot.
29803
+ for ( snap_volume = org_volume->snapshot_next; snap_volume; snap_volume = next_volume ) {
29804
+ next_volume = snap_volume->snapshot_next;
29805
+ snapshot_copy_1(snap_volume, org_sector, &remap_chunk);
29814
+ * Function: write_snap
29816
+static void write_snap( evms_logical_node_t * node, eio_t * eio)
29818
+ snapshot_volume_t * volume = (snapshot_volume_t *) node->instance_data;
29820
+ u_int64_t remap_chunk;
29821
+ u_int64_t remainder;
29825
+ if ( eio->rsector + eio->rsize > node->total_vsectors) {
29826
+ EVMS_IO_ERROR(eio);
29830
+ // if this is a snapshot
29831
+ if ( volume->flags & EVMS_SNAPSHOT ) {
29832
+ if ( volume->flags & EVMS_SNAPSHOT_WRITEABLE) {
29833
+ if (snapshot_copy_1(volume, eio->rsector, &remap_chunk)){
29834
+ EVMS_IO_ERROR(eio);
29836
+ remainder = eio->rsector & (u_int64_t)(volume->chunk_size -1);
29837
+ eio->rsector = (remap_chunk * volume->chunk_size) + remainder;
29838
+ W_IO(volume->logical_node,eio);
29841
+ EVMS_IO_ERROR(eio);
29845
+ } else{ // write to original
29846
+ // Remap this sector if necessary.
29847
+ if ( (rc = snapshot_copy_data(volume, eio->rsector)) ) {
29850
+ W_IO(volume->logical_node,eio);
29857
+ * Function: ioctl_snap
29860
+static int ioctl_snap( evms_logical_node_t * logical_node,
29861
+ struct inode * inode,
29862
+ struct file * file,
29863
+ unsigned int cmd,
29864
+ unsigned long arg)
29867
+ snapshot_volume_t * volume = (snapshot_volume_t*)logical_node->instance_data;
29869
+ if (!inode || !logical_node) {
29873
+ case EVMS_QUIESCE_VOLUME:
29875
+ evms_quiesce_volume_t *tmp = (evms_quiesce_volume_t*)arg;
29876
+ if ( tmp->command ) { // Quiesce
29877
+ volume->flags |= EVMS_SNAPSHOT_QUIESCED;
29879
+ else { // Un-quiesce
29880
+ volume->flags &= ~EVMS_SNAPSHOT_QUIESCED;
29885
+ case EVMS_GET_BMAP:
29887
+ if ( volume->flags & EVMS_SNAPSHOT_ORG ) {
29888
+ rc = IOCTL(volume->logical_node, inode, file, cmd, arg);
29896
+ case EVMS_PLUGIN_IOCTL:
29898
+ evms_plugin_ioctl_t tmp, *user_parms;
29899
+ int percent_full;
29900
+ user_parms = (evms_plugin_ioctl_t *)arg;
29902
+ /* copy user's parameters to kernel space */
29903
+ if (copy_from_user(&tmp, user_parms, sizeof(tmp)))
29907
+ /* is this cmd targetted at this feature ? */
29908
+ if (tmp.feature_id == logical_node->plugin->id) {
29909
+ switch(tmp.feature_command) {
29910
+ case SNAPSHOT_QUERY_PERCENT_FULL:
29911
+ if (volume->flags & EVMS_SNAPSHOT_FULL) {
29912
+ percent_full = -1;
29913
+ } else if (volume->flags & EVMS_SNAPSHOT_DISABLED) {
29914
+ percent_full = -2;
29916
+ percent_full = (volume->next_free_chunk * 100) / volume->num_chunks;
29918
+ rc = copy_to_user(tmp.feature_ioctl_data, &percent_full, sizeof(percent_full));
29922
+ } else { /* broadcast this cmd to all children */
29923
+ rc = IOCTL(logical_node,inode, file, cmd, arg);
29929
+ case EVMS_CHECK_MEDIA_CHANGE:
29930
+ case EVMS_REVALIDATE_DISK:
29931
+ case EVMS_GET_DISK_LIST:
29933
+ if (!(volume->flags & EVMS_SNAPSHOT_ORG)) {
29934
+ volume = volume->snapshot_org;
29936
+ while ( volume ) {
29937
+ rc = IOCTL(volume->logical_node, inode, file, cmd, arg);
29938
+ volume = volume->snapshot_next;
29943
+ rc = IOCTL(volume->logical_node, inode, file, cmd, arg);
29950
+static int init_io_snap(evms_logical_node_t * node,
29951
+ int io_flag, /* 0=read, 1=write*/
29952
+ evms_sector_t sect_nr, /* disk LBA */
29953
+ evms_sector_t num_sects, /* # of sectors */
29954
+ void * buf_addr ) /* buffer address */
29956
+ snapshot_volume_t * volume = (snapshot_volume_t *)(node->instance_data);
29958
+ // no init io access to snapshot, and no writes allowed to original
29959
+ // since they would not be snapshotted.
29960
+ if (io_flag || (volume->flags & EVMS_SNAPSHOT)) {
29963
+ return INIT_IO(volume->logical_node, io_flag, sect_nr, num_sects, buf_addr);
29969
+ * Function: snapshot_init
29972
+int __init snapshot_init(void)
29974
+ struct proc_dir_entry * pde;
29976
+ // Register a directory in proc-fs.
29977
+ pde = evms_cs_get_evms_proc_dir();
29979
+ snap_proc = create_proc_entry("snapshot", S_IFDIR, pde);
29982
+ return evms_cs_register_plugin(&plugin_header); /* register with EVMS */
29986
+ * Function: snapshot_exit
29988
+void __exit snapshot_exit(void)
29990
+ struct proc_dir_entry * pde;
29992
+ // Unregister the directory in proc-fs.
29993
+ pde = evms_cs_get_evms_proc_dir();
29995
+ remove_proc_entry("snapshot", pde);
29998
+ evms_cs_unregister_plugin(&plugin_header);
30001
+module_init(snapshot_init);
30002
+module_exit(snapshot_exit);
30003
+#ifdef MODULE_LICENSE
30004
+MODULE_LICENSE("GPL");
30008
+/********** SnapShot Functions **********/
30013
+ * Function: add_cow_entry_to_snapshot_map
30015
+ * This function takes a cow table entry (from the on-disk data), and
30016
+ * converts it into an appropriate entry for the snapshot map, and
30017
+ * inserts it into the appropriate map for the specified volume.
30019
+static int add_cow_entry_to_snapshot_map( u_int64_t org_chunk,
30020
+ u_int64_t snap_chunk,
30021
+ snapshot_volume_t * volume )
30023
+ snapshot_hash_entry_t * new_entry;
30024
+ snapshot_hash_entry_t * target_entry;
30025
+ unsigned long hash_value;
30027
+ evms_cs_allocate_memory((void **)&new_entry,sizeof (snapshot_hash_entry_t));
30028
+ if (!new_entry) {
30031
+ new_entry->org_chunk = org_chunk;
30032
+ new_entry->snap_chunk = snap_chunk;
30034
+ hash_value = (long)org_chunk % volume->hash_table_size;
30035
+ if ( search_snapshot_hash_chain( org_chunk, volume->snapshot_map[hash_value], &target_entry ) ) {
30036
+ // This means a duplicate mapping was found. This should not happen.
30039
+ if ( target_entry ) {
30040
+ insert_snapshot_hash_entry( new_entry, target_entry );
30043
+ insert_snapshot_hash_entry_at_head( new_entry, &(volume->snapshot_map[hash_value]) );
30051
+ * Function: build_snapshot_maps
30053
+ * Construct the initial hash table state based on
30054
+ * existing COW tables on the disk.
30056
+static int build_snapshot_maps(snapshot_volume_t * volume)
30062
+ // Read in one sector's worth of COW tables.
30063
+ if ( INIT_IO(volume->logical_node, 0, volume->current_cow_sector, 1, volume->cow_table) ) {
30066
+ // Translate every valid COW table entry into
30067
+ // a snapshot map entry.
30068
+ for ( volume->next_cow_entry = 0;
30069
+ volume->next_cow_entry < (SECTOR_SIZE/sizeof(u_int64_t)) &&
30070
+ volume->cow_table[volume->next_cow_entry] != 0xffffffffffffffff;
30071
+ volume->next_cow_entry++, volume->next_free_chunk++ ) {
30072
+ if ( (rc = add_cow_entry_to_snapshot_map( le64_to_cpu(volume->cow_table[volume->next_cow_entry]),
30073
+ volume->next_free_chunk, volume ))) {
30077
+ // Move on to the next sector if necessary.
30078
+ if ( volume->next_cow_entry == (SECTOR_SIZE/sizeof(u_int64_t)) ) {
30079
+ volume->current_cow_sector++;
30090
+ * Function: add_snapshot
30092
+ * Initializes a snapshot instance and exports an evms_logical_node to
30093
+ * the global list.
30095
+static int add_snapshot(evms_logical_node_t * snap_node,
30096
+ snapshot_metadata_t * metadata,
30097
+ evms_logical_node_t ** evms_node_list )
30099
+ evms_logical_node_t * new_snap_node;
30100
+ evms_logical_node_t * new_org_node;
30101
+ evms_logical_node_t * org_node;
30102
+ snapshot_volume_t * snap_volume;
30103
+ snapshot_volume_t * org_volume;
30104
+ snapshot_volume_t * tmp_volume;
30107
+ evms_cs_remove_logical_node_from_list(evms_node_list,snap_node);
30109
+ // Make sure the snapshot is not full or disabled.
30110
+ if ( metadata->flags & (EVMS_SNAPSHOT_DISABLED | EVMS_SNAPSHOT_FULL) ) {
30111
+ LOG_WARNING("Error: Snapshot %s discovered as disabled/full.\n", snap_node->name);
30112
+ LOG_WARNING(" Deleting from further use.\n");
30113
+ DELETE(snap_node);
30117
+ // Inspect the global list until a node is found with the name of
30118
+ // this snapshot's original. There can only be one original for
30119
+ // each snapshot.
30120
+ for ( org_node = *evms_node_list;
30122
+ strncmp(EVMS_GET_NODE_NAME(org_node), metadata->original_volume, EVMS_VOLUME_NAME_SIZE);
30123
+ org_node = org_node->next ) {
30127
+ // No original was found. Disable and delete the snapshot.
30128
+ LOG_WARNING("Error: No original found for snapshot %s, looking for %s\n", snap_node->name,metadata->original_volume);
30129
+ set_snapshot_flags(snap_node,EVMS_SNAPSHOT_DISABLED);
30130
+ DELETE(snap_node);
30134
+ LOG_EXTRA("Adding snapshot for volume %s\n",org_node->name);
30136
+ // ok, we found the original on the list.
30137
+ // verify the size to be sure the name didn't change for compatibility
30138
+ if (org_node->total_vsectors != metadata->original_size) {
30139
+ LOG_WARNING("Error: Original volume size does not match\n");
30140
+ LOG_WARNING(" vol=%s: org_size=%d, current size=%d\n",
30141
+ org_node->name, (int)(metadata->original_size), (int)(org_node->total_vsectors));
30142
+ // The snapshot no longer points at a valid original.
30143
+ // Disable and delete the snapshot.
30144
+ set_snapshot_flags(snap_node,EVMS_SNAPSHOT_DISABLED);
30145
+ DELETE(snap_node);
30149
+ // New EVMS node for the snapshot
30150
+ if ( evms_cs_allocate_logical_node( &new_snap_node ) ) {
30151
+ set_snapshot_flags(snap_node,EVMS_SNAPSHOT_DISABLED);
30152
+ DELETE( snap_node );
30156
+ MOD_INC_USE_COUNT;
30158
+ // Instance data for the snapshot
30159
+ if ( evms_cs_allocate_memory( (void**)&snap_volume, sizeof(snapshot_volume_t) )) {
30160
+ delete_snapshot_volume( new_snap_node );
30161
+ set_snapshot_flags(snap_node,EVMS_SNAPSHOT_DISABLED);
30162
+ DELETE( snap_node );
30166
+ // Initialize the snapshot node
30167
+ if (metadata->flags & EVMS_SNAPSHOT_WRITEABLE) {
30168
+ new_snap_node->flags = snap_node->flags;
30169
+ }else { // if not writeable, set read only
30170
+ new_snap_node->flags = snap_node->flags | EVMS_VOLUME_SET_READ_ONLY;
30172
+ new_snap_node->flags = new_snap_node->flags |
30173
+ (org_node->flags &(EVMS_DEVICE_REMOVABLE | EVMS_VOLUME_PARTIAL));
30174
+ new_snap_node->system_id = 0x536e4170; // SnAp
30175
+ new_snap_node->total_vsectors = org_node->total_vsectors; // Lying about the size.
30176
+ new_snap_node->block_size = snap_node->block_size;
30177
+ new_snap_node->hardsector_size = snap_node->hardsector_size;
30178
+ new_snap_node->plugin = &plugin_header;
30179
+ new_snap_node->instance_data = (void*)snap_volume;
30180
+ // Get the new node's name from the consumed node's feature
30182
+ strcpy(new_snap_node->name, snap_node->feature_header->object_name);
30183
+ // No problem with propagating the volume name up.
30184
+ new_snap_node->volume_info = snap_node->volume_info;
30186
+ // Initialize the instance data
30187
+ snap_volume->logical_node = snap_node;
30188
+ snap_volume->chunk_size = metadata->chunk_size;
30189
+ snap_volume->chunk_shift = evms_cs_log2((u_int64_t)metadata->chunk_size);
30190
+ snap_volume->num_chunks = metadata->total_chunks;
30191
+ snap_volume->current_cow_sector = metadata->lba_of_COW_table;
30192
+ snap_volume->hash_table_size = (metadata->total_chunks)/MAX_HASH_CHAIN_ENTRIES + 1;
30193
+ snap_volume->flags = EVMS_SNAPSHOT;
30194
+ if (metadata->flags & EVMS_SNAPSHOT_WRITEABLE) {
30195
+ snap_volume->flags |= EVMS_SNAPSHOT_WRITEABLE;
30198
+ // Snapshot hash table
30199
+ snap_volume->snapshot_map = vmalloc(snap_volume->hash_table_size * sizeof(snapshot_hash_entry_t*));
30200
+ if ( !snap_volume->snapshot_map) {
30201
+ set_snapshot_flags(snap_node,EVMS_SNAPSHOT_DISABLED);
30202
+ delete_snapshot_volume( new_snap_node );
30206
+ memset(snap_volume->snapshot_map, 0, snap_volume->hash_table_size * sizeof(snapshot_hash_entry_t*));
30208
+ if ( (rc = build_snapshot_maps(snap_volume)) ){
30209
+ set_snapshot_flags(snap_node,EVMS_SNAPSHOT_DISABLED);
30210
+ delete_snapshot_volume( new_snap_node );
30214
+ // check to see if the node we found is one we put back on the list due to
30215
+ // another snapshot of the original, if so then don't allocate a new
30216
+ // node and volume info, just get the old
30217
+ if (org_node->plugin->id != plugin_header.id) {
30219
+ // New EVMS node for the original
30220
+ if ( evms_cs_allocate_logical_node( &new_org_node ) ) {
30221
+ set_snapshot_flags(snap_node,EVMS_SNAPSHOT_DISABLED);
30222
+ delete_snapshot_volume( new_snap_node );
30226
+ MOD_INC_USE_COUNT;
30228
+ // Instance data for the original
30229
+ if ( evms_cs_allocate_memory( (void**)&org_volume, sizeof(snapshot_volume_t) )) {
30230
+ set_snapshot_flags(snap_node,EVMS_SNAPSHOT_DISABLED);
30231
+ delete_snapshot_volume( new_snap_node );
30232
+ delete_snapshot_volume( new_org_node );
30236
+ // Initialize the new node
30237
+ new_org_node->flags = org_node->flags |
30238
+ (snap_node->flags &(EVMS_DEVICE_REMOVABLE | EVMS_VOLUME_PARTIAL));
30239
+ new_org_node->system_id = 0x4f724967; // OrIg
30240
+ new_org_node->total_vsectors = org_node->total_vsectors;
30241
+ new_org_node->block_size = org_node->block_size;
30242
+ new_org_node->hardsector_size = org_node->hardsector_size;
30243
+ new_org_node->plugin = &plugin_header;
30244
+ new_org_node->instance_data = (void*)org_volume;
30245
+ // Must reuse the original node's name
30246
+ strcpy(new_org_node->name, org_node->name);
30247
+ new_org_node->volume_info = org_node->volume_info;
30249
+ // Initialize the instance data
30250
+ org_volume->chunk_size = SNAPSHOT_CHUNK_BUFFER_SIZE;
30251
+ org_volume->num_chunks = 0;
30252
+ org_volume->current_cow_sector = 0;
30253
+ org_volume->flags = EVMS_SNAPSHOT_ORG;
30254
+ org_volume->snapshot_next = snap_volume;
30255
+ snap_volume->snapshot_next = NULL;
30257
+ // Buffer for copying data from the original to the snapshot
30258
+ if ( evms_cs_allocate_memory( (void**)(&org_volume->chunk_data_buffer), SNAPSHOT_CHUNK_BUFFER_SIZE * SECTOR_SIZE)) {
30259
+ set_snapshot_flags(snap_node,EVMS_SNAPSHOT_DISABLED);
30260
+ delete_snapshot_volume( new_snap_node );
30261
+ delete_snapshot_volume( new_org_node );
30265
+ // remove the original volume from the global list, then
30266
+ // add the new version of the original to the global list.
30267
+ evms_cs_remove_logical_node_from_list(evms_node_list,org_node);
30268
+ org_volume->logical_node = org_node;
30269
+ evms_cs_add_logical_node_to_list(evms_node_list,new_org_node);
30272
+ // There is already at least one snapshot for this original.
30273
+ new_org_node = org_node;
30274
+ org_volume = (snapshot_volume_t*)org_node->instance_data;
30276
+ // propagate the flags from the new snapshot node to the original, and then to every other snapshot
30277
+ for (tmp_volume=org_volume; tmp_volume;tmp_volume=tmp_volume->snapshot_next) {
30278
+ tmp_volume->logical_node->flags = org_node->flags |
30279
+ (snap_node->flags &(EVMS_DEVICE_REMOVABLE | EVMS_VOLUME_PARTIAL));
30281
+ // Insert the new snapshot at the start of the original's chain.
30282
+ snap_volume->snapshot_next = org_volume->snapshot_next;
30283
+ org_volume->snapshot_next = snap_volume;
30286
+ if ( snap_proc ) {
30287
+ create_proc_read_entry(snap_node->feature_header->volume_name, S_IFREG, snap_proc, snap_proc_read, new_snap_node);
30290
+ init_MUTEX( &snap_volume->snap_semaphore );
30291
+ snap_volume->snapshot_org = org_volume;
30292
+ evms_cs_add_logical_node_to_list(evms_node_list,new_snap_node);
30299
+/* Function: snap_proc_read
30301
+ * Callback function for the proc-fs entry for each snapshot node.
30302
+ * Print out pertinent information about this snapshot. The "data"
30303
+ * parameter is a pointer to an EVMS logical node.
30305
+static int snap_proc_read(char * page,
30312
+ evms_logical_node_t * snap_node = data;
30313
+ snapshot_volume_t * snap_volume = snap_node->instance_data;
30316
+ PROCPRINT("Snapshot of : %s\n", (snap_volume->snapshot_org) ? EVMS_GET_NODE_NAME(snap_volume->snapshot_org->logical_node) : "Unknown");
30317
+ PROCPRINT("Size (KB) : %ld\n", (snap_volume->num_chunks * snap_volume->chunk_size)/2);
30318
+ PROCPRINT("Chunk Size (KB): %ld\n", (snap_volume->chunk_size)/2);
30319
+ PROCPRINT("Writeable : %s\n", (snap_volume->flags & EVMS_SNAPSHOT_WRITEABLE) ? "True" : "False");
30320
+ PROCPRINT("Usage : %ld%%\n", (snap_volume->next_free_chunk * 100) / snap_volume->num_chunks);
30321
+ PROCPRINT("Status : %s\n", (snap_volume->flags & EVMS_SNAPSHOT_FULL) ? "Full / Disabled" : (snap_volume->flags & EVMS_SNAPSHOT_DISABLED) ? "Disabled" : "Active");
30326
diff -Naur linux-2002-03-28/include/linux/evms/evms.h evms-2002-03-28/include/linux/evms/evms.h
30327
--- linux-2002-03-28/include/linux/evms/evms.h Wed Dec 31 18:00:00 1969
30328
+++ evms-2002-03-28/include/linux/evms/evms.h Mon Mar 25 15:51:13 2002
30330
+/* -*- linux-c -*- */
30333
+ * Copyright (c) International Business Machines Corp., 2000
30335
+ * This program is free software; you can redistribute it and/or modify
30336
+ * it under the terms of the GNU General Public License as published by
30337
+ * the Free Software Foundation; either version 2 of the License, or
30338
+ * (at your option) any later version.
30340
+ * This program is distributed in the hope that it will be useful,
30341
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
30342
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
30343
+ * the GNU General Public License for more details.
30345
+ * You should have received a copy of the GNU General Public License
30346
+ * along with this program; if not, write to the Free Software
30347
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
30350
+ * linux/include/linux/evms/evms.h
30352
+ * EVMS public kernel header file
30356
+#ifndef __EVMS_INCLUDED__
30357
+#define __EVMS_INCLUDED__
30359
+#include <linux/genhd.h>
30360
+#include <linux/fs.h>
30361
+#include <linux/iobuf.h>
30362
+#include <linux/kdev_t.h>
30363
+#include <linux/hdreg.h>
30364
+#include <linux/slab.h>
30365
+#include <linux/proc_fs.h>
30370
+/* tracing info */
30371
+#define EVMS_INFO_CRITICAL 0
30372
+#define EVMS_INFO_SERIOUS 1
30373
+#define EVMS_INFO_ERROR 2
30374
+#define EVMS_INFO_WARNING 3
30375
+#define EVMS_INFO_DEFAULT 5
30376
+#define EVMS_INFO_DETAILS 6
30377
+#define EVMS_INFO_DEBUG 7
30378
+#define EVMS_INFO_EXTRA 8
30379
+#define EVMS_INFO_ENTRY_EXIT 9
30380
+#define EVMS_INFO_EVERYTHING 10
30382
+extern int evms_info_level;
30383
+/* information message: e.g., configuration, major event */
30384
+#define evmsTRACE(info_level,prspec) { if (evms_info_level >= info_level) printk prspec; }
30385
+#define evmsTRACE2(info_level,statement) { if (evms_info_level >= info_level) statement; }
30386
+// sample - be sure to use enclose "prspec" or "statement" with parens ()
30387
+// evmsTRACE(info_level,(KERN_INFO "evms_myfunction: name = %s\n", name));
30388
+// evmsTRACE2(info_level,(print_mem( buffer_address, buffer_length)));
30390
+/* LOG MACROS to make evms log messages look much
30391
+ * cleaner in the source.
30393
+#define EVMS_LOG_PREFIX "evms: "
30394
+#define LOG_CRITICAL(msg, args...) evmsTRACE(EVMS_INFO_CRITICAL, (KERN_CRIT EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))
30395
+#define LOG_SERIOUS(msg, args...) evmsTRACE(EVMS_INFO_SERIOUS, (KERN_ERR EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))
30396
+#define LOG_ERROR(msg, args...) evmsTRACE(EVMS_INFO_ERROR, (KERN_ERR EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))
30397
+#define LOG_WARNING(msg, args...) evmsTRACE(EVMS_INFO_WARNING, (KERN_WARNING EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))
30398
+#define LOG_DEFAULT(msg, args...) evmsTRACE(EVMS_INFO_DEFAULT, (KERN_INFO EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))
30399
+#define LOG_DETAILS(msg, args...) evmsTRACE(EVMS_INFO_DETAILS, (KERN_INFO EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))
30400
+#define LOG_DEBUG(msg, args...) evmsTRACE(EVMS_INFO_DEBUG, (KERN_INFO EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))
30401
+#define LOG_EXTRA(msg, args...) evmsTRACE(EVMS_INFO_EXTRA, (KERN_INFO EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))
30402
+#define LOG_ENTRY_EXIT(msg, args...) evmsTRACE(EVMS_INFO_ENTRY_EXIT, (KERN_INFO EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))
30403
+#define LOG_EVERYTHING(msg, args...) evmsTRACE(EVMS_INFO_EVERYTHING, (KERN_INFO EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))
30405
+#ifdef CONFIG_PROC_FS
30406
+#define PROCPRINT(msg, args...) (sz += sprintf(page + sz, msg, ## args))
30409
+#define EVMS_HANDLE_KEY 0x89ABCDEF
30411
+/* Plugin structure definitions */
30413
+typedef struct evms_plugin_header_s {
30415
+ evms_version_t version;
30416
+ evms_version_t required_common_services_version;
30417
+ struct evms_plugin_function_table_s *function_table;
30418
+} evms_plugin_header_t;
30420
+typedef struct evms_volume_info_s {
30421
+/* 0*/ u_int64_t volume_serial_number;
30422
+/* 8*/ u_int32_t volume_system_id; /* the minor is stored here */
30423
+/* 12*/ char volume_name[EVMS_VOLUME_NAME_SIZE+1];
30425
+} evms_volume_info_t;
30427
+/* flags field bit definitions in evms_common.h */
30428
+/* iflags field used internally by the kernel only */
30429
+#define EVMS_FEATURE_BOTTOM (1<<0)
30430
+typedef struct evms_logical_node_s {
30431
+/* 0*/ evms_sector_t total_vsectors;
30432
+/* 8*/ evms_plugin_header_t * plugin;
30433
+/* 12*/ void * instance_data; /* ptr to private instance data */
30434
+/* 16*/ unsigned int flags;
30435
+/* 20*/ unsigned int iflags;
30436
+/* 24*/ int hardsector_size;
30437
+/* 28*/ int block_size;
30438
+/* 32*/ unsigned int system_id;
30439
+/* 36*/ evms_volume_info_t * volume_info;
30440
+/* 40*/ evms_feature_header_t * feature_header;
30441
+/* 44*/ struct evms_logical_node_s * next;
30442
+/* 48*/ char name[EVMS_VOLUME_NAME_SIZE+1];
30444
+} evms_logical_node_t;
30446
+/* this macro will retrieve the appropriate kernel node name
30447
+ * based on the node type.
30449
+#define EVMS_GET_NODE_NAME(node) \
30450
+ ((node->flags & EVMS_VOLUME_FLAG) ? \
30451
+ node->volume_info->volume_name : \
30454
+/* bit definitions of FLAGS field in logical volume struct */
30455
+/* NOTE: these bit field definitions can be found in
30456
+ * evms_ioctl.h above the evms_volume_data_t structure
30458
+typedef struct evms_logical_volume_s {
30459
+ char * name; /* devfs name if any */
30460
+ evms_logical_node_t * node; /* ptr to top logical node */
30463
+ int vfs_quiesced;
30464
+ atomic_t requests_in_progress;
30465
+ wait_queue_head_t wait_queue;
30466
+ devfs_handle_t devfs_handle;
30468
+ request_queue_t request_queue;
30470
+} evms_logical_volume_t;
30472
+/* EVMS generic I/O structure */
30473
+typedef struct eio_s {
30474
+ evms_sector_t rsector;
30475
+ evms_sector_t rsize;
30476
+ struct buffer_head *bh;
30479
+/* Abstraction MACROs */
30480
+#define EVMS_IO_ERROR(eio) (buffer_IO_error(eio->bh))
30483
+ * The following function table is used for all plugins.
30485
+typedef struct evms_plugin_function_table_s {
30486
+ int (* discover)(evms_logical_node_t **);
30487
+ int (* end_discover)(evms_logical_node_t **);
30488
+ int (* delete) (evms_logical_node_t *);
30489
+ void (* read) (evms_logical_node_t *, eio_t *);
30490
+ void (* write) (evms_logical_node_t *, eio_t *);
30491
+ int (* init_io) (evms_logical_node_t *, int, evms_sector_t,
30492
+ evms_sector_t, void *);
30493
+ int (* ioctl) (evms_logical_node_t *, struct inode *,
30494
+ struct file *, unsigned int, unsigned long);
30495
+ int (* direct_ioctl)(struct inode *, struct file *,
30496
+ unsigned int, unsigned long);
30497
+} evms_plugin_function_table_t;
30500
+ * These macros facilitate easier use of the
30501
+ * entry points in the function table
30503
+#define DISCOVER(node, list) ((node)->plugin->function_table->discover(list))
30504
+#define END_DISCOVER(node, list) ((node)->plugin->function_table->end_discover(list))
30505
+#define DELETE(node) ((node)->plugin->function_table->delete(node))
30506
+#define R_IO(node, eio) ((node)->plugin->function_table->read(node, eio))
30507
+#define W_IO(node, eio) ((node)->plugin->function_table->write(node, eio))
30508
+#define INIT_IO(node, rw_flag, start_sec, num_secs, buf_addr) ((node)->plugin->function_table->init_io(node, rw_flag, start_sec, num_secs, buf_addr))
30509
+#define INT_IO(node, rw_flag, start_sec, num_secs, buf_addr) ((node)->plugin->function_table->int_io(node, rw_flag, start_sec, num_secs, buf_addr))
30510
+#define IOCTL(node, inode, file, cmd, arg) ((node)->plugin->function_table->ioctl(node, inode, file, cmd, arg))
30511
+#define DIRECT_IOCTL(reg_record, inode, file, cmd, arg) ((reg_record)->plugin->function_table->direct_ioctl(inode, file, cmd, arg))
30513
+typedef struct evms_list_node_s {
30515
+ struct evms_list_node_s *next;
30516
+} evms_list_node_t;
30518
+/* pool management structure */
30519
+typedef struct evms_pool_mgmt_s {
30520
+ kmem_cache_t *cachep;
30523
+ atomic_t waiters;
30524
+ wait_queue_head_t wait_queue;
30525
+ /* WARNING!!! pool name MUST be less than 20 chars */
30527
+} evms_pool_mgmt_t;
30531
+ * All of the following kernel thread functions belong to EVMS base.
30532
+ * These functions were copied from md_core.c
30534
+#define EVMS_THREAD_WAKEUP 0
30535
+typedef struct evms_thread_s {
30536
+ void (*run) (void *data);
30538
+ wait_queue_head_t wqueue;
30539
+ unsigned long flags;
30540
+ struct completion *event;
30541
+ struct task_struct *tsk;
30542
+ const char *name;
30545
+/* EVMS (common services) exported functions prototypes */
30546
+#define EVMS_COMMON_SERVICES_MAJOR 0
30547
+#define EVMS_COMMON_SERVICES_MINOR 6
30548
+#define EVMS_COMMON_SERVICES_PATCHLEVEL 0
30550
+void evms_cs_get_version(int *, int *);
30551
+int evms_cs_check_version(evms_version_t *, evms_version_t *);
30552
+int evms_cs_register_plugin(evms_plugin_header_t *);
30553
+int evms_cs_unregister_plugin(evms_plugin_header_t *);
30554
+#ifdef EVMS_MEM_DEBUG
30555
+int evms_cs_verify_memory_integrity(int);
30557
+int evms_cs_allocate_memory(void **, int);
30558
+int evms_cs_deallocate_memory(void *);
30559
+int evms_cs_allocate_logical_node(evms_logical_node_t **);
30560
+void evms_cs_deallocate_volume_info(evms_logical_node_t *);
30561
+int evms_cs_deallocate_logical_node(evms_logical_node_t *);
30562
+int evms_cs_add_logical_node_to_list(evms_logical_node_t **,
30563
+ evms_logical_node_t *);
30564
+int evms_cs_remove_logical_node_from_list(evms_logical_node_t **,
30565
+ evms_logical_node_t *);
30566
+int evms_cs_kernel_ioctl(evms_logical_node_t *, unsigned int,
30568
+int evms_cs_get_hardsect_size(evms_logical_node_t *, int *);
30569
+int evms_cs_get_blocksize_size(evms_logical_node_t *, int *);
30570
+unsigned long evms_cs_size_in_sectors(unsigned long, unsigned long);
30571
+unsigned long evms_cs_size_in_vsectors(long long);
30572
+int evms_cs_log2(long long);
30573
+u_int32_t evms_cs_calculate_crc(u_int32_t, void *, u_int32_t);
30574
+int evms_cs_register_for_end_io_notification(void *,
30575
+ struct buffer_head *,
30576
+ void *callback_function);
30577
+evms_pool_mgmt_t * evms_cs_create_pool(
30580
+ void (*ctor)(void*, kmem_cache_t *, unsigned long),
30581
+ void (*dtor)(void*, kmem_cache_t *, unsigned long));
30582
+#define EVMS_BLOCKABLE TRUE
30583
+void * evms_cs_allocate_from_pool(evms_pool_mgmt_t *, int);
30584
+void evms_cs_deallocate_to_pool(evms_pool_mgmt_t *, void *);
30585
+void evms_cs_destroy_pool(evms_pool_mgmt_t *);
30586
+int evms_cs_add_item_to_list(evms_list_node_t **, void *);
30587
+int evms_cs_remove_item_from_list(evms_list_node_t **, void *);
30588
+int evms_cs_register_device(evms_logical_node_t *);
30589
+int evms_cs_unregister_device(evms_logical_node_t *);
30590
+int evms_cs_find_next_device(evms_logical_node_t *,
30591
+ evms_logical_node_t **);
30592
+void evms_cs_signal_event(int);
30593
+evms_thread_t * evms_cs_register_thread (
30594
+ void (*run) (void *),
30596
+ const char *name);
30597
+void evms_cs_unregister_thread (evms_thread_t *thread);
30598
+void evms_cs_wakeup_thread(evms_thread_t *thread);
30599
+void evms_cs_interrupt_thread (evms_thread_t *thread);
30600
+struct proc_dir_entry *evms_cs_get_evms_proc_dir(void);
30601
+int evms_cs_volume_request_in_progress(kdev_t, int, int *);
30604
+/* EVMS exported global variables */
30605
+extern evms_pool_mgmt_t *evms_bh_pool;
30606
+extern char *evms_primary_string;
30607
+extern char *evms_secondary_string;
30609
diff -Naur linux-2002-03-28/include/linux/evms/evms_aix.h evms-2002-03-28/include/linux/evms/evms_aix.h
30610
--- linux-2002-03-28/include/linux/evms/evms_aix.h Wed Dec 31 18:00:00 1969
30611
+++ evms-2002-03-28/include/linux/evms/evms_aix.h Wed Mar 27 19:27:56 2002
30614
+* The following structures are nested within the structures used by the
30615
+* system management routines. These structures and sizes were pulled from the AIX
30618
+#define LVM_MAXLPS 65535 /* max number of logical partitions allowed */
30619
+#define LVM_NAMESIZ 64 /* maximum size for the logical volume name */
30620
+#define LVM_NUMCOPIES 3 /* max number of copies allowed of a logical partition */
30621
+#define LVM_MAXVGS 255
30622
+#define LVM_MAXPVS 32
30623
+#define LVM_MAXLVS 256
30624
+#define AIX_MIN_BLOCK_SIZE 4096
30625
+#define VGSA_BT_PV 127
30628
+#define OFFSET_CONSTANT 144
30629
+#define SLEEP_TIME 0
30630
+#define MAXLVS_OFFSET 16
30631
+#define PHYS_VOL_OFFSET 34
30632
+#define AIX_PVHPP_LENGTH PHYS_VOL_OFFSET
30633
+#define MAX_SECTORS_NAMELIST 32
30634
+#define AIX_DEFAULT_MIRRORING 1
30635
+#define AIX_FIRST_MIRROR 2
30636
+#define AIX_MAX_MIRRORS 3 // AIX defines ALL copies as mirrors - 3 mirrors MAX - 1 orig and 2 copies
30638
+#define PSN_LVM_REC 7
30639
+#define PSN_VGSA_REC 128
30640
+#define PSN_NAMELIST_REC 2065
30641
+#define PSN_VGT_TRAILER 135
30642
+#define PSN_LVE_REC 1
30643
+#define PSN_PPH_OFFSET 17
30644
+#define PSN_PVH_INCREMENT 34
30645
+#define AIX_SECTOR_SIZE 512
30646
+#define MAX_PPENT_SECTOR 16
30647
+#define NAME_LEN 128 /* don't change!!! */
30648
+#define UUID_LEN 32 /* don't change!!! */
30649
+#define MAX_SECTORS_LV_ENTRIES 16
30650
+#define AIX_MIN_MIRROR_POOL 10
30651
+#define AIX_MIRROR_POOL_CHANGE 10
30653
+#define LV_SET_ACCESS _IOW ( 0xfe, 0x28, 1)
30654
+#define LV_SET_ALLOCATION _IOW ( 0xfe, 0x29, 1)
30655
+#define LV_SET_STATUS _IOW ( 0xfe, 0x2a, 1)
30656
+#define LV_BMAP _IOWR ( 0xfe, 0x30, 1)
30658
+#define LV_ACTIVE 0x01 /* lv_status */
30659
+#define LV_SPINDOWN 0x02 /* " */
30660
+#define LV_ERROR 0x99 /* " */
30662
+#define VG_ACTIVE 0x01 /* vg_status */
30664
+#define AIX_LV_READ 0x00 /* lv_access */
30665
+#define AIX_LV_WRITE 0x01 /* " */
30666
+#define EVMS_LV_NEW 0x10 // volume was created during the current discovery pass
30667
+#define EVMS_LV_INCOMPLETE 0x20 // volume has an incomplete LE map
30668
+#define EVMS_LV_INVALID 0x40 // volume has a memory-corruption problem
30669
+#define EVMS_VG_DIRTY 0x01 // group has had a new PV added during this discovery
30670
+#define AIX_VG_INCOMPLETE 0x20 // volume group is incomplete
30673
+#define LOG_PREFIX "--AIXlvm: "
30675
+// Entries in the list of physical volumes (PV)
30676
+// in a volume group (VG)
30678
+typedef struct unique_id_s {
30685
+typedef struct _partition_list_entry {
30686
+ evms_logical_node_t * logical_node;
30687
+ u_int32_t pv_number;
30688
+ u_int32_t block_size; // bytes
30689
+ u_int32_t hard_sect_size; // bytes
30690
+ struct _partition_list_entry * next;
30692
+} partition_list_entry_t;
30694
+// Table for mapping logical extents (LE) to physical extents (PE)
30695
+typedef struct _pe_table_entry {
30696
+ partition_list_entry_t * owning_pv;
30697
+ u_int64_t pe_sector_offset;
30698
+} pe_table_entry_t;
30700
+// Logical volumes (LV) in a volume group (VG)
30701
+typedef struct _aix_logical_volume {
30702
+ u_int32_t lv_number;
30703
+ u_int64_t lv_size; // Sectors
30704
+ u_int32_t lv_access; // Flags: LV_READ, LV_WRITE, LV_SNAPSHOT, LV_SNAPSHOT_ORG, EVMS_QUIESCE
30705
+ u_int32_t lv_status; // Flags: LV_ACTIVE, LV_SPINDOWN
30706
+ u_int32_t lv_minor; // Device minor number
30707
+ u_int32_t mirror_copies; // Do we have mirroring and how many ?
30708
+ u_int32_t mirror_number; // mirror number - which copy is this ?
30709
+ u_int32_t mirror_iterations; // Which mirror should we be writing to ?
30710
+ u_int32_t stripes;
30711
+ u_int32_t stripe_size; // Sectors
30712
+ u_int32_t stripe_size_shift; // Number of bits to shift right instead of dividing by stripe_size
30713
+ u_int32_t pe_size; // Sectors
30714
+ u_int32_t pe_size_shift; // Number of bits to shift right instead of dividing by pe_size
30715
+ u_int32_t num_le; // Number of entries in the le_to_pe_map
30716
+ u_int32_t new_volume; // Flag to indicate if this volume needs to be exported
30717
+ struct _aix_volume_group * group; // Pointer back to parent volume group
30718
+ unsigned char name[EVMS_VOLUME_NAME_SIZE+1]; // Dev-tree volume name (eg: /dev/group0/vol0)
30719
+ pe_table_entry_t * le_to_pe_map; // Mapping of logical to physical extents
30720
+ pe_table_entry_t * le_to_pe_map_mir1; // Mapping of logical to physical extents for mirror 1
30721
+ pe_table_entry_t * le_to_pe_map_mir2; // Mapping of logical to physical extents for mirror 2
30722
+ evms_logical_node_t * volume_node; // Pointer to the parent EVMS node representing this volume
30724
+} aix_logical_volume_t;
30726
+// Volume groups (VG)
30727
+typedef struct _aix_volume_group {
30728
+ unique_id vg_id; // volume group number */
30729
+ u_int32_t numpvs; // Number of PVs found on this VG.
30730
+ u_int32_t numlvs; // Number of LVs found on this VG.
30731
+ u_int32_t hard_sect_size; // The largest hard_sect_size and block_size
30732
+ u_int32_t block_size; // values of all partitions in this group.
30733
+ u_int32_t flags; //
30734
+ u_int32_t lv_max; // maximum logical volumes */
30735
+ u_int32_t pe_size; // physical extent size in sectors */
30736
+ partition_list_entry_t * partition_list; // List of partitions/segments/PVs that make up this VG
30737
+ u_int32_t partition_count;
30738
+ struct _aix_logical_volume ** volume_list; // Array of volumes found in this VG.
30739
+ struct _aix_volume_group * next; // Pointer to the next VG
30740
+ u_int32_t CleanVGInfo; // Do we have a clean VG Info to work with ?
30741
+ daddr_t vgda_psn; // Which VGDA we should use
30742
+ long vgda_len; // length of the volume group descriptor area */
30743
+ struct _vg_header * AIXvgh; // Pointer to valid data area on disk for the VG
30744
+} aix_volume_group_t;
30746
+typedef struct _aix_mirror_bh {
30747
+ atomic_t remaining;
30748
+ int iteration; // 'have we finished' count, used from IRQ handlers
30750
+ u_int64_t mir_sector1;
30751
+ u_int64_t mir_sector2;
30752
+ struct buffer_head *master_bh;
30753
+ struct buffer_head bh_req;
30754
+ struct _aix_mirror_bh *mirror_bh_list;
30755
+ evms_logical_node_t *node; // map to evms node (READ only)
30756
+ evms_logical_node_t *mir_node1; //
30757
+ evms_logical_node_t *mir_node2; //
30759
+ struct _aix_mirror_bh *next_r1; // next for retry or in free list
30760
+} aix_mirror_bh_t;
30762
+typedef struct _timestruc_t
30769
+typedef struct ipl_rec_area
30771
+ unsigned int IPL_record_id; /* This physical volume contains a */
30772
+ /* valid IPL record if and only if */
30773
+ /* this field contains IPLRECID */
30775
+#define IPLRECID 0xc9c2d4c1 /* Value is EBCIDIC 'IBMA' */
30777
+ char reserved1[20];
30778
+ unsigned int formatted_cap; /* Formatted capacity. The number of */
30779
+ /* sectors available after formatting*/
30780
+ /* The presence or absence of bad */
30781
+ /* blocks does not alter this value. */
30783
+ char last_head; /* THIS IS DISKETTE INFORMATION */
30784
+ /* The number of heads minus 1. Heads*/
30785
+ /* are number from 0 to last_head. */
30787
+ char last_sector; /* THIS IS DISKETTE INFORMATION */
30788
+ /* The number of sectors per track. */
30789
+ /* Sectors are numbered from 1 to */
30790
+ /* last_sector. */
30792
+ char reserved2[6];
30794
+ unsigned int boot_code_length; /* Boot code length in sectors. A 0 */
30795
+ /* value implies no boot code present*/
30797
+ unsigned int boot_code_offset; /* Boot code offset. Must be 0 if no */
30798
+ /* boot code present, else contains */
30799
+ /* byte offset from start of boot */
30800
+ /* code to first instruction. */
30802
+ unsigned int boot_lv_start; /* Contains the PSN of the start of */
30805
+ unsigned int boot_prg_start; /* Boot code start. Must be 0 if no */
30806
+ /* boot code present, else contains */
30807
+ /* the PSN of the start of boot code.*/
30809
+ unsigned int boot_lv_length; /* BLV length in sectors. */
30811
+ unsigned int boot_load_add; /* 512 byte boundary load address for*/
30814
+ char boot_frag; /* Boot code fragmentation flag. Must*/
30815
+ /* be 0 if no fragmentation allowed, */
30816
+ /* else must be 0x01. */
30818
+ char boot_emulation; /* ROS network emulation flag */
30819
+ /* 0x0 => not an emul support image */
30820
+ /* 0x1 => ROS network emulation code */
30821
+ /* 0x2 => AIX code supporting ROS emul*/
30823
+ char reserved3[2];
30825
+ ushort basecn_length; /* Number of sectors for base */
30826
+ /* customization. Normal mode. */
30828
+ ushort basecs_length; /* Number of sectors for base */
30829
+ /* customization. Service mode. */
30831
+ unsigned int basecn_start; /* Starting PSN value for base */
30832
+ /* customization. Normal mode. */
30834
+ unsigned int basecs_start; /* Starting PSN value for base */
30835
+ /* customization. Service mode. */
30837
+ char reserved4[24];
30839
+ unsigned int ser_code_length; /* Service code length in sectors. */
30840
+ /* A 0 value implies no service code */
30843
+ unsigned int ser_code_offset; /* Service code offset. Must be 0 if */
30844
+ /* no service code is present, else */
30845
+ /* contains byte offset from start of*/
30846
+ /* service code to first instruction.*/
30848
+ unsigned int ser_lv_start; /* Contains the PSN of the start of */
30851
+ unsigned int ser_prg_start; /* Service code start. Must be 0 if */
30852
+ /* service code is not present, else */
30853
+ /* contains the PSN of the start of */
30854
+ /* service code. */
30856
+ unsigned int ser_lv_length; /* SLV length in sectors. */
30858
+ unsigned int ser_load_add; /* 512 byte boundary load address for*/
30859
+ /* service code. */
30861
+ char ser_frag; /* Service code fragmentation flag. */
30862
+ /* Must be 0 if no fragmentation */
30863
+ /* allowed, else must be 0x01. */
30865
+ char ser_emulation; /* ROS network emulation flag */
30866
+ /* 0x0 => not an emul support image */
30867
+ /* 0x1 => ROS network emulation code */
30868
+ /* 0x2 => AIX code supporting ROS emul*/
30870
+ char reserved5[2];
30872
+ unique_id pv_id; /* The unique identifier for this */
30873
+ /* physical volume. */
30874
+ char dummy[512 - 128 - sizeof(unique_id)];
30875
+}AIXIPL_REC, *AIXIPL_REC_PTR;
30878
+typedef struct AIXlvm_rec_s
30879
+ /* structure which describes the physical volume LVM record */
30881
+ long lvm_id; /* LVM id field which identifies whether the PV is a member of a volume group */
30883
+#define AIX_LVM_LVMID 0x5F4C564D /* LVM id field of ASCII "_LVM" */
30885
+ unique_id vg_id; /* the id of the volume group to which this physical volume belongs */
30886
+ long lvmarea_len; /* the length of the LVM reserved area */
30887
+ long vgda_len; /* length of the volume group descriptor area */
30888
+ daddr_t vgda_psn [2]; /* the physical sector numbers of the beginning of the volume group descriptor area copies on this disk */
30889
+ daddr_t reloc_psn; /* the physical sector number of the beginning of a pool of blocks */
30890
+ /* (located at the end of the PV) which are reserved for the relocation of bad blocks */
30891
+ long reloc_len; /* the length in number of sectors of the pool of bad block relocation blocks */
30892
+ short int pv_num; /* the physical volume number within the volume group of this physical volume */
30893
+ short int pp_size; /* the size in bytes for the partition, expressed as a power of 2 (i.e., the partition size is 2 to the power pp_size) */
30894
+ long vgsa_len; /* length of the volume group status area */
30895
+ daddr_t vgsa_psn [2]; /* the physical sector numbers of the beginning of the volume group status area copies on this disk */
30896
+ short int version; /* the version number of this volume group descriptor and status area */
30898
+#define LVM_VERSION_1 1 /* first version - AIX 3.0 */
30899
+#define LVM_STRIPE_ENHANCE 2 /* version with striped lv's - AIX 4.1 */
30900
+#define LVM_1024_PPSIZE 3 /* ppsizes of 512 and 1024 */
30901
+#define LVM_GT_1016 4 /* version with support for > 1016 pps/pv */
30902
+#define LVM_MAX_VERSION LVM_GT_1016 /* max version # */
30904
+ char res1 [450]; /* reserved area */
30910
+/* II.Volume Group Descriptor Area */
30912
+typedef struct _vgsa_area
30914
+ timestruc_t b_tmstamp; /* Beginning timestamp */
30915
+ unsigned int pv_missing [(LVM_MAXPVS + (NBPI -1)) / NBPI]; /* Bit per PV */
30916
+ unsigned char stalepp [LVM_MAXPVS] [VGSA_BT_PV];
30918
+ char resv[10]; /* Padding */
30919
+ timestruc_t e_tmstamp; /* Ending timestamp */
30923
+typedef struct _vg_header
30925
+ timestruc_t vg_timestamp; /* time of last update */
30926
+ unique_id vg_id; /* unique id for volume group */
30927
+ short numlvs; /* number of lvs in vg */
30928
+ short maxlvs; /* max number of lvs allowed in vg */
30929
+ short pp_size; /* size of pps in the vg */
30930
+ short numpvs; /* number of pvs in the vg */
30931
+ short total_vgdas; /* number of copies of vg */
30932
+ /* descriptor area on disk */
30933
+ short vgda_size; /* size of volume group descriptor */
30936
+ short auto_varyon;
30941
+typedef struct _lv_entries
30943
+ short lvname; /* name of LV */
30944
+ short res1; /* reserved area */
30945
+ int maxsize; /* maximum number of partitions allowed */
30946
+ char lv_state; /* state of logical volume */
30947
+ char mirror; /* none,single, or double */
30948
+ short mirror_policy; /* type of writing used to write */
30949
+ int num_lps; /* number of logical partitions on the lv */
30951
+ char permissions; /* read write or read only */
30952
+ char bb_relocation; /* specifies if bad block */
30953
+ /* relocation is desired */
30954
+ char write_verify; /* verify all writes to the LV */
30955
+ char mirwrt_consist; /* mirror write consistency flag */
30956
+ unsigned short stripe_exp; /* stripe size in exponent value */
30957
+ unsigned short striping_width; /* stripe width */
30958
+ unsigned short lv_avoid;
30959
+ unsigned short child_minor_num;
30960
+ char res4[4]; /* reserved area on disk */
30964
+typedef struct _pv_header
30966
+ unique_id pv_id; /* unique identifier of PV */
30967
+ unsigned short pp_count; /* number of physical partitions */
30969
+ char pv_state; /* state of physical volume */
30970
+ char res1; /* reserved area on disk */
30971
+ daddr_t psn_part1; /* physical sector number of 1st pp */
30972
+ short pvnum_vgdas;/* number of vg descriptor areas */
30973
+ /* on the physical volume */
30974
+ short pv_num; /* PV number */
30975
+ long res2; /* reserved area on disk */
30979
+typedef struct _pp_entries
30981
+ short lv_index; /* index to lv pp is on */
30982
+ short res_1; /* reserved area on disk */
30983
+ long lp_num; /* log. part. number */
30984
+ char copy; /* the copy of the logical partition */
30985
+ /* that this pp is allocated for */
30986
+ char pp_state; /* current state of pp */
30987
+ char fst_alt_vol; /* pv where partition allocation for*/
30988
+ /* first mirror begins */
30989
+ char snd_alt_vol; /* pv where partition allocation for*/
30990
+ /* second mirror begins */
30991
+ short fst_alt_part; /* partition to begin first mirror */
30992
+ short snd_alt_part; /*partition to begin second mirror */
30993
+ double res_3; /* reserved area on disk */
30994
+ double res_4; /* reserved area on disk */
30997
+typedef struct _namelist
30999
+ char name[LVM_MAXLVS][LVM_NAMESIZ];
31002
+typedef struct _vg_trailer
31004
+ timestruc_t timestamp; /* time of last update */
31005
+ short concurrency;
31006
+ /* MS Nibble = concurrent capable */
31007
+ /* LS Nibble = concurrent auto-varyon */
31009
+ int res_3; /* reserved area on disk */
31010
+ double res_4; /* reserved area on disk */
31011
+ double res_5; /* reserved area on disk */
31014
diff -Naur linux-2002-03-28/include/linux/evms/evms_bbr.h evms-2002-03-28/include/linux/evms/evms_bbr.h
31015
--- linux-2002-03-28/include/linux/evms/evms_bbr.h Wed Dec 31 18:00:00 1969
31016
+++ evms-2002-03-28/include/linux/evms/evms_bbr.h Tue Mar 26 16:04:31 2002
31020
+ * Copyright (c) International Business Machines Corp., 2000
31022
+ * This program is free software; you can redistribute it and/or modify
31023
+ * it under the terms of the GNU General Public License as published by
31024
+ * the Free Software Foundation; either version 2 of the License, or
31025
+ * (at your option) any later version.
31027
+ * This program is distributed in the hope that it will be useful,
31028
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
31029
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
31030
+ * the GNU General Public License for more details.
31032
+ * You should have received a copy of the GNU General Public License
31033
+ * along with this program; if not, write to the Free Software
31034
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
31038
+ * linux/include/linux/evms_bbr.h
31040
+ * EVMS Bad Block Relocation Feature kernel header file
31044
+#ifndef EVMS_BBR_INCLUDED
31046
+#define EVMS_BBR_INCLUDED
31048
+#define EVMS_BBR_VERSION_MAJOR 1
31049
+#define EVMS_BBR_VERSION_MINOR 0
31050
+#define EVMS_BBR_VERSION_PATCHLEVEL 0
31052
+#define EVMS_BBR_FEATURE_ID 6
31053
+#define EVMS_BBR_SIGNATURE 0x42627246 /* BbrF */
31055
+/* The following defines establish the minimum and maximum number of
31056
+ * replacement sectors which can be allocated for Bad Block Relocation.
31057
+ * Otherwise, 1 replacement sector per MB of disk space is allocated. */
31058
+#define EVMS_BBR_ENTRIES_PER_SECT 31 /* Assume sector size is 512 bytes*/
31059
+#define EVMS_BBR_LIMIT 4096
31061
+#define EVMS_BBR_TABLE_SIGNATURE 0x42627254 /* BbrT */
31063
+typedef struct evms_bbr_table_entry_s {
31064
+ u_int64_t bad_sect;
31065
+ u_int64_t replacement_sect;
31066
+} evms_bbr_table_entry_t;
31068
+typedef struct evms_bbr_table_s {
31069
+ u_int32_t signature; /* Signature for a sector of the bbr table (EVMS_BBR_TABLE_SIGNATURE) */
31070
+ u_int32_t crc; /* CRC for this sector of the BBR Table. */
31071
+ u_int32_t sequence_number; /* Used to resolve conflicts when the primary and secondary tables do not match. */
31072
+ u_int32_t in_use_cnt; /* number of in-use entries */
31073
+ evms_bbr_table_entry_t entries[EVMS_BBR_ENTRIES_PER_SECT]; /* BBR table entries available for this sector of the BBR table */
31074
+} evms_bbr_table_t;
31076
+/* description of on disk meta data sector for bbr feature */
31077
+typedef struct evms_bbr_metadata_s {
31078
+/* 0*/ u_int32_t signature; /* EVMS_BBR_SIGNATURE */
31079
+/* 4*/ u_int32_t crc;
31080
+/* 8*/ u_int32_t block_size; /* block size in bytes */
31081
+/*12*/ u_int32_t flags; /* Global flag used by BBR */
31082
+/*16*/ u_int64_t sequence_number;
31083
+/*24*/ u_int64_t start_sect_bbr_table; /* start 64-bit LBA of the BBR table */
31084
+/*32*/ u_int64_t nr_sects_bbr_table; /* number of sectors to hold the BBR table */
31085
+/*40*/ u_int64_t start_replacement_sect; /* start 64-bit LBA of the replacement sectors */
31086
+/*48*/ u_int64_t nr_replacement_blks; /* number of replacement blocks. */
31087
+/*56*/ char pads[456]; /* padding for 512-byte sector alignment */
31088
+} evms_bbr_metadata_t;
31091
+// BBR direct ioctl commands.
31092
+#define BBR_GET_INFO_CMD 1 // Return the total number of sectors
31093
+ // that are currently remapped for the
31095
+#define BBR_STOP_REMAP_CMD 2 // Stop ... do not remap any new sectors
31096
+ // or even honor any existing remaps for
31097
+ // the bbr object until after the next
31098
+ // rediscover command is received.
31099
+#define BBR_SECTOR_IO_CMD 3 // Process an I/O from the engine directly
31100
+ // through the bbr object.
31102
+typedef struct evms_notify_bbr_s {
31103
+ char object_name[EVMS_VOLUME_NAME_SIZE+1]; // Input - Name of bbr object from feature header
31104
+ u_int64_t count; // Output - Count of remapped sectors
31105
+ u_int64_t start_sect; // Input - Starting sector for sector_io
31106
+ u_int64_t nr_sect; // Input - Number of sectors for sector_io
31107
+ unsigned long buffer; // Input - Pointer to buffer for sector_io
31108
+ int rw; // Input - READ or WRITE for sector_io
31109
+} evms_notify_bbr_t;
31114
diff -Naur linux-2002-03-28/include/linux/evms/evms_bbr_k.h evms-2002-03-28/include/linux/evms/evms_bbr_k.h
31115
--- linux-2002-03-28/include/linux/evms/evms_bbr_k.h Wed Dec 31 18:00:00 1969
31116
+++ evms-2002-03-28/include/linux/evms/evms_bbr_k.h Wed Mar 27 16:08:55 2002
31118
+#ifndef __EVMS_BBR_K__
31119
+#define __EVMS_BBR_K__
31123
+ * Copyright (c) International Business Machines Corp., 2000
31125
+ * This program is free software; you can redistribute it and/or modify
31126
+ * it under the terms of the GNU General Public License as published by
31127
+ * the Free Software Foundation; either version 2 of the License, or
31128
+ * (at your option) any later version.
31130
+ * This program is distributed in the hope that it will be useful,
31131
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
31132
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
31133
+ * the GNU General Public License for more details.
31135
+ * You should have received a copy of the GNU General Public License
31136
+ * along with this program; if not, write to the Free Software
31137
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
31140
+/* linux/include/linux/evms/evms_bbr_k.h
31142
+ * Kernel header file for Bad Block Relocation (BBR) Feature
31144
+ * BBR feature is designed to remap I/O write failures to another safe location on disk.
31145
+ * Note that most disk drives have BBR built into them, this means that our software BBR
31146
+ * will be only activated when all hardware BBR replacement sectors have been used.
31149
+#include <linux/config.h>
31150
+#include <linux/module.h>
31151
+#include <linux/kernel.h>
31152
+#include <linux/sched.h>
31153
+#include <linux/smp_lock.h>
31154
+#include <linux/locks.h>
31155
+#include <linux/delay.h>
31156
+#include <linux/reboot.h>
31157
+#include <linux/completion.h>
31158
+#include <linux/vmalloc.h>
31159
+#include <asm/uaccess.h>
31160
+#include <linux/blk.h>
31162
+#include <linux/evms/evms_kernel.h>
31163
+#include <linux/evms/evms_bbr.h>
31165
+#define BBR_POOL_NAME_LENGTH 20
31167
+/* Required common services version */
31168
+#define EVMS_BBR_COMMON_SERVICES_MAJOR 0
31169
+#define EVMS_BBR_COMMON_SERVICES_MINOR 6
31170
+#define EVMS_BBR_COMMON_SERVICES_PATCHLEVEL 0
31173
+static int bbr_notify_reboot(
31174
+ struct notifier_block *this,
31175
+ unsigned long code,
31178
+typedef struct bbr_runtime_remap_s {
31179
+ evms_bbr_table_entry_t remap;
31180
+ struct bbr_runtime_remap_s *left; /** for binary tree */
31181
+ struct bbr_runtime_remap_s *right; /** for binary tree */
31182
+}bbr_runtime_remap_t;
31185
+/* local instance data structure definition */
31187
+#define BBR_STOP_REMAP (1<<0)
31189
+typedef struct bbr_instance_data_s {
31190
+ struct bbr_instance_data_s *next; /* link all bbr_instances */
31191
+ evms_logical_node_t *node; /* bbr_node */
31192
+ evms_logical_node_t *source; /* consumed node */
31193
+ evms_bbr_table_t *bbr_table;
31194
+ u_int64_t lba_table1;
31195
+ u_int64_t lba_table2;
31196
+ u_int64_t nr_sects_bbr_table;
31197
+ u_int64_t nr_replacement_blks;
31198
+ u_int64_t start_replacement_sect;
31199
+ u_int32_t blksize_in_sects;
31200
+ evms_pool_mgmt_t *bbr_bh_pool;
31201
+ char bh_pool_name[BBR_POOL_NAME_LENGTH+1];
31202
+ evms_pool_mgmt_t *remap_pool;
31203
+ char remap_pool_name[BBR_POOL_NAME_LENGTH+1];
31204
+ atomic_t in_use_replacement_blks;
31205
+ bbr_runtime_remap_t *remap_root; /* for binary tree */
31206
+ spinlock_t bbr_id_lock; /* lock for runtime remap table */
31208
+ evms_sector_t total_vsectors;
31209
+} bbr_instance_data_t;
31211
+#define BBR_BH_USE_EVMS_CALLBACK (1<<0) // Set if an EVMS callback was registered for this I/O
31213
+typedef struct bbr_bh_s {
31214
+ struct bbr_bh_s *next; // Used by bbr_io_list.
31215
+ bbr_instance_data_t *BBRID; // Object for this request.
31216
+ eio_t eio; // Original eio.
31217
+ atomic_t waiters; // Used by bbr_init_io.
31218
+ int rw; // READ or WRITE
31219
+ int rc; // Return code from bbr_io_handler.
31220
+ unsigned long flag;
31224
+/* --- discovery support functions --- */
31225
+static int load_feature_data(
31226
+ evms_logical_node_t *node,
31227
+ bbr_instance_data_t **ID);
31229
+static int load_meta_data(
31230
+ evms_logical_node_t *node,
31231
+ evms_sector_t LSN,
31232
+ evms_bbr_metadata_t **md,
31233
+ evms_bbr_table_t **bbr_table);
31235
+static int validate_meta_data(evms_bbr_metadata_t *md);
31236
+static int validate_bbr_table_sector(evms_bbr_table_t *p);
31237
+static u_int32_t validate_bbr_table(
31238
+ evms_bbr_metadata_t *md,
31239
+ evms_bbr_table_t *p);
31240
+static u_int32_t validate_bbr_tables(
31241
+ evms_logical_node_t *node,
31242
+ evms_bbr_metadata_t *MD1,
31243
+ evms_bbr_metadata_t *MD2,
31244
+ evms_bbr_table_t *p1,
31245
+ evms_bbr_table_t *p2);
31246
+void update_invalid_bbr_table_sector(
31247
+ evms_logical_node_t *node,
31248
+ evms_bbr_table_t *valid,
31249
+ evms_bbr_table_t *invalid,
31250
+ evms_sector_t LSN);
31252
+static u_int32_t bbr_table_to_remap_list(bbr_instance_data_t *BBRID);
31254
+static int bbr_create_pools(bbr_instance_data_t *BBRID);
31255
+static void bbr_destroy_pools(bbr_instance_data_t *BBRID);
31257
+#ifdef EVMS_BBR_DEBUG
31258
+static void print_meta_data(evms_bbr_metadata_t *md);
31259
+static void print_bbr_table_sector(evms_bbr_table_t *bbr_table);
31260
+static void print_remap_list(bbr_instance_data_t *BBRID);
31261
+#define BBR_DEBUG_PRINT_META_DATA(md) print_meta_data(md)
31262
+#define BBR_DEBUG_PRINT_TABLE_SECTOR(table) print_bbr_table_sector(table)
31263
+#define BBR_DEBUG_PRINT_REMAP_LIST(BBRID) print_remap_list(BBRID)
31265
+#define BBR_DEBUG_PRINT_META_DATA(md)
31266
+#define BBR_DEBUG_PRINT_TABLE_SECTOR(table)
31267
+#define BBR_DEBUG_PRINT_REMAP_LIST(BBRID)
31270
+#define BBR_BUG(msg) LOG_SERIOUS(__FUNCTION__ msg "\n")
31272
+/* -- Mapping functions -- */
31273
+void bbr_binary_tree_insert(
31274
+ bbr_runtime_remap_t **node,
31275
+ bbr_runtime_remap_t *newnode);
31276
+bbr_runtime_remap_t * bbr_binary_search(
31277
+ bbr_runtime_remap_t *node,
31278
+ evms_sector_t bad_sect);
31279
+static int bbr_insert_remap_entry(
31280
+ bbr_instance_data_t *BBRID,
31281
+ evms_bbr_table_entry_t *new_bbr_entry);
31282
+static evms_bbr_table_entry_t * bbr_search_remap_entry(
31283
+ bbr_instance_data_t *BBRID,
31284
+ evms_sector_t sect);
31285
+static inline int bbr_remap(
31286
+ bbr_instance_data_t *BBRID,
31287
+ evms_sector_t *lsn);
31288
+static void bbr_free_remap(bbr_instance_data_t *BBRID);
31289
+static void bbr_free_instance_data(bbr_instance_data_t *BBRID);
31290
+static inline void bbr_list_add(bbr_instance_data_t *BBRID);
31291
+static void bbr_list_remove(bbr_instance_data_t *BBRID);
31292
+static bbr_instance_data_t *bbr_find_instance_data (char * object_name);
31294
+/* --- runtime support functions --- */
31295
+static bbr_bh_t * allocate_bbr_bh(
31296
+ bbr_instance_data_t *BBRID,
31298
+static void bbr_io_handler( void * void_data );
31300
+/* -- EVMS Plugin interface functions -- */
31301
+static int bbr_discover(evms_logical_node_t **);
31302
+static int bbr_delete(evms_logical_node_t *);
31303
+static void bbr_read(evms_logical_node_t *, eio_t *);
31304
+static void bbr_write(evms_logical_node_t *, eio_t *);
31305
+static int bbr_ioctl (
31306
+ evms_logical_node_t *bbr_node,
31307
+ struct inode *inode,
31308
+ struct file *file,
31309
+ unsigned int cmd,
31310
+ unsigned long arg);
31311
+static int bbr_direct_ioctl (
31312
+ struct inode *inode,
31313
+ struct file *file,
31314
+ unsigned int cmd,
31315
+ unsigned long arg);
31317
+static int bbr_init_io(
31318
+ evms_logical_node_t * bbr_node,
31320
+ evms_sector_t startLSN,
31321
+ evms_sector_t nr_sects,
31325
diff -Naur linux-2002-03-28/include/linux/evms/evms_common.h evms-2002-03-28/include/linux/evms/evms_common.h
31326
--- linux-2002-03-28/include/linux/evms/evms_common.h Wed Dec 31 18:00:00 1969
31327
+++ evms-2002-03-28/include/linux/evms/evms_common.h Wed Mar 27 15:51:36 2002
31329
+/* -*- linux-c -*- */
31332
+ * Copyright (c) International Business Machines Corp., 2000
31334
+ * This program is free software; you can redistribute it and/or modify
31335
+ * it under the terms of the GNU General Public License as published by
31336
+ * the Free Software Foundation; either version 2 of the License, or
31337
+ * (at your option) any later version.
31339
+ * This program is distributed in the hope that it will be useful,
31340
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
31341
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
31342
+ * the GNU General Public License for more details.
31344
+ * You should have received a copy of the GNU General Public License
31345
+ * along with this program; if not, write to the Free Software
31346
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
31349
+ * linux/include/linux/evms/evms_common.h
31351
+ * EVMS common (kernel and user) header file
31355
+#ifndef __EVMS_COMMON_INCLUDED__
31356
+#define __EVMS_COMMON_INCLUDED__
31358
+/* version info */
31359
+#define EVMS_MAJOR 63 /* use experimental major 63 for now */
31360
+#define EVMS_MAJOR_VERSION 1
31361
+#define EVMS_MINOR_VERSION 0
31362
+#define EVMS_PATCHLEVEL_VERSION 0
31364
+#define MAX_EVMS_VOLUMES 256 /* There are 256 minors */
31365
+#define EVMS_VOLUME_NAME_SIZE 127
31367
+#define IBM_OEM_ID 8112 // could be anything, but used
31368
+ // I=8, B=1, M=12
31369
+// this one going away as well.
31370
+#define EVMS_OEM_IBM IBM_OEM_ID
31372
+#define EVMS_INITIAL_CRC 0xFFFFFFFF
31373
+#define EVMS_MAGIC_CRC 0x31415926
31375
+#define EVMS_VSECTOR_SIZE 512
31376
+#define EVMS_VSECTOR_SIZE_SHIFT 9
31378
+#define DEV_PATH "/dev"
31379
+#define EVMS_DIR_NAME "evms"
31380
+#define EVMS_DEV_NAME "block_device"
31381
+#define EVMS_DEV_NODE_PATH DEV_PATH "/" EVMS_DIR_NAME "/"
31382
+#define EVMS_DEVICE_NAME DEV_PATH "/" EVMS_DIR_NAME "/" EVMS_DEV_NAME
31384
+/* EVMS will always use 64-bit fields */
31385
+typedef u_int64_t evms_sector_t;
31387
+typedef struct evms_version_s {
31388
+ /* major changes when incompatible differences are introduced */
31390
+ /* minor changes when additions are made */
31392
+ /* patchlevel changes when bugs are fixed */
31393
+ u_int32_t patchlevel;
31396
+typedef enum evms_plugin_code_s {
31397
+ EVMS_NO_PLUGIN, // 0
31398
+ EVMS_DEVICE_MANAGER, // 1
31399
+ EVMS_SEGMENT_MANAGER, // 2
31400
+ EVMS_REGION_MANAGER, // 3
31401
+ EVMS_FEATURE, // 4
31402
+ EVMS_ASSOCIATIVE_FEATURE, // 5
31403
+ EVMS_FILESYSTEM_INTERFACE_MODULE, // 6
31404
+ EVMS_CLUSTER_MANAGER_INTERFACE_MODULE, // 7
31405
+ EVMS_DISTRIBUTED_LOCK_MANAGER_INTERFACE_MODULE // 8
31406
+} evms_plugin_code_t;
31408
+#define SetPluginID(oem, type, id) ((oem << 16) | (type << 12) | id)
31409
+#define GetPluginOEM(pluginid) (pluginid >> 16)
31410
+#define GetPluginType(pluginid) ((pluginid >> 12) & 0xf)
31411
+#define GetPluginID(pluginid) (pluginid & 0xfff)
31413
+/* bit definitions for the flags field in
31414
+ * the EVMS LOGICAL NODE (kernel) and
31415
+ * the EVMS LOGICAL VOLUME (user) structures.
31417
+#define EVMS_FLAGS_WIDTH 32
31418
+#define EVMS_VOLUME_FLAG (1<<0)
31419
+#define EVMS_VOLUME_PARTIAL_FLAG (1<<1)
31420
+#define EVMS_VOLUME_PARTIAL (1<<1)
31421
+#define EVMS_VOLUME_SET_READ_ONLY (1<<2)
31422
+#define EVMS_VOLUME_READ_ONLY (1<<2)
31423
+/* queued flags bits */
31424
+#define EVMS_REQUESTED_DELETE (1<<5)
31425
+#define EVMS_REQUESTED_QUIESCE (1<<6)
31426
+#define EVMS_REQUESTED_VFS_QUIESCE (1<<7)
31427
+/* this bit indicates corruption */
31428
+#define EVMS_VOLUME_CORRUPT (1<<8)
31429
+/* these bits define the source of the corruption */
31430
+#define EVMS_VOLUME_SOFT_DELETED (1<<9)
31431
+#define EVMS_VOLUME_GENDISK_GONE (1<<10)
31432
+/* these bits define volume status */
31433
+#define EVMS_MEDIA_CHANGED (1<<20)
31434
+#define EVMS_DEVICE_UNPLUGGED (1<<21)
31435
+/* these bits used for removable status */
31436
+#define EVMS_DEVICE_MEDIA_PRESENT (1<<24)
31437
+#define EVMS_DEVICE_PRESENT (1<<25)
31438
+#define EVMS_DEVICE_LOCKABLE (1<<26)
31439
+#define EVMS_DEVICE_REMOVABLE (1<<27)
31441
+/* version info for evms_feature_header_t */
31442
+#define EVMS_FEATURE_HEADER_MAJOR 3
31443
+#define EVMS_FEATURE_HEADER_MINOR 0
31444
+#define EVMS_FEATURE_HEADER_PATCHLEVEL 0
31446
+/* bit definitions of FEATURE HEADER bits in the FLAGS field */
31447
+#define EVMS_FEATURE_ACTIVE (1<<0)
31448
+#define EVMS_FEATURE_VOLUME_COMPLETE (1<<1)
31449
+/* bit definitions for VOLUME bits in the FLAGS field */
31450
+#define EVMS_VOLUME_DATA_OBJECT (1<<16)
31451
+#define EVMS_VOLUME_DATA_STOP (1<<17)
31453
+#define EVMS_FEATURE_HEADER_SIGNATURE 0x54414546 //FEAT
31454
+typedef struct evms_feature_header_s {
31455
+/* 0*/ u_int32_t signature;
31456
+/* 4*/ u_int32_t crc;
31457
+/* 8*/ evms_version_t version; /* structure version */
31458
+/* 20*/ evms_version_t engine_version; /* version of the Engine that */
31459
+ /* wrote this feature header */
31460
+/* 32*/ u_int32_t flags;
31461
+/* 36*/ u_int32_t feature_id;
31462
+/* 40*/ u_int64_t sequence_number;
31463
+/* 48*/ u_int64_t alignment_padding;
31464
+ //required: starting lsn to 1st copy of feature's metadata.
31465
+/* 56*/ evms_sector_t feature_data1_start_lsn;
31466
+/* 64*/ evms_sector_t feature_data1_size; //in 512 byte units
31467
+ //optional: starting lsn to 2nd copy of feature's metadata.
31468
+ // if unused set size field to 0.
31469
+/* 72*/ evms_sector_t feature_data2_start_lsn;
31470
+/* 80*/ evms_sector_t feature_data2_size; //in 512 byte units
31471
+/* 88*/ u_int64_t volume_serial_number;
31472
+/* 96*/ u_int32_t volume_system_id; /* the minor is stored here */
31473
+/*100*/ u_int32_t object_depth; /* depth of object in the volume tree */
31474
+/*104*/ char object_name[EVMS_VOLUME_NAME_SIZE+1];
31475
+/*232*/ char volume_name[EVMS_VOLUME_NAME_SIZE+1];
31476
+/*360*/ unsigned char pad[152];
31478
+} evms_feature_header_t;
31480
+/* EVMS specific error codes */
31481
+#define EVMS_FEATURE_FATAL_ERROR 257
31482
+#define EVMS_VOLUME_FATAL_ERROR 258
31484
+#define EVMS_FEATURE_INCOMPLETE_ERROR 259
31487
diff -Naur linux-2002-03-28/include/linux/evms/evms_drivelink.h evms-2002-03-28/include/linux/evms/evms_drivelink.h
31488
--- linux-2002-03-28/include/linux/evms/evms_drivelink.h Wed Dec 31 18:00:00 1969
31489
+++ evms-2002-03-28/include/linux/evms/evms_drivelink.h Wed Dec 12 09:37:43 2001
31491
+/* -*- linux-c -*- */
31494
+ * Copyright (c) International Business Machines Corp., 2000
31496
+ * This program is free software; you can redistribute it and/or modify
31497
+ * it under the terms of the GNU General Public License as published by
31498
+ * the Free Software Foundation; either version 2 of the License, or
31499
+ * (at your option) any later version.
31501
+ * This program is distributed in the hope that it will be useful,
31502
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
31503
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
31504
+ * the GNU General Public License for more details.
31506
+ * You should have received a copy of the GNU General Public License
31507
+ * along with this program; if not, write to the Free Software
31508
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
31511
+ * linux/include/linux/evms_drvlink.h
31513
+ * EVMS DriveLink Feature kernel header file
31517
+#ifndef __EVMS_DRIVELINK_INCLUDED__
31518
+#define __EVMS_DRIVELINK_INCLUDED__
31520
+#define EVMS_DRIVELINK_VERSION_MAJOR 2
31521
+#define EVMS_DRIVELINK_VERSION_MINOR 0
31522
+#define EVMS_DRIVELINK_VERSION_PATCHLEVEL 0
31524
+#define EVMS_DRIVELINK_FEATURE_ID 1
31525
+#define EVMS_DRIVELINK_SIGNATURE 0x4C767244 //DrvL
31526
+#define EVMS_DRIVELINK_MAX_ENTRIES 60
31528
+// description of on disk meta data sector for drivelink feature
31530
+typedef struct evms_dl_ordering_table_entry_s {
31531
+ u_int64_t child_serial_number;
31532
+ evms_sector_t child_vsize;
31533
+} evms_dl_ordering_table_entry_t;
31535
+typedef struct evms_drivelink_metadata_s {
31536
+/* 0*/ u_int32_t signature;
31537
+/* 4*/ u_int32_t crc;
31538
+/* 8*/ evms_version_t version;
31539
+/* 20*/ u_int32_t flags;
31540
+/* 24*/ u_int64_t sequence_number;
31541
+/* 32*/ u_int64_t child_serial_number;
31542
+/* 40*/ u_int64_t parent_serial_number;
31543
+/* 48*/ u_int64_t child_count;
31544
+/* 56*/ u_int64_t pad;
31545
+/* 64*/ evms_dl_ordering_table_entry_t ordering_table[EVMS_DRIVELINK_MAX_ENTRIES];
31547
+} evms_drivelink_metadata_t;
31550
+// description of in memory meta data for drivelink feature
31551
+typedef struct evms_drivelink_runtime_entry_s {
31552
+ u_int64_t block_size;
31553
+ evms_sector_t voffset;
31554
+ evms_sector_t vsize;
31555
+ evms_logical_node_t *child_node;
31556
+ evms_drivelink_metadata_t *child_metadata;
31557
+} evms_drivelink_runtime_entry_t;
31559
+typedef struct evms_drivelink_runtime_data_s {
31560
+ u_int64_t block_size;
31561
+ // keep the fields below this point in order
31562
+ u_int64_t parent_serial_number;
31563
+ u_int64_t child_count;
31564
+ evms_drivelink_runtime_entry_t *child_table;
31565
+} evms_drivelink_runtime_data_t;
31569
diff -Naur linux-2002-03-28/include/linux/evms/evms_ecr.h evms-2002-03-28/include/linux/evms/evms_ecr.h
31570
--- linux-2002-03-28/include/linux/evms/evms_ecr.h Wed Dec 31 18:00:00 1969
31571
+++ evms-2002-03-28/include/linux/evms/evms_ecr.h Wed Nov 7 14:32:21 2001
31575
+ * Copyright (c) International Business Machines Corp., 2000
31577
+ * This program is free software; you can redistribute it and/or modify
31578
+ * it under the terms of the GNU General Public License as published by
31579
+ * the Free Software Foundation; either version 2 of the License, or
31580
+ * (at your option) any later version.
31582
+ * This program is distributed in the hope that it will be useful,
31583
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
31584
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
31585
+ * the GNU General Public License for more details.
31587
+ * You should have received a copy of the GNU General Public License
31588
+ * along with this program; if not, write to the Free Software
31589
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
31593
+ * linux/include/linux/evms_ecr.h
31595
+ * EVMS Cluster enablement kernel header file
31599
+#ifndef __EVMS_ECR__
31601
+#define __EVMS_ECR__
31603
+#define ECR_SUCCESS 0
31604
+#define ECR_FAIL -1
31607
+ * Beginning of group messaging API
31609
+typedef int ecr_group_t;
31610
+typedef int ecr_nodeid_t;
31611
+typedef void ecr_cred_t;
31612
+typedef void ecr_instance_t;
31613
+typedef void ecr_message_t;
31615
+typedef enum ecr_type_s {
31616
+ ECR_GROUP_START, /* 0th entry is reserved */
31617
+ ECR_P2P, /* Point to Point message type */
31618
+ ECR_BROADCAST, /* Broadcast message type */
31619
+ ECR_ATOMIC_EXECUTE, /* Atomic execute type */
31620
+ ECR_GROUP_LAST /* Just a last enum type, not a message type */
31623
+typedef struct ecr_table_s {
31624
+ void (*join) (ecr_nodeid_t, uint, ecr_nodeid_t *, ecr_instance_t *);
31625
+ int (*can_join)(ecr_nodeid_t, ecr_cred_t *, size_t, ecr_instance_t *);
31626
+ void (*leave) (ecr_nodeid_t, ecr_instance_t *);
31627
+ void (*recover)(ecr_nodeid_t, ecr_instance_t *);
31628
+ void (*message)(ecr_message_t *, ecr_type_t, ecr_nodeid_t,
31629
+ void *, size_t, ecr_instance_t *);
31630
+ void (*vol_leave)(ecr_nodeid_t, ecr_instance_t *);
31634
+#define ECR_GROUPNAME_MAX_SIZE NAME_SIZE /* maximum size of a group name */
31636
+ecr_group_t ecr_group_join(char *, ecr_table_t *, ecr_cred_t *, size_t,
31637
+ ecr_instance_t *);
31638
+void ecr_group_leave(ecr_group_t);
31639
+int ecr_group_send(ecr_group_t, ecr_nodeid_t, void *, size_t,
31640
+ ecr_instance_t *,
31641
+ void callback(int, ecr_instance_t *));
31642
+int ecr_group_send_wait(ecr_group_t, ecr_nodeid_t, void *, size_t,
31644
+int ecr_group_broadcast(ecr_group_t, void *, size_t, ecr_instance_t *,
31645
+ void callback(u_char, ecr_instance_t *));
31646
+int ecr_group_broadcast_wait(ecr_group_t, void *, size_t, u_char *);
31647
+int ecr_group_atomic_execute(ecr_group_t, void *, size_t,
31648
+ ecr_instance_t *,
31649
+ void callback(ecr_instance_t *));
31650
+int ecr_group_atomic_execute_wait(ecr_group_t, void *, size_t);
31651
+void ecr_group_success_response(ecr_message_t *);
31652
+void ecr_group_failure_response(ecr_message_t *, int);
31657
+ * Beginning of distributed lock API
31660
+typedef int ecr_lock_t;
31661
+typedef enum ecr_lock_mode_s {
31662
+ ECR_LOCK_START, /* 0th entry is reserved */
31663
+ ECR_LOCK_CONCURRENT, /* concurrent access */
31664
+ ECR_LOCK_EXCLUSIVE, /* exclusive access */
31665
+ ECR_LOCK_LAST /* Just a last enum type, not a lock type */
31666
+} ecr_lock_mode_t;
31668
+typedef u_char ecr_mode_t;
31671
+#define ECR_LOCKNAME_MAX_SIZE NAME_SIZE /* maximum size of a lock name */
31672
+#define ECR_BLOCK 1 /* waitflag set */
31674
+ecr_lock_t ecr_lock_create(char * /* lock name */);
31675
+int ecr_lock(ecr_lock_t, u_int64_t, u_int64_t, ecr_lock_mode_t,
31676
+ u_char /*waitflag*/);
31677
+int ecr_unlock(ecr_lock_t, u_int64_t, u_int64_t);
31679
+#endif /* __EVMS_ECR__ */
31680
diff -Naur linux-2002-03-28/include/linux/evms/evms_ioctl.h evms-2002-03-28/include/linux/evms/evms_ioctl.h
31681
--- linux-2002-03-28/include/linux/evms/evms_ioctl.h Wed Dec 31 18:00:00 1969
31682
+++ evms-2002-03-28/include/linux/evms/evms_ioctl.h Thu Mar 21 14:08:50 2002
31684
+/* -*- linux-c -*- */
31687
+ * Copyright (c) International Business Machines Corp., 2000
31689
+ * This program is free software; you can redistribute it and/or modify
31690
+ * it under the terms of the GNU General Public License as published by
31691
+ * the Free Software Foundation; either version 2 of the License, or
31692
+ * (at your option) any later version.
31694
+ * This program is distributed in the hope that it will be useful,
31695
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
31696
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
31697
+ * the GNU General Public License for more details.
31699
+ * You should have received a copy of the GNU General Public License
31700
+ * along with this program; if not, write to the Free Software
31701
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
31704
+ * linux/include/linux/evms.h
31706
+ * EVMS public kernel header file
31710
+#ifndef __EVMS_IOCTL_INCLUDED__
31711
+#define __EVMS_IOCTL_INCLUDED__
31713
+#include <linux/hdreg.h>
31715
+/* IOCTL interface version definitions */
31716
+#define EVMS_IOCTL_INTERFACE_MAJOR 10
31717
+#define EVMS_IOCTL_INTERFACE_MINOR 0
31718
+#define EVMS_IOCTL_INTERFACE_PATCHLEVEL 0
31720
+/* IOCTL definitions */
31721
+typedef enum evms_ioctl_cmds_s {
31722
+ /* version commands */
31723
+ EVMS_GET_IOCTL_VERSION_NUMBER = 0,
31724
+ EVMS_GET_VERSION_NUMBER,
31726
+ /* EVMS internal commands */
31727
+ EVMS_GET_DISK_LIST_NUMBER = 0x40,
31728
+ EVMS_CHECK_MEDIA_CHANGE_NUMBER,
31729
+ EVMS_REVALIDATE_DISK_NUMBER,
31730
+ EVMS_OPEN_VOLUME_NUMBER,
31731
+ EVMS_CLOSE_VOLUME_NUMBER,
31732
+ EVMS_QUIESCE_VOLUME_NUMBER,
31734
+ /* configuration commands */
31735
+ EVMS_GET_INFO_LEVEL_NUMBER = 0x80,
31736
+ EVMS_SET_INFO_LEVEL_NUMBER,
31737
+ EVMS_REDISCOVER_VOLUMES_NUMBER,
31738
+ EVMS_DELETE_VOLUME_NUMBER,
31739
+ EVMS_PLUGIN_IOCTL_NUMBER,
31740
+ EVMS_PROCESS_NOTIFY_EVENT_NUMBER,
31741
+ /* query info commands */
31742
+ EVMS_GET_LOGICAL_DISK_NUMBER = 0xC0,
31743
+ EVMS_GET_LOGICAL_DISK_INFO_NUMBER,
31744
+ EVMS_SECTOR_IO_NUMBER,
31745
+ EVMS_GET_MINOR_NUMBER,
31746
+ EVMS_GET_VOLUME_DATA_NUMBER,
31747
+ EVMS_GET_PLUGIN_NUMBER,
31748
+ EVMS_COMPUTE_CSUM_NUMBER,
31749
+ EVMS_GET_BMAP_NUMBER,
31750
+} evms_ioctl_cmds_t;
31752
+/* version commands */
31753
+#define EVMS_GET_IOCTL_VERSION_STRING "EVMS_GET_IOCTL_VERSION"
31754
+#define EVMS_GET_IOCTL_VERSION _IOR(EVMS_MAJOR, EVMS_GET_IOCTL_VERSION_NUMBER, evms_version_t)
31756
+#define EVMS_GET_VERSION_STRING "EVMS_GET_VERSION"
31757
+#define EVMS_GET_VERSION _IOR(EVMS_MAJOR, EVMS_GET_VERSION_NUMBER, evms_version_t)
31761
+/* EVMS internal commands */
31762
+#define EVMS_GET_DISK_LIST_STRING "EVMS_GET_DISK_LIST"
31763
+#define EVMS_GET_DISK_LIST _IOWR(EVMS_MAJOR, EVMS_GET_DISK_LIST_NUMBER, evms_list_node_t **)
31765
+#define EVMS_CHECK_MEDIA_CHANGE_STRING "EVMS_CHECK_MEDIA_CHANGE"
31766
+#define EVMS_CHECK_MEDIA_CHANGE _IO(EVMS_MAJOR, EVMS_CHECK_MEDIA_CHANGE_NUMBER)
31768
+#define EVMS_REVALIDATE_DISK_STRING "EVMS_REVALIDATE_DISK"
31769
+#define EVMS_REVALIDATE_DISK _IO(EVMS_MAJOR, EVMS_REVALIDATE_DISK_NUMBER)
31771
+#define EVMS_OPEN_VOLUME_STRING "EVMS_OPEN_VOLUME"
31772
+#define EVMS_OPEN_VOLUME _IO(EVMS_MAJOR, EVMS_OPEN_VOLUME_NUMBER)
31774
+#define EVMS_CLOSE_VOLUME_STRING "EVMS_CLOSE_VOLUME"
31775
+#define EVMS_CLOSE_VOLUME _IO(EVMS_MAJOR, EVMS_CLOSE_VOLUME_NUMBER)
31777
+/* field: command: defines */
31778
+#define EVMS_UNQUIESCE 0
31779
+#define EVMS_QUIESCE 1
31781
+/* field: do_vfs: defines */
31782
+/* see evms_delete_volume */
31783
+typedef struct evms_quiesce_volume_s {
31784
+ int command; /* 0 = unquiesce, 1 = quiesce */
31785
+ int minor; /* minor device number of target volume */
31786
+ int do_vfs; /* 0 = do nothing, 1 = also perform equivalent VFS operation */
31787
+ int status; /* 0 = success */
31788
+} evms_quiesce_volume_t;
31790
+#define EVMS_QUIESCE_VOLUME_STRING "EVMS_QUIESCE_VOLUME"
31791
+#define EVMS_QUIESCE_VOLUME _IOR(EVMS_MAJOR, EVMS_QUIESCE_VOLUME_NUMBER, evms_quiesce_volume_t)
31795
+/* configuration commands */
31796
+#define EVMS_GET_INFO_LEVEL_STRING "EVMS_GET_INFO_LEVEL"
31797
+#define EVMS_GET_INFO_LEVEL _IOR(EVMS_MAJOR, EVMS_GET_INFO_LEVEL_NUMBER, int)
31799
+#define EVMS_SET_INFO_LEVEL_STRING "EVMS_SET_INFO_LEVEL"
31800
+#define EVMS_SET_INFO_LEVEL _IOW(EVMS_MAJOR, EVMS_SET_INFO_LEVEL_NUMBER, int)
31802
+/* field: drive_count: defines */
31803
+#define REDISCOVER_ALL_DEVICES 0xFFFFFFFF
31804
+typedef struct evms_rediscover_s {
31806
+ unsigned int drive_count; /* 0xffffffff = rediscover all known disks */
31807
+ unsigned long *drive_array;
31808
+} evms_rediscover_t;
31810
+#define EVMS_REDISCOVER_VOLUMES_STRING "EVMS_REDISCOVER_VOLUMES"
31811
+#define EVMS_REDISCOVER_VOLUMES _IOWR(EVMS_MAJOR, EVMS_REDISCOVER_VOLUMES_NUMBER, evms_rediscover_t)
31813
+/* field: command: defines */
31814
+#define EVMS_SOFT_DELETE 0
31815
+#define EVMS_HARD_DELETE 1
31817
+/* field: do_vfs: defines */
31818
+#define EVMS_VFS_DO_NOTHING 0
31819
+#define EVMS_VFS_DO 1
31820
+typedef struct evms_delete_volume_s {
31821
+ int command; /* 0 = "temp", 1 = "permanent" */
31822
+ int minor; /* minor device number of target volume */
31823
+ int do_vfs; /* 0 = do nothing, 1 = perform VFS operations */
31824
+ int associative_minor; /* optional minor of associative volume */
31825
+ /* must be 0 when not in use */
31826
+ int status; /* 0 = success, other is error */
31827
+} evms_delete_volume_t;
31829
+#define EVMS_DELETE_VOLUME_STRING "EVMS_DELETE_VOLUME"
31830
+#define EVMS_DELETE_VOLUME _IOR(EVMS_MAJOR, EVMS_DELETE_VOLUME_NUMBER, evms_delete_volume_t)
31832
+typedef struct evms_plugin_ioctl_s {
31833
+ unsigned long feature_id; /* ID of feature to receive this ioctl */
31834
+ int feature_command; /* feature specific ioctl command */
31835
+ int status; /* 0 = completed, non-0 = error */
31836
+ void *feature_ioctl_data; /* ptr to feature specific struct */
31837
+} evms_plugin_ioctl_t;
31839
+#define EVMS_PLUGIN_IOCTL_STRING "EVMS_PLUGIN_IOCTL"
31840
+#define EVMS_PLUGIN_IOCTL _IOR(EVMS_MAJOR, EVMS_PLUGIN_IOCTL_NUMBER, evms_plugin_ioctl_t)
31842
+/* field: eventid: defines */
31843
+#define EVMS_EVENT_END_OF_DISCOVERY 0
31844
+typedef struct evms_event_s {
31845
+ int pid; /* PID to act on */
31846
+ int eventid; /* event id to respond to */
31847
+ int signo; /* signal # to send when event occurs */
31850
+/* field: command: defines */
31851
+#define EVMS_EVENT_UNREGISTER 0
31852
+#define EVMS_EVENT_REGISTER 1
31853
+typedef struct evms_notify_s {
31854
+ int command; /* 0 = unregister, 1 = register */
31855
+ evms_event_t eventry; /* event structure */
31856
+ int status; /* return status */
31859
+#define EVMS_PROCESS_NOTIFY_EVENT_STRING "EVMS_PROCESS_NOTIFY_EVENT"
31860
+#define EVMS_PROCESS_NOTIFY_EVENT _IOWR(EVMS_MAJOR, EVMS_PROCESS_NOTIFY_EVENT_NUMBER, evms_notify_t)
31862
+/* query info commands */
31864
+/* field: command: defines */
31865
+#define EVMS_FIRST_DISK 0
31866
+#define EVMS_NEXT_DISK 1
31868
+/* field: status: defines */
31869
+#define EVMS_DISK_INVALID 0
31870
+#define EVMS_DISK_VALID 1
31871
+typedef struct evms_user_disk_s {
31872
+ int command; /* 0 = first disk, 1 = next disk */
31873
+ int status; /* 0 = no more disks, 1 = valid disk info */
31874
+ unsigned long disk_handle; /* only valid when status == 1 */
31875
+} evms_user_disk_t;
31877
+#define EVMS_GET_LOGICAL_DISK_STRING "EVMS_GET_LOGICAL_DISK"
31878
+#define EVMS_GET_LOGICAL_DISK _IOWR(EVMS_MAJOR, EVMS_GET_LOGICAL_DISK_NUMBER, evms_user_disk_t)
31880
+/* flags fields described in evms_common.h */
31881
+typedef struct evms_user_disk_info_s {
31882
+ unsigned int status;
31883
+ unsigned int flags;
31884
+ unsigned long disk_handle;
31885
+ unsigned int disk_dev;
31886
+ struct hd_geometry geometry;
31887
+ unsigned int block_size;
31888
+ unsigned int hardsect_size;
31889
+ u_int64_t total_sectors;
31890
+ char disk_name[EVMS_VOLUME_NAME_SIZE];
31891
+} evms_user_disk_info_t;
31893
+#define EVMS_GET_LOGICAL_DISK_INFO_STRING "EVMS_GET_LOGICAL_DISK_INFO"
31894
+#define EVMS_GET_LOGICAL_DISK_INFO _IOWR(EVMS_MAJOR, EVMS_GET_LOGICAL_DISK_INFO_NUMBER, evms_user_disk_info_t)
31896
+/* field: io_flag: defines */
31897
+#define EVMS_SECTOR_IO_READ 0
31898
+#define EVMS_SECTOR_IO_WRITE 1
31899
+typedef struct evms_sector_io_s {
31900
+ unsigned long disk_handle; /* valid disk handle */
31901
+ int io_flag; /* 0 = READ, 1 = WRITE */
31902
+ evms_sector_t starting_sector; /* disk relative LBA */
31903
+ evms_sector_t sector_count; /* number of sectors in IO */
31904
+ unsigned char *buffer_address; /* IO address */
31905
+ int status; /* 0 = success, not 0 = error */
31906
+} evms_sector_io_t;
31908
+#define EVMS_SECTOR_IO_STRING "EVMS_SECTOR_IO"
31909
+#define EVMS_SECTOR_IO _IOWR(EVMS_MAJOR, EVMS_SECTOR_IO_NUMBER, evms_sector_io_t)
31911
+/* field: command: defines */
31912
+#define EVMS_FIRST_VOLUME 0
31913
+#define EVMS_NEXT_VOLUME 1
31915
+/* field: status: defines */
31916
+#define EVMS_VOLUME_INVALID 0
31917
+#define EVMS_VOLUME_VALID 1
31918
+typedef struct evms_user_minor_s {
31919
+ int command; /* 0 = first volume, 1 = next volume */
31920
+ int status; /* 0 = no more, 1 = valid info */
31921
+ int minor; /* only valid when status == 1 */
31922
+} evms_user_minor_t;
31924
+#define EVMS_GET_MINOR_STRING "EVMS_GET_MINOR"
31925
+#define EVMS_GET_MINOR _IOWR(EVMS_MAJOR, EVMS_GET_MINOR_NUMBER, evms_user_minor_t)
31927
+/* flags field described in evms_common.h */
31928
+typedef struct evms_volume_data_s {
31929
+ int minor; /* minor of target volume */
31931
+ char volume_name[EVMS_VOLUME_NAME_SIZE + 1];
31933
+} evms_volume_data_t;
31935
+#define EVMS_GET_VOLUME_DATA_STRING "EVMS_GET_VOLUME_DATA"
31936
+#define EVMS_GET_VOLUME_DATA _IOWR(EVMS_MAJOR, EVMS_GET_VOLUME_DATA_NUMBER, evms_volume_data_t)
31938
+/* field: command: defines */
31939
+#define EVMS_FIRST_PLUGIN 0
31940
+#define EVMS_NEXT_PLUGIN 1
31942
+/* field: status: defines */
31943
+#define EVMS_PLUGIN_INVALID 0
31944
+#define EVMS_PLUGIN_VALID 1
31945
+typedef struct evms_kernel_plugin_s {
31946
+ int command; /* 0 = first item, 1 = next item */
31947
+ u_int32_t id; /* returned plugin id */
31948
+ evms_version_t version; /* maj,min,patch of plugin */
31949
+ int status; /* 0 = no more, 1 = valid info */
31950
+} evms_kernel_plugin_t;
31952
+#define EVMS_GET_PLUGIN_STRING "EVMS_GET_PLUGIN"
31953
+#define EVMS_GET_PLUGIN _IOWR(EVMS_MAJOR, EVMS_GET_PLUGIN_NUMBER, evms_kernel_plugin_t)
31955
+typedef struct evms_compute_csum_s {
31956
+ unsigned char *buffer_address; /* IO address */
31957
+ int buffer_size; /* byte size of buffer */
31958
+ unsigned int insum; /* previous csum to be factored in */
31959
+ unsigned int outsum; /* resulting csum value of buffer */
31960
+ int status; /* 0 = success, not 0 = error */
31961
+} evms_compute_csum_t;
31963
+#define EVMS_COMPUTE_CSUM_STRING "EVMS_COMPUTE_CSUM"
31964
+#define EVMS_COMPUTE_CSUM _IOWR(EVMS_MAJOR, EVMS_COMPUTE_CSUM_NUMBER, evms_compute_csum_t)
31966
+typedef struct evms_get_bmap_s {
31967
+ u_int64_t rsector; /* input: volume relative rsector value */
31968
+ /* output: disk relative rsector value */
31969
+ u_int32_t dev; /* output = physical device */
31970
+ int status; /* 0 = success, not 0 = error */
31971
+} evms_get_bmap_t;
31973
+#define EVMS_GET_BMAP_STRING "EVMS_GET_BMAP"
31974
+#define EVMS_GET_BMAP _IOWR(EVMS_MAJOR, EVMS_GET_BMAP_NUMBER, evms_get_bmap_t)
31977
diff -Naur linux-2002-03-28/include/linux/evms/evms_kernel.h evms-2002-03-28/include/linux/evms/evms_kernel.h
31978
--- linux-2002-03-28/include/linux/evms/evms_kernel.h Wed Dec 31 18:00:00 1969
31979
+++ evms-2002-03-28/include/linux/evms/evms_kernel.h Wed May 16 13:40:56 2001
31981
+/* -*- linux-c -*- */
31984
+ * Copyright (c) International Business Machines Corp., 2000
31986
+ * This program is free software; you can redistribute it and/or modify
31987
+ * it under the terms of the GNU General Public License as published by
31988
+ * the Free Software Foundation; either version 2 of the License, or
31989
+ * (at your option) any later version.
31991
+ * This program is distributed in the hope that it will be useful,
31992
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
31993
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
31994
+ * the GNU General Public License for more details.
31996
+ * You should have received a copy of the GNU General Public License
31997
+ * along with this program; if not, write to the Free Software
31998
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
32001
+ * linux/include/linux/evms_kernel.h
32003
+ * EVMS (master) kernel header file
32007
+#include <linux/evms/evms_common.h>
32008
+#include <linux/evms/evms.h>
32009
+#include <linux/evms/evms_ioctl.h>
32010
diff -Naur linux-2002-03-28/include/linux/evms/evms_linear.h evms-2002-03-28/include/linux/evms/evms_linear.h
32011
--- linux-2002-03-28/include/linux/evms/evms_linear.h Wed Dec 31 18:00:00 1969
32012
+++ evms-2002-03-28/include/linux/evms/evms_linear.h Thu Jan 10 12:51:50 2002
32014
+#ifndef __EVMS_LINEAR_H
32015
+#define __EVMS_LINEAR_H
32017
+#include <linux/evms/evms_md.h>
32020
+ evms_logical_node_t *node;
32022
+ unsigned long size;
32023
+ unsigned long offset;
32026
+typedef struct dev_info dev_info_t;
32028
+struct linear_hash
32030
+ dev_info_t *dev0, *dev1;
32033
+struct linear_private_data
32035
+ struct linear_hash *hash_table;
32036
+ dev_info_t disks[MD_SB_DISKS];
32037
+ dev_info_t *smallest;
32042
+typedef struct linear_private_data linear_conf_t;
32044
+#define mddev_to_conf(mddev) ((linear_conf_t *) mddev->private)
32047
diff -Naur linux-2002-03-28/include/linux/evms/evms_lvm.h evms-2002-03-28/include/linux/evms/evms_lvm.h
32048
--- linux-2002-03-28/include/linux/evms/evms_lvm.h Wed Dec 31 18:00:00 1969
32049
+++ evms-2002-03-28/include/linux/evms/evms_lvm.h Thu Mar 21 16:30:34 2002
32051
+/* -*- linux-c -*- */
32053
+ * Copyright (c) International Business Machines Corp., 2000
32055
+ * This program is free software; you can redistribute it and/or modify
32056
+ * it under the terms of the GNU General Public License as published by
32057
+ * the Free Software Foundation; either version 2 of the License, or
32058
+ * (at your option) any later version.
32060
+ * This program is distributed in the hope that it will be useful,
32061
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
32062
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
32063
+ * the GNU General Public License for more details.
32065
+ * You should have received a copy of the GNU General Public License
32066
+ * along with this program; if not, write to the Free Software
32067
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
32070
+ * linux/include/linux/evms_lvm.h
32072
+ * EVMS LVM VGE kernel header file
32076
+#ifndef __EVMS_LVM_H__
32077
+#define __EVMS_LVM_H__
32079
+#define EVMS_LVM_VERSION_MAJOR 1
32080
+#define EVMS_LVM_VERSION_MINOR 0
32081
+#define EVMS_LVM_VERSION_PATCH 0
32083
+// The following definitions and data structures are copied from lvm.h and
32084
+// liblvm.h from the LVM 0.9.1beta8 distribution. Since the metadata format
32085
+// changed in beta8, lvm.h changed significantly enough that this module would
32086
+// no longer compile. Instead of requiring evms users to install the latest lvm
32087
+// release, the required definitions and data structures will now be included
32088
+// in this header file.
32090
+#ifndef SECTOR_SIZE
32091
+#define SECTOR_SIZE 512
32094
+#define MAX_LV 256
32095
+#define MAX_PV 256 /* caused by 8 bit minor */
32096
+#define NAME_LEN 128 /* don't change!!! */
32097
+#define UUID_LEN 32 /* don't change!!! */
32098
+#define LV_SET_ACCESS _IOW ( 0xfe, 0x28, 1)
32099
+#define LV_SET_ALLOCATION _IOW ( 0xfe, 0x29, 1)
32100
+#define LV_SET_STATUS _IOW ( 0xfe, 0x2a, 1)
32101
+#define LV_SNAPSHOT_USE_RATE _IOWR ( 0xfe, 0x2c, 1)
32102
+#define LV_BMAP _IOWR ( 0xfe, 0x30, 1)
32103
+#define LVM_VGDA_ALIGN 4096UL /* some metadata on the disk need to be aligned */
32104
+#define LVM_PV_DISK_BASE 0L /* base of PV structure in disk partition */
32105
+#define LVM_PV_DISK_SIZE 1024L /* size reserved for PV structure on disk */
32106
+#define LVM_VG_DISK_BASE round_up(LVM_PV_DISK_BASE + LVM_PV_DISK_SIZE, LVM_VGDA_ALIGN)
32107
+ /* base of VG structure in disk partition */
32108
+#define LVM_VG_DISK_SIZE (8*512L) /* size reserved for VG structure */
32113
+/* logical volume */
32114
+#define LV_ACTIVE 0x01 /* lv_status */
32115
+#define LV_READ 0x01 /* lv_access */
32116
+#define LV_WRITE 0x02 /* " */
32117
+#define LV_SNAPSHOT 0x04 /* " */
32118
+#define LV_SNAPSHOT_ORG 0x08 /* " */
32120
+/* copy on write tables in disk format */
32121
+typedef struct lv_COW_table_disk_v1 {
32122
+ uint64_t pv_org_number;
32123
+ uint64_t pv_org_rsector;
32124
+ uint64_t pv_snap_number;
32125
+ uint64_t pv_snap_rsector;
32126
+} lv_COW_table_disk_t;
32128
+/* disk stored pe information */
32134
+/* disk stored PV, VG, LV and PE size and offset information */
32138
+} lvm_disk_data_t;
32141
+typedef struct pv_disk_v2 {
32142
+ uint8_t id[2]; /* Identifier */
32143
+ uint16_t version; /* HM lvm version */
32144
+ lvm_disk_data_t pv_on_disk;
32145
+ lvm_disk_data_t vg_on_disk;
32146
+ lvm_disk_data_t pv_uuidlist_on_disk;
32147
+ lvm_disk_data_t lv_on_disk;
32148
+ lvm_disk_data_t pe_on_disk;
32149
+ uint8_t pv_uuid[NAME_LEN];
32150
+ uint8_t vg_name[NAME_LEN];
32151
+ uint8_t system_id[NAME_LEN]; /* for vgexport/vgimport */
32152
+ uint32_t pv_major;
32153
+ uint32_t pv_number;
32154
+ uint32_t pv_status;
32155
+ uint32_t pv_allocatable;
32156
+ uint32_t pv_size; /* HM */
32158
+ uint32_t pe_size;
32159
+ uint32_t pe_total;
32160
+ uint32_t pe_allocated;
32162
+ /* new in struct version 2 */
32163
+ uint32_t pe_start; /* in sectors */
32168
+typedef struct lv_disk_v3 {
32169
+ uint8_t lv_name[NAME_LEN];
32170
+ uint8_t vg_name[NAME_LEN];
32171
+ uint32_t lv_access;
32172
+ uint32_t lv_status;
32173
+ uint32_t lv_open; /* HM */
32174
+ uint32_t lv_dev; /* HM */
32175
+ uint32_t lv_number; /* HM */
32176
+ uint32_t lv_mirror_copies; /* for future use */
32177
+ uint32_t lv_recovery; /* " */
32178
+ uint32_t lv_schedule; /* " */
32179
+ uint32_t lv_size;
32180
+ uint32_t lv_snapshot_minor;/* minor number of original */
32181
+ uint16_t lv_chunk_size; /* chunk size of snapshot */
32183
+ uint32_t lv_allocated_le;
32184
+ uint32_t lv_stripes;
32185
+ uint32_t lv_stripesize;
32186
+ uint32_t lv_badblock; /* for future use */
32187
+ uint32_t lv_allocation;
32188
+ uint32_t lv_io_timeout; /* for future use */
32189
+ uint32_t lv_read_ahead; /* HM */
32193
+typedef struct vg_disk_v2 {
32194
+ uint8_t vg_uuid[UUID_LEN]; /* volume group UUID */
32195
+ uint8_t vg_name_dummy[NAME_LEN-UUID_LEN]; /* rest of v1 VG name */
32196
+ uint32_t vg_number; /* volume group number */
32197
+ uint32_t vg_access; /* read/write */
32198
+ uint32_t vg_status; /* active or not */
32199
+ uint32_t lv_max; /* maximum logical volumes */
32200
+ uint32_t lv_cur; /* current logical volumes */
32201
+ uint32_t lv_open; /* open logical volumes */
32202
+ uint32_t pv_max; /* maximum physical volumes */
32203
+ uint32_t pv_cur; /* current physical volumes FU */
32204
+ uint32_t pv_act; /* active physical volumes */
32206
+ uint32_t vgda; /* volume group descriptor arrays FU */
32207
+ uint32_t pe_size; /* physical extent size in sectors */
32208
+ uint32_t pe_total; /* total of physical extents */
32209
+ uint32_t pe_allocated; /* allocated physical extents */
32210
+ uint32_t pvg_total; /* physical volume groups FU */
32213
+/* useful inlines */
32214
+static inline ulong round_up(ulong n, ulong size) {
32216
+ return (n + size) & ~size;
32219
+static inline ulong div_up(ulong n, ulong size) {
32220
+ return round_up(n, size) / size;
32223
+// End of lvm.h imported data structures
32226
+#define DEV_DIRECTORY "/dev/"
32227
+#define LVM_DEV_DIRECTORY "lvm/"
32228
+#define LVM_PROC_NAME "lvm"
32229
+#define LVM_PROC_VG_NAME "VGs"
32230
+#define LVM_PROC_LV_NAME "LVs"
32231
+#define LVM_PROC_PV_NAME "PVs"
32232
+#define LVM_PROC_GLOBAL_NAME "global"
32233
+#define IO_BUFFER_SECTORS 8
32235
+// Structure for doing PV remove ioctls
32237
+#define EVMS_LVM_PV_REMOVE_IOCTL 0x01
32238
+#define EVMS_LVM_SNAPSHOT_STAT_IOCTL 0x02
32240
+typedef struct lvm_pv_remove_ioctl_s {
32241
+ unsigned char vg_uuid[UUID_LEN];
32243
+ struct lvm_pv_remove_ioctl_s * next;
32244
+} lvm_pv_remove_ioctl_t;
32247
+// Structure for doing snapshot stat ioctls
32248
+typedef struct lvm_snapshot_stat_ioctl_s {
32249
+ unsigned char vg_uuid[UUID_LEN];
32251
+ evms_sector_t next_free_chunk;
32252
+ u_int32_t lv_status;
32253
+} lvm_snapshot_stat_ioctl_t;
32256
+// Entries in the list of physical volumes (PV)
32257
+// in a volume group (VG)
32258
+typedef struct lvm_physical_volume_s {
32259
+ evms_logical_node_t * logical_node;
32260
+ pv_disk_t * pv; // Copy of on-disk PV struct
32261
+ pe_disk_t * pe_map;
32262
+ u_int32_t pv_number;
32263
+ struct lvm_physical_volume_s * next;
32264
+} lvm_physical_volume_t;
32267
+// Table for mapping logical extents (LE) to physical extents (PE)
32268
+typedef struct le_table_entry_s {
32269
+ lvm_physical_volume_t * owning_pv;
32270
+ evms_sector_t pe_sector_offset;
32271
+} le_table_entry_t;
32274
+// Entries in the snapshot remapping structure
32275
+typedef struct snapshot_map_entry_s {
32276
+ evms_sector_t org_sector;
32277
+ evms_sector_t snap_sector;
32278
+ lvm_physical_volume_t * snap_pv;
32279
+ struct snapshot_map_entry_s * next;
32280
+ struct snapshot_map_entry_s * prev;
32281
+} snapshot_map_entry_t;
32284
+// Logical volumes (LV) in a volume group (VG)
32285
+#define EVMS_LV_NEW 0x10 // volume was created during the current discovery pass
32286
+#define EVMS_LV_INCOMPLETE 0x20 // volume has an incomplete LE map
32287
+#define EVMS_LV_INVALID 0x40 // volume has a memory-corruption problem
32288
+#define EVMS_LV_QUIESCED 0x80 // volume is in quiesced state
32289
+#define MAX_HASH_CHAIN_ENTRIES 10
32290
+#define CHUNK_DATA_BUFFER_SIZE 64 // 32k in sectors. Feel free to change, but must be power of 2!
32292
+typedef struct lvm_logical_volume_s {
32293
+ u_int32_t lv_number;
32294
+ evms_sector_t lv_size; // Sectors
32295
+ u_int32_t lv_access; // Flags: LV_READ, LV_WRITE, LV_SNAPSHOT, LV_SNAPSHOT_ORG, EVMS_LV_*
32296
+ u_int32_t lv_status; // Flags: LV_ACTIVE, LV_SPINDOWN
32297
+ u_int32_t lv_minor; // Device minor number
32298
+ u_int32_t stripes;
32299
+ u_int32_t stripe_size; // Sectors
32300
+ u_int32_t stripe_size_shift; // Number of bits to shift right instead of dividing by stripe_size
32301
+ u_int32_t pe_size; // Sectors
32302
+ u_int32_t pe_size_shift; // Number of bits to shift right instead of dividing by pe_size
32303
+ u_int32_t num_le; // Number of entries in the le_to_pe_map
32304
+ struct lvm_volume_group_s * group; // Pointer back to parent volume group
32305
+ unsigned char name[NAME_LEN]; // Dev-tree volume name (eg: /dev/group0/vol0)
32306
+ le_table_entry_t * le_map; // Mapping of logical to physical extents
32307
+ evms_logical_node_t * volume_node; // Pointer to the parent EVMS node representing this volume
32309
+ // Snapshotting information
32310
+ u_int32_t chunk_size; // Sectors
32311
+ u_int32_t num_chunks; // lv_size/chunk_size
32312
+ u_int32_t snap_org_minor; // Minor number of snapshot original
32313
+ u_int32_t next_cow_entry; // Index into current COW table
32314
+ evms_sector_t current_cow_sector; // LOGICAL sector of current COW table
32315
+ evms_sector_t next_free_chunk; // Starting LOGICAL sector of next free chunk
32316
+ u_int32_t hash_table_size; // Number of pointers in each hash table
32317
+ lv_COW_table_disk_t * cow_table; // Pointer to one sector's worth of COW tables
32318
+ unsigned char * chunk_data_buffer; // Buffer for reading data when doing a copy-on-write
32319
+ struct semaphore snap_semaphore; // For locking during snapshot I/O operations
32320
+ snapshot_map_entry_t *** snapshot_map; // Pointer to the remapping hash tables
32321
+ struct lvm_logical_volume_s * snapshot_next; // Linked list of volumes snapshotting the original
32322
+ struct lvm_logical_volume_s * snapshot_org; // Pointer to volume being snapshotted
32323
+} lvm_logical_volume_t;
32326
+// Volume groups (VG)
32328
+#define EVMS_VG_DIRTY (1 << 0) // group is new or has had a PV added during this discovery
32329
+#define EVMS_VG_PARTIAL_PVS (1 << 1) // group contains at least one partial PV.
32330
+#define EVMS_VG_REMOVABLE_PVS (1 << 2) // group contains at least one removeable PV.
32332
+typedef struct lvm_volume_group_s {
32333
+ vg_disk_t * vg; // Copy of on-disk VG metadata
32334
+ lvm_physical_volume_t * pv_list; // List of PVs that make up this group
32335
+ lvm_logical_volume_t * volume_list[MAX_LV+1]; // Array of volumes
32336
+ lv_disk_t * lv_array; // Array of LV metadata
32337
+ unsigned char * uuid_list; // List of PV UUIDs
32338
+ unsigned char vg_uuid[UUID_LEN]; // UUID from the VG metadata
32339
+ char vg_name[NAME_LEN]; // Name from the PV metadata
32340
+ u_int32_t pv_count; // Number of PVs found in this group
32341
+ u_int32_t volume_count; // Number of LVs found in this group
32342
+ int hard_sect_size; // The largest hard_sect_size and block_size
32343
+ int block_size; // values of all PVs in this group.
32344
+ u_int32_t flags; // EVMS_VG_?
32345
+ struct lvm_volume_group_s * next_group;
32346
+} lvm_volume_group_t;
32351
diff -Naur linux-2002-03-28/include/linux/evms/evms_md.h evms-2002-03-28/include/linux/evms/evms_md.h
32352
--- linux-2002-03-28/include/linux/evms/evms_md.h Wed Dec 31 18:00:00 1969
32353
+++ evms-2002-03-28/include/linux/evms/evms_md.h Thu Mar 14 17:01:39 2002
32356
+ * Copyright (c) International Business Machines Corp., 2000
32358
+ * This program is free software; you can redistribute it and/or modify
32359
+ * it under the terms of the GNU General Public License as published by
32360
+ * the Free Software Foundation; either version 2 of the License, or
32361
+ * (at your option) any later version.
32363
+ * This program is distributed in the hope that it will be useful,
32364
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
32365
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
32366
+ * the GNU General Public License for more details.
32368
+ * You should have received a copy of the GNU General Public License
32369
+ * along with this program; if not, write to the Free Software
32370
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
32372
+ * linux/include/linux/evms/evms_md.h
32374
+ * EVMS Linux MD Region Manager Public Header File
32376
+ * 'evms_md.h' is an EVMS version of linux/include/linux/raid/md.h modified
32377
+ * by Cuong (Mike) Tran <miketran@us.ibm.com>, January 2002.
32381
+#ifndef __EVMS_MD_INCLUDED
32382
+#define __EVMS_MD_INCLUDED
32384
+#include <linux/mm.h>
32385
+#include <linux/fs.h>
32386
+#include <linux/blkdev.h>
32387
+#include <asm/semaphore.h>
32388
+#include <linux/major.h>
32389
+#include <linux/ioctl.h>
32390
+#include <linux/types.h>
32391
+#include <asm/bitops.h>
32392
+#include <linux/module.h>
32393
+#include <linux/hdreg.h>
32394
+#include <linux/proc_fs.h>
32395
+#include <linux/smp_lock.h>
32396
+#include <linux/delay.h>
32397
+#include <net/checksum.h>
32398
+#include <linux/random.h>
32399
+#include <linux/locks.h>
32400
+#include <linux/kernel_stat.h>
32401
+#include <asm/io.h>
32402
+#include <linux/completion.h>
32404
+#include <linux/evms/evms_kernel.h>
32406
+#include <linux/raid/md_compatible.h>
32408
+ * 'md_p.h' holds the 'physical' layout of RAID devices
32409
+ * 'md_u.h' holds the user <=> kernel API
32411
+ * 'md_k.h' holds kernel internal definitions
32414
+#include <linux/evms/evms_md_p.h>
32415
+#include <linux/evms/evms_md_u.h>
32416
+#include <linux/evms/evms_md_k.h>
32418
+#ifndef MAX_READAHEAD /* The following #defines were removed as of 2.4.16 kernel */
32420
+#define MAX_READAHEAD 31
32421
+#define MIN_READAHEAD 3
32426
+ * Different major versions are not compatible.
32427
+ * Different minor versions are only downward compatible.
32428
+ * Different patchlevel versions are downward and upward compatible.
32430
+#define MD_MAJOR_VERSION 0
32431
+#define MD_MINOR_VERSION 90
32432
+#define MD_PATCHLEVEL_VERSION 0
32434
+#define EVMS_MD_COMMON_SERVICES_MAJOR 0
32435
+#define EVMS_MD_COMMON_SERVICES_MINOR 5
32436
+#define EVMS_MD_COMMON_SERVICES_PATCHLEVEL 0
32439
+extern int evms_md_size[MAX_MD_DEVS];
32441
+extern void evms_md_add_mddev_mapping (mddev_t *mddev, kdev_t dev, void *data);
32442
+extern void evms_md_del_mddev_mapping (mddev_t *mddev, kdev_t dev);
32443
+extern char * evms_md_partition_name (evms_logical_node_t *node);
32444
+extern int evms_register_md_personality (int p_num, mdk_personality_t *p);
32445
+extern int evms_unregister_md_personality (int p_num);
32447
+extern int evms_md_update_sb (mddev_t *mddev);
32448
+extern int evms_md_check_ordering (mddev_t *mddev);
32449
+extern void evms_md_print_devices (void);
32451
+extern int evms_md_do_sync(mddev_t *mddev, mdp_disk_t *spare);
32452
+extern void evms_md_done_sync(mddev_t *mddev, int blocks, int ok);
32453
+extern void evms_md_sync_acct(kdev_t dev, unsigned long nr_sectors);
32454
+extern void evms_md_recover_arrays (void);
32455
+extern int evms_md_error (mddev_t *mddev, evms_logical_node_t *node);
32457
+#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); evms_md_print_devices(); }
32462
diff -Naur linux-2002-03-28/include/linux/evms/evms_md_k.h evms-2002-03-28/include/linux/evms/evms_md_k.h
32463
--- linux-2002-03-28/include/linux/evms/evms_md_k.h Wed Dec 31 18:00:00 1969
32464
+++ evms-2002-03-28/include/linux/evms/evms_md_k.h Mon Mar 11 22:58:16 2002
32467
+ * Copyright (c) International Business Machines Corp., 2000
32469
+ * This program is free software; you can redistribute it and/or modify
32470
+ * it under the terms of the GNU General Public License as published by
32471
+ * the Free Software Foundation; either version 2 of the License, or
32472
+ * (at your option) any later version.
32474
+ * This program is distributed in the hope that it will be useful,
32475
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
32476
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
32477
+ * the GNU General Public License for more details.
32479
+ * You should have received a copy of the GNU General Public License
32480
+ * along with this program; if not, write to the Free Software
32481
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
32484
+ * linux/include/linux/evms/evms_md_k.h
32486
+ * EVMS Linux MD Region Manager Public Header File
32488
+ * 'evms_md_k.h' is an EVMS version of linux/include/linux/raid/md_k.h modified
32489
+ * by Cuong (Mike) Tran <miketran@us.ibm.com>, January 2002.
32493
+#ifndef __EVMS_MD_K_INC__
32494
+#define __EVMS_MD_K_INC__
32496
+#define MD_RESERVED 0UL
32497
+#define LINEAR 1UL
32501
+#define TRANSLUCENT 5UL
32503
+#define MULTIPATH 7UL
32504
+#define MAX_PERSONALITY 8UL
32506
+static inline int pers_to_level (int pers)
32509
+ case MULTIPATH: return -4;
32510
+ case HSM: return -3;
32511
+ case TRANSLUCENT: return -2;
32512
+ case LINEAR: return -1;
32513
+ case RAID0: return 0;
32514
+ case RAID1: return 1;
32515
+ case RAID5: return 5;
32518
+ return MD_RESERVED;
32521
+static inline int level_to_pers (int level)
32524
+ case -3: return HSM;
32525
+ case -2: return TRANSLUCENT;
32526
+ case -1: return LINEAR;
32527
+ case 0: return RAID0;
32528
+ case 1: return RAID1;
32530
+ case 5: return RAID5;
32532
+ return MD_RESERVED;
32535
+typedef struct mddev_s mddev_t;
32536
+typedef struct mdk_rdev_s mdk_rdev_t;
32538
+#if (MINORBITS != 8)
32539
+#error MD doesnt handle bigger kdev yet
32542
+#define MAX_MD_DEVS (1<<MINORBITS) /* Max number of md dev */
32545
+ * Maps a kdev to an mddev/subdev. How 'data' is handled is up to
32546
+ * the personality. (eg. HSM uses this to identify individual LVs)
32548
+typedef struct dev_mapping_s {
32554
+extern dev_mapping_t evms_mddev_map [MAX_MD_DEVS];
32555
+static inline mddev_t * kdev_to_mddev (kdev_t dev)
32557
+ if (MAJOR(dev) != MD_MAJOR)
32559
+ return evms_mddev_map[MINOR(dev)].mddev;
32563
+ * options passed in raidrun:
32566
+#define MAX_CHUNK_SIZE (4096*1024)
32569
+ * default readahead
32571
+#define MD_READAHEAD MAX_READAHEAD
32573
+static inline int disk_faulty(mdp_disk_t * d)
32575
+ return d->state & (1 << MD_DISK_FAULTY);
32578
+static inline int disk_active(mdp_disk_t * d)
32580
+ return d->state & (1 << MD_DISK_ACTIVE);
32583
+static inline int disk_sync(mdp_disk_t * d)
32585
+ return d->state & (1 << MD_DISK_SYNC);
32588
+static inline int disk_spare(mdp_disk_t * d)
32590
+ return !disk_sync(d) && !disk_active(d) && !disk_faulty(d);
32593
+static inline int disk_removed(mdp_disk_t * d)
32595
+ return d->state & (1 << MD_DISK_REMOVED);
32598
+static inline void mark_disk_faulty(mdp_disk_t * d)
32600
+ d->state |= (1 << MD_DISK_FAULTY);
32603
+static inline void mark_disk_active(mdp_disk_t * d)
32605
+ d->state |= (1 << MD_DISK_ACTIVE);
32606
+ d->state &= ~(1 << MD_DISK_PENDING_ACTIVE);
32609
+static inline void mark_disk_sync(mdp_disk_t * d)
32611
+ d->state |= (1 << MD_DISK_SYNC);
32614
+static inline void mark_disk_spare(mdp_disk_t * d)
32619
+static inline void mark_disk_removed(mdp_disk_t * d)
32621
+ d->state = (1 << MD_DISK_FAULTY) | (1 << MD_DISK_REMOVED);
32624
+static inline void mark_disk_inactive(mdp_disk_t * d)
32626
+ d->state &= ~(1 << MD_DISK_ACTIVE);
32629
+static inline void mark_disk_nonsync(mdp_disk_t * d)
32631
+ d->state &= ~(1 << MD_DISK_SYNC);
32635
+ * MD's 'extended' device
32639
+ struct md_list_head same_set; /* RAID devices within the same set */
32640
+ struct md_list_head all; /* all RAID devices */
32641
+ struct md_list_head pending; /* undetected RAID devices */
32642
+ evms_logical_node_t *node; /* EVMS device node */
32643
+ kdev_t dev; /* Device number */
32644
+ kdev_t old_dev; /* "" when it was last imported */
32645
+ unsigned long size; /* Device size (in blocks) */
32646
+ mddev_t *mddev; /* RAID array if running */
32647
+ unsigned long last_events; /* IO event timestamp */
32649
+ struct block_device *bdev; /* block device handle */
32652
+ unsigned long sb_offset; /* in blocks */
32654
+ int virtual_spare; /* "virtual" spare added via IOCTL */
32655
+ int alias_device; /* device alias to the same disk */
32656
+ int faulty; /* if faulty do not issue IO requests */
32657
+ int desc_nr; /* descriptor index in the superblock */
32662
+ * disk operations in a working array:
32664
+#define DISKOP_SPARE_INACTIVE 0
32665
+#define DISKOP_SPARE_WRITE 1
32666
+#define DISKOP_SPARE_ACTIVE 2
32667
+#define DISKOP_HOT_SPARE_ACTIVE 3
32668
+#define DISKOP_HOT_REMOVE_SPARE 4
32669
+#define DISKOP_HOT_REMOVE_DISK 5
32670
+#define DISKOP_HOT_ADD_DISK 6
32671
+#define DISKOP_HOT_DEACTIVATE_DISK 7
32673
+typedef struct mdk_personality_s mdk_personality_t;
32675
+#define EVMS_MD_INCOMPLETE (1<<0)
32680
+ mdk_personality_t *pers;
32681
+ evms_logical_node_t *node; /* evms node */
32682
+ unsigned long flag;
32683
+ int nr_raid_disks;
32687
+ struct md_list_head disks;
32689
+ mdu_param_t param;
32691
+ unsigned long curr_resync; /* blocks scheduled */
32692
+ unsigned long resync_mark; /* a recent timestamp */
32693
+ unsigned long resync_mark_cnt;/* blocks written at resync_mark */
32695
+ int recovery_running;
32696
+ struct semaphore reconfig_sem;
32697
+ struct semaphore recovery_sem;
32698
+ struct semaphore resync_sem;
32701
+ atomic_t recovery_active; /* blocks scheduled, but not written */
32702
+ md_wait_queue_head_t recovery_wait;
32704
+ struct md_list_head all_mddevs;
32707
+struct mdk_personality_s
32710
+ int (* init_io) (mddev_t *mddev, int rw, evms_sector_t LSN, evms_sector_t nr_sects, void *data);
32711
+ int (*make_request)(mddev_t *mddev, int rw, eio_t *eio);
32712
+ int (*run)(mddev_t *mddev);
32713
+ int (*stop)(mddev_t *mddev);
32714
+ int (*status)(char *page, mddev_t *mddev);
32715
+ int (*error_handler)(mddev_t *mddev, evms_logical_node_t *node);
32718
+ * Some personalities (RAID-1, RAID-5) can have disks hot-added and
32719
+ * hot-removed. Hot removal is different from failure. (failure marks
32720
+ * a disk inactive, but the disk is still part of the array) The interface
32721
+ * to such operations is the 'pers->diskop()' function, can be NULL.
32723
+ * the diskop function can change the pointer pointing to the incoming
32724
+ * descriptor, but must do so very carefully. (currently only
32725
+ * SPARE_ACTIVE expects such a change)
32727
+ int (*diskop) (mddev_t *mddev, mdp_disk_t **descriptor, int state);
32729
+ int (*stop_resync)(mddev_t *mddev);
32730
+ int (*restart_resync)(mddev_t *mddev);
32731
+ int (*sync_request)(mddev_t *mddev, unsigned long block_nr);
32732
+ int (*evms_ioctl)(mddev_t *mddev, struct inode *inode, struct file *file,
32733
+ unsigned int cmd, unsigned long arg);
32734
+ int (*md_pers_ioctl)(mddev_t *mddev, int cmd, void* pers_arg);
32737
+/* This structure is required for activating a spare device */
32738
+typedef struct evms_md_activate_spare_s {
32739
+ struct evms_md_activate_spare_s *next; /* next entry */
32740
+ mddev_t *mddev; /* target mddev */
32741
+ mdp_disk_t *spare; /* spare to activate */
32742
+} evms_md_activate_spare_t;
32745
+ * Currently we index md_array directly, based on the minor
32746
+ * number. This will have to change to dynamic allocation
32747
+ * once we start supporting partitioning of md devices.
32749
+static inline int mdidx (mddev_t * mddev)
32751
+ return mddev->__minor;
32754
+static inline kdev_t mddev_to_kdev(mddev_t * mddev)
32756
+ return MKDEV(MD_MAJOR, mdidx(mddev));
32759
+extern mdk_rdev_t * evms_md_find_rdev(mddev_t * mddev, kdev_t dev);
32760
+extern mdk_rdev_t * evms_md_find_rdev_nr(mddev_t *mddev, int nr);
32761
+extern mdp_disk_t *get_spare(mddev_t *mddev);
32764
+ * iterates through some rdev ringlist. It's safe to remove the
32765
+ * current 'rdev'. Dont touch 'tmp' though.
32767
+#define ITERATE_RDEV_GENERIC(head,field,rdev,tmp) \
32769
+ for (tmp = head.next; \
32770
+ rdev = md_list_entry(tmp, mdk_rdev_t, field), \
32771
+ tmp = tmp->next, tmp->prev != &head \
32774
+ * iterates through the 'same array disks' ringlist
32776
+#define ITERATE_RDEV(mddev,rdev,tmp) \
32777
+ ITERATE_RDEV_GENERIC((mddev)->disks,same_set,rdev,tmp)
32780
+ * Same as above, but assumes that the device has rdev->desc_nr numbered
32781
+ * from 0 to mddev->nb_dev, and iterates through rdevs in ascending order.
32783
+#define ITERATE_RDEV_ORDERED(mddev,rdev,i) \
32784
+ for (i = 0; rdev = evms_md_find_rdev_nr(mddev, i), i < mddev->nb_dev; i++)
32788
+ * Iterates through all 'RAID managed disks'
32790
+#define ITERATE_RDEV_ALL(rdev,tmp) \
32791
+ ITERATE_RDEV_GENERIC(all_raid_disks,all,rdev,tmp)
32794
+ * Iterates through 'pending RAID disks'
32796
+#define ITERATE_RDEV_PENDING(rdev,tmp) \
32797
+ ITERATE_RDEV_GENERIC(pending_raid_disks,pending,rdev,tmp)
32800
+ * iterates through all used mddevs in the system.
32802
+#define ITERATE_MDDEV(mddev,tmp) \
32804
+ for (tmp = all_mddevs.next; \
32805
+ mddev = md_list_entry(tmp, mddev_t, all_mddevs), \
32806
+ tmp = tmp->next, tmp->prev != &all_mddevs \
32809
+static inline int lock_mddev (mddev_t * mddev)
32811
+ return down_interruptible(&mddev->reconfig_sem);
32814
+static inline void unlock_mddev (mddev_t * mddev)
32816
+ up(&mddev->reconfig_sem);
32819
+#define xchg_values(x,y) do { __typeof__(x) __tmp = x; \
32820
+ x = y; y = __tmp; } while (0)
32822
+#define MAX_DISKNAME_LEN 64
32824
+typedef struct dev_name_s {
32825
+ struct md_list_head list;
32827
+ char namebuf [MAX_DISKNAME_LEN];
32832
+#define __wait_event_lock_irq(wq, condition, lock) \
32834
+ wait_queue_t __wait; \
32835
+ init_waitqueue_entry(&__wait, current); \
32837
+ add_wait_queue(&wq, &__wait); \
32839
+ set_current_state(TASK_UNINTERRUPTIBLE); \
32842
+ spin_unlock_irq(&lock); \
32843
+ run_task_queue(&tq_disk); \
32845
+ spin_lock_irq(&lock); \
32847
+ current->state = TASK_RUNNING; \
32848
+ remove_wait_queue(&wq, &__wait); \
32851
+#define wait_event_lock_irq(wq, condition, lock) \
32855
+ __wait_event_lock_irq(wq, condition, lock); \
32859
+#define __wait_disk_event(wq, condition) \
32861
+ wait_queue_t __wait; \
32862
+ init_waitqueue_entry(&__wait, current); \
32864
+ add_wait_queue(&wq, &__wait); \
32866
+ set_current_state(TASK_UNINTERRUPTIBLE); \
32869
+ run_task_queue(&tq_disk); \
32872
+ current->state = TASK_RUNNING; \
32873
+ remove_wait_queue(&wq, &__wait); \
32876
+#define wait_disk_event(wq, condition) \
32880
+ __wait_disk_event(wq, condition); \
32885
diff -Naur linux-2002-03-28/include/linux/evms/evms_md_p.h evms-2002-03-28/include/linux/evms/evms_md_p.h
32886
--- linux-2002-03-28/include/linux/evms/evms_md_p.h Wed Dec 31 18:00:00 1969
32887
+++ evms-2002-03-28/include/linux/evms/evms_md_p.h Tue Mar 26 18:58:57 2002
32890
+ * Copyright (c) International Business Machines Corp., 2000
32892
+ * This program is free software; you can redistribute it and/or modify
32893
+ * it under the terms of the GNU General Public License as published by
32894
+ * the Free Software Foundation; either version 2 of the License, or
32895
+ * (at your option) any later version.
32897
+ * This program is distributed in the hope that it will be useful,
32898
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
32899
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
32900
+ * the GNU General Public License for more details.
32902
+ * You should have received a copy of the GNU General Public License
32903
+ * along with this program; if not, write to the Free Software
32904
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
32907
+ * linux/include/linux/evms/evms_md_p.h
32909
+ * EVMS Linux MD Region Manager Public Header File
32911
+ * 'evms_md_p.h' is an EVMS version of linux/include/linux/raid/md_p.h modified
32912
+ * by Cuong (Mike) Tran <miketran@us.ibm.com>, March 2002.
32916
+#ifndef __EVMS_MD_P_INC__
32917
+#define __EVMS_MD_P_INC__
32920
+ * RAID superblock.
32922
+ * The RAID superblock maintains some statistics on each RAID configuration.
32923
+ * Each real device in the RAID set contains it near the end of the device.
32924
+ * Some of the ideas are copied from the ext2fs implementation.
32926
+ * We currently use 4096 bytes as follows:
32928
+ * word offset function
32930
+ * 0 - 31 Constant generic RAID device information.
32931
+ * 32 - 63 Generic state information.
32932
+ * 64 - 127 Personality specific information.
32933
+ * 128 - 511 12 32-words descriptors of the disks in the raid set.
32934
+ * 512 - 911 Reserved.
32935
+ * 912 - 1023 Disk specific descriptor.
32939
+ * If x is the real device size in bytes, we return an apparent size of:
32941
+ * y = (x & ~(MD_RESERVED_BYTES - 1)) - MD_RESERVED_BYTES
32943
+ * and place the 4kB superblock at offset y.
32945
+#define MD_RESERVED_BYTES (64 * 1024)
32946
+#define MD_RESERVED_SECTORS (MD_RESERVED_BYTES / 512)
32947
+#define MD_RESERVED_BLOCKS (MD_RESERVED_BYTES / BLOCK_SIZE)
32949
+#define MD_NEW_SIZE_SECTORS(x) ((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS)
32950
+#define MD_NEW_SIZE_BLOCKS(x) ((x & ~(MD_RESERVED_BLOCKS - 1)) - MD_RESERVED_BLOCKS)
32952
+#define MD_SB_BYTES 4096
32953
+#define MD_SB_WORDS (MD_SB_BYTES / 4)
32954
+#define MD_SB_BLOCKS (MD_SB_BYTES / BLOCK_SIZE)
32955
+#define MD_SB_SECTORS (MD_SB_BYTES / 512)
32958
+ * The following are counted in 32-bit words
32960
+#define MD_SB_GENERIC_OFFSET 0
32961
+#define MD_SB_PERSONALITY_OFFSET 64
32962
+#define MD_SB_DISKS_OFFSET 128
32963
+#define MD_SB_DESCRIPTOR_OFFSET 992
32965
+#define MD_SB_GENERIC_CONSTANT_WORDS 32
32966
+#define MD_SB_GENERIC_STATE_WORDS 32
32967
+#define MD_SB_GENERIC_WORDS (MD_SB_GENERIC_CONSTANT_WORDS + MD_SB_GENERIC_STATE_WORDS)
32968
+#define MD_SB_PERSONALITY_WORDS 64
32969
+#define MD_SB_DESCRIPTOR_WORDS 32
32970
+#define MD_SB_DISKS 27
32971
+#define MD_SB_DISKS_WORDS (MD_SB_DISKS*MD_SB_DESCRIPTOR_WORDS)
32972
+#define MD_SB_RESERVED_WORDS (1024 - MD_SB_GENERIC_WORDS - MD_SB_PERSONALITY_WORDS - MD_SB_DISKS_WORDS - MD_SB_DESCRIPTOR_WORDS)
32973
+#define MD_SB_EQUAL_WORDS (MD_SB_GENERIC_WORDS + MD_SB_PERSONALITY_WORDS + MD_SB_DISKS_WORDS)
32976
+ * Device "operational" state bits
32978
+#define MD_DISK_FAULTY 0 /* disk is faulty / operational */
32979
+#define MD_DISK_ACTIVE 1 /* disk is running or spare disk */
32980
+#define MD_DISK_SYNC 2 /* disk is in sync with the raid set */
32981
+#define MD_DISK_REMOVED 3 /* disk has kind of been removed, but not really or it would not be here */
32982
+#define MD_DISK_NEW 4 /* disk has just been added to the raid set */
32983
+#define MD_DISK_PENDING_ACTIVE 5 /* disk was spare, but should be activated */
32985
+typedef struct mdp_device_descriptor_s {
32986
+ __u32 number; /* 0 Device number in the entire set */
32987
+ __u32 major; /* 1 Device major number */
32988
+ __u32 minor; /* 2 Device minor number */
32989
+ __u32 raid_disk; /* 3 The role of the device in the raid set */
32990
+ __u32 state; /* 4 Operational state */
32991
+ __u32 reserved[MD_SB_DESCRIPTOR_WORDS - 5];
32994
+#define MD_SB_MAGIC 0xa92b4efc
32997
+ * Superblock state bits
32999
+#define MD_SB_CLEAN 0
33000
+#define MD_SB_ERRORS 1
33002
+typedef struct mdp_superblock_s {
33004
+ * Constant generic information
33006
+ __u32 md_magic; /* 0 MD identifier */
33007
+ __u32 major_version; /* 1 major version to which the set conforms */
33008
+ __u32 minor_version; /* 2 minor version ... */
33009
+ __u32 patch_version; /* 3 patchlevel version ... */
33010
+ __u32 gvalid_words; /* 4 Number of used words in this section */
33011
+ __u32 set_uuid0; /* 5 Raid set identifier */
33012
+ __u32 ctime; /* 6 Creation time */
33013
+ __u32 level; /* 7 Raid personality */
33014
+ __u32 size; /* 8 Apparent size of each individual disk */
33015
+ __u32 nr_disks; /* 9 total disks in the raid set */
33016
+ __u32 raid_disks; /* 10 disks in a fully functional raid set */
33017
+ __u32 md_minor; /* 11 preferred MD minor device number */
33018
+ __u32 not_persistent; /* 12 does it have a persistent superblock */
33019
+ __u32 set_uuid1; /* 13 Raid set identifier #2 */
33020
+ __u32 set_uuid2; /* 14 Raid set identifier #3 */
33021
+ __u32 set_uuid3; /* 15 Raid set identifier #4 */
33022
+ __u32 gstate_creserved[MD_SB_GENERIC_CONSTANT_WORDS - 16];
33025
+ * Generic state information
33027
+ __u32 utime; /* 0 Superblock update time */
33028
+ __u32 state; /* 1 State bits (clean, ...) */
33029
+ __u32 active_disks; /* 2 Number of currently active disks */
33030
+ __u32 working_disks; /* 3 Number of working disks */
33031
+ __u32 failed_disks; /* 4 Number of failed disks */
33032
+ __u32 spare_disks; /* 5 Number of spare disks */
33033
+ __u32 sb_csum; /* 6 checksum of the whole superblock */
33035
+#ifdef __BIG_ENDIAN
33036
+ __u32 events_hi; /* 7 high-order of superblock update count */
33037
+ __u32 events_lo; /* 8 low-order of superblock update count */
33039
+ __u32 events_lo; /* 7 low-order of superblock update count */
33040
+ __u32 events_hi; /* 8 high-order of superblock update count */
33043
+#if __BYTE_ORDER == __BIG_ENDIAN
33044
+ __u32 events_hi; /* 7 high-order of superblock update count */
33045
+ __u32 events_lo; /* 8 low-order of superblock update count */
33047
+ __u32 events_lo; /* 7 low-order of superblock update count */
33048
+ __u32 events_hi; /* 8 high-order of superblock update count */
33051
+ __u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 9];
33054
+ * Personality information
33056
+ __u32 layout; /* 0 the array's physical layout */
33057
+ __u32 chunk_size; /* 1 chunk size in bytes */
33058
+ __u32 root_pv; /* 2 LV root PV */
33059
+ __u32 root_block; /* 3 LV root block */
33060
+ __u32 pstate_reserved[MD_SB_PERSONALITY_WORDS - 4];
33063
+ * Disks information
33065
+ mdp_disk_t disks[MD_SB_DISKS];
33070
+ __u32 reserved[MD_SB_RESERVED_WORDS];
33073
+ * Active descriptor
33075
+ mdp_disk_t this_disk;
33079
+static inline __u64 md_event(mdp_super_t *sb) {
33080
+ __u64 ev = sb->events_hi;
33081
+ return (ev<<32)| sb->events_lo;
33086
diff -Naur linux-2002-03-28/include/linux/evms/evms_md_u.h evms-2002-03-28/include/linux/evms/evms_md_u.h
33087
--- linux-2002-03-28/include/linux/evms/evms_md_u.h Wed Dec 31 18:00:00 1969
33088
+++ evms-2002-03-28/include/linux/evms/evms_md_u.h Wed Mar 6 17:08:40 2002
33091
+ * Copyright (c) International Business Machines Corp., 2000
33093
+ * This program is free software; you can redistribute it and/or modify
33094
+ * it under the terms of the GNU General Public License as published by
33095
+ * the Free Software Foundation; either version 2 of the License, or
33096
+ * (at your option) any later version.
33098
+ * This program is distributed in the hope that it will be useful,
33099
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
33100
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
33101
+ * the GNU General Public License for more details.
33103
+ * You should have received a copy of the GNU General Public License
33104
+ * along with this program; if not, write to the Free Software
33105
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
33108
+ * linux/include/linux/evms/evms_md_h.c
33110
+ * EVMS MD Region Manager, User <-> Kernel common file
33114
+#ifndef _EVMS_MD_U_INC_
33115
+#define _EVMS_MD_U_INC_
33117
+#define EVMS_MD_ID 4
33118
+#define MD_SET_PLUGIN_ID SetPluginID(IBM_OEM_ID,EVMS_REGION_MANAGER,EVMS_MD_ID)
33120
+#define EVMS_MD_PERS_IOCTL_CMD 1 /* personality specific ioctl command */
33121
+#define EVMS_MD_ADD 2
33122
+#define EVMS_MD_REMOVE 3
33123
+#define EVMS_MD_ACTIVATE 4
33124
+#define EVMS_MD_DEACTIVATE 5
33125
+#define EVMS_MD_GET_ARRAY_INFO 6
33127
+/* structure definition to use with MD_ADD, MD_REMOVE, MD_ACTIVATE */
33128
+typedef struct evms_md_kdev_s {
33129
+ u_int32_t major; /* 1 Device major number */
33130
+ u_int32_t minor; /* 2 Device minor number */
33133
+/* structure definition to use with MD_GET_ARRAY_INFO */
33134
+#define EVMS_MD_ARRAY_DEGRADED (1<<0)
33135
+#define EVMS_MD_ARRAY_SYNCING (1<<1)
33136
+typedef struct evms_md_array_info_s {
33137
+ unsigned long state; /* degraded mode, syncing,...*/
33138
+ mdp_super_t *sb; /* array super block */
33139
+} evms_md_array_info_t;
33141
+typedef struct evms_md_ioctl_s {
33142
+ int mddev_idx; /* same as __minor in mddev_s struct */
33143
+ int cmd; /* Command for personality */
33144
+ void *arg; /* Command specific ioctl command structure */
33145
+} evms_md_ioctl_t;
33147
+/* Needed by mddev_s structure in evms_md_k.h */
33148
+typedef struct mdu_param_s
33150
+ int personality; /* 1,2,3,4 */
33151
+ int chunk_size; /* in bytes */
33152
+ int max_fault; /* unused for now */
33158
diff -Naur linux-2002-03-28/include/linux/evms/evms_os2.h evms-2002-03-28/include/linux/evms/evms_os2.h
33159
--- linux-2002-03-28/include/linux/evms/evms_os2.h Wed Dec 31 18:00:00 1969
33160
+++ evms-2002-03-28/include/linux/evms/evms_os2.h Wed Mar 27 23:55:42 2002
33164
+ * Copyright (c) International Business Machines Corp., 2000
33166
+ * This program is free software; you can redistribute it and/or modify
33167
+ * it under the terms of the GNU General Public License as published by
33168
+ * the Free Software Foundation; either version 2 of the License, or
33169
+ * (at your option) any later version.
33171
+ * This program is distributed in the hope that it will be useful,
33172
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
33173
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
33174
+ * the GNU General Public License for more details.
33176
+ * You should have received a copy of the GNU General Public License
33177
+ * along with this program; if not, write to the Free Software
33178
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
33180
+ * Module: linux/include/linux/evms_os2.h
33184
+ * Change History:
33189
+ * Description: This module defines the disk structures used by the OS/2
33190
+ * Logical Volume Manager, including that of the Master
33191
+ * Boot Record (MBR) and Extended Boot Records (EBR).
33193
+ * Notes: LVM Drive Letter Assignment Tables (DLA_Tables) appear on the
33194
+ * last sector of each track containing a valid MBR or EBR. Since
33195
+ * partitions must be track aligned, any track containing an MBR or
33196
+ * EBR will be almost all empty sectors. We will grab the last
33197
+ * of these empty sectors for our DLT_Tables.
33202
+#ifndef OS2LVM_INCLUDED__
33203
+#define OS2LVM_INCLUDED__
33205
+/* The following define the values used to indicate that a partition table entry is for an EBR, not a partition. */
33206
+#define EBR_BOOT_INDICATOR 0
33207
+#define EBR_FORMAT_INDICATOR 5
33209
+/* The following define is used as the default Format_Indicator for new non-primary partitions. */
33210
+#define NEW_LOGICAL_DRIVE_FORMAT_INDICATOR 0x6
33212
+/* The following define is used as the default Format_Indicator for a new non-active primary partitions. */
33213
+#define NEW_PRIMARY_PARTITION_FORMAT_INDICATOR 0x16
33215
+/* The following define is used as the default Format_Indicator for a new active primary partition. */
33216
+#define NEW_ACTIVE_PRIMARY_PARTITION_FORMAT_INDICATOR 0x06
33218
+/* The following define is used to hold the value of the Boot_Indicator for active partitions. */
33219
+#define ACTIVE_PARTITION 0x80
33221
+/* Define the size of a Partition Name. Partition Names are user defined names given to a partition. */
33222
+#define PARTITION_NAME_SIZE 20
33224
+/* Define the size of a volume name. Volume Names are user defined names given to a volume. */
33225
+#define VOLUME_NAME_SIZE 20
33227
+/* Define the size of a disk name. Disk Names are user defined names given to physical disk drives in the system. */
33228
+#define DISK_NAME_SIZE 20
33230
+/* The name of the filesystem in use on a partition. This name may be up to 12 ( + NULL terminator) characters long. */
33231
+#define FILESYSTEM_NAME_SIZE 20
33233
+/* The comment field is reserved but is not currently used. This is for future expansion and use. */
33234
+#define COMMENT_SIZE 81
33237
+/* Define the minimum number of sectors to reserve on the disk for Boot Manager. */
33238
+#define BOOT_MANAGER_SIZE 2048
33240
+#define OS2_BYTES_PER_SECTOR 512
33241
+#define OS2_SECTOR_SHIFT 9
33244
+/*--------------------------------------------------
33245
+ * Type definitions
33246
+ --------------------------------------------------*/
33248
+/* The following definitions define the drive letter assignment table used by LVM.
33249
+ For each partition table on the disk, there will be a drive letter assignment table in the last sector
33250
+ of the track containing the partition table. */
33252
+/* NOTE: DLA stands for Drive Letter Assignment. */
33254
+#define DLA_TABLE_SIGNATURE1 0x424D5202L
33255
+#define DLA_TABLE_SIGNATURE2 0x44464D50L
33258
+typedef struct _DLA_Entry { /* DE */
33259
+ u_int32_t Volume_Serial_Number; /* The serial number of the volume that this partition belongs to. */
33260
+ u_int32_t Partition_Serial_Number; /* The serial number of this partition. */
33261
+ u_int32_t Partition_Size; /* The size of the partition, in sectors. */
33262
+ u_int32_t Partition_Start; /* The starting sector of the partition. */
33263
+ unsigned char On_Boot_Manager_Menu; /* Set to TRUE if this volume/partition is on the Boot Manager Menu. */
33264
+ unsigned char Installable; /* Set to TRUE if this volume is the one to install the operating system on. */
33265
+ char Drive_Letter; /* The drive letter assigned to the partition. */
33266
+ unsigned char Reserved;
33267
+ char Volume_Name[VOLUME_NAME_SIZE]; /* The name assigned to the volume by the user. */
33268
+ char Partition_Name[PARTITION_NAME_SIZE]; /* The name assigned to the partition. */
33271
+typedef struct _DLA_Table_Sector { /* DTS */
33272
+ u_int32_t DLA_Signature1; /* The magic signature (part 1) of a Drive Letter Assignment Table. */
33273
+ u_int32_t DLA_Signature2; /* The magic signature (part 2) of a Drive Letter Assignment Table. */
33274
+ u_int32_t DLA_CRC; /* The 32 bit CRC for this sector. Calculated assuming that this field and all unused space in the sector is 0. */
33275
+ u_int32_t Disk_Serial_Number; /* The serial number assigned to this disk. */
33276
+ u_int32_t Boot_Disk_Serial_Number; /* The serial number of the disk used to boot the system. This is for conflict resolution when multiple volumes
33277
+ want the same drive letter. Since LVM.EXE will not let this situation happen, the only way to get this situation
33278
+ is for the disk to have been altered by something other than LVM.EXE, or if a disk drive has been moved from one
33279
+ machine to another. If the drive has been moved, then it should have a different Boot_Disk_Serial_Number. Thus,
33280
+ we can tell which disk drive is the "foreign" drive and therefore reject its claim for the drive letter in question.
33281
+ If we find that all of the claimaints have the same Boot_Disk_Serial_Number, then we must assign drive letters on
33282
+ a first come, first serve basis.*/
33283
+ u_int32_t Install_Flags; /* Used by the Install program. */
33284
+ u_int32_t Cylinders;
33285
+ u_int32_t Heads_Per_Cylinder;
33286
+ u_int32_t Sectors_Per_Track;
33287
+ char Disk_Name[DISK_NAME_SIZE]; /* The name assigned to the disk containing this sector. */
33288
+ unsigned char Reboot; /* For use by Install. Used to keep track of reboots initiated by install. */
33289
+ unsigned char Reserved[3]; /* Alignment. */
33290
+ DLA_Entry DLA_Array[4]; /* These are the four entries which correspond to the entries in the partition table. */
33291
+} DLA_Table_Sector;
33294
+/* The following definitions define the LVM signature sector which will appear as the last sector in an LVM partition. */
33297
+#define OS2LVM_PRIMARY_SIGNATURE 0x4A435332L
33298
+#define OS2LVM_SECONDARY_SIGNATURE 0x4252444BL
33301
+#define CURRENT_OS2LVM_MAJOR_VERSION_NUMBER 2 /* Define as appropriate. */
33302
+#define CURRENT_OS2LVM_MINOR_VERSION_NUMBER 0 /* Define as appropriate. */
33305
+/* The following definitions limit the number of LVM features that can be applied to a volume, as well as defining a "NULL" feature for use in feature table entries that are not being used. */
33306
+#define OS2LVM_MAX_FEATURES_PER_VOLUME 10 /* The maximum number of LVM features that can be applied to a volume. */
33307
+#define OS2LVM_NULL_FEATURE 0 /* No feature. Used in all unused entries of the feature array in the LVM Signature sector. */
33310
+/* The following structure is used to hold the location of the feature specific data for LVM features. */
33311
+typedef struct _LVM_Feature_Data { /* LFD */
33312
+ u_int32_t Feature_ID; /* The ID of the feature. */
33313
+ u_int32_t Location_Of_Primary_Feature_Data; /* The u_int32_t of the starting sector of the private data for this feature. */
33314
+ u_int32_t Location_Of_Secondary_Feature_Data; /* The u_int32_t of the starting sector of the backup copy of the private data for this feature. */
33315
+ u_int32_t Feature_Data_Size; /* The number of sectors used by this feature for its private data. */
33316
+ u_int16_t Feature_Major_Version_Number; /* The integer portion of the version number of this feature. */
33317
+ u_int16_t Feature_Minor_Version_Number; /* The decimal portion of the version number of this feature. */
33318
+ unsigned char Feature_Active; /* TRUE if this feature is active on this partition/volume, FALSE otherwise. */
33319
+ unsigned char Reserved[3]; /* Alignment. */
33320
+} LVM_Feature_Data;
33323
+/* The following structure defines the LVM Signature Sector. This is the last sector of every partition which is part of an LVM volume. It gives vital
33324
+ information about the version of LVM used to create the LVM volume that it is a part of, as well as which LVM features (BBR, drive linking, etc.) are
33325
+ active on the volume that this partition is a part of. */
33326
+typedef struct _LVM_Signature_Sector { /* LSS */
33327
+ u_int32_t LVM_Signature1; /* The first part of the magic LVM signature. */
33328
+ u_int32_t LVM_Signature2; /* The second part of the magic LVM signature. */
33329
+ u_int32_t Signature_Sector_CRC; /* 32 bit CRC for this sector. Calculated using 0 for this field. */
33330
+ u_int32_t Partition_Serial_Number; /* The LVM assigned serial number for this partition. */
33331
+ u_int32_t Partition_Start; /* u_int32_t of the first sector of this partition. */
33332
+ u_int32_t Partition_End; /* u_int32_t of the last sector of this partition. */
33333
+ u_int32_t Partition_Sector_Count; /* The number of sectors in this partition. */
33334
+ u_int32_t LVM_Reserved_Sector_Count; /* The number of sectors reserved for use by LVM. */
33335
+ u_int32_t Partition_Size_To_Report_To_User; /* The size of the partition as the user sees it - i.e. (the actual size of the partition - LVM reserved sectors) rounded to a track boundary. */
33336
+ u_int32_t Boot_Disk_Serial_Number; /* The serial number of the boot disk for the system. If the system contains Boot Manager, then this is the serial number of the disk containing the active copy of Boot Manager. */
33337
+ u_int32_t Volume_Serial_Number; /* The serial number of the volume that this partition belongs to. */
33338
+ u_int32_t Fake_EBR_Location; /* The location, on disk, of a Fake EBR, if one has been allocated. */
33339
+ u_int16_t LVM_Major_Version_Number; /* Major version number of the LVM that created this partition. */
33340
+ u_int16_t LVM_Minor_Version_Number; /* Minor version number of the LVM that created this partition. */
33341
+ char Partition_Name[PARTITION_NAME_SIZE]; /* User defined partition name. */
33342
+ char Volume_Name[VOLUME_NAME_SIZE]; /* The name of the volume that this partition belongs to. */
33343
+ LVM_Feature_Data LVM_Feature_Array[OS2LVM_MAX_FEATURES_PER_VOLUME]; /* The feature array. This indicates which LVM features, if any, are active on this volume
33344
+ and what order they should be applied in. */
33345
+ char Drive_Letter; /* The drive letter assigned to the volume that this partition is part of. */
33346
+ unsigned char Fake_EBR_Allocated; /* If TRUE, then a fake EBR has been allocated. */
33347
+ char Comment[COMMENT_SIZE]; /* User comment. */
33348
+ char Disk_Name[DISK_NAME_SIZE]; /* Added to allow BBR to report the name of a disk when bad sectors are encountered on that disk. */
33349
+ u_int32_t Sequence_Number; /* This indicates the order that partitions within a volume are used. This number is 1 based. A 0 here indicates that the volume was made by LVM Ver. 1. */
33350
+ u_int32_t Next_Aggregate_Number; /* Used during volume creation and expansion when creating unique names for aggregates. */
33351
+ /* The remainder of the sector is reserved for future use and should be all zero or else the CRC will not come out correctly. */
33352
+} LVM_Signature_Sector;
33355
+/* The following definitions define the format of a partition table and the Master Boot Record (MBR). */
33356
+typedef struct _Partition_Record { /* PR */
33357
+ unsigned char Boot_Indicator; /* 80h = active partition. */
33358
+ unsigned char Starting_Head;
33359
+ unsigned char Starting_Sector; /* Bits 0-5 are the sector. Bits 6 and 7 are the high order bits of the starting cylinder. */
33360
+ unsigned char Starting_Cylinder; /* The cylinder number is a 10 bit value. The high order bits of the 10 bit value come from bits 6 & 7 of the Starting_Sector field. */
33361
+ unsigned char Format_Indicator; /* An indicator of the format/operation system on this partition. */
33362
+ unsigned char Ending_Head;
33363
+ unsigned char Ending_Sector;
33364
+ unsigned char Ending_Cylinder;
33365
+ u_int32_t Sector_Offset; /* The number of sectors on the disk which are prior to the start of this partition. */
33366
+ u_int32_t Sector_Count; /* The number of sectors in this partition. */
33367
+} Partition_Record;
33369
+typedef struct _Master_Boot_Record { /* MBR */
33370
+ unsigned char Reserved[446];
33371
+ Partition_Record Partition_Table[4];
33372
+ u_int16_t Signature; /* AA55h in this field indicates that this is a valid partition table/MBR. */
33373
+} Master_Boot_Record;
33375
+typedef Master_Boot_Record Extended_Boot_Record;
33377
+/* The following definition covers the Boot Manager Alias Table in the EBR.
33379
+ The Alias Table in the EBR has 2 entries in it, although only the first one is actually used. */
33380
+#define ALIAS_NAME_SIZE 8
33381
+typedef struct _AliasTableEntry { /* ATE */
33382
+ unsigned char On_Boot_Manager_Menu;
33383
+ char Name[ALIAS_NAME_SIZE];
33384
+} AliasTableEntry;
33386
+#define ALIAS_TABLE_OFFSET 0x18A
33389
+/* The following text is used for the Boot Manager Alias for items that were placed on the Boot Manager Menu by FDISK and
33390
+ which have since been migrated to the new LVM format. This text is put into the Name field of an AliasTableEntry so
33391
+ that, if FDISK ( or another program which understands the old Boot Manager Menu format) is run, it will display
33392
+ something for those partitions/volumes which are on the Boot Manager Menu.
33394
+ NOTE: This text must be exactly ALIAS_NAME_SIZE characters in length! */
33395
+#define ALIAS_TABLE_ENTRY_MIGRATION_TEXT "--> LVM "
33396
+#define ALIAS_TABLE_ENTRY_MIGRATION_TEXT2 "--> LVM*"
33400
+/* The following is the signature used for an Master Boot Record, an Extended Boot Record, and a Boot Sector. */
33401
+#define MBR_EBR_SIGNATURE 0xAA55
33404
+/* The following list of definitions defines the values of interest for the Format_Indicator in a Partition_Record. */
33405
+#define EBR_INDICATOR 0x5
33406
+#define WINDOZE_EBR_INDICATOR 0xF
33407
+#define UNUSED_INDICATOR 0x0
33408
+#define IFS_INDICATOR 0x7
33409
+#define FAT12_INDICATOR 0x1
33410
+#define FAT16_SMALL_PARTITION_INDICATOR 0x4
33411
+#define FAT16_LARGE_PARTITION_INDICATOR 0x6
33412
+#define BOOT_MANAGER_HIDDEN_PARTITION_FLAG 0x10
33413
+#define LVM_PARTITION_INDICATOR 0x35
33414
+#define BOOT_MANAGER_INDICATOR 0x0A
33417
+/* The following is the signature used in the Boot Sector for Boot Manager. */
33418
+#define OS2LVM_BOOT_MANAGER_SIGNATURE "APJ&WN"
33421
+/* The following is used for determining the synthetic geometry reported for Volumes employing drive linking. */
33422
+#define OS2LVM_SYNTHETIC_SECTORS_PER_TRACK 63
33425
+/*--------------------------------------------------
33426
+ * Declares for Drive Linking feature:
33427
+ *--------------------------------------------------*/
33429
+/* The following defines uniquely identify Drive Linking. */
33430
+#define DRIVE_LINKING_FEATURE_ID 100
33431
+#define DRIVE_LINKING_MAJOR_VERSION 1
33432
+#define DRIVE_LINKING_MINOR_VERSION 0
33434
+/* The following definitions are used for the disk structures supporting drive linking. */
33436
+#define LINK_TABLE_MASTER_SIGNATURE 0x434E4157L
33437
+#define LINK_TABLE_SIGNATURE 0X4D4D5652L
33439
+#define MAXIMUM_LINKS 246
33441
+#define DRIVE_LINKING_RESERVED_SECTOR_COUNT 4
33443
+#define LINKS_IN_FIRST_SECTOR 60
33445
+#define LINKS_IN_NEXT_SECTOR 62
33447
+typedef struct _Drive_Link {
33448
+ u_int32_t Drive_Serial_Number;
33449
+ u_int32_t Partition_Serial_Number;
33452
+typedef struct _LVM_Link_Table_First_Sector {
33453
+ u_int32_t Link_Table_Signature; /* Use the LINK_TABLE_MASTER_SIGNATURE here. */
33454
+ u_int32_t Link_Table_CRC;
33455
+ u_int32_t Sequence_Number; /* Used to resolve conflicts when the primary and secondary tables do not match. */
33456
+ u_int32_t Links_In_Use;
33457
+ Drive_Link Link_Table[LINKS_IN_FIRST_SECTOR];
33458
+} LVM_Link_Table_First_Sector;
33460
+typedef struct _LVM_Link_Table_Sector {
33461
+ u_int32_t Link_Table_Signature; /* Use LINK_TABLE_SIGNATURE here. */
33462
+ u_int32_t Link_Table_CRC;
33463
+ u_int32_t Sequence_Number; /* Used to resolve conflicts when the primary and secondary tables do not match. */
33464
+ Drive_Link Link_Table[LINKS_IN_NEXT_SECTOR];
33465
+} LVM_Link_Table_Sector;
33468
+/*--------------------------------------------------
33469
+ * Declares for Bad Block Relocation feature:
33470
+ *--------------------------------------------------*/
33472
+/* The following definition is the numeric ID for Bad Block Relocation. */
33473
+#define BBR_FEATURE_ID 101
33475
+#define BBR_FEATURE_MAJOR_VERSION 0x0001
33476
+#define BBR_FEATURE_MINOR_VERSION 0x0000
33478
+/* The following definitions are used for the disk structures supporting bad block relocation. */
33480
+/* NOTE: BBR stands for Bad Block Relocation. */
33482
+#define BBR_TABLE_MASTER_SIGNATURE 0x00726D62
33483
+#define BBR_TABLE_SIGNATURE 0x01726276
33486
+typedef struct _BBR_Table_Entry {
33487
+ u_int32_t BadSector;
33488
+ u_int32_t ReplacementSector;
33489
+} BBR_Table_Entry;
33491
+typedef struct _LVM_BBR_Table_First_Sector {
33492
+ u_int32_t Signature; /* Signature for the first sector of the BBR Table. Use BBR_TABLE_MASTER_SIGNATURE here.*/
33493
+ u_int32_t CRC;/* CRC for this sector.*/
33494
+ u_int32_t Sequence_Number; /* Used to resolve conflicts when the primary and secondary tables do not match.*/
33495
+ u_int32_t Table_Size; /* The number of BBR_Table_Entries in the BBR Table.*/
33496
+ u_int32_t Table_Entries_In_Use;/* The number of BBR Table entries which are in use.*/
33497
+ u_int32_t Sectors_Per_Table; /* The number of LVM_BBR_Table_Sectors used to hold the BBR Table.*/
33498
+ u_int32_t First_Replacement_Sector; /* The location of the first replacement sector.*/
33499
+ u_int32_t Last_Replacement_Sector; /* The location of the last replacement sector.*/
33500
+ u_int32_t Replacement_Sector_Count; /* The number of replacement sectors.*/
33501
+ u_int32_t Flags; /* Flags global to the Bad Block Relocation Feature.*/
33502
+} LVM_BBR_Table_First_Sector;
33504
+/* Flags for LVM_BBR_Table_First_Sector */
33505
+#define BBR_Flag_Write_Verify 0x00000001/* Indicate convert Write I/O to Write/Verify*/
33507
+#define BBR_TABLE_ENTRIES_PER_SECTOR 62
33509
+typedef struct _LVM_BBR_Table_Sector {
33510
+ u_int32_t Signature;/* Signature for a sector of the BBR_Table which is not the first sector of the BBR Table. Use BBR_TABLE_SIGNATURE here.*/
33511
+ u_int32_t CRC;/* CRC for this sector of the BBR Table.*/
33512
+ u_int32_t Sequence_Number; /* Used to resolve conflicts when the primary and secondary tables do not match.*/
33513
+ BBR_Table_Entry BBR_Table[BBR_TABLE_ENTRIES_PER_SECTOR];
33514
+ u_int32_t reserved1;/* for block alignment*/
33515
+} LVM_BBR_Table_Sector;
33518
+// Combined structure to hold entire BBR feature data as it exists on disk.
33519
+typedef struct _LVM_BBR_Feature
33521
+ LVM_BBR_Table_First_Sector control;
33522
+ char reserved1[OS2_BYTES_PER_SECTOR - sizeof(LVM_BBR_Table_First_Sector)];
33523
+ LVM_BBR_Table_Sector remap[1];
33524
+} LVM_BBR_Feature;
33526
+/* The following defines establish the minimum and maximum number of replacement sectors which can be allocated for
33527
+ Bad Block Relocation. Otherwise, 1 replacement sector per MB of disk space is allocated. */
33528
+#define BBR_FLOOR 62
33529
+#define BBR_LIMIT 4096
33533
+// In-memory Meta Data for Bad Block Relocation
33534
+// In-memory Meta Data for Drive Linking
33535
+typedef struct os2_drivelink_runtime_entry_s {
33536
+ evms_sector_t start_sector;
33537
+ evms_sector_t sector_count;
33538
+ evms_sector_t Drive_Link_Data_Copy1; /* LSN of first on-disk copy of drive linking data. */
33539
+ evms_sector_t Drive_Link_Data_Copy2; /* LSN of the second on-disk copy of drive linking data. */
33541
+ u_int32_t Partition_Serial_Number;
33542
+ evms_sector_t BBR_Data_Copy1; /* LSN of the first on-disk copy of the BBR data.*/
33543
+ evms_sector_t BBR_Data_Copy2; /* LSN of the second on-disk copy of the BBR data.*/
33544
+ u_int32_t BBR_Feature_Size; /* # of sectors of BBR data. */
33545
+ u_int32_t bbr_is_active;
33546
+ struct semaphore BBR_Table_Lock; /* Used to serialize writers */
33547
+ unsigned int Guard1; /* Lamport's Theorem for mutual exclusion */
33549
+ unsigned int Guard2; /* Lamport's Theorem for mutual exclusion */
33550
+ evms_logical_node_t *link_partition;
33551
+ struct os2_drivelink_runtime_entry_s *next;
33552
+} os2_drivelink_runtime_entry_t;
33554
+// In-memory Meta Data for each OS/2 LVM Volume:
33555
+typedef struct os2_volume_runtime_entry_s {
33557
+ u_int32_t Export_Needed;
33558
+ evms_sector_t size_in_sectors;
33559
+ u_int32_t Volume_Serial_Number;
33560
+ u_int32_t drive_link_count;
33561
+ os2_drivelink_runtime_entry_t *drive_link;
33562
+ evms_logical_node_t *next_os2lvm_node;
33563
+} os2_volume_runtime_entry_t;
33569
diff -Naur linux-2002-03-28/include/linux/evms/evms_raid0.h evms-2002-03-28/include/linux/evms/evms_raid0.h
33570
--- linux-2002-03-28/include/linux/evms/evms_raid0.h Wed Dec 31 18:00:00 1969
33571
+++ evms-2002-03-28/include/linux/evms/evms_raid0.h Thu Jan 3 13:15:19 2002
33576
+#include <linux/evms/evms_md.h>
33580
+ unsigned long zone_offset; /* Zone offset in md_dev */
33581
+ unsigned long dev_offset; /* Zone offset in real dev */
33582
+ unsigned long size; /* Zone size */
33583
+ int nb_dev; /* # of devices attached to the zone */
33584
+ mdk_rdev_t *dev[MD_SB_DISKS]; /* Devices attached to the zone */
33589
+ struct strip_zone *zone0, *zone1;
33592
+struct raid0_private_data
33594
+ struct raid0_hash *hash_table; /* Dynamically allocated */
33595
+ struct strip_zone *strip_zone; /* This one too */
33596
+ int nr_strip_zones;
33597
+ struct strip_zone *smallest;
33601
+typedef struct raid0_private_data raid0_conf_t;
33603
+#define mddev_to_conf(mddev) ((raid0_conf_t *) mddev->private)
33606
diff -Naur linux-2002-03-28/include/linux/evms/evms_raid1.h evms-2002-03-28/include/linux/evms/evms_raid1.h
33607
--- linux-2002-03-28/include/linux/evms/evms_raid1.h Wed Dec 31 18:00:00 1969
33608
+++ evms-2002-03-28/include/linux/evms/evms_raid1.h Mon Mar 11 22:58:26 2002
33610
+#ifndef _EVMS_RAID1_H
33611
+#define _EVMS_RAID1_H
33613
+#include <linux/evms/evms_md.h>
33615
+struct mirror_info {
33618
+ evms_logical_node_t *node;
33621
+ int head_position;
33633
+struct raid1_private_data {
33635
+ struct mirror_info mirrors[MD_SB_DISKS];
33638
+ int working_disks;
33640
+ unsigned long next_sect;
33642
+ evms_thread_t *thread, *resync_thread;
33643
+ int resync_mirrors;
33644
+ struct mirror_info *spare;
33645
+ md_spinlock_t device_lock;
33647
+ /* buffer pool */
33648
+ /* buffer_heads that we have pre-allocated have b_pprev -> &freebh
33649
+ * and are linked into a stack using b_next
33650
+ * raid1_bh that are pre-allocated have R1BH_PreAlloc set.
33651
+ * All these variable are protected by device_lock
33653
+ struct buffer_head *freebh;
33654
+ int freebh_cnt; /* how many are on the list */
33655
+ int freebh_blocked;
33656
+ struct raid1_bh *freer1;
33657
+ int freer1_blocked;
33659
+ struct raid1_bh *freebuf; /* each bh_req has a page allocated */
33660
+ md_wait_queue_head_t wait_buffer;
33662
+ /* for use when syncing mirrors: */
33663
+ unsigned long start_active, start_ready,
33664
+ start_pending, start_future;
33665
+ int cnt_done, cnt_active, cnt_ready,
33666
+ cnt_pending, cnt_future;
33669
+ md_wait_queue_head_t wait_done;
33670
+ md_wait_queue_head_t wait_ready;
33671
+ md_spinlock_t segment_lock;
33674
+typedef struct raid1_private_data raid1_conf_t;
33677
+ * this is the only point in the RAID code where we violate
33678
+ * C type safety. mddev->private is an 'opaque' pointer.
33680
+#define mddev_to_conf(mddev) ((raid1_conf_t *) mddev->private)
33683
+ * this is our 'private' 'collective' RAID1 buffer head.
33684
+ * it contains information about what kind of IO operations were started
33685
+ * for this RAID1 operation, and about their status:
33688
+/* This structure is used to map a buffer head to a evms logical node */
33689
+typedef struct raid1_node_map_s {
33690
+ evms_logical_node_t *node;
33691
+ struct buffer_head *bh;
33692
+} raid1_node_map_t;
33695
+ atomic_t remaining; /* 'have we finished' count,
33696
+ * used from IRQ handlers
33699
+ unsigned long state;
33701
+ struct buffer_head *master_bh;
33702
+ struct buffer_head *mirror_bh_list;
33703
+ raid1_node_map_t mirror_node_map[MD_SB_DISKS];
33704
+ struct buffer_head bh_req;
33705
+ evms_logical_node_t *node; /* map to evms node (READ only) */
33707
+ struct raid1_bh *next_r1; /* next for retry or in free list */
33709
+/* bits for raid1_bh.state */
33710
+#define R1BH_Uptodate 1
33711
+#define R1BH_SyncPhase 2
33712
+#define R1BH_PreAlloc 3 /* this was pre-allocated, add to free list */
33714
diff -Naur linux-2002-03-28/include/linux/evms/evms_raid5.h evms-2002-03-28/include/linux/evms/evms_raid5.h
33715
--- linux-2002-03-28/include/linux/evms/evms_raid5.h Wed Dec 31 18:00:00 1969
33716
+++ evms-2002-03-28/include/linux/evms/evms_raid5.h Mon Mar 11 22:58:36 2002
33721
+#include <linux/evms/evms_md.h>
33722
+#include <linux/evms/evms_xor.h>
33726
+ * Each stripe contains one buffer per disc. Each buffer can be in
33727
+ * one of a number of states determined by bh_state. Changes between
33728
+ * these states happen *almost* exclusively under a per-stripe
33729
+ * spinlock. Some very specific changes can happen in b_end_io, and
33730
+ * these are not protected by the spin lock.
33732
+ * The bh_state bits that are used to represent these states are:
33733
+ * BH_Uptodate, BH_Lock
33735
+ * State Empty == !Uptodate, !Lock
33736
+ * We have no data, and there is no active request
33737
+ * State Want == !Uptodate, Lock
33738
+ * A read request is being submitted for this block
33739
+ * State Dirty == Uptodate, Lock
33740
+ * Some new data is in this buffer, and it is being written out
33741
+ * State Clean == Uptodate, !Lock
33742
+ * We have valid data which is the same as on disc
33744
+ * The possible state transitions are:
33746
+ * Empty -> Want - on read or write to get old data for parity calc
33747
+ * Empty -> Dirty - on compute_parity to satisfy write/sync request.(RECONSTRUCT_WRITE)
33748
+ * Empty -> Clean - on compute_block when computing a block for failed drive
33749
+ * Want -> Empty - on failed read
33750
+ * Want -> Clean - on successful completion of read request
33751
+ * Dirty -> Clean - on successful completion of write request
33752
+ * Dirty -> Clean - on failed write
33753
+ * Clean -> Dirty - on compute_parity to satisfy write/sync (RECONSTRUCT or RMW)
33755
+ * The Want->Empty, Want->Clean, Dirty->Clean, transitions
33756
+ * all happen in b_end_io at interrupt time.
33757
+ * Each sets the Uptodate bit before releasing the Lock bit.
33758
+ * This leaves one multi-stage transition:
33759
+ * Want->Dirty->Clean
33760
+ * This is safe because thinking that a Clean buffer is actually dirty
33761
+ * will at worst delay some action, and the stripe will be scheduled
33762
+ * for attention after the transition is complete.
33764
+ * There is one possibility that is not covered by these states. That
33765
+ * is if one drive has failed and there is a spare being rebuilt. We
33766
+ * can't distinguish between a clean block that has been generated
33767
+ * from parity calculations, and a clean block that has been
33768
+ * successfully written to the spare ( or to parity when resyncing).
33769
+ * To distingush these states we have a stripe bit STRIPE_INSYNC that
33770
+ * is set whenever a write is scheduled to the spare, or to the parity
33771
+ * disc if there is no spare. A sync request clears this bit, and
33772
+ * when we find it set with no buffers locked, we know the sync is
33775
+ * Buffers for the md device that arrive via make_request are attached
33776
+ * to the appropriate stripe in one of two lists linked on b_reqnext.
33777
+ * One list (bh_read) for read requests, one (bh_write) for write.
33778
+ * There should never be more than one buffer on the two lists
33779
+ * together, but we are not guaranteed of that so we allow for more.
33781
+ * If a buffer is on the read list when the associated cache buffer is
33782
+ * Uptodate, the data is copied into the read buffer and it's b_end_io
33783
+ * routine is called. This may happen in the end_request routine only
33784
+ * if the buffer has just successfully been read. end_request should
33785
+ * remove the buffers from the list and then set the Uptodate bit on
33786
+ * the buffer. Other threads may do this only if they first check
33787
+ * that the Uptodate bit is set. Once they have checked that they may
33788
+ * take buffers off the read queue.
33790
+ * When a buffer on the write list is committed for write is it copied
33791
+ * into the cache buffer, which is then marked dirty, and moved onto a
33792
+ * third list, the written list (bh_written). Once both the parity
33793
+ * block and the cached buffer are successfully written, any buffer on
33794
+ * a written list can be returned with b_end_io.
33796
+ * The write list and read list both act as fifos. The read list is
33797
+ * protected by the device_lock. The write and written lists are
33798
+ * protected by the stripe lock. The device_lock, which can be
33799
+ * claimed while the stipe lock is held, is only for list
33800
+ * manipulations and will only be held for a very short time. It can
33801
+ * be claimed from interrupts.
33804
+ * Stripes in the stripe cache can be on one of two lists (or on
33805
+ * neither). The "inactive_list" contains stripes which are not
33806
+ * currently being used for any request. They can freely be reused
33807
+ * for another stripe. The "handle_list" contains stripes that need
33808
+ * to be handled in some way. Both of these are fifo queues. Each
33809
+ * stripe is also (potentially) linked to a hash bucket in the hash
33810
+ * table so that it can be found by sector number. Stripes that are
33811
+ * not hashed must be on the inactive_list, and will normally be at
33812
+ * the front. All stripes start life this way.
33814
+ * The inactive_list, handle_list and hash bucket lists are all protected by the
33816
+ * - stripes on the inactive_list never have their stripe_lock held.
33817
+ * - stripes have a reference counter. If count==0, they are on a list.
33818
+ * - If a stripe might need handling, STRIPE_HANDLE is set.
33819
+ * - When refcount reaches zero, then if STRIPE_HANDLE it is put on
33820
+ * handle_list else inactive_list
33822
+ * This, combined with the fact that STRIPE_HANDLE is only ever
33823
+ * cleared while a stripe has a non-zero count means that if the
33824
+ * refcount is 0 and STRIPE_HANDLE is set, then it is on the
33825
+ * handle_list and if recount is 0 and STRIPE_HANDLE is not set, then
33826
+ * the stripe is on inactive_list.
33828
+ * The possible transitions are:
33829
+ * activate an unhashed/inactive stripe (get_active_stripe())
33830
+ * lockdev check-hash unlink-stripe cnt++ clean-stripe hash-stripe unlockdev
33831
+ * activate a hashed, possibly active stripe (get_active_stripe())
33832
+ * lockdev check-hash if(!cnt++)unlink-stripe unlockdev
33833
+ * attach a request to an active stripe (add_stripe_bh())
33834
+ * lockdev attach-buffer unlockdev
33835
+ * handle a stripe (handle_stripe())
33836
+ * lockstripe clrSTRIPE_HANDLE ... (lockdev check-buffers unlockdev) .. change-state .. record io needed unlockstripe schedule io
33837
+ * release an active stripe (release_stripe())
33838
+ * lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev
33840
+ * The refcount counts each thread that have activated the stripe,
33841
+ * plus raid5d if it is handling it, plus one for each active request
33842
+ * on a cached buffer.
33844
+struct stripe_head {
33845
+ struct stripe_head *hash_next, **hash_pprev; /* hash pointers */
33846
+ struct list_head lru; /* inactive_list or handle_list */
33847
+ struct raid5_private_data *raid_conf;
33848
+ struct buffer_head *bh_cache[MD_SB_DISKS]; /* buffered copy */
33849
+ struct buffer_head *bh_read[MD_SB_DISKS]; /* read request buffers of the MD device */
33850
+ struct buffer_head *bh_write[MD_SB_DISKS]; /* write request buffers of the MD device */
33851
+ struct buffer_head *bh_written[MD_SB_DISKS]; /* write request buffers of the MD device that have been scheduled for write */
33852
+ struct page *bh_page[MD_SB_DISKS]; /* saved bh_cache[n]->b_page when reading around the cache */
33853
+ evms_logical_node_t *node[MD_SB_DISKS]; /* the target device node */
33854
+ unsigned long sector; /* sector of this row */
33855
+ int size; /* buffers size */
33856
+ int pd_idx; /* parity disk index */
33857
+ unsigned long state; /* state flags */
33858
+ atomic_t count; /* nr of active thread/requests */
33867
+#define RECONSTRUCT_WRITE 1
33868
+#define READ_MODIFY_WRITE 2
33869
+/* not a write method, but a compute_parity mode */
33870
+#define CHECK_PARITY 3
33875
+#define STRIPE_ERROR 1
33876
+#define STRIPE_HANDLE 2
33877
+#define STRIPE_SYNCING 3
33878
+#define STRIPE_INSYNC 4
33879
+#define STRIPE_PREREAD_ACTIVE 5
33880
+#define STRIPE_DELAYED 6
33885
+ * To improve write throughput, we need to delay the handling of some
33886
+ * stripes until there has been a chance that several write requests
33887
+ * for the one stripe have all been collected.
33888
+ * In particular, any write request that would require pre-reading
33889
+ * is put on a "delayed" queue until there are no stripes currently
33890
+ * in a pre-read phase. Further, if the "delayed" queue is empty when
33891
+ * a stripe is put on it then we "plug" the queue and do not process it
33892
+ * until an unplg call is made. (the tq_disk list is run).
33894
+ * When preread is initiated on a stripe, we set PREREAD_ACTIVE and add
33895
+ * it to the count of prereading stripes.
33896
+ * When write is initiated, or the stripe refcnt == 0 (just in case) we
33897
+ * clear the PREREAD_ACTIVE flag and decrement the count
33898
+ * Whenever the delayed queue is empty and the device is not plugged, we
33899
+ * move any strips from delayed to handle and clear the DELAYED flag and set PREREAD_ACTIVE.
33900
+ * In stripe_handle, if we find pre-reading is necessary, we do it if
33901
+ * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue.
33902
+ * HANDLE gets cleared if stripe_handle leave nothing locked.
33906
+struct disk_info {
33908
+ evms_logical_node_t *node;
33917
+struct raid5_private_data {
33918
+ struct stripe_head **stripe_hashtbl;
33920
+ evms_thread_t *thread, *resync_thread;
33921
+ struct disk_info disks[MD_SB_DISKS];
33922
+ struct disk_info *spare;
33924
+ int chunk_size, level, algorithm;
33925
+ int raid_disks, working_disks, failed_disks;
33926
+ int resync_parity;
33927
+ int max_nr_stripes;
33929
+ struct list_head handle_list; /* stripes needing handling */
33930
+ struct list_head delayed_list; /* stripes that have plugged requests */
33931
+ atomic_t preread_active_stripes; /* stripes with scheduled io */
33933
+ * Free stripes pool
33935
+ atomic_t active_stripes;
33936
+ struct list_head inactive_list;
33937
+ md_wait_queue_head_t wait_for_stripe;
33938
+ int inactive_blocked; /* release of inactive stripes blocked,
33939
+ * waiting for 25% to be free
33941
+ md_spinlock_t device_lock;
33944
+ struct tq_struct plug_tq;
33947
+typedef struct raid5_private_data raid5_conf_t;
33949
+#define mddev_to_conf(mddev) ((raid5_conf_t *) mddev->private)
33952
+ * Our supported algorithms
33954
+#define ALGORITHM_LEFT_ASYMMETRIC 0
33955
+#define ALGORITHM_RIGHT_ASYMMETRIC 1
33956
+#define ALGORITHM_LEFT_SYMMETRIC 2
33957
+#define ALGORITHM_RIGHT_SYMMETRIC 3
33960
+#define EVMS_MD_RAID5_INIT_IO 1
33962
+typedef struct raid5_ioctl_init_io_s {
33964
+ evms_sector_t lsn;
33965
+ evms_sector_t nr_sects;
33967
+} raid5_ioctl_init_io_t;
33969
diff -Naur linux-2002-03-28/include/linux/evms/evms_snapshot.h evms-2002-03-28/include/linux/evms/evms_snapshot.h
33970
--- linux-2002-03-28/include/linux/evms/evms_snapshot.h Wed Dec 31 18:00:00 1969
33971
+++ evms-2002-03-28/include/linux/evms/evms_snapshot.h Thu Dec 6 18:42:08 2001
33973
+/* -*- linux-c -*- */
33976
+ * Copyright (c) International Business Machines Corp., 2000
33978
+ * This program is free software; you can redistribute it and/or modify
33979
+ * it under the terms of the GNU General Public License as published by
33980
+ * the Free Software Foundation; either version 2 of the License, or
33981
+ * (at your option) any later version.
33983
+ * This program is distributed in the hope that it will be useful,
33984
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
33985
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
33986
+ * the GNU General Public License for more details.
33988
+ * You should have received a copy of the GNU General Public License
33989
+ * along with this program; if not, write to the Free Software
33990
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
33993
+ * linux/include/linux/evms_snapshot.h
33995
+ * EVMS Snapshot Feature kernel header file
33999
+#ifndef __EVMS_SNAPSHOT_INCLUDED__
34000
+#define __EVMS_SNAPSHOT_INCLUDED__
34002
+#define EVMS_SNAPSHOT_VERSION_MAJOR 2
34003
+#define EVMS_SNAPSHOT_VERSION_MINOR 0
34004
+#define EVMS_SNAPSHOT_VERSION_PATCHLEVEL 0
34006
+#define EVMS_SNAPSHOT_FEATURE_ID 104
34008
+#define EVMS_SNAPSHOT_SIGNATURE 0x536e4170 // SnAp
34009
+#define MAX_HASH_CHAIN_ENTRIES 10
34011
+#define EVMS_SNAPSHOT 0x01 // Status flags
34012
+#define EVMS_SNAPSHOT_ORG 0x02
34013
+#define EVMS_SNAPSHOT_DISABLED 0x04
34014
+#define EVMS_SNAPSHOT_FULL 0x08
34015
+#define EVMS_SNAPSHOT_QUIESCED 0x10
34016
+#define EVMS_SNAPSHOT_WRITEABLE 0x20
34018
+ // option definitions
34019
+#define SNAP_OPTION_ORG_VOLUME_NAME "original" // original volume
34020
+#define SNAP_OPTION_ORG_VOLUME_INDEX 0 // original volume
34021
+#define SNAP_OPTION_SNAPSHOT_NAME "snapshot" // snapshot volume
34022
+#define SNAP_OPTION_SNAPSHOT_INDEX 1 // snapshot volume
34023
+#define SNAP_OPTION_CHUNKSIZE_NAME "chunksize" // chunksize
34024
+#define SNAP_OPTION_CHUNKSIZE_INDEX 2 // chunksize
34025
+#define SNAP_OPTION_WRITEABLE_NAME "writeable" // writeable snapshot
34026
+#define SNAP_OPTION_WRITEABLE_INDEX 3 // writeable snapshot
34028
+#define SNAPSHOT_DEFAULT_CHUNK_SIZE 128 //sectors
34029
+#define SNAPSHOT_MIN_CHUNK_SIZE 16 // 8k
34030
+#define SNAPSHOT_MAX_CHUNK_SIZE 2048 // = 1Meg
34031
+#define SNAPSHOT_CHUNK_BUFFER_SIZE 128 // copy buffer
34033
+#define SNAPSHOT_QUERY_PERCENT_FULL 1 // ioctl internal command to query percent full
34035
+#define SECTOR_SIZE 512
34037
+// description of on disk meta data sector for snapshot feature
34038
+typedef struct _snapshot_metadata {
34039
+/* 0*/ u_int32_t signature;
34040
+/* 4*/ u_int32_t CRC;
34041
+/* 8*/ evms_version_t version; /* structure version */
34042
+/*12*/ u_int32_t flags;
34043
+/*16*/ char original_volume[128];
34044
+/*144*/ u_int64_t original_size;
34045
+/*152*/ u_int64_t lba_of_COW_table;
34046
+/*160*/ u_int64_t lba_of_first_chunk;
34047
+/*168*/ u_int32_t chunk_size; // in sectors
34048
+/*172*/ u_int32_t total_chunks;
34049
+} snapshot_metadata_t;
34054
+// Entries in the snapshot remapping structure
34055
+typedef struct _snapshot_hash_entry {
34056
+ unsigned long long org_chunk;
34057
+ unsigned long long snap_chunk;
34058
+ struct _snapshot_hash_entry * next;
34059
+ struct _snapshot_hash_entry * prev;
34060
+} snapshot_hash_entry_t;
34063
+typedef struct _snapshot_volume {
34064
+ evms_logical_node_t * logical_node; // node below us
34065
+ unsigned long chunk_size; // Sectors
34066
+ unsigned long chunk_shift; // shift value for chunk size
34067
+ unsigned long num_chunks; // in this volume
34068
+ unsigned long next_cow_entry; // Index into current COW table
34069
+ unsigned long long current_cow_sector; // LOGICAL sector of current COW table
34070
+ unsigned long next_free_chunk; // index of next free chunk (not LBA!)
34071
+ u_int64_t cow_table[64]; // Pointer to one sector's worth of COW tables
34072
+ unsigned long hash_table_size; // size of the hash table for the remap
34073
+ unsigned long flags; // status flags
34074
+ snapshot_hash_entry_t ** snapshot_map; // array of remapped chunks
34075
+ struct _snapshot_volume * snapshot_next; // Linked list of volumes snapshotting this original
34076
+ struct _snapshot_volume * snapshot_org; // Pointer to volume being snapshotted
34077
+ struct semaphore snap_semaphore; // Semaphore for locking of snapshots
34078
+ unsigned char * chunk_data_buffer; // Buffer for reading data when doing a copy-on-write
34079
+} snapshot_volume_t;
34082
+typedef struct _snapshot_volume {
34083
+ storage_object_t * object; // our exported object
34084
+ storage_object_t * child_object; // our child object
34085
+ unsigned long chunk_size; // Sectors
34086
+ unsigned long num_chunks; // in this volume
34087
+ unsigned long next_cow_entry; // Index into current COW table
34088
+ unsigned long long current_cow_sector; // LOGICAL sector of current COW table
34089
+ unsigned long next_free_chunk; // index of next free chunk (not LBA!)
34090
+ u_int64_t cow_table[64]; // Pointer to one sector's worth of COW tables
34091
+ unsigned long hash_table_size; // size of the hash table for the remap
34092
+ unsigned long flags; // status flags
34093
+// snapshot_hash_entry_t ** snapshot_map; // array of remapped chunks
34094
+ struct _snapshot_volume * snapshot_next; // Linked list of volumes snapshotting this original
34095
+ struct _snapshot_volume * snapshot_org; // Pointer to volume being snapshotted
34096
+// struct semaphore snap_semaphore; // Semaphore for locking of snapshots
34097
+// unsigned char * chunk_data_buffer; // Buffer for reading data when doing a copy-on-write
34098
+ snapshot_metadata_t meta_data; // copy of metadata if not original
34099
+} snapshot_volume_t;
34104
diff -Naur linux-2002-03-28/include/linux/evms/evms_user.h evms-2002-03-28/include/linux/evms/evms_user.h
34105
--- linux-2002-03-28/include/linux/evms/evms_user.h Wed Dec 31 18:00:00 1969
34106
+++ evms-2002-03-28/include/linux/evms/evms_user.h Wed May 16 13:40:56 2001
34108
+/* -*- linux-c -*- */
34111
+ * Copyright (c) International Business Machines Corp., 2000
34113
+ * This program is free software; you can redistribute it and/or modify
34114
+ * it under the terms of the GNU General Public License as published by
34115
+ * the Free Software Foundation; either version 2 of the License, or
34116
+ * (at your option) any later version.
34118
+ * This program is distributed in the hope that it will be useful,
34119
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
34120
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
34121
+ * the GNU General Public License for more details.
34123
+ * You should have received a copy of the GNU General Public License
34124
+ * along with this program; if not, write to the Free Software
34125
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
34128
+ * linux/include/linux/evms_user.h
34130
+ * EVMS (master) user header file
34134
+#include <linux/evms/evms_common.h>
34135
+#include <linux/evms/evms_ioctl.h>
34136
diff -Naur linux-2002-03-28/include/linux/evms/evms_xor.h evms-2002-03-28/include/linux/evms/evms_xor.h
34137
--- linux-2002-03-28/include/linux/evms/evms_xor.h Wed Dec 31 18:00:00 1969
34138
+++ evms-2002-03-28/include/linux/evms/evms_xor.h Mon Feb 4 09:58:43 2002
34143
+#include <linux/evms/evms_md.h>
34145
+#define MAX_XOR_BLOCKS 5
34147
+extern void evms_md_xor_block(unsigned int count, struct buffer_head **bh_ptr);
34149
+struct xor_block_template {
34150
+ struct xor_block_template *next;
34151
+ const char *name;
34153
+ void (*do_2)(unsigned long, unsigned long *, unsigned long *);
34154
+ void (*do_3)(unsigned long, unsigned long *, unsigned long *,
34155
+ unsigned long *);
34156
+ void (*do_4)(unsigned long, unsigned long *, unsigned long *,
34157
+ unsigned long *, unsigned long *);
34158
+ void (*do_5)(unsigned long, unsigned long *, unsigned long *,
34159
+ unsigned long *, unsigned long *, unsigned long *);