~ubuntu-branches/ubuntu/gutsy/evms/gutsy

« back to all changes in this revision

Viewing changes to kernel/evms-1.0.0-linux-2.4.patch

  • Committer: Bazaar Package Importer
  • Author(s): Steinar H. Gunderson
  • Date: 2006-09-14 19:32:30 UTC
  • mfrom: (2.1.13 edgy)
  • Revision ID: james.westby@ubuntu.com-20060914193230-4b1pmy0coqk81sqa
Tags: 2.5.5-18
* Apply patches from upstream:
  * cli_query_segfault.patch, fixes a segfault in the CLI when doing a
    query.
  * cli_reload_options.patch, reloads the right option descriptors after
    a change.
  * ntfs_unmkfs.patch, fixes a bug in the wiping of NTFS file systems.
  * raid5_remove_spare_fix.patch + raid5_remove_spare_fix_2.patch, lets the
    user remove a spare if resync does not run.
  * raid5_algorithm.patch, makes EVMS heed the parity algorithm the user
    selects when creating a RAID-5 array.

Show diffs side-by-side

added added

removed removed

Lines of Context:
1
 
diff -Naur linux-2002-03-28/drivers/evms/AIXlvm_vge.c evms-2002-03-28/drivers/evms/AIXlvm_vge.c
2
 
--- linux-2002-03-28/drivers/evms/AIXlvm_vge.c  Wed Dec 31 18:00:00 1969
3
 
+++ evms-2002-03-28/drivers/evms/AIXlvm_vge.c   Thu Mar 28 13:53:07 2002
4
 
@@ -0,0 +1,2540 @@
5
 
+/* -*- linux-c -*- */
6
 
+
7
 
+/*
8
 
+ *
9
 
+ *
10
 
+ *   Copyright (c) International Business Machines  Corp., 2000
11
 
+ *
12
 
+ *   This program is free software;  you can redistribute it and/or modify
13
 
+ *   it under the terms of the GNU General Public License as published by
14
 
+ *   the Free Software Foundation; either version 2 of the License, or
15
 
+ *   (at your option) any later version.
16
 
+ *
17
 
+ *   This program is distributed in the hope that it will be useful,
18
 
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
19
 
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
20
 
+ *   the GNU General Public License for more details.
21
 
+ *
22
 
+ *   You should have received a copy of the GNU General Public License
23
 
+ *   along with this program;  if not, write to the Free Software
24
 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25
 
+ *
26
 
+ *
27
 
+ */
28
 
+/*
29
 
+ * linux/drivers/evms/AIXlvm_vge.c
30
 
+ *
31
 
+ * EVMS AIX LVM Volume Group Emulator
32
 
+ *
33
 
+ *
34
 
+ */
35
 
+
36
 
+#define EVMS_DEBUG     1
37
 
+#define EVMS_AIX_DEBUG 1
38
 
+
39
 
+#define AIX_COMMON_SERVICES_MAJOR        0  // Required common services levels for the AIX kernel plugin
40
 
+#define AIX_COMMON_SERVICES_MINOR        5  // These must be incremented if new function is added to common
41
 
+#define AIX_COMMON_SERVICES_PATCHLEVEL   0  // services and the AIX kernel plugin uses the new function.
42
 
+#define AIX_INCREMENT_REQUEST            1
43
 
+#define AIX_DECREMENT_REQUEST           -1
44
 
+
45
 
+
46
 
+#include <linux/module.h>
47
 
+#include <linux/kernel.h>
48
 
+#include <linux/config.h>
49
 
+
50
 
+#include <linux/genhd.h>
51
 
+#include <linux/major.h>
52
 
+#include <linux/string.h>
53
 
+#include <linux/blk.h>
54
 
+#include <linux/init.h>
55
 
+#include <linux/slab.h>
56
 
+
57
 
+#include <linux/evms/evms_kernel.h>
58
 
+#include <linux/evms/evms_aix.h>
59
 
+#include <asm/system.h>
60
 
+#include <asm/uaccess.h>
61
 
+
62
 
+#include <linux/sched.h>
63
 
+#include <linux/smp_lock.h>
64
 
+#include <linux/locks.h>
65
 
+#include <linux/delay.h>
66
 
+#include <linux/reboot.h>
67
 
+#include <linux/completion.h>
68
 
+#include <linux/vmalloc.h>
69
 
+
70
 
+#ifdef EVMS_AIX_DEBUG
71
 
+static int AIX_volume_group_dump(void);
72
 
+#endif
73
 
+
74
 
+static aix_volume_group_t      * AIXVolumeGroupList=NULL;
75
 
+static evms_thread_t           * AIX_mirror_thread;
76
 
+static evms_pool_mgmt_t        * AIX_BH_list_pool = NULL;
77
 
+static aix_mirror_bh_t         * AIX_retry_list = NULL;
78
 
+static aix_mirror_bh_t         ** AIX_retry_tail = NULL;
79
 
+static spinlock_t               AIX_retry_list_lock = SPIN_LOCK_UNLOCKED;
80
 
+
81
 
+// Plugin API prototypes
82
 
+
83
 
+static void AIXiod (void *data);
84
 
+static int  discover_aix(evms_logical_node_t ** evms_logical_disk_head);
85
 
+static int  discover_volume_groups( evms_logical_node_t ** );
86
 
+static int  discover_logical_volumes( void );
87
 
+static int  end_discover_aix(evms_logical_node_t ** evms_logical_disk_head);
88
 
+static void read_aix(evms_logical_node_t     * node,  eio_t      * eio);
89
 
+static void write_aix(evms_logical_node_t     * node, eio_t      * eio);
90
 
+static int  ioctl_aix(   evms_logical_node_t     * logical_node,
91
 
+                        struct inode            * inode,
92
 
+                        struct file             * file,
93
 
+                        unsigned int            cmd,
94
 
+                        unsigned long           arg);
95
 
+static int  AIX_remap_sector(evms_logical_node_t        * node,
96
 
+                            evms_sector_t           org_sector,                     // logical sector to remap
97
 
+                            evms_sector_t           size,                               // size (in sectors) of request to remap
98
 
+                            evms_sector_t           * new_sector,                   // remapped sector
99
 
+                            evms_sector_t           * new_size,                         // new size (in sectors)
100
 
+                            partition_list_entry_t  ** partition,               // new node for which new_sector is relative
101
 
+                            u_int32_t               * le,
102
 
+                            u_int32_t               * offset_in_le);
103
 
+
104
 
+static int validate_build_volume_group_disk_info(evms_logical_node_t   * logical_node,
105
 
+                                                AIXlvm_rec_t                * AIXlvm);
106
 
+
107
 
+static int add_VG_data_to_VG_list ( evms_logical_node_t   * logical_node, 
108
 
+                                   aix_volume_group_t  * new_group,
109
 
+                                   short int             pvNum);
110
 
+static int add_PV_to_volume_group( aix_volume_group_t  * group,
111
 
+                                  evms_logical_node_t * evms_partition,
112
 
+                                  int                   pvNum);
113
 
+static aix_volume_group_t * AIX_create_volume_group(evms_logical_node_t   * logical_node,
114
 
+                                                   AIXlvm_rec_t          * AIXlvm);
115
 
+
116
 
+static int  AIX_update_volume_group(aix_volume_group_t    * AIXVGLptr,
117
 
+                                   evms_logical_node_t   * logical_node,
118
 
+                                   AIXlvm_rec_t          * AIXlvm);
119
 
+
120
 
+static int  AIX_evms_cs_notify_lv_io_error(evms_logical_node_t * node);
121
 
+
122
 
+
123
 
+static int  export_volumes( evms_logical_node_t ** evms_logical_disk_head );
124
 
+static int  lvm_cleanup( void );
125
 
+static int AIX_copy_header_info(vg_header *AIXvgh, vg_header *AIXvgh2);
126
 
+static int  build_pe_maps( aix_volume_group_t * volume_group);
127
 
+
128
 
+static aix_logical_volume_t * new_logical_volume(lv_entries         *AIXlvent, 
129
 
+                                                aix_volume_group_t *group, 
130
 
+                                                char               *lv_name,
131
 
+                                                u_int32_t           stripesize);
132
 
+
133
 
+static int  check_log_volume_and_pe_maps( aix_volume_group_t * group );
134
 
+static int  check_volume_groups(void);
135
 
+static int  init_io_aix( evms_logical_node_t     * node,
136
 
+                        int                                   io_flag,  /* 0=read, 1=write*/
137
 
+                        evms_sector_t             sect_nr,      /* disk LBA */
138
 
+                        evms_sector_t             num_sects,    /* # of sectors */
139
 
+                        void                        * buf_addr );       /* buffer address */
140
 
+
141
 
+
142
 
+static int delete_logical_volume( aix_logical_volume_t * volume );
143
 
+static int  delete_aix_node( evms_logical_node_t * logical_node );
144
 
+static int deallocate_volume_group( aix_volume_group_t * group );
145
 
+
146
 
+static void AIX_handle_read_mirror_drives(struct buffer_head      * bh,
147
 
+                                         int                      uptodate);
148
 
+
149
 
+static void AIX_handle_write_mirror_drives(struct buffer_head      * bh,
150
 
+                                          int                      uptodate);
151
 
+
152
 
+static void aix_notify_cache_ctor(void * foo, kmem_cache_t * cachep, unsigned long flags);
153
 
+
154
 
+static aix_mirror_bh_t * AIX_alloc_rbh(evms_logical_node_t   * node,
155
 
+                                      eio_t                 * eio,
156
 
+                                      uint32_t                mirror_copies,
157
 
+                                      evms_sector_t           org_sector,
158
 
+                                      int                     cmd);
159
 
+
160
 
+static aix_mirror_bh_t * AIX_alloc_wbh(evms_logical_node_t   * node,
161
 
+                                      evms_logical_node_t   * node2,
162
 
+                                      evms_logical_node_t   * node3,
163
 
+                                      eio_t                 * eio,
164
 
+                                      uint32_t                mirror_copies,
165
 
+                                      evms_sector_t           new_sector2,
166
 
+                                      evms_sector_t           new_sector3);
167
 
+
168
 
+static void AIX_free_headers(vg_header *AIXvgh, vg_header *AIXvgh2, vg_trailer *AIXvgt, vg_trailer *AIXvgt2);
169
 
+//****************************************************************************************************
170
 
+
171
 
+/* END of PROTOTYES*/
172
 
+
173
 
+#define GET_PHYSICAL_PART_SIZE(v1) (1 << v1)
174
 
+
175
 
+#define AIX_PVH_DATA_PSN(vgda_psn, pvNum)  (vgda_psn + PSN_PPH_OFFSET + ((pvNum -1) * PSN_PVH_INCREMENT))
176
 
+
177
 
+#define COMPARE_TIMESTAMPS(t1, t2)     ( (t1).tv_sec  == (t2).tv_sec && \
178
 
+                                         (t1).tv_nsec == (t2).tv_nsec )
179
 
+
180
 
+#define COMPARE_UNIQUE_IDS(id1, id2)   ( (id1).word1 == (id2).word1 && \
181
 
+                                         (id1).word2 == (id2).word2 && \
182
 
+                                         (id1).word3 == (id2).word3 && \
183
 
+                                         (id1).word4 == (id2).word4 )
184
 
+
185
 
+#define AIX_PV_STATE_VALID              0      // Both VGDAs are valid and match.
186
 
+#define AIX_PV_STATE_FIRST_VGDA                 1      // Only the first VGDA is valid.
187
 
+#define AIX_PV_STATE_SECOND_VGDA        2      // Only the second VGDA is valid.
188
 
+#define AIX_PV_STATE_EITHER_VGDA       -1      // Both VGDAs are valid, but do not match each other.
189
 
+#define AIX_PV_STATE_INVALID        -2  // We're in an invalid state but there's more PVs in this group
190
 
+
191
 
+
192
 
+#ifndef EVMS_AIX_DEBUG
193
 
+       #define AIX_VOLUME_GROUP_DUMP()
194
 
+#else
195
 
+       #define AIX_VOLUME_GROUP_DUMP() LOG_DEBUG("Called line:%d \n",__LINE__); \
196
 
+                                AIX_volume_group_dump()
197
 
+#endif
198
 
+
199
 
+// Global LVM data structures
200
 
+
201
 
+static evms_plugin_function_table_t AIXlvm_function_table = {
202
 
+       discover: &discover_aix,
203
 
+       end_discover: &end_discover_aix,
204
 
+       delete  : &delete_aix_node,
205
 
+       read    : &read_aix,
206
 
+       write   : &write_aix,
207
 
+       init_io : &init_io_aix,
208
 
+       ioctl   : &ioctl_aix
209
 
+};
210
 
+
211
 
+static evms_plugin_header_t plugin_header = {
212
 
+       id              : SetPluginID(
213
 
+                                    IBM_OEM_ID,
214
 
+                                    EVMS_REGION_MANAGER,            // Region Manager class
215
 
+                                    3 ),                            // Unique ID within VGEs
216
 
+       version         : { 
217
 
+               major      : 1, 
218
 
+               minor      : 0, 
219
 
+               patchlevel : 0 
220
 
+       },                  // Major, Minor, Patchlevel
221
 
+       required_common_services_version: {
222
 
+               major      : AIX_COMMON_SERVICES_MAJOR,
223
 
+               minor      : AIX_COMMON_SERVICES_MINOR,
224
 
+               patchlevel : AIX_COMMON_SERVICES_PATCHLEVEL
225
 
+       },
226
 
+       function_table  : &AIXlvm_function_table               // Function table for this plugin
227
 
+};
228
 
+
229
 
+
230
 
+
231
 
+
232
 
+/*
233
 
+ * Function: remap sector 
234
 
+ *  Common function to remap volume lba to partition lba in appropriate PE
235
 
+ */
236
 
+static int AIX_remap_sector(evms_logical_node_t * node,
237
 
+                           evms_sector_t           org_sector,             // logical sector to remap
238
 
+                           evms_sector_t           size,                   // size (in sectors) of request to remap
239
 
+                           evms_sector_t           * new_sector,           // remapped sector
240
 
+                           evms_sector_t           * new_size,             // new size (in sectors)
241
 
+                           partition_list_entry_t  ** partition,   // new node for which new_sector is relative
242
 
+                           u_int32_t               * le,
243
 
+                           u_int32_t               * offset_in_le)
244
 
+{
245
 
+       aix_logical_volume_t    * volume;
246
 
+
247
 
+       u_int32_t               sectors_per_stripe;
248
 
+       u_int32_t       partition_to_use;
249
 
+       u_int32_t               column;
250
 
+       u_int32_t               stripe_in_column;
251
 
+
252
 
+       u_int32_t                 org_sector32; // Until striping is 64-bit enabled.
253
 
+
254
 
+       volume = (aix_logical_volume_t *) node->instance_data; 
255
 
+
256
 
+#ifdef EVMS_DEBUG
257
 
+       LOG_DEBUG("-- %s volume:%p lv:%d size:%Ld Name:%s\n",__FUNCTION__, volume,volume->lv_number,size,volume->name);
258
 
+       LOG_DEBUG(" node %p node_name [%s] org_sector:%Ld\n",node, node->name, org_sector);
259
 
+       LOG_DEBUG(" mirror_copies:%d volume->lv_size:%Ld\n",volume->mirror_copies,volume->lv_size);
260
 
+#endif
261
 
+
262
 
+       org_sector32 = org_sector;
263
 
+
264
 
+       *(new_size) = size;
265
 
+
266
 
+       // Check if volume is striped. Reset the size if the request
267
 
+       // crosses a stripe boundary.
268
 
+       if ( volume->stripes > 1 ) {
269
 
+#ifdef EVMS_DEBUG
270
 
+               LOG_DEBUG(" *** STRIPED ***\n");
271
 
+               LOG_DEBUG(" ------- volume->stripe_size:%d org_sector:%d volume_stripes:%d\n",volume->stripe_size, org_sector32, volume->stripes);
272
 
+#endif
273
 
+
274
 
+               *(le)              = org_sector >> volume->pe_size_shift;       // 64-bit safe
275
 
+               *(offset_in_le)    = org_sector & (volume->pe_size - 1);        // 64-bit safe
276
 
+
277
 
+#ifdef EVMS_DEBUG
278
 
+               LOG_DEBUG("OLD - le:%d -- offset_in_le:%d \n",*(le), *(offset_in_le));
279
 
+#endif
280
 
+
281
 
+               sectors_per_stripe = volume->stripe_size / AIX_SECTOR_SIZE;
282
 
+               partition_to_use   = (org_sector32 / sectors_per_stripe) % volume->stripes;
283
 
+               stripe_in_column   = ((((org_sector32 / volume->stripe_size) / volume->stripes) * volume->stripe_size) + (org_sector32 % sectors_per_stripe)); 
284
 
+               column             = ((org_sector32 / sectors_per_stripe) / volume->stripes) * sectors_per_stripe;
285
 
+
286
 
+#ifdef EVMS_DEBUG
287
 
+               LOG_DEBUG("offset_in_le:%d org_sector:%Ld pe_shift:%d stripe_shift:%d\n",*(offset_in_le), org_sector, volume->pe_size_shift,volume->stripe_size_shift);
288
 
+
289
 
+               LOG_DEBUG(" org_sector:%d  sectors_per_stripe:%d partition_to_use:%d stripe_in_column:%d column:%d\n",org_sector32, sectors_per_stripe, partition_to_use,stripe_in_column,column);
290
 
+               LOG_DEBUG(" offset_in_le + size:%Ld volume->pe_size:%d volume->lv_size:%Ld\n",(*(offset_in_le)+size),volume->pe_size ,volume->lv_size);
291
 
+#endif
292
 
+
293
 
+               if ( *(offset_in_le) + size > volume->pe_size ) {
294
 
+                       *new_size = volume->pe_size - *(offset_in_le);
295
 
+                       LOG_DEBUG("  new_size %Ld\n",*new_size);
296
 
+               }
297
 
+
298
 
+       }
299
 
+       // Non-striped volume. Just find LE and offset. Reset the size
300
 
+       // if the request crosses an LE boundary.
301
 
+       else {
302
 
+#ifdef EVMS_DEBUG
303
 
+               LOG_DEBUG(" *** NON-STRIPED ***\n");
304
 
+#endif
305
 
+
306
 
+               *(le)            = org_sector >> volume->pe_size_shift;  // 64-bit safe
307
 
+               *(offset_in_le)  = org_sector & (volume->pe_size - 1);   // 64-bit safe
308
 
+
309
 
+       }
310
 
+
311
 
+#ifdef EVMS_DEBUG
312
 
+       LOG_DEBUG(" offset_in_le:%d org_sector:%Ld shift:%d\n",*(offset_in_le), org_sector, volume->pe_size_shift);
313
 
+
314
 
+       if (*(le) >= volume->num_le) {
315
 
+               LOG_DEBUG(" le Memory Overwrite !! le:%d vs volume->num_le:%d\n",*(le),volume->num_le);
316
 
+               return EINVAL;
317
 
+       }
318
 
+#endif
319
 
+
320
 
+       *(new_sector)       = volume->le_to_pe_map[*(le)].pe_sector_offset + *(offset_in_le);
321
 
+       *(partition)        = volume->le_to_pe_map[*(le)].owning_pv;
322
 
+
323
 
+#ifdef EVMS_DEBUG
324
 
+       LOG_DEBUG(" new_sector:%Ld\n", *(new_sector));
325
 
+       LOG_DEBUG(" Owning Part %p\n",*(partition));
326
 
+       LOG_DEBUG(" End %s\n",__FUNCTION__);
327
 
+#endif
328
 
+
329
 
+       return(0);
330
 
+}
331
 
+
332
 
+
333
 
+/*
334
 
+ * Function: read_aix
335
 
+ */
336
 
+static void read_aix(evms_logical_node_t     * node,
337
 
+                    eio_t      * eio)
338
 
+{
339
 
+       partition_list_entry_t  * partition;
340
 
+       evms_sector_t                   org_sector;
341
 
+       evms_sector_t                   new_sector;
342
 
+       evms_sector_t                   new_size;
343
 
+       aix_logical_volume_t    * volume;
344
 
+       aix_mirror_bh_t         * tmp_bh;
345
 
+       u_int32_t                 le, offset_in_le,count;
346
 
+
347
 
+
348
 
+       volume = (aix_logical_volume_t *) node->instance_data; 
349
 
+#ifdef EVMS_DEBUG
350
 
+       LOG_DEBUG(" ***** %s ***** bh:%p volume->iter:%d\n",__FUNCTION__,eio->bh, volume->mirror_iterations);
351
 
+#endif
352
 
+
353
 
+
354
 
+#ifdef EVMS_DEBUG
355
 
+       LOG_DEBUG(" node->total_vsectors:%Lu\n",node->total_vsectors);
356
 
+       LOG_DEBUG(" rsector:%Lu rsize:%Lu node_flags:%u\n",eio->rsector,eio->rsize,node->flags);
357
 
+#endif
358
 
+
359
 
+       // Check if I/O goes past end of logical volume.
360
 
+       if ( eio->rsector + eio->rsize > node->total_vsectors ) {
361
 
+               LOG_CRITICAL(" read_aix ERROR %d\n",__LINE__);
362
 
+               EVMS_IO_ERROR(eio);
363
 
+               return;
364
 
+       }
365
 
+
366
 
+
367
 
+       // Logical-to-physical remapping.
368
 
+       if ( AIX_remap_sector(node, eio->rsector, eio->rsize, &new_sector, &new_size, &partition, &le, &offset_in_le) || 
369
 
+            (!partition || !new_sector)) {
370
 
+               LOG_CRITICAL(" read_aix bh: ERROR %d\n",__LINE__);
371
 
+               EVMS_IO_ERROR(eio);
372
 
+               return;
373
 
+       }
374
 
+
375
 
+       org_sector   = eio->rsector;
376
 
+       eio->rsector = new_sector;
377
 
+       eio->rsize   = new_size;  
378
 
+
379
 
+#ifdef EVMS_DEBUG
380
 
+       LOG_DEBUG(" read_aix Mirror_Copies:%d\n",volume->mirror_copies);
381
 
+#endif
382
 
+
383
 
+       if (volume->mirror_copies > AIX_DEFAULT_MIRRORING) {
384
 
+
385
 
+
386
 
+               tmp_bh = AIX_alloc_rbh(node, eio, 1, new_sector, AIX_LV_READ);
387
 
+
388
 
+               if (!tmp_bh) {
389
 
+                       EVMS_IO_ERROR(eio);
390
 
+                       return;
391
 
+               }
392
 
+
393
 
+               if (volume->le_to_pe_map_mir1) {
394
 
+                       tmp_bh->mir_node1   = volume->le_to_pe_map_mir1[le].owning_pv->logical_node;
395
 
+                       tmp_bh->mir_sector1 = volume->le_to_pe_map_mir1[le].pe_sector_offset + offset_in_le;
396
 
+               }
397
 
+
398
 
+               if (volume->mirror_copies == AIX_MAX_MIRRORS) {
399
 
+                       tmp_bh->mir_node2 = volume->le_to_pe_map_mir2[le].owning_pv->logical_node;
400
 
+                       tmp_bh->mir_sector2 = volume->le_to_pe_map_mir2[le].pe_sector_offset + offset_in_le;
401
 
+               }
402
 
+
403
 
+               if (evms_cs_volume_request_in_progress(tmp_bh->bh_req.b_dev , AIX_INCREMENT_REQUEST, &count)) {
404
 
+                       EVMS_IO_ERROR(eio);
405
 
+                       return;
406
 
+               }
407
 
+
408
 
+               R_IO(partition->logical_node, &tmp_bh->eio); 
409
 
+       } else {
410
 
+
411
 
+               R_IO(partition->logical_node, eio);
412
 
+       }
413
 
+
414
 
+
415
 
+#ifdef EVMS_DEBUG
416
 
+       LOG_DEBUG(" ***** %s ***** returning\n",__FUNCTION__);
417
 
+#endif
418
 
+       return;     
419
 
+}
420
 
+
421
 
+
422
 
+/*
423
 
+ * Function: write_aix
424
 
+ */
425
 
+static void write_aix(   evms_logical_node_t     * node,
426
 
+                        eio_t      * eio)
427
 
+{
428
 
+       partition_list_entry_t  * partition;
429
 
+       evms_sector_t           new_sector, new_sector2 = 0, new_sector3 = 0;
430
 
+       evms_sector_t           org_sector;
431
 
+       evms_sector_t           new_size;
432
 
+       aix_logical_volume_t    * volume;
433
 
+       aix_mirror_bh_t         * tmp_bh;
434
 
+       evms_logical_node_t     * node2 = NULL, *node3 = NULL;
435
 
+       u_int32_t                 le, offset_in_le, count;
436
 
+
437
 
+       volume = (aix_logical_volume_t *) node->instance_data; 
438
 
+
439
 
+#ifdef EVMS_DEBUG
440
 
+       LOG_DEBUG(" ***** %s ***** bh:%p volume->iter:%d\n",__FUNCTION__,eio->bh, volume->mirror_iterations);
441
 
+       LOG_DEBUG(" write_aix rsector:%Lu rsize:%Lu\n",eio->rsector,eio->rsize);
442
 
+       LOG_DEBUG(" write_aix total_sectors:%Lu\n",node->total_vsectors);
443
 
+#endif
444
 
+
445
 
+       if (volume->lv_access & EVMS_LV_INCOMPLETE) {   //No writes allowed on incomplete volumes
446
 
+               LOG_CRITICAL(" write_aix incomplete volume ERROR %d\n",__LINE__);
447
 
+               EVMS_IO_ERROR(eio);
448
 
+               return;
449
 
+       }
450
 
+
451
 
+
452
 
+       // Check if I/O goes past end of logical volume.
453
 
+       if ( eio->rsector + eio->rsize > node->total_vsectors ) {
454
 
+               LOG_CRITICAL(" write_aix ERROR %d\n",__LINE__);
455
 
+               EVMS_IO_ERROR(eio);
456
 
+               return;
457
 
+       }
458
 
+
459
 
+       // Logical-to-Physical remapping
460
 
+       if ( AIX_remap_sector(node, eio->rsector, eio->rsize, &new_sector, &new_size, &partition, &le, &offset_in_le) ||
461
 
+            (!new_sector || !partition)) {
462
 
+               LOG_CRITICAL(" write_aix ERROR %d\n",__LINE__);
463
 
+               EVMS_IO_ERROR(eio);
464
 
+               return;
465
 
+       }
466
 
+
467
 
+       org_sector = eio->rsector; 
468
 
+       eio->rsector = new_sector;
469
 
+       eio->rsize   = new_size;  
470
 
+
471
 
+#ifdef EVMS_DEBUG
472
 
+       LOG_DEBUG(" write_aix  Mirror_Copies:%d\n", volume->mirror_copies);
473
 
+#endif
474
 
+
475
 
+
476
 
+       if (volume->mirror_copies > AIX_DEFAULT_MIRRORING) {
477
 
+
478
 
+               if (volume->le_to_pe_map_mir1) {
479
 
+                       new_sector2 = volume->le_to_pe_map_mir1[le].pe_sector_offset + offset_in_le;
480
 
+                       node2       = volume->le_to_pe_map_mir1[le].owning_pv->logical_node;
481
 
+               }
482
 
+
483
 
+               if (volume->mirror_copies == AIX_MAX_MIRRORS) {
484
 
+
485
 
+                       new_sector3 = volume->le_to_pe_map_mir2[le].pe_sector_offset + offset_in_le;
486
 
+                       node3       = volume->le_to_pe_map_mir2[le].owning_pv->logical_node;
487
 
+               }
488
 
+
489
 
+               tmp_bh = AIX_alloc_wbh(partition->logical_node, node2, node3, eio, volume->mirror_copies, new_sector2, new_sector3);
490
 
+
491
 
+               if (!tmp_bh) {
492
 
+                       EVMS_IO_ERROR(eio);
493
 
+                       return;
494
 
+               }
495
 
+               tmp_bh->node = node;
496
 
+
497
 
+               tmp_bh = tmp_bh->mirror_bh_list;
498
 
+
499
 
+               if (evms_cs_volume_request_in_progress(tmp_bh->bh_req.b_dev , AIX_INCREMENT_REQUEST, &count)) {
500
 
+                       EVMS_IO_ERROR(eio);
501
 
+                       // free memory here
502
 
+                       return;
503
 
+               }
504
 
+
505
 
+               W_IO(tmp_bh->node, &tmp_bh->eio);
506
 
+
507
 
+               tmp_bh = tmp_bh->next_r1;
508
 
+
509
 
+               if (tmp_bh) {
510
 
+                       W_IO(tmp_bh->node, &tmp_bh->eio);
511
 
+                       tmp_bh = tmp_bh->next_r1;
512
 
+               }
513
 
+
514
 
+               if (tmp_bh) {
515
 
+                       W_IO(tmp_bh->node, &tmp_bh->eio);
516
 
+               }
517
 
+
518
 
+       } else {
519
 
+
520
 
+               W_IO(partition->logical_node, eio);
521
 
+       }
522
 
+
523
 
+
524
 
+#ifdef EVMS_DEBUG
525
 
+       LOG_DEBUG(" ***** %s returning *****\n",__FUNCTION__);
526
 
+#endif
527
 
+       return;     
528
 
+}
529
 
+
530
 
+
531
 
+/*
532
 
+ * Function: ioctl_aix
533
 
+ *
534
 
+ */
535
 
+static int ioctl_aix(   evms_logical_node_t     * logical_node,
536
 
+                       struct inode            * inode,
537
 
+                       struct file             * file,
538
 
+                       unsigned int            cmd,
539
 
+                       unsigned long           arg)
540
 
+{
541
 
+       aix_logical_volume_t    * volume = (aix_logical_volume_t*)(logical_node->instance_data);
542
 
+       int                     rc = 0;
543
 
+
544
 
+       LOG_EXTRA(" Ioctl %u\n",cmd);
545
 
+
546
 
+
547
 
+       switch (cmd) {
548
 
+       
549
 
+       case HDIO_GETGEO:
550
 
+               {
551
 
+                       // Fixed geomerty for all LVM volumes 
552
 
+                       unsigned char heads = 64;
553
 
+                       unsigned char sectors = 32;
554
 
+                       long start = 0;
555
 
+                       struct hd_geometry *hd = (struct hd_geometry *)arg;
556
 
+                       short cylinders;
557
 
+                       cylinders = logical_node->total_vsectors;
558
 
+                       cylinders = (cylinders / heads) / sectors;
559
 
+
560
 
+                       if (hd == NULL) {
561
 
+                               return EINVAL;
562
 
+                       }
563
 
+
564
 
+                       if ( copy_to_user((char*)(&hd->heads), &heads, sizeof(heads)) != 0 ||
565
 
+                            copy_to_user((char*)(&hd->sectors), &sectors, sizeof(sectors)) != 0 ||
566
 
+                            copy_to_user((short*)(&hd->cylinders), &cylinders, sizeof(cylinders)) != 0 ||
567
 
+                            copy_to_user((long*)(&hd->start), &start, sizeof(start)) != 0 ) {
568
 
+                               return EFAULT;
569
 
+                       }
570
 
+               }
571
 
+               break;
572
 
+
573
 
+       case EVMS_QUIESCE_VOLUME:
574
 
+               break;
575
 
+
576
 
+       case EVMS_GET_DISK_LIST:
577
 
+       case EVMS_CHECK_MEDIA_CHANGE:
578
 
+       case EVMS_REVALIDATE_DISK:
579
 
+       case EVMS_OPEN_VOLUME:
580
 
+       case EVMS_CLOSE_VOLUME:
581
 
+               {
582
 
+                       // These five ioctl all need to be broadcast to all PVs.
583
 
+                       aix_volume_group_t * group = volume->group;
584
 
+                       partition_list_entry_t * partition;
585
 
+                       for ( partition = group->partition_list; partition; partition = partition->next ) {
586
 
+                               rc |= IOCTL(partition->logical_node, inode, file, cmd, arg);
587
 
+                       }
588
 
+               }
589
 
+               break;
590
 
+
591
 
+       default:
592
 
+               // Currently the VGE does not send any ioctl's down to the
593
 
+               // partitions. Which partition would they go to?
594
 
+               rc = EINVAL;
595
 
+       }
596
 
+
597
 
+       return rc;
598
 
+}
599
 
+
600
 
+
601
 
+/*
602
 
+ * Function: init_io_aix
603
 
+ *
604
 
+ */
605
 
+static int init_io_aix( evms_logical_node_t     * node,
606
 
+                       int                     io_flag,        /* 0=read, 1=write*/
607
 
+                       evms_sector_t   sect_nr,        /* disk LBA */
608
 
+                       evms_sector_t           num_sects,      /* # of sectors */
609
 
+                       void                    * buf_addr )    /* buffer address */
610
 
+{
611
 
+       partition_list_entry_t  * partition;
612
 
+       evms_sector_t           new_sector = 0;
613
 
+       evms_sector_t           new_size   = 0;
614
 
+       int                     rc = 0;
615
 
+       u_int32_t               le, offset;
616
 
+
617
 
+       LOG_DEBUG(" ************ init_io_aix() num_sects:%Ld node:%p sect_nr:%Ld\n",num_sects, node, sect_nr);
618
 
+
619
 
+       // Init IO needs to deal with the possibility that a request can come
620
 
+       // in that spans PEs or stripes. This is possible because there is no
621
 
+       // limit on num_sects. To fix this, we loop through AIX_remap_sector and
622
 
+       // INIT_IO until num_sects reaches zero.
623
 
+
624
 
+
625
 
+       while ( num_sects > 0 ) {
626
 
+
627
 
+               if (AIX_remap_sector(node, sect_nr, num_sects, &new_sector, &new_size,  &partition, &le, &offset) ||
628
 
+                   (!new_sector || !partition)) {
629
 
+                       LOG_CRITICAL("--- Error returned from AIX_remap_sector %d\n",__LINE__);
630
 
+                       return -EIO;
631
 
+               }
632
 
+
633
 
+               LOG_DEBUG(" init_io_aix() line:%d logical_node:%p io_flag:%d new_sector:%Ld new_size:%Ld\n",__LINE__,partition->logical_node, io_flag, new_sector, new_size);
634
 
+
635
 
+               rc = INIT_IO(partition->logical_node, io_flag, new_sector, new_size, buf_addr);
636
 
+               num_sects       -= new_size;
637
 
+               sect_nr         += new_size;
638
 
+               buf_addr        = (void*)(((unsigned long)buf_addr) + (unsigned long)(new_size << EVMS_VSECTOR_SIZE_SHIFT));
639
 
+       }
640
 
+
641
 
+       return rc;
642
 
+}
643
 
+
644
 
+/*
645
 
+ * Function: AIXlvm_vge_init
646
 
+ *
647
 
+ */
648
 
+int __init AIXlvm_vge_init(void)
649
 
+{
650
 
+       const char * name = "evms_AIXiod";
651
 
+
652
 
+       LOG_DEBUG(" %s --------\n",__FUNCTION__);
653
 
+
654
 
+       AIX_mirror_thread = evms_cs_register_thread(AIXiod, NULL, name);
655
 
+
656
 
+       MOD_INC_USE_COUNT;
657
 
+       return evms_cs_register_plugin(&plugin_header); /* register with EVMS */
658
 
+}
659
 
+
660
 
+module_init(AIXlvm_vge_init);
661
 
+
662
 
+
663
 
+
664
 
+
665
 
+/********** Required Plugin Functions **********/
666
 
+
667
 
+
668
 
+/*
669
 
+ * Function: discover_aix
670
 
+ *
671
 
+ *  This is the entry point into the LVM discovery process.
672
 
+ */
673
 
+static int discover_aix(evms_logical_node_t ** evms_logical_disk_head)
674
 
+{
675
 
+       int rc = 0, count = 0;
676
 
+
677
 
+       LOG_DEBUG("[%s] discover_volume_groups\n",__FUNCTION__); 
678
 
+
679
 
+       rc = discover_volume_groups(evms_logical_disk_head);
680
 
+
681
 
+       if (rc) {
682
 
+               LOG_ERROR("[%s] discover_volume_groups rc=%d\n",__FUNCTION__ ,rc); 
683
 
+       }
684
 
+
685
 
+       if (AIXVolumeGroupList) {
686
 
+
687
 
+               LOG_DEBUG("[%s] discover_logical_volumes\n",__FUNCTION__); 
688
 
+
689
 
+               rc = discover_logical_volumes();
690
 
+
691
 
+               if (rc) {
692
 
+                       LOG_ERROR("[%s] discover_logical_volumes rc=%d\n",__FUNCTION__ ,rc); 
693
 
+               }
694
 
+
695
 
+
696
 
+               LOG_DEBUG("[%s] export_volumes\n",__FUNCTION__); 
697
 
+
698
 
+               count = export_volumes(evms_logical_disk_head);
699
 
+
700
 
+               LOG_DEBUG("[%s] export_volumes count=%d\n",__FUNCTION__ ,count); 
701
 
+       }
702
 
+
703
 
+       return(count);
704
 
+}
705
 
+
706
 
+
707
 
+
708
 
+static int discover_volume_groups(evms_logical_node_t ** evms_logical_disk_head)
709
 
+{
710
 
+       evms_logical_node_t     * logical_node;
711
 
+       evms_logical_node_t     * next_node;
712
 
+       AIXIPL_REC              * AIXpv;
713
 
+       AIXlvm_rec_t            * AIXlvm; // Temp holder for the LVM on disk rec
714
 
+
715
 
+
716
 
+       LOG_DEBUG(" Begin %s\n", __FUNCTION__); 
717
 
+
718
 
+       if (evms_cs_allocate_memory((void**)&AIXpv, AIX_SECTOR_SIZE)) {
719
 
+               return -ENOMEM;
720
 
+       }
721
 
+
722
 
+       // We'll create at least one volume entry, if we don't find any AIX volumes we'll clean it up later
723
 
+
724
 
+       if (evms_cs_allocate_memory((void**)&AIXlvm, sizeof(AIXlvm_rec_t))) {
725
 
+               evms_cs_deallocate_memory(AIXpv);
726
 
+               return -ENOMEM;
727
 
+       }
728
 
+
729
 
+
730
 
+       for ( logical_node = *evms_logical_disk_head; logical_node; logical_node = next_node ) {
731
 
+
732
 
+               // Grab the next list item in case we remove this partition from the global list.
733
 
+               next_node = logical_node->next;
734
 
+
735
 
+               // Read the first sector and see if it has a valid AIX PV signature.
736
 
+
737
 
+               if ( INIT_IO(logical_node, 0, 0, 1, AIXpv) ) {
738
 
+                       // On an I/O error, continue on to the next
739
 
+                       // partition. The group that this partition
740
 
+                       // belongs to will be incomplete, but we still
741
 
+                       // need to discover any other groups.
742
 
+
743
 
+                       LOG_ERROR(" Error reading PV [%p]\n",logical_node);
744
 
+                       continue;
745
 
+               }
746
 
+
747
 
+
748
 
+               if (AIXpv->IPL_record_id == IPLRECID) {
749
 
+
750
 
+                       // This partition is definitely a PV,
751
 
+                       // but is it part of a valid VG?
752
 
+                       LOG_DEBUG(" DVG removing node from list logical_node %p\n", logical_node); 
753
 
+
754
 
+                       if (INIT_IO(logical_node, 0, PSN_LVM_REC, 1, AIXlvm)) {
755
 
+                               LOG_ERROR(" Error reading PV [%p]\n",logical_node);
756
 
+                               continue;
757
 
+                       }
758
 
+
759
 
+                       if (AIXlvm->lvm_id == AIX_LVM_LVMID) {
760
 
+
761
 
+                               if (validate_build_volume_group_disk_info(logical_node, AIXlvm) ) {
762
 
+                                       // Again, continue on and we'll
763
 
+                                       // clean up later.
764
 
+                                       continue;
765
 
+                               }
766
 
+
767
 
+                               evms_cs_remove_logical_node_from_list( evms_logical_disk_head, logical_node );
768
 
+
769
 
+                       } else {
770
 
+                               LOG_DEBUG(" Found an AIX PV with no parent LVM (LVM ID: %ld)\n",AIXlvm->lvm_id);
771
 
+                               continue;
772
 
+                       }
773
 
+               } else {
774
 
+                       LOG_DEBUG(" Found a PV not belonging to AIX [%p]\n",logical_node);
775
 
+               }
776
 
+       }
777
 
+
778
 
+       AIX_VOLUME_GROUP_DUMP();
779
 
+
780
 
+       if (check_volume_groups()) {
781
 
+               return -EINVAL;
782
 
+       }
783
 
+
784
 
+       evms_cs_deallocate_memory(AIXpv);
785
 
+       evms_cs_deallocate_memory(AIXlvm);
786
 
+
787
 
+       return 0;
788
 
+}
789
 
+
790
 
+
791
 
+/*
792
 
+ * Function:  validate_build_volume_group_disk_info
793
 
+ *
794
 
+ *  Creates and validates the volume groups found on the disk structures.
795
 
+ *  
796
 
+ */
797
 
+static int validate_build_volume_group_disk_info(evms_logical_node_t   * logical_node,
798
 
+                                                AIXlvm_rec_t                * AIXlvm)
799
 
+{
800
 
+
801
 
+       aix_volume_group_t    * AIXVGLptr = AIXVolumeGroupList;
802
 
+
803
 
+       LOG_DEBUG(" VBVGDI pv_num:%d\n", AIXlvm->pv_num);
804
 
+
805
 
+       while (AIXVGLptr) {
806
 
+               if (COMPARE_UNIQUE_IDS(AIXlvm->vg_id, AIXVGLptr->vg_id)) {
807
 
+                       break;
808
 
+               }
809
 
+               AIXVGLptr = AIXVGLptr->next;  // There is more than one so walk the list 
810
 
+       }
811
 
+
812
 
+       if (!AIXVGLptr) {
813
 
+               LOG_DEBUG(" VBVGDI AIXVGLptr:%p line:%d\n", AIXVGLptr,__LINE__);
814
 
+               AIXVGLptr = AIX_create_volume_group(logical_node, AIXlvm); 
815
 
+               AIXVGLptr->next = AIXVolumeGroupList; 
816
 
+               AIXVolumeGroupList = AIXVGLptr; 
817
 
+       } else {
818
 
+               LOG_DEBUG(" VBVGDI Rediscover AIXVGLptr:%p line:%d\n", AIXVGLptr,__LINE__);
819
 
+
820
 
+               if (AIX_update_volume_group(AIXVGLptr, logical_node, AIXlvm)) {
821
 
+                       LOG_DEBUG(" VBVGDI ERROR on Rediscover AIXVGLptr:%p  line:%d\n", AIXVGLptr,__LINE__);
822
 
+               }
823
 
+       }
824
 
+
825
 
+       if (!AIXVGLptr) {
826
 
+
827
 
+               LOG_DEBUG(" VBVGDI AIXVGLptr:%p line:%d\n", AIXVGLptr,__LINE__);
828
 
+               LOG_DEBUG(" VBVGDI flags:%d\n", AIXVGLptr->flags);
829
 
+               LOG_CRITICAL("Unable to allocate volume group data struct Volume Group Corruption !!\n");
830
 
+               return -EINVAL;
831
 
+       } else {
832
 
+
833
 
+               LOG_DEBUG(" VBVGDI AIXVolumeGroupList:%p line:%d\n", AIXVolumeGroupList,__LINE__);
834
 
+               LOG_DEBUG(" VBVGDI AIXVGLptr:%p line:%d\n", AIXVGLptr,__LINE__);
835
 
+               LOG_DEBUG(" VBVGDI flags:%d\n", AIXVGLptr->flags);
836
 
+
837
 
+               if ( add_PV_to_volume_group(AIXVGLptr, logical_node, AIXlvm->pv_num) ) {
838
 
+                       return -EINVAL;                                                             
839
 
+               }
840
 
+       }
841
 
+
842
 
+       return 0;
843
 
+}
844
 
+/*
845
 
+ * Function: add_VG_data_to_VG_list
846
 
+ *
847
 
+ *  Allocate space for a new LVM volume group and all of its sub-fields.
848
 
+ *  Initialize the appropriate fields.
849
 
+ */
850
 
+
851
 
+static int add_VG_data_to_VG_list ( evms_logical_node_t   * logical_node,
852
 
+                                   aix_volume_group_t    * new_group,
853
 
+                                   short int             pvNum)
854
 
+{
855
 
+       int pvh_pos;
856
 
+
857
 
+       pv_header *AIXpvh;
858
 
+
859
 
+       // The array of pointer to the logical volumes.
860
 
+       // Leave this allocation at the max permitted, the lv numbering may not be sequential so you may have gaps
861
 
+       // in the array allocation i.e. 1,2,3,4,5,6,7,8,11,15,21,33 etc. even though you only have 12 LVs.
862
 
+
863
 
+       LOG_DEBUG(" AVGDVGL Entering pvNum:%d vgda_PSN:%d\n",pvNum, new_group->vgda_psn);
864
 
+
865
 
+       pvh_pos = AIX_PVH_DATA_PSN(new_group->vgda_psn, pvNum);
866
 
+
867
 
+       if (evms_cs_allocate_memory((void**)&AIXpvh, AIX_SECTOR_SIZE)) {
868
 
+               return ENOMEM;
869
 
+       }
870
 
+
871
 
+       LOG_DEBUG(" AVGDVGL pvh_pos:%d\n", pvh_pos);
872
 
+
873
 
+       if (INIT_IO(logical_node, 0, pvh_pos, 1, AIXpvh)) {
874
 
+               return EIO;
875
 
+       }
876
 
+
877
 
+       LOG_DEBUG(" AVGDVGL AIXpvh->pv_num:%d\n", pvNum);
878
 
+
879
 
+       if (!new_group->volume_list) {
880
 
+               if ( evms_cs_allocate_memory((void**)&(new_group->volume_list), LVM_MAXLVS*sizeof(aix_logical_volume_t*)) ) {
881
 
+                       evms_cs_deallocate_memory(AIXpvh);
882
 
+                       return -ENOMEM;
883
 
+               }
884
 
+       }
885
 
+
886
 
+       new_group->vg_id.word1      = new_group->AIXvgh->vg_id.word1;
887
 
+       new_group->vg_id.word2      = new_group->AIXvgh->vg_id.word2;
888
 
+       new_group->vg_id.word3      = new_group->AIXvgh->vg_id.word3;
889
 
+       new_group->vg_id.word4      = new_group->AIXvgh->vg_id.word4;
890
 
+       new_group->numpvs           = new_group->AIXvgh->numpvs;
891
 
+       new_group->numlvs           = new_group->AIXvgh->numlvs;
892
 
+       new_group->lv_max           = new_group->AIXvgh->maxlvs;      
893
 
+       new_group->pe_size          = (GET_PHYSICAL_PART_SIZE(new_group->AIXvgh->pp_size) / AIX_SECTOR_SIZE);
894
 
+
895
 
+       new_group->block_size       = 0;
896
 
+       new_group->hard_sect_size   = 0;
897
 
+       new_group->flags           |= EVMS_VG_DIRTY;
898
 
+
899
 
+       evms_cs_deallocate_memory(AIXpvh);
900
 
+
901
 
+
902
 
+       LOG_DEBUG(" AVGDVGL Vol Group ID %x\n", new_group->vg_id.word2);
903
 
+
904
 
+
905
 
+       return 0;
906
 
+}
907
 
+
908
 
+
909
 
+/*
910
 
+ * Function: add_PV_to_volume_group
911
 
+ *
912
 
+ *  Create a new partition_list_entry for the specified volume group.
913
 
+ *  Initialize the new partition with the evms node and lvm pv information,
914
 
+ *  and add the new partition to the group's list.
915
 
+ */
916
 
+
917
 
+static int add_PV_to_volume_group( aix_volume_group_t  * group,
918
 
+                                  evms_logical_node_t * evms_partition,
919
 
+                                  int                   pvNum)
920
 
+{
921
 
+       partition_list_entry_t  * new_partition;
922
 
+
923
 
+       LOG_DEBUG(" APVVG Entering pvNum:%d\n",pvNum);
924
 
+
925
 
+       group->flags |= EVMS_VG_DIRTY;
926
 
+
927
 
+       for (new_partition = group->partition_list; new_partition != NULL; new_partition=new_partition->next) {
928
 
+               if (new_partition->logical_node == evms_partition) {
929
 
+                       return 0;
930
 
+               }
931
 
+       }
932
 
+
933
 
+       if ( evms_cs_allocate_memory((void**)&new_partition, sizeof(partition_list_entry_t)) ) {
934
 
+               return -ENOMEM;
935
 
+       }
936
 
+
937
 
+       // Add this partition to this group's list.
938
 
+       new_partition->logical_node            = evms_partition;
939
 
+       new_partition->pv_number               = pvNum;
940
 
+
941
 
+       group->hard_sect_size   = evms_partition->hardsector_size;
942
 
+       group->block_size       = evms_partition->block_size;
943
 
+
944
 
+       // Add this partition to the beginning of its group's list.
945
 
+       new_partition->next     = group->partition_list;
946
 
+       group->partition_list       = new_partition;
947
 
+       group->partition_count++;
948
 
+
949
 
+       LOG_DEBUG(" APVVG partition_count:%d pv_num:%d\n",group->partition_count, pvNum);
950
 
+
951
 
+       return 0;
952
 
+}
953
 
+/****************************************************
954
 
+*
955
 
+*
956
 
+*
957
 
+*****************************************************/
958
 
+static aix_volume_group_t * AIX_create_volume_group(evms_logical_node_t   * logical_node,
959
 
+                                                   AIXlvm_rec_t          * AIXlvm)
960
 
+{
961
 
+       vg_header             * AIXvgh, *AIXvgh2;
962
 
+       vg_trailer            * AIXvgt, *AIXvgt2;
963
 
+       aix_volume_group_t    * AIXVGLptr;
964
 
+
965
 
+
966
 
+
967
 
+       if (evms_cs_allocate_memory((void**)&AIXvgh, AIX_SECTOR_SIZE)) {
968
 
+               return NULL;
969
 
+       }
970
 
+
971
 
+       if (evms_cs_allocate_memory((void**)&AIXvgh2, AIX_SECTOR_SIZE)) {
972
 
+               AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
973
 
+               return NULL;
974
 
+       }
975
 
+
976
 
+       if (evms_cs_allocate_memory((void**)&AIXvgt, AIX_SECTOR_SIZE)) {
977
 
+               AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
978
 
+               return NULL;
979
 
+       }
980
 
+
981
 
+       if (evms_cs_allocate_memory((void**)&AIXvgt2, AIX_SECTOR_SIZE)) {
982
 
+               AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
983
 
+               return NULL;
984
 
+       }
985
 
+
986
 
+       // First time thru we want to read this in, we may only have one PV in this group, all others 
987
 
+       // may be corrupt, etc. If the info is clean we shouldn't get here.
988
 
+
989
 
+       if (INIT_IO(logical_node, 0, AIXlvm->vgda_psn[0], 1, AIXvgh)) {
990
 
+               AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
991
 
+               return NULL;
992
 
+       }
993
 
+
994
 
+       if (INIT_IO(logical_node, 0, AIXlvm->vgda_psn[1], 1, AIXvgh2)) {
995
 
+               AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
996
 
+               return NULL;
997
 
+       }
998
 
+
999
 
+       if (INIT_IO(logical_node, 0, (AIXlvm->vgda_psn[0]+AIXlvm->vgda_len-1), 1, AIXvgt)) {
1000
 
+               AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1001
 
+               return NULL;
1002
 
+       }
1003
 
+
1004
 
+       if (INIT_IO(logical_node, 0, (AIXlvm->vgda_psn[1]+AIXlvm->vgda_len-1), 1, AIXvgt2)) {
1005
 
+               AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1006
 
+               return NULL;
1007
 
+       }
1008
 
+
1009
 
+       LOG_DEBUG("CVG AIXvgh->vgda_psn[%d]:%d\n", 0, AIXlvm->vgda_psn[0]);
1010
 
+       LOG_DEBUG("CVG AIXvgh->vgda_psn[%d]:%d\n", 1, AIXlvm->vgda_psn[1]);
1011
 
+       LOG_DEBUG("CVG AIXvgt psn[%d]:%ld\n", 0, (AIXlvm->vgda_psn[0]+AIXlvm->vgda_len-1));
1012
 
+       LOG_DEBUG("CVG AIXvgt psn[%d]:%ld\n", 1, (AIXlvm->vgda_psn[1]+AIXlvm->vgda_len-1));
1013
 
+
1014
 
+
1015
 
+       LOG_DEBUG("CVG Allocating AIXVGLptr:size:%d \n",sizeof(aix_volume_group_t));
1016
 
+       if (evms_cs_allocate_memory((void**)&AIXVGLptr, sizeof(aix_volume_group_t))) {
1017
 
+               AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1018
 
+               return NULL;       
1019
 
+
1020
 
+       }
1021
 
+
1022
 
+       AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;
1023
 
+       AIXVGLptr->flags       |= EVMS_VG_DIRTY;
1024
 
+
1025
 
+       LOG_DEBUG("CVG AIXVGLptr:%p line %d\n",AIXVGLptr, __LINE__);
1026
 
+
1027
 
+       if (evms_cs_allocate_memory((void**)&AIXVGLptr->AIXvgh, sizeof(vg_header))) {
1028
 
+               evms_cs_deallocate_memory(AIXVGLptr);
1029
 
+               AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1030
 
+               return NULL;
1031
 
+
1032
 
+       }
1033
 
+
1034
 
+
1035
 
+       LOG_DEBUG("CVG COMP TS AIXVGLptr->CleanVGInfo:%d \n",AIXVGLptr->CleanVGInfo);
1036
 
+
1037
 
+       if (AIXVGLptr->CleanVGInfo == AIX_PV_STATE_INVALID) {
1038
 
+               if (COMPARE_TIMESTAMPS(AIXvgh->vg_timestamp,AIXvgt->timestamp)) {
1039
 
+                       if (COMPARE_TIMESTAMPS(AIXvgh2->vg_timestamp,AIXvgt2->timestamp)) {
1040
 
+                               if (COMPARE_TIMESTAMPS(AIXvgh->vg_timestamp,AIXvgh2->vg_timestamp)) {
1041
 
+                                       // All timestamps match. Yea!
1042
 
+                                       AIXVGLptr->CleanVGInfo = AIX_PV_STATE_VALID;
1043
 
+                               } else {
1044
 
+                                       // Both VGDAs are good, but timestamps are
1045
 
+                                       // different. Can't tell yet which one is
1046
 
+                                       // correct. 
1047
 
+                                       AIXVGLptr->CleanVGInfo = AIX_PV_STATE_EITHER_VGDA;
1048
 
+                               }
1049
 
+                       } else {
1050
 
+                               // First VGDA is good, second is bad.
1051
 
+                               AIXVGLptr->CleanVGInfo = AIX_PV_STATE_FIRST_VGDA;
1052
 
+                       }
1053
 
+               } else {
1054
 
+                       if ( COMPARE_TIMESTAMPS(AIXvgh2->vg_timestamp, AIXvgt2->timestamp) ) {
1055
 
+                               // First VGDA is bad, second is good.
1056
 
+                               AIXVGLptr->CleanVGInfo = AIX_PV_STATE_SECOND_VGDA;
1057
 
+                       } else if (AIXvgh->numpvs == 1) {                      // We only have 1 PV in this group, mismatch or not this will have to do 
1058
 
+                               AIXVGLptr->CleanVGInfo = AIX_PV_STATE_VALID;
1059
 
+                       } else {
1060
 
+                               // This should never happen.
1061
 
+                               LOG_DEBUG("All four VG timestamps for %d are different. What happened?!?\n", AIXVGLptr->vg_id.word2);
1062
 
+                               AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;
1063
 
+
1064
 
+                       }
1065
 
+               }
1066
 
+
1067
 
+               LOG_DEBUG("CVG SWITCH TS AIXVGLptr->CleanVGInfo:%d \n",AIXVGLptr->CleanVGInfo);
1068
 
+
1069
 
+               switch (AIXVGLptr->CleanVGInfo) {
1070
 
+               case AIX_PV_STATE_VALID:
1071
 
+               case AIX_PV_STATE_FIRST_VGDA:
1072
 
+
1073
 
+                       LOG_DEBUG("CVG SWITCH VALID %d size:%d\n",AIXVGLptr->CleanVGInfo,sizeof(vg_header));
1074
 
+
1075
 
+                       AIX_copy_header_info(AIXVGLptr->AIXvgh, AIXvgh);  // Get the info. we need
1076
 
+
1077
 
+                       AIXVGLptr->vgda_psn = AIXlvm->vgda_psn[0];
1078
 
+                       AIXVGLptr->vgda_len = AIXlvm->vgda_len;
1079
 
+                       break;
1080
 
+
1081
 
+               case AIX_PV_STATE_SECOND_VGDA:
1082
 
+                       LOG_DEBUG("CVG SWITCH SECOND VGDA %d size:%d\n",AIXVGLptr->CleanVGInfo,sizeof(vg_header));
1083
 
+
1084
 
+                       AIX_copy_header_info(AIXVGLptr->AIXvgh, AIXvgh2);  // Get the info. we need
1085
 
+
1086
 
+                       AIXVGLptr->vgda_psn = AIXlvm->vgda_psn[1];
1087
 
+                       AIXVGLptr->vgda_len = AIXlvm->vgda_len;
1088
 
+                       break;
1089
 
+
1090
 
+               case AIX_PV_STATE_EITHER_VGDA:
1091
 
+                       LOG_DEBUG("CVG SWITCH EITHER VGDA %d size:%d\n",AIXVGLptr->CleanVGInfo,sizeof(vg_header));
1092
 
+                       if ( COMPARE_UNIQUE_IDS(AIXvgh->vg_id, AIXvgh2->vg_id) ) {
1093
 
+
1094
 
+                               AIX_copy_header_info(AIXVGLptr->AIXvgh, AIXvgh);  // Get the info. we need
1095
 
+
1096
 
+                               AIXVGLptr->vgda_psn = AIXlvm->vgda_psn[0];
1097
 
+                               AIXVGLptr->vgda_len = AIXlvm->vgda_len;
1098
 
+                       } else {
1099
 
+                               AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;
1100
 
+                               // Not sure where this PV belongs. It thinks it is
1101
 
+                               // supposed to be in two different containers. We will
1102
 
+                               // probably need to put this on a separate, temporary
1103
 
+                               // list, and determine later which container is missing
1104
 
+                               // a PV.
1105
 
+                       }
1106
 
+                       break;
1107
 
+
1108
 
+               default:
1109
 
+                       LOG_ERROR("Invalid PV state (%d) for %d\n",AIXVGLptr->CleanVGInfo , AIXVGLptr->vg_id.word2);
1110
 
+                       AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;
1111
 
+                       break;
1112
 
+               }
1113
 
+
1114
 
+       }
1115
 
+
1116
 
+       add_VG_data_to_VG_list(logical_node, AIXVGLptr, AIXlvm->pv_num);
1117
 
+
1118
 
+       AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1119
 
+
1120
 
+       LOG_DEBUG("CVG Exiting CleanVGInfo:%d\n", AIXVGLptr->CleanVGInfo);
1121
 
+
1122
 
+       return AIXVGLptr;
1123
 
+}
1124
 
+/****************************************************
1125
 
+*
1126
 
+*
1127
 
+*
1128
 
+*****************************************************/
1129
 
+static int AIX_update_volume_group(aix_volume_group_t    * AIXVGLptr,
1130
 
+                                  evms_logical_node_t   * logical_node,
1131
 
+                                  AIXlvm_rec_t          * AIXlvm)
1132
 
+{
1133
 
+       vg_header             * AIXvgh, *AIXvgh2;
1134
 
+       vg_trailer            * AIXvgt, *AIXvgt2;
1135
 
+
1136
 
+
1137
 
+
1138
 
+       if (evms_cs_allocate_memory((void**)&AIXvgh, AIX_SECTOR_SIZE)) {
1139
 
+               return -ENOMEM;
1140
 
+       }
1141
 
+
1142
 
+       if (evms_cs_allocate_memory((void**)&AIXvgh2, AIX_SECTOR_SIZE)) {
1143
 
+               AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1144
 
+               return -ENOMEM;
1145
 
+       }
1146
 
+
1147
 
+       if (evms_cs_allocate_memory((void**)&AIXvgt, AIX_SECTOR_SIZE)) {
1148
 
+               AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1149
 
+               return -ENOMEM;
1150
 
+       }
1151
 
+
1152
 
+       if (evms_cs_allocate_memory((void**)&AIXvgt2, AIX_SECTOR_SIZE)) {
1153
 
+               AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1154
 
+               return -ENOMEM;
1155
 
+       }
1156
 
+
1157
 
+       // First time thru we want to read this in, we may only have one PV in this group, all others 
1158
 
+       // may be corrupt, etc. If the info is clean we shouldn't get here.
1159
 
+
1160
 
+       if (INIT_IO(logical_node, 0, AIXlvm->vgda_psn[0], 1, AIXvgh)) {
1161
 
+               AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1162
 
+               return -ENOMEM;
1163
 
+       }
1164
 
+
1165
 
+       if (INIT_IO(logical_node, 0, AIXlvm->vgda_psn[1], 1, AIXvgh2)) {
1166
 
+               AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1167
 
+               return -ENOMEM;
1168
 
+       }
1169
 
+
1170
 
+       if (INIT_IO(logical_node, 0, (AIXlvm->vgda_psn[0]+AIXlvm->vgda_len-1), 1, AIXvgt)) {
1171
 
+               AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1172
 
+               return -ENOMEM;
1173
 
+       }
1174
 
+
1175
 
+       if (INIT_IO(logical_node, 0, (AIXlvm->vgda_psn[1]+AIXlvm->vgda_len-1), 1, AIXvgt2)) {
1176
 
+               AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1177
 
+               return -ENOMEM;
1178
 
+       }
1179
 
+
1180
 
+       LOG_DEBUG("UVG AIXvgh->vgda_psn[%d]:%d\n", 0, AIXlvm->vgda_psn[0]);
1181
 
+       LOG_DEBUG("UVG AIXvgh->vgda_psn[%d]:%d\n", 1, AIXlvm->vgda_psn[1]);
1182
 
+       LOG_DEBUG("UVG AIXvgt psn[%d]:%ld\n", 0, (AIXlvm->vgda_psn[0]+AIXlvm->vgda_len-1));
1183
 
+       LOG_DEBUG("UVG AIXvgt psn[%d]:%ld\n", 1, (AIXlvm->vgda_psn[1]+AIXlvm->vgda_len-1));
1184
 
+
1185
 
+
1186
 
+       AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;
1187
 
+       AIXVGLptr->flags       |= EVMS_VG_DIRTY;
1188
 
+
1189
 
+       LOG_DEBUG("UVG AIXVGLptr:%p line %d\n",AIXVGLptr, __LINE__);
1190
 
+
1191
 
+       if (evms_cs_allocate_memory((void**)&AIXVGLptr->AIXvgh, sizeof(vg_header))) {
1192
 
+               AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1193
 
+               return -ENOMEM;
1194
 
+
1195
 
+       }
1196
 
+
1197
 
+
1198
 
+       LOG_DEBUG("UVG COMP TS AIXVGLptr->CleanVGInfo:%d \n",AIXVGLptr->CleanVGInfo);
1199
 
+
1200
 
+       if (AIXVGLptr->CleanVGInfo == AIX_PV_STATE_INVALID) {
1201
 
+               if (COMPARE_TIMESTAMPS(AIXvgh->vg_timestamp,AIXvgt->timestamp)) {
1202
 
+                       if (COMPARE_TIMESTAMPS(AIXvgh2->vg_timestamp,AIXvgt2->timestamp)) {
1203
 
+                               if (COMPARE_TIMESTAMPS(AIXvgh->vg_timestamp,AIXvgh2->vg_timestamp)) {
1204
 
+                                       // All timestamps match. Yea!
1205
 
+                                       AIXVGLptr->CleanVGInfo = AIX_PV_STATE_VALID;
1206
 
+                               } else {
1207
 
+                                       // Both VGDAs are good, but timestamps are
1208
 
+                                       // different. Can't tell yet which one is
1209
 
+                                       // correct. 
1210
 
+                                       AIXVGLptr->CleanVGInfo = AIX_PV_STATE_EITHER_VGDA;
1211
 
+                               }
1212
 
+                       } else {
1213
 
+                               // First VGDA is good, second is bad.
1214
 
+                               AIXVGLptr->CleanVGInfo = AIX_PV_STATE_FIRST_VGDA;
1215
 
+                       }
1216
 
+               } else {
1217
 
+                       if ( COMPARE_TIMESTAMPS(AIXvgh2->vg_timestamp, AIXvgt2->timestamp) ) {
1218
 
+                               // First VGDA is bad, second is good.
1219
 
+                               AIXVGLptr->CleanVGInfo = AIX_PV_STATE_SECOND_VGDA;
1220
 
+                       } else if (AIXvgh->numpvs == 1) {                      // We only have 1 PV in this group, mismatch or not this will have to do 
1221
 
+                               AIXVGLptr->CleanVGInfo = AIX_PV_STATE_VALID;
1222
 
+                       } else {
1223
 
+                               // This should never happen.
1224
 
+                               LOG_DEBUG("All four VG timestamps for %d are different. What happened?!?\n", AIXVGLptr->vg_id.word2);
1225
 
+                               AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;
1226
 
+
1227
 
+                       }
1228
 
+               }
1229
 
+
1230
 
+               LOG_DEBUG("UVG SWITCH TS AIXVGLptr->CleanVGInfo:%d \n",AIXVGLptr->CleanVGInfo);
1231
 
+
1232
 
+               switch (AIXVGLptr->CleanVGInfo) {
1233
 
+               case AIX_PV_STATE_VALID:
1234
 
+               case AIX_PV_STATE_FIRST_VGDA:
1235
 
+
1236
 
+                       LOG_DEBUG("UVG SWITCH VALID %d size:%d\n",AIXVGLptr->CleanVGInfo,sizeof(vg_header));
1237
 
+
1238
 
+                       AIX_copy_header_info(AIXVGLptr->AIXvgh, AIXvgh);  // Get the info. we need
1239
 
+
1240
 
+                       AIXVGLptr->vgda_psn = AIXlvm->vgda_psn[0];
1241
 
+                       AIXVGLptr->vgda_len = AIXlvm->vgda_len;
1242
 
+                       break;
1243
 
+
1244
 
+               case AIX_PV_STATE_SECOND_VGDA:
1245
 
+                       LOG_DEBUG("UVG SWITCH SECOND VGDA %d size:%d\n",AIXVGLptr->CleanVGInfo,sizeof(vg_header));
1246
 
+
1247
 
+                       AIX_copy_header_info(AIXVGLptr->AIXvgh, AIXvgh2);  // Get the info. we need
1248
 
+
1249
 
+                       AIXVGLptr->vgda_psn = AIXlvm->vgda_psn[1];
1250
 
+                       AIXVGLptr->vgda_len = AIXlvm->vgda_len;
1251
 
+                       break;
1252
 
+
1253
 
+               case AIX_PV_STATE_EITHER_VGDA:
1254
 
+                       LOG_DEBUG("UVG SWITCH EITHER VGDA %d size:%d\n",AIXVGLptr->CleanVGInfo,sizeof(vg_header));
1255
 
+                       if ( COMPARE_UNIQUE_IDS(AIXvgh->vg_id, AIXvgh2->vg_id) ) {
1256
 
+
1257
 
+                               AIX_copy_header_info(AIXVGLptr->AIXvgh, AIXvgh);  // Get the info. we need
1258
 
+
1259
 
+                               AIXVGLptr->vgda_psn = AIXlvm->vgda_psn[0];
1260
 
+                               AIXVGLptr->vgda_len = AIXlvm->vgda_len;
1261
 
+                       } else {
1262
 
+                               AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;
1263
 
+                               // Not sure where this PV belongs. It thinks it is
1264
 
+                               // supposed to be in two different containers. We will
1265
 
+                               // probably need to put this on a separate, temporary
1266
 
+                               // list, and determine later which container is missing
1267
 
+                               // a PV.
1268
 
+                       }
1269
 
+                       break;
1270
 
+
1271
 
+               default:
1272
 
+                       LOG_ERROR("UVG Invalid PV state (%d) for %d\n",AIXVGLptr->CleanVGInfo , AIXVGLptr->vg_id.word2);
1273
 
+                       AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;
1274
 
+                       break;
1275
 
+               }
1276
 
+
1277
 
+       }
1278
 
+
1279
 
+       add_VG_data_to_VG_list(logical_node, AIXVGLptr, AIXlvm->pv_num);
1280
 
+
1281
 
+       AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1282
 
+
1283
 
+       LOG_DEBUG("UVG Exiting CleanVGInfo:%d\n", AIXVGLptr->CleanVGInfo);
1284
 
+
1285
 
+       return 0;
1286
 
+}
1287
 
+/****************************************************
1288
 
+* Function: check_volume_groups
1289
 
+*
1290
 
+* We just want to make sure the volume groups have found
1291
 
+* all their drives.
1292
 
+*
1293
 
+* If not, we'll continue and build what we can
1294
 
+*****************************************************/
1295
 
+static int check_volume_groups(void)
1296
 
+{
1297
 
+       aix_volume_group_t      * group;
1298
 
+       partition_list_entry_t  * partitions;
1299
 
+       int                     NumPVS = 0; 
1300
 
+
1301
 
+
1302
 
+       LOG_DEBUG("CHVG Checking volume groups:\n");
1303
 
+
1304
 
+       group = AIXVolumeGroupList;
1305
 
+
1306
 
+       while (group) {
1307
 
+               partitions = group->partition_list;
1308
 
+               while (partitions) {
1309
 
+                       NumPVS++;
1310
 
+                       partitions = partitions->next;
1311
 
+               }
1312
 
+
1313
 
+               if (NumPVS != group->numpvs) {
1314
 
+                       group->flags |= AIX_VG_INCOMPLETE;
1315
 
+                       LOG_ERROR("CHVG Found incomplete VG !! flags:%x\n",group->flags);
1316
 
+                       LOG_ERROR("CHVG Found %d PVs should have %d PVs\n",NumPVS, group->numpvs);
1317
 
+               }
1318
 
+
1319
 
+               group = group->next;
1320
 
+               NumPVS = 0;
1321
 
+       }
1322
 
+
1323
 
+       LOG_DEBUG("CHVG Finished Checking volume groups:\n");
1324
 
+       return 0;
1325
 
+
1326
 
+}
1327
 
+
1328
 
+/************************************************************************
1329
 
+ * Function: discover_logical_volumes
1330
 
+ *
1331
 
+ *  After all PVs have been claimed and added to the appropriate VG list,
1332
 
+ *  the volumes for each VG must be constructed.
1333
 
+ *
1334
 
+ *
1335
 
+ */
1336
 
+static int discover_logical_volumes( void )
1337
 
+{
1338
 
+
1339
 
+       aix_volume_group_t        * AIXVGLPtr;
1340
 
+       aix_logical_volume_t      * new_LV;
1341
 
+       partition_list_entry_t    * partition;
1342
 
+       evms_logical_node_t       * node;
1343
 
+       lv_entries                * AIXlvent, *AIXlventHead;
1344
 
+       int                         j, lv_found, all_lvs_found, rc;
1345
 
+       namelist                  * AIXnamelist;
1346
 
+       char                      * NameBuffer;
1347
 
+
1348
 
+       AIXVGLPtr = AIXVolumeGroupList;
1349
 
+
1350
 
+       LOG_DEBUG("DLV Discover Logical volume AIXVGLPtr:%p\n",AIXVGLPtr);
1351
 
+
1352
 
+       if ( evms_cs_allocate_memory((void**)&AIXlventHead, MAX_SECTORS_LV_ENTRIES * AIX_SECTOR_SIZE) ) {
1353
 
+               return -ENOMEM;
1354
 
+       }
1355
 
+
1356
 
+       if ( evms_cs_allocate_memory((void**)&NameBuffer, MAX_SECTORS_NAMELIST * EVMS_VSECTOR_SIZE) ) {
1357
 
+               evms_cs_deallocate_memory(AIXlventHead);
1358
 
+               return -ENOMEM;
1359
 
+       }
1360
 
+
1361
 
+       while (AIXVGLPtr) {
1362
 
+               partition = AIXVGLPtr->partition_list;
1363
 
+               node = partition->logical_node;
1364
 
+
1365
 
+
1366
 
+               LOG_DEBUG("DLV INIT_IO AIXNameList position:%ld\n",((AIXVGLPtr->vgda_psn + AIXVGLPtr->vgda_len) - 1 - MAX_SECTORS_NAMELIST));
1367
 
+
1368
 
+               if (INIT_IO(node, 0, ((AIXVGLPtr->vgda_psn + AIXVGLPtr->vgda_len) - 1 - MAX_SECTORS_NAMELIST), MAX_SECTORS_NAMELIST, NameBuffer)) {
1369
 
+                       continue;
1370
 
+               }
1371
 
+
1372
 
+               LOG_DEBUG("DLV INIT_IO AIXNameList\n");
1373
 
+
1374
 
+               if (INIT_IO(node, 0, AIXVGLPtr->vgda_psn + PSN_LVE_REC, MAX_SECTORS_LV_ENTRIES, AIXlventHead)) {
1375
 
+                       continue;
1376
 
+               }
1377
 
+               AIXlvent = AIXlventHead;
1378
 
+               AIXnamelist = (namelist *)NameBuffer;
1379
 
+
1380
 
+               LOG_DEBUG("DLV INIT_IO AIXlvent\n");
1381
 
+               // Search through the LV structs for valid LV entries
1382
 
+               // We're just going to search until all valid LVs are found
1383
 
+               // The max. allowable LVs is 256 and we want don't want to
1384
 
+               // search for 255 if only 8 are defined 1-8 however, there 
1385
 
+               // could be gaps in the LV numbering. i.e 1,2,3,4,5,6,7,8, 27,43, etc.
1386
 
+
1387
 
+               for ( j = 0, lv_found = 0, all_lvs_found = FALSE; !all_lvs_found && j < LVM_MAXLVS; j++, AIXlvent++) {
1388
 
+
1389
 
+                       LOG_DEBUG(" ** DVIG:lv_size:%d lvname:[%s] j:%d lv_number:%d ** \n",AIXlvent->num_lps, AIXnamelist->name[j], j, AIXlvent->lvname);
1390
 
+                       LOG_DEBUG(" DVIG:stripe_exp:%u stripesize:%u lv_status:%d\n", AIXlvent->striping_width, GET_PHYSICAL_PART_SIZE(AIXlvent->stripe_exp), AIXlvent->lv_state);
1391
 
+                       LOG_DEBUG(" DVIG Group:%x.Access:%x\n",(unsigned int)AIXVGLPtr->vg_id.word2,AIXlvent->permissions);
1392
 
+                       LOG_DEBUG(" DVIG mirror:%d mirror_policy:%d mirwrt:%d \n", AIXlvent->mirror, AIXlvent->mirror_policy, AIXlvent->mirwrt_consist);
1393
 
+
1394
 
+                       // This is the same check we used in "diskedit" and "readdisk"
1395
 
+                       if ( AIXlvent->lv_state    != 0 &&
1396
 
+                            AIXlvent->permissions <= 0x10 ) {
1397
 
+
1398
 
+
1399
 
+                               lv_found++;
1400
 
+                               if (lv_found == AIXVGLPtr->numlvs) {
1401
 
+                                       all_lvs_found = TRUE;
1402
 
+                               }
1403
 
+
1404
 
+                               LOG_DEBUG(" DVIG lv_found:%d all_lvs_found:%d \n", lv_found, all_lvs_found);
1405
 
+
1406
 
+                               // Create a new logical volume and place it in the appropriate
1407
 
+                               // spot in this VG's volume list. For re-discovery, make sure
1408
 
+                               // this volume does not already exist.
1409
 
+                               if ( !AIXVGLPtr->volume_list[AIXlvent->lvname] ) {
1410
 
+                                       new_LV = new_logical_volume( AIXlvent, AIXVGLPtr, AIXnamelist->name[j],GET_PHYSICAL_PART_SIZE(AIXlvent->stripe_exp));
1411
 
+                                       if (!new_LV) {
1412
 
+                                               continue;
1413
 
+                                       }
1414
 
+                                       LOG_DEBUG(" DVIG Adding new logical volume %d to group:%x \n",new_LV->lv_number, AIXVGLPtr->vg_id.word2);
1415
 
+                                       AIXVGLPtr->volume_list[new_LV->lv_number] = new_LV;
1416
 
+                               } else {
1417
 
+                                       LOG_DEBUG("DVIG Updating Vol Exists\n");
1418
 
+                               }
1419
 
+                       }
1420
 
+               }
1421
 
+
1422
 
+
1423
 
+               // Build the le_to_pe_map for each volume that was discovered above.
1424
 
+               // This has to be done after all volumes in the group are discovered
1425
 
+               if ( (rc = build_pe_maps(AIXVGLPtr)) ) {
1426
 
+                       continue;
1427
 
+               }
1428
 
+
1429
 
+               check_log_volume_and_pe_maps( AIXVGLPtr );
1430
 
+
1431
 
+               AIXVGLPtr = AIXVGLPtr->next;
1432
 
+       }
1433
 
+
1434
 
+       evms_cs_deallocate_memory(NameBuffer);
1435
 
+       evms_cs_deallocate_memory(AIXlventHead);
1436
 
+
1437
 
+       return 0;
1438
 
+}
1439
 
+/*
1440
 
+ * Function: new_logical_volume
1441
 
+ *
1442
 
+ *  Allocate space for a new LVM logical volume, including space for the
1443
 
+ *  PE map 
1444
 
+ */
1445
 
+static aix_logical_volume_t * new_logical_volume(lv_entries         *AIXlvent, 
1446
 
+                                                aix_volume_group_t *volume_group, 
1447
 
+                                                char               *lv_name,
1448
 
+                                                u_int32_t           stripesize)
1449
 
+{
1450
 
+       aix_logical_volume_t    * new_volume;
1451
 
+
1452
 
+
1453
 
+       LOG_DEBUG(" NLV: lv_number:%d lv_allocated_le:%d lv_size:%d\n", AIXlvent->lvname, 
1454
 
+                 AIXlvent->num_lps,
1455
 
+                 AIXlvent->num_lps * volume_group->pe_size);
1456
 
+
1457
 
+       // Allocate space for the new logical volume.
1458
 
+       if ( evms_cs_allocate_memory((void**)&new_volume, sizeof(aix_logical_volume_t)) ) {
1459
 
+               return NULL;
1460
 
+       }
1461
 
+
1462
 
+       // Allocate space for the LE to PE mapping table
1463
 
+       // We add 1 for the allocated le to ease mapping later on, all AIX le are 1 based
1464
 
+       if ( evms_cs_allocate_memory((void**)&(new_volume->le_to_pe_map), (AIXlvent->num_lps+1)*sizeof(pe_table_entry_t)) ) {
1465
 
+               delete_logical_volume( new_volume );
1466
 
+               return NULL;
1467
 
+       }
1468
 
+
1469
 
+       if (AIXlvent->mirror > AIX_DEFAULT_MIRRORING) {
1470
 
+               if (evms_cs_allocate_memory((void**)&(new_volume->le_to_pe_map_mir1), (AIXlvent->num_lps+1)*sizeof(pe_table_entry_t)) ) {
1471
 
+                       delete_logical_volume( new_volume );
1472
 
+                       return NULL;
1473
 
+               }
1474
 
+       }
1475
 
+
1476
 
+       if (AIXlvent->mirror == AIX_MAX_MIRRORS) {
1477
 
+               if (evms_cs_allocate_memory((void**)&(new_volume->le_to_pe_map_mir2), (AIXlvent->num_lps+1)*sizeof(pe_table_entry_t)) ) {
1478
 
+                       delete_logical_volume( new_volume );
1479
 
+                       return NULL;
1480
 
+               }
1481
 
+       }
1482
 
+
1483
 
+
1484
 
+       // Initialize the rest of the new volume.
1485
 
+       new_volume->lv_number         = AIXlvent->lvname;
1486
 
+       new_volume->lv_size           = AIXlvent->num_lps * (volume_group->pe_size);
1487
 
+       new_volume->lv_access         = AIXlvent->permissions | EVMS_LV_NEW;     // All volumes start new.
1488
 
+       new_volume->lv_status         = AIXlvent->lv_state;
1489
 
+       //new_volume->lv_minor          = MINOR(1);
1490
 
+       new_volume->mirror_copies     = AIXlvent->mirror;
1491
 
+       new_volume->mirror_iterations = AIX_DEFAULT_MIRRORING;
1492
 
+       new_volume->stripes           = AIXlvent->striping_width;
1493
 
+       new_volume->stripe_size       = stripesize;
1494
 
+       new_volume->stripe_size_shift = evms_cs_log2(stripesize);
1495
 
+       new_volume->pe_size           = volume_group->pe_size;
1496
 
+       new_volume->pe_size_shift     = evms_cs_log2(volume_group->pe_size);
1497
 
+       new_volume->num_le            = AIXlvent->num_lps;
1498
 
+       new_volume->new_volume        = TRUE;
1499
 
+       new_volume->group             = volume_group;
1500
 
+
1501
 
+       sprintf(new_volume->name, "aix/%s", lv_name);
1502
 
+
1503
 
+       if (!AIX_BH_list_pool && new_volume->mirror_copies > AIX_DEFAULT_MIRRORING) {
1504
 
+               AIX_BH_list_pool = evms_cs_create_pool(sizeof(aix_mirror_bh_t), "EVMS_AIX_BH", aix_notify_cache_ctor, NULL);
1505
 
+               if (!AIX_BH_list_pool) {
1506
 
+                       return NULL;
1507
 
+               }
1508
 
+       }
1509
 
+
1510
 
+       LOG_DEBUG("NLV lv_number:%d name:%s lv_size %Ld \n", new_volume->lv_number, new_volume->name, new_volume->lv_size); 
1511
 
+       LOG_DEBUG("NLV stripe_size:%d stripe_size_shift:%d\n", new_volume->stripe_size, new_volume->stripe_size_shift); 
1512
 
+
1513
 
+       return new_volume;             
1514
 
+}
1515
 
+/* 
1516
 
+ * Function: aix_notify_cache_ctor
1517
 
+ * this function initializes the b_wait field in the buffer heads
1518
 
+ * in our private buffer head pool.
1519
 
+ */
1520
 
+static void 
1521
 
+aix_notify_cache_ctor(
1522
 
+                    void * foo, 
1523
 
+                    kmem_cache_t * cachep, 
1524
 
+                    unsigned long flags)
1525
 
+{
1526
 
+       if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
1527
 
+           SLAB_CTOR_CONSTRUCTOR) {
1528
 
+               aix_mirror_bh_t *rbh = (aix_mirror_bh_t *)foo;
1529
 
+               memset(rbh, 0, sizeof(aix_mirror_bh_t));
1530
 
+               init_waitqueue_head(&rbh->bh_req.b_wait);
1531
 
+       }
1532
 
+}
1533
 
+
1534
 
+/*
1535
 
+ * Function: build_pe_maps
1536
 
+ *
1537
 
+ *  After all logical volumes have been discovered, the mappings from
1538
 
+ *  logical extents to physical extents must be constructed. Each PV
1539
 
+ *  contains a map on-disk of its PEs. Each PE map entry contains the
1540
 
+ *  logical volume number and the logical extent number on that volume.
1541
 
+ *  Our internal map is the reverse of this map for each volume, listing
1542
 
+ *  the PV node and sector offset for every logical extent on the volume.
1543
 
+ */
1544
 
+static int build_pe_maps( aix_volume_group_t * volume_group)
1545
 
+{
1546
 
+       partition_list_entry_t  * partition;
1547
 
+       partition_list_entry_t  * mirror_partition;
1548
 
+       pp_entries              * AIXppent, *AIXppent_buff;
1549
 
+       pv_header               * AIXpvh;
1550
 
+       u_int64_t               offset;
1551
 
+       u_int32_t               le_number;
1552
 
+       u_int32_t               j, pp_count,pvh_pos;
1553
 
+       u_int32_t               MirrorFound;
1554
 
+#ifdef EVMS_DEBUG_MIRRORS
1555
 
+       u_int32_t               lv_found, all_lvs_found;
1556
 
+       u_int32_t               mirs = 0;
1557
 
+#endif
1558
 
+
1559
 
+       LOG_DEBUG(" *** BPEM ***\n");
1560
 
+       // For every partition in this VG
1561
 
+
1562
 
+       if (evms_cs_allocate_memory((void**)&AIXppent_buff, (AIX_SECTOR_SIZE * PHYS_VOL_OFFSET))) {
1563
 
+               return -ENOMEM;
1564
 
+       }
1565
 
+
1566
 
+       if (evms_cs_allocate_memory((void**)&AIXpvh, AIX_SECTOR_SIZE)) {
1567
 
+               evms_cs_deallocate_memory(AIXppent_buff);
1568
 
+               return -ENOMEM;
1569
 
+       }
1570
 
+
1571
 
+       LOG_DEBUG(" BPEM AIXppent_buff:%d \n", (AIX_SECTOR_SIZE * PHYS_VOL_OFFSET));
1572
 
+
1573
 
+       for ( partition = volume_group->partition_list; partition; partition = partition->next ) {
1574
 
+
1575
 
+               LOG_DEBUG(" BPEM partition:%p next:%p\n", partition, partition->next);
1576
 
+
1577
 
+               pvh_pos = AIX_PVH_DATA_PSN(volume_group->vgda_psn, partition->pv_number);
1578
 
+
1579
 
+               LOG_DEBUG(" BPEM pvh_pos:%d\n", pvh_pos);
1580
 
+
1581
 
+               if (INIT_IO(partition->logical_node, 0, pvh_pos, 1, AIXpvh)) {
1582
 
+                       evms_cs_deallocate_memory(AIXppent_buff);
1583
 
+                       evms_cs_deallocate_memory(AIXpvh);
1584
 
+                       return EIO;
1585
 
+               }
1586
 
+
1587
 
+               // For every entry in the PE map, calculate the PE's sector offset
1588
 
+               // and update the correct LV's PE map. LV number of 0 marks an unused PE.
1589
 
+               // For re-discovery, only compute entries for new volumes.
1590
 
+
1591
 
+               if (INIT_IO(partition->logical_node, 0, pvh_pos, AIX_PVHPP_LENGTH, AIXppent_buff)) {
1592
 
+                       evms_cs_deallocate_memory(AIXppent_buff);
1593
 
+                       evms_cs_deallocate_memory(AIXpvh);
1594
 
+                       return -EIO;
1595
 
+               }
1596
 
+
1597
 
+               AIXppent = AIXppent_buff;
1598
 
+               AIXppent++;
1599
 
+
1600
 
+               pp_count = AIXpvh->pp_count;
1601
 
+
1602
 
+               LOG_DEBUG(" PE Map: volgrp:%x AIXpvh->pv_num:%d partition:%p next:%p lv_index:%d pp_count:%d\n",
1603
 
+                         volume_group->vg_id.word2,
1604
 
+                         AIXpvh->pv_num, 
1605
 
+                         partition, 
1606
 
+                         partition->next,
1607
 
+                         AIXppent->lv_index,
1608
 
+                         pp_count);
1609
 
+
1610
 
+               for (j = 0; j < pp_count; j++) {
1611
 
+                       if (AIXppent->lv_index && AIXppent->pp_state ) {
1612
 
+
1613
 
+                               LOG_EXTRA(" -- pv:%x pp:%d st:%d nm:%s lv:%d lp:%ld cp:%d fst v:%d fst p:%d snd v:%d snd p:%d \n",
1614
 
+                                         volume_group->vg_id.word2, j+1, AIXppent->pp_state, volume_group->volume_list[AIXppent->lv_index-1]->name,
1615
 
+                                         AIXppent->lv_index,
1616
 
+                                         AIXppent->lp_num, AIXppent->copy,
1617
 
+                                         AIXppent->fst_alt_vol, AIXppent->fst_alt_part,
1618
 
+                                         AIXppent->snd_alt_vol, AIXppent->snd_alt_part);
1619
 
+
1620
 
+                               le_number = AIXppent->lp_num -1; // AIX lp's start @ 1, we want a 0 index
1621
 
+                               offset = ((j * (volume_group->pe_size)) + AIXpvh->psn_part1);
1622
 
+
1623
 
+                               LOG_DEBUG(" PE Map: le_number:%d partition:%p lv_index:%d lv_name:%s\n",
1624
 
+                                         le_number,
1625
 
+                                         partition, 
1626
 
+                                         AIXppent->lv_index, 
1627
 
+                                         volume_group->volume_list[AIXppent->lv_index-1]->name);
1628
 
+
1629
 
+                               if (volume_group->volume_list[AIXppent->lv_index-1]->le_to_pe_map && 
1630
 
+                                   le_number <= volume_group->volume_list[AIXppent->lv_index-1]->num_le) {
1631
 
+                                       volume_group->volume_list[AIXppent->lv_index-1]->le_to_pe_map[le_number].owning_pv = partition;
1632
 
+                                       volume_group->volume_list[AIXppent->lv_index-1]->le_to_pe_map[le_number].pe_sector_offset = offset;
1633
 
+                               }
1634
 
+
1635
 
+
1636
 
+                               if (volume_group->volume_list[AIXppent->lv_index-1]->mirror_copies > AIX_DEFAULT_MIRRORING) {
1637
 
+
1638
 
+                                       LOG_EXTRA(" PE Map: Mirror found lv:%d -- \n", AIXppent->lv_index);
1639
 
+
1640
 
+                                       for ( mirror_partition = volume_group->partition_list, MirrorFound = FALSE; mirror_partition && !MirrorFound; mirror_partition = mirror_partition->next ) {
1641
 
+
1642
 
+                                               if (mirror_partition->pv_number == AIXppent->fst_alt_vol) {
1643
 
+
1644
 
+                                                       offset = (((AIXppent->fst_alt_part -1) * (volume_group->pe_size)) + AIXpvh->psn_part1);
1645
 
+
1646
 
+
1647
 
+                                                       volume_group->volume_list[AIXppent->lv_index-1]->le_to_pe_map_mir1[le_number].owning_pv  = mirror_partition;
1648
 
+                                                       volume_group->volume_list[AIXppent->lv_index-1]->le_to_pe_map_mir1[le_number].pe_sector_offset = offset;
1649
 
+
1650
 
+                                                       LOG_EXTRA(" PE Map: mirror_partition:%p \n", mirror_partition);
1651
 
+                                                       LOG_EXTRA(" PE Map: mirror_sector_offet:%d\n", AIXppent->fst_alt_part);
1652
 
+
1653
 
+                                                       MirrorFound = TRUE;
1654
 
+                                               }
1655
 
+                                       }
1656
 
+
1657
 
+                                       if (volume_group->volume_list[AIXppent->lv_index-1]->mirror_copies == AIX_MAX_MIRRORS) {
1658
 
+
1659
 
+                                               for ( mirror_partition = volume_group->partition_list, MirrorFound = FALSE; mirror_partition && !MirrorFound; mirror_partition = mirror_partition->next ) {
1660
 
+
1661
 
+                                                       if (mirror_partition->pv_number == AIXppent->snd_alt_vol) {
1662
 
+
1663
 
+                                                               offset = (((AIXppent->snd_alt_part -1) * (volume_group->pe_size)) + AIXpvh->psn_part1);
1664
 
+
1665
 
+                                                               volume_group->volume_list[AIXppent->lv_index-1]->le_to_pe_map_mir2[le_number].owning_pv  = mirror_partition;
1666
 
+                                                               volume_group->volume_list[AIXppent->lv_index-1]->le_to_pe_map_mir2[le_number].pe_sector_offset = offset;
1667
 
+
1668
 
+                                                               LOG_EXTRA(" PE Map: mirror_partition2:%p \n", mirror_partition);
1669
 
+                                                               LOG_EXTRA(" PE Map: mirror_sector_offet2:%d\n", AIXppent->snd_alt_part);
1670
 
+
1671
 
+                                                               MirrorFound = TRUE;
1672
 
+                                                       }
1673
 
+                                               }
1674
 
+                                       }
1675
 
+
1676
 
+
1677
 
+                               } // End of if mirroring is enabled 
1678
 
+
1679
 
+                       }
1680
 
+
1681
 
+                       AIXppent++;
1682
 
+
1683
 
+               } 
1684
 
+       }
1685
 
+
1686
 
+//     LOG_EXTRA(" PE Map: PE maps:%d Mirror count:%d -- \n", lvs, mirs);
1687
 
+
1688
 
+#ifdef EVMS_DEBUG_MIRRORS
1689
 
+       for (mirs = 0, lv_found = 0, all_lvs_found = FALSE; !all_lvs_found && mirs < LVM_MAXLVS; mirs++) {
1690
 
+
1691
 
+               if (volume_group->volume_list[mirs] != NULL) {
1692
 
+                       if (volume_group->volume_list[mirs]->lv_status == LV_ACTIVE) {
1693
 
+
1694
 
+                               lv_found++;
1695
 
+
1696
 
+                               LOG_DEBUG(" PE Map: owning part lv %d -- %p\n", mirs, volume_group->volume_list[mirs]->le_to_pe_map[0].owning_pv);
1697
 
+                               if (volume_group->volume_list[mirs]->mirror_copies > AIX_DEFAULT_MIRRORING) {
1698
 
+                                       LOG_DEBUG(" PE Map: mirror_partition lv %d -- %p \n", mirs, volume_group->volume_list[mirs]->le_to_pe_map_mir1[0].owning_pv);
1699
 
+                               }
1700
 
+                               if (volume_group->volume_list[mirs]->mirror_copies == AIX_MAX_MIRRORS) {
1701
 
+                                       LOG_DEBUG(" PE Map: mirror_partition lv %d -- %p \n", mirs, volume_group->volume_list[mirs]->le_to_pe_map_mir2[0].owning_pv);
1702
 
+                               }
1703
 
+                       }
1704
 
+                       if (lv_found == volume_group->numlvs) {
1705
 
+                               all_lvs_found = TRUE;
1706
 
+                               LOG_DEBUG(" PE Map: all_lvs_found\n" );
1707
 
+                       }
1708
 
+               }
1709
 
+       }
1710
 
+#endif
1711
 
+
1712
 
+       evms_cs_deallocate_memory(AIXpvh);
1713
 
+       evms_cs_deallocate_memory(AIXppent_buff);
1714
 
+
1715
 
+       return 0;
1716
 
+}
1717
 
+/*
1718
 
+ * Function: check_log_volume_and_pe_maps
1719
 
+ *
1720
 
+ *  Make sure all volumes in this group have valid LE-to-PE maps.
1721
 
+ *  Any volume that doesn't is deleted. This is safe for re-discovery
1722
 
+ *  because only new volumes could have corrupted PE maps.
1723
 
+ */
1724
 
+static int check_log_volume_and_pe_maps( aix_volume_group_t * group )
1725
 
+{
1726
 
+       aix_logical_volume_t * volume;
1727
 
+       int i, j, lv_found, all_lvs_found;
1728
 
+
1729
 
+       LOG_DEBUG(" check_pe_map.\n");
1730
 
+
1731
 
+       for ( i = 0, all_lvs_found = FALSE, lv_found = 0; !all_lvs_found && i < LVM_MAXLVS; i++ ) {
1732
 
+               if ( ! group->volume_list[i] ) {
1733
 
+                       LOG_DEBUG(" CPEM No Volume %d found \n",i);
1734
 
+                       continue;
1735
 
+               }
1736
 
+
1737
 
+               volume = group->volume_list[i];
1738
 
+               if ( ! volume->le_to_pe_map ) {
1739
 
+                       LOG_DEBUG(" CPEM Volume %s has no PE map.\n",volume->name);
1740
 
+                       delete_logical_volume(volume);
1741
 
+                       continue;
1742
 
+               }
1743
 
+
1744
 
+               LOG_DEBUG(" CPEM volume %s num_le: %d \n",volume->name, volume->num_le);
1745
 
+
1746
 
+               lv_found++;
1747
 
+
1748
 
+               if (lv_found == group->numlvs) {
1749
 
+                       all_lvs_found = TRUE;
1750
 
+               }
1751
 
+
1752
 
+
1753
 
+
1754
 
+               for ( j = 0; j < volume->num_le; j++) {
1755
 
+                       if ( ! volume->le_to_pe_map[j].owning_pv ||
1756
 
+                            ! volume->le_to_pe_map[j].pe_sector_offset ) {
1757
 
+                               LOG_SERIOUS(" CPEM Volume (%s) incomplete PE map (LE %d) \n",volume->name, j);
1758
 
+                               volume->lv_access |= EVMS_LV_INCOMPLETE;
1759
 
+                       }
1760
 
+
1761
 
+                       if (volume->mirror_copies > AIX_DEFAULT_MIRRORING) {
1762
 
+                               if ( ! volume->le_to_pe_map_mir1[j].owning_pv ||
1763
 
+                                    ! volume->le_to_pe_map_mir1[j].pe_sector_offset ) {
1764
 
+                                       LOG_SERIOUS(" CPEM Volume (%s) incomplete PE mirror map 1 (LE %d) \n",volume->name, j);
1765
 
+                                       volume->lv_access |= EVMS_LV_INCOMPLETE;
1766
 
+                               }
1767
 
+
1768
 
+                               if (volume->mirror_copies == AIX_MAX_MIRRORS) {
1769
 
+                                       if ( ! volume->le_to_pe_map_mir2[j].owning_pv ||
1770
 
+                                            ! volume->le_to_pe_map_mir2[j].pe_sector_offset ) {
1771
 
+                                               LOG_SERIOUS(" CPEM Volume (%s) incomplete PE mirror map 2 (LE %d) \n",volume->name, j);
1772
 
+                                               volume->lv_access |= EVMS_LV_INCOMPLETE;
1773
 
+                                       }
1774
 
+                               }
1775
 
+                       }
1776
 
+               }
1777
 
+       }
1778
 
+
1779
 
+       LOG_EXTRA(" Leaving check_pe_map.\n");
1780
 
+       return 0;
1781
 
+}
1782
 
+/*
1783
 
+ * Function: export_volumes
1784
 
+ *
1785
 
+ *  The last thing this VGE must do is take each constructed volume and
1786
 
+ *  place it back on the evms logical partition list.
1787
 
+ */
1788
 
+static int export_volumes( evms_logical_node_t ** evms_partition_list )
1789
 
+{
1790
 
+       aix_volume_group_t        * AIXVGLPtr;
1791
 
+       evms_logical_node_t * new_node;
1792
 
+       aix_logical_volume_t    * volume;
1793
 
+       int  j, lv_found, all_lvs_found;
1794
 
+       int count = 0;
1795
 
+
1796
 
+       AIXVGLPtr = AIXVolumeGroupList;
1797
 
+
1798
 
+       while (AIXVGLPtr) {
1799
 
+
1800
 
+               if (AIXVGLPtr->flags & EVMS_VG_DIRTY) {
1801
 
+
1802
 
+                       LOG_DEBUG(" Exporting all new volumes numpvs:%d numlvs:%d \n",AIXVGLPtr->numpvs,AIXVGLPtr->numlvs);
1803
 
+
1804
 
+                       // Export every valid volume in the group. For re-discovery,
1805
 
+                       // make sure we are only exporting "new" volumes.
1806
 
+
1807
 
+                       for ( j = 0, all_lvs_found = FALSE, lv_found = 0; !all_lvs_found && j < LVM_MAXLVS ; j++ ) {
1808
 
+                               if (AIXVGLPtr->volume_list[j] != NULL ) {
1809
 
+                                       if (AIXVGLPtr->volume_list[j]->new_volume == TRUE) {
1810
 
+
1811
 
+                                               LOG_DEBUG(" EV Checking LV:[%d] volume:%p\n",j, AIXVGLPtr->volume_list[j]);
1812
 
+                                               volume = AIXVGLPtr->volume_list[j];
1813
 
+                                               lv_found++;
1814
 
+
1815
 
+                                               if (lv_found == AIXVGLPtr->numlvs) {
1816
 
+                                                       all_lvs_found = TRUE;
1817
 
+                                               }
1818
 
+
1819
 
+                                               // For new volumes, create a new EVMS node and 
1820
 
+                                               // initialize the appropriate fields.
1821
 
+                                               if ( volume->lv_access & EVMS_LV_NEW ) {
1822
 
+                                                       if ( evms_cs_allocate_logical_node( &new_node ) ) {
1823
 
+                                                               LOG_DEBUG(" Export Vol Error allocating node !!\n");
1824
 
+                                                               continue;
1825
 
+                                                       } else {
1826
 
+                                                               LOG_DEBUG(" EV Node allocated OK\n");
1827
 
+                                                       }
1828
 
+
1829
 
+                                                       volume->new_volume          = 0;
1830
 
+                                                       volume->volume_node         = new_node;
1831
 
+                                                       volume->lv_access          &= (~EVMS_LV_NEW);
1832
 
+                                                       new_node->hardsector_size   = AIXVGLPtr->hard_sect_size;
1833
 
+                                                       new_node->block_size        = AIXVGLPtr->block_size;
1834
 
+                                                       new_node->plugin            = &plugin_header;
1835
 
+                                                       new_node->instance_data     = volume;
1836
 
+                                                       new_node->total_vsectors     = volume->lv_size;
1837
 
+
1838
 
+
1839
 
+                                                       LOG_DEBUG(" EV volume->name:[%s]\n",volume->name);
1840
 
+
1841
 
+                                                       strncpy(new_node->name, volume->name, EVMS_VOLUME_NAME_SIZE+1);
1842
 
+
1843
 
+
1844
 
+                                                       // Is the volume read-only?
1845
 
+                                                       if ( !(volume->lv_access & AIX_LV_WRITE) ||
1846
 
+                                                            volume->lv_access & EVMS_LV_INCOMPLETE ) {
1847
 
+                                                               new_node->flags |= EVMS_VOLUME_SET_READ_ONLY;
1848
 
+                                                               LOG_DEBUG(" EV Read Only volume->lv_access:%d\n",volume->lv_access);
1849
 
+                                                       }
1850
 
+                                               } else {
1851
 
+                                                       LOG_DEBUG(" EV Node [%s] allocated previously\n",volume->name);
1852
 
+                                               }
1853
 
+
1854
 
+                                               evms_cs_add_logical_node_to_list( evms_partition_list, new_node );
1855
 
+                                               count++;
1856
 
+
1857
 
+                                               LOG_DEBUG(" Exporting LVM volume %p new_node:%p ESD->volume_name[%s]\n", volume, new_node,new_node->name);
1858
 
+                                       } else {
1859
 
+                                               evms_cs_add_logical_node_to_list( evms_partition_list, AIXVGLPtr->volume_list[j]->volume_node);
1860
 
+                                               count++;
1861
 
+                                               LOG_DEBUG(" ELV vol_list[%d]%p\n",j, AIXVGLPtr->volume_list[j]);
1862
 
+                                       }
1863
 
+                               } else {
1864
 
+                                       LOG_DEBUG(" EV Checking LV:[%d] == NULL\n",j);
1865
 
+                               }
1866
 
+                       } // end checking all lvs
1867
 
+
1868
 
+               } else {
1869
 
+                       LOG_DEBUG(" ELV Existing volume -- %d\n",AIXVGLPtr->vg_id.word2);
1870
 
+               }
1871
 
+
1872
 
+               AIXVGLPtr->flags &= ~EVMS_VG_DIRTY;
1873
 
+               AIXVGLPtr = AIXVGLPtr->next;
1874
 
+       }
1875
 
+
1876
 
+       return count;
1877
 
+
1878
 
+}
1879
 
+
1880
 
+/*
1881
 
+ * Function: delete_logical_volume
1882
 
+ *
1883
 
+ *  This function deletes the in-memory representation of a single LVM
1884
 
+ *  logical volume, including its PE map and any snapshot data. It does
1885
 
+ *  not alter the parent volume group, except to remove this volume from
1886
 
+ *  its volume list.
1887
 
+ */
1888
 
+static int delete_logical_volume( aix_logical_volume_t * volume )
1889
 
+{
1890
 
+       aix_volume_group_t      * group = volume->group;
1891
 
+
1892
 
+       LOG_DEBUG(" Deleting volume %s\n",volume->name);
1893
 
+
1894
 
+       // Now free up all the memory. This includes the LE-to-PE map, any
1895
 
+       // mirror PEs, etc.
1896
 
+       if ( volume->le_to_pe_map ) {
1897
 
+               evms_cs_deallocate_memory( volume->le_to_pe_map );
1898
 
+               volume->le_to_pe_map = NULL;
1899
 
+       }
1900
 
+
1901
 
+       if ( volume->le_to_pe_map_mir1 ) {
1902
 
+               evms_cs_deallocate_memory( volume->le_to_pe_map_mir1 );
1903
 
+               volume->le_to_pe_map_mir1 = NULL;
1904
 
+       }
1905
 
+
1906
 
+       if ( volume->le_to_pe_map_mir2 ) {
1907
 
+               evms_cs_deallocate_memory( volume->le_to_pe_map_mir2 );
1908
 
+               volume->le_to_pe_map_mir2 = NULL;
1909
 
+       }
1910
 
+
1911
 
+       // Remove this volume from the volume-group's list.
1912
 
+       if ( group && group->volume_list[volume->lv_number] == volume ) {
1913
 
+               group->volume_list[volume->lv_number] = NULL;
1914
 
+               group->numlvs--;
1915
 
+       }
1916
 
+
1917
 
+       evms_cs_deallocate_memory(volume);
1918
 
+
1919
 
+       return 0;
1920
 
+}
1921
 
+
1922
 
+
1923
 
+/* Function: remove_group_from_list
1924
 
+ *
1925
 
+ *     Remove an LVM volume group from the global LVM list.
1926
 
+ */
1927
 
+static int remove_group_from_list( aix_volume_group_t * group )
1928
 
+{
1929
 
+       aix_volume_group_t ** p_group;
1930
 
+
1931
 
+       for ( p_group = &AIXVolumeGroupList; *p_group; p_group = &(*p_group)->next ) {
1932
 
+               if ( *p_group == group ) {
1933
 
+                       *p_group = (*p_group)->next;
1934
 
+                       group->next = NULL;
1935
 
+                       break;
1936
 
+               }
1937
 
+       }
1938
 
+       return 0;
1939
 
+}
1940
 
+
1941
 
+
1942
 
+/*
1943
 
+ * Function: delete_aix_node
1944
 
+ *
1945
 
+ *  This function deletes the in-memory representation of an LVM
1946
 
+ *  logical volume. Right now it makes a lot of assumptions about
1947
 
+ *  the data in the group not being corrupted. It would be possible
1948
 
+ *  to put in a lot of consistency checks before deleting everything
1949
 
+ *  to indicate if problems have occurred during the lifetime of the
1950
 
+ *  volume and its volume group.
1951
 
+ */
1952
 
+static int delete_aix_node( evms_logical_node_t * logical_node )
1953
 
+{
1954
 
+       aix_logical_volume_t    * volume = (aix_logical_volume_t*)(logical_node->instance_data);
1955
 
+       aix_volume_group_t      * group = volume->group;
1956
 
+
1957
 
+       if ( delete_logical_volume(volume) ) {
1958
 
+               return -EINVAL;
1959
 
+       }
1960
 
+
1961
 
+       // If we just removed the last volume from this group, the entire group
1962
 
+       // can also be deleted.
1963
 
+       if ( group && group->numlvs == 0) {
1964
 
+               remove_group_from_list(group);
1965
 
+               deallocate_volume_group(group);
1966
 
+       }
1967
 
+
1968
 
+       // Free the logical node.
1969
 
+       evms_cs_deallocate_logical_node(logical_node);
1970
 
+
1971
 
+       return 0;
1972
 
+}
1973
 
+
1974
 
+/* Function: deallocate_volume_group
1975
 
+ *
1976
 
+ *  This function deletes the entire in-memory representation of an LVM
1977
 
+ *  volume group, including all partitions and logical volumes. If this
1978
 
+ *  group is on the VGE's volume group list, it is removed.
1979
 
+ */
1980
 
+static int deallocate_volume_group( aix_volume_group_t * group )
1981
 
+{
1982
 
+       partition_list_entry_t  * partition;
1983
 
+       partition_list_entry_t  * next_part;
1984
 
+       int                     i;
1985
 
+
1986
 
+       LOG_DEBUG(" Deleting volume group %x\n",group->vg_id.word2);
1987
 
+
1988
 
+
1989
 
+       // Delete all partitions from the group's list.
1990
 
+       for ( partition = group->partition_list; partition; partition = next_part ) {
1991
 
+
1992
 
+               next_part = partition->next;
1993
 
+
1994
 
+               if ( partition->logical_node ) {
1995
 
+                       // Send a delete command down to the partition manager.
1996
 
+                       LOG_DEBUG(" Deleting PV %d from group %x\n",partition->pv_number,group->vg_id.word2);
1997
 
+                       DELETE(partition->logical_node);
1998
 
+               }
1999
 
+               evms_cs_deallocate_memory(partition);
2000
 
+
2001
 
+       }
2002
 
+
2003
 
+       // Delete all logical volumes, and the array of pointers.
2004
 
+       for ( i = 0; i < LVM_MAXLVS; i++ ) {
2005
 
+               if ( group->volume_list[i] ) {
2006
 
+                       delete_logical_volume(group->volume_list[i]);
2007
 
+               }
2008
 
+       }
2009
 
+
2010
 
+       evms_cs_deallocate_memory(group);
2011
 
+
2012
 
+       return 0;
2013
 
+}
2014
 
+/* Function: end_discover_aix
2015
 
+ *
2016
 
+ *     The discovery process at the region-manager level is now iterative,
2017
 
+ *     much like the EVMS feature level. To accomplish this correctly, and
2018
 
+ *     also to accomplish partial volume discovery, a second discover
2019
 
+ *     entry point is needed, so EVMS can tell the region managers that
2020
 
+ *     discovery is over, and to finish up any discovery that is not yet
2021
 
+ *     complete. When this function is called, it should be assumed that
2022
 
+ *     the node list has had nothing new added to it since the last call
2023
 
+ *     of the regular discover function. Therefore, when this function is
2024
 
+ *     called, we do not need to try to discovery any additional volume
2025
 
+ *     groups. We will, however, look for logical volumes once more. This
2026
 
+ *     gives us the ability to export (read-only) volumes that have
2027
 
+ *     partially corrupted LE maps due to missing PVs in their VG.
2028
 
+ */
2029
 
+static int end_discover_aix(evms_logical_node_t ** evms_logical_disk_head)
2030
 
+{
2031
 
+
2032
 
+       int rc;
2033
 
+
2034
 
+       LOG_DEBUG("Final Discovery:\n");
2035
 
+
2036
 
+
2037
 
+       if ( (rc = discover_logical_volumes()) ) {
2038
 
+               return rc;
2039
 
+       }
2040
 
+
2041
 
+       rc = export_volumes(evms_logical_disk_head);
2042
 
+
2043
 
+       lvm_cleanup();
2044
 
+
2045
 
+       return rc;
2046
 
+}
2047
 
+/****************************************************
2048
 
+* Function: AIX_alloc_wbh
2049
 
+*
2050
 
+* Alloc any buffer heads from the pool and return a linked list
2051
 
+*
2052
 
+*
2053
 
+*****************************************************/
2054
 
+static aix_mirror_bh_t * AIX_alloc_wbh(evms_logical_node_t   * node,
2055
 
+                                      evms_logical_node_t   * node2,
2056
 
+                                      evms_logical_node_t   * node3,
2057
 
+                                      eio_t                 * eio,
2058
 
+                                      uint32_t                mirror_copies,
2059
 
+                                      evms_sector_t           new_sector2,
2060
 
+                                      evms_sector_t           new_sector3)
2061
 
+
2062
 
+{
2063
 
+       aix_mirror_bh_t  * tmp_bh = NULL, *head_bh = NULL;
2064
 
+       int i;
2065
 
+
2066
 
+       head_bh = evms_cs_allocate_from_pool(AIX_BH_list_pool, EVMS_BLOCKABLE);
2067
 
+
2068
 
+       if (!head_bh) {
2069
 
+               LOG_SERIOUS("Unable to allocate memory for mirror pool line:%d\n",__LINE__);
2070
 
+               return NULL;
2071
 
+       }
2072
 
+
2073
 
+       head_bh->master_bh = eio->bh;
2074
 
+    head_bh->mirror_bh_list = NULL;
2075
 
+       atomic_set(&head_bh->remaining, 0);
2076
 
+
2077
 
+       for (i = AIX_DEFAULT_MIRRORING; i <= mirror_copies; i++) {
2078
 
+
2079
 
+               tmp_bh = evms_cs_allocate_from_pool(AIX_BH_list_pool, EVMS_BLOCKABLE);
2080
 
+               if (!tmp_bh) {
2081
 
+                       LOG_SERIOUS("Unable to allocate memory for mirror pool line:%d\n",__LINE__);
2082
 
+                       return NULL;
2083
 
+               }
2084
 
+
2085
 
+               tmp_bh->next_r1 = head_bh->mirror_bh_list;
2086
 
+               head_bh->mirror_bh_list = tmp_bh;
2087
 
+               atomic_inc(&head_bh->remaining);
2088
 
+
2089
 
+               memcpy(&tmp_bh->bh_req, eio->bh, sizeof(struct buffer_head));
2090
 
+               init_waitqueue_head(&tmp_bh->bh_req.b_wait);
2091
 
+//             tmp_bh->master_bh       = eio->bh;
2092
 
+//             tmp_bh->iteration       = AIX_DEFAULT_MIRRORING + i;
2093
 
+               tmp_bh->eio.rsize       = eio->rsize;
2094
 
+               tmp_bh->eio.bh          = &tmp_bh->bh_req;
2095
 
+
2096
 
+               switch (i) {
2097
 
+               
2098
 
+               case AIX_DEFAULT_MIRRORING:
2099
 
+                       tmp_bh->node            = node;
2100
 
+                       tmp_bh->eio.rsector     = eio->rsector;
2101
 
+                       break;
2102
 
+
2103
 
+               case AIX_FIRST_MIRROR:
2104
 
+                       tmp_bh->node            = node2;
2105
 
+                       tmp_bh->eio.rsector     = new_sector2;
2106
 
+                       break;
2107
 
+
2108
 
+               case AIX_MAX_MIRRORS:
2109
 
+                       tmp_bh->node            = node3;
2110
 
+                       tmp_bh->eio.rsector     = new_sector3;
2111
 
+                       break;
2112
 
+               }
2113
 
+
2114
 
+               tmp_bh->bh_req.b_end_io = AIX_handle_write_mirror_drives;  //setup callback routine 
2115
 
+               tmp_bh->bh_req.b_private = (void*)head_bh;
2116
 
+
2117
 
+       }
2118
 
+
2119
 
+       return head_bh;
2120
 
+
2121
 
+}
2122
 
+/****************************************************
2123
 
+* Function: AIX_handle_write_mirror_drives
2124
 
+*
2125
 
+* Handles a write from a set of mirrored AIX LVs
2126
 
+
2127
 
+*
2128
 
+*
2129
 
+*****************************************************/
2130
 
+static void AIX_handle_write_mirror_drives(struct buffer_head      * bh,
2131
 
+                                          int                      uptodate)
2132
 
+{
2133
 
+       aix_logical_volume_t * volume;                                
2134
 
+       evms_logical_node_t   * node;
2135
 
+       aix_mirror_bh_t  * tmp_bh = NULL, * tmp_bh2 = NULL;
2136
 
+       kdev_t          tmp_b_dev = bh->b_dev; 
2137
 
+       u_int32_t       count;
2138
 
+
2139
 
+       tmp_bh = (aix_mirror_bh_t *)bh->b_private;
2140
 
+       node   = tmp_bh->node;
2141
 
+       volume = (aix_logical_volume_t *) node->instance_data; 
2142
 
+
2143
 
+       LOG_DEBUG("AHWMD node:%p bh_flags:%lu uptodate:%d mirror_copies:%d \n", node, bh->b_state,uptodate, volume->mirror_copies);
2144
 
+
2145
 
+       if (!uptodate) {
2146
 
+
2147
 
+               AIX_evms_cs_notify_lv_io_error(node);
2148
 
+       }
2149
 
+
2150
 
+       if (atomic_dec_and_test(&tmp_bh->remaining)) {
2151
 
+               tmp_bh->master_bh->b_end_io(tmp_bh->master_bh, uptodate);
2152
 
+               tmp_bh2 = tmp_bh->mirror_bh_list;
2153
 
+               evms_cs_deallocate_to_pool(AIX_BH_list_pool, tmp_bh);
2154
 
+
2155
 
+               while (tmp_bh2) {
2156
 
+                       tmp_bh = tmp_bh2->next_r1;
2157
 
+                       evms_cs_deallocate_to_pool(AIX_BH_list_pool, tmp_bh2);
2158
 
+                       tmp_bh2 = tmp_bh;
2159
 
+               }
2160
 
+
2161
 
+               evms_cs_volume_request_in_progress(tmp_b_dev, AIX_DECREMENT_REQUEST, &count);
2162
 
+       }
2163
 
+
2164
 
+       return;
2165
 
+}
2166
 
+
2167
 
+/****************************************************
2168
 
+* Function: AIX_alloc_rbh
2169
 
+*
2170
 
+* Alloc any buffer heads from the pool and return a linked list
2171
 
+*
2172
 
+*
2173
 
+*****************************************************/
2174
 
+static aix_mirror_bh_t * AIX_alloc_rbh(evms_logical_node_t   * node,
2175
 
+                                      eio_t                 * eio,
2176
 
+                                      uint32_t                mirror_copies,
2177
 
+                                      evms_sector_t           org_sector,
2178
 
+                                      int                     cmd)
2179
 
+{
2180
 
+       aix_mirror_bh_t  * tmp_bh = NULL;
2181
 
+
2182
 
+       tmp_bh = evms_cs_allocate_from_pool(AIX_BH_list_pool, EVMS_BLOCKABLE);
2183
 
+
2184
 
+       if (!tmp_bh) {
2185
 
+               LOG_SERIOUS("Unable to allocate memory for mirror pool line:%d\n",__LINE__);
2186
 
+               return NULL;
2187
 
+       }
2188
 
+
2189
 
+       memcpy(&tmp_bh->bh_req, eio->bh, sizeof(struct buffer_head));
2190
 
+       tmp_bh->node            = node;
2191
 
+       tmp_bh->master_bh       = eio->bh;
2192
 
+       tmp_bh->iteration       = AIX_FIRST_MIRROR;
2193
 
+       tmp_bh->eio.rsector     = eio->rsector;
2194
 
+       tmp_bh->eio.rsize       = eio->rsize;
2195
 
+       tmp_bh->eio.bh          = &tmp_bh->bh_req;
2196
 
+
2197
 
+
2198
 
+       tmp_bh->bh_req.b_end_io = AIX_handle_read_mirror_drives;  //setup callback routine 
2199
 
+       tmp_bh->bh_req.b_private = (void*)tmp_bh;
2200
 
+
2201
 
+       tmp_bh->cmd       = cmd;
2202
 
+       tmp_bh->next_r1   = NULL;
2203
 
+       tmp_bh->node      = node;
2204
 
+
2205
 
+       return tmp_bh;
2206
 
+
2207
 
+}
2208
 
+
2209
 
+static void AIX_reschedule_retry (aix_mirror_bh_t *aix_bh)
2210
 
+{
2211
 
+       unsigned long flags;
2212
 
+
2213
 
+       spin_lock_irqsave(&AIX_retry_list_lock, flags);
2214
 
+       if (AIX_retry_list == NULL)
2215
 
+               AIX_retry_tail = &AIX_retry_list;
2216
 
+       *AIX_retry_tail = aix_bh;
2217
 
+       AIX_retry_tail = &aix_bh->next_r1;
2218
 
+       aix_bh->next_r1 = NULL;
2219
 
+       spin_unlock_irqrestore(&AIX_retry_list_lock, flags);
2220
 
+       evms_cs_wakeup_thread(AIX_mirror_thread);
2221
 
+}
2222
 
+/****************************************************
2223
 
+* Function: AIX_handle_read_mirror_drives
2224
 
+*
2225
 
+* Handles a read from a set of mirrored AIX LVs
2226
 
+
2227
 
+*
2228
 
+*
2229
 
+*****************************************************/
2230
 
+static void AIX_handle_read_mirror_drives(struct buffer_head      * bh,
2231
 
+                                         int                      uptodate)
2232
 
+{
2233
 
+       aix_logical_volume_t * volume;                                
2234
 
+       evms_logical_node_t   * node;
2235
 
+       aix_mirror_bh_t  * tmp_bh;
2236
 
+       kdev_t          tmp_b_dev = bh->b_dev; 
2237
 
+       u_int32_t       count;
2238
 
+
2239
 
+       tmp_bh = (aix_mirror_bh_t *)bh->b_private;
2240
 
+       volume = (aix_logical_volume_t *) tmp_bh->node->instance_data; 
2241
 
+       node   = tmp_bh->node;
2242
 
+
2243
 
+       LOG_DEBUG("AHRMD node:%p bh_flags:%lu uptodate:%d mirror_copies:%d \n", node, bh->b_state,uptodate, volume->mirror_copies);
2244
 
+
2245
 
+       if (!uptodate && tmp_bh->iteration < volume->mirror_copies) {
2246
 
+               AIX_evms_cs_notify_lv_io_error(node);
2247
 
+               AIX_reschedule_retry(tmp_bh);
2248
 
+       } else {
2249
 
+               tmp_bh->master_bh->b_end_io(tmp_bh->master_bh, uptodate);
2250
 
+               evms_cs_deallocate_to_pool(AIX_BH_list_pool, tmp_bh);
2251
 
+               evms_cs_volume_request_in_progress(tmp_b_dev, AIX_DECREMENT_REQUEST, &count);
2252
 
+
2253
 
+       }
2254
 
+
2255
 
+
2256
 
+
2257
 
+       return;
2258
 
+}
2259
 
+/****************************************************
2260
 
+* This is a temporary function until a common EVMS
2261
 
+* notification function can be created.
2262
 
+*
2263
 
+*****************************************************/
2264
 
+static int  AIX_evms_cs_notify_lv_io_error(evms_logical_node_t * node)
2265
 
+{
2266
 
+       aix_logical_volume_t * volume;
2267
 
+
2268
 
+       volume = (aix_logical_volume_t *)node->instance_data;
2269
 
+
2270
 
+       LOG_CRITICAL("Notify_ERROR !!  node:%p volume->lv_status:%d volume->name:[%s]\n", node, volume->lv_status,volume->name);
2271
 
+
2272
 
+       return 0;
2273
 
+}
2274
 
+
2275
 
+/* Function: lvm_cleanup
2276
 
+ *
2277
 
+ *     This function runs through the entire lvm data structure, removing
2278
 
+ *     all items that are not needed at runtime. Currently, this is just the
2279
 
+ *     vg_disk_t structure and the pv_disk_t structure for each PV. Also, any
2280
 
+ *     groups that don't contain any volumes are deleted. All of the other
2281
 
+ *     volume_group, logical_volume and evms_logical_node structures will be
2282
 
+ *     kept around at run-time.
2283
 
+ */
2284
 
+static int lvm_cleanup( void )
2285
 
+{
2286
 
+       aix_volume_group_t      * group;
2287
 
+
2288
 
+       group = AIXVolumeGroupList;
2289
 
+
2290
 
+       while (group) {
2291
 
+
2292
 
+               if (group->AIXvgh) {
2293
 
+                       evms_cs_deallocate_memory(group->AIXvgh);
2294
 
+                       group->AIXvgh = NULL;
2295
 
+               }
2296
 
+
2297
 
+               group = group->next;
2298
 
+       }
2299
 
+
2300
 
+       return 0;
2301
 
+}
2302
 
+
2303
 
+/****************************************************
2304
 
+* Function: AIX_copy_header_info
2305
 
+*
2306
 
+* Copy the disk header info into the volume struct
2307
 
+* so we can use it later.
2308
 
+*
2309
 
+* 
2310
 
+*
2311
 
+*****************************************************/
2312
 
+static int AIX_copy_header_info(vg_header *AIXvgh, vg_header *AIXvgh2)
2313
 
+{
2314
 
+
2315
 
+       LOG_DEBUG("CHI  AIXvgh:%p AIXvgh2:%p\n",AIXvgh,AIXvgh2);
2316
 
+
2317
 
+       if (AIXvgh) {
2318
 
+
2319
 
+               AIXvgh->vg_timestamp.tv_sec     = AIXvgh2->vg_timestamp.tv_sec; 
2320
 
+               AIXvgh->vg_timestamp.tv_nsec    = AIXvgh2->vg_timestamp.tv_nsec; 
2321
 
+               AIXvgh->vg_id.word1             = AIXvgh2->vg_id.word1;
2322
 
+               AIXvgh->vg_id.word2             = AIXvgh2->vg_id.word2;
2323
 
+               AIXvgh->vg_id.word3             = AIXvgh2->vg_id.word3;
2324
 
+               AIXvgh->vg_id.word4             = AIXvgh2->vg_id.word4;
2325
 
+               AIXvgh->numlvs                  = AIXvgh2->numlvs;       
2326
 
+               AIXvgh->maxlvs                  = AIXvgh2->maxlvs;       
2327
 
+               AIXvgh->pp_size                 = AIXvgh2->pp_size;
2328
 
+               AIXvgh->numpvs                  = AIXvgh2->numpvs;     
2329
 
+               AIXvgh->total_vgdas             = AIXvgh2->total_vgdas;
2330
 
+               AIXvgh->vgda_size               = AIXvgh2->vgda_size;  
2331
 
+               AIXvgh->bigvg                   = AIXvgh2->bigvg;      
2332
 
+               AIXvgh->quorum                  = AIXvgh2->quorum;     
2333
 
+               AIXvgh->auto_varyon             = AIXvgh2->auto_varyon;
2334
 
+               AIXvgh->checksum                = AIXvgh2->checksum;   
2335
 
+               AIXvgh->bigda_size              = AIXvgh2->bigda_size; 
2336
 
+
2337
 
+       } else {
2338
 
+               return -ENOMEM;
2339
 
+       }
2340
 
+
2341
 
+       LOG_DEBUG("Returning CHI  AIXvgh:%p AIXvgh2:%p\n",AIXvgh,AIXvgh2);
2342
 
+
2343
 
+       return 0;
2344
 
+}
2345
 
+/****************************************************
2346
 
+* Function: AIX_free_header
2347
 
+*
2348
 
+* 
2349
 
+* 
2350
 
+* 
2351
 
+*
2352
 
+*****************************************************/
2353
 
+static void AIX_free_headers(vg_header *AIXvgh, vg_header *AIXvgh2, vg_trailer *AIXvgt, vg_trailer *AIXvgt2)
2354
 
+{
2355
 
+
2356
 
+       if (AIXvgh) {
2357
 
+               evms_cs_deallocate_memory(AIXvgh);
2358
 
+               AIXvgh = NULL;
2359
 
+       }
2360
 
+
2361
 
+       if (AIXvgh2) {
2362
 
+               evms_cs_deallocate_memory(AIXvgh2);
2363
 
+               AIXvgh2 = NULL;
2364
 
+       }
2365
 
+
2366
 
+       if (AIXvgt) {
2367
 
+               evms_cs_deallocate_memory(AIXvgt);
2368
 
+               AIXvgt = NULL;
2369
 
+       }
2370
 
+
2371
 
+       if (AIXvgt2) {
2372
 
+               evms_cs_deallocate_memory(AIXvgt2);
2373
 
+               AIXvgt2 = NULL;
2374
 
+       }
2375
 
+
2376
 
+}
2377
 
+
2378
 
+/****************************************************
2379
 
+* Function: AIXiod
2380
 
+*
2381
 
+* This is a kernel thread that handles read/write of mirrorss
2382
 
+* This shouldn't ever run on a non-mirrored LV read/write
2383
 
+* 
2384
 
+*
2385
 
+*****************************************************/
2386
 
+static void AIXiod (void *data)
2387
 
+{
2388
 
+       aix_mirror_bh_t         * r1_bh;
2389
 
+       evms_logical_node_t     * node;
2390
 
+       unsigned long flags;
2391
 
+
2392
 
+
2393
 
+       while(1){
2394
 
+
2395
 
+               spin_lock_irqsave(&AIX_retry_list_lock, flags);
2396
 
+               if (AIX_retry_list == NULL){
2397
 
+                       spin_unlock_irqrestore(&AIX_retry_list_lock, flags);
2398
 
+                       break;
2399
 
+               }
2400
 
+               r1_bh = AIX_retry_list;
2401
 
+               AIX_retry_list = r1_bh->next_r1;
2402
 
+               spin_unlock_irqrestore(&AIX_retry_list_lock, flags);
2403
 
+               r1_bh->next_r1 = NULL; // for mark
2404
 
+
2405
 
+       switch (r1_bh->cmd) {
2406
 
+       case AIX_LV_READ:
2407
 
+
2408
 
+                       r1_bh->iteration++;
2409
 
+               LOG_DEBUG("Report from thread AIXiod READ\n");
2410
 
+
2411
 
+               if (r1_bh->iteration == AIX_FIRST_MIRROR) {
2412
 
+                       node = r1_bh->mir_node1;
2413
 
+                       r1_bh->eio.rsector = r1_bh->mir_sector1;
2414
 
+               } else {
2415
 
+                       node = r1_bh->mir_node2;
2416
 
+                       r1_bh->eio.rsector = r1_bh->mir_sector2;
2417
 
+               }
2418
 
+
2419
 
+
2420
 
+               R_IO(node, &r1_bh->eio);
2421
 
+
2422
 
+               break;
2423
 
+
2424
 
+       default:
2425
 
+               LOG_DEBUG("AIXiod unknown cmd passed to thread:%d\n", r1_bh->cmd);
2426
 
+               break;
2427
 
+       }
2428
 
+
2429
 
+       }
2430
 
+       return;
2431
 
+}
2432
 
+/****************************************************
2433
 
+* Function: AIX_volume_group_dump
2434
 
+*
2435
 
+* This is for debug purposes and will walk the volume group list
2436
 
+* and LV's within the volume groups
2437
 
+*
2438
 
+* It can be called at anytime however the output to the display is large
2439
 
+*
2440
 
+*****************************************************/
2441
 
+#ifdef EVMS_AIX_DEBUG
2442
 
+static int AIX_volume_group_dump(void)
2443
 
+{
2444
 
+       aix_volume_group_t      * AIXVGLDebugPtr;
2445
 
+       partition_list_entry_t  * DebugPartitionList;
2446
 
+       aix_logical_volume_t    * DebugLVList;
2447
 
+       int i;
2448
 
+
2449
 
+       AIXVGLDebugPtr = AIXVolumeGroupList;
2450
 
+
2451
 
+       if (!AIXVGLDebugPtr) {
2452
 
+               LOG_DEBUG("***********************************************\n");
2453
 
+               LOG_DEBUG("ERROR Nothing built in the list to check !!!   \n");
2454
 
+               LOG_DEBUG("***********************************************\n");
2455
 
+               return 0;
2456
 
+       }
2457
 
+
2458
 
+       LOG_DEBUG("***********************************************    \n");
2459
 
+       LOG_DEBUG("Begin Volume Group Dump \n");
2460
 
+       LOG_DEBUG("***********************************************    \n");
2461
 
+
2462
 
+       while (AIXVGLDebugPtr) {
2463
 
+
2464
 
+               LOG_DEBUG("vg_number      %x\n",AIXVGLDebugPtr->vg_id.word2   );
2465
 
+               LOG_DEBUG("numpvs         %d\n",AIXVGLDebugPtr->numpvs        );         
2466
 
+               LOG_DEBUG("numlvs         %d\n",AIXVGLDebugPtr->numlvs        );         
2467
 
+               LOG_DEBUG("hard_sect_size %d\n",AIXVGLDebugPtr->hard_sect_size);         
2468
 
+               LOG_DEBUG("block_size     %d\n",AIXVGLDebugPtr->block_size    );         
2469
 
+               LOG_DEBUG("flags          %d\n",AIXVGLDebugPtr->flags         );         
2470
 
+               LOG_DEBUG("lv_max         %d\n",AIXVGLDebugPtr->lv_max        );         
2471
 
+               LOG_DEBUG("pe_size        %d\n",AIXVGLDebugPtr->pe_size       );         
2472
 
+               LOG_DEBUG("CleanVGInfo    %d\n",AIXVGLDebugPtr->CleanVGInfo   );
2473
 
+
2474
 
+               DebugPartitionList = AIXVGLDebugPtr->partition_list;
2475
 
+
2476
 
+               LOG_DEBUG("********* Begin Volume Partition Dump ********* \n");
2477
 
+
2478
 
+               if (!DebugPartitionList) {
2479
 
+                       LOG_DEBUG("No partitions to check !!  \n");
2480
 
+               }
2481
 
+
2482
 
+
2483
 
+               while (DebugPartitionList) {
2484
 
+                       LOG_DEBUG("logical_node       %p\n",DebugPartitionList->logical_node       );
2485
 
+                       LOG_DEBUG("pv_number          %d\n",DebugPartitionList->pv_number          );
2486
 
+                       LOG_DEBUG("block_size         %d\n",DebugPartitionList->block_size         );
2487
 
+                       LOG_DEBUG("hard_sect_size     %d\n",DebugPartitionList->hard_sect_size     );
2488
 
+                       LOG_DEBUG("-------------------------------------------------------------\n");
2489
 
+                       DebugPartitionList = DebugPartitionList->next;
2490
 
+               }
2491
 
+
2492
 
+               LOG_DEBUG("********* End Volume Partition Dump **********\n");
2493
 
+
2494
 
+               LOG_DEBUG("********** Begin Logical Volume Partition Dump **********\n");
2495
 
+
2496
 
+               DebugLVList = AIXVGLDebugPtr->volume_list[0];
2497
 
+
2498
 
+               if (!DebugLVList) {
2499
 
+                       LOG_DEBUG("No logical volumes to check !!  \n");
2500
 
+               }
2501
 
+
2502
 
+               for (i = 0; i < LVM_MAXLVS && DebugLVList; i++) {
2503
 
+
2504
 
+                       DebugLVList = AIXVGLDebugPtr->volume_list[i];
2505
 
+
2506
 
+                       if (DebugLVList) {
2507
 
+                               LOG_DEBUG("volume_list #    %d \n",  i                             );
2508
 
+                               LOG_DEBUG("lv_number        %d \n",  DebugLVList->lv_number        );
2509
 
+                               LOG_DEBUG("LV name          %s \n",  DebugLVList->name             );
2510
 
+                               LOG_DEBUG("lv_size          %Ld \n", DebugLVList->lv_size          );
2511
 
+                               LOG_DEBUG("lv_access        %d \n",  DebugLVList->lv_access        );
2512
 
+                               LOG_DEBUG("lv_status        %d \n",  DebugLVList->lv_status        );
2513
 
+                               LOG_DEBUG("lv_minor         %d \n",  DebugLVList->lv_minor         );
2514
 
+                               LOG_DEBUG("mirror_copies    %d \n",  DebugLVList->mirror_copies    );
2515
 
+                               LOG_DEBUG("mirror_number    %d \n",  DebugLVList->mirror_number    );
2516
 
+                               LOG_DEBUG("stripes          %d \n",  DebugLVList->stripes          );
2517
 
+                               LOG_DEBUG("stripe_size      %d \n",  DebugLVList->stripe_size      );
2518
 
+                               LOG_DEBUG("stripe_size_shift%d \n",  DebugLVList->stripe_size_shift);
2519
 
+                               LOG_DEBUG("pe_size          %d \n",  DebugLVList->pe_size          );
2520
 
+                               LOG_DEBUG("pe_size_shift    %d \n",  DebugLVList->pe_size_shift    );
2521
 
+                               LOG_DEBUG("num_le           %d \n",  DebugLVList->num_le           );
2522
 
+                               LOG_DEBUG("new_volume       %d \n",  DebugLVList->new_volume       );
2523
 
+                               LOG_DEBUG("group            %p \n",  DebugLVList->group            );
2524
 
+                       }
2525
 
+
2526
 
+
2527
 
+               }
2528
 
+
2529
 
+               AIXVGLDebugPtr = AIXVGLDebugPtr->next;
2530
 
+
2531
 
+               LOG_DEBUG("********** End Logical Volume Partition Dump **********\n");
2532
 
+
2533
 
+
2534
 
+       }
2535
 
+
2536
 
+       LOG_DEBUG("***********************************************\n");
2537
 
+       LOG_DEBUG("End Volume Group Dump                          \n");
2538
 
+       LOG_DEBUG("***********************************************\n");
2539
 
+
2540
 
+       return 0;
2541
 
+
2542
 
+}
2543
 
+#endif
2544
 
+
2545
 
diff -Naur linux-2002-03-28/drivers/evms/Config.in evms-2002-03-28/drivers/evms/Config.in
2546
 
--- linux-2002-03-28/drivers/evms/Config.in     Wed Dec 31 18:00:00 1969
2547
 
+++ evms-2002-03-28/drivers/evms/Config.in      Mon Mar 18 16:54:45 2002
2548
 
@@ -0,0 +1,60 @@
2549
 
+#
2550
 
+#   Copyright (c) International Business Machines  Corp., 2000
2551
 
+#
2552
 
+#   This program is free software;  you can redistribute it and/or modify
2553
 
+#   it under the terms of the GNU General Public License as published by
2554
 
+#   the Free Software Foundation; either version 2 of the License, or
2555
 
+#   (at your option) any later version.
2556
 
+#
2557
 
+#   This program is distributed in the hope that it will be useful,
2558
 
+#   but WITHOUT ANY WARRANTY;  without even the implied warranty of
2559
 
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
2560
 
+#   the GNU General Public License for more details.
2561
 
+#
2562
 
+#   You should have received a copy of the GNU General Public License
2563
 
+#   along with this program;  if not, write to the Free Software
2564
 
+#   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
2565
 
+#
2566
 
+#
2567
 
+# EVMS driver configuration
2568
 
+#
2569
 
+
2570
 
+mainmenu_option next_comment
2571
 
+comment 'Enterprise Volume Management System'
2572
 
+
2573
 
+tristate     'EVMS Kernel Runtime' CONFIG_EVMS
2574
 
+dep_tristate '  EVMS Local Device Manager Plugin' CONFIG_EVMS_LOCAL_DEV_MGR_PLUGIN $CONFIG_EVMS
2575
 
+dep_tristate '  EVMS DOS Partition Manager Plugin' CONFIG_EVMS_DOS_PARTITION_PLUGIN $CONFIG_EVMS
2576
 
+dep_tristate '  EVMS SnapShot Feature' CONFIG_EVMS_SNAPSHOT_PLUGIN $CONFIG_EVMS
2577
 
+dep_tristate '  EVMS DriveLink Feature' CONFIG_EVMS_DRIVELINK_PLUGIN $CONFIG_EVMS
2578
 
+dep_tristate '  EVMS Bad Block Relocation (BBR) Feature' CONFIG_EVMS_BBR_PLUGIN $CONFIG_EVMS
2579
 
+dep_tristate '  EVMS Linux LVM Package' CONFIG_EVMS_LVM_PLUGIN $CONFIG_EVMS
2580
 
+dep_tristate '  EVMS Linux MD Package' CONFIG_EVMS_MD_PLUGIN $CONFIG_EVMS
2581
 
+dep_tristate '    EVMS MD Linear (append) mode' CONFIG_EVMS_MD_LINEAR_PERS $CONFIG_EVMS_MD_PLUGIN
2582
 
+dep_tristate '    EVMS MD RAID-0 (stripe) mode' CONFIG_EVMS_MD_RAID0_PERS $CONFIG_EVMS_MD_PLUGIN
2583
 
+dep_tristate '    EVMS MD RAID-1 (mirroring) mode' CONFIG_EVMS_MD_RAID1_PERS $CONFIG_EVMS_MD_PLUGIN
2584
 
+dep_tristate '    EVMS MD RAID-4/RAID-5 mode' CONFIG_EVMS_MD_RAID5_PERS $CONFIG_EVMS_MD_PLUGIN
2585
 
+dep_tristate '  EVMS AIX LVM Package' CONFIG_EVMS_AIX_PLUGIN $CONFIG_EVMS
2586
 
+dep_tristate '  EVMS OS/2 LVM Package' CONFIG_EVMS_OS2_PLUGIN $CONFIG_EVMS
2587
 
+dep_tristate '  EVMS Clustering Package' CONFIG_EVMS_ECR_PLUGIN $CONFIG_EVMS
2588
 
+
2589
 
+if [ "$CONFIG_ARCH_S390" = "y" ]; then
2590
 
+dep_tristate '  EVMS s390 Partition Manager Plugin' CONFIG_EVMS_S390_PART_PLUGIN $CONFIG_EVMS
2591
 
+fi
2592
 
+
2593
 
+if [ "$CONFIG_EVMS" != "n" ]; then
2594
 
+       choice '  EVMS Debug Level' \
2595
 
+               "Critical       CONFIG_EVMS_INFO_CRITICAL \
2596
 
+                Serious        CONFIG_EVMS_INFO_SERIOUS \
2597
 
+                Error          CONFIG_EVMS_INFO_ERROR \
2598
 
+                Warning        CONFIG_EVMS_INFO_WARNING \
2599
 
+                Default        CONFIG_EVMS_INFO_DEFAULT \
2600
 
+                Details        CONFIG_EVMS_INFO_DETAILS \
2601
 
+                Debug          CONFIG_EVMS_INFO_DEBUG \
2602
 
+                Extra          CONFIG_EVMS_INFO_EXTRA \
2603
 
+                Entry_Exit     CONFIG_EVMS_INFO_ENTRY_EXIT \
2604
 
+                Everything     CONFIG_EVMS_INFO_EVERYTHING" Default
2605
 
+fi
2606
 
+
2607
 
+endmenu
2608
 
+
2609
 
diff -Naur linux-2002-03-28/drivers/evms/Makefile evms-2002-03-28/drivers/evms/Makefile
2610
 
--- linux-2002-03-28/drivers/evms/Makefile      Wed Dec 31 18:00:00 1969
2611
 
+++ evms-2002-03-28/drivers/evms/Makefile       Thu Mar 28 15:13:34 2002
2612
 
@@ -0,0 +1,60 @@
2613
 
+#
2614
 
+# Makefile for the kernel EVMS driver and modules.
2615
 
+#
2616
 
+# 08 March 2001, Mark Peloquin <peloquin@us.ibm.com>
2617
 
+#
2618
 
+
2619
 
+O_TARGET := evmsdrvr.o
2620
 
+
2621
 
+export-objs := evms.o evms_passthru.o ldev_mgr.o dos_part.o lvm_vge.o snapshot.o evms_drivelink.o evms_bbr.o AIXlvm_vge.o os2lvm_vge.o evms_ecr.o md_core.o md_linear.o md_raid0.o md_raid1.o md_raid5.o md_xor.o s390_part.o
2622
 
+
2623
 
+# Link order is important! Plugins must come first, then the EVMS core.
2624
 
+
2625
 
+obj-$(CONFIG_EVMS_LOCAL_DEV_MGR_PLUGIN)        += ldev_mgr.o
2626
 
+obj-$(CONFIG_EVMS_DOS_PARTITION_PLUGIN)        += dos_part.o
2627
 
+obj-$(CONFIG_EVMS_MD_PLUGIN)           += md_core.o
2628
 
+obj-$(CONFIG_EVMS_MD_LINEAR_PERS)      += md_linear.o
2629
 
+obj-$(CONFIG_EVMS_MD_RAID0_PERS)       += md_raid0.o
2630
 
+obj-$(CONFIG_EVMS_MD_RAID1_PERS)       += md_raid1.o
2631
 
+obj-$(CONFIG_EVMS_MD_RAID5_PERS)       += md_raid5.o md_xor.o
2632
 
+obj-$(CONFIG_EVMS_LVM_PLUGIN)          += lvm_vge.o
2633
 
+obj-$(CONFIG_EVMS_AIX_PLUGIN)          += AIXlvm_vge.o
2634
 
+obj-$(CONFIG_EVMS_OS2_PLUGIN)          += os2lvm_vge.o
2635
 
+obj-$(CONFIG_EVMS_DRIVELINK_PLUGIN)    += evms_drivelink.o
2636
 
+obj-$(CONFIG_EVMS_BBR_PLUGIN)          += evms_bbr.o
2637
 
+obj-$(CONFIG_EVMS_SNAPSHOT_PLUGIN)     += snapshot.o
2638
 
+obj-$(CONFIG_EVMS_ECR_PLUGIN)          += evms_ecr.o
2639
 
+obj-$(CONFIG_EVMS_S390_PART_PLUGIN)    += s390_part.o
2640
 
+obj-$(CONFIG_EVMS)                     += evms_passthru.o evms.o
2641
 
+
2642
 
+EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_DEFAULT
2643
 
+ifeq ($(CONFIG_EVMS_INFO_CRITICAL),y)
2644
 
+       EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_CRITICAL
2645
 
+endif
2646
 
+ifeq ($(CONFIG_EVMS_INFO_SERIOUS),y)
2647
 
+       EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_SERIOUS
2648
 
+endif
2649
 
+ifeq ($(CONFIG_EVMS_INFO_ERROR),y)
2650
 
+       EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_ERROR
2651
 
+endif
2652
 
+ifeq ($(CONFIG_EVMS_INFO_WARNING),y)
2653
 
+       EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_WARNING
2654
 
+endif
2655
 
+ifeq ($(CONFIG_EVMS_INFO_DETAILS),y)
2656
 
+       EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_DETAILS
2657
 
+endif
2658
 
+ifeq ($(CONFIG_EVMS_INFO_DEBUG),y)
2659
 
+       EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_DEBUG
2660
 
+endif
2661
 
+ifeq ($(CONFIG_EVMS_INFO_EXTRA),y)
2662
 
+       EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_EXTRA
2663
 
+endif
2664
 
+ifeq ($(CONFIG_EVMS_INFO_ENTRY_EXIT),y)
2665
 
+       EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_ENTRY_EXIT
2666
 
+endif
2667
 
+ifeq ($(CONFIG_EVMS_INFO_EVERYTHING),y)
2668
 
+       EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_EVERYTHING
2669
 
+endif
2670
 
+
2671
 
+include $(TOPDIR)/Rules.make
2672
 
+
2673
 
diff -Naur linux-2002-03-28/drivers/evms/dos_part.c evms-2002-03-28/drivers/evms/dos_part.c
2674
 
--- linux-2002-03-28/drivers/evms/dos_part.c    Wed Dec 31 18:00:00 1969
2675
 
+++ evms-2002-03-28/drivers/evms/dos_part.c     Wed Mar 27 21:24:20 2002
2676
 
@@ -0,0 +1,1407 @@
2677
 
+/* -*- linux-c -*- */
2678
 
+/*
2679
 
+ *
2680
 
+ *
2681
 
+ *   Copyright (c) International Business Machines  Corp., 2000
2682
 
+ *
2683
 
+ *   This program is free software;  you can redistribute it and/or modify
2684
 
+ *   it under the terms of the GNU General Public License as published by
2685
 
+ *   the Free Software Foundation; either version 2 of the License, or
2686
 
+ *   (at your option) any later version.
2687
 
+ *
2688
 
+ *   This program is distributed in the hope that it will be useful,
2689
 
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
2690
 
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
2691
 
+ *   the GNU General Public License for more details.
2692
 
+ *
2693
 
+ *   You should have received a copy of the GNU General Public License
2694
 
+ *   along with this program;  if not, write to the Free Software
2695
 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
2696
 
+ *
2697
 
+ *
2698
 
+ */
2699
 
+/*
2700
 
+ * linux/drivers/evms/dos_part.c
2701
 
+ *
2702
 
+ * EVMS DOS partition manager
2703
 
+ *
2704
 
+ * Partial code extracted from
2705
 
+ *
2706
 
+ *  linux/fs/partitions/msdos.c
2707
 
+ *
2708
 
+ */
2709
 
+
2710
 
+#include <linux/config.h>
2711
 
+#include <linux/module.h>
2712
 
+#include <linux/kernel.h>
2713
 
+#include <linux/config.h>
2714
 
+#include <linux/fs.h>
2715
 
+#include <linux/genhd.h>
2716
 
+#include <linux/major.h>
2717
 
+#include <linux/string.h>
2718
 
+#include <linux/blk.h>
2719
 
+#include <linux/init.h>
2720
 
+#include <linux/iobuf.h> /* for kiobuf stuffs */
2721
 
+
2722
 
+#ifdef CONFIG_BLK_DEV_IDE
2723
 
+#include <linux/ide.h>  /* IDE xlate */
2724
 
+#endif /* CONFIG_BLK_DEV_IDE */
2725
 
+
2726
 
+#include <linux/evms/evms_kernel.h>
2727
 
+#include <linux/evms/evms_os2.h>
2728
 
+
2729
 
+#include <asm/system.h>
2730
 
+#include <asm/uaccess.h>
2731
 
+
2732
 
+/* prefix used in logging messages */
2733
 
+#define LOG_PREFIX "dos_part: "
2734
 
+
2735
 
+/* #include "msdos.h" */
2736
 
+#define MSDOS_LABEL_MAGIC               0xAA55
2737
 
+
2738
 
+/* Skeletal MBR/EBR structure useful for our purposes */
2739
 
+typedef struct mbr_ebr_s {
2740
 
+        u_int8_t                unused1[0x1be];
2741
 
+        struct partition        partitions[4];
2742
 
+        u_int16_t               signature;
2743
 
+} mbr_ebr_t;
2744
 
+
2745
 
+/* Private instance data structure for node we produced */
2746
 
+typedef struct local_instance_data_s {
2747
 
+        evms_logical_node_t     * source_disk;
2748
 
+        evms_sector_t           start_sect;     /* starting LBA */
2749
 
+        evms_sector_t           nr_sects;       /* number of sectors */
2750
 
+        unsigned char           type;           /* partition type or filesystem format indicator, can be set to 0 */
2751
 
+} local_instance_data_t;
2752
 
+
2753
 
+/* Structure used to track progress traversing an EBR chain */
2754
 
+typedef struct extended_part_s {
2755
 
+        int                  partition_number;
2756
 
+        struct partition    *extended;
2757
 
+        u_int64_t            start_sect;
2758
 
+        u_int64_t            next_ebr_start;
2759
 
+        int                  done;
2760
 
+} extended_part_t;
2761
 
+
2762
 
+/* Global variables */
2763
 
+static int cur_comp_part_num;   /* used to track non-primary
2764
 
+                                 * partition numbers
2765
 
+                                 */
2766
 
+static int exported_nodes;      /* total # of exported segments
2767
 
+                                 * produced during this discovery.
2768
 
+                                 */
2769
 
+
2770
 
+/* External references */
2771
 
+#if CONFIG_BLK_DEV_MD && CONFIG_AUTODETECT_RAID
2772
 
+extern void md_autodetect_dev(kdev_t dev);
2773
 
+#endif
2774
 
+
2775
 
+/* Prototypes */
2776
 
+static int  mbr_ebr_partition_discover(evms_logical_node_t **);
2777
 
+static int  mbr_ebr_partition_delete(evms_logical_node_t *);
2778
 
+static void mbr_ebr_partition_read(evms_logical_node_t *,
2779
 
+                                   eio_t *);
2780
 
+static void mbr_ebr_partition_write(evms_logical_node_t *,
2781
 
+                                    eio_t *);
2782
 
+static int  mbr_ebr_partition_ioctl(evms_logical_node_t *,
2783
 
+                                    struct inode *,
2784
 
+                                    struct file *,
2785
 
+                                    unsigned int,
2786
 
+                                    unsigned long);
2787
 
+static int  mbr_ebr_partition_init_io(evms_logical_node_t *,
2788
 
+                                      int,
2789
 
+                                      evms_sector_t,
2790
 
+                                      evms_sector_t,
2791
 
+                                      void *);
2792
 
+
2793
 
+static evms_plugin_function_table_t function_table = {
2794
 
+        discover: &mbr_ebr_partition_discover,
2795
 
+        delete  : &mbr_ebr_partition_delete,
2796
 
+        read    : &mbr_ebr_partition_read,
2797
 
+        write   : &mbr_ebr_partition_write,
2798
 
+        init_io : &mbr_ebr_partition_init_io,
2799
 
+        ioctl   : &mbr_ebr_partition_ioctl
2800
 
+};
2801
 
+
2802
 
+#define EVMS_MSDOS_PARTITION_MANAGER_ID 1
2803
 
+
2804
 
+static evms_plugin_header_t plugin_header = {
2805
 
+        id              : SetPluginID(
2806
 
+                IBM_OEM_ID,
2807
 
+                EVMS_SEGMENT_MANAGER,
2808
 
+                EVMS_MSDOS_PARTITION_MANAGER_ID),
2809
 
+        version         : {
2810
 
+                major      : 1,
2811
 
+                minor      : 0,
2812
 
+                patchlevel : 0
2813
 
+        },
2814
 
+        required_common_services_version : {
2815
 
+                major      : 0,
2816
 
+                minor      : 5,
2817
 
+                patchlevel : 0
2818
 
+        },
2819
 
+        function_table  : &function_table
2820
 
+};
2821
 
+
2822
 
+/*
2823
 
+ * Many architectures don't like unaligned accesses, which is
2824
 
+ * frequently the case with the nr_sects and start_sect partition
2825
 
+ * table entries.
2826
 
+ */
2827
 
+#include <asm/unaligned.h>
2828
 
+
2829
 
+#define SYS_IND(p)      (get_unaligned(&p->sys_ind))
2830
 
+#define NR_SECTS(p)     (u_int64_t)({ __typeof__(p->nr_sects) __a =        \
2831
 
+                                get_unaligned(&p->nr_sects);    \
2832
 
+                                le32_to_cpu(__a); \
2833
 
+                        })
2834
 
+
2835
 
+#define START_SECT(p)   (u_int64_t)({ __typeof__(p->start_sect) __a =      \
2836
 
+                                get_unaligned(&p->start_sect);  \
2837
 
+                                le32_to_cpu(__a); \
2838
 
+                        })
2839
 
+
2840
 
+
2841
 
+/***************************************************/
2842
 
+/* List Support - Typedefs, Variables, & Functions */
2843
 
+/***************************************************/
2844
 
+
2845
 
+/* Typedefs */
2846
 
+
2847
 
+typedef struct local_segment_list_node_s {
2848
 
+        evms_logical_node_t              *segment;
2849
 
+        struct local_segment_list_node_s *next;
2850
 
+} local_segment_list_node_t;
2851
 
+
2852
 
+typedef struct local_disk_list_node_s {
2853
 
+        evms_logical_node_t           *disk;
2854
 
+        local_segment_list_node_t     *segment_list;
2855
 
+        struct local_disk_list_node_s *next;
2856
 
+} local_disk_list_node_t;
2857
 
+
2858
 
+/* Variables */
2859
 
+
2860
 
+static local_disk_list_node_t *my_disk_list;
2861
 
+
2862
 
+/* Functions */
2863
 
+
2864
 
+static local_disk_list_node_t **
2865
 
+lookup_disk(
2866
 
+        evms_logical_node_t *disk)
2867
 
+{
2868
 
+        local_disk_list_node_t **ldln;
2869
 
+
2870
 
+        ldln = &my_disk_list;
2871
 
+        while(*ldln) {
2872
 
+                if ((*ldln)->disk == disk)
2873
 
+                        break;
2874
 
+                ldln = &(*ldln)->next;
2875
 
+        }
2876
 
+        return(ldln);
2877
 
+}
2878
 
+
2879
 
+static local_segment_list_node_t **
2880
 
+lookup_segment(
2881
 
+        local_disk_list_node_t *disk,
2882
 
+        evms_logical_node_t    *segment)
2883
 
+{
2884
 
+        local_segment_list_node_t **lsln;
2885
 
+
2886
 
+        lsln = &disk->segment_list;
2887
 
+        while(*lsln) {
2888
 
+                if ((*lsln)->segment == segment)
2889
 
+                        break;
2890
 
+                lsln = &(*lsln)->next;
2891
 
+        }
2892
 
+        return(lsln);
2893
 
+}
2894
 
+
2895
 
+static evms_logical_node_t *
2896
 
+find_segment_on_disk(
2897
 
+        evms_logical_node_t *disk,
2898
 
+        u_int64_t start_sect,
2899
 
+        u_int64_t nr_sects)
2900
 
+{
2901
 
+        evms_logical_node_t *rc = NULL;
2902
 
+        local_disk_list_node_t **ldln;
2903
 
+        local_segment_list_node_t **lsln;
2904
 
+        local_instance_data_t *lid;
2905
 
+
2906
 
+        ldln = lookup_disk(disk);
2907
 
+        if (*ldln) {
2908
 
+                /* disk found in list */
2909
 
+                /* attempt to find segment */
2910
 
+
2911
 
+                lsln = &(*ldln)->segment_list;
2912
 
+                while(*lsln) {
2913
 
+                        lid = (*lsln)->segment->instance_data;
2914
 
+                        if (lid->start_sect == start_sect)
2915
 
+                                if (lid->nr_sects == nr_sects)
2916
 
+                                        break;
2917
 
+                        lsln = &(*lsln)->next;
2918
 
+                }
2919
 
+                if (*lsln)
2920
 
+                        rc = (*lsln)->segment;
2921
 
+        }
2922
 
+        return(rc);
2923
 
+}
2924
 
+
2925
 
+/* function description: add_segment_to_disk
2926
 
+ *
2927
 
+ * this function attempts to add a segment to the segment
2928
 
+ * list of a disk. if the specified disk is not found, it
2929
 
+ * will be added to the global disk list. this function will
2930
 
+ * return a pointer to the matching segment in the disk's
2931
 
+ * segment list. the caller must compare the returned pointer
2932
 
+ * to the specified segment to see if the
2933
 
+ * specified segment was already present in the disk's segment
2934
 
+ * list. if the return pointer matches the specified segment,
2935
 
+ * then the specified segment was added to the list. if the
2936
 
+ * return segment pointer to does not match the specified
2937
 
+ * segment pointer, then the specified segment pointer was
2938
 
+ * a duplicate and can be thrown away.
2939
 
+ */
2940
 
+static int
2941
 
+add_segment_to_disk(
2942
 
+        evms_logical_node_t *disk,
2943
 
+        evms_logical_node_t *segment)
2944
 
+{
2945
 
+        int rc = 0;
2946
 
+        local_disk_list_node_t **ldln, *new_disk;
2947
 
+        local_segment_list_node_t **lsln, *new_segment;
2948
 
+
2949
 
+        ldln = lookup_disk(disk);
2950
 
+        if (*ldln == NULL) {
2951
 
+                /* disk not in list, add disk */
2952
 
+                rc = evms_cs_allocate_memory((void **)&new_disk,
2953
 
+                                             sizeof(*new_disk));
2954
 
+                if (!rc) {
2955
 
+                        new_disk->disk = disk;
2956
 
+                        *ldln = new_disk;
2957
 
+                }
2958
 
+        }
2959
 
+        if (!rc) {
2960
 
+                /* attempt to add segment */
2961
 
+                lsln = lookup_segment(*ldln, segment);
2962
 
+                if (*lsln == NULL) {
2963
 
+                        /* segment not in list, add segment */
2964
 
+                        rc = evms_cs_allocate_memory((void **)&new_segment,
2965
 
+                                                     sizeof(*new_segment));
2966
 
+                        if (!rc) {
2967
 
+                                new_segment->segment = segment;
2968
 
+                                *lsln = new_segment;
2969
 
+                        }
2970
 
+                } else
2971
 
+                        rc = -1;
2972
 
+        }
2973
 
+        return(rc);
2974
 
+}
2975
 
+
2976
 
+static int
2977
 
+remove_segment_from_disk(
2978
 
+        evms_logical_node_t *disk,
2979
 
+        evms_logical_node_t *segment,
2980
 
+        evms_logical_node_t **empty_disk)
2981
 
+{
2982
 
+        int rc = 0;
2983
 
+        local_disk_list_node_t **ldln, *tmp_disk_node;
2984
 
+        local_segment_list_node_t **lsln, *tmp_segment_node;
2985
 
+
2986
 
+        *empty_disk = NULL;
2987
 
+        ldln = lookup_disk(disk);
2988
 
+        if (*ldln == NULL) {
2989
 
+                rc = -1;
2990
 
+        } else {
2991
 
+                /* disk found in list */
2992
 
+                /* attempt to add segment */
2993
 
+                lsln = lookup_segment(*ldln, segment);
2994
 
+                if (*lsln == NULL) {
2995
 
+                        rc = -2;
2996
 
+                } else {
2997
 
+                        tmp_segment_node = *lsln;
2998
 
+                        /* remove segment from list */
2999
 
+                        *lsln = (*lsln)->next;
3000
 
+                        /* free the segment list node */
3001
 
+                        evms_cs_deallocate_memory(tmp_segment_node);
3002
 
+
3003
 
+                        if ((*ldln)->segment_list == NULL) {
3004
 
+                                tmp_disk_node = *ldln;
3005
 
+                                *empty_disk = tmp_disk_node->disk;
3006
 
+                                /* remove disk from list */
3007
 
+                                *ldln = (*ldln)->next;
3008
 
+                                /* free the disk list node */
3009
 
+                                evms_cs_deallocate_memory(tmp_disk_node);
3010
 
+                        }
3011
 
+                }
3012
 
+        }
3013
 
+        return(rc);
3014
 
+}
3015
 
+
3016
 
+static inline int
3017
 
+is_extended_partition(struct partition *p)
3018
 
+{
3019
 
+        return (SYS_IND(p) == DOS_EXTENDED_PARTITION ||
3020
 
+                SYS_IND(p) == WIN98_EXTENDED_PARTITION ||
3021
 
+                SYS_IND(p) == LINUX_EXTENDED_PARTITION);
3022
 
+}
3023
 
+
3024
 
+static inline u64
3025
 
+part_start(struct partition *part, u64 ext_start, u64 ebr_start)
3026
 
+{
3027
 
+       u64 pstart = START_SECT(part);
3028
 
+       pstart += (is_extended_partition(part)) ? ext_start : ebr_start;
3029
 
+       return(pstart);
3030
 
+}
3031
 
+
3032
 
+static int
3033
 
+validate_mbr_ebr(
3034
 
+        evms_logical_node_t *node,
3035
 
+        mbr_ebr_t *mbr_ebr,
3036
 
+       u_int64_t ext_start,
3037
 
+       u_int64_t ebr_start)
3038
 
+{
3039
 
+        int valid_mbr_ebr, i, j, mbr_flag;
3040
 
+        struct partition *pi, *pj;
3041
 
+        u_int64_t pi_start, pi_end, pj_start, pj_end;
3042
 
+
3043
 
+        /* assume an MBR */
3044
 
+        mbr_flag = TRUE;
3045
 
+
3046
 
+        /* assume its valid */
3047
 
+        valid_mbr_ebr = TRUE;
3048
 
+
3049
 
+        /* check for valid signature */
3050
 
+        if (mbr_ebr->signature != cpu_to_le16(MSDOS_LABEL_MAGIC)) {
3051
 
+                LOG_DEBUG("%s: invalid signature on '%s'!\n",
3052
 
+                         __FUNCTION__, node->name);
3053
 
+                valid_mbr_ebr = FALSE;
3054
 
+        }
3055
 
+
3056
 
+       /* check for an AIX IPL signature */
3057
 
+       #define IPLRECID 0xc9c2d4c1 /* Value is EBCIDIC 'IBMA'           */
3058
 
+       if ( *(unsigned int *)mbr_ebr == IPLRECID ) {
3059
 
+               LOG_DEBUG("%s: found an AIX IPL signature on '%s'\n",
3060
 
+                       __FUNCTION__, node->name);
3061
 
+               valid_mbr_ebr = FALSE;
3062
 
+       }
3063
 
+       
3064
 
+
3065
 
+        /* check for boot sector fields */
3066
 
+
3067
 
+#if 0 //Remove checking of the first byte
3068
 
+
3069
 
+        /* attempt to make some initial assumptions about
3070
 
+         * what type of data structure this could be. we
3071
 
+         * start by checking the 1st byte. we can tell a
3072
 
+         * few things based on what is or isn't there.
3073
 
+         */
3074
 
+        if (valid_mbr_ebr == TRUE)
3075
 
+                switch(*(u_char *)mbr_ebr) {
3076
 
+                        /* check for JMP as 1st instruction
3077
 
+                         * if found, assume (for now), that
3078
 
+                         * this is a boot sector.
3079
 
+                         */
3080
 
+            /* Removed the JMP opcode check because it's not enough to determine
3081
 
+             * that this sector does not have a valid MBR.
3082
 
+             * Note:  To avoid going thru validation process of partition table,
3083
 
+             * it's necessary to have a better boot sector check
3084
 
+             * (eg. JMP opcode && other conditions) */
3085
 
+            /*
3086
 
+                        case 0xEB:
3087
 
+                                LOG_DEBUG("%s: boot sector detected!\n", __FUNCTION__);
3088
 
+                                valid_mbr_ebr = FALSE;
3089
 
+                */
3090
 
+                        /* let this fall thru to pick up the
3091
 
+                         * mbr_flag == FALSE.
3092
 
+                         */
3093
 
+
3094
 
+
3095
 
+                        /* the MBR should contain boot strap
3096
 
+                         * code, so we don't expect the 1st
3097
 
+                         * byte to be a 0x0. If the 1st byte
3098
 
+                         * IS 0x0, its assumed (for now) to
3099
 
+                         * be an EBR.
3100
 
+                         */
3101
 
+                        case 0:
3102
 
+                                mbr_flag = FALSE;
3103
 
+                                break;
3104
 
+                }
3105
 
+#endif //Remove checking of the first byte
3106
 
+
3107
 
+        if (valid_mbr_ebr == TRUE) {
3108
 
+               /* dump the partition table entries in debug mode */
3109
 
+               LOG_DEBUG("%s: disk relative starts: ext_part(%Ld), ebr(%Ld).\n",
3110
 
+                         __FUNCTION__, ext_start, ebr_start);
3111
 
+                for (i = 0; i < 4; i++) {
3112
 
+                        pi = &mbr_ebr->partitions[i];
3113
 
+                       LOG_DEBUG("%s: Partition: index(%d), start(%Ld), size(%Ld), sys(0x%x).\n",
3114
 
+                                 __FUNCTION__, i, START_SECT(pi), NR_SECTS(pi), SYS_IND(pi));
3115
 
+               }
3116
 
+                /* check for mbr/ebr partition table validity */
3117
 
+                       for (i = 0; i < 4; i++) {
3118
 
+                        pi = &mbr_ebr->partitions[i];
3119
 
+                        if (NR_SECTS(pi)) {
3120
 
+                                /* check for partition extending past end of node */
3121
 
+                               pi_start = part_start(pi, ext_start, ebr_start);
3122
 
+                               pi_end = pi_start + NR_SECTS(pi) - 1;
3123
 
+                                if ( pi_end >= node->total_vsectors) {
3124
 
+                                        LOG_DEBUG("%s: partition(%d) ends(%Ld) beyond the end of the disk(%s,%Ld)!\n",
3125
 
+                                                 __FUNCTION__, i, pi_end, 
3126
 
+                                                node->name, node->total_vsectors);
3127
 
+                                        valid_mbr_ebr = FALSE;
3128
 
+                                }
3129
 
+                                if (valid_mbr_ebr == FALSE) break;
3130
 
+
3131
 
+                                /* check for partition overlap */
3132
 
+                                for (j = i + 1; j < 4; j++) {
3133
 
+                                        pj = &mbr_ebr->partitions[j];
3134
 
+                                        if (NR_SECTS(pj)) {
3135
 
+                                               pj_start = part_start(pj, ext_start, ebr_start);
3136
 
+                                               pj_end = pj_start + NR_SECTS(pj) - 1;
3137
 
+                                                if (pi_start == pj_start) {
3138
 
+                                                        valid_mbr_ebr = FALSE;
3139
 
+                                                } else if (pi_start < pj_start) {
3140
 
+                                                        if (pi_end >= pj_start)
3141
 
+                                                                valid_mbr_ebr = FALSE;
3142
 
+                                                } else if (pi_start <= pj_end)
3143
 
+                                                        valid_mbr_ebr = FALSE;
3144
 
+
3145
 
+                                                if (valid_mbr_ebr == FALSE) {
3146
 
+                                                        LOG_DEBUG("%s: overlapping partitions(%d,%d) detected on '%s'!\n",
3147
 
+                                                                 __FUNCTION__,i,j, node->name);
3148
 
+                                                        break;
3149
 
+                                                }
3150
 
+                                        }
3151
 
+                                }
3152
 
+                                if (valid_mbr_ebr == FALSE) break;
3153
 
+                        }
3154
 
+                }
3155
 
+        }
3156
 
+        if (valid_mbr_ebr == TRUE) {
3157
 
+                LOG_DEBUG("%s: valid %cBR detected on '%s'!\n", __FUNCTION__,
3158
 
+                         (mbr_flag == TRUE) ? 'M' : 'E', node->name);
3159
 
+        } else {
3160
 
+                LOG_DEBUG("%s: no valid MBR/EBR detected on '%s'!\n",
3161
 
+                         __FUNCTION__, node->name);
3162
 
+        }
3163
 
+        return(valid_mbr_ebr);
3164
 
+}
3165
 
+
3166
 
+/*
3167
 
+ * Function:  add_segment
3168
 
+ */
3169
 
+static int
3170
 
+mbr_ebr_process_segment(
3171
 
+        evms_logical_node_t **discover_list,
3172
 
+        evms_logical_node_t *node,
3173
 
+        u_int64_t            start_sect,
3174
 
+        u_int64_t            nr_sects,
3175
 
+        unsigned char        type,
3176
 
+        int                  part_num,
3177
 
+        char                *partition_name)
3178
 
+{
3179
 
+        local_instance_data_t *InstData = NULL;
3180
 
+        evms_logical_node_t *segment;
3181
 
+        int rc = 0;
3182
 
+
3183
 
+        segment = find_segment_on_disk(node, start_sect, nr_sects);
3184
 
+        if (segment) {
3185
 
+               LOG_DETAILS("exporting segment '%s'.\n",
3186
 
+                           segment->name);
3187
 
+       } else {
3188
 
+                rc = evms_cs_allocate_memory((void **)&InstData,sizeof(*InstData));
3189
 
+                if (!rc) {
3190
 
+                        InstData->source_disk = node;
3191
 
+                        InstData->start_sect = start_sect;
3192
 
+                        InstData->nr_sects = nr_sects;
3193
 
+                        InstData->type = type;
3194
 
+                        rc = evms_cs_allocate_logical_node(&segment);
3195
 
+                }
3196
 
+                if (!rc) {
3197
 
+                        segment->plugin = &plugin_header;
3198
 
+                        segment->system_id = (unsigned int)type;
3199
 
+                        segment->total_vsectors = nr_sects;
3200
 
+                        segment->block_size = node->block_size;
3201
 
+                        segment->hardsector_size = node->hardsector_size;
3202
 
+                        segment->instance_data = InstData;
3203
 
+                       segment->flags = node->flags;
3204
 
+                        if (partition_name)
3205
 
+                                strcpy(segment->name, partition_name);
3206
 
+                        else {
3207
 
+                                strcpy(segment->name, node->name);
3208
 
+                                sprintf(segment->name + strlen(segment->name), "%d", part_num);
3209
 
+                        }
3210
 
+                        LOG_DETAILS("creating segment '%s'.\n",
3211
 
+                                segment->name);
3212
 
+                        rc = add_segment_to_disk(node, segment);
3213
 
+                        if (rc) {
3214
 
+                                LOG_ERROR("%s: error(%d) adding segment '%s'!\n",
3215
 
+                                        __FUNCTION__, rc, segment->name);
3216
 
+                                rc = 0;
3217
 
+                        } else {
3218
 
+                               MOD_INC_USE_COUNT;
3219
 
+                       }
3220
 
+                }
3221
 
+                if (rc) {
3222
 
+                        if (InstData)
3223
 
+                                evms_cs_deallocate_memory(InstData);
3224
 
+                        if (segment)
3225
 
+                                evms_cs_deallocate_logical_node(segment);
3226
 
+                }
3227
 
+        }
3228
 
+        if (!rc) {
3229
 
+                evms_cs_add_logical_node_to_list(discover_list, segment);
3230
 
+                exported_nodes++;
3231
 
+        }
3232
 
+        return rc;
3233
 
+}
3234
 
+
3235
 
+static void
3236
 
+print_partition_info( char *leading_comment, struct partition *p )
3237
 
+{
3238
 
+        LOG_EXTRA("%s: boot_ind(0x%02x), sys_ind(0x%02x), startCHS(%u,%u,%u), endCHS(%u,%u,%u), startLBA(%Lu), sizeLBA(%Lu)\n",
3239
 
+                leading_comment,p->boot_ind,p->sys_ind,p->cyl,p->head,p->sector,
3240
 
+                p->end_cyl,p->end_head,p->end_sector,START_SECT(p),NR_SECTS(p));
3241
 
+}
3242
 
+
3243
 
+#ifdef CONFIG_BSD_DISKLABEL
3244
 
+#define BSD_DISKLABEL_PART_TABLE_SECTOR_OFFSET 1
3245
 
+static void
3246
 
+print_bsd_partition_info( char *leading_comment, struct bsd_partition *p )
3247
 
+{
3248
 
+        LOG_EXTRA("%s: p_size(%u), p_offset(%u), p_fsize(%u), p_fstype(0x%02X), p_frag(0x%02X), p_cpg(%u)\n",
3249
 
+                leading_comment,p->p_size, p->p_offset, p->p_fsize, p->p_fstype, p->p_frag, p->p_cpg);
3250
 
+}
3251
 
+
3252
 
+/*
3253
 
+ * bsd_disklabel_partition
3254
 
+ *
3255
 
+ * Return:
3256
 
+ *     - 0 for 0 partition
3257
 
+ *     - (positive) number for number of BSD partitions found
3258
 
+ *     - (negative) error code
3259
 
+ */
3260
 
+static int
3261
 
+bsd_disklabel_partition(
3262
 
+       evms_logical_node_t **discover_list,
3263
 
+       evms_logical_node_t *node,
3264
 
+       struct partition *bsd)
3265
 
+{
3266
 
+        struct bsd_disklabel *l;
3267
 
+        struct bsd_partition *p;
3268
 
+        int max_partitions;
3269
 
+        char *data;
3270
 
+        int rc = 0;
3271
 
+       int count = 0;
3272
 
+
3273
 
+        rc = evms_cs_allocate_memory((void**) &data, node->hardsector_size);
3274
 
+        if (!rc)
3275
 
+                rc = INIT_IO(node,
3276
 
+                        0,
3277
 
+                        START_SECT(bsd) + BSD_DISKLABEL_PART_TABLE_SECTOR_OFFSET,
3278
 
+                        1,
3279
 
+                        data);
3280
 
+        if (!rc) {
3281
 
+
3282
 
+                l = (struct bsd_disklabel *) data;
3283
 
+                if (l->d_magic == BSD_DISKMAGIC) {
3284
 
+
3285
 
+                        max_partitions = ((SYS_IND(bsd) == OPENBSD_PARTITION) ? OPENBSD_MAXPARTITIONS
3286
 
+                                : BSD_MAXPARTITIONS);
3287
 
+                        if (l->d_npartitions < max_partitions)
3288
 
+                                max_partitions = l->d_npartitions;
3289
 
+                        for (p = l->d_partitions; p - l->d_partitions <  max_partitions; p++) {
3290
 
+                                if (p->p_fstype != BSD_FS_UNUSED) {
3291
 
+                                        evmsTRACE2(EVMS_INFO_EXTRA,
3292
 
+                                                (print_bsd_partition_info(__FUNCTION__, p)));
3293
 
+                                        rc = mbr_ebr_process_segment(
3294
 
+                                                discover_list,
3295
 
+                                                node,
3296
 
+                                                (u_int64_t)p->p_offset,
3297
 
+                                                (u_int64_t)p->p_size,
3298
 
+                                                p->p_fstype,
3299
 
+                                                cur_comp_part_num++,
3300
 
+                                                NULL);
3301
 
+                                        if (rc)
3302
 
+                                                break;
3303
 
+                                       count++;
3304
 
+                                }
3305
 
+                        }
3306
 
+                }
3307
 
+        }
3308
 
+        if (data)
3309
 
+                evms_cs_deallocate_memory(data);
3310
 
+       if (!rc)
3311
 
+               rc = count;
3312
 
+       LOG_DETAILS("%s: exported (%d) partitions\n", __FUNCTION__, rc);
3313
 
+        return rc;
3314
 
+}
3315
 
+#endif
3316
 
+
3317
 
+#ifdef CONFIG_UNIXWARE_DISKLABEL
3318
 
+#define UNIXWARE_PART_TABLE_SECTOR_OFFSET 29
3319
 
+
3320
 
+/*
3321
 
+ * unixware_partition
3322
 
+ *
3323
 
+ * Return:
3324
 
+ *     - 0 for 0 partition
3325
 
+ *     - (positive) number for number of UNIXWARE partitions found
3326
 
+ *     - (negative) error code
3327
 
+ */
3328
 
+static int
3329
 
+unixware_partition(
3330
 
+    evms_logical_node_t **discover_list,
3331
 
+    evms_logical_node_t *node,
3332
 
+        struct partition *unixware_part)
3333
 
+{
3334
 
+        struct unixware_disklabel *l;
3335
 
+        struct unixware_slice *p;
3336
 
+        char *data = NULL;
3337
 
+        int rc = 0;
3338
 
+       int count = 0;
3339
 
+
3340
 
+        rc = evms_cs_allocate_memory((void**) &data, node->hardsector_size);
3341
 
+        if (!rc)
3342
 
+                rc = INIT_IO(node,
3343
 
+                        0,
3344
 
+                        START_SECT(unixware_part) + UNIXWARE_PART_TABLE_SECTOR_OFFSET,
3345
 
+                        1,
3346
 
+                        data);
3347
 
+        if (!rc) {
3348
 
+                l = (struct unixware_disklabel *)data;
3349
 
+                if ( le32_to_cpu(l->d_magic) == UNIXWARE_DISKMAGIC &&
3350
 
+                                le32_to_cpu(l->vtoc.v_magic) == UNIXWARE_DISKMAGIC2) {
3351
 
+                        p = &l->vtoc.v_slice[1]; /* The 0th slice is the same as whole disk. */
3352
 
+                        while (p - &l->vtoc.v_slice[0] < UNIXWARE_NUMSLICE) {
3353
 
+                                if (p->s_label != UNIXWARE_FS_UNUSED) {
3354
 
+                                        rc = mbr_ebr_process_segment(
3355
 
+                                                discover_list,
3356
 
+                                                node,
3357
 
+                                                START_SECT(p),
3358
 
+                                                NR_SECTS(p),
3359
 
+                                                UNIXWARE_PARTITION,
3360
 
+                                                cur_comp_part_num++,
3361
 
+                                                NULL);
3362
 
+                                       if (rc)
3363
 
+                                               break;
3364
 
+                                       count++;
3365
 
+                               }
3366
 
+                                p++;
3367
 
+                        }
3368
 
+                }
3369
 
+        }
3370
 
+        if (data)
3371
 
+                evms_cs_deallocate_memory(data);
3372
 
+       if (!rc)
3373
 
+               rc = count;
3374
 
+       LOG_DETAILS("%s: exported (%d) partitions\n", __FUNCTION__, rc);
3375
 
+        return rc;
3376
 
+}
3377
 
+#endif
3378
 
+
3379
 
+#ifdef CONFIG_SOLARIS_X86_PARTITION
3380
 
+#define SOLARIS_X86_PART_TABLE_SECTOR_OFFSET 1
3381
 
+/*
3382
 
+ * solaris_x86_partition
3383
 
+ *
3384
 
+ * Return:
3385
 
+ *     - 0 for 0 partition
3386
 
+ *     - (positive) number for number of solaris partitions found
3387
 
+ *     - (negative) error code
3388
 
+ */
3389
 
+static int
3390
 
+solaris_x86_partition(
3391
 
+       evms_logical_node_t **discover_list,
3392
 
+        evms_logical_node_t *node,
3393
 
+        struct partition *solaris_x86,
3394
 
+       int probe_only) /* if TRUE, do not add segments */
3395
 
+{
3396
 
+        long offset = START_SECT(solaris_x86);
3397
 
+        struct solaris_x86_vtoc *v;
3398
 
+        struct solaris_x86_slice *s;
3399
 
+        int i;
3400
 
+        char *data = NULL;
3401
 
+        int rc=0;
3402
 
+       int count = 0;
3403
 
+
3404
 
+        rc = evms_cs_allocate_memory((void**) &data, node->hardsector_size);
3405
 
+        if (!rc)
3406
 
+                rc = INIT_IO(node,
3407
 
+                        0,
3408
 
+                        START_SECT(solaris_x86) + SOLARIS_X86_PART_TABLE_SECTOR_OFFSET,
3409
 
+                        1,
3410
 
+                        data);
3411
 
+        if (!rc) {
3412
 
+
3413
 
+                v = (struct solaris_x86_vtoc *)data;
3414
 
+
3415
 
+                if (v->v_sanity == SOLARIS_X86_VTOC_SANE) {
3416
 
+                        if (v->v_version != 1) {
3417
 
+                                LOG_WARNING("%s: cannot handle version %d vtoc>\n", __FUNCTION__, v->v_version);
3418
 
+                        } else {
3419
 
+                                for (i=0; i<v->v_nparts; i++) {
3420
 
+                                        s = &v->v_slice[i];
3421
 
+                                       LOG_EXTRA("s[%d] s_tag(%u), s_flag(%u), s_start(%u), s_size(%u), last_sector(%u)\n",
3422
 
+                                               i,s->s_tag, s->s_flag, s->s_start, s->s_size, s->s_start + s->s_size -1);
3423
 
+
3424
 
+                                        if ((s->s_size == 0) || (s->s_tag == 0x05))
3425
 
+                                                continue;
3426
 
+                                       if (!probe_only) {
3427
 
+                                               rc = mbr_ebr_process_segment(
3428
 
+                                                       discover_list,
3429
 
+                                                       node,
3430
 
+                                                       (u_int64_t)(s->s_start+offset),
3431
 
+                                                       (u_int64_t)s->s_size,
3432
 
+                                                       SOLARIS_X86_PARTITION,
3433
 
+                                                       cur_comp_part_num++,
3434
 
+                                                       NULL);
3435
 
+                                               if (rc)
3436
 
+                                                       break;
3437
 
+                                       }
3438
 
+                                       count++;
3439
 
+                                }
3440
 
+                        }
3441
 
+                }
3442
 
+        }
3443
 
+        if (data)
3444
 
+                evms_cs_deallocate_memory(data);
3445
 
+       if (!rc)
3446
 
+               rc = count;
3447
 
+       LOG_DETAILS("%s: %s (%d) partitions\n", 
3448
 
+               __FUNCTION__, probe_only ? " " : "exported", rc);
3449
 
+        return rc;
3450
 
+}
3451
 
+#endif
3452
 
+
3453
 
+/*
3454
 
+ * os2lvm_partition() looks for DLAT at last sector of the track containing MBR/EBR
3455
 
+ *
3456
 
+ * Returns:     1 - os2 DLAT was found
3457
 
+ *              0 otherwise
3458
 
+ *
3459
 
+ */
3460
 
+static int
3461
 
+os2lvm_partition(
3462
 
+        u_int64_t MBR_EBR_sect,
3463
 
+        evms_logical_node_t *node,
3464
 
+        DLA_Table_Sector *dlat)
3465
 
+{
3466
 
+        struct hd_geometry geometry;
3467
 
+        int rc;
3468
 
+        u_int32_t crc_hold;
3469
 
+
3470
 
+        rc = evms_cs_kernel_ioctl(node, HDIO_GETGEO, (unsigned long)&geometry);
3471
 
+        if (rc) {
3472
 
+                LOG_SERIOUS("%s: ioctl failed(%u) on '%s'\n", 
3473
 
+                           __FUNCTION__, rc, node->name);
3474
 
+        } else if (!INIT_IO(node, 0, MBR_EBR_sect + geometry.sectors - 1, 1, dlat)) {
3475
 
+                if ( (dlat->DLA_Signature1 == cpu_to_le32(DLA_TABLE_SIGNATURE1)) &&
3476
 
+                        (dlat->DLA_Signature2 == cpu_to_le32(DLA_TABLE_SIGNATURE2)) ) {
3477
 
+                                crc_hold = le32_to_cpu( dlat->DLA_CRC );
3478
 
+                                dlat->DLA_CRC = 0;
3479
 
+                                if ( evms_cs_calculate_crc( EVMS_INITIAL_CRC, (void *)dlat,
3480
 
+                                     node->hardsector_size ) == crc_hold )
3481
 
+                                        return 1;
3482
 
+                }
3483
 
+        }
3484
 
+        return 0;
3485
 
+}
3486
 
+
3487
 
+static int
3488
 
+mbr_ebr_process_logical_drive(
3489
 
+        evms_logical_node_t **discover_list,
3490
 
+        evms_logical_node_t *node,
3491
 
+        extended_part_t *ext_info,
3492
 
+        int i,
3493
 
+        struct partition *p,
3494
 
+        int os2lvm,
3495
 
+        DLA_Table_Sector *dlat)
3496
 
+{
3497
 
+        int rc = 0;
3498
 
+        char tmp_buf[EVMS_VOLUME_NAME_SIZE], *partition_name;
3499
 
+
3500
 
+        LOG_EXTRA("%s: PartitionTableIndex(%i), Start(%Lu), Size(%Lu)\n",
3501
 
+                 __FUNCTION__, i, START_SECT(p), NR_SECTS(p));
3502
 
+
3503
 
+        if (NR_SECTS(p)) {
3504
 
+                if (is_extended_partition(p)) {
3505
 
+                        ext_info->next_ebr_start =
3506
 
+                                (u_int64_t)(START_SECT(p) + START_SECT(ext_info->extended));
3507
 
+                        ext_info->done = FALSE; /* not done yet */
3508
 
+                } else {
3509
 
+                        partition_name = NULL;
3510
 
+                        if ( os2lvm && p->sys_ind != LVM_PARTITION_INDICATOR &&
3511
 
+                             le32_to_cpu( dlat->DLA_Array[i].Partition_Start ) == ( ext_info->start_sect + START_SECT(p) ) &&
3512
 
+                             le32_to_cpu( dlat->DLA_Array[i].Partition_Size ) == NR_SECTS(p) &&
3513
 
+                             dlat->DLA_Array[i].Drive_Letter != '\0' ) {
3514
 
+                                sprintf( tmp_buf, "os2/%c", dlat->DLA_Array[i].Drive_Letter );
3515
 
+                                partition_name = tmp_buf;
3516
 
+                        }
3517
 
+                        evmsTRACE2(EVMS_INFO_EXTRA,
3518
 
+                                (print_partition_info(__FUNCTION__, p)));
3519
 
+
3520
 
+                        rc = mbr_ebr_process_segment(
3521
 
+                                discover_list,
3522
 
+                                node,
3523
 
+                                ext_info->start_sect + START_SECT(p),
3524
 
+                                NR_SECTS(p),
3525
 
+                                p->sys_ind,
3526
 
+                                cur_comp_part_num++,
3527
 
+                                partition_name);
3528
 
+                }
3529
 
+        }
3530
 
+        return(rc);
3531
 
+}
3532
 
+
3533
 
+static int
3534
 
+mbr_ebr_process_ebr(
3535
 
+        evms_logical_node_t **discover_list,
3536
 
+        evms_logical_node_t *node,
3537
 
+        extended_part_t *ext_info,
3538
 
+        mbr_ebr_t *ebr)
3539
 
+{
3540
 
+        int rc = 0, i, os2lvm;
3541
 
+        struct partition *p;
3542
 
+        DLA_Table_Sector *dlat = NULL;
3543
 
+
3544
 
+        /* allocate space for the OS2 DLAT info */
3545
 
+        rc = evms_cs_allocate_memory((void **)&dlat, node->hardsector_size);
3546
 
+        if (!rc) {
3547
 
+                /* read the dlat for this mbr */
3548
 
+                os2lvm = os2lvm_partition(ext_info->start_sect, node, dlat);
3549
 
+
3550
 
+                /* walk thru the partition table in the mbr
3551
 
+                 * processing each partition record.
3552
 
+                 */
3553
 
+                for (i = 0; i < 4; i++) {
3554
 
+                        p = &ebr->partitions[i];
3555
 
+                        rc = mbr_ebr_process_logical_drive(
3556
 
+                                discover_list,
3557
 
+                                node,
3558
 
+                                ext_info,
3559
 
+                                i,
3560
 
+                                p,
3561
 
+                                os2lvm,
3562
 
+                                dlat);
3563
 
+                }
3564
 
+        }
3565
 
+
3566
 
+        /* free the space used for OS2 DLAT info */
3567
 
+        if (dlat)
3568
 
+                evms_cs_deallocate_memory(dlat);
3569
 
+
3570
 
+        return(rc);
3571
 
+}
3572
 
+
3573
 
+static int
3574
 
+mbr_ebr_probe_for_ebr(
3575
 
+        evms_logical_node_t **discover_list,
3576
 
+        evms_logical_node_t *node,
3577
 
+        extended_part_t *ext_info)
3578
 
+{
3579
 
+        int rc = 0;
3580
 
+        u_char *sector_buffer = NULL;
3581
 
+        mbr_ebr_t *ebr = NULL;
3582
 
+
3583
 
+        /* allocate a sector size buffer */
3584
 
+        rc = evms_cs_allocate_memory((void **)&sector_buffer,
3585
 
+                                     node->hardsector_size);
3586
 
+        if (!rc)
3587
 
+                /* read the location of the mbr sector */
3588
 
+                rc = INIT_IO(node, 0, ext_info->start_sect, 1, sector_buffer);
3589
 
+
3590
 
+        if (!rc) {
3591
 
+                ebr = (mbr_ebr_t *)sector_buffer;
3592
 
+                if (validate_mbr_ebr(node, ebr, 
3593
 
+                                    START_SECT(ext_info->extended),
3594
 
+                                    ext_info->start_sect) == TRUE)
3595
 
+                        rc = mbr_ebr_process_ebr(
3596
 
+                                discover_list,
3597
 
+                                node,
3598
 
+                                ext_info,
3599
 
+                                ebr);
3600
 
+        }
3601
 
+
3602
 
+        if (sector_buffer)
3603
 
+                evms_cs_deallocate_memory(sector_buffer);
3604
 
+
3605
 
+        return(rc);
3606
 
+}
3607
 
+
3608
 
+static int
3609
 
+mbr_ebr_process_extended_partition(
3610
 
+        evms_logical_node_t **discover_list,
3611
 
+        evms_logical_node_t *node,
3612
 
+        struct partition *p)
3613
 
+{
3614
 
+        int rc = 0;
3615
 
+        extended_part_t ext_info;
3616
 
+
3617
 
+        memset(&ext_info, 0, sizeof(ext_info));
3618
 
+        ext_info.done = FALSE;
3619
 
+        ext_info.extended = p;
3620
 
+        ext_info.next_ebr_start = START_SECT(p);
3621
 
+        while (ext_info.done == FALSE) {
3622
 
+                ext_info.done = TRUE; /* assume done, unless we find another EBR */
3623
 
+                ext_info.start_sect = ext_info.next_ebr_start;
3624
 
+                rc = mbr_ebr_probe_for_ebr(
3625
 
+                        discover_list,
3626
 
+                        node,
3627
 
+                        &ext_info);
3628
 
+        }
3629
 
+        return rc;
3630
 
+}
3631
 
+
3632
 
+/*
3633
 
+ * is_non_dos_extended
3634
 
+ *
3635
 
+ * This function returns TRUE if the partition entry represents a non-DOS
3636
 
+ * extended partition such as UnixWare, Solaris x86 and BSD
3637
 
+ */
3638
 
+static int
3639
 
+is_non_dos_extended(
3640
 
+        evms_logical_node_t **discover_list,
3641
 
+        evms_logical_node_t *node,
3642
 
+        struct partition *p)
3643
 
+{
3644
 
+        if (NR_SECTS(p)) {
3645
 
+               #ifdef CONFIG_BSD_DISKLABEL
3646
 
+                if (SYS_IND(p) == BSD_PARTITION ||
3647
 
+                        SYS_IND(p) == NETBSD_PARTITION ||
3648
 
+                        SYS_IND(p) == OPENBSD_PARTITION)
3649
 
+                        return TRUE;
3650
 
+                #endif
3651
 
+
3652
 
+                #ifdef CONFIG_UNIXWARE_DISKLABEL
3653
 
+                if (SYS_IND(p) == UNIXWARE_PARTITION)
3654
 
+                        return TRUE;
3655
 
+                #endif
3656
 
+
3657
 
+                #ifdef CONFIG_SOLARIS_X86_PARTITION
3658
 
+                if ( (SYS_IND(p) == SOLARIS_X86_PARTITION) &&
3659
 
+                       (solaris_x86_partition(discover_list, node, p, TRUE) > 0) )
3660
 
+                        return TRUE;
3661
 
+                #endif
3662
 
+        }
3663
 
+        return(FALSE);
3664
 
+}
3665
 
+
3666
 
+/*
3667
 
+ * mbr_ebr_process_other_primary_partition
3668
 
+ * This function processes other (non-DOS) primary partitions such as
3669
 
+ * UnixWare, Solaris x86 and BSD
3670
 
+ */
3671
 
+static int
3672
 
+mbr_ebr_process_other_primary_partition(
3673
 
+        evms_logical_node_t **discover_list,
3674
 
+        evms_logical_node_t *node,
3675
 
+        struct partition *p)
3676
 
+{
3677
 
+        if (NR_SECTS(p)) {
3678
 
+               #ifdef CONFIG_BSD_DISKLABEL
3679
 
+                if (SYS_IND(p) == BSD_PARTITION ||
3680
 
+                        SYS_IND(p) == NETBSD_PARTITION ||
3681
 
+                        SYS_IND(p) == OPENBSD_PARTITION)
3682
 
+                        return  bsd_disklabel_partition(discover_list, node, p);
3683
 
+                #endif
3684
 
+
3685
 
+                #ifdef CONFIG_UNIXWARE_DISKLABEL
3686
 
+                if (SYS_IND(p) == UNIXWARE_PARTITION)
3687
 
+                        return unixware_partition(discover_list, node, p);
3688
 
+                #endif
3689
 
+
3690
 
+                #ifdef CONFIG_SOLARIS_X86_PARTITION
3691
 
+                if (SYS_IND(p) == SOLARIS_X86_PARTITION)
3692
 
+                        return solaris_x86_partition(discover_list, node, p, FALSE);
3693
 
+                #endif
3694
 
+        }
3695
 
+        return(0);
3696
 
+}
3697
 
+
3698
 
+static int
3699
 
+mbr_ebr_process_dos_primary_partition(
3700
 
+        evms_logical_node_t **discover_list,
3701
 
+        evms_logical_node_t *node,
3702
 
+        int i,
3703
 
+        struct partition *p,
3704
 
+        int os2lvm,
3705
 
+        DLA_Table_Sector *dlat)
3706
 
+{
3707
 
+        int rc = 0;
3708
 
+        char tmp_buf[EVMS_VOLUME_NAME_SIZE], *partition_name;
3709
 
+
3710
 
+        LOG_EVERYTHING("%s: PartitionTableIndex(%i), Start(%Lu), Size(%Lu)\n",
3711
 
+                 __FUNCTION__, i, START_SECT(p), NR_SECTS(p));
3712
 
+
3713
 
+        if (NR_SECTS(p)) {
3714
 
+
3715
 
+                if (is_extended_partition(p))
3716
 
+                        rc = mbr_ebr_process_extended_partition(
3717
 
+                                discover_list,node,p);
3718
 
+
3719
 
+                else {
3720
 
+                        partition_name = NULL;
3721
 
+                        if ( os2lvm && p->sys_ind != LVM_PARTITION_INDICATOR &&
3722
 
+                             le32_to_cpu( dlat->DLA_Array[i].Partition_Start ) == START_SECT(p) &&
3723
 
+                             le32_to_cpu( dlat->DLA_Array[i].Partition_Size ) == NR_SECTS(p) &&
3724
 
+                             dlat->DLA_Array[i].Drive_Letter != '\0' ) {
3725
 
+                                sprintf( tmp_buf, "os2/%c", dlat->DLA_Array[i].Drive_Letter );
3726
 
+                                partition_name = tmp_buf;
3727
 
+                        }
3728
 
+                        evmsTRACE2(EVMS_INFO_EXTRA,
3729
 
+                                (print_partition_info(__FUNCTION__, p)));
3730
 
+
3731
 
+                        rc = mbr_ebr_process_segment(
3732
 
+                                discover_list,
3733
 
+                                node,
3734
 
+                                START_SECT(p),
3735
 
+                                NR_SECTS(p),
3736
 
+                                p->sys_ind,
3737
 
+                                i+1,
3738
 
+                                partition_name);
3739
 
+                }
3740
 
+        }
3741
 
+        return(rc);
3742
 
+}
3743
 
+
3744
 
+static int
3745
 
+mbr_ebr_process_mbr(
3746
 
+        evms_logical_node_t **discover_list,
3747
 
+        evms_logical_node_t *node,
3748
 
+        mbr_ebr_t *mbr)
3749
 
+{
3750
 
+        int rc = 0, i, os2lvm;
3751
 
+        struct partition *p;
3752
 
+        DLA_Table_Sector *dlat = NULL;
3753
 
+
3754
 
+        cur_comp_part_num = 5; /* set this value for each disk */
3755
 
+
3756
 
+        /* allocate space for the OS2 DLAT info */
3757
 
+        rc = evms_cs_allocate_memory((void **)&dlat, node->hardsector_size);
3758
 
+        if (!rc) {
3759
 
+                /* read the dlat for this mbr */
3760
 
+                os2lvm = os2lvm_partition(0, node, dlat);
3761
 
+
3762
 
+                /* Pass 1: walk thru the partition table in the mbr
3763
 
+                 * processing each partition record.
3764
 
+                 */
3765
 
+                for (i = 0; i < 4; i++) {
3766
 
+                        p = &mbr->partitions[i];
3767
 
+                       if (is_non_dos_extended(discover_list, node, p)) {
3768
 
+                               LOG_DETAILS(" Found and skip a non-dos extended partition.\n");
3769
 
+                               continue;
3770
 
+                       }
3771
 
+                               
3772
 
+                        mbr_ebr_process_dos_primary_partition(
3773
 
+                                discover_list,
3774
 
+                                node,
3775
 
+                                i,
3776
 
+                                p,
3777
 
+                                os2lvm,
3778
 
+                                dlat);
3779
 
+                }
3780
 
+
3781
 
+                /* Pass 2: walk thru the partition table in the mbr
3782
 
+                 * processing each partition record for non-DOS extended partitions
3783
 
+                 */
3784
 
+                for (i = 0; i < 4; i++) {
3785
 
+                        p = &mbr->partitions[i];
3786
 
+                        mbr_ebr_process_other_primary_partition(
3787
 
+                                discover_list,
3788
 
+                                node,
3789
 
+                                p);
3790
 
+                }
3791
 
+
3792
 
+        }
3793
 
+
3794
 
+        /* free the space used for OS2 DLAT info */
3795
 
+        if (dlat)
3796
 
+                evms_cs_deallocate_memory(dlat);
3797
 
+
3798
 
+        return(rc);
3799
 
+}
3800
 
+
3801
 
+static int
3802
 
+mbr_ebr_probe_for_mbr(
3803
 
+        evms_logical_node_t **discover_list,
3804
 
+        evms_logical_node_t *node)
3805
 
+{
3806
 
+        int rc = 0;
3807
 
+        u_char *sector_buffer = NULL;
3808
 
+        mbr_ebr_t *mbr = NULL;
3809
 
+
3810
 
+        LOG_DEBUG("%s: probing (%s).\n",
3811
 
+                 __FUNCTION__, node->name);
3812
 
+
3813
 
+        /* allocate a sector size buffer */
3814
 
+        rc = evms_cs_allocate_memory((void **)&sector_buffer,
3815
 
+                                     node->hardsector_size);
3816
 
+        if (!rc)
3817
 
+                /* read the location of the mbr sector */
3818
 
+                rc = INIT_IO(node, 0, 0, 1, sector_buffer);
3819
 
+        if (rc) {
3820
 
+                LOG_ERROR("%s: read error(%d) on '%s'.\n",
3821
 
+                         __FUNCTION__, rc, node->name);
3822
 
+        } else {
3823
 
+                mbr = (mbr_ebr_t *)sector_buffer;
3824
 
+                if (validate_mbr_ebr(node, mbr, 0, 0) == TRUE) {
3825
 
+                       /* since it looks like this disk has a
3826
 
+                        * valid MBR, remove the disk node from
3827
 
+                        * the discover list. it may already be
3828
 
+                        * on the global list, or it will be
3829
 
+                        * added to it. in the case of an mbr
3830
 
+                        * with no partitions, it is simply
3831
 
+                        * removed and forgotten. when one or
3832
 
+                        * more partitions are created, the
3833
 
+                        * disk will be examined and handled
3834
 
+                        * properly during the following
3835
 
+                        * rediscover operation.
3836
 
+                        */
3837
 
+                       evms_cs_remove_logical_node_from_list(
3838
 
+                               discover_list, node);
3839
 
+
3840
 
+                        rc = mbr_ebr_process_mbr(discover_list,node,mbr);
3841
 
+               }
3842
 
+        }
3843
 
+
3844
 
+        if (sector_buffer)
3845
 
+                evms_cs_deallocate_memory(sector_buffer);
3846
 
+
3847
 
+        return(rc);
3848
 
+}
3849
 
+
3850
 
+/*
3851
 
+ * Function: mbr_ebr_partition_discover
3852
 
+ *
3853
 
+ */
3854
 
+static int
3855
 
+mbr_ebr_partition_discover(evms_logical_node_t **discover_list)
3856
 
+{
3857
 
+        int rc = 0;
3858
 
+        evms_logical_node_t *node, *next_node;
3859
 
+
3860
 
+        LOG_ENTRY_EXIT("%s: ENTRY\n", __FUNCTION__);
3861
 
+
3862
 
+        /* initialize global variable */
3863
 
+        exported_nodes = 0;
3864
 
+
3865
 
+        /* examine each node on the discover list */
3866
 
+        next_node = *discover_list;
3867
 
+        while(next_node) {
3868
 
+                node = next_node;
3869
 
+                next_node = node->next;
3870
 
+               if (node->plugin->id == plugin_header.id)
3871
 
+                       /* don't recurse into our own objects
3872
 
+                        */
3873
 
+                       continue;
3874
 
+                mbr_ebr_probe_for_mbr(discover_list,node);
3875
 
+        }
3876
 
+
3877
 
+        LOG_ENTRY_EXIT("%s: EXIT(exported nodes:%d, error code:%d)\n",
3878
 
+                        __FUNCTION__, exported_nodes, rc);
3879
 
+        if (exported_nodes)
3880
 
+                rc = exported_nodes;
3881
 
+        return(rc);
3882
 
+}
3883
 
+
3884
 
+/*
3885
 
+ * Function: mbr_ebr_partition_delete
3886
 
+ *
3887
 
+ */
3888
 
+static int
3889
 
+mbr_ebr_partition_delete(evms_logical_node_t *segment)
3890
 
+{
3891
 
+        int rc = 0;
3892
 
+        local_instance_data_t *LID;
3893
 
+        evms_logical_node_t *empty_disk = NULL;
3894
 
+
3895
 
+        LOG_DETAILS("deleting segment '%s'.\n",segment->name);
3896
 
+
3897
 
+        if (!segment) {
3898
 
+                rc = -ENODEV;
3899
 
+        } else {
3900
 
+                LID = segment->instance_data;
3901
 
+                if (LID) {
3902
 
+                        /* remove the segment from the
3903
 
+                         * disk's segment list
3904
 
+                         */
3905
 
+                        rc = remove_segment_from_disk(
3906
 
+                                LID->source_disk,
3907
 
+                                segment,
3908
 
+                                &empty_disk);
3909
 
+                        /* free the local instance data */
3910
 
+                        evms_cs_deallocate_memory(LID);
3911
 
+                }
3912
 
+                /* free the segment node */
3913
 
+                evms_cs_deallocate_logical_node(segment);
3914
 
+                MOD_DEC_USE_COUNT;
3915
 
+                /* if the last segment on the disk was
3916
 
+                 * deleted, delete the disk node too
3917
 
+                 */
3918
 
+                if (empty_disk)
3919
 
+                        DELETE(empty_disk);
3920
 
+        }
3921
 
+        return(rc);
3922
 
+}
3923
 
+
3924
 
+/*
3925
 
+ * function: mbr_ebr_partition_io_error
3926
 
+ *
3927
 
+ * this function was primarily created because the function
3928
 
+ * buffer_IO_error is inline and kgdb doesn't allow breakpoints
3929
 
+ * to be set on inline functions. Since this was an error path
3930
 
+ * and not mainline, I decided to add a trace statement to help
3931
 
+ * report on the failing condition.
3932
 
+ *
3933
 
+ */
3934
 
+static void
3935
 
+mbr_ebr_partition_io_error(
3936
 
+        evms_logical_node_t *node,
3937
 
+        int io_flag,
3938
 
+        eio_t *eio)
3939
 
+{
3940
 
+        LOG_SERIOUS("attempt to %s beyond partition boundary(%Ld) on (%s), rsector(%Ld).\n",
3941
 
+                (io_flag) ? "WRITE" : "READ",
3942
 
+                node->total_vsectors - 1,
3943
 
+                node->name,
3944
 
+                eio->rsector);
3945
 
+
3946
 
+        EVMS_IO_ERROR(eio);
3947
 
+}
3948
 
+
3949
 
+/*
3950
 
+ * Function: mbr_ebr_partition_read
3951
 
+ *
3952
 
+ */
3953
 
+static void
3954
 
+mbr_ebr_partition_read(
3955
 
+        evms_logical_node_t *partition,
3956
 
+        eio_t *eio)
3957
 
+{
3958
 
+        local_instance_data_t *LID = partition->instance_data;
3959
 
+
3960
 
+        if ((eio->rsector + eio->rsize) <= partition->total_vsectors) {
3961
 
+                eio->rsector += LID->start_sect;
3962
 
+                R_IO(LID->source_disk, eio);
3963
 
+        } else
3964
 
+                mbr_ebr_partition_io_error(partition, READ, eio);
3965
 
+}
3966
 
+
3967
 
+/*
3968
 
+ * Function: mbr_ebr_partition_write
3969
 
+ *
3970
 
+ */
3971
 
+static void
3972
 
+mbr_ebr_partition_write(
3973
 
+        evms_logical_node_t *partition,
3974
 
+        eio_t *eio)
3975
 
+{
3976
 
+        local_instance_data_t *LID = partition->instance_data;
3977
 
+
3978
 
+        if ((eio->rsector + eio->rsize) <= partition->total_vsectors) {
3979
 
+                eio->rsector += LID->start_sect;
3980
 
+                W_IO(LID->source_disk, eio);
3981
 
+        } else
3982
 
+                mbr_ebr_partition_io_error(partition, WRITE, eio);
3983
 
+}
3984
 
+
3985
 
+/*
3986
 
+ * Function: mbr_ebr_partition_init_io
3987
 
+ *
3988
 
+ */
3989
 
+static int
3990
 
+mbr_ebr_partition_init_io(
3991
 
+        evms_logical_node_t *partition,
3992
 
+        int                  io_flag,        /* 0=read, 1=write*/
3993
 
+        evms_sector_t        sect_nr,        /* disk LBA */
3994
 
+        evms_sector_t        num_sects,      /* # of sectors */
3995
 
+        void                *buf_addr)       /* buffer address */
3996
 
+{
3997
 
+        int rc;
3998
 
+        local_instance_data_t *LID = partition->instance_data;
3999
 
+
4000
 
+        if ((sect_nr + num_sects) <= partition->total_vsectors) {
4001
 
+                rc = INIT_IO(LID->source_disk, io_flag, sect_nr + LID->start_sect, num_sects, buf_addr);
4002
 
+        } else {
4003
 
+                LOG_SERIOUS("init_io: attempt to %s beyond partition(%s) boundary(%Ld) at sector(%Ld) for count(%Ld).\n",
4004
 
+                        (io_flag) ? "WRITE" : "READ",
4005
 
+                       partition->name,
4006
 
+                        (LID->nr_sects - 1),
4007
 
+                        sect_nr, num_sects);
4008
 
+                rc = -EINVAL;
4009
 
+        }
4010
 
+
4011
 
+        return(rc);
4012
 
+}
4013
 
+
4014
 
+/*
4015
 
+ * Function: mbr_ebr_partition_ioctl
4016
 
+ *
4017
 
+ */
4018
 
+static int
4019
 
+mbr_ebr_partition_ioctl (
4020
 
+        evms_logical_node_t *partition,
4021
 
+        struct inode        *inode,
4022
 
+        struct file         *file,
4023
 
+        unsigned int         cmd,
4024
 
+        unsigned long        arg)
4025
 
+{
4026
 
+        local_instance_data_t *LID;
4027
 
+        struct hd_geometry hd_geo;
4028
 
+        int rc;
4029
 
+
4030
 
+        rc = 0;
4031
 
+        LID = partition->instance_data;
4032
 
+        if (!inode)
4033
 
+                return -EINVAL;
4034
 
+        switch (cmd) {
4035
 
+                case HDIO_GETGEO:
4036
 
+                {
4037
 
+                        rc = IOCTL(LID->source_disk, inode, file, cmd, arg);
4038
 
+                        if (rc) break;
4039
 
+                        if (copy_from_user(&hd_geo, (void *)arg, sizeof(struct hd_geometry)))
4040
 
+                                rc = -EFAULT;
4041
 
+                        if (rc) break;
4042
 
+                        hd_geo.start = LID->start_sect;
4043
 
+                        if (copy_to_user((void *)arg, &hd_geo, sizeof(struct hd_geometry)))
4044
 
+                                rc = -EFAULT;
4045
 
+                }
4046
 
+                break;
4047
 
+               case EVMS_GET_BMAP:
4048
 
+                       {
4049
 
+                               evms_get_bmap_t *bmap = (evms_get_bmap_t *)arg;
4050
 
+                               bmap->rsector += LID->start_sect;
4051
 
+                               /* intentionally fall thru to
4052
 
+                                * default ioctl down to device
4053
 
+                                * manager.
4054
 
+                                */
4055
 
+                       }
4056
 
+                default:
4057
 
+                        rc = IOCTL(LID->source_disk, inode, file, cmd, arg);
4058
 
+        }
4059
 
+        return rc;
4060
 
+}
4061
 
+
4062
 
+/*
4063
 
+ * Function: dos_part_init
4064
 
+ *
4065
 
+ */
4066
 
+static int __init
4067
 
+dos_part_init(void)
4068
 
+{
4069
 
+        return evms_cs_register_plugin(&plugin_header); /* register with EVMS */
4070
 
+}
4071
 
+
4072
 
+static void __exit
4073
 
+dos_part_exit(void)
4074
 
+{
4075
 
+        evms_cs_unregister_plugin(&plugin_header);
4076
 
+}
4077
 
+
4078
 
+module_init(dos_part_init);
4079
 
+module_exit(dos_part_exit);
4080
 
+#ifdef MODULE_LICENSE
4081
 
+MODULE_LICENSE("GPL");
4082
 
+#endif
4083
 
+
4084
 
diff -Naur linux-2002-03-28/drivers/evms/evms.c evms-2002-03-28/drivers/evms/evms.c
4085
 
--- linux-2002-03-28/drivers/evms/evms.c        Wed Dec 31 18:00:00 1969
4086
 
+++ evms-2002-03-28/drivers/evms/evms.c Thu Mar 28 15:43:00 2002
4087
 
@@ -0,0 +1,5153 @@
4088
 
+/* -*- linux-c -*- */
4089
 
+/*
4090
 
+ *
4091
 
+ *
4092
 
+ *   Copyright (c) International Business Machines  Corp., 2000
4093
 
+ *
4094
 
+ *   This program is free software;  you can redistribute it and/or modify
4095
 
+ *   it under the terms of the GNU General Public License as published by
4096
 
+ *   the Free Software Foundation; either version 2 of the License, or
4097
 
+ *   (at your option) any later version.
4098
 
+ *
4099
 
+ *   This program is distributed in the hope that it will be useful,
4100
 
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
4101
 
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
4102
 
+ *   the GNU General Public License for more details.
4103
 
+ *
4104
 
+ *   You should have received a copy of the GNU General Public License
4105
 
+ *   along with this program;  if not, write to the Free Software
4106
 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
4107
 
+ *
4108
 
+ *
4109
 
+ */
4110
 
+/*
4111
 
+ *
4112
 
+ * linux/drivers/evms/evms.c
4113
 
+ *
4114
 
+ * EVMS Base and Common Services
4115
 
+ *
4116
 
+ */
4117
 
+
4118
 
+#define DEVICE_NR(device) MINOR(device)         /* evms has no partition bits */
4119
 
+#define DEVICE_NAME "evms"                      /* name for messaging */
4120
 
+#define DEVICE_NO_RANDOM                        /* no entropy to contribute */
4121
 
+#define DEVICE_OFF(d)                           /* do nothing */
4122
 
+
4123
 
+#include <linux/config.h>
4124
 
+#include <linux/module.h>
4125
 
+#include <linux/errno.h>
4126
 
+#include <linux/kernel.h>
4127
 
+#include <linux/init.h>
4128
 
+#include <linux/fs.h>
4129
 
+#include <linux/major.h>
4130
 
+#include <linux/slab.h>
4131
 
+#include <asm/uaccess.h>
4132
 
+#include <linux/blk.h>      /* must be included by all block drivers */
4133
 
+#include <linux/blkdev.h>
4134
 
+#include <linux/blkpg.h>
4135
 
+#include <linux/iobuf.h>
4136
 
+#include <linux/genhd.h>
4137
 
+#include <linux/major.h>
4138
 
+#include <linux/sched.h>
4139
 
+#include <linux/version.h>
4140
 
+#include <linux/swap.h>
4141
 
+#include <net/checksum.h>
4142
 
+#include <linux/sysctl.h>
4143
 
+#include <linux/smp_lock.h>
4144
 
+#include <linux/evms/evms_kernel.h>
4145
 
+
4146
 
+//#define VFS_PATCH_PRESENT
4147
 
+
4148
 
+/* prefix used in logging messages */
4149
 
+#define LOG_PREFIX
4150
 
+
4151
 
+typedef struct evms_registered_plugin_s {
4152
 
+        evms_plugin_header_t            * plugin;
4153
 
+        struct evms_registered_plugin_s * next;
4154
 
+} evms_registered_plugin_t;
4155
 
+static evms_registered_plugin_t * registered_plugin_head = NULL;
4156
 
+
4157
 
+static evms_list_node_t *evms_global_device_list = NULL;
4158
 
+static evms_list_node_t *evms_global_feature_node_list = NULL;
4159
 
+static evms_list_node_t *evms_global_notify_list = NULL;
4160
 
+
4161
 
+int                               evms_info_level = EVMS_INFO_LEVEL;
4162
 
+struct proc_dir_entry           *evms_proc_dir = NULL;
4163
 
+EXPORT_SYMBOL(evms_info_level);
4164
 
+static evms_logical_volume_t    * evms_logical_volumes;
4165
 
+static int                        evms_volumes = 0;
4166
 
+/* a few variables to aid in detecting memory leaks.
4167
 
+ * these variables are always in use, regardless of
4168
 
+ * the state of EVMS_MEM_DEBUG.
4169
 
+ */
4170
 
+static atomic_t                   evms_allocs;
4171
 
+static atomic_t                   evms_logical_nodes;
4172
 
+
4173
 
+char *evms_primary_string = "primary";
4174
 
+EXPORT_SYMBOL(evms_primary_string);
4175
 
+char *evms_secondary_string = "secondary";
4176
 
+EXPORT_SYMBOL(evms_secondary_string);
4177
 
+
4178
 
+static evms_version_t evms_svc_version = {
4179
 
+        major      : EVMS_COMMON_SERVICES_MAJOR,
4180
 
+        minor      : EVMS_COMMON_SERVICES_MINOR,
4181
 
+        patchlevel : EVMS_COMMON_SERVICES_PATCHLEVEL
4182
 
+};
4183
 
+
4184
 
+static int evms_discover_volumes(evms_rediscover_t *);
4185
 
+
4186
 
+/* Handles for "private" EVMS object pools */
4187
 
+static evms_pool_mgmt_t *evms_io_notify_pool;
4188
 
+
4189
 
+/* Handles for "public" EVMS object pools */
4190
 
+evms_pool_mgmt_t *evms_bh_pool;
4191
 
+EXPORT_SYMBOL(evms_bh_pool);
4192
 
+
4193
 
+/* Handle for the devfs directory entry */
4194
 
+devfs_handle_t evms_dir_devfs_handle;
4195
 
+devfs_handle_t evms_blk_devfs_handle;
4196
 
+
4197
 
+
4198
 
+/**********************************************************/
4199
 
+/* SYSCTL - EVMS folder                                          */
4200
 
+/**********************************************************/
4201
 
+
4202
 
+#ifdef CONFIG_PROC_FS
4203
 
+static struct ctl_table_header *evms_table_header;
4204
 
+static int evms_info_level_min = EVMS_INFO_CRITICAL;
4205
 
+static int evms_info_level_max = EVMS_INFO_EVERYTHING;
4206
 
+
4207
 
+static ctl_table evms_table[] = {
4208
 
+       {DEV_EVMS_INFO_LEVEL, "evms_info_level",
4209
 
+        &evms_info_level, sizeof(int), 0644, NULL, 
4210
 
+        &proc_dointvec_minmax, &sysctl_intvec,
4211
 
+        NULL, &evms_info_level_min, &evms_info_level_max},
4212
 
+       {0}
4213
 
+};
4214
 
+
4215
 
+static ctl_table evms_dir_table[] = {
4216
 
+       {DEV_EVMS, "evms", NULL, 0, 0555, evms_table},
4217
 
+       {0}
4218
 
+};
4219
 
+
4220
 
+static ctl_table dev_dir_table[] = {
4221
 
+       {CTL_DEV, "dev", NULL, 0, 0555, evms_dir_table},
4222
 
+       {0}
4223
 
+};
4224
 
+#endif
4225
 
+
4226
 
+/**********************************************************/
4227
 
+/* START -- exported functions/Common Services            */
4228
 
+/**********************************************************/
4229
 
+
4230
 
+/*
4231
 
+ * Function:     evms_cs_get_version
4232
 
+ * Description: This function returns the current EVMS version
4233
 
+ */
4234
 
+void 
4235
 
+evms_cs_get_version(int * major, int *minor)
4236
 
+{
4237
 
+        *major = EVMS_MAJOR_VERSION;
4238
 
+        *minor = EVMS_MINOR_VERSION;
4239
 
+}
4240
 
+EXPORT_SYMBOL(evms_cs_get_version);
4241
 
+
4242
 
+int 
4243
 
+evms_cs_check_version(
4244
 
+       evms_version_t *required, 
4245
 
+       evms_version_t *actual)
4246
 
+{
4247
 
+        int rc = 0;
4248
 
+
4249
 
+        if (required->major != actual->major)
4250
 
+                rc = -EINVAL;
4251
 
+        else if (required->minor > actual->minor)
4252
 
+                rc = -EINVAL;
4253
 
+        else if (required->minor == actual->minor)
4254
 
+                if (required->patchlevel > actual->patchlevel)
4255
 
+                        rc = -EINVAL;
4256
 
+        return(rc);
4257
 
+}
4258
 
+EXPORT_SYMBOL(evms_cs_check_version);
4259
 
+
4260
 
+#ifdef EVMS_MEM_DEBUG
4261
 
+#define EVMS_MEM_SSIGNATURE 0x4D444D63   //SMEM
4262
 
+typedef struct memobj_head_s {
4263
 
+        unsigned int            ssignature;
4264
 
+        struct memobj_head_s    *next;
4265
 
+        int                     size;
4266
 
+        struct memobj_tail_s    *tail;
4267
 
+} memobj_head_t;
4268
 
+#define EVMS_MEM_ESIGNATURE 0x4D444D44   //EMEM
4269
 
+typedef struct memobj_tail_s {
4270
 
+        unsigned int            esignature;
4271
 
+        memobj_head_t           *head;
4272
 
+} memobj_tail_t;
4273
 
+
4274
 
+static memobj_head_t *memobj_head = NULL;
4275
 
+static spinlock_t mem_debug_lock = SPIN_LOCK_UNLOCKED;
4276
 
+
4277
 
+/* 
4278
 
+ * function description: evms_cs_verify_memory_integrity
4279
 
+ *   Verifies:
4280
 
+ *      the count of memory objects in the list
4281
 
+ *      the starting signature (SSIGNATURE) hasn't been overwritten
4282
 
+ *      the ending signature (ESIGNATURE) hasn't been overwritten
4283
 
+ *
4284
 
+ *   op_flag: controls the behaviour when a problem is found
4285
 
+ *      0  = stop immediately where a problem is found
4286
 
+ *      !0 = don't stop, but report problem(s) exist, via return code
4287
 
+ */
4288
 
+int 
4289
 
+evms_cs_verify_memory_integrity(int op_flag)
4290
 
+{
4291
 
+        int rc = 0, objcount;
4292
 
+        memobj_head_t *mobj, **ppmobj;
4293
 
+        memobj_tail_t *mobjtail;
4294
 
+
4295
 
+        /* verify each object in the linked list */
4296
 
+        objcount = 0;
4297
 
+       spin_lock(&mem_debug_lock);
4298
 
+        ppmobj = &memobj_head;
4299
 
+        while(*ppmobj) {
4300
 
+                objcount++;
4301
 
+                mobj = *ppmobj;
4302
 
+                /* verify starting signature */
4303
 
+                if (mobj->ssignature != EVMS_MEM_SSIGNATURE) {
4304
 
+                        if (op_flag == 0)
4305
 
+                                BUG();
4306
 
+                        else
4307
 
+                                rc++;
4308
 
+                }
4309
 
+                /* verify ending signature */
4310
 
+                mobjtail = mobj->tail;
4311
 
+                if (mobjtail->esignature != EVMS_MEM_ESIGNATURE) {
4312
 
+                        if (op_flag == 0)
4313
 
+                                BUG();
4314
 
+                        else
4315
 
+                                rc++;
4316
 
+                }
4317
 
+                ppmobj = &(*ppmobj)->next;
4318
 
+        }
4319
 
+       spin_unlock(&mem_debug_lock);
4320
 
+        /* verify object count */
4321
 
+        if (objcount != evms_allocs) {
4322
 
+                if (op_flag == 0)
4323
 
+                        BUG();
4324
 
+                else
4325
 
+                        rc++;
4326
 
+        }
4327
 
+        return(rc);
4328
 
+}
4329
 
+EXPORT_SYMBOL(evms_cs_verify_memory_integrity);
4330
 
+#endif 
4331
 
+
4332
 
+/*
4333
 
+ * function: evms_cs_allocate_memory
4334
 
+ *
4335
 
+ * This function is a wrapper function for the kernel malloc
4336
 
+ * (kmalloc) function. It provides a consistent method of
4337
 
+ * allocating kernel memory for all evms code.
4338
 
+ *
4339
 
+ *
4340
 
+ * This function takes as arguments:
4341
 
+ *
4342
 
+ *  **pp: the address of the pointer which is to contain the
4343
 
+ *        the address of the allocated memory object.
4344
 
+ *  size: the size in bytes of the memory object to be
4345
 
+ *        allocated.
4346
 
+ *
4347
 
+ *
4348
 
+ * This function returns:
4349
 
+ *
4350
 
+ *  *pp = NULL, and return set to -ENOMEM when there is 
4351
 
+ *     insufficient memory to satisfy the request.
4352
 
+ *
4353
 
+ * OR
4354
 
+ *
4355
 
+ *  *pp = NULL, and return set to 0 when the specified 
4356
 
+ *     size is invalid.
4357
 
+ *
4358
 
+ * OR
4359
 
+ *
4360
 
+ * *pp is set to the address of the allocated memory object
4361
 
+ *     and return code is set to 0.
4362
 
+ *
4363
 
+ *
4364
 
+ * NOTE: Defining EVMS_MEM_DEBUG turns on memory integrity
4365
 
+ *       checking. This wraps each memory object with a
4366
 
+ *       header and trailer. The header and trailer contain
4367
 
+ *      signatures and sizes that are used to verify that
4368
 
+ *      existing memory objects have not been overwritten.
4369
 
+ *      Refer to the evms_cs_verify_memory_integrity 
4370
 
+ *      function for more details.
4371
 
+ */
4372
 
+int 
4373
 
+evms_cs_allocate_memory(void **pp, int size)
4374
 
+{
4375
 
+        int rc = 0;
4376
 
+
4377
 
+#ifdef EVMS_MEM_DEBUG
4378
 
+        memobj_head_t *mobj, **ppmobj;
4379
 
+        memobj_tail_t *mobjtail;
4380
 
+#endif
4381
 
+       /* verify a valid size parameter was specified */
4382
 
+       if (size <= 0)
4383
 
+               /* return NULL on invalid size */
4384
 
+               *pp = NULL;
4385
 
+       else {
4386
 
+#ifdef EVMS_MEM_DEBUG
4387
 
+               size += sizeof(memobj_head_t) + sizeof(memobj_tail_t);
4388
 
+#endif
4389
 
+//             *pp = kmalloc(size, GFP_KERNEL);
4390
 
+               *pp = kmalloc(size, GFP_NOIO);
4391
 
+               if (*pp == NULL)
4392
 
+                       rc = -ENOMEM;
4393
 
+               else {
4394
 
+#ifdef EVMS_MEM_DEBUG
4395
 
+                       /* adjust variables to caller values */
4396
 
+                       mobj = (memobj_head_t *)*pp;
4397
 
+                       *pp += sizeof(memobj_head_t);
4398
 
+                       size -= sizeof(memobj_head_t) + sizeof(memobj_tail_t);
4399
 
+
4400
 
+                       /* setup memobj head */
4401
 
+                       mobj->ssignature = EVMS_MEM_SSIGNATURE;
4402
 
+                       mobj->size = size;
4403
 
+                
4404
 
+                       /* setup memobj tail */
4405
 
+                       mobjtail = (memobj_tail_t *)(*pp + size);
4406
 
+                       mobjtail->esignature = EVMS_MEM_ESIGNATURE;
4407
 
+                       mobj->tail = mobjtail;
4408
 
+                       mobjtail->head = mobj;
4409
 
+
4410
 
+                       /* add mobj to linked list */
4411
 
+
4412
 
+                       spin_lock(&mem_debug_lock);
4413
 
+                       ppmobj = &memobj_head;
4414
 
+                       while(*ppmobj > mobj)
4415
 
+                               ppmobj = &(*ppmobj)->next;
4416
 
+                       mobj->next = *ppmobj;
4417
 
+                       *ppmobj = mobj;
4418
 
+                       spin_unlock(&mem_debug_lock);
4419
 
+#endif
4420
 
+                       memset(*pp, 0, size);
4421
 
+                       atomic_inc(&evms_allocs);
4422
 
+               }
4423
 
+       }
4424
 
+
4425
 
+#ifdef EVMS_MEM_DEBUG
4426
 
+        evms_cs_verify_memory_integrity(0);
4427
 
+#endif
4428
 
+        return(rc);
4429
 
+}
4430
 
+EXPORT_SYMBOL(evms_cs_allocate_memory);
4431
 
+
4432
 
+int 
4433
 
+evms_cs_deallocate_memory(void *p)
4434
 
+{
4435
 
+#ifdef EVMS_MEM_DEBUG
4436
 
+        memobj_head_t *mobj, **ppmobj;
4437
 
+
4438
 
+        evms_cs_verify_memory_integrity(0);
4439
 
+
4440
 
+        /* init ptr to memobj structure */
4441
 
+        mobj = (memobj_head_t *)(p - sizeof(memobj_head_t));
4442
 
+
4443
 
+        /* find mobj in linked list */
4444
 
+       spin_lock(&mem_debug_lock);
4445
 
+        ppmobj = &memobj_head;
4446
 
+       while(*ppmobj != mobj)
4447
 
+               ppmobj = &(*ppmobj)->next;
4448
 
+       *ppmobj = mobj->next;
4449
 
+       spin_unlock(&mem_debug_lock);
4450
 
+#endif
4451
 
+        kfree(p);
4452
 
+        atomic_dec(&evms_allocs);
4453
 
+        return(0);
4454
 
+}
4455
 
+EXPORT_SYMBOL(evms_cs_deallocate_memory);
4456
 
+
4457
 
+int 
4458
 
+evms_cs_allocate_logical_node(evms_logical_node_t **pp)
4459
 
+{
4460
 
+        int rc;
4461
 
+
4462
 
+        rc = evms_cs_allocate_memory((void **)pp, sizeof(evms_logical_node_t));
4463
 
+        if (!rc)
4464
 
+               atomic_inc(&evms_logical_nodes);
4465
 
+        return(rc);
4466
 
+}
4467
 
+EXPORT_SYMBOL(evms_cs_allocate_logical_node);
4468
 
+
4469
 
+void
4470
 
+evms_cs_deallocate_volume_info(evms_logical_node_t *p)
4471
 
+{
4472
 
+        if (p->iflags & EVMS_FEATURE_BOTTOM) {
4473
 
+               evms_cs_remove_item_from_list(
4474
 
+                       &evms_global_feature_node_list, p);
4475
 
+               evms_cs_deallocate_memory(p->volume_info);
4476
 
+               p->volume_info = NULL;
4477
 
+               p->iflags &= ~EVMS_FEATURE_BOTTOM;
4478
 
+       }
4479
 
+}
4480
 
+EXPORT_SYMBOL(evms_cs_deallocate_volume_info);
4481
 
+
4482
 
+int 
4483
 
+evms_cs_deallocate_logical_node(evms_logical_node_t *p)
4484
 
+{
4485
 
+        if (p->next) {
4486
 
+                LOG_SERIOUS("Deallocating object whose NEXT ptr is not null!!\n");
4487
 
+        }
4488
 
+       evms_cs_deallocate_volume_info(p);
4489
 
+       if (p->feature_header) {
4490
 
+               evms_cs_deallocate_memory(p->feature_header);
4491
 
+               p->feature_header = NULL;
4492
 
+       }
4493
 
+        evms_cs_deallocate_memory(p);
4494
 
+        atomic_dec(&evms_logical_nodes);
4495
 
+        return(0);
4496
 
+}
4497
 
+EXPORT_SYMBOL(evms_cs_deallocate_logical_node);
4498
 
+
4499
 
+/*
4500
 
+ * Function:     evms_cs_register_plugin
4501
 
+ * Description: This function is exported so that all plugins can register with EVMS
4502
 
+ */
4503
 
+int 
4504
 
+evms_cs_register_plugin(evms_plugin_header_t * plugin)
4505
 
+{
4506
 
+        int rc = 0;
4507
 
+        evms_registered_plugin_t    *reg_record, **pp;
4508
 
+        evms_version_t *ver;
4509
 
+
4510
 
+        ver = &plugin->required_common_services_version;
4511
 
+
4512
 
+       LOG_EXTRA("registering plugin (plugin.id=%d.%d.%d, plugin.ver=%d.%d.%d, req.svc.ver=%d.%d.%d)\n",
4513
 
+                  GetPluginOEM(plugin->id),
4514
 
+                  GetPluginType(plugin->id),
4515
 
+                  GetPluginID(plugin->id),
4516
 
+                  plugin->version.major,
4517
 
+                  plugin->version.minor,
4518
 
+                  plugin->version.patchlevel,
4519
 
+                  ver->major,
4520
 
+                  ver->minor,
4521
 
+                  ver->patchlevel);
4522
 
+
4523
 
+        /* check common services requirements */
4524
 
+        rc = evms_cs_check_version(ver, &evms_svc_version);
4525
 
+        if (rc) {
4526
 
+                LOG_SERIOUS("plugin failed to load: common services (vers:%d,%d,%d) incompatibility!\n",
4527
 
+                           EVMS_COMMON_SERVICES_MAJOR,
4528
 
+                           EVMS_COMMON_SERVICES_MINOR,
4529
 
+                           EVMS_COMMON_SERVICES_PATCHLEVEL);
4530
 
+        }
4531
 
+       if (!rc) {
4532
 
+               /* ensure a plugin with this feature id is
4533
 
+                * not already loaded.
4534
 
+                */
4535
 
+                for (pp = &registered_plugin_head; *pp; pp = &(*pp)->next) {
4536
 
+                       if ((*pp)->plugin->id == plugin->id) {
4537
 
+                               rc = -EBUSY;
4538
 
+                               LOG_ERROR("error(%d) attempting to load another plugin with id(%x).\n",
4539
 
+                                         rc, plugin->id);
4540
 
+                       }
4541
 
+               }
4542
 
+       }
4543
 
+       if (!rc) {
4544
 
+               /* ensure the plugin has provided functions for
4545
 
+                * the mandatory entry points.
4546
 
+                */
4547
 
+               if (!plugin->function_table->discover) {
4548
 
+                       rc = -EINVAL;
4549
 
+               } else if (!plugin->function_table->init_io) {
4550
 
+                       rc = -EINVAL;
4551
 
+               } else if (!plugin->function_table->ioctl) {
4552
 
+                       rc = -EINVAL;
4553
 
+               } else if (!plugin->function_table->read) {
4554
 
+                       rc = -EINVAL;
4555
 
+               } else if (!plugin->function_table->write) {
4556
 
+                       rc = -EINVAL;
4557
 
+               } else if (!plugin->function_table->delete) {
4558
 
+                       rc = -EINVAL;
4559
 
+               }
4560
 
+       }
4561
 
+        if (!rc) {
4562
 
+                /* allocate a new plugin registration record */
4563
 
+                rc = evms_cs_allocate_memory((void **)&reg_record, 
4564
 
+                                          sizeof(evms_registered_plugin_t));
4565
 
+        }
4566
 
+        if (!rc) {
4567
 
+                /* store ptr to plugin header in new registration record */
4568
 
+                reg_record->plugin = plugin;
4569
 
+
4570
 
+                /* terminate the record */
4571
 
+                reg_record->next = NULL;
4572
 
+
4573
 
+                /* find end of the plugin registration list */
4574
 
+                for (pp = &registered_plugin_head; *pp; pp = &(*pp)->next);
4575
 
+                /* add registration record to list */
4576
 
+                *pp = reg_record;
4577
 
+
4578
 
+               /* increment the usage count */
4579
 
+               MOD_INC_USE_COUNT;
4580
 
+        }
4581
 
+        
4582
 
+        return(rc);
4583
 
+}
4584
 
+EXPORT_SYMBOL(evms_cs_register_plugin);
4585
 
+
4586
 
+/*
4587
 
+ * Function:     evms_cs_unregister_plugin
4588
 
+ * Description: This function is exported so that all plugins can 
4589
 
+ * unregister with EVMS
4590
 
+ */
4591
 
+int 
4592
 
+evms_cs_unregister_plugin(evms_plugin_header_t * plugin)
4593
 
+{
4594
 
+        int rc = 0, found = FALSE;
4595
 
+        evms_registered_plugin_t **pp;
4596
 
+        evms_version_t *ver;
4597
 
+
4598
 
+        ver = &plugin->required_common_services_version;
4599
 
+
4600
 
+       LOG_EXTRA("unregistering plugin (plugin.id=%d.%d.%d, plugin.ver=%d.%d.%d, req.svc.ver=%d.%d.%d)\n",
4601
 
+                  GetPluginOEM(plugin->id),
4602
 
+                  GetPluginType(plugin->id),
4603
 
+                  GetPluginID(plugin->id),
4604
 
+                  plugin->version.major,
4605
 
+                  plugin->version.minor,
4606
 
+                  plugin->version.patchlevel,
4607
 
+                  ver->major,
4608
 
+                  ver->minor,
4609
 
+                  ver->patchlevel);
4610
 
+       /* ensure a plugin with this feature id is
4611
 
+        * currently loaded.
4612
 
+        */
4613
 
+        for (pp = &registered_plugin_head; *pp; pp = &(*pp)->next) {
4614
 
+               if ((*pp)->plugin->id == plugin->id) {
4615
 
+                       found = TRUE;
4616
 
+                       break;
4617
 
+               }
4618
 
+       }
4619
 
+       if (!found) {
4620
 
+               rc = -ENOPKG;
4621
 
+               LOG_ERROR("error(%d) attempt to unload a non-loaded plugin with id(%x).\n",
4622
 
+                         rc, plugin->id);
4623
 
+       }
4624
 
+       /* actually unload the plugin now */
4625
 
+       if (!rc) {
4626
 
+               evms_registered_plugin_t * tmp = *pp;
4627
 
+
4628
 
+               /* remove the plugin record from our 
4629
 
+                * internal plugin list
4630
 
+                */
4631
 
+               *pp = (*pp)->next;
4632
 
+                /* deallocate the plugin registration record
4633
 
+                */
4634
 
+                evms_cs_deallocate_memory(tmp);
4635
 
+
4636
 
+               /* decrement the usage count */
4637
 
+               MOD_DEC_USE_COUNT;
4638
 
+        }
4639
 
+        return(rc);
4640
 
+}
4641
 
+EXPORT_SYMBOL(evms_cs_unregister_plugin);
4642
 
+
4643
 
+/* function: evms_cs_add_logical_node_to_list
4644
 
+ *
4645
 
+ * This functions adds a new logical node to the end of a
4646
 
+ * node list.
4647
 
+ * 
4648
 
+ * NOTE: This function is only expected to be called at
4649
 
+ * discovery time, which is singled threaded by nature,
4650
 
+ * and therefore doesn't need to be made SMP safe.
4651
 
+ */
4652
 
+int 
4653
 
+evms_cs_add_logical_node_to_list(evms_logical_node_t ** list_head, evms_logical_node_t * node)
4654
 
+{
4655
 
+        int rc = 0;
4656
 
+        evms_logical_node_t **pp = NULL;
4657
 
+
4658
 
+        /* check to make sure node is not already on a list */
4659
 
+        if (node->next)
4660
 
+                rc = 1;
4661
 
+        else
4662
 
+                /* check to make sure node being added is not already in the list */
4663
 
+                for (pp = list_head; *pp; pp = &(*pp)->next)
4664
 
+                        if (*pp == node) {
4665
 
+                                rc = 2;
4666
 
+                               break;
4667
 
+                       }
4668
 
+
4669
 
+        /* add node to the end of the list */
4670
 
+        if (!rc)
4671
 
+                *pp = node;
4672
 
+
4673
 
+        return(rc);
4674
 
+}
4675
 
+EXPORT_SYMBOL(evms_cs_add_logical_node_to_list);
4676
 
+
4677
 
+/* function: evms_cs_remove_logical_node_from_list
4678
 
+ *
4679
 
+ * This functions removes a new logical node from a node list.
4680
 
+ * 
4681
 
+ * NOTE: This function is only expected to be called at
4682
 
+ * discovery time, which is singled threaded by nature,
4683
 
+ * and therefore doesn't need to be made SMP safe.
4684
 
+ */
4685
 
+int 
4686
 
+evms_cs_remove_logical_node_from_list(evms_logical_node_t ** list_head, evms_logical_node_t * node)
4687
 
+{
4688
 
+        /* remove this node from the head of the list */
4689
 
+        int rc = 1; /* assume failure until target node is found */
4690
 
+        evms_logical_node_t **pp;
4691
 
+        for (pp = list_head; *pp; pp = &(*pp)->next)
4692
 
+                if (*pp == node) {
4693
 
+                        *pp = (*pp)->next;
4694
 
+                        node->next = NULL;
4695
 
+                        rc = 0;
4696
 
+                        break;
4697
 
+                }
4698
 
+        return(rc);
4699
 
+}
4700
 
+EXPORT_SYMBOL(evms_cs_remove_logical_node_from_list);
4701
 
+
4702
 
+int 
4703
 
+evms_cs_kernel_ioctl(evms_logical_node_t *node, unsigned int cmd, unsigned long arg)
4704
 
+{
4705
 
+        int rc = 0;
4706
 
+        struct inode tmp_inode;
4707
 
+        mm_segment_t fs;
4708
 
+
4709
 
+       lock_kernel();
4710
 
+        fs = get_fs();
4711
 
+        set_fs(get_ds());
4712
 
+        rc = IOCTL(node, &tmp_inode, NULL, cmd, arg);
4713
 
+        set_fs(fs);
4714
 
+       unlock_kernel();
4715
 
+
4716
 
+        return(rc);
4717
 
+
4718
 
+}
4719
 
+EXPORT_SYMBOL(evms_cs_kernel_ioctl);
4720
 
+
4721
 
+/*
4722
 
+ * function: evms_cs_size_in_vsectors
4723
 
+ *
4724
 
+ * In EVMS a V(irtual)Sector is 512 bytes in size.
4725
 
+ * This function computes the number of VSECTORs an specified
4726
 
+ * item size would require.
4727
 
+ *
4728
 
+ * NOTE: This function has been coded to work with 64 bit values.
4729
 
+ */
4730
 
+unsigned long 
4731
 
+evms_cs_size_in_vsectors(long long item_size)
4732
 
+{
4733
 
+        long long sectors;
4734
 
+
4735
 
+        sectors = item_size >> EVMS_VSECTOR_SIZE_SHIFT;
4736
 
+        if (item_size & (EVMS_VSECTOR_SIZE - 1))
4737
 
+                sectors++;
4738
 
+        
4739
 
+        return(sectors);
4740
 
+}
4741
 
+EXPORT_SYMBOL(evms_cs_size_in_vsectors);
4742
 
+
4743
 
+/*
4744
 
+ * function: evms_cs_log2
4745
 
+ *
4746
 
+ * this function computes the power of the 2 of specified
4747
 
+ * value. If the value is 0, a -1 is returned. If the value
4748
 
+ * is NOT a power of 2, a -2 is return. Otherwise the power
4749
 
+ * of 2 is returned.
4750
 
+ */
4751
 
+int evms_cs_log2(long long value)
4752
 
+{
4753
 
+       int result = -1;
4754
 
+       long long tmp;
4755
 
+
4756
 
+       if (value) {
4757
 
+               tmp = value;
4758
 
+               result++;
4759
 
+               while(!(tmp & 1)) {
4760
 
+                       result++;
4761
 
+                       tmp >>= 1;
4762
 
+               }
4763
 
+               if (tmp != 1) {
4764
 
+                       result = -2;
4765
 
+               }
4766
 
+       }
4767
 
+       return(result);
4768
 
+}
4769
 
+EXPORT_SYMBOL(evms_cs_log2);
4770
 
+
4771
 
+/*
4772
 
+ * Functions: 
4773
 
+ *
4774
 
+ *              build_crc_table()
4775
 
+ *              calculate_crc()
4776
 
+ *
4777
 
+ *
4778
 
+ * Description: The functions in this module provide a means of calculating
4779
 
+ *              the 32 bit CRC for a block of data.  build_crc_table must
4780
 
+ *              be called to initialize this module.  calculate_crc must
4781
 
+ *              NOT be used until after build_crc_table has been called.
4782
 
+ *              Once build_crc_table has been called, calculate_crc can
4783
 
+ *              be used to calculate the crc of the data residing in a
4784
 
+ *              user specified buffer.
4785
 
+ *
4786
 
+ */
4787
 
+
4788
 
+#define CRC_POLYNOMIAL     0xEDB88320L
4789
 
+
4790
 
+static u_int32_t crc_table[256];     
4791
 
+static u_int32_t crc_table_built = FALSE;
4792
 
+
4793
 
+/*********************************************************************/
4794
 
+/*                                                                   */
4795
 
+/*   Function Name: build_crc_table                                  */
4796
 
+/*                                                                   */
4797
 
+/*   Descriptive Name: This module implements the crc function using */
4798
 
+/*                     a table driven method.  The required table    */
4799
 
+/*                     must be setup before the calculate_crc        */
4800
 
+/*                     function can be used.  This table only needs  */
4801
 
+/*                     to be set up once.  This function sets up the */
4802
 
+/*                     crc table needed by calculate_crc.            */
4803
 
+/*                                                                   */
4804
 
+/*   Input: None                                                     */
4805
 
+/*                                                                   */
4806
 
+/*   Output: None                                                    */
4807
 
+/*                                                                   */
4808
 
+/*   Error Handling: N/A                                             */
4809
 
+/*                                                                   */
4810
 
+/*   Side Effects:  The internal crc table is initialized.           */
4811
 
+/*                                                                   */
4812
 
+/*   Notes:  None.                                                   */
4813
 
+/*                                                                   */
4814
 
+/*********************************************************************/
4815
 
+static void 
4816
 
+build_crc_table( void )
4817
 
+{
4818
 
+       u_int32_t  i, j, crc;
4819
 
+
4820
 
+       for (i = 0; i <= 255; i++) {
4821
 
+               crc = i;
4822
 
+               for (j = 8; j > 0; j--) {
4823
 
+                       if (crc & 1)
4824
 
+                               crc = (crc >> 1) ^ CRC_POLYNOMIAL;
4825
 
+                       else
4826
 
+                               crc >>= 1;
4827
 
+               }       
4828
 
+               crc_table[i] = crc;
4829
 
+       }
4830
 
+       crc_table_built = TRUE;
4831
 
+}
4832
 
+
4833
 
+/*********************************************************************/
4834
 
+/*                                                                   */
4835
 
+/*   Function Name: calculate_crc                                    */
4836
 
+/*                                                                   */
4837
 
+/*   Descriptive Name: This function calculates the crc value for    */
4838
 
+/*                     the data in the buffer specified by Buffer.   */
4839
 
+/*                                                                   */
4840
 
+/*   Input: u_int32_t    crc : This is the starting crc.  If you are */
4841
 
+/*                             starting a new crc calculation, then  */
4842
 
+/*                             this should be set to 0xFFFFFFFF.  If */
4843
 
+/*                             you are continuing a crc calculation  */
4844
 
+/*                             (i.e. all of the data did not fit in  */
4845
 
+/*                             the buffer so you could not calculate */
4846
 
+/*                             the crc in a single operation), then  */
4847
 
+/*                             this is the crc output by the last    */
4848
 
+/*                             calculate_crc call.                   */
4849
 
+/*                                                                   */
4850
 
+/*   Output: The crc for the data in the buffer, based upon the value*/
4851
 
+/*           of the input parameter crc.                             */
4852
 
+/*                                                                   */
4853
 
+/*   Error Handling: None.                                           */
4854
 
+/*                                                                   */
4855
 
+/*   Side Effects:  None.                                            */
4856
 
+/*                                                                   */
4857
 
+/*   Notes:  None.                                                   */
4858
 
+/*                                                                   */
4859
 
+/*********************************************************************/
4860
 
+u_int32_t 
4861
 
+evms_cs_calculate_crc(u_int32_t crc, void * buffer, u_int32_t buffersize)
4862
 
+{
4863
 
+       unsigned char    * current_byte;
4864
 
+       u_int32_t        temp1, temp2, i;
4865
 
+
4866
 
+       current_byte = (unsigned char *) buffer;
4867
 
+       /* Make sure the crc table is available */
4868
 
+       if (crc_table_built==FALSE)  build_crc_table();
4869
 
+       /* Process each byte in the buffer. */
4870
 
+       for (i = 0; i < buffersize; i++) {
4871
 
+               temp1 = (crc >> 8) & 0x00FFFFFF;
4872
 
+               temp2 = crc_table[(crc ^ (u_int32_t)*current_byte) & (u_int32_t)0xff];
4873
 
+               current_byte++;
4874
 
+               crc = temp1 ^ temp2;
4875
 
+       }
4876
 
+    return(crc);
4877
 
+}
4878
 
+EXPORT_SYMBOL(evms_cs_calculate_crc);
4879
 
+
4880
 
+#define EVMS_ORIGINAL_CALLBACK_FLAG    1<<0
4881
 
+typedef struct io_notify_s {
4882
 
+       unsigned int         flags;
4883
 
+       void                 *private;
4884
 
+       struct buffer_head  *bh;
4885
 
+       u_int64_t            rsector;
4886
 
+       void                *b_private; 
4887
 
+       void (*callback_function)(evms_logical_node_t *node,
4888
 
+                                 struct buffer_head *bh,
4889
 
+                                 int uptodate, int *redrive);
4890
 
+       struct io_notify_s  *next;
4891
 
+} io_notify_t;
4892
 
+
4893
 
+evms_pool_mgmt_t *
4894
 
+evms_cs_create_pool(
4895
 
+       int objsize, 
4896
 
+       char *pool_name,
4897
 
+        void (*ctor)(void*, kmem_cache_t *, unsigned long),
4898
 
+       void (*dtor)(void*, kmem_cache_t *, unsigned long))
4899
 
+{
4900
 
+       evms_pool_mgmt_t *pool;
4901
 
+
4902
 
+       /* create the pool management structure */
4903
 
+       if (evms_cs_allocate_memory((void **)&pool, sizeof(evms_pool_mgmt_t))) {
4904
 
+               panic("Cannot create %s fpool mgmt structure", pool_name);
4905
 
+       }
4906
 
+       /* initialize various field in pool mgmt structure */
4907
 
+       pool->member_size = objsize;
4908
 
+       pool->name = pool_name;
4909
 
+       atomic_set(&pool->waiters, 0);
4910
 
+       init_waitqueue_head(&pool->wait_queue);
4911
 
+       /* go create the pool */
4912
 
+       pool->cachep = kmem_cache_create(
4913
 
+               pool->name,
4914
 
+               pool->member_size,
4915
 
+               0, 
4916
 
+               SLAB_HWCACHE_ALIGN, 
4917
 
+               ctor, dtor);
4918
 
+       if(!pool->cachep)
4919
 
+               panic("Cannot create %s SLAB cache", pool->name);
4920
 
+       return(pool);
4921
 
+}
4922
 
+EXPORT_SYMBOL(evms_cs_create_pool);
4923
 
+
4924
 
+void *
4925
 
+evms_cs_allocate_from_pool(evms_pool_mgmt_t *pool, int blockable)
4926
 
+{
4927
 
+       void *objp;
4928
 
+
4929
 
+       while (1) {
4930
 
+               objp = kmem_cache_alloc(pool->cachep, SLAB_NOIO);
4931
 
+               if (objp || !blockable) {
4932
 
+                       return(objp);
4933
 
+               } else {
4934
 
+                       /* block and wait for an object to
4935
 
+                        * be returned to the pool
4936
 
+                        */
4937
 
+                       atomic_inc(&pool->waiters);
4938
 
+                       wait_event(pool->wait_queue, 
4939
 
+                               (!atomic_read(&pool->waiters)));
4940
 
+               }
4941
 
+       }
4942
 
+       return(objp);
4943
 
+}
4944
 
+EXPORT_SYMBOL(evms_cs_allocate_from_pool);
4945
 
+
4946
 
+void
4947
 
+evms_cs_deallocate_to_pool(evms_pool_mgmt_t *pool, void *objp)
4948
 
+{
4949
 
+       kmem_cache_free(pool->cachep, objp);
4950
 
+       atomic_set(&pool->waiters,0);
4951
 
+       if (waitqueue_active(&pool->wait_queue)) {
4952
 
+               wake_up(&pool->wait_queue);
4953
 
+       }
4954
 
+}
4955
 
+EXPORT_SYMBOL(evms_cs_deallocate_to_pool);
4956
 
+
4957
 
+void
4958
 
+evms_cs_destroy_pool(evms_pool_mgmt_t *pool)
4959
 
+{
4960
 
+       kmem_cache_destroy(pool->cachep);
4961
 
+       evms_cs_deallocate_memory(pool);
4962
 
+}
4963
 
+EXPORT_SYMBOL(evms_cs_destroy_pool);
4964
 
+
4965
 
+/* 
4966
 
+ * function: evms_end_io
4967
 
+ *
4968
 
+ * This is a support function for 
4969
 
+ * evms_cs_register_for_end_io_notification.
4970
 
+ * This function is called during I/O completion on any buffer
4971
 
+ * head that was registered by a plugin. Control is passed here
4972
 
+ * and this routine will, thru the use of the I/O notify entry
4973
 
+ * stored in the b_private field of the buffer head, restore
4974
 
+ * the b_rsector value the buffer head had at the time of 
4975
 
+ * registration and pass control to the registered callback
4976
 
+ * address, with pointers to the buffer head and an optional
4977
 
+ * plugin private data. Upon completion of the callback,
4978
 
+ * control is returned back here. The io notify list entry
4979
 
+ * is deleted. This process repeats until this routine
4980
 
+ * detects that all registered plugins have been called back
4981
 
+ * and the buffer head's original end_io function has been
4982
 
+ * called. At this point the DONE flag is set, and we terminate
4983
 
+ * callback loop and exit.
4984
 
+ *
4985
 
+ * Plugins may desire to break or interrupt the callback
4986
 
+ * sequence or chain. This may be useful to redrive I/O or
4987
 
+ * to wait for other buffer heads to complete before
4988
 
+ * allowing the original buffer head callback to occur.
4989
 
+ * To interrupt the callback "chain", a registered
4990
 
+ * plugin's callback must return with the DONE flag set.
4991
 
+ *
4992
 
+ * NOTE: If a plugin set the DONE flag, and wishes to redrive
4993
 
+ * a buffer head, the plugin MUST reregister the buffer head
4994
 
+ * to receive another callback on this buffer head. Also, the
4995
 
+ * plugin MUST ensure that the original buffer head end_io
4996
 
+ * function get called at some point, either by reregistering
4997
 
+ * this buffer head and receiving another callback, or by
4998
 
+ * means of buffer head aggregation triggered by the callbacks
4999
 
+ * of other buffer heads.
5000
 
+ *
5001
 
+ */
5002
 
+static void 
5003
 
+evms_end_io(struct buffer_head *bh, int uptodate)
5004
 
+{
5005
 
+       io_notify_t *entry;
5006
 
+       int done;
5007
 
+
5008
 
+       done = FALSE;
5009
 
+       while (!done) {
5010
 
+               /* retrieve the io_notify_entry ptr from
5011
 
+                * the b_private field in the buffer head.
5012
 
+                */
5013
 
+               entry = (io_notify_t *)bh->b_private;
5014
 
+
5015
 
+               /* restore the b_private value to
5016
 
+                * the previous b_private value (which
5017
 
+                * should be a previous io_notify_entry
5018
 
+                * or the original b_private pointer).
5019
 
+                */
5020
 
+               bh->b_private = entry->b_private;
5021
 
+
5022
 
+               /* check for original callback for this bh */
5023
 
+               if (entry->flags & EVMS_ORIGINAL_CALLBACK_FLAG) {
5024
 
+                       /* this is the original for bh */
5025
 
+
5026
 
+                       /* turn off flag marking this as the original */
5027
 
+                       entry->flags &= ~EVMS_ORIGINAL_CALLBACK_FLAG;
5028
 
+                       
5029
 
+                       /* decrement volume's requests_in_progress var */
5030
 
+                       atomic_dec(&evms_logical_volumes[MINOR(bh->b_dev)].requests_in_progress);
5031
 
+
5032
 
+                       /* restore b_end_io to original value */
5033
 
+                       bh->b_end_io = (void *)entry->callback_function;
5034
 
+                       if (bh->b_end_io) {
5035
 
+                               /* invoke original callback function 
5036
 
+                                * if it exists.
5037
 
+                                */
5038
 
+                               bh->b_end_io(bh, uptodate);
5039
 
+                       }
5040
 
+                       done = TRUE;
5041
 
+               } else {
5042
 
+                       /* this is a plugin callback */
5043
 
+
5044
 
+                       /* restore the rsector value to the
5045
 
+                        * value at the time of callback
5046
 
+                        * registration.
5047
 
+                        */
5048
 
+                       bh->b_rsector = entry->rsector;
5049
 
+                       /* invoke plugin callback function */
5050
 
+                       entry->callback_function(entry->private, bh, uptodate, &done);
5051
 
+               }
5052
 
+               /* free the io notify entry */
5053
 
+               evms_cs_deallocate_to_pool(evms_io_notify_pool, entry);
5054
 
+       }
5055
 
+}
5056
 
+
5057
 
+/*
5058
 
+ * function: evms_cs_register_for_end_io_notification
5059
 
+ *
5060
 
+ * This function is an evms common service.
5061
 
+ * This routine allows a (plugin) function to register to
5062
 
+ * participate in the io completion notification process.
5063
 
+ * This is useful for plugins which alter data after it
5064
 
+ * has been read from the disk (i.e. encryption or
5065
 
+ * compression).
5066
 
+ *
5067
 
+ * This routine also records the rsector value at the time
5068
 
+ * of registration, so that it can be restored to that value
5069
 
+ * prior to the callback to a plugin, thus allowing that
5070
 
+ * plugin to work with the value it had seen during the
5071
 
+ * initiating I/O request.
5072
 
+ *
5073
 
+ * This routine also records a private data pointer at the
5074
 
+ * time of registration, and is returned to the plugin
5075
 
+ * at callback time. This private data pointer was designed
5076
 
+ * to contain context/callback/buffer_head specific data, and
5077
 
+ * frees the plugin from having to store and find associated
5078
 
+ * data at the time of the callback. This field is not used 
5079
 
+ * by this function and is optional (NULL if unused). It is
5080
 
+ * recorded and returned as a convenience for the plugins.
5081
 
+ *
5082
 
+ * DANGER!!! - WILL ROBINSON - DANGER!!!
5083
 
+ * This routine uses the b_private field in the
5084
 
+ * buffer_head structure. If any lower level driver uses this
5085
 
+ * field and do NOT restore it, the I/O callback will fail!!
5086
 
+ *
5087
 
+ * Any plugins writers requiring a field for private storage
5088
 
+ * should instead use the private field parameter in this
5089
 
+ * function to store their private data.
5090
 
+ *
5091
 
+ */
5092
 
+
5093
 
+int 
5094
 
+evms_cs_register_for_end_io_notification(
5095
 
+       void *private,
5096
 
+       struct buffer_head *bh,
5097
 
+       void *callback_function)
5098
 
+{
5099
 
+       int rc = 0, done;
5100
 
+       io_notify_t *new_entry;
5101
 
+
5102
 
+       done = FALSE;
5103
 
+       while (!done) {
5104
 
+               /* allocate a notify entry */
5105
 
+               new_entry = evms_cs_allocate_from_pool(evms_io_notify_pool, EVMS_BLOCKABLE);
5106
 
+               if (!new_entry) {
5107
 
+                       schedule();
5108
 
+                       continue;
5109
 
+               }
5110
 
+
5111
 
+               /* initialize notify entry */
5112
 
+               new_entry->private = private;
5113
 
+               new_entry->bh = bh;
5114
 
+               new_entry->rsector = bh->b_rsector;
5115
 
+               new_entry->b_private = bh->b_private;
5116
 
+               new_entry->flags = 0;
5117
 
+
5118
 
+               /* is this the first callback for this bh? */
5119
 
+               if (bh->b_end_io != evms_end_io) {
5120
 
+                       /* yes, first callback */
5121
 
+                       new_entry->flags |= EVMS_ORIGINAL_CALLBACK_FLAG;
5122
 
+                       new_entry->callback_function = (void *)bh->b_end_io;
5123
 
+                       
5124
 
+                       /* increment volume's requests_in_progress var */
5125
 
+                       atomic_inc(&evms_logical_volumes[MINOR(bh->b_dev)].requests_in_progress);
5126
 
+
5127
 
+                       /* set b_end_io so we get control */
5128
 
+                       bh->b_end_io = evms_end_io;
5129
 
+               } else {
5130
 
+                       /* no, not first callback */
5131
 
+                       new_entry->callback_function = callback_function;
5132
 
+                       done = TRUE;
5133
 
+               }
5134
 
+               /* set b_private to aid in quick lookup */
5135
 
+               bh->b_private = new_entry;
5136
 
+       }
5137
 
+       return(rc);
5138
 
+}
5139
 
+EXPORT_SYMBOL(evms_cs_register_for_end_io_notification);
5140
 
+
5141
 
+/* function description: evms_lookup_item_in_list
5142
 
+ *     
5143
 
+ * this function searches for the specified item in the
5144
 
+ * specified node list. it returns the address of the
5145
 
+ * evms_list_node containing the specified item.
5146
 
+ */
5147
 
+static evms_list_node_t **
5148
 
+evms_lookup_item_in_list(
5149
 
+       evms_list_node_t **node_list,
5150
 
+       void *item)
5151
 
+{
5152
 
+       evms_list_node_t **list_node;
5153
 
+
5154
 
+       list_node = node_list;
5155
 
+       while(*list_node) {
5156
 
+               if ((*list_node)->item == item)
5157
 
+                       break;
5158
 
+               list_node = &(*list_node)->next;
5159
 
+       }
5160
 
+       return(list_node);
5161
 
+}
5162
 
+
5163
 
+/* function description: evms_add_item_to_list
5164
 
+ *
5165
 
+ * this function adds an item to the list. the
5166
 
+ * node for the new item is added to the end
5167
 
+ * of the list. the list is traversed to find the end.
5168
 
+ * while the traversal occurs, the list is checked
5169
 
+ * for the presence of the specified item. if already 
5170
 
+ * present in the list, and error code is returned.
5171
 
+ */
5172
 
+/* function description: evms_cs_add_item_to_list
5173
 
+ *
5174
 
+ * this function adds an item to an item list.
5175
 
+ * 
5176
 
+ * RC == 0 is returned for:
5177
 
+ *     a successful add of a new item
5178
 
+ *
5179
 
+ * RC == 1 is returned when:
5180
 
+ *     the item is already on the list
5181
 
+ *
5182
 
+ * RC < 0 is returned for an error attempting to add the item.
5183
 
+ */
5184
 
+int 
5185
 
+evms_cs_add_item_to_list(
5186
 
+       evms_list_node_t **list,
5187
 
+       void *item)
5188
 
+{
5189
 
+       int rc = 0;
5190
 
+       evms_list_node_t **list_node, *new_node;
5191
 
+
5192
 
+       list_node = evms_lookup_item_in_list(list, item);
5193
 
+       if (*list_node == NULL) {
5194
 
+               rc = evms_cs_allocate_memory(
5195
 
+                       (void **)&new_node, 
5196
 
+                        sizeof(evms_list_node_t));
5197
 
+               if (!rc) {
5198
 
+                       new_node->item = item;
5199
 
+                       *list_node = new_node;
5200
 
+               }
5201
 
+       } else {
5202
 
+               rc = 1;
5203
 
+               LOG_DEBUG("warning: attempt to add duplicate item(%p) to list(%p).\n",
5204
 
+                          item, list);
5205
 
+       }
5206
 
+       return(rc);
5207
 
+}
5208
 
+EXPORT_SYMBOL(evms_cs_add_item_to_list);
5209
 
+
5210
 
+/* function description: evms_remove_item_from_list
5211
 
+ *
5212
 
+ * this function removes a specified item from the
5213
 
+ * specified list. if the specified item is not
5214
 
+ * found in the list, and error is returned.
5215
 
+ */
5216
 
+int 
5217
 
+evms_cs_remove_item_from_list(
5218
 
+       evms_list_node_t **list,
5219
 
+       void *item)
5220
 
+{
5221
 
+       int rc = 0;
5222
 
+       evms_list_node_t **list_node;
5223
 
+
5224
 
+       /* check to see if item is in the list */
5225
 
+       list_node = evms_lookup_item_in_list(list, item);
5226
 
+
5227
 
+       /* was the node found in the list? */
5228
 
+       if (*list_node) {
5229
 
+               /* yes, it was found */
5230
 
+               evms_list_node_t *tmp_node;
5231
 
+
5232
 
+               /* save ptr to node being removed*/
5233
 
+               tmp_node = *list_node;
5234
 
+               /* remove it from the global list */
5235
 
+               *list_node = tmp_node->next;
5236
 
+               /* delete removed node */
5237
 
+               evms_cs_deallocate_memory(tmp_node);
5238
 
+       } else {
5239
 
+               /* no, it was not found */
5240
 
+               rc = -1;
5241
 
+               LOG_ERROR("error(%d): attempt to remove nonexistant node(%p) from list(%p).\n",
5242
 
+                          rc, item, list);
5243
 
+       }
5244
 
+       return(rc);
5245
 
+}
5246
 
+EXPORT_SYMBOL(evms_cs_remove_item_from_list);
5247
 
+
5248
 
+/* function description: evms_cs_register_device
5249
 
+ *
5250
 
+ * this function adds a device to the EVMS global device list.
5251
 
+ * 
5252
 
+ * RC == 0 is returned for:
5253
 
+ *     a successful add of a new device
5254
 
+ *
5255
 
+ * RC == 1 is returned when:
5256
 
+ *     the device is already on the list
5257
 
+ *
5258
 
+ * RC < 0 is returned for an error attempting to add the device.
5259
 
+ */
5260
 
+int 
5261
 
+evms_cs_register_device(evms_logical_node_t *device)
5262
 
+{
5263
 
+       return(evms_cs_add_item_to_list(
5264
 
+               &evms_global_device_list, 
5265
 
+               device));
5266
 
+}
5267
 
+EXPORT_SYMBOL(evms_cs_register_device);
5268
 
+
5269
 
+/* function description: evms_cs_unregister_device
5270
 
+ *
5271
 
+ * this function removes a device from the EVMS global device list.
5272
 
+ * 
5273
 
+ * RC == 0 is returned for:
5274
 
+ *     a successful removal of the specified device
5275
 
+ *
5276
 
+ * RC < 0 is returned for an error attempting to add the device.
5277
 
+ *     -ENODATA is returned if specified device is not found.
5278
 
+ */
5279
 
+int 
5280
 
+evms_cs_unregister_device(evms_logical_node_t *device)
5281
 
+{
5282
 
+       return(evms_cs_remove_item_from_list(
5283
 
+               &evms_global_device_list, 
5284
 
+               device));
5285
 
+}
5286
 
+EXPORT_SYMBOL(evms_cs_unregister_device);
5287
 
+
5288
 
+static evms_list_node_t *find_first_next_list_node = NULL;
5289
 
+int 
5290
 
+evms_cs_find_next_device(
5291
 
+       evms_logical_node_t *in_device,
5292
 
+       evms_logical_node_t **out_device)
5293
 
+{
5294
 
+       int rc = 0;
5295
 
+       evms_list_node_t **list_node;
5296
 
+
5297
 
+       if (in_device == NULL)
5298
 
+               find_first_next_list_node = evms_global_device_list;
5299
 
+       else {
5300
 
+               list_node = evms_lookup_item_in_list(
5301
 
+                       &evms_global_device_list, 
5302
 
+                       in_device);
5303
 
+               find_first_next_list_node = *list_node;
5304
 
+               if (find_first_next_list_node == NULL)
5305
 
+                       rc = -ENODATA;
5306
 
+               else
5307
 
+                       find_first_next_list_node = 
5308
 
+                       find_first_next_list_node->next;
5309
 
+       }
5310
 
+    
5311
 
+       if (find_first_next_list_node == NULL)
5312
 
+               *out_device = NULL;
5313
 
+       else
5314
 
+               *out_device = (evms_logical_node_t *)
5315
 
+                       find_first_next_list_node->item;
5316
 
+
5317
 
+       return(rc);
5318
 
+}
5319
 
+EXPORT_SYMBOL(evms_cs_find_next_device);
5320
 
+
5321
 
+void
5322
 
+evms_cs_signal_event(int eventid)
5323
 
+{
5324
 
+       int rc;
5325
 
+       evms_list_node_t **list_node;
5326
 
+
5327
 
+       /* signal PID(s) of specified event */
5328
 
+       list_node = &evms_global_notify_list;
5329
 
+       while(*list_node) {
5330
 
+               evms_event_t *event;
5331
 
+
5332
 
+               event = (*list_node)->item;
5333
 
+               if (event->eventid == eventid) {
5334
 
+                       struct task_struct *tsk;
5335
 
+
5336
 
+                       tsk = find_task_by_pid(event->pid);
5337
 
+                       if (tsk) {
5338
 
+                               struct siginfo siginfo;
5339
 
+
5340
 
+                               siginfo.si_signo = event->signo;
5341
 
+                               siginfo.si_errno = 0;
5342
 
+                               siginfo.si_code = 0;
5343
 
+                               rc = send_sig_info(event->signo,
5344
 
+                                             &siginfo,
5345
 
+                                             tsk);
5346
 
+                       } else {
5347
 
+                               /* TODO:
5348
 
+                                * unregister this stale 
5349
 
+                                * notification record
5350
 
+                                */
5351
 
+                       }
5352
 
+               }
5353
 
+               list_node = &(*list_node)->next;
5354
 
+       }
5355
 
+}
5356
 
+EXPORT_SYMBOL(evms_cs_signal_event);
5357
 
+
5358
 
+static inline void 
5359
 
+evms_flush_signals (void)
5360
 
+{
5361
 
+       spin_lock(&current->sigmask_lock);
5362
 
+       flush_signals(current);
5363
 
+       spin_unlock(&current->sigmask_lock);
5364
 
+}
5365
 
+
5366
 
+static inline void 
5367
 
+evms_init_signals (void)
5368
 
+{
5369
 
+        current->exit_signal = SIGCHLD;
5370
 
+        siginitsetinv(&current->blocked, sigmask(SIGKILL));
5371
 
+}
5372
 
+
5373
 
+static int 
5374
 
+evms_thread(void * arg)
5375
 
+{
5376
 
+       evms_thread_t *thread = arg;
5377
 
+       lock_kernel();
5378
 
+       
5379
 
+       /*
5380
 
+        * Detach thread
5381
 
+        */
5382
 
+
5383
 
+       daemonize();
5384
 
+
5385
 
+       sprintf(current->comm, thread->name);
5386
 
+       evms_init_signals();
5387
 
+       evms_flush_signals();
5388
 
+       thread->tsk = current;
5389
 
+
5390
 
+       current->policy = SCHED_OTHER;
5391
 
+       current->nice = -20;
5392
 
+       unlock_kernel();
5393
 
+       
5394
 
+       complete(thread->event);
5395
 
+       while (thread->run) {
5396
 
+               void (*run)(void *data);
5397
 
+               DECLARE_WAITQUEUE(wait, current);
5398
 
+
5399
 
+               add_wait_queue(&thread->wqueue, &wait);
5400
 
+               set_task_state(current, TASK_INTERRUPTIBLE);
5401
 
+               if (!test_bit(EVMS_THREAD_WAKEUP, &thread->flags)) {
5402
 
+                       schedule();
5403
 
+               }
5404
 
+               current->state = TASK_RUNNING;
5405
 
+               remove_wait_queue(&thread->wqueue, &wait);
5406
 
+               clear_bit(EVMS_THREAD_WAKEUP, &thread->flags);
5407
 
+
5408
 
+               run = thread->run;
5409
 
+               if (run) {
5410
 
+                       run(thread->data);
5411
 
+                       run_task_queue(&tq_disk);
5412
 
+               }
5413
 
+               if (signal_pending(current)) {
5414
 
+                       evms_flush_signals();
5415
 
+               }
5416
 
+       }
5417
 
+       complete(thread->event);
5418
 
+       return 0;
5419
 
+}
5420
 
+
5421
 
+evms_thread_t *
5422
 
+evms_cs_register_thread (
5423
 
+       void (*run) (void *), 
5424
 
+       void *data, 
5425
 
+       const char *name)
5426
 
+{
5427
 
+       evms_thread_t *thread;
5428
 
+       int ret;
5429
 
+       struct completion event;
5430
 
+       
5431
 
+       if (evms_cs_allocate_memory((void**)&thread,sizeof(evms_thread_t)))
5432
 
+               return NULL;
5433
 
+       
5434
 
+       memset(thread, 0, sizeof(evms_thread_t));
5435
 
+       init_waitqueue_head(&thread->wqueue);
5436
 
+
5437
 
+       init_completion(&event);        
5438
 
+       thread->event = &event;
5439
 
+       thread->run = run;
5440
 
+       thread->data = data;
5441
 
+       thread->name = name;
5442
 
+       ret = kernel_thread(evms_thread, thread, 0);
5443
 
+       if (ret < 0) {
5444
 
+               evms_cs_deallocate_memory(thread);
5445
 
+               return NULL;
5446
 
+       }
5447
 
+       wait_for_completion(&event);
5448
 
+       return thread;
5449
 
+}
5450
 
+EXPORT_SYMBOL(evms_cs_register_thread);
5451
 
+
5452
 
+void 
5453
 
+evms_cs_unregister_thread (evms_thread_t *thread)
5454
 
+{
5455
 
+       struct completion event;
5456
 
+
5457
 
+       init_completion(&event);
5458
 
+       
5459
 
+       thread->event = &event;
5460
 
+       thread->run = NULL;
5461
 
+       thread->name = NULL;
5462
 
+       evms_cs_interrupt_thread(thread);
5463
 
+       wait_for_completion(&event);
5464
 
+       evms_cs_deallocate_memory(thread);
5465
 
+}
5466
 
+EXPORT_SYMBOL(evms_cs_unregister_thread);
5467
 
+
5468
 
+void 
5469
 
+evms_cs_wakeup_thread(evms_thread_t *thread)
5470
 
+{
5471
 
+       set_bit(EVMS_THREAD_WAKEUP, &thread->flags);
5472
 
+       wake_up(&thread->wqueue);
5473
 
+}
5474
 
+EXPORT_SYMBOL(evms_cs_wakeup_thread);
5475
 
+
5476
 
+void 
5477
 
+evms_cs_interrupt_thread (evms_thread_t *thread)
5478
 
+{
5479
 
+       if (!thread->tsk) {
5480
 
+               LOG_ERROR("error: attempted to interrupt an invalid thread!\n");
5481
 
+               return;
5482
 
+       }
5483
 
+       send_sig(SIGKILL, thread->tsk, 1);
5484
 
+}
5485
 
+EXPORT_SYMBOL(evms_cs_interrupt_thread);
5486
 
+
5487
 
+struct proc_dir_entry *
5488
 
+evms_cs_get_evms_proc_dir(void)
5489
 
+{
5490
 
+#ifdef CONFIG_PROC_FS
5491
 
+       if (!evms_proc_dir) {
5492
 
+               evms_proc_dir = create_proc_entry("evms", S_IFDIR, &proc_root);
5493
 
+       }
5494
 
+#endif
5495
 
+       return(evms_proc_dir);
5496
 
+}
5497
 
+EXPORT_SYMBOL(evms_cs_get_evms_proc_dir);
5498
 
+
5499
 
+int
5500
 
+evms_cs_volume_request_in_progress(
5501
 
+       kdev_t dev, 
5502
 
+       int operation, 
5503
 
+       int *current_count)
5504
 
+{
5505
 
+       int rc = 0;
5506
 
+       evms_logical_volume_t *volume;
5507
 
+
5508
 
+       volume = &evms_logical_volumes[MINOR(dev)];
5509
 
+       if (volume->node) {
5510
 
+               if (operation > 0) {
5511
 
+                       atomic_inc(&volume->requests_in_progress);
5512
 
+               } else if (operation < 0) {
5513
 
+                       atomic_dec(&volume->requests_in_progress);
5514
 
+               }
5515
 
+               if (current_count) {
5516
 
+                       *current_count = atomic_read(&volume->requests_in_progress);
5517
 
+               }
5518
 
+       } else {
5519
 
+               rc = -ENODEV;
5520
 
+       }
5521
 
+       return(rc);
5522
 
+}
5523
 
+EXPORT_SYMBOL(evms_cs_volume_request_in_progress);
5524
 
+
5525
 
+/**********************************************************/
5526
 
+/* END -- exported functions/Common Services              */
5527
 
+/**********************************************************/
5528
 
+
5529
 
+/**********************************************************/
5530
 
+/* START -- Proc FS Support functions                     */
5531
 
+/**********************************************************/
5532
 
+
5533
 
+#ifdef CONFIG_PROC_FS
5534
 
+static int 
5535
 
+evms_info_read_proc(
5536
 
+       char *page, 
5537
 
+       char **start, 
5538
 
+       off_t off,
5539
 
+       int count, 
5540
 
+       int *eof, 
5541
 
+       void *data)
5542
 
+{
5543
 
+       int sz = 0;
5544
 
+       char *info_level_text = NULL;
5545
 
+
5546
 
+       PROCPRINT("Enterprise Volume Management System: Info\n");
5547
 
+       switch(evms_info_level) {
5548
 
+               case 1:
5549
 
+                       info_level_text = "critical";
5550
 
+                       break;
5551
 
+               case 2:
5552
 
+                       info_level_text = "serious";
5553
 
+                       break;
5554
 
+               case 3:
5555
 
+                       info_level_text = "error";
5556
 
+                       break;
5557
 
+               case 4:
5558
 
+                       info_level_text = "warning";
5559
 
+                       break;
5560
 
+               case 5:
5561
 
+                       info_level_text = "default";
5562
 
+                       break;
5563
 
+               case 6:
5564
 
+                       info_level_text = "details";
5565
 
+                       break;
5566
 
+               case 7:
5567
 
+                       info_level_text = "debug";
5568
 
+                       break;
5569
 
+               case 8:
5570
 
+                       info_level_text = "extra";
5571
 
+                       break;
5572
 
+               case 9:
5573
 
+                       info_level_text = "entry exit";
5574
 
+                       break;
5575
 
+               case 10:
5576
 
+                       info_level_text = "everything";
5577
 
+                       break;
5578
 
+               default:
5579
 
+                       info_level_text = "unknown";
5580
 
+                       break;
5581
 
+       }
5582
 
+       PROCPRINT("EVMS info level: %d (%s).\n",
5583
 
+                 evms_info_level, info_level_text);
5584
 
+
5585
 
+       PROCPRINT("EVMS kernel version: %d.%d.%d\n",
5586
 
+                 EVMS_MAJOR_VERSION, 
5587
 
+                 EVMS_MINOR_VERSION,
5588
 
+                 EVMS_PATCHLEVEL_VERSION);
5589
 
+       
5590
 
+       PROCPRINT("EVMS IOCTL interface version: %d.%d.%d\n",
5591
 
+                 EVMS_IOCTL_INTERFACE_MAJOR,
5592
 
+                 EVMS_IOCTL_INTERFACE_MINOR,
5593
 
+                 EVMS_IOCTL_INTERFACE_PATCHLEVEL);
5594
 
+
5595
 
+       PROCPRINT("EVMS Common Services version: %d.%d.%d\n",
5596
 
+                 EVMS_COMMON_SERVICES_MAJOR,
5597
 
+                 EVMS_COMMON_SERVICES_MINOR,
5598
 
+                 EVMS_COMMON_SERVICES_PATCHLEVEL);
5599
 
+
5600
 
+       return sz;
5601
 
+}
5602
 
+
5603
 
+static int 
5604
 
+evms_plugins_read_proc(
5605
 
+       char *page, 
5606
 
+       char **start, 
5607
 
+       off_t off,
5608
 
+       int count, 
5609
 
+       int *eof, 
5610
 
+       void *data)
5611
 
+{
5612
 
+       int sz = 0;
5613
 
+       evms_registered_plugin_t *rp = NULL;
5614
 
+
5615
 
+       PROCPRINT("Enterprise Volume Management System: Plugins\n");
5616
 
+       /*             0    1    1    2    2    3    3    4    4    5    5    6    6    7*/
5617
 
+       /*         1   5    0    5    0    5    0    5    0    5    0    5    0    5    0*/
5618
 
+       PROCPRINT(" ---------Plugin----------      required services\n");
5619
 
+       PROCPRINT(" ----id----        version      version\n\n");
5620
 
+       for (rp = registered_plugin_head; rp; rp = rp->next) {
5621
 
+               PROCPRINT(" %x.%x.%x\t   %d.%d.%d\t%d.%d.%d\n", 
5622
 
+                         GetPluginOEM(rp->plugin->id),
5623
 
+                         GetPluginType(rp->plugin->id),
5624
 
+                         GetPluginID(rp->plugin->id),
5625
 
+                         rp->plugin->version.major,
5626
 
+                         rp->plugin->version.minor,
5627
 
+                         rp->plugin->version.patchlevel,
5628
 
+                         rp->plugin->required_common_services_version.major,
5629
 
+                         rp->plugin->required_common_services_version.minor,
5630
 
+                         rp->plugin->required_common_services_version.patchlevel);
5631
 
+       }
5632
 
+
5633
 
+       return sz;
5634
 
+}
5635
 
+
5636
 
+static int 
5637
 
+evms_volumes_read_proc(
5638
 
+       char *page, 
5639
 
+       char **start, 
5640
 
+       off_t off,
5641
 
+       int count, 
5642
 
+       int *eof, 
5643
 
+       void *data)
5644
 
+{
5645
 
+       int sz = 0, j;
5646
 
+
5647
 
+       PROCPRINT("Enterprise Volume Management System: Volumes\n");
5648
 
+       PROCPRINT("major   minor          #blocks type   flags name\n\n");
5649
 
+       for (j = 1; j < MAX_EVMS_VOLUMES; j++) {
5650
 
+               evms_logical_volume_t *volume;
5651
 
+
5652
 
+               volume = &evms_logical_volumes[j];
5653
 
+               if (volume->node) {
5654
 
+                       PROCPRINT("%5d %7d %16Ld %s %s %s %s%s\n",
5655
 
+                               EVMS_MAJOR, j,
5656
 
+                               volume->node->total_vsectors >> 1,
5657
 
+                               (volume->flags & EVMS_VOLUME_FLAG) ? "evms  " : "compat",
5658
 
+                               (volume->flags & EVMS_VOLUME_READ_ONLY) ? "ro" : "rw",
5659
 
+                               (volume->flags & EVMS_VOLUME_PARTIAL) ? "p " : "  ",
5660
 
+                               EVMS_DEV_NODE_PATH,
5661
 
+                               volume->name);
5662
 
+               }
5663
 
+       }
5664
 
+
5665
 
+       return sz;
5666
 
+}
5667
 
+#endif
5668
 
+
5669
 
+/**********************************************************/
5670
 
+/* END -- Proc FS Support functions                       */
5671
 
+/**********************************************************/
5672
 
+
5673
 
+/**********************************************************/
5674
 
+/* START -- FOPS functions definitions                    */
5675
 
+/**********************************************************/
5676
 
+
5677
 
+/************************************************/
5678
 
+/* START -- IOCTL commands -- EVMS specific     */
5679
 
+/************************************************/
5680
 
+
5681
 
+static int 
5682
 
+evms_ioctl_cmd_get_ioctl_version (void * arg)
5683
 
+{
5684
 
+        int rc = 0;
5685
 
+        evms_version_t ver;
5686
 
+
5687
 
+        ver.major = EVMS_IOCTL_INTERFACE_MAJOR;
5688
 
+        ver.minor = EVMS_IOCTL_INTERFACE_MINOR;
5689
 
+        ver.patchlevel = EVMS_IOCTL_INTERFACE_PATCHLEVEL;
5690
 
+
5691
 
+        /* copy info to userspace */
5692
 
+        if (copy_to_user(arg, &ver, sizeof(ver)))
5693
 
+                rc = -EFAULT;
5694
 
+
5695
 
+        return (rc);
5696
 
+}
5697
 
+
5698
 
+static int 
5699
 
+evms_ioctl_cmd_get_version (void * arg)
5700
 
+{
5701
 
+        int rc = 0;
5702
 
+        evms_version_t ver;
5703
 
+
5704
 
+        ver.major = EVMS_MAJOR_VERSION;
5705
 
+        ver.minor = EVMS_MINOR_VERSION;
5706
 
+        ver.patchlevel = EVMS_PATCHLEVEL_VERSION;
5707
 
+
5708
 
+        /* copy info to userspace */
5709
 
+        if (copy_to_user(arg, &ver, sizeof(ver)))
5710
 
+                rc = -EFAULT;
5711
 
+
5712
 
+        return (rc);
5713
 
+}
5714
 
+
5715
 
+static int 
5716
 
+evms_ioctl_cmd_get_info_level (void * arg)
5717
 
+{
5718
 
+        int rc = 0;
5719
 
+
5720
 
+        /* copy info to userspace */
5721
 
+        if (copy_to_user(arg, &evms_info_level, sizeof(evms_info_level)))
5722
 
+                rc = -EFAULT;
5723
 
+
5724
 
+        return (rc);
5725
 
+}
5726
 
+
5727
 
+static int 
5728
 
+evms_ioctl_cmd_set_info_level (void * arg)
5729
 
+{
5730
 
+        int rc = 0;
5731
 
+
5732
 
+        /* copy info from userspace */
5733
 
+        if (copy_from_user(&evms_info_level, arg, sizeof(evms_info_level)))
5734
 
+                rc = -EFAULT;
5735
 
+
5736
 
+        return (rc);
5737
 
+}                  
5738
 
+
5739
 
+/* function: evms_quiesce_volume
5740
 
+ *
5741
 
+ * this function performs the actual quiesce operation on
5742
 
+ * a volume in kernel memory. 
5743
 
+ *
5744
 
+ * when quiescing, all new I/Os to a volume are stopped,
5745
 
+ * causing the calling thread to block. this thread then
5746
 
+ * waits until all I/Os in progress are completed, before
5747
 
+ * return control to the caller.
5748
 
+ *
5749
 
+ * when unquiescing, all new I/Os are allowed to proceed
5750
 
+ * unencumbered, and all threads waiting (blocked) on this
5751
 
+ * volume, are woken up and allowed to proceed.
5752
 
+ *
5753
 
+ */
5754
 
+static int 
5755
 
+evms_quiesce_volume(
5756
 
+       evms_logical_volume_t *volume,
5757
 
+       struct inode *inode, 
5758
 
+       struct file *file,
5759
 
+       evms_quiesce_volume_t *qv)
5760
 
+{
5761
 
+        int rc;
5762
 
+
5763
 
+       LOG_DEBUG("%squiescing %s.\n", 
5764
 
+                 ((qv->command) ? "" : "un"), volume->name);
5765
 
+
5766
 
+#ifdef VFS_PATCH_PRESENT
5767
 
+       if (qv->do_vfs) {
5768
 
+               /* VFS function call to sync and lock the filesystem */
5769
 
+               fsync_dev_lockfs(MKDEV(EVMS_MAJOR, qv->minor));
5770
 
+               volume->vfs_quiesced = TRUE;
5771
 
+       }
5772
 
+#endif
5773
 
+        volume->quiesced = qv->command;
5774
 
+
5775
 
+       /* Command specified was "quiesce". */
5776
 
+        if (qv->command) {
5777
 
+               /* After setting the volume to
5778
 
+                * a quiesced state, there could
5779
 
+                * be threads (on SMP systems)
5780
 
+                * that are executing in the
5781
 
+                * function, evms_handle_request,
5782
 
+                * between the "wait_event" and the
5783
 
+                * "atomic_inc" lines. We need to
5784
 
+                * provide a "delay" sufficient
5785
 
+                * to allow those threads to
5786
 
+                * to reach the atomic_inc's
5787
 
+                * before executing the while loop 
5788
 
+                * below. The "schedule" call should 
5789
 
+                * provide this.
5790
 
+                */
5791
 
+               schedule();
5792
 
+               /* wait for outstanding requests
5793
 
+                * to complete
5794
 
+                */
5795
 
+                while(atomic_read(&volume->requests_in_progress)>0)
5796
 
+                       schedule();
5797
 
+       }
5798
 
+        /* send this command down the stack so lower */
5799
 
+        /* layers can know about this                */
5800
 
+        rc = IOCTL(volume->node, inode, file, 
5801
 
+                  EVMS_QUIESCE_VOLUME, (unsigned long)&qv);
5802
 
+       if (!rc) {
5803
 
+               /* Command specified was "unquiesce". */
5804
 
+               if (!qv->command) {
5805
 
+                       /* "wakeup" any I/O requests waiting on
5806
 
+                        * this volume.
5807
 
+                        */
5808
 
+                       if (waitqueue_active(&volume->wait_queue))
5809
 
+                               wake_up(&volume->wait_queue);
5810
 
+#ifdef VFS_PATCH_PRESENT
5811
 
+                       if (volume->vfs_quiesced) {
5812
 
+                               /* VFS function call to unlock the filesystem */
5813
 
+                               unlockfs(MKDEV(EVMS_MAJOR, qv->minor));
5814
 
+                               volume->vfs_quiesced = FALSE;
5815
 
+                       }
5816
 
+#endif
5817
 
+               }
5818
 
+       } else {
5819
 
+               LOG_ERROR("error(%d) %squiescing %s.\n",
5820
 
+                         rc, 
5821
 
+                         ((qv->command) ? "" : "un"),
5822
 
+                         volume->name);
5823
 
+       }
5824
 
+        return(rc);
5825
 
+}
5826
 
+
5827
 
+/* function: evms_delete_volume
5828
 
+ *
5829
 
+ * this function performs the actual delete operation on
5830
 
+ * a volume to purge it from kernel memory. all structures
5831
 
+ * and memory consumed by this volume will be free as well
5832
 
+ * as clearing or unregistering any system services or
5833
 
+ * global data arrays.
5834
 
+ *
5835
 
+ * NOTE: this function will return -EBUSY on attempts to
5836
 
+ * delete mounted volumes.
5837
 
+ *
5838
 
+ */
5839
 
+static int 
5840
 
+evms_delete_volume(
5841
 
+       evms_logical_volume_t *volume,
5842
 
+       evms_delete_volume_t *dv)
5843
 
+{
5844
 
+        int rc = 0;
5845
 
+
5846
 
+       /* if this is a "permament" delete */
5847
 
+        /* check to make sure volume is not mounted */
5848
 
+       if (dv->command) {
5849
 
+               if (is_mounted(MKDEV(EVMS_MAJOR, dv->minor))) {
5850
 
+                       rc = -EBUSY;
5851
 
+               }
5852
 
+       }
5853
 
+
5854
 
+        /* invoke the delete ioctl at the top of the feature stack */
5855
 
+        if (!rc) {
5856
 
+               LOG_DETAILS("deleting '%s'.\n",volume->name);
5857
 
+                rc = DELETE(volume->node);
5858
 
+        }
5859
 
+
5860
 
+       /* the volume has been deleted, do any clean up work
5861
 
+        * required.
5862
 
+        */
5863
 
+        if (!rc) {
5864
 
+               devfs_unregister(volume->devfs_handle);
5865
 
+               if (dv->command) {
5866
 
+                       /* if "permanent" delete, free the name
5867
 
+                        * and NULL the name field.
5868
 
+                        */
5869
 
+                       evms_cs_deallocate_memory(volume->name);
5870
 
+                       volume->name = NULL;
5871
 
+                       volume->flags = 0;
5872
 
+               } else {
5873
 
+                       /* if "soft" delete, leave the name so
5874
 
+                        * we can use it to reassign the same
5875
 
+                        * minor to this volume after a
5876
 
+                        * rediscovery.
5877
 
+                        */
5878
 
+                       volume->flags = EVMS_VOLUME_SOFT_DELETED;
5879
 
+               }
5880
 
+               volume->node = NULL;
5881
 
+                set_device_ro(MKDEV(EVMS_MAJOR,dv->minor),0);
5882
 
+                blk_size[EVMS_MAJOR][dv->minor] = 0;
5883
 
+                blksize_size[EVMS_MAJOR][dv->minor] = 0;
5884
 
+                hardsect_size[EVMS_MAJOR][dv->minor] = 0;
5885
 
+                evms_volumes--;
5886
 
+        } else {
5887
 
+               LOG_ERROR("error(%d) %s deleting %s.\n",
5888
 
+                         rc, 
5889
 
+                         ((dv->command) ? "hard" : "soft"),
5890
 
+                         volume->name);
5891
 
+       }
5892
 
+        return(rc);
5893
 
+}
5894
 
+
5895
 
+/* function: evms_user_delete_volume
5896
 
+ *
5897
 
+ * this function, depending on the parameters, performs
5898
 
+ * a "soft" or a "hard" delete. for a "soft" delete, a
5899
 
+ * quiesce & delete request is queued up, to be executed
5900
 
+ * at the beginning of the next rediscovery. for a
5901
 
+ * "hard" delete, the target volume is quiesced and then
5902
 
+ * deleted. if there is any errors attempting to delete
5903
 
+ * the target, then the target is unquiesced. if an
5904
 
+ * associative volume is specified it is quiesced before
5905
 
+ * the target volume is quiesced, and is unquiesced
5906
 
+ * after the attempt to delete the target volume.
5907
 
+ *
5908
 
+ */
5909
 
+static int
5910
 
+evms_user_delete_volume(
5911
 
+       evms_logical_volume_t *lvt,
5912
 
+       struct inode *inode,
5913
 
+       struct file  *file,
5914
 
+       evms_delete_volume_t *dv)
5915
 
+{
5916
 
+       int rc = 0;
5917
 
+
5918
 
+       if (!dv->command) {
5919
 
+               /* "soft delete" requested */
5920
 
+               lvt->flags |= (EVMS_REQUESTED_QUIESCE |
5921
 
+                              EVMS_REQUESTED_DELETE);
5922
 
+               if (dv->do_vfs) {
5923
 
+                       lvt->flags |= EVMS_REQUESTED_VFS_QUIESCE;
5924
 
+               }
5925
 
+       } else {
5926
 
+               /* "hard delete" requested */
5927
 
+               int qa = FALSE;
5928
 
+               evms_quiesce_volume_t qv;
5929
 
+               evms_logical_volume_t *lva = NULL;
5930
 
+
5931
 
+               if (dv->associative_minor) {
5932
 
+                       /* associative volume specified
5933
 
+                        *
5934
 
+                        * quiesce it
5935
 
+                        */
5936
 
+                       lva = &evms_logical_volumes[dv->associative_minor];
5937
 
+                       /* quiesce associative volume */
5938
 
+                       qv.command = EVMS_QUIESCE;
5939
 
+                       qv.do_vfs = EVMS_VFS_DO_NOTHING;
5940
 
+                       qv.minor = dv->associative_minor;
5941
 
+                       rc = evms_quiesce_volume(lva, inode, file, &qv);
5942
 
+                       qa = (rc) ? FALSE : TRUE;
5943
 
+               }
5944
 
+               if (!rc) {
5945
 
+                       /* quiesce target volume */
5946
 
+                       qv.command = EVMS_QUIESCE;
5947
 
+                       qv.do_vfs = EVMS_VFS_DO_NOTHING;
5948
 
+                       qv.minor = dv->minor;
5949
 
+                       rc = evms_quiesce_volume(lvt, inode, file, &qv);
5950
 
+               }
5951
 
+               if (!rc) {
5952
 
+                       /* delete the target volume */
5953
 
+                       rc = evms_delete_volume(lvt, dv);
5954
 
+                       if (rc) {
5955
 
+                               /* got an error undeleting...
5956
 
+                                *
5957
 
+                                * unquiesce the target
5958
 
+                                */
5959
 
+                               qv.command = EVMS_UNQUIESCE;
5960
 
+                               qv.do_vfs = EVMS_VFS_DO_NOTHING;
5961
 
+                               qv.minor = dv->minor;
5962
 
+                               evms_quiesce_volume(lvt, inode, file, &qv);
5963
 
+                       }
5964
 
+               }
5965
 
+               if (dv->associative_minor) {
5966
 
+                       /* associative volume specified
5967
 
+                        *
5968
 
+                        * unquiesce it
5969
 
+                        */
5970
 
+                       if (qa) {
5971
 
+                               /* only unquiesce associative
5972
 
+                                * if we successfully quiesced
5973
 
+                                * it previously.
5974
 
+                                */
5975
 
+                               qv.command = EVMS_UNQUIESCE;
5976
 
+                               qv.do_vfs = EVMS_VFS_DO_NOTHING;
5977
 
+                               qv.minor = dv->associative_minor;
5978
 
+                               evms_quiesce_volume(lva, inode, file, &qv);
5979
 
+                       }
5980
 
+               }
5981
 
+       }
5982
 
+       return(rc);
5983
 
+}
5984
 
+
5985
 
+/* function: evms_ioctl_cmd_delete_volume
5986
 
+ *
5987
 
+ * this function copy user data to/from the kernel, and
5988
 
+ * validates user parameters. after validation, control
5989
 
+ * is passed to worker routine evms_user_delete_volume.
5990
 
+ *
5991
 
+ */
5992
 
+static int 
5993
 
+evms_ioctl_cmd_delete_volume(
5994
 
+       struct inode *inode,
5995
 
+       struct file  *file,
5996
 
+       unsigned long arg)
5997
 
+{
5998
 
+        int rc = 0;
5999
 
+        evms_delete_volume_t tmp, *user_parms;
6000
 
+        evms_logical_volume_t *volume = NULL;
6001
 
+
6002
 
+        user_parms = (evms_delete_volume_t *)arg;
6003
 
+        /* copy user's parameters to kernel space */
6004
 
+        if (copy_from_user(&tmp, user_parms, sizeof(tmp)))
6005
 
+                rc = -EFAULT;
6006
 
+
6007
 
+        /* check to make sure associative minor is in use */
6008
 
+        if (!rc) {
6009
 
+               if (tmp.associative_minor) {
6010
 
+                       volume = &evms_logical_volumes[tmp.associative_minor];
6011
 
+                       if (volume->node == NULL)
6012
 
+                               rc = -ENXIO;
6013
 
+               }
6014
 
+       }
6015
 
+        /* check to make sure target minor is in use */
6016
 
+       if (!rc) {
6017
 
+               volume = &evms_logical_volumes[tmp.minor];
6018
 
+               if (volume->node == NULL)
6019
 
+                       rc = -ENXIO;
6020
 
+               else
6021
 
+                       rc = evms_user_delete_volume(
6022
 
+                               volume,inode,file,&tmp);
6023
 
+        }
6024
 
+        /* copy the status value back to the user */
6025
 
+        tmp.status = rc;
6026
 
+        if (copy_to_user(user_parms, &tmp, sizeof(tmp)))
6027
 
+                rc = -EFAULT;
6028
 
+
6029
 
+        return(rc);
6030
 
+}
6031
 
+
6032
 
+/* function: evms_full_rediscover_prep
6033
 
+ *
6034
 
+ * this function helps to prevent problems when evms is
6035
 
+ * configured with the base built in statically and some
6036
 
+ * plugins built as modules.
6037
 
+ *
6038
 
+ * in these cases, when the initial discovery is done, 
6039
 
+ * only the statically built modules are available for
6040
 
+ * volume construction. as a result, some volumes that
6041
 
+ * require the plugins built as modules (which haven't
6042
 
+ * been loaded), to be fully reconstructed, may come up
6043
 
+ * as compatibility volumes or partial volumes.
6044
 
+ *
6045
 
+ * when parts of evms are built as modules, the
6046
 
+ * evms_rediscovery utility is used, to perform a secondary
6047
 
+ * rediscover, after all the plugins built as modules
6048
 
+ * have been loaded, to construct all the volumes 
6049
 
+ * requiring these plugins.
6050
 
+ *
6051
 
+ * however since some of the volumes, requiring the plugins
6052
 
+ * built as modules, may have been already exported as
6053
 
+ * compatibility or partial volumes, we need to purge these
6054
 
+ * volumes from kernel's memory, so that can be rediscovered
6055
 
+ * and claimed by the appropriate plugins, and reconstructed
6056
 
+ * into the correct volumes.
6057
 
+ *
6058
 
+ * this function purges all compatibility volumes that are
6059
 
+ * not in use(mounted) and all partial volumes, prior to
6060
 
+ * doing the secondary rediscover, thus allowing volumes to
6061
 
+ * rediscovered correctly.
6062
 
+ *
6063
 
+ * NOTE: again, this is only required in cases when a
6064
 
+ * combination of plugins are built statically and as
6065
 
+ * modules.
6066
 
+ *
6067
 
+ */
6068
 
+static void
6069
 
+evms_full_rediscover_prep(struct inode *inode, struct file *file)
6070
 
+{
6071
 
+       int rc = 0, i;
6072
 
+
6073
 
+       LOG_DETAILS("%s: started.\n", __FUNCTION__);
6074
 
+       /* check for acceptable volumes to be deleted */
6075
 
+       for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
6076
 
+               evms_logical_volume_t *volume = NULL;
6077
 
+               evms_delete_volume_t dv;
6078
 
+               int volume_mounted, doit;
6079
 
+               kdev_t devp;
6080
 
+        
6081
 
+               volume = &evms_logical_volumes[i];
6082
 
+               if (!volume->node)
6083
 
+                       continue;
6084
 
+               devp = MKDEV(EVMS_MAJOR,i);
6085
 
+               volume_mounted = (is_mounted(devp)) ? 1 : 0;
6086
 
+               /* only proceed on volumes that are:
6087
 
+                *   partial volumes
6088
 
+                *      OR
6089
 
+                *   unmounted compatibility volumes
6090
 
+                */
6091
 
+               doit = FALSE;
6092
 
+               if (volume->flags & EVMS_VOLUME_PARTIAL) {
6093
 
+                       /* do all partial volumes
6094
 
+                        */
6095
 
+                       doit = TRUE;
6096
 
+               } else if (!(volume->flags & EVMS_VOLUME_FLAG)) {
6097
 
+                       /* check all compatibility volumes
6098
 
+                        */
6099
 
+                       if (!volume_mounted && !is_swap_partition(devp)) {
6100
 
+                               /* only do unmounted volumes
6101
 
+                                */
6102
 
+                               doit = TRUE;
6103
 
+                       }
6104
 
+               }
6105
 
+               if (doit == FALSE) {
6106
 
+                       continue;
6107
 
+               }
6108
 
+               /* delete the volume from memory.
6109
 
+                * do a 'soft' delete if volume
6110
 
+                * is mounted, and 'hard' delete
6111
 
+                * if it is not.
6112
 
+                *
6113
 
+                * NOTE: the delete operation will
6114
 
+                * clear the bits in the flags field.
6115
 
+                */
6116
 
+               dv.command = (volume_mounted) ? 
6117
 
+                       EVMS_SOFT_DELETE : EVMS_HARD_DELETE;
6118
 
+               dv.minor = i;
6119
 
+               dv.associative_minor = 0;
6120
 
+               dv.status = 0;
6121
 
+               rc = evms_user_delete_volume(volume,inode,file,&dv);
6122
 
+       }
6123
 
+       LOG_DETAILS("%s: completed.\n", __FUNCTION__);
6124
 
+}
6125
 
+
6126
 
+static int 
6127
 
+evms_ioctl_cmd_rediscover_volumes(
6128
 
+       struct inode *inode, 
6129
 
+       struct file *file,
6130
 
+       unsigned int cmd, 
6131
 
+       unsigned long arg)
6132
 
+{
6133
 
+        int rc, i;
6134
 
+        evms_rediscover_t tmp, *user_parms;
6135
 
+        unsigned long *array_ptr = NULL, array_size = 0;
6136
 
+       evms_logical_volume_t *volume = NULL;
6137
 
+
6138
 
+        rc = tmp.drive_count = 0;
6139
 
+        user_parms = (evms_rediscover_t *)arg;
6140
 
+        /* copy user's parameters to kernel space */
6141
 
+        if (copy_from_user(&tmp, user_parms, sizeof(tmp)))
6142
 
+                rc = -EFAULT;
6143
 
+
6144
 
+       if (tmp.drive_count == REDISCOVER_ALL_DEVICES) {
6145
 
+               evms_full_rediscover_prep(inode, file);
6146
 
+       } 
6147
 
+       /* quiesce all queued volumes */
6148
 
+       for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
6149
 
+               evms_quiesce_volume_t qv;
6150
 
+
6151
 
+               volume = &evms_logical_volumes[i];
6152
 
+               if (!volume->node) {
6153
 
+                       continue;
6154
 
+               }
6155
 
+               if (!(volume->flags & EVMS_REQUESTED_QUIESCE)) {
6156
 
+                       continue;
6157
 
+               }
6158
 
+               qv.command = EVMS_QUIESCE;
6159
 
+               qv.minor = i;
6160
 
+               qv.do_vfs = (volume->flags & EVMS_REQUESTED_VFS_QUIESCE) ? 
6161
 
+                       EVMS_VFS_DO : EVMS_VFS_DO_NOTHING,
6162
 
+               qv.status = 0;
6163
 
+               rc = evms_quiesce_volume(volume,inode,file,&qv);
6164
 
+       }
6165
 
+       /* "soft" delete all queued volumes */
6166
 
+       for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
6167
 
+               evms_delete_volume_t dv;
6168
 
+
6169
 
+               volume = &evms_logical_volumes[i];
6170
 
+               if (!volume->node) {
6171
 
+                       continue;
6172
 
+               }
6173
 
+               if (!(volume->flags & EVMS_REQUESTED_DELETE)) {
6174
 
+                       continue;
6175
 
+               }
6176
 
+               dv.command = EVMS_SOFT_DELETE;
6177
 
+               dv.minor = i;
6178
 
+               dv.associative_minor = 0;
6179
 
+               dv.status = 0;
6180
 
+               rc = evms_delete_volume(volume, &dv);
6181
 
+       }
6182
 
+
6183
 
+        if (tmp.drive_count &&
6184
 
+          (tmp.drive_count != REDISCOVER_ALL_DEVICES)) {
6185
 
+                if (!rc) {
6186
 
+                        /* create space for userspace drive array */
6187
 
+                        array_size = sizeof(*tmp.drive_array) * tmp.drive_count;
6188
 
+                        array_ptr = tmp.drive_array;
6189
 
+                        rc = evms_cs_allocate_memory((void **)&tmp.drive_array, array_size);
6190
 
+                }
6191
 
+                if (!rc)
6192
 
+                        /* copy rediscover drive array to kernel space */
6193
 
+                        if (copy_from_user(tmp.drive_array, array_ptr, array_size))
6194
 
+                                rc = -EFAULT;
6195
 
+        }
6196
 
+
6197
 
+       if (!rc) {
6198
 
+               /* perform the rediscovery operation */
6199
 
+               rc = evms_discover_volumes(&tmp);
6200
 
+       }
6201
 
+
6202
 
+        /* clean up after operation */
6203
 
+        if (tmp.drive_count &&
6204
 
+          (tmp.drive_count != REDISCOVER_ALL_DEVICES))
6205
 
+                evms_cs_deallocate_memory(tmp.drive_array);
6206
 
+
6207
 
+        /* set return code and copy info to userspace */
6208
 
+        tmp.status = rc;
6209
 
+        if (copy_to_user(&user_parms->status, &tmp.status, sizeof(tmp.status)))
6210
 
+                rc = -EFAULT;
6211
 
+
6212
 
+        return(rc);
6213
 
+}
6214
 
+
6215
 
+static evms_list_node_t *user_disk_ptr;
6216
 
+static int 
6217
 
+evms_ioctl_cmd_get_logical_disk(void * arg)
6218
 
+{
6219
 
+        int rc = 0;
6220
 
+        evms_user_disk_t tmp, *user_parms;
6221
 
+
6222
 
+        user_parms = (evms_user_disk_t *)arg;
6223
 
+        /* copy user's parameters to kernel space */
6224
 
+        if (copy_from_user(&tmp.command, &user_parms->command, sizeof(tmp.command)))
6225
 
+                rc = -EFAULT;
6226
 
+
6227
 
+        if (!rc) {
6228
 
+                if (tmp.command == EVMS_FIRST_DISK)
6229
 
+                        user_disk_ptr = evms_global_device_list;
6230
 
+               else /* tmp.command == EVMS_NEXT_DISK */
6231
 
+                        user_disk_ptr = user_disk_ptr->next;
6232
 
+                
6233
 
+                if (user_disk_ptr == NULL) 
6234
 
+                        tmp.status = EVMS_DISK_INVALID;
6235
 
+                else {
6236
 
+                        tmp.status = EVMS_DISK_VALID;
6237
 
+                        tmp.disk_handle = (unsigned long)user_disk_ptr->item ^ EVMS_HANDLE_KEY;
6238
 
+                }
6239
 
+                /* copy info to userspace */
6240
 
+                if (copy_to_user(user_parms, &tmp, sizeof(tmp)))
6241
 
+                        rc = -EFAULT;
6242
 
+        }
6243
 
+        return(rc);
6244
 
+}
6245
 
+
6246
 
+static int 
6247
 
+evms_ioctl_cmd_get_logical_disk_info(void * arg)
6248
 
+{
6249
 
+        int rc = 0;
6250
 
+        evms_user_disk_info_t tmp, *user_parms;
6251
 
+        evms_list_node_t *p;
6252
 
+
6253
 
+        user_parms = (evms_user_disk_info_t *)arg;
6254
 
+        /* copy user's parameters to kernel space */
6255
 
+        if (copy_from_user(&tmp.disk_handle, &user_parms->disk_handle, sizeof(tmp.disk_handle)))
6256
 
+                rc = -EFAULT;
6257
 
+
6258
 
+        /* check handle for validity */
6259
 
+        if (!rc) {
6260
 
+                rc = -EINVAL;
6261
 
+                for (p = evms_global_device_list; p; p = p->next)
6262
 
+                        if (p->item == (evms_logical_node_t *)(tmp.disk_handle ^ EVMS_HANDLE_KEY)) {
6263
 
+                                rc = 0;
6264
 
+                                user_disk_ptr = p;
6265
 
+                                break;
6266
 
+                        }
6267
 
+        }
6268
 
+
6269
 
+        /* populate kernel copy of user's structure with appropriate info */
6270
 
+        if (!rc) {
6271
 
+               evms_logical_node_t *node = (evms_logical_node_t *)user_disk_ptr->item;
6272
 
+                tmp.flags = node->flags;
6273
 
+               strcpy(tmp.disk_name, EVMS_DEV_NODE_PATH);
6274
 
+               strcat(tmp.disk_name, node->name);
6275
 
+                tmp.total_sectors = node->total_vsectors;
6276
 
+               tmp.hardsect_size = node->hardsector_size;
6277
 
+               tmp.block_size = node->block_size;
6278
 
+                rc = evms_cs_kernel_ioctl(node, HDIO_GETGEO, 
6279
 
+                                       (unsigned long)&tmp.geometry);
6280
 
+        }
6281
 
+
6282
 
+        /* set return code and copy info to userspace */
6283
 
+        tmp.status = rc;
6284
 
+        if (copy_to_user(user_parms, &tmp, sizeof(tmp)))
6285
 
+                rc = -EFAULT;
6286
 
+
6287
 
+        return(rc);
6288
 
+}
6289
 
+
6290
 
+#define MAX_IO_SIZE 128
6291
 
+static int 
6292
 
+evms_ioctl_cmd_sector_io(void * arg)
6293
 
+{
6294
 
+        int rc;
6295
 
+       evms_sector_t io_size = MAX_IO_SIZE;
6296
 
+        evms_sector_io_t tmp, *user_parms;
6297
 
+        evms_logical_node_t *disk_node = NULL;
6298
 
+        evms_list_node_t *list_node;
6299
 
+        unsigned char *io_buffer;
6300
 
+
6301
 
+        rc = 0;
6302
 
+        list_node = NULL;
6303
 
+        io_buffer = NULL;
6304
 
+
6305
 
+        user_parms = (evms_sector_io_t *)arg;
6306
 
+        /* copy user's parameters to kernel space */
6307
 
+        if (copy_from_user(&tmp, user_parms, sizeof(tmp)))
6308
 
+                rc = -EFAULT;
6309
 
+
6310
 
+        /* check handle for validity */
6311
 
+        if (!rc) {
6312
 
+                rc = -EINVAL;
6313
 
+                disk_node = (evms_logical_node_t *)(tmp.disk_handle ^ EVMS_HANDLE_KEY);
6314
 
+                for (list_node = evms_global_device_list; list_node; list_node = list_node->next)
6315
 
+                        if (list_node->item == disk_node) {
6316
 
+                                rc = 0;
6317
 
+                                break;
6318
 
+                        }
6319
 
+        }
6320
 
+        if (!rc) {
6321
 
+               /* allocate a io buffer upto 64Kbytes in size */
6322
 
+               if (tmp.sector_count < MAX_IO_SIZE)
6323
 
+                       io_size = tmp.sector_count;
6324
 
+
6325
 
+               /* allocate buffer large enough to hold a single sector */
6326
 
+                rc = evms_cs_allocate_memory(
6327
 
+                       (void **)&io_buffer, 
6328
 
+                       io_size << EVMS_VSECTOR_SIZE_SHIFT);
6329
 
+       }
6330
 
+        /* perform io with specified disk */
6331
 
+        if (!rc) {
6332
 
+               evms_sector_t io_sector_offset, io_remaining;
6333
 
+               u_int64_t io_bytes;
6334
 
+               u_char *user_buffer_ptr;
6335
 
+
6336
 
+               io_remaining = tmp.sector_count;
6337
 
+               io_sector_offset = 0;
6338
 
+               user_buffer_ptr = tmp.buffer_address;
6339
 
+               while(io_remaining) {
6340
 
+                       /* compute the io_size for this pass */
6341
 
+                       io_size = (io_remaining >= MAX_IO_SIZE) ? 
6342
 
+                               MAX_IO_SIZE : io_remaining;
6343
 
+
6344
 
+                       io_bytes = io_size << EVMS_VSECTOR_SIZE_SHIFT;
6345
 
+                        /* for writes, copy a sector from user to kernel */
6346
 
+                        if (tmp.io_flag == EVMS_SECTOR_IO_WRITE) {
6347
 
+                                /* copy sector from user data buffer */
6348
 
+                                if (copy_from_user(io_buffer, 
6349
 
+                                                  user_buffer_ptr, 
6350
 
+                                                  io_bytes))
6351
 
+                                        rc = -EFAULT;
6352
 
+                        }
6353
 
+                        if (rc) break;
6354
 
+
6355
 
+                        /* perform IO one sector at a time */
6356
 
+                        rc = INIT_IO(
6357
 
+                                disk_node, 
6358
 
+                                tmp.io_flag, 
6359
 
+                                io_sector_offset + tmp.starting_sector, 
6360
 
+                                io_size,
6361
 
+                                io_buffer);
6362
 
+
6363
 
+                        if (rc) break;
6364
 
+
6365
 
+                        if (tmp.io_flag != EVMS_SECTOR_IO_WRITE) {
6366
 
+                                /* copy sector to user data buffer */
6367
 
+                                if (copy_to_user(user_buffer_ptr,
6368
 
+                                                io_buffer, 
6369
 
+                                                io_bytes))
6370
 
+                                        rc = -EFAULT;
6371
 
+                        }
6372
 
+                        if (rc) break;
6373
 
+                        
6374
 
+                       user_buffer_ptr += io_bytes;
6375
 
+                       tmp.buffer_address += io_bytes;
6376
 
+                       io_sector_offset += io_size;
6377
 
+                       io_remaining -= io_size;
6378
 
+               }
6379
 
+        }
6380
 
+
6381
 
+        /* if the sector_buffer was allocated, free it */
6382
 
+        if (io_buffer)
6383
 
+                evms_cs_deallocate_memory(io_buffer);
6384
 
+
6385
 
+        /* copy the status value back to the user */
6386
 
+        tmp.status = rc;
6387
 
+        if (copy_to_user(user_parms, &tmp, sizeof(tmp)))
6388
 
+                rc = -EFAULT;
6389
 
+
6390
 
+        return(rc);
6391
 
+}
6392
 
+#undef MAX_IO_SIZE
6393
 
+
6394
 
+static int user_minor;
6395
 
+static int 
6396
 
+evms_ioctl_cmd_get_minor(void * arg)
6397
 
+{
6398
 
+        int rc = 0;
6399
 
+        evms_user_minor_t tmp, *user_parms;
6400
 
+
6401
 
+        user_parms = (evms_user_minor_t *)arg;
6402
 
+        /* copy user's parameters to kernel space */
6403
 
+        if (copy_from_user(&tmp.command, &user_parms->command, sizeof(tmp.command)))
6404
 
+                rc = -EFAULT;
6405
 
+
6406
 
+        if (!rc) {
6407
 
+                if (tmp.command == EVMS_FIRST_VOLUME)
6408
 
+                        user_minor = 1;
6409
 
+               else /* tmp.command == EVMS_NEXT_VOLUME */
6410
 
+                        user_minor++;
6411
 
+                
6412
 
+                tmp.status = EVMS_VOLUME_INVALID;
6413
 
+                for (; user_minor < MAX_EVMS_VOLUMES; user_minor++) {
6414
 
+                       evms_logical_volume_t *lv;
6415
 
+
6416
 
+                       lv = &evms_logical_volumes[user_minor];
6417
 
+                       /* see if any corrupt volumes have been
6418
 
+                        * unmounted. If so, clean up the
6419
 
+                        * evms_logical_volumes array entry, and
6420
 
+                        * don't report the volume to the user.
6421
 
+                        */
6422
 
+                       if (lv->flags & EVMS_VOLUME_CORRUPT) {
6423
 
+                               if (!get_super(MKDEV(EVMS_MAJOR,user_minor))) {
6424
 
+                                       /* clear logical volume structure
6425
 
+                                       * for this volume so it may be
6426
 
+                                       * reused.
6427
 
+                                       */
6428
 
+                                       LOG_WARNING("ioctl_get_minor: found unmounted %s volume(%u,%u,%s).\n",
6429
 
+                                               ((lv->flags & EVMS_VOLUME_SOFT_DELETED) ?
6430
 
+                                                "'soft deleted'" : ""),
6431
 
+                                               EVMS_MAJOR, user_minor,
6432
 
+                                               lv->name);
6433
 
+                                       LOG_WARNING("            releasing minor(%d) used by volume(%s)!\n",
6434
 
+                                               user_minor, lv->name);
6435
 
+                                       evms_cs_deallocate_memory(lv->name);
6436
 
+                                       lv->name = NULL;
6437
 
+                                       lv->flags = 0;
6438
 
+                               }
6439
 
+                       }
6440
 
+                        if (lv->node || (lv->flags & EVMS_VOLUME_CORRUPT)) {
6441
 
+                                tmp.status = EVMS_VOLUME_VALID;
6442
 
+                                tmp.minor = user_minor;
6443
 
+                                break;
6444
 
+                        }
6445
 
+               }
6446
 
+
6447
 
+                /* copy info to userspace */
6448
 
+                if (copy_to_user(user_parms, &tmp, sizeof(tmp)))
6449
 
+                        rc = -EFAULT;
6450
 
+        }
6451
 
+        return(rc);
6452
 
+}
6453
 
+
6454
 
+static int 
6455
 
+evms_ioctl_cmd_get_volume_data(void * arg)
6456
 
+{
6457
 
+        int rc = 0;
6458
 
+        evms_volume_data_t tmp, *user_parms;
6459
 
+       evms_logical_volume_t *volume = NULL;
6460
 
+        evms_logical_node_t *node = NULL;
6461
 
+
6462
 
+        user_parms = (evms_volume_data_t *)arg;
6463
 
+        /* copy user's parameters to kernel space */
6464
 
+        if (copy_from_user(&tmp, user_parms, sizeof(tmp)))
6465
 
+                rc = -EFAULT;
6466
 
+
6467
 
+        if (!rc) {
6468
 
+                volume = &evms_logical_volumes[tmp.minor];
6469
 
+                node = volume->node;
6470
 
+                if (node == NULL)
6471
 
+                        rc = -ENODEV;
6472
 
+        }
6473
 
+        if (!rc) {
6474
 
+                tmp.flags = volume->flags;
6475
 
+                strcpy(tmp.volume_name, EVMS_DEV_NODE_PATH);
6476
 
+                strcat(tmp.volume_name, volume->name);
6477
 
+        }
6478
 
+
6479
 
+        /* copy return code and info to userspace */
6480
 
+        tmp.status = rc;
6481
 
+        if (copy_to_user(user_parms, &tmp, sizeof(tmp)))
6482
 
+                rc = -EFAULT;
6483
 
+        return(rc);
6484
 
+}
6485
 
+
6486
 
+static evms_registered_plugin_t  *ioctl_reg_record;
6487
 
+static int 
6488
 
+evms_ioctl_cmd_get_plugin(void * arg)
6489
 
+{
6490
 
+        int rc = 0;
6491
 
+        evms_kernel_plugin_t tmp, *user_parms;
6492
 
+
6493
 
+        user_parms = (evms_kernel_plugin_t *)arg;
6494
 
+        /* copy user's parameters to kernel space */
6495
 
+        if (copy_from_user(&tmp.command, &user_parms->command, sizeof(tmp.command)))
6496
 
+                rc = -EFAULT;
6497
 
+
6498
 
+        if (!rc) {
6499
 
+               /* if the command is not 0, then verify
6500
 
+                * that ioctl_reg_record is pointing to
6501
 
+                * current and valid plugin header.
6502
 
+                */
6503
 
+               if (tmp.command) { /* tmp.command == EVMS_NEXT_PLUGIN */
6504
 
+                       evms_registered_plugin_t *tmp_reg_record;
6505
 
+                       tmp_reg_record = registered_plugin_head;
6506
 
+                       /* search the current plugin list */
6507
 
+                       while(tmp_reg_record) {
6508
 
+                               if (tmp_reg_record == ioctl_reg_record)
6509
 
+                                       break;
6510
 
+                               tmp_reg_record = tmp_reg_record->next;
6511
 
+                       }
6512
 
+                       /* if the ioctl_reg_record is not in the
6513
 
+                        * current list, then start at the beginning.
6514
 
+                        */
6515
 
+                       if (!tmp_reg_record) 
6516
 
+                               tmp.command = EVMS_FIRST_PLUGIN;
6517
 
+               }
6518
 
+
6519
 
+                if (tmp.command == EVMS_FIRST_PLUGIN)
6520
 
+                       /* start at beginning of plugin list */
6521
 
+                        ioctl_reg_record = registered_plugin_head;
6522
 
+               else /* tmp.command == EVMS_NEXT_PLUGIN */
6523
 
+                       /* continue from current position in list */
6524
 
+                        ioctl_reg_record = ioctl_reg_record->next;
6525
 
+                
6526
 
+               tmp.status = EVMS_PLUGIN_INVALID;
6527
 
+               tmp.id = 0;
6528
 
+               if (ioctl_reg_record) {
6529
 
+                       tmp.id = ioctl_reg_record->plugin->id;
6530
 
+                       tmp.version = ioctl_reg_record->plugin->version;
6531
 
+                       tmp.status = EVMS_PLUGIN_VALID;
6532
 
+               }
6533
 
+
6534
 
+                /* copy info to userspace */
6535
 
+                if (copy_to_user(user_parms, &tmp, sizeof(tmp)))
6536
 
+                        rc = -EFAULT;
6537
 
+        }
6538
 
+        return(rc);
6539
 
+}
6540
 
+
6541
 
+static int 
6542
 
+evms_ioctl_cmd_plugin_ioctl(
6543
 
+       struct inode *inode, 
6544
 
+       struct file *file,
6545
 
+       unsigned int cmd, 
6546
 
+       unsigned long arg)
6547
 
+{
6548
 
+        int rc = 0, found = FALSE;
6549
 
+        evms_plugin_ioctl_t tmp, *user_parms;
6550
 
+       evms_registered_plugin_t * p;
6551
 
+
6552
 
+        user_parms = (evms_plugin_ioctl_t *)arg;
6553
 
+        /* copy user's parameters to kernel space */
6554
 
+        if (copy_from_user(&tmp, user_parms, sizeof(tmp)))
6555
 
+                rc = -EFAULT;
6556
 
+
6557
 
+        if (!rc) {
6558
 
+               /* search for the specified plugin */
6559
 
+               for (p = registered_plugin_head; p; p = p->next)
6560
 
+                       /* check for the specified feature id */
6561
 
+                       if (p->plugin->id == tmp.feature_id) {
6562
 
+                               found = TRUE;
6563
 
+                               /* check that entry point is used */
6564
 
+                               if (p->plugin->function_table->direct_ioctl)
6565
 
+                                       rc = DIRECT_IOCTL(p, inode, file, cmd, arg);
6566
 
+                               else
6567
 
+                                       rc = -ENOSYS;
6568
 
+                               break;
6569
 
+                       }
6570
 
+               /* was the specified plugin found? */
6571
 
+               if (found == FALSE)
6572
 
+                       rc = -ENOPKG;
6573
 
+                
6574
 
+               /* copy the status value back to the user */
6575
 
+               tmp.status = rc;
6576
 
+               if (copy_to_user(user_parms, &tmp, sizeof(tmp)))
6577
 
+                       rc = -EFAULT;
6578
 
+        }
6579
 
+        return(rc);
6580
 
+}
6581
 
+
6582
 
+#define MAX_BUFFER_SIZE 65536
6583
 
+static int
6584
 
+evms_ioctl_cmd_kernel_partial_csum(void * arg)
6585
 
+{
6586
 
+        int rc = 0;
6587
 
+       u_int64_t compute_size = MAX_BUFFER_SIZE;
6588
 
+        evms_compute_csum_t tmp, *user_parms;
6589
 
+        unsigned char *buffer = NULL;
6590
 
+
6591
 
+        user_parms = (evms_compute_csum_t *)arg;
6592
 
+        /* copy user's parameters to kernel space */
6593
 
+        if (copy_from_user(&tmp, user_parms, sizeof(tmp)))
6594
 
+                rc = -EFAULT;
6595
 
+
6596
 
+        if (!rc) {
6597
 
+               /* allocate a io buffer upto 64Kbytes in size */
6598
 
+               if (tmp.buffer_size < MAX_BUFFER_SIZE)
6599
 
+                       compute_size = tmp.buffer_size;
6600
 
+
6601
 
+               /* allocate buffer large enough to hold a single sector */
6602
 
+                rc = evms_cs_allocate_memory(
6603
 
+                       (void **)&buffer, compute_size);
6604
 
+       }
6605
 
+        /* perform io with specified disk */
6606
 
+        if (!rc) {
6607
 
+               evms_sector_t remaining_bytes;
6608
 
+               u_char *user_buffer_ptr;
6609
 
+               unsigned int insum = tmp.insum;
6610
 
+
6611
 
+               remaining_bytes = tmp.buffer_size;
6612
 
+               user_buffer_ptr = tmp.buffer_address;
6613
 
+               while(remaining_bytes) {
6614
 
+                       /* compute the compute_size for this pass */
6615
 
+                       compute_size = (remaining_bytes >= MAX_BUFFER_SIZE) ? 
6616
 
+                               MAX_BUFFER_SIZE : remaining_bytes;
6617
 
+
6618
 
+                        /* copy into kernel from user data buffer */
6619
 
+                        if (copy_from_user(buffer, user_buffer_ptr, 
6620
 
+                                          compute_size))
6621
 
+                               rc = -EFAULT;
6622
 
+                        if (rc) break;
6623
 
+                       /* compute the checksum for this pass */
6624
 
+                       tmp.outsum = csum_partial(buffer, tmp.buffer_size, 
6625
 
+                                         insum);
6626
 
+                       /* set up for another possible pass */
6627
 
+                       insum = tmp.outsum;
6628
 
+                       /* update loop progress variables */
6629
 
+                       user_buffer_ptr += compute_size;
6630
 
+                       tmp.buffer_address += compute_size;
6631
 
+                       remaining_bytes -= compute_size;
6632
 
+               }
6633
 
+        }
6634
 
+
6635
 
+        /* if the sector_buffer was allocated, free it */
6636
 
+        if (buffer)
6637
 
+                evms_cs_deallocate_memory(buffer);
6638
 
+
6639
 
+        /* copy the status value back to the user */
6640
 
+        tmp.status = rc;
6641
 
+        if (copy_to_user(user_parms, &tmp, sizeof(tmp)))
6642
 
+                rc = -EFAULT;
6643
 
+
6644
 
+        return(rc);
6645
 
+}
6646
 
+#undef MAX_BUFFER_SIZE
6647
 
+
6648
 
+static int
6649
 
+evms_ioctl_cmd_get_bmap(
6650
 
+       struct inode *inode, 
6651
 
+       struct file *file,
6652
 
+       unsigned int cmd, 
6653
 
+       unsigned long arg)
6654
 
+{
6655
 
+        int rc = 0;
6656
 
+        evms_get_bmap_t tmp, *user_parms;
6657
 
+
6658
 
+        user_parms = (evms_get_bmap_t *)arg;
6659
 
+        /* copy user's parameters to kernel space */
6660
 
+        if (copy_from_user(&tmp, user_parms, sizeof(tmp)))
6661
 
+                rc = -EFAULT;
6662
 
+
6663
 
+        /* pass the ioctl down the volume stack */
6664
 
+        if (!rc) {
6665
 
+               evms_logical_volume_t *volume;
6666
 
+
6667
 
+               volume = &evms_logical_volumes[MINOR(inode->i_rdev)];
6668
 
+               rc = IOCTL(volume->node, inode, file, cmd, (unsigned long)&tmp);
6669
 
+       }
6670
 
+        /* copy the status value back to the user */
6671
 
+        tmp.status = rc;
6672
 
+        if (copy_to_user(user_parms, &tmp, sizeof(tmp)))
6673
 
+                rc = -EFAULT;
6674
 
+
6675
 
+        return(rc);
6676
 
+}
6677
 
+
6678
 
+static int
6679
 
+evms_ioctl_cmd_process_notify_event(unsigned long arg)
6680
 
+{
6681
 
+       int rc = 0, found = FALSE;
6682
 
+        evms_notify_t tmp, *user_parms;
6683
 
+       evms_list_node_t **list_node = NULL;
6684
 
+       evms_event_t *event = NULL;
6685
 
+
6686
 
+        user_parms = (evms_notify_t *)arg;
6687
 
+        /* copy user's parameters to kernel space */
6688
 
+        if (copy_from_user(&tmp, user_parms, sizeof(tmp)))
6689
 
+                rc = -EFAULT;
6690
 
+
6691
 
+        /* check to see if PID has already been registered
6692
 
+        * for this event.
6693
 
+        */
6694
 
+        if (!rc) {
6695
 
+               list_node = &evms_global_notify_list;
6696
 
+               while(*list_node) {
6697
 
+                       event = (*list_node)->item;
6698
 
+                       if ((event->pid == tmp.eventry.pid) &&
6699
 
+                           (event->eventid == tmp.eventry.eventid)) {
6700
 
+                               found = TRUE;
6701
 
+                               break;
6702
 
+                       }
6703
 
+                       list_node = &(*list_node)->next;
6704
 
+               }
6705
 
+       }
6706
 
+       if (tmp.command) { /* tmp.command == EVMS_REGISTER_EVENT */
6707
 
+               /* registration code */
6708
 
+               if (found) {
6709
 
+                       rc = -EBUSY;
6710
 
+                       LOG_ERROR("error(%d) pid(%d) already register to receive signal(%d) on event(%d).\n",
6711
 
+                                 rc, tmp.eventry.pid, tmp.eventry.signo, tmp.eventry.eventid);
6712
 
+               } else {
6713
 
+                       /* register this pid/event type */
6714
 
+                       rc = evms_cs_allocate_memory((void **)&event, sizeof(evms_event_t));
6715
 
+                       if (rc) {
6716
 
+                               LOG_ERROR("error(%d) allocating event structure.\n",
6717
 
+                                         rc);
6718
 
+                       } else {
6719
 
+                               event->pid = tmp.eventry.pid;
6720
 
+                               event->eventid = tmp.eventry.eventid;
6721
 
+                               event->signo = tmp.eventry.signo;
6722
 
+                               rc = evms_cs_add_item_to_list(
6723
 
+                                       &evms_global_notify_list,
6724
 
+                                       event);
6725
 
+                       }
6726
 
+               }
6727
 
+       } else { /* tmp.command == EVMS_UNREGISTER_EVENT */
6728
 
+               /* unregistration code */
6729
 
+               if (!found) {
6730
 
+                       rc = -ENODATA;
6731
 
+                       LOG_ERROR("error(%d) attempting to unregister a non-registered pid(%d) on event(%d).\n",
6732
 
+                                 rc, tmp.eventry.pid, tmp.eventry.eventid);
6733
 
+               } else {
6734
 
+                       event = (*list_node)->item;
6735
 
+                       rc = evms_cs_remove_item_from_list(
6736
 
+                               &evms_global_notify_list,
6737
 
+                               event);
6738
 
+                       if (!rc) {
6739
 
+                               evms_cs_deallocate_memory(event);
6740
 
+                       }
6741
 
+               }
6742
 
+       }
6743
 
+        /* copy the status value back to the user */
6744
 
+        tmp.status = rc;
6745
 
+        if (copy_to_user(user_parms, &tmp, sizeof(tmp)))
6746
 
+                rc = -EFAULT;
6747
 
+
6748
 
+       return(rc);
6749
 
+}
6750
 
+/************************************************/
6751
 
+/* END -- IOCTL commands -- EVMS specific       */
6752
 
+/************************************************/
6753
 
+
6754
 
+/************************************************/
6755
 
+/* START -- IOCTL commands -- Volume specific   */
6756
 
+/************************************************/
6757
 
+
6758
 
+/************************************************/
6759
 
+/* END -- IOCTL commands -- Volume specific     */
6760
 
+/************************************************/
6761
 
+
6762
 
+/************************************************/
6763
 
+/* START -- IOCTL main                          */
6764
 
+/************************************************/
6765
 
+
6766
 
+/* 
6767
 
+ * Function: evms_ioctl
6768
 
+ *
6769
 
+ *  This function is the main ioctl entry point for all of evms.
6770
 
+ */
6771
 
+
6772
 
+static int 
6773
 
+evms_ioctl(
6774
 
+       struct inode *inode, 
6775
 
+       struct file *file,
6776
 
+       unsigned int cmd, 
6777
 
+       unsigned long arg)
6778
 
+{
6779
 
+        unsigned long minor = 0;
6780
 
+        int rc = 0;
6781
 
+        evms_logical_node_t *node = NULL;
6782
 
+
6783
 
+        /* check user access */
6784
 
+        if (!capable(CAP_SYS_ADMIN))
6785
 
+                rc = -EACCES;
6786
 
+
6787
 
+        if (!inode)
6788
 
+                rc = -EINVAL;
6789
 
+
6790
 
+        if (!rc) {
6791
 
+                /* get the minor */
6792
 
+                minor = MINOR(inode->i_rdev);
6793
 
+               LOG_EXTRA("ioctl: minor(%lu), dir(%d), size(%d), type(%d), nr(%d)\n",
6794
 
+                           minor,
6795
 
+                           (cmd >> _IOC_DIRSHIFT)  & _IOC_DIRMASK,
6796
 
+                           (cmd >> _IOC_SIZESHIFT) & _IOC_SIZEMASK,
6797
 
+                           (cmd >> _IOC_TYPESHIFT) & _IOC_TYPEMASK,
6798
 
+                           (cmd >> _IOC_NRSHIFT)   & _IOC_NRMASK);
6799
 
+
6800
 
+                /* insure this minor points to a valid volume */
6801
 
+                if (minor) {
6802
 
+                        node = evms_logical_volumes[minor].node;
6803
 
+                        if (node == NULL)
6804
 
+                                rc = -ENXIO;
6805
 
+                }
6806
 
+        }
6807
 
+
6808
 
+        /* process the IOCTL commands */
6809
 
+        if (!rc) {
6810
 
+                if (!minor) {
6811
 
+                        /* process all EVMS specific commands */
6812
 
+                        switch(cmd) {
6813
 
+                                case EVMS_GET_IOCTL_VERSION:
6814
 
+                                        rc = evms_ioctl_cmd_get_ioctl_version((void *)arg);
6815
 
+                                        break;
6816
 
+                                case EVMS_GET_VERSION:
6817
 
+                                        rc = evms_ioctl_cmd_get_version((void *)arg);
6818
 
+                                        break;
6819
 
+                                case EVMS_GET_INFO_LEVEL:
6820
 
+                                        rc = evms_ioctl_cmd_get_info_level((void *)arg);
6821
 
+                                        break;
6822
 
+                                case EVMS_SET_INFO_LEVEL:
6823
 
+                                        rc = evms_ioctl_cmd_set_info_level((void *)arg);
6824
 
+                                        break;
6825
 
+                                case EVMS_REDISCOVER_VOLUMES:
6826
 
+                                        rc = evms_ioctl_cmd_rediscover_volumes(inode, file, cmd, arg);
6827
 
+                                        break;
6828
 
+                                case EVMS_GET_LOGICAL_DISK:
6829
 
+                                        rc = evms_ioctl_cmd_get_logical_disk((void *)arg);
6830
 
+                                        break;
6831
 
+                                case EVMS_GET_LOGICAL_DISK_INFO:
6832
 
+                                        rc = evms_ioctl_cmd_get_logical_disk_info((void *)arg);
6833
 
+                                        break;
6834
 
+                                case EVMS_SECTOR_IO:
6835
 
+                                        rc = evms_ioctl_cmd_sector_io((void *)arg);
6836
 
+                                        break;
6837
 
+                                case EVMS_GET_MINOR:
6838
 
+                                        rc = evms_ioctl_cmd_get_minor((void *)arg);
6839
 
+                                        break;
6840
 
+                                case EVMS_GET_VOLUME_DATA:
6841
 
+                                        rc = evms_ioctl_cmd_get_volume_data((void *)arg);
6842
 
+                                        break;
6843
 
+                                case EVMS_DELETE_VOLUME:
6844
 
+                                        rc = evms_ioctl_cmd_delete_volume(inode, file, arg);
6845
 
+                                        break;
6846
 
+                                case EVMS_GET_PLUGIN:
6847
 
+                                        rc = evms_ioctl_cmd_get_plugin((void *)arg);
6848
 
+                                        break;
6849
 
+                                case EVMS_PLUGIN_IOCTL:
6850
 
+                                        rc = evms_ioctl_cmd_plugin_ioctl(inode, file, cmd, arg);
6851
 
+                                        break;
6852
 
+                                case EVMS_COMPUTE_CSUM:
6853
 
+                                        rc = evms_ioctl_cmd_kernel_partial_csum((void *)arg);
6854
 
+                                        break;
6855
 
+                               case EVMS_PROCESS_NOTIFY_EVENT:
6856
 
+                                       rc = evms_ioctl_cmd_process_notify_event(arg);
6857
 
+                                       break;
6858
 
+                                default:
6859
 
+                                        rc = -EINVAL;
6860
 
+                                        break;
6861
 
+                        }
6862
 
+                } else {
6863
 
+                        /* process Volume specific commands */
6864
 
+                        switch(cmd) {
6865
 
+                                /* pick up standard blk ioctls */
6866
 
+                                case BLKFLSBUF:
6867
 
+                                case BLKROSET:
6868
 
+                                case BLKROGET:
6869
 
+                                case BLKRASET:
6870
 
+                                case BLKRAGET:
6871
 
+                                case BLKBSZGET:
6872
 
+                                case BLKSSZGET:
6873
 
+                                        rc = blk_ioctl(inode->i_rdev, cmd, arg);
6874
 
+                                        break;
6875
 
+                               case BLKGETSIZE:
6876
 
+                                       {
6877
 
+                                               /* casting size down to 32-bits until 
6878
 
+                                                * kernel allows return of 64-bit size 
6879
 
+                                                * values.
6880
 
+                                                */
6881
 
+                                               long size = node->total_vsectors;
6882
 
+                                               if (copy_to_user((long *)arg, &size, sizeof(long)))
6883
 
+                                                       rc = -EFAULT;
6884
 
+                                       }
6885
 
+                                        break;
6886
 
+                               case BLKGETSIZE64:
6887
 
+                                       {
6888
 
+                                               u64 size_in_bytes = node->total_vsectors << EVMS_VSECTOR_SIZE_SHIFT;
6889
 
+                                               if (copy_to_user((u64 *)arg, &size_in_bytes, sizeof(u64)))
6890
 
+                                                       rc = -EFAULT;
6891
 
+                                       }
6892
 
+                                        break;
6893
 
+                                case EVMS_GET_IOCTL_VERSION:
6894
 
+                                        rc = evms_ioctl_cmd_get_ioctl_version((void *)arg);
6895
 
+                                        break;
6896
 
+                                case EVMS_GET_BMAP:
6897
 
+                                        rc = evms_ioctl_cmd_get_bmap(inode, file, cmd, arg);
6898
 
+                                        break;
6899
 
+                                default:
6900
 
+                                        rc = IOCTL(node, inode, file, cmd, arg);
6901
 
+                                        break;
6902
 
+                        }
6903
 
+                }
6904
 
+        }
6905
 
+        return rc;
6906
 
+}
6907
 
+
6908
 
+/************************************************/
6909
 
+/* END -- IOCTL main                            */
6910
 
+/************************************************/
6911
 
+
6912
 
+/************************************************/
6913
 
+/* START -- CHECK MEDIA CHANGE                 */
6914
 
+/************************************************/
6915
 
+
6916
 
+static int 
6917
 
+evms_check_media_change(kdev_t dev)
6918
 
+{
6919
 
+        int rc = 0;
6920
 
+       evms_logical_volume_t *volume = NULL;
6921
 
+
6922
 
+        /* check user access */
6923
 
+        if (!capable(CAP_SYS_ADMIN))
6924
 
+                rc = -EACCES;
6925
 
+       if (!rc) {
6926
 
+               int minor;
6927
 
+               /* get the minor */
6928
 
+               minor = MINOR(dev);
6929
 
+               /* insure this minor points to a valid volume */
6930
 
+               volume = &evms_logical_volumes[minor];
6931
 
+               if (volume->node == NULL) {
6932
 
+                       rc = -ENXIO;
6933
 
+               }
6934
 
+       }
6935
 
+       if (!rc) {
6936
 
+               if (volume->flags & EVMS_DEVICE_REMOVABLE) {
6937
 
+                       /* check for media change */
6938
 
+                       rc = evms_cs_kernel_ioctl(
6939
 
+                               volume->node, 
6940
 
+                               EVMS_CHECK_MEDIA_CHANGE, 
6941
 
+                               (unsigned long)NULL);
6942
 
+                       if (rc < 0) {
6943
 
+                               LOG_ERROR("error(%d) doing EVMS_CHECK_MEDIA_CHANGE ioctl on '%s'.\n",
6944
 
+                                         rc, volume->name);
6945
 
+                       }
6946
 
+               }
6947
 
+       }
6948
 
+        return(rc);
6949
 
+}
6950
 
+
6951
 
+/************************************************/
6952
 
+/* END -- CHECK MEDIA CHANGE                   */
6953
 
+/************************************************/
6954
 
+
6955
 
+static void
6956
 
+evms_discover_logical_disks(evms_logical_node_t **);
6957
 
+
6958
 
+static int
6959
 
+evms_check_for_device_changes(
6960
 
+       struct inode *inode,
6961
 
+       struct file *file)
6962
 
+{
6963
 
+       int rc = 0, something_changed = 0, i;
6964
 
+       evms_rediscover_t kernel_rd_pckt = {0,0,NULL};
6965
 
+       evms_list_node_t *disk_list = NULL, *lnode, *next_lnode;
6966
 
+       evms_logical_node_t *disk, *new_device_list = NULL;
6967
 
+       evms_logical_volume_t *volume = NULL;
6968
 
+
6969
 
+       /* check for new devices
6970
 
+        *
6971
 
+        * put all new devices on the disk list so they
6972
 
+        * will be included in the rediscovery process.
6973
 
+        */
6974
 
+        evms_discover_logical_disks(&new_device_list);
6975
 
+        if (new_device_list) {
6976
 
+               LOG_DETAILS("%s: new devices detected.\n", __FUNCTION__);
6977
 
+               something_changed++;
6978
 
+               /* put these new nodes on the disk list */
6979
 
+               while(new_device_list) {
6980
 
+                       disk = new_device_list;
6981
 
+                       rc = evms_cs_remove_logical_node_from_list(
6982
 
+                               &new_device_list,disk);
6983
 
+                       if (rc) {
6984
 
+                               LOG_ERROR("%s: error(%d) removing device(%s) from list.\n",
6985
 
+                                         __FUNCTION__, rc, disk->name);
6986
 
+                       }
6987
 
+                       rc = evms_cs_add_item_to_list(
6988
 
+                               &disk_list,disk);
6989
 
+                       if (rc) {
6990
 
+                               LOG_ERROR("%s: error(%d) adding device(%s) from list.\n",
6991
 
+                                         __FUNCTION__, rc, disk->name);
6992
 
+                       }
6993
 
+               }
6994
 
+       }
6995
 
+
6996
 
+       /* check all devices for changed removable media
6997
 
+        *
6998
 
+        * scan the global device list and issue check
6999
 
+        * media change on each removable media device.
7000
 
+        * put all removable devices that indicate a
7001
 
+        * media change on the disk list.
7002
 
+        */
7003
 
+       for (lnode = evms_global_device_list; lnode; lnode = lnode->next) {
7004
 
+               disk = (evms_logical_node_t *)lnode->item;
7005
 
+               /* only really check removable media devices */
7006
 
+               if (disk->flags & EVMS_DEVICE_REMOVABLE) {
7007
 
+                       /* check for media change */
7008
 
+                       rc = evms_cs_kernel_ioctl(
7009
 
+                               disk, 
7010
 
+                               EVMS_CHECK_MEDIA_CHANGE, 
7011
 
+                               (unsigned long)NULL);
7012
 
+                       if (rc < 0) {
7013
 
+                               LOG_ERROR("%s: error(%d) doing EVMS_CHECK_MEDIA_CHANGE ioctl on '%s'.\n",
7014
 
+                                         __FUNCTION__, rc, disk->name);
7015
 
+                       } else if (rc == 1) {
7016
 
+                               something_changed++;
7017
 
+                               rc = evms_cs_add_item_to_list(
7018
 
+                                       &disk_list, disk);
7019
 
+                       }
7020
 
+               }
7021
 
+       }
7022
 
+       /* log a statement that we detected changed media.
7023
 
+        */
7024
 
+       if (disk_list) {
7025
 
+               LOG_DETAILS("%s: media change detected.\n", __FUNCTION__);
7026
 
+       }
7027
 
+
7028
 
+       /* check for volumes with removed removable media.
7029
 
+        * mark the volumes that reside on changed media.
7030
 
+        */
7031
 
+       for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
7032
 
+               volume = &evms_logical_volumes[i];
7033
 
+               if (!volume->node)
7034
 
+                       continue;
7035
 
+               if (!(volume->flags & EVMS_DEVICE_REMOVABLE))
7036
 
+                       continue;
7037
 
+               if (evms_check_media_change(MKDEV(EVMS_MAJOR,i)) <= 0)
7038
 
+                       continue;
7039
 
+               /* remember which volumes have changed media */
7040
 
+               volume->flags |= EVMS_MEDIA_CHANGED;
7041
 
+               something_changed++;
7042
 
+       }
7043
 
+
7044
 
+       /* check for removed hotplug devices */
7045
 
+
7046
 
+       /* do we have some work to do? */
7047
 
+       if (something_changed) {
7048
 
+               /* check for volumes to be deleted */
7049
 
+               for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
7050
 
+                       evms_quiesce_volume_t qv;
7051
 
+
7052
 
+                       volume = &evms_logical_volumes[i];
7053
 
+                       if (!volume->node)
7054
 
+                               continue;
7055
 
+                       /* only proceed on volumes with:
7056
 
+                        *  changed media,
7057
 
+                        *  hot-unplugged devices,
7058
 
+                        *  & partial volumes
7059
 
+                        */
7060
 
+                       if (!(volume->flags & 
7061
 
+                               (EVMS_MEDIA_CHANGED | 
7062
 
+                                EVMS_VOLUME_PARTIAL | 
7063
 
+                                EVMS_DEVICE_UNPLUGGED)))
7064
 
+                               continue;
7065
 
+                       /* gather the disk's needing to be
7066
 
+                        * rediscovered to rebuild this
7067
 
+                        * volume.
7068
 
+                        *
7069
 
+                        * this will locate other disks that
7070
 
+                        * the volume resides on that don't
7071
 
+                        * indicate media change.
7072
 
+                        */
7073
 
+                       rc = evms_cs_kernel_ioctl(
7074
 
+                               volume->node,
7075
 
+                               EVMS_GET_DISK_LIST,
7076
 
+                               (unsigned long)&disk_list);
7077
 
+                       if (rc) {
7078
 
+                               LOG_ERROR("%s: error(%d) retrieving underlying disk list for '%s', skipping ...\n",
7079
 
+                                         __FUNCTION__, rc, volume->name);
7080
 
+                               continue;
7081
 
+                       }
7082
 
+                       /* quiesce all the changed volumes
7083
 
+                        * prior to being deleted.
7084
 
+                        */
7085
 
+                       qv.command = 1;    // quiesce
7086
 
+                       qv.minor = i;      // 
7087
 
+                       qv.status = 0;     // reset status
7088
 
+                       qv.do_vfs = 0;
7089
 
+                       rc = evms_quiesce_volume(volume, inode, file, &qv);
7090
 
+                       if (rc) {
7091
 
+                               LOG_ERROR("%s: error(%d) attempting to quiesce '%s%s'.\n",
7092
 
+                                         __FUNCTION__, rc, 
7093
 
+                                         EVMS_DEV_NODE_PATH,
7094
 
+                                         volume->name);
7095
 
+                       }
7096
 
+               }
7097
 
+
7098
 
+               /* we need to revalidate all the changed
7099
 
+                * media. this is accomplished by issuing
7100
 
+                * the revalidate disk ioctl to each device
7101
 
+                * with changed media. the device manager
7102
 
+                * remembers which devices indicated
7103
 
+                * media changed (set by check media
7104
 
+                * changed ioctl issued earlier), and will
7105
 
+                * only issue the revalidate disk ioctl to
7106
 
+                * those disks one time.
7107
 
+                *
7108
 
+                * NOTE:
7109
 
+                * this needs to be done BEFORE deleting
7110
 
+                * the volumes because deleting the 
7111
 
+                * last segment on disk will cause the
7112
 
+                * associated disk node to freed, and we
7113
 
+                * will not be able to issue the 
7114
 
+                * revalidate disk ioctl after that.
7115
 
+                */
7116
 
+               for (lnode = disk_list; lnode; lnode = lnode->next) {
7117
 
+                       disk = (evms_logical_node_t *)lnode->item;
7118
 
+                       /* only really do removable media devices */
7119
 
+                       if (disk->flags & EVMS_MEDIA_CHANGED) {
7120
 
+                               /* go revalidate the change media */
7121
 
+                               rc = evms_cs_kernel_ioctl(
7122
 
+                                       disk,
7123
 
+                                       EVMS_REVALIDATE_DISK,
7124
 
+                                       (unsigned long)NULL);
7125
 
+                       }
7126
 
+               }
7127
 
+
7128
 
+               /* delete all the affected volumes */
7129
 
+               for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
7130
 
+                       evms_delete_volume_t dv;
7131
 
+
7132
 
+                       volume = &evms_logical_volumes[i];
7133
 
+                       if (!volume->node)
7134
 
+                               continue;
7135
 
+                       /* only proceed on volumes with:
7136
 
+                        *  changed media,
7137
 
+                        *  hot-unplugged devices,
7138
 
+                        *  & partial volumes
7139
 
+                        */
7140
 
+                       if (!(volume->flags & 
7141
 
+                               (EVMS_MEDIA_CHANGED | 
7142
 
+                                EVMS_VOLUME_PARTIAL | 
7143
 
+                                EVMS_DEVICE_UNPLUGGED)))
7144
 
+                               continue;
7145
 
+                       /* only delete quiesced volumes */
7146
 
+                       if (!volume->quiesced)
7147
 
+                               continue;
7148
 
+                       /* delete the volume from memory.
7149
 
+                        * do a 'soft' delete if volume
7150
 
+                        * is mounted, and 'hard' delete
7151
 
+                        * if it is not.
7152
 
+                        *
7153
 
+                        * NOTE: the delete operation will
7154
 
+                        * clear the bits in the flags field.
7155
 
+                        */
7156
 
+                       dv.command = (is_mounted(MKDEV(EVMS_MAJOR,i))) ? 0 : 1;
7157
 
+                       dv.minor = i;
7158
 
+                       dv.status = 0;
7159
 
+                       rc = evms_delete_volume(volume, &dv);
7160
 
+               }
7161
 
+
7162
 
+               /* at this point all devices indicating
7163
 
+                * media change that had volumes on them
7164
 
+                * should be gone. however, we could still
7165
 
+                * have devices indicating media change
7166
 
+                * that had no volumes on them in the disk
7167
 
+                * list. we need to delete these devices
7168
 
+                * from kernel memory and the global device
7169
 
+                * list.
7170
 
+                */
7171
 
+               for (lnode = evms_global_device_list; lnode; lnode = next_lnode) {
7172
 
+                       next_lnode = lnode->next;
7173
 
+
7174
 
+                       disk = (evms_logical_node_t *)lnode->item;
7175
 
+                       if (disk->flags & EVMS_MEDIA_CHANGED) {
7176
 
+                               rc = DELETE(disk);
7177
 
+                       }
7178
 
+               }
7179
 
+
7180
 
+               /* all the devices that indicated media 
7181
 
+                * change should be gone, both from kernel
7182
 
+                * memory and global device list. we now
7183
 
+                * need to remove any references to these
7184
 
+                * devices from the disk list.
7185
 
+                *
7186
 
+                * when removable media is installed, it
7187
 
+                * will get detected in the device manager's
7188
 
+                * rediscovery as a new device and added to
7189
 
+                * the discover list.
7190
 
+                */
7191
 
+               for (lnode = disk_list; lnode; lnode = next_lnode) {
7192
 
+                       evms_list_node_t *glnode;
7193
 
+                       int lnode_still_there;
7194
 
+
7195
 
+                       next_lnode = lnode->next;
7196
 
+
7197
 
+                       lnode_still_there = FALSE;
7198
 
+                       for (glnode = evms_global_device_list;
7199
 
+                            glnode; glnode = glnode->next) {
7200
 
+                               if (glnode->item == lnode->item) {
7201
 
+                                       lnode_still_there = TRUE;
7202
 
+                                       break;
7203
 
+                               }
7204
 
+                       }
7205
 
+                       if (lnode_still_there == FALSE) {
7206
 
+                               rc = evms_cs_remove_item_from_list(
7207
 
+                                       &disk_list,
7208
 
+                                       lnode->item);
7209
 
+                               if (rc) {
7210
 
+                                       LOG_ERROR("%s: error(%d) attempting to remove item(%p) from disk_list(%p).\n",
7211
 
+                                                 __FUNCTION__, rc, lnode->item, &disk_list);
7212
 
+                               }
7213
 
+                       }
7214
 
+               }
7215
 
+
7216
 
+               /* build the in-kernel rediscover packet */
7217
 
+
7218
 
+               /* allocate the space for the drive_array in
7219
 
+                * the evms_rediscover_t packet. to do this
7220
 
+                * we need to count the number of disk nodes,
7221
 
+                * then allocate the necessary space.
7222
 
+                */
7223
 
+               /* count the disk nodes */
7224
 
+               for (lnode = disk_list; lnode; lnode = lnode->next)
7225
 
+                       kernel_rd_pckt.drive_count++;
7226
 
+               /* allocate the space */
7227
 
+               if (kernel_rd_pckt.drive_count) {
7228
 
+                       rc = evms_cs_allocate_memory(
7229
 
+                               (void **)&kernel_rd_pckt.drive_array,
7230
 
+                               kernel_rd_pckt.drive_count * 
7231
 
+                               sizeof(unsigned long));
7232
 
+                       if (rc) {
7233
 
+                               LOG_ERROR("%s: error(%d) allocating rediscover drive array.\n",
7234
 
+                                         __FUNCTION__, rc);
7235
 
+                       }
7236
 
+               }
7237
 
+               /* populate the drive array
7238
 
+                *
7239
 
+                * this also frees the disk_list which is useful
7240
 
+                * if we had an error allocating the drive array.
7241
 
+                */
7242
 
+               for (i = 0, lnode = disk_list; lnode; lnode = next_lnode, i++) {
7243
 
+                       next_lnode = lnode->next;
7244
 
+
7245
 
+                       /* remove this disk from the disk list */
7246
 
+                       disk = (evms_logical_node_t *)lnode->item;
7247
 
+                       rc = evms_cs_remove_item_from_list(&disk_list, disk);
7248
 
+                       if (!rc) {
7249
 
+                               /* add this disk to rediscover
7250
 
+                                * packet
7251
 
+                                */
7252
 
+                               kernel_rd_pckt.drive_array[i] = 
7253
 
+                                       (unsigned long)disk ^ EVMS_HANDLE_KEY;
7254
 
+                       }
7255
 
+               }
7256
 
+               /* perform the rediscovery operation */
7257
 
+               if (!rc) {
7258
 
+                       rc = evms_discover_volumes(&kernel_rd_pckt);
7259
 
+                       if (kernel_rd_pckt.drive_count) {
7260
 
+                               evms_cs_deallocate_memory(
7261
 
+                                       kernel_rd_pckt.drive_array);
7262
 
+                       }
7263
 
+               }
7264
 
+               LOG_DETAILS("%s: rediscover completed.\n", __FUNCTION__);
7265
 
+       }
7266
 
+
7267
 
+       return(rc);
7268
 
+}
7269
 
+
7270
 
+/************************************************/
7271
 
+/* START -- REVALIDATE DISK                    */
7272
 
+/************************************************/
7273
 
+
7274
 
+static int 
7275
 
+evms_revalidate_disk(kdev_t dev)
7276
 
+{
7277
 
+        int rc = 0;
7278
 
+       evms_logical_volume_t *volume = NULL;
7279
 
+
7280
 
+        /* check user access */
7281
 
+        if (!capable(CAP_SYS_ADMIN))
7282
 
+                rc = -EACCES;
7283
 
+       if (!rc) {
7284
 
+               int minor;
7285
 
+               /* get the minor */
7286
 
+               minor = MINOR(dev);
7287
 
+               /* insure this minor points to a valid volume */
7288
 
+               volume = &evms_logical_volumes[minor];
7289
 
+               if (volume->node == NULL) {
7290
 
+                       rc = -ENXIO;
7291
 
+               }
7292
 
+       }
7293
 
+       if (!rc) {
7294
 
+               /* go revalidate the change media */
7295
 
+               rc = evms_cs_kernel_ioctl(
7296
 
+                       volume->node,
7297
 
+                       EVMS_REVALIDATE_DISK,
7298
 
+                       (unsigned long)NULL);
7299
 
+       }
7300
 
+        return(rc);
7301
 
+}
7302
 
+
7303
 
+/************************************************/
7304
 
+/* END -- REVALIDATE DISK                      */
7305
 
+/************************************************/
7306
 
+
7307
 
+/************************************************/
7308
 
+/* START -- OPEN                               */
7309
 
+/************************************************/
7310
 
+
7311
 
+static int 
7312
 
+evms_open(struct inode * inode, struct file * file)
7313
 
+{
7314
 
+        int rc = 0, minor = 0;
7315
 
+       evms_logical_volume_t *volume = NULL;
7316
 
+
7317
 
+        /* check user access */
7318
 
+        if (!capable(CAP_SYS_ADMIN))
7319
 
+                rc = -EACCES;
7320
 
+       if (!rc) {
7321
 
+               if (!inode)
7322
 
+                       rc = -EINVAL;
7323
 
+       }
7324
 
+       rc = evms_check_for_device_changes(inode, file);
7325
 
+       if (!rc) {
7326
 
+               /* get the minor */
7327
 
+               minor = MINOR(inode->i_rdev);
7328
 
+               if (minor) {
7329
 
+                       /* insure this minor points to a valid volume */
7330
 
+                       volume = &evms_logical_volumes[minor];
7331
 
+                       if (volume->node == NULL) {
7332
 
+                               rc = -ENXIO;
7333
 
+                       }
7334
 
+               }
7335
 
+       }
7336
 
+       /* go "open" the volume */
7337
 
+       if (!rc && minor) {
7338
 
+               rc = IOCTL(volume->node, inode, file,
7339
 
+                          EVMS_OPEN_VOLUME,
7340
 
+                          (unsigned long)NULL);
7341
 
+               if (rc) {
7342
 
+                       LOG_ERROR("error(%d) doing EVMS_OPEN_VOLUME ioctl to '%s'.\n",
7343
 
+                                 rc, volume->name);
7344
 
+               }
7345
 
+       }
7346
 
+        return(rc);
7347
 
+}
7348
 
+
7349
 
+/************************************************/
7350
 
+/* END -- OPEN                                 */
7351
 
+/************************************************/
7352
 
+
7353
 
+/************************************************/
7354
 
+/* START -- RELEASE                            */
7355
 
+/************************************************/
7356
 
+
7357
 
+static int 
7358
 
+evms_release(struct inode * inode, struct file * file)
7359
 
+{
7360
 
+        int rc = 0, minor = 0;
7361
 
+       evms_logical_volume_t *volume = NULL;
7362
 
+
7363
 
+        /* check user access */
7364
 
+        if (!capable(CAP_SYS_ADMIN))
7365
 
+                rc = -EACCES;
7366
 
+       if (!rc) {
7367
 
+               if (!inode)
7368
 
+                       rc = -EINVAL;
7369
 
+       }
7370
 
+       if (!rc) {
7371
 
+               /* get the minor */
7372
 
+               minor = MINOR(inode->i_rdev);
7373
 
+               if (minor) {
7374
 
+                       /* insure this minor points to a valid volume */
7375
 
+                       volume = &evms_logical_volumes[minor];
7376
 
+                       if (volume->node == NULL) {
7377
 
+                               rc = -ENXIO;
7378
 
+                       }
7379
 
+               }
7380
 
+       }
7381
 
+       /* go "close" the volume */
7382
 
+       if (!rc && minor) {
7383
 
+               rc = IOCTL(volume->node, inode, file,
7384
 
+                          EVMS_CLOSE_VOLUME,
7385
 
+                          (unsigned long)NULL);
7386
 
+               if (rc) {
7387
 
+                       LOG_ERROR("error(%d) doing EVMS_CLOSE_VOLUME ioctl to '%s'.\n",
7388
 
+                                 rc, volume->name);
7389
 
+               } 
7390
 
+       }
7391
 
+        return(rc);
7392
 
+}
7393
 
+
7394
 
+/************************************************/
7395
 
+/* END -- RELEASE                              */
7396
 
+/************************************************/
7397
 
+
7398
 
+struct block_device_operations evms_fops = {
7399
 
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,14)
7400
 
+       owner:                  THIS_MODULE,
7401
 
+#endif
7402
 
+        open:                   evms_open,
7403
 
+        release:                evms_release,
7404
 
+        ioctl:                  evms_ioctl,
7405
 
+        check_media_change:     evms_check_media_change,
7406
 
+        revalidate:             evms_revalidate_disk
7407
 
+};
7408
 
+
7409
 
+/**********************************************************/
7410
 
+/* END -- FOPS functions definitions                      */
7411
 
+/**********************************************************/
7412
 
+
7413
 
+/**********************************************************/
7414
 
+/* START -- RUNTIME support functions                     */
7415
 
+/**********************************************************/
7416
 
+
7417
 
+static void 
7418
 
+evms_do_request_fn(request_queue_t *q) {
7419
 
+       LOG_WARNING("This function should not be called.\n");        
7420
 
+}
7421
 
+
7422
 
+#ifdef CONFIG_SMP
7423
 
+static request_queue_t *
7424
 
+evms_find_queue(kdev_t dev)
7425
 
+{
7426
 
+       request_queue_t *rq = NULL;
7427
 
+       evms_logical_volume_t *volume;
7428
 
+
7429
 
+       volume = &evms_logical_volumes[MINOR(dev)];
7430
 
+       if (volume->node)
7431
 
+               rq = &volume->request_queue;
7432
 
+       return (rq);
7433
 
+}
7434
 
+#endif
7435
 
+
7436
 
+/*
7437
 
+ * Function:    evms_make_request_fn
7438
 
+ *
7439
 
+ */
7440
 
+static int 
7441
 
+evms_make_request_fn(
7442
 
+       request_queue_t *q, 
7443
 
+       int rw, 
7444
 
+       struct buffer_head *bh)
7445
 
+{
7446
 
+        evms_logical_volume_t *volume;
7447
 
+       eio_t eio;
7448
 
+        
7449
 
+       eio.rsector = bh->b_rsector;
7450
 
+       eio.rsize = bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT;
7451
 
+       eio.bh = bh;
7452
 
+         
7453
 
+        volume = &evms_logical_volumes[MINOR(bh->b_dev)];
7454
 
+       wait_event(volume->wait_queue, (!volume->quiesced));
7455
 
+       if (volume->node) {
7456
 
+               switch (rw) {
7457
 
+                       case READ:
7458
 
+                       case READA:
7459
 
+                               atomic_inc(&volume->requests_in_progress);
7460
 
+                               R_IO(volume->node, &eio);
7461
 
+                               atomic_dec(&volume->requests_in_progress);
7462
 
+                               return 0;
7463
 
+                       case WRITE:
7464
 
+                               atomic_inc(&volume->requests_in_progress);
7465
 
+                               W_IO(volume->node, &eio);
7466
 
+                               atomic_dec(&volume->requests_in_progress);
7467
 
+                               return 0;
7468
 
+                       default:
7469
 
+                               buffer_IO_error(bh);
7470
 
+                               return 0;
7471
 
+               }
7472
 
+       } else {
7473
 
+               LOG_ERROR("request for unknown logical volume [minor(%d)].\n",
7474
 
+                         bh->b_dev);
7475
 
+               buffer_IO_error(bh);
7476
 
+       }
7477
 
+        return 0;
7478
 
+}
7479
 
+
7480
 
+/**********************************************************/
7481
 
+/* END -- RUNTIME support functions                       */
7482
 
+/**********************************************************/
7483
 
+
7484
 
+/**********************************************************/
7485
 
+/* START -- INIT/DISCOVERY support functions              */
7486
 
+/**********************************************************/
7487
 
+
7488
 
+/*
7489
 
+ * Function:     evms_discover_logical_disks
7490
 
+ * Description: Construct the logical disk list by calling all registered device managers.
7491
 
+ */
7492
 
+static void 
7493
 
+evms_discover_logical_disks(evms_logical_node_t **disk_list)
7494
 
+{
7495
 
+        evms_registered_plugin_t * p;
7496
 
+       LOG_EXTRA("discovering logical disks...\n");
7497
 
+        for (p = registered_plugin_head; p; p = p->next) {
7498
 
+                if (GetPluginType(p->plugin->id) == EVMS_DEVICE_MANAGER) {
7499
 
+                        DISCOVER(p, disk_list);
7500
 
+                }
7501
 
+        }
7502
 
+}
7503
 
+
7504
 
+/*
7505
 
+ * Function:     evms_discover_logical_partitions
7506
 
+ * Description: Construct the logical partition list by calling all registered partition managers.
7507
 
+ */
7508
 
+static void 
7509
 
+evms_discover_logical_partitions(evms_logical_node_t **discover_list)
7510
 
+{
7511
 
+       int rc, done;
7512
 
+
7513
 
+        evms_registered_plugin_t * p;
7514
 
+       LOG_EXTRA("discovering logical partitions...\n");
7515
 
+       do {
7516
 
+               done = TRUE;
7517
 
+               for (p = registered_plugin_head; p; p = p->next) {
7518
 
+                       if (GetPluginType(p->plugin->id) == EVMS_SEGMENT_MANAGER) {
7519
 
+                               rc = DISCOVER(p, discover_list);
7520
 
+                               /* RC > 0 means the plugin
7521
 
+                                * added something to the
7522
 
+                                * discover list. This also
7523
 
+                                * means we must loop thru
7524
 
+                                * these plugins another time.
7525
 
+                                * RC == 0 means nothing was
7526
 
+                                * added to the discover list
7527
 
+                                * by this plugin.
7528
 
+                                * RC < 0 means the plugin
7529
 
+                                * encountered some error and
7530
 
+                                * nothing was added to the list.
7531
 
+                                * NOTE: If a plugin has both
7532
 
+                                * added something new to the
7533
 
+                                * discover list and encountered
7534
 
+                                * an error, RC > 0 must be
7535
 
+                                * returned.
7536
 
+                                */
7537
 
+                               if (rc > 0)
7538
 
+                                       done = FALSE;
7539
 
+                       }
7540
 
+               }
7541
 
+       } while (done == FALSE);
7542
 
+
7543
 
+       /* send the end of discovery signal to each 
7544
 
+        * partition manager plugin.
7545
 
+        */
7546
 
+       for (p = registered_plugin_head; p; p = p->next) 
7547
 
+               if (GetPluginType(p->plugin->id) == EVMS_SEGMENT_MANAGER)
7548
 
+                       if (p->plugin->function_table->end_discover)
7549
 
+                               rc = END_DISCOVER(p, discover_list);
7550
 
+}
7551
 
+
7552
 
+/*
7553
 
+ * Function:     evms_discover_volume_groups
7554
 
+ * Description: Find volume groups within the logical partitions list
7555
 
+ */
7556
 
+static void 
7557
 
+evms_discover_volume_groups(evms_logical_node_t **discover_list)
7558
 
+{
7559
 
+       int rc, done;
7560
 
+
7561
 
+        evms_registered_plugin_t * p;
7562
 
+       LOG_EXTRA("discovering logical volume groups...\n");
7563
 
+       do {
7564
 
+               done = TRUE;
7565
 
+               for (p = registered_plugin_head; p; p = p->next) {
7566
 
+                       if (GetPluginType(p->plugin->id) == EVMS_REGION_MANAGER) {
7567
 
+                               rc = DISCOVER(p, discover_list);
7568
 
+                               /* RC > 0 means the plugin
7569
 
+                                * added something to the
7570
 
+                                * discover list. This also
7571
 
+                                * means we must loop thru
7572
 
+                                * these plugins another time.
7573
 
+                                * RC == 0 means nothing was
7574
 
+                                * added to the discover list
7575
 
+                                * by this plugin.
7576
 
+                                * RC < 0 means the plugin
7577
 
+                                * encountered some error and
7578
 
+                                * nothing was added to the list.
7579
 
+                                * NOTE: If a plugin has both
7580
 
+                                * added something new to the
7581
 
+                                * discover list and encountered
7582
 
+                                * an error, RC > 0 must be
7583
 
+                                * returned.
7584
 
+                                */
7585
 
+                               if (rc > 0)
7586
 
+                                       done = FALSE;
7587
 
+                       }
7588
 
+               }
7589
 
+       } while (done == FALSE);
7590
 
+
7591
 
+       /* send the end of discovery signal to each volume
7592
 
+        * group plugin.
7593
 
+        */
7594
 
+       for (p = registered_plugin_head; p; p = p->next) 
7595
 
+               if (GetPluginType(p->plugin->id) == EVMS_REGION_MANAGER)
7596
 
+                       if (p->plugin->function_table->end_discover)
7597
 
+                               rc = END_DISCOVER(p, discover_list);
7598
 
+}
7599
 
+
7600
 
+/* 
7601
 
+ *
7602
 
+ * convert all the feature header fields into cpu native format
7603
 
+ * from the on-disk Little Endian format. From this point forward
7604
 
+ * all plugins can deal with feature headers natively.
7605
 
+ */
7606
 
+void
7607
 
+le_feature_header_to_cpu(evms_feature_header_t *fh)
7608
 
+{
7609
 
+       fh->signature = le32_to_cpu(fh->signature);
7610
 
+       fh->crc = le32_to_cpu(fh->crc);
7611
 
+       fh->version.major = le32_to_cpu(fh->version.major);
7612
 
+       fh->version.minor = le32_to_cpu(fh->version.minor);
7613
 
+       fh->version.patchlevel = le32_to_cpu(fh->version.patchlevel);
7614
 
+       fh->engine_version.major = le32_to_cpu(fh->engine_version.major);
7615
 
+       fh->engine_version.minor = le32_to_cpu(fh->engine_version.minor);
7616
 
+       fh->engine_version.patchlevel = le32_to_cpu(fh->engine_version.patchlevel);
7617
 
+       fh->flags = le32_to_cpu(fh->flags);
7618
 
+       fh->feature_id = le32_to_cpu(fh->feature_id);
7619
 
+       fh->sequence_number = le64_to_cpu(fh->sequence_number);
7620
 
+       fh->alignment_padding = le64_to_cpu(fh->alignment_padding);
7621
 
+       fh->feature_data1_start_lsn = le64_to_cpu(fh->feature_data1_start_lsn);
7622
 
+       fh->feature_data1_size = le64_to_cpu(fh->feature_data1_size);
7623
 
+       fh->feature_data2_start_lsn = le64_to_cpu(fh->feature_data2_start_lsn);
7624
 
+       fh->feature_data2_size = le64_to_cpu(fh->feature_data2_size);
7625
 
+       fh->volume_serial_number = le64_to_cpu(fh->volume_serial_number);
7626
 
+       fh->volume_system_id = le32_to_cpu(fh->volume_system_id);
7627
 
+       fh->object_depth = le32_to_cpu(fh->object_depth);
7628
 
+}
7629
 
+
7630
 
+static int 
7631
 
+edef_load_feature_header(evms_logical_node_t *node)
7632
 
+{
7633
 
+        int i, rc = 0, rc_array[2] = {0,0};
7634
 
+       unsigned long size_in_bytes;
7635
 
+       u_int64_t size_in_sectors, starting_sector = 0;
7636
 
+        evms_feature_header_t *fh = NULL, *fh1 = NULL, *fh2 = NULL;
7637
 
+       char *location_name = NULL;
7638
 
+       evms_version_t version = {
7639
 
+               EVMS_FEATURE_HEADER_MAJOR,
7640
 
+               EVMS_FEATURE_HEADER_MINOR,
7641
 
+               EVMS_FEATURE_HEADER_PATCHLEVEL
7642
 
+       };
7643
 
+
7644
 
+        if (!node->feature_header) {
7645
 
+               size_in_sectors = evms_cs_size_in_vsectors(sizeof(*fh));
7646
 
+               size_in_bytes = size_in_sectors << EVMS_VSECTOR_SIZE_SHIFT;
7647
 
+               rc = evms_cs_allocate_memory((void **)&fh1,size_in_bytes);
7648
 
+                if (!rc) {
7649
 
+                       rc = evms_cs_allocate_memory((void **)&fh2,size_in_bytes);
7650
 
+                       if (rc)
7651
 
+                               evms_cs_deallocate_memory(fh1);
7652
 
+               }
7653
 
+               for (i = 0; i < 2; i++) {
7654
 
+                       if (i == 0) {
7655
 
+                               starting_sector = 
7656
 
+                                       node->total_vsectors - 
7657
 
+                                       size_in_sectors;
7658
 
+                               fh = fh1;
7659
 
+                               location_name = evms_primary_string;
7660
 
+                       } else {
7661
 
+                               starting_sector--;
7662
 
+                               fh = fh2;
7663
 
+                               location_name = evms_secondary_string;
7664
 
+                       }
7665
 
+                        /* read header into buffer */
7666
 
+                        rc = INIT_IO(
7667
 
+                                node, 
7668
 
+                                0,
7669
 
+                                starting_sector, 
7670
 
+                               size_in_sectors, 
7671
 
+                                fh);
7672
 
+                       if (rc) {
7673
 
+                               LOG_ERROR("error(%d) probing for %s feature header(at %Ld) on '%s'.\n",
7674
 
+                                         rc, 
7675
 
+                                         location_name,
7676
 
+                                         starting_sector,
7677
 
+                                         node->name);
7678
 
+                               rc_array[i] = rc;
7679
 
+                               continue;
7680
 
+                       }
7681
 
+                        /* validate header signature */
7682
 
+                        if (cpu_to_le32(fh->signature) != EVMS_FEATURE_HEADER_SIGNATURE) {
7683
 
+                                rc = -ENODATA;
7684
 
+                               rc_array[i] = rc;
7685
 
+                               continue;
7686
 
+                       }
7687
 
+                        /* validate header CRC */
7688
 
+                        if (fh->crc != EVMS_MAGIC_CRC) {
7689
 
+                                u_int32_t org_crc, final_crc;
7690
 
+                                org_crc = cpu_to_le32(fh->crc);
7691
 
+                                fh->crc = 0;
7692
 
+                                final_crc = evms_cs_calculate_crc(
7693
 
+                                        EVMS_INITIAL_CRC,
7694
 
+                                        fh, sizeof(*fh));
7695
 
+                                if (final_crc != org_crc) {
7696
 
+                                       LOG_ERROR("CRC mismatch error [stored(%x), computed(%x)] in %s feature header(at %Ld) on '%s'.\n",
7697
 
+                                               org_crc, final_crc, 
7698
 
+                                               location_name,
7699
 
+                                               starting_sector,
7700
 
+                                               node->name);
7701
 
+                                        rc = -EINVAL;
7702
 
+                                       rc_array[i] = rc;
7703
 
+                                       continue;
7704
 
+                                }
7705
 
+                        } else {
7706
 
+                               LOG_WARNING("CRC disabled in %s feature header(at %Ld) on '%s'.\n",
7707
 
+                                       location_name,
7708
 
+                                       starting_sector,
7709
 
+                                       node->name);
7710
 
+                       }
7711
 
+                       /* convert the feature header from the
7712
 
+                        * on-disk format (Little Endian) to
7713
 
+                        * native cpu format.
7714
 
+                        */
7715
 
+                       le_feature_header_to_cpu(fh);
7716
 
+                       /* verify the system data version */
7717
 
+                       rc = evms_cs_check_version(
7718
 
+                               &version, 
7719
 
+                               &fh->version);
7720
 
+                       if (rc) {
7721
 
+                               LOG_ERROR("error: obsolete version(%d,%d,%d) in %s feature header on '%s'.\n",
7722
 
+                                         fh->version.major,
7723
 
+                                         fh->version.minor,
7724
 
+                                         fh->version.patchlevel,
7725
 
+                                         location_name,
7726
 
+                                         node->name);
7727
 
+                               rc_array[i] = rc;
7728
 
+                       }
7729
 
+               }
7730
 
+
7731
 
+               /* getting same return code for both copies? */
7732
 
+               if (rc_array[0] == rc_array[1]) {
7733
 
+                       rc = rc_array[0];
7734
 
+                       /* if no errors on both copies,
7735
 
+                        * check the sequence numbers.
7736
 
+                        * use the highest sequence number.
7737
 
+                        */
7738
 
+                       if (!rc) {
7739
 
+                               /* compare sequence numbers */
7740
 
+                               if (fh1->sequence_number == fh2->sequence_number) {
7741
 
+                                       fh = fh1;
7742
 
+                               } else {
7743
 
+                                       LOG_WARNING("%s feature header sequence number(%Ld) mismatches %s feature header sequence number(%Ld) on '%s'!\n",
7744
 
+                                                  evms_primary_string,
7745
 
+                                                  fh1->sequence_number,
7746
 
+                                                  evms_secondary_string,
7747
 
+                                                  fh2->sequence_number,
7748
 
+                                                  node->name);
7749
 
+                                       if (fh1->sequence_number > fh2->sequence_number) {
7750
 
+                                               fh = fh1;
7751
 
+                                               location_name = evms_primary_string;
7752
 
+                                               /* indicate bad sequence number of secondary */
7753
 
+                                               rc_array[1] = -1;
7754
 
+                                       } else {
7755
 
+                                               fh = fh2;
7756
 
+                                               location_name = evms_secondary_string;
7757
 
+                                               /* indicate bad sequence number of primary */
7758
 
+                                               rc_array[0] = -1;
7759
 
+                                       }
7760
 
+                               }
7761
 
+                       }
7762
 
+               /* getting different return codes for each copy */
7763
 
+               } else 
7764
 
+                       /* either primary or secondary copy is
7765
 
+                        * valid, so use the valid copy.
7766
 
+                        */
7767
 
+                       if ((rc_array[0] == 0) ||
7768
 
+                           (rc_array[1] == 0)) {
7769
 
+                       char *warn_name = NULL;
7770
 
+
7771
 
+                       /* indicate success */
7772
 
+                       rc = 0;
7773
 
+                       /* set variables based on which copy is valid */
7774
 
+                       if (rc_array[0] == 0) {
7775
 
+                               /* use primary (rear) copy if its good */
7776
 
+                               fh = fh1;
7777
 
+                               location_name = evms_primary_string;
7778
 
+                               warn_name = evms_secondary_string;
7779
 
+                       } else {
7780
 
+                               /* use secondary (front) copy if its good */
7781
 
+                               fh = fh2;
7782
 
+                               location_name = evms_secondary_string;
7783
 
+                               warn_name = evms_primary_string;
7784
 
+                       }
7785
 
+                       /* warn the user about the invalid copy */
7786
 
+                       LOG_WARNING("warning: error(%d) probing/verifying the %s feature header on '%s'.\n",
7787
 
+                                 rc_array[0] + rc_array[1], 
7788
 
+                                 warn_name,
7789
 
+                                 node->name);
7790
 
+               } else 
7791
 
+                       /* both copies had a different error,
7792
 
+                        * and one was a fatal error, so
7793
 
+                        * indicate fatal error.
7794
 
+                        */
7795
 
+                       if ((rc_array[0] == -EINVAL) || 
7796
 
+                          (rc_array[1] == -EINVAL)) {
7797
 
+                       rc = -EINVAL;
7798
 
+               }
7799
 
+
7800
 
+               /* on error, set fh to NULL */
7801
 
+               if (rc) fh = NULL;
7802
 
+
7803
 
+               /* deallocate metadata buffers appropriately */
7804
 
+               if (fh != fh1)
7805
 
+                       evms_cs_deallocate_memory(fh1);
7806
 
+               if (fh != fh2)
7807
 
+                       evms_cs_deallocate_memory(fh2);
7808
 
+
7809
 
+               /* save validated feature header pointer */
7810
 
+               if (!rc) {
7811
 
+                       node->feature_header = fh;
7812
 
+                       if (rc_array[0] != rc_array[1]) {
7813
 
+                               LOG_DETAILS("using %s feature header on '%s'.\n",
7814
 
+                                       location_name,
7815
 
+                                       node->name);
7816
 
+                       }
7817
 
+               }
7818
 
+               
7819
 
+                /* if no signature found, adjust return code */
7820
 
+                if (rc == -ENODATA) {
7821
 
+                        rc = 0;
7822
 
+                       LOG_DEBUG("no feature header found on '%s'.\n",
7823
 
+                               node->name);
7824
 
+               }
7825
 
+        }
7826
 
+        return(rc);
7827
 
+}
7828
 
+
7829
 
+static int 
7830
 
+edef_find_first_features(evms_logical_node_t **discover_list)
7831
 
+{
7832
 
+       int rc;
7833
 
+       evms_logical_node_t *node, *tmp_list_head;
7834
 
+
7835
 
+       tmp_list_head = *discover_list;
7836
 
+       *discover_list = NULL;
7837
 
+
7838
 
+       while(tmp_list_head) {
7839
 
+               node = tmp_list_head;
7840
 
+               rc = evms_cs_remove_logical_node_from_list(
7841
 
+                       &tmp_list_head,
7842
 
+                       node);
7843
 
+               if (rc) BUG();
7844
 
+               /* load the feature header if present */
7845
 
+               rc = edef_load_feature_header(node);
7846
 
+               /* This node have a feature header ?
7847
 
+                * it won't be if there is no header to load
7848
 
+                * OR
7849
 
+                * there was a fatal error attempting to read it.
7850
 
+                */
7851
 
+               if (node->feature_header) {
7852
 
+                       /* check for object flag */
7853
 
+                       if (node->feature_header->flags &
7854
 
+                           EVMS_VOLUME_DATA_OBJECT) {
7855
 
+                               LOG_DEFAULT("object detected, deleting '%s'.\n",
7856
 
+                                         node->name);
7857
 
+                               rc = -EINVAL;
7858
 
+                       } else
7859
 
+                       /* check for stop-data flag */
7860
 
+                               if (node->feature_header->flags &
7861
 
+                                   EVMS_VOLUME_DATA_STOP) {
7862
 
+                               LOG_DEFAULT("stop data detected, deleting '%s'.\n",
7863
 
+                                         node->name);
7864
 
+                               rc = -EINVAL;
7865
 
+                       } else {
7866
 
+                       /* register node on global list */
7867
 
+                               evms_list_node_t **evms_node;
7868
 
+
7869
 
+                               /* check for duplicate pointers */
7870
 
+                               /* search for node in global list */
7871
 
+                               evms_node = evms_lookup_item_in_list(
7872
 
+                                       &evms_global_feature_node_list,
7873
 
+                                       node);
7874
 
+                               /* already present? */
7875
 
+                               if (*evms_node) {
7876
 
+                                       /* yes, already present */
7877
 
+                                       rc = -ENODATA;  /* dont process this node further */
7878
 
+                                       LOG_DEFAULT("deleting duplicate reference to '%s'.\n",
7879
 
+                                                  node->name);
7880
 
+                                       /* forget this node */
7881
 
+                                       node = NULL; 
7882
 
+                               } else {
7883
 
+                                       /* no, not present.
7884
 
+                                        * add it to the list.
7885
 
+                                        */
7886
 
+                                       node->flags |= EVMS_VOLUME_FLAG;
7887
 
+                                       node->iflags |= EVMS_FEATURE_BOTTOM;
7888
 
+                                       rc = evms_cs_allocate_memory(
7889
 
+                                               (void **)&node->volume_info,
7890
 
+                                               sizeof(evms_volume_info_t));
7891
 
+                                       if (!rc) {
7892
 
+                                               node->volume_info->volume_serial_number =
7893
 
+                                                       node->feature_header->volume_serial_number;
7894
 
+                                               node->volume_info->volume_system_id =
7895
 
+                                                       node->feature_header->volume_system_id;
7896
 
+                                               strcpy(node->volume_info->volume_name,
7897
 
+                                                      node->feature_header->volume_name);
7898
 
+                                               rc = evms_cs_add_item_to_list(
7899
 
+                                                       &evms_global_feature_node_list,
7900
 
+                                                       node);
7901
 
+                                       }
7902
 
+                               }
7903
 
+                       }
7904
 
+               }
7905
 
+               /* if any errors, delete the node */
7906
 
+               if (rc) {
7907
 
+                       if (node)
7908
 
+                               DELETE(node);
7909
 
+               } else 
7910
 
+                       /* on successful processing of this node
7911
 
+                        * place it back on the discover list.
7912
 
+                        */
7913
 
+                       evms_cs_add_logical_node_to_list(
7914
 
+                               discover_list,
7915
 
+                               node);
7916
 
+       }
7917
 
+       return(0);
7918
 
+}
7919
 
+
7920
 
+/* These define describe the node types that can be isolated. */
7921
 
+#define ISOLATE_ASSOCIATIVE_FEATURES           0
7922
 
+#define ISOLATE_COMPATIBILITY_VOLUMES          1
7923
 
+#define ISOLATE_EVMS_VOLUMES                   2
7924
 
+#define ISOLATE_EVMS_VOLUME_SERIAL_NUMBER      3
7925
 
+#define ISOLATE_EVMS_NODES_BY_FEATURE_AND_DEPTH        4
7926
 
+static int 
7927
 
+edef_isolate_nodes_by_type(
7928
 
+       unsigned int type,
7929
 
+       evms_logical_node_t **src_list,
7930
 
+        evms_logical_node_t **trg_list,
7931
 
+       u_int32_t compare32,
7932
 
+        u_int64_t compare64)
7933
 
+{
7934
 
+        evms_logical_node_t *node, *next_node;
7935
 
+        int rc = 0, found_node;
7936
 
+       evms_feature_header_t *fh = NULL;
7937
 
+
7938
 
+       for (node = *src_list; node; node = next_node) {
7939
 
+               next_node = node->next;
7940
 
+
7941
 
+               if (node->feature_header)
7942
 
+                       fh = node->feature_header;
7943
 
+                found_node = FALSE;
7944
 
+                switch(type) {
7945
 
+                        case ISOLATE_ASSOCIATIVE_FEATURES:
7946
 
+                                if (fh) {
7947
 
+                                        if (GetPluginType(fh->feature_id) == 
7948
 
+                                            EVMS_ASSOCIATIVE_FEATURE)
7949
 
+                                                found_node = TRUE;
7950
 
+                                }
7951
 
+                                break;
7952
 
+                       case ISOLATE_COMPATIBILITY_VOLUMES:
7953
 
+                                if (!(node->flags & EVMS_VOLUME_FLAG))
7954
 
+                                        found_node = TRUE;
7955
 
+                                break;
7956
 
+                       case ISOLATE_EVMS_VOLUMES:
7957
 
+                                if (node->flags & EVMS_VOLUME_FLAG)
7958
 
+                                        found_node = TRUE;
7959
 
+                                break;
7960
 
+                       /* EVMS volumes with same serial # */
7961
 
+                       case ISOLATE_EVMS_VOLUME_SERIAL_NUMBER:
7962
 
+                                if (node->volume_info->volume_serial_number == compare64)
7963
 
+                                        found_node = TRUE;
7964
 
+                                break;
7965
 
+                       case ISOLATE_EVMS_NODES_BY_FEATURE_AND_DEPTH:
7966
 
+                               if (fh)
7967
 
+                                       if (fh->object_depth == compare64)
7968
 
+                                               if (fh->feature_id == compare32)
7969
 
+                                                       found_node = TRUE;
7970
 
+                                break;
7971
 
+                }
7972
 
+                if (found_node == TRUE) {
7973
 
+                        rc = evms_cs_remove_logical_node_from_list(src_list, node);
7974
 
+                        if (rc) break;
7975
 
+                        rc = evms_cs_add_logical_node_to_list(trg_list, node);
7976
 
+                        if (rc) break;
7977
 
+                } 
7978
 
+        }
7979
 
+        return(rc);
7980
 
+}
7981
 
+
7982
 
+static int 
7983
 
+edef_apply_feature(
7984
 
+       evms_logical_node_t *node, 
7985
 
+       evms_logical_node_t **volume_node_list)
7986
 
+{
7987
 
+        evms_registered_plugin_t * p;
7988
 
+        int rc = -1;
7989
 
+
7990
 
+        for (p = registered_plugin_head; p; p = p->next) {
7991
 
+                if (p->plugin->id == 
7992
 
+                    node->feature_header->feature_id) {
7993
 
+                        rc = DISCOVER(p, volume_node_list);
7994
 
+                        break;
7995
 
+                }
7996
 
+        }
7997
 
+        return(rc);
7998
 
+}
7999
 
+
8000
 
+static int 
8001
 
+edef_get_feature_plugin_header(
8002
 
+        u_int32_t id, 
8003
 
+       evms_plugin_header_t **header)
8004
 
+{
8005
 
+        int rc = -ENOPKG;
8006
 
+        evms_registered_plugin_t *p;
8007
 
+        
8008
 
+        for (p = registered_plugin_head; p; p = p->next) {
8009
 
+                if (p->plugin->id == id) {
8010
 
+                        *header = p->plugin;
8011
 
+                        rc = 0;
8012
 
+                        break;
8013
 
+                }
8014
 
+        }
8015
 
+        if (rc) {
8016
 
+                LOG_SERIOUS("no plugin loaded for feature id(0x%x)\n", id);
8017
 
+        }
8018
 
+        return(rc);
8019
 
+}
8020
 
+
8021
 
+typedef struct evms_volume_build_info_s {
8022
 
+       int node_count; 
8023
 
+       int feature_header_count; 
8024
 
+       int feature_count;
8025
 
+       int associative_feature_count;
8026
 
+       u_int64_t max_depth;
8027
 
+       evms_plugin_header_t *plugin;
8028
 
+       evms_logical_node_t *feature_node_list;
8029
 
+} evms_volume_build_info_t;
8030
 
+
8031
 
+/* 
8032
 
+ * edef_evaluate_volume_node_list:
8033
 
+ *   does:
8034
 
+ *     1) put all nodes from feature list back on volume list
8035
 
+ *      2) loads the node's feature headers
8036
 
+ *      3) counts the node list's entries
8037
 
+ *      4) builds the feature node list
8038
 
+ *     5) counts the feature headers for associative features
8039
 
+ *     6) sets feature count to >1 if >1 features to be processed
8040
 
+ */
8041
 
+static int 
8042
 
+edef_evaluate_volume_node_list(
8043
 
+       evms_logical_node_t **volume_node_list,
8044
 
+       evms_volume_build_info_t *vbi,
8045
 
+       int volume_complete)
8046
 
+{
8047
 
+        int rc;
8048
 
+        evms_logical_node_t *node;
8049
 
+
8050
 
+        vbi->node_count = 
8051
 
+               vbi->feature_count = 
8052
 
+               vbi->associative_feature_count = 
8053
 
+               vbi->max_depth = 0;
8054
 
+               vbi->plugin = NULL;
8055
 
+
8056
 
+       /* put all feature nodes back on the volume list */
8057
 
+       rc = edef_isolate_nodes_by_type(
8058
 
+               ISOLATE_EVMS_VOLUMES, 
8059
 
+               &vbi->feature_node_list,
8060
 
+               volume_node_list, 
8061
 
+               0,0);
8062
 
+       if (rc) return(rc);
8063
 
+
8064
 
+       /* load all the feature headers */
8065
 
+       if (!volume_complete) {
8066
 
+               for(node = *volume_node_list; node; node = node->next) {
8067
 
+                       rc = edef_load_feature_header(node);
8068
 
+                       if (rc) return(rc);
8069
 
+               }
8070
 
+       }
8071
 
+
8072
 
+       /* find the 1st max depth object:
8073
 
+        *   record the depth
8074
 
+        *   record the plugin
8075
 
+        */
8076
 
+       for(node = *volume_node_list; node; node = node->next) {
8077
 
+               evms_plugin_header_t *plugin;
8078
 
+               evms_feature_header_t *fh = node->feature_header;
8079
 
+
8080
 
+                /* count the nodes */
8081
 
+                vbi->node_count++;
8082
 
+
8083
 
+               /* no feature header found, continue to next node */
8084
 
+               if (!fh) continue;
8085
 
+
8086
 
+               /* check the depth */
8087
 
+               if (fh->object_depth > vbi->max_depth) {
8088
 
+                       /* record new max depth */
8089
 
+                       vbi->max_depth = fh->object_depth;
8090
 
+                       /* find the plugin header for this feature id */
8091
 
+                       rc = edef_get_feature_plugin_header(
8092
 
+                               fh->feature_id,
8093
 
+                               &plugin);
8094
 
+                       if (rc) return(rc);
8095
 
+                       /* check for >1 plugins */
8096
 
+                       if (vbi->plugin != plugin) {
8097
 
+                               vbi->feature_count++;
8098
 
+                               vbi->plugin = plugin;
8099
 
+                       }
8100
 
+               }
8101
 
+               /* check for "associative" feature indicator */
8102
 
+               if (GetPluginType(vbi->plugin->id) ==
8103
 
+                   EVMS_ASSOCIATIVE_FEATURE)
8104
 
+                       vbi->associative_feature_count++;
8105
 
+       }
8106
 
+       /* build a list of max depth nodes for this feature */
8107
 
+       if (vbi->max_depth) {
8108
 
+               rc = edef_isolate_nodes_by_type(
8109
 
+                       ISOLATE_EVMS_NODES_BY_FEATURE_AND_DEPTH, 
8110
 
+                       volume_node_list, 
8111
 
+                       &vbi->feature_node_list,
8112
 
+                       vbi->plugin->id,
8113
 
+                       vbi->max_depth);
8114
 
+               if (rc) return(rc);
8115
 
+               if (!vbi->plugin) 
8116
 
+                       return(-ENODATA);
8117
 
+               if (!vbi->feature_node_list) 
8118
 
+                       return(-ENODATA);
8119
 
+       }
8120
 
+
8121
 
+        return(rc);
8122
 
+}
8123
 
+
8124
 
+/* function: edef_check_feature_conditions
8125
 
+ *
8126
 
+ * This routine verifies the state of volume based on the features
8127
 
+ * headers and nodes in the current discovery list. All detected 
8128
 
+ * errors are considered fatal.
8129
 
+ */
8130
 
+static int 
8131
 
+edef_check_feature_conditions(evms_volume_build_info_t *vbi)
8132
 
+{
8133
 
+        int rc = 0;
8134
 
+
8135
 
+        if (vbi->associative_feature_count) {
8136
 
+                if (vbi->node_count > 1) {
8137
 
+                        rc = -EVMS_VOLUME_FATAL_ERROR;
8138
 
+                       LOG_ERROR("associative ERROR: > 1 nodes(%d) remaining to be processed!\n",
8139
 
+                               vbi->node_count);
8140
 
+                } else if (vbi->max_depth != 1) {
8141
 
+                        rc = -EVMS_VOLUME_FATAL_ERROR;
8142
 
+                       LOG_ERROR("associative ERROR: associative feature found at node depth(%Ld) != 1!\n",
8143
 
+                                vbi->max_depth);
8144
 
+                } else
8145
 
+                        rc = -EVMS_ASSOCIATIVE_FEATURE;
8146
 
+        }
8147
 
+        if (!rc) {
8148
 
+                if (!vbi->max_depth) {
8149
 
+                       if (vbi->node_count > 1) {
8150
 
+                                rc = -EVMS_VOLUME_FATAL_ERROR;
8151
 
+                               LOG_ERROR("max depth ERROR: > 1 nodes(%d) remaining to be processed!\n",
8152
 
+                                       vbi->node_count);
8153
 
+                       }
8154
 
+               } else if (vbi->max_depth == 1) {
8155
 
+                       if (vbi->feature_count > 1) {
8156
 
+                                rc = -EVMS_VOLUME_FATAL_ERROR;
8157
 
+                               LOG_ERROR("max depth 1 ERROR: > 1 features remaining to be processed!\n");
8158
 
+                       }
8159
 
+               }
8160
 
+       }
8161
 
+        return(rc);
8162
 
+}
8163
 
+
8164
 
+/* function: edef_apply_features
8165
 
+ *
8166
 
+ * This routine applies none, one, or more features to an EVMS
8167
 
+ * volume. The system data structure is first verified and then
8168
 
+ * features are applied and verified recursively until the
8169
 
+ * entire volume has been constructed. Fatal errors result in
8170
 
+ * all nodes in the volume discovery list being deleted.
8171
 
+ */
8172
 
+static int 
8173
 
+edef_apply_features(evms_logical_node_t **volume_node_list)
8174
 
+{
8175
 
+        int rc = 1, done, top_feature_applying;
8176
 
+        evms_volume_build_info_t vbi;
8177
 
+        
8178
 
+       vbi.feature_node_list = NULL;
8179
 
+        rc = edef_evaluate_volume_node_list(
8180
 
+               volume_node_list, 
8181
 
+               &vbi, FALSE);
8182
 
+        
8183
 
+        /* this loop should ONLY get used when 
8184
 
+         * there are features to process.
8185
 
+         */
8186
 
+       done = (rc) ? TRUE : FALSE;
8187
 
+        while(!done) {
8188
 
+                rc = edef_check_feature_conditions(&vbi);
8189
 
+                if (rc) break;
8190
 
+                top_feature_applying = (vbi.max_depth == 1) ? TRUE : FALSE;
8191
 
+                rc = vbi.plugin->function_table->
8192
 
+                       discover(&vbi.feature_node_list);
8193
 
+               if (!rc) {
8194
 
+                       rc = edef_evaluate_volume_node_list(
8195
 
+                               volume_node_list, 
8196
 
+                               &vbi, top_feature_applying);
8197
 
+                       if (top_feature_applying == TRUE) {
8198
 
+                               if (vbi.node_count > 1) {
8199
 
+                                       rc = -EVMS_VOLUME_FATAL_ERROR;
8200
 
+                                       LOG_ERROR("ERROR: detected > 1 node at volume completion!\n");
8201
 
+                               }
8202
 
+                               done = TRUE;
8203
 
+                       } else {
8204
 
+                               if (!vbi.plugin) {
8205
 
+                                       rc = -EVMS_VOLUME_FATAL_ERROR;
8206
 
+                                       LOG_ERROR("ERROR: depth(%Ld): expected another feature!\n",
8207
 
+                                                 vbi.max_depth);
8208
 
+                                       done = TRUE;
8209
 
+                               }
8210
 
+                       }
8211
 
+               } else { /* rc != 0 */
8212
 
+                       rc = -EVMS_VOLUME_FATAL_ERROR;
8213
 
+                       done = TRUE;
8214
 
+               }
8215
 
+        }
8216
 
+       if (rc)
8217
 
+               /* put all feature nodes back on the volume list */
8218
 
+               if (edef_isolate_nodes_by_type(
8219
 
+                       ISOLATE_EVMS_VOLUMES, 
8220
 
+                       &vbi.feature_node_list,
8221
 
+                       volume_node_list, 
8222
 
+                       0,0))
8223
 
+                       BUG();
8224
 
+        return(rc);
8225
 
+}
8226
 
+
8227
 
+static int 
8228
 
+edef_delete_node( 
8229
 
+       evms_logical_node_t **node_list,
8230
 
+       evms_logical_node_t *node,
8231
 
+       int return_code,
8232
 
+       char *log_text)
8233
 
+{
8234
 
+       int rc;
8235
 
+
8236
 
+       rc = evms_cs_remove_logical_node_from_list(node_list, node);
8237
 
+       if (!rc) {
8238
 
+               LOG_ERROR("%s error(%d): deleting volume(%s), node(%s)\n",
8239
 
+                        log_text, return_code,
8240
 
+                        node->volume_info->volume_name,
8241
 
+                        node->name);
8242
 
+               rc = DELETE(node);
8243
 
+               if (rc) {
8244
 
+                       LOG_ERROR("error(%d) while deleting node(%s)\n",
8245
 
+                               rc, node->name);
8246
 
+               }
8247
 
+       } else  {
8248
 
+               LOG_WARNING("%s error(%d): node gone, assumed deleted by plugin.\n",
8249
 
+                        log_text, return_code);
8250
 
+               /* plugin must have cleaned up the node. 
8251
 
+                * So just reset the return code and leave.
8252
 
+                */
8253
 
+               rc = 0;
8254
 
+       }
8255
 
+
8256
 
+       return(rc);
8257
 
+}
8258
 
+
8259
 
+static int 
8260
 
+edef_process_evms_volumes(
8261
 
+       evms_logical_node_t **discover_list,
8262
 
+        evms_logical_node_t **associative_feature_list)
8263
 
+{
8264
 
+        int rc = 0;
8265
 
+        evms_logical_node_t *node, *evms_volumes_list, *volume_node_list;
8266
 
+        u_int64_t volume_sn;
8267
 
+
8268
 
+        /* put all EVMS volumes on their own list */
8269
 
+        evms_volumes_list = NULL;
8270
 
+        rc = edef_isolate_nodes_by_type(
8271
 
+               ISOLATE_EVMS_VOLUMES, 
8272
 
+               discover_list, 
8273
 
+               &evms_volumes_list, 
8274
 
+               0,0);
8275
 
+        
8276
 
+        /* apply features to each EVMS volume */
8277
 
+        /* one volume at a time on each pass  */
8278
 
+        while (evms_volumes_list) {
8279
 
+                node = evms_volumes_list;
8280
 
+                /* put all nodes for one EVMS volume on separate list */
8281
 
+                volume_node_list = NULL;
8282
 
+                volume_sn = node->volume_info->volume_serial_number;
8283
 
+                rc = edef_isolate_nodes_by_type(
8284
 
+                       ISOLATE_EVMS_VOLUME_SERIAL_NUMBER, 
8285
 
+                       &evms_volumes_list,
8286
 
+                        &volume_node_list, 
8287
 
+                       0, volume_sn);
8288
 
+                if (rc) break;
8289
 
+                /* go apply all the volume features now */
8290
 
+                rc = edef_apply_features(&volume_node_list);
8291
 
+                switch(rc) {
8292
 
+                        case 0: /* SUCCESS */
8293
 
+                                /* remove volume just processed */
8294
 
+                                node = volume_node_list;
8295
 
+                                rc = evms_cs_remove_logical_node_from_list(&volume_node_list, node);
8296
 
+                                if (rc) break;
8297
 
+                                /* put volume on global list */
8298
 
+                                rc = evms_cs_add_logical_node_to_list(discover_list, node);
8299
 
+                                break;
8300
 
+                        case -EVMS_ASSOCIATIVE_FEATURE:
8301
 
+                                /* put all "associative" features on their own list */
8302
 
+                                rc = edef_isolate_nodes_by_type(
8303
 
+                                       ISOLATE_ASSOCIATIVE_FEATURES, 
8304
 
+                                       &volume_node_list,
8305
 
+                                        associative_feature_list, 
8306
 
+                                       0,0);
8307
 
+                                break;
8308
 
+                        default:/* FATAL ERROR */
8309
 
+                                /* delete each node remaining in the list */
8310
 
+                                if (volume_node_list) {
8311
 
+                                        LOG_ERROR("encountered fatal error building volume '%s'\n",
8312
 
+                                                   volume_node_list->volume_info->volume_name);
8313
 
+                                }
8314
 
+                                while(volume_node_list) {
8315
 
+                                        node = volume_node_list;
8316
 
+                                       edef_delete_node(
8317
 
+                                                &volume_node_list,
8318
 
+                                                node, rc,
8319
 
+                                               "EVMS feature");
8320
 
+                                }
8321
 
+                               rc = 0;
8322
 
+                                break;
8323
 
+                }
8324
 
+                if (rc) break;
8325
 
+        }
8326
 
+        return(rc);
8327
 
+}
8328
 
+
8329
 
+static int 
8330
 
+edef_process_associative_volumes(
8331
 
+        evms_logical_node_t **associative_feature_list,
8332
 
+        evms_logical_node_t **discover_list)
8333
 
+{
8334
 
+        int rc = 0;
8335
 
+        evms_logical_node_t *node;
8336
 
+
8337
 
+        while (*associative_feature_list) {
8338
 
+                node = *associative_feature_list;
8339
 
+                /* remove this node from associative feature list */
8340
 
+                rc = evms_cs_remove_logical_node_from_list(associative_feature_list, node);
8341
 
+                if (rc) break;
8342
 
+                /* put volume on global list */
8343
 
+                rc = evms_cs_add_logical_node_to_list(discover_list, node);
8344
 
+                if (rc) break;
8345
 
+                rc = edef_load_feature_header(node);
8346
 
+                if (rc) break;
8347
 
+                rc = edef_apply_feature(node, discover_list);
8348
 
+               if (rc) 
8349
 
+                       edef_delete_node(
8350
 
+                                discover_list, node, rc,
8351
 
+                               "Associative feature");
8352
 
+        }
8353
 
+        return(rc);
8354
 
+}
8355
 
+        
8356
 
+static int 
8357
 
+edef_check_for_incomplete_volumes(
8358
 
+       evms_logical_node_t **discover_list)
8359
 
+{
8360
 
+        int rc = 0;
8361
 
+        evms_logical_node_t *next_node, *node;
8362
 
+
8363
 
+        /* check to see if any incomplete volumes are left around */
8364
 
+        /* if so, delete them.                                    */
8365
 
+        /* complete volumes should not have feature_headers       */
8366
 
+        /* hanging off them, if we find any, we know the volume   */
8367
 
+        /* is incomplete.                                         */
8368
 
+
8369
 
+       for (node = *discover_list; node; node = next_node) {
8370
 
+               next_node = node->next;
8371
 
+
8372
 
+                if (node->feature_header) {
8373
 
+                       edef_delete_node(
8374
 
+                                discover_list, node, rc,
8375
 
+                               "Unexpected feature header");
8376
 
+               }
8377
 
+        }
8378
 
+        return(rc);
8379
 
+}
8380
 
+
8381
 
+/*
8382
 
+ * Function:     evms_discover_evms_features
8383
 
+ * Description: Find features for nodes on the logical partitions list
8384
 
+ */
8385
 
+static int 
8386
 
+evms_discover_evms_features(evms_logical_node_t **discover_list)
8387
 
+{
8388
 
+        evms_logical_node_t *associative_feature_list;
8389
 
+        int rc = 0;
8390
 
+        
8391
 
+       LOG_EXTRA("discovering evms volume features...\n");
8392
 
+
8393
 
+        /* initialize "associative" features list */
8394
 
+        associative_feature_list = NULL;
8395
 
+
8396
 
+       /* find the bottom features */
8397
 
+       rc = edef_find_first_features(discover_list);
8398
 
+        if (!rc)
8399
 
+                /* process EVMS volumes here */
8400
 
+                rc = edef_process_evms_volumes(discover_list, &associative_feature_list);
8401
 
+        if (!rc)
8402
 
+                /* process "associative" features here */
8403
 
+                rc = edef_process_associative_volumes(
8404
 
+                        &associative_feature_list, discover_list);
8405
 
+        if (!rc)
8406
 
+                /* check for incomplete volumes */
8407
 
+                rc = edef_check_for_incomplete_volumes(discover_list);
8408
 
+
8409
 
+        return(rc);
8410
 
+}
8411
 
+
8412
 
+/*
8413
 
+ * function: eelv_assign_volume_minor
8414
 
+ *
8415
 
+ * This is a support function for evms_export_logical_volumes.
8416
 
+ * This routine assigns a specific minor number to a volume. It
8417
 
+ * also performs the remaining steps to make this volume visible
8418
 
+ * and usable to the kernel.
8419
 
+ *
8420
 
+ */
8421
 
+static void 
8422
 
+eelv_assign_volume_minor(evms_logical_node_t *node, int minor)
8423
 
+{       
8424
 
+        evms_logical_volume_t *volume;
8425
 
+       int rc;
8426
 
+
8427
 
+        /* initialize the logical_node entry in the volume array */
8428
 
+        volume = &evms_logical_volumes[minor];
8429
 
+        volume->node = node;
8430
 
+       rc = evms_cs_allocate_memory((void **)&volume->name, 
8431
 
+                               strlen(EVMS_GET_NODE_NAME(node)) + 1);
8432
 
+       if (rc) BUG();
8433
 
+       strcpy(volume->name, EVMS_GET_NODE_NAME(node));
8434
 
+
8435
 
+       /* copy flags from top level node into volume structure */
8436
 
+       volume->flags = node->flags;
8437
 
+
8438
 
+       /* check for read-only volume */
8439
 
+       if ( volume->flags & EVMS_VOLUME_READ_ONLY ) {
8440
 
+               set_device_ro(MKDEV(EVMS_MAJOR, minor),1);
8441
 
+       }
8442
 
+
8443
 
+       /* initialize the global device arrays */
8444
 
+        blksize_size[EVMS_MAJOR][minor] = node->block_size;
8445
 
+        hardsect_size[EVMS_MAJOR][minor] = node->hardsector_size;
8446
 
+        blk_size[EVMS_MAJOR][minor] = (int)(node->total_vsectors >> 1);
8447
 
+
8448
 
+        /* register this volume with devfs */
8449
 
+        volume->devfs_handle =
8450
 
+                devfs_register(evms_dir_devfs_handle,
8451
 
+                              volume->name,
8452
 
+                               DEVFS_FL_DEFAULT,
8453
 
+                               EVMS_MAJOR, minor,
8454
 
+                               S_IFBLK | S_IRUGO | S_IWUGO,
8455
 
+                               &evms_fops, NULL);
8456
 
+
8457
 
+        evms_volumes++;
8458
 
+
8459
 
+       LOG_DEFAULT("Exporting EVMS Volume(%u,%u) from \"%s%s\".\n",
8460
 
+                    EVMS_MAJOR, minor, 
8461
 
+                   EVMS_DEV_NODE_PATH, volume->name);
8462
 
+}
8463
 
+
8464
 
+/*
8465
 
+ * function: eelv_check_for_duplicity
8466
 
+ *
8467
 
+ * This is a support function for evms_export_logical_volumes.
8468
 
+ * This routine compares the serial number in the top most node
8469
 
+ * in the volume to the list of currently exported volumes. If
8470
 
+ * this volumes serial number is found in the list then we know
8471
 
+ * this volume is a duplicate and it is then delete.
8472
 
+ *
8473
 
+ */
8474
 
+static void 
8475
 
+eelv_check_for_duplicity(evms_logical_node_t **discover_list)
8476
 
+{
8477
 
+        evms_logical_node_t *next_node, *node;
8478
 
+       evms_logical_volume_t *lv;
8479
 
+        int i, is_dup;
8480
 
+
8481
 
+       for (node = *discover_list; node; node = next_node) {
8482
 
+               next_node = node->next;
8483
 
+
8484
 
+                is_dup = FALSE;
8485
 
+                for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
8486
 
+                        lv = &evms_logical_volumes[i];
8487
 
+                        /* only check exported volumes */
8488
 
+                        if (lv->node) {
8489
 
+                               char *type_ptr = NULL;
8490
 
+
8491
 
+                               /* check for duplicate pointer */
8492
 
+                               if (node == lv->node) {
8493
 
+                                       is_dup = TRUE;
8494
 
+                                       type_ptr = "pointer";
8495
 
+                               /* check for duplicate node */
8496
 
+                               } else if (!strcmp(node->name, 
8497
 
+                                                  lv->node->name)) {
8498
 
+                                       is_dup = TRUE;
8499
 
+                                       type_ptr = "node";
8500
 
+                               }
8501
 
+                               if (is_dup == TRUE) {
8502
 
+                                       evms_cs_remove_logical_node_from_list(discover_list, node);
8503
 
+                                       LOG_DEFAULT("deleting duplicate %s to EVMS volume(%u,%u,%s)...\n",
8504
 
+                                                  type_ptr,
8505
 
+                                                  EVMS_MAJOR, i,
8506
 
+                                                  EVMS_GET_NODE_NAME(node));
8507
 
+                                       /* forget duplicate */
8508
 
+                                       break;
8509
 
+                               }
8510
 
+                        }
8511
 
+                }
8512
 
+        }
8513
 
+}
8514
 
+
8515
 
+/*
8516
 
+ * function: eelv_reassign_soft_deleted_volume_minors
8517
 
+ *
8518
 
+ * This is a support function for evms_export_logical_volumes.
8519
 
+ * This routine reassigns minor numbers to rediscovered "soft"
8520
 
+ * deleted volumes.
8521
 
+ *
8522
 
+ */
8523
 
+static void 
8524
 
+eelv_reassign_soft_deleted_volume_minors(evms_logical_node_t **discover_list)
8525
 
+{
8526
 
+        evms_logical_node_t *next_node, *node;
8527
 
+       evms_logical_volume_t *lv;
8528
 
+        int i, node_removed;
8529
 
+
8530
 
+       for (node = *discover_list; node; node = next_node) {
8531
 
+               next_node = node->next;
8532
 
+
8533
 
+                node_removed = FALSE;
8534
 
+                for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
8535
 
+                        lv = &evms_logical_volumes[i];
8536
 
+                        /* only check soft deleted volumes:
8537
 
+                        *  they have a non-NULL name.
8538
 
+                        */
8539
 
+                        if (lv->flags & EVMS_VOLUME_SOFT_DELETED) {
8540
 
+                               if (!strcmp(EVMS_GET_NODE_NAME(node),lv->name)) {
8541
 
+                                        /* reassign requested minor */
8542
 
+                                        evms_cs_remove_logical_node_from_list(discover_list, node);
8543
 
+                                        node_removed = TRUE;
8544
 
+                                        LOG_DEFAULT("Re");
8545
 
+                                       /* free the previously used name */
8546
 
+                                       evms_cs_deallocate_memory(lv->name);
8547
 
+                                       lv->name = NULL;
8548
 
+                                       /* clear the EVMS_VOLUME_SOFT_DELETED flag */
8549
 
+                                       lv->flags = 0;
8550
 
+                                        eelv_assign_volume_minor(node, i);
8551
 
+                                       break;
8552
 
+                               }
8553
 
+                        }
8554
 
+                }
8555
 
+        }
8556
 
+}
8557
 
+
8558
 
+/*
8559
 
+ * function: eelv_assign_evms_volume_minors
8560
 
+ *
8561
 
+ * This is a support function for evms_export_logical_volumes.
8562
 
+ * This routine assigns minor numbers to new evms volumes. If
8563
 
+ * the specified minor is already in use, the requested minor
8564
 
+ * is set to 0, and will be assigned next available along with
8565
 
+ * any remaining volumes at the end of evms_export_logical_volumes.
8566
 
+ *
8567
 
+ */
8568
 
+static void 
8569
 
+eelv_assign_evms_volume_minors(evms_logical_node_t **discover_list)
8570
 
+{
8571
 
+        evms_logical_node_t *next_node, *node, *lv_node;
8572
 
+        unsigned int requested_minor, node_removed;
8573
 
+
8574
 
+       for (node = *discover_list; node; node = next_node) {
8575
 
+               next_node = node->next;
8576
 
+
8577
 
+                node_removed = FALSE;
8578
 
+                /* only process evms volumes */
8579
 
+                if (node->flags & EVMS_VOLUME_FLAG) {
8580
 
+                        requested_minor = node->volume_info->volume_system_id;
8581
 
+                        /* is there a requested minor? */
8582
 
+                        if (requested_minor) {
8583
 
+                               int lv_flags = 0;
8584
 
+
8585
 
+                                /* check range of requested minor */
8586
 
+                                if (requested_minor >= MAX_EVMS_VOLUMES)
8587
 
+                                        lv_node = node;
8588
 
+                                else {
8589
 
+                                       evms_logical_volume_t *lv;
8590
 
+                                       lv = &evms_logical_volumes[requested_minor];
8591
 
+                                        lv_node = lv->node;
8592
 
+                                        lv_flags = lv->flags;
8593
 
+                               }
8594
 
+                                if ( (!lv_node) && (!(lv_flags & EVMS_VOLUME_SOFT_DELETED)) ) {
8595
 
+                                        /* assign requested minor */
8596
 
+                                        evms_cs_remove_logical_node_from_list(discover_list, node);
8597
 
+                                        node_removed = TRUE;
8598
 
+                                        eelv_assign_volume_minor(node, requested_minor);
8599
 
+                                } else {
8600
 
+                                        LOG_WARNING("EVMS volume(%s) requesting invalid/in-use minor(%d), assigning next available!\n",
8601
 
+                                                   node->volume_info->volume_name, 
8602
 
+                                                  requested_minor);
8603
 
+                                        /*
8604
 
+                                         * requested minor is already
8605
 
+                                         * in use, defer assignment
8606
 
+                                         * until later.
8607
 
+                                         */
8608
 
+                                        node->volume_info->volume_system_id = 0;
8609
 
+                                }
8610
 
+                        }
8611
 
+                }
8612
 
+        }
8613
 
+}
8614
 
+
8615
 
+/*
8616
 
+ * function: eelv_assign_remaining_evms_volume_minors
8617
 
+ *
8618
 
+ * This is a support function for evms_export_logical_volumes.
8619
 
+ * This routine assigns minor numbers to new evms volumes that
8620
 
+ * have no/conflicting minor assignments. This function will 
8621
 
+ * search from high(255) minor values down, for the first available
8622
 
+ * minor. Searching high to low minimizes the possibility of
8623
 
+ * conflicting evms volumes causing "compatibility" minor
8624
 
+ * assignments to shift from expected assignments.
8625
 
+ *
8626
 
+ */
8627
 
+static void 
8628
 
+eelv_assign_remaining_evms_volume_minors(
8629
 
+       evms_logical_node_t **discover_list)
8630
 
+{
8631
 
+        evms_logical_node_t *next_node, *node;
8632
 
+        int requested_minor, node_removed;
8633
 
+
8634
 
+       for (node = *discover_list; node; node = next_node) {
8635
 
+               next_node = node->next;
8636
 
+
8637
 
+                node_removed = FALSE;
8638
 
+                /* only process evms volumes */
8639
 
+               /* all remaining evms volumes should now
8640
 
+                * have a minor value of 0, meaning they
8641
 
+                * had no minor assignment, or their minor
8642
 
+                * assignment conflicted with an existing
8643
 
+                * minor assignment.
8644
 
+                */
8645
 
+                if (node->flags & EVMS_VOLUME_FLAG) {
8646
 
+                       evms_cs_remove_logical_node_from_list(discover_list, node);
8647
 
+                       node_removed = TRUE;
8648
 
+                       /* find next available minor number */
8649
 
+                       for (requested_minor = 255; 
8650
 
+                            (evms_logical_volumes[requested_minor].node  ||
8651
 
+                             evms_logical_volumes[requested_minor].name) && 
8652
 
+                            requested_minor; 
8653
 
+                            requested_minor--);
8654
 
+                       /* check range of assigned minor */
8655
 
+                       if (!requested_minor) {
8656
 
+                               LOG_CRITICAL("no more minor numbers available for evms volumes!!!!\n");
8657
 
+                               DELETE(node);
8658
 
+                       } else
8659
 
+                               /* assign requested minor */
8660
 
+                               eelv_assign_volume_minor(node, requested_minor);
8661
 
+                }
8662
 
+        }
8663
 
+}
8664
 
+
8665
 
+/*
8666
 
+ * function: eelv_assign_remaining_volume_minors
8667
 
+ *
8668
 
+ * This is a support function for evms_export_logical_volumes.
8669
 
+ * This routine assigns minor numbers to all remaining unassigned
8670
 
+ * volumes. Minor numbers are assigned on an availability
8671
 
+ * basis. The first free minor number is used in the assignment.
8672
 
+ *
8673
 
+ */
8674
 
+static void 
8675
 
+eelv_assign_remaining_volume_minors(
8676
 
+       evms_logical_node_t **discover_list)
8677
 
+{
8678
 
+        evms_logical_node_t *node;
8679
 
+        int minor;
8680
 
+
8681
 
+        while(*discover_list) {
8682
 
+                node = *discover_list;
8683
 
+                evms_cs_remove_logical_node_from_list(discover_list, node);
8684
 
+
8685
 
+                /* find next available minor number */
8686
 
+                for (minor = 1; 
8687
 
+                     (evms_logical_volumes[minor].node  ||
8688
 
+                      evms_logical_volumes[minor].name) && 
8689
 
+                     minor < MAX_EVMS_VOLUMES; 
8690
 
+                     minor++);
8691
 
+                
8692
 
+                if (minor >= MAX_EVMS_VOLUMES) {
8693
 
+                        LOG_CRITICAL("no more minor numbers available for compatibility volumes!!!!\n");
8694
 
+                        DELETE(node);
8695
 
+                } else
8696
 
+                        /* assign minor */
8697
 
+                        eelv_assign_volume_minor(node, minor);
8698
 
+        }
8699
 
+}
8700
 
+
8701
 
+/*
8702
 
+ * function: eelv_check_for_unreassign_soft_deleted_volume
8703
 
+ *
8704
 
+ * This is a support function for evms_export_logical_volumes.
8705
 
+ * This routine reports any "soft deleted" volumes that were not
8706
 
+ * found after a rediscovery.
8707
 
+ */
8708
 
+static void 
8709
 
+eelv_check_for_unreassign_soft_deleted_volume(void)
8710
 
+{
8711
 
+       evms_logical_volume_t *lv;
8712
 
+       int i;
8713
 
+
8714
 
+       for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
8715
 
+               lv = &evms_logical_volumes[i];
8716
 
+               /* only check soft deleted volumes:
8717
 
+                *  they have a NULL node ptr &
8718
 
+                *  they have a non-NULL name.
8719
 
+                */
8720
 
+               if (lv->flags & EVMS_VOLUME_SOFT_DELETED) {
8721
 
+                       if (get_super(MKDEV(EVMS_MAJOR, i))) 
8722
 
+                               lv->flags |= EVMS_VOLUME_CORRUPT;
8723
 
+                       LOG_ERROR("error: rediscovery failed to find %smounted 'soft deleted' volume(%u,%u,%s)...\n",
8724
 
+                               ((lv->flags & EVMS_VOLUME_CORRUPT) ? "" : "un"),
8725
 
+                               EVMS_MAJOR, i,
8726
 
+                               lv->name);
8727
 
+                       if (lv->flags & EVMS_VOLUME_CORRUPT) {
8728
 
+                               LOG_ERROR("         flagging volume(%u,%u,%s) as CORRUPT!\n",
8729
 
+                                       EVMS_MAJOR, i,
8730
 
+                                       lv->name);
8731
 
+                       } else {
8732
 
+                               LOG_ERROR("         releasing minor(%d) used by volume(%s)!\n",
8733
 
+                                       i, lv->name);
8734
 
+                               /* clear logical volume structure
8735
 
+                                * for this volume so it may be
8736
 
+                                * reused.
8737
 
+                                */
8738
 
+                               evms_cs_deallocate_memory(lv->name);
8739
 
+                               lv->name = NULL;
8740
 
+                               lv->flags = 0;
8741
 
+                       }
8742
 
+               }
8743
 
+       }
8744
 
+}
8745
 
+
8746
 
+static void 
8747
 
+eelv_unquiesce_volumes(void)
8748
 
+{
8749
 
+       int i;
8750
 
+
8751
 
+       /* check each volume array entry */
8752
 
+       for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
8753
 
+               evms_logical_volume_t *volume;
8754
 
+
8755
 
+               volume = &evms_logical_volumes[i];
8756
 
+               /* is this volume "quiesced" ? */
8757
 
+               if (volume->quiesced) {
8758
 
+                       int rc = 1;
8759
 
+                       if (volume->node) {
8760
 
+                               /* "unquiesce" it */
8761
 
+                               struct inode inode;
8762
 
+                               evms_quiesce_volume_t qv;
8763
 
+
8764
 
+                               qv.command = qv.status = 0;
8765
 
+                               qv.do_vfs = 0;
8766
 
+                               qv.minor = i;
8767
 
+                               rc = evms_quiesce_volume(volume, &inode, NULL, &qv);
8768
 
+                       }
8769
 
+                       /* Wake up any waiters */
8770
 
+                       if (rc) {
8771
 
+                               /* clear the flag */
8772
 
+                               volume->quiesced = 0;
8773
 
+                               /* wake up the waiters */
8774
 
+                               if (waitqueue_active(&volume->wait_queue))
8775
 
+                                       wake_up(&volume->wait_queue);
8776
 
+#ifdef VFS_PATCH_PRESENT
8777
 
+                               /* unquiesce VFS if quiesced */
8778
 
+                               if (volume->vfs_quiesced) {
8779
 
+                                       /* VFS function call to unlock the filesystem */
8780
 
+                                       unlockfs(MKDEV(EVMS_MAJOR, i));
8781
 
+                                       volume->vfs_quiesced = FALSE;
8782
 
+                               }
8783
 
+#endif
8784
 
+                       }
8785
 
+               }
8786
 
+       }
8787
 
+}
8788
 
+
8789
 
+/*
8790
 
+ * Function:     evms_export_logical_volumes
8791
 
+ *
8792
 
+ * This function is called from evms_discover_volumes. It
8793
 
+ * check for duplicate volumes, assigns minor values to evms
8794
 
+ * volumes, and assigns minor values to the remaining volumes.
8795
 
+ * In addition to assigning minor values to each volume this
8796
 
+ * function also completes the final steps necessary to allow
8797
 
+ * the volumes to be using by the operating system.
8798
 
+ */
8799
 
+static void 
8800
 
+evms_export_logical_volumes(evms_logical_node_t **discover_list)
8801
 
+{
8802
 
+        LOG_EXTRA("exporting EVMS logical volumes...\n");
8803
 
+
8804
 
+        eelv_check_for_duplicity(discover_list);
8805
 
+
8806
 
+       eelv_reassign_soft_deleted_volume_minors(discover_list);
8807
 
+
8808
 
+        eelv_assign_evms_volume_minors(discover_list);
8809
 
+
8810
 
+        eelv_assign_remaining_evms_volume_minors(discover_list);
8811
 
+
8812
 
+        eelv_assign_remaining_volume_minors(discover_list);
8813
 
+
8814
 
+       eelv_check_for_unreassign_soft_deleted_volume();
8815
 
+
8816
 
+       /* "unquiesce" any "quiesced" volumes */
8817
 
+       eelv_unquiesce_volumes();
8818
 
+}
8819
 
+
8820
 
+static int 
8821
 
+edv_populate_discover_list(
8822
 
+       evms_list_node_t *src_list,
8823
 
+        evms_logical_node_t **trg_list,
8824
 
+        evms_rediscover_t *discover_parms)
8825
 
+{
8826
 
+        int rc = 0, i, move_node, use_all_disks = FALSE;
8827
 
+       evms_list_node_t *src_node;
8828
 
+
8829
 
+
8830
 
+        /* if no discover parameters are specified */
8831
 
+        /* copy ALL the disk nodes into the        */
8832
 
+        /* discovery list.                         */
8833
 
+        if ((discover_parms == NULL) ||
8834
 
+           (discover_parms->drive_count == REDISCOVER_ALL_DEVICES))
8835
 
+                use_all_disks = TRUE;
8836
 
+
8837
 
+        /* copy the disk nodes specified in the */ 
8838
 
+        /* discover_parms over to a discover list */
8839
 
+       src_node = src_list;
8840
 
+        while(src_node) {
8841
 
+                move_node = use_all_disks;
8842
 
+                if (move_node == FALSE)
8843
 
+                        /* check the rediscovery array */
8844
 
+                        for (i = 0; i < discover_parms->drive_count; i++)
8845
 
+                                if (discover_parms->drive_array[i] == ((unsigned long)src_node->item ^ EVMS_HANDLE_KEY)) {
8846
 
+                                        move_node = TRUE;
8847
 
+                                        break;
8848
 
+                                }
8849
 
+               /* check to see if we want this node */
8850
 
+                if (move_node == TRUE)
8851
 
+                        evms_cs_add_logical_node_to_list(
8852
 
+                               trg_list, 
8853
 
+                               (evms_logical_node_t *)src_node->item);
8854
 
+                /* advance to next evms_list_node_t */
8855
 
+               src_node = src_node->next;
8856
 
+        }
8857
 
+        return(rc);
8858
 
+}
8859
 
+
8860
 
+static int 
8861
 
+evms_discover_volumes(evms_rediscover_t *discover_parms)
8862
 
+{       
8863
 
+        int rc = 0;
8864
 
+        evms_logical_node_t *discover_list = NULL;
8865
 
+
8866
 
+        evms_discover_logical_disks(&discover_list);
8867
 
+        if (evms_global_device_list) {
8868
 
+                /* move the appropriate disk nodes, based on */
8869
 
+                /* on the discover parameters, onto the      */
8870
 
+                /* discover list for the partition managers  */
8871
 
+                /* to process                                */
8872
 
+                edv_populate_discover_list(
8873
 
+                        evms_global_device_list,
8874
 
+                        &discover_list, discover_parms);
8875
 
+       }
8876
 
+       if (discover_list) {
8877
 
+                evms_discover_logical_partitions(&discover_list);
8878
 
+        }
8879
 
+       if (discover_list) {
8880
 
+                evms_discover_volume_groups(&discover_list);
8881
 
+        }
8882
 
+       if (discover_list) {
8883
 
+                evms_discover_evms_features(&discover_list);
8884
 
+        }
8885
 
+       if (discover_list) {
8886
 
+                evms_export_logical_volumes(&discover_list);
8887
 
+               evms_cs_signal_event(EVMS_EVENT_END_OF_DISCOVERY);
8888
 
+        }
8889
 
+        return(rc);
8890
 
+}
8891
 
+
8892
 
+/*
8893
 
+ * Function: find_root_fs_dev
8894
 
+ * If "root=/dev/evms/???" was specified on the kernel command line, and devfs
8895
 
+ * is not enabled, we need to determine the appropriate minor number for the
8896
 
+ * specified volume for the root fs.
8897
 
+ */
8898
 
+static void find_root_fs_dev(void)
8899
 
+{
8900
 
+       char * name;
8901
 
+       int i;
8902
 
+
8903
 
+       if ( ! strncmp(root_device_name, EVMS_DIR_NAME "/", strlen(EVMS_DIR_NAME)+1) ) {
8904
 
+               name = &root_device_name[strlen(EVMS_DIR_NAME)+1];
8905
 
+
8906
 
+               for ( i = 1; i <= MAX_EVMS_VOLUMES; i++ ) {
8907
 
+                       if ( evms_logical_volumes[i].name &&
8908
 
+                            ! strncmp(name, evms_logical_volumes[i].name, strlen(evms_logical_volumes[i].name)) ) {
8909
 
+                               ROOT_DEV = MKDEV(EVMS_MAJOR,i);
8910
 
+                               return;
8911
 
+                       }
8912
 
+               }
8913
 
+       }
8914
 
+}
8915
 
+
8916
 
+/* 
8917
 
+ * Function: bh_cache_ctor
8918
 
+ * this function initializes the b_wait field in the buffer heads
8919
 
+ * in our private buffer head pool.
8920
 
+ */
8921
 
+static void 
8922
 
+io_notify_cache_ctor(
8923
 
+       void * foo, 
8924
 
+       kmem_cache_t * cachep, 
8925
 
+       unsigned long flags)
8926
 
+{
8927
 
+       if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
8928
 
+           SLAB_CTOR_CONSTRUCTOR)
8929
 
+       {
8930
 
+               io_notify_t *io_notify = (io_notify_t *)foo;
8931
 
+               memset(io_notify, 0, sizeof(*io_notify));
8932
 
+       }
8933
 
+}
8934
 
+
8935
 
+/* 
8936
 
+ * Function: bh_cache_ctor
8937
 
+ * this function initializes the b_wait field in the buffer heads
8938
 
+ * in our private buffer head pool.
8939
 
+ */
8940
 
+static void 
8941
 
+bh_cache_ctor(
8942
 
+       void * foo, 
8943
 
+       kmem_cache_t * cachep, 
8944
 
+       unsigned long flags)
8945
 
+{
8946
 
+       if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
8947
 
+           SLAB_CTOR_CONSTRUCTOR)
8948
 
+       {
8949
 
+               struct buffer_head *bh = (struct buffer_head *)foo;
8950
 
+               memset(bh, 0, sizeof(*bh));
8951
 
+               init_waitqueue_head(&bh->b_wait);
8952
 
+       }
8953
 
+}
8954
 
+
8955
 
+/*
8956
 
+ * Function:  evms_init_module
8957
 
+ * This function runs once at system initialization.
8958
 
+ */
8959
 
+static int __init 
8960
 
+evms_init_module (void)
8961
 
+{
8962
 
+        int rc = 0, i;
8963
 
+        int *evms_blocksizes;
8964
 
+
8965
 
+        LOG_DEFAULT("EVMS v%d.%d.%d initializing .... info level(%d).\n", 
8966
 
+                   EVMS_MAJOR_VERSION, 
8967
 
+                   EVMS_MINOR_VERSION,
8968
 
+                   EVMS_PATCHLEVEL_VERSION,
8969
 
+                   evms_info_level);
8970
 
+
8971
 
+       /* initialize memory management counters */
8972
 
+       atomic_set(&evms_allocs,0);
8973
 
+       atomic_set(&evms_logical_nodes,0);
8974
 
+
8975
 
+       /* initialize the io_notify_entry pool */
8976
 
+       if (!rc)
8977
 
+               evms_io_notify_pool = evms_cs_create_pool(
8978
 
+                       sizeof(io_notify_t), 
8979
 
+                       "EVMS IO Notify",
8980
 
+                       io_notify_cache_ctor,
8981
 
+                       NULL );
8982
 
+
8983
 
+       /* initialize the "public" buffer_head pool */
8984
 
+       if (!rc)
8985
 
+               evms_bh_pool = evms_cs_create_pool(
8986
 
+                       sizeof(struct buffer_head), 
8987
 
+                       "EVMS BH",
8988
 
+                       bh_cache_ctor,
8989
 
+                       NULL );
8990
 
+
8991
 
+       /* allocate the logical volume array */
8992
 
+       if (!rc)
8993
 
+               rc = evms_cs_allocate_memory(
8994
 
+                       (void **)&evms_logical_volumes,
8995
 
+                       sizeof(evms_logical_volume_t) * MAX_EVMS_VOLUMES);
8996
 
+
8997
 
+       /* initialize the logical volume array entries */
8998
 
+       if (!rc)
8999
 
+               for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
9000
 
+                       evms_logical_volume_t *volume;
9001
 
+
9002
 
+                       volume = &evms_logical_volumes[i];
9003
 
+                       init_waitqueue_head(&volume->wait_queue);
9004
 
+#ifdef CONFIG_SMP
9005
 
+                       blk_init_queue(&volume->request_queue, 
9006
 
+                                      evms_do_request_fn);
9007
 
+                       blk_queue_make_request(&volume->request_queue, 
9008
 
+                                              evms_make_request_fn);
9009
 
+#endif
9010
 
+               }
9011
 
+
9012
 
+        /* allocate EVMS' blk_size array */
9013
 
+        if (!rc) {
9014
 
+                rc = evms_cs_allocate_memory((void **)&evms_blocksizes, MAX_EVMS_VOLUMES * sizeof(int));
9015
 
+                if (rc) {
9016
 
+                        LOG_CRITICAL("can't allocate memory for EVMS blk_size\n");
9017
 
+                } else blk_size[EVMS_MAJOR] = evms_blocksizes;
9018
 
+        }
9019
 
+        
9020
 
+        /* allocate EVMS' blksize_size array */
9021
 
+        if (!rc) {
9022
 
+                rc = evms_cs_allocate_memory((void **)&evms_blocksizes, MAX_EVMS_VOLUMES * sizeof(int));
9023
 
+                if (rc) { 
9024
 
+                        LOG_CRITICAL("can't allocate memory for EVMS blksize_size\n");
9025
 
+                } else blksize_size[EVMS_MAJOR] = evms_blocksizes;
9026
 
+        }
9027
 
+
9028
 
+        /* allocate EVMS' hardsect_size array */
9029
 
+        if (!rc) {
9030
 
+                rc = evms_cs_allocate_memory((void **)&evms_blocksizes, MAX_EVMS_VOLUMES * sizeof(int));
9031
 
+                if (rc) { 
9032
 
+                        LOG_CRITICAL("can't allocate memory for EVMS hardsect_size\n");
9033
 
+                } else hardsect_size[EVMS_MAJOR] = evms_blocksizes;
9034
 
+        }
9035
 
+
9036
 
+        /* Register the block device */
9037
 
+        if (!rc) {
9038
 
+                rc = devfs_register_blkdev(EVMS_MAJOR, EVMS_DIR_NAME, &evms_fops);
9039
 
+                if (rc) {
9040
 
+                        LOG_CRITICAL("error calling devfs_register_blkdev()  err=%u\n", rc);
9041
 
+                        rc = -EINVAL;
9042
 
+                }
9043
 
+        }
9044
 
+
9045
 
+        /* Register with devfs */
9046
 
+        if (!rc) {
9047
 
+                evms_dir_devfs_handle = devfs_mk_dir(NULL, EVMS_DIR_NAME, NULL);
9048
 
+                // A NULL return cannot be fatal.
9049
 
+                // Devfs just might not be running
9050
 
+                if ( ! evms_dir_devfs_handle ) {
9051
 
+                        LOG_EXTRA("NULL return from devfs_mk_dir() for \"%s\"\n", EVMS_DIR_NAME);
9052
 
+                        LOG_EXTRA("Is devfs enabled?\n");
9053
 
+                }
9054
 
+                else {
9055
 
+                        evms_blk_devfs_handle = devfs_register(evms_dir_devfs_handle,
9056
 
+                                                               EVMS_DEV_NAME,
9057
 
+                                                               DEVFS_FL_DEFAULT,
9058
 
+                                                               EVMS_MAJOR, 0,
9059
 
+                                                               S_IFBLK | S_IRUGO | S_IWUGO,
9060
 
+                                                               &evms_fops, NULL);
9061
 
+                        if ( ! evms_blk_devfs_handle ) {
9062
 
+                                LOG_DETAILS("NULL return from devfs_register() for \"%s\"\n", EVMS_DEV_NAME);
9063
 
+                        }
9064
 
+                }
9065
 
+        }
9066
 
+
9067
 
+        if (!rc) {
9068
 
+                read_ahead[EVMS_MAJOR] = 4096;
9069
 
+#ifdef CONFIG_SMP
9070
 
+               blk_dev[EVMS_MAJOR].queue = evms_find_queue;
9071
 
+#else
9072
 
+                blk_init_queue(BLK_DEFAULT_QUEUE(EVMS_MAJOR), evms_do_request_fn);
9073
 
+                blk_queue_make_request(BLK_DEFAULT_QUEUE(EVMS_MAJOR), evms_make_request_fn);
9074
 
+#endif
9075
 
+#ifdef CONFIG_PROC_FS
9076
 
+               evms_cs_get_evms_proc_dir();
9077
 
+               if (evms_proc_dir) {
9078
 
+                       create_proc_read_entry("info", 0, evms_proc_dir, evms_info_read_proc, NULL);
9079
 
+                       create_proc_read_entry("plugins", 0, evms_proc_dir, evms_plugins_read_proc, NULL);
9080
 
+                       create_proc_read_entry("volumes", 0, evms_proc_dir, evms_volumes_read_proc, NULL);
9081
 
+               }
9082
 
+               evms_table_header = register_sysctl_table(dev_dir_table, 1);
9083
 
+#endif
9084
 
+        }
9085
 
+
9086
 
+        return rc;
9087
 
+}
9088
 
+
9089
 
+/*
9090
 
+ * Function:  evms_init_module
9091
 
+ * This function runs once at system initialization.
9092
 
+ */
9093
 
+static void __exit
9094
 
+evms_exit_module (void)
9095
 
+{
9096
 
+        int rc = 0, i;
9097
 
+
9098
 
+        LOG_DEFAULT("EVMS v%d.%d.%d unloading ....\n", 
9099
 
+                   EVMS_MAJOR_VERSION, 
9100
 
+                   EVMS_MINOR_VERSION,
9101
 
+                   EVMS_PATCHLEVEL_VERSION);
9102
 
+
9103
 
+       /* ensure no EVMS volumes exist
9104
 
+        */
9105
 
+       for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
9106
 
+               if (evms_logical_volumes[i].node) {
9107
 
+                       LOG_ERROR("volume(%d,%d,%s) still exists.\n",
9108
 
+                                 EVMS_MAJOR, i, 
9109
 
+                                 evms_logical_volumes[i].name);
9110
 
+                       rc = -EPERM;
9111
 
+               }
9112
 
+       }
9113
 
+       if (rc) {
9114
 
+               LOG_ERROR("unable to unload until no volumes exist!\n");
9115
 
+       }
9116
 
+       if (!rc) {
9117
 
+               /* ensure no plugins are loaded.
9118
 
+                */
9119
 
+               evms_registered_plugin_t *p;
9120
 
+               int found = FALSE;
9121
 
+
9122
 
+               for (p = registered_plugin_head; p; p = p->next) {
9123
 
+                       found = TRUE;
9124
 
+                       LOG_ERROR("plugin (plugin.id=%d.%d.%d, plugin.ver=%d.%d.%d still loaded.\n",
9125
 
+                               GetPluginOEM(p->plugin->id),
9126
 
+                               GetPluginType(p->plugin->id),
9127
 
+                               GetPluginID(p->plugin->id),
9128
 
+                               p->plugin->version.major,
9129
 
+                               p->plugin->version.minor,
9130
 
+                               p->plugin->version.patchlevel);
9131
 
+               }
9132
 
+               if (found) {
9133
 
+                       LOG_ERROR("unable to unload while plugins still loaded!\n");
9134
 
+               }
9135
 
+       }
9136
 
+       if (!rc) {
9137
 
+               /* unregister with devfs 
9138
 
+                */
9139
 
+               devfs_unregister(evms_dir_devfs_handle);
9140
 
+               /* clean up the queue for the block device
9141
 
+                */
9142
 
+               blk_cleanup_queue(blk_get_queue(MKDEV(EVMS_MAJOR,0)));
9143
 
+               /* unregister block device 
9144
 
+                */
9145
 
+                rc = devfs_unregister_blkdev(EVMS_MAJOR, EVMS_DIR_NAME);
9146
 
+       }
9147
 
+       if (!rc) {
9148
 
+               /* deallocate device arrays
9149
 
+                */
9150
 
+                evms_cs_deallocate_memory(blk_size[EVMS_MAJOR]);
9151
 
+               blk_size[EVMS_MAJOR] = NULL;
9152
 
+                evms_cs_deallocate_memory(blksize_size[EVMS_MAJOR]);
9153
 
+               blksize_size[EVMS_MAJOR] = NULL;
9154
 
+                evms_cs_deallocate_memory(hardsect_size[EVMS_MAJOR]);
9155
 
+               hardsect_size[EVMS_MAJOR] = NULL;
9156
 
+                read_ahead[EVMS_MAJOR] = 0;
9157
 
+               /* deallocate logical volumes array
9158
 
+                */
9159
 
+                evms_cs_deallocate_memory(evms_logical_volumes);
9160
 
+               /* destroy buffer head pool
9161
 
+                */
9162
 
+               evms_cs_destroy_pool(evms_bh_pool);
9163
 
+               /* destroy io notify pool
9164
 
+                */
9165
 
+               evms_cs_destroy_pool(evms_io_notify_pool);
9166
 
+#ifdef CONFIG_PROC_FS
9167
 
+               if (evms_proc_dir) {
9168
 
+                       remove_proc_entry("volumes", evms_proc_dir);
9169
 
+                       remove_proc_entry("plugins", evms_proc_dir);
9170
 
+                       remove_proc_entry("info", evms_proc_dir);
9171
 
+                       remove_proc_entry("evms", NULL);
9172
 
+               }
9173
 
+               unregister_sysctl_table(evms_table_header);
9174
 
+#endif
9175
 
+       }
9176
 
+}
9177
 
+
9178
 
+/*
9179
 
+ * Function: evms_init_discover
9180
 
+ * If EVMS is statically built into the kernel, this function will be called
9181
 
+ * to perform an initial volume discovery.
9182
 
+ */
9183
 
+int __init
9184
 
+evms_init_discover (void)
9185
 
+{
9186
 
+       /* go find volumes */
9187
 
+       evms_discover_volumes(NULL);
9188
 
+
9189
 
+       /* Check if the root fs is on EVMS */
9190
 
+       if ( MAJOR(ROOT_DEV) == EVMS_MAJOR ) {
9191
 
+               find_root_fs_dev();
9192
 
+       }
9193
 
+
9194
 
+       return 0;
9195
 
+}
9196
 
+
9197
 
+
9198
 
+/*
9199
 
+ * a placeholder for cluster enablement
9200
 
+ */
9201
 
+void 
9202
 
+evms_cluster_init(int nodeid, int clusterid)
9203
 
+{
9204
 
+       /* dummy */
9205
 
+       return;
9206
 
+}
9207
 
+EXPORT_SYMBOL(evms_cluster_init);
9208
 
+
9209
 
+/*
9210
 
+ * a placeholder for cluster enablement
9211
 
+ */
9212
 
+int
9213
 
+evms_cluster_shutdown(void)
9214
 
+{
9215
 
+       /* dummy */
9216
 
+       return -1;
9217
 
+}
9218
 
+EXPORT_SYMBOL(evms_cluster_shutdown);
9219
 
+
9220
 
+static int __init 
9221
 
+evms_boot_info_level(char *str)
9222
 
+{
9223
 
+    int evms_boot_info_level = (int) simple_strtoul(str, NULL, 10);
9224
 
+    if (evms_boot_info_level) {
9225
 
+        evms_info_level = evms_boot_info_level;
9226
 
+    }
9227
 
+    return 1;
9228
 
+}
9229
 
+
9230
 
+__setup("evms_info_level=", evms_boot_info_level);
9231
 
+module_init(evms_init_module);
9232
 
+module_exit(evms_exit_module);
9233
 
+__initcall(evms_init_discover);
9234
 
+#ifdef MODULE_LICENSE
9235
 
+MODULE_LICENSE("GPL");
9236
 
+#endif
9237
 
+
9238
 
+/**********************************************************/
9239
 
+/* END -- INIT/DISCOVERY support functions                */
9240
 
+/**********************************************************/
9241
 
diff -Naur linux-2002-03-28/drivers/evms/evms_bbr.c evms-2002-03-28/drivers/evms/evms_bbr.c
9242
 
--- linux-2002-03-28/drivers/evms/evms_bbr.c    Wed Dec 31 18:00:00 1969
9243
 
+++ evms-2002-03-28/drivers/evms/evms_bbr.c     Wed Mar 27 19:01:30 2002
9244
 
@@ -0,0 +1,1631 @@
9245
 
+/* -*- linux-c -*- */
9246
 
+/*
9247
 
+ *
9248
 
+ *   Copyright (c) International Business Machines  Corp., 2000
9249
 
+ *
9250
 
+ *   This program is free software;  you can redistribute it and/or modify
9251
 
+ *   it under the terms of the GNU General Public License as published by
9252
 
+ *   the Free Software Foundation; either version 2 of the License, or
9253
 
+ *   (at your option) any later version.
9254
 
+ *
9255
 
+ *   This program is distributed in the hope that it will be useful,
9256
 
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
9257
 
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
9258
 
+ *   the GNU General Public License for more details.
9259
 
+ *
9260
 
+ *   You should have received a copy of the GNU General Public License
9261
 
+ *   along with this program;  if not, write to the Free Software
9262
 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
9263
 
+ */
9264
 
+
9265
 
+/* linux/driver/evms/evms_bbr.c
9266
 
+ *
9267
 
+ * EVMS - Bad Block Relocation (BBR) Feature Plugin
9268
 
+ *
9269
 
+ * BBR feature is designed to remap I/O write failures to another safe location on disk.
9270
 
+ * Note that most disk drives have BBR built into them, this means that our software BBR
9271
 
+ * will be only activated when all hardware BBR replacement sectors have been used.
9272
 
+ */
9273
 
+
9274
 
+
9275
 
+/* #define EVMS_BBR_DEBUG 1 */
9276
 
+
9277
 
+#include <linux/evms/evms_bbr_k.h>
9278
 
+
9279
 
+#define LOG_PREFIX "bbr: "
9280
 
+
9281
 
+static bbr_instance_data_t *bbr_instances = NULL;
9282
 
+
9283
 
+static struct notifier_block bbr_notifier = {
9284
 
+       notifier_call:  bbr_notify_reboot,
9285
 
+       next:           NULL,
9286
 
+       priority:       INT_MAX, /* before any real devices */
9287
 
+};
9288
 
+
9289
 
+// Data pertaining to the I/O thread.
9290
 
+static evms_thread_t   * bbr_io_thread = NULL;
9291
 
+static spinlock_t      bbr_io_list_lock = SPIN_LOCK_UNLOCKED;
9292
 
+static bbr_bh_t                * bbr_io_list = NULL, **bbr_io_list_tail;
9293
 
+
9294
 
+/* plugin function table definition */
9295
 
+static  evms_plugin_function_table_t function_table = {
9296
 
+        discover   : bbr_discover,
9297
 
+        delete     : bbr_delete,
9298
 
+        read       : bbr_read,
9299
 
+        write      : bbr_write,
9300
 
+        init_io    : bbr_init_io,
9301
 
+        ioctl      : bbr_ioctl,
9302
 
+       direct_ioctl : bbr_direct_ioctl
9303
 
+};
9304
 
+
9305
 
+static evms_plugin_header_t plugin_header = {
9306
 
+        id              : SetPluginID(
9307
 
+                IBM_OEM_ID,
9308
 
+                EVMS_FEATURE,
9309
 
+                EVMS_BBR_FEATURE_ID),
9310
 
+        version         : { 1,0,0 },
9311
 
+        required_common_services_version : {
9312
 
+                EVMS_BBR_COMMON_SERVICES_MAJOR,
9313
 
+                EVMS_BBR_COMMON_SERVICES_MINOR,
9314
 
+                EVMS_BBR_COMMON_SERVICES_PATCHLEVEL
9315
 
+        },
9316
 
+        function_table  : &function_table
9317
 
+};
9318
 
+
9319
 
+
9320
 
+/* 
9321
 
+ * Function: le_meta_data_to_cpu
9322
 
+ *     convert bbr meta data from on-disk (LE) format to the native cpu endian format.
9323
 
+ */
9324
 
+void le_meta_data_to_cpu(evms_bbr_metadata_t *md)
9325
 
+{
9326
 
+       md->signature = le32_to_cpu(md->signature);
9327
 
+       md->crc = le32_to_cpu(md->crc);
9328
 
+       md->block_size = le32_to_cpu(md->block_size);
9329
 
+       md->flags = le32_to_cpu(md->flags);
9330
 
+       md->sequence_number = le64_to_cpu(md->sequence_number);
9331
 
+       md->start_sect_bbr_table = le64_to_cpu(md->start_sect_bbr_table);
9332
 
+       md->nr_sects_bbr_table = le64_to_cpu(md->nr_sects_bbr_table);
9333
 
+       md->start_replacement_sect = le64_to_cpu(md->start_replacement_sect);
9334
 
+       md->nr_replacement_blks = le64_to_cpu(md->nr_replacement_blks);
9335
 
+}
9336
 
+
9337
 
+/*
9338
 
+ * Function: le_bbr_table_sector_to_cpu 
9339
 
+ * convert bbr meta data from on-disk (LE) format to the native cpu endian format.
9340
 
+ */
9341
 
+void le_bbr_table_sector_to_cpu(evms_bbr_table_t *p)
9342
 
+{
9343
 
+       int i;
9344
 
+       p->signature = le32_to_cpu(p->signature);
9345
 
+       p->crc = le32_to_cpu(p->crc);
9346
 
+       p->sequence_number = le32_to_cpu(p->sequence_number);
9347
 
+       p->in_use_cnt = le32_to_cpu(p->in_use_cnt);
9348
 
+       for (i=0; i<EVMS_BBR_ENTRIES_PER_SECT; i++) {
9349
 
+               p->entries[i].bad_sect = le64_to_cpu(p->entries[i].bad_sect);
9350
 
+               p->entries[i].replacement_sect = le64_to_cpu(p->entries[i].replacement_sect);
9351
 
+       }
9352
 
+}
9353
 
+
9354
 
+/* 
9355
 
+ * Function: cpu_bbr_table_sector_to_le
9356
 
+ *     convert bbr meta data from cpu endian format to on-disk (LE) format
9357
 
+ */
9358
 
+void cpu_bbr_table_sector_to_le(evms_bbr_table_t *p, evms_bbr_table_t *le)
9359
 
+{
9360
 
+       int i;
9361
 
+       le->signature = cpu_to_le32(p->signature);
9362
 
+       le->crc = cpu_to_le32(p->crc);
9363
 
+       le->sequence_number = cpu_to_le32(p->sequence_number);
9364
 
+       le->in_use_cnt = cpu_to_le32(p->in_use_cnt);
9365
 
+       for (i=0; i<EVMS_BBR_ENTRIES_PER_SECT; i++) {
9366
 
+               le->entries[i].bad_sect = cpu_to_le64(p->entries[i].bad_sect);
9367
 
+               le->entries[i].replacement_sect = cpu_to_le64(p->entries[i].replacement_sect);
9368
 
+       }
9369
 
+}
9370
 
+
9371
 
+
9372
 
+
9373
 
+static int validate_bbr_table_sector(evms_bbr_table_t *p)
9374
 
+{
9375
 
+       int rc=0;
9376
 
+       int org_crc, final_crc;
9377
 
+
9378
 
+       if (le32_to_cpu(p->signature) != EVMS_BBR_TABLE_SIGNATURE) {
9379
 
+               LOG_ERROR("BBR_TABLE_SIGNATURE don't match! sector has (0x%08X) expected(0x%08X)\n",
9380
 
+                          le32_to_cpu(p->signature), EVMS_BBR_TABLE_SIGNATURE);
9381
 
+               rc = -EINVAL;
9382
 
+       } else {
9383
 
+               if (p->crc) {
9384
 
+                       org_crc = le32_to_cpu(p->crc);
9385
 
+                       p->crc = 0;
9386
 
+                       final_crc = evms_cs_calculate_crc(EVMS_INITIAL_CRC, p, sizeof(*p));
9387
 
+                       if (final_crc != org_crc) {
9388
 
+                               LOG_ERROR("CRC failed! sector has (0x%08X) calculated(0x%08X)\n",
9389
 
+                                          org_crc, final_crc);
9390
 
+                               rc = -EINVAL;
9391
 
+                       }
9392
 
+                       p->crc = cpu_to_le32(org_crc);
9393
 
+               } else {
9394
 
+                       LOG_ERROR("bbr table sector has no crc\n");
9395
 
+                       rc = -EINVAL;
9396
 
+               }
9397
 
+       }
9398
 
+       if (rc)
9399
 
+               BBR_DEBUG_PRINT_TABLE_SECTOR(p);
9400
 
+       le_bbr_table_sector_to_cpu(p);
9401
 
+       return rc;
9402
 
+}
9403
 
+
9404
 
+void update_invalid_bbr_table_sector(
9405
 
+       evms_logical_node_t *node,
9406
 
+       evms_bbr_table_t *valid,
9407
 
+       evms_bbr_table_t *invalid,
9408
 
+       evms_sector_t LSN)
9409
 
+{
9410
 
+       int rc;
9411
 
+       evms_bbr_table_t *tmp_bbr_table;
9412
 
+
9413
 
+       /* Correct the invalid bbr table sector */
9414
 
+       memcpy(invalid, valid, sizeof(evms_bbr_table_t));
9415
 
+
9416
 
+       /* Allocate memory for I/O */
9417
 
+       rc = evms_cs_allocate_memory((void**)&tmp_bbr_table,sizeof(evms_bbr_table_t));
9418
 
+       if (!rc) {
9419
 
+               cpu_bbr_table_sector_to_le(valid, tmp_bbr_table);
9420
 
+               LOG_WARNING("%s: updating LSN=%Lu\n", __FUNCTION__, LSN);
9421
 
+               rc = INIT_IO(node, 1, LSN, 1, tmp_bbr_table);
9422
 
+               if (rc) {
9423
 
+                       LOG_ERROR("Could not update bbr table sector, INIT_IO(rc=%d)\n", rc);
9424
 
+               }
9425
 
+               evms_cs_deallocate_memory(tmp_bbr_table);
9426
 
+       }
9427
 
+}
9428
 
+
9429
 
+static u_int32_t validate_bbr_table(
9430
 
+       evms_bbr_metadata_t *md,
9431
 
+       evms_bbr_table_t *p)
9432
 
+{
9433
 
+       u_int32_t i, nr_sects;
9434
 
+
9435
 
+       nr_sects = md->nr_sects_bbr_table;
9436
 
+
9437
 
+       for (i=0; i<nr_sects; i++, p++) {
9438
 
+               if (validate_bbr_table_sector(p))
9439
 
+                       break;
9440
 
+       }
9441
 
+
9442
 
+       if (i != nr_sects) {
9443
 
+               LOG_SERIOUS("stop validation at sector[%d]\n",i);
9444
 
+               nr_sects = i;
9445
 
+       }
9446
 
+       LOG_DEBUG("processed %d bbr table sectors\n", nr_sects);
9447
 
+       return nr_sects;
9448
 
+}
9449
 
+
9450
 
+
9451
 
+static u_int32_t validate_bbr_tables(
9452
 
+       evms_logical_node_t *node,
9453
 
+       evms_bbr_metadata_t *MD1,
9454
 
+       evms_bbr_metadata_t *MD2,
9455
 
+       evms_bbr_table_t *p1,
9456
 
+       evms_bbr_table_t *p2)
9457
 
+{
9458
 
+       u_int32_t i, rc1, rc2, nr_sects;
9459
 
+
9460
 
+       nr_sects = MD1->nr_sects_bbr_table;
9461
 
+       if (nr_sects != MD2->nr_sects_bbr_table) {
9462
 
+               nr_sects = (MD1->nr_sects_bbr_table < MD2->nr_sects_bbr_table) ?
9463
 
+                       MD1->nr_sects_bbr_table : MD2->nr_sects_bbr_table;
9464
 
+               LOG_SERIOUS("number of bbr table sectors don't match, use %d",nr_sects);
9465
 
+       }
9466
 
+
9467
 
+       for (i=0; i<nr_sects; i++, p1++, p2++) {
9468
 
+               rc1 = rc2 = 0;
9469
 
+               if ((rc1 = validate_bbr_table_sector(p1)))
9470
 
+                       LOG_WARNING("%s: MD1 has invalid bbr table sector at (LSN=%Lu)\n",
9471
 
+                                   __FUNCTION__, MD1->start_sect_bbr_table + i);
9472
 
+
9473
 
+               if ((rc2 = validate_bbr_table_sector(p2)))
9474
 
+                       LOG_WARNING("%s: MD2 has invalid bbr table sector at (LSN=%Lu)\n",
9475
 
+                                   __FUNCTION__, MD2->start_sect_bbr_table + i);
9476
 
+               if (rc1 && rc2) {
9477
 
+                       /* cannot continue */
9478
 
+                       break;
9479
 
+               } else {
9480
 
+                       if (rc1 || rc2) {
9481
 
+                               if (rc1) {
9482
 
+                                       update_invalid_bbr_table_sector(node, p2, p1,
9483
 
+                                                                       MD1->start_sect_bbr_table + i);
9484
 
+                               } else {
9485
 
+                                       update_invalid_bbr_table_sector(node, p1, p2,
9486
 
+                                                                       MD2->start_sect_bbr_table + i);
9487
 
+                               }
9488
 
+                               /* skip sequence number check, advance to next bbr table sector */
9489
 
+                               continue;
9490
 
+                       }
9491
 
+               }
9492
 
+
9493
 
+               if (p1->sequence_number != p2->sequence_number) {
9494
 
+                       LOG_WARNING("at bbr table sector idx[%d] MD1 sequence_nr=%u <> MD2 sequence_nr_2=%u\n",
9495
 
+                                   i, p1->sequence_number, p2->sequence_number);
9496
 
+                       if (p1->sequence_number < p2->sequence_number)
9497
 
+                               update_invalid_bbr_table_sector(node, p2, p1,
9498
 
+                                                               MD1->start_sect_bbr_table + i);
9499
 
+                       else
9500
 
+                               update_invalid_bbr_table_sector(node, p1, p2,
9501
 
+                                                               MD2->start_sect_bbr_table + i);
9502
 
+               }
9503
 
+       }
9504
 
+       if (i != nr_sects) {
9505
 
+               LOG_SERIOUS("stop validation at sector[%d]\n",i);
9506
 
+               nr_sects = i;
9507
 
+       }
9508
 
+       LOG_DEBUG("%s processed %d bbr table sectors\n", __FUNCTION__, nr_sects);
9509
 
+       return nr_sects;
9510
 
+}
9511
 
+
9512
 
+#ifdef EVMS_BBR_DEBUG
9513
 
+static void print_meta_data(evms_bbr_metadata_t *md)
9514
 
+{
9515
 
+       LOG_DEBUG("META DATA SECTOR\n sig(0x%08X) crc(0x%08X) block_size=%d\n"
9516
 
+                  "     start_sect_bbr_table=%Lu, nr_sects_bbr_table=%Lu\n"
9517
 
+                  "     start_replacement_sect=%Lu, nr_replacement_blks=%Lu\n",
9518
 
+                  md->signature,
9519
 
+                  md->crc,
9520
 
+                  md->block_size,
9521
 
+                  md->start_sect_bbr_table,
9522
 
+                  md->nr_sects_bbr_table,
9523
 
+                  md->start_replacement_sect,
9524
 
+                  md->nr_replacement_blks);
9525
 
+}
9526
 
+
9527
 
+static void print_bbr_table_sector(evms_bbr_table_t *p)
9528
 
+{
9529
 
+       int i;
9530
 
+       LOG_DEBUG("BBR TABLE SECTOR\n sig(0x%08X) crc(0x%08X) sequence=%d, in_use_cnt=%d\n ENTRIES:",
9531
 
+                  p->signature, p->crc, p->sequence_number, p->in_use_cnt);
9532
 
+       for (i=0; i<EVMS_BBR_ENTRIES_PER_SECT; i++) {
9533
 
+               LOG_DEBUG("  [%d] bad_sect=%Lu, replacement_sect=%Lu\n",
9534
 
+                          i, p->entries[i].bad_sect, p->entries[i].replacement_sect);
9535
 
+       }
9536
 
+}
9537
 
+
9538
 
+#endif
9539
 
+
9540
 
+static int validate_meta_data(evms_bbr_metadata_t *md)
9541
 
+{
9542
 
+       int org_crc, final_crc;
9543
 
+
9544
 
+       BBR_DEBUG_PRINT_META_DATA(md);
9545
 
+
9546
 
+       if (le32_to_cpu(md->signature) != EVMS_BBR_SIGNATURE) {
9547
 
+               LOG_SERIOUS("EVMS_BBR_SIGNATURE don't match, got(0x%08X), expected(0x%08X)\n",
9548
 
+                           le32_to_cpu(md->signature), EVMS_BBR_SIGNATURE);
9549
 
+               return -EINVAL;
9550
 
+       }
9551
 
+
9552
 
+       if (md->crc) {
9553
 
+               org_crc = le32_to_cpu(md->crc);
9554
 
+               md->crc = 0;
9555
 
+               final_crc = evms_cs_calculate_crc(EVMS_INITIAL_CRC, md, sizeof(*md));
9556
 
+               if (final_crc != org_crc) {
9557
 
+                       LOG_SERIOUS("metadata has crc(0x%08X), calculated(0x%08X)\n",
9558
 
+                                   org_crc, final_crc);
9559
 
+                       return -EINVAL;
9560
 
+               }
9561
 
+               md->crc = cpu_to_le32(org_crc);
9562
 
+       } else {
9563
 
+               LOG_WARNING("metadata has no crc!!!\n");
9564
 
+       }
9565
 
+
9566
 
+       le_meta_data_to_cpu(md);
9567
 
+       return 0;
9568
 
+}
9569
 
+
9570
 
+/*
9571
 
+ * Function:  bbr_load_meta_data
9572
 
+ *     Load and validate bbr meta data
9573
 
+ */
9574
 
+static int load_meta_data(
9575
 
+       evms_logical_node_t *node,
9576
 
+       evms_sector_t LSN,
9577
 
+       evms_bbr_metadata_t **md,
9578
 
+       evms_bbr_table_t **bbr_table)
9579
 
+{
9580
 
+       int rc;
9581
 
+
9582
 
+       *md = NULL;
9583
 
+       *bbr_table = NULL;
9584
 
+
9585
 
+       if (!LSN) {
9586
 
+               rc = -ENODATA;
9587
 
+               LOG_WARNING("No meta data\n");
9588
 
+               return rc;
9589
 
+       }
9590
 
+
9591
 
+       rc = evms_cs_allocate_memory((void **)md, sizeof(evms_bbr_metadata_t));
9592
 
+       if (!rc) {
9593
 
+               int metadata_hdr_size;
9594
 
+               metadata_hdr_size = evms_cs_size_in_vsectors(sizeof(evms_bbr_metadata_t));
9595
 
+               rc = INIT_IO(node, 0, LSN, metadata_hdr_size, *md);
9596
 
+               if (!rc) {
9597
 
+                       rc = validate_meta_data(*md);
9598
 
+                       if (!rc) {
9599
 
+                               rc = evms_cs_allocate_memory((void**)bbr_table,
9600
 
+                                                            (*md)->nr_sects_bbr_table * EVMS_VSECTOR_SIZE);
9601
 
+                               if (!rc) {
9602
 
+                                       /* load BBR table but do not validate here */
9603
 
+                                       rc = INIT_IO(node, 0,
9604
 
+                                               (*md)->start_sect_bbr_table,
9605
 
+                                               (*md)->nr_sects_bbr_table,
9606
 
+                                               *bbr_table);
9607
 
+                               }
9608
 
+                       }
9609
 
+               }
9610
 
+       }
9611
 
+
9612
 
+       if (rc) {
9613
 
+               LOG_ERROR("%s failed rc=%d.  Free allocated memory!\n",__FUNCTION__,rc);
9614
 
+               if (*md) {
9615
 
+                       evms_cs_deallocate_memory(*md); 
9616
 
+                       *md = NULL;
9617
 
+               }
9618
 
+
9619
 
+               if (*bbr_table) {
9620
 
+                       evms_cs_deallocate_memory(*bbr_table);
9621
 
+                       *bbr_table = NULL;
9622
 
+               }
9623
 
+       }
9624
 
+       return rc;
9625
 
+}
9626
 
+
9627
 
+
9628
 
+/*
9629
 
+ * Function:  bbr_load_feature_data
9630
 
+ *     Load 2 copies meta data
9631
 
+ *     
9632
 
+ */
9633
 
+static int load_feature_data(
9634
 
+       evms_logical_node_t *node,
9635
 
+       bbr_instance_data_t **ID)
9636
 
+{
9637
 
+       int rc = 0;
9638
 
+       int rc1, rc2;
9639
 
+       evms_bbr_metadata_t *md1 = NULL;
9640
 
+       evms_bbr_metadata_t *md2 = NULL;
9641
 
+       evms_bbr_table_t *table1 = NULL;
9642
 
+       evms_bbr_table_t *table2 = NULL;
9643
 
+       u_int64_t lba_table1 = 0;
9644
 
+       u_int64_t lba_table2 = 0;
9645
 
+       u_int32_t nr_sects = 0;
9646
 
+
9647
 
+       *ID = NULL;
9648
 
+
9649
 
+       /* Loads metadata 1 */
9650
 
+       rc1 = load_meta_data(node,
9651
 
+                            node->feature_header->feature_data1_start_lsn,
9652
 
+                            &md1,
9653
 
+                            &table1);
9654
 
+       /* Loads metadata 2 */
9655
 
+       rc2 = load_meta_data(node,
9656
 
+                            node->feature_header->feature_data2_start_lsn,
9657
 
+                            &md2,
9658
 
+                            &table2);
9659
 
+
9660
 
+       if (rc1 && rc2) { /* both copies are bad ?*/
9661
 
+               rc = -ENODATA; /* cannot continue */
9662
 
+       } else {
9663
 
+               if (!rc1 && !rc2) {
9664
 
+                       lba_table1 = md1->start_sect_bbr_table;
9665
 
+                       lba_table2 = md2->start_sect_bbr_table;
9666
 
+                       nr_sects = validate_bbr_tables(node, md1, md2, table1, table2);
9667
 
+                       if (nr_sects == 0) {
9668
 
+                               rc = -ENODATA;
9669
 
+                       }
9670
 
+               } else {
9671
 
+                       /* only 1 copy of meta data */
9672
 
+                       if (rc1) {
9673
 
+                               lba_table2 = md2->start_sect_bbr_table;
9674
 
+                               /* free meta data 1 */
9675
 
+                               evms_cs_deallocate_memory(table1);
9676
 
+                               table1 = table2;
9677
 
+                               table2 = NULL;
9678
 
+                               evms_cs_deallocate_memory(md1);
9679
 
+                               md1 = md2;
9680
 
+                               md2 = NULL;
9681
 
+                       } else {
9682
 
+                               lba_table1 = md1->start_sect_bbr_table;
9683
 
+                       }
9684
 
+                       nr_sects = validate_bbr_table(md1,table1);
9685
 
+                       if (nr_sects == 0) {
9686
 
+                               rc = -ENODATA;
9687
 
+                       }
9688
 
+               }
9689
 
+       }
9690
 
+
9691
 
+       if (!rc && nr_sects) {
9692
 
+               rc = evms_cs_allocate_memory((void **)ID, sizeof(bbr_instance_data_t));
9693
 
+               if (!rc) {
9694
 
+                       /* memset(*ID, 0, sizeof(bbr_instance_data_t)); */ /* not needed */
9695
 
+                       (*ID)->source = node;
9696
 
+                       (*ID)->blksize_in_sects = md1->block_size >> EVMS_VSECTOR_SIZE_SHIFT;
9697
 
+                       (*ID)->remap_root = NULL;
9698
 
+                       (*ID)->lba_table1 = lba_table1;
9699
 
+                       (*ID)->lba_table2 = lba_table2;
9700
 
+                       (*ID)->bbr_table = table1;      /* use only 1 copy of bbr table */
9701
 
+                       (*ID)->nr_sects_bbr_table = nr_sects;
9702
 
+                       if (nr_sects < md1->nr_sects_bbr_table) {
9703
 
+                               LOG_WARNING(" making bbr node read-only\n");
9704
 
+                               (*ID)->flag |= EVMS_VOLUME_READ_ONLY;
9705
 
+                       }
9706
 
+                       (*ID)->nr_replacement_blks = nr_sects * EVMS_BBR_ENTRIES_PER_SECT;
9707
 
+                       (*ID)->start_replacement_sect = md1->start_replacement_sect;
9708
 
+                       atomic_set(&(*ID)->in_use_replacement_blks,0);
9709
 
+                       (*ID)->bbr_id_lock = SPIN_LOCK_UNLOCKED;
9710
 
+                       rc = bbr_create_pools(*ID);
9711
 
+                       if (!rc)
9712
 
+                               atomic_set(&(*ID)->in_use_replacement_blks,bbr_table_to_remap_list(*ID));
9713
 
+               }
9714
 
+       }
9715
 
+
9716
 
+       if (!rc) {
9717
 
+               if (!bbr_io_thread) {
9718
 
+                       const char * name1 = "evms_bbr_io";
9719
 
+                       bbr_io_thread = evms_cs_register_thread(bbr_io_handler, NULL, name1);
9720
 
+                       if (!bbr_io_thread) {
9721
 
+                               rc = -EINVAL;
9722
 
+                       }
9723
 
+               }
9724
 
+       }
9725
 
+
9726
 
+       /* if error, free table1 */
9727
 
+       if (rc) {
9728
 
+               if (table1)
9729
 
+                       evms_cs_deallocate_memory(table1);
9730
 
+               if (*ID) {
9731
 
+                       (*ID)->bbr_table = NULL;
9732
 
+                       bbr_free_instance_data(*ID);
9733
 
+                       (*ID) = NULL;
9734
 
+               }
9735
 
+       }
9736
 
+
9737
 
+       /* Will never use md1, md2 and table2 again */
9738
 
+       if (md1) 
9739
 
+               evms_cs_deallocate_memory(md1);
9740
 
+       if (md2)
9741
 
+               evms_cs_deallocate_memory(md2);
9742
 
+       if (table2)
9743
 
+               evms_cs_deallocate_memory(table2);
9744
 
+
9745
 
+       return rc;
9746
 
+}
9747
 
+
9748
 
+#ifdef EVMS_BBR_DEBUG
9749
 
+
9750
 
+/*
9751
 
+ * bbr_print_binary_tree
9752
 
+ *     Traverse the tree and print out each node
9753
 
+ */
9754
 
+void print_binary_tree(bbr_runtime_remap_t *node)
9755
 
+{
9756
 
+       if (node == NULL) {
9757
 
+               return;
9758
 
+       } else {
9759
 
+               LOG_DEFAULT("[%Lu,%Lu]\n",node->remap.bad_sect, node->remap.replacement_sect);
9760
 
+               print_binary_tree(node->left);
9761
 
+               print_binary_tree(node->right);
9762
 
+       }
9763
 
+
9764
 
+}
9765
 
+
9766
 
+static void print_remap_list(bbr_instance_data_t *BBRID)
9767
 
+{
9768
 
+       if (!BBRID->remap_root)
9769
 
+               return;
9770
 
+       LOG_DEFAULT("%s for %s\n", __FUNCTION__, 
9771
 
+                   BBRID->node ? BBRID->node->name : "?");
9772
 
+       print_binary_tree(BBRID->remap_root);
9773
 
+}
9774
 
+
9775
 
+#endif
9776
 
+
9777
 
+#ifdef BBR_USE_RECURSIVE_FUNCTIONS
9778
 
+
9779
 
+/*
9780
 
+ * Recursive function to insert a node into the binary tree
9781
 
+ */
9782
 
+void bbr_binary_tree_insert(bbr_runtime_remap_t **node, bbr_runtime_remap_t *newnode)
9783
 
+{
9784
 
+       if (*node == NULL) {
9785
 
+               newnode->left = newnode->right = NULL;
9786
 
+               *node = newnode;
9787
 
+               return;
9788
 
+       } else {
9789
 
+               if (newnode->remap.bad_sect > (*node)->remap.bad_sect)
9790
 
+                       return bbr_binary_tree_insert(&((*node)->right),newnode);
9791
 
+               else
9792
 
+                       return bbr_binary_tree_insert(&((*node)->left),newnode);
9793
 
+       }
9794
 
+}
9795
 
+
9796
 
+/*
9797
 
+ * Recursive function to search for a node that contains bad_sect = lsn
9798
 
+ */
9799
 
+bbr_runtime_remap_t * bbr_binary_search(bbr_runtime_remap_t *node, evms_sector_t lsn)
9800
 
+{
9801
 
+       if ((node == NULL) || (node->remap.bad_sect == lsn)) {
9802
 
+               return node;
9803
 
+       } else {
9804
 
+               if (lsn > node->remap.bad_sect)
9805
 
+                       return bbr_binary_search(node->right, lsn);
9806
 
+               else
9807
 
+                       return bbr_binary_search(node->left, lsn);
9808
 
+       }
9809
 
+}
9810
 
+
9811
 
+/*
9812
 
+ * Recursive function to detroy the binary tree
9813
 
+ */
9814
 
+void bbr_binary_tree_destroy(bbr_runtime_remap_t *node, bbr_instance_data_t *BBRID)
9815
 
+{
9816
 
+       if (node) {
9817
 
+               bbr_binary_tree_destroy(node->left, BBRID);
9818
 
+               bbr_binary_tree_destroy(node->right, BBRID);
9819
 
+               evms_cs_deallocate_to_pool(BBRID->remap_pool, node);
9820
 
+       }
9821
 
+}
9822
 
+
9823
 
+#else
9824
 
+
9825
 
+/*
9826
 
+ * Insert a node into the binary tree
9827
 
+ */
9828
 
+void bbr_binary_tree_insert(bbr_runtime_remap_t **root, bbr_runtime_remap_t *newnode)
9829
 
+{
9830
 
+       bbr_runtime_remap_t **node = root;
9831
 
+       while (node && *node) {
9832
 
+               if (newnode->remap.bad_sect > (*node)->remap.bad_sect)
9833
 
+                       node = &((*node)->right);
9834
 
+               else
9835
 
+                       node = &((*node)->left);
9836
 
+       }
9837
 
+       
9838
 
+       newnode->left = newnode->right = NULL;
9839
 
+       *node = newnode;
9840
 
+}
9841
 
+
9842
 
+/*
9843
 
+ * Search for a node that contains bad_sect = lsn
9844
 
+ */
9845
 
+bbr_runtime_remap_t * bbr_binary_search(
9846
 
+       bbr_runtime_remap_t *root,
9847
 
+       evms_sector_t lsn)
9848
 
+{
9849
 
+       bbr_runtime_remap_t *node = root;
9850
 
+       while (node) {
9851
 
+               if (node->remap.bad_sect == lsn)
9852
 
+                       break;
9853
 
+               if (lsn > node->remap.bad_sect)
9854
 
+                       node = node->right;
9855
 
+               else
9856
 
+                       node = node->left;
9857
 
+       }
9858
 
+       return node;
9859
 
+}
9860
 
+
9861
 
+/*
9862
 
+ * Destroy the binary tree
9863
 
+ */
9864
 
+void bbr_binary_tree_destroy(bbr_runtime_remap_t *root, bbr_instance_data_t *BBRID)
9865
 
+{
9866
 
+       bbr_runtime_remap_t **link = NULL;
9867
 
+       bbr_runtime_remap_t *node = root;
9868
 
+
9869
 
+       while (node) {
9870
 
+               if (node->left) {
9871
 
+                       link = &(node->left);
9872
 
+                       node = node->left;
9873
 
+                       continue;
9874
 
+               }
9875
 
+               if (node->right) {
9876
 
+                       link = &(node->right);
9877
 
+                       node = node->right;
9878
 
+                       continue;
9879
 
+               }
9880
 
+
9881
 
+               evms_cs_deallocate_to_pool(BBRID->remap_pool, node);
9882
 
+               
9883
 
+               if (node == root) /* if root is deleted, it's done. */
9884
 
+                       break;
9885
 
+               node = root; /* back to root */
9886
 
+               *link = NULL;
9887
 
+       }
9888
 
+}
9889
 
+
9890
 
+#endif
9891
 
+
9892
 
+static void bbr_free_remap(bbr_instance_data_t *BBRID)
9893
 
+{
9894
 
+       unsigned long flags;
9895
 
+       spin_lock_irqsave(&BBRID->bbr_id_lock, flags);  
9896
 
+       bbr_binary_tree_destroy(BBRID->remap_root, BBRID);
9897
 
+       BBRID->remap_root = NULL;
9898
 
+       spin_unlock_irqrestore(&BBRID->bbr_id_lock, flags);
9899
 
+}
9900
 
+
9901
 
+/*
9902
 
+ * bbr_insert_remap_entry
9903
 
+ */
9904
 
+static int bbr_insert_remap_entry(bbr_instance_data_t *BBRID,
9905
 
+                                 evms_bbr_table_entry_t *new_bbr_entry)
9906
 
+{
9907
 
+       bbr_runtime_remap_t *newnode = NULL;
9908
 
+       unsigned long flags;
9909
 
+       int rc;
9910
 
+
9911
 
+       newnode = kmem_cache_alloc (BBRID->remap_pool->cachep, SLAB_ATOMIC);
9912
 
+       if (!newnode) {
9913
 
+               rc = -ENOMEM;
9914
 
+               LOG_SERIOUS("could not allocate from remap pool! (rc=%d)\n", rc);
9915
 
+               return rc;
9916
 
+       }
9917
 
+       newnode->remap.bad_sect  = new_bbr_entry->bad_sect;
9918
 
+       newnode->remap.replacement_sect = new_bbr_entry->replacement_sect;
9919
 
+       spin_lock_irqsave(&BBRID->bbr_id_lock, flags);  
9920
 
+       bbr_binary_tree_insert(&BBRID->remap_root,newnode);
9921
 
+       spin_unlock_irqrestore(&BBRID->bbr_id_lock, flags);
9922
 
+       return 0;
9923
 
+}
9924
 
+
9925
 
+/*
9926
 
+ * bbr_table_to_remap_list
9927
 
+ *
9928
 
+ * The on-disk bbr table is sorted by the replacement sector LBA
9929
 
+ * In order to improve run time performance, the in memory remap
9930
 
+ * list must be sorted by the bad sector LBA.
9931
 
+ * This function is called at discovery time to initialize the remap
9932
 
+ * list.  This function assumes that at least one copy of meta data is valid.
9933
 
+ */
9934
 
+static u_int32_t bbr_table_to_remap_list(bbr_instance_data_t *BBRID)
9935
 
+{
9936
 
+       u_int32_t in_use_blks = 0;
9937
 
+       int i, j;
9938
 
+       evms_bbr_table_t *p;
9939
 
+       
9940
 
+
9941
 
+       for (i=0, p=BBRID->bbr_table; i<BBRID->nr_sects_bbr_table; i++, p++) {
9942
 
+               if (!p->in_use_cnt)
9943
 
+                       break;
9944
 
+               in_use_blks += p->in_use_cnt;
9945
 
+               for (j=0; j<p->in_use_cnt; j++) {
9946
 
+                       bbr_insert_remap_entry(BBRID, &p->entries[j]);
9947
 
+               }
9948
 
+       }
9949
 
+
9950
 
+
9951
 
+       return in_use_blks;
9952
 
+}
9953
 
+
9954
 
+/*
9955
 
+ * bbr_search_remap_entry
9956
 
+ *
9957
 
+ * Search remap entry for the specified sector.
9958
 
+ * If found, return pointer to evms_bbr_table_entry_t.
9959
 
+ * Otherwise, return NULL.
9960
 
+ */
9961
 
+static evms_bbr_table_entry_t * bbr_search_remap_entry(bbr_instance_data_t *BBRID, evms_sector_t lsn)
9962
 
+{
9963
 
+       bbr_runtime_remap_t *p;
9964
 
+       unsigned long flags;
9965
 
+
9966
 
+       spin_lock_irqsave(&BBRID->bbr_id_lock, flags);
9967
 
+       p = bbr_binary_search(BBRID->remap_root, lsn);
9968
 
+       spin_unlock_irqrestore(&BBRID->bbr_id_lock, flags);
9969
 
+       if (p)
9970
 
+               return (&p->remap);
9971
 
+       else
9972
 
+               return NULL;
9973
 
+}
9974
 
+
9975
 
+/*
9976
 
+ * bbr_remap
9977
 
+ *     if *lsn is in the remap table, return TRUE and modify *lsn
9978
 
+ *     else, return FALSE.
9979
 
+ */
9980
 
+static inline int bbr_remap(bbr_instance_data_t *BBRID,
9981
 
+                    evms_sector_t *lsn)
9982
 
+{
9983
 
+       evms_bbr_table_entry_t *e;
9984
 
+
9985
 
+       if (atomic_read(&BBRID->in_use_replacement_blks) && 
9986
 
+           !(BBRID->flag & BBR_STOP_REMAP) ) {
9987
 
+               e = bbr_search_remap_entry(BBRID,*lsn);
9988
 
+               if (e) {
9989
 
+                       *lsn = e->replacement_sect;
9990
 
+                       LOG_EXTRA("%s replacement sector(LSN=%Lu)\n", __FUNCTION__, *lsn);
9991
 
+                       return TRUE;
9992
 
+               }
9993
 
+       }
9994
 
+       return FALSE;
9995
 
+}
9996
 
+
9997
 
+/*
9998
 
+ * bbr_remap_probe
9999
 
+ *     if any of the sectors [lsn, lsn+nr_sects] in the remap table
10000
 
+ *             return TRUE
10001
 
+ *     else, return FALSE.
10002
 
+ */
10003
 
+static inline int bbr_remap_probe(
10004
 
+       bbr_instance_data_t *BBRID,
10005
 
+       evms_sector_t lsn,
10006
 
+       evms_sector_t nr_sects)
10007
 
+{
10008
 
+       evms_sector_t tmp, cnt;
10009
 
+
10010
 
+       if (atomic_read(&BBRID->in_use_replacement_blks) &&
10011
 
+           !(BBRID->flag & BBR_STOP_REMAP) ) {
10012
 
+               for (cnt = 0, tmp=lsn;
10013
 
+                    cnt < nr_sects;
10014
 
+                    cnt += BBRID->blksize_in_sects, tmp = lsn + cnt) {
10015
 
+                       if (bbr_remap(BBRID,&tmp))
10016
 
+                               return TRUE;
10017
 
+               }
10018
 
+       }
10019
 
+       return FALSE;
10020
 
+}
10021
 
+
10022
 
+static int bbr_create_pools(bbr_instance_data_t *BBRID)
10023
 
+{
10024
 
+       /* create a memory pool for the remap list */
10025
 
+       sprintf(BBRID->remap_pool_name, "BBR_REMAP_%p", BBRID);
10026
 
+       sprintf(BBRID->bh_pool_name, "BBR_BH_%p", BBRID);
10027
 
+       BBRID->remap_pool = evms_cs_create_pool( 
10028
 
+               sizeof (bbr_runtime_remap_t), BBRID->remap_pool_name, NULL, NULL);
10029
 
+       BBRID->bbr_bh_pool = evms_cs_create_pool( 
10030
 
+                       sizeof(bbr_bh_t), BBRID->bh_pool_name, NULL, NULL);
10031
 
+
10032
 
+       if (!BBRID->remap_pool || !BBRID->bbr_bh_pool) {
10033
 
+               BBR_BUG(" Could not allocate pools!");
10034
 
+               bbr_destroy_pools(BBRID);
10035
 
+               return -ENOMEM;
10036
 
+       }
10037
 
+       return 0;
10038
 
+}
10039
 
+
10040
 
+static void bbr_destroy_pools(bbr_instance_data_t *BBRID)
10041
 
+{
10042
 
+       if (BBRID->bbr_bh_pool)
10043
 
+               evms_cs_destroy_pool(BBRID->bbr_bh_pool);
10044
 
+       if (BBRID->remap_pool)
10045
 
+               evms_cs_destroy_pool(BBRID->remap_pool);
10046
 
+}
10047
 
+
10048
 
+static int bbr_discover(evms_logical_node_t **discover_list)
10049
 
+{
10050
 
+        int rc = 0;
10051
 
+        evms_logical_node_t *node, *next_node;
10052
 
+        evms_logical_node_t *bbr_node = NULL;
10053
 
+        bbr_instance_data_t *BBRID;
10054
 
+       
10055
 
+       next_node = *discover_list;
10056
 
+        while(next_node) {
10057
 
+
10058
 
+               node = next_node;
10059
 
+               next_node = node->next;
10060
 
+
10061
 
+                       if ((!node->feature_header) || (node->feature_header->feature_id != plugin_header.id))
10062
 
+                               continue;  // probably a node we just put on the list, skip and go to next.
10063
 
+
10064
 
+               rc = load_feature_data(node, &BBRID);
10065
 
+               if (rc) {
10066
 
+                       /* error loading feature data */
10067
 
+                       /* This node belongs to us, but metadata is invalid,
10068
 
+                        *   remove it from the discovery list
10069
 
+                        *   delete it
10070
 
+                        *   clear error code then continue.
10071
 
+                        * Will consider creating a read only BBR node in the future.
10072
 
+                        */
10073
 
+                       LOG_SERIOUS(" Error in node (%s) with %Lu sectors.\n",
10074
 
+                                   node->name,node->total_vsectors);
10075
 
+                       evms_cs_remove_logical_node_from_list(discover_list, node);
10076
 
+                       DELETE(node);
10077
 
+                       rc = 0;
10078
 
+                       continue;
10079
 
+               }
10080
 
+
10081
 
+               rc = evms_cs_allocate_logical_node(&bbr_node);
10082
 
+               if (!rc) {
10083
 
+                       int bad_blocks;
10084
 
+
10085
 
+                       bbr_node->volume_info = node->volume_info;
10086
 
+                       bbr_node->flags |= node->flags;
10087
 
+                       bbr_node->plugin = &plugin_header;
10088
 
+                       strcpy(bbr_node->name, node->feature_header->object_name);
10089
 
+                       bbr_node->hardsector_size = node->hardsector_size;
10090
 
+                       bbr_node->total_vsectors = node->total_vsectors;
10091
 
+                       bbr_node->total_vsectors -= (u_int64_t)(evms_cs_size_in_vsectors(sizeof(evms_feature_header_t)) * 2);
10092
 
+                       bbr_node->total_vsectors -= (u_int64_t)node->feature_header->feature_data1_size;
10093
 
+                       bbr_node->total_vsectors -= (u_int64_t)node->feature_header->feature_data2_size;
10094
 
+                       bbr_node->block_size = node->block_size;
10095
 
+                       bbr_node->instance_data = BBRID;
10096
 
+                       BBRID->total_vsectors = bbr_node->total_vsectors;
10097
 
+                       BBRID->node = bbr_node;
10098
 
+
10099
 
+                       /* free the feature header */
10100
 
+                       evms_cs_deallocate_memory(node->feature_header);
10101
 
+                       node->feature_header = NULL;
10102
 
+                       evms_cs_remove_logical_node_from_list(discover_list, node);
10103
 
+
10104
 
+                       /* If bad blocks exist, give warning */
10105
 
+                       bad_blocks = atomic_read(&BBRID->in_use_replacement_blks);
10106
 
+                       if (bad_blocks) {
10107
 
+                               BBR_DEBUG_PRINT_REMAP_LIST(BBRID);
10108
 
+                               LOG_WARNING("%s has %d bad blocks\n", BBRID->source->name, bad_blocks);
10109
 
+                               LOG_WARNING("There are %Lu total replacement blocks.\n",
10110
 
+                                           BBRID->nr_replacement_blks);
10111
 
+                               LOG_WARNING("There are %Lu remaining replacement blocks.\n",
10112
 
+                                           BBRID->nr_replacement_blks - bad_blocks);
10113
 
+                       }
10114
 
+
10115
 
+                       evms_cs_add_logical_node_to_list(discover_list, bbr_node);
10116
 
+
10117
 
+                       MOD_INC_USE_COUNT;
10118
 
+                       bbr_list_add(BBRID);
10119
 
+               } else {
10120
 
+                       LOG_SERIOUS("could not allocate logical node! rc=%d\n",rc);
10121
 
+                       bbr_free_instance_data(BBRID);
10122
 
+               }
10123
 
+        } /* end while()*/
10124
 
+        return( rc );
10125
 
+}
10126
 
+
10127
 
+static inline void bbr_list_add(bbr_instance_data_t *BBRID)
10128
 
+{
10129
 
+               BBRID->next = bbr_instances;
10130
 
+       bbr_instances = BBRID;
10131
 
+}
10132
 
+
10133
 
+static void bbr_list_remove(bbr_instance_data_t *BBRID)
10134
 
+{
10135
 
+       bbr_instance_data_t *p;
10136
 
+
10137
 
+       if (!BBRID)
10138
 
+               return;
10139
 
+
10140
 
+       if (BBRID == bbr_instances) {
10141
 
+               bbr_instances = NULL;
10142
 
+               return;
10143
 
+       }
10144
 
+
10145
 
+       p = bbr_instances;
10146
 
+       while (p) {
10147
 
+               if (p->next == BBRID) {
10148
 
+                       p->next = p->next->next;
10149
 
+                       return;
10150
 
+               }
10151
 
+               p = p->next;
10152
 
+       }
10153
 
+}
10154
 
+
10155
 
+static bbr_instance_data_t *bbr_find_instance_data (char * object_name)
10156
 
+{
10157
 
+       bbr_instance_data_t *p = bbr_instances;
10158
 
+
10159
 
+       while (p) {
10160
 
+               if (!strcmp(p->node->name, object_name))
10161
 
+                       break;
10162
 
+               p = p->next;
10163
 
+       }
10164
 
+       return p;
10165
 
+}
10166
 
+
10167
 
+static void bbr_free_instance_data(bbr_instance_data_t *BBRID)
10168
 
+{
10169
 
+       if (BBRID->remap_root)
10170
 
+               bbr_free_remap(BBRID);
10171
 
+       bbr_destroy_pools(BBRID);
10172
 
+       if (BBRID->bbr_table)
10173
 
+               evms_cs_deallocate_memory(BBRID->bbr_table);
10174
 
+       bbr_list_remove(BBRID);
10175
 
+       evms_cs_deallocate_memory(BBRID);
10176
 
+}
10177
 
+
10178
 
+static int bbr_delete(evms_logical_node_t *bbr_node)
10179
 
+{
10180
 
+       bbr_instance_data_t *BBRID;
10181
 
+       int rc;
10182
 
+       
10183
 
+        BBRID = bbr_node->instance_data;
10184
 
+
10185
 
+        rc = DELETE(BBRID->source);
10186
 
+       if (!rc) {
10187
 
+               /* Now cleanup and go away */
10188
 
+               bbr_free_instance_data(BBRID);
10189
 
+               evms_cs_deallocate_logical_node(bbr_node);
10190
 
+               MOD_DEC_USE_COUNT;
10191
 
+               if (!bbr_instances) {
10192
 
+                       if (bbr_io_thread) {
10193
 
+                               evms_cs_unregister_thread(bbr_io_thread);
10194
 
+                               bbr_io_thread = NULL;
10195
 
+                       }
10196
 
+               }
10197
 
+       }
10198
 
+        return rc;
10199
 
+}
10200
 
+
10201
 
+static bbr_bh_t * allocate_bbr_bh(bbr_instance_data_t *BBRID, int rw)
10202
 
+{
10203
 
+       bbr_bh_t * bbr_bh;
10204
 
+
10205
 
+       bbr_bh = evms_cs_allocate_from_pool(BBRID->bbr_bh_pool, TRUE);
10206
 
+       if (bbr_bh) {
10207
 
+               memset(bbr_bh, 0, sizeof(bbr_bh_t));
10208
 
+               bbr_bh->BBRID = BBRID;
10209
 
+               bbr_bh->rw = rw;
10210
 
+               atomic_set(&bbr_bh->waiters, 0);
10211
 
+       }
10212
 
+       else {
10213
 
+               LOG_WARNING("Could not allocate from BBR BH pool!\n");
10214
 
+       }
10215
 
+       return bbr_bh;
10216
 
+}
10217
 
+
10218
 
+static void free_bbr_bh(bbr_bh_t *bbr_bh)
10219
 
+{
10220
 
+       evms_cs_deallocate_to_pool(bbr_bh->BBRID->bbr_bh_pool, bbr_bh);
10221
 
+}
10222
 
+
10223
 
+
10224
 
+/* bbr_io_remap_error
10225
 
+ *
10226
 
+ *     For the requested range, try to write each sector individually. For each
10227
 
+ *     sector that fails, find the next available remap location and write the
10228
 
+ *     data to that new location. Then update the table and write both copies
10229
 
+ *     of the table to disk. Finally, update the in-memory mapping and do any
10230
 
+ *     other necessary bookkeeping.
10231
 
+ */
10232
 
+static int bbr_io_remap_error( bbr_instance_data_t     * BBRID,
10233
 
+                               int                     rw,
10234
 
+                               evms_sector_t           starting_lsn,
10235
 
+                               evms_sector_t           count,
10236
 
+                               char                    * buffer )
10237
 
+{
10238
 
+       evms_sector_t           lsn, new_lsn;
10239
 
+       evms_bbr_table_t        * bbr_table;
10240
 
+       unsigned long           table_sector_index;
10241
 
+       unsigned long           table_sector_offset;
10242
 
+       unsigned long           index;
10243
 
+       int                     rc;
10244
 
+
10245
 
+       if ( rw == READ ) {
10246
 
+               // Nothing can be done about read errors.
10247
 
+               return -EIO;
10248
 
+       }
10249
 
+
10250
 
+       // For each sector in the request.
10251
 
+       for ( lsn = 0; lsn < count; lsn++, buffer += EVMS_VSECTOR_SIZE ) {
10252
 
+               rc = INIT_IO(BBRID->source, rw, starting_lsn + lsn, 1, buffer);
10253
 
+               while (rc) {
10254
 
+                       if ( BBRID->flag & BBR_STOP_REMAP ) {
10255
 
+                               // Can't allow new remaps if the engine told us to stop.
10256
 
+                               LOG_ERROR("object %s: Bad sector (%Lu), but remapping is turned off.\n",
10257
 
+                                       BBRID->node->name, starting_lsn + lsn);
10258
 
+                               return -EIO;
10259
 
+                       }
10260
 
+
10261
 
+                       // Find the next available relocation sector.
10262
 
+                       new_lsn = atomic_read(&BBRID->in_use_replacement_blks);
10263
 
+                       if ( new_lsn >= BBRID->nr_replacement_blks ) {
10264
 
+                               // No more replacement sectors available.
10265
 
+                               return -EIO;
10266
 
+                       }
10267
 
+                       new_lsn += BBRID->start_replacement_sect;
10268
 
+
10269
 
+                       // Write the data to its new location.
10270
 
+                       LOG_WARNING("object %s: Trying to remap bad sector (%Lu) to sector (%Lu)\n",
10271
 
+                                       BBRID->node->name, starting_lsn + lsn, new_lsn);
10272
 
+                       rc = INIT_IO(BBRID->source, rw, new_lsn, 1, buffer);
10273
 
+                       if (rc) {
10274
 
+                               // This replacement sector is bad. Try the next.
10275
 
+                               LOG_ERROR("object %s: Replacement sector (%Lu) is bad. Skipping.\n",
10276
 
+                                       BBRID->node->name, new_lsn);
10277
 
+                               atomic_inc(&BBRID->in_use_replacement_blks);
10278
 
+                               continue;
10279
 
+                       }
10280
 
+
10281
 
+                       // Add this new entry to the on-disk table.
10282
 
+                       table_sector_index = new_lsn - BBRID->start_replacement_sect;
10283
 
+                       table_sector_offset = table_sector_index / EVMS_BBR_ENTRIES_PER_SECT;
10284
 
+                       index = table_sector_index % EVMS_BBR_ENTRIES_PER_SECT;
10285
 
+
10286
 
+                       bbr_table = &BBRID->bbr_table[table_sector_offset];
10287
 
+                       bbr_table->entries[index].bad_sect = starting_lsn + lsn;
10288
 
+                       bbr_table->entries[index].replacement_sect = new_lsn;
10289
 
+                       bbr_table->in_use_cnt++;
10290
 
+                       bbr_table->sequence_number++;
10291
 
+                       bbr_table->crc = 0;
10292
 
+                       bbr_table->crc = evms_cs_calculate_crc( EVMS_INITIAL_CRC,
10293
 
+                                                               bbr_table,
10294
 
+                                                               sizeof(evms_bbr_table_t));
10295
 
+
10296
 
+                       // Write the table to disk.
10297
 
+                       cpu_bbr_table_sector_to_le(bbr_table, bbr_table);
10298
 
+                       if ( BBRID->lba_table1 ) {
10299
 
+                               rc = INIT_IO(BBRID->source, WRITE, BBRID->lba_table1 + table_sector_offset, 1, bbr_table);
10300
 
+                       }
10301
 
+                       if ( BBRID->lba_table2 ) {
10302
 
+                               rc |= INIT_IO(BBRID->source, WRITE, BBRID->lba_table2 + table_sector_offset, 1, bbr_table);
10303
 
+                       }
10304
 
+                       le_bbr_table_sector_to_cpu(bbr_table);
10305
 
+
10306
 
+                       if (rc) {
10307
 
+                               // Error writing one of the tables to disk.
10308
 
+                               LOG_ERROR("object %s: Error updating BBR tables on disk.\n",
10309
 
+                                       BBRID->node->name);
10310
 
+                               return rc;
10311
 
+                       }
10312
 
+
10313
 
+                       // Insert a new entry in the remapping binary-tree.
10314
 
+                       rc = bbr_insert_remap_entry(BBRID, &bbr_table->entries[index]);
10315
 
+                       if (rc) {
10316
 
+                               LOG_ERROR("object %s: Error adding new entry to remap tree.\n",
10317
 
+                                       BBRID->node->name);
10318
 
+                               return rc;
10319
 
+                       }
10320
 
+
10321
 
+                       atomic_inc(&BBRID->in_use_replacement_blks);
10322
 
+               }
10323
 
+       }
10324
 
+
10325
 
+       return 0;
10326
 
+}
10327
 
+
10328
 
+
10329
 
+/* bbr_io_process_request
10330
 
+ *
10331
 
+ *     For each sector in this request, check if the sector has already
10332
 
+ *     been remapped. If so, process all previous sectors in the request,
10333
 
+ *     followed by the remapped sector. Then reset the starting lsn and
10334
 
+ *     count, and keep going with the rest of the request as if it were
10335
 
+ *     a whole new request. If any of the INIT_IO's return an error,
10336
 
+ *     call the remapper to relocate the bad sector(s).
10337
 
+ */
10338
 
+static int bbr_io_process_request( bbr_bh_t * bbr_bh )
10339
 
+{
10340
 
+       bbr_instance_data_t     * BBRID = bbr_bh->BBRID;
10341
 
+       evms_sector_t           starting_lsn = bbr_bh->eio.rsector;
10342
 
+       evms_sector_t           count = bbr_bh->eio.rsize;
10343
 
+       evms_sector_t           lsn, remapped_lsn;
10344
 
+       char                    * buffer = bbr_bh->eio.bh->b_data;
10345
 
+       int                     rc = 0, rw = bbr_bh->rw;
10346
 
+
10347
 
+       // For each sector in this request, check if this sector has already
10348
 
+       // been remapped. If so, process all previous sectors in this request,
10349
 
+       // followed by the remapped sector. Then reset the starting lsn and
10350
 
+       // count and keep going with the rest of the request as if it were
10351
 
+       // a whole new request.
10352
 
+       for ( lsn = 0; lsn < count && !(BBRID->flag & BBR_STOP_REMAP); lsn++ ) {
10353
 
+               remapped_lsn = starting_lsn + lsn;
10354
 
+               rc = bbr_remap(BBRID, &remapped_lsn);
10355
 
+               if (rc) {
10356
 
+                       // Process all sectors in the request up to this one.
10357
 
+                       if (lsn > 0) {
10358
 
+                               rc = INIT_IO(BBRID->source, rw, starting_lsn, lsn, buffer);
10359
 
+                               if (rc) {
10360
 
+                                       // If this I/O failed, then one of the
10361
 
+                                       // sectors in this request needs to be
10362
 
+                                       // relocated.
10363
 
+                                       rc = bbr_io_remap_error(BBRID, rw, starting_lsn, lsn, buffer);
10364
 
+                                       if (rc) {
10365
 
+                                               return rc;
10366
 
+                                       }
10367
 
+                               }
10368
 
+                               buffer += (lsn << EVMS_VSECTOR_SIZE_SHIFT);
10369
 
+                       }
10370
 
+
10371
 
+                       // Process the remapped sector.
10372
 
+                       rc = INIT_IO(BBRID->source, rw, remapped_lsn, 1, buffer);
10373
 
+                       if (rc) {
10374
 
+                               // BUGBUG - Need more processing if this caused an error.
10375
 
+                               // If this I/O failed, then the existing remap
10376
 
+                               // is now bad, and we need to find a new remap.
10377
 
+                               // Can't use bbr_io_remap_error(), because the
10378
 
+                               // existing map entry needs to be changed, not
10379
 
+                               // added again, and the original table entry
10380
 
+                               // also needs to be changed.
10381
 
+                               return rc;
10382
 
+                       }
10383
 
+
10384
 
+                       buffer          += EVMS_VSECTOR_SIZE;
10385
 
+                       starting_lsn    += (lsn + 1);
10386
 
+                       count           -= (lsn + 1);
10387
 
+                       lsn             = -1;
10388
 
+               }
10389
 
+       }
10390
 
+
10391
 
+       // Check for any remaining sectors after the last split. This could
10392
 
+       // potentially be the whole request, but that should be a rare case
10393
 
+       // because requests should only be processed by the thread if we know
10394
 
+       // an error occurred or they contained one or more remapped sectors.
10395
 
+       if ( count ) {
10396
 
+               rc = INIT_IO(BBRID->source, rw, starting_lsn, count, buffer);
10397
 
+               if (rc) {
10398
 
+                       // If this I/O failed, then one of the sectors in this
10399
 
+                       // request needs to be relocated.
10400
 
+                       rc = bbr_io_remap_error(BBRID, rw, starting_lsn, lsn, buffer);
10401
 
+                       if (rc) {
10402
 
+                               return rc;
10403
 
+                       }
10404
 
+               }
10405
 
+       }
10406
 
+
10407
 
+       return 0;
10408
 
+}
10409
 
+
10410
 
+
10411
 
+/* bbr_io_handler
10412
 
+ *
10413
 
+ *     This is the handler for the bbr_io_thread. It continuously loops,
10414
 
+ *     taking I/O requests off its list and processing them. If nothing
10415
 
+ *     is on the list, the thread goes back to sleep until specifically
10416
 
+ *     woken up.
10417
 
+ *
10418
 
+ *     I/O requests should only be sent to this thread if we know that:
10419
 
+ *     a) the request contains at least one remapped sector.
10420
 
+ *        or
10421
 
+ *     b) the request caused an error on the normal I/O path.
10422
 
+ *     This function uses synchronous I/O, so sending a request to this
10423
 
+ *     thread that doesn't need special processing will cause severe
10424
 
+ *     performance degredation.
10425
 
+ */
10426
 
+static void bbr_io_handler( void * void_data )
10427
 
+{
10428
 
+       bbr_bh_t                * bbr_bh;
10429
 
+       struct buffer_head      * bh;
10430
 
+       unsigned long           flags;
10431
 
+       int                     rc = 0;
10432
 
+
10433
 
+       while (1) {
10434
 
+               // Process bbr_io_list, one entry at a time.
10435
 
+               spin_lock_irqsave(&bbr_io_list_lock, flags);
10436
 
+               bbr_bh = bbr_io_list;
10437
 
+               if (!bbr_bh) {
10438
 
+                       spin_unlock_irqrestore(&bbr_io_list_lock, flags);
10439
 
+                       break; // No more items on the list.
10440
 
+               }
10441
 
+               bbr_io_list = bbr_bh->next;
10442
 
+               spin_unlock_irqrestore(&bbr_io_list_lock, flags);
10443
 
+
10444
 
+               rc = bbr_io_process_request(bbr_bh);
10445
 
+
10446
 
+               // Clean up and complete the original I/O.
10447
 
+               bh = bbr_bh->eio.bh;
10448
 
+               if (bh->b_end_io) {
10449
 
+                       // A normal request that originated from above EVMS.
10450
 
+                       if ( ! (bbr_bh->flag & BBR_BH_USE_EVMS_CALLBACK) ) {
10451
 
+                               evms_cs_volume_request_in_progress(bh->b_dev, -1, NULL);
10452
 
+                       }
10453
 
+                       free_bbr_bh(bbr_bh);
10454
 
+                       bh->b_end_io(bh, rc ? 0 : 1);
10455
 
+               }
10456
 
+               else {
10457
 
+                       // A request that originated from bbr_init_io.
10458
 
+                       bbr_bh->rc = rc;
10459
 
+                       if ( waitqueue_active(&bh->b_wait) ) {
10460
 
+                               atomic_dec(&bbr_bh->waiters);
10461
 
+                               wake_up(&bh->b_wait);
10462
 
+                       }
10463
 
+               }
10464
 
+       }
10465
 
+}
10466
 
+
10467
 
+
10468
 
+/* bbr_schedule_io
10469
 
+ *
10470
 
+ *     Place the specified bbr_bh on the thread's processing list.
10471
 
+ */
10472
 
+static void bbr_schedule_io( bbr_bh_t * bbr_bh )
10473
 
+{
10474
 
+       unsigned long flags;
10475
 
+
10476
 
+       spin_lock_irqsave(&bbr_io_list_lock, flags);
10477
 
+       if (bbr_io_list == NULL)
10478
 
+               bbr_io_list_tail = &bbr_io_list;
10479
 
+       *bbr_io_list_tail = bbr_bh;
10480
 
+       bbr_io_list_tail = &bbr_bh->next;
10481
 
+       bbr_bh->next = NULL;
10482
 
+       spin_unlock_irqrestore(&bbr_io_list_lock, flags);
10483
 
+       evms_cs_wakeup_thread(bbr_io_thread);
10484
 
+}
10485
 
+
10486
 
+
10487
 
+/* bbr_read
10488
 
+ *
10489
 
+ *     If there are any remapped sectors on this object, send this request over
10490
 
+ *     to the thread for processing. Otherwise send it down the stack normally.
10491
 
+ */
10492
 
+static void bbr_read(  evms_logical_node_t     * bbr_node,
10493
 
+                       eio_t                   * eio )
10494
 
+{
10495
 
+        bbr_instance_data_t    * BBRID = bbr_node->instance_data;
10496
 
+       bbr_bh_t                * bbr_bh;
10497
 
+
10498
 
+       if ( eio->rsector + eio->rsize <= bbr_node->total_vsectors ) {
10499
 
+               if ( atomic_read(&BBRID->in_use_replacement_blks) == 0 ||
10500
 
+                    BBRID->flag & BBR_STOP_REMAP ||
10501
 
+                    ! bbr_remap_probe(BBRID, eio->rsector, eio->rsize) ) {
10502
 
+                       R_IO(BBRID->source, eio);
10503
 
+               }
10504
 
+               else {
10505
 
+                       bbr_bh = allocate_bbr_bh(BBRID, READ);
10506
 
+                       if (bbr_bh) {
10507
 
+                               bbr_bh->eio = *eio;
10508
 
+                               evms_cs_volume_request_in_progress(bbr_bh->eio.bh->b_dev, +1, NULL);
10509
 
+                               bbr_schedule_io(bbr_bh);
10510
 
+                       }
10511
 
+                       else {
10512
 
+                               // Can't get memory to track the I/O.
10513
 
+                               EVMS_IO_ERROR(eio);
10514
 
+                       }
10515
 
+               }
10516
 
+       }
10517
 
+       else {
10518
 
+               // Request is off the end of the object.
10519
 
+               EVMS_IO_ERROR(eio);
10520
 
+       }
10521
 
+}
10522
 
+
10523
 
+
10524
 
+/* bbr_write_callback
10525
 
+ *
10526
 
+ *     This is the callback for normal write requests. Check for an error
10527
 
+ *     during the I/O, and send to the thread for processing if necessary.
10528
 
+ */
10529
 
+static void bbr_write_callback(        bbr_bh_t                * bbr_bh,
10530
 
+                               struct buffer_head      * bh,
10531
 
+                               int                     uptodate,
10532
 
+                               int                     * redrive )
10533
 
+{
10534
 
+       if ( ! uptodate &&
10535
 
+            ! (bbr_bh->BBRID->flag & BBR_STOP_REMAP) ) {
10536
 
+               LOG_ERROR("object %s: Write failure on sector (%Lu). Scheduling for retry.\n",
10537
 
+                       bbr_bh->BBRID->node->name, bbr_bh->eio.rsector);
10538
 
+               bbr_schedule_io(bbr_bh);
10539
 
+               *redrive = TRUE;
10540
 
+       }
10541
 
+       else {
10542
 
+               free_bbr_bh(bbr_bh);
10543
 
+       }
10544
 
+}
10545
 
+
10546
 
+
10547
 
+/* bbr_write
10548
 
+ *
10549
 
+ *     If there are any remapped sectors on this object, send the request over
10550
 
+ *     to the thread for processing. Otherwise, register for callback
10551
 
+ *     notification, and send the request down normally.
10552
 
+ */
10553
 
+static void bbr_write(evms_logical_node_t *bbr_node, eio_t *eio)
10554
 
+{
10555
 
+        bbr_instance_data_t    * BBRID = bbr_node->instance_data;
10556
 
+       bbr_bh_t                * bbr_bh;
10557
 
+
10558
 
+       if ( eio->rsector + eio->rsize <= bbr_node->total_vsectors &&
10559
 
+            ! (BBRID->flag & EVMS_VOLUME_READ_ONLY) ) {
10560
 
+               bbr_bh = allocate_bbr_bh(BBRID, WRITE);
10561
 
+               if (bbr_bh) {
10562
 
+                       bbr_bh->eio = *eio;
10563
 
+
10564
 
+                       if ( atomic_read(&BBRID->in_use_replacement_blks) == 0 ||
10565
 
+                            BBRID->flag & BBR_STOP_REMAP ||
10566
 
+                            ! bbr_remap_probe(BBRID, eio->rsector, eio->rsize) ) {
10567
 
+                               bbr_bh->flag |= BBR_BH_USE_EVMS_CALLBACK;
10568
 
+                               evms_cs_register_for_end_io_notification(bbr_bh, eio->bh, bbr_write_callback);
10569
 
+                               W_IO(BBRID->source, eio);
10570
 
+                       }
10571
 
+                       else {
10572
 
+                               evms_cs_volume_request_in_progress(eio->bh->b_dev, +1, NULL);
10573
 
+                               bbr_schedule_io(bbr_bh);
10574
 
+                       }
10575
 
+               }
10576
 
+               else {
10577
 
+                       // Can't get memory to track the I/O.
10578
 
+                       EVMS_IO_ERROR(eio);
10579
 
+               }
10580
 
+       }
10581
 
+       else {
10582
 
+               // Request is off the end of the object, or this
10583
 
+               // is a read-only object.
10584
 
+               EVMS_IO_ERROR(eio);
10585
 
+       }
10586
 
+}
10587
 
+
10588
 
+
10589
 
+/********************************************************/
10590
 
+/* Required Plugin Function Table Entry Point:          */
10591
 
+/*      Init_io function                                */
10592
 
+/********************************************************/
10593
 
+
10594
 
+
10595
 
+static int bbr_init_io_schedule_io(    bbr_instance_data_t     * BBRID,
10596
 
+                                       int                     rw,
10597
 
+                                       evms_sector_t           lsn,
10598
 
+                                       evms_sector_t           count,
10599
 
+                                       void                    * buffer )
10600
 
+{
10601
 
+       bbr_bh_t                * bbr_bh;
10602
 
+       struct buffer_head      * bh;
10603
 
+       int                     rc = 0;
10604
 
+
10605
 
+       if ( rw == WRITE ) {
10606
 
+               LOG_ERROR("object %s: init_io write failure (sector %Lu: count %Lu). Scheduling for retry.\n",
10607
 
+                       BBRID->node->name, lsn, count);
10608
 
+               bbr_bh = allocate_bbr_bh(BBRID,rw);
10609
 
+               if (bbr_bh) {
10610
 
+                       bbr_bh->eio.rsector = lsn;
10611
 
+                       bbr_bh->eio.rsize = count;
10612
 
+       
10613
 
+                       bh = evms_cs_allocate_from_pool(evms_bh_pool, TRUE);
10614
 
+                       if (bh) {
10615
 
+                               bbr_bh->eio.bh = bh;
10616
 
+
10617
 
+                               memset(bh, 0, sizeof(*bh));
10618
 
+                               init_waitqueue_head(&bh->b_wait);
10619
 
+                               bh->b_data = buffer;
10620
 
+                               bh->b_end_io = NULL;
10621
 
+       
10622
 
+                               atomic_inc(&bbr_bh->waiters);
10623
 
+                               bbr_schedule_io(bbr_bh);
10624
 
+                               wait_event(bh->b_wait, (atomic_read(&bbr_bh->waiters) == 0));
10625
 
+
10626
 
+                               rc = bbr_bh->rc;
10627
 
+
10628
 
+                               evms_cs_deallocate_to_pool(evms_bh_pool, bh);
10629
 
+                       }
10630
 
+                       else {
10631
 
+                               // Couldn't get buffer head.
10632
 
+                               rc = -ENOMEM;
10633
 
+                       }
10634
 
+
10635
 
+                       free_bbr_bh(bbr_bh);
10636
 
+               }
10637
 
+               else {
10638
 
+                       // Couldn't get bbr_bh.
10639
 
+                       rc = -ENOMEM;
10640
 
+               }
10641
 
+       }
10642
 
+       else {
10643
 
+               // Nothing can be done about read failures.
10644
 
+               rc = -EIO;
10645
 
+       }
10646
 
+
10647
 
+       return 0;
10648
 
+}
10649
 
+
10650
 
+static int bbr_init_io(        evms_logical_node_t     * bbr_node,
10651
 
+                       int                     io_flag,
10652
 
+                       evms_sector_t           start_lsn,
10653
 
+                       evms_sector_t           count,
10654
 
+                       void                    * buffer )
10655
 
+{
10656
 
+        bbr_instance_data_t    * BBRID;
10657
 
+       evms_sector_t           lsn;
10658
 
+       int                     rc = 0;
10659
 
+
10660
 
+       if ( start_lsn + count <= bbr_node->total_vsectors ) {
10661
 
+               BBRID = bbr_node->instance_data;
10662
 
+
10663
 
+               if ( io_flag == WRITE && (BBRID->flag & EVMS_VOLUME_READ_ONLY) ) {
10664
 
+                       // Can't write to a read-only object.
10665
 
+                       rc = -EINVAL;
10666
 
+               }
10667
 
+               else {
10668
 
+                       if ( BBRID->flag & BBR_STOP_REMAP ) {
10669
 
+                               // Can't remap at all.
10670
 
+                               rc = INIT_IO(BBRID->source, io_flag, start_lsn, count, buffer);
10671
 
+                       }
10672
 
+                       else if ( atomic_read(&BBRID->in_use_replacement_blks) == 0 ||
10673
 
+                                 ! bbr_remap_probe(BBRID, start_lsn, count) ) {
10674
 
+                               // Normal case (no existing remaps)
10675
 
+                               rc = INIT_IO(BBRID->source, io_flag, start_lsn, count, buffer);
10676
 
+                               if (rc) {
10677
 
+                                       // Init_io error. Send request over to
10678
 
+                                       // thread for further processing.
10679
 
+                                       rc = bbr_init_io_schedule_io(BBRID, io_flag, start_lsn, count, buffer);
10680
 
+                               }
10681
 
+                       }
10682
 
+                       else {
10683
 
+                               // At least one sector in this request needs to
10684
 
+                               // be remapped. Test and send each one down
10685
 
+                               // individually.
10686
 
+                               for ( lsn = start_lsn; lsn < start_lsn + count; lsn++, buffer += EVMS_VSECTOR_SIZE ) {
10687
 
+                                       bbr_remap(BBRID, &lsn);
10688
 
+                                       rc = INIT_IO(BBRID->source, io_flag, lsn, 1, buffer);
10689
 
+                                       if (rc) {
10690
 
+                                               // Init_io error. Send request
10691
 
+                                               // to thread for processing.
10692
 
+                                               rc = bbr_init_io_schedule_io(BBRID, io_flag, lsn, 1, buffer);
10693
 
+                                               if (rc) {
10694
 
+                                                       break;
10695
 
+                                               }
10696
 
+                                       }
10697
 
+                               }
10698
 
+                       }
10699
 
+               }
10700
 
+       }
10701
 
+       else {
10702
 
+               // Request is off the end of the object.
10703
 
+               rc = -EINVAL;
10704
 
+       }
10705
 
+
10706
 
+       return rc;
10707
 
+}
10708
 
+
10709
 
+
10710
 
+/********************************************************/
10711
 
+/* Required Plugin Function Table Entry Point:          */
10712
 
+/*      IOCTL function                                  */
10713
 
+/********************************************************/
10714
 
+
10715
 
+static int bbr_direct_ioctl_sector_io( bbr_instance_data_t     * BBRID,
10716
 
+                                       evms_notify_bbr_t       * ioctl_arg )
10717
 
+{
10718
 
+       char            * buffer, *user_buffer;
10719
 
+       evms_sector_t   lsn;
10720
 
+       int             rc = 0;
10721
 
+
10722
 
+       if ( evms_cs_allocate_memory((void**)&buffer, EVMS_VSECTOR_SIZE) ) {
10723
 
+               return -ENOMEM;
10724
 
+       }
10725
 
+
10726
 
+       user_buffer = (char*)ioctl_arg->buffer;
10727
 
+
10728
 
+       for ( lsn = 0; lsn < ioctl_arg->nr_sect; lsn++, user_buffer += EVMS_VSECTOR_SIZE ) {
10729
 
+               if ( ioctl_arg->rw == WRITE ) {
10730
 
+                       if ( copy_from_user(buffer, user_buffer, EVMS_VSECTOR_SIZE) ) {
10731
 
+                               rc = -EFAULT;
10732
 
+                               break;
10733
 
+                       }
10734
 
+               }
10735
 
+
10736
 
+               rc = bbr_init_io(BBRID->node, ioctl_arg->rw, ioctl_arg->start_sect + lsn, 1, buffer);
10737
 
+               if (rc) {
10738
 
+                       break;
10739
 
+               }
10740
 
+
10741
 
+               if ( ioctl_arg->rw == READ ) {
10742
 
+                       if ( copy_to_user(user_buffer, buffer, EVMS_VSECTOR_SIZE) ) {
10743
 
+                               rc = -EFAULT;
10744
 
+                               break;
10745
 
+                       }
10746
 
+               }
10747
 
+       }
10748
 
+
10749
 
+       evms_cs_deallocate_memory(buffer);
10750
 
+       return rc;
10751
 
+}
10752
 
+
10753
 
+static int bbr_direct_ioctl (
10754
 
+       struct inode *inode,
10755
 
+       struct file *file,
10756
 
+       unsigned int cmd,
10757
 
+       unsigned long arg)
10758
 
+{
10759
 
+       int rc = 0;
10760
 
+       bbr_instance_data_t *BBRID;
10761
 
+       evms_plugin_ioctl_t argument;
10762
 
+       evms_notify_bbr_t ioctl_arg, *usr_ioctl_arg;
10763
 
+
10764
 
+       if ( copy_from_user(&argument, (evms_plugin_ioctl_t *)arg, sizeof(argument)) ) {
10765
 
+               return -EFAULT;
10766
 
+       }
10767
 
+
10768
 
+       if ( argument.feature_id != plugin_header.id ) {
10769
 
+               return -EINVAL;
10770
 
+       }
10771
 
+
10772
 
+       usr_ioctl_arg = (evms_notify_bbr_t*)argument.feature_ioctl_data;
10773
 
+       if ( copy_from_user(&ioctl_arg, usr_ioctl_arg, sizeof(ioctl_arg)) ) {
10774
 
+               rc = -EFAULT;
10775
 
+       }
10776
 
+       else {
10777
 
+               BBRID = bbr_find_instance_data(ioctl_arg.object_name);
10778
 
+               if (!BBRID)
10779
 
+                       rc = -ENODEV;
10780
 
+
10781
 
+               if (!rc) {
10782
 
+
10783
 
+                       switch(argument.feature_command) {
10784
 
+
10785
 
+                       case BBR_STOP_REMAP_CMD:
10786
 
+                               BBRID->flag |= BBR_STOP_REMAP;
10787
 
+                               // Fall through.
10788
 
+
10789
 
+                       case BBR_GET_INFO_CMD:
10790
 
+                               ioctl_arg.count = atomic_read(&BBRID->in_use_replacement_blks);
10791
 
+                               if ( copy_to_user(&usr_ioctl_arg->count,
10792
 
+                                               &ioctl_arg.count,
10793
 
+                                               sizeof(usr_ioctl_arg->count)) ) {
10794
 
+                                       rc = -EFAULT;
10795
 
+                               }
10796
 
+                               break;
10797
 
+
10798
 
+                       case BBR_SECTOR_IO_CMD:
10799
 
+                               rc = bbr_direct_ioctl_sector_io(BBRID, &ioctl_arg);
10800
 
+                               break;
10801
 
+
10802
 
+                       default:
10803
 
+                               rc = -ENOSYS;
10804
 
+                       }
10805
 
+               }
10806
 
+       }
10807
 
+
10808
 
+       argument.status = rc;
10809
 
+       copy_to_user((evms_plugin_ioctl_t*)arg, &argument, sizeof(argument));
10810
 
+       return rc;
10811
 
+}
10812
 
+
10813
 
+static int bbr_ioctl (evms_logical_node_t *bbr_node,
10814
 
+                     struct inode *inode,
10815
 
+                     struct file *file,
10816
 
+                     unsigned int cmd,
10817
 
+                     unsigned long arg)
10818
 
+{
10819
 
+        bbr_instance_data_t *BBRID;
10820
 
+        int rc;
10821
 
+
10822
 
+        rc = 0;
10823
 
+        BBRID = bbr_node->instance_data;
10824
 
+        if (!inode)
10825
 
+                return -EINVAL;
10826
 
+        switch (cmd) {
10827
 
+               case EVMS_PLUGIN_IOCTL:
10828
 
+                       rc = bbr_direct_ioctl(inode,file,cmd,arg);
10829
 
+                       break;
10830
 
+               case EVMS_GET_BMAP:
10831
 
+               {
10832
 
+                       evms_get_bmap_t *bmap = (evms_get_bmap_t *)arg;
10833
 
+       
10834
 
+                       bbr_remap(BBRID, &bmap->rsector);
10835
 
+                       /* fall thru */
10836
 
+               }
10837
 
+       
10838
 
+               default:
10839
 
+                       rc = IOCTL(BBRID->source, inode, file, cmd, arg);
10840
 
+        }
10841
 
+        return rc;
10842
 
+}
10843
 
+
10844
 
+int bbr_notify_reboot(struct notifier_block *this, unsigned long code, void *x)
10845
 
+{
10846
 
+       if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) {
10847
 
+
10848
 
+               LOG_DEFAULT("%s unregister BBR threads\n", __FUNCTION__);
10849
 
+               if (bbr_io_thread)
10850
 
+                       evms_cs_unregister_thread(bbr_io_thread);
10851
 
+               mdelay(1000*1); /* delay some */
10852
 
+       }
10853
 
+       return NOTIFY_DONE;
10854
 
+}
10855
 
+
10856
 
+static int __init bbr_init(void)
10857
 
+{
10858
 
+       /* Register for reboot notification */
10859
 
+       register_reboot_notifier(&bbr_notifier);
10860
 
+
10861
 
+        return evms_cs_register_plugin(&plugin_header);
10862
 
+}
10863
 
+
10864
 
+static void __exit bbr_exit(void)
10865
 
+{
10866
 
+       evms_cs_unregister_plugin(&plugin_header);
10867
 
+}
10868
 
+
10869
 
+
10870
 
+module_init(bbr_init);
10871
 
+module_exit(bbr_exit);
10872
 
+#ifdef MODULE_LICENSE
10873
 
+MODULE_LICENSE("GPL");
10874
 
+#endif
10875
 
+
10876
 
diff -Naur linux-2002-03-28/drivers/evms/evms_drivelink.c evms-2002-03-28/drivers/evms/evms_drivelink.c
10877
 
--- linux-2002-03-28/drivers/evms/evms_drivelink.c      Wed Dec 31 18:00:00 1969
10878
 
+++ evms-2002-03-28/drivers/evms/evms_drivelink.c       Wed Mar 27 15:51:36 2002
10879
 
@@ -0,0 +1,1107 @@
10880
 
+/* -*- linux-c -*- */
10881
 
+
10882
 
+/*
10883
 
+ *
10884
 
+ *
10885
 
+ *   Copyright (c) International Business Machines  Corp., 2000
10886
 
+ *
10887
 
+ *   This program is free software;  you can redistribute it and/or modify
10888
 
+ *   it under the terms of the GNU General Public License as published by
10889
 
+ *   the Free Software Foundation; either version 2 of the License, or
10890
 
+ *   (at your option) any later version.
10891
 
+ *
10892
 
+ *   This program is distributed in the hope that it will be useful,
10893
 
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
10894
 
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
10895
 
+ *   the GNU General Public License for more details.
10896
 
+ *
10897
 
+ *   You should have received a copy of the GNU General Public License
10898
 
+ *   along with this program;  if not, write to the Free Software
10899
 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
10900
 
+ *
10901
 
+ *
10902
 
+ */
10903
 
+/*
10904
 
+ * linux/drivers/evms/drvlink.c
10905
 
+
10906
 
+ *
10907
 
+ * EVMS Drive Linking Feature.
10908
 
+ *
10909
 
+ * This feature provides the ability to link multiple storage objects
10910
 
+ * together as a single virtual storage object.
10911
 
+ *
10912
 
+ */
10913
 
+
10914
 
+#include <linux/module.h>
10915
 
+#include <linux/kernel.h>
10916
 
+#include <linux/config.h>
10917
 
+#include <linux/genhd.h>
10918
 
+#include <linux/blk.h>
10919
 
+#include <linux/evms/evms_kernel.h>
10920
 
+#include <linux/evms/evms_drivelink.h>
10921
 
+#include <asm/uaccess.h>
10922
 
+
10923
 
+#define LOG_PREFIX "drivelink: "
10924
 
+
10925
 
+/* prototypes for mandatory plugin interface functions */
10926
 
+static int  drivelink_discover(evms_logical_node_t **);
10927
 
+static int  drivelink_delete(evms_logical_node_t *);
10928
 
+static void drivelink_read(evms_logical_node_t *, eio_t *);
10929
 
+static void drivelink_write(evms_logical_node_t *, eio_t *);
10930
 
+static int  drivelink_ioctl(evms_logical_node_t *, 
10931
 
+                           struct inode *, 
10932
 
+                            struct file *, 
10933
 
+                           unsigned int, 
10934
 
+                           unsigned long);
10935
 
+static int  drivelink_init_io(evms_logical_node_t *, 
10936
 
+                             int, 
10937
 
+                             evms_sector_t,
10938
 
+                              evms_sector_t, 
10939
 
+                             void *);
10940
 
+
10941
 
+/* plugin function table definition */
10942
 
+static evms_plugin_function_table_t function_table = {
10943
 
+        discover: &drivelink_discover,
10944
 
+        delete  : &drivelink_delete,
10945
 
+        read    : &drivelink_read,
10946
 
+        write   : &drivelink_write,
10947
 
+        init_io : &drivelink_init_io,
10948
 
+        ioctl   : &drivelink_ioctl
10949
 
+};
10950
 
+
10951
 
+/* plugin header definition */
10952
 
+static evms_plugin_header_t plugin_header = {
10953
 
+        id              : SetPluginID(
10954
 
+                IBM_OEM_ID,
10955
 
+                EVMS_FEATURE,                   //FEATURE class
10956
 
+                EVMS_DRIVELINK_FEATURE_ID),     // unique id for feature
10957
 
+        version         : { 
10958
 
+                major      : EVMS_DRIVELINK_VERSION_MAJOR,
10959
 
+                minor      : EVMS_DRIVELINK_VERSION_MINOR,
10960
 
+                patchlevel : EVMS_DRIVELINK_VERSION_PATCHLEVEL 
10961
 
+        },
10962
 
+        required_common_services_version : {
10963
 
+                major      : 0,
10964
 
+                minor      : 5,
10965
 
+                patchlevel : 0
10966
 
+        },
10967
 
+        function_table  : &function_table       // function table for this plugin
10968
 
+};
10969
 
+
10970
 
+/********************************************************/
10971
 
+/* Required Plugin Function Table Entry Point:          */
10972
 
+/*      Discover function & Support routines            */
10973
 
+/********************************************************/
10974
 
+
10975
 
+
10976
 
+/* 
10977
 
+ *
10978
 
+ * convert feature data from on-disk (Little Endian) format
10979
 
+ * to the native cpu endian format.
10980
 
+ */
10981
 
+static void
10982
 
+le_feature_data_to_cpu(evms_drivelink_metadata_t *DLMD)
10983
 
+{
10984
 
+       int i;
10985
 
+
10986
 
+       DLMD->signature = le32_to_cpu(DLMD->signature);
10987
 
+       DLMD->crc = le32_to_cpu(DLMD->crc);
10988
 
+       DLMD->version.major = le32_to_cpu(DLMD->version.major);
10989
 
+       DLMD->version.minor = le32_to_cpu(DLMD->version.minor);
10990
 
+       DLMD->version.patchlevel = le32_to_cpu(DLMD->version.patchlevel);
10991
 
+       DLMD->flags = le32_to_cpu(DLMD->flags);
10992
 
+       DLMD->sequence_number = le64_to_cpu(DLMD->sequence_number);
10993
 
+       DLMD->child_serial_number = le64_to_cpu(DLMD->child_serial_number);
10994
 
+       DLMD->parent_serial_number = le64_to_cpu(DLMD->parent_serial_number);
10995
 
+       DLMD->child_count = le64_to_cpu(DLMD->child_count);
10996
 
+       for (i = 0; i < EVMS_DRIVELINK_MAX_ENTRIES; i++) {
10997
 
+               evms_dl_ordering_table_entry_t *child_entry;
10998
 
+
10999
 
+               child_entry = &DLMD->ordering_table[i];
11000
 
+               child_entry->child_serial_number = 
11001
 
+                       le64_to_cpu(child_entry->child_serial_number);
11002
 
+               child_entry->child_vsize = 
11003
 
+                       le64_to_cpu(child_entry->child_vsize);
11004
 
+       }
11005
 
+}
11006
 
+
11007
 
+static int 
11008
 
+load_feature_data(
11009
 
+       evms_logical_node_t *node, 
11010
 
+        evms_drivelink_metadata_t **DLMD)
11011
 
+{
11012
 
+        int i, rc = 0, rc_array[2] = {0,0}, size_in_bytes;
11013
 
+        u_int64_t real_metadata_size, feature_data_size;
11014
 
+       u_int64_t starting_sector;
11015
 
+       evms_drivelink_metadata_t *cur_DLMD, *DLMD1, *DLMD2;
11016
 
+       char *location_name;
11017
 
+
11018
 
+       /* verify the feature metadata size from the  */
11019
 
+       /* feature header agrees with the real size   */
11020
 
+       /* of the current metadata structure.         */
11021
 
+       real_metadata_size = evms_cs_size_in_vsectors(sizeof(**DLMD));
11022
 
+
11023
 
+        /* allocate a buffer large enough to hold all */
11024
 
+        /* sectors containing the feature's metadata  */
11025
 
+        size_in_bytes = real_metadata_size * EVMS_VSECTOR_SIZE;
11026
 
+        rc = evms_cs_allocate_memory((void **)&DLMD1, size_in_bytes);
11027
 
+        if (!rc) {
11028
 
+               rc = evms_cs_allocate_memory((void **)&DLMD2, size_in_bytes);
11029
 
+               if (rc) evms_cs_deallocate_memory(DLMD1);
11030
 
+       }
11031
 
+       if (!rc) {
11032
 
+               for (i = 0; i < 2; i++) {
11033
 
+                       if (i == 0) {
11034
 
+                               starting_sector = node->feature_header->feature_data1_start_lsn;
11035
 
+                               feature_data_size = node->feature_header->feature_data1_size;
11036
 
+                               cur_DLMD = DLMD1;
11037
 
+                               location_name = evms_primary_string;
11038
 
+                       } else {
11039
 
+                               starting_sector = node->feature_header->feature_data2_start_lsn;
11040
 
+                               feature_data_size = node->feature_header->feature_data2_size;
11041
 
+                               cur_DLMD = DLMD2;
11042
 
+                               location_name = evms_secondary_string;
11043
 
+                       }
11044
 
+                       /* check that real metadata size matches the  */
11045
 
+                       /* feature data size                          */
11046
 
+                       if (real_metadata_size != feature_data_size) {
11047
 
+                               LOG_ERROR("%s feature data size(%Lu bytes) doesn't match expected size(%Lu bytes).\n",
11048
 
+                                          location_name,
11049
 
+                                          feature_data_size << EVMS_VSECTOR_SIZE_SHIFT,
11050
 
+                                          real_metadata_size << EVMS_VSECTOR_SIZE_SHIFT);
11051
 
+                               rc = -EINVAL;
11052
 
+                               rc_array[i] = rc;
11053
 
+                               continue;
11054
 
+                       }
11055
 
+                       /* load the node's feature data */
11056
 
+                       rc = INIT_IO(node, 
11057
 
+                                    0, 
11058
 
+                                    starting_sector,
11059
 
+                                    feature_data_size, 
11060
 
+                                    cur_DLMD);
11061
 
+                       if (rc) {
11062
 
+                               LOG_ERROR("error(%d) probing for %s feature data at sector(%Ld) on '%s'.\n",
11063
 
+                                         rc, 
11064
 
+                                         location_name,
11065
 
+                                         starting_sector,
11066
 
+                                         node->name);
11067
 
+                               rc_array[i] = rc;
11068
 
+                               continue;
11069
 
+                       }
11070
 
+                       /* check for valid metadata signature */
11071
 
+                       if (le32_to_cpu(cur_DLMD->signature) != EVMS_DRIVELINK_SIGNATURE) {
11072
 
+                               rc = -ENODATA;
11073
 
+                               LOG_SERIOUS("error(%d) invalid signature in %s feature data on '%s'\n",
11074
 
+                                          rc, 
11075
 
+                                          location_name,
11076
 
+                                          node->name);
11077
 
+                               rc_array[i] = rc;
11078
 
+                               continue;
11079
 
+                       }
11080
 
+                       /* validate feature data CRC */
11081
 
+                       if (cur_DLMD->crc != EVMS_MAGIC_CRC) {
11082
 
+                               int org_crc, final_crc;
11083
 
+                               org_crc = le32_to_cpu(cur_DLMD->crc);
11084
 
+                               cur_DLMD->crc = 0;
11085
 
+                               final_crc = evms_cs_calculate_crc(
11086
 
+                                       EVMS_INITIAL_CRC,
11087
 
+                                       cur_DLMD, sizeof(*cur_DLMD));
11088
 
+                               if (final_crc != org_crc) {
11089
 
+                                       LOG_ERROR("CRC mismatch error [stored(%x), computed(%x)] in %s feature data on '%s'.\n",
11090
 
+                                                org_crc, final_crc, 
11091
 
+                                                location_name,
11092
 
+                                                node->name);
11093
 
+                                       rc = -EINVAL;
11094
 
+                                       rc_array[i] = rc;
11095
 
+                                       continue;
11096
 
+                               }
11097
 
+                       } else {
11098
 
+                               LOG_WARNING("CRC disabled in %s feature data on '%s'.\n",
11099
 
+                                         location_name,
11100
 
+                                         node->name);
11101
 
+                       }
11102
 
+                       /* convert feature data from on-disk
11103
 
+                        * format (Little Endian) to native
11104
 
+                        * cpu endian format.
11105
 
+                        */
11106
 
+                       le_feature_data_to_cpu(cur_DLMD);
11107
 
+                       /* check for valid structure version */
11108
 
+                       rc = evms_cs_check_version(
11109
 
+                               &plugin_header.version,
11110
 
+                               &cur_DLMD->version);
11111
 
+                       if (rc) {
11112
 
+                               LOG_SERIOUS("error(%d) obsolete version(%d,%d,%d) detected in %s feature data on '%s'\n",
11113
 
+                                          rc, 
11114
 
+                                          cur_DLMD->version.major,
11115
 
+                                          cur_DLMD->version.minor,
11116
 
+                                          cur_DLMD->version.patchlevel,
11117
 
+                                          location_name,
11118
 
+                                          node->name);
11119
 
+                               rc_array[i] = rc;
11120
 
+                       }
11121
 
+               }
11122
 
+               /* getting same return code for both copies? */
11123
 
+               if (rc_array[0] == rc_array[1]) {
11124
 
+                       rc = rc_array[0];
11125
 
+                       /* if no errors on both copies,
11126
 
+                        * check the sequence numbers.
11127
 
+                        * use the highest sequence number.
11128
 
+                        */
11129
 
+                       if (!rc) {
11130
 
+                               /* compare sequence numbers */
11131
 
+                               if (DLMD1->sequence_number == DLMD2->sequence_number) {
11132
 
+                                       cur_DLMD = DLMD1;
11133
 
+                               } else {
11134
 
+                                       LOG_WARNING("sequence number mismatches between front(%Ld) and rear(%Ld) feature data copies on node(%s)!\n",
11135
 
+                                                  DLMD2->sequence_number,
11136
 
+                                                  DLMD1->sequence_number,
11137
 
+                                                  node->name);
11138
 
+                                       if (DLMD1->sequence_number > DLMD2->sequence_number)
11139
 
+                                               cur_DLMD = DLMD1;
11140
 
+                                       else
11141
 
+                                               cur_DLMD = DLMD2;
11142
 
+                                       LOG_WARNING("using %s feature data copy!\n",
11143
 
+                                                  (cur_DLMD == DLMD1) ? 
11144
 
+                                                   evms_primary_string : 
11145
 
+                                                   evms_secondary_string);
11146
 
+                               }
11147
 
+                       }
11148
 
+               /* getting different return codes for each copy */
11149
 
+               } else if (rc_array[0] == 0) {
11150
 
+                       /* use 1st (rear) copy if its good */
11151
 
+                       rc = 0;
11152
 
+                       cur_DLMD = DLMD1;
11153
 
+               } else if (rc_array[1] == 0) {
11154
 
+                       /* use 2nd (front) copy if its good */
11155
 
+                       rc = 0;
11156
 
+                       cur_DLMD = DLMD2;
11157
 
+               } else if ((rc_array[0] == -EINVAL) || 
11158
 
+                          (rc_array[1] == -EINVAL)) {
11159
 
+                       /* fail if either give a fatal error */
11160
 
+                       rc = -EINVAL;
11161
 
+                       cur_DLMD = NULL;
11162
 
+               }
11163
 
+
11164
 
+               /* deallocate metadata buffers appropriately */
11165
 
+               if (rc || (cur_DLMD == DLMD1))
11166
 
+                       evms_cs_deallocate_memory(DLMD2);
11167
 
+               if (rc || (cur_DLMD == DLMD2))
11168
 
+                       evms_cs_deallocate_memory(DLMD1);
11169
 
+
11170
 
+               /* save validated feature header pointer */
11171
 
+               if (!rc)
11172
 
+                       *DLMD = cur_DLMD;
11173
 
+       }
11174
 
+        return(rc);
11175
 
+}
11176
 
+
11177
 
+static int 
11178
 
+find_parent_node_for_child_node(
11179
 
+        evms_logical_node_t *child_node,
11180
 
+        evms_drivelink_metadata_t *DLMD,
11181
 
+        evms_logical_node_t **parent_node,
11182
 
+        evms_drivelink_runtime_data_t **drivelink_instance_data,
11183
 
+        evms_logical_node_t **discover_list)
11184
 
+{
11185
 
+        int rc = 0, parent_found = FALSE;
11186
 
+        evms_logical_node_t *parent = NULL;
11187
 
+        evms_drivelink_runtime_data_t *DLID = NULL;
11188
 
+
11189
 
+        /* find the parent node for this child */
11190
 
+        for (parent = *discover_list; parent; parent = parent->next) {
11191
 
+                /* only parent nodes will have null feature headers */
11192
 
+                if (!parent->feature_header) {
11193
 
+                        DLID = (evms_drivelink_runtime_data_t *)parent->instance_data;
11194
 
+                        if (DLID->parent_serial_number == DLMD->parent_serial_number) {
11195
 
+                                parent_found = TRUE;
11196
 
+                                break;
11197
 
+                        }
11198
 
+                }
11199
 
+        }
11200
 
+        /* if no parent node found, create it */
11201
 
+        if (parent_found == FALSE) {
11202
 
+                rc = evms_cs_allocate_logical_node(&parent);
11203
 
+                if (!rc) {
11204
 
+                        /* transpose info from child to parent */
11205
 
+                        parent->flags |= child_node->flags;
11206
 
+                       strcpy(parent->name, child_node->feature_header->object_name);
11207
 
+                        /* copy evms system data to parent */
11208
 
+                        parent->volume_info = child_node->volume_info;
11209
 
+                        /* initialize the plugin id field */
11210
 
+                        parent->plugin = &plugin_header;
11211
 
+                        /* allocate parent's instance data */
11212
 
+                        rc = evms_cs_allocate_memory(
11213
 
+                                (void **)&parent->instance_data,
11214
 
+                                sizeof(*DLID));
11215
 
+                }
11216
 
+                if (!rc) {
11217
 
+                        /* initialize some instance data fields */
11218
 
+                        DLID = (evms_drivelink_runtime_data_t *)parent->instance_data;
11219
 
+                        DLID->parent_serial_number = DLMD->parent_serial_number;
11220
 
+                        DLID->child_count = DLMD->child_count;
11221
 
+                        /* allocate the child table */
11222
 
+                        rc = evms_cs_allocate_memory(
11223
 
+                                (void **)&DLID->child_table,
11224
 
+                                sizeof(evms_drivelink_runtime_entry_t) *
11225
 
+                                DLID->child_count);
11226
 
+                }
11227
 
+                if (!rc) {
11228
 
+                        /* add the parent node to the discover list */
11229
 
+                        rc = evms_cs_add_logical_node_to_list(discover_list, parent);
11230
 
+                        MOD_INC_USE_COUNT;
11231
 
+                }
11232
 
+                /* if any errors encountered, try to clean up */
11233
 
+                if (rc) {
11234
 
+                        LOG_SERIOUS("find_parent_node: rc(%d) from '%s'\n",
11235
 
+                                   rc, child_node->name);
11236
 
+                        if (parent) {
11237
 
+                                DELETE(parent);
11238
 
+                                parent = NULL;
11239
 
+                                DLID = NULL;
11240
 
+                        }
11241
 
+                }
11242
 
+        }
11243
 
+
11244
 
+        *drivelink_instance_data = DLID;
11245
 
+        *parent_node = parent;
11246
 
+
11247
 
+        return(rc);
11248
 
+}
11249
 
+
11250
 
+static int 
11251
 
+compute_child_index(
11252
 
+       evms_logical_node_t *node, 
11253
 
+        evms_drivelink_metadata_t *DLMD)
11254
 
+{
11255
 
+        int i, position = -1;
11256
 
+
11257
 
+        for(i = 0; i < DLMD->child_count; i++) {
11258
 
+                if (DLMD->ordering_table[i].child_serial_number == 
11259
 
+                   DLMD->child_serial_number) {
11260
 
+                        position = i;
11261
 
+                        break;
11262
 
+                }
11263
 
+        }
11264
 
+        if (position == -1) {
11265
 
+                LOG_SERIOUS("%s: child not found from '%s'\n",
11266
 
+                           __FUNCTION__, node->name);
11267
 
+        }
11268
 
+        return(position);
11269
 
+}
11270
 
+
11271
 
+static int 
11272
 
+process_child_nodes(evms_logical_node_t **discover_list)
11273
 
+{
11274
 
+        int rc = 0, index = -1;
11275
 
+        evms_logical_node_t *node, *next_node, *parent;
11276
 
+        evms_drivelink_metadata_t *DLMD;
11277
 
+        evms_drivelink_runtime_data_t *DLID;
11278
 
+        evms_drivelink_runtime_entry_t *child_entry = NULL;
11279
 
+
11280
 
+       for (node = *discover_list; node; node = next_node) {
11281
 
+               next_node = node->next;
11282
 
+                if ( (!node->feature_header) || 
11283
 
+                     (node->feature_header->feature_id != plugin_header.id) ) {
11284
 
+                        continue;
11285
 
+                }
11286
 
+
11287
 
+               rc = evms_cs_remove_logical_node_from_list(discover_list, node);
11288
 
+               if (rc) BUG();
11289
 
+               /* we need to load the feature data to   */
11290
 
+               /* find the parent's serial number this  */
11291
 
+               /* child node belongs to.                */
11292
 
+               DLMD = NULL;
11293
 
+               rc = load_feature_data(node,&DLMD);
11294
 
+               if (!rc) {
11295
 
+                       /* find the parent node for this child */
11296
 
+                       parent = NULL;
11297
 
+                       rc = find_parent_node_for_child_node(
11298
 
+                               node, DLMD, &parent, &DLID, discover_list);
11299
 
+               }
11300
 
+               if (!rc) {
11301
 
+                       /* determine position of child in drive link object */
11302
 
+                       index = compute_child_index(node, DLMD);
11303
 
+                       if (index == -1)
11304
 
+                               rc = index;
11305
 
+               }
11306
 
+               if (!rc) {
11307
 
+                       /* check for multiple child index requests */
11308
 
+                       child_entry = (evms_drivelink_runtime_entry_t *)&DLID->child_table[index];
11309
 
+                       /* check to see if this child index is 
11310
 
+                        * already in use.
11311
 
+                        */
11312
 
+                       if (child_entry->child_node) {
11313
 
+                               LOG_SERIOUS("attempt to put '%s' in child index(%d). Already occupied by '%s'.\n",
11314
 
+                                           node->name, index, child_entry->child_node->name);
11315
 
+                               rc = -1;
11316
 
+                       }
11317
 
+               }
11318
 
+               if (!rc) {
11319
 
+                       /* fill in child info in parent */
11320
 
+
11321
 
+                       /* check the sector size for this node */
11322
 
+                       if (node->hardsector_size > parent->hardsector_size)
11323
 
+                               parent->hardsector_size = node->hardsector_size;
11324
 
+                       /* check the block size for this node */
11325
 
+                       if (node->block_size > parent->block_size)
11326
 
+                               parent->block_size = node->block_size;
11327
 
+                       /* set the child node */
11328
 
+                       child_entry->child_node = node;
11329
 
+                       /* set the metadata for this node */
11330
 
+                       child_entry->child_metadata = DLMD;
11331
 
+               }
11332
 
+
11333
 
+               /* on error, clean up accordingly */
11334
 
+                if (rc) {
11335
 
+                        if (DLMD)
11336
 
+                                evms_cs_deallocate_memory(DLMD);
11337
 
+                        LOG_SERIOUS("%s: rc(%d) from '%s'\n",
11338
 
+                                __FUNCTION__, rc, node->name);
11339
 
+                        LOG_SERIOUS("deleting child node '%s'.\n",
11340
 
+                                node->name);
11341
 
+                        rc = DELETE(node);
11342
 
+                       if (rc) {
11343
 
+                               LOG_SERIOUS("error(%d) attempting to delete '%s'.\n",
11344
 
+                                           rc, node->name);
11345
 
+                       }
11346
 
+                }
11347
 
+        }
11348
 
+
11349
 
+        /* errors are handled internal to this function */
11350
 
+        /* by deleting the failed node. This will get   */
11351
 
+        /* picked up by finalize_parent_nodes as a      */
11352
 
+        /* missing child node                           */
11353
 
+        return(0);
11354
 
+}
11355
 
+
11356
 
+#define TEST_CHILD_PRESENCE            0
11357
 
+#define TEST_CHILD_COUNT               1
11358
 
+#define TEST_CHILD_PARENTS_SERIAL_NUM  2
11359
 
+#define TEST_CHILD_POSITION            3
11360
 
+#define TEST_CHILD_METADATA            4
11361
 
+
11362
 
+static int 
11363
 
+test_parent_node(evms_logical_node_t *node)
11364
 
+{
11365
 
+        int i, rc = 0;
11366
 
+        evms_drivelink_runtime_data_t *DLID;
11367
 
+        evms_drivelink_runtime_entry_t *child_entry;
11368
 
+
11369
 
+        DLID = (evms_drivelink_runtime_data_t *)node->instance_data;
11370
 
+        for(i = 0; i < DLID->child_count; i++) {
11371
 
+                child_entry = (evms_drivelink_runtime_entry_t *)&DLID->child_table[i];
11372
 
+
11373
 
+               /* insure each child entry is filled */
11374
 
+                if (!child_entry->child_node) {
11375
 
+                       node->flags |= 
11376
 
+                               EVMS_VOLUME_SET_READ_ONLY |
11377
 
+                               EVMS_VOLUME_PARTIAL;
11378
 
+                        LOG_ERROR("%s: missing child(%d).\n",__FUNCTION__,i);
11379
 
+                } else 
11380
 
+                       /* insure child count is the same */
11381
 
+                        /* in each child's metadata       */
11382
 
+                        if (child_entry->child_metadata->child_count != 
11383
 
+                            DLID->child_count) {
11384
 
+                        rc = -EVMS_FEATURE_FATAL_ERROR;
11385
 
+                        LOG_ERROR("%s: child count wrong for node '%s'\n",
11386
 
+                                __FUNCTION__, node->name);
11387
 
+                } else 
11388
 
+                       /* insure parent serial number is    */
11389
 
+                        /* the same in each child's metadata */
11390
 
+                        if (child_entry->child_metadata->parent_serial_number != 
11391
 
+                            DLID->parent_serial_number) {
11392
 
+                        rc = -EVMS_FEATURE_FATAL_ERROR;
11393
 
+                        LOG_ERROR("%s: incorrect [is(%Ld), should be(%Ld)] child serial number for node '%s'\n",
11394
 
+                               __FUNCTION__,
11395
 
+                                child_entry->child_metadata->parent_serial_number,
11396
 
+                                DLID->parent_serial_number,
11397
 
+                                node->name);
11398
 
+                } else 
11399
 
+                       /* insure each is in the correct entry */
11400
 
+                        if (child_entry->child_metadata->ordering_table[i].child_serial_number !=
11401
 
+                            child_entry->child_metadata->child_serial_number) {
11402
 
+                        rc = -EVMS_FEATURE_FATAL_ERROR;
11403
 
+                        LOG_ERROR("%s: child reports different index for node '%s'\n",
11404
 
+                                __FUNCTION__, node->name);
11405
 
+                } else { 
11406
 
+                       evms_drivelink_runtime_entry_t *other_child_entry;
11407
 
+                       int j, rc2;
11408
 
+                       /* compare the children's metadata */
11409
 
+
11410
 
+                       /* look for another present child to 
11411
 
+                        * compare against.
11412
 
+                        */
11413
 
+                       other_child_entry = NULL;
11414
 
+                       for (j = 0; j < DLID->child_count; j++) {
11415
 
+                               /* skip comparing to ourselves */
11416
 
+                               if (j == i) {
11417
 
+                                       continue;
11418
 
+                               }
11419
 
+                               /* is this child is present? */
11420
 
+                               if (DLID->child_table[j].child_node) {
11421
 
+                                       /* yes, use it */
11422
 
+                                       other_child_entry = &DLID->child_table[j];
11423
 
+                                       break;
11424
 
+                               }
11425
 
+                       }
11426
 
+                       /* if we can't find another valid
11427
 
+                        * child node's metadata to compare
11428
 
+                        * against, just skip this test.
11429
 
+                        */
11430
 
+                       if (!other_child_entry) {
11431
 
+                               continue;
11432
 
+                       }
11433
 
+                        rc2 = memcmp(
11434
 
+                                other_child_entry->child_metadata->ordering_table,
11435
 
+                                child_entry->child_metadata->ordering_table,
11436
 
+                                sizeof(child_entry->child_metadata->ordering_table));
11437
 
+                        if (rc2) {
11438
 
+                                rc = -EVMS_FEATURE_FATAL_ERROR;
11439
 
+                                LOG_ERROR("%s: mismatching child metadata for nodes '%s' and '%s'\n",
11440
 
+                                           __FUNCTION__, DLID->child_table[i-1].child_node->name,
11441
 
+                                           child_entry->child_node->name);
11442
 
+                        }
11443
 
+                }
11444
 
+               /* stop if fatal error encountered */
11445
 
+               if (rc == -EVMS_FEATURE_FATAL_ERROR) {
11446
 
+                       break;
11447
 
+               }
11448
 
+        }
11449
 
+        return(rc);
11450
 
+}
11451
 
+
11452
 
+/*
11453
 
+ * function: perform_final_adjustments
11454
 
+ *
11455
 
+ * This function does the following:
11456
 
+ *           sets the vsize (in vsectors) field in each child node
11457
 
+ *           sets the voffset (in vsectors) field in each child node
11458
 
+ *           frees each child node's metadata
11459
 
+ *           sets the parent's total size field
11460
 
+ */
11461
 
+static void 
11462
 
+perform_final_adjustments(evms_logical_node_t *node)
11463
 
+{
11464
 
+        int i;
11465
 
+        evms_drivelink_runtime_data_t *DLID;
11466
 
+        evms_drivelink_runtime_entry_t *child_entry = NULL;
11467
 
+       evms_drivelink_metadata_t *ref_data = NULL;
11468
 
+
11469
 
+        DLID = (evms_drivelink_runtime_data_t *)node->instance_data;
11470
 
+       /* find a valid copy of the ordering table.
11471
 
+        * since all the ordering tables are the same
11472
 
+        * we can just pick one to use for all the
11473
 
+        * child computations.
11474
 
+        */
11475
 
+        for(i = 0; i < DLID->child_count; i++) {
11476
 
+                child_entry = (evms_drivelink_runtime_entry_t *)&DLID->child_table[i];
11477
 
+               if (child_entry->child_node) {
11478
 
+                       ref_data = child_entry->child_metadata;
11479
 
+                       break;
11480
 
+               }
11481
 
+       }
11482
 
+       /* if we got this far, there should
11483
 
+        * always be at least one valid child.
11484
 
+        */
11485
 
+       if (!ref_data) BUG();
11486
 
+       /* compute the parent's usable size,
11487
 
+        * and construct the table used to
11488
 
+        * remap parent I/Os to child I/Os */
11489
 
+        for(i = 0; i < DLID->child_count; i++) {
11490
 
+                child_entry = (evms_drivelink_runtime_entry_t *)&DLID->child_table[i];
11491
 
+                /* set the LBA count for this child node */
11492
 
+                child_entry->vsize = ref_data->ordering_table[i].child_vsize;
11493
 
+               /* set the start LBA value for this child node */
11494
 
+                child_entry->voffset = node->total_vsectors;
11495
 
+                /* keep a running total of size in sectors */
11496
 
+                node->total_vsectors += child_entry->vsize;
11497
 
+                /* free the metadata for this child node */
11498
 
+               if (ref_data != child_entry->child_metadata) {
11499
 
+                       evms_cs_deallocate_memory(child_entry->child_metadata);
11500
 
+               }
11501
 
+               child_entry->child_metadata = NULL;
11502
 
+               /* free the feature header for this child node */
11503
 
+               if (child_entry->child_node) {
11504
 
+                       evms_cs_deallocate_memory(child_entry->child_node->feature_header);
11505
 
+                       child_entry->child_node->feature_header = NULL;
11506
 
+               }
11507
 
+        }
11508
 
+       /* free the reference data */
11509
 
+       evms_cs_deallocate_memory(ref_data);
11510
 
+}
11511
 
+
11512
 
+static int 
11513
 
+finalize_parent_nodes(evms_logical_node_t **discover_list)
11514
 
+{
11515
 
+        int rc = 0, rc2;
11516
 
+        evms_logical_node_t *node, *next_node;
11517
 
+
11518
 
+       for (node = *discover_list; node; node = next_node) {
11519
 
+               next_node = node->next;
11520
 
+                /* only check parent nodes */
11521
 
+                if (!node->feature_header) {
11522
 
+                       /* valid the children of this parent */
11523
 
+                        rc = test_parent_node(node);
11524
 
+                        if (!rc) {
11525
 
+                               /* compute parent size and
11526
 
+                                * child remap table.
11527
 
+                                */
11528
 
+                                perform_final_adjustments(node);
11529
 
+                        } else {
11530
 
+                               /* fatal error encountered. 
11531
 
+                                * cleanup from this node and
11532
 
+                                * delete it from memory.
11533
 
+                                */
11534
 
+                                evms_cs_remove_logical_node_from_list(discover_list, node);
11535
 
+                                rc2 = DELETE(node);
11536
 
+                               if (rc2) {
11537
 
+                                       LOG_SERIOUS("error(%d) attempting to delete '%s'.\n",
11538
 
+                                                   rc2, node->name);
11539
 
+                               }
11540
 
+                        }
11541
 
+                }
11542
 
+        }
11543
 
+        return(rc);
11544
 
+}
11545
 
+
11546
 
+/*
11547
 
+ * Function: discover drive linked storage objects
11548
 
+ *
11549
 
+ */
11550
 
+static int 
11551
 
+drivelink_discover(evms_logical_node_t **discover_list)
11552
 
+{
11553
 
+        int rc = 0;
11554
 
+
11555
 
+        rc = process_child_nodes(discover_list);
11556
 
+        if (!rc)
11557
 
+                rc = finalize_parent_nodes(discover_list);
11558
 
+
11559
 
+        return(rc);
11560
 
+}
11561
 
+
11562
 
+
11563
 
+/********************************************************/
11564
 
+/* Required Plugin Function Table Entry Point:          */
11565
 
+/*      Delete function                                 */
11566
 
+/********************************************************/
11567
 
+
11568
 
+/*
11569
 
+ * Function: drivelink_delete
11570
 
+ *
11571
 
+ */
11572
 
+static int 
11573
 
+drivelink_delete(evms_logical_node_t * node)
11574
 
+{
11575
 
+        int i, rc = 0;
11576
 
+        evms_drivelink_runtime_data_t *DLID;
11577
 
+        evms_drivelink_runtime_entry_t *child_entry;
11578
 
+
11579
 
+        LOG_DETAILS("deleting '%s'.\n", node->name);
11580
 
+
11581
 
+        DLID = (evms_drivelink_runtime_data_t *)node->instance_data;
11582
 
+        if (DLID) {
11583
 
+                for (i = 0; i < DLID->child_count; i++) {
11584
 
+                        child_entry = &DLID->child_table[i];
11585
 
+                        /* delete the child node */
11586
 
+                        if (child_entry->child_node) {
11587
 
+                                rc = DELETE(child_entry->child_node);
11588
 
+                                if (rc) break;
11589
 
+                                child_entry->child_node = NULL;
11590
 
+                        }
11591
 
+                        /* delete the child's metadata */
11592
 
+                        if (child_entry->child_metadata) {
11593
 
+                                evms_cs_deallocate_memory(child_entry->child_metadata);
11594
 
+                                child_entry->child_metadata = NULL;
11595
 
+                        }
11596
 
+                }
11597
 
+                if (!rc) {
11598
 
+                        /* delete the child table */
11599
 
+                        if (DLID->child_table) {
11600
 
+                                evms_cs_deallocate_memory(DLID->child_table);
11601
 
+                                DLID->child_table = NULL;
11602
 
+                        }
11603
 
+                        /* delete the instance data */
11604
 
+                        evms_cs_deallocate_memory(DLID);
11605
 
+                        node->instance_data = NULL;
11606
 
+                }
11607
 
+        }
11608
 
+        if (!rc) {
11609
 
+                evms_cs_deallocate_logical_node(node);
11610
 
+                MOD_DEC_USE_COUNT;
11611
 
+        }
11612
 
+
11613
 
+        return(rc);
11614
 
+}
11615
 
+
11616
 
+/********************************************************/
11617
 
+/* Required Plugin Function Table Entry Point:          */
11618
 
+/*      Read function & Support routines                */
11619
 
+/********************************************************/
11620
 
+
11621
 
+/*
11622
 
+ * function: which_child
11623
 
+ *
11624
 
+ * This function find the child node a parent rsector maps to.
11625
 
+ * It then adjusts the rsector value to be child relative and
11626
 
+ * optionally computes the max # of sectors that can be access
11627
 
+ * from this starting point on the child. The child node, the 
11628
 
+ * child relative rsector and max io size are returned to the 
11629
 
+ * caller.
11630
 
+ *
11631
 
+ */
11632
 
+static evms_logical_node_t *
11633
 
+which_child(
11634
 
+       evms_logical_node_t *parent,
11635
 
+        evms_sector_t *rsector,
11636
 
+        evms_sector_t *max_io_sects)
11637
 
+{
11638
 
+        int i;
11639
 
+        evms_logical_node_t *child = NULL;
11640
 
+        evms_drivelink_runtime_data_t *DLID;
11641
 
+        evms_drivelink_runtime_entry_t *child_entry = NULL;
11642
 
+
11643
 
+        DLID = (evms_drivelink_runtime_data_t *)parent->instance_data;
11644
 
+        for (i = 0; i < DLID->child_count; i++) {
11645
 
+                child_entry = (evms_drivelink_runtime_entry_t *)&DLID->child_table[i];
11646
 
+
11647
 
+                if (*rsector >= child_entry->vsize) {
11648
 
+                        *rsector -= child_entry->vsize;
11649
 
+                } else {
11650
 
+                        /* get the child node */
11651
 
+                        child = child_entry->child_node;
11652
 
+                        /* compute the sector count if requested */
11653
 
+                        if (max_io_sects)
11654
 
+                               /* this is only used for INIT I/O
11655
 
+                                * to return the largest sector
11656
 
+                                * count size for this child based
11657
 
+                                * on first sector in the I/O.
11658
 
+                                */
11659
 
+                                *max_io_sects = 
11660
 
+                                       child_entry->vsize - *rsector;
11661
 
+                        break;
11662
 
+                }
11663
 
+        }
11664
 
+        return(child);
11665
 
+}
11666
 
+
11667
 
+/* 
11668
 
+ * function: drivelink_io_error
11669
 
+ * 
11670
 
+ * this function was primarily created because the function
11671
 
+ * buffer_IO_error is inline and kgdb doesn't allow breakpoints
11672
 
+ * to be set on inline functions. Since this was an error path
11673
 
+ * and not mainline, I decided to add a trace statement to help
11674
 
+ * report on the failing condition.
11675
 
+ *
11676
 
+ */
11677
 
+static void 
11678
 
+drivelink_io_error(
11679
 
+       evms_logical_node_t *node,
11680
 
+       int io_flag, 
11681
 
+       eio_t *eio)
11682
 
+{
11683
 
+        LOG_SERIOUS("sector remap error %sING on (%s), rsector(%Ld).\n",
11684
 
+                (io_flag) ? "WRIT" : "READ", 
11685
 
+               node->name,
11686
 
+               eio->rsector);
11687
 
+
11688
 
+        EVMS_IO_ERROR(eio);
11689
 
+}
11690
 
+
11691
 
+/*
11692
 
+ * Function: drivelink_read
11693
 
+ */
11694
 
+static void 
11695
 
+drivelink_read(evms_logical_node_t *node, eio_t *eio)
11696
 
+{
11697
 
+        evms_logical_node_t *child;
11698
 
+
11699
 
+       child = which_child(node, &eio->rsector, NULL);
11700
 
+       if (child) {
11701
 
+               R_IO(child, eio);
11702
 
+       } else {
11703
 
+               drivelink_io_error(node, READ, eio);
11704
 
+       }
11705
 
+}
11706
 
+
11707
 
+/********************************************************/
11708
 
+/* Required Plugin Function Table Entry Point:          */
11709
 
+/*      Read function & Support routines                */
11710
 
+/********************************************************/
11711
 
+
11712
 
+/*
11713
 
+ * Function: drivelink_write
11714
 
+ *
11715
 
+ */
11716
 
+static void 
11717
 
+drivelink_write(evms_logical_node_t *node, eio_t *eio)
11718
 
+{
11719
 
+        evms_logical_node_t *child;
11720
 
+
11721
 
+       child = which_child(node, &eio->rsector, NULL);
11722
 
+       if (child) {
11723
 
+               W_IO(child, eio);
11724
 
+       } else {
11725
 
+               drivelink_io_error(node, WRITE, eio);
11726
 
+       }
11727
 
+}
11728
 
+
11729
 
+/********************************************************/
11730
 
+/* Required Plugin Function Table Entry Point:          */
11731
 
+/*      Init I/O function                               */
11732
 
+/********************************************************/
11733
 
+
11734
 
+/*
11735
 
+ * function: init_io
11736
 
+ *
11737
 
+ * This function must determine which child or children a
11738
 
+ * specified I/O request must be passed to. Also if, when,
11739
 
+ * and how a request must be broken up. 
11740
 
+ *
11741
 
+ */
11742
 
+static int 
11743
 
+drivelink_init_io(
11744
 
+       evms_logical_node_t     * node,
11745
 
+       int                     io_flag,        /* 0=read, 1=write*/
11746
 
+        evms_sector_t           sect_nr,        /* disk LBA */
11747
 
+        evms_sector_t           num_sects,      /* # of sectors */
11748
 
+        void                    * buf_addr )    /* buffer address */
11749
 
+{
11750
 
+        int rc = 0;
11751
 
+
11752
 
+        if (!node)
11753
 
+                rc = -EINVAL;
11754
 
+        else {
11755
 
+               evms_sector_t starting_sector, remaining_sectors;
11756
 
+               void *io_buf;
11757
 
+               evms_drivelink_runtime_data_t *DLID;
11758
 
+
11759
 
+               if ( (sect_nr + num_sects) > node->total_vsectors) {
11760
 
+                       LOG_SERIOUS("attempted out of bound(%Ld) %s on '%s' at sector(%Ld), count(%Ld).\n",
11761
 
+                               node->total_vsectors,
11762
 
+                               (io_flag) ? "WRITE" : "READ",
11763
 
+                               node->name,
11764
 
+                               sect_nr, num_sects);
11765
 
+                       rc = -EINVAL;
11766
 
+               } else {
11767
 
+                       DLID = (evms_drivelink_runtime_data_t *)node->instance_data;
11768
 
+                       /* make working copies of input parameters */
11769
 
+                       starting_sector = sect_nr;
11770
 
+                       remaining_sectors = num_sects;
11771
 
+                       io_buf = buf_addr;
11772
 
+                       /* loop until all I/O is performed */
11773
 
+                       while(remaining_sectors) {
11774
 
+                               evms_sector_t io_start, io_size;
11775
 
+                               evms_logical_node_t *child;
11776
 
+
11777
 
+                               /* compute the child relative io_start
11778
 
+                                * and max io_size.
11779
 
+                                */
11780
 
+                               io_start = starting_sector;
11781
 
+                               child = which_child(node, &io_start, &io_size);
11782
 
+                               /* adjust io_size based on
11783
 
+                                * original remaining sectors
11784
 
+                                * in this io.
11785
 
+                                */
11786
 
+                               if (io_size > remaining_sectors)
11787
 
+                                       io_size = remaining_sectors;
11788
 
+                               if (child) {
11789
 
+                                       rc = INIT_IO(child, 
11790
 
+                                                    io_flag, 
11791
 
+                                                    io_start,
11792
 
+                                                    io_size, 
11793
 
+                                                    io_buf);
11794
 
+                               } else {
11795
 
+                                       /* if partial volume, return 0's
11796
 
+                                        * for missing children.
11797
 
+                                        */
11798
 
+                                       if (io_flag == READ) {
11799
 
+                                               memset(io_buf, 0, io_size << EVMS_VSECTOR_SIZE_SHIFT);
11800
 
+                                       }
11801
 
+                               }
11802
 
+                               if (!rc) {
11803
 
+                                       /* adjust working copies */
11804
 
+                                       starting_sector += io_size;
11805
 
+                                       remaining_sectors -= io_size;
11806
 
+                                       io_buf += io_size <<
11807
 
+                                               EVMS_VSECTOR_SIZE_SHIFT;
11808
 
+                               } else
11809
 
+                                       break;
11810
 
+                       }
11811
 
+               }
11812
 
+        }
11813
 
+
11814
 
+        return(rc);
11815
 
+}
11816
 
+
11817
 
+/********************************************************/
11818
 
+/* Required Plugin Function Table Entry Point:          */
11819
 
+/*      IOCTL function & Support routines               */
11820
 
+/********************************************************/
11821
 
+
11822
 
+static int 
11823
 
+drivelink_ioctl_cmd_plugin_ioctl(
11824
 
+        evms_logical_node_t *node, 
11825
 
+        struct inode *inode, struct file *file,
11826
 
+        unsigned long cmd, unsigned long arg)
11827
 
+{
11828
 
+        int i, rc = 0;
11829
 
+        evms_drivelink_runtime_data_t *DLID;
11830
 
+        evms_plugin_ioctl_t tmp, *user_parms;
11831
 
+
11832
 
+        user_parms = (evms_plugin_ioctl_t *)arg;
11833
 
+        /* copy user's parameters to kernel space */
11834
 
+        if (copy_from_user(&tmp, user_parms, sizeof(tmp)))
11835
 
+                rc = -EFAULT;
11836
 
+
11837
 
+        if (!rc) {
11838
 
+                DLID = (evms_drivelink_runtime_data_t *)node->instance_data;
11839
 
+                /* is this cmd targetted at this feature ? */
11840
 
+                if (tmp.feature_id == node->plugin->id) {
11841
 
+                        switch(tmp.feature_command) {
11842
 
+                                default:
11843
 
+                                        break;
11844
 
+                        }
11845
 
+                } else { /* broadcast this cmd to all children */
11846
 
+                        for (i = 0; i < DLID->child_count; i++) {
11847
 
+                                rc = IOCTL(DLID->child_table[i].child_node,
11848
 
+                                      inode, file, cmd, arg);
11849
 
+                                if (rc) break;
11850
 
+                        }
11851
 
+                }
11852
 
+                /* copy info to userspace */
11853
 
+                if (copy_to_user(user_parms, &tmp, sizeof(tmp)))
11854
 
+                        rc = -EFAULT;
11855
 
+        }
11856
 
+        return(rc);
11857
 
+}
11858
 
+
11859
 
+static int 
11860
 
+drivelink_ioctl_cmd_broadcast(
11861
 
+        evms_logical_node_t *node,
11862
 
+        struct inode *inode, struct file *file,
11863
 
+        unsigned long cmd, unsigned long arg)
11864
 
+{
11865
 
+        int i, rc = 0;
11866
 
+        evms_drivelink_runtime_data_t *DLID;
11867
 
+
11868
 
+        DLID = (evms_drivelink_runtime_data_t *)node->instance_data;
11869
 
+        /* broadcast this cmd to all children */
11870
 
+        for (i = 0; i < DLID->child_count; i++)        {
11871
 
+               evms_logical_node_t *child_node;
11872
 
+
11873
 
+               child_node = DLID->child_table[i].child_node;
11874
 
+               if (child_node) {
11875
 
+                       rc |= IOCTL(child_node, inode, file, cmd, arg);
11876
 
+               }
11877
 
+       }
11878
 
+        return(rc);
11879
 
+}
11880
 
+
11881
 
+/*
11882
 
+ * Function: drivelink_ioctl
11883
 
+ *
11884
 
+ */
11885
 
+static int 
11886
 
+drivelink_ioctl(
11887
 
+       evms_logical_node_t     * node,
11888
 
+        struct inode            * inode,
11889
 
+        struct file             * file,
11890
 
+        unsigned int            cmd,
11891
 
+        unsigned long           arg)
11892
 
+{
11893
 
+        int rc = 0;
11894
 
+        evms_drivelink_runtime_data_t *DLID = NULL;
11895
 
+        struct hd_geometry hdgeo;
11896
 
+        
11897
 
+        if ( (!node) || (!inode) )
11898
 
+                rc = -EINVAL;
11899
 
+
11900
 
+        if (!rc) {
11901
 
+                DLID = (evms_drivelink_runtime_data_t *)node->instance_data;
11902
 
+                switch (cmd) {
11903
 
+                        case HDIO_GETGEO:
11904
 
+                                hdgeo.heads = 255;
11905
 
+                                hdgeo.sectors = 63;
11906
 
+                                hdgeo.cylinders = ((unsigned int)node->total_vsectors) /
11907
 
+                                        hdgeo.heads / hdgeo.sectors;
11908
 
+                                hdgeo.start = 0;
11909
 
+                                if (copy_to_user((int *)arg, 
11910
 
+                                                 &hdgeo, 
11911
 
+                                                 sizeof(hdgeo)))
11912
 
+                                        rc = -EFAULT;
11913
 
+                                break;
11914
 
+                       case EVMS_QUIESCE_VOLUME:
11915
 
+                       case EVMS_GET_DISK_LIST:
11916
 
+                       case EVMS_CHECK_MEDIA_CHANGE:
11917
 
+                       case EVMS_REVALIDATE_DISK:
11918
 
+                       case EVMS_OPEN_VOLUME:
11919
 
+                       case EVMS_CLOSE_VOLUME:
11920
 
+                                rc = drivelink_ioctl_cmd_broadcast(
11921
 
+                                        node, inode, file, cmd, arg);
11922
 
+                                break;
11923
 
+                        case EVMS_PLUGIN_IOCTL:
11924
 
+                                rc = drivelink_ioctl_cmd_plugin_ioctl(
11925
 
+                                        node, inode, file, cmd, arg);
11926
 
+                                break;
11927
 
+                       case EVMS_GET_BMAP:
11928
 
+                               {
11929
 
+                                       evms_get_bmap_t *bmap;
11930
 
+                                       evms_sector_t io_start, io_size;
11931
 
+                                       evms_logical_node_t *child;
11932
 
+
11933
 
+                                       bmap = (evms_get_bmap_t *)arg;
11934
 
+                                       io_start = bmap->rsector;
11935
 
+                                       child = which_child(node, &io_start, &io_size);
11936
 
+                                       if (child) {
11937
 
+                                               if (node->block_size != 
11938
 
+                                                   child->block_size) {
11939
 
+                                                       bmap->status = -EPERM;
11940
 
+                                               } else {
11941
 
+                                                       bmap->rsector = io_start;
11942
 
+                                                       rc = IOCTL(child,
11943
 
+                                                                   inode,
11944
 
+                                                                   file,
11945
 
+                                                                   cmd,
11946
 
+                                                                   arg);
11947
 
+                                               }
11948
 
+                                       }
11949
 
+                               }
11950
 
+                               break;
11951
 
+                        default:
11952
 
+                                rc = -EINVAL;
11953
 
+                                break;
11954
 
+                }
11955
 
+        }
11956
 
+        return(rc);
11957
 
+}
11958
 
+
11959
 
+
11960
 
+/********************************************************/
11961
 
+/* Required Module Entry Point:                         */
11962
 
+/*      drivelink_init                                  */
11963
 
+/********************************************************/
11964
 
+
11965
 
+/*
11966
 
+ * Function: drivelink_init
11967
 
+ *
11968
 
+ */
11969
 
+int __init 
11970
 
+drivelink_init(void)
11971
 
+{
11972
 
+        return evms_cs_register_plugin(&plugin_header); /* register with EVMS */
11973
 
+}
11974
 
+
11975
 
+void __exit
11976
 
+drivelink_exit(void)
11977
 
+{
11978
 
+        evms_cs_unregister_plugin(&plugin_header);
11979
 
+}
11980
 
+
11981
 
+module_init(drivelink_init);
11982
 
+module_exit(drivelink_exit);
11983
 
+#ifdef MODULE_LICENSE
11984
 
+MODULE_LICENSE("GPL");
11985
 
+#endif
11986
 
+
11987
 
diff -Naur linux-2002-03-28/drivers/evms/evms_ecr.c evms-2002-03-28/drivers/evms/evms_ecr.c
11988
 
--- linux-2002-03-28/drivers/evms/evms_ecr.c    Wed Dec 31 18:00:00 1969
11989
 
+++ evms-2002-03-28/drivers/evms/evms_ecr.c     Wed Mar  6 16:01:37 2002
11990
 
@@ -0,0 +1,212 @@
11991
 
+/* -*- linux-c -*- */
11992
 
+/*
11993
 
+ *
11994
 
+ *   Copyright (c) International Business Machines  Corp., 2000
11995
 
+ *
11996
 
+ *   This program is free software;  you can redistribute it and/or modify
11997
 
+ *   it under the terms of the GNU General Public License as published by
11998
 
+ *   the Free Software Foundation; either version 2 of the License, or
11999
 
+ *   (at your option) any later version.
12000
 
+ *
12001
 
+ *   This program is distributed in the hope that it will be useful,
12002
 
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
12003
 
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
12004
 
+ *   the GNU General Public License for more details.
12005
 
+ *
12006
 
+ *   You should have received a copy of the GNU General Public License
12007
 
+ *   along with this program;  if not, write to the Free Software
12008
 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
12009
 
+ */
12010
 
+
12011
 
+/* linux/driver/evms/evms_ecr.c
12012
 
+ *
12013
 
+ * EVMS - Cluster enablement (ECR) module
12014
 
+ *
12015
 
+ */
12016
 
+
12017
 
+
12018
 
+#include <linux/kernel.h> 
12019
 
+#include <linux/module.h>
12020
 
+#include <linux/init.h>
12021
 
+#include <linux/types.h>
12022
 
+#include <linux/evms/evms_ecr.h>
12023
 
+
12024
 
+#define LOG_PREFIX "ecr: "
12025
 
+
12026
 
+
12027
 
+/*
12028
 
+ *  ecr_group_join
12029
 
+ */
12030
 
+ecr_group_t ecr_group_join(char *group_name, ecr_table_t *f_table, 
12031
 
+                  ecr_cred_t * cred, size_t size, ecr_instance_t *instance)
12032
 
+{
12033
 
+       /* dummy */
12034
 
+       return ECR_FAIL;
12035
 
+}
12036
 
+
12037
 
+
12038
 
+
12039
 
+
12040
 
+/*
12041
 
+ *  ecr_group_leave
12042
 
+ */
12043
 
+void  ecr_group_leave(ecr_group_t group)
12044
 
+{
12045
 
+       /* dummy */
12046
 
+       return;
12047
 
+}
12048
 
+
12049
 
+
12050
 
+
12051
 
+/*
12052
 
+ * ecr_group_send
12053
 
+ */
12054
 
+int ecr_group_send(ecr_group_t group, ecr_nodeid_t node, void *message,
12055
 
+               size_t size, ecr_instance_t *instance, 
12056
 
+               void callback(int ret, ecr_instance_t *instance))
12057
 
+{
12058
 
+       /* dummy */
12059
 
+       return ECR_FAIL;
12060
 
+}
12061
 
+
12062
 
+
12063
 
+
12064
 
+/*
12065
 
+ * ecr_group_send_wait
12066
 
+ */
12067
 
+int ecr_group_send_wait(ecr_group_t group, ecr_nodeid_t node, void *message,
12068
 
+               size_t size, int *ret)
12069
 
+{
12070
 
+       /* dummy */
12071
 
+       *ret = ECR_FAIL;
12072
 
+       return ECR_FAIL;
12073
 
+}
12074
 
+
12075
 
+
12076
 
+
12077
 
+/*
12078
 
+ * ecr_group_broadcast
12079
 
+ */
12080
 
+int ecr_group_broadcast(ecr_group_t group, void *message, size_t size,
12081
 
+                       ecr_instance_t *instance,
12082
 
+                       void callback(u_char ret, ecr_instance_t *instance))
12083
 
+{
12084
 
+       /* dummy */
12085
 
+       return ECR_FAIL;
12086
 
+}
12087
 
+
12088
 
+
12089
 
+
12090
 
+/*
12091
 
+ * ecr_group_broadcast_wait
12092
 
+ */
12093
 
+int ecr_group_broadcast_wait(ecr_group_t group, void *message, size_t size,
12094
 
+                       u_char *ret)
12095
 
+{
12096
 
+       /* dummy */
12097
 
+       *ret = ECR_FAIL;
12098
 
+       return ECR_FAIL;
12099
 
+}
12100
 
+
12101
 
+
12102
 
+
12103
 
+/*
12104
 
+ * ecr_group_atomic_execute
12105
 
+ */
12106
 
+int ecr_group_atomic_execute(ecr_group_t group, void *message, size_t size,
12107
 
+                       ecr_instance_t *instance,
12108
 
+                       void callback(ecr_instance_t *instance))
12109
 
+{
12110
 
+       /* dummy */
12111
 
+       return ECR_FAIL;
12112
 
+}
12113
 
+
12114
 
+
12115
 
+
12116
 
+/*
12117
 
+ * ecr_group_atomic_execute_wait
12118
 
+ */
12119
 
+int ecr_group_atomic_execute_wait(ecr_group_t group, void *message, size_t size)
12120
 
+{
12121
 
+       /* dummy */
12122
 
+       return ECR_FAIL;
12123
 
+}
12124
 
+
12125
 
+
12126
 
+
12127
 
+/*
12128
 
+ * ecr_group_success_response
12129
 
+ */
12130
 
+void ecr_group_success_response(ecr_message_t *handle)
12131
 
+{
12132
 
+       /* dummy */
12133
 
+       return;
12134
 
+}
12135
 
+
12136
 
+
12137
 
+
12138
 
+
12139
 
+/*
12140
 
+ * ecr_group_failure_response
12141
 
+ */
12142
 
+void ecr_group_failure_response(ecr_message_t *handle, int ret)
12143
 
+{
12144
 
+       /* dummy */
12145
 
+       return;
12146
 
+}
12147
 
+                       
12148
 
+
12149
 
+
12150
 
+/*
12151
 
+ * ecr_lock_create
12152
 
+ */
12153
 
+ecr_lock_t ecr_lock_create(char *lockname)
12154
 
+{
12155
 
+       /* dummy */
12156
 
+       return ECR_FAIL;
12157
 
+}
12158
 
+
12159
 
+/*
12160
 
+ * ecr_lock
12161
 
+ */
12162
 
+int  ecr_lock(ecr_lock_t lock, u_int64_t start, u_int64_t length, 
12163
 
+               ecr_lock_mode_t mode, u_char flag)
12164
 
+{
12165
 
+       /* dummy */
12166
 
+       return ECR_FAIL;
12167
 
+}
12168
 
+
12169
 
+
12170
 
+
12171
 
+/*
12172
 
+ * ecr_unlock
12173
 
+ */
12174
 
+int ecr_unlock(ecr_lock_t lock, u_int64_t start, u_int64_t length)
12175
 
+{
12176
 
+       /* dummy */
12177
 
+       return ECR_FAIL;
12178
 
+}
12179
 
+               
12180
 
+
12181
 
+/********************************************************/
12182
 
+/* Required Module Entry Point:                         */
12183
 
+/*      ecr_init()                                        */
12184
 
+/********************************************************/
12185
 
+
12186
 
+static int __init ecr_init(void)
12187
 
+{
12188
 
+        /* dummy */
12189
 
+       return 0;
12190
 
+}
12191
 
+
12192
 
+static void __exit ecr_exit(void)
12193
 
+{
12194
 
+       return;
12195
 
+}
12196
 
+
12197
 
+module_init(ecr_init);
12198
 
+module_exit(ecr_exit);
12199
 
+#ifdef MODULE_LICENSE
12200
 
+MODULE_LICENSE("GPL");
12201
 
+#endif
12202
 
+
12203
 
diff -Naur linux-2002-03-28/drivers/evms/evms_passthru.c evms-2002-03-28/drivers/evms/evms_passthru.c
12204
 
--- linux-2002-03-28/drivers/evms/evms_passthru.c       Wed Dec 31 18:00:00 1969
12205
 
+++ evms-2002-03-28/drivers/evms/evms_passthru.c        Mon Mar 18 17:39:22 2002
12206
 
@@ -0,0 +1,317 @@
12207
 
+/* -*- linux-c -*- */
12208
 
+
12209
 
+/*
12210
 
+ *
12211
 
+ *
12212
 
+ *   Copyright (c) International Business Machines  Corp., 2000
12213
 
+ *
12214
 
+ *   This program is free software;  you can redistribute it and/or modify
12215
 
+ *   it under the terms of the GNU General Public License as published by
12216
 
+ *   the Free Software Foundation; either version 2 of the License, or
12217
 
+ *   (at your option) any later version.
12218
 
+ *
12219
 
+ *   This program is distributed in the hope that it will be useful,
12220
 
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
12221
 
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
12222
 
+ *   the GNU General Public License for more details.
12223
 
+ *
12224
 
+ *   You should have received a copy of the GNU General Public License
12225
 
+ *   along with this program;  if not, write to the Free Software
12226
 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
12227
 
+ *
12228
 
+ *
12229
 
+ */
12230
 
+/*
12231
 
+ * linux/drivers/evms/evms_passthru.c
12232
 
+ *
12233
 
+ * EVMS System Data Manager
12234
 
+ *
12235
 
+ *
12236
 
+ */
12237
 
+
12238
 
+#include <linux/module.h>
12239
 
+#include <linux/kernel.h>
12240
 
+#include <linux/config.h>
12241
 
+#include <linux/genhd.h>
12242
 
+#include <linux/major.h>
12243
 
+#include <linux/string.h>
12244
 
+#include <linux/blk.h>
12245
 
+#include <linux/init.h>
12246
 
+#include <linux/slab.h>
12247
 
+#include <linux/evms/evms_kernel.h>
12248
 
+#include <asm/system.h>
12249
 
+
12250
 
+#define EVMS_PASSTHRU_ID     0
12251
 
+#define LOG_PREFIX "passthru: "
12252
 
+
12253
 
+static int  passthru_mgr_discover(evms_logical_node_t **);
12254
 
+static int  passthru_mgr_delete(evms_logical_node_t *);
12255
 
+static void passthru_mgr_read(evms_logical_node_t *, 
12256
 
+                                eio_t *);
12257
 
+static void passthru_mgr_write(evms_logical_node_t *, 
12258
 
+                                 eio_t *);
12259
 
+static int  passthru_mgr_ioctl(evms_logical_node_t *, 
12260
 
+                                 struct inode *, 
12261
 
+                                 struct file *, 
12262
 
+                                 unsigned int, 
12263
 
+                                 unsigned long);
12264
 
+static int  passthru_mgr_init_io(evms_logical_node_t *, 
12265
 
+                                   int, 
12266
 
+                                   evms_sector_t,
12267
 
+                                   evms_sector_t,
12268
 
+                                   void *);
12269
 
+
12270
 
+static evms_plugin_function_table_t function_table = {
12271
 
+        discover: &passthru_mgr_discover,
12272
 
+        delete  : &passthru_mgr_delete,
12273
 
+        read    : &passthru_mgr_read,
12274
 
+        write   : &passthru_mgr_write,
12275
 
+        init_io : &passthru_mgr_init_io,
12276
 
+        ioctl   : &passthru_mgr_ioctl
12277
 
+};
12278
 
+
12279
 
+static evms_plugin_header_t plugin_header = {
12280
 
+        id              : SetPluginID(
12281
 
+                IBM_OEM_ID,
12282
 
+                EVMS_FEATURE,
12283
 
+                EVMS_PASSTHRU_ID),
12284
 
+        version         : { 
12285
 
+                major      : 1,
12286
 
+                minor      : 0,
12287
 
+                patchlevel : 0
12288
 
+        },
12289
 
+        required_common_services_version : { 
12290
 
+                major      : 0,
12291
 
+                minor      : 5,
12292
 
+                patchlevel : 0
12293
 
+        },
12294
 
+        function_table   : &function_table               // function table for this plugin
12295
 
+};
12296
 
+
12297
 
+/*******************************/
12298
 
+/* discovery support functions */
12299
 
+/*******************************/
12300
 
+
12301
 
+static int
12302
 
+process_passthru_data(evms_logical_node_t **pp)
12303
 
+{
12304
 
+        int rc, size_in_sectors;
12305
 
+        evms_logical_node_t *node, *new_node;
12306
 
+
12307
 
+        node = *pp;
12308
 
+
12309
 
+       size_in_sectors = evms_cs_size_in_vsectors(
12310
 
+               sizeof(evms_feature_header_t));
12311
 
+
12312
 
+       /* allocate "parent" node */
12313
 
+       rc = evms_cs_allocate_logical_node(&new_node);
12314
 
+       if (!rc) {
12315
 
+               /* initialize "parent" node */
12316
 
+               new_node->instance_data = node;
12317
 
+               new_node->flags = node->flags;
12318
 
+               new_node->plugin = &plugin_header;
12319
 
+               new_node->system_id = node->system_id;
12320
 
+               new_node->block_size = node->block_size;
12321
 
+               new_node->hardsector_size = node->hardsector_size;
12322
 
+               new_node->total_vsectors = node->total_vsectors;
12323
 
+               new_node->total_vsectors -= 
12324
 
+                       (size_in_sectors << 1) + 
12325
 
+                       node->feature_header->alignment_padding;
12326
 
+               new_node->volume_info = node->volume_info;
12327
 
+               strcpy(new_node->name, node->name);
12328
 
+               if (strlen(node->feature_header->object_name))
12329
 
+                       strcat(new_node->name, node->feature_header->object_name);
12330
 
+               else
12331
 
+                       strcat(new_node->name, "_Passthru");
12332
 
+
12333
 
+               /* return "parent" node to caller */
12334
 
+               *pp = new_node;
12335
 
+
12336
 
+               MOD_INC_USE_COUNT;
12337
 
+
12338
 
+               LOG_DETAILS("feature header found on '%s', created '%s'.\n",
12339
 
+                       node->name, new_node->name);
12340
 
+               /* we're done with the passthru feature headers
12341
 
+                * so lets delete them now.
12342
 
+                */
12343
 
+               evms_cs_deallocate_memory(node->feature_header);
12344
 
+               node->feature_header = NULL;
12345
 
+       } else {
12346
 
+               /* on any fatal error, delete the node */
12347
 
+               int rc2 = DELETE(node);
12348
 
+               if (rc2) {
12349
 
+                       LOG_DEFAULT("error(%d) attempting to delete node(%p,%s).\n",
12350
 
+                               rc2, node, node->name);
12351
 
+               }
12352
 
+       }
12353
 
+        return(rc);
12354
 
+}
12355
 
+
12356
 
+/********** Required Plugin Functions **********/
12357
 
+
12358
 
+
12359
 
+/*
12360
 
+ * Function: passthru_mgr_discover
12361
 
+ *
12362
 
+ */
12363
 
+static int 
12364
 
+passthru_mgr_discover(evms_logical_node_t **discover_list)
12365
 
+{
12366
 
+        int rc = 0;
12367
 
+        evms_logical_node_t *node, *tmp_list_head;
12368
 
+
12369
 
+        tmp_list_head = *discover_list;
12370
 
+        *discover_list = NULL;
12371
 
+
12372
 
+        while(tmp_list_head) {
12373
 
+                node = tmp_list_head;
12374
 
+                rc = evms_cs_remove_logical_node_from_list(&tmp_list_head, node);
12375
 
+               if (!rc)
12376
 
+                       rc = process_passthru_data(&node);
12377
 
+               if (!rc)
12378
 
+                       if (node)
12379
 
+                               rc = evms_cs_add_logical_node_to_list(discover_list, node);
12380
 
+        }
12381
 
+        return(rc);
12382
 
+}
12383
 
+                                                
12384
 
+/*
12385
 
+ * Function: passthru_mgr_delete
12386
 
+ *
12387
 
+ */                                     
12388
 
+static int 
12389
 
+passthru_mgr_delete(evms_logical_node_t * node)
12390
 
+{
12391
 
+        int rc;
12392
 
+        evms_logical_node_t *p;
12393
 
+
12394
 
+       LOG_DETAILS("deleting '%s'.\n", node->name);
12395
 
+
12396
 
+        p = node->instance_data;
12397
 
+       rc = DELETE(p);
12398
 
+        if (!rc) {
12399
 
+                evms_cs_deallocate_logical_node(node);
12400
 
+                MOD_DEC_USE_COUNT;
12401
 
+        }
12402
 
+        return(rc);
12403
 
+}
12404
 
+
12405
 
+/* 
12406
 
+ * function: passthru_io_error
12407
 
+ * 
12408
 
+ * this function was primarily created because the function
12409
 
+ * buffer_IO_error is inline and kgdb doesn't allow breakpoints
12410
 
+ * to be set on inline functions. Since this was an error path
12411
 
+ * and not mainline, I decided to add a trace statement to help
12412
 
+ * report on the failing condition.
12413
 
+ *
12414
 
+ */
12415
 
+static void 
12416
 
+passthru_io_error(
12417
 
+       evms_logical_node_t    *node,
12418
 
+       int                     io_flag, 
12419
 
+       eio_t                  *eio)
12420
 
+{
12421
 
+       LOG_SERIOUS("attempt to %s beyond boundary(%Ld) on (%s), rsector(%Ld).\n",
12422
 
+                (io_flag) ? "WRITE" : "READ", 
12423
 
+               node->total_vsectors - 1,
12424
 
+               node->name,
12425
 
+               eio->rsector);
12426
 
+
12427
 
+        EVMS_IO_ERROR(eio);
12428
 
+}
12429
 
+
12430
 
+/*
12431
 
+ * Function: passthru_mgr_read
12432
 
+ */
12433
 
+static void 
12434
 
+passthru_mgr_read(
12435
 
+       evms_logical_node_t *node,
12436
 
+       eio_t               *eio)
12437
 
+{
12438
 
+       if ((eio->rsector + eio->rsize) <= node->total_vsectors) {
12439
 
+               R_IO(((evms_logical_node_t*)(node->instance_data)),
12440
 
+                    eio);
12441
 
+       } else
12442
 
+                passthru_io_error(node, READ, eio);
12443
 
+}
12444
 
+
12445
 
+/*
12446
 
+ * Function: passthru_mgr_write
12447
 
+ *
12448
 
+ */
12449
 
+static void 
12450
 
+passthru_mgr_write(
12451
 
+       evms_logical_node_t *node,
12452
 
+       eio_t               *eio)
12453
 
+{
12454
 
+       if ((eio->rsector + eio->rsize) <= node->total_vsectors) {
12455
 
+               W_IO(((evms_logical_node_t*)(node->instance_data)),
12456
 
+                    eio);
12457
 
+       } else
12458
 
+                passthru_io_error(node, WRITE, eio);
12459
 
+}
12460
 
+
12461
 
+/*
12462
 
+ * Function: passthru_mgr_ioctl
12463
 
+ *
12464
 
+ */
12465
 
+static int 
12466
 
+passthru_mgr_ioctl(       
12467
 
+       evms_logical_node_t   * node,
12468
 
+        struct inode          * inode,
12469
 
+        struct file           * file,
12470
 
+        unsigned int            cmd,
12471
 
+        unsigned long           arg)
12472
 
+{
12473
 
+        int rc;
12474
 
+        
12475
 
+        if ((!node) || (!inode))
12476
 
+                rc = -EINVAL;
12477
 
+        else
12478
 
+                rc = IOCTL(((evms_logical_node_t*)(node->instance_data)), inode, file, cmd, arg);
12479
 
+        return(rc);
12480
 
+}
12481
 
+
12482
 
+
12483
 
+static int 
12484
 
+passthru_mgr_init_io(
12485
 
+       evms_logical_node_t   * node,
12486
 
+        int                    io_flag,        /* 0=read, 1=write*/
12487
 
+        evms_sector_t           sect_nr,        /* disk LBA */
12488
 
+        evms_sector_t           num_sects,      /* # of sectors */
12489
 
+        void                  * buf_addr )      /* buffer address */
12490
 
+{
12491
 
+       int rc;
12492
 
+       if ((sect_nr + num_sects) <= node->total_vsectors) {
12493
 
+               rc = INIT_IO(((evms_logical_node_t*)(node->instance_data)),
12494
 
+                            io_flag, sect_nr, num_sects, buf_addr);
12495
 
+       } else
12496
 
+               rc = -EINVAL;
12497
 
+        return(rc);
12498
 
+}
12499
 
+
12500
 
+
12501
 
+
12502
 
+/*
12503
 
+ * Function: passthru_init
12504
 
+ *
12505
 
+ */
12506
 
+int __init 
12507
 
+evms_passthru_manager_init(void)
12508
 
+{
12509
 
+        return evms_cs_register_plugin(&plugin_header); /* register with EVMS */
12510
 
+}
12511
 
+
12512
 
+void __exit
12513
 
+evms_passthru_manager_exit(void)
12514
 
+{
12515
 
+       evms_cs_unregister_plugin(&plugin_header);
12516
 
+}
12517
 
+
12518
 
+module_init(evms_passthru_manager_init);
12519
 
+module_exit(evms_passthru_manager_exit);
12520
 
+#ifdef MODULE_LICENSE
12521
 
+MODULE_LICENSE("GPL");
12522
 
+#endif
12523
 
+
12524
 
diff -Naur linux-2002-03-28/drivers/evms/ldev_mgr.c evms-2002-03-28/drivers/evms/ldev_mgr.c
12525
 
--- linux-2002-03-28/drivers/evms/ldev_mgr.c    Wed Dec 31 18:00:00 1969
12526
 
+++ evms-2002-03-28/drivers/evms/ldev_mgr.c     Wed Mar 27 16:25:55 2002
12527
 
@@ -0,0 +1,1262 @@
12528
 
+/* -*- linux-c -*- */
12529
 
+/*
12530
 
+ *
12531
 
+ *   Copyright (c) International Business Machines  Corp., 2000
12532
 
+ *
12533
 
+ *   This program is free software;  you can redistribute it and/or modify
12534
 
+ *   it under the terms of the GNU General Public License as published by
12535
 
+ *   the Free Software Foundation; either version 2 of the License, or
12536
 
+ *   (at your option) any later version.
12537
 
+ *
12538
 
+ *   This program is distributed in the hope that it will be useful,
12539
 
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
12540
 
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
12541
 
+ *   the GNU General Public License for more details.
12542
 
+ *
12543
 
+ *   You should have received a copy of the GNU General Public License
12544
 
+ *   along with this program;  if not, write to the Free Software
12545
 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
12546
 
+ */
12547
 
+
12548
 
+/* linux/driver/evms/ldev_mgr.c
12549
 
+ *
12550
 
+ * EVMS - Local Device (Hard Drive) Manager
12551
 
+ *
12552
 
+ *  This plugin walks the gendisk list and creates logical disk structures for each
12553
 
+ *  local ide or scsi device.
12554
 
+ *
12555
 
+ */
12556
 
+
12557
 
+#include <linux/config.h>
12558
 
+#include <linux/module.h>
12559
 
+#include <linux/errno.h>
12560
 
+#include <linux/kernel.h>
12561
 
+#include <linux/fs.h>
12562
 
+#include <linux/major.h>
12563
 
+#include <linux/slab.h>
12564
 
+#include <asm/uaccess.h>
12565
 
+#include <linux/blk.h>      /* must be included by all block drivers */
12566
 
+#include <linux/genhd.h>
12567
 
+#include <linux/ide.h>
12568
 
+#include "../scsi/scsi.h"
12569
 
+#include "../scsi/sd.h"
12570
 
+#include <linux/init.h>
12571
 
+#include <linux/evms/evms_kernel.h>
12572
 
+
12573
 
+#define LOG_PREFIX "ldev_mgr: "
12574
 
+
12575
 
+#define EVMS_LOCAL_DEVICE_MANAGER_ID    1
12576
 
+
12577
 
+/* local instance data structure definition */            
12578
 
+typedef struct ldev_mgr_instance_data_s {
12579
 
+        kdev_t                  dev;
12580
 
+        struct gendisk         *gd;
12581
 
+       int                     media_changed;
12582
 
+} ldev_mgr_instance_data_t;
12583
 
+
12584
 
+/* prototypes for mandatory plugin interface functions */
12585
 
+static int  discover_disks(evms_logical_node_t **);
12586
 
+static int  ldev_mgr_delete(evms_logical_node_t *);
12587
 
+static void ldev_mgr_read(evms_logical_node_t *, eio_t *);
12588
 
+static void ldev_mgr_write(evms_logical_node_t *, eio_t *);
12589
 
+static int  ldev_mgr_ioctl(evms_logical_node_t *, 
12590
 
+                          struct inode *, 
12591
 
+                           struct file *, 
12592
 
+                          unsigned int, 
12593
 
+                          unsigned long);
12594
 
+static int  ldev_init_io(evms_logical_node_t *, 
12595
 
+                        int, 
12596
 
+                        evms_sector_t,
12597
 
+                        evms_sector_t,
12598
 
+                         void *);
12599
 
+
12600
 
+/* plugin function table definition */
12601
 
+static  evms_plugin_function_table_t function_table = {
12602
 
+        discover   : &discover_disks,
12603
 
+        delete     : &ldev_mgr_delete,
12604
 
+        read       : &ldev_mgr_read,
12605
 
+        write      : &ldev_mgr_write,
12606
 
+        init_io    : &ldev_init_io,
12607
 
+        ioctl      : &ldev_mgr_ioctl
12608
 
+};
12609
 
+
12610
 
+/* plugin header definition */
12611
 
+static evms_plugin_header_t plugin_header = {
12612
 
+        id              : SetPluginID(
12613
 
+                IBM_OEM_ID,
12614
 
+                EVMS_DEVICE_MANAGER,
12615
 
+                EVMS_LOCAL_DEVICE_MANAGER_ID),
12616
 
+        version         : { 
12617
 
+                major      : 1,
12618
 
+                minor      : 0,
12619
 
+                patchlevel : 0 
12620
 
+        },
12621
 
+        required_common_services_version : { 
12622
 
+                major      : 0,
12623
 
+                minor      : 5,
12624
 
+                patchlevel : 0
12625
 
+        },
12626
 
+        function_table  : &function_table
12627
 
+};
12628
 
+
12629
 
+#define TYPE_NONE      0
12630
 
+#define TYPE_GENERIC   1
12631
 
+#define TYPE_IDE       2
12632
 
+#define TYPE_SCSI      3
12633
 
+
12634
 
+#define INDEX_ALPHA    0
12635
 
+#define INDEX_NUMERIC  1
12636
 
+
12637
 
+/********************************************************/
12638
 
+/* Required Plugin Function Table Entry Point:          */
12639
 
+/*      Discover function & Support routines            */
12640
 
+/********************************************************/
12641
 
+
12642
 
+#define MAX_NAME_BASE_SIZE     10
12643
 
+#define MAX_NAME_MODIFIER_SIZE 4
12644
 
+typedef struct blk_device_info_s {
12645
 
+       char devnode_name_base[MAX_NAME_BASE_SIZE];
12646
 
+       char null1;
12647
 
+       char devnode_name_modifier[MAX_NAME_MODIFIER_SIZE];
12648
 
+       char null2;
12649
 
+       int devnode_name_index;
12650
 
+       int devnode_name_type;
12651
 
+       int device_type;
12652
 
+} blk_device_info_t;
12653
 
+
12654
 
+static blk_device_info_t *blk_dev_info = NULL;
12655
 
+
12656
 
+#define BLK_DEV_INFO(a,b,c,d,e)                                \
12657
 
+       strncpy(blk_dev_info[a].devnode_name_base, b, MAX_NAME_BASE_SIZE);      \
12658
 
+       blk_dev_info[a].null1 = 0;                              \
12659
 
+       strncpy(blk_dev_info[a].devnode_name_modifier, c, MAX_NAME_MODIFIER_SIZE);      \
12660
 
+       blk_dev_info[a].null2 = 0;                              \
12661
 
+       blk_dev_info[a].devnode_name_index = 0;                 \
12662
 
+       blk_dev_info[a].device_type = d;                        \
12663
 
+       blk_dev_info[a].devnode_name_type = e;
12664
 
+       
12665
 
+static void 
12666
 
+init_blk_dev_info( blk_device_info_t *blk_dev_info )
12667
 
+{
12668
 
+       BLK_DEV_INFO( IDE0_MAJOR, "hd", "a", TYPE_IDE, INDEX_ALPHA );
12669
 
+       BLK_DEV_INFO( IDE1_MAJOR, "hd", "c", TYPE_IDE, INDEX_ALPHA );
12670
 
+       BLK_DEV_INFO( IDE2_MAJOR, "hd", "e", TYPE_IDE, INDEX_ALPHA );
12671
 
+       BLK_DEV_INFO( IDE3_MAJOR, "hd", "g", TYPE_IDE, INDEX_ALPHA );
12672
 
+       BLK_DEV_INFO( IDE4_MAJOR, "hd", "i", TYPE_IDE, INDEX_ALPHA );
12673
 
+       BLK_DEV_INFO( IDE5_MAJOR, "hd", "k", TYPE_IDE, INDEX_ALPHA );
12674
 
+       BLK_DEV_INFO( IDE6_MAJOR, "hd", "m", TYPE_IDE, INDEX_ALPHA );
12675
 
+       BLK_DEV_INFO( IDE7_MAJOR, "hd", "o", TYPE_IDE, INDEX_ALPHA );
12676
 
+       BLK_DEV_INFO( IDE8_MAJOR, "hd", "q", TYPE_IDE, INDEX_ALPHA );
12677
 
+       BLK_DEV_INFO( IDE9_MAJOR, "hd", "s", TYPE_IDE, INDEX_ALPHA );
12678
 
+
12679
 
+       BLK_DEV_INFO( SCSI_DISK0_MAJOR, "sd", "a",  TYPE_SCSI, INDEX_ALPHA );
12680
 
+       BLK_DEV_INFO( SCSI_DISK1_MAJOR, "sd", "q",  TYPE_SCSI, INDEX_ALPHA );
12681
 
+       BLK_DEV_INFO( SCSI_DISK2_MAJOR, "sd", "ag", TYPE_SCSI, INDEX_ALPHA );
12682
 
+       BLK_DEV_INFO( SCSI_DISK3_MAJOR, "sd", "aw", TYPE_SCSI, INDEX_ALPHA );
12683
 
+       BLK_DEV_INFO( SCSI_DISK4_MAJOR, "sd", "bm", TYPE_SCSI, INDEX_ALPHA );
12684
 
+       BLK_DEV_INFO( SCSI_DISK5_MAJOR, "sd", "cc", TYPE_SCSI, INDEX_ALPHA );
12685
 
+       BLK_DEV_INFO( SCSI_DISK6_MAJOR, "sd", "cs", TYPE_SCSI, INDEX_ALPHA );
12686
 
+       BLK_DEV_INFO( SCSI_DISK7_MAJOR, "sd", "di", TYPE_SCSI, INDEX_ALPHA );
12687
 
+
12688
 
+//     BLK_DEV_INFO( MD_MAJOR, "md", "0", TYPE_GENERIC, INDEX_NUMERIC );
12689
 
+
12690
 
+       BLK_DEV_INFO( XT_DISK_MAJOR, "xd", "a", TYPE_GENERIC, INDEX_ALPHA );
12691
 
+
12692
 
+       BLK_DEV_INFO( CYCLADES_MAJOR, "double", "0", TYPE_GENERIC, INDEX_NUMERIC );
12693
 
+
12694
 
+       BLK_DEV_INFO( MFM_ACORN_MAJOR, "mfm", "a", TYPE_GENERIC, INDEX_ALPHA );
12695
 
+
12696
 
+       BLK_DEV_INFO( ACSI_MAJOR, "ad", "a", TYPE_GENERIC, INDEX_ALPHA );
12697
 
+
12698
 
+       BLK_DEV_INFO( PS2ESDI_MAJOR, "ed", "a", TYPE_GENERIC, INDEX_ALPHA );
12699
 
+
12700
 
+       BLK_DEV_INFO( 40, "ez", "a", TYPE_GENERIC, INDEX_ALPHA );
12701
 
+       BLK_DEV_INFO( 43, "nb", "0", TYPE_GENERIC, INDEX_NUMERIC );
12702
 
+       BLK_DEV_INFO( 44, "ftl", "a", TYPE_GENERIC, INDEX_ALPHA );
12703
 
+       BLK_DEV_INFO( 45, "pd", "a", TYPE_GENERIC, INDEX_ALPHA );
12704
 
+       BLK_DEV_INFO( 47, "pf", "0", TYPE_GENERIC, INDEX_NUMERIC );
12705
 
+
12706
 
+       BLK_DEV_INFO( DAC960_MAJOR + 0, "rd/c0d", "0", TYPE_GENERIC, INDEX_NUMERIC );
12707
 
+       BLK_DEV_INFO( DAC960_MAJOR + 1, "rd/c1d", "0", TYPE_GENERIC, INDEX_NUMERIC );
12708
 
+       BLK_DEV_INFO( DAC960_MAJOR + 2, "rd/c2d", "0", TYPE_GENERIC, INDEX_NUMERIC );
12709
 
+       BLK_DEV_INFO( DAC960_MAJOR + 3, "rd/c3d", "0", TYPE_GENERIC, INDEX_NUMERIC );
12710
 
+       BLK_DEV_INFO( DAC960_MAJOR + 4, "rd/c4d", "0", TYPE_GENERIC, INDEX_NUMERIC );
12711
 
+       BLK_DEV_INFO( DAC960_MAJOR + 5, "rd/c5d", "0", TYPE_GENERIC, INDEX_NUMERIC );
12712
 
+       BLK_DEV_INFO( DAC960_MAJOR + 6, "rd/c6d", "0", TYPE_GENERIC, INDEX_NUMERIC );
12713
 
+       BLK_DEV_INFO( DAC960_MAJOR + 7, "rd/c7d", "0", TYPE_GENERIC, INDEX_NUMERIC );
12714
 
+
12715
 
+       BLK_DEV_INFO( COMPAQ_SMART2_MAJOR,  "ida/c0d", "0", TYPE_GENERIC, INDEX_NUMERIC );
12716
 
+       BLK_DEV_INFO( COMPAQ_SMART2_MAJOR1, "ida/c1d", "0", TYPE_GENERIC, INDEX_NUMERIC );
12717
 
+       BLK_DEV_INFO( COMPAQ_SMART2_MAJOR2, "ida/c2d", "0", TYPE_GENERIC, INDEX_NUMERIC );
12718
 
+       BLK_DEV_INFO( COMPAQ_SMART2_MAJOR3, "ida/c3d", "0", TYPE_GENERIC, INDEX_NUMERIC );
12719
 
+       BLK_DEV_INFO( COMPAQ_SMART2_MAJOR4, "ida/c4d", "0", TYPE_GENERIC, INDEX_NUMERIC );
12720
 
+       BLK_DEV_INFO( COMPAQ_SMART2_MAJOR5, "ida/c5d", "0", TYPE_GENERIC, INDEX_NUMERIC );
12721
 
+       BLK_DEV_INFO( COMPAQ_SMART2_MAJOR6, "ida/c6d", "0", TYPE_GENERIC, INDEX_NUMERIC );
12722
 
+       BLK_DEV_INFO( COMPAQ_SMART2_MAJOR7, "ida/c7d", "0", TYPE_GENERIC, INDEX_NUMERIC );
12723
 
+
12724
 
+       BLK_DEV_INFO( I2O_MAJOR + 0, "i2o/hd", "a",  TYPE_GENERIC, INDEX_ALPHA );
12725
 
+       BLK_DEV_INFO( I2O_MAJOR + 1, "i2o/hd", "q",  TYPE_GENERIC, INDEX_ALPHA );
12726
 
+       BLK_DEV_INFO( I2O_MAJOR + 2, "i2o/hd", "ag", TYPE_GENERIC, INDEX_ALPHA );
12727
 
+       BLK_DEV_INFO( I2O_MAJOR + 3, "i2o/hd", "aw", TYPE_GENERIC, INDEX_ALPHA );
12728
 
+       BLK_DEV_INFO( I2O_MAJOR + 4, "i2o/hd", "bm", TYPE_GENERIC, INDEX_ALPHA );
12729
 
+       BLK_DEV_INFO( I2O_MAJOR + 5, "i2o/hd", "cc", TYPE_GENERIC, INDEX_ALPHA );
12730
 
+       BLK_DEV_INFO( I2O_MAJOR + 6, "i2o/hd", "cs", TYPE_GENERIC, INDEX_ALPHA );
12731
 
+       BLK_DEV_INFO( I2O_MAJOR + 7, "i2o/hd", "di", TYPE_GENERIC, INDEX_ALPHA );
12732
 
+
12733
 
+       BLK_DEV_INFO( 92, "ppdd", "0", TYPE_GENERIC, INDEX_NUMERIC );
12734
 
+       BLK_DEV_INFO( 93, "nftl", "a", TYPE_GENERIC, INDEX_ALPHA );
12735
 
+
12736
 
+       BLK_DEV_INFO( DASD_MAJOR,  "dasd", "a", TYPE_GENERIC, INDEX_ALPHA );
12737
 
+       BLK_DEV_INFO( MDISK_MAJOR, "mdisk", "a", TYPE_GENERIC, INDEX_ALPHA );
12738
 
+
12739
 
+       BLK_DEV_INFO( 96, "msd", "0", TYPE_GENERIC, INDEX_NUMERIC );
12740
 
+       BLK_DEV_INFO( 97, "pktcdvd", "0", TYPE_GENERIC, INDEX_NUMERIC );
12741
 
+
12742
 
+       BLK_DEV_INFO( UBD_MAJOR, "ubd", "0", TYPE_GENERIC, INDEX_NUMERIC );
12743
 
+
12744
 
+       BLK_DEV_INFO( JSFD_MAJOR, "jsfd", "", TYPE_GENERIC, INDEX_NUMERIC );
12745
 
+
12746
 
+       BLK_DEV_INFO( 101, "amiraid/ar", "0", TYPE_GENERIC, INDEX_NUMERIC );
12747
 
+       
12748
 
+       BLK_DEV_INFO( 104, "cciss/c0d", "0", TYPE_GENERIC, INDEX_NUMERIC );
12749
 
+       BLK_DEV_INFO( 105, "cciss/c1d", "0", TYPE_GENERIC, INDEX_NUMERIC );
12750
 
+       BLK_DEV_INFO( 106, "cciss/c2d", "0", TYPE_GENERIC, INDEX_NUMERIC );
12751
 
+       BLK_DEV_INFO( 107, "cciss/c3d", "0", TYPE_GENERIC, INDEX_NUMERIC );
12752
 
+       BLK_DEV_INFO( 108, "cciss/c4d", "0", TYPE_GENERIC, INDEX_NUMERIC );
12753
 
+       BLK_DEV_INFO( 108, "cciss/c5d", "0", TYPE_GENERIC, INDEX_NUMERIC );
12754
 
+       BLK_DEV_INFO( 110, "cciss/c6d", "0", TYPE_GENERIC, INDEX_NUMERIC );
12755
 
+       BLK_DEV_INFO( 111, "cciss/c7d", "0", TYPE_GENERIC, INDEX_NUMERIC );
12756
 
+
12757
 
+       BLK_DEV_INFO( RAW_MAJOR, "raw", "0", TYPE_GENERIC, INDEX_NUMERIC );
12758
 
+
12759
 
+       BLK_DEV_INFO( VXVM_MAJOR, "vx/dsk", "0", TYPE_GENERIC, INDEX_NUMERIC );
12760
 
+       BLK_DEV_INFO( VXDMP_MAJOR, "vx/dmp", "0", TYPE_GENERIC, INDEX_NUMERIC );
12761
 
+}
12762
 
+
12763
 
+static int 
12764
 
+is_in_device_list(
12765
 
+       struct gendisk *gd, 
12766
 
+       int major, int minor)
12767
 
+{
12768
 
+       int found, done, rc;
12769
 
+       evms_logical_node_t *device = NULL;
12770
 
+       ldev_mgr_instance_data_t *LID;
12771
 
+
12772
 
+       done = found = FALSE;
12773
 
+       while(done == FALSE) {
12774
 
+               rc = evms_cs_find_next_device(device, &device);
12775
 
+               if (rc || !device)
12776
 
+                       done = TRUE;
12777
 
+               else {
12778
 
+                       LID = device->instance_data;
12779
 
+                       if (LID->gd == gd)
12780
 
+                               if (MAJOR(LID->dev) == major)
12781
 
+                                       if (MINOR(LID->dev) == minor)
12782
 
+                                               done = found = TRUE;
12783
 
+               }
12784
 
+       }
12785
 
+       return(found);
12786
 
+}
12787
 
+
12788
 
+static void 
12789
 
+build_devnode_name(char *name_buf, int major)
12790
 
+{
12791
 
+       char buf[11], *modifier, *buf_ptr;
12792
 
+       int int_mod;
12793
 
+       blk_device_info_t *bdi;
12794
 
+
12795
 
+       bdi = &blk_dev_info[major];
12796
 
+
12797
 
+       /* convert the base name modifier to an integer */
12798
 
+       modifier = bdi->devnode_name_modifier;
12799
 
+       int_mod = 0;
12800
 
+       while (*modifier) {
12801
 
+               if (bdi->devnode_name_type == INDEX_ALPHA) {
12802
 
+                       int_mod *= 26;
12803
 
+                       int_mod += *modifier - 'a';
12804
 
+               } else {
12805
 
+                       int_mod *= 10;
12806
 
+                       int_mod += *modifier - '0';
12807
 
+               }
12808
 
+               modifier++;
12809
 
+       }
12810
 
+       /* add in device_index_value */
12811
 
+       int_mod += bdi->devnode_name_index;
12812
 
+       bdi->devnode_name_index++;
12813
 
+
12814
 
+       /* convert integer modifier back to ALPHA/NUMERIC chars */
12815
 
+       memset(buf, 0, sizeof(buf));
12816
 
+       /* fill the buffer from the rear to front with the
12817
 
+        * ascii version of the modifier, leaving space for
12818
 
+        * NULL terminator at the end.
12819
 
+        */
12820
 
+       buf_ptr = &buf[sizeof(buf) - 2];
12821
 
+       do {
12822
 
+               if (bdi->devnode_name_type == INDEX_ALPHA) {
12823
 
+                       *buf_ptr = (int_mod % 26) + 'a';
12824
 
+                       int_mod /= 26;
12825
 
+               } else {
12826
 
+                       *buf_ptr = (int_mod % 10) + '0';
12827
 
+                       int_mod /= 10;
12828
 
+               }
12829
 
+               buf_ptr--;
12830
 
+       } while (int_mod);
12831
 
+
12832
 
+       /* find beginning of modifier in buffer */
12833
 
+       modifier = buf;
12834
 
+       while (!*modifier)
12835
 
+               modifier++;
12836
 
+
12837
 
+       /* build the final device devnode name */
12838
 
+       sprintf(name_buf, "%s%s", 
12839
 
+               bdi->devnode_name_base,
12840
 
+               modifier);
12841
 
+}
12842
 
+
12843
 
+#define DEVICE_KNOWN                   1234
12844
 
+#define DEVICE_UNINITIALIZED           1235
12845
 
+#define DEVICE_MEDIA_NOT_PRESENT       1236
12846
 
+static int 
12847
 
+create_logical_disk(
12848
 
+       evms_logical_node_t **disk_list,
12849
 
+       struct gendisk *gd, 
12850
 
+       int device_index)
12851
 
+{
12852
 
+        int rc = 0, major, minor;
12853
 
+        evms_logical_node_t *new_disk;
12854
 
+        ldev_mgr_instance_data_t *InstData;
12855
 
+       char device_name[EVMS_VOLUME_NAME_SIZE + 1];
12856
 
+
12857
 
+       major = gd->major;
12858
 
+       minor = device_index << gd->minor_shift;
12859
 
+
12860
 
+       /* skip uninitialized devices */
12861
 
+       if (!blk_size[major])
12862
 
+               rc = DEVICE_UNINITIALIZED;
12863
 
+       else if (!blk_size[major][minor])
12864
 
+               rc = DEVICE_UNINITIALIZED;
12865
 
+       if (!rc) {
12866
 
+               /* construct the devnode name for this device */
12867
 
+               build_devnode_name(device_name, major);
12868
 
+
12869
 
+               /* skip devices we already know about */
12870
 
+               if (is_in_device_list(gd, major, minor) == TRUE)
12871
 
+                       rc = DEVICE_KNOWN;
12872
 
+       }
12873
 
+       /* allocate the new node & it's instance data */
12874
 
+       if (!rc)
12875
 
+                rc = evms_cs_allocate_logical_node(&new_disk);
12876
 
+       if (!rc) {
12877
 
+               rc = evms_cs_allocate_memory((void **)&InstData,sizeof(ldev_mgr_instance_data_t));
12878
 
+               if (rc) 
12879
 
+                       evms_cs_deallocate_logical_node(new_disk);
12880
 
+       }
12881
 
+       /* initialize the new node */
12882
 
+       if (!rc) {
12883
 
+               struct hd_geometry dev_geo;
12884
 
+               new_disk->plugin = &plugin_header;
12885
 
+
12886
 
+               /* initialize the instance data */
12887
 
+               new_disk->instance_data = InstData;
12888
 
+               InstData->dev = MKDEV(major, minor);
12889
 
+               InstData->gd = gd;
12890
 
+
12891
 
+               /* determine hardsector size */
12892
 
+               new_disk->hardsector_size = 512;
12893
 
+               if (hardsect_size[major])
12894
 
+                       new_disk->hardsector_size = hardsect_size[major][minor];
12895
 
+
12896
 
+               /* determine block size */
12897
 
+               new_disk->block_size = 1024;
12898
 
+               if (blksize_size[major])
12899
 
+                       new_disk->block_size = blksize_size[major][minor];
12900
 
+
12901
 
+               /* determine the device size in sectors */
12902
 
+               new_disk->total_vsectors = blk_size[major][minor] << 1;
12903
 
+               /* check the size based on the device geometry
12904
 
+                * and use this if its larger than the blk_size
12905
 
+                * info. because of odd(non-even) geometry, the
12906
 
+                * total sector count could be an odd number,
12907
 
+                * and we need to insure we truly reflect the
12908
 
+                * maximum size of the device.
12909
 
+                */
12910
 
+               rc = evms_cs_kernel_ioctl(
12911
 
+                       new_disk,
12912
 
+                       HDIO_GETGEO,
12913
 
+                       (unsigned long)&dev_geo);
12914
 
+               if (rc) {
12915
 
+                       LOG_ERROR("error(%d) retrieving geometry for '%s'.\n",
12916
 
+                                 rc, device_name);
12917
 
+               } else {
12918
 
+                       u64 dev_size;
12919
 
+
12920
 
+                       dev_size = dev_geo.cylinders;
12921
 
+                       dev_size *= (u64)dev_geo.heads;
12922
 
+                       dev_size *= (u64)dev_geo.sectors;
12923
 
+
12924
 
+                       /* convert device size to 512 byte units */
12925
 
+                       dev_size <<= evms_cs_log2(new_disk->hardsector_size) - 9;
12926
 
+
12927
 
+                       if (dev_size > new_disk->total_vsectors) {
12928
 
+                               new_disk->total_vsectors = dev_size;
12929
 
+                       }
12930
 
+                       LOG_DEBUG("blk_size(%Lu), geometry size(%Lu) in 512 byte units.\n",
12931
 
+                                   (u64)blk_size[major][minor] << 1,
12932
 
+                                   dev_size);
12933
 
+               }
12934
 
+
12935
 
+               /* remember removable devices */
12936
 
+               if (gd->flags)
12937
 
+                       if (gd->flags[device_index] & GENHD_FL_REMOVABLE)
12938
 
+                               new_disk->flags |= EVMS_DEVICE_REMOVABLE;
12939
 
+
12940
 
+               /* save the devnode name for this device */
12941
 
+               strcpy(new_disk->name, device_name);
12942
 
+
12943
 
+               /* register this device with evms */
12944
 
+               evms_cs_register_device(new_disk);
12945
 
+               MOD_INC_USE_COUNT;
12946
 
+
12947
 
+               /* append this record the linked list */
12948
 
+               evms_cs_add_logical_node_to_list(disk_list, new_disk);
12949
 
+               LOG_DETAILS("added logical disk(%s) for physical disk(%u,%u,%s), size(%Lu) in 512 byte units\n",
12950
 
+                       new_disk->name,
12951
 
+                       major, minor,
12952
 
+                       new_disk->name,
12953
 
+                       new_disk->total_vsectors);
12954
 
+
12955
 
+       }
12956
 
+       /* reset the "benign" error codes for the caller */
12957
 
+       switch(rc) {
12958
 
+               case DEVICE_UNINITIALIZED:
12959
 
+               case DEVICE_KNOWN:
12960
 
+               case DEVICE_MEDIA_NOT_PRESENT:
12961
 
+                       rc = 0;
12962
 
+       }
12963
 
+        return( rc );
12964
 
+}
12965
 
+
12966
 
+static int 
12967
 
+create_logical_generic_disks(
12968
 
+       evms_logical_node_t **disk_list,
12969
 
+       struct gendisk *gd)
12970
 
+{
12971
 
+        int rc, i;
12972
 
+
12973
 
+        /* This is a generic device */
12974
 
+
12975
 
+        rc = 0;
12976
 
+        LOG_DEBUG("major name = %s\n", gd->major_name);
12977
 
+        LOG_DEBUG("number of real devices = %i\n", gd->nr_real);
12978
 
+        for ( i = 0; i < gd->nr_real; i++ ) {
12979
 
+                LOG_DEBUG("device %d:\n", i);
12980
 
+               rc = create_logical_disk(disk_list, gd, i);
12981
 
+                if (rc) break;
12982
 
+        }
12983
 
+        return( rc );
12984
 
+}
12985
 
+
12986
 
+static int 
12987
 
+create_logical_ide_disks(
12988
 
+       evms_logical_node_t **disk_list,
12989
 
+       struct gendisk *gd) 
12990
 
+{
12991
 
+        int rc = 0, i;
12992
 
+        ide_hwif_t * ide_hwif;
12993
 
+        ide_drive_t * drive;
12994
 
+
12995
 
+        /* This is an IDE device */
12996
 
+       LOG_DEBUG("found IDE major : %i - searching for disks\n",
12997
 
+               gd->major);
12998
 
+
12999
 
+        ide_hwif = gd->real_devices; /* IDE internal data */
13000
 
+        for (i = 0; i < MAX_DRIVES; i++) {
13001
 
+                drive = &(ide_hwif->drives[i]);
13002
 
+                if (drive->present && (drive->media == ide_disk)) {
13003
 
+                       /* force the name index value on ide drives */
13004
 
+                       blk_dev_info[gd->major].devnode_name_index = i;
13005
 
+                       rc = create_logical_disk(disk_list, gd, i);
13006
 
+               }
13007
 
+                if (rc) break;
13008
 
+        }
13009
 
+        return( rc );
13010
 
+}
13011
 
+
13012
 
+static int 
13013
 
+create_logical_scsi_disks(
13014
 
+       evms_logical_node_t **disk_list,
13015
 
+       struct gendisk *gd)
13016
 
+{
13017
 
+        int rc = 0, i;
13018
 
+        Scsi_Disk *SDisks;
13019
 
+        Scsi_Device *SDev;
13020
 
+
13021
 
+        /* This is an SCSI device */
13022
 
+       LOG_DEBUG("found SCSI major : %i - searching for disks\n",gd->major);
13023
 
+        LOG_DEBUG("scsi: major name = %s\n",gd->major_name);
13024
 
+        LOG_DEBUG("scsi: number of real devices = %i\n",gd->nr_real);
13025
 
+        SDisks = gd->real_devices; /* SCSI internal data */
13026
 
+        for ( i = 0; i < gd->nr_real; i++ ) {
13027
 
+                SDev = SDisks[i].device;
13028
 
+                LOG_DEBUG("scsi: Channel = %i, Id = %i, Lun = %i, Capacity = %i\n",
13029
 
+                         SDev->channel, SDev->id, SDev->lun, SDisks[i].capacity);
13030
 
+               rc = create_logical_disk(disk_list, gd, i);
13031
 
+                if (rc) break;
13032
 
+        }
13033
 
+        return( rc );
13034
 
+}
13035
 
+
13036
 
+static int 
13037
 
+create_logical_disks(struct gendisk *gd,
13038
 
+                     void * p_disk_list)
13039
 
+{
13040
 
+        int rc = 0;
13041
 
+        evms_logical_node_t **disk_list = p_disk_list;
13042
 
+
13043
 
+        /* create logical disks from all IDE & SCSI devices */
13044
 
+       switch(blk_dev_info[gd->major].device_type) {
13045
 
+               case TYPE_IDE:
13046
 
+                       rc = create_logical_ide_disks(disk_list, gd);
13047
 
+                       break;
13048
 
+               case TYPE_SCSI:
13049
 
+                       rc = create_logical_scsi_disks(disk_list, gd);
13050
 
+                       break;
13051
 
+               case TYPE_GENERIC:
13052
 
+                       rc = create_logical_generic_disks(disk_list, gd);
13053
 
+                       break;
13054
 
+               default:
13055
 
+                       LOG_DEBUG("unrecognized device major : %i\n",gd->major);
13056
 
+                       break;
13057
 
+       }
13058
 
+
13059
 
+        return(rc);
13060
 
+}
13061
 
+                        
13062
 
+static int 
13063
 
+discover_disks(evms_logical_node_t **disk_list)
13064
 
+{
13065
 
+        int rc = 0;
13066
 
+
13067
 
+        LOG_ENTRY_EXIT("%s Entry\n", __FUNCTION__);
13068
 
+
13069
 
+       if (blk_dev_info == NULL) {
13070
 
+               /* allocate space for device info array */
13071
 
+               rc = evms_cs_allocate_memory(
13072
 
+                       (void **)&blk_dev_info,
13073
 
+                       sizeof(blk_device_info_t) * (MAX_BLKDEV + 1));
13074
 
+               if (!rc)
13075
 
+                       /* initialize device info array */
13076
 
+                       init_blk_dev_info(blk_dev_info);
13077
 
+       }
13078
 
+        if (!rc)
13079
 
+                /* create logical disks from the raw devices */
13080
 
+                rc = walk_gendisk(create_logical_disks, disk_list);
13081
 
+
13082
 
+       /* free blk_dev_info table and null the ptr to it */
13083
 
+       evms_cs_deallocate_memory(blk_dev_info);
13084
 
+       blk_dev_info = NULL;
13085
 
+
13086
 
+        LOG_ENTRY_EXIT("%s Exit\n", __FUNCTION__);
13087
 
+        return( rc );
13088
 
+}
13089
 
+
13090
 
+/********************************************************/
13091
 
+/* Required Plugin Function Table Entry Point:          */
13092
 
+/*      Delete function                                 */
13093
 
+/********************************************************/
13094
 
+
13095
 
+static int 
13096
 
+ldev_mgr_delete(evms_logical_node_t *disk)
13097
 
+{
13098
 
+       ldev_mgr_instance_data_t *LID;
13099
 
+
13100
 
+       /* reset any evms volume related info from
13101
 
+        * the device node, because we can't predict
13102
 
+        * how this node will be used in the future.
13103
 
+        */
13104
 
+
13105
 
+       /* removed the feature header if its been used
13106
 
+        */
13107
 
+       if (disk->feature_header) {
13108
 
+               evms_cs_deallocate_memory(disk->feature_header);
13109
 
+               disk->feature_header = NULL;
13110
 
+       }
13111
 
+       /* remove the volume_info structure and flag
13112
 
+        * if this has been used directly by an evms
13113
 
+        * feature.
13114
 
+        */
13115
 
+       evms_cs_deallocate_volume_info(disk);
13116
 
+       /* reset the flags field to the appropriate state
13117
 
+        */
13118
 
+       disk->flags &= ~EVMS_VOLUME_FLAG;
13119
 
+
13120
 
+       /* disk nodes only get deleted when:
13121
 
+        * 1)  there are no references to the disk node
13122
 
+        *      in memory.
13123
 
+        * 2)  the device is removable
13124
 
+        * 3)  the device reported a media change
13125
 
+        *
13126
 
+        * All three of these conditions must be true
13127
 
+        * before the disk node can be deleted. 
13128
 
+        * evms_check_for_device_changes should set
13129
 
+        * and ensure these conditions before issuing
13130
 
+        * deletes.
13131
 
+        *
13132
 
+        * Newly installed removable media will be
13133
 
+        * picked up in this modules discover code.
13134
 
+        */
13135
 
+       if (disk->flags & EVMS_MEDIA_CHANGED) {
13136
 
+               LOG_DETAILS("deleting '%s'.\n",disk->name);
13137
 
+
13138
 
+               evms_cs_unregister_device(disk);
13139
 
+               MOD_DEC_USE_COUNT;
13140
 
+               LID = disk->instance_data;
13141
 
+               if (LID) {
13142
 
+                       evms_cs_deallocate_memory(LID);
13143
 
+               }
13144
 
+               evms_cs_deallocate_logical_node(disk);
13145
 
+       }
13146
 
+        return 0;
13147
 
+}
13148
 
+
13149
 
+/********************************************************/
13150
 
+/* Required Plugin Function Table Entry Point:          */
13151
 
+/*      Read function                                   */
13152
 
+/********************************************************/
13153
 
+
13154
 
+/* 
13155
 
+ * function: ldev_mgr_io_error
13156
 
+ * 
13157
 
+ * this function was primarily created because the function
13158
 
+ * buffer_IO_error is inline and kgdb doesn't allow breakpoints
13159
 
+ * to be set on inline functions. Since this was an error path
13160
 
+ * and not mainline, I decided to add a trace statement to help
13161
 
+ * report on the failing condition.
13162
 
+ *
13163
 
+ */
13164
 
+static void 
13165
 
+ldev_mgr_io_error(
13166
 
+       evms_logical_node_t *disk,
13167
 
+       int io_flag, 
13168
 
+       eio_t *eio,
13169
 
+       int rc)
13170
 
+{
13171
 
+       if (rc == -EOVERFLOW) {
13172
 
+               LOG_SERIOUS("attempt to %s beyond boundary(%Ld) on (%s), rsector(%Ld).\n",
13173
 
+                       (io_flag) ? "WRITE" : "READ", 
13174
 
+                       disk->total_vsectors - 1,
13175
 
+                       disk->name,
13176
 
+                       eio->rsector);
13177
 
+       } else if (rc == -ENXIO) {
13178
 
+               LOG_SERIOUS("attempt to access a non-existent device(%s).\n",
13179
 
+                           disk->name);
13180
 
+       }
13181
 
+
13182
 
+        EVMS_IO_ERROR(eio);
13183
 
+}
13184
 
+
13185
 
+static void 
13186
 
+ldev_mgr_read(evms_logical_node_t *disk, eio_t *eio)
13187
 
+{
13188
 
+       int rc = 0;
13189
 
+       request_queue_t *q;
13190
 
+        ldev_mgr_instance_data_t *InstData;
13191
 
+
13192
 
+        InstData = disk->instance_data;
13193
 
+       if ((eio->rsector + eio->rsize) <= disk->total_vsectors) {
13194
 
+               eio->bh->b_rsector = eio->rsector;
13195
 
+               eio->bh->b_size = eio->rsize << EVMS_VSECTOR_SIZE_SHIFT;
13196
 
+               eio->bh->b_rdev = InstData->dev;
13197
 
+               q = blk_get_queue(InstData->dev);
13198
 
+               if (q) {
13199
 
+                       q->make_request_fn(q, READ, eio->bh);
13200
 
+                       return;
13201
 
+               } else {
13202
 
+                       rc = -ENXIO;
13203
 
+                       disk->flags |= EVMS_VOLUME_CORRUPT | 
13204
 
+                               EVMS_VOLUME_GENDISK_GONE;
13205
 
+               }
13206
 
+       } else { 
13207
 
+               rc = -EOVERFLOW;
13208
 
+       }
13209
 
+       if (rc) {
13210
 
+               ldev_mgr_io_error(disk, READ, eio, rc);
13211
 
+       }
13212
 
+}
13213
 
+
13214
 
+/********************************************************/
13215
 
+/* Required Plugin Function Table Entry Point:          */
13216
 
+/*      Write function                                  */
13217
 
+/********************************************************/
13218
 
+
13219
 
+static void 
13220
 
+ldev_mgr_write(evms_logical_node_t *disk, eio_t *eio)
13221
 
+{
13222
 
+       int rc = 0;
13223
 
+        request_queue_t *q;
13224
 
+        ldev_mgr_instance_data_t *InstData;
13225
 
+
13226
 
+        InstData = disk->instance_data;
13227
 
+       if ((eio->rsector + eio->rsize) <= disk->total_vsectors) {
13228
 
+               eio->bh->b_rsector = eio->rsector;
13229
 
+               eio->bh->b_size = eio->rsize << EVMS_VSECTOR_SIZE_SHIFT;
13230
 
+               eio->bh->b_rdev = InstData->dev;
13231
 
+               q = blk_get_queue(InstData->dev);
13232
 
+               if (q) {
13233
 
+                       q->make_request_fn(q, WRITE, eio->bh);
13234
 
+                       return;
13235
 
+               } else {
13236
 
+                       rc = -ENXIO;
13237
 
+                       disk->flags |= EVMS_VOLUME_CORRUPT | 
13238
 
+                               EVMS_VOLUME_GENDISK_GONE;
13239
 
+               }
13240
 
+       } else { 
13241
 
+               rc = -EOVERFLOW;
13242
 
+       }
13243
 
+       if (rc) {
13244
 
+               ldev_mgr_io_error(disk, WRITE, eio, rc);
13245
 
+       }
13246
 
+}
13247
 
+
13248
 
+/********************************************************/
13249
 
+/* Required Plugin Function Table Entry Point:          */
13250
 
+/*      Init_io function & Support routines             */
13251
 
+/********************************************************/
13252
 
+
13253
 
+/*
13254
 
+ * function: allocate_bh
13255
 
+ *
13256
 
+ * This function obtains a buffer head from the private 
13257
 
+ * buffer head pool (pre-allocated at EVMS initial 
13258
 
+ * discovery time). 
13259
 
+ *
13260
 
+ * NOTE: All access to the buffer head pool are protected 
13261
 
+ * by a private spinlock.
13262
 
+ *
13263
 
+ */
13264
 
+static inline struct buffer_head *
13265
 
+allocate_bh(void)
13266
 
+{
13267
 
+       struct buffer_head *bh =
13268
 
+               evms_cs_allocate_from_pool(evms_bh_pool, FALSE);
13269
 
+       if (bh) {
13270
 
+               init_waitqueue_head(&bh->b_wait);
13271
 
+       }
13272
 
+       return(bh);
13273
 
+}
13274
 
+
13275
 
+/*
13276
 
+ * function: deallocate_bh
13277
 
+ *
13278
 
+ * This function returns a buffer head to the private 
13279
 
+ * buffer head pool (pre-allocated at EVMS initial 
13280
 
+ * discovery time). 
13281
 
+ *
13282
 
+ * NOTE: All access to the buffer head pool are protected 
13283
 
+ * by a private spinlock.
13284
 
+ *
13285
 
+ */
13286
 
+static inline void 
13287
 
+deallocate_bh(struct buffer_head *bh)
13288
 
+{
13289
 
+       evms_cs_deallocate_to_pool(evms_bh_pool, bh);
13290
 
+}
13291
 
+
13292
 
+/* this is the buffer head control block structure definition */
13293
 
+typedef struct bh_cb_s {
13294
 
+       int                 rc;
13295
 
+        atomic_t            blks_allocated;
13296
 
+        wait_queue_head_t   cb_wait;
13297
 
+} bh_cb_t;
13298
 
+
13299
 
+/*
13300
 
+ * function: __wait_on_bh_cb
13301
 
+ *
13302
 
+ * This is a worker function to wait_on_bh_cb.
13303
 
+ * This function waits for a set of private buffer heads
13304
 
+ * associated to the specified buffer head control block
13305
 
+ * to return from I/O completion. On completion of the
13306
 
+ * last buffer head, the calling function is awakened
13307
 
+ * and continues running.
13308
 
+ *
13309
 
+ * This is the worker function to the function wait_on_bh_cb.
13310
 
+ *
13311
 
+ */
13312
 
+static void 
13313
 
+__wait_on_bh_cb(bh_cb_t *bh_cb)
13314
 
+{
13315
 
+        struct task_struct *tsk = current;
13316
 
+        DECLARE_WAITQUEUE(wait, tsk);
13317
 
+
13318
 
+        add_wait_queue(&bh_cb->cb_wait, &wait);
13319
 
+        do {
13320
 
+                run_task_queue(&tq_disk);
13321
 
+                set_task_state(tsk, TASK_UNINTERRUPTIBLE);
13322
 
+                if (!atomic_read(&bh_cb->blks_allocated))
13323
 
+                        break;
13324
 
+                schedule();
13325
 
+        } while (atomic_read(&bh_cb->blks_allocated));
13326
 
+        tsk->state = TASK_RUNNING;
13327
 
+        remove_wait_queue(&bh_cb->cb_wait, &wait);
13328
 
+}
13329
 
+
13330
 
+/*
13331
 
+ * function: wait_on_bh_cb
13332
 
+ *
13333
 
+ * This function waits for a set of private buffer heads
13334
 
+ * associated to the specified buffer head control block
13335
 
+ * to return from I/O completion. On completion of the
13336
 
+ * last buffer head, the calling function is awakened
13337
 
+ * and continues running.
13338
 
+ *
13339
 
+ */
13340
 
+static void 
13341
 
+wait_on_bh_cb(bh_cb_t *bh_cb)
13342
 
+{
13343
 
+        if (atomic_read(&bh_cb->blks_allocated))
13344
 
+                __wait_on_bh_cb(bh_cb);
13345
 
+       else
13346
 
+               /* if we ended up with no buffer heads on
13347
 
+                * this pass, lets wait a until a few buffer
13348
 
+                * heads have been freed and try again. This
13349
 
+                * should provide a reasonable delay.
13350
 
+                */
13351
 
+               schedule();
13352
 
+}
13353
 
+
13354
 
+/*
13355
 
+ * function: end_bh_cb_io
13356
 
+ *
13357
 
+ * This is the I/O completion function that is called for
13358
 
+ * each private buffer head obtained from the buffer head 
13359
 
+ * pool. Control is return thru this routine so we can track
13360
 
+ * all outstanding requests to know when to awaken the caller,
13361
 
+ * and to regain control after all I/Os have been performed.
13362
 
+ *
13363
 
+ */
13364
 
+static void 
13365
 
+end_bh_cb_io_sync(struct buffer_head *bh, int uptodate)
13366
 
+{
13367
 
+        bh_cb_t *bh_cb = (bh_cb_t *)bh->b_private;
13368
 
+
13369
 
+       /* record that errors occurred */
13370
 
+       if (!uptodate) {
13371
 
+               bh_cb->rc = -EIO;
13372
 
+       }
13373
 
+        mark_buffer_uptodate(bh, uptodate);
13374
 
+        unlock_buffer(bh);
13375
 
+
13376
 
+        deallocate_bh(bh);
13377
 
+        atomic_dec(&bh_cb->blks_allocated);
13378
 
+        if (!atomic_read(&bh_cb->blks_allocated))
13379
 
+                if (waitqueue_active(&bh_cb->cb_wait))
13380
 
+                    wake_up(&bh_cb->cb_wait);
13381
 
+}
13382
 
+
13383
 
+/*
13384
 
+ * function: ldev_partial_sector_init_io
13385
 
+ *
13386
 
+ * This function is a support function for ldev_init_io,
13387
 
+ * which handles the cases of performing I/O to only a part
13388
 
+ * of non-standard sized hardsector. This function is not 
13389
 
+ * designed to be called directly, but via ldev_init_io.
13390
 
+ *
13391
 
+ */
13392
 
+static int 
13393
 
+ldev_partial_sector_init_io(
13394
 
+       evms_logical_node_t *node,
13395
 
+        int io_flag,
13396
 
+       bh_cb_t *bh_cb,
13397
 
+        u_int64_t next_lsn,
13398
 
+        u_int64_t sector_lsn,
13399
 
+       u_int64_t io_size,
13400
 
+        void *bufptr,
13401
 
+       unsigned char **sector_buf )
13402
 
+{
13403
 
+       int rc = 0;
13404
 
+        ldev_mgr_instance_data_t *InstData = node->instance_data;
13405
 
+        kdev_t dev = InstData->dev;
13406
 
+        struct buffer_head *bh;
13407
 
+
13408
 
+       if (*sector_buf == NULL) {
13409
 
+               /* allocate buffer for incoming sector */
13410
 
+               rc = evms_cs_allocate_memory((void **)sector_buf,
13411
 
+                                            node->hardsector_size);
13412
 
+               if (rc) return(rc);
13413
 
+       }
13414
 
+       /* allocate a buffer head from the pool */
13415
 
+       while((bh = allocate_bh()) == NULL)
13416
 
+               /* yielding the cpu is playing it
13417
 
+                * safe. it might be wiser to just
13418
 
+                * spin. requires more thought.
13419
 
+                */
13420
 
+               schedule();
13421
 
+
13422
 
+       /* set up the buffer head for this sector */
13423
 
+       bh->b_end_io = end_bh_cb_io_sync;
13424
 
+       bh->b_size = node->hardsector_size;
13425
 
+       bh->b_rdev = dev;
13426
 
+       bh->b_rsector = next_lsn - sector_lsn;
13427
 
+       bh->b_data = *sector_buf;
13428
 
+       bh->b_page = virt_to_page(*sector_buf); /* this isn't handling the case of a block with more than 1 sector, that spans pages */
13429
 
+       bh->b_state = 0;
13430
 
+       set_bit(BH_Dirty, &bh->b_state);
13431
 
+       set_bit(BH_Lock, &bh->b_state);
13432
 
+       set_bit(BH_Req, &bh->b_state);
13433
 
+       set_bit(BH_Mapped, &bh->b_state);
13434
 
+       bh->b_private = (void *)bh_cb;
13435
 
+       atomic_inc(&bh_cb->blks_allocated);
13436
 
+
13437
 
+       /* drive the buffer head down   */
13438
 
+       /* to the device                */
13439
 
+       generic_make_request(READ, bh);
13440
 
+
13441
 
+       /* wait for all bh's I/O's to end */
13442
 
+       wait_on_bh_cb(bh_cb);
13443
 
+
13444
 
+       /* copy data to/from user */
13445
 
+       if (io_flag != WRITE)
13446
 
+               /* READ */
13447
 
+               memcpy(bufptr,
13448
 
+                      *sector_buf + (sector_lsn << EVMS_VSECTOR_SIZE_SHIFT),
13449
 
+                      io_size << EVMS_VSECTOR_SIZE_SHIFT);
13450
 
+       else {
13451
 
+               /* WRITE */
13452
 
+               memcpy(*sector_buf + (sector_lsn << EVMS_VSECTOR_SIZE_SHIFT),
13453
 
+                      bufptr, 
13454
 
+                      io_size << EVMS_VSECTOR_SIZE_SHIFT);
13455
 
+
13456
 
+               /* allocate a buffer head from the pool */
13457
 
+               while((bh = allocate_bh()) == NULL)
13458
 
+                       /* yielding the cpu is playing it
13459
 
+                        * safe. it might be wiser to just
13460
 
+                        * spin. requires more thought.
13461
 
+                        */
13462
 
+                       schedule();
13463
 
+
13464
 
+               /* set up the buffer head for this sector */
13465
 
+               bh->b_end_io = end_bh_cb_io_sync;
13466
 
+               bh->b_size = node->hardsector_size;
13467
 
+               bh->b_rdev = dev;
13468
 
+               bh->b_rsector = next_lsn - sector_lsn;
13469
 
+               bh->b_data = *sector_buf;
13470
 
+               bh->b_page = virt_to_page(*sector_buf); /* this isn't handling the case of a block with more than 1 sector, that spans pages */
13471
 
+               bh->b_state = 0;
13472
 
+               set_bit(BH_Dirty, &bh->b_state);
13473
 
+               set_bit(BH_Lock, &bh->b_state);
13474
 
+               set_bit(BH_Req, &bh->b_state);
13475
 
+               set_bit(BH_Mapped, &bh->b_state);
13476
 
+               bh->b_private = (void *)bh_cb;
13477
 
+               atomic_inc(&bh_cb->blks_allocated);
13478
 
+
13479
 
+               /* drive the buffer head down   */
13480
 
+               /* to the device                */
13481
 
+               generic_make_request(WRITE, bh);
13482
 
+
13483
 
+               /* wait for all bh's I/O's to end */
13484
 
+               wait_on_bh_cb(bh_cb);
13485
 
+       }
13486
 
+       return(rc);
13487
 
+}
13488
 
+
13489
 
+/*
13490
 
+ * function: ldev_init_io
13491
 
+ *
13492
 
+ * This function provides support for synchronous I/O 
13493
 
+ * operations to the underlying devices. These I/O 
13494
 
+ * operations are NOT buffered in any way including the 
13495
 
+ * operating system's buffer cache.
13496
 
+ *
13497
 
+ * This function can work with any hardsector size that
13498
 
+ * is a power of 2.
13499
 
+ *
13500
 
+ * node           : logical node of the target logical disk
13501
 
+ * io_flag        : 0 = read, 1 = write, 2 = read-a-head
13502
 
+ * starting_lsn   : the 0-based (disk relative) logical
13503
 
+ *               :  (512 byte) sector number (lsn) 
13504
 
+ * num_lsns       : the total number of lsns in this I/O
13505
 
+ * bufptr         : address of the memory to read/write the data
13506
 
+ *
13507
 
+ */
13508
 
+static int 
13509
 
+ldev_init_io( 
13510
 
+       evms_logical_node_t *node,
13511
 
+        int io_flag,
13512
 
+        u_int64_t starting_lsn,
13513
 
+       u_int64_t num_lsns,
13514
 
+        void *bufptr )
13515
 
+{
13516
 
+        int rc = 0, lsns_per_hardsector, lsns_per_blocksize;
13517
 
+       unchar *sector_buf = NULL, *cur_bufptr;
13518
 
+        u_int64_t next_lsn, remaining_lsns, sector_lsn;
13519
 
+        ldev_mgr_instance_data_t *InstData = node->instance_data;
13520
 
+        kdev_t dev = InstData->dev;
13521
 
+        bh_cb_t bh_cb;
13522
 
+
13523
 
+        LOG_EVERYTHING("%s Entry: Disk(%u,%u), ioflag(%u), start_lsn(%Lu), num_lsns(%Lu), bufptr(0x%p)\n",
13524
 
+                  __FUNCTION__, MAJOR(dev), MINOR(dev), io_flag, starting_lsn, num_lsns, bufptr);
13525
 
+
13526
 
+       /* check for valid device */
13527
 
+       if (!blk_size[MAJOR(dev)][MINOR(dev)]) {
13528
 
+               node->flags |= EVMS_VOLUME_CORRUPT | 
13529
 
+                       EVMS_VOLUME_GENDISK_GONE;
13530
 
+               return(-ENXIO);
13531
 
+       }
13532
 
+       /* check for 0 length request */
13533
 
+        if ( num_lsns == 0 ) {
13534
 
+               LOG_ERROR("%s: error requesting 0 sectors.\n", __FUNCTION__);
13535
 
+                return(-EINVAL);
13536
 
+       }
13537
 
+       /* check for out of bound request */
13538
 
+       if ( (starting_lsn + num_lsns) > node->total_vsectors) {
13539
 
+               LOG_ERROR("%s: attempted %s beyond logical disk boundary(%Lu LSNs), requesting LSN(%Lu), total LSNs(%Lu).\n",
13540
 
+                          __FUNCTION__, (io_flag == WRITE) ? "WRITE" : "READ",
13541
 
+                       node->total_vsectors,
13542
 
+                       starting_lsn, num_lsns);
13543
 
+               return(-EINVAL);
13544
 
+       }
13545
 
+       /* check for invalid io_flag value */
13546
 
+       switch( io_flag ) {
13547
 
+               case READ:   /* read...   */
13548
 
+               case WRITE:  /* write...  */
13549
 
+               case READA:  /* reada...  */
13550
 
+                       break;
13551
 
+               default:
13552
 
+                       return(-EINVAL);
13553
 
+       }
13554
 
+
13555
 
+       /* compute some per device info once up-front */
13556
 
+       lsns_per_hardsector = node->hardsector_size / EVMS_VSECTOR_SIZE;
13557
 
+       lsns_per_blocksize = node->block_size / EVMS_VSECTOR_SIZE;
13558
 
+
13559
 
+       /* initialize the buffer head control block */
13560
 
+       memset(&bh_cb, 0, sizeof(bh_cb_t));
13561
 
+       init_waitqueue_head(&bh_cb.cb_wait);
13562
 
+
13563
 
+       /* only update the local copy of variables */
13564
 
+       cur_bufptr = bufptr;
13565
 
+       next_lsn = starting_lsn;
13566
 
+       remaining_lsns = num_lsns;
13567
 
+
13568
 
+       /* check for a mid-sector starting offset
13569
 
+        *
13570
 
+        * if found, perform I/O on part of that
13571
 
+        * sector
13572
 
+        */
13573
 
+       sector_lsn = next_lsn & (lsns_per_hardsector - 1);
13574
 
+       if (sector_lsn) {
13575
 
+               u_int64_t io_size;
13576
 
+
13577
 
+               /* determine bytes in IO to this sector */
13578
 
+               io_size = lsns_per_hardsector - sector_lsn;
13579
 
+               if (io_size > remaining_lsns)
13580
 
+                       io_size = remaining_lsns;
13581
 
+
13582
 
+               /* perform the partial sector io */
13583
 
+               rc = ldev_partial_sector_init_io(
13584
 
+                       node,io_flag, &bh_cb,
13585
 
+                       next_lsn,
13586
 
+                       sector_lsn, io_size,
13587
 
+                       cur_bufptr, &sector_buf);
13588
 
+
13589
 
+               if (!rc) {
13590
 
+                       /* update progress in local variables */
13591
 
+                       cur_bufptr += io_size << EVMS_VSECTOR_SIZE_SHIFT;
13592
 
+                       next_lsn += io_size;
13593
 
+                       remaining_lsns -= io_size;
13594
 
+               }
13595
 
+       }
13596
 
+
13597
 
+       /* continue if no errors found */
13598
 
+       if (!rc) {
13599
 
+               /* perform I/O on all the complete sectors
13600
 
+                * in this request.
13601
 
+                *
13602
 
+                * loop until there are no more complete sectors
13603
 
+                * to process.
13604
 
+                */
13605
 
+               while(remaining_lsns >= lsns_per_hardsector) {
13606
 
+                       /* this inner loop attempts to drive as many
13607
 
+                        * bytes (in sector size multiples) down to 
13608
 
+                        * the device as possible using the available
13609
 
+                        * buffer heads in the pool.
13610
 
+                        */
13611
 
+                       while(remaining_lsns >= lsns_per_hardsector) {
13612
 
+                               struct buffer_head *bh;
13613
 
+
13614
 
+                               /* allocate a buffer head from the pool */
13615
 
+                               bh = allocate_bh();
13616
 
+                               if (bh == NULL) break;
13617
 
+
13618
 
+                               /* set up the buffer head for this I/O */
13619
 
+                               bh->b_end_io = end_bh_cb_io_sync;
13620
 
+                               bh->b_size = 
13621
 
+                                       (remaining_lsns >= lsns_per_blocksize) ?
13622
 
+                                        node->block_size :  
13623
 
+                                       node->hardsector_size;
13624
 
+                               bh->b_data = cur_bufptr;
13625
 
+                               bh->b_rdev = dev;
13626
 
+                               bh->b_rsector = next_lsn;
13627
 
+                               bh->b_page = virt_to_page(cur_bufptr); /* this isn't handling the case of a block with more than 1 sector, that spans pages */
13628
 
+                               bh->b_state = 0;
13629
 
+                               set_bit(BH_Dirty, &bh->b_state);
13630
 
+                               set_bit(BH_Lock, &bh->b_state);
13631
 
+                               set_bit(BH_Req, &bh->b_state);
13632
 
+                               set_bit(BH_Mapped, &bh->b_state);
13633
 
+                               bh->b_private = (void *)&bh_cb;
13634
 
+                               atomic_inc(&bh_cb.blks_allocated);
13635
 
+
13636
 
+                               /* drive the buffer head down   */
13637
 
+                               /* to the device                */
13638
 
+                               generic_make_request(io_flag, bh);
13639
 
+
13640
 
+                               /* update progress in local variables */
13641
 
+                               cur_bufptr += bh->b_size;
13642
 
+                               next_lsn += bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT;
13643
 
+                               remaining_lsns -= bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT;
13644
 
+                       }
13645
 
+                       /* wait for all bh's I/O's to end */
13646
 
+                       wait_on_bh_cb(&bh_cb);
13647
 
+               }
13648
 
+       }
13649
 
+
13650
 
+       /* continue if no errors found */
13651
 
+       if (!rc)
13652
 
+               /* check for a mid-sector ending offset
13653
 
+                *
13654
 
+                * if found, perform I/O on part of that
13655
 
+                * sector
13656
 
+                */
13657
 
+               if (remaining_lsns)
13658
 
+                       /* perform the partial sector io */
13659
 
+                       rc = ldev_partial_sector_init_io(
13660
 
+                               node, io_flag, &bh_cb,
13661
 
+                               next_lsn,
13662
 
+                               0, remaining_lsns,
13663
 
+                               cur_bufptr, &sector_buf);
13664
 
+
13665
 
+       /* free the sector buffer if it was allocated */
13666
 
+       if (sector_buf)
13667
 
+               evms_cs_deallocate_memory(sector_buf);
13668
 
+
13669
 
+       /* coalesce return codes */
13670
 
+       rc |= bh_cb.rc;
13671
 
+
13672
 
+        LOG_EVERYTHING("%s Exit: rc(%u)\n", __FUNCTION__, rc);
13673
 
+
13674
 
+        return( rc );
13675
 
+}
13676
 
+
13677
 
+/********************************************************/
13678
 
+/* Required Plugin Function Table Entry Point:          */
13679
 
+/*      IOCTL function & Support routines               */
13680
 
+/********************************************************/
13681
 
+
13682
 
+static int 
13683
 
+ldev_mgr_ioctl(
13684
 
+       evms_logical_node_t * disk,
13685
 
+       struct inode * inode,
13686
 
+        struct file * file,
13687
 
+        unsigned int cmd,
13688
 
+        unsigned long arg)
13689
 
+{
13690
 
+        int rc = 0;
13691
 
+        ldev_mgr_instance_data_t *InstData = disk->instance_data;
13692
 
+       kdev_t save_dev;
13693
 
+
13694
 
+        if (!inode || !disk)
13695
 
+                return -EINVAL;
13696
 
+
13697
 
+       save_dev = inode->i_rdev;
13698
 
+       inode->i_rdev = InstData->dev;
13699
 
+        switch (cmd) {
13700
 
+                case EVMS_QUIESCE_VOLUME:
13701
 
+                case EVMS_PLUGIN_IOCTL:
13702
 
+                        rc = 0;
13703
 
+                        break;
13704
 
+               case EVMS_GET_BMAP:
13705
 
+                       {
13706
 
+                               evms_get_bmap_t *bmap = (evms_get_bmap_t *)arg;
13707
 
+                               bmap->dev = InstData->dev;
13708
 
+                               bmap->status = 0;
13709
 
+                       }
13710
 
+                       break;
13711
 
+               case EVMS_OPEN_VOLUME:
13712
 
+                       rc = InstData->gd->fops->open(inode, file);
13713
 
+                       break;
13714
 
+               case EVMS_CLOSE_VOLUME:
13715
 
+                       rc = InstData->gd->fops->release(inode, file);
13716
 
+                       break;
13717
 
+               case EVMS_CHECK_MEDIA_CHANGE:
13718
 
+                       /* once we detect that media changed
13719
 
+                        * is 'set', don't send any more ioctls
13720
 
+                        * down to the device, until the
13721
 
+                        * media change has been 'reset' by a
13722
 
+                        * revalidate disk ioctl. when already
13723
 
+                        * 'set', just return a 1 w/o actually
13724
 
+                        * performing another ioctl call to the
13725
 
+                        * device.
13726
 
+                        */
13727
 
+                       if (InstData->media_changed == TRUE) {
13728
 
+                               rc = 1;
13729
 
+                               break;
13730
 
+                       }
13731
 
+                       rc = InstData->gd->fops->check_media_change(InstData->dev);
13732
 
+                       if (rc == 1) {
13733
 
+                               InstData->media_changed = TRUE;
13734
 
+                               disk->flags |= EVMS_MEDIA_CHANGED;
13735
 
+                       }
13736
 
+                       break;
13737
 
+               case EVMS_REVALIDATE_DISK:
13738
 
+                       /* don't actually send this ioctl down
13739
 
+                        * to the device, until we know that
13740
 
+                        * previous check media change ioctl
13741
 
+                        * has occurred.
13742
 
+                        *
13743
 
+                        * when we do actually send the ioctl
13744
 
+                        * down, reset the local media_changed
13745
 
+                        * flag.
13746
 
+                        */
13747
 
+                       if (InstData->media_changed == FALSE)
13748
 
+                               break;
13749
 
+                       rc = InstData->gd->fops->revalidate(InstData->dev);
13750
 
+                       InstData->media_changed = FALSE;
13751
 
+                       break;
13752
 
+               case EVMS_GET_DISK_LIST:
13753
 
+                       rc = evms_cs_add_item_to_list(
13754
 
+                               (evms_list_node_t **)arg,
13755
 
+                               disk);
13756
 
+                       if (rc > 0)
13757
 
+                               rc = 0;
13758
 
+                       break;
13759
 
+                default:
13760
 
+                        rc = InstData->gd->fops->ioctl(inode, file, cmd, arg);
13761
 
+                        break;
13762
 
+        }
13763
 
+       inode->i_rdev = save_dev;
13764
 
+
13765
 
+        return( rc );
13766
 
+}
13767
 
+
13768
 
+/********************************************************/
13769
 
+/* Required Module Entry Point:                         */
13770
 
+/*      ldev_mgr_init                                   */
13771
 
+/********************************************************/
13772
 
+
13773
 
+static int __init 
13774
 
+ldev_mgr_init(void)
13775
 
+{
13776
 
+        return evms_cs_register_plugin(&plugin_header);
13777
 
+}
13778
 
+
13779
 
+static void __exit 
13780
 
+ldev_mgr_exit(void)
13781
 
+{
13782
 
+        evms_cs_unregister_plugin(&plugin_header);
13783
 
+}
13784
 
+
13785
 
+module_init(ldev_mgr_init);
13786
 
+module_exit(ldev_mgr_exit);
13787
 
+#ifdef MODULE_LICENSE
13788
 
+MODULE_LICENSE("GPL");
13789
 
+#endif
13790
 
diff -Naur linux-2002-03-28/drivers/evms/lvm_vge.c evms-2002-03-28/drivers/evms/lvm_vge.c
13791
 
--- linux-2002-03-28/drivers/evms/lvm_vge.c     Wed Dec 31 18:00:00 1969
13792
 
+++ evms-2002-03-28/drivers/evms/lvm_vge.c      Thu Mar 28 10:20:25 2002
13793
 
@@ -0,0 +1,3480 @@
13794
 
+/* -*- linux-c -*- */
13795
 
+
13796
 
+/*
13797
 
+ *   Copyright (c) International Business Machines  Corp., 2000
13798
 
+ *
13799
 
+ *   This program is free software;  you can redistribute it and/or modify
13800
 
+ *   it under the terms of the GNU General Public License as published by
13801
 
+ *   the Free Software Foundation; either version 2 of the License, or
13802
 
+ *   (at your option) any later version.
13803
 
+ *
13804
 
+ *   This program is distributed in the hope that it will be useful,
13805
 
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
13806
 
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
13807
 
+ *   the GNU General Public License for more details.
13808
 
+ *
13809
 
+ *   You should have received a copy of the GNU General Public License
13810
 
+ *   along with this program;  if not, write to the Free Software
13811
 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
13812
 
+ */
13813
 
+/*
13814
 
+ * linux/drivers/evms/lvm_vge.c
13815
 
+ *
13816
 
+ * EVMS Linux LVM Region Manager
13817
 
+ */
13818
 
+
13819
 
+#include <linux/module.h>
13820
 
+#include <linux/kernel.h>
13821
 
+#include <linux/config.h>
13822
 
+#include <linux/genhd.h>
13823
 
+#include <linux/major.h>
13824
 
+#include <linux/string.h>
13825
 
+#include <linux/blk.h>
13826
 
+#include <linux/init.h>
13827
 
+#include <linux/slab.h>
13828
 
+#include <linux/vmalloc.h>
13829
 
+#include <linux/evms/evms_kernel.h>
13830
 
+#include <linux/evms/evms_lvm.h>
13831
 
+#include <asm/system.h>
13832
 
+#include <asm/uaccess.h>
13833
 
+
13834
 
+#define LOG_PREFIX "lvm: "
13835
 
+
13836
 
+// Plugin API prototypes
13837
 
+static int lvm_discover( evms_logical_node_t ** evms_node_list );
13838
 
+static int lvm_discover_end( evms_logical_node_t ** evms_node_list );
13839
 
+static int lvm_delete_node( evms_logical_node_t * logical_node );
13840
 
+static void lvm_read(  evms_logical_node_t     * node,
13841
 
+                       eio_t                   * eio );
13842
 
+static void lvm_write( evms_logical_node_t     * node,
13843
 
+                       eio_t                   * eio );
13844
 
+static int lvm_init_io(        evms_logical_node_t     * node,
13845
 
+                       int                     io_flag,
13846
 
+                       evms_sector_t           sect_nr,
13847
 
+                       evms_sector_t           num_sects,
13848
 
+                       void                    * buf_addr );
13849
 
+static int lvm_ioctl(  evms_logical_node_t     * logical_node,
13850
 
+                       struct inode            * inode,
13851
 
+                       struct file             * file,
13852
 
+                       unsigned int            cmd,
13853
 
+                       unsigned long           arg);
13854
 
+static int lvm_direct_ioctl(   struct inode    * inode,
13855
 
+                               struct file     * file,
13856
 
+                               unsigned int    cmd,
13857
 
+                               unsigned long   args );
13858
 
+
13859
 
+static snapshot_map_entry_t * allocate_snapshot_map_entry(evms_sector_t        org_sector,
13860
 
+                                                       evms_sector_t   snap_sector );
13861
 
+
13862
 
+
13863
 
+// Global LVM data structures
13864
 
+static evms_plugin_function_table_t lvm_function_table = {
13865
 
+       discover        : lvm_discover,
13866
 
+       end_discover    : lvm_discover_end,
13867
 
+       delete          : lvm_delete_node,
13868
 
+       read            : lvm_read,
13869
 
+       write           : lvm_write,
13870
 
+       init_io         : lvm_init_io,
13871
 
+       ioctl           : lvm_ioctl,
13872
 
+       direct_ioctl    : lvm_direct_ioctl
13873
 
+};
13874
 
+
13875
 
+static evms_plugin_header_t lvm_plugin_header = {
13876
 
+       id : SetPluginID(
13877
 
+               IBM_OEM_ID,
13878
 
+               EVMS_REGION_MANAGER,
13879
 
+               0x01 ),
13880
 
+       version : {
13881
 
+               major           : EVMS_LVM_VERSION_MAJOR,
13882
 
+               minor           : EVMS_LVM_VERSION_MINOR,
13883
 
+               patchlevel      : EVMS_LVM_VERSION_PATCH
13884
 
+       },
13885
 
+       required_common_services_version: {
13886
 
+               major           : 0,
13887
 
+               minor           : 5,
13888
 
+               patchlevel      : 0
13889
 
+       },
13890
 
+       function_table : &lvm_function_table
13891
 
+};
13892
 
+
13893
 
+static lvm_volume_group_t      * lvm_group_list = NULL;
13894
 
+static struct proc_dir_entry   * lvm_proc = NULL;
13895
 
+
13896
 
+
13897
 
+
13898
 
+/********** Miscellaneous Functions **********/
13899
 
+
13900
 
+
13901
 
+
13902
 
+/* Function: remap sector 
13903
 
+ *
13904
 
+ *     Common function to remap LV lba to PV lba in appropriate PE. This
13905
 
+ *     function needs to deal with requests that span PEs and/or stripes. If
13906
 
+ *     this occurs, the request will simply be chopped off at the boundary of
13907
 
+ *     the first PE/stripe. It is up to the calling function to loop
13908
 
+ *     accordingly to finish the full remapping. This function is now partially
13909
 
+ *     64-bit enabled. The striping section contains code that currently cannot
13910
 
+ *     eliminate at least one mod operation on 64 bit values.
13911
 
+ */
13912
 
+static int remap_sector(evms_logical_node_t    * node,
13913
 
+                       evms_sector_t           org_sector,     // logical sector to remap
13914
 
+                       evms_sector_t           size,           // size (in sectors) of request to remap
13915
 
+                       evms_sector_t           * new_sector,   // remapped sector
13916
 
+                       evms_sector_t           * new_size,     // new size (in sectors)
13917
 
+                       evms_sector_t           * pe_start_sector,// starting sector of pe - needed for snapshotting
13918
 
+                       lvm_physical_volume_t   ** pv_entry )   // new node for which new_sector is relative
13919
 
+{
13920
 
+       lvm_logical_volume_t    * volume = node->instance_data;
13921
 
+       le_table_entry_t        * le_entry;
13922
 
+       u_int32_t               le;
13923
 
+       u_int32_t               offset_in_le;
13924
 
+
13925
 
+       u_int32_t               sectors_per_column;
13926
 
+       u_int32_t               column;
13927
 
+       u_int32_t               sector_in_column;
13928
 
+       u_int32_t               stripe_in_column;
13929
 
+       u_int32_t               le_in_column;
13930
 
+       u_int32_t               columns;
13931
 
+       u_int32_t               offset_in_stripe;
13932
 
+       u_int32_t               stripe_in_le;
13933
 
+       u_int32_t               org_sector32;   // Needed for striping - not 64-bit enabled
13934
 
+
13935
 
+       *new_size = size;
13936
 
+
13937
 
+       // Check if volume is striped. Reset the size if the request
13938
 
+       // crosses a stripe boundary. Striping in LVM is not 64-bit
13939
 
+       // enabled.
13940
 
+       if ( volume->stripes > 1 ) {
13941
 
+               org_sector32            = org_sector;
13942
 
+               sectors_per_column      = volume->stripes * volume->pe_size;
13943
 
+               column                  = org_sector32 / sectors_per_column;
13944
 
+               sector_in_column        = org_sector32 % sectors_per_column;
13945
 
+               stripe_in_column        = sector_in_column / volume->stripe_size;
13946
 
+               le_in_column            = stripe_in_column % volume->stripes;
13947
 
+               columns                 = volume->num_le / volume->stripes;
13948
 
+               le                      = column + (columns * le_in_column);
13949
 
+
13950
 
+               offset_in_stripe        = org_sector32 % volume->stripe_size;
13951
 
+               stripe_in_le            = stripe_in_column / volume->stripes;
13952
 
+               offset_in_le            = offset_in_stripe + stripe_in_le * volume->stripe_size;
13953
 
+
13954
 
+               if ( offset_in_stripe + size > volume->stripe_size ) {
13955
 
+                       *new_size = volume->stripe_size - offset_in_stripe;
13956
 
+               }
13957
 
+       }
13958
 
+       // Non-striped volume. Just find LE and offset. Reset the size if
13959
 
+       // the request crosses an LE boundary. This path is 64-bit safe.
13960
 
+       else {  
13961
 
+               le              = org_sector >> volume->pe_size_shift;
13962
 
+               offset_in_le    = org_sector & (volume->pe_size - 1);
13963
 
+
13964
 
+               if ( offset_in_le + size > volume->pe_size ) {
13965
 
+                       *new_size = volume->pe_size - offset_in_le;
13966
 
+               }
13967
 
+       }
13968
 
+
13969
 
+       le_entry                = &volume->le_map[le];
13970
 
+       *pe_start_sector        = le_entry->pe_sector_offset;
13971
 
+       *new_sector             = le_entry->pe_sector_offset + offset_in_le;
13972
 
+       *pv_entry               = le_entry->owning_pv;
13973
 
+
13974
 
+       return 0;
13975
 
+}
13976
 
+
13977
 
+
13978
 
+/* Function: add_group_to_list
13979
 
+ *
13980
 
+ *     Add an LVM volume group to the global LVM list. This inserts at
13981
 
+ *     the start of the list, since order isn't particularly important.
13982
 
+ *
13983
 
+ *     So, it appears that order is important. :) Now inserting at the
13984
 
+ *     end of the list instead of the beginning.
13985
 
+ */
13986
 
+static int add_group_to_list( lvm_volume_group_t * group )
13987
 
+{
13988
 
+       lvm_volume_group_t ** p_group;
13989
 
+
13990
 
+       for ( p_group = &lvm_group_list; *p_group; p_group = &(*p_group)->next_group ) {
13991
 
+               ;
13992
 
+       }
13993
 
+
13994
 
+       *p_group = group;
13995
 
+       group->next_group = NULL;
13996
 
+
13997
 
+       return 0;
13998
 
+}
13999
 
+
14000
 
+
14001
 
+/* Function: remove_group_from_list
14002
 
+ *
14003
 
+ *     Remove an LVM volume group from the global LVM list.
14004
 
+ */
14005
 
+static int remove_group_from_list( lvm_volume_group_t * group )
14006
 
+{
14007
 
+       lvm_volume_group_t ** p_group;
14008
 
+
14009
 
+       for ( p_group = &lvm_group_list; *p_group; p_group = &(*p_group)->next_group ) {
14010
 
+               if ( *p_group == group ) {
14011
 
+                       *p_group = (*p_group)->next_group;
14012
 
+                       group->next_group = NULL;
14013
 
+                       break;
14014
 
+               }
14015
 
+       }
14016
 
+
14017
 
+       return 0;
14018
 
+}
14019
 
+
14020
 
+
14021
 
+/* Function: find_group_by_uuid
14022
 
+ *
14023
 
+ *     Use the vg_uuid to find the desired volume group.
14024
 
+ */
14025
 
+static int find_group_by_uuid( unsigned char           * vg_uuid,
14026
 
+                               lvm_volume_group_t      ** group)
14027
 
+{
14028
 
+       lvm_volume_group_t * gp;
14029
 
+
14030
 
+       for ( gp = lvm_group_list; gp; gp = gp->next_group ) {
14031
 
+               if ( ! memcmp(vg_uuid, gp->vg_uuid, UUID_LEN) ) {
14032
 
+                       *group = gp;
14033
 
+                       return 0;
14034
 
+               }
14035
 
+       }
14036
 
+       *group = NULL;
14037
 
+       return -EINVAL;
14038
 
+}
14039
 
+
14040
 
+
14041
 
+/* Function: find_pv_by_number
14042
 
+ *
14043
 
+ *     Search the PV list of the specified volume group, looking for the
14044
 
+ *     specified PV number. If found, return a pointer to that PV.
14045
 
+ */
14046
 
+static lvm_physical_volume_t * find_pv_by_number(u_int32_t             pv_number,
14047
 
+                                               lvm_volume_group_t      * group )
14048
 
+{
14049
 
+       lvm_physical_volume_t * pv_entry;
14050
 
+
14051
 
+       for ( pv_entry = group->pv_list; pv_entry; pv_entry = pv_entry->next ) {
14052
 
+               if ( pv_entry->pv_number == pv_number ) {
14053
 
+                       return pv_entry;
14054
 
+               }
14055
 
+       }
14056
 
+       return NULL;
14057
 
+}
14058
 
+
14059
 
+
14060
 
+/* Function: translate_lv_name
14061
 
+ *
14062
 
+ *     In LVM, volumes have names based on their dev-node, which follow the
14063
 
+ *     pattern /dev/group_name/volume_name. In EVMS, the same volume needs
14064
 
+ *     to appear as /dev/evms/lvm/group_name/volume_name. Thus, the name from
14065
 
+ *     the lv_disk_t needs to be translated before copying to the associated
14066
 
+ *     node. evms_node_name must point to a NAME_LEN sized buffer.
14067
 
+ */
14068
 
+static int translate_lv_name( char * lvm_lv_name, char * evms_node_name )
14069
 
+{
14070
 
+       char * ptr;
14071
 
+
14072
 
+       memset(evms_node_name, 0, NAME_LEN);
14073
 
+
14074
 
+       // Make sure the string starts with /dev/, and skip over it.
14075
 
+       ptr = strstr(lvm_lv_name, DEV_DIRECTORY);
14076
 
+       if ( ptr != lvm_lv_name ) {
14077
 
+               LOG_SERIOUS("Invalid LV name: %s\n", lvm_lv_name);
14078
 
+               return -EINVAL;
14079
 
+       }
14080
 
+       ptr = &ptr[strlen(DEV_DIRECTORY)];
14081
 
+
14082
 
+       // ptr now points to "group_name/volume_name".
14083
 
+       // Use this to create the name for the EVMS node.
14084
 
+       strcpy(evms_node_name, LVM_DEV_DIRECTORY);
14085
 
+       strncat(evms_node_name, ptr, NAME_LEN-strlen(evms_node_name)-1);
14086
 
+
14087
 
+       return 0;
14088
 
+}
14089
 
+
14090
 
+
14091
 
+/* Function: check_pv_for_lv
14092
 
+ *
14093
 
+ *     Run through all LE maps of all LVs in this group, and make sure the
14094
 
+ *     specified PV is not being pointed to by any LEs.
14095
 
+ */
14096
 
+static int check_pv_for_lv(    lvm_physical_volume_t   * pv_entry,
14097
 
+                               lvm_volume_group_t      * group )
14098
 
+{
14099
 
+       lvm_logical_volume_t    * volume;
14100
 
+       int                     i,j;
14101
 
+
14102
 
+       for ( i = 1; i <= MAX_LV; i++ ) {
14103
 
+               if ( (volume = group->volume_list[i]) ) {
14104
 
+                       for ( j = 0; j < volume->num_le; j++ ) {
14105
 
+                               if ( volume->le_map[j].owning_pv == pv_entry ) {
14106
 
+                                       return -EINVAL;
14107
 
+                               }
14108
 
+                       }
14109
 
+               }
14110
 
+       }
14111
 
+       return 0;
14112
 
+}
14113
 
+
14114
 
+
14115
 
+
14116
 
+/********** Metadata I/O Functions **********/
14117
 
+
14118
 
+
14119
 
+/* Function: endian_convert_pv
14120
 
+ *
14121
 
+ *     Endian-neutral conversion for PV structures.
14122
 
+ */
14123
 
+static inline void endian_convert_pv( pv_disk_t * pv )
14124
 
+{
14125
 
+       pv->version                     = le16_to_cpu(pv->version);
14126
 
+       pv->pv_on_disk.base             = le32_to_cpu(pv->pv_on_disk.base);
14127
 
+       pv->pv_on_disk.size             = le32_to_cpu(pv->pv_on_disk.size);
14128
 
+       pv->vg_on_disk.base             = le32_to_cpu(pv->vg_on_disk.base);
14129
 
+       pv->vg_on_disk.size             = le32_to_cpu(pv->vg_on_disk.size);
14130
 
+       pv->pv_uuidlist_on_disk.base    = le32_to_cpu(pv->pv_uuidlist_on_disk.base);
14131
 
+       pv->pv_uuidlist_on_disk.size    = le32_to_cpu(pv->pv_uuidlist_on_disk.size);
14132
 
+       pv->lv_on_disk.base             = le32_to_cpu(pv->lv_on_disk.base);
14133
 
+       pv->lv_on_disk.size             = le32_to_cpu(pv->lv_on_disk.size);
14134
 
+       pv->pe_on_disk.base             = le32_to_cpu(pv->pe_on_disk.base);
14135
 
+       pv->pe_on_disk.size             = le32_to_cpu(pv->pe_on_disk.size);
14136
 
+       pv->pv_major                    = le32_to_cpu(pv->pv_major);
14137
 
+       pv->pv_number                   = le32_to_cpu(pv->pv_number);
14138
 
+       pv->pv_status                   = le32_to_cpu(pv->pv_status);
14139
 
+       pv->pv_allocatable              = le32_to_cpu(pv->pv_allocatable);
14140
 
+       pv->pv_size                     = le32_to_cpu(pv->pv_size);
14141
 
+       pv->lv_cur                      = le32_to_cpu(pv->lv_cur);
14142
 
+       pv->pe_size                     = le32_to_cpu(pv->pe_size);
14143
 
+       pv->pe_total                    = le32_to_cpu(pv->pe_total);
14144
 
+       pv->pe_allocated                = le32_to_cpu(pv->pe_allocated);
14145
 
+       pv->pe_start                    = le32_to_cpu(pv->pe_start);
14146
 
+}
14147
 
+
14148
 
+
14149
 
+/* Function: read_pv
14150
 
+ *
14151
 
+ *     Read in the PV structure from the specified node. If it contains a
14152
 
+ *     valid PV signature, allocate a new pv_disk_t and copy the data.
14153
 
+ */
14154
 
+static int read_pv(    evms_logical_node_t     * node,
14155
 
+                       pv_disk_t               ** pv )
14156
 
+{
14157
 
+       pv_disk_t * pv_buffer;
14158
 
+
14159
 
+       *pv = NULL;
14160
 
+
14161
 
+       // Buffer for reading the PV metadata.
14162
 
+       pv_buffer = kmalloc(LVM_PV_DISK_SIZE, GFP_NOIO);
14163
 
+       if ( ! pv_buffer ) {
14164
 
+               LOG_CRITICAL("Memory error creating buffer to read PV metadata for node %s\n", node->name);
14165
 
+               return -ENOMEM;
14166
 
+       }
14167
 
+
14168
 
+       // Read the first two sectors.
14169
 
+       if ( INIT_IO(node, 0, evms_cs_size_in_vsectors(LVM_PV_DISK_BASE),
14170
 
+                       evms_cs_size_in_vsectors(LVM_PV_DISK_SIZE), pv_buffer) ) {
14171
 
+               LOG_SERIOUS("Error reading PV metadata from node %s\n", node->name);
14172
 
+               kfree(pv_buffer);
14173
 
+               return -EIO;
14174
 
+       }
14175
 
+
14176
 
+       // Endian-neutral conversion of PV metadata.
14177
 
+       endian_convert_pv(pv_buffer);
14178
 
+
14179
 
+       // Check for an LVM signature and make sure the sizes match.
14180
 
+       // Versions 1 and 2 are both valid now. Thanks LVM! :)
14181
 
+       if ( ! ( pv_buffer->id[0] == 'H' &&
14182
 
+                pv_buffer->id[1] == 'M' &&
14183
 
+                (pv_buffer->version == 1 || pv_buffer->version == 2) &&
14184
 
+                pv_buffer->pv_size == node->total_vsectors ) ) {
14185
 
+               LOG_EXTRA("Node %s is not an LVM PV\n", node->name);
14186
 
+               kfree(pv_buffer);
14187
 
+               return -EINVAL;
14188
 
+       }
14189
 
+
14190
 
+       // This is a valid PV. Allocate a new pv_disk_t.
14191
 
+       *pv = kmalloc(sizeof(pv_disk_t), GFP_NOIO);
14192
 
+       if ( ! *pv ) {
14193
 
+               LOG_CRITICAL("Memory error creating new PV for node %s\n", node->name);
14194
 
+               kfree(pv_buffer);
14195
 
+               return -ENOMEM;
14196
 
+       }
14197
 
+
14198
 
+       // Copy the metadata.
14199
 
+       memcpy(*pv, pv_buffer, sizeof(pv_disk_t));
14200
 
+       kfree(pv_buffer);
14201
 
+       return 0;
14202
 
+}
14203
 
+
14204
 
+
14205
 
+/* Function: endian_convert_vg
14206
 
+ *
14207
 
+ *     Endian-neutral conversion for VG structures
14208
 
+ */
14209
 
+static inline void endian_convert_vg( vg_disk_t * vg )
14210
 
+{
14211
 
+       vg->vg_number   = le32_to_cpu(vg->vg_number);
14212
 
+       vg->vg_access   = le32_to_cpu(vg->vg_access);
14213
 
+       vg->vg_status   = le32_to_cpu(vg->vg_status);
14214
 
+       vg->lv_max      = le32_to_cpu(vg->lv_max);
14215
 
+       vg->lv_cur      = le32_to_cpu(vg->lv_cur);
14216
 
+       vg->lv_open     = le32_to_cpu(vg->lv_open);
14217
 
+       vg->pv_max      = le32_to_cpu(vg->pv_max);
14218
 
+       vg->pv_cur      = le32_to_cpu(vg->pv_cur);
14219
 
+       vg->pv_act      = le32_to_cpu(vg->pv_act);
14220
 
+       vg->dummy       = le32_to_cpu(vg->dummy);
14221
 
+       vg->vgda        = le32_to_cpu(vg->vgda);
14222
 
+       vg->pe_size     = le32_to_cpu(vg->pe_size);
14223
 
+       vg->pe_total    = le32_to_cpu(vg->pe_total);
14224
 
+       vg->pe_allocated= le32_to_cpu(vg->pe_allocated);
14225
 
+       vg->pvg_total   = le32_to_cpu(vg->pvg_total);
14226
 
+}
14227
 
+
14228
 
+
14229
 
+/* Function: read_vg
14230
 
+ *
14231
 
+ *     Read in the VG structure from the specified node. Allocate a new
14232
 
+ *     vg_disk_t and copy the data.
14233
 
+ */
14234
 
+static int read_vg(    evms_logical_node_t     * node,
14235
 
+                       pv_disk_t               * pv,
14236
 
+                       vg_disk_t               ** vg )
14237
 
+{
14238
 
+       vg_disk_t       * vg_buffer;
14239
 
+       unsigned long   vg_sectors;
14240
 
+
14241
 
+       // Allocate a buffer to read the VG metadata.
14242
 
+       vg_sectors = evms_cs_size_in_vsectors(pv->vg_on_disk.size);
14243
 
+       vg_buffer = kmalloc(vg_sectors << EVMS_VSECTOR_SIZE_SHIFT, GFP_NOIO);
14244
 
+       if ( ! vg_buffer ) {
14245
 
+               LOG_CRITICAL("Memory error creating buffer to read VG metadata from node %s\n", node->name);
14246
 
+               return -ENOMEM;
14247
 
+       }
14248
 
+
14249
 
+       // Read the VG metadata.
14250
 
+       if ( INIT_IO(node, 0, evms_cs_size_in_vsectors(pv->vg_on_disk.base), vg_sectors, vg_buffer) ) {
14251
 
+               LOG_SERIOUS("Error reading VG metadata from node %s\n", node->name);
14252
 
+               kfree(vg_buffer);
14253
 
+               return -EIO;
14254
 
+       }
14255
 
+
14256
 
+       // Endian-neutral conversion of VG metadata.
14257
 
+       endian_convert_vg(vg_buffer);
14258
 
+
14259
 
+       // Allocate a new vg_disk_t
14260
 
+       *vg = kmalloc(sizeof(vg_disk_t), GFP_NOIO);
14261
 
+       if ( ! *vg ) {
14262
 
+               LOG_CRITICAL("Memory error creating new VG structure for node %s\n", node->name);
14263
 
+               kfree(vg_buffer);
14264
 
+               return -ENOMEM;
14265
 
+       }
14266
 
+
14267
 
+       // Copy the metadata.
14268
 
+       memcpy(*vg, vg_buffer, sizeof(vg_disk_t));
14269
 
+       kfree(vg_buffer);
14270
 
+       return 0;
14271
 
+}
14272
 
+
14273
 
+
14274
 
+/* Function: read_uuid_list
14275
 
+ */
14276
 
+static int read_uuid_list(     evms_logical_node_t     * node,
14277
 
+                               pv_disk_t               * pv,
14278
 
+                               lvm_volume_group_t      * group )
14279
 
+{
14280
 
+       evms_sector_t   start_sector;
14281
 
+       unsigned long   total_sectors;
14282
 
+       unsigned char   * uuid_buffer;
14283
 
+       unsigned long   buffer_size = IO_BUFFER_SECTORS * EVMS_VSECTOR_SIZE;
14284
 
+       unsigned long   uuid_list_size;
14285
 
+       int             i;
14286
 
+
14287
 
+       if ( group->uuid_list ) {
14288
 
+               LOG_EXTRA("Already read PV UUIDs for group %s\n", group->vg_name);
14289
 
+               return 0;
14290
 
+       }
14291
 
+
14292
 
+       start_sector = evms_cs_size_in_vsectors(pv->pv_uuidlist_on_disk.base);
14293
 
+       total_sectors = evms_cs_size_in_vsectors(pv->pv_uuidlist_on_disk.size);
14294
 
+       uuid_list_size = round_up(total_sectors * EVMS_VSECTOR_SIZE, buffer_size);
14295
 
+
14296
 
+       // Allocate memory for the UUID array for this group.
14297
 
+       group->uuid_list = vmalloc(uuid_list_size);
14298
 
+       if ( ! group->uuid_list ) {
14299
 
+               LOG_CRITICAL("Memory error creating UUID list for group %s\n", group->vg_name);
14300
 
+               return -ENOMEM;
14301
 
+       }
14302
 
+       memset(group->uuid_list, 0, uuid_list_size);
14303
 
+
14304
 
+       // Allocate a buffer to perform the I/Os.
14305
 
+       uuid_buffer = kmalloc(buffer_size, GFP_NOIO);
14306
 
+       if ( ! uuid_buffer ) {
14307
 
+               LOG_CRITICAL("Memory error creating I/O buffer for UUID list in group %s\n", group->vg_name);
14308
 
+               vfree(group->uuid_list);
14309
 
+               group->uuid_list = NULL;
14310
 
+               return -ENOMEM;
14311
 
+       }
14312
 
+
14313
 
+       for ( i = 0; i < total_sectors; i += IO_BUFFER_SECTORS ) {
14314
 
+               if ( INIT_IO(node, 0, start_sector + i, IO_BUFFER_SECTORS, uuid_buffer) ) {
14315
 
+                       LOG_SERIOUS("Error reading PV UUID list from node %s\n", node->name);
14316
 
+                       kfree(uuid_buffer);
14317
 
+                       vfree(group->uuid_list);
14318
 
+                       group->uuid_list = NULL;
14319
 
+                       return -EIO;
14320
 
+               }
14321
 
+
14322
 
+               // Copy the I/O buffer into the UUID array.
14323
 
+               memcpy(&(group->uuid_list[i*EVMS_VSECTOR_SIZE]), uuid_buffer, buffer_size);
14324
 
+       }
14325
 
+
14326
 
+       // Clear out the unused portion at the end of the uuid_list
14327
 
+       memset(&(group->uuid_list[pv->pv_uuidlist_on_disk.size]), 0, uuid_list_size - pv->pv_uuidlist_on_disk.size);
14328
 
+
14329
 
+       kfree(uuid_buffer);
14330
 
+       return 0;
14331
 
+}
14332
 
+
14333
 
+
14334
 
+/* Function: endian_convert_lv
14335
 
+ *
14336
 
+ *     Endian-neutral conversion for LV structures
14337
 
+ */
14338
 
+static inline void endian_convert_lv( lv_disk_t * lv )
14339
 
+{
14340
 
+       lv->lv_access           = le32_to_cpu(lv->lv_access);
14341
 
+       lv->lv_status           = le32_to_cpu(lv->lv_status);
14342
 
+       lv->lv_open             = le32_to_cpu(lv->lv_open);
14343
 
+       lv->lv_dev              = le32_to_cpu(lv->lv_dev);
14344
 
+       lv->lv_number           = le32_to_cpu(lv->lv_number);
14345
 
+       lv->lv_mirror_copies    = le32_to_cpu(lv->lv_mirror_copies);
14346
 
+       lv->lv_recovery         = le32_to_cpu(lv->lv_recovery);
14347
 
+       lv->lv_schedule         = le32_to_cpu(lv->lv_schedule);
14348
 
+       lv->lv_size             = le32_to_cpu(lv->lv_size);
14349
 
+       lv->lv_snapshot_minor   = le32_to_cpu(lv->lv_snapshot_minor);
14350
 
+       lv->lv_chunk_size       = le16_to_cpu(lv->lv_chunk_size);
14351
 
+       lv->dummy               = le16_to_cpu(lv->dummy);
14352
 
+       lv->lv_allocated_le     = le32_to_cpu(lv->lv_allocated_le);
14353
 
+       lv->lv_stripes          = le32_to_cpu(lv->lv_stripes);
14354
 
+       lv->lv_stripesize       = le32_to_cpu(lv->lv_stripesize);
14355
 
+       lv->lv_badblock         = le32_to_cpu(lv->lv_badblock);
14356
 
+       lv->lv_allocation       = le32_to_cpu(lv->lv_allocation);
14357
 
+       lv->lv_io_timeout       = le32_to_cpu(lv->lv_io_timeout);
14358
 
+       lv->lv_read_ahead       = le32_to_cpu(lv->lv_read_ahead);
14359
 
+}
14360
 
+
14361
 
+static inline void endian_convert_lvs( lvm_volume_group_t * group )
14362
 
+{
14363
 
+       int i;
14364
 
+       for ( i = 0; i < group->vg->lv_max; i++ ) {
14365
 
+               endian_convert_lv(&(group->lv_array[i]));
14366
 
+       }
14367
 
+}
14368
 
+
14369
 
+
14370
 
+/* Function: read_lv
14371
 
+ *
14372
 
+ *     Read in the LV structures for the specified group. Do the read from
14373
 
+ *     the first PV in the group. If that one fails, keep trying on the
14374
 
+ *     remaining PVs until one works. This function will allocate a buffer
14375
 
+ *     for the group to read in the structures.
14376
 
+ */
14377
 
+static int read_lv( lvm_volume_group_t * group )
14378
 
+{
14379
 
+       lvm_physical_volume_t   * pv_entry = group->pv_list;
14380
 
+       unsigned char           * lv_buffer = NULL;
14381
 
+       evms_sector_t           start_sector;
14382
 
+       unsigned long           total_sectors;
14383
 
+       unsigned long           buffer_size = IO_BUFFER_SECTORS * EVMS_VSECTOR_SIZE;
14384
 
+       unsigned long           lv_array_size;
14385
 
+       int                     i, rc = 1;
14386
 
+
14387
 
+       if ( group->lv_array ) {
14388
 
+               return 0;
14389
 
+       }
14390
 
+
14391
 
+       if ( ! pv_entry ) {
14392
 
+               LOG_ERROR("Group %s has no PVs. Cannot read LV structures.\n", group->vg_name);
14393
 
+               return -EINVAL;
14394
 
+       }
14395
 
+
14396
 
+       // Allocate a buffer to do the actual I/Os.
14397
 
+       lv_buffer = kmalloc(buffer_size, GFP_NOIO);
14398
 
+       if ( ! lv_buffer ) {
14399
 
+               LOG_CRITICAL("Memory error creating I/O buffer for LV structs for Group %s\n", group->vg_name);
14400
 
+               return -ENOMEM;
14401
 
+       }
14402
 
+
14403
 
+       // Read in the LV structures 4k at a time. If one PV returns errors,
14404
 
+       // start over with the next PV in the group.
14405
 
+       while (rc && pv_entry) {
14406
 
+               start_sector = evms_cs_size_in_vsectors(pv_entry->pv->lv_on_disk.base);
14407
 
+               total_sectors = evms_cs_size_in_vsectors(pv_entry->pv->lv_on_disk.size);
14408
 
+               lv_array_size = round_up(total_sectors * EVMS_VSECTOR_SIZE, buffer_size);
14409
 
+
14410
 
+               // Allocate the buffer for this group to hold the entire LV array.
14411
 
+               if ( group->lv_array ) {
14412
 
+                       vfree(group->lv_array);
14413
 
+                       group->lv_array = NULL;
14414
 
+               }
14415
 
+               group->lv_array = vmalloc(lv_array_size);
14416
 
+               if ( ! group->lv_array ) {
14417
 
+                       LOG_CRITICAL("Memory error creating lv_array buffer for Group %s\n", group->vg_name);
14418
 
+                       kfree(lv_buffer);
14419
 
+                       return -ENOMEM;
14420
 
+               }
14421
 
+               memset(group->lv_array, 0, lv_array_size);
14422
 
+
14423
 
+               for ( i = 0; i < total_sectors; i += IO_BUFFER_SECTORS ) {
14424
 
+                       rc = INIT_IO(pv_entry->logical_node, 0, start_sector + i, IO_BUFFER_SECTORS, lv_buffer);
14425
 
+                       if (rc) {
14426
 
+                               LOG_SERIOUS("Error reading LV metadata from node %s in Group %s\n",
14427
 
+                                       pv_entry->logical_node->name, group->vg_name);
14428
 
+
14429
 
+                               // Try the next PV if the current one caused any errors.
14430
 
+                               pv_entry = pv_entry->next;
14431
 
+                               break;
14432
 
+                       }
14433
 
+
14434
 
+                       // Copy the I/O buffer into the lv_array
14435
 
+                       memcpy(&(((char*)(group->lv_array))[i*EVMS_VSECTOR_SIZE]), lv_buffer, buffer_size);
14436
 
+               }
14437
 
+       }
14438
 
+
14439
 
+       if (rc) {
14440
 
+               LOG_SERIOUS("Unable to read LV metadata from any PV in Group %s\n", group->vg_name);
14441
 
+               kfree(lv_buffer);
14442
 
+               vfree(group->lv_array);
14443
 
+               group->lv_array = NULL;
14444
 
+               return -EIO;
14445
 
+       }
14446
 
+
14447
 
+       // Clear out the unused portion at the end of the lv_array.
14448
 
+       memset(&(((char*)(group->lv_array))[pv_entry->pv->lv_on_disk.size]), 0, lv_array_size - pv_entry->pv->lv_on_disk.size);
14449
 
+
14450
 
+       // Endian-neutral conversion of the LV metadata.
14451
 
+       endian_convert_lvs(group);
14452
 
+
14453
 
+       kfree(lv_buffer);
14454
 
+       return 0;
14455
 
+}
14456
 
+
14457
 
+
14458
 
+/* Function: endian_convert_pe_map
14459
 
+ *
14460
 
+ *     Endian-neutral conversion for PE structures
14461
 
+ */
14462
 
+static inline void endian_convert_pe_map( lvm_physical_volume_t * pv_entry )
14463
 
+{
14464
 
+       int i;
14465
 
+       for ( i = 0; i < pv_entry->pv->pe_total; i++ ) {
14466
 
+               pv_entry->pe_map[i].lv_num = le16_to_cpu(pv_entry->pe_map[i].lv_num);
14467
 
+               pv_entry->pe_map[i].le_num = le16_to_cpu(pv_entry->pe_map[i].le_num);
14468
 
+       }
14469
 
+}
14470
 
+
14471
 
+
14472
 
+/* Function: read_pe_map
14473
 
+ *
14474
 
+ *     Read in the PE map for the specified PV. This function will allocate a
14475
 
+ *     buffer to read in the data.
14476
 
+ */
14477
 
+static int read_pe_map( lvm_physical_volume_t * pv_entry )
14478
 
+{
14479
 
+       evms_logical_node_t     * node = pv_entry->logical_node;
14480
 
+       pv_disk_t               * pv = pv_entry->pv;
14481
 
+       unsigned char           * pe_buffer;
14482
 
+       evms_sector_t           start_sector;
14483
 
+       unsigned long           total_sectors;
14484
 
+       unsigned long           buffer_size = IO_BUFFER_SECTORS * EVMS_VSECTOR_SIZE;
14485
 
+       unsigned long           pe_map_size;
14486
 
+       int                     i;
14487
 
+
14488
 
+       if ( pv_entry->pe_map ) {
14489
 
+               return 0;
14490
 
+       }
14491
 
+
14492
 
+       start_sector = evms_cs_size_in_vsectors(pv->pe_on_disk.base);
14493
 
+       total_sectors = evms_cs_size_in_vsectors(pv->pe_total * sizeof(pe_disk_t));
14494
 
+       pe_map_size = round_up(total_sectors * EVMS_VSECTOR_SIZE, buffer_size);
14495
 
+
14496
 
+       // Allocate a buffer to hold the PE map for this PV.
14497
 
+       //pv_entry->pe_map = vmalloc(total_sectors << EVMS_VSECTOR_SIZE_SHIFT);
14498
 
+       pv_entry->pe_map = vmalloc(pe_map_size);
14499
 
+       if ( ! pv_entry->pe_map ) {
14500
 
+               LOG_CRITICAL("Memory error creating PE map for node %s\n", node->name);
14501
 
+               return -ENOMEM;
14502
 
+       }
14503
 
+       memset(pv_entry->pe_map, 0, pe_map_size);
14504
 
+
14505
 
+       // Allocate a buffer for performing the I/O.
14506
 
+       pe_buffer = kmalloc(buffer_size, GFP_NOIO);
14507
 
+       if ( ! pe_buffer ) {
14508
 
+               LOG_CRITICAL("Memory error creating I/O buffer for PE maps for node %s\n", node->name);
14509
 
+               vfree(pv_entry->pe_map);
14510
 
+               pv_entry->pe_map = NULL;
14511
 
+               return -ENOMEM;
14512
 
+       }
14513
 
+
14514
 
+       for ( i = 0; i < total_sectors; i += IO_BUFFER_SECTORS ) {
14515
 
+               if ( INIT_IO(node, 0, start_sector + i, IO_BUFFER_SECTORS, pe_buffer) ) {
14516
 
+                       LOG_SERIOUS("Error reading PE maps from node %s.\n", node->name);
14517
 
+                       kfree(pe_buffer);
14518
 
+                       vfree(pv_entry->pe_map);
14519
 
+                       pv_entry->pe_map = NULL;
14520
 
+                       return -EIO;
14521
 
+               }
14522
 
+               // Copy the data to the actual PE map.
14523
 
+               memcpy(&(((char*)(pv_entry->pe_map))[i*EVMS_VSECTOR_SIZE]), pe_buffer, buffer_size);
14524
 
+       }
14525
 
+
14526
 
+       // Clear out the unused portion at the end of the PE map.
14527
 
+       memset(&(((char*)(pv_entry->pe_map))[total_sectors*EVMS_VSECTOR_SIZE]), 0, pe_map_size - total_sectors*EVMS_VSECTOR_SIZE);
14528
 
+
14529
 
+       // Endian-neutral conversion of the PE metadata.
14530
 
+       endian_convert_pe_map(pv_entry);
14531
 
+
14532
 
+       kfree(pe_buffer);
14533
 
+       return 0;
14534
 
+}
14535
 
+
14536
 
+
14537
 
+
14538
 
+/********** Snapshot Manipulation Functions **********/
14539
 
+
14540
 
+
14541
 
+/* Function: snapshot_check_quiesce_original
14542
 
+ *
14543
 
+ *     For this snapshot LV, check that both it and its original are quiesced.
14544
 
+ */
14545
 
+static int snapshot_check_quiesce_original( lvm_logical_volume_t * snap_volume )
14546
 
+{
14547
 
+       lvm_logical_volume_t * org_volume = snap_volume->snapshot_org;
14548
 
+
14549
 
+       if ( ! (snap_volume->lv_access & EVMS_LV_QUIESCED) ) {
14550
 
+               return -EINVAL;
14551
 
+       }
14552
 
+
14553
 
+       if ( org_volume &&
14554
 
+            ! (org_volume->lv_access & EVMS_LV_QUIESCED) ) {
14555
 
+               return -EINVAL;
14556
 
+       }
14557
 
+
14558
 
+       return 0;
14559
 
+}
14560
 
+
14561
 
+
14562
 
+/* Function: snapshot_check_quiesce_all
14563
 
+ *
14564
 
+ *     Go through the list of all snapshots for an original volume, and make
14565
 
+ *     sure everyone is in a quiesced state.
14566
 
+ */
14567
 
+static int snapshot_check_quiesce_all( lvm_logical_volume_t * org_volume )
14568
 
+{
14569
 
+       lvm_logical_volume_t * snap;
14570
 
+
14571
 
+       if ( ! (org_volume->lv_access & EVMS_LV_QUIESCED) ) {
14572
 
+               return -EINVAL;
14573
 
+       }
14574
 
+
14575
 
+       for ( snap = org_volume->snapshot_next; snap; snap = snap->snapshot_next ) {
14576
 
+               if ( ! (snap->lv_access & EVMS_LV_QUIESCED) ) {
14577
 
+                       return -EINVAL;
14578
 
+               }
14579
 
+       }
14580
 
+
14581
 
+       return 0;
14582
 
+}
14583
 
+
14584
 
+
14585
 
+/* Function: invalidate_snapshot_volume
14586
 
+ *
14587
 
+ *     In the event a snapshot volume becomes full or corrupted, its metadata
14588
 
+ *     must be altered in order to prevent it from being used again. Write some
14589
 
+ *     invalid data into the first entry of the COW table. If this volume is
14590
 
+ *     not fully deleted by the user/engine, this invalid COW entry will be
14591
 
+ *     detected by build_snapshot_maps(), and will cause the volume to be
14592
 
+ *     deleted before being exported to EVMS during discover. This is obviously
14593
 
+ *     a hack, but it is the same hack currently used by LVM. We're just trying
14594
 
+ *     to be compatible. :)
14595
 
+ */
14596
 
+static int invalidate_snapshot_volume( lvm_logical_volume_t * snap_volume )
14597
 
+{
14598
 
+       evms_logical_node_t tmp_node;
14599
 
+
14600
 
+       tmp_node.instance_data = snap_volume;
14601
 
+       tmp_node.total_vsectors = snap_volume->lv_size;
14602
 
+
14603
 
+       if ( ! (snap_volume->lv_access & LV_SNAPSHOT) ) {
14604
 
+               LOG_WARNING("Volume %s is not a snapshot. Cannot invalidate\n", snap_volume->name);
14605
 
+               return -EINVAL;
14606
 
+       }
14607
 
+
14608
 
+       LOG_WARNING("Invalidating full/corrupted snapshot volume %s\n", snap_volume->name);
14609
 
+       LOG_WARNING("Run the EVMS administration tools to remove this snapshot.\n");
14610
 
+
14611
 
+       if ( snap_volume->cow_table ) {
14612
 
+               snap_volume->cow_table[0].pv_org_rsector = cpu_to_le64(((evms_sector_t)1));
14613
 
+               if ( lvm_init_io(&tmp_node, 4, 0, 1, snap_volume->cow_table) ) {
14614
 
+                       LOG_SERIOUS("Unable to invalidate snapshot volume %s\n", snap_volume->name);
14615
 
+               }
14616
 
+       }
14617
 
+       else {
14618
 
+               LOG_SERIOUS("Unable to invalidate snapshot volume %s\n", snap_volume->name);
14619
 
+       }
14620
 
+
14621
 
+       snap_volume->lv_status &= ~LV_ACTIVE;
14622
 
+
14623
 
+       return 0;
14624
 
+}
14625
 
+
14626
 
+
14627
 
+/* Function: remove_snapshot_from_chain
14628
 
+ *
14629
 
+ *     Remove a snapshot volume from its original's chain of snapshots. This
14630
 
+ *     does not delete the snapshot volume. At runtime, we cannot delete
14631
 
+ *     volumes at the region-manager level, because EVMS may have this volume
14632
 
+ *     exported, and there is no way to notify EVMS of the deletion. It will
14633
 
+ *     eventually need to be deleted in the engine, which will then tell the
14634
 
+ *     EVMS kernel services to delete the volume in the kernel.
14635
 
+ */
14636
 
+static int remove_snapshot_from_chain( lvm_logical_volume_t * snap_volume )
14637
 
+{
14638
 
+       lvm_logical_volume_t * org_volume = snap_volume->snapshot_org;
14639
 
+       lvm_logical_volume_t ** p_volume;
14640
 
+
14641
 
+       if ( org_volume ) {
14642
 
+               for ( p_volume = &org_volume->snapshot_next; *p_volume; p_volume = &(*p_volume)->snapshot_next ) {
14643
 
+                       if ( *p_volume == snap_volume ) {
14644
 
+                               *p_volume = snap_volume->snapshot_next;
14645
 
+                               break;
14646
 
+                       }
14647
 
+               }
14648
 
+       }
14649
 
+
14650
 
+       snap_volume->snapshot_org = NULL;
14651
 
+       snap_volume->snapshot_next = NULL;
14652
 
+       return 0;
14653
 
+}
14654
 
+
14655
 
+
14656
 
+/* Function: snapshot_hash
14657
 
+ *
14658
 
+ *     The snapshot hash tables are NEVER going to have 4 billion entries, so
14659
 
+ *     we can safely cast the org_sector to 32 bits and just mod it by the
14660
 
+ *     hash table size.
14661
 
+ */
14662
 
+static u_int32_t snapshot_hash(        evms_sector_t           org_sector,
14663
 
+                               lvm_logical_volume_t    * snap_volume )
14664
 
+{
14665
 
+       return( ((u_int32_t)org_sector) % snap_volume->hash_table_size);
14666
 
+}
14667
 
+
14668
 
+
14669
 
+/* Function: snapshot_search_hash_chain
14670
 
+ *
14671
 
+ *     Search the hash chain that is anchored at the specified head pointer.
14672
 
+ *     If the sector number is found, the result pointer is set to that entry
14673
 
+ *     in the chain, and a 1 is returned. If the sector is not found, the
14674
 
+ *     result pointer is set to the previous entry and 0 is returned. If the
14675
 
+ *     result pointer is NULL, this means either the list is empty, or the
14676
 
+ *     specified sector should become the first list item.
14677
 
+ */
14678
 
+static int snapshot_search_hash_chain( evms_sector_t           org_sector,
14679
 
+                                       snapshot_map_entry_t    * head,
14680
 
+                                       snapshot_map_entry_t    ** result )
14681
 
+{
14682
 
+       snapshot_map_entry_t * curr = head;
14683
 
+       snapshot_map_entry_t * prev = head;
14684
 
+       while ( curr && curr->org_sector < org_sector ) {
14685
 
+               prev = curr;
14686
 
+               curr = curr->next;
14687
 
+       }
14688
 
+       if ( ! curr ) {
14689
 
+               // Either an empty chain or went off the end of the chain.
14690
 
+               *result = prev;
14691
 
+               return 0;
14692
 
+       }
14693
 
+       else if ( curr->org_sector != org_sector ) {
14694
 
+               *result = curr->prev;
14695
 
+               return 0;
14696
 
+       }
14697
 
+       else {
14698
 
+               // Found the desired sector.
14699
 
+               *result = curr;
14700
 
+               return 1;
14701
 
+       }
14702
 
+}
14703
 
+
14704
 
+
14705
 
+/* Function: insert_snapshot_map_entry
14706
 
+ *
14707
 
+ *     Insert a new entry into a snapshot hash chain, immediately following the
14708
 
+ *     specified entry. This function should not be used to add an entry into
14709
 
+ *     an empty list, or as the first entry in an existing list. For that case,
14710
 
+ *     use insert_snapshot_map_entry_at_head().
14711
 
+ */
14712
 
+static int insert_snapshot_map_entry(  snapshot_map_entry_t * entry,
14713
 
+                                       snapshot_map_entry_t * base )
14714
 
+{
14715
 
+       entry->next = base->next;
14716
 
+       entry->prev = base;
14717
 
+       base->next = entry;
14718
 
+       if ( entry->next ) {
14719
 
+               entry->next->prev = entry;
14720
 
+       }
14721
 
+       return 0;
14722
 
+}
14723
 
+
14724
 
+
14725
 
+/* Function: insert_snapshot_map_entry_at_head
14726
 
+ *
14727
 
+ *     Insert a new entry into a snapshot chain as the first entry.
14728
 
+ */
14729
 
+static int insert_snapshot_map_entry_at_head(  snapshot_map_entry_t * entry,
14730
 
+                                               snapshot_map_entry_t ** head )
14731
 
+{
14732
 
+       entry->next = *head;
14733
 
+       entry->prev = NULL;
14734
 
+       *head = entry;
14735
 
+       if ( entry->next ) {
14736
 
+               entry->next->prev = entry;
14737
 
+       }
14738
 
+       return 0;
14739
 
+}
14740
 
+
14741
 
+
14742
 
+/* Function: add_cow_entry_to_snapshot_map
14743
 
+ *
14744
 
+ *     Convert a cow table entry (from the on-disk data) into an appropriate
14745
 
+ *     entry for the snapshot map. Insert this new entry into the appropriate
14746
 
+ *     map for the specified volume.
14747
 
+ *
14748
 
+ *     The cow_entry passed into this function must have already been
14749
 
+ *     endian-converted from disk-order to cpu-order.
14750
 
+ */
14751
 
+static int add_cow_entry_to_snapshot_map(lv_COW_table_disk_t   * cow_entry,
14752
 
+                                       lvm_logical_volume_t    * volume )
14753
 
+{
14754
 
+       snapshot_map_entry_t    * new_entry;
14755
 
+       snapshot_map_entry_t    ** hash_table;
14756
 
+       snapshot_map_entry_t    * chain_head;
14757
 
+       snapshot_map_entry_t    * target_entry;
14758
 
+       u_int32_t               hash_value;
14759
 
+
14760
 
+       if ( cow_entry->pv_org_number == 0 ) {
14761
 
+               return -EINVAL;
14762
 
+       }
14763
 
+       new_entry = allocate_snapshot_map_entry(cow_entry->pv_org_rsector, cow_entry->pv_snap_rsector);
14764
 
+       if ( ! new_entry ) {
14765
 
+               return -ENOMEM;
14766
 
+       }
14767
 
+       new_entry->snap_pv = find_pv_by_number(cow_entry->pv_snap_number, volume->group);
14768
 
+       if ( ! new_entry->snap_pv ) {
14769
 
+               return -EINVAL;
14770
 
+       }
14771
 
+
14772
 
+       hash_value = snapshot_hash(new_entry->org_sector, volume);
14773
 
+       hash_table = volume->snapshot_map[cow_entry->pv_org_number];
14774
 
+       chain_head = hash_table[hash_value];
14775
 
+       if ( snapshot_search_hash_chain(new_entry->org_sector, chain_head, &target_entry) ) {   
14776
 
+               // In general, we should not find this entry in the snapshot
14777
 
+               // map already. However, it could happen on a re-discover, but
14778
 
+               // the build_snapshot_maps function should weed out those cases.
14779
 
+               // In either event, we can simply ignore duplicates.
14780
 
+               LOG_WARNING("Detected a duplicate snapshot map entry\n");
14781
 
+               LOG_WARNING("Snap PV %Ld:%Ld, Org PV %Ld:%Ld\n", cow_entry->pv_snap_number, cow_entry->pv_snap_rsector,
14782
 
+                       cow_entry->pv_org_number, cow_entry->pv_org_rsector);
14783
 
+               kfree(new_entry);
14784
 
+       }
14785
 
+       else {
14786
 
+               if ( target_entry ) {
14787
 
+                       insert_snapshot_map_entry(new_entry, target_entry);
14788
 
+               }
14789
 
+               else {
14790
 
+                       insert_snapshot_map_entry_at_head(new_entry, &hash_table[hash_value]);
14791
 
+               }
14792
 
+       }
14793
 
+
14794
 
+       return 0;
14795
 
+}
14796
 
+
14797
 
+
14798
 
+/* Function: snapshot_remap_sector
14799
 
+ *
14800
 
+ *     Perform a sector remap on a snapshot volume. This should be called from
14801
 
+ *     the I/O read path, after the LE-to-PE translation has already been
14802
 
+ *     performed. First, determine the base sector of the chunk containing the
14803
 
+ *     specified sector, and save the remainder. Then, perform a search through
14804
 
+ *     the snapshot map for the specified volume. If an match is found, change
14805
 
+ *     the PV and sector numbers to the new values. If no match is found, leave
14806
 
+ *     the values alone, meaning the read should proceed down the original
14807
 
+ *     volume.
14808
 
+ */
14809
 
+static void snapshot_remap_sector(     lvm_logical_volume_t    * snap_volume,
14810
 
+                                       evms_sector_t           pe_start_sector,
14811
 
+                                       evms_sector_t           * sector,
14812
 
+                                       lvm_physical_volume_t   ** pv_entry )
14813
 
+{
14814
 
+       snapshot_map_entry_t    ** hash_table;
14815
 
+       snapshot_map_entry_t    * chain_head;
14816
 
+       snapshot_map_entry_t    * result;
14817
 
+       u_int32_t               hash_value;
14818
 
+       evms_sector_t           chunk_sector;
14819
 
+       evms_sector_t           remainder;
14820
 
+
14821
 
+       if ( ! (snap_volume->lv_access & LV_SNAPSHOT) ) {
14822
 
+               return;
14823
 
+       }
14824
 
+
14825
 
+       chunk_sector = ((*sector - pe_start_sector) & ((evms_sector_t)(~(snap_volume->chunk_size - 1)))) + pe_start_sector;
14826
 
+       remainder = *sector - chunk_sector;
14827
 
+       hash_value = snapshot_hash(chunk_sector, snap_volume);
14828
 
+       hash_table = snap_volume->snapshot_map[(*pv_entry)->pv_number];
14829
 
+       chain_head = hash_table[hash_value];
14830
 
+
14831
 
+       if ( snapshot_search_hash_chain(chunk_sector, chain_head, &result) ) {
14832
 
+               *pv_entry       = result->snap_pv;
14833
 
+               *sector         = result->snap_sector + remainder;
14834
 
+       }
14835
 
+}
14836
 
+
14837
 
+
14838
 
+/* Function: snapshot_read_write_chunk
14839
 
+ *
14840
 
+ *     This function takes care of reading one chunk of data from the
14841
 
+ *     original, and writing it to the snapshot. Since the original now has
14842
 
+ *     a fixed sized buffer for this data, we may have to loop to get the
14843
 
+ *     whole chunk copied.
14844
 
+ */
14845
 
+static int snapshot_read_write_chunk(  lvm_logical_volume_t    * org_volume,
14846
 
+                                       lvm_physical_volume_t   * org_pv,
14847
 
+                                       evms_sector_t           chunk_sector,
14848
 
+                                       lvm_logical_volume_t    * snap_volume,
14849
 
+                                       lvm_physical_volume_t   ** snap_pv,
14850
 
+                                       evms_sector_t           * snap_sector )
14851
 
+{
14852
 
+       u_int32_t       io_size = snap_volume->chunk_size;
14853
 
+       evms_sector_t   snap_pe_start_sector;
14854
 
+       evms_sector_t   size;
14855
 
+       int             i, iterations = 1;
14856
 
+
14857
 
+       if ( org_volume->chunk_size < snap_volume->chunk_size ) {
14858
 
+               iterations = snap_volume->chunk_size / org_volume->chunk_size;
14859
 
+               io_size = org_volume->chunk_size;
14860
 
+       }
14861
 
+
14862
 
+       remap_sector(snap_volume->volume_node, snap_volume->next_free_chunk, 1, snap_sector, &size, &snap_pe_start_sector, snap_pv);
14863
 
+
14864
 
+       // Check for an incomplete volume
14865
 
+       if ( ! *snap_sector || ! *snap_pv ) {
14866
 
+               invalidate_snapshot_volume(snap_volume);
14867
 
+               return -1;
14868
 
+       }
14869
 
+
14870
 
+       for ( i = 0; i < iterations; i++ ) {
14871
 
+
14872
 
+               // Read the chunk from the original volume. This is a physical
14873
 
+               // read, not logical. Thus, stripe boundary considerations are
14874
 
+               // unnecessary. Also, chunks are always aligned with PEs, so PE
14875
 
+               // boundary considerations are unnecessary.
14876
 
+               if ( INIT_IO(org_pv->logical_node, 0, chunk_sector + i*io_size, io_size, org_volume->chunk_data_buffer) ) {
14877
 
+                       return 1;
14878
 
+               }
14879
 
+
14880
 
+               // Write this chunk to the snapshot volume. This does duplicate
14881
 
+               // the local init_io code, but we need to have the remapped
14882
 
+               // sector later on, so this is slightly more efficient. Snapshot
14883
 
+               // volumes cannot be striped, so there is no need to consider
14884
 
+               // stripe-boundary conditions. And just like the read in the
14885
 
+               // previous line, chunks are always aligned with PEs, so we
14886
 
+               // don't have to consider PE-boundary conditions.
14887
 
+               if ( INIT_IO((*snap_pv)->logical_node, 1, *snap_sector + i*io_size, io_size, org_volume->chunk_data_buffer) ) {
14888
 
+                       // An error writing the chunk to the snapshot is the
14889
 
+                       // same situation as the snapshot being full.
14890
 
+                       invalidate_snapshot_volume(snap_volume);
14891
 
+                       return -1;
14892
 
+               }
14893
 
+       }
14894
 
+
14895
 
+       return 0;
14896
 
+}
14897
 
+
14898
 
+
14899
 
+/* Function: snapshot_copy_data
14900
 
+ *
14901
 
+ *     On a write to a snapshotted volume, check all snapshots to see if the
14902
 
+ *     specified chunk has already been remapped. If it has not, read the
14903
 
+ *     original data from the volume, write the data to the next available
14904
 
+ *     chunk on the snapshot, update the COW table, write the COW table to
14905
 
+ *     the snapshot, and insert a new entry into the snapshot map.
14906
 
+ *
14907
 
+ *     Now converted to copy data to a single snapshot. The looping is left
14908
 
+ *     up to lvm_write.
14909
 
+ */
14910
 
+static int snapshot_copy_data( lvm_logical_volume_t    * org_volume,
14911
 
+                               lvm_logical_volume_t    * snap_volume,
14912
 
+                               evms_sector_t           pe_start_sector,
14913
 
+                               evms_sector_t           org_sector,
14914
 
+                               lvm_physical_volume_t   * org_pv )
14915
 
+{
14916
 
+       lvm_physical_volume_t   * snap_pv;
14917
 
+       snapshot_map_entry_t    ** hash_table;
14918
 
+       snapshot_map_entry_t    * chain_head;
14919
 
+       snapshot_map_entry_t    * target_entry;
14920
 
+       snapshot_map_entry_t    * new_map_entry;
14921
 
+       u_int32_t               hash_value;
14922
 
+       evms_sector_t           chunk_sector;
14923
 
+       evms_sector_t           snap_sector;
14924
 
+       int                     rc;
14925
 
+
14926
 
+       // Lock out this snapshot while we are remapping.
14927
 
+       down(&snap_volume->snap_semaphore);
14928
 
+
14929
 
+       // Make sure the snapshot has not been deactivated.
14930
 
+       if ( ! (snap_volume->lv_status & LV_ACTIVE) ) {
14931
 
+               up(&snap_volume->snap_semaphore);
14932
 
+               return 0;
14933
 
+       }
14934
 
+
14935
 
+       // Search the hash table to see if this sector has already been
14936
 
+       // remapped on this snapshot.
14937
 
+       chunk_sector = ((org_sector - pe_start_sector) & ((evms_sector_t)(~(snap_volume->chunk_size - 1)))) + pe_start_sector;
14938
 
+       hash_value = snapshot_hash(chunk_sector, snap_volume);
14939
 
+       hash_table = snap_volume->snapshot_map[org_pv->pv_number];
14940
 
+       chain_head = hash_table[hash_value];
14941
 
+       if ( snapshot_search_hash_chain(chunk_sector, chain_head, &target_entry) ) {
14942
 
+               // Chunk is already remapped.
14943
 
+               up(&snap_volume->snap_semaphore);
14944
 
+               return 0;
14945
 
+       }
14946
 
+       
14947
 
+       // Is there room on the snapshot to remap this chunk?
14948
 
+       if ( snap_volume->next_free_chunk >= snap_volume->lv_size ) {
14949
 
+               // At this point, the snapshot is full. Any further
14950
 
+               // writes to the original will cause the snapshot to
14951
 
+               // become "corrupt" because they can't be remapped.
14952
 
+               // Take this snapshot permanently offline.
14953
 
+               invalidate_snapshot_volume(snap_volume);
14954
 
+               up(&snap_volume->snap_semaphore);
14955
 
+               return 0;
14956
 
+       }
14957
 
+
14958
 
+       rc = snapshot_read_write_chunk(org_volume, org_pv, chunk_sector, snap_volume, &snap_pv, &snap_sector);
14959
 
+       if ( rc > 0 ) {
14960
 
+               up(&snap_volume->snap_semaphore);
14961
 
+               return -EIO;
14962
 
+       }
14963
 
+       else if ( rc < 0 ) {
14964
 
+               up(&snap_volume->snap_semaphore);
14965
 
+               return 0;
14966
 
+       }
14967
 
+
14968
 
+       // Fill in the appropriate COW table entry and write that
14969
 
+       // metadata sector back to the snapshot volume. Since we are
14970
 
+       // only writing one sector, there are no boundary conditions.
14971
 
+       // Must endian-convert each entry as it is added.
14972
 
+       snap_volume->cow_table[snap_volume->next_cow_entry].pv_org_number   = cpu_to_le64((evms_sector_t)(org_pv->pv_number));
14973
 
+       snap_volume->cow_table[snap_volume->next_cow_entry].pv_org_rsector  = cpu_to_le64(chunk_sector);
14974
 
+       snap_volume->cow_table[snap_volume->next_cow_entry].pv_snap_number  = cpu_to_le64((evms_sector_t)(snap_pv->pv_number));
14975
 
+       snap_volume->cow_table[snap_volume->next_cow_entry].pv_snap_rsector = cpu_to_le64(snap_sector);
14976
 
+       if ( lvm_init_io(snap_volume->volume_node, 4, snap_volume->current_cow_sector, 1, snap_volume->cow_table) ) {
14977
 
+               // The data was written to the snapshot, but
14978
 
+               // writing the metadata failed.
14979
 
+               invalidate_snapshot_volume(snap_volume);
14980
 
+               up(&snap_volume->snap_semaphore);
14981
 
+               return 0;
14982
 
+       }
14983
 
+       snap_volume->next_cow_entry++;
14984
 
+       if ( snap_volume->next_cow_entry >= (EVMS_VSECTOR_SIZE/sizeof(lv_COW_table_disk_t)) ) {
14985
 
+               snap_volume->next_cow_entry = 0;
14986
 
+               snap_volume->current_cow_sector++;
14987
 
+               memset(snap_volume->cow_table, 0, EVMS_VSECTOR_SIZE);
14988
 
+               if ( lvm_init_io(snap_volume->volume_node, 4, snap_volume->current_cow_sector, 1, snap_volume->cow_table) ) {
14989
 
+                       // Can't clear out the next sector of metadata.
14990
 
+                       invalidate_snapshot_volume(snap_volume);
14991
 
+                       up(&snap_volume->snap_semaphore);
14992
 
+                       return 0;
14993
 
+               }
14994
 
+       }
14995
 
+       snap_volume->next_free_chunk += snap_volume->chunk_size;
14996
 
+
14997
 
+       // Create a new snapshot map entry and add it in the appropriate
14998
 
+       // place in the map.
14999
 
+       if ( ! (new_map_entry = allocate_snapshot_map_entry(chunk_sector, snap_sector)) ) {
15000
 
+               invalidate_snapshot_volume(snap_volume);
15001
 
+               up(&snap_volume->snap_semaphore);
15002
 
+               return -ENOMEM;
15003
 
+       }
15004
 
+       new_map_entry->snap_pv = snap_pv;
15005
 
+       if ( target_entry ) {   
15006
 
+               insert_snapshot_map_entry(new_map_entry, target_entry);
15007
 
+       }
15008
 
+       else {
15009
 
+               insert_snapshot_map_entry_at_head(new_map_entry, &(hash_table[hash_value]));
15010
 
+       }
15011
 
+
15012
 
+       up(&snap_volume->snap_semaphore);
15013
 
+       return 0;
15014
 
+}
15015
 
+
15016
 
+
15017
 
+/* Function: get_snapshot_stats
15018
 
+ */
15019
 
+static int get_snapshot_stats( lvm_snapshot_stat_ioctl_t * snap_stats )
15020
 
+{
15021
 
+       lvm_logical_volume_t    * volume;
15022
 
+       lvm_volume_group_t      * group;
15023
 
+
15024
 
+       // Make sure the parameters are in range.       
15025
 
+       if ( snap_stats->lv_number < 1 ||
15026
 
+            snap_stats->lv_number > MAX_LV ) {
15027
 
+               return 1;
15028
 
+       }
15029
 
+
15030
 
+       // Make sure the specified group and volume exist, and that
15031
 
+       // this is a snapshot volume.
15032
 
+       find_group_by_uuid(snap_stats->vg_uuid, &group);
15033
 
+       if ( ! group ||
15034
 
+            ! (volume = group->volume_list[snap_stats->lv_number]) ||
15035
 
+            ! (volume->lv_access & LV_SNAPSHOT) ) {
15036
 
+               return 1;
15037
 
+       }
15038
 
+
15039
 
+       // Return the starting LBA of the next available chunk.
15040
 
+       snap_stats->next_free_chunk = volume->next_free_chunk;
15041
 
+       snap_stats->lv_status = volume->lv_status;
15042
 
+
15043
 
+       return 0;
15044
 
+}
15045
 
+
15046
 
+
15047
 
+/********** Memory Allocation/Deallocation Functions **********/
15048
 
+
15049
 
+
15050
 
+
15051
 
+/* Function: deallocate_physical_volume
15052
 
+ *
15053
 
+ *     Free the memory used by this physical volume. Do not delete the EVMS
15054
 
+ *     node in this function, since this could be called during an error
15055
 
+ *     path when we want to save the logical node.
15056
 
+ */
15057
 
+static int deallocate_physical_volume( lvm_physical_volume_t * pv_entry )
15058
 
+{
15059
 
+       if ( pv_entry->pv ) {
15060
 
+               kfree(pv_entry->pv);
15061
 
+               pv_entry->pv = NULL;
15062
 
+       }
15063
 
+
15064
 
+       if ( pv_entry->pe_map ) {
15065
 
+               vfree(pv_entry->pe_map);
15066
 
+               pv_entry->pe_map = NULL;
15067
 
+       }
15068
 
+
15069
 
+       kfree(pv_entry);
15070
 
+       return 0;
15071
 
+}
15072
 
+
15073
 
+
15074
 
+/* Function: allocate_physical_volume
15075
 
+ *
15076
 
+ *     Create a new lvm_physical_volume_t for the specified volume group.
15077
 
+ *     Initialize the new PV with the evms node and lvm pv information.
15078
 
+ */
15079
 
+static lvm_physical_volume_t * allocate_physical_volume(evms_logical_node_t    * node,
15080
 
+                                                       pv_disk_t               * pv )
15081
 
+{
15082
 
+       lvm_physical_volume_t * new_pv;
15083
 
+
15084
 
+       new_pv = kmalloc(sizeof(lvm_physical_volume_t), GFP_NOIO);
15085
 
+       if ( ! new_pv ) {
15086
 
+               LOG_CRITICAL("Memory error creating physical volume for node %s.\n", node->name);
15087
 
+               kfree(pv);
15088
 
+               return NULL;
15089
 
+       }
15090
 
+
15091
 
+       // Initialize the PV
15092
 
+       memset(new_pv, 0, sizeof(lvm_physical_volume_t));
15093
 
+       new_pv->logical_node    = node;
15094
 
+       new_pv->pv              = pv;
15095
 
+       new_pv->pv_number       = pv->pv_number;
15096
 
+
15097
 
+       return new_pv;
15098
 
+}
15099
 
+
15100
 
+
15101
 
+/* Function: allocate_snapshot_map_entry
15102
 
+ *
15103
 
+ *     Allocate memory for a new entry in the snapshot map and fill in the
15104
 
+ *     sector values. The PV pointer is not filled in here, but can easily
15105
 
+ *     be found by using the find_pv_by_number function.
15106
 
+ */
15107
 
+static snapshot_map_entry_t * allocate_snapshot_map_entry(evms_sector_t        org_sector,
15108
 
+                                                       evms_sector_t   snap_sector )
15109
 
+{
15110
 
+       snapshot_map_entry_t * new_entry;
15111
 
+
15112
 
+       new_entry = kmalloc(sizeof(snapshot_map_entry_t), GFP_NOIO);
15113
 
+       if ( ! new_entry ) {
15114
 
+               return NULL;
15115
 
+       }
15116
 
+       memset(new_entry, 0, sizeof(snapshot_map_entry_t));
15117
 
+       new_entry->org_sector = org_sector;
15118
 
+       new_entry->snap_sector = snap_sector;
15119
 
+       return new_entry;
15120
 
+}
15121
 
+
15122
 
+
15123
 
+/* Function: deallocate_snapshot_map
15124
 
+ *
15125
 
+ *     This function will delete one hash table, which is part of the whole
15126
 
+ *     snapshot remapping structure. Each hash table is an array of pointers
15127
 
+ *     to linked lists of snapshot_map_entry_t's.
15128
 
+ */
15129
 
+static int deallocate_snapshot_map( snapshot_map_entry_t ** table, u_int32_t table_size )
15130
 
+{
15131
 
+       snapshot_map_entry_t    * entry;
15132
 
+       snapshot_map_entry_t    * next;
15133
 
+       int                     i;
15134
 
+
15135
 
+       if ( table ) {
15136
 
+               for ( i = 0; i < table_size; i++ ) {
15137
 
+                       for ( entry = table[i]; entry; entry = next ) {
15138
 
+                               next = entry->next;
15139
 
+                               kfree(entry);
15140
 
+                       }
15141
 
+               }
15142
 
+               vfree(table);
15143
 
+       }
15144
 
+       return 0;
15145
 
+}
15146
 
+
15147
 
+
15148
 
+/* Function: deallocate_logical_volume
15149
 
+ *
15150
 
+ *     Delete the in-memory representation of a single LVM logical volume,
15151
 
+ *     including its PE map and any snapshot data. Do not alter the parent
15152
 
+ *     volume group, except to remove this volume from its volume list.
15153
 
+ */
15154
 
+static int deallocate_logical_volume( lvm_logical_volume_t * volume )
15155
 
+{
15156
 
+       lvm_volume_group_t      * group = volume->group;
15157
 
+       lvm_logical_volume_t    * org_volume;
15158
 
+       lvm_logical_volume_t    * snap_volume;
15159
 
+       int                     i;
15160
 
+
15161
 
+       // If this volume is a snapshot, remove it from the linked list of
15162
 
+       // volumes that are snapshotting the original. First, the original
15163
 
+       // volume must be quiesced.
15164
 
+       if ( volume->lv_access & LV_SNAPSHOT ) {
15165
 
+               org_volume = volume->snapshot_org;
15166
 
+
15167
 
+               if ( snapshot_check_quiesce_original(volume) ) {
15168
 
+                       return -EINVAL;
15169
 
+               }
15170
 
+
15171
 
+               remove_snapshot_from_chain(volume);
15172
 
+
15173
 
+               // If the snapshot that was just removed was the last/only
15174
 
+               // volume snapshotting the original, then mark the original
15175
 
+               // as no longer being snapshotted.
15176
 
+               if ( org_volume && ! org_volume->snapshot_next ) {
15177
 
+                       org_volume->lv_access &= ~LV_SNAPSHOT_ORG;
15178
 
+               }
15179
 
+       }
15180
 
+
15181
 
+       // If this volume is a snapshot original, all of its snapshots must also
15182
 
+       // be deleted. However, Those deletions need to be taken care of by the
15183
 
+       // engine. So just check that they have all been quiesced before
15184
 
+       // removing the original.
15185
 
+       else if ( volume->lv_access & LV_SNAPSHOT_ORG ) {
15186
 
+               if ( snapshot_check_quiesce_all(volume) ) {
15187
 
+                       return -EINVAL;
15188
 
+               }
15189
 
+
15190
 
+               // In case there are any snapshots remaining, we must clear out
15191
 
+               // their pointers to this original to prevent errors when those
15192
 
+               // snapshots are accessed or deleted.
15193
 
+               for ( snap_volume = volume->snapshot_next; snap_volume; snap_volume = snap_volume->snapshot_next ) {
15194
 
+                       snap_volume->snapshot_org = NULL;
15195
 
+               }
15196
 
+       }
15197
 
+
15198
 
+       LOG_DEBUG("Deleting volume %s\n", volume->name);
15199
 
+
15200
 
+       // Free all the memory. This includes the LE-to-PE map, any snapshot
15201
 
+       // hash tables, the COW table, and chunk data buffer.
15202
 
+       if ( volume->le_map ) {
15203
 
+               vfree(volume->le_map);
15204
 
+               volume->le_map = NULL;
15205
 
+       }
15206
 
+       if ( volume->snapshot_map ) {
15207
 
+               for ( i = 1; i <= group->pv_count; i++ ) {
15208
 
+                       deallocate_snapshot_map(volume->snapshot_map[i], volume->hash_table_size);
15209
 
+               }
15210
 
+               kfree(volume->snapshot_map);
15211
 
+               volume->snapshot_map = NULL;
15212
 
+       }
15213
 
+       if ( volume->cow_table ) {
15214
 
+               kfree(volume->cow_table);
15215
 
+               volume->cow_table = NULL;
15216
 
+       }
15217
 
+       if ( volume->chunk_data_buffer ) {
15218
 
+               kfree(volume->chunk_data_buffer);
15219
 
+               volume->chunk_data_buffer = NULL;
15220
 
+       }
15221
 
+
15222
 
+       // Remove this volume from the volume-group's list.
15223
 
+       if ( group && group->volume_list[volume->lv_number] == volume ) {
15224
 
+               group->volume_list[volume->lv_number] = NULL;
15225
 
+               group->volume_count--;
15226
 
+       }
15227
 
+
15228
 
+       kfree(volume);
15229
 
+
15230
 
+       return 0;
15231
 
+}
15232
 
+
15233
 
+
15234
 
+/* Function: allocate_logical_volume
15235
 
+ *
15236
 
+ *     Allocate space for a new LVM logical volume, including space for the
15237
 
+ *     LE-to-PE map and any necessary snapshot data.
15238
 
+ */
15239
 
+static lvm_logical_volume_t * allocate_logical_volume( lv_disk_t               * lv,
15240
 
+                                                       lvm_volume_group_t      * group )
15241
 
+{
15242
 
+       lvm_logical_volume_t    * new_volume;
15243
 
+       u_int32_t               table_entries_per_chunk;
15244
 
+       u_int32_t               table_chunks;
15245
 
+       int                     i;
15246
 
+
15247
 
+       // Allocate space for the new logical volume.
15248
 
+       new_volume = kmalloc(sizeof(lvm_logical_volume_t), GFP_NOIO);
15249
 
+       if ( ! new_volume ) {
15250
 
+               LOG_CRITICAL("Memory error creating new logical volume %s\n", lv->lv_name);
15251
 
+               return NULL;
15252
 
+       }
15253
 
+       memset(new_volume, 0, sizeof(lvm_logical_volume_t));
15254
 
+
15255
 
+       // Allocate space for the LE to PE mapping table
15256
 
+       new_volume->le_map = vmalloc(lv->lv_allocated_le*sizeof(le_table_entry_t));
15257
 
+       if ( ! new_volume->le_map ) {
15258
 
+               LOG_CRITICAL("Memory error creating LE map for logical volume %s\n", lv->lv_name);
15259
 
+               kfree(new_volume);
15260
 
+               return NULL;
15261
 
+       }
15262
 
+       memset(new_volume->le_map, 0, lv->lv_allocated_le*sizeof(le_table_entry_t));
15263
 
+
15264
 
+       // Initialize the rest of the new volume.
15265
 
+       new_volume->lv_number           = lv->lv_number + 1;    // Need the +1 to match the PE Map entries on the PV
15266
 
+       new_volume->lv_size             = lv->lv_size;
15267
 
+       new_volume->lv_access           = lv->lv_access | EVMS_LV_NEW | EVMS_LV_QUIESCED; // All volumes start new and quieseced.
15268
 
+       new_volume->lv_status           = lv->lv_status | LV_ACTIVE;    // All LVs start as active.
15269
 
+       new_volume->lv_minor            = MINOR(lv->lv_dev);
15270
 
+       new_volume->stripes             = lv->lv_stripes;
15271
 
+       new_volume->stripe_size         = lv->lv_stripesize;
15272
 
+       new_volume->stripe_size_shift   = evms_cs_log2(lv->lv_stripesize);
15273
 
+       new_volume->pe_size             = group->vg->pe_size;
15274
 
+       new_volume->pe_size_shift       = evms_cs_log2(group->vg->pe_size);
15275
 
+       new_volume->num_le              = lv->lv_allocated_le;
15276
 
+       new_volume->group               = group;
15277
 
+       // Different naming scheme for EVMS nodes.
15278
 
+       if ( translate_lv_name(lv->lv_name, new_volume->name) ) {
15279
 
+               deallocate_logical_volume(new_volume);
15280
 
+               return NULL;
15281
 
+       }
15282
 
+
15283
 
+       // If the volume is a snapshot, initialize the remaining data, and
15284
 
+       // allocate space for the remapping structures, and one sector's worth
15285
 
+       // of COW tables.
15286
 
+       if ( new_volume->lv_access & LV_SNAPSHOT ) {
15287
 
+               new_volume->chunk_size          = lv->lv_chunk_size;
15288
 
+               new_volume->num_chunks          = lv->lv_size / lv->lv_chunk_size;
15289
 
+               new_volume->snap_org_minor      = lv->lv_snapshot_minor;
15290
 
+               new_volume->next_cow_entry      = 0;
15291
 
+               new_volume->current_cow_sector  = 0;
15292
 
+               table_entries_per_chunk         = (new_volume->chunk_size << EVMS_VSECTOR_SIZE_SHIFT) / sizeof(lv_COW_table_disk_t);
15293
 
+               table_chunks                    = (new_volume->num_chunks + table_entries_per_chunk - 1) / table_entries_per_chunk;
15294
 
+               new_volume->next_free_chunk     = table_chunks * new_volume->chunk_size;
15295
 
+               new_volume->hash_table_size     = (lv->lv_size / lv->lv_chunk_size / MAX_HASH_CHAIN_ENTRIES) + 1;
15296
 
+
15297
 
+               new_volume->cow_table = kmalloc(EVMS_VSECTOR_SIZE, GFP_NOIO);
15298
 
+               if ( ! new_volume->cow_table ) {
15299
 
+                       LOG_CRITICAL("Memory error creating COW table for logical volume %s\n", lv->lv_name);
15300
 
+                       deallocate_logical_volume(new_volume);
15301
 
+                       return NULL;
15302
 
+               }
15303
 
+               memset(new_volume->cow_table, 0, EVMS_VSECTOR_SIZE);
15304
 
+
15305
 
+               new_volume->snapshot_map = kmalloc((group->pv_count+1) * sizeof(snapshot_map_entry_t**), GFP_NOIO);
15306
 
+               if ( ! new_volume->snapshot_map ) {
15307
 
+                       LOG_CRITICAL("Memory error creating snapshot map for logical volume %s\n", lv->lv_name);
15308
 
+                       deallocate_logical_volume(new_volume);
15309
 
+                       return NULL;
15310
 
+               }
15311
 
+
15312
 
+               new_volume->snapshot_map[0] = NULL;
15313
 
+               for ( i = 1; i <= group->pv_count; i++ ) {
15314
 
+                       new_volume->snapshot_map[i] = vmalloc(new_volume->hash_table_size * sizeof(snapshot_map_entry_t*));
15315
 
+                       if ( ! new_volume->snapshot_map[i] ) {
15316
 
+                               LOG_CRITICAL("Memory error creating snapshot sub-map for logical volume %s\n", lv->lv_name);
15317
 
+                               deallocate_logical_volume(new_volume);
15318
 
+                               return NULL;
15319
 
+                       }
15320
 
+                       memset(new_volume->snapshot_map[i], 0, new_volume->hash_table_size*sizeof(snapshot_map_entry_t*));
15321
 
+               }
15322
 
+               init_MUTEX(&new_volume->snap_semaphore);
15323
 
+       }
15324
 
+
15325
 
+       // If the volume is a snapshot original, allocate space to use for
15326
 
+       // copying snapshot chunks. This will now be a fixed size instead of
15327
 
+       // being based on the chunk size of the snapshots.
15328
 
+       else if ( new_volume->lv_access & LV_SNAPSHOT_ORG ) {
15329
 
+               new_volume->chunk_size = CHUNK_DATA_BUFFER_SIZE;
15330
 
+               new_volume->chunk_data_buffer = kmalloc(new_volume->chunk_size << EVMS_VSECTOR_SIZE_SHIFT, GFP_NOIO);
15331
 
+               if ( ! new_volume->chunk_data_buffer ) {
15332
 
+                       LOG_SERIOUS("Memory error creating snapshot chunk buffer for logical volume %s\n", lv->lv_name);
15333
 
+                       deallocate_logical_volume(new_volume);
15334
 
+                       return NULL;
15335
 
+               }
15336
 
+               memset(new_volume->chunk_data_buffer, 0, new_volume->chunk_size << EVMS_VSECTOR_SIZE_SHIFT);
15337
 
+       }
15338
 
+
15339
 
+       return new_volume;
15340
 
+}
15341
 
+
15342
 
+
15343
 
+/* Function: deallocate_volume_group
15344
 
+ *
15345
 
+ *     Delete the entire in-memory representation of an LVM volume group,
15346
 
+ *     including all PVs and logical volumes. If this group is on LVM's
15347
 
+ *     volume group list, remove it.
15348
 
+ */
15349
 
+static int deallocate_volume_group( lvm_volume_group_t * group )
15350
 
+{
15351
 
+       lvm_physical_volume_t   * pv_entry;
15352
 
+       lvm_physical_volume_t   * next_pv;
15353
 
+       int                     i;
15354
 
+
15355
 
+       LOG_DEBUG("Deleting volume group %s\n", group->vg_name);
15356
 
+
15357
 
+       // Remove the group from the global list.
15358
 
+       remove_group_from_list(group);
15359
 
+
15360
 
+       // Delete the LV metadata array.
15361
 
+       if ( group->lv_array ) {
15362
 
+               vfree(group->lv_array);
15363
 
+               group->lv_array = NULL;
15364
 
+       }
15365
 
+       
15366
 
+       // Delete the PV UUID list
15367
 
+       if ( group->uuid_list ) {
15368
 
+               vfree(group->uuid_list);
15369
 
+               group->uuid_list = NULL;
15370
 
+       }
15371
 
+
15372
 
+       // Delete all logical volumes.
15373
 
+       for ( i = 1; i <= MAX_LV; i++ ) {
15374
 
+               if ( group->volume_list[i] ) {
15375
 
+                       deallocate_logical_volume(group->volume_list[i]);
15376
 
+                       group->volume_list[i] = NULL;
15377
 
+               }
15378
 
+       }
15379
 
+
15380
 
+       // Delete all PVs from the group's list.
15381
 
+       for ( pv_entry = group->pv_list; pv_entry; pv_entry = next_pv ) {
15382
 
+               next_pv = pv_entry->next;
15383
 
+               if ( pv_entry->logical_node ) {
15384
 
+                       // Send a delete command down to the partition manager.
15385
 
+                       LOG_DEBUG("Deleting PV %s from group %s\n", pv_entry->logical_node->name, group->vg_name);
15386
 
+                       DELETE(pv_entry->logical_node);
15387
 
+                       pv_entry->logical_node = NULL;
15388
 
+               }
15389
 
+               deallocate_physical_volume(pv_entry);
15390
 
+       }
15391
 
+
15392
 
+       // Delete the VG metadata.
15393
 
+       if ( group->vg ) {
15394
 
+               kfree(group->vg);
15395
 
+               group->vg = NULL;
15396
 
+       }
15397
 
+
15398
 
+       kfree(group);
15399
 
+
15400
 
+       return 0;
15401
 
+}
15402
 
+
15403
 
+
15404
 
+/* Function: allocate_volume_group
15405
 
+ *
15406
 
+ *     Allocate space for a new LVM volume group and all of its sub-fields.
15407
 
+ *     Initialize the appropriate fields.
15408
 
+ *     vg parameter should already have an allocate/initialized vg_disk_t.
15409
 
+ */
15410
 
+static lvm_volume_group_t * allocate_volume_group(     vg_disk_t       * vg,
15411
 
+                                                       unsigned char   * vg_name )
15412
 
+{
15413
 
+       lvm_volume_group_t * new_group;
15414
 
+
15415
 
+       // The volume group itself.
15416
 
+       new_group = kmalloc(sizeof(lvm_volume_group_t), GFP_NOIO);
15417
 
+       if ( ! new_group ) {
15418
 
+               kfree(vg);
15419
 
+               return NULL;
15420
 
+       }
15421
 
+
15422
 
+       // Initialize the new group.
15423
 
+       memset(new_group, 0, sizeof(lvm_volume_group_t));
15424
 
+       memcpy(new_group->vg_uuid, vg->vg_uuid, UUID_LEN);
15425
 
+       strncpy(new_group->vg_name, vg_name, NAME_LEN-1);
15426
 
+       new_group->vg                   = vg;
15427
 
+       new_group->hard_sect_size       = 512;          // Default value
15428
 
+       new_group->block_size           = 1024;         // Default value
15429
 
+       new_group->flags                = EVMS_VG_DIRTY;
15430
 
+
15431
 
+       LOG_DETAILS("Discovered volume group %s\n", new_group->vg_name);
15432
 
+
15433
 
+       return new_group;
15434
 
+}
15435
 
+
15436
 
+
15437
 
+/* Function: remove_pv_from_group
15438
 
+ *
15439
 
+ *     In the engine, when a PV is removed from a group (on a vgreduce), that
15440
 
+ *     same PV must be removed from that group in the kernel. Otherwise, when
15441
 
+ *     the rediscover occurs, that PV will still appear in the group, and
15442
 
+ *     will cause segfaults when we try to read metadata from it.
15443
 
+ */
15444
 
+static int remove_pv_from_group(int            pv_number,
15445
 
+                               unsigned char   * vg_uuid )
15446
 
+{
15447
 
+       lvm_volume_group_t      * group;
15448
 
+       lvm_physical_volume_t   * pv_entry;
15449
 
+       lvm_physical_volume_t   ** p_pv_entry;
15450
 
+       int                     rc = 0;
15451
 
+
15452
 
+       // Make sure the numbers are in range.
15453
 
+       if ( pv_number < 0 || pv_number > MAX_PV ) {
15454
 
+               return 0;
15455
 
+       }
15456
 
+
15457
 
+       // Make sure the group exists.
15458
 
+       find_group_by_uuid(vg_uuid, &group);
15459
 
+       if ( ! group ) {
15460
 
+               return 0;
15461
 
+       }
15462
 
+
15463
 
+       // Make sure the PV is in this group.
15464
 
+       pv_entry = find_pv_by_number(pv_number, group);
15465
 
+       if ( ! pv_entry ) {
15466
 
+               LOG_WARNING("Did not find PV %d in group %s\n", pv_number, group->vg_name);
15467
 
+               return 0;
15468
 
+       }
15469
 
+
15470
 
+       // Make sure the PV is not in use by any volumes
15471
 
+       if ( check_pv_for_lv(pv_entry, group) ) {
15472
 
+               LOG_SERIOUS("PV %d in group %s still contains LVs\n", pv_number, group->vg_name);
15473
 
+               return -EINVAL;
15474
 
+       }
15475
 
+
15476
 
+       // Take this PV out of the group's list.
15477
 
+       for ( p_pv_entry = &group->pv_list; *p_pv_entry; p_pv_entry = &(*p_pv_entry)->next ) {
15478
 
+               if ( *p_pv_entry == pv_entry ) {
15479
 
+                       *p_pv_entry = (*p_pv_entry)->next;
15480
 
+                       pv_entry->next = NULL;
15481
 
+                       break;
15482
 
+               }
15483
 
+       }
15484
 
+
15485
 
+       group->pv_count--;
15486
 
+
15487
 
+       // There is no way that this PV was the last from this group, so the
15488
 
+       // group never needs to be deleted at this point. The only way this
15489
 
+       // group will exist in the kernel is if there are volumes exported from
15490
 
+       // it. If this was the last PV, then those volumes must be on that PV,
15491
 
+       // and it wouldn't be allowed to be removed from the group (above).
15492
 
+
15493
 
+       // Free up the memory for this PV. Just drop the node.
15494
 
+       deallocate_physical_volume(pv_entry);
15495
 
+
15496
 
+       LOG_DEBUG("PV %d removed from group %s\n", pv_number, group->vg_name);
15497
 
+       return rc;
15498
 
+}
15499
 
+
15500
 
+
15501
 
+
15502
 
+/********** Consistency Checking Functions **********/
15503
 
+
15504
 
+
15505
 
+/* Function: clear_le_entries_for_missing_pv
15506
 
+ */
15507
 
+static void clear_le_entries_for_missing_pv(   lvm_volume_group_t      * group,
15508
 
+                                               lvm_physical_volume_t   * pv_entry )
15509
 
+{
15510
 
+       lvm_logical_volume_t    * volume;
15511
 
+       int                     i, j;
15512
 
+
15513
 
+       for ( i = 1; i <= MAX_LV; i++ ) {
15514
 
+               if ( group->volume_list[i] ) {
15515
 
+                       volume = group->volume_list[i];
15516
 
+                       for ( j = 0; j < volume->num_le; j++ ) {
15517
 
+                               if ( volume->le_map[j].owning_pv == pv_entry ) {
15518
 
+                                       volume->le_map[j].owning_pv = NULL;
15519
 
+                                       volume->le_map[j].pe_sector_offset = 0;
15520
 
+                               }
15521
 
+                       }
15522
 
+               }
15523
 
+       }
15524
 
+}
15525
 
+
15526
 
+
15527
 
+/* Function: check_volume_groups
15528
 
+ *
15529
 
+ *     This function performs some simple consistency checks on all dirty
15530
 
+ *     volume groups. Any groups that have no PVs are deleted. If any metadata
15531
 
+ *     structures (PV or VG) are missing, they are read in from disk.
15532
 
+ */
15533
 
+static int check_volume_groups( void )
15534
 
+{
15535
 
+       lvm_volume_group_t      * group;
15536
 
+       lvm_volume_group_t      * next_group;
15537
 
+       lvm_physical_volume_t   * pv_entry;
15538
 
+       lvm_physical_volume_t   * next_pv;
15539
 
+       int                     rc = 0;
15540
 
+
15541
 
+       for ( group = lvm_group_list; group; group = next_group) {
15542
 
+               next_group = group->next_group;
15543
 
+
15544
 
+               LOG_DEBUG("Checking Group %s\n", group->vg_name);
15545
 
+
15546
 
+               // If a group has no PVs, it can be safely deleted,
15547
 
+               // because we can't find any volumes on it.
15548
 
+               if ( ! group->pv_count ) {
15549
 
+                       LOG_WARNING("No PVs found for Group %s.\n", group->vg_name);
15550
 
+                       if ( ! group->volume_count ) {
15551
 
+                               deallocate_volume_group(group);
15552
 
+                       }
15553
 
+                       continue;
15554
 
+               }
15555
 
+
15556
 
+               // Make sure all metadata for the PVs is present. On a
15557
 
+               // rediscover, it may be missing, because we delete it at the
15558
 
+               // end of discovery. If any is missing, read it in from disk.
15559
 
+               // This is only necessary in the kernel. It can't happen in
15560
 
+               // the engine.
15561
 
+               for ( pv_entry = group->pv_list; pv_entry; pv_entry = next_pv ) {
15562
 
+                       next_pv = pv_entry->next;
15563
 
+                       if ( ! pv_entry->pv ) {
15564
 
+                               LOG_DEBUG("Re-reading PV metadata for node %s\n", pv_entry->logical_node->name);
15565
 
+                               rc = read_pv(pv_entry->logical_node, &pv_entry->pv);
15566
 
+                               if (rc) {
15567
 
+                                       // What happens if we can't re-read the
15568
 
+                                       // PV metadata? This PV must be removed
15569
 
+                                       // from the group. Need to also clear
15570
 
+                                       // all LE entries in all LVs that are
15571
 
+                                       // pointing to this PV before it can be
15572
 
+                                       // removed from the list.
15573
 
+                                       LOG_SERIOUS("PV metadata is missing or cannot be read from node %s\n", pv_entry->logical_node->name);
15574
 
+                                       clear_le_entries_for_missing_pv(group, pv_entry);
15575
 
+                                       remove_pv_from_group(pv_entry->pv_number, group->vg_uuid);
15576
 
+                                       continue;
15577
 
+                               }
15578
 
+                               pv_entry->pv_number = pv_entry->pv->pv_number;
15579
 
+
15580
 
+                               // Check for a "stale" PV. This case should be
15581
 
+                               // already be covered, as long as the Engine is
15582
 
+                               // calling the PV_REMOVE ioctl when it does a
15583
 
+                               // vgreduce or a pvremove. If this is the last
15584
 
+                               // PV in the group, the group will be deleted.
15585
 
+                               if ( ! pv_entry->pv_number ) {
15586
 
+                                       remove_pv_from_group(0, group->vg_uuid);
15587
 
+                                       continue;
15588
 
+                               }
15589
 
+                       }
15590
 
+
15591
 
+                       if ( ! pv_entry->pe_map ) {
15592
 
+                               LOG_DEBUG("Re-reading PE maps for node %s\n", pv_entry->logical_node->name);
15593
 
+                               rc = read_pe_map(pv_entry);
15594
 
+                               if (rc) {
15595
 
+                                       LOG_WARNING("Error reading PE maps for node %s\n", pv_entry->logical_node->name);
15596
 
+                                       LOG_WARNING("Any volumes residing on this node will be incomplete!\n");
15597
 
+                               }
15598
 
+                       }
15599
 
+               }
15600
 
+
15601
 
+               // Make sure the metadata for the VG is present. If it's
15602
 
+               // missing, read it in from the first PV in the VG.
15603
 
+               if ( ! group->vg && group->pv_count ) {
15604
 
+                       LOG_DEBUG("Re-reading VG metadata for Group %s\n", group->vg_name);
15605
 
+                       pv_entry = group->pv_list;
15606
 
+                       rc = read_vg(pv_entry->logical_node, pv_entry->pv, &group->vg);
15607
 
+                       if (rc) {
15608
 
+                               // What happens if we can't re-read the
15609
 
+                               // VG metadata? It's definitely bad
15610
 
+                               // news. Should we delete the VG?
15611
 
+                               continue;
15612
 
+                       }
15613
 
+               }
15614
 
+
15615
 
+               // Display a warning if the number of PVs found for the group
15616
 
+               // doesn't match the number of PVs recorded for the VG.
15617
 
+               if ( group->vg && group->pv_count != group->vg->pv_cur ) {
15618
 
+                       LOG_WARNING("Group %s is incomplete.\n", group->vg_name);
15619
 
+                       LOG_WARNING("     Only %d of %d PVs found.\n", group->pv_count, group->vg->pv_cur);
15620
 
+                       LOG_WARNING("     Volumes in this group may be incomplete.\n");
15621
 
+               }
15622
 
+       }
15623
 
+
15624
 
+       return 0;
15625
 
+}
15626
 
+
15627
 
+
15628
 
+/* Function: check_le_maps
15629
 
+ *
15630
 
+ *     Make sure all volumes in this group have valid LE-to-PE maps. Any
15631
 
+ *     volume that doesn't is marked as incomplete. This is safe for
15632
 
+ *     re-discovery because only new volumes could have corrupted LE maps.
15633
 
+ */
15634
 
+static int check_le_maps( lvm_volume_group_t * group )
15635
 
+{
15636
 
+       lvm_logical_volume_t * volume;
15637
 
+       int i, j, count;
15638
 
+
15639
 
+       for ( i = 1; i <= MAX_LV; i++ ) {
15640
 
+               volume = group->volume_list[i];
15641
 
+               if ( ! volume ) {
15642
 
+                       continue;
15643
 
+               }
15644
 
+
15645
 
+               if ( ! volume->le_map ) {
15646
 
+                       // No point in keeping the volume around if it has
15647
 
+                       // no LE map at all.
15648
 
+                       LOG_SERIOUS("Volume %s has no LE map.\n", volume->name);
15649
 
+                       deallocate_logical_volume(volume);
15650
 
+                       continue;
15651
 
+               }
15652
 
+
15653
 
+               // If any entries in the LE map are missing, mark this volume
15654
 
+               // as incomplete.
15655
 
+               for ( j = 0, count = 0; j < volume->num_le; j++ ) {
15656
 
+                       if ( ! volume->le_map[j].owning_pv ||
15657
 
+                            ! volume->le_map[j].pe_sector_offset ) {
15658
 
+                               count++;
15659
 
+                       }
15660
 
+               }
15661
 
+               if ( count ) {
15662
 
+                       LOG_SERIOUS("Volume %s has incomplete LE map.\n", volume->name);
15663
 
+                       LOG_SERIOUS("       Missing %d out of %d LEs.\n", count, volume->num_le);
15664
 
+                       volume->lv_access |= EVMS_LV_INCOMPLETE;
15665
 
+               }
15666
 
+       }
15667
 
+       return 0;
15668
 
+}
15669
 
+
15670
 
+
15671
 
+/* Function: check_snapshot_map
15672
 
+ *
15673
 
+ *     For snapshot volumes, make sure the snapshot map is intact, and that
15674
 
+ *     any existing entries in the map are in the correct order and there
15675
 
+ *     are no duplicate entries.
15676
 
+ */
15677
 
+static int check_snapshot_map( lvm_logical_volume_t * snap_volume )
15678
 
+{
15679
 
+       snapshot_map_entry_t ** table;
15680
 
+       snapshot_map_entry_t * curr;
15681
 
+       int i, j;
15682
 
+
15683
 
+       if ( ! (snap_volume->lv_access & LV_SNAPSHOT) ) {
15684
 
+               return 0;
15685
 
+       }
15686
 
+       if ( ! snap_volume->snapshot_map ) {
15687
 
+               snap_volume->lv_access |= EVMS_LV_INVALID;
15688
 
+               return -EINVAL;
15689
 
+       }
15690
 
+       for ( i = 1; i <= snap_volume->group->pv_count; i++ ) {
15691
 
+               if ( ! snap_volume->snapshot_map[i] ) {
15692
 
+                       snap_volume->lv_access |= EVMS_LV_INVALID;
15693
 
+                       return -EINVAL;
15694
 
+               }
15695
 
+               table = snap_volume->snapshot_map[i];
15696
 
+               for ( j = 0; j < snap_volume->hash_table_size; j++ ) {
15697
 
+                       for ( curr = table[j]; curr; curr = curr->next ) {
15698
 
+                               if ( curr->next && curr->org_sector >= curr->next->org_sector ) {
15699
 
+                                       snap_volume->lv_access |= EVMS_LV_INVALID;
15700
 
+                                       return -EINVAL;
15701
 
+                               }
15702
 
+                       }
15703
 
+               }
15704
 
+       }
15705
 
+       return 0;       
15706
 
+}
15707
 
+
15708
 
+
15709
 
+/* Function: check_logical_volumes
15710
 
+ *
15711
 
+ *     Perform a consistency check on all of the logical volumes that have been
15712
 
+ *     discovered. Any volume that has any inconsistencies will be marked as
15713
 
+ *     incomplete or invalid, depending on the severity of the problem. At the
15714
 
+ *     end, all invalid volumes are deleted. If the deleted_incompletes
15715
 
+ *     parameter is set, those will also be deleted.
15716
 
+ */
15717
 
+static int check_logical_volumes( int final_discovery )
15718
 
+{
15719
 
+       lvm_volume_group_t      * group;
15720
 
+       lvm_logical_volume_t    * volume;
15721
 
+       lvm_logical_volume_t    * snap;
15722
 
+       lvm_logical_volume_t    * next;
15723
 
+       int                     count;
15724
 
+       int                     i, j;
15725
 
+
15726
 
+       // Check every valid, dirty volume group
15727
 
+       for ( group = lvm_group_list; group; group = group->next_group ) {
15728
 
+               if ( ! (group->flags & EVMS_VG_DIRTY) ) {
15729
 
+                       continue;
15730
 
+               }
15731
 
+
15732
 
+               // Check every valid volume in this group
15733
 
+               for ( i = 1; i <= MAX_LV; i++ ) {
15734
 
+                       volume  = group->volume_list[i];
15735
 
+                       if ( ! volume ) {
15736
 
+                               continue;
15737
 
+                       }
15738
 
+
15739
 
+                       LOG_DEBUG("Checking logical volume %s\n", volume->name);
15740
 
+
15741
 
+                       if ( ! volume->group ) {
15742
 
+                               volume->group = group;
15743
 
+                       }
15744
 
+
15745
 
+                       // All LE-map entries must have valid values. The I/O
15746
 
+                       // paths now detect missing LE entries.
15747
 
+                       if ( volume->le_map ) {
15748
 
+                               for ( j = 0, count = 0; j < volume->num_le; j++ ) {
15749
 
+                                       if ( ! volume->le_map[j].owning_pv ||
15750
 
+                                            ! volume->le_map[j].pe_sector_offset ) {
15751
 
+                                               count++;
15752
 
+                                       }
15753
 
+                               }
15754
 
+                               if ( count ) {
15755
 
+                                       LOG_SERIOUS("Volume %s has incomplete LE map.\n", volume->name);
15756
 
+                                       LOG_SERIOUS("      Missing %d out of %d LEs.\n", count, volume->num_le);
15757
 
+                                       volume->lv_access |= EVMS_LV_INCOMPLETE;
15758
 
+                               }
15759
 
+                               else {
15760
 
+                                       // In case this volume was previously
15761
 
+                                       // marked incomplete.
15762
 
+                                       volume->lv_access &= ~EVMS_LV_INCOMPLETE;
15763
 
+                               }
15764
 
+                       }
15765
 
+                       else {
15766
 
+                               // This should only ever happen due to
15767
 
+                               // memory corruption.
15768
 
+                               LOG_SERIOUS("Volume %s has no LE map.\n", volume->name);
15769
 
+                               volume->lv_access |= EVMS_LV_INVALID;
15770
 
+                       }
15771
 
+               
15772
 
+                       // For a snapshot original, check all snapshots in the
15773
 
+                       // chain, to make sure they point back to the original.
15774
 
+                       // Also, make sure there is memory for the chunk buffer.
15775
 
+                       if ( volume->lv_access & LV_SNAPSHOT_ORG ) {
15776
 
+                               for ( snap = volume->snapshot_next, count = 0; snap; snap = snap->snapshot_next, count++ ) {
15777
 
+                                       if ( snap->snapshot_org != volume ) {
15778
 
+                                               LOG_SERIOUS("Snapshot volume %s not pointing at correct original\n", volume->name);
15779
 
+                                               snap->snapshot_org = NULL;
15780
 
+                                               snap->lv_access |= EVMS_LV_INVALID;
15781
 
+                                       }
15782
 
+                               }
15783
 
+                               if ( ! count ) {
15784
 
+                                       LOG_WARNING("No snapshots found for volume %s\n", volume->name);
15785
 
+                                       if ( final_discovery ) {
15786
 
+                                               volume->lv_access &= ~LV_SNAPSHOT_ORG;
15787
 
+                                       }
15788
 
+                               }
15789
 
+                               else if ( ! volume->chunk_data_buffer ) {
15790
 
+                                       volume->lv_access |= EVMS_LV_INVALID;
15791
 
+                               }
15792
 
+                       }
15793
 
+
15794
 
+                       // For a snapshot volume, make sure it points back to
15795
 
+                       // its original. Also make sure there is memory for the
15796
 
+                       // cow table, and that any existing snapshot entries in
15797
 
+                       // the snapshot map are correctly ordered.
15798
 
+                       else if ( volume->lv_access & LV_SNAPSHOT ) {
15799
 
+                               // Is there a COW table?
15800
 
+                               if ( ! volume->cow_table ) {
15801
 
+                                       LOG_SERIOUS("Snapshot volume %s has no COW table\n", volume->name);
15802
 
+                                       volume->lv_access |= EVMS_LV_INVALID;
15803
 
+                               }
15804
 
+                               // Is the snapshot map in order?
15805
 
+                               if ( check_snapshot_map(volume) ) {
15806
 
+                                       LOG_SERIOUS("Snapshot volume %s has snapshot map inconsistency\n", volume->name);
15807
 
+                                       volume->lv_access |= EVMS_LV_INVALID;
15808
 
+                               }
15809
 
+                               // Is there an original volume? This is only
15810
 
+                               // a real problem during final discovery.
15811
 
+                               if ( ! volume->snapshot_org ) {
15812
 
+                                       LOG_SERIOUS("Snapshot volume %s not pointing at an original\n", volume->name);
15813
 
+                                       if ( final_discovery ) {
15814
 
+                                               volume->lv_access |= EVMS_LV_INVALID;
15815
 
+                                       }
15816
 
+                               }
15817
 
+                               // Is the original the correct one?
15818
 
+                               else if ( volume->snap_org_minor != volume->snapshot_org->lv_minor ) {
15819
 
+                                       LOG_SERIOUS("Snapshot volume %s not pointing at correct original\n", volume->name);
15820
 
+                                       volume->lv_access |= EVMS_LV_INVALID;
15821
 
+                               }
15822
 
+                       }
15823
 
+
15824
 
+                       // Delete any invalid volumes from use. Delete
15825
 
+                       // incomplete volumes as well if this is not final
15826
 
+                       // discovery. If a snapshot original is bad, delete all
15827
 
+                       // of its snapshots.
15828
 
+                       if ( volume->lv_access & EVMS_LV_INVALID ||
15829
 
+                            (!final_discovery &&
15830
 
+                             (volume->lv_access & EVMS_LV_INCOMPLETE) &&
15831
 
+                             (volume->lv_access & EVMS_LV_NEW) ) ) {
15832
 
+                               if ( volume->lv_access & LV_SNAPSHOT_ORG ) {
15833
 
+                                       for ( snap = volume->snapshot_next; snap; snap = next ) {
15834
 
+                                               next = snap->snapshot_next;
15835
 
+                                               snap->snapshot_next = NULL;
15836
 
+                                               snap->snapshot_org = NULL;
15837
 
+                                               invalidate_snapshot_volume(snap);
15838
 
+                                               deallocate_logical_volume(snap);
15839
 
+                                       }
15840
 
+                                       volume->snapshot_next = NULL;
15841
 
+                               }
15842
 
+                               else if ( volume->lv_access & LV_SNAPSHOT ) {
15843
 
+                                       invalidate_snapshot_volume(volume);
15844
 
+                               }
15845
 
+                               deallocate_logical_volume(volume);
15846
 
+                       }
15847
 
+               }
15848
 
+       }
15849
 
+
15850
 
+       return 0;
15851
 
+}
15852
 
+
15853
 
+
15854
 
+
15855
 
+/********** Volume Group Discovery Functions **********/
15856
 
+
15857
 
+
15858
 
+
15859
 
+/* Function: find_group_for_pv
15860
 
+ *
15861
 
+ *     This is a discover-time function. It reads the VG metadata info for the
15862
 
+ *     specified node, and locates the appropriate group that owns that
15863
 
+ *     node. If that group does not already exist, it is created and
15864
 
+ *     initialized.
15865
 
+ */
15866
 
+static int find_group_for_pv(  evms_logical_node_t     * node,
15867
 
+                               pv_disk_t               * pv,
15868
 
+                               lvm_volume_group_t      ** group )
15869
 
+{
15870
 
+       vg_disk_t       * vg;
15871
 
+       int             rc;
15872
 
+
15873
 
+       *group = NULL;
15874
 
+
15875
 
+       // Check for an unassigned PV.
15876
 
+       if ( pv->vg_name[0] == 0 ) {
15877
 
+               return 0;
15878
 
+       }
15879
 
+
15880
 
+       // Read the VG on-disk info for this PV. If this succeeds, it
15881
 
+       // allocates a new VG metadata structure.
15882
 
+       rc = read_vg(node, pv, &vg);
15883
 
+       if (rc) {
15884
 
+               return rc;
15885
 
+       }
15886
 
+
15887
 
+       // Use the UUID from the VG metadata to determine if this group
15888
 
+       // has already been discovered and constructed.
15889
 
+       find_group_by_uuid(vg->vg_uuid, group);
15890
 
+
15891
 
+       if ( ! *group ) {
15892
 
+               // Create a new group entry and add to the global list.
15893
 
+               *group = allocate_volume_group(vg, pv->vg_name);
15894
 
+               if ( ! *group ) {
15895
 
+                       return -ENOMEM;
15896
 
+               }
15897
 
+               add_group_to_list(*group);
15898
 
+       }
15899
 
+       else if ( ! (*group)->vg ) {
15900
 
+               // On a rediscover, the VG metadata for an existing group might
15901
 
+               // be missing. Fill it in if necessary. This check is also not
15902
 
+               // necessary in the engine, since the metadata is never deleted.
15903
 
+// Should we re-copy vg_name? (vg_uuid can not be allowed to change).
15904
 
+// Or should vg_name changes be done through direct ioctl only?
15905
 
+               (*group)->vg = vg;
15906
 
+       }
15907
 
+       else {
15908
 
+               kfree(vg);
15909
 
+       }
15910
 
+
15911
 
+       // Read in the UUID list for this group, if it isn't present.
15912
 
+       rc = read_uuid_list(node, pv, *group);
15913
 
+       if (rc) {
15914
 
+               LOG_WARNING("Error reading UUID list for group %s.\n", (*group)->vg_name);
15915
 
+               LOG_WARNING("May not be able to verify PV UUIDs for group %s\n", (*group)->vg_name);
15916
 
+       }
15917
 
+
15918
 
+       // In the kernel, any time we even see a PV for a group, that group
15919
 
+       // must be marked dirty so its volumes will be re-exported.
15920
 
+       (*group)->flags |= EVMS_VG_DIRTY;
15921
 
+
15922
 
+       return 0;
15923
 
+}
15924
 
+
15925
 
+
15926
 
+/* Function: check_for_duplicate_pv
15927
 
+ *
15928
 
+ *     Search the list of PVs in the specified volume group. If the
15929
 
+ *     specified node already exists in the list, we can discard it.
15930
 
+ */
15931
 
+static int check_for_duplicate_pv( evms_logical_node_t * node,
15932
 
+                               pv_disk_t               * pv,
15933
 
+                               lvm_volume_group_t      * group )
15934
 
+{
15935
 
+       lvm_physical_volume_t   * pv_entry;
15936
 
+
15937
 
+       // For re-discovery, we need to search all existing PVs in this VG to
15938
 
+       // make sure we didn't get a duplicate from the plugin below us. The
15939
 
+       // plugins below us should be re-exporting the same node on
15940
 
+       // re-discovery, instead of creating a new node to represent the same
15941
 
+       // objects, so just check the memory location.
15942
 
+       for ( pv_entry = group->pv_list; pv_entry; pv_entry = pv_entry->next ) {
15943
 
+               if ( pv_entry->logical_node == node ) {
15944
 
+
15945
 
+                       // We found a duplicate. Just ignore the duplicate.
15946
 
+                       LOG_DEBUG("PV %s is already in Group %s.\n", node->name, group->vg_name);
15947
 
+
15948
 
+                       // Even if the node was a duplicate, we may need to
15949
 
+                       // fill in the pv entry for this partition, since we
15950
 
+                       // always delete those at the end of discovery.
15951
 
+                       if ( ! pv_entry->pv ) {
15952
 
+                               pv_entry->pv = pv;
15953
 
+                               pv_entry->pv_number = pv->pv_number;
15954
 
+                       }
15955
 
+                       else {
15956
 
+                               kfree(pv);
15957
 
+                       }
15958
 
+
15959
 
+                       return 1;
15960
 
+               }
15961
 
+       }
15962
 
+
15963
 
+       // No duplicate was found.
15964
 
+       return 0;
15965
 
+}
15966
 
+
15967
 
+
15968
 
+/* Function: verify_pv_uuid
15969
 
+ *
15970
 
+ *     Verify that the specified PV belongs in the specified group by
15971
 
+ *     searching for the PV's UUID in the group's list.
15972
 
+ */
15973
 
+static int verify_pv_uuid(     lvm_physical_volume_t   * pv_entry,
15974
 
+                               lvm_volume_group_t      * group )
15975
 
+{
15976
 
+       int i;
15977
 
+
15978
 
+       // Obviously the UUID list must be present in order to search.
15979
 
+       if ( ! group->uuid_list ) {
15980
 
+               LOG_WARNING("UUID list is missing from group %s.\n", group->vg_name);
15981
 
+               LOG_WARNING("Cannot verify UUID for PV %s\n", pv_entry->logical_node->name);
15982
 
+               return 0;
15983
 
+       }
15984
 
+
15985
 
+       // Start with the UUID entry for this PV's number
15986
 
+       if ( ! memcmp(pv_entry->pv->pv_uuid, &(group->uuid_list[(pv_entry->pv_number-1)*NAME_LEN]), UUID_LEN) ) {
15987
 
+               return 0;
15988
 
+       }
15989
 
+
15990
 
+       // If it wasn't found there, then search the entire group's list.
15991
 
+       for ( i = 0; i < group->vg->pv_cur; i++ ) {
15992
 
+               if ( ! memcmp(pv_entry->pv->pv_uuid, &(group->uuid_list[i*NAME_LEN]), UUID_LEN) ) {
15993
 
+                       // Found the UUID. 
15994
 
+                       LOG_WARNING("Detected UUID mismatch for PV %s!\n", pv_entry->logical_node->name);
15995
 
+                       LOG_WARNING("PV %s is recorded as being at index %d,\n", pv_entry->logical_node->name, pv_entry->pv_number);
15996
 
+                       LOG_WARNING(" but Group %s has it recorded at index %d.\n", group->vg_name, i+1);
15997
 
+                       LOG_WARNING("Run the EVMS Engine to correct the problem.\n");
15998
 
+                       LOG_WARNING("If you have any snapshot regions in group %s\n", group->vg_name);
15999
 
+                       LOG_WARNING(" it is recommended that you delete them immediately!\n");
16000
 
+                       return 0;
16001
 
+               }
16002
 
+       }
16003
 
+
16004
 
+       LOG_SERIOUS("Could not find UUID for PV %s in group %s\n", pv_entry->logical_node->name, group->vg_name);
16005
 
+       return -EINVAL;
16006
 
+}
16007
 
+
16008
 
+
16009
 
+/* Function:  add_pv_to_group
16010
 
+ *
16011
 
+ *     Adds the physical volume to the appropriate volume group. The PV
16012
 
+ *     passed into this function MUST be part of a valid VG.
16013
 
+ */
16014
 
+static int add_pv_to_group(    lvm_physical_volume_t   * pv_entry,
16015
 
+                               lvm_volume_group_t      * group )
16016
 
+{
16017
 
+       int rc;
16018
 
+
16019
 
+       // Make sure this PV's UUID is listed in the group.
16020
 
+       rc = verify_pv_uuid(pv_entry, group);
16021
 
+       if (rc) {
16022
 
+               LOG_SERIOUS("PV %s does not belong in group %s!\n", pv_entry->logical_node->name, group->vg_name);
16023
 
+               return rc;
16024
 
+       }
16025
 
+
16026
 
+       // Add this PV to the beginning of its group's list.
16027
 
+       pv_entry->next          = group->pv_list;
16028
 
+       group->pv_list          = pv_entry;
16029
 
+       group->pv_count++;
16030
 
+
16031
 
+       // Update the group's block and hardsector sizes as appropriate.
16032
 
+       group->block_size = max(pv_entry->logical_node->block_size, group->block_size);
16033
 
+       group->hard_sect_size = max(pv_entry->logical_node->hardsector_size, group->hard_sect_size);
16034
 
+
16035
 
+       // Check for the Partial or Removable flag on the PV.
16036
 
+       if ( pv_entry->logical_node->flags & EVMS_VOLUME_PARTIAL ) {
16037
 
+               group->flags |= EVMS_VG_PARTIAL_PVS;
16038
 
+       }
16039
 
+       if ( pv_entry->logical_node->flags & EVMS_DEVICE_REMOVABLE ) {
16040
 
+               group->flags |= EVMS_VG_REMOVABLE_PVS;
16041
 
+       }
16042
 
+
16043
 
+       LOG_DETAILS("PV %s added to Group %s\n", pv_entry->logical_node->name, group->vg_name);
16044
 
+
16045
 
+       return 0;
16046
 
+}
16047
 
+
16048
 
+
16049
 
+/* Function: discover_volume_groups
16050
 
+ *
16051
 
+ *     Examine the list of logical nodes. Any node that contains a valid PV
16052
 
+ *     structure is consumed and added to the appropriate volume group. PVs
16053
 
+ *     which do not belong to any group are deleted. Everything else is left
16054
 
+ *     on the discovery list.
16055
 
+ */
16056
 
+static int discover_volume_groups( evms_logical_node_t ** evms_node_list )
16057
 
+{
16058
 
+       evms_logical_node_t     * node;
16059
 
+       evms_logical_node_t     * next_node;
16060
 
+       pv_disk_t               * pv;
16061
 
+       lvm_volume_group_t      * group;
16062
 
+       lvm_physical_volume_t   * pv_entry;
16063
 
+       int                     rc;
16064
 
+
16065
 
+       LOG_EXTRA("Searching for PVs in the node list.\n");
16066
 
+
16067
 
+       // Run through the discovery list
16068
 
+       for ( node = *evms_node_list; node; node = next_node ) {
16069
 
+               // Save the next node. We may remove this one from the list.
16070
 
+               next_node = node->next;
16071
 
+
16072
 
+               // Read the PV metadata. This will also create a new pv_disk_t
16073
 
+               // if it finds the correct LVM signatures.
16074
 
+               rc = read_pv(node, &pv);
16075
 
+               if (rc) {
16076
 
+                       // This node is not an LVM PV, or an error occurred.
16077
 
+                       // Just leave the node on the discovery list.
16078
 
+                       continue;
16079
 
+               }
16080
 
+
16081
 
+               rc = find_group_for_pv(node, pv, &group);
16082
 
+               if (rc) {
16083
 
+                       // Error getting the group for this PV.
16084
 
+                       kfree(pv);
16085
 
+                       continue;
16086
 
+               }
16087
 
+
16088
 
+               if ( ! group ) {
16089
 
+                       // This node is an unassigned PV.
16090
 
+                       LOG_DETAILS("PV %s is unassigned.\n", node->name);
16091
 
+                       kfree(pv);
16092
 
+                       continue;
16093
 
+               }
16094
 
+
16095
 
+               rc = check_for_duplicate_pv(node, pv, group);
16096
 
+               if (rc) {
16097
 
+                       // This node is already in the group. This check is also
16098
 
+                       // only in the kernel because the engine has no notion
16099
 
+                       // of rediscover, and thus can never get a duplicate.
16100
 
+                       evms_cs_remove_logical_node_from_list(evms_node_list, node);
16101
 
+                       continue;
16102
 
+               }
16103
 
+
16104
 
+               // Allocate a PV entry for this node.
16105
 
+               pv_entry = allocate_physical_volume(node, pv);
16106
 
+               if ( ! pv_entry ) {
16107
 
+                       continue;
16108
 
+               }
16109
 
+
16110
 
+               // Add this PV to the appropriate volume group.
16111
 
+               rc = add_pv_to_group(pv_entry, group);
16112
 
+               if (rc) {
16113
 
+                       deallocate_physical_volume(pv_entry);
16114
 
+                       continue;
16115
 
+               }
16116
 
+
16117
 
+               rc = read_pe_map(pv_entry);
16118
 
+               if (rc) {
16119
 
+                       LOG_WARNING("Error reading PE maps for node %s\n", node->name);
16120
 
+                       LOG_WARNING("Any volumes residing on this node will be incomplete!\n");
16121
 
+               }
16122
 
+
16123
 
+               evms_cs_remove_logical_node_from_list(evms_node_list, node);
16124
 
+       }
16125
 
+
16126
 
+       LOG_EXTRA("Group discovery complete.\n");
16127
 
+       return 0;
16128
 
+}
16129
 
+
16130
 
+
16131
 
+
16132
 
+/********** Logical Volume Discovery Functions **********/
16133
 
+
16134
 
+
16135
 
+
16136
 
+/* Function: build_le_maps
16137
 
+ *
16138
 
+ *     After all logical volumes have been discovered, the mappings from
16139
 
+ *     logical extents to physical extents must be constructed. Each PV
16140
 
+ *     contains a map on-disk of its PEs. Each PE map entry contains the
16141
 
+ *     logical volume number and the logical extent number on that volume.
16142
 
+ *     Our internal map is the reverse of this map for each volume, listing
16143
 
+ *     the PV node and sector offset for every logical extent on the volume.
16144
 
+ */
16145
 
+static int build_le_maps( lvm_volume_group_t * group )
16146
 
+{
16147
 
+       lvm_logical_volume_t    ** volume_list = group->volume_list;
16148
 
+       lvm_physical_volume_t   * pv_entry;
16149
 
+       evms_logical_node_t     * node;
16150
 
+       pv_disk_t               * pv;
16151
 
+       pe_disk_t               * pe_map;
16152
 
+       evms_sector_t           offset;
16153
 
+       u_int32_t               lv_number;
16154
 
+       u_int32_t               le_number;
16155
 
+       u_int32_t               first_pe_sector;
16156
 
+       int                     i;
16157
 
+
16158
 
+       LOG_DEBUG("Building LE maps for new volumes in group %s.\n", group->vg_name);
16159
 
+
16160
 
+       // For every PV in this VG
16161
 
+       for ( pv_entry = group->pv_list; pv_entry; pv_entry = pv_entry->next ) {
16162
 
+               node = pv_entry->logical_node;
16163
 
+               pv = pv_entry->pv;
16164
 
+               pe_map = pv_entry->pe_map;
16165
 
+
16166
 
+               // Version 1 metadata uses pe_on_disk.base + .size to find start
16167
 
+               // of first PE. Version 2 uses pe_start.
16168
 
+               if ( pv->version == 1 ) {
16169
 
+                       first_pe_sector = evms_cs_size_in_vsectors(pv->pe_on_disk.base + pv->pe_on_disk.size);
16170
 
+               }
16171
 
+               else {
16172
 
+                       first_pe_sector = pv->pe_start;
16173
 
+                       if ( ! first_pe_sector ) {
16174
 
+                               first_pe_sector = evms_cs_size_in_vsectors(pv->pe_on_disk.base + pv->pe_on_disk.size);
16175
 
+                       }
16176
 
+               }
16177
 
+
16178
 
+               // For every entry in the PE map, calculate the PE's sector offset
16179
 
+               // and update the correct LV's PE map. LV number of 0 marks an unused PE.
16180
 
+               // For re-discovery, only compute entries for new volumes. If a PV
16181
 
+               // is read-only, all LVs on that PV will also be read-only.
16182
 
+               for ( i = 0; i < pv->pe_total; i++ ) {
16183
 
+                       lv_number = pe_map[i].lv_num;
16184
 
+                       if ( lv_number &&
16185
 
+                            volume_list[lv_number] &&
16186
 
+                            volume_list[lv_number]->lv_access & (EVMS_LV_NEW|EVMS_LV_INCOMPLETE) ) {
16187
 
+                               le_number = pe_map[i].le_num;
16188
 
+                               offset = i * pv->pe_size + first_pe_sector;
16189
 
+                               volume_list[lv_number]->le_map[le_number].owning_pv = pv_entry;
16190
 
+                               volume_list[lv_number]->le_map[le_number].pe_sector_offset = offset;
16191
 
+                               if ( node->flags & EVMS_VOLUME_SET_READ_ONLY ) {
16192
 
+                                       volume_list[lv_number]->lv_access &= ~LV_WRITE;
16193
 
+                               }
16194
 
+                       }
16195
 
+               }
16196
 
+       }
16197
 
+
16198
 
+       return 0;
16199
 
+}
16200
 
+
16201
 
+
16202
 
+/* Function: build_snapshot_maps
16203
 
+ *
16204
 
+ *     For every volume in this group that is a snapshot, read all of the
16205
 
+ *     existing entries in the COW table, and build up the snapshot mapping
16206
 
+ *     structures accordingly.
16207
 
+ *
16208
 
+ *     For reference, the COW tables attached to the snapshot volumes  will
16209
 
+ *     always be in disk-order (little-endian), so that it can always be
16210
 
+ *     immediately written to disk. Therefore, endian conversions are necessary
16211
 
+ *     any time the COW table is accessed. This function will make a local
16212
 
+ *     copy of each COW table sector, and convert the local copy before
16213
 
+ *     building the snapshot maps.
16214
 
+ */
16215
 
+static int build_snapshot_maps( lvm_volume_group_t * group )
16216
 
+{
16217
 
+       lvm_logical_volume_t    * volume;
16218
 
+       evms_logical_node_t     tmp_node;
16219
 
+       lv_COW_table_disk_t     cow_table[EVMS_VSECTOR_SIZE/sizeof(lv_COW_table_disk_t)];
16220
 
+       unsigned long           max_entries = EVMS_VSECTOR_SIZE / sizeof(lv_COW_table_disk_t);
16221
 
+       int                     i, j, rc = 0;
16222
 
+
16223
 
+       // Check every volume in the group to see if it is a snapshot. Also
16224
 
+       // check to make sure it is a new volume in the case of re-discovery.
16225
 
+       for ( i = 1; i <= MAX_LV; i++ ) {
16226
 
+
16227
 
+               // The volume must exist, must be new, and must be a snapshot
16228
 
+               volume = group->volume_list[i];
16229
 
+               if ( ! volume ||
16230
 
+                    ! (volume->lv_access & EVMS_LV_NEW) ||
16231
 
+                    ! (volume->lv_access & LV_SNAPSHOT) ) {
16232
 
+                       continue;
16233
 
+               }
16234
 
+
16235
 
+               // Set up a temporary EVMS node
16236
 
+               tmp_node.instance_data = volume;
16237
 
+               rc = 0;
16238
 
+
16239
 
+               LOG_DEBUG("Building snapshot map for volume %s\n", volume->name);
16240
 
+
16241
 
+               while (1) {
16242
 
+                       // Read in one sector's worth of COW tables.
16243
 
+                       if ( lvm_init_io(&tmp_node, 0, volume->current_cow_sector, 1, volume->cow_table) ) {
16244
 
+                               invalidate_snapshot_volume(volume);
16245
 
+                               deallocate_logical_volume(volume);
16246
 
+                               break;
16247
 
+                       }
16248
 
+
16249
 
+                       // Endian-conversion of this COW table to a local table.
16250
 
+                       for ( j = 0; j < max_entries; j++ ) {
16251
 
+                               cow_table[j].pv_org_number   = le64_to_cpu(volume->cow_table[j].pv_org_number);
16252
 
+                               cow_table[j].pv_org_rsector  = le64_to_cpu(volume->cow_table[j].pv_org_rsector);
16253
 
+                               cow_table[j].pv_snap_number  = le64_to_cpu(volume->cow_table[j].pv_snap_number);
16254
 
+                               cow_table[j].pv_snap_rsector = le64_to_cpu(volume->cow_table[j].pv_snap_rsector);
16255
 
+                       }
16256
 
+                       
16257
 
+
16258
 
+                       // Translate every valid COW table entry into
16259
 
+                       // a snapshot map entry.
16260
 
+                       for ( volume->next_cow_entry = 0;
16261
 
+                             volume->next_cow_entry < max_entries &&
16262
 
+                             cow_table[volume->next_cow_entry].pv_org_number;
16263
 
+                             volume->next_cow_entry++ ) {
16264
 
+                               // org_rsector must be a valid sector number,
16265
 
+                               // i.e. it can't be within a PVs metadata. This
16266
 
+                               // is how we detect invalidated snapshots.
16267
 
+                               if ( (cow_table[volume->next_cow_entry].pv_org_rsector < 10) ||
16268
 
+                                    (cow_table[volume->next_cow_entry].pv_org_number > group->pv_count) ||
16269
 
+                                    (add_cow_entry_to_snapshot_map(&(cow_table[volume->next_cow_entry]),volume)) ) { 
16270
 
+                                       // This volume either has an invalid COW entry,
16271
 
+                                       // or had an error adding that COW entry to the
16272
 
+                                       // snapshot map. This snapshot is done.
16273
 
+                                       invalidate_snapshot_volume(volume);
16274
 
+                                       deallocate_logical_volume(volume);
16275
 
+                                       rc = -EINVAL;
16276
 
+                                       break;
16277
 
+                               }
16278
 
+                               volume->next_free_chunk += volume->chunk_size;
16279
 
+                       }
16280
 
+                       // Move on to the next sector if necessary.
16281
 
+                       if ( !rc && volume->next_cow_entry == max_entries ) {
16282
 
+                               volume->current_cow_sector++;
16283
 
+                       }
16284
 
+                       else {
16285
 
+                               break;
16286
 
+                       }
16287
 
+               }
16288
 
+       }
16289
 
+
16290
 
+       return 0;
16291
 
+}
16292
 
+
16293
 
+
16294
 
+/* Function: link_snapshot_volumes
16295
 
+ *
16296
 
+ *     This function examines the list of logical volumes in this group and
16297
 
+ *     sets up the necessary pointers to link snapshots and their originals.
16298
 
+ *     A singly-linked list is created starting with the original volume. Also,
16299
 
+ *     all snapshot volumes point directly back to their original. This
16300
 
+ *     function should not be run until all volumes have been discovered.
16301
 
+ *     In the case of re-discovery, all of these links/lists get rebuilt as if
16302
 
+ *     they were not already there. Currently this should not pose a problem.
16303
 
+ */
16304
 
+static int link_snapshot_volumes( lvm_volume_group_t * group )
16305
 
+{
16306
 
+       lvm_logical_volume_t    * org_volume;
16307
 
+       lvm_logical_volume_t    * snap_volume;
16308
 
+       u_int32_t               org_minor;
16309
 
+       u_int32_t               buffer_size = 0;
16310
 
+       int                     i, j;
16311
 
+
16312
 
+       for ( i = 1; i <= MAX_LV; i++ ) {
16313
 
+
16314
 
+               // Only process snapshot-originals
16315
 
+               org_volume = group->volume_list[i];
16316
 
+               if ( ! org_volume ||
16317
 
+                    ! (org_volume->lv_access & LV_SNAPSHOT_ORG) ) {
16318
 
+                       continue;
16319
 
+               }
16320
 
+
16321
 
+               // For snapshot-originals, look for all other volumes that
16322
 
+               // claim to be snapshotting it. For each one that is found,
16323
 
+               // insert it at the start of the original's list of snapshots.
16324
 
+               org_minor                       = org_volume->lv_minor;
16325
 
+               org_volume->snapshot_next       = NULL; // This is necessary for rediscovery to work properly.
16326
 
+                                                       // Could get circular snapshot lists otherwise.
16327
 
+               for ( j = 1; j <= MAX_LV; j++ ) {
16328
 
+                       snap_volume = group->volume_list[j];
16329
 
+                       if ( snap_volume &&
16330
 
+                            snap_volume->lv_access & LV_SNAPSHOT &&
16331
 
+                            (snap_volume->snap_org_minor == org_minor) ) {
16332
 
+                               snap_volume->snapshot_org       = org_volume;
16333
 
+                               snap_volume->snapshot_next      = org_volume->snapshot_next;
16334
 
+                               org_volume->snapshot_next       = snap_volume;
16335
 
+                               if ( snap_volume->chunk_size > buffer_size ) {
16336
 
+                                       buffer_size = snap_volume->chunk_size;
16337
 
+                               }
16338
 
+                               LOG_DEBUG("Linking snapshot (%s) to original (%s)\n", snap_volume->name, org_volume->name);
16339
 
+                       }
16340
 
+               }
16341
 
+
16342
 
+               // If no snapshots were found for a volume that claims to be
16343
 
+               // under snapshot, mark the group dirty. If this is final
16344
 
+               // discovery, the original will have the snapshot flag turned
16345
 
+               // off in check_logical_volumes().
16346
 
+               if ( ! org_volume->snapshot_next ) {
16347
 
+                       LOG_WARNING("No snapshots found for original (%s)\n", org_volume->name);
16348
 
+                       group->flags |= EVMS_VG_DIRTY;
16349
 
+               }
16350
 
+       }
16351
 
+       return 0;
16352
 
+}
16353
 
+
16354
 
+
16355
 
+/* Function: discover_volumes_in_group
16356
 
+ *     
16357
 
+ */
16358
 
+static int discover_volumes_in_group( lvm_volume_group_t * group )
16359
 
+{
16360
 
+       lv_disk_t               * lv_array = group->lv_array;
16361
 
+       lvm_logical_volume_t    * new_volume;
16362
 
+       int                     i;
16363
 
+
16364
 
+       // Search through the LV structs for valid LV entries
16365
 
+       for ( i = 0; i < group->vg->lv_max; i++ ) {
16366
 
+
16367
 
+               // Only discover valid, active volumes
16368
 
+               if ( ! lv_array[i].lv_name[0] ||
16369
 
+                    lv_array[i].lv_number >= MAX_LV ) {
16370
 
+                       continue;
16371
 
+               }
16372
 
+
16373
 
+               // Make sure this volume isn't already in the list.
16374
 
+               if ( group->volume_list[lv_array[i].lv_number+1] ) {
16375
 
+                       continue;
16376
 
+               }
16377
 
+
16378
 
+               // Create a new logical volume and place it in the appropriate
16379
 
+               // spot in this VG's volume list.
16380
 
+               new_volume = allocate_logical_volume(&(lv_array[i]), group);
16381
 
+               if ( ! new_volume ) {
16382
 
+                       // This volume will be missing, but other
16383
 
+                       // volumes in this group can still be built.
16384
 
+                       LOG_CRITICAL("Memory error creating LV %s in Group %s\n", lv_array[i].lv_name, group->vg_name);
16385
 
+                       continue;
16386
 
+               }
16387
 
+
16388
 
+               group->volume_list[new_volume->lv_number] = new_volume;
16389
 
+               group->volume_count++;
16390
 
+               group->flags |= EVMS_VG_DIRTY;
16391
 
+
16392
 
+               LOG_DEBUG("Discovered volume %s in group %s.\n", new_volume->name, group->vg_name);
16393
 
+       }
16394
 
+
16395
 
+       return 0;
16396
 
+}
16397
 
+
16398
 
+
16399
 
+/* Function: discover_logical_volumes
16400
 
+ *
16401
 
+ *     After all PVs have been claimed and added to the appropriate VG list,
16402
 
+ *     the volumes for each VG must be constructed. For each group, read all
16403
 
+ *     the LV structs off the first PV in the list. Search this list of
16404
 
+ *     structs for valid LVs. For each valid LV, create a new volume and add
16405
 
+ *     it to the group.
16406
 
+ */
16407
 
+static int discover_logical_volumes( void )
16408
 
+{
16409
 
+       lvm_volume_group_t      * group;
16410
 
+       int                     rc;
16411
 
+
16412
 
+       // Look for volumes in each valid VG entry. We even need to check ones
16413
 
+       // that aren't dirty - We could have deleted an incomplete volume on
16414
 
+       // the previous pass, and need to rediscover it in case this is final
16415
 
+       // discovery and we now want to export it.
16416
 
+       for ( group = lvm_group_list; group; group = group->next_group ) {
16417
 
+
16418
 
+               if ( ! group->vg ) {
16419
 
+                       continue;
16420
 
+               }
16421
 
+
16422
 
+               LOG_DEBUG("Searching for volumes in group %s\n", group->vg_name);
16423
 
+
16424
 
+               // Read in the LV array from disk if necessary.
16425
 
+               rc = read_lv(group);
16426
 
+               if (rc) {
16427
 
+                       LOG_WARNING("Unable to read LV metadata for group %s\n", group->vg_name);
16428
 
+                       LOG_WARNING("No regions can be discovered for group %s\n", group->vg_name);
16429
 
+                       continue;
16430
 
+               }
16431
 
+
16432
 
+               // Assemble each volume in the group.
16433
 
+               discover_volumes_in_group(group);
16434
 
+
16435
 
+               // Build the LE map for each LV discovered in this group. This
16436
 
+               // must be done after all LVS in the group are discovered.
16437
 
+               build_le_maps(group);
16438
 
+               check_le_maps(group);
16439
 
+
16440
 
+               // Set up all of the initial snapshot maps. Only the kernel
16441
 
+               // keeps track of the snapshot maps.
16442
 
+               build_snapshot_maps(group);
16443
 
+
16444
 
+               // Set up the pointers to link snapshot volumes
16445
 
+               // with their originals.
16446
 
+               link_snapshot_volumes(group);
16447
 
+       }
16448
 
+
16449
 
+       return 0;
16450
 
+}
16451
 
+
16452
 
+
16453
 
+/* Function: export_volumes
16454
 
+ *
16455
 
+ *     The last thing the plugin must do is take each newly constructed volume
16456
 
+ *     and place it on the evms logical node list. A zero return-code from
16457
 
+ *     this function means nothing new was added to the list, and a positive
16458
 
+ *     return code means that many new items were added to the list.
16459
 
+ */
16460
 
+static int export_volumes( evms_logical_node_t ** evms_node_list )
16461
 
+{
16462
 
+       lvm_volume_group_t      * group;
16463
 
+       evms_logical_node_t     * new_node;
16464
 
+       lvm_logical_volume_t    * volume;
16465
 
+       int                     count = 0;
16466
 
+       int                     i;
16467
 
+
16468
 
+       LOG_EXTRA("Exporting volumes\n");
16469
 
+
16470
 
+       // For every valid, dirty volume group
16471
 
+       for ( group = lvm_group_list; group; group = group->next_group ) {
16472
 
+               if ( ! (group->flags & EVMS_VG_DIRTY) ) {
16473
 
+                       continue;
16474
 
+               }
16475
 
+
16476
 
+               // Export every valid volume in the group. For re-discovery,
16477
 
+               // we re-export the same logical node.
16478
 
+               for ( i = 1; i <= MAX_LV; i++ ) {
16479
 
+                       volume = group->volume_list[i];
16480
 
+                       if ( ! volume ) {
16481
 
+                               continue;
16482
 
+                       }
16483
 
+                       
16484
 
+                       // For new volumes, create a new EVMS node and 
16485
 
+                       // initialize the appropriate fields.
16486
 
+                       if ( volume->lv_access & EVMS_LV_NEW ) {
16487
 
+                               if ( evms_cs_allocate_logical_node(&new_node) ) {
16488
 
+                                       continue;
16489
 
+                               }
16490
 
+
16491
 
+                               volume->volume_node             = new_node;
16492
 
+                               volume->lv_access               &= (~EVMS_LV_QUIESCED & ~EVMS_LV_NEW);
16493
 
+                               new_node->hardsector_size       = group->hard_sect_size;
16494
 
+                               new_node->block_size            = group->block_size;
16495
 
+                               new_node->plugin                = &lvm_plugin_header;
16496
 
+                               new_node->instance_data         = volume;
16497
 
+                               memcpy(new_node->name, volume->name, NAME_LEN);
16498
 
+
16499
 
+                               // Snapshot volumes should report the size of their original
16500
 
+                               if ( volume->lv_access & LV_SNAPSHOT ) {
16501
 
+                                       new_node->total_vsectors = volume->snapshot_org->lv_size;
16502
 
+                               }
16503
 
+                               else {
16504
 
+                                       new_node->total_vsectors = volume->lv_size;
16505
 
+                               }
16506
 
+
16507
 
+                               // Is the volume read-only?
16508
 
+                               if ( ! (volume->lv_access & LV_WRITE) ) {
16509
 
+                                       new_node->flags |= EVMS_VOLUME_READ_ONLY;
16510
 
+                                       LOG_DEBUG("LVM volume %s is read-only\n", volume->name);
16511
 
+                               }
16512
 
+
16513
 
+                               // Is the volume incomplete?
16514
 
+                               if ( volume->lv_access & EVMS_LV_INCOMPLETE ) {
16515
 
+                                       new_node->flags |= (EVMS_VOLUME_READ_ONLY | EVMS_VOLUME_PARTIAL);
16516
 
+                                       LOG_DEBUG("LVM volume %s is incomplete\n", volume->name);
16517
 
+                               }
16518
 
+
16519
 
+                               // Does the volume group contain any partial or
16520
 
+                               // removable PVs?
16521
 
+                               if ( group->flags & EVMS_VG_PARTIAL_PVS ) {
16522
 
+                                       new_node->flags |= EVMS_VOLUME_PARTIAL;
16523
 
+                               }
16524
 
+                               if ( group->flags & EVMS_VG_REMOVABLE_PVS ) {
16525
 
+                                       new_node->flags |= EVMS_DEVICE_REMOVABLE;
16526
 
+                               }
16527
 
+
16528
 
+                               MOD_INC_USE_COUNT;
16529
 
+                       }
16530
 
+
16531
 
+                       // Export the node. The add_to_list will catch it if
16532
 
+                       // we try to add the same node to the list twice.
16533
 
+                       if ( ! evms_cs_add_logical_node_to_list(evms_node_list, volume->volume_node) ) {
16534
 
+                               LOG_DETAILS("Exporting LVM volume %s\n", volume->name);
16535
 
+                               count++;
16536
 
+                       }
16537
 
+               }
16538
 
+
16539
 
+               // The group is clean now.
16540
 
+               group->flags &= ~EVMS_VG_DIRTY;
16541
 
+       }
16542
 
+
16543
 
+       return count;
16544
 
+}
16545
 
+
16546
 
+
16547
 
+/* Function: lvm_cleanup
16548
 
+ *
16549
 
+ *     This function runs through the entire lvm data structure, removing
16550
 
+ *     all items that are not needed at runtime. Currently, this is just the
16551
 
+ *     vg_disk_t structure and the pv_disk_t structure for each PV. Also, any
16552
 
+ *     groups that don't contain any volumes are deleted. All of the other
16553
 
+ *     volume_group, logical_volume and evms_logical_node structures will be
16554
 
+ *     kept around at run-time.
16555
 
+ */
16556
 
+static int lvm_cleanup( void )
16557
 
+{
16558
 
+       lvm_volume_group_t      * group;
16559
 
+       lvm_volume_group_t      * next_group;
16560
 
+       lvm_physical_volume_t   * pv_entry;
16561
 
+
16562
 
+       for ( group = lvm_group_list; group; group = next_group ) {
16563
 
+               next_group = group->next_group;
16564
 
+
16565
 
+               // Delete groups with no volumes.
16566
 
+               if ( ! group->volume_count ) {
16567
 
+                       LOG_WARNING("Group %s contains no logical volumes. Deleting.\n", group->vg_name);
16568
 
+                       remove_group_from_list(group);
16569
 
+                       deallocate_volume_group(group);
16570
 
+                       // Need to go back to the start of the list,
16571
 
+                       // just to be safe. :)
16572
 
+                       next_group = lvm_group_list;
16573
 
+                       continue;
16574
 
+               }
16575
 
+
16576
 
+               // Delete data structures that aren't used at runtime.
16577
 
+               if ( group->vg ) {
16578
 
+                       kfree(group->vg);
16579
 
+                       group->vg = NULL;
16580
 
+               }
16581
 
+               for ( pv_entry = group->pv_list; pv_entry; pv_entry = pv_entry->next ) {
16582
 
+                       if ( pv_entry->pv ) {
16583
 
+                               kfree(pv_entry->pv);
16584
 
+                               pv_entry->pv = NULL;
16585
 
+                       }
16586
 
+                       if ( pv_entry->pe_map ) {
16587
 
+                               vfree(pv_entry->pe_map);
16588
 
+                               pv_entry->pe_map = NULL;
16589
 
+                       }
16590
 
+               }
16591
 
+               if ( group->lv_array ) {
16592
 
+                       vfree(group->lv_array);
16593
 
+                       group->lv_array = NULL;
16594
 
+               }
16595
 
+               if ( group->uuid_list ) {
16596
 
+                       vfree(group->uuid_list);
16597
 
+                       group->uuid_list = NULL;
16598
 
+               }
16599
 
+       }
16600
 
+       return 0;
16601
 
+}
16602
 
+
16603
 
+
16604
 
+/* Function: lvm_get_bmap
16605
 
+ *
16606
 
+ *     Support for the BMAP ioctl used by LILO to translate filesystem blocks
16607
 
+ *     to disk blocks to map kernel images for boot time.
16608
 
+ */
16609
 
+static int lvm_get_bmap(evms_logical_node_t    * node,
16610
 
+                       evms_get_bmap_t         * bmap,
16611
 
+                       evms_logical_node_t     ** pv_node )
16612
 
+{
16613
 
+       lvm_logical_volume_t    * volume = node->instance_data;
16614
 
+       lvm_physical_volume_t   * pv_entry;
16615
 
+       evms_sector_t           new_sector = 0;
16616
 
+       evms_sector_t           new_size = 0;
16617
 
+       evms_sector_t           pe_start_sector;
16618
 
+       int                     rc = 0;
16619
 
+
16620
 
+       // No kernel images allowed on snapshot LVs.
16621
 
+       if ( volume->lv_access & LV_SNAPSHOT ) {
16622
 
+               return -EINVAL;
16623
 
+       }
16624
 
+
16625
 
+       // Range check.
16626
 
+       if ( bmap->rsector >= volume->lv_size ) {
16627
 
+               return -EINVAL;
16628
 
+       }
16629
 
+
16630
 
+       rc = remap_sector(node, bmap->rsector, 1, &new_sector, &new_size, &pe_start_sector, &pv_entry);
16631
 
+
16632
 
+       if (rc || !pv_entry || !new_sector) {
16633
 
+               return -EINVAL;
16634
 
+       }
16635
 
+
16636
 
+       bmap->rsector = new_sector;
16637
 
+       *pv_node = pv_entry->logical_node;
16638
 
+
16639
 
+       return 0;
16640
 
+}
16641
 
+
16642
 
+
16643
 
+/* Function: lvm_global_proc_read
16644
 
+ *
16645
 
+ *     A callback function for the lvm-global proc-fs entry. This will print
16646
 
+ *     general info about all LVM VGs, PVs, and LVs.
16647
 
+ */
16648
 
+static int lvm_global_proc_read(char           * page,
16649
 
+                               char            ** start,
16650
 
+                               off_t           off,
16651
 
+                               int             count,
16652
 
+                               int             * eof,
16653
 
+                               void            * data )
16654
 
+{
16655
 
+       lvm_volume_group_t      * group;
16656
 
+       lvm_physical_volume_t   * pv_entry;
16657
 
+       lvm_logical_volume_t    * volume;
16658
 
+       lvm_logical_volume_t    * snap;
16659
 
+       int                     vgs = 0;
16660
 
+       int                     lvs = 0;
16661
 
+       int                     pvs = 0;
16662
 
+       int                     sz = 0;
16663
 
+       int                     i;
16664
 
+
16665
 
+       PROCPRINT("Enterprise Volume Management System: LVM Plugin\n");
16666
 
+       PROCPRINT("Plugin ID: %x.%x.%x\n",
16667
 
+               GetPluginOEM(lvm_plugin_header.id),
16668
 
+               GetPluginType(lvm_plugin_header.id),
16669
 
+               GetPluginID(lvm_plugin_header.id));
16670
 
+       PROCPRINT("Plugin Version: %d.%d.%d\n",
16671
 
+               lvm_plugin_header.version.major,
16672
 
+               lvm_plugin_header.version.minor,
16673
 
+               lvm_plugin_header.version.patchlevel);
16674
 
+       PROCPRINT("Required EVMS Services Version: %d.%d.%d\n",
16675
 
+               lvm_plugin_header.required_common_services_version.major,
16676
 
+               lvm_plugin_header.required_common_services_version.minor,
16677
 
+               lvm_plugin_header.required_common_services_version.patchlevel);
16678
 
+
16679
 
+       // Count all existing items.
16680
 
+       for ( group = lvm_group_list; group; group = group->next_group ) {
16681
 
+               lvs += group->volume_count;
16682
 
+               pvs += group->pv_count;
16683
 
+               vgs++;
16684
 
+       }
16685
 
+
16686
 
+       PROCPRINT("\n");
16687
 
+       PROCPRINT("Total: %d VGs  %d PVs  %d LVs\n", vgs, pvs, lvs);
16688
 
+
16689
 
+       // Print out specifics about each VG.
16690
 
+       for ( group = lvm_group_list; group; group = group->next_group ) {
16691
 
+               PROCPRINT("\n");
16692
 
+               PROCPRINT("VG:  %s  [%d PV, %d LV]\n",
16693
 
+                       group->vg_name, group->pv_count, group->volume_count);
16694
 
+               PROCPRINT("PVs:\n");
16695
 
+               for ( pv_entry = group->pv_list; pv_entry; pv_entry = pv_entry->next ) {
16696
 
+                       if ( pv_entry->logical_node ) {
16697
 
+                               PROCPRINT("\t%s\t%10Ld KB\n",
16698
 
+                                       pv_entry->logical_node->name,
16699
 
+                                       pv_entry->logical_node->total_vsectors / 2);
16700
 
+                       }
16701
 
+               }
16702
 
+               PROCPRINT("LVs:\n");
16703
 
+               for ( i = 1; i <= MAX_LV; i++ ) {
16704
 
+                       if ( group->volume_list[i] ) {
16705
 
+                               volume = group->volume_list[i];
16706
 
+                               PROCPRINT("\t%s\t%10Ld KB / %5d LEs",
16707
 
+                                       volume->name,
16708
 
+                                       volume->lv_size / 2,
16709
 
+                                       volume->num_le);
16710
 
+                               if ( volume->lv_access & LV_SNAPSHOT ) {
16711
 
+                                       PROCPRINT("\tSnapshot of : ");
16712
 
+                                       if ( volume->snapshot_org ) {
16713
 
+                                               PROCPRINT("%s : ", volume->snapshot_org->name);
16714
 
+                                       }
16715
 
+                                       else {
16716
 
+                                               PROCPRINT("(unknown) : ");
16717
 
+                                       }
16718
 
+                                       PROCPRINT("%ld%% full : ", (long)(volume->next_free_chunk) * 100 / (long)(volume->lv_size));
16719
 
+                                       if ( volume->lv_status & LV_ACTIVE ) {
16720
 
+                                               PROCPRINT("active");
16721
 
+                                       }
16722
 
+                                       else {
16723
 
+                                               PROCPRINT("disabled");
16724
 
+                                       }
16725
 
+                               }
16726
 
+                               else if ( volume->lv_access & LV_SNAPSHOT_ORG ) {
16727
 
+                                       PROCPRINT("\tSnapshotted by : ");
16728
 
+                                       for ( snap = volume->snapshot_next; snap; snap = snap->snapshot_next ) {
16729
 
+                                               PROCPRINT("%s  ", snap->name);
16730
 
+                                       }
16731
 
+                               }
16732
 
+                               PROCPRINT("\n");
16733
 
+                       }
16734
 
+               }
16735
 
+       }
16736
 
+
16737
 
+       return sz;
16738
 
+}
16739
 
+
16740
 
+
16741
 
+/********** Required EVMS Plugin Functions **********/
16742
 
+
16743
 
+
16744
 
+/* Function: lvm_discover
16745
 
+ *
16746
 
+ *     This is the entry point into the LVM discovery process. It is a three
16747
 
+ *     phase process. First, the list of nodes are examined for PVs, and the
16748
 
+ *     appropriate volume groups are created. Then each volume group is
16749
 
+ *     examined to find all available logical volumes. Finally, each LVM
16750
 
+ *     logical volume has a new EVMS node created for it, and added to the
16751
 
+ *     list of nodes.
16752
 
+ */
16753
 
+static int lvm_discover( evms_logical_node_t ** evms_node_list )
16754
 
+{
16755
 
+       int rc;
16756
 
+
16757
 
+       LOG_EXTRA("Beginning discovery.\n");
16758
 
+
16759
 
+       discover_volume_groups(evms_node_list);
16760
 
+
16761
 
+       check_volume_groups();
16762
 
+
16763
 
+       discover_logical_volumes();
16764
 
+
16765
 
+       check_logical_volumes(0);
16766
 
+
16767
 
+       rc = export_volumes(evms_node_list);
16768
 
+
16769
 
+       LOG_EXTRA("Discovery complete.\n");
16770
 
+       return rc;
16771
 
+}
16772
 
+
16773
 
+
16774
 
+/* Function: lvm_discover_end
16775
 
+ *
16776
 
+ *     The discovery process at the region-manager level is now iterative,
16777
 
+ *     much like the EVMS feature level. This allows the ability to stack
16778
 
+ *     LVM on top of MD, or vice-versa. To accomplish this correctly, and
16779
 
+ *     also to accomplish partial volume discovery, a second discover
16780
 
+ *     entry point is needed, so EVMS can tell the region managers that
16781
 
+ *     discovery is over, and to finish up any discovery that is not yet
16782
 
+ *     complete. When this function is called, it should be assumed that
16783
 
+ *     the node list has had nothing new added to it since the last call
16784
 
+ *     of the regular discover function. Therefore, when this function is
16785
 
+ *     called, we do not need to try to discovery any additional volume
16786
 
+ *     groups. We will, however, look for logical volumes once more. This
16787
 
+ *     gives us the ability to export (read-only) volumes that have
16788
 
+ *     partially corrupted LE maps due to missing PVs in their VG.
16789
 
+ */
16790
 
+static int lvm_discover_end( evms_logical_node_t ** evms_node_list )
16791
 
+{
16792
 
+       int rc;
16793
 
+
16794
 
+       LOG_EXTRA("Beginning final discovery\n");
16795
 
+
16796
 
+       discover_volume_groups(evms_node_list);
16797
 
+
16798
 
+       check_volume_groups();
16799
 
+
16800
 
+       discover_logical_volumes();
16801
 
+
16802
 
+       check_logical_volumes(1);
16803
 
+
16804
 
+       rc = export_volumes(evms_node_list);
16805
 
+
16806
 
+       lvm_cleanup();
16807
 
+
16808
 
+       LOG_EXTRA("Final discovery complete.\n");
16809
 
+       return rc;
16810
 
+}
16811
 
+
16812
 
+
16813
 
+/* Function: lvm_delete_node
16814
 
+ *
16815
 
+ *     This function deletes the in-memory representation of an LVM
16816
 
+ *     logical volume.
16817
 
+ */
16818
 
+static int lvm_delete_node( evms_logical_node_t * logical_node )
16819
 
+{
16820
 
+       lvm_logical_volume_t    * volume = logical_node->instance_data;
16821
 
+       lvm_volume_group_t      * group = volume->group;
16822
 
+
16823
 
+       LOG_DEBUG("Deleting LVM node %s\n", logical_node->name);
16824
 
+
16825
 
+       if ( deallocate_logical_volume(volume) ) {
16826
 
+               return -EINVAL;
16827
 
+       }
16828
 
+
16829
 
+       // If we just removed the last volume from this group, the entire group
16830
 
+       // must also be deleted.
16831
 
+       if ( group && group->volume_count == 0 ) {
16832
 
+               remove_group_from_list(group);
16833
 
+               deallocate_volume_group(group);
16834
 
+       }
16835
 
+
16836
 
+       // Free the logical node.
16837
 
+       evms_cs_deallocate_logical_node(logical_node);
16838
 
+
16839
 
+       MOD_DEC_USE_COUNT;
16840
 
+
16841
 
+       return 0;
16842
 
+}
16843
 
+
16844
 
+
16845
 
+/* Function: lvm_read
16846
 
+ */
16847
 
+static void lvm_read(  evms_logical_node_t     * node,
16848
 
+                       eio_t                   * eio )
16849
 
+{
16850
 
+       lvm_logical_volume_t    * volume = node->instance_data;
16851
 
+       lvm_physical_volume_t   * pv_entry;
16852
 
+       evms_sector_t           pe_start_sector;
16853
 
+       evms_sector_t           new_sector;
16854
 
+       evms_sector_t           new_size;
16855
 
+
16856
 
+       // Make sure the volume is active and readable
16857
 
+       if ( ! (volume->lv_access & LV_READ && volume->lv_status & LV_ACTIVE) ) {
16858
 
+               EVMS_IO_ERROR(eio);
16859
 
+               return;
16860
 
+       }
16861
 
+
16862
 
+       // If this volume is a snapshot, lock the volume, and do
16863
 
+       // the LE-PE translation on its original volume.
16864
 
+       if ( volume->lv_access & LV_SNAPSHOT ) {
16865
 
+               down( &volume->snap_semaphore );
16866
 
+               if ( ! volume->snapshot_org ) {
16867
 
+                       EVMS_IO_ERROR(eio);
16868
 
+                       up( &volume->snap_semaphore );
16869
 
+                       return;
16870
 
+               }
16871
 
+               node = volume->snapshot_org->volume_node;
16872
 
+       }
16873
 
+
16874
 
+       // Check if I/O goes past end of logical volume. Must use the
16875
 
+       // node, not the volume, so snapshots will work correctly.
16876
 
+       if ( eio->rsector + eio->rsize > node->total_vsectors ) {
16877
 
+               if ( volume->lv_access & LV_SNAPSHOT ) {
16878
 
+                       up( &volume->snap_semaphore );
16879
 
+               }
16880
 
+               EVMS_IO_ERROR(eio);
16881
 
+               return;
16882
 
+       }
16883
 
+
16884
 
+       // Logical-to-Physical remapping. Check for incomplete volumes.
16885
 
+       if ( remap_sector(node, eio->rsector, eio->rsize, &new_sector, &new_size, &pe_start_sector, &pv_entry) ||
16886
 
+            ! pe_start_sector || ! pv_entry ) {
16887
 
+               if ( volume->lv_access & LV_SNAPSHOT ) {
16888
 
+                       up( &volume->snap_semaphore );
16889
 
+               }
16890
 
+               EVMS_IO_ERROR(eio);
16891
 
+               return;
16892
 
+       }
16893
 
+
16894
 
+       // For snapshot volumes, check if this sector's chunk has been
16895
 
+       // remapped. If it has, new_sector and pv_entry will be changed
16896
 
+       // accordingly. If not, they remain the same.
16897
 
+       if ( volume->lv_access & LV_SNAPSHOT ) {
16898
 
+               snapshot_remap_sector(volume, pe_start_sector , &new_sector, &pv_entry);
16899
 
+       }
16900
 
+
16901
 
+       eio->rsector = new_sector;
16902
 
+       eio->rsize = new_size;
16903
 
+       R_IO(pv_entry->logical_node, eio);
16904
 
+
16905
 
+       // Unlock the snapshot
16906
 
+       if ( volume->lv_access & LV_SNAPSHOT ) {
16907
 
+               up( &volume->snap_semaphore );
16908
 
+       }
16909
 
+}
16910
 
+
16911
 
+
16912
 
+/* Function: lvm_write
16913
 
+ */
16914
 
+static void lvm_write( evms_logical_node_t     * node,
16915
 
+                       eio_t                   * eio )
16916
 
+{
16917
 
+       lvm_logical_volume_t    * volume = node->instance_data;
16918
 
+       lvm_logical_volume_t    * snap_volume;
16919
 
+       lvm_physical_volume_t   * pv_entry;
16920
 
+       evms_sector_t           pe_start_sector;
16921
 
+       evms_sector_t           new_sector;
16922
 
+       evms_sector_t           new_size;
16923
 
+
16924
 
+       // Make sure the volume is active and writable
16925
 
+       if ( ! (volume->lv_access & LV_WRITE && volume->lv_status & LV_ACTIVE) ) {
16926
 
+               EVMS_IO_ERROR(eio);
16927
 
+               return;
16928
 
+       }
16929
 
+
16930
 
+       // Check if I/O goes past end of logical volume.
16931
 
+       if ( eio->rsector + eio->rsize > node->total_vsectors ) {
16932
 
+               EVMS_IO_ERROR(eio);
16933
 
+               return;
16934
 
+       }
16935
 
+
16936
 
+       // Logical-to-Physical remapping. Check for incomplete volumes.
16937
 
+       if ( remap_sector(node, eio->rsector, eio->rsize, &new_sector, &new_size, &pe_start_sector, &pv_entry) ||
16938
 
+            ! pe_start_sector || ! pv_entry ) {
16939
 
+               EVMS_IO_ERROR(eio);
16940
 
+               return;
16941
 
+       }
16942
 
+
16943
 
+       // Copy-on-write for snapshotting
16944
 
+       if ( volume->lv_access & LV_SNAPSHOT_ORG ) {
16945
 
+               // Originals can be snapshotted multiple times
16946
 
+               for ( snap_volume = volume->snapshot_next; snap_volume; snap_volume = snap_volume->snapshot_next ) {
16947
 
+                       if ( snapshot_copy_data(volume, snap_volume, pe_start_sector, new_sector, pv_entry) ) {
16948
 
+                               EVMS_IO_ERROR(eio);
16949
 
+                               return;
16950
 
+                       }
16951
 
+               }
16952
 
+       }
16953
 
+
16954
 
+       eio->rsector = new_sector;
16955
 
+       eio->rsize = new_size;
16956
 
+               W_IO(pv_entry->logical_node, eio);
16957
 
+}
16958
 
+
16959
 
+
16960
 
+/* Function: lvm_init_io
16961
 
+ *
16962
 
+ *     Init_io on a snapshot volume treats it like a regular volume.
16963
 
+ */
16964
 
+static int lvm_init_io(        evms_logical_node_t     * node,
16965
 
+                       int                     io_flag,        // 0=read, 1=write, 4=LVM-internal-write
16966
 
+                       evms_sector_t           sect_nr,        // node LBA
16967
 
+                       evms_sector_t           num_sects,      // # of sectors
16968
 
+                       void                    * buf_addr )    // buffer address
16969
 
+{
16970
 
+       lvm_physical_volume_t   * pv_entry;
16971
 
+       lvm_logical_volume_t    * volume = node->instance_data;
16972
 
+       evms_sector_t           pe_start_sector;
16973
 
+       evms_sector_t           new_sector;
16974
 
+       evms_sector_t           new_size;
16975
 
+       int                     rc = 0;
16976
 
+
16977
 
+       // Only allow internal writes to snapshots (io_flag==4). Disallow
16978
 
+       // writes to snapshot originals.
16979
 
+       if ( io_flag == 1 &&
16980
 
+            volume->lv_access & (LV_SNAPSHOT|LV_SNAPSHOT_ORG) ) {
16981
 
+               return -EINVAL;
16982
 
+       }
16983
 
+       // The node for a snapshot reports the size of the original. If a
16984
 
+       // request comes in in that range, just return.
16985
 
+       else if ( volume->lv_access & LV_SNAPSHOT &&
16986
 
+                 sect_nr >= volume->lv_size &&
16987
 
+                 sect_nr < node->total_vsectors ) {
16988
 
+               if ( io_flag == 0 ) {
16989
 
+                       memset( buf_addr, 0, num_sects << EVMS_VSECTOR_SIZE_SHIFT );
16990
 
+               }
16991
 
+               return 0;
16992
 
+       }
16993
 
+       // Regular range check.
16994
 
+       else if ( sect_nr + num_sects > volume->lv_size ) {
16995
 
+               return -EINVAL;
16996
 
+       }
16997
 
+
16998
 
+       if ( io_flag == 4 ) {
16999
 
+               io_flag = 1;
17000
 
+       }
17001
 
+
17002
 
+       // Init IO needs to deal with the possibility of a request that spans
17003
 
+       // PEs or stripes. This is possible because there is no limit on
17004
 
+       // num_sects. To handle this, we loop through remap_sector and
17005
 
+       // INIT_IO until num_sects reaches zero.
17006
 
+       while ( num_sects ) {
17007
 
+               if ( remap_sector(node, sect_nr, num_sects, &new_sector, &new_size, &pe_start_sector, &pv_entry) ) {
17008
 
+                       return -EIO;
17009
 
+               }
17010
 
+               // If the volume is incomplete, clear the buffer (on a read).
17011
 
+               if ( !pe_start_sector || !pv_entry ) {
17012
 
+                       if ( io_flag == 0 ) {
17013
 
+                               memset(buf_addr, 0, new_size << EVMS_VSECTOR_SIZE_SHIFT);
17014
 
+                       }
17015
 
+               }
17016
 
+               else {
17017
 
+                       rc = INIT_IO(pv_entry->logical_node, io_flag, new_sector, new_size, buf_addr);
17018
 
+               }
17019
 
+               num_sects       -= new_size;
17020
 
+               sect_nr         += new_size;
17021
 
+               buf_addr        = (void*)(((unsigned long)buf_addr) + (unsigned long)(new_size << EVMS_VSECTOR_SIZE_SHIFT));
17022
 
+       }
17023
 
+
17024
 
+       return rc;
17025
 
+}
17026
 
+
17027
 
+
17028
 
+/* Function: lvm_ioctl
17029
 
+ */
17030
 
+static int lvm_ioctl(  evms_logical_node_t     * logical_node,
17031
 
+                       struct inode            * inode,
17032
 
+                       struct file             * file,
17033
 
+                       unsigned int            cmd,
17034
 
+                       unsigned long           arg)
17035
 
+{
17036
 
+       lvm_logical_volume_t    * volume = logical_node->instance_data;
17037
 
+       int                     rc = 0;
17038
 
+
17039
 
+       LOG_ENTRY_EXIT("--lvm: Ioctl %d\n",cmd);
17040
 
+
17041
 
+       switch (cmd) {
17042
 
+
17043
 
+       case HDIO_GETGEO:
17044
 
+               {
17045
 
+                       // Fixed geometry for all LVM volumes 
17046
 
+                       unsigned char heads = 64;
17047
 
+                       unsigned char sectors = 32;
17048
 
+                       long start = 0;
17049
 
+                       struct hd_geometry *hd = (struct hd_geometry *)arg;
17050
 
+                       short cylinders;
17051
 
+                       cylinders = logical_node->total_vsectors;
17052
 
+                       cylinders = (cylinders / heads) / sectors;
17053
 
+
17054
 
+                       if (hd == NULL) {
17055
 
+                               return -EINVAL;
17056
 
+                       }
17057
 
+
17058
 
+                       if ( copy_to_user((char*)(&hd->heads), &heads, sizeof(heads)) != 0 ||
17059
 
+                            copy_to_user((char*)(&hd->sectors), &sectors, sizeof(sectors)) != 0 ||
17060
 
+                            copy_to_user((short*)(&hd->cylinders), &cylinders, sizeof(cylinders)) != 0 ||
17061
 
+                            copy_to_user((long*)(&hd->start), &start, sizeof(start)) != 0 ) {
17062
 
+                               return -EFAULT;
17063
 
+                       }
17064
 
+               }
17065
 
+               break;
17066
 
+
17067
 
+       case LV_SET_ACCESS:
17068
 
+               // Set access flags of a logical volume 
17069
 
+               // If we decide to make a volume read-only, how do we
17070
 
+               // tell the EVMS level?
17071
 
+               /*
17072
 
+               if (!capable(CAP_SYS_ADMIN)) return -EACCES;
17073
 
+               lv_ptr->lv_access = (ulong) arg;
17074
 
+               if ( lv_ptr->lv_access & LV_WRITE)
17075
 
+                       set_device_ro(lv_ptr->lv_dev, 0);
17076
 
+               else
17077
 
+                       set_device_ro(lv_ptr->lv_dev, 1);
17078
 
+               */
17079
 
+               rc = -EINVAL;
17080
 
+               break;
17081
 
+
17082
 
+       case LV_SET_STATUS:
17083
 
+               // Set status flags of a logical volume 
17084
 
+               /*
17085
 
+               if (!capable(CAP_SYS_ADMIN)) return -EACCES;
17086
 
+               if (!((ulong) arg & LV_ACTIVE) && lv_ptr->lv_open > 1)
17087
 
+                       return -EPERM;
17088
 
+               lv_ptr->lv_status = (ulong) arg;
17089
 
+               */
17090
 
+               rc = -EINVAL;
17091
 
+               break;
17092
 
+
17093
 
+       case EVMS_QUIESCE_VOLUME:
17094
 
+               {
17095
 
+                       evms_quiesce_volume_t * tmp = (evms_quiesce_volume_t*)arg;
17096
 
+                       if ( tmp->command ) {   // Quiesce
17097
 
+                               volume->lv_access |= EVMS_LV_QUIESCED;
17098
 
+                       }
17099
 
+                       else {                  // Un-quiesce
17100
 
+                               volume->lv_access &= ~EVMS_LV_QUIESCED;
17101
 
+                       }
17102
 
+               }
17103
 
+               break;
17104
 
+
17105
 
+       case EVMS_GET_BMAP:
17106
 
+               {
17107
 
+                       evms_get_bmap_t         * bmap = (evms_get_bmap_t*)arg;
17108
 
+                       evms_logical_node_t     * pv_node;
17109
 
+
17110
 
+                       rc = lvm_get_bmap(logical_node, bmap, &pv_node);
17111
 
+                       if (!rc) {
17112
 
+                               rc = IOCTL(pv_node, inode, file, cmd, (unsigned long)bmap);
17113
 
+                       }
17114
 
+               }
17115
 
+               break;
17116
 
+       
17117
 
+       case EVMS_GET_DISK_LIST:
17118
 
+       case EVMS_CHECK_MEDIA_CHANGE:
17119
 
+       case EVMS_REVALIDATE_DISK:
17120
 
+       case EVMS_OPEN_VOLUME:
17121
 
+       case EVMS_CLOSE_VOLUME:
17122
 
+               {
17123
 
+                       // These five ioctl all need to be broadcast to all PVs.
17124
 
+                       lvm_volume_group_t * group = volume->group;
17125
 
+                       lvm_physical_volume_t * pv_entry;
17126
 
+                       for ( pv_entry = group->pv_list; pv_entry; pv_entry = pv_entry->next ) {
17127
 
+                               rc |= IOCTL(pv_entry->logical_node, inode, file, cmd, arg);
17128
 
+                       }
17129
 
+               }
17130
 
+               break;
17131
 
+
17132
 
+       default:
17133
 
+               // Currently LVM does not send any ioctl's down to the
17134
 
+               // PVs. Which PV would they go to? What would we do with
17135
 
+               // the return codes?
17136
 
+               rc = -EINVAL;
17137
 
+       }
17138
 
+
17139
 
+       return rc;
17140
 
+}
17141
 
+
17142
 
+
17143
 
+/* Function: lvm_direct_ioctl
17144
 
+ *
17145
 
+ *     This function provides a method for user-space to communicate directly
17146
 
+ *     with a plugin in the kernel.
17147
 
+ */
17148
 
+static int lvm_direct_ioctl(   struct inode    * inode,
17149
 
+                               struct file     * file,
17150
 
+                               unsigned int    cmd,
17151
 
+                               unsigned long   args )
17152
 
+{
17153
 
+       evms_plugin_ioctl_t     argument;
17154
 
+       int                     rc = 0;
17155
 
+
17156
 
+        // Copy user's parameters to kernel space
17157
 
+        if ( copy_from_user(&argument, (evms_plugin_ioctl_t*)args, sizeof(argument)) ) {
17158
 
+                return -EFAULT;
17159
 
+       }
17160
 
+
17161
 
+       // Make sure this is supposed to be our ioctl.
17162
 
+       if ( argument.feature_id != lvm_plugin_header.id ) {
17163
 
+               return -EINVAL;
17164
 
+       }
17165
 
+
17166
 
+       switch(argument.feature_command) {
17167
 
+
17168
 
+       case EVMS_LVM_PV_REMOVE_IOCTL:
17169
 
+               {
17170
 
+                       lvm_pv_remove_ioctl_t pv_remove;
17171
 
+                       if ( copy_from_user(&pv_remove, (lvm_pv_remove_ioctl_t*)argument.feature_ioctl_data, sizeof(pv_remove)) ) {
17172
 
+                               rc = -EINVAL;
17173
 
+                               break;
17174
 
+                       }
17175
 
+                       rc = remove_pv_from_group(pv_remove.pv_number, pv_remove.vg_uuid);
17176
 
+               }
17177
 
+               break;
17178
 
+
17179
 
+       case EVMS_LVM_SNAPSHOT_STAT_IOCTL:
17180
 
+               {
17181
 
+                       lvm_snapshot_stat_ioctl_t snap_stats;
17182
 
+                       if ( copy_from_user(&snap_stats, (lvm_snapshot_stat_ioctl_t*)argument.feature_ioctl_data, sizeof(snap_stats)) ) {
17183
 
+                               rc = -EINVAL;
17184
 
+                               break;
17185
 
+                       }
17186
 
+                       rc = get_snapshot_stats(&snap_stats);
17187
 
+                       if ( copy_to_user((lvm_snapshot_stat_ioctl_t*)argument.feature_ioctl_data, &snap_stats, sizeof(snap_stats)) ) {
17188
 
+                               rc = -EINVAL;
17189
 
+                               break;
17190
 
+                       }
17191
 
+               }
17192
 
+               break;
17193
 
+
17194
 
+       default:
17195
 
+               rc = -EINVAL;
17196
 
+               break;
17197
 
+       }
17198
 
+
17199
 
+       argument.status = rc;
17200
 
+       copy_to_user((evms_plugin_ioctl_t*)args, &argument, sizeof(argument));
17201
 
+       return rc;
17202
 
+}
17203
 
+
17204
 
+
17205
 
+/* Function: lvm_vge_init
17206
 
+ */
17207
 
+int __init lvm_vge_init(void)
17208
 
+{
17209
 
+       struct proc_dir_entry * pde;
17210
 
+
17211
 
+       lvm_group_list = NULL;
17212
 
+       lvm_proc = NULL;
17213
 
+
17214
 
+       // Register the global proc-fs entries.
17215
 
+       pde = evms_cs_get_evms_proc_dir();
17216
 
+       if ( pde ) {
17217
 
+               lvm_proc = create_proc_entry(LVM_PROC_NAME, S_IFDIR, pde);
17218
 
+               if ( lvm_proc ) {
17219
 
+                       create_proc_read_entry(LVM_PROC_GLOBAL_NAME, S_IFREG, lvm_proc, lvm_global_proc_read, NULL);
17220
 
+               }
17221
 
+       }
17222
 
+
17223
 
+       // Register this plugin with EVMS.
17224
 
+       return evms_cs_register_plugin(&lvm_plugin_header);
17225
 
+}
17226
 
+
17227
 
+
17228
 
+/* Function: lvm_vge_exit
17229
 
+ */
17230
 
+void __exit lvm_vge_exit(void)
17231
 
+{
17232
 
+       lvm_volume_group_t      * group;
17233
 
+       lvm_volume_group_t      * next_group;
17234
 
+       struct proc_dir_entry   * pde;
17235
 
+       int                     i;
17236
 
+
17237
 
+       // If LVM is called for module_exit, that means the reference
17238
 
+       // count must be zero, which means there should be no volumes,
17239
 
+       // and thus no volume groups. But, check anyway and delete
17240
 
+       // any volumes and groups that are still hanging around.
17241
 
+       if ( lvm_group_list ) {
17242
 
+               LOG_SERIOUS("Called for module_exit, but group list is not empty!\n");
17243
 
+       }
17244
 
+       for ( group = lvm_group_list; group; group = next_group ) {
17245
 
+               next_group = group->next_group;
17246
 
+
17247
 
+               LOG_SERIOUS("In module_exit: deleting all volumes from group %s.\n", group->vg_name);
17248
 
+
17249
 
+               for ( i = 1; i <= MAX_LV; i++ ) {
17250
 
+                       if ( group->volume_list[i] ) {
17251
 
+                               lvm_delete_node(group->volume_list[i]->volume_node);
17252
 
+                       }
17253
 
+               }
17254
 
+       }
17255
 
+
17256
 
+       // Unregister the proc-fs entries.
17257
 
+       pde = evms_cs_get_evms_proc_dir();
17258
 
+       if (pde) {
17259
 
+               remove_proc_entry(LVM_PROC_GLOBAL_NAME, lvm_proc);
17260
 
+               remove_proc_entry(LVM_PROC_NAME, pde);
17261
 
+       }
17262
 
+
17263
 
+       // Unregister this plugin from EVMS.
17264
 
+       evms_cs_unregister_plugin(&lvm_plugin_header);
17265
 
+}
17266
 
+
17267
 
+
17268
 
+module_init(lvm_vge_init);
17269
 
+module_exit(lvm_vge_exit);
17270
 
+#ifdef MODULE_LICENSE
17271
 
+MODULE_LICENSE("GPL");
17272
 
+#endif
17273
 
+
17274
 
diff -Naur linux-2002-03-28/drivers/evms/md_core.c evms-2002-03-28/drivers/evms/md_core.c
17275
 
--- linux-2002-03-28/drivers/evms/md_core.c     Wed Dec 31 18:00:00 1969
17276
 
+++ evms-2002-03-28/drivers/evms/md_core.c      Thu Mar 28 08:37:22 2002
17277
 
@@ -0,0 +1,3267 @@
17278
 
+/*
17279
 
+ *   Copyright (c) International Business Machines  Corp., 2000
17280
 
+ *
17281
 
+ *   This program is free software;  you can redistribute it and/or modify
17282
 
+ *   it under the terms of the GNU General Public License as published by
17283
 
+ *   the Free Software Foundation; either version 2 of the License, or
17284
 
+ *   (at your option) any later version.
17285
 
+ *
17286
 
+ *   This program is distributed in the hope that it will be useful,
17287
 
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
17288
 
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
17289
 
+ *   the GNU General Public License for more details.
17290
 
+ *
17291
 
+ *   You should have received a copy of the GNU General Public License
17292
 
+ *   along with this program;  if not, write to the Free Software
17293
 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17294
 
+ *
17295
 
+ *
17296
 
+ * linux/drivers/evms/md_core.c
17297
 
+ *
17298
 
+ * EVMS Linux MD Region Manager
17299
 
+ *
17300
 
+ */
17301
 
+
17302
 
+
17303
 
+#include <linux/module.h>
17304
 
+#include <linux/kmod.h>
17305
 
+#include <linux/kernel.h>
17306
 
+#include <linux/config.h>
17307
 
+#include <linux/genhd.h>
17308
 
+#include <linux/major.h>
17309
 
+#include <linux/string.h>
17310
 
+#include <linux/blk.h>
17311
 
+#include <linux/init.h>
17312
 
+#include <linux/slab.h>
17313
 
+#include <linux/vmalloc.h>
17314
 
+#include <linux/evms/evms_kernel.h>
17315
 
+#include <linux/evms/evms_md.h>
17316
 
+#include <linux/sysctl.h>
17317
 
+#include <asm/system.h>
17318
 
+#include <asm/uaccess.h>
17319
 
+
17320
 
+#define LOG_PREFIX "md core: "
17321
 
+
17322
 
+/*
17323
 
+ * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
17324
 
+ * is 100 KB/sec, so the extra system load does not show up that much.
17325
 
+ * Increase it if you want to have more _guaranteed_ speed. Note that
17326
 
+ * the RAID driver will use the maximum available bandwith if the IO
17327
 
+ * subsystem is idle. There is also an 'absolute maximum' reconstruction
17328
 
+ * speed limit - in case reconstruction slows down your system despite
17329
 
+ * idle IO detection.
17330
 
+ *
17331
 
+ * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
17332
 
+ */
17333
 
+
17334
 
+static MD_LIST_HEAD(all_raid_disks);
17335
 
+static MD_LIST_HEAD(pending_raid_disks);
17336
 
+
17337
 
+static int sysctl_speed_limit_min = 100;
17338
 
+static int sysctl_speed_limit_max = 100000;
17339
 
+
17340
 
+
17341
 
+static mdk_personality_t *pers[MAX_PERSONALITY];
17342
 
+
17343
 
+static int md_blocksizes[MAX_MD_DEVS];
17344
 
+static int md_hardsect_sizes[MAX_MD_DEVS];
17345
 
+int evms_md_size[MAX_MD_DEVS];
17346
 
+static evms_thread_t *evms_md_recovery_thread;
17347
 
+
17348
 
+/*
17349
 
+ * Enables to iterate over all existing md arrays
17350
 
+ */
17351
 
+static MD_LIST_HEAD(all_mddevs);
17352
 
+
17353
 
+/*
17354
 
+ * The mapping between kdev and mddev is not necessary a simple
17355
 
+ * one! Eg. HSM uses several sub-devices to implement Logical
17356
 
+ * Volumes. All these sub-devices map to the same mddev.
17357
 
+ */
17358
 
+dev_mapping_t evms_mddev_map[MAX_MD_DEVS];
17359
 
+
17360
 
+
17361
 
+static md_spinlock_t activate_spare_list_lock = MD_SPIN_LOCK_UNLOCKED;
17362
 
+static evms_md_activate_spare_t *evms_activate_spare_list = NULL, **evms_activate_spare_tail;
17363
 
+
17364
 
+/* Support functions for discovery */
17365
 
+static int evms_md_import_device (evms_logical_node_t **discover_list,
17366
 
+                                 evms_logical_node_t *node,
17367
 
+                                 int on_disk);
17368
 
+static void evms_md_autostart_arrays(evms_logical_node_t **discover_list);
17369
 
+static void evms_md_autorun_devices (evms_logical_node_t **discover_list,
17370
 
+                                    kdev_t countdev);
17371
 
+static void evms_md_autorun_array (evms_logical_node_t ** discover_list,
17372
 
+                                  mddev_t *mddev);
17373
 
+static int evms_md_create_logical_node(evms_logical_node_t **discover_list,
17374
 
+                                      mddev_t *mddev, uint flags);
17375
 
+static int evms_md_read_disk_sb (mdk_rdev_t * rdev);
17376
 
+static int evms_md_analyze_sbs (mddev_t * mddev);
17377
 
+static mddev_t * alloc_mddev (kdev_t dev);
17378
 
+static void free_mddev(mddev_t * mddev);
17379
 
+static int do_md_run (mddev_t * mddev);
17380
 
+static int do_md_stop (mddev_t * mddev, int ro);
17381
 
+
17382
 
+static void kick_rdev_from_array (mdk_rdev_t * rdev);
17383
 
+static mdp_disk_t *evms_md_find_disk(mddev_t *mddev, kdev_t dev);
17384
 
+static void remove_descriptor (mdp_disk_t *disk, mdp_super_t *sb);
17385
 
+
17386
 
+/* Plugin API prototypes */
17387
 
+static int md_discover( evms_logical_node_t ** discover_list );
17388
 
+static int md_end_discover( evms_logical_node_t ** discover_list );
17389
 
+static int md_delete( evms_logical_node_t * node);
17390
 
+static void md_read(   evms_logical_node_t     * node,
17391
 
+                       eio_t                   * eio);
17392
 
+static void md_write(  evms_logical_node_t     * node,
17393
 
+                       eio_t                   * eio);
17394
 
+static int md_init_io( evms_logical_node_t     * node,
17395
 
+                       int                     rw,
17396
 
+                       evms_sector_t           sect_nr,
17397
 
+                       evms_sector_t           num_sects,
17398
 
+                       void                    * buf_addr );
17399
 
+static int md_ioctl(   evms_logical_node_t     * node,
17400
 
+                       struct inode            * inode,
17401
 
+                       struct file             * file,
17402
 
+                       unsigned int            cmd,
17403
 
+                       unsigned long           arg);
17404
 
+static int md_ioctl_cmd_broadcast(
17405
 
+       evms_logical_node_t     *node,
17406
 
+       struct inode            *inode,
17407
 
+       struct file             *file,
17408
 
+       unsigned long           cmd,
17409
 
+       unsigned long           arg);
17410
 
+                       
17411
 
+static int md_direct_ioctl(
17412
 
+       struct inode            * inode,
17413
 
+       struct file             * file,
17414
 
+       unsigned int            cmd,
17415
 
+       unsigned long           arg);
17416
 
+
17417
 
+/* global MD data structures */
17418
 
+static evms_plugin_function_table_t md_function_table = {
17419
 
+       discover        : &md_discover,
17420
 
+       end_discover    : &md_end_discover,
17421
 
+       delete          : &md_delete,
17422
 
+       read            : &md_read,
17423
 
+       write           : &md_write,
17424
 
+       init_io         : &md_init_io,
17425
 
+       ioctl           : &md_ioctl,
17426
 
+       direct_ioctl    : &md_direct_ioctl
17427
 
+};
17428
 
+
17429
 
+static evms_plugin_header_t md_plugin_header = {
17430
 
+       id : SetPluginID(
17431
 
+               IBM_OEM_ID,
17432
 
+               EVMS_REGION_MANAGER,
17433
 
+               EVMS_MD_ID ),
17434
 
+       version : {
17435
 
+               major           : MD_MAJOR_VERSION,
17436
 
+               minor           : MD_MINOR_VERSION,
17437
 
+               patchlevel      : MD_PATCHLEVEL_VERSION
17438
 
+       },
17439
 
+       required_common_services_version: {
17440
 
+               major           : EVMS_MD_COMMON_SERVICES_MAJOR,
17441
 
+               minor           : EVMS_MD_COMMON_SERVICES_MINOR,
17442
 
+               patchlevel      : EVMS_MD_COMMON_SERVICES_PATCHLEVEL
17443
 
+       },
17444
 
+       function_table : &md_function_table
17445
 
+};
17446
 
+
17447
 
+/* local instance data structure definition */
17448
 
+typedef struct md_instance_data_s {
17449
 
+       mddev_t *mddev;
17450
 
+} md_instance_data_t;
17451
 
+
17452
 
+/* global variables */
17453
 
+static int exported_nodes;      /* total # of exported devices
17454
 
+                                 * produced during this discovery.
17455
 
+                                 */
17456
 
+static evms_logical_node_t **cur_discover_list = NULL;
17457
 
+
17458
 
+/**********************************************************/
17459
 
+/* SYSCTL - EVMS/RAID folder                             */
17460
 
+/**********************************************************/
17461
 
+
17462
 
+#ifdef CONFIG_PROC_FS
17463
 
+static struct ctl_table_header *md_table_header;
17464
 
+
17465
 
+static ctl_table md_table[] = {
17466
 
+       {DEV_EVMS_MD_SPEED_LIMIT_MIN, "speed_limit_min",
17467
 
+        &sysctl_speed_limit_min, sizeof(int), 0644, NULL, &proc_dointvec},
17468
 
+       {DEV_EVMS_MD_SPEED_LIMIT_MAX, "speed_limit_max",
17469
 
+        &sysctl_speed_limit_max, sizeof(int), 0644, NULL, &proc_dointvec},
17470
 
+       {0}
17471
 
+};
17472
 
+
17473
 
+static ctl_table md_dir_table[] = {
17474
 
+       {DEV_EVMS_MD, "md", NULL, 0, 0555, md_table},
17475
 
+       {0}
17476
 
+};
17477
 
+
17478
 
+static ctl_table evms_dir_table[] = {
17479
 
+       {DEV_EVMS, "evms", NULL, 0, 0555, md_dir_table},
17480
 
+       {0}
17481
 
+};
17482
 
+
17483
 
+static ctl_table dev_dir_table[] = {
17484
 
+       {CTL_DEV, "dev", NULL, 0, 0555, evms_dir_table},
17485
 
+       {0}
17486
 
+};
17487
 
+#endif  
17488
 
+/********** Required EVMS Plugin Functions **********/
17489
 
+
17490
 
+/*
17491
 
+ * Function: md_discover
17492
 
+ *     We should only export complete MD device nodes
17493
 
+ */
17494
 
+static int md_discover( evms_logical_node_t ** discover_list )
17495
 
+{
17496
 
+        LOG_ENTRY_EXIT("md_discover() ENTRY\n");
17497
 
+
17498
 
+        /* initialize global variable */
17499
 
+        exported_nodes = 0;
17500
 
+       cur_discover_list = discover_list;
17501
 
+       evms_md_autostart_arrays(discover_list);
17502
 
+
17503
 
+       LOG_ENTRY_EXIT("md_discover() EXIT (exported nodes: %d)\n", exported_nodes);
17504
 
+       cur_discover_list = NULL;
17505
 
+        return(exported_nodes);
17506
 
+}
17507
 
+
17508
 
+
17509
 
+/*
17510
 
+ * Function: md_discover_end
17511
 
+ */
17512
 
+static int md_end_discover( evms_logical_node_t ** discover_list )
17513
 
+{
17514
 
+       int rc = 0;
17515
 
+       mddev_t *mddev;
17516
 
+       struct md_list_head *tmp;
17517
 
+       int done = FALSE;
17518
 
+
17519
 
+       rc = md_discover(discover_list);
17520
 
+       
17521
 
+       do {
17522
 
+               done = TRUE;
17523
 
+               ITERATE_MDDEV(mddev,tmp){
17524
 
+                       if (!mddev->nr_raid_disks) {
17525
 
+                               free_mddev(mddev);
17526
 
+                               done = FALSE;
17527
 
+                               break;
17528
 
+                       }
17529
 
+                       if (mddev->flag & EVMS_MD_INCOMPLETE) {
17530
 
+                               LOG_DETAILS("trying to run incomplete array md%d\n", mdidx(mddev));
17531
 
+                               evms_md_autorun_array(discover_list,mddev);
17532
 
+                               done = FALSE;
17533
 
+                               break;
17534
 
+                       }
17535
 
+               }
17536
 
+       } while (!done);
17537
 
+       
17538
 
+       return rc;
17539
 
+}
17540
 
+
17541
 
+
17542
 
+/*
17543
 
+ * Function: md_delete_node
17544
 
+ */
17545
 
+static int md_delete( evms_logical_node_t * node)
17546
 
+{
17547
 
+       md_instance_data_t *MDID;
17548
 
+       mddev_t *mddev;
17549
 
+
17550
 
+       MDID = node->instance_data;
17551
 
+       mddev = MDID->mddev;
17552
 
+
17553
 
+       LOG_DEFAULT("md_delete() name=%s\n", evms_md_partition_name(node));
17554
 
+
17555
 
+       do_md_stop(mddev,0);
17556
 
+       if (MDID)
17557
 
+               evms_cs_deallocate_memory(MDID);
17558
 
+       evms_cs_deallocate_logical_node(node);
17559
 
+       return 0;
17560
 
+}
17561
 
+
17562
 
+
17563
 
+/*
17564
 
+ * Function: md_read
17565
 
+ */
17566
 
+static void md_read(   evms_logical_node_t     * node,
17567
 
+                       eio_t * eio)
17568
 
+{
17569
 
+       md_instance_data_t *MDID;
17570
 
+       mddev_t *mddev;
17571
 
+
17572
 
+       MDID = node->instance_data;
17573
 
+       mddev = MDID->mddev;
17574
 
+       if ((eio->rsector + eio->rsize) > node->total_vsectors)
17575
 
+               EVMS_IO_ERROR(eio);
17576
 
+       else {
17577
 
+               if (mddev && mddev->pers)
17578
 
+                       mddev->pers->make_request(mddev, READ, eio);
17579
 
+       }
17580
 
+}
17581
 
+
17582
 
+
17583
 
+/*
17584
 
+ * Function: md_write
17585
 
+ */
17586
 
+static void md_write(  evms_logical_node_t     * node,
17587
 
+                       eio_t * eio)
17588
 
+{
17589
 
+       md_instance_data_t *MDID;
17590
 
+       mddev_t *mddev;
17591
 
+
17592
 
+       MDID = node->instance_data;
17593
 
+       mddev = MDID->mddev;
17594
 
+       if ((eio->rsector + eio->rsize) > node->total_vsectors)
17595
 
+               EVMS_IO_ERROR(eio);
17596
 
+       else {
17597
 
+               if (mddev && mddev->pers)
17598
 
+                       mddev->pers->make_request(mddev, WRITE, eio);
17599
 
+       }
17600
 
+}
17601
 
+
17602
 
+
17603
 
+/*
17604
 
+ * Function: md_init_io
17605
 
+ */
17606
 
+static int md_init_io( evms_logical_node_t     * node,
17607
 
+                       int                     rw,
17608
 
+                       evms_sector_t           sect_nr,
17609
 
+                       evms_sector_t           num_sects,      /* # of sectors */
17610
 
+                       void                    * buf_addr )    /* buffer address */
17611
 
+{
17612
 
+       md_instance_data_t *MDID;
17613
 
+       mddev_t *mddev;
17614
 
+       int rc = 0;
17615
 
+
17616
 
+       MDID = node->instance_data;
17617
 
+       mddev = MDID->mddev;
17618
 
+       if (sect_nr + num_sects > node->total_vsectors) {
17619
 
+               LOG_ERROR("  md_init_io() attempt to %s beyond MD device(%s) boundary(%Lu) with sect_nr(%Lu) and num_sects(%Lu)\n",
17620
 
+                          rw ? "WRITE" : "READ", evms_md_partition_name(node),node->total_vsectors,sect_nr,num_sects);
17621
 
+               rc = -EINVAL;
17622
 
+       }
17623
 
+       if (!rc && mddev && mddev->pers)
17624
 
+               rc = mddev->pers->init_io(mddev, rw, sect_nr, num_sects, buf_addr);
17625
 
+       else
17626
 
+               rc = -EINVAL;
17627
 
+       return rc;
17628
 
+}
17629
 
+
17630
 
+
17631
 
+/*
17632
 
+ * Function: md_ioctl
17633
 
+ */
17634
 
+static int md_ioctl(
17635
 
+       evms_logical_node_t     * node,
17636
 
+       struct inode            * inode,
17637
 
+       struct file             * file,
17638
 
+       unsigned int            cmd,
17639
 
+       unsigned long           arg)
17640
 
+{
17641
 
+       md_instance_data_t      * MDID = node->instance_data;
17642
 
+       mddev_t *mddev;
17643
 
+       int rc = 0;
17644
 
+
17645
 
+        if ((!inode) || (!MDID) )
17646
 
+                rc = -EINVAL;
17647
 
+
17648
 
+        if (!rc) {
17649
 
+                switch (cmd) {
17650
 
+                       /*
17651
 
+                        * We have a problem here : there is no easy way to give a CHS
17652
 
+                        * virtual geometry. We currently pretend that we have a 2 heads
17653
 
+                        * 4 sectors (with a BIG number of cylinders...). This drives
17654
 
+                        * dosfs just mad... ;-)
17655
 
+                        */
17656
 
+
17657
 
+                        case HDIO_GETGEO:
17658
 
+                       {
17659
 
+                               struct hd_geometry hdgeo;
17660
 
+                                hdgeo.heads = 2;
17661
 
+                                hdgeo.sectors = 4;
17662
 
+                                hdgeo.cylinders = ((unsigned int)node->total_vsectors) /
17663
 
+                                        hdgeo.heads / hdgeo.sectors;
17664
 
+                                hdgeo.start = 0;
17665
 
+                                if (copy_to_user((int *)arg,
17666
 
+                                                 &hdgeo,
17667
 
+                                                 sizeof(hdgeo)))
17668
 
+                                        rc = -EFAULT;
17669
 
+                       }
17670
 
+                               break;
17671
 
+                       case EVMS_QUIESCE_VOLUME:
17672
 
+                       case EVMS_GET_DISK_LIST:
17673
 
+                       case EVMS_CHECK_MEDIA_CHANGE:
17674
 
+                       case EVMS_REVALIDATE_DISK:
17675
 
+                       case EVMS_OPEN_VOLUME:
17676
 
+                       case EVMS_CLOSE_VOLUME:
17677
 
+                                rc = md_ioctl_cmd_broadcast(
17678
 
+                                        node, inode, file, cmd, arg);
17679
 
+                                break;
17680
 
+                        case EVMS_PLUGIN_IOCTL:
17681
 
+                                rc = md_direct_ioctl(
17682
 
+                                        inode, file, cmd, arg);
17683
 
+                                break;
17684
 
+                       default:
17685
 
+                               mddev = MDID->mddev;
17686
 
+                               if (mddev == NULL) {
17687
 
+                                       rc = -ENODEV;
17688
 
+                               } else if (mddev->pers->evms_ioctl == NULL) {
17689
 
+                                       rc = -ENOSYS;
17690
 
+                               } else {
17691
 
+                                       rc = mddev->pers->evms_ioctl(mddev, inode, file, cmd, arg);
17692
 
+                               }
17693
 
+                }
17694
 
+        }
17695
 
+        return(rc);
17696
 
+}
17697
 
+
17698
 
+static int md_ioctl_cmd_broadcast(
17699
 
+       evms_logical_node_t     *node,
17700
 
+       struct inode            *inode,
17701
 
+       struct file             *file,
17702
 
+       unsigned long           cmd,
17703
 
+       unsigned long           arg)
17704
 
+{
17705
 
+        int rc = 0;
17706
 
+       md_instance_data_t *MDID;
17707
 
+       mddev_t *mddev;
17708
 
+       struct md_list_head *tmp;
17709
 
+       mdk_rdev_t *rdev;
17710
 
+
17711
 
+       MDID = node->instance_data;
17712
 
+       mddev = MDID->mddev;
17713
 
+
17714
 
+        /* broadcast this cmd to all children */
17715
 
+       ITERATE_RDEV(mddev,rdev,tmp) {
17716
 
+               if (!rdev->mddev) {
17717
 
+                       MD_BUG();
17718
 
+                       continue;
17719
 
+               }
17720
 
+               if (!rdev->virtual_spare) {
17721
 
+                       rc |= IOCTL(rdev->node, inode, file, cmd, arg);
17722
 
+               }
17723
 
+       }
17724
 
+       return (rc);
17725
 
+}
17726
 
+
17727
 
+
17728
 
+static int evms_md_add_virtual_spare (mddev_t *mddev, kdev_t dev)
17729
 
+{
17730
 
+       mdk_rdev_t *rdev;
17731
 
+       mdp_disk_t *disk = NULL;
17732
 
+       int i;
17733
 
+
17734
 
+       if (evms_md_find_rdev(mddev,dev))
17735
 
+               return -EEXIST;
17736
 
+
17737
 
+       LOG_ENTRY_EXIT("%s ENTRY\n", __FUNCTION__);
17738
 
+       if( evms_cs_allocate_memory((void**)&rdev, sizeof(*rdev)))
17739
 
+               return -ENOMEM;
17740
 
+
17741
 
+       memset(rdev, 0, sizeof(*rdev));
17742
 
+
17743
 
+       for (i = mddev->sb->raid_disks; i < MD_SB_DISKS; i++) {
17744
 
+               disk = mddev->sb->disks + i;
17745
 
+               if (!disk->major && !disk->minor)
17746
 
+                       break;
17747
 
+               if (disk_removed(disk))
17748
 
+                       break;
17749
 
+       }
17750
 
+       if (i == MD_SB_DISKS) {
17751
 
+               LOG_WARNING("%s : [md%d]can not hot-add to full array!\n", __FUNCTION__, mdidx(mddev));
17752
 
+               evms_cs_deallocate_memory(rdev);
17753
 
+               return -EBUSY;
17754
 
+       }
17755
 
+
17756
 
+       if (disk_removed(disk)) {
17757
 
+               /*
17758
 
+                * reuse slot
17759
 
+                */
17760
 
+               if (disk->number != i) {
17761
 
+                       MD_BUG();
17762
 
+                       evms_cs_deallocate_memory(rdev);
17763
 
+                       return -EINVAL;
17764
 
+               }
17765
 
+       } else {
17766
 
+               disk->number = i;
17767
 
+       }
17768
 
+
17769
 
+       disk->raid_disk = disk->number;
17770
 
+       disk->major = MAJOR(dev);
17771
 
+       disk->minor = MINOR(dev);
17772
 
+
17773
 
+       mark_disk_spare(disk);
17774
 
+
17775
 
+       rdev->mddev = mddev;
17776
 
+       rdev->dev = dev;
17777
 
+       rdev->desc_nr = disk->number;
17778
 
+       rdev->virtual_spare = 1;
17779
 
+
17780
 
+       /* bind rdev to mddev array */
17781
 
+       md_list_add(&rdev->all, &all_raid_disks);
17782
 
+       md_list_add(&rdev->same_set, &mddev->disks);
17783
 
+       MD_INIT_LIST_HEAD(&rdev->pending);
17784
 
+
17785
 
+       mddev->sb->nr_disks++;
17786
 
+       mddev->sb->spare_disks++;
17787
 
+       mddev->sb->working_disks++;
17788
 
+       mddev->nb_dev++;
17789
 
+
17790
 
+       mddev->sb_dirty = 1;
17791
 
+
17792
 
+       evms_md_update_sb(mddev);
17793
 
+
17794
 
+       return 0;
17795
 
+}
17796
 
+
17797
 
+static int evms_md_remove_disk(mddev_t *mddev, kdev_t dev)
17798
 
+{
17799
 
+       mdk_rdev_t *rdev = NULL;
17800
 
+       mdp_disk_t *disk;
17801
 
+       int rc = 0;
17802
 
+
17803
 
+       disk = evms_md_find_disk(mddev,dev);
17804
 
+       if (!disk)
17805
 
+               return -ENODEV;
17806
 
+
17807
 
+       rdev = evms_md_find_rdev(mddev,dev);
17808
 
+
17809
 
+       if (rdev && !rdev->faulty) {
17810
 
+               /*
17811
 
+                * The disk is active in the array,
17812
 
+                * must ask the personality to do it
17813
 
+                */
17814
 
+               if (mddev->pers && mddev->pers->diskop) {
17815
 
+                       /* Assume spare, try to remove it first. */
17816
 
+                       rc = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_SPARE);
17817
 
+                       if (rc)
17818
 
+                               rc = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_DISK);
17819
 
+               } else
17820
 
+                       rc = -ENOSYS;
17821
 
+       }
17822
 
+
17823
 
+       if (!rc) {
17824
 
+               remove_descriptor(disk,mddev->sb);
17825
 
+               if (rdev)
17826
 
+                       kick_rdev_from_array(rdev);
17827
 
+               mddev->sb_dirty = 1;
17828
 
+               evms_md_update_sb(mddev);
17829
 
+
17830
 
+       }
17831
 
+       return rc;
17832
 
+}
17833
 
+
17834
 
+static int evms_md_activate_spare(mddev_t *mddev, kdev_t dev)
17835
 
+{
17836
 
+       mdk_rdev_t *rdev = NULL;
17837
 
+       evms_md_activate_spare_t activate_spare;
17838
 
+       unsigned long flags;
17839
 
+       int rc = 0;
17840
 
+       
17841
 
+       rdev = evms_md_find_rdev(mddev,dev);
17842
 
+       if (rdev) {
17843
 
+               if (mddev->recovery_running) {
17844
 
+                       rc = -EBUSY;
17845
 
+               } else {
17846
 
+                       activate_spare.mddev = mddev;
17847
 
+                       activate_spare.spare = &mddev->sb->disks[rdev->sb->this_disk.number];
17848
 
+                       md_spin_lock_irqsave(&activate_spare_list_lock, flags);
17849
 
+                       if (evms_activate_spare_list == NULL)
17850
 
+                               evms_activate_spare_tail = &evms_activate_spare_list;
17851
 
+                       *evms_activate_spare_tail = &activate_spare;
17852
 
+                       evms_activate_spare_tail = &activate_spare.next;
17853
 
+                       activate_spare.next = NULL;
17854
 
+                       md_spin_unlock_irqrestore(&activate_spare_list_lock, flags);
17855
 
+       
17856
 
+                       mddev->sb->raid_disks++;
17857
 
+                       evms_md_recover_arrays();
17858
 
+               }
17859
 
+       } else {
17860
 
+               rc = -ENODEV;
17861
 
+       }
17862
 
+       return rc;
17863
 
+}
17864
 
+
17865
 
+static int evms_md_deactivate_disk(mddev_t *mddev, kdev_t dev)
17866
 
+{
17867
 
+       mdk_rdev_t *rdev = NULL;
17868
 
+       mdp_disk_t *disk;
17869
 
+       int rc = 0;
17870
 
+
17871
 
+       disk = evms_md_find_disk(mddev,dev);
17872
 
+       rdev = evms_md_find_rdev(mddev,dev);
17873
 
+       if (!disk || !rdev || rdev->faulty)
17874
 
+               return -ENODEV;
17875
 
+
17876
 
+       /* Make sure it's not a spare */
17877
 
+       if (disk_spare(disk))
17878
 
+               return -EINVAL;
17879
 
+       /*
17880
 
+        * The disk is active in the array,
17881
 
+        * must ask the personality to do it
17882
 
+        */
17883
 
+       if (mddev->pers && mddev->pers->diskop) {
17884
 
+               rc = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_DEACTIVATE_DISK);
17885
 
+               if (!rc) {
17886
 
+                       mark_disk_spare(disk);
17887
 
+                       mddev->sb->active_disks--;
17888
 
+                       mddev->sb->raid_disks--;
17889
 
+                       mddev->sb->spare_disks++;
17890
 
+                       mddev->sb_dirty = 1;
17891
 
+                       evms_md_update_sb(mddev);
17892
 
+               }
17893
 
+       } else
17894
 
+               rc = -ENOSYS;
17895
 
+
17896
 
+       return rc;
17897
 
+       
17898
 
+}
17899
 
+
17900
 
+/*
17901
 
+ * Function: md_direct_ioctl
17902
 
+ *
17903
 
+ *     This function provides a method for user-space to communicate directly
17904
 
+ *     with a plugin in the kernel.
17905
 
+ */
17906
 
+static int md_direct_ioctl(
17907
 
+       struct inode            * inode,
17908
 
+       struct file             * file,
17909
 
+       unsigned int            cmd,
17910
 
+       unsigned long           args )
17911
 
+{
17912
 
+       evms_plugin_ioctl_t     argument;
17913
 
+       kdev_t                  md_kdev;
17914
 
+       mddev_t                 *mddev = NULL;
17915
 
+       evms_md_ioctl_t         ioctl_arg;
17916
 
+       evms_md_kdev_t          device;
17917
 
+       evms_md_array_info_t    array_info, *usr_array_info;
17918
 
+       int                     rc = 0;
17919
 
+
17920
 
+        // Copy user's parameters to kernel space
17921
 
+        if ( copy_from_user(&argument, (evms_plugin_ioctl_t*)args, sizeof(argument)) )
17922
 
+                return -EFAULT;
17923
 
+
17924
 
+       // Make sure this is supposed to be our ioctl.
17925
 
+       if ( argument.feature_id != md_plugin_header.id )
17926
 
+               return -EINVAL;
17927
 
+
17928
 
+       // Copy user's md ioclt parmeters to kernel space
17929
 
+       if ( copy_from_user(&ioctl_arg,
17930
 
+                           (evms_md_ioctl_t*)argument.feature_ioctl_data,
17931
 
+                           sizeof(ioctl_arg)) )
17932
 
+               rc = -EFAULT;
17933
 
+       else {
17934
 
+               if (ioctl_arg.mddev_idx < MAX_MD_DEVS) {
17935
 
+                       md_kdev = MKDEV(MD_MAJOR, ioctl_arg.mddev_idx);
17936
 
+                       mddev = kdev_to_mddev(md_kdev);
17937
 
+                       if (mddev == NULL)
17938
 
+                               rc = -ENODEV;
17939
 
+               } else
17940
 
+                       rc = -ENODEV;
17941
 
+       }
17942
 
+
17943
 
+       if (!rc) {
17944
 
+               switch(argument.feature_command) {
17945
 
+               case EVMS_MD_PERS_IOCTL_CMD:
17946
 
+                       if (mddev->pers->md_pers_ioctl == NULL)
17947
 
+                               return -ENOSYS;
17948
 
+                       rc = mddev->pers->md_pers_ioctl(mddev,
17949
 
+                                                       ioctl_arg.cmd,
17950
 
+                                                       ioctl_arg.arg);
17951
 
+                       copy_to_user((evms_md_ioctl_t*)argument.feature_ioctl_data,
17952
 
+                                    &ioctl_arg,
17953
 
+                                    sizeof(ioctl_arg));
17954
 
+                       break;
17955
 
+
17956
 
+               case EVMS_MD_ADD:
17957
 
+                       if ( copy_from_user(&device,
17958
 
+                                           (evms_md_kdev_t*)ioctl_arg.arg,
17959
 
+                                           sizeof(device)) )
17960
 
+                               rc = -EFAULT;
17961
 
+                       else
17962
 
+                               rc = evms_md_add_virtual_spare(mddev,MKDEV(device.major, device.minor));
17963
 
+                       break;
17964
 
+
17965
 
+               case EVMS_MD_REMOVE:
17966
 
+                       if ( copy_from_user(&device,
17967
 
+                                           (evms_md_kdev_t*)ioctl_arg.arg,
17968
 
+                                           sizeof(device)) )
17969
 
+                               rc = -EFAULT;
17970
 
+                       else
17971
 
+                               rc = evms_md_remove_disk(mddev,MKDEV(device.major, device.minor));
17972
 
+                       break;
17973
 
+
17974
 
+               case EVMS_MD_ACTIVATE:
17975
 
+                       if ( copy_from_user(&device,
17976
 
+                                           (evms_md_kdev_t*)ioctl_arg.arg,
17977
 
+                                           sizeof(device)) )
17978
 
+                               rc = -EFAULT;
17979
 
+                       else
17980
 
+                               rc = evms_md_activate_spare(mddev,MKDEV(device.major, device.minor));
17981
 
+                       break;
17982
 
+
17983
 
+               case EVMS_MD_DEACTIVATE:
17984
 
+                       if ( copy_from_user(&device,
17985
 
+                                           (evms_md_kdev_t*)ioctl_arg.arg,
17986
 
+                                           sizeof(device)) )
17987
 
+                               rc = -EFAULT;
17988
 
+                       else
17989
 
+                               rc = evms_md_deactivate_disk(mddev,MKDEV(device.major, device.minor));
17990
 
+                       break;
17991
 
+
17992
 
+               case EVMS_MD_GET_ARRAY_INFO:
17993
 
+
17994
 
+                       usr_array_info = (evms_md_array_info_t*)ioctl_arg.arg;
17995
 
+                       if ( copy_from_user(&array_info, usr_array_info,
17996
 
+                                           sizeof(array_info)) )
17997
 
+                               rc = -EFAULT;
17998
 
+                       else {
17999
 
+                               array_info.state = 0;
18000
 
+                               if (mddev->curr_resync)
18001
 
+                                       array_info.state |= EVMS_MD_ARRAY_SYNCING;
18002
 
+                               copy_to_user(&usr_array_info->state, &array_info.state,
18003
 
+                                            sizeof(usr_array_info->state));
18004
 
+                               if (copy_to_user(array_info.sb, mddev->sb,
18005
 
+                                                sizeof(mdp_super_t)))
18006
 
+                                       rc = -EFAULT;
18007
 
+                       }
18008
 
+                       break;
18009
 
+               default:
18010
 
+                       rc = -ENOSYS;
18011
 
+                       break;
18012
 
+               }
18013
 
+       }
18014
 
+
18015
 
+       argument.status = rc;
18016
 
+       copy_to_user((evms_plugin_ioctl_t*)args, &argument, sizeof(argument));
18017
 
+       return rc;
18018
 
+}
18019
 
+
18020
 
+
18021
 
+
18022
 
+
18023
 
+void evms_md_add_mddev_mapping (mddev_t * mddev, kdev_t dev, void *data)
18024
 
+{
18025
 
+       unsigned int minor = MINOR(dev);
18026
 
+
18027
 
+       if (MAJOR(dev) != MD_MAJOR) {
18028
 
+               MD_BUG();
18029
 
+               return;
18030
 
+       }
18031
 
+       if (evms_mddev_map[minor].mddev != NULL) {
18032
 
+               MD_BUG();
18033
 
+               return;
18034
 
+       }
18035
 
+       evms_mddev_map[minor].mddev = mddev;
18036
 
+       evms_mddev_map[minor].data = data;
18037
 
+}
18038
 
+
18039
 
+void evms_md_del_mddev_mapping (mddev_t * mddev, kdev_t dev)
18040
 
+{
18041
 
+       unsigned int minor = MINOR(dev);
18042
 
+
18043
 
+       if (MAJOR(dev) != MD_MAJOR) {
18044
 
+               MD_BUG();
18045
 
+               return;
18046
 
+       }
18047
 
+       if (evms_mddev_map[minor].mddev != mddev) {
18048
 
+               MD_BUG();
18049
 
+               return;
18050
 
+       }
18051
 
+       evms_mddev_map[minor].mddev = NULL;
18052
 
+       evms_mddev_map[minor].data = NULL;
18053
 
+}
18054
 
+
18055
 
+static mddev_t * alloc_mddev (kdev_t dev)
18056
 
+{
18057
 
+       mddev_t *mddev;
18058
 
+
18059
 
+       if (MAJOR(dev) != MD_MAJOR) {
18060
 
+               MD_BUG();
18061
 
+               return 0;
18062
 
+       }
18063
 
+       mddev = (mddev_t *) kmalloc(sizeof(*mddev), GFP_KERNEL);
18064
 
+       if (!mddev)
18065
 
+               return NULL;
18066
 
+               
18067
 
+       memset(mddev, 0, sizeof(*mddev));
18068
 
+
18069
 
+       mddev->__minor = MINOR(dev);
18070
 
+       init_MUTEX(&mddev->reconfig_sem);
18071
 
+       init_MUTEX(&mddev->recovery_sem);
18072
 
+       init_MUTEX(&mddev->resync_sem);
18073
 
+       MD_INIT_LIST_HEAD(&mddev->disks);
18074
 
+       MD_INIT_LIST_HEAD(&mddev->all_mddevs);
18075
 
+       atomic_set(&mddev->active, 0);
18076
 
+
18077
 
+       /*
18078
 
+        * The 'base' mddev is the one with data NULL.
18079
 
+        * personalities can create additional mddevs
18080
 
+        * if necessary.
18081
 
+        */
18082
 
+       evms_md_add_mddev_mapping(mddev, dev, 0);
18083
 
+       md_list_add(&mddev->all_mddevs, &all_mddevs);
18084
 
+
18085
 
+       MOD_INC_USE_COUNT;
18086
 
+
18087
 
+       return mddev;
18088
 
+}
18089
 
+
18090
 
+mdk_rdev_t * evms_md_find_rdev_nr(mddev_t *mddev, int nr)
18091
 
+{
18092
 
+       mdk_rdev_t * rdev;
18093
 
+       struct md_list_head *tmp;
18094
 
+
18095
 
+       ITERATE_RDEV(mddev,rdev,tmp) {
18096
 
+               if (rdev->desc_nr == nr)
18097
 
+                       return rdev;
18098
 
+       }
18099
 
+       return NULL;
18100
 
+}
18101
 
+
18102
 
+
18103
 
+mdk_rdev_t * evms_md_find_rdev(mddev_t * mddev, kdev_t dev)
18104
 
+{
18105
 
+       struct md_list_head *tmp;
18106
 
+       mdk_rdev_t *rdev;
18107
 
+
18108
 
+       ITERATE_RDEV(mddev,rdev,tmp) {
18109
 
+               if (rdev->dev == dev)
18110
 
+                       return rdev;
18111
 
+       }
18112
 
+       return NULL;
18113
 
+}
18114
 
+
18115
 
+mdk_rdev_t * evms_md_find_rdev_from_node(mddev_t * mddev, evms_logical_node_t * node)
18116
 
+{
18117
 
+       struct md_list_head *tmp;
18118
 
+       mdk_rdev_t *rdev;
18119
 
+
18120
 
+       ITERATE_RDEV(mddev,rdev,tmp) {
18121
 
+               if (rdev->node == node)
18122
 
+                       return rdev;
18123
 
+       }
18124
 
+       return NULL;
18125
 
+}
18126
 
+
18127
 
+static MD_LIST_HEAD(device_names);
18128
 
+
18129
 
+static char * org_partition_name (kdev_t dev)
18130
 
+{
18131
 
+       struct gendisk *hd;
18132
 
+       static char nomem [] = "<nomem>";
18133
 
+       dev_name_t *dname;
18134
 
+       struct md_list_head *tmp = device_names.next;
18135
 
+
18136
 
+       while (tmp != &device_names) {
18137
 
+               dname = md_list_entry(tmp, dev_name_t, list);
18138
 
+               if (dname->dev == dev)
18139
 
+                       return dname->name;
18140
 
+               tmp = tmp->next;
18141
 
+       }
18142
 
+
18143
 
+       dname = (dev_name_t *) kmalloc(sizeof(*dname), GFP_KERNEL);
18144
 
+
18145
 
+       if (!dname)
18146
 
+               return nomem;
18147
 
+       /*
18148
 
+        * ok, add this new device name to the list
18149
 
+        */
18150
 
+       hd = get_gendisk (dev);
18151
 
+       dname->name = NULL;
18152
 
+       if (hd)
18153
 
+               dname->name = disk_name (hd, MINOR(dev), dname->namebuf);
18154
 
+       if (!dname->name) {
18155
 
+               sprintf (dname->namebuf, "[dev %s]", kdevname(dev));
18156
 
+               dname->name = dname->namebuf;
18157
 
+       }
18158
 
+
18159
 
+       dname->dev = dev;
18160
 
+       MD_INIT_LIST_HEAD(&dname->list);
18161
 
+       md_list_add(&dname->list, &device_names);
18162
 
+
18163
 
+       return dname->name;
18164
 
+}
18165
 
+
18166
 
+
18167
 
+#define EVMS_MD_NULL_PARTITION_NAME "<EVMS_NODE_NO_NAME>"
18168
 
+char * evms_md_partition_name (evms_logical_node_t *node)
18169
 
+{
18170
 
+       if (node && node->name)
18171
 
+               return node->name;
18172
 
+       else
18173
 
+               return EVMS_MD_NULL_PARTITION_NAME;
18174
 
+}
18175
 
+
18176
 
+static char * get_partition_name (mdk_rdev_t *rdev)
18177
 
+{
18178
 
+       if (rdev->node)
18179
 
+               return evms_md_partition_name(rdev->node);
18180
 
+       else
18181
 
+               return org_partition_name(rdev->dev);
18182
 
+}
18183
 
+
18184
 
+/*
18185
 
+ * Function: evms_md_calc_dev_sboffset
18186
 
+ *     return the LSN for md super block.
18187
 
+ */
18188
 
+static u_int64_t evms_md_calc_dev_sboffset (evms_logical_node_t *node,mddev_t *mddev, int persistent)
18189
 
+{
18190
 
+       u_int64_t size = 0;
18191
 
+
18192
 
+       size = node->total_vsectors;
18193
 
+       if (persistent) {
18194
 
+               size = MD_NEW_SIZE_SECTORS(size);
18195
 
+       }
18196
 
+       return size; /* size in sectors */
18197
 
+}
18198
 
+
18199
 
+/*
18200
 
+ * Function: evms_md_calc_dev_size
18201
 
+ *     return data size (in blocks) for an "extended" device.
18202
 
+ */
18203
 
+static unsigned long evms_md_calc_dev_size (evms_logical_node_t *node,
18204
 
+                                          mddev_t *mddev,
18205
 
+                                          int persistent)
18206
 
+{
18207
 
+       unsigned long size;
18208
 
+       u_int64_t size_in_sectors;
18209
 
+
18210
 
+       size_in_sectors = evms_md_calc_dev_sboffset(node, mddev, persistent);
18211
 
+       size = size_in_sectors >> 1;
18212
 
+       if (!mddev->sb) {
18213
 
+               MD_BUG();
18214
 
+               return size;
18215
 
+       }
18216
 
+       if (mddev->sb->chunk_size)
18217
 
+               size &= ~(mddev->sb->chunk_size/1024 - 1);
18218
 
+       return size;
18219
 
+}
18220
 
+
18221
 
+static unsigned int zoned_raid_size (mddev_t *mddev)
18222
 
+{
18223
 
+       unsigned int mask;
18224
 
+       mdk_rdev_t * rdev;
18225
 
+       struct md_list_head *tmp;
18226
 
+
18227
 
+       if (!mddev->sb) {
18228
 
+               MD_BUG();
18229
 
+               return -EINVAL;
18230
 
+       }
18231
 
+       /*
18232
 
+        * do size and offset calculations.
18233
 
+        */
18234
 
+       mask = ~(mddev->sb->chunk_size/1024 - 1);
18235
 
+
18236
 
+       ITERATE_RDEV(mddev,rdev,tmp) {
18237
 
+               rdev->size &= mask;
18238
 
+               evms_md_size[mdidx(mddev)] += rdev->size;
18239
 
+       }
18240
 
+       return 0;
18241
 
+}
18242
 
+
18243
 
+/*
18244
 
+ * We check wether all devices are numbered from 0 to nb_dev-1. The
18245
 
+ * order is guaranteed even after device name changes.
18246
 
+ *
18247
 
+ * Some personalities (raid0, linear) use this. Personalities that
18248
 
+ * provide data have to be able to deal with loss of individual
18249
 
+ * disks, so they do their checking themselves.
18250
 
+ */
18251
 
+int evms_md_check_ordering (mddev_t *mddev)
18252
 
+{
18253
 
+       int i, c;
18254
 
+       mdk_rdev_t *rdev;
18255
 
+       struct md_list_head *tmp;
18256
 
+
18257
 
+       /*
18258
 
+        * First, all devices must be fully functional
18259
 
+        */
18260
 
+       ITERATE_RDEV(mddev,rdev,tmp) {
18261
 
+               if (rdev->faulty) {
18262
 
+                       LOG_ERROR("evms_md_check_ordering() md%d's device %s faulty, aborting.\n",
18263
 
+                                  mdidx(mddev), get_partition_name(rdev));
18264
 
+                       goto abort;
18265
 
+               }
18266
 
+       }
18267
 
+
18268
 
+       c = 0;
18269
 
+       ITERATE_RDEV(mddev,rdev,tmp) {
18270
 
+               c++;
18271
 
+       }
18272
 
+       if (c != mddev->nb_dev) {
18273
 
+               MD_BUG();
18274
 
+               goto abort;
18275
 
+       }
18276
 
+       if (mddev->nb_dev != mddev->sb->raid_disks) {
18277
 
+               LOG_ERROR("[md%d] array needs %d disks, has %d, aborting.\n",
18278
 
+                          mdidx(mddev), mddev->sb->raid_disks, mddev->nb_dev);
18279
 
+               goto abort;
18280
 
+       }
18281
 
+       /*
18282
 
+        * Now the numbering check
18283
 
+        */
18284
 
+       for (i = 0; i < mddev->nb_dev; i++) {
18285
 
+               c = 0;
18286
 
+               ITERATE_RDEV(mddev,rdev,tmp) {
18287
 
+                       if (rdev->desc_nr == i)
18288
 
+                               c++;
18289
 
+               }
18290
 
+               if (!c) {
18291
 
+                       LOG_ERROR("md%d, missing disk #%d, aborting.\n",mdidx(mddev), i);
18292
 
+                       goto abort;
18293
 
+               }
18294
 
+               if (c > 1) {
18295
 
+                       LOG_ERROR("md%d, too many disks #%d, aborting.\n",mdidx(mddev), i);
18296
 
+                       goto abort;
18297
 
+               }
18298
 
+       }
18299
 
+       return 0;
18300
 
+abort:
18301
 
+       return 1;
18302
 
+}
18303
 
+
18304
 
+static void remove_descriptor (mdp_disk_t *disk, mdp_super_t *sb)
18305
 
+{
18306
 
+       if (disk_active(disk)) {
18307
 
+               sb->working_disks--;
18308
 
+       } else {
18309
 
+               if (disk_spare(disk)) {
18310
 
+                       sb->spare_disks--;
18311
 
+                       sb->working_disks--;
18312
 
+               } else  {
18313
 
+                       sb->failed_disks--;
18314
 
+               }
18315
 
+       }
18316
 
+       sb->nr_disks--;
18317
 
+       disk->major = disk->minor = 0;
18318
 
+       mark_disk_removed(disk);
18319
 
+}
18320
 
+
18321
 
+#define BAD_MAGIC \
18322
 
+"invalid raid superblock magic on %s\n"
18323
 
+
18324
 
+#define BAD_MINOR \
18325
 
+"%s: invalid raid minor (%x)\n"
18326
 
+
18327
 
+#define NO_SB \
18328
 
+"disabled device %s, could not read superblock.\n"
18329
 
+
18330
 
+#define BAD_CSUM \
18331
 
+"invalid superblock checksum on %s\n"
18332
 
+
18333
 
+
18334
 
+static int alloc_array_sb (mddev_t * mddev)
18335
 
+{
18336
 
+       if (mddev->sb) {
18337
 
+               MD_BUG();
18338
 
+               return 0;
18339
 
+       }
18340
 
+
18341
 
+       mddev->sb = (mdp_super_t *) __get_free_page (GFP_KERNEL);
18342
 
+       if (!mddev->sb) {
18343
 
+               LOG_ERROR("%s: Out of memory!\n", __FUNCTION__);
18344
 
+               return -ENOMEM;
18345
 
+       }
18346
 
+       md_clear_page(mddev->sb);
18347
 
+       return 0;
18348
 
+}
18349
 
+
18350
 
+static int alloc_disk_sb (mdk_rdev_t * rdev)
18351
 
+{
18352
 
+       if (rdev->sb)
18353
 
+               MD_BUG();
18354
 
+
18355
 
+       rdev->sb = (mdp_super_t *) __get_free_page(GFP_KERNEL);
18356
 
+       if (!rdev->sb) {
18357
 
+               LOG_ERROR("%s: Out of memory!\n", __FUNCTION__);
18358
 
+               return -EINVAL;
18359
 
+       }
18360
 
+       md_clear_page(rdev->sb);
18361
 
+
18362
 
+       return 0;
18363
 
+}
18364
 
+
18365
 
+/*
18366
 
+ * Function: free_disk_sb
18367
 
+ *
18368
 
+ */
18369
 
+static void free_disk_sb (mdk_rdev_t * rdev)
18370
 
+{
18371
 
+       if (rdev->sb) {
18372
 
+               free_page((unsigned long) rdev->sb);
18373
 
+               rdev->sb = NULL;
18374
 
+               rdev->sb_offset = 0;
18375
 
+               rdev->size = 0;
18376
 
+       } else {
18377
 
+               if (!rdev->virtual_spare && !rdev->faulty)
18378
 
+                       MD_BUG();
18379
 
+       }
18380
 
+}
18381
 
+
18382
 
+/*
18383
 
+ * Function: evms_md_read_disk_sb
18384
 
+ *     Read the MD superblock.
18385
 
+ */
18386
 
+static int evms_md_read_disk_sb (mdk_rdev_t * rdev)
18387
 
+{
18388
 
+       int rc = 0;
18389
 
+       evms_logical_node_t *node = rdev->node;
18390
 
+       u_int64_t sb_offset_in_sectors;
18391
 
+
18392
 
+       if (!rdev->sb) {
18393
 
+               MD_BUG();
18394
 
+               return -EINVAL;
18395
 
+       }
18396
 
+       if (node->total_vsectors <= MD_RESERVED_SECTORS) {
18397
 
+               LOG_DETAILS("%s is too small, total_vsectors(%Lu)\n",
18398
 
+                          evms_md_partition_name(node), node->total_vsectors);
18399
 
+               return -EINVAL;
18400
 
+       }
18401
 
+       
18402
 
+       /*
18403
 
+        * Calculate the position of the superblock,
18404
 
+        * it's at the end of the disk
18405
 
+        */
18406
 
+       sb_offset_in_sectors = evms_md_calc_dev_sboffset(node, rdev->mddev, 1);
18407
 
+       rdev->sb_offset = (unsigned long)(sb_offset_in_sectors >> 1);
18408
 
+       LOG_DEBUG("(read) %s's sb offset(%Lu) total_vsectors(%Lu)\n",
18409
 
+                  evms_md_partition_name(node), sb_offset_in_sectors, node->total_vsectors);
18410
 
+
18411
 
+       /*
18412
 
+        * Read superblock
18413
 
+        */
18414
 
+       rc = INIT_IO(node, 0, sb_offset_in_sectors, MD_SB_SECTORS, rdev->sb);
18415
 
+
18416
 
+       if (!rc) {
18417
 
+               LOG_DEBUG(" [events: %x]\n", rdev->sb->events_lo);
18418
 
+       } else {
18419
 
+               LOG_ERROR(NO_SB, evms_md_partition_name(node));
18420
 
+       }
18421
 
+       return rc;
18422
 
+}
18423
 
+
18424
 
+static unsigned int calc_sb_csum (mdp_super_t * sb)
18425
 
+{
18426
 
+       unsigned int disk_csum, csum;
18427
 
+
18428
 
+       disk_csum = sb->sb_csum;
18429
 
+       sb->sb_csum = 0;
18430
 
+       csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
18431
 
+       sb->sb_csum = disk_csum;
18432
 
+       return csum;
18433
 
+}
18434
 
+
18435
 
+
18436
 
+
18437
 
+/*
18438
 
+ * Check one RAID superblock for generic plausibility
18439
 
+ */
18440
 
+
18441
 
+static int check_disk_sb (mdk_rdev_t * rdev)
18442
 
+{
18443
 
+       mdp_super_t *sb;
18444
 
+       int ret = -EINVAL;
18445
 
+
18446
 
+       sb = rdev->sb;
18447
 
+       if (!sb) {
18448
 
+               MD_BUG();
18449
 
+               goto abort;
18450
 
+       }
18451
 
+
18452
 
+       if (sb->md_magic != MD_SB_MAGIC) {
18453
 
+               LOG_DEBUG(BAD_MAGIC, get_partition_name(rdev));
18454
 
+               goto abort;
18455
 
+       }
18456
 
+
18457
 
+       if (sb->md_minor >= MAX_MD_DEVS) {
18458
 
+               LOG_ERROR(BAD_MINOR, get_partition_name(rdev), sb->md_minor);
18459
 
+               goto abort;
18460
 
+       }
18461
 
+       if (calc_sb_csum(sb) != sb->sb_csum) {
18462
 
+               LOG_ERROR(BAD_CSUM, get_partition_name(rdev));
18463
 
+               goto abort;
18464
 
+       }
18465
 
+       ret = 0;
18466
 
+abort:
18467
 
+       return ret;
18468
 
+}
18469
 
+
18470
 
+static kdev_t dev_unit(kdev_t dev)
18471
 
+{
18472
 
+       unsigned int mask;
18473
 
+       struct gendisk *hd = get_gendisk(dev);
18474
 
+
18475
 
+       if (!hd)
18476
 
+               return 0;
18477
 
+       mask = ~((1 << hd->minor_shift) - 1);
18478
 
+
18479
 
+       return MKDEV(MAJOR(dev), MINOR(dev) & mask);
18480
 
+}
18481
 
+
18482
 
+static mdk_rdev_t * match_dev_unit(mddev_t *mddev, kdev_t dev)
18483
 
+{
18484
 
+       struct md_list_head *tmp;
18485
 
+       mdk_rdev_t *rdev;
18486
 
+
18487
 
+       ITERATE_RDEV(mddev,rdev,tmp)
18488
 
+               if (dev_unit(rdev->dev) == dev_unit(dev))
18489
 
+                       return rdev;
18490
 
+
18491
 
+       return NULL;
18492
 
+}
18493
 
+
18494
 
+static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
18495
 
+{
18496
 
+       struct md_list_head *tmp;
18497
 
+       mdk_rdev_t *rdev;
18498
 
+
18499
 
+       ITERATE_RDEV(mddev1,rdev,tmp)
18500
 
+               if (match_dev_unit(mddev2, rdev->dev))
18501
 
+                       return 1;
18502
 
+
18503
 
+       return 0;
18504
 
+}
18505
 
+
18506
 
+
18507
 
+static void bind_rdev_to_array (mdk_rdev_t * rdev, mddev_t * mddev)
18508
 
+{
18509
 
+       mdk_rdev_t *same_pdev;
18510
 
+
18511
 
+       if (rdev->mddev) {
18512
 
+               MD_BUG();
18513
 
+               return;
18514
 
+       }
18515
 
+
18516
 
+       same_pdev = match_dev_unit(mddev, rdev->dev);
18517
 
+       if (same_pdev)
18518
 
+               LOG_WARNING("[md%d] WARNING: %s appears to be on the same physical disk as %s. True\n"
18519
 
+                           "     protection against single-disk failure might be compromised.\n",
18520
 
+                           mdidx(mddev), get_partition_name(rdev),get_partition_name(same_pdev));
18521
 
+               
18522
 
+       md_list_add(&rdev->same_set, &mddev->disks);
18523
 
+       rdev->mddev = mddev;
18524
 
+       mddev->nb_dev++;
18525
 
+       if (rdev->sb && disk_active(&rdev->sb->this_disk))
18526
 
+               mddev->nr_raid_disks++;
18527
 
+       LOG_DETAILS("bind<%s,%d>\n", get_partition_name(rdev), rdev->mddev->nb_dev);
18528
 
+}
18529
 
+
18530
 
+static void unbind_rdev_from_array (mdk_rdev_t * rdev)
18531
 
+{
18532
 
+       if (!rdev->mddev) {
18533
 
+               MD_BUG();
18534
 
+               return;
18535
 
+       }
18536
 
+       md_list_del(&rdev->same_set);
18537
 
+       MD_INIT_LIST_HEAD(&rdev->same_set);
18538
 
+       rdev->mddev->nb_dev--;
18539
 
+       if (rdev->sb && disk_active(&rdev->sb->this_disk))
18540
 
+               rdev->mddev->nr_raid_disks--;
18541
 
+       LOG_DETAILS("unbind<%s,%d>\n", get_partition_name(rdev), rdev->mddev->nb_dev);
18542
 
+       rdev->mddev = NULL;
18543
 
+}
18544
 
+
18545
 
+
18546
 
+/*
18547
 
+ * Function: evms_md_export_rdev
18548
 
+ *     EVMS MD version of export_rdev()
18549
 
+ *     Discard this MD "extended" device
18550
 
+ */
18551
 
+static void evms_md_export_rdev (mdk_rdev_t * rdev)
18552
 
+{
18553
 
+       LOG_DETAILS("%s: (%s)\n", __FUNCTION__ , get_partition_name(rdev));
18554
 
+       if (rdev->mddev)
18555
 
+               MD_BUG();
18556
 
+       free_disk_sb(rdev);
18557
 
+       md_list_del(&rdev->all);
18558
 
+       MD_INIT_LIST_HEAD(&rdev->all);
18559
 
+       if (rdev->pending.next != &rdev->pending) {
18560
 
+               LOG_WARNING("%s: (%s was pending)\n",__FUNCTION__ ,get_partition_name(rdev));
18561
 
+               md_list_del(&rdev->pending);
18562
 
+               MD_INIT_LIST_HEAD(&rdev->pending);
18563
 
+       }
18564
 
+       if (rdev->node) {
18565
 
+               LOG_DETAILS("%s: deleting node %s\n", __FUNCTION__, get_partition_name(rdev));
18566
 
+               if (cur_discover_list) {
18567
 
+                       LOG_DETAILS("%s: remove (%s) from discover list.\n", __FUNCTION__,
18568
 
+                               get_partition_name(rdev));
18569
 
+                       evms_cs_remove_logical_node_from_list(cur_discover_list, rdev->node);
18570
 
+               }
18571
 
+               DELETE(rdev->node);
18572
 
+               rdev->node = NULL;
18573
 
+       }
18574
 
+       rdev->dev = 0;
18575
 
+       rdev->faulty = 0;
18576
 
+       kfree(rdev);
18577
 
+}
18578
 
+
18579
 
+
18580
 
+static void kick_rdev_from_array (mdk_rdev_t * rdev)
18581
 
+{
18582
 
+       LOG_DEFAULT("%s: (%s)\n", __FUNCTION__,get_partition_name(rdev));
18583
 
+       unbind_rdev_from_array(rdev);
18584
 
+       evms_md_export_rdev(rdev);
18585
 
+}
18586
 
+
18587
 
+static void export_array (mddev_t *mddev)
18588
 
+{
18589
 
+       struct md_list_head *tmp;
18590
 
+       mdk_rdev_t *rdev;
18591
 
+       mdp_super_t *sb = mddev->sb;
18592
 
+
18593
 
+       LOG_DEFAULT("%s: [md%d]\n",__FUNCTION__ ,mdidx(mddev));
18594
 
+       if (mddev->sb) {
18595
 
+               mddev->sb = NULL;
18596
 
+               free_page((unsigned long) sb);
18597
 
+       }
18598
 
+
18599
 
+       LOG_DEBUG("%s: removing all extended devices belong to md%d\n",__FUNCTION__,mdidx(mddev));
18600
 
+       ITERATE_RDEV(mddev,rdev,tmp) {
18601
 
+               if (!rdev->mddev) {
18602
 
+                       MD_BUG();
18603
 
+                       continue;
18604
 
+               }
18605
 
+               kick_rdev_from_array(rdev);
18606
 
+       }
18607
 
+       if (mddev->nb_dev)
18608
 
+               MD_BUG();
18609
 
+}
18610
 
+
18611
 
+static void free_mddev (mddev_t *mddev)
18612
 
+{
18613
 
+       if (!mddev) {
18614
 
+               MD_BUG();
18615
 
+               return;
18616
 
+       }
18617
 
+
18618
 
+       export_array(mddev);
18619
 
+       evms_md_size[mdidx(mddev)] = 0;
18620
 
+
18621
 
+
18622
 
+       /*
18623
 
+        * Make sure nobody else is using this mddev
18624
 
+        * (careful, we rely on the global kernel lock here)
18625
 
+        */
18626
 
+       while (md_atomic_read(&mddev->resync_sem.count) != 1)
18627
 
+               schedule();
18628
 
+       while (md_atomic_read(&mddev->recovery_sem.count) != 1)
18629
 
+               schedule();
18630
 
+
18631
 
+       evms_md_del_mddev_mapping(mddev, MKDEV(MD_MAJOR, mdidx(mddev)));
18632
 
+       md_list_del(&mddev->all_mddevs);
18633
 
+       MD_INIT_LIST_HEAD(&mddev->all_mddevs);
18634
 
+       kfree(mddev);
18635
 
+       MOD_DEC_USE_COUNT;
18636
 
+}
18637
 
+
18638
 
+
18639
 
+static void print_desc(mdp_disk_t *desc)
18640
 
+{
18641
 
+       printk(" DISK<N:%d,R:%d,S:%d>\n", desc->number,
18642
 
+               desc->raid_disk,desc->state);
18643
 
+}
18644
 
+
18645
 
+static void print_sb(mdp_super_t *sb)
18646
 
+{
18647
 
+       int i;
18648
 
+
18649
 
+       printk(" SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
18650
 
+               sb->major_version, sb->minor_version, sb->patch_version,
18651
 
+               sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
18652
 
+               sb->ctime);
18653
 
+       printk("    L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", sb->level,
18654
 
+               sb->size, sb->nr_disks, sb->raid_disks, sb->md_minor,
18655
 
+               sb->layout, sb->chunk_size);
18656
 
+       printk("    UT:%08x ST:%d AD:%d WD:%d FD:%d SD:%d CSUM:%08x E:%x\n",
18657
 
+               sb->utime, sb->state, sb->active_disks, sb->working_disks,
18658
 
+               sb->failed_disks, sb->spare_disks,
18659
 
+               sb->sb_csum, sb->events_lo);
18660
 
+
18661
 
+       for (i = 0; i < MD_SB_DISKS; i++) {
18662
 
+               mdp_disk_t *desc;
18663
 
+
18664
 
+               desc = sb->disks + i;
18665
 
+               if (desc->number || desc->major || desc->minor || desc->raid_disk || (desc->state && (desc->state != 4))) {
18666
 
+                       printk("     D %2d: ", i);
18667
 
+                       print_desc(desc);
18668
 
+               }
18669
 
+       }
18670
 
+       printk("    THIS: ");
18671
 
+       print_desc(&sb->this_disk);
18672
 
+
18673
 
+}
18674
 
+
18675
 
+static void print_rdev(mdk_rdev_t *rdev)
18676
 
+{
18677
 
+       printk("rdev %s: SZ:%08ld F:%d DN:%d ",
18678
 
+               get_partition_name(rdev),
18679
 
+               rdev->size, rdev->faulty, rdev->desc_nr);
18680
 
+       if (rdev->sb) {
18681
 
+               printk("rdev superblock:\n");
18682
 
+               print_sb(rdev->sb);
18683
 
+       } else
18684
 
+               printk("no rdev superblock!\n");
18685
 
+}
18686
 
+
18687
 
+void evms_md_print_devices (void)
18688
 
+{
18689
 
+       struct md_list_head *tmp, *tmp2;
18690
 
+       mdk_rdev_t *rdev;
18691
 
+       mddev_t *mddev;
18692
 
+
18693
 
+       printk("\n");
18694
 
+       printk(":       **********************************\n");
18695
 
+       printk(":       * <COMPLETE RAID STATE PRINTOUT> *\n");
18696
 
+       printk(":       **********************************\n");
18697
 
+       ITERATE_MDDEV(mddev,tmp) {
18698
 
+               printk("md%d: ", mdidx(mddev));
18699
 
+
18700
 
+               ITERATE_RDEV(mddev,rdev,tmp2)
18701
 
+                       printk("<%s>", get_partition_name(rdev));
18702
 
+
18703
 
+               if (mddev->sb) {
18704
 
+                       printk(" array superblock:\n");
18705
 
+                       print_sb(mddev->sb);
18706
 
+               } else
18707
 
+                       printk(" no array superblock.\n");
18708
 
+
18709
 
+               ITERATE_RDEV(mddev,rdev,tmp2)
18710
 
+                       print_rdev(rdev);
18711
 
+       }
18712
 
+       printk(":       **********************************\n");
18713
 
+       printk("\n");
18714
 
+}
18715
 
+
18716
 
+static int sb_equal ( mdp_super_t *sb1, mdp_super_t *sb2)
18717
 
+{
18718
 
+       int ret;
18719
 
+       mdp_super_t *tmp1, *tmp2;
18720
 
+
18721
 
+       tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
18722
 
+       tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
18723
 
+
18724
 
+       if (!tmp1 || !tmp2) {
18725
 
+               ret = 0;
18726
 
+               printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n");
18727
 
+               goto abort;
18728
 
+       }
18729
 
+
18730
 
+       *tmp1 = *sb1;
18731
 
+       *tmp2 = *sb2;
18732
 
+
18733
 
+       /*
18734
 
+        * nr_disks is not constant
18735
 
+        */
18736
 
+       tmp1->nr_disks = 0;
18737
 
+       tmp2->nr_disks = 0;
18738
 
+
18739
 
+       if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
18740
 
+               ret = 0;
18741
 
+       else
18742
 
+               ret = 1;
18743
 
+
18744
 
+abort:
18745
 
+       if (tmp1)
18746
 
+               kfree(tmp1);
18747
 
+       if (tmp2)
18748
 
+               kfree(tmp2);
18749
 
+
18750
 
+       return ret;
18751
 
+}
18752
 
+
18753
 
+static int uuid_equal(mdk_rdev_t *rdev1, mdk_rdev_t *rdev2)
18754
 
+{
18755
 
+       if (    (rdev1->sb->set_uuid0 == rdev2->sb->set_uuid0) &&
18756
 
+               (rdev1->sb->set_uuid1 == rdev2->sb->set_uuid1) &&
18757
 
+               (rdev1->sb->set_uuid2 == rdev2->sb->set_uuid2) &&
18758
 
+               (rdev1->sb->set_uuid3 == rdev2->sb->set_uuid3))
18759
 
+
18760
 
+               return 1;
18761
 
+
18762
 
+       return 0;
18763
 
+}
18764
 
+
18765
 
+/*
18766
 
+ * Function: evms_md_find_rdev_all
18767
 
+ *     EVMS MD version of find_rdev_all() above
18768
 
+ *     Search entire all_raid_disks for "node"
18769
 
+ *     Return the MD "extended" device if found.
18770
 
+ */
18771
 
+static mdk_rdev_t * evms_md_find_rdev_all (evms_logical_node_t *node)
18772
 
+{
18773
 
+       struct md_list_head *tmp;
18774
 
+       mdk_rdev_t *rdev;
18775
 
+
18776
 
+       tmp = all_raid_disks.next;
18777
 
+       while (tmp != &all_raid_disks) {
18778
 
+               rdev = md_list_entry(tmp, mdk_rdev_t, all);
18779
 
+               if (rdev->node == node)
18780
 
+                       return rdev;
18781
 
+               tmp = tmp->next;
18782
 
+       }
18783
 
+       return NULL;
18784
 
+}
18785
 
+
18786
 
+
18787
 
+/*
18788
 
+ * Function: evms_md_write_disk_sb
18789
 
+ *     EVMS MD version of write_disk_sb
18790
 
+ */
18791
 
+static int evms_md_write_disk_sb(mdk_rdev_t * rdev)
18792
 
+{
18793
 
+       unsigned long size;
18794
 
+       u_int64_t sb_offset_in_sectors;
18795
 
+
18796
 
+       if (!rdev->sb) {
18797
 
+               MD_BUG();
18798
 
+               return 1;
18799
 
+       }
18800
 
+       if (rdev->faulty) {
18801
 
+               MD_BUG();
18802
 
+               return 1;
18803
 
+       }
18804
 
+       if (rdev->sb->md_magic != MD_SB_MAGIC) {
18805
 
+               MD_BUG();
18806
 
+               return 1;
18807
 
+       }
18808
 
+
18809
 
+       sb_offset_in_sectors = evms_md_calc_dev_sboffset(rdev->node, rdev->mddev, 1);
18810
 
+       if (rdev->sb_offset != (sb_offset_in_sectors >> 1)) {
18811
 
+               LOG_WARNING("%s's sb offset has changed from blocks(%ld) to blocks(%ld), skipping\n",
18812
 
+                          get_partition_name(rdev),
18813
 
+                          rdev->sb_offset,
18814
 
+                          (unsigned long)(sb_offset_in_sectors >> 1));
18815
 
+               goto skip;
18816
 
+       }
18817
 
+       /*
18818
 
+        * If the disk went offline meanwhile and it's just a spare, then
18819
 
+        * its size has changed to zero silently, and the MD code does
18820
 
+        * not yet know that it's faulty.
18821
 
+        */
18822
 
+       size = evms_md_calc_dev_size(rdev->node, rdev->mddev, 1);
18823
 
+       if (size != rdev->size) {
18824
 
+               LOG_WARNING("%s's size has changed from %ld to %ld since import, skipping\n",
18825
 
+                          get_partition_name(rdev), rdev->size, size);
18826
 
+               goto skip;
18827
 
+       }
18828
 
+
18829
 
+       LOG_DETAILS("(write) %s's sb offset: %Lu\n",get_partition_name(rdev), sb_offset_in_sectors);
18830
 
+
18831
 
+       INIT_IO(rdev->node,WRITE,sb_offset_in_sectors,MD_SB_SECTORS,rdev->sb);
18832
 
+
18833
 
+skip:
18834
 
+       return 0;
18835
 
+}
18836
 
+
18837
 
+static int evms_md_sync_sbs(mddev_t * mddev)
18838
 
+{
18839
 
+       mdk_rdev_t *rdev;
18840
 
+       struct md_list_head *tmp;
18841
 
+       mdp_disk_t * disk;
18842
 
+
18843
 
+       ITERATE_RDEV(mddev,rdev,tmp) {
18844
 
+               if (rdev->virtual_spare || rdev->faulty || rdev->alias_device)
18845
 
+                       continue;
18846
 
+                       
18847
 
+               /* copy everything from the master */
18848
 
+               *rdev->sb = *mddev->sb;
18849
 
+               
18850
 
+               /* this_disk is unique, copy it from the master */
18851
 
+//             rdev->sb->this_disk = mddev->sb->disks[rdev->desc_nr];
18852
 
+               // use the SB disk array since if update occurred on normal shutdown
18853
 
+               // the rdevs may be out of date.
18854
 
+               disk = evms_md_find_disk(mddev, rdev->dev);
18855
 
+               if (disk) {
18856
 
+                       rdev->sb->this_disk = *disk;
18857
 
+               }
18858
 
+               
18859
 
+               rdev->sb->sb_csum = calc_sb_csum(rdev->sb);
18860
 
+       }
18861
 
+       return 0;
18862
 
+}
18863
 
+
18864
 
+int evms_md_update_sb_sync(mddev_t * mddev)
18865
 
+{
18866
 
+       mdk_rdev_t *rdev;
18867
 
+       struct md_list_head *tmp;
18868
 
+
18869
 
+       ITERATE_RDEV(mddev,rdev,tmp) {
18870
 
+               if (rdev->virtual_spare || rdev->faulty || rdev->alias_device)
18871
 
+                       continue;
18872
 
+                       
18873
 
+               /* found first good device, so read the new SB */
18874
 
+               if (!evms_md_read_disk_sb(rdev)){
18875
 
+                       /* this_disk is unique, copy it from the master */
18876
 
+                       if (rdev->sb->md_magic == MD_SB_MAGIC) {
18877
 
+                               *mddev->sb = *rdev->sb;
18878
 
+                               mddev->sb->state |= 1 << MD_SB_CLEAN;
18879
 
+                               evms_md_update_sb(mddev);
18880
 
+                               break;
18881
 
+                       }
18882
 
+               }
18883
 
+               
18884
 
+       }
18885
 
+       return 0;
18886
 
+
18887
 
+}
18888
 
+int evms_md_update_sb(mddev_t * mddev)
18889
 
+{
18890
 
+       int err, count = 100;
18891
 
+       struct md_list_head *tmp;
18892
 
+       mdk_rdev_t *rdev;
18893
 
+
18894
 
+
18895
 
+repeat:
18896
 
+       mddev->sb->utime = CURRENT_TIME;
18897
 
+       if ((++mddev->sb->events_lo)==0)
18898
 
+               ++mddev->sb->events_hi;
18899
 
+
18900
 
+       if ((mddev->sb->events_lo|mddev->sb->events_hi)==0) {
18901
 
+               /*
18902
 
+                * oops, this 64-bit counter should never wrap.
18903
 
+                * Either we are in around ~1 trillion A.C., assuming
18904
 
+                * 1 reboot per second, or we have a bug:
18905
 
+                */
18906
 
+               MD_BUG();
18907
 
+               mddev->sb->events_lo = mddev->sb->events_hi = 0xffffffff;
18908
 
+       }
18909
 
+       evms_md_sync_sbs(mddev);
18910
 
+
18911
 
+       /*
18912
 
+        * do not write anything to disk if using
18913
 
+        * nonpersistent superblocks
18914
 
+        */
18915
 
+       if (mddev->sb->not_persistent)
18916
 
+               return 0;
18917
 
+
18918
 
+       LOG_DETAILS("%s: updating [md%d] superblock\n",__FUNCTION__ ,mdidx(mddev));
18919
 
+
18920
 
+       err = 0;
18921
 
+       ITERATE_RDEV(mddev,rdev,tmp) {
18922
 
+               if (!rdev->virtual_spare && !rdev->faulty && !rdev->alias_device) {
18923
 
+                       LOG_DETAILS(" %s [events: %x]",
18924
 
+                               get_partition_name(rdev),
18925
 
+                               rdev->sb->events_lo);
18926
 
+                       err += evms_md_write_disk_sb(rdev);
18927
 
+               } else {
18928
 
+                       if (rdev->faulty)
18929
 
+                               LOG_DETAILS(" skipping faulty %s\n", get_partition_name(rdev));
18930
 
+                       if (rdev->alias_device)
18931
 
+                               LOG_DETAILS(" skipping alias %s\n", get_partition_name(rdev));
18932
 
+                       if (rdev->virtual_spare)
18933
 
+                               LOG_DETAILS(" skipping virtual spare.\n");
18934
 
+               }
18935
 
+       }
18936
 
+       if (err) {
18937
 
+               if (--count) {
18938
 
+                       LOG_WARNING("errors occurred during superblock update, repeating\n");
18939
 
+                       goto repeat;
18940
 
+               }
18941
 
+               LOG_ERROR("excessive errors occurred during superblock update, exiting\n");
18942
 
+       }
18943
 
+       return 0;
18944
 
+}
18945
 
+
18946
 
+/*
18947
 
+ * Function: evms_md_import_device
18948
 
+ *     Insure that node is not yet imported.
18949
 
+ *     Read and validate the MD super block on this device
18950
 
+ *     Add to the global MD "extended" devices list (all_raid_disks)
18951
 
+ *
18952
 
+ */
18953
 
+static int evms_md_import_device (evms_logical_node_t **discover_list,
18954
 
+                                 evms_logical_node_t *node,
18955
 
+                                 int on_disk)
18956
 
+{
18957
 
+       int err;
18958
 
+       mdk_rdev_t *rdev;
18959
 
+
18960
 
+       LOG_ENTRY_EXIT("%s: discovering %s\n",__FUNCTION__,evms_md_partition_name(node));
18961
 
+
18962
 
+       if (evms_md_find_rdev_all(node)) {
18963
 
+               LOG_DEBUG("%s exists\n", evms_md_partition_name(node));
18964
 
+               return -EEXIST;
18965
 
+       }
18966
 
+
18967
 
+       rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL);
18968
 
+       if (!rdev) {
18969
 
+               LOG_ERROR("could not alloc mem for %s!\n", evms_md_partition_name(node));
18970
 
+               return -ENOMEM;
18971
 
+       }
18972
 
+       memset(rdev, 0, sizeof(*rdev));
18973
 
+
18974
 
+       if ((err = alloc_disk_sb(rdev)))
18975
 
+               goto abort_free;
18976
 
+
18977
 
+       rdev->node = node; /* set this for evms_md_read_disk_sb() */
18978
 
+       
18979
 
+       rdev->desc_nr = -1;
18980
 
+       rdev->faulty = 0;
18981
 
+
18982
 
+       if (!node->total_vsectors) {
18983
 
+               LOG_ERROR("%s has zero size, marking faulty!\n", evms_md_partition_name(node));
18984
 
+               err = -EINVAL;
18985
 
+               goto abort_free;
18986
 
+       }
18987
 
+
18988
 
+       if (on_disk) {
18989
 
+               if ((err = evms_md_read_disk_sb(rdev))) {
18990
 
+                       LOG_EXTRA("could not read %s's sb, not importing!\n",evms_md_partition_name(node));
18991
 
+                       goto abort_free;
18992
 
+               }
18993
 
+               if ((err = check_disk_sb(rdev))) {
18994
 
+                       LOG_EXTRA("%s has invalid sb, not importing!\n",evms_md_partition_name(node));
18995
 
+                       goto abort_free;
18996
 
+               }
18997
 
+               if (rdev->sb->level != -4) {
18998
 
+                       rdev->old_dev = MKDEV(rdev->sb->this_disk.major,
18999
 
+                                               rdev->sb->this_disk.minor);
19000
 
+                       rdev->desc_nr = rdev->sb->this_disk.number;
19001
 
+               } else {
19002
 
+                       rdev->old_dev = MKDEV(0, 0);
19003
 
+                       rdev->desc_nr = -1;
19004
 
+               }
19005
 
+               rdev->dev = MKDEV(rdev->sb->this_disk.major, rdev->sb->this_disk.minor);
19006
 
+               LOG_DETAILS("FOUND %s desc_nr(%d)\n", get_partition_name(rdev), rdev->desc_nr);
19007
 
+       }
19008
 
+       md_list_add(&rdev->all, &all_raid_disks);
19009
 
+       MD_INIT_LIST_HEAD(&rdev->pending);
19010
 
+
19011
 
+       if (rdev->faulty && rdev->sb)
19012
 
+               free_disk_sb(rdev);
19013
 
+
19014
 
+       return 0;
19015
 
+
19016
 
+abort_free:
19017
 
+       if (rdev->sb) {
19018
 
+               free_disk_sb(rdev);
19019
 
+       }
19020
 
+       kfree(rdev);
19021
 
+       return err;
19022
 
+}
19023
 
+
19024
 
+
19025
 
+
19026
 
+/*
19027
 
+ * Function: evms_md_analyze_sbs
19028
 
+ *     EVMS MD version of analyze_sbs()
19029
 
+ */
19030
 
+static int evms_md_analyze_sbs (mddev_t * mddev)
19031
 
+{
19032
 
+       int out_of_date = 0, i;
19033
 
+       struct md_list_head *tmp, *tmp2;
19034
 
+       mdk_rdev_t *rdev, *rdev2, *freshest;
19035
 
+       mdp_super_t *sb;
19036
 
+
19037
 
+       LOG_ENTRY_EXIT("Analyzing all superblocks...\n");
19038
 
+       /*
19039
 
+        * Verify the RAID superblock on each real device
19040
 
+        */
19041
 
+       ITERATE_RDEV(mddev,rdev,tmp) {
19042
 
+               if (rdev->faulty) {
19043
 
+                       MD_BUG();
19044
 
+                       goto abort;
19045
 
+               }
19046
 
+               if (!rdev->sb) {
19047
 
+                       MD_BUG();
19048
 
+                       goto abort;
19049
 
+               }
19050
 
+               if (check_disk_sb(rdev))
19051
 
+                       goto abort;
19052
 
+       }
19053
 
+
19054
 
+       /*
19055
 
+        * The superblock constant part has to be the same
19056
 
+        * for all disks in the array.
19057
 
+        */
19058
 
+       sb = NULL;
19059
 
+
19060
 
+       ITERATE_RDEV(mddev,rdev,tmp) {
19061
 
+               if (!sb) {
19062
 
+                       sb = rdev->sb;
19063
 
+                       continue;
19064
 
+               }
19065
 
+               if (!sb_equal(sb, rdev->sb)) {
19066
 
+                       LOG_WARNING("kick out %s\n",get_partition_name(rdev));
19067
 
+                       kick_rdev_from_array(rdev);
19068
 
+                       continue;
19069
 
+               }
19070
 
+       }
19071
 
+
19072
 
+       /*
19073
 
+        * OK, we have all disks and the array is ready to run. Let's
19074
 
+        * find the freshest superblock, that one will be the superblock
19075
 
+        * that represents the whole array.
19076
 
+        */
19077
 
+       if (!mddev->sb)
19078
 
+               if (alloc_array_sb(mddev))
19079
 
+                       goto abort;
19080
 
+       sb = mddev->sb;
19081
 
+       freshest = NULL;
19082
 
+
19083
 
+       ITERATE_RDEV(mddev,rdev,tmp) {
19084
 
+               __u64 ev1, ev2;
19085
 
+               /*
19086
 
+                * if the checksum is invalid, use the superblock
19087
 
+                * only as a last resort. (decrease it's age by
19088
 
+                * one event)
19089
 
+                */
19090
 
+               if (calc_sb_csum(rdev->sb) != rdev->sb->sb_csum) {
19091
 
+                       if (rdev->sb->events_lo || rdev->sb->events_hi)
19092
 
+                               if ((rdev->sb->events_lo--)==0)
19093
 
+                                       rdev->sb->events_hi--;
19094
 
+               }
19095
 
+               LOG_DETAILS("%s's event counter: %x\n",get_partition_name(rdev), rdev->sb->events_lo);
19096
 
+
19097
 
+               if (!freshest) {
19098
 
+                       freshest = rdev;
19099
 
+                       continue;
19100
 
+               }
19101
 
+               /*
19102
 
+                * Find the newest superblock version
19103
 
+                */
19104
 
+               ev1 = md_event(rdev->sb);
19105
 
+               ev2 = md_event(freshest->sb);
19106
 
+               if (ev1 != ev2) {
19107
 
+                       out_of_date = 1;
19108
 
+                       if (ev1 > ev2)
19109
 
+                               freshest = rdev;
19110
 
+               }
19111
 
+       }
19112
 
+       if (out_of_date) {
19113
 
+               LOG_WARNING("OUT OF DATE, freshest: %s\n",get_partition_name(freshest));
19114
 
+       }
19115
 
+       memcpy (sb, freshest->sb, sizeof(*sb));
19116
 
+
19117
 
+       /*
19118
 
+        * at this point we have picked the 'best' superblock
19119
 
+        * from all available superblocks.
19120
 
+        * now we validate this superblock and kick out possibly
19121
 
+        * failed disks.
19122
 
+        */
19123
 
+       ITERATE_RDEV(mddev,rdev,tmp) {
19124
 
+               /*
19125
 
+                * Kick all non-fresh devices
19126
 
+                */
19127
 
+               __u64 ev1, ev2;
19128
 
+               ev1 = md_event(rdev->sb);
19129
 
+               ev2 = md_event(sb);
19130
 
+               if (ev1 < ev2) {
19131
 
+                       if (ev1) {
19132
 
+                               LOG_WARNING("kicking non-fresh %s from array!\n",get_partition_name(rdev));
19133
 
+                               kick_rdev_from_array(rdev);
19134
 
+                       continue;
19135
 
+                       } else {
19136
 
+                               LOG_DETAILS("%s is a new spare.\n",get_partition_name(rdev));
19137
 
+                       }
19138
 
+               }
19139
 
+       }
19140
 
+
19141
 
+       /*
19142
 
+        * Remove unavailable and faulty devices ...
19143
 
+        *
19144
 
+        * note that if an array becomes completely unrunnable due to
19145
 
+        * missing devices, we do not write the superblock back, so the
19146
 
+        * administrator has a chance to fix things up. The removal thus
19147
 
+        * only happens if it's nonfatal to the contents of the array.
19148
 
+        */
19149
 
+       for (i = 0; i < MD_SB_DISKS; i++) {
19150
 
+               int found;
19151
 
+               mdp_disk_t *desc;
19152
 
+
19153
 
+               desc = sb->disks + i;
19154
 
+
19155
 
+               /*
19156
 
+                * We kick faulty devices/descriptors immediately.
19157
 
+                *
19158
 
+                * Note: multipath devices are a special case.  Since we
19159
 
+                * were able to read the superblock on the path, we don't
19160
 
+                * care if it was previously marked as faulty, it's up now
19161
 
+                * so enable it.
19162
 
+                */
19163
 
+               if (disk_faulty(desc) && mddev->sb->level != -4) {
19164
 
+                       found = 0;
19165
 
+                       ITERATE_RDEV(mddev,rdev,tmp) {
19166
 
+                               if (rdev->desc_nr != desc->number)
19167
 
+                                       continue;
19168
 
+                               LOG_WARNING("[md%d] kicking faulty %s!\n",mdidx(mddev),get_partition_name(rdev));
19169
 
+                               kick_rdev_from_array(rdev);
19170
 
+                               found = 1;
19171
 
+                               break;
19172
 
+                       }
19173
 
+                       if (!found) {
19174
 
+                               LOG_WARNING("%s: [md%d] found former faulty device [number=%d]\n",
19175
 
+                                           __FUNCTION__ ,mdidx(mddev), desc->number);
19176
 
+                       }
19177
 
+                       /*
19178
 
+                        * Don't call remove_descriptor(),
19179
 
+                        * let the administrator remove it from the user-land */
19180
 
+                       /* remove_descriptor(desc, sb); */
19181
 
+                       continue;
19182
 
+               } else if (disk_faulty(desc)) {
19183
 
+                       /*
19184
 
+                        * multipath entry marked as faulty, unfaulty it
19185
 
+                        */
19186
 
+                       kdev_t dev;
19187
 
+
19188
 
+                       dev = MKDEV(desc->major, desc->minor);
19189
 
+
19190
 
+                       rdev = evms_md_find_rdev(mddev, dev);
19191
 
+                       if (rdev)
19192
 
+                               mark_disk_spare(desc);
19193
 
+                       else {
19194
 
+                               LOG_WARNING("%s: [md%d] (MULTIPATH) found former faulty device [number=%d]\n",
19195
 
+                                           __FUNCTION__ ,mdidx(mddev), desc->number);
19196
 
+                               /*
19197
 
+                                * Don't call remove_descriptor(),
19198
 
+                                * let the administrator remove it from the user-land */
19199
 
+                               /* remove_descriptor(desc, sb); */
19200
 
+                       }
19201
 
+               }
19202
 
+
19203
 
+               /*
19204
 
+                * Is this device present in the rdev ring?
19205
 
+                */
19206
 
+               found = 0;
19207
 
+               ITERATE_RDEV(mddev,rdev,tmp) {
19208
 
+                       /*
19209
 
+                        * Multi-path IO special-case: since we have no
19210
 
+                        * this_disk descriptor at auto-detect time,
19211
 
+                        * we cannot check rdev->number.
19212
 
+                        * We can check the device though.
19213
 
+                        */
19214
 
+                       if ((sb->level == -4) && (rdev->dev ==
19215
 
+                                       MKDEV(desc->major,desc->minor))) {
19216
 
+                               found = 1;
19217
 
+                               break;
19218
 
+                       }
19219
 
+                       if (rdev->desc_nr == desc->number) {
19220
 
+                               found = 1;
19221
 
+                               break;
19222
 
+                       }
19223
 
+               }
19224
 
+               if (found)
19225
 
+                       continue;
19226
 
+
19227
 
+               LOG_WARNING(" [md%d]: former device [number=%d] is unavailable!\n",
19228
 
+                           mdidx(mddev), desc->number);
19229
 
+               /*
19230
 
+                * Don't call remove_descriptor(),
19231
 
+                * let the administrator remove it from the user-land */
19232
 
+               /* remove_descriptor(desc, sb); */
19233
 
+       }
19234
 
+
19235
 
+       /*
19236
 
+        * Kick all rdevs that are not in the
19237
 
+        * descriptor array:
19238
 
+        */
19239
 
+       ITERATE_RDEV(mddev,rdev,tmp) {
19240
 
+               if (rdev->desc_nr == -1)
19241
 
+                       kick_rdev_from_array(rdev);
19242
 
+       }
19243
 
+
19244
 
+       /*
19245
 
+        * Do a final reality check.
19246
 
+        */
19247
 
+       if (mddev->sb->level != -4) {
19248
 
+               ITERATE_RDEV(mddev,rdev,tmp) {
19249
 
+                       if (rdev->desc_nr == -1) {
19250
 
+                               MD_BUG();
19251
 
+                               goto abort;
19252
 
+                       }
19253
 
+                       /*
19254
 
+                        * is the desc_nr unique?
19255
 
+                        */
19256
 
+                       ITERATE_RDEV(mddev,rdev2,tmp2) {
19257
 
+                               if ((rdev2 != rdev) &&
19258
 
+                                               (rdev2->desc_nr == rdev->desc_nr)) {
19259
 
+                                       MD_BUG();
19260
 
+                                       goto abort;
19261
 
+                               }
19262
 
+                       }
19263
 
+               }
19264
 
+       }
19265
 
+
19266
 
+#define OLD_VERSION KERN_ALERT \
19267
 
+"md%d: unsupported raid array version %d.%d.%d\n"
19268
 
+
19269
 
+#define NOT_CLEAN_IGNORE KERN_ERR \
19270
 
+"md%d: raid array is not clean -- starting background reconstruction\n"
19271
 
+
19272
 
+       /*
19273
 
+        * Check if we can support this RAID array
19274
 
+        */
19275
 
+       if (sb->major_version != MD_MAJOR_VERSION ||
19276
 
+                       sb->minor_version > MD_MINOR_VERSION) {
19277
 
+
19278
 
+               LOG_ERROR("[md%d] unsupported raid array version %d.%d.%d\n",
19279
 
+                          mdidx(mddev),
19280
 
+                          sb->major_version,
19281
 
+                          sb->minor_version,
19282
 
+                          sb->patch_version);
19283
 
+               goto abort;
19284
 
+       }
19285
 
+
19286
 
+       if ((sb->state != (1 << MD_SB_CLEAN)) && ((sb->level == 1) ||
19287
 
+                       (sb->level == 4) || (sb->level == 5)))
19288
 
+               LOG_WARNING("[md%d, level=%d] raid array is not clean -- starting background reconstruction\n",
19289
 
+                           mdidx(mddev), sb->level);
19290
 
+
19291
 
+       LOG_ENTRY_EXIT("analysis of all superblocks is OK!\n");
19292
 
+       return 0;
19293
 
+abort:
19294
 
+       LOG_WARNING("ABORT analyze_sbs()!!!\n");
19295
 
+       return 1;
19296
 
+}
19297
 
+
19298
 
+
19299
 
+static int device_size_calculation (mddev_t * mddev)
19300
 
+{
19301
 
+       int data_disks = 0, persistent;
19302
 
+       //unsigned int readahead;
19303
 
+       mdp_super_t *sb = mddev->sb;
19304
 
+       struct md_list_head *tmp;
19305
 
+       mdk_rdev_t *rdev;
19306
 
+
19307
 
+       /*
19308
 
+        * Do device size calculation. Bail out if too small.
19309
 
+        * (we have to do this after having validated chunk_size,
19310
 
+        * because device size has to be modulo chunk_size)
19311
 
+        */
19312
 
+       persistent = !mddev->sb->not_persistent;
19313
 
+       ITERATE_RDEV(mddev,rdev,tmp) {
19314
 
+               if (rdev->faulty)
19315
 
+                       continue;
19316
 
+               if (rdev->size) {
19317
 
+                       MD_BUG();
19318
 
+                       continue;
19319
 
+               }
19320
 
+               rdev->size = evms_md_calc_dev_size(rdev->node, mddev, persistent);
19321
 
+               if (rdev->size < sb->chunk_size / 1024) {
19322
 
+                       LOG_WARNING("Dev %s smaller than chunk_size: %ldk < %dk\n",
19323
 
+                                  get_partition_name(rdev), rdev->size, sb->chunk_size / 1024);
19324
 
+                       return -EINVAL;
19325
 
+               }
19326
 
+       }
19327
 
+
19328
 
+       switch (sb->level) {
19329
 
+               case -4:
19330
 
+                       data_disks = 1;
19331
 
+                       break;
19332
 
+               case -3:
19333
 
+                       data_disks = 1;
19334
 
+                       break;
19335
 
+               case -2:
19336
 
+                       data_disks = 1;
19337
 
+                       break;
19338
 
+               case -1:
19339
 
+                       zoned_raid_size(mddev);
19340
 
+                       data_disks = 1;
19341
 
+                       break;
19342
 
+               case 0:
19343
 
+                       zoned_raid_size(mddev);
19344
 
+                       data_disks = sb->raid_disks;
19345
 
+                       break;
19346
 
+               case 1:
19347
 
+                       data_disks = 1;
19348
 
+                       break;
19349
 
+               case 4:
19350
 
+               case 5:
19351
 
+                       data_disks = sb->raid_disks-1;
19352
 
+                       break;
19353
 
+               default:
19354
 
+                       LOG_ERROR("[md%d] unkown level %d\n", mdidx(mddev), sb->level);
19355
 
+                       goto abort;
19356
 
+       }
19357
 
+       if (!evms_md_size[mdidx(mddev)])
19358
 
+               evms_md_size[mdidx(mddev)] = sb->size * data_disks;
19359
 
+
19360
 
+       return 0;
19361
 
+abort:
19362
 
+       return 1;
19363
 
+}
19364
 
+
19365
 
+
19366
 
+#define TOO_BIG_CHUNKSIZE KERN_ERR \
19367
 
+"too big chunk_size: %d > %d\n"
19368
 
+
19369
 
+#define TOO_SMALL_CHUNKSIZE KERN_ERR \
19370
 
+"too small chunk_size: %d < %ld\n"
19371
 
+
19372
 
+#define BAD_CHUNKSIZE KERN_ERR \
19373
 
+"no chunksize specified, see 'man raidtab'\n"
19374
 
+
19375
 
+static int do_md_run (mddev_t * mddev)
19376
 
+{
19377
 
+       int pnum, err;
19378
 
+       int chunk_size;
19379
 
+       struct md_list_head *tmp;
19380
 
+       mdk_rdev_t *rdev;
19381
 
+
19382
 
+
19383
 
+       if (!mddev->nb_dev) {
19384
 
+               MD_BUG();
19385
 
+               return -EINVAL;
19386
 
+       }
19387
 
+
19388
 
+       if (mddev->pers)
19389
 
+               return -EBUSY;
19390
 
+
19391
 
+       /*
19392
 
+        * Resize disks to align partitions size on a given
19393
 
+        * chunk size.
19394
 
+        */
19395
 
+       evms_md_size[mdidx(mddev)] = 0;
19396
 
+
19397
 
+       /*
19398
 
+        * Analyze all RAID superblock(s)
19399
 
+        */
19400
 
+       if (evms_md_analyze_sbs(mddev)) {
19401
 
+               MD_BUG();
19402
 
+               return -EINVAL;
19403
 
+       }
19404
 
+
19405
 
+       chunk_size = mddev->sb->chunk_size;
19406
 
+       pnum = level_to_pers(mddev->sb->level);
19407
 
+
19408
 
+       mddev->param.chunk_size = chunk_size;
19409
 
+       mddev->param.personality = pnum;
19410
 
+
19411
 
+       if ((pnum != MULTIPATH) && (pnum != RAID1)) {
19412
 
+               if (!chunk_size) {
19413
 
+                       /*
19414
 
+                        * 'default chunksize' in the old md code used to
19415
 
+                        * be PAGE_SIZE, baaad.
19416
 
+                        * we abort here to be on the safe side. We dont
19417
 
+                        * want to continue the bad practice.
19418
 
+                        */
19419
 
+                       printk(BAD_CHUNKSIZE);
19420
 
+                       return -EINVAL;
19421
 
+               }
19422
 
+               if (chunk_size > MAX_CHUNK_SIZE) {
19423
 
+                       printk(TOO_BIG_CHUNKSIZE, chunk_size, MAX_CHUNK_SIZE);
19424
 
+                       return -EINVAL;
19425
 
+               }
19426
 
+               /*
19427
 
+                * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
19428
 
+                */
19429
 
+               if ( (1 << ffz(~chunk_size)) != chunk_size) {
19430
 
+                       MD_BUG();
19431
 
+                       return -EINVAL;
19432
 
+               }
19433
 
+               if (chunk_size < PAGE_SIZE) {
19434
 
+                       printk(TOO_SMALL_CHUNKSIZE, chunk_size, PAGE_SIZE);
19435
 
+                       return -EINVAL;
19436
 
+               }
19437
 
+       } else
19438
 
+               if (chunk_size)
19439
 
+                       printk(KERN_INFO "RAID level %d does not need chunksize! Continuing anyway.\n", mddev->sb->level);
19440
 
+
19441
 
+       if (pnum >= MAX_PERSONALITY) {
19442
 
+               MD_BUG();
19443
 
+               return -EINVAL;
19444
 
+       }
19445
 
+       if (!pers[pnum])
19446
 
+       {
19447
 
+#ifdef CONFIG_KMOD
19448
 
+               char module_name[80];
19449
 
+               sprintf (module_name, "md-personality-%d", pnum);
19450
 
+               request_module (module_name);
19451
 
+               if (!pers[pnum])
19452
 
+#endif
19453
 
+               {
19454
 
+                       printk(KERN_ERR "personality %d is not loaded!\n",
19455
 
+                               pnum);
19456
 
+                       return -EINVAL;
19457
 
+               }
19458
 
+       }
19459
 
+       if (device_size_calculation(mddev))
19460
 
+               return -EINVAL;
19461
 
+
19462
 
+       /*
19463
 
+        * Drop all container device buffers, from now on
19464
 
+        * the only valid external interface is through the md
19465
 
+        * device.
19466
 
+        * Also find largest hardsector size
19467
 
+        */
19468
 
+       md_hardsect_sizes[mdidx(mddev)] = 512;
19469
 
+       ITERATE_RDEV(mddev,rdev,tmp) {
19470
 
+               if (rdev->faulty)
19471
 
+                       continue;
19472
 
+               invalidate_device(rdev->dev, 1);
19473
 
+/*             if (get_hardsect_size(rdev->dev)
19474
 
+                       > md_hardsect_sizes[mdidx(mddev)])
19475
 
+                       md_hardsect_sizes[mdidx(mddev)] =
19476
 
+                               get_hardsect_size(rdev->dev); */
19477
 
+               if (rdev->node->hardsector_size  > md_hardsect_sizes[mdidx(mddev)]) {
19478
 
+                       md_hardsect_sizes[mdidx(mddev)] = rdev->node->hardsector_size;
19479
 
+               }
19480
 
+
19481
 
+       }
19482
 
+       md_blocksizes[mdidx(mddev)] = 1024;
19483
 
+       if (md_blocksizes[mdidx(mddev)] < md_hardsect_sizes[mdidx(mddev)])
19484
 
+               md_blocksizes[mdidx(mddev)] = md_hardsect_sizes[mdidx(mddev)];
19485
 
+
19486
 
+       mddev->pers = pers[pnum];
19487
 
+
19488
 
+       err = mddev->pers->run(mddev);
19489
 
+       if (err) {
19490
 
+               printk("pers->run() failed ...\n");
19491
 
+               mddev->pers = NULL;
19492
 
+               return -EINVAL;
19493
 
+       }
19494
 
+       mddev->sb->state &= ~(1 << MD_SB_CLEAN);
19495
 
+
19496
 
+       evms_md_update_sb(mddev);
19497
 
+
19498
 
+       mddev->flag &= ~EVMS_MD_INCOMPLETE; /* Clear incomplete flag */
19499
 
+
19500
 
+       return (0);
19501
 
+}
19502
 
+
19503
 
+#undef TOO_BIG_CHUNKSIZE
19504
 
+#undef BAD_CHUNKSIZE
19505
 
+
19506
 
+
19507
 
+#define OUT(x) do { err = (x); goto out; } while (0)
19508
 
+
19509
 
+
19510
 
+#define STILL_MOUNTED KERN_WARNING \
19511
 
+"md%d still mounted.\n"
19512
 
+#define        STILL_IN_USE \
19513
 
+"md%d still in use.\n"
19514
 
+
19515
 
+static int do_md_stop (mddev_t * mddev, int ro)
19516
 
+{
19517
 
+       int err = 0, resync_interrupted = 0;
19518
 
+       kdev_t dev = mddev_to_kdev(mddev);
19519
 
+
19520
 
+       if (atomic_read(&mddev->active)>1) {
19521
 
+               printk(STILL_IN_USE, mdidx(mddev));
19522
 
+               OUT(-EBUSY);
19523
 
+       }
19524
 
+
19525
 
+       if (mddev->pers) {
19526
 
+               /*
19527
 
+                * It is safe to call stop here, it only frees private
19528
 
+                * data. Also, it tells us if a device is unstoppable
19529
 
+                * (eg. resyncing is in progress)
19530
 
+                */
19531
 
+               if (mddev->pers->stop_resync)
19532
 
+                       if (mddev->pers->stop_resync(mddev))
19533
 
+                               resync_interrupted = 1;
19534
 
+
19535
 
+               if (mddev->recovery_running)
19536
 
+                       evms_cs_interrupt_thread(evms_md_recovery_thread);
19537
 
+
19538
 
+               /*
19539
 
+                * This synchronizes with signal delivery to the
19540
 
+                * resync or reconstruction thread. It also nicely
19541
 
+                * hangs the process if some reconstruction has not
19542
 
+                * finished.
19543
 
+                */
19544
 
+               down(&mddev->recovery_sem);
19545
 
+               up(&mddev->recovery_sem);
19546
 
+
19547
 
+               invalidate_device(dev, 1);
19548
 
+
19549
 
+               if (ro) {
19550
 
+                       if (mddev->ro)
19551
 
+                               OUT(-ENXIO);
19552
 
+                       mddev->ro = 1;
19553
 
+               } else {
19554
 
+                       if (mddev->ro)
19555
 
+                               set_device_ro(dev, 0);
19556
 
+                       if (mddev->pers->stop(mddev)) {
19557
 
+                               if (mddev->ro)
19558
 
+                                       set_device_ro(dev, 1);
19559
 
+                               OUT(-EBUSY);
19560
 
+                       }
19561
 
+                       if (mddev->ro)
19562
 
+                               mddev->ro = 0;
19563
 
+               }
19564
 
+               if (mddev->sb) {
19565
 
+                       /*
19566
 
+                        * mark it clean only if there was no resync
19567
 
+                        * interrupted.
19568
 
+                        */
19569
 
+                       if (!mddev->recovery_running && !resync_interrupted) {
19570
 
+                               printk("marking sb clean...\n");
19571
 
+                               mddev->sb->state |= 1 << MD_SB_CLEAN;
19572
 
+                       }
19573
 
+                       evms_md_update_sb_sync(mddev);
19574
 
+               }
19575
 
+               if (ro)
19576
 
+                       set_device_ro(dev, 1);
19577
 
+       }
19578
 
+
19579
 
+       /*
19580
 
+        * Free resources if final stop
19581
 
+        */
19582
 
+       if (!ro) {
19583
 
+               printk (KERN_INFO "md%d stopped.\n", mdidx(mddev));
19584
 
+               free_mddev(mddev);
19585
 
+
19586
 
+       } else
19587
 
+               printk (KERN_INFO
19588
 
+                       "md%d switched to read-only mode.\n", mdidx(mddev));
19589
 
+out:
19590
 
+       return err;
19591
 
+}
19592
 
+
19593
 
+
19594
 
+static void evms_md_autorun_array (evms_logical_node_t ** discover_list, mddev_t *mddev)
19595
 
+{
19596
 
+       mdk_rdev_t *rdev;
19597
 
+       struct md_list_head *tmp;
19598
 
+       int err;
19599
 
+       uint flags = 0;
19600
 
+
19601
 
+       if (mddev->disks.prev == &mddev->disks) {
19602
 
+               MD_BUG();
19603
 
+               return;
19604
 
+       }
19605
 
+
19606
 
+       LOG_DETAILS("%s: trying to run array md%d\n", __FUNCTION__,mdidx(mddev) );
19607
 
+
19608
 
+       ITERATE_RDEV(mddev,rdev,tmp) {
19609
 
+               LOG_DETAILS(" <%s>\n", get_partition_name(rdev));
19610
 
+       }
19611
 
+
19612
 
+       err = do_md_run (mddev);
19613
 
+       if (!err) {
19614
 
+               /*
19615
 
+                * remove all nodes consumed by this md device from the discover list
19616
 
+                */
19617
 
+               ITERATE_RDEV(mddev,rdev,tmp) {
19618
 
+                       LOG_DETAILS(" removing %s from discover list.\n", get_partition_name(rdev));
19619
 
+                       evms_cs_remove_logical_node_from_list(discover_list,rdev->node);
19620
 
+                       flags |= rdev->node->flags;
19621
 
+               }
19622
 
+               err = evms_md_create_logical_node(discover_list,mddev,flags);
19623
 
+               if (!err) {
19624
 
+                       exported_nodes++;
19625
 
+               }
19626
 
+       } else {
19627
 
+               LOG_WARNING("%s: cannot run array md%d\n",__FUNCTION__,mdidx(mddev));
19628
 
+               mddev->sb_dirty = 0;
19629
 
+               do_md_stop (mddev, 0);
19630
 
+       }
19631
 
+}
19632
 
+
19633
 
+/*
19634
 
+ * lets try to run arrays based on all disks that have arrived
19635
 
+ * until now. (those are in the ->pending list)
19636
 
+ *
19637
 
+ * the method: pick the first pending disk, collect all disks with
19638
 
+ * the same UUID, remove all from the pending list and put them into
19639
 
+ * the 'same_array' list. Then order this list based on superblock
19640
 
+ * update time (freshest comes first), kick out 'old' disks and
19641
 
+ * compare superblocks. If everything's fine then run it.
19642
 
+ *
19643
 
+ * If "unit" is allocated, then bump its reference count
19644
 
+ */
19645
 
+static void evms_md_autorun_devices (evms_logical_node_t **discover_list, kdev_t countdev)
19646
 
+{
19647
 
+       struct md_list_head candidates;
19648
 
+       struct md_list_head *tmp;
19649
 
+       mdk_rdev_t *rdev0, *rdev;
19650
 
+       mddev_t *mddev;
19651
 
+       kdev_t md_kdev;
19652
 
+
19653
 
+
19654
 
+       LOG_DETAILS("autorun ...\n");
19655
 
+       while (pending_raid_disks.next != &pending_raid_disks) {
19656
 
+               rdev0 = md_list_entry(pending_raid_disks.next,
19657
 
+                                        mdk_rdev_t, pending);
19658
 
+               LOG_DETAILS("considering %s ...\n",get_partition_name(rdev0));
19659
 
+               MD_INIT_LIST_HEAD(&candidates);
19660
 
+               ITERATE_RDEV_PENDING(rdev,tmp) {
19661
 
+                       if (uuid_equal(rdev0, rdev)) {
19662
 
+                               if (!sb_equal(rdev0->sb, rdev->sb)) {
19663
 
+                                       LOG_DETAILS("%s has same UUID as %s, but superblocks differ ...\n",\
19664
 
+                                                   get_partition_name(rdev),get_partition_name(rdev0));
19665
 
+                                       continue;
19666
 
+                               }
19667
 
+                               LOG_DETAILS(" adding %s ...\n", get_partition_name(rdev));
19668
 
+                               md_list_del(&rdev->pending);
19669
 
+                               md_list_add(&rdev->pending, &candidates);
19670
 
+                       }
19671
 
+               }
19672
 
+
19673
 
+               /*
19674
 
+                * now we have a set of devices, with all of them having
19675
 
+                * mostly sane superblocks. It's time to allocate the
19676
 
+                * mddev.
19677
 
+                */
19678
 
+               md_kdev = MKDEV(MD_MAJOR, rdev0->sb->md_minor);
19679
 
+               mddev = kdev_to_mddev(md_kdev);
19680
 
+               if (mddev && (!(mddev->flag & EVMS_MD_INCOMPLETE))) {
19681
 
+                       LOG_DETAILS("md%d already running, cannot run %s\n",
19682
 
+                                  mdidx(mddev), get_partition_name(rdev0));
19683
 
+                       /*
19684
 
+                        * This is EVMS re-discovery!
19685
 
+                        * Remove all nodes consumed by this md device from the discover list
19686
 
+                        */
19687
 
+                       ITERATE_RDEV(mddev,rdev,tmp)
19688
 
+                               evms_cs_remove_logical_node_from_list(discover_list,rdev->node);
19689
 
+                       ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp)
19690
 
+                               evms_md_export_rdev(rdev);
19691
 
+                       continue;
19692
 
+               }
19693
 
+
19694
 
+               if (!mddev) {
19695
 
+                       mddev = alloc_mddev(md_kdev);
19696
 
+                       if (mddev == NULL) {
19697
 
+                               LOG_ERROR("cannot allocate memory for md drive.\n");
19698
 
+                               break;
19699
 
+                       }
19700
 
+                       LOG_DETAILS("created md%d\n", mdidx(mddev));
19701
 
+               } else {
19702
 
+                       LOG_DETAILS("found INCOMPLETE md%d\n", mdidx(mddev));
19703
 
+               }
19704
 
+
19705
 
+               if (md_kdev == countdev)
19706
 
+                       atomic_inc(&mddev->active);
19707
 
+
19708
 
+               ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) {
19709
 
+                       bind_rdev_to_array(rdev, mddev);
19710
 
+                       md_list_del(&rdev->pending);
19711
 
+                       MD_INIT_LIST_HEAD(&rdev->pending);
19712
 
+               }
19713
 
+
19714
 
+               if ((mddev->nr_raid_disks >= rdev0->sb->raid_disks) ||
19715
 
+                   (mddev->nb_dev == rdev0->sb->nr_disks)) {
19716
 
+                       evms_md_autorun_array(discover_list,mddev);
19717
 
+               } else {
19718
 
+                       mddev->flag |= EVMS_MD_INCOMPLETE;
19719
 
+                       LOG_DETAILS("THIS md%d IS INCOMPLETE, found %d devices, need %d\n",
19720
 
+                                   mdidx(mddev), mddev->nr_raid_disks, rdev0->sb->raid_disks);
19721
 
+                       ITERATE_RDEV(mddev,rdev,tmp) {
19722
 
+                               evms_cs_remove_logical_node_from_list(discover_list,rdev->node);
19723
 
+                       }
19724
 
+               }
19725
 
+       }
19726
 
+       LOG_DETAILS("... autorun DONE.\n");
19727
 
+}
19728
 
+
19729
 
+void evms_md_recover_arrays(void)
19730
 
+{
19731
 
+       if (!evms_md_recovery_thread) {
19732
 
+               MD_BUG();
19733
 
+               return;
19734
 
+       }
19735
 
+       evms_cs_wakeup_thread(evms_md_recovery_thread);
19736
 
+}
19737
 
+
19738
 
+int evms_md_error(
19739
 
+       mddev_t *mddev,
19740
 
+       evms_logical_node_t *node)
19741
 
+{
19742
 
+       mdk_rdev_t * rrdev;
19743
 
+
19744
 
+       LOG_ERROR("evms_md_error dev:(md%d), node:(%s), (caller: %p,%p,%p,%p).\n",
19745
 
+                  mdidx(mddev), node->name,
19746
 
+                  __builtin_return_address(0),__builtin_return_address(1),
19747
 
+                  __builtin_return_address(2),__builtin_return_address(3));
19748
 
+
19749
 
+       if (!mddev) {
19750
 
+               MD_BUG();
19751
 
+               return 0;
19752
 
+       }
19753
 
+       rrdev = evms_md_find_rdev_from_node(mddev, node);
19754
 
+       if (!rrdev || rrdev->faulty)
19755
 
+               return 0;
19756
 
+       if (!mddev->pers->error_handler
19757
 
+                       || mddev->pers->error_handler(mddev,node) <= 0) {
19758
 
+               free_disk_sb(rrdev);
19759
 
+               rrdev->faulty = 1;
19760
 
+       } else
19761
 
+               return 1;
19762
 
+       /*
19763
 
+        * if recovery was running, stop it now.
19764
 
+        */
19765
 
+       if (mddev->pers->stop_resync)
19766
 
+               mddev->pers->stop_resync(mddev);
19767
 
+       if (mddev->recovery_running)
19768
 
+               evms_cs_interrupt_thread(evms_md_recovery_thread);
19769
 
+       evms_md_recover_arrays();
19770
 
+
19771
 
+       return 0;
19772
 
+}
19773
 
+
19774
 
+int evms_register_md_personality (int pnum, mdk_personality_t *p)
19775
 
+{
19776
 
+       if (pnum >= MAX_PERSONALITY) {
19777
 
+               MD_BUG();
19778
 
+               return -EINVAL;
19779
 
+       }
19780
 
+
19781
 
+       if (pers[pnum]) {
19782
 
+               MD_BUG();
19783
 
+               return -EBUSY;
19784
 
+       }
19785
 
+
19786
 
+       pers[pnum] = p;
19787
 
+       LOG_DETAILS("%s personality registered as nr %d\n",p->name, pnum);
19788
 
+       return 0;
19789
 
+}
19790
 
+
19791
 
+int evms_unregister_md_personality (int pnum)
19792
 
+{
19793
 
+       if (pnum >= MAX_PERSONALITY) {
19794
 
+               MD_BUG();
19795
 
+               return -EINVAL;
19796
 
+       }
19797
 
+
19798
 
+       printk(KERN_INFO "%s personality unregistered\n", pers[pnum]->name);
19799
 
+       pers[pnum] = NULL;
19800
 
+       return 0;
19801
 
+}
19802
 
+
19803
 
+mdp_disk_t *evms_md_get_spare(mddev_t *mddev)
19804
 
+{
19805
 
+       mdp_super_t *sb = mddev->sb;
19806
 
+       mdp_disk_t *disk;
19807
 
+       mdk_rdev_t *rdev;
19808
 
+//     struct md_list_head *tmp;
19809
 
+       int i, j;
19810
 
+
19811
 
+       for (i = 0, j = 0; j < mddev->nb_dev; i++) {
19812
 
+                rdev = evms_md_find_rdev_nr(mddev, i);
19813
 
+               if (rdev == NULL)
19814
 
+                       continue;
19815
 
+               j++;
19816
 
+                if (rdev->faulty)
19817
 
+                       continue;
19818
 
+               if (!rdev->sb) {
19819
 
+                       if (!rdev->virtual_spare)
19820
 
+                               MD_BUG();
19821
 
+                       continue;
19822
 
+               }
19823
 
+               disk = &sb->disks[rdev->desc_nr];
19824
 
+               if (disk_faulty(disk)) {
19825
 
+                       MD_BUG();
19826
 
+                       continue;
19827
 
+               }
19828
 
+               if (disk_active(disk))
19829
 
+                       continue;
19830
 
+               return disk;
19831
 
+       }
19832
 
+       return NULL;
19833
 
+}
19834
 
+
19835
 
+static mdp_disk_t *evms_md_find_disk(mddev_t *mddev, kdev_t dev)
19836
 
+{
19837
 
+       mdp_super_t *sb = mddev->sb;
19838
 
+       mdp_disk_t *disk;
19839
 
+       int i;
19840
 
+
19841
 
+       for (i=0; i < MD_SB_DISKS; i++) {
19842
 
+               disk = &sb->disks[i];
19843
 
+               if ((disk->major == MAJOR(dev)) && (disk->minor == MINOR(dev)))
19844
 
+                       return disk;
19845
 
+       }
19846
 
+       return NULL;
19847
 
+}
19848
 
+
19849
 
+static unsigned int sync_io[DK_MAX_MAJOR][DK_MAX_DISK];
19850
 
+void evms_md_sync_acct(
19851
 
+       kdev_t dev,
19852
 
+       unsigned long nr_sectors)
19853
 
+{
19854
 
+       unsigned int major = MAJOR(dev);
19855
 
+       unsigned int index;
19856
 
+
19857
 
+       index = disk_index(dev);
19858
 
+       if ((index >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))
19859
 
+               return;
19860
 
+
19861
 
+       sync_io[major][index] += nr_sectors;
19862
 
+}
19863
 
+
19864
 
+static int is_mddev_idle(mddev_t *mddev)
19865
 
+{
19866
 
+       mdk_rdev_t * rdev;
19867
 
+       struct md_list_head *tmp;
19868
 
+       int idle;
19869
 
+       unsigned long curr_events;
19870
 
+
19871
 
+       idle = 1;
19872
 
+       ITERATE_RDEV(mddev,rdev,tmp) {
19873
 
+               int major = MAJOR(rdev->dev);
19874
 
+               int idx = disk_index(rdev->dev);
19875
 
+
19876
 
+               if ((idx >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))
19877
 
+                       continue;
19878
 
+
19879
 
+               curr_events = kstat.dk_drive_rblk[major][idx] +
19880
 
+                                               kstat.dk_drive_wblk[major][idx] ;
19881
 
+               curr_events -= sync_io[major][idx];
19882
 
+               if ((curr_events - rdev->last_events) > 32) {
19883
 
+                       rdev->last_events = curr_events;
19884
 
+                       idle = 0;
19885
 
+               }
19886
 
+       }
19887
 
+       return idle;
19888
 
+}
19889
 
+
19890
 
+MD_DECLARE_WAIT_QUEUE_HEAD(evms_resync_wait);
19891
 
+
19892
 
+void evms_md_done_sync(mddev_t *mddev, int blocks, int ok)
19893
 
+{
19894
 
+       /* another "blocks" (512byte) blocks have been synced */
19895
 
+       atomic_sub(blocks, &mddev->recovery_active);
19896
 
+       wake_up(&mddev->recovery_wait);
19897
 
+       if (!ok) {
19898
 
+               // stop recovery, signal do_sync ....
19899
 
+       }
19900
 
+}
19901
 
+
19902
 
+#define SYNC_MARKS     10
19903
 
+#define        SYNC_MARK_STEP  (3*HZ)
19904
 
+int evms_md_do_sync(mddev_t *mddev, mdp_disk_t *spare)
19905
 
+{
19906
 
+       mddev_t *mddev2;
19907
 
+       unsigned int max_sectors, currspeed,
19908
 
+               j, window, err, serialize;
19909
 
+       unsigned long mark[SYNC_MARKS];
19910
 
+       unsigned long mark_cnt[SYNC_MARKS];
19911
 
+       int last_mark,m;
19912
 
+       struct md_list_head *tmp;
19913
 
+       unsigned long last_check;
19914
 
+
19915
 
+
19916
 
+       err = down_interruptible(&mddev->resync_sem);
19917
 
+       if (err)
19918
 
+               goto out_nolock;
19919
 
+
19920
 
+recheck:
19921
 
+       serialize = 0;
19922
 
+       ITERATE_MDDEV(mddev2,tmp) {
19923
 
+               if (mddev2 == mddev)
19924
 
+                       continue;
19925
 
+               if (mddev2->curr_resync && match_mddev_units(mddev,mddev2)) {
19926
 
+                       LOG_DEFAULT("delaying resync of md%d until md%d "
19927
 
+                                  "has finished resync (they share one or more physical units)\n",
19928
 
+                                  mdidx(mddev), mdidx(mddev2));
19929
 
+                       serialize = 1;
19930
 
+                       break;
19931
 
+               }
19932
 
+       }
19933
 
+       if (serialize) {
19934
 
+               interruptible_sleep_on(&evms_resync_wait);
19935
 
+               if (md_signal_pending(current)) {
19936
 
+                       md_flush_signals();
19937
 
+                       err = -EINTR;
19938
 
+                       goto out;
19939
 
+               }
19940
 
+               goto recheck;
19941
 
+       }
19942
 
+
19943
 
+       mddev->curr_resync = 1;
19944
 
+
19945
 
+       max_sectors = mddev->sb->size<<1;
19946
 
+
19947
 
+       LOG_DEFAULT("syncing RAID array md%d\n", mdidx(mddev));
19948
 
+       LOG_DEFAULT("minimum _guaranteed_ reconstruction speed: %d KB/sec/disc.\n",
19949
 
+                  sysctl_speed_limit_min);
19950
 
+       LOG_DEFAULT("using maximum available idle IO bandwith "
19951
 
+                  "(but not more than %d KB/sec) for reconstruction.\n",
19952
 
+                  sysctl_speed_limit_max);
19953
 
+
19954
 
+       /*
19955
 
+        * Resync has low priority.
19956
 
+        */
19957
 
+       current->nice = 19;
19958
 
+
19959
 
+       is_mddev_idle(mddev); /* this also initializes IO event counters */
19960
 
+       for (m = 0; m < SYNC_MARKS; m++) {
19961
 
+               mark[m] = jiffies;
19962
 
+               mark_cnt[m] = 0;
19963
 
+       }
19964
 
+       last_mark = 0;
19965
 
+       mddev->resync_mark = mark[last_mark];
19966
 
+       mddev->resync_mark_cnt = mark_cnt[last_mark];
19967
 
+
19968
 
+       /*
19969
 
+        * Tune reconstruction:
19970
 
+        */
19971
 
+       window = MAX_READAHEAD*(PAGE_SIZE/512);
19972
 
+       LOG_DEFAULT("using %dk window, over a total of %d blocks.\n",
19973
 
+                  window/2,max_sectors/2);
19974
 
+
19975
 
+       atomic_set(&mddev->recovery_active, 0);
19976
 
+       init_waitqueue_head(&mddev->recovery_wait);
19977
 
+       last_check = 0;
19978
 
+       for (j = 0; j < max_sectors;) {
19979
 
+               int sectors;
19980
 
+
19981
 
+               sectors = mddev->pers->sync_request(mddev, j);
19982
 
+
19983
 
+               if (sectors < 0) {
19984
 
+                       err = sectors;
19985
 
+                       goto out;
19986
 
+               }
19987
 
+               atomic_add(sectors, &mddev->recovery_active);
19988
 
+               j += sectors;
19989
 
+               mddev->curr_resync = j;
19990
 
+
19991
 
+               if (last_check + window > j)
19992
 
+                       continue;
19993
 
+
19994
 
+               last_check = j;
19995
 
+
19996
 
+               run_task_queue(&tq_disk);
19997
 
+
19998
 
+       repeat:
19999
 
+               if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) {
20000
 
+                       /* step marks */
20001
 
+                       int next = (last_mark+1) % SYNC_MARKS;
20002
 
+
20003
 
+                       mddev->resync_mark = mark[next];
20004
 
+                       mddev->resync_mark_cnt = mark_cnt[next];
20005
 
+                       mark[next] = jiffies;
20006
 
+                       mark_cnt[next] = j - atomic_read(&mddev->recovery_active);
20007
 
+                       last_mark = next;
20008
 
+               }
20009
 
+
20010
 
+
20011
 
+               if (md_signal_pending(current)) {
20012
 
+                       /*
20013
 
+                        * got a signal, exit.
20014
 
+                        */
20015
 
+                       mddev->curr_resync = 0;
20016
 
+                       LOG_DEFAULT("evms_md_do_sync() got signal ... exiting\n");
20017
 
+                       md_flush_signals();
20018
 
+                       err = -EINTR;
20019
 
+                       goto out;
20020
 
+               }
20021
 
+
20022
 
+               /*
20023
 
+                * this loop exits only if either when we are slower than
20024
 
+                * the 'hard' speed limit, or the system was IO-idle for
20025
 
+                * a jiffy.
20026
 
+                * the system might be non-idle CPU-wise, but we only care
20027
 
+                * about not overloading the IO subsystem. (things like an
20028
 
+                * e2fsck being done on the RAID array should execute fast)
20029
 
+                */
20030
 
+               if (md_need_resched(current))
20031
 
+                       schedule();
20032
 
+
20033
 
+               currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1;
20034
 
+
20035
 
+               if (currspeed > sysctl_speed_limit_min) {
20036
 
+                       current->nice = 19;
20037
 
+
20038
 
+                       if ((currspeed > sysctl_speed_limit_max) ||
20039
 
+                                       !is_mddev_idle(mddev)) {
20040
 
+                               current->state = TASK_INTERRUPTIBLE;
20041
 
+                               md_schedule_timeout(HZ/4);
20042
 
+                               goto repeat;
20043
 
+                       }
20044
 
+               } else
20045
 
+                       current->nice = -20;
20046
 
+       }
20047
 
+       LOG_DEFAULT("md%d: sync done.\n",mdidx(mddev));
20048
 
+       err = 0;
20049
 
+       /*
20050
 
+        * this also signals 'finished resyncing' to md_stop
20051
 
+        */
20052
 
+out:
20053
 
+       wait_event(mddev->recovery_wait, atomic_read(&mddev->recovery_active)==0);
20054
 
+       up(&mddev->resync_sem);
20055
 
+out_nolock:
20056
 
+       mddev->curr_resync = 0;
20057
 
+       wake_up(&evms_resync_wait);
20058
 
+       return err;
20059
 
+}
20060
 
+
20061
 
+
20062
 
+
20063
 
+/*
20064
 
+ * This is a kernel thread which syncs a spare disk with the active array
20065
 
+ *
20066
 
+ * the amount of foolproofing might seem to be a tad excessive, but an
20067
 
+ * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs
20068
 
+ * of my root partition with the first 0.5 gigs of my /home partition ... so
20069
 
+ * i'm a bit nervous ;)
20070
 
+ */
20071
 
+void evms_md_do_recovery(void *data)
20072
 
+{
20073
 
+       int err;
20074
 
+       mddev_t *mddev;
20075
 
+       mdp_super_t *sb;
20076
 
+       mdp_disk_t *spare;
20077
 
+       struct md_list_head *tmp;
20078
 
+       unsigned long flags;
20079
 
+       evms_md_activate_spare_t *activate_spare;
20080
 
+
20081
 
+       LOG_DEFAULT("recovery thread got woken up ...\n");
20082
 
+restart:
20083
 
+       ITERATE_MDDEV(mddev,tmp) {
20084
 
+
20085
 
+               sb = mddev->sb;
20086
 
+               if (!sb)
20087
 
+                       continue;
20088
 
+               if (mddev->recovery_running)
20089
 
+                       continue;
20090
 
+               if (sb->active_disks == sb->raid_disks)
20091
 
+                       continue;
20092
 
+               if (!sb->spare_disks) {
20093
 
+                       LOG_ERROR(" [md%d] no spare disk to reconstruct array! "
20094
 
+                                  "-- continuing in degraded mode\n", mdidx(mddev));
20095
 
+                       continue;
20096
 
+               }
20097
 
+
20098
 
+               spare = NULL;
20099
 
+               activate_spare = NULL;
20100
 
+
20101
 
+               spin_lock_irqsave(&activate_spare_list_lock, flags);
20102
 
+               activate_spare = evms_activate_spare_list;
20103
 
+               if (activate_spare && (activate_spare->mddev == mddev)) {
20104
 
+                       spare = activate_spare->spare;
20105
 
+                       evms_activate_spare_list = activate_spare->next;
20106
 
+               }
20107
 
+               spin_unlock_irqrestore(&activate_spare_list_lock, flags);
20108
 
+
20109
 
+               if (!spare) {
20110
 
+                       /*
20111
 
+                        * now here we get the spare and resync it.
20112
 
+                        */
20113
 
+                       spare = evms_md_get_spare(mddev);
20114
 
+               }
20115
 
+               if (!spare)
20116
 
+                       continue;
20117
 
+
20118
 
+               LOG_DEFAULT(" [md%d] resyncing spare disk %s to replace failed disk\n",
20119
 
+                          mdidx(mddev), org_partition_name(MKDEV(spare->major,spare->minor)));
20120
 
+               if (!mddev->pers->diskop)
20121
 
+                       continue;
20122
 
+
20123
 
+               if (mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_WRITE))
20124
 
+                       continue;
20125
 
+
20126
 
+               down(&mddev->recovery_sem);
20127
 
+               mddev->recovery_running = 1;
20128
 
+               err = evms_md_do_sync(mddev, spare);
20129
 
+               if (err == -EIO) {
20130
 
+                       LOG_DEFAULT("[md%d] spare disk %s failed, skipping to next spare.\n",
20131
 
+                                  mdidx(mddev), org_partition_name(MKDEV(spare->major,spare->minor)));
20132
 
+                       if (!disk_faulty(spare)) {
20133
 
+                               mddev->pers->diskop(mddev,&spare,DISKOP_SPARE_INACTIVE);
20134
 
+                               mark_disk_faulty(spare);
20135
 
+                               mark_disk_nonsync(spare);
20136
 
+                               mark_disk_inactive(spare);
20137
 
+                               sb->spare_disks--;
20138
 
+                               sb->working_disks--;
20139
 
+                               sb->failed_disks++;
20140
 
+                       }
20141
 
+               } else
20142
 
+                       if (disk_faulty(spare))
20143
 
+                               mddev->pers->diskop(mddev, &spare,
20144
 
+                                               DISKOP_SPARE_INACTIVE);
20145
 
+               if (err == -EINTR || err == -ENOMEM) {
20146
 
+                       /*
20147
 
+                        * Recovery got interrupted, or ran out of mem ...
20148
 
+                        * signal back that we have finished using the array.
20149
 
+                        */
20150
 
+                       mddev->pers->diskop(mddev, &spare,
20151
 
+                                                        DISKOP_SPARE_INACTIVE);
20152
 
+                       up(&mddev->recovery_sem);
20153
 
+                       mddev->recovery_running = 0;
20154
 
+                       continue;
20155
 
+               } else {
20156
 
+                       mddev->recovery_running = 0;
20157
 
+                       up(&mddev->recovery_sem);
20158
 
+               }
20159
 
+               if (!disk_faulty(spare)) {
20160
 
+                       /*
20161
 
+                        * the SPARE_ACTIVE diskop possibly changes the
20162
 
+                        * pointer too
20163
 
+                        */
20164
 
+                       if (activate_spare)
20165
 
+                               mddev->pers->diskop(mddev, &spare, DISKOP_HOT_SPARE_ACTIVE);
20166
 
+                       else
20167
 
+                               mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_ACTIVE);
20168
 
+                       mark_disk_sync(spare);
20169
 
+                       mark_disk_active(spare);
20170
 
+                       sb->active_disks++;
20171
 
+                       sb->spare_disks--;
20172
 
+               }
20173
 
+               mddev->sb_dirty = 1;
20174
 
+               evms_md_update_sb(mddev);
20175
 
+               goto restart;
20176
 
+       }
20177
 
+       LOG_DEFAULT("recovery thread finished ...\n");
20178
 
+
20179
 
+}
20180
 
+
20181
 
+int evms_md_notify_reboot(struct notifier_block *this,
20182
 
+                                       unsigned long code, void *x)
20183
 
+{
20184
 
+       struct md_list_head *tmp;
20185
 
+       mddev_t *mddev;
20186
 
+
20187
 
+       if ((code == MD_SYS_DOWN) || (code == MD_SYS_HALT)
20188
 
+                                       || (code == MD_SYS_POWER_OFF)) {
20189
 
+
20190
 
+               LOG_DEFAULT("stopping all md devices.\n");
20191
 
+
20192
 
+               ITERATE_MDDEV(mddev,tmp)
20193
 
+                       do_md_stop (mddev, 1);
20194
 
+               /*
20195
 
+                * certain more exotic SCSI devices are known to be
20196
 
+                * volatile wrt too early system reboots. While the
20197
 
+                * right place to handle this issue is the given
20198
 
+                * driver, we do want to have a safe RAID driver ...
20199
 
+                */
20200
 
+               md_mdelay(1000*1);
20201
 
+       }
20202
 
+       return NOTIFY_DONE;
20203
 
+}
20204
 
+
20205
 
+static struct notifier_block md_notifier = {
20206
 
+       notifier_call:  evms_md_notify_reboot,
20207
 
+       next:           NULL,
20208
 
+       priority:       INT_MAX, /* before any real devices */
20209
 
+};
20210
 
+
20211
 
+
20212
 
+
20213
 
+/*
20214
 
+ * Function: evms_md_create_logical_node
20215
 
+ */
20216
 
+static int evms_md_create_logical_node(evms_logical_node_t **discover_list,
20217
 
+                                      mddev_t *mddev, uint flags)
20218
 
+{
20219
 
+       int rc;
20220
 
+       md_instance_data_t *MDID = NULL;
20221
 
+       evms_logical_node_t *newnode = NULL;
20222
 
+
20223
 
+       rc = evms_cs_allocate_logical_node(&newnode);
20224
 
+       if (!rc) {
20225
 
+               rc = evms_cs_allocate_memory((void**)&MDID,sizeof(*MDID));
20226
 
+       }
20227
 
+       if (!rc) {
20228
 
+               memset(newnode,0,sizeof(*MDID));
20229
 
+               newnode->plugin = &md_plugin_header;
20230
 
+               newnode->total_vsectors = (u_int64_t)evms_md_size[mdidx(mddev)] * 2;
20231
 
+               newnode->block_size = md_blocksizes[mdidx(mddev)];
20232
 
+               newnode->hardsector_size = md_hardsect_sizes[mdidx(mddev)];
20233
 
+               sprintf(newnode->name,"md/md%d",mdidx(mddev));
20234
 
+               MDID->mddev = mddev;
20235
 
+               newnode->instance_data = MDID;
20236
 
+               newnode->flags = flags;
20237
 
+       }
20238
 
+       if (!rc) {
20239
 
+               rc = evms_cs_add_logical_node_to_list(discover_list, newnode);
20240
 
+               if (rc) {
20241
 
+                       LOG_ERROR("could not add md node %s\n",newnode->name);
20242
 
+               } else {
20243
 
+                       LOG_DETAILS("added our md node %s to discover list (total_vsectors=%Lu, blk_size=%d, sector_size=%d)\n",
20244
 
+                                  newnode->name, newnode->total_vsectors, newnode->block_size, newnode->hardsector_size);
20245
 
+               }
20246
 
+       }
20247
 
+
20248
 
+       if (!rc) {
20249
 
+               mddev->node = newnode;
20250
 
+       } else {
20251
 
+               if (MDID)
20252
 
+                       evms_cs_deallocate_memory(MDID);
20253
 
+               if (newnode)
20254
 
+                       evms_cs_deallocate_logical_node(newnode);
20255
 
+       }
20256
 
+       return rc;
20257
 
+}
20258
 
+
20259
 
+/*
20260
 
+ * Function: evms_md_autostart_arrays
20261
 
+ *     Discover MD "extended" devices
20262
 
+ *     Add MD "extended" devices to pending list for further processing
20263
 
+ */
20264
 
+static void evms_md_autostart_arrays (evms_logical_node_t **discover_list)
20265
 
+{
20266
 
+        evms_logical_node_t *node, *next_node;
20267
 
+       mdk_rdev_t *rdev;
20268
 
+       int rc=0;
20269
 
+
20270
 
+        LOG_ENTRY_EXIT(":autostart_arrays() ENTRY\n");
20271
 
+
20272
 
+        /* examine each node on the discover list */
20273
 
+        next_node = *discover_list;
20274
 
+        while(next_node) {
20275
 
+                node = next_node;
20276
 
+                next_node = node->next;
20277
 
+
20278
 
+               rc = evms_md_import_device(discover_list, node,1);
20279
 
+               if (rc && (rc != -EEXIST)) {
20280
 
+                       LOG_EXTRA("autostart_arrrays() Not %s!\n",evms_md_partition_name(node));
20281
 
+                       continue;
20282
 
+               }
20283
 
+
20284
 
+               /*
20285
 
+                * Sanity checks:
20286
 
+                */
20287
 
+               rdev = evms_md_find_rdev_all(node);
20288
 
+               if (!rdev) {
20289
 
+                       LOG_ERROR("find_rdev_all() failed\n");
20290
 
+                       continue;
20291
 
+               }
20292
 
+               if (rdev->faulty) {
20293
 
+                       MD_BUG();
20294
 
+                       continue;
20295
 
+               }
20296
 
+
20297
 
+               if (!rc) {
20298
 
+                       md_list_add(&rdev->pending, &pending_raid_disks);
20299
 
+               } else if (rc == -EEXIST) {
20300
 
+                       evms_logical_node_t *md_node;
20301
 
+                       /*
20302
 
+                        * Must be in a re-discovery process here.
20303
 
+                        * Find the EVMS MD node that this rdev is a member of
20304
 
+                        */
20305
 
+                       if (rdev->mddev) {
20306
 
+                               md_node = rdev->mddev->node;
20307
 
+                               if (md_node) {
20308
 
+                                       rc = evms_cs_add_logical_node_to_list(discover_list,md_node);
20309
 
+                                       switch (rc) {
20310
 
+                                       case 0:
20311
 
+                                               exported_nodes++;
20312
 
+                                               LOG_DETAILS("Added MD node (%s) to discover list\n",
20313
 
+                                                       md_node->name);
20314
 
+                                               break;
20315
 
+                                       case 1: /* already on the list */
20316
 
+                                       case 2: /* already on the list */
20317
 
+                                               break;
20318
 
+                                       default:
20319
 
+                                               LOG_WARNING("could not add md node (%s), rc=%d\n",
20320
 
+                                                       md_node->name, rc);
20321
 
+                                       }
20322
 
+                               } else {
20323
 
+                                       LOG_ERROR("This MD device [md%d] does not have an EVMS logical node.\n",
20324
 
+                                                  rdev->mddev->__minor);
20325
 
+                               }
20326
 
+                       } else {
20327
 
+                               LOG_ERROR("This device [%s] does not belong to any array!\n",
20328
 
+                                         get_partition_name(rdev));
20329
 
+                               evms_md_export_rdev(rdev);
20330
 
+                       }
20331
 
+                       evms_cs_remove_logical_node_from_list(discover_list,node);
20332
 
+               }
20333
 
+        }
20334
 
+
20335
 
+       evms_md_autorun_devices(discover_list, -1);
20336
 
+        LOG_DETAILS("EVMD MD:autostart_arrays() EXIT (exported_nodes=%d)\n",exported_nodes);
20337
 
+}
20338
 
+
20339
 
+#ifdef CONFIG_PROC_FS
20340
 
+static int status_resync(char * page, mddev_t * mddev)
20341
 
+{
20342
 
+       int sz = 0;
20343
 
+       unsigned long max_blocks, resync, res, dt, db, rt;
20344
 
+
20345
 
+       resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
20346
 
+       max_blocks = mddev->sb->size;
20347
 
+
20348
 
+       /*
20349
 
+        * Should not happen.
20350
 
+        */
20351
 
+       if (!max_blocks) {
20352
 
+               MD_BUG();
20353
 
+               return 0;
20354
 
+       }
20355
 
+       res = (resync/1024)*1000/(max_blocks/1024 + 1);
20356
 
+       {
20357
 
+               int i, x = res/50, y = 20-x;
20358
 
+               PROCPRINT("[");
20359
 
+               for (i = 0; i < x; i++)
20360
 
+                       PROCPRINT("=");
20361
 
+               sz += sprintf(page + sz, ">");
20362
 
+               for (i = 0; i < y; i++)
20363
 
+                       PROCPRINT(".");
20364
 
+               PROCPRINT("] ");
20365
 
+       }
20366
 
+       if (!mddev->recovery_running)
20367
 
+               /*
20368
 
+                * true resync
20369
 
+                */
20370
 
+               PROCPRINT(" resync =%3lu.%lu%% (%lu/%lu)",
20371
 
+                       res/10, res % 10, resync, max_blocks);
20372
 
+       else
20373
 
+               /*
20374
 
+                * recovery ...
20375
 
+                */
20376
 
+               PROCPRINT(" recovery =%3lu.%lu%% (%lu/%lu)",
20377
 
+                       res/10, res % 10, resync, max_blocks);
20378
 
+
20379
 
+       /*
20380
 
+        * We do not want to overflow, so the order of operands and
20381
 
+        * the * 100 / 100 trick are important. We do a +1 to be
20382
 
+        * safe against division by zero. We only estimate anyway.
20383
 
+        *
20384
 
+        * dt: time from mark until now
20385
 
+        * db: blocks written from mark until now
20386
 
+        * rt: remaining time
20387
 
+        */
20388
 
+       dt = ((jiffies - mddev->resync_mark) / HZ);
20389
 
+       if (!dt) dt++;
20390
 
+       db = resync - (mddev->resync_mark_cnt/2);
20391
 
+       rt = (dt * ((max_blocks-resync) / (db/100+1)))/100;
20392
 
+
20393
 
+       PROCPRINT(" finish=%lu.%lumin", rt / 60, (rt % 60)/6);
20394
 
+
20395
 
+       PROCPRINT(" speed=%ldK/sec", db/dt);
20396
 
+
20397
 
+       return sz;
20398
 
+}
20399
 
+
20400
 
+static int evms_md_status_read_proc(char *page, char **start, off_t off,
20401
 
+                       int count, int *eof, void *data)
20402
 
+{
20403
 
+       int sz = 0, j, size;
20404
 
+       struct md_list_head *tmp, *tmp2;
20405
 
+       mdk_rdev_t *rdev;
20406
 
+       mddev_t *mddev;
20407
 
+
20408
 
+       PROCPRINT("Enterprise Volume Management System: MD Status\n");
20409
 
+       PROCPRINT("Personalities : ");
20410
 
+       for (j = 0; j < MAX_PERSONALITY; j++)
20411
 
+       if (pers[j])
20412
 
+               PROCPRINT("[%s] ", pers[j]->name);
20413
 
+
20414
 
+       PROCPRINT("\n");
20415
 
+
20416
 
+
20417
 
+       ITERATE_MDDEV(mddev,tmp) {
20418
 
+               PROCPRINT("md%d : %sactive", mdidx(mddev),
20419
 
+                       mddev->pers ? "" : "in");
20420
 
+               if (mddev->pers) {
20421
 
+                       if (mddev->ro)
20422
 
+                               PROCPRINT(" (read-only)");
20423
 
+                       PROCPRINT(" %s", mddev->pers->name);
20424
 
+               }
20425
 
+
20426
 
+               size = 0;
20427
 
+               ITERATE_RDEV(mddev,rdev,tmp2) {
20428
 
+                       PROCPRINT(" %s[%d]",
20429
 
+                               rdev->node->name, rdev->desc_nr);
20430
 
+                       if (rdev->faulty) {
20431
 
+                               PROCPRINT("(F)");
20432
 
+                               continue;
20433
 
+                       }
20434
 
+                       size += rdev->size;
20435
 
+               }
20436
 
+
20437
 
+               if (mddev->nb_dev) {
20438
 
+                       if (mddev->pers)
20439
 
+                               PROCPRINT("\n      %Ld blocks",
20440
 
+                                                mddev->node->total_vsectors >> 1);
20441
 
+                       else
20442
 
+                               PROCPRINT("\n      %d blocks", size);
20443
 
+               }
20444
 
+
20445
 
+               if (!mddev->pers) {
20446
 
+                       PROCPRINT("\n");
20447
 
+                       continue;
20448
 
+               }
20449
 
+
20450
 
+               sz += mddev->pers->status (page+sz, mddev);
20451
 
+               
20452
 
+               PROCPRINT("\n      ");
20453
 
+               if (mddev->curr_resync) {
20454
 
+                       sz += status_resync (page+sz, mddev);
20455
 
+               } else {
20456
 
+                       if (atomic_read(&mddev->resync_sem.count) != 1)
20457
 
+                               PROCPRINT("     resync=DELAYED");
20458
 
+               }
20459
 
+
20460
 
+               PROCPRINT("\n");
20461
 
+       }
20462
 
+
20463
 
+       return sz;
20464
 
+}
20465
 
+#endif
20466
 
+
20467
 
+/* Function: md_core_init
20468
 
+ */
20469
 
+int __init md_core_init(void)
20470
 
+{
20471
 
+       static char * name = "evms_mdrecoveryd";
20472
 
+#ifdef CONFIG_PROC_FS
20473
 
+       struct proc_dir_entry *evms_proc_dir;
20474
 
+#endif
20475
 
+
20476
 
+       // Increment the use count, so it never goes to zero.
20477
 
+       // This is necessary for now because we don't have code
20478
 
+       // to shut down the MD threads. When that is written,
20479
 
+       // this line should be removed.
20480
 
+       MOD_INC_USE_COUNT;
20481
 
+
20482
 
+#ifdef CONFIG_PROC_FS
20483
 
+       evms_proc_dir = evms_cs_get_evms_proc_dir();
20484
 
+       if (evms_proc_dir) {
20485
 
+               create_proc_read_entry("mdstat", 0, evms_proc_dir, evms_md_status_read_proc, NULL);
20486
 
+       }
20487
 
+       md_table_header = register_sysctl_table(dev_dir_table, 1);
20488
 
+#endif
20489
 
+
20490
 
+       /* Create MD recovery thread */
20491
 
+       evms_md_recovery_thread = evms_cs_register_thread(evms_md_do_recovery, NULL, name);
20492
 
+       if (!evms_md_recovery_thread)
20493
 
+               LOG_SERIOUS("%s: evms_cs_recovery_thread failed\n", __FUNCTION__);
20494
 
+
20495
 
+       /* Register for reboot notification */
20496
 
+       md_register_reboot_notifier(&md_notifier);
20497
 
+
20498
 
+       return evms_cs_register_plugin(&md_plugin_header);
20499
 
+}
20500
 
+
20501
 
+static void __exit md_core_exit(void)
20502
 
+{
20503
 
+#ifdef CONFIG_PROC_FS
20504
 
+       struct proc_dir_entry *evms_proc_dir;
20505
 
+       
20506
 
+       evms_proc_dir = evms_cs_get_evms_proc_dir();
20507
 
+       if (evms_proc_dir) {
20508
 
+               remove_proc_entry("mdstat", evms_proc_dir);
20509
 
+       }
20510
 
+       unregister_sysctl_table(md_table_header);
20511
 
+#endif
20512
 
+       evms_cs_unregister_plugin(&md_plugin_header);
20513
 
+}
20514
 
+
20515
 
+module_init(md_core_init);
20516
 
+module_exit(md_core_exit);
20517
 
+#ifdef MODULE_LICENSE
20518
 
+MODULE_LICENSE("GPL");
20519
 
+#endif
20520
 
+
20521
 
+/*
20522
 
+ * In order to have the coexistence of this EVMS plugin and the orginal MD
20523
 
+ * module, the symbols exported by this plugin are prefixed with "evms_"
20524
 
+ */
20525
 
+
20526
 
+MD_EXPORT_SYMBOL(evms_md_size);
20527
 
+MD_EXPORT_SYMBOL(evms_register_md_personality);
20528
 
+MD_EXPORT_SYMBOL(evms_unregister_md_personality);
20529
 
+       /* Export the following function for use with rdev->node in evms_md_k.h */
20530
 
+MD_EXPORT_SYMBOL(evms_md_partition_name);
20531
 
+       /* Export the following function for use with disks[] in md_p.h */
20532
 
+//MD_EXPORT_SYMBOL(get_partition_name);
20533
 
+MD_EXPORT_SYMBOL(evms_md_error);
20534
 
+MD_EXPORT_SYMBOL(evms_md_update_sb);
20535
 
+MD_EXPORT_SYMBOL(evms_md_find_rdev_nr);
20536
 
+MD_EXPORT_SYMBOL(evms_md_print_devices);
20537
 
+MD_EXPORT_SYMBOL(evms_mddev_map);
20538
 
+MD_EXPORT_SYMBOL(evms_md_check_ordering);
20539
 
+MD_EXPORT_SYMBOL(evms_md_do_sync);
20540
 
+MD_EXPORT_SYMBOL(evms_md_sync_acct);
20541
 
+MD_EXPORT_SYMBOL(evms_md_done_sync);
20542
 
+MD_EXPORT_SYMBOL(evms_md_recover_arrays);
20543
 
+MD_EXPORT_SYMBOL(evms_md_get_spare);
20544
 
+
20545
 
diff -Naur linux-2002-03-28/drivers/evms/md_linear.c evms-2002-03-28/drivers/evms/md_linear.c
20546
 
--- linux-2002-03-28/drivers/evms/md_linear.c   Wed Dec 31 18:00:00 1969
20547
 
+++ evms-2002-03-28/drivers/evms/md_linear.c    Thu Mar 28 16:28:59 2002
20548
 
@@ -0,0 +1,284 @@
20549
 
+/*
20550
 
+   linear.c : Multiple Devices driver for Linux
20551
 
+              Copyright (C) 1994-96 Marc ZYNGIER
20552
 
+             <zyngier@ufr-info-p7.ibp.fr> or
20553
 
+             <maz@gloups.fdn.fr>
20554
 
+
20555
 
+   Linear mode management functions.
20556
 
+
20557
 
+   This program is free software; you can redistribute it and/or modify
20558
 
+   it under the terms of the GNU General Public License as published by
20559
 
+   the Free Software Foundation; either version 2, or (at your option)
20560
 
+   any later version.
20561
 
+   
20562
 
+   You should have received a copy of the GNU General Public License
20563
 
+   (for example /usr/src/linux/COPYING); if not, write to the Free
20564
 
+   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
20565
 
+*/
20566
 
+
20567
 
+#include <linux/module.h>
20568
 
+#include <linux/evms/evms_md.h>
20569
 
+#include <linux/evms/evms_linear.h>
20570
 
+#include <linux/slab.h>
20571
 
+
20572
 
+
20573
 
+#define MAJOR_NR MD_MAJOR
20574
 
+#define MD_DRIVER
20575
 
+#define MD_PERSONALITY
20576
 
+
20577
 
+#define LOG_PREFIX "md linear: "
20578
 
+static int linear_run (mddev_t *mddev)
20579
 
+{
20580
 
+       linear_conf_t *conf;
20581
 
+       struct linear_hash *table;
20582
 
+       mdk_rdev_t *rdev;
20583
 
+       int size, i, j, nb_zone;
20584
 
+       unsigned int curr_offset;
20585
 
+
20586
 
+       MOD_INC_USE_COUNT;
20587
 
+
20588
 
+       conf = kmalloc (sizeof (*conf), GFP_KERNEL);
20589
 
+       if (!conf)
20590
 
+               goto out;
20591
 
+       mddev->private = conf;
20592
 
+
20593
 
+       if (evms_md_check_ordering(mddev)) {
20594
 
+               printk("linear: disks are not ordered, aborting!\n");
20595
 
+               goto out;
20596
 
+       }
20597
 
+
20598
 
+       /*
20599
 
+        * Find the smallest device.
20600
 
+        */
20601
 
+
20602
 
+       conf->smallest = NULL;
20603
 
+       curr_offset = 0;
20604
 
+       ITERATE_RDEV_ORDERED(mddev,rdev,j) {
20605
 
+               dev_info_t *disk = conf->disks + j;
20606
 
+               disk->node = rdev->node;
20607
 
+               LOG_DETAILS(__FUNCTION__" is taking %s, total_vsectors=%Lu\n",
20608
 
+                          disk->node->name,disk->node->total_vsectors);
20609
 
+               disk->dev = rdev->dev;
20610
 
+               disk->size = rdev->size;
20611
 
+               disk->offset = curr_offset;
20612
 
+
20613
 
+               curr_offset += disk->size;
20614
 
+
20615
 
+               if (!conf->smallest || (disk->size < conf->smallest->size))
20616
 
+                       conf->smallest = disk;
20617
 
+       }
20618
 
+
20619
 
+       nb_zone = conf->nr_zones = evms_md_size[mdidx(mddev)] / conf->smallest->size + 
20620
 
+               ((evms_md_size[mdidx(mddev)] % conf->smallest->size) ? 1 : 0);
20621
 
+  
20622
 
+       conf->hash_table = kmalloc (sizeof (struct linear_hash) * nb_zone,
20623
 
+                                       GFP_KERNEL);
20624
 
+       if (!conf->hash_table)
20625
 
+               goto out;
20626
 
+
20627
 
+       /*
20628
 
+        * Here we generate the linear hash table
20629
 
+        */
20630
 
+       table = conf->hash_table;
20631
 
+       i = 0;
20632
 
+       size = 0;
20633
 
+       for (j = 0; j < mddev->nb_dev; j++) {
20634
 
+               dev_info_t *disk = conf->disks + j;
20635
 
+
20636
 
+               if (size < 0) {
20637
 
+                       table[-1].dev1 = disk;
20638
 
+               }
20639
 
+               size += disk->size;
20640
 
+
20641
 
+               while (size>0) {
20642
 
+                       table->dev0 = disk;
20643
 
+                       table->dev1 = NULL;
20644
 
+                       size -= conf->smallest->size;
20645
 
+                       table++;
20646
 
+               }
20647
 
+       }
20648
 
+       if (table-conf->hash_table != nb_zone)
20649
 
+               BUG();
20650
 
+       LOG_DETAILS(__FUNCTION__" EXIT nr_zones=%d, smallest=%lu\n",
20651
 
+                  conf->nr_zones,conf->smallest->size);
20652
 
+       return 0;
20653
 
+
20654
 
+out:
20655
 
+       if (conf)
20656
 
+               kfree(conf);
20657
 
+       MOD_DEC_USE_COUNT;
20658
 
+       return 1;
20659
 
+}
20660
 
+
20661
 
+static int linear_stop (mddev_t *mddev)
20662
 
+{
20663
 
+       linear_conf_t *conf = mddev_to_conf(mddev);
20664
 
+  
20665
 
+       kfree(conf->hash_table);
20666
 
+       kfree(conf);
20667
 
+
20668
 
+       MOD_DEC_USE_COUNT;
20669
 
+
20670
 
+       return 0;
20671
 
+}
20672
 
+
20673
 
+/*
20674
 
+ * Function: linear_map
20675
 
+ */
20676
 
+static int linear_map(mddev_t *mddev, evms_logical_node_t **node, evms_sector_t *LSN)
20677
 
+{
20678
 
+       linear_conf_t *conf = mddev_to_conf(mddev);
20679
 
+       struct linear_hash *hash;
20680
 
+       dev_info_t *tmp_dev;
20681
 
+       long block;
20682
 
+
20683
 
+       block = (long)(*LSN >> 1);
20684
 
+       hash = conf->hash_table + (block / conf->smallest->size);
20685
 
+       if (block >= (hash->dev0->size + hash->dev0->offset)) {
20686
 
+               if (!hash->dev1) {
20687
 
+                       LOG_ERROR(__FUNCTION__ " hash->dev1==NULL for block %ld\n",block);
20688
 
+                       return -EINVAL;
20689
 
+               }
20690
 
+               tmp_dev = hash->dev1;
20691
 
+       } else
20692
 
+               tmp_dev = hash->dev0;
20693
 
+    
20694
 
+       if (block >= (tmp_dev->size + tmp_dev->offset)
20695
 
+                               || block < tmp_dev->offset) {
20696
 
+               LOG_ERROR(__FUNCTION__" Block %ld out of bounds on node %s size %ld offset %ld\n",
20697
 
+                          block,
20698
 
+                          tmp_dev->node->name,
20699
 
+                          tmp_dev->size,
20700
 
+                          tmp_dev->offset);
20701
 
+               return -EINVAL;
20702
 
+       }
20703
 
+       *LSN -= (evms_sector_t)(tmp_dev->offset << 1);
20704
 
+       *node = tmp_dev->node;
20705
 
+       return 0;
20706
 
+}
20707
 
+
20708
 
+static int linear_init_io(mddev_t *mddev,
20709
 
+                         int rw,
20710
 
+                         evms_sector_t LSN,
20711
 
+                         evms_sector_t nr_sects,
20712
 
+                         void *data)
20713
 
+{
20714
 
+       int rc = 0;
20715
 
+       evms_logical_node_t *node;
20716
 
+
20717
 
+       LOG_ENTRY_EXIT(__FUNCTION__" LSN=%Lu, nr_sects=%Lu\n", LSN, nr_sects);
20718
 
+       rc = linear_map(mddev, &node, &LSN);
20719
 
+       if (!rc)
20720
 
+               rc = INIT_IO(node, rw, LSN, nr_sects, data);
20721
 
+       return rc;
20722
 
+}
20723
 
+
20724
 
+static int linear_make_request (mddev_t *mddev,
20725
 
+                               int rw,
20726
 
+                               eio_t *eio)
20727
 
+{
20728
 
+       evms_logical_node_t *node;
20729
 
+       int rc;
20730
 
+
20731
 
+       rc = linear_map(mddev, &node, &eio->rsector);
20732
 
+       if (!rc) {
20733
 
+
20734
 
+               if (rw == READ) {
20735
 
+                       R_IO(node, eio);
20736
 
+               } else {
20737
 
+                       W_IO(node, eio);
20738
 
+               }
20739
 
+               return 1; /* success */
20740
 
+       }
20741
 
+       LOG_ERROR(__FUNCTION__ " FAILED %s node(%s) rsector(%Lu)\n",
20742
 
+               (rw == READ) ? "READ" : "WRITE",node->name,eio->rsector);
20743
 
+
20744
 
+       EVMS_IO_ERROR(eio);
20745
 
+
20746
 
+       return 0;
20747
 
+}
20748
 
+
20749
 
+static int linear_status (char *page, mddev_t *mddev)
20750
 
+{
20751
 
+       int sz = 0;
20752
 
+
20753
 
+#undef MD_DEBUG
20754
 
+#ifdef MD_DEBUG
20755
 
+       int j;
20756
 
+       linear_conf_t *conf = mddev_to_conf(mddev);
20757
 
+  
20758
 
+       sz += sprintf(page+sz, "      ");
20759
 
+       for (j = 0; j < conf->nr_zones; j++)
20760
 
+       {
20761
 
+               sz += sprintf(page+sz, "[%s",
20762
 
+                       partition_name(conf->hash_table[j].dev0->dev));
20763
 
+
20764
 
+               if (conf->hash_table[j].dev1)
20765
 
+                       sz += sprintf(page+sz, "/%s] ",
20766
 
+                         partition_name(conf->hash_table[j].dev1->dev));
20767
 
+               else
20768
 
+                       sz += sprintf(page+sz, "] ");
20769
 
+       }
20770
 
+       sz += sprintf(page+sz, "\n");
20771
 
+#endif
20772
 
+       sz += sprintf(page+sz, " %dk rounding", mddev->param.chunk_size/1024);
20773
 
+       return sz;
20774
 
+}
20775
 
+
20776
 
+static int linear_evms_ioctl (
20777
 
+       mddev_t         * mddev,
20778
 
+       struct inode    * inode,
20779
 
+       struct file     * file, 
20780
 
+       unsigned int    cmd,
20781
 
+       unsigned long   arg)
20782
 
+{
20783
 
+       int rc = 0;
20784
 
+       evms_logical_node_t *node;
20785
 
+
20786
 
+       switch (cmd) {
20787
 
+               case EVMS_GET_BMAP:
20788
 
+               {
20789
 
+                       evms_get_bmap_t *bmap = (evms_get_bmap_t *)arg;
20790
 
+                       rc = linear_map(mddev,&node, &bmap->rsector);
20791
 
+                       if (!rc) {
20792
 
+                               if (node)
20793
 
+                                       rc = IOCTL(node, inode, file, cmd, arg);
20794
 
+                               else
20795
 
+                                       rc = -ENODEV;
20796
 
+                       }
20797
 
+                       break;
20798
 
+               }
20799
 
+
20800
 
+               default:
20801
 
+                       rc = -EINVAL;
20802
 
+       }
20803
 
+       return rc;
20804
 
+}
20805
 
+
20806
 
+static mdk_personality_t linear_personality=
20807
 
+{
20808
 
+       name:           "evms_linear",
20809
 
+       init_io:        linear_init_io,
20810
 
+       make_request:   linear_make_request,
20811
 
+       run:            linear_run,
20812
 
+       stop:           linear_stop,
20813
 
+       status:         linear_status,
20814
 
+       evms_ioctl:     linear_evms_ioctl
20815
 
+};
20816
 
+
20817
 
+static int md__init linear_init (void)
20818
 
+{
20819
 
+       return evms_register_md_personality (LINEAR, &linear_personality);
20820
 
+}
20821
 
+
20822
 
+static void linear_exit (void)
20823
 
+{
20824
 
+       evms_unregister_md_personality (LINEAR);
20825
 
+}
20826
 
+
20827
 
+
20828
 
+module_init(linear_init);
20829
 
+module_exit(linear_exit);
20830
 
+#ifdef MODULE_LICENSE
20831
 
+MODULE_LICENSE("GPL");
20832
 
+#endif
20833
 
diff -Naur linux-2002-03-28/drivers/evms/md_raid0.c evms-2002-03-28/drivers/evms/md_raid0.c
20834
 
--- linux-2002-03-28/drivers/evms/md_raid0.c    Wed Dec 31 18:00:00 1969
20835
 
+++ evms-2002-03-28/drivers/evms/md_raid0.c     Thu Mar 28 16:28:46 2002
20836
 
@@ -0,0 +1,442 @@
20837
 
+/*
20838
 
+   raid0.c : Multiple Devices driver for Linux
20839
 
+             Copyright (C) 1994-96 Marc ZYNGIER
20840
 
+            <zyngier@ufr-info-p7.ibp.fr> or
20841
 
+            <maz@gloups.fdn.fr>
20842
 
+             Copyright (C) 1999, 2000 Ingo Molnar, Red Hat
20843
 
+
20844
 
+
20845
 
+   RAID-0 management functions.
20846
 
+
20847
 
+   This program is free software; you can redistribute it and/or modify
20848
 
+   it under the terms of the GNU General Public License as published by
20849
 
+   the Free Software Foundation; either version 2, or (at your option)
20850
 
+   any later version.
20851
 
+   
20852
 
+   You should have received a copy of the GNU General Public License
20853
 
+   (for example /usr/src/linux/COPYING); if not, write to the Free
20854
 
+   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
20855
 
+*/
20856
 
+
20857
 
+#include <linux/module.h>
20858
 
+#include <linux/evms/evms_raid0.h>
20859
 
+
20860
 
+#define MAJOR_NR MD_MAJOR
20861
 
+#define MD_DRIVER
20862
 
+#define MD_PERSONALITY
20863
 
+
20864
 
+#define LOG_PREFIX "md raid0: "
20865
 
+
20866
 
+static int create_strip_zones (mddev_t *mddev)
20867
 
+{
20868
 
+       int i, c, j, j1, j2;
20869
 
+       unsigned long current_offset, curr_zone_offset;
20870
 
+       raid0_conf_t *conf = mddev_to_conf(mddev);
20871
 
+       mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev;
20872
 
20873
 
+       /*
20874
 
+        * The number of 'same size groups'
20875
 
+        */
20876
 
+       conf->nr_strip_zones = 0;
20877
 
20878
 
+       ITERATE_RDEV_ORDERED(mddev,rdev1,j1) {
20879
 
+               LOG_DETAILS(" looking at %s\n", evms_md_partition_name(rdev1->node));
20880
 
+               c = 0;
20881
 
+               ITERATE_RDEV_ORDERED(mddev,rdev2,j2) {
20882
 
+                       LOG_DETAILS("   comparing %s(%ld) with %s(%ld)\n",
20883
 
+                                  evms_md_partition_name(rdev1->node), rdev1->size, 
20884
 
+                                  evms_md_partition_name(rdev2->node), rdev2->size);
20885
 
+                       if (rdev2 == rdev1) {
20886
 
+                               LOG_DETAILS("   END\n");
20887
 
+                               break;
20888
 
+                       }
20889
 
+                       if (rdev2->size == rdev1->size)
20890
 
+                       {
20891
 
+                               /*
20892
 
+                                * Not unique, dont count it as a new
20893
 
+                                * group
20894
 
+                                */
20895
 
+                               LOG_DETAILS("   EQUAL\n");
20896
 
+                               c = 1;
20897
 
+                               break;
20898
 
+                       }
20899
 
+                       LOG_DETAILS("   NOT EQUAL\n");
20900
 
+               }
20901
 
+               if (!c) {
20902
 
+                       LOG_DETAILS("   ==> UNIQUE\n");
20903
 
+                       conf->nr_strip_zones++;
20904
 
+                       LOG_DETAILS(" %d zones\n",conf->nr_strip_zones);
20905
 
+               }
20906
 
+       }
20907
 
+       LOG_DETAILS(" FINAL %d zones\n",conf->nr_strip_zones);
20908
 
+
20909
 
+       conf->strip_zone = vmalloc(sizeof(struct strip_zone)*
20910
 
+                               conf->nr_strip_zones);
20911
 
+       if (!conf->strip_zone)
20912
 
+               return 1;
20913
 
+
20914
 
+
20915
 
+       conf->smallest = NULL;
20916
 
+       current_offset = 0;
20917
 
+       curr_zone_offset = 0;
20918
 
+
20919
 
+       for (i = 0; i < conf->nr_strip_zones; i++)
20920
 
+       {
20921
 
+               struct strip_zone *zone = conf->strip_zone + i;
20922
 
+
20923
 
+               LOG_DETAILS(" zone %d\n", i);
20924
 
+               zone->dev_offset = current_offset;
20925
 
+               smallest = NULL;
20926
 
+               c = 0;
20927
 
+
20928
 
+               ITERATE_RDEV_ORDERED(mddev,rdev,j) {
20929
 
+
20930
 
+                       LOG_DETAILS(" checking %s ...",evms_md_partition_name(rdev->node));
20931
 
+                       if (rdev->size > current_offset)
20932
 
+                       {
20933
 
+                               LOG_DETAILS(" contained as device %d\n", c);
20934
 
+                               zone->dev[c] = rdev;
20935
 
+                               c++;
20936
 
+                               if (!smallest || (rdev->size <smallest->size)) {
20937
 
+                                       smallest = rdev;
20938
 
+                                       LOG_DETAILS("  (%ld) is smallest!.\n", rdev->size);
20939
 
+                               }
20940
 
+                       } else
20941
 
+                               LOG_DETAILS(" nope.\n");
20942
 
+               }
20943
 
+
20944
 
+               zone->nb_dev = c;
20945
 
+               zone->size = (smallest->size - current_offset) * c;
20946
 
+               LOG_DETAILS(" zone->nb_dev: %d, size: %ld\n",
20947
 
+                       zone->nb_dev,zone->size);
20948
 
+
20949
 
+               if (!conf->smallest || (zone->size < conf->smallest->size))
20950
 
+                       conf->smallest = zone;
20951
 
+
20952
 
+               zone->zone_offset = curr_zone_offset;
20953
 
+               curr_zone_offset += zone->size;
20954
 
+
20955
 
+               current_offset = smallest->size;
20956
 
+               LOG_DETAILS(" current zone offset: %ld\n",current_offset);
20957
 
+       }
20958
 
+       LOG_DETAILS(" done.\n");
20959
 
+       return 0;
20960
 
+}
20961
 
+
20962
 
+static int raid0_run (mddev_t *mddev)
20963
 
+{
20964
 
+       unsigned long cur=0, i=0, size, zone0_size, nb_zone;
20965
 
+       raid0_conf_t *conf;
20966
 
+
20967
 
+       MOD_INC_USE_COUNT;
20968
 
+
20969
 
+       conf = vmalloc(sizeof (raid0_conf_t));
20970
 
+       if (!conf)
20971
 
+               goto out;
20972
 
+       mddev->private = (void *)conf;
20973
 
20974
 
+       if (evms_md_check_ordering(mddev)) {
20975
 
+               LOG_ERROR("disks are not ordered, aborting!\n");
20976
 
+               goto out_free_conf;
20977
 
+       }
20978
 
+
20979
 
+       if (create_strip_zones (mddev)) 
20980
 
+               goto out_free_conf;
20981
 
+
20982
 
+       LOG_DETAILS("evms_md_size is %d blocks.\n", evms_md_size[mdidx(mddev)]);
20983
 
+       LOG_DETAILS("conf->smallest->size is %ld blocks.\n", conf->smallest->size);
20984
 
+       nb_zone = evms_md_size[mdidx(mddev)]/conf->smallest->size +
20985
 
+                       (evms_md_size[mdidx(mddev)] % conf->smallest->size ? 1 : 0);
20986
 
+       LOG_DETAILS("nb_zone is %ld.\n", nb_zone);
20987
 
+       conf->nr_zones = nb_zone;
20988
 
+
20989
 
+       LOG_DETAILS("Allocating %ld bytes for hash.\n", nb_zone*sizeof(struct raid0_hash));
20990
 
+
20991
 
+       conf->hash_table = vmalloc (sizeof (struct raid0_hash)*nb_zone);
20992
 
+       if (!conf->hash_table)
20993
 
+               goto out_free_zone_conf;
20994
 
+       size = conf->strip_zone[cur].size;
20995
 
+
20996
 
+       i = 0;
20997
 
+       while (cur < conf->nr_strip_zones) {
20998
 
+               conf->hash_table[i].zone0 = conf->strip_zone + cur;
20999
 
+
21000
 
+               /*
21001
 
+                * If we completely fill the slot
21002
 
+                */
21003
 
+               if (size >= conf->smallest->size) {
21004
 
+                       conf->hash_table[i++].zone1 = NULL;
21005
 
+                       size -= conf->smallest->size;
21006
 
+
21007
 
+                       if (!size) {
21008
 
+                               if (++cur == conf->nr_strip_zones)
21009
 
+                                       continue;
21010
 
+                               size = conf->strip_zone[cur].size;
21011
 
+                       }
21012
 
+                       continue;
21013
 
+               }
21014
 
+               if (++cur == conf->nr_strip_zones) {
21015
 
+                       /*
21016
 
+                        * Last dev, set unit1 as NULL
21017
 
+                        */
21018
 
+                       conf->hash_table[i].zone1=NULL;
21019
 
+                       continue;
21020
 
+               }
21021
 
+
21022
 
+               /*
21023
 
+                * Here we use a 2nd dev to fill the slot
21024
 
+                */
21025
 
+               zone0_size = size;
21026
 
+               size = conf->strip_zone[cur].size;
21027
 
+               conf->hash_table[i++].zone1 = conf->strip_zone + cur;
21028
 
+               size -= (conf->smallest->size - zone0_size);
21029
 
+       }
21030
 
+       return 0;
21031
 
+
21032
 
+out_free_zone_conf:
21033
 
+       vfree(conf->strip_zone);
21034
 
+       conf->strip_zone = NULL;
21035
 
+
21036
 
+out_free_conf:
21037
 
+       vfree(conf);
21038
 
+       mddev->private = NULL;
21039
 
+out:
21040
 
+       MOD_DEC_USE_COUNT;
21041
 
+       return 1;
21042
 
+}
21043
 
+
21044
 
+static int raid0_stop (mddev_t *mddev)
21045
 
+{
21046
 
+       raid0_conf_t *conf = mddev_to_conf(mddev);
21047
 
+
21048
 
+       vfree (conf->hash_table);
21049
 
+       conf->hash_table = NULL;
21050
 
+       vfree (conf->strip_zone);
21051
 
+       conf->strip_zone = NULL;
21052
 
+       vfree (conf);
21053
 
+       mddev->private = NULL;
21054
 
+
21055
 
+       MOD_DEC_USE_COUNT;
21056
 
+       return 0;
21057
 
+}
21058
 
+
21059
 
+
21060
 
+/*
21061
 
+ * Function: raid0_map
21062
 
+ *
21063
 
+ *     Return 0 for success, else error
21064
 
+ *
21065
 
+ * Comment from original code:
21066
 
+ *
21067
 
+ * FIXME - We assume some things here :
21068
 
+ * - requested buffers NEVER bigger than chunk size,
21069
 
+ * - requested buffers NEVER cross stripes limits.
21070
 
+ * Of course, those facts may not be valid anymore (and surely won't...)
21071
 
+ * Hey guys, there's some work out there ;-)
21072
 
+ */
21073
 
+
21074
 
+static inline int raid0_map(mddev_t *mddev, evms_logical_node_t **node, evms_sector_t *LSN, evms_sector_t size)
21075
 
+{
21076
 
+       unsigned int sect_in_chunk, chunksize_bits,  chunk_size;
21077
 
+       raid0_conf_t *conf = mddev_to_conf(mddev);
21078
 
+       struct raid0_hash *hash;
21079
 
+       struct strip_zone *zone;
21080
 
+       mdk_rdev_t *tmp_dev;
21081
 
+       unsigned long chunk, block, rsect;
21082
 
+       unsigned long b_rsector;
21083
 
+       unsigned int b_size;
21084
 
+
21085
 
+       b_rsector = (unsigned long)*LSN;
21086
 
+       b_size = (unsigned int)size;
21087
 
+
21088
 
+       chunk_size = mddev->param.chunk_size >> 10;
21089
 
+       chunksize_bits = ffz(~chunk_size);
21090
 
+       block = b_rsector >> 1;
21091
 
+       hash = conf->hash_table + block / conf->smallest->size;
21092
 
+
21093
 
+       /* Sanity check */
21094
 
+       if (chunk_size < (block % chunk_size) + (b_size >> 10))
21095
 
+               goto bad_map;
21096
 
21097
 
+       if (!hash)
21098
 
+               goto bad_hash;
21099
 
+
21100
 
+       if (!hash->zone0)
21101
 
+               goto bad_zone0;
21102
 
21103
 
+       if (block >= (hash->zone0->size + hash->zone0->zone_offset)) {
21104
 
+               if (!hash->zone1)
21105
 
+                       goto bad_zone1;
21106
 
+               zone = hash->zone1;
21107
 
+       } else
21108
 
+               zone = hash->zone0;
21109
 
+    
21110
 
+       sect_in_chunk = b_rsector & ((chunk_size<<1) -1);
21111
 
+       chunk = (block - zone->zone_offset) / (zone->nb_dev << chunksize_bits);
21112
 
+       tmp_dev = zone->dev[(block >> chunksize_bits) % zone->nb_dev];
21113
 
+       rsect = (((chunk << chunksize_bits) + zone->dev_offset)<<1)
21114
 
+               + sect_in_chunk;
21115
 
21116
 
+       /*
21117
 
+        * The new BH_Lock semantics in ll_rw_blk.c guarantee that this
21118
 
+        * is the only IO operation happening on this bh.
21119
 
+        */
21120
 
+       *LSN  = (evms_sector_t)rsect;
21121
 
+       *node = tmp_dev->node;
21122
 
+       return 0;
21123
 
+
21124
 
+bad_map:
21125
 
+       LOG_ERROR(__FUNCTION__ " bug: can't convert block across chunks or bigger than %dk %ld %d\n",
21126
 
+                  chunk_size, b_rsector, b_size >> 10);
21127
 
+       goto outerr;
21128
 
+bad_hash:
21129
 
+       LOG_ERROR(__FUNCTION__ " bug: hash==NULL for block %ld\n",block);
21130
 
+       goto outerr;
21131
 
+bad_zone0:
21132
 
+       LOG_ERROR(__FUNCTION__ " bug: hash->zone0==NULL for block %ld\n", block);
21133
 
+       goto outerr;
21134
 
+bad_zone1:
21135
 
+       LOG_ERROR(__FUNCTION__ " bug: hash->zone1==NULL for block %ld\n",block);
21136
 
+outerr:
21137
 
+       return -EINVAL;
21138
 
+}
21139
 
+
21140
 
+/*
21141
 
+ * Function: raid0_init_io
21142
 
+ */
21143
 
+static int raid0_init_io(
21144
 
+       mddev_t *mddev,
21145
 
+       int rw,
21146
 
+       evms_sector_t LSN,
21147
 
+       evms_sector_t nr_sects,
21148
 
+       void *data)
21149
 
+{
21150
 
+       int rc = 0;
21151
 
+       evms_logical_node_t *node;
21152
 
+
21153
 
+       LOG_ENTRY_EXIT(__FUNCTION__ " LSN=%Lu, nr_sects=%Lu\n", LSN, nr_sects);
21154
 
+       rc = raid0_map(mddev, &node, &LSN, nr_sects);
21155
 
+       if (!rc)
21156
 
+               rc = INIT_IO(node, rw, LSN, nr_sects, data);
21157
 
+       return rc;
21158
 
+}
21159
 
+
21160
 
+static int raid0_make_request (
21161
 
+       mddev_t *mddev,
21162
 
+       int rw,
21163
 
+       eio_t *eio)
21164
 
+{
21165
 
+       evms_logical_node_t *node;
21166
 
+       int rc;
21167
 
+
21168
 
+       rc = raid0_map(mddev, &node, &eio->rsector, eio->rsize);
21169
 
+       if (!rc) {
21170
 
+               if (rw == READ) {
21171
 
+                       R_IO(node, eio);
21172
 
+               } else {
21173
 
+                       W_IO(node, eio);
21174
 
+               }
21175
 
+               return 1; /* success */
21176
 
+       }
21177
 
+       LOG_ERROR(__FUNCTION__ " FAILED %s node(%s) rsector(%Lu)\n",
21178
 
+                  (rw == READ) ? "READ" : "WRITE",node->name,eio->rsector);
21179
 
+
21180
 
+       EVMS_IO_ERROR(eio);
21181
 
+
21182
 
+       return 0;
21183
 
+}
21184
 
+
21185
 
+                          
21186
 
+static int raid0_status (char *page, mddev_t *mddev)
21187
 
+{
21188
 
+       int sz = 0;
21189
 
+#undef MD_DEBUG
21190
 
+#ifdef MD_DEBUG
21191
 
+       int j, k;
21192
 
+       raid0_conf_t *conf = mddev_to_conf(mddev);
21193
 
+  
21194
 
+       sz += sprintf(page + sz, "      ");
21195
 
+       for (j = 0; j < conf->nr_zones; j++) {
21196
 
+               sz += sprintf(page + sz, "[z%d",
21197
 
+                               conf->hash_table[j].zone0 - conf->strip_zone);
21198
 
+               if (conf->hash_table[j].zone1)
21199
 
+                       sz += sprintf(page+sz, "/z%d] ",
21200
 
+                               conf->hash_table[j].zone1 - conf->strip_zone);
21201
 
+               else
21202
 
+                       sz += sprintf(page+sz, "] ");
21203
 
+       }
21204
 
+  
21205
 
+       sz += sprintf(page + sz, "\n");
21206
 
+  
21207
 
+       for (j = 0; j < conf->nr_strip_zones; j++) {
21208
 
+               sz += sprintf(page + sz, "      z%d=[", j);
21209
 
+               for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
21210
 
+                       sz += sprintf (page+sz, "%s/", partition_name(
21211
 
+                               conf->strip_zone[j].dev[k]->dev));
21212
 
+               sz--;
21213
 
+               sz += sprintf (page+sz, "] zo=%d do=%d s=%d\n",
21214
 
+                               conf->strip_zone[j].zone_offset,
21215
 
+                               conf->strip_zone[j].dev_offset,
21216
 
+                               conf->strip_zone[j].size);
21217
 
+       }
21218
 
+#endif
21219
 
+       sz += sprintf(page + sz, " %dk chunks", mddev->param.chunk_size/1024);
21220
 
+       return sz;
21221
 
+}
21222
 
+
21223
 
+static int raid0_evms_ioctl (
21224
 
+       mddev_t         * mddev,
21225
 
+       struct inode    * inode,
21226
 
+       struct file     * file, 
21227
 
+       unsigned int    cmd,
21228
 
+       unsigned long   arg)
21229
 
+{
21230
 
+       int rc = 0;
21231
 
+       evms_logical_node_t *node;
21232
 
+
21233
 
+       switch (cmd) {
21234
 
+               case EVMS_GET_BMAP:
21235
 
+               {
21236
 
+                       evms_get_bmap_t *bmap = (evms_get_bmap_t *)arg;
21237
 
+                       rc = raid0_map(mddev,&node, &bmap->rsector, mddev->node->block_size);
21238
 
+                       if (!rc) {
21239
 
+                               if (node)
21240
 
+                                       rc = IOCTL(node, inode, file, cmd, arg);
21241
 
+                               else
21242
 
+                                       rc = -ENODEV;
21243
 
+                       }
21244
 
+                       break;
21245
 
+               }
21246
 
+
21247
 
+               default:
21248
 
+                       rc = -EINVAL;
21249
 
+       }
21250
 
+       return rc;
21251
 
+}
21252
 
+
21253
 
+static mdk_personality_t raid0_personality=
21254
 
+{
21255
 
+       name:           "evms_raid0",
21256
 
+       init_io:        raid0_init_io,
21257
 
+       make_request:   raid0_make_request,
21258
 
+       run:            raid0_run,
21259
 
+       stop:           raid0_stop,
21260
 
+       status:         raid0_status,
21261
 
+       evms_ioctl:     raid0_evms_ioctl
21262
 
+};
21263
 
+
21264
 
+static int md__init raid0_init (void)
21265
 
+{
21266
 
+       return evms_register_md_personality (RAID0, &raid0_personality);
21267
 
+}
21268
 
+
21269
 
+static void raid0_exit (void)
21270
 
+{
21271
 
+       evms_unregister_md_personality (RAID0);
21272
 
+}
21273
 
+
21274
 
+module_init(raid0_init);
21275
 
+module_exit(raid0_exit);
21276
 
+#ifdef MODULE_LICENSE
21277
 
+MODULE_LICENSE("GPL");
21278
 
+#endif
21279
 
diff -Naur linux-2002-03-28/drivers/evms/md_raid1.c evms-2002-03-28/drivers/evms/md_raid1.c
21280
 
--- linux-2002-03-28/drivers/evms/md_raid1.c    Wed Dec 31 18:00:00 1969
21281
 
+++ evms-2002-03-28/drivers/evms/md_raid1.c     Wed Mar 27 09:07:59 2002
21282
 
@@ -0,0 +1,2053 @@
21283
 
+/*
21284
 
+ * md_raid1.c : Multiple Devices driver for Linux
21285
 
+ *
21286
 
+ * Copyright (C) 1999, 2000 Ingo Molnar, Red Hat
21287
 
+ *
21288
 
+ * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
21289
 
+ *
21290
 
+ * RAID-1 management functions.
21291
 
+ *
21292
 
+ * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
21293
 
+ *
21294
 
+ * Fixes to reconstruction by Jakob �stergaard" <jakob@ostenfeld.dk>
21295
 
+ * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
21296
 
+ *
21297
 
+ * 'md_raid1.c' is an EVMS version of linux/drivers/md/raid1.c modified
21298
 
+ * by Cuong (Mike) Tran <miketran@us.ibm.com>, January 2002.
21299
 
+ *
21300
 
+ * This program is free software; you can redistribute it and/or modify
21301
 
+ * it under the terms of the GNU General Public License as published by
21302
 
+ * the Free Software Foundation; either version 2, or (at your option)
21303
 
+ * any later version.
21304
 
+ *
21305
 
+ * You should have received a copy of the GNU General Public License
21306
 
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
21307
 
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21308
 
+ */
21309
 
+
21310
 
+#include <linux/module.h>
21311
 
+#include <linux/slab.h>
21312
 
+#include <linux/evms/evms_raid1.h>
21313
 
+#include <asm/atomic.h>
21314
 
+
21315
 
+#define MAJOR_NR MD_MAJOR
21316
 
+#define MD_DRIVER
21317
 
+#define MD_PERSONALITY
21318
 
+
21319
 
+#define MAX_WORK_PER_DISK 128
21320
 
+
21321
 
+#define        NR_RESERVED_BUFS        32
21322
 
+
21323
 
+#define LOG_PREFIX "md raid1: "
21324
 
+/*
21325
 
+ * The following can be used to debug the driver
21326
 
+ */
21327
 
+#define RAID1_DEBUG    0
21328
 
+
21329
 
+#if RAID1_DEBUG
21330
 
+#define PRINTK(x...)   LOG_DEFAULT(x)
21331
 
+#define inline
21332
 
+#define __inline__
21333
 
+#else
21334
 
+#define PRINTK(x...)  do { } while (0)
21335
 
+#endif
21336
 
+
21337
 
+
21338
 
+static mdk_personality_t raid1_personality;
21339
 
+static md_spinlock_t retry_list_lock = MD_SPIN_LOCK_UNLOCKED;
21340
 
+struct raid1_bh *evms_raid1_retry_list = NULL, **evms_raid1_retry_tail;
21341
 
+
21342
 
+static inline void add_node_mapping(
21343
 
+       struct raid1_bh *r1_bh,
21344
 
+       evms_logical_node_t *node,
21345
 
+       struct buffer_head *bh)
21346
 
+{
21347
 
+       int i;
21348
 
+       for (i=0; i<MD_SB_DISKS; i++) {
21349
 
+               if (!r1_bh->mirror_node_map[i].node) {
21350
 
+                       r1_bh->mirror_node_map[i].node = node;
21351
 
+                       r1_bh->mirror_node_map[i].bh = bh;
21352
 
+                       return;
21353
 
+               }
21354
 
+       }
21355
 
+       LOG_ERROR(__FUNCTION__" Cannot create mapping for %s\n",node->name);
21356
 
+}
21357
 
+
21358
 
+static inline evms_logical_node_t * bh_to_node(
21359
 
+       struct raid1_bh *r1_bh,
21360
 
+       struct buffer_head *bh)
21361
 
+{
21362
 
+       int i;
21363
 
+       for (i=0; i<MD_SB_DISKS; i++) {
21364
 
+               if (r1_bh->mirror_node_map[i].bh == bh) {
21365
 
+                       return r1_bh->mirror_node_map[i].node;
21366
 
+               }
21367
 
+       }
21368
 
+       LOG_ERROR(__FUNCTION__" Cannot find mapping for bh(%p)\n",bh);
21369
 
+       return NULL;
21370
 
+}
21371
 
+
21372
 
+static struct buffer_head *raid1_alloc_bh(raid1_conf_t *conf, int cnt)
21373
 
+{
21374
 
+       /* return a linked list of "cnt" struct buffer_heads.
21375
 
+        * don't take any off the free list unless we know we can
21376
 
+        * get all we need, otherwise we could deadlock
21377
 
+        */
21378
 
+       struct buffer_head *bh=NULL;
21379
 
+
21380
 
+       while(cnt) {
21381
 
+               struct buffer_head *t;
21382
 
+               md_spin_lock_irq(&conf->device_lock);
21383
 
+               if (!conf->freebh_blocked && conf->freebh_cnt >= cnt)
21384
 
+                       while (cnt) {
21385
 
+                               t = conf->freebh;
21386
 
+                               conf->freebh = t->b_next;
21387
 
+                               t->b_next = bh;
21388
 
+                               bh = t;
21389
 
+                               t->b_state = 0;
21390
 
+                               conf->freebh_cnt--;
21391
 
+                               cnt--;
21392
 
+                       }
21393
 
+               md_spin_unlock_irq(&conf->device_lock);
21394
 
+               if (cnt == 0)
21395
 
+                       break;
21396
 
+               t = kmem_cache_alloc(bh_cachep, SLAB_NOIO);
21397
 
+               if (t) {
21398
 
+                       t->b_next = bh;
21399
 
+                       bh = t;
21400
 
+                       cnt--;
21401
 
+               } else {
21402
 
+                       PRINTK("raid1: waiting for %d bh\n", cnt);
21403
 
+                       conf->freebh_blocked = 1;
21404
 
+                       wait_disk_event(conf->wait_buffer,
21405
 
+                                       !conf->freebh_blocked ||
21406
 
+                                       conf->freebh_cnt > conf->raid_disks * NR_RESERVED_BUFS/2);
21407
 
+                       conf->freebh_blocked = 0;
21408
 
+               }
21409
 
+       }
21410
 
+       return bh;
21411
 
+}
21412
 
+
21413
 
+static inline void raid1_free_bh(raid1_conf_t *conf, struct buffer_head *bh)
21414
 
+{
21415
 
+       unsigned long flags;
21416
 
+       spin_lock_irqsave(&conf->device_lock, flags);
21417
 
+       while (bh) {
21418
 
+               struct buffer_head *t = bh;
21419
 
+               bh=bh->b_next;
21420
 
+               if (t->b_pprev == NULL)
21421
 
+                       kmem_cache_free(bh_cachep, t);
21422
 
+               else {
21423
 
+                       t->b_next= conf->freebh;
21424
 
+                       conf->freebh = t;
21425
 
+                       conf->freebh_cnt++;
21426
 
+               }
21427
 
+       }
21428
 
+       spin_unlock_irqrestore(&conf->device_lock, flags);
21429
 
+       wake_up(&conf->wait_buffer);
21430
 
+}
21431
 
+
21432
 
+static int raid1_grow_bh(raid1_conf_t *conf, int cnt)
21433
 
+{
21434
 
+       /* allocate cnt buffer_heads, possibly less if kmalloc fails */
21435
 
+       int i = 0;
21436
 
+
21437
 
+       while (i < cnt) {
21438
 
+               struct buffer_head *bh;
21439
 
+               bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL);
21440
 
+               if (!bh) break;
21441
 
+
21442
 
+               md_spin_lock_irq(&conf->device_lock);
21443
 
+               bh->b_pprev = &conf->freebh;
21444
 
+               bh->b_next = conf->freebh;
21445
 
+               conf->freebh = bh;
21446
 
+               conf->freebh_cnt++;
21447
 
+               md_spin_unlock_irq(&conf->device_lock);
21448
 
+
21449
 
+               i++;
21450
 
+       }
21451
 
+       return i;
21452
 
+}
21453
 
+
21454
 
+static void raid1_shrink_bh(raid1_conf_t *conf)
21455
 
+{
21456
 
+       /* discard all buffer_heads */
21457
 
+
21458
 
+       md_spin_lock_irq(&conf->device_lock);
21459
 
+       while (conf->freebh) {
21460
 
+               struct buffer_head *bh = conf->freebh;
21461
 
+               conf->freebh = bh->b_next;
21462
 
+               kmem_cache_free(bh_cachep, bh);
21463
 
+               conf->freebh_cnt--;
21464
 
+       }
21465
 
+       md_spin_unlock_irq(&conf->device_lock);
21466
 
+}
21467
 
+               
21468
 
+
21469
 
+static struct raid1_bh *raid1_alloc_r1bh(raid1_conf_t *conf)
21470
 
+{
21471
 
+       struct raid1_bh *r1_bh = NULL;
21472
 
+
21473
 
+       do {
21474
 
+               md_spin_lock_irq(&conf->device_lock);
21475
 
+               if (!conf->freer1_blocked && conf->freer1) {
21476
 
+                       r1_bh = conf->freer1;
21477
 
+                       conf->freer1 = r1_bh->next_r1;
21478
 
+                       conf->freer1_cnt--;
21479
 
+                       r1_bh->next_r1 = NULL;
21480
 
+                       r1_bh->state = (1 << R1BH_PreAlloc);
21481
 
+                       r1_bh->bh_req.b_state = 0;
21482
 
+                       memset(r1_bh->mirror_node_map, 0, sizeof(r1_bh->mirror_node_map));
21483
 
+               }
21484
 
+               md_spin_unlock_irq(&conf->device_lock);
21485
 
+               if (r1_bh)
21486
 
+                       return r1_bh;
21487
 
+               r1_bh = (struct raid1_bh *) kmalloc(sizeof(struct raid1_bh), GFP_NOIO);
21488
 
+               if (r1_bh) {
21489
 
+                       memset(r1_bh, 0, sizeof(*r1_bh));
21490
 
+                       return r1_bh;
21491
 
+               }
21492
 
+               conf->freer1_blocked = 1;
21493
 
+               wait_disk_event(conf->wait_buffer,
21494
 
+                               !conf->freer1_blocked ||
21495
 
+                               conf->freer1_cnt > NR_RESERVED_BUFS/2
21496
 
+                       );
21497
 
+               conf->freer1_blocked = 0;
21498
 
+       } while (1);
21499
 
+}
21500
 
+
21501
 
+static inline void raid1_free_r1bh(struct raid1_bh *r1_bh)
21502
 
+{
21503
 
+       struct buffer_head *bh = r1_bh->mirror_bh_list;
21504
 
+       raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
21505
 
+
21506
 
+       r1_bh->mirror_bh_list = NULL;
21507
 
+
21508
 
+       if (test_bit(R1BH_PreAlloc, &r1_bh->state)) {
21509
 
+               unsigned long flags;
21510
 
+               spin_lock_irqsave(&conf->device_lock, flags);
21511
 
+               r1_bh->next_r1 = conf->freer1;
21512
 
+               conf->freer1 = r1_bh;
21513
 
+               conf->freer1_cnt++;
21514
 
+               spin_unlock_irqrestore(&conf->device_lock, flags);
21515
 
+               /* don't need to wakeup wait_buffer because
21516
 
+                *  raid1_free_bh below will do that
21517
 
+                */
21518
 
+       } else {
21519
 
+               kfree(r1_bh);
21520
 
+       }
21521
 
+       raid1_free_bh(conf, bh);
21522
 
+}
21523
 
+
21524
 
+static int raid1_grow_r1bh (raid1_conf_t *conf, int cnt)
21525
 
+{
21526
 
+       int i = 0;
21527
 
+
21528
 
+       while (i < cnt) {
21529
 
+               struct raid1_bh *r1_bh;
21530
 
+               r1_bh = (struct raid1_bh*)kmalloc(sizeof(*r1_bh), GFP_KERNEL);
21531
 
+               if (!r1_bh)
21532
 
+                       break;
21533
 
+               memset(r1_bh, 0, sizeof(*r1_bh));
21534
 
+               set_bit(R1BH_PreAlloc, &r1_bh->state);
21535
 
+               r1_bh->mddev = conf->mddev;
21536
 
+
21537
 
+               raid1_free_r1bh(r1_bh);
21538
 
+               i++;
21539
 
+       }
21540
 
+       return i;
21541
 
+}
21542
 
+
21543
 
+static void raid1_shrink_r1bh(raid1_conf_t *conf)
21544
 
+{
21545
 
+       md_spin_lock_irq(&conf->device_lock);
21546
 
+       while (conf->freer1) {
21547
 
+               struct raid1_bh *r1_bh = conf->freer1;
21548
 
+               conf->freer1 = r1_bh->next_r1;
21549
 
+               conf->freer1_cnt--;
21550
 
+               kfree(r1_bh);
21551
 
+       }
21552
 
+       md_spin_unlock_irq(&conf->device_lock);
21553
 
+}
21554
 
+
21555
 
+
21556
 
+
21557
 
+static inline void raid1_free_buf(struct raid1_bh *r1_bh)
21558
 
+{
21559
 
+       unsigned long flags;
21560
 
+       struct buffer_head *bh = r1_bh->mirror_bh_list;
21561
 
+       raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
21562
 
+       r1_bh->mirror_bh_list = NULL;
21563
 
+       
21564
 
+       spin_lock_irqsave(&conf->device_lock, flags);
21565
 
+       r1_bh->next_r1 = conf->freebuf;
21566
 
+       conf->freebuf = r1_bh;
21567
 
+       spin_unlock_irqrestore(&conf->device_lock, flags);
21568
 
+       raid1_free_bh(conf, bh);
21569
 
+}
21570
 
+
21571
 
+static struct raid1_bh *raid1_alloc_buf(raid1_conf_t *conf)
21572
 
+{
21573
 
+       struct raid1_bh *r1_bh;
21574
 
+
21575
 
+       md_spin_lock_irq(&conf->device_lock);
21576
 
+       wait_event_lock_irq(conf->wait_buffer, conf->freebuf, conf->device_lock);
21577
 
+       r1_bh = conf->freebuf;
21578
 
+       conf->freebuf = r1_bh->next_r1;
21579
 
+       r1_bh->next_r1= NULL;
21580
 
+       md_spin_unlock_irq(&conf->device_lock);
21581
 
+       memset(r1_bh->mirror_node_map, 0, sizeof(r1_bh->mirror_node_map));
21582
 
+       return r1_bh;
21583
 
+}
21584
 
+
21585
 
+static int raid1_grow_buffers (raid1_conf_t *conf, int cnt)
21586
 
+{
21587
 
+       int i = 0;
21588
 
+
21589
 
+       md_spin_lock_irq(&conf->device_lock);
21590
 
+       while (i < cnt) {
21591
 
+               struct raid1_bh *r1_bh;
21592
 
+               struct page *page;
21593
 
+
21594
 
+               page = alloc_page(GFP_KERNEL);
21595
 
+               if (!page)
21596
 
+                       break;
21597
 
+
21598
 
+               r1_bh = (struct raid1_bh *) kmalloc(sizeof(*r1_bh), GFP_KERNEL);
21599
 
+               if (!r1_bh) {
21600
 
+                       __free_page(page);
21601
 
+                       break;
21602
 
+               }
21603
 
+               memset(r1_bh, 0, sizeof(*r1_bh));
21604
 
+               r1_bh->bh_req.b_page = page;
21605
 
+               r1_bh->bh_req.b_data = page_address(page);
21606
 
+               r1_bh->next_r1 = conf->freebuf;
21607
 
+               conf->freebuf = r1_bh;
21608
 
+               i++;
21609
 
+       }
21610
 
+       md_spin_unlock_irq(&conf->device_lock);
21611
 
+       return i;
21612
 
+}
21613
 
+
21614
 
+static void raid1_shrink_buffers (raid1_conf_t *conf)
21615
 
+{
21616
 
+       md_spin_lock_irq(&conf->device_lock);
21617
 
+       while (conf->freebuf) {
21618
 
+               struct raid1_bh *r1_bh = conf->freebuf;
21619
 
+               conf->freebuf = r1_bh->next_r1;
21620
 
+               __free_page(r1_bh->bh_req.b_page);
21621
 
+               kfree(r1_bh);
21622
 
+       }
21623
 
+       md_spin_unlock_irq(&conf->device_lock);
21624
 
+}
21625
 
+
21626
 
+/*
21627
 
+ * evms_raid1_map
21628
 
+ *     EVMS raid1 version of raid1_map()
21629
 
+ */
21630
 
+static int evms_raid1_map (mddev_t *mddev, evms_logical_node_t **node)
21631
 
+{
21632
 
+       raid1_conf_t *conf = mddev_to_conf(mddev);
21633
 
+       int i;
21634
 
+
21635
 
+       /*
21636
 
+        * Later we do read balancing on the read side 
21637
 
+        * now we use the first available disk.
21638
 
+        */
21639
 
+
21640
 
+       for (i = 0; i < MD_SB_DISKS; i++) {
21641
 
+               if (conf->mirrors[i].operational) {
21642
 
+                       *node = conf->mirrors[i].node;
21643
 
+                       return (0);
21644
 
+               }
21645
 
+       }
21646
 
+
21647
 
+       LOG_ERROR("huh, no more operational devices?\n");
21648
 
+       return (-1);
21649
 
+}
21650
 
+
21651
 
+
21652
 
+static void raid1_reschedule_retry (struct raid1_bh *r1_bh)
21653
 
+{
21654
 
+       unsigned long flags;
21655
 
+       mddev_t *mddev = r1_bh->mddev;
21656
 
+       raid1_conf_t *conf = mddev_to_conf(mddev);
21657
 
+
21658
 
+       md_spin_lock_irqsave(&retry_list_lock, flags);
21659
 
+       if (evms_raid1_retry_list == NULL)
21660
 
+               evms_raid1_retry_tail = &evms_raid1_retry_list;
21661
 
+       *evms_raid1_retry_tail = r1_bh;
21662
 
+       evms_raid1_retry_tail = &r1_bh->next_r1;
21663
 
+       r1_bh->next_r1 = NULL;
21664
 
+       md_spin_unlock_irqrestore(&retry_list_lock, flags);
21665
 
+       evms_cs_wakeup_thread(conf->thread);
21666
 
+}
21667
 
+
21668
 
+
21669
 
+static void inline io_request_done(unsigned long sector, raid1_conf_t *conf, int phase)
21670
 
+{
21671
 
+       unsigned long flags;
21672
 
+       spin_lock_irqsave(&conf->segment_lock, flags);
21673
 
+       if (sector < conf->start_active)
21674
 
+               conf->cnt_done--;
21675
 
+       else if (sector >= conf->start_future && conf->phase == phase)
21676
 
+               conf->cnt_future--;
21677
 
+       else if (!--conf->cnt_pending)
21678
 
+               wake_up(&conf->wait_ready);
21679
 
+
21680
 
+       spin_unlock_irqrestore(&conf->segment_lock, flags);
21681
 
+}
21682
 
+
21683
 
+static void inline sync_request_done (unsigned long sector, raid1_conf_t *conf)
21684
 
+{
21685
 
+       unsigned long flags;
21686
 
+       spin_lock_irqsave(&conf->segment_lock, flags);
21687
 
+       if (sector >= conf->start_ready)
21688
 
+               --conf->cnt_ready;
21689
 
+       else if (sector >= conf->start_active) {
21690
 
+               if (!--conf->cnt_active) {
21691
 
+                       conf->start_active = conf->start_ready;
21692
 
+                       wake_up(&conf->wait_done);
21693
 
+               }
21694
 
+       }
21695
 
+       spin_unlock_irqrestore(&conf->segment_lock, flags);
21696
 
+}
21697
 
+
21698
 
+/*
21699
 
+ * raid1_end_bh_io() is called when we have finished servicing a mirrored
21700
 
+ * operation and are ready to return a success/failure code to the buffer
21701
 
+ * cache layer.
21702
 
+ */
21703
 
+static void raid1_end_bh_io (struct raid1_bh *r1_bh, int uptodate)
21704
 
+{
21705
 
+       struct buffer_head *bh = r1_bh->master_bh;
21706
 
+       unsigned long rsector = (unsigned long)r1_bh->eio.rsector;
21707
 
+
21708
 
+       //io_request_done(bh->b_rsector, mddev_to_conf(r1_bh->mddev),
21709
 
+       io_request_done(rsector, mddev_to_conf(r1_bh->mddev),
21710
 
+                       test_bit(R1BH_SyncPhase, &r1_bh->state));
21711
 
+
21712
 
+       bh->b_end_io(bh, uptodate);
21713
 
+       raid1_free_r1bh(r1_bh);
21714
 
+}
21715
 
+
21716
 
+void evms_raid1_end_request (struct buffer_head *bh, int uptodate)
21717
 
+{
21718
 
+       struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
21719
 
+
21720
 
+       /*
21721
 
+        * this branch is our 'one mirror IO has finished' event handler:
21722
 
+        */
21723
 
+       if (!uptodate) {
21724
 
+               if (r1_bh->node)
21725
 
+                       /* READ */
21726
 
+                       evms_md_error (r1_bh->mddev, r1_bh->node);
21727
 
+               else {  /* WRITE */
21728
 
+                       evms_logical_node_t *node;
21729
 
+                       node = bh_to_node(r1_bh,bh);
21730
 
+                       if (node)
21731
 
+                               evms_md_error (r1_bh->mddev, node);
21732
 
+               }
21733
 
+       } else
21734
 
+               /*
21735
 
+                * Set R1BH_Uptodate in our master buffer_head, so that
21736
 
+                * we will return a good error code for to the higher
21737
 
+                * levels even if IO on some other mirrored buffer fails.
21738
 
+                *
21739
 
+                * The 'master' represents the complex operation to 
21740
 
+                * user-side. So if something waits for IO, then it will
21741
 
+                * wait for the 'master' buffer_head.
21742
 
+                */
21743
 
+               set_bit (R1BH_Uptodate, &r1_bh->state);
21744
 
+
21745
 
+       /*
21746
 
+        * We split up the read and write side, imho they are 
21747
 
+        * conceptually different.
21748
 
+        */
21749
 
+
21750
 
+       if ( (r1_bh->cmd == READ) || (r1_bh->cmd == READA) ) {
21751
 
+               /*
21752
 
+                * we have only one buffer_head on the read side
21753
 
+                */
21754
 
+               
21755
 
+               if (uptodate) {
21756
 
+                       raid1_end_bh_io(r1_bh, uptodate);
21757
 
+                       return;
21758
 
+               }
21759
 
+               /*
21760
 
+                * oops, read error:
21761
 
+                */
21762
 
+               LOG_ERROR("rescheduling block %lu\n", bh->b_blocknr);
21763
 
+               raid1_reschedule_retry(r1_bh);
21764
 
+               return;
21765
 
+       }
21766
 
+
21767
 
+       /*
21768
 
+        * WRITE:
21769
 
+        *
21770
 
+        * Let's see if all mirrored write operations have finished 
21771
 
+        * already.
21772
 
+        */
21773
 
+
21774
 
+       if (atomic_dec_and_test(&r1_bh->remaining))
21775
 
+               raid1_end_bh_io(r1_bh, test_bit(R1BH_Uptodate, &r1_bh->state));
21776
 
+}
21777
 
+
21778
 
+/*
21779
 
+ * This routine returns the disk from which the requested read should
21780
 
+ * be done. It bookkeeps the last read position for every disk
21781
 
+ * in array and when new read requests come, the disk which last
21782
 
+ * position is nearest to the request, is chosen.
21783
 
+ *
21784
 
+ * TODO: now if there are 2 mirrors in the same 2 devices, performance
21785
 
+ * degrades dramatically because position is mirror, not device based.
21786
 
+ * This should be changed to be device based. Also atomic sequential
21787
 
+ * reads should be somehow balanced.
21788
 
+ */
21789
 
+
21790
 
+//static int raid1_read_balance (raid1_conf_t *conf, struct buffer_head *bh)
21791
 
+static int raid1_read_balance (raid1_conf_t *conf, eio_t *eio)
21792
 
+{
21793
 
+       int new_disk = conf->last_used;
21794
 
+       //const int sectors = bh->b_size >> 9;
21795
 
+       const int sectors = (int)eio->rsize;
21796
 
+       //const unsigned long this_sector = bh->b_rsector;
21797
 
+       const unsigned long this_sector = (unsigned long)eio->rsector;
21798
 
+       int disk = new_disk;
21799
 
+       unsigned long new_distance;
21800
 
+       unsigned long current_distance;
21801
 
+       
21802
 
+       /*
21803
 
+        * Check if it is sane at all to balance
21804
 
+        */
21805
 
+       
21806
 
+       if (conf->resync_mirrors)
21807
 
+               goto rb_out;
21808
 
+       
21809
 
+
21810
 
+       /* make sure that disk is operational */
21811
 
+       while( !conf->mirrors[new_disk].operational) {
21812
 
+               if (new_disk <= 0) new_disk = conf->raid_disks;
21813
 
+               new_disk--;
21814
 
+               if (new_disk == disk) {
21815
 
+                       /*
21816
 
+                        * This means no working disk was found
21817
 
+                        * Nothing much to do, lets not change anything
21818
 
+                        * and hope for the best...
21819
 
+                        */
21820
 
+                       
21821
 
+                       new_disk = conf->last_used;
21822
 
+
21823
 
+                       goto rb_out;
21824
 
+               }
21825
 
+       }
21826
 
+       disk = new_disk;
21827
 
+       /* now disk == new_disk == starting point for search */
21828
 
+       
21829
 
+       /*
21830
 
+        * Don't touch anything for sequential reads.
21831
 
+        */
21832
 
+
21833
 
+       if (this_sector == conf->mirrors[new_disk].head_position)
21834
 
+               goto rb_out;
21835
 
+       
21836
 
+       /*
21837
 
+        * If reads have been done only on a single disk
21838
 
+        * for a time, lets give another disk a change.
21839
 
+        * This is for kicking those idling disks so that
21840
 
+        * they would find work near some hotspot.
21841
 
+        */
21842
 
+       
21843
 
+       if (conf->sect_count >= conf->mirrors[new_disk].sect_limit) {
21844
 
+               conf->sect_count = 0;
21845
 
+
21846
 
+               do {
21847
 
+                       if (new_disk<=0)
21848
 
+                               new_disk = conf->raid_disks;
21849
 
+                       new_disk--;
21850
 
+                       if (new_disk == disk)
21851
 
+                               break;
21852
 
+               } while ((conf->mirrors[new_disk].write_only) ||
21853
 
+                        (!conf->mirrors[new_disk].operational));
21854
 
+
21855
 
+               goto rb_out;
21856
 
+       }
21857
 
+       
21858
 
+       current_distance = abs(this_sector -
21859
 
+                               conf->mirrors[disk].head_position);
21860
 
+       
21861
 
+       /* Find the disk which is closest */
21862
 
+       
21863
 
+       do {
21864
 
+               if (disk <= 0)
21865
 
+                       disk = conf->raid_disks;
21866
 
+               disk--;
21867
 
+               
21868
 
+               if ((conf->mirrors[disk].write_only) ||
21869
 
+                               (!conf->mirrors[disk].operational))
21870
 
+                       continue;
21871
 
+               
21872
 
+               new_distance = abs(this_sector -
21873
 
+                                       conf->mirrors[disk].head_position);
21874
 
+               
21875
 
+               if (new_distance < current_distance) {
21876
 
+                       conf->sect_count = 0;
21877
 
+                       current_distance = new_distance;
21878
 
+                       new_disk = disk;
21879
 
+               }
21880
 
+       } while (disk != conf->last_used);
21881
 
+
21882
 
+rb_out:
21883
 
+       conf->mirrors[new_disk].head_position = this_sector + sectors;
21884
 
+
21885
 
+       conf->last_used = new_disk;
21886
 
+       conf->sect_count += sectors;
21887
 
+
21888
 
+       return new_disk;
21889
 
+}
21890
 
+
21891
 
+
21892
 
+static int raid1_init_io(mddev_t *mddev,
21893
 
+                        int rw,
21894
 
+                        evms_sector_t LSN,
21895
 
+                        evms_sector_t nr_sects,
21896
 
+                        void *data)
21897
 
+{
21898
 
+       int rc = 0;
21899
 
+       raid1_conf_t *conf = mddev_to_conf(mddev);
21900
 
+       struct mirror_info *mirror;
21901
 
+
21902
 
+       LOG_EXTRA(__FUNCTION__ " LSN=%Lu, nr_sects=%Lu\n", LSN, nr_sects);
21903
 
+
21904
 
+       if (rw == READ) {
21905
 
+               /*
21906
 
+                * read balancing logic:
21907
 
+                */
21908
 
+               eio_t eio;
21909
 
+               eio.rsector = LSN;
21910
 
+               eio.rsize = nr_sects;
21911
 
+               mirror = conf->mirrors + raid1_read_balance(conf, &eio);
21912
 
+
21913
 
+               return INIT_IO(mirror->node, rw, LSN, nr_sects, data);
21914
 
+       } else {
21915
 
+               int i;
21916
 
+               int saved_rc = 0;
21917
 
+               for (i=0; i< MD_SB_DISKS; i++) {
21918
 
+                       if (!conf->mirrors[i].operational)
21919
 
+                               continue;
21920
 
+                       rc = INIT_IO(conf->mirrors[i].node, rw, LSN, nr_sects, data);
21921
 
+                       if (rc) {
21922
 
+                               LOG_ERROR(__FUNCTION__ " WRITE failed on %s, rc=%d\n",
21923
 
+                                          conf->mirrors[i].node->name, rc);
21924
 
+                               saved_rc = rc;
21925
 
+                       }
21926
 
+               }
21927
 
+               if (saved_rc)
21928
 
+                       rc = saved_rc;
21929
 
+       }
21930
 
+       return rc;
21931
 
+}
21932
 
+
21933
 
+
21934
 
+static int raid1_make_request (mddev_t *mddev,
21935
 
+                              int rw,
21936
 
+                              eio_t *eio)
21937
 
+{
21938
 
+       struct buffer_head *bh = eio->bh;
21939
 
+       unsigned long rsector = (unsigned long)eio->rsector;
21940
 
+       raid1_conf_t *conf = mddev_to_conf(mddev);
21941
 
+       struct buffer_head *bh_req;
21942
 
+       struct raid1_bh * r1_bh;
21943
 
+       int disks = MD_SB_DISKS;
21944
 
+       struct buffer_head *bhl;
21945
 
+       int i, sum_bhs = 0;
21946
 
+       struct mirror_info *mirror;
21947
 
+
21948
 
+       if (!buffer_locked(bh))
21949
 
+               BUG();
21950
 
+       
21951
 
+/*
21952
 
+ * make_request() can abort the operation when READA is being
21953
 
+ * used and no empty request is available.
21954
 
+ *
21955
 
+ * Currently, just replace the command with READ/WRITE.
21956
 
+ */
21957
 
+       if (rw == READA)
21958
 
+               rw = READ;
21959
 
+
21960
 
+       r1_bh = raid1_alloc_r1bh (conf);
21961
 
+
21962
 
+       spin_lock_irq(&conf->segment_lock);
21963
 
+       wait_event_lock_irq(conf->wait_done,
21964
 
+                       rsector < conf->start_active ||
21965
 
+                       rsector >= conf->start_future,
21966
 
+                       conf->segment_lock);
21967
 
+       if (rsector < conf->start_active) 
21968
 
+               conf->cnt_done++;
21969
 
+       else {
21970
 
+               conf->cnt_future++;
21971
 
+               if (conf->phase)
21972
 
+                       set_bit(R1BH_SyncPhase, &r1_bh->state);
21973
 
+       }
21974
 
+       spin_unlock_irq(&conf->segment_lock);
21975
 
+       
21976
 
+       /*
21977
 
+        * i think the read and write branch should be separated completely,
21978
 
+        * since we want to do read balancing on the read side for example.
21979
 
+        * Alternative implementations? :) --mingo
21980
 
+        */
21981
 
+
21982
 
+       r1_bh->master_bh = bh;
21983
 
+       r1_bh->mddev = mddev;
21984
 
+       r1_bh->cmd = rw;
21985
 
+
21986
 
+       if (rw == READ) {
21987
 
+               /*
21988
 
+                * read balancing logic:
21989
 
+                */
21990
 
+               //mirror = conf->mirrors + raid1_read_balance(conf, bh);
21991
 
+               mirror = conf->mirrors + raid1_read_balance(conf, eio);
21992
 
+
21993
 
+               bh_req = &r1_bh->bh_req;
21994
 
+               memcpy(bh_req, bh, sizeof(*bh));
21995
 
+               bh_req->b_blocknr = rsector;
21996
 
+               bh_req->b_dev = mirror->dev;
21997
 
+               bh_req->b_rdev = mirror->dev;
21998
 
+       /*      bh_req->b_rsector = bh->n_rsector; */
21999
 
+               bh_req->b_end_io = evms_raid1_end_request;
22000
 
+               bh_req->b_private = r1_bh;
22001
 
+               //generic_make_request (rw, bh_req);
22002
 
+               eio->bh = bh_req;
22003
 
+               r1_bh->node = mirror->node;
22004
 
+               r1_bh->eio = *eio;
22005
 
+               R_IO(mirror->node, eio);
22006
 
+               return 0;
22007
 
+       }
22008
 
+
22009
 
+       /*
22010
 
+        * WRITE:
22011
 
+        */
22012
 
+
22013
 
+       bhl = raid1_alloc_bh(conf, conf->raid_disks);
22014
 
+       r1_bh->node = NULL;
22015
 
+       r1_bh->eio = *eio;
22016
 
+       for (i = 0; i < disks; i++) {
22017
 
+               struct buffer_head *mbh;
22018
 
+               if (!conf->mirrors[i].operational) 
22019
 
+                       continue;
22020
 
22021
 
+       /*
22022
 
+        * We should use a private pool (size depending on NR_REQUEST),
22023
 
+        * to avoid writes filling up the memory with bhs
22024
 
+        *
22025
 
+        * Such pools are much faster than kmalloc anyways (so we waste
22026
 
+        * almost nothing by not using the master bh when writing and
22027
 
+        * win alot of cleanness) but for now we are cool enough. --mingo
22028
 
+        *
22029
 
+        * It's safe to sleep here, buffer heads cannot be used in a shared
22030
 
+        * manner in the write branch. Look how we lock the buffer at the
22031
 
+        * beginning of this function to grok the difference ;)
22032
 
+        */
22033
 
+               mbh = bhl;
22034
 
+               if (mbh == NULL) {
22035
 
+                       MD_BUG();
22036
 
+                       break;
22037
 
+               }
22038
 
+               bhl = mbh->b_next;
22039
 
+               mbh->b_next = NULL;
22040
 
+               mbh->b_this_page = (struct buffer_head *)1;
22041
 
+               
22042
 
+       /*
22043
 
+        * prepare mirrored mbh (fields ordered for max mem throughput):
22044
 
+        */
22045
 
+               mbh->b_blocknr    = rsector;
22046
 
+               mbh->b_dev        = conf->mirrors[i].dev;
22047
 
+               mbh->b_rdev       = conf->mirrors[i].dev;
22048
 
+               mbh->b_rsector    = rsector;
22049
 
+               mbh->b_state      = (1<<BH_Req) | (1<<BH_Dirty) |
22050
 
+                                               (1<<BH_Mapped) | (1<<BH_Lock);
22051
 
+
22052
 
+               atomic_set(&mbh->b_count, 1);
22053
 
+               mbh->b_size       = bh->b_size;
22054
 
+               mbh->b_page       = bh->b_page;
22055
 
+               mbh->b_data       = bh->b_data;
22056
 
+               mbh->b_list       = BUF_LOCKED;
22057
 
+               mbh->b_end_io     = evms_raid1_end_request;
22058
 
+               //mbh->b_private    = r1_bh;
22059
 
+               mbh->b_private    = conf->mirrors[i].node;
22060
 
+
22061
 
+               mbh->b_next = r1_bh->mirror_bh_list;
22062
 
+               r1_bh->mirror_bh_list = mbh;
22063
 
+               sum_bhs++;
22064
 
+       }
22065
 
+       if (bhl) raid1_free_bh(conf,bhl);
22066
 
+       if (!sum_bhs) {
22067
 
+               /* Gag - all mirrors non-operational.. */
22068
 
+               raid1_end_bh_io(r1_bh, 0);
22069
 
+               return 0;
22070
 
+       }
22071
 
+       md_atomic_set(&r1_bh->remaining, sum_bhs);
22072
 
+
22073
 
+       /*
22074
 
+        * We have to be a bit careful about the semaphore above, thats
22075
 
+        * why we start the requests separately. Since kmalloc() could
22076
 
+        * fail, sleep and make_request() can sleep too, this is the
22077
 
+        * safer solution. Imagine, end_request decreasing the semaphore
22078
 
+        * before we could have set it up ... We could play tricks with
22079
 
+        * the semaphore (presetting it and correcting at the end if
22080
 
+        * sum_bhs is not 'n' but we have to do end_request by hand if
22081
 
+        * all requests finish until we had a chance to set up the
22082
 
+        * semaphore correctly ... lots of races).
22083
 
+        */
22084
 
+       bh = r1_bh->mirror_bh_list;
22085
 
+       while(bh) {
22086
 
+               evms_logical_node_t *node;
22087
 
+               eio_t this_eio;
22088
 
+               struct buffer_head *bh2 = bh;
22089
 
+
22090
 
+               bh = bh->b_next;
22091
 
+               node = (evms_logical_node_t *)bh2->b_private;
22092
 
+               bh2->b_private = r1_bh;
22093
 
+               this_eio = r1_bh->eio;
22094
 
+               this_eio.bh = bh2;
22095
 
+               add_node_mapping(r1_bh, node, bh2);
22096
 
+               W_IO(node, &this_eio);
22097
 
+               //generic_make_request(rw, bh2);
22098
 
+       }
22099
 
+
22100
 
+       return (0);
22101
 
+}
22102
 
+
22103
 
+static int raid1_status (char *page, mddev_t *mddev)
22104
 
+{
22105
 
+       raid1_conf_t *conf = mddev_to_conf(mddev);
22106
 
+       int sz = 0, i;
22107
 
+       
22108
 
+       sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks,
22109
 
+                                                conf->working_disks);
22110
 
+       for (i = 0; i < conf->raid_disks; i++)
22111
 
+               sz += sprintf (page+sz, "%s",
22112
 
+                       conf->mirrors[i].operational ? "U" : "_");
22113
 
+       sz += sprintf (page+sz, "]");
22114
 
+       return sz;
22115
 
+}
22116
 
+
22117
 
+#define LAST_DISK KERN_ALERT \
22118
 
+"EVMS raid1: only one disk left and IO error.\n"
22119
 
+
22120
 
+#define NO_SPARE_DISK KERN_ALERT \
22121
 
+"EVMS raid1: no spare disk left, degrading mirror level by one.\n"
22122
 
+
22123
 
+#define DISK_FAILED KERN_ALERT \
22124
 
+"EVMS raid1: Disk failure on %s, disabling device. \n" \
22125
 
+"      Operation continuing on %d devices\n"
22126
 
+
22127
 
+#define START_SYNCING KERN_ALERT \
22128
 
+"EVMS raid1: start syncing spare disk.\n"
22129
 
+
22130
 
+#define ALREADY_SYNCING KERN_INFO \
22131
 
+"EVMS raid1: syncing already in progress.\n"
22132
 
+
22133
 
+static void mark_disk_bad (mddev_t *mddev, int failed)
22134
 
+{
22135
 
+       raid1_conf_t *conf = mddev_to_conf(mddev);
22136
 
+       struct mirror_info *mirror = conf->mirrors+failed;
22137
 
+       mdp_super_t *sb = mddev->sb;
22138
 
+
22139
 
+       mirror->operational = 0;
22140
 
+       mark_disk_faulty(sb->disks+mirror->number);
22141
 
+       mark_disk_nonsync(sb->disks+mirror->number);
22142
 
+       mark_disk_inactive(sb->disks+mirror->number);
22143
 
+       if (!mirror->write_only)
22144
 
+               sb->active_disks--;
22145
 
+       sb->working_disks--;
22146
 
+       sb->failed_disks++;
22147
 
+       mddev->sb_dirty = 1;
22148
 
+       evms_cs_wakeup_thread(conf->thread);
22149
 
+       if (!mirror->write_only)
22150
 
+               conf->working_disks--;
22151
 
+       LOG_SERIOUS(DISK_FAILED, evms_md_partition_name(mirror->node),conf->working_disks);
22152
 
+}
22153
 
+
22154
 
+static int raid1_error (
22155
 
+       mddev_t *mddev,
22156
 
+       evms_logical_node_t *node)
22157
 
+{
22158
 
+       raid1_conf_t *conf = mddev_to_conf(mddev);
22159
 
+       struct mirror_info * mirrors = conf->mirrors;
22160
 
+       int disks = MD_SB_DISKS;
22161
 
+       int i;
22162
 
+
22163
 
+       /* Find the drive.
22164
 
+        * If it is not operational, then we have already marked it as dead
22165
 
+        * else if it is the last working disks, ignore the error, let the
22166
 
+        * next level up know.
22167
 
+        * else mark the drive as failed
22168
 
+        */
22169
 
+
22170
 
+       for (i = 0; i < disks; i++)
22171
 
+               if (mirrors[i].node==node && mirrors[i].operational)
22172
 
+                       break;
22173
 
+       if (i == disks)
22174
 
+               return 0;
22175
 
+
22176
 
+       if (i < conf->raid_disks && conf->working_disks == 1) {
22177
 
+               /* Don't fail the drive, act as though we were just a
22178
 
+                * normal single drive
22179
 
+                */
22180
 
+
22181
 
+               return 1;
22182
 
+       }
22183
 
+       mark_disk_bad(mddev, i);
22184
 
+       return 0;
22185
 
+}
22186
 
+
22187
 
+#undef LAST_DISK
22188
 
+#undef NO_SPARE_DISK
22189
 
+#undef DISK_FAILED
22190
 
+#undef START_SYNCING
22191
 
+
22192
 
+
22193
 
+static void print_raid1_conf (raid1_conf_t *conf)
22194
 
+{
22195
 
+       int i;
22196
 
+       struct mirror_info *tmp;
22197
 
+
22198
 
+       LOG_DEFAULT("RAID1 conf printout:\n");
22199
 
+       if (!conf) {
22200
 
+               LOG_DEFAULT("(conf==NULL)\n");
22201
 
+               return;
22202
 
+       }
22203
 
+       LOG_DEFAULT(" --- wd:%d rd:%d nd:%d\n",
22204
 
+               conf->working_disks,conf->raid_disks, conf->nr_disks);
22205
 
+
22206
 
+       for (i = 0; i < conf->nr_disks; i++) {
22207
 
+               tmp = conf->mirrors + i;
22208
 
+               LOG_DEFAULT(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
22209
 
+                          i, tmp->spare,tmp->operational,
22210
 
+                          tmp->number,tmp->raid_disk,tmp->used_slot,
22211
 
+                          evms_md_partition_name(tmp->node));
22212
 
+       }
22213
 
+}
22214
 
+
22215
 
+static void close_sync(raid1_conf_t *conf)
22216
 
+{
22217
 
+       mddev_t *mddev = conf->mddev;
22218
 
+       /* If reconstruction was interrupted, we need to close the "active" and "pending"
22219
 
+        * holes.
22220
 
+        * we know that there are no active rebuild requests, os cnt_active == cnt_ready ==0
22221
 
+        */
22222
 
+       /* this is really needed when recovery stops too... */
22223
 
+       spin_lock_irq(&conf->segment_lock);
22224
 
+       conf->start_active = conf->start_pending;
22225
 
+       conf->start_ready = conf->start_pending;
22226
 
+       wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
22227
 
+       conf->start_active =conf->start_ready = conf->start_pending = conf->start_future;
22228
 
+       conf->start_future = mddev->sb->size+1;
22229
 
+       conf->cnt_pending = conf->cnt_future;
22230
 
+       conf->cnt_future = 0;
22231
 
+       conf->phase = conf->phase ^1;
22232
 
+       wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
22233
 
+       conf->start_active = conf->start_ready = conf->start_pending = conf->start_future = 0;
22234
 
+       conf->phase = 0;
22235
 
+       conf->cnt_future = conf->cnt_done;;
22236
 
+       conf->cnt_done = 0;
22237
 
+       spin_unlock_irq(&conf->segment_lock);
22238
 
+       wake_up(&conf->wait_done);
22239
 
+}
22240
 
+
22241
 
+static int raid1_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
22242
 
+{
22243
 
+       int err = 0;
22244
 
+       int i, failed_disk=-1, spare_disk=-1, removed_disk=-1;
22245
 
+       raid1_conf_t *conf = mddev->private;
22246
 
+       struct mirror_info *tmp, *sdisk, *fdisk, *rdisk;
22247
 
+       mdp_super_t *sb = mddev->sb;
22248
 
+       mdp_disk_t *failed_desc, *spare_desc;
22249
 
+       mdk_rdev_t *spare_rdev, *failed_rdev;
22250
 
+
22251
 
+       print_raid1_conf(conf);
22252
 
+       md_spin_lock_irq(&conf->device_lock);
22253
 
+       /*
22254
 
+        * find the disk ...
22255
 
+        */
22256
 
+       switch (state) {
22257
 
+
22258
 
+       case DISKOP_SPARE_ACTIVE:
22259
 
+
22260
 
+               /*
22261
 
+                * Find the failed disk within the RAID1 configuration ...
22262
 
+                * (this can only be in the first conf->working_disks part)
22263
 
+                */
22264
 
+               for (i = 0; i < conf->raid_disks; i++) {
22265
 
+                       tmp = conf->mirrors + i;
22266
 
+                       if ((!tmp->operational && !tmp->spare) ||
22267
 
+                                       !tmp->used_slot) {
22268
 
+                               failed_disk = i;
22269
 
+                               break;
22270
 
+                       }
22271
 
+               }
22272
 
+               /*
22273
 
+                * When we activate a spare disk we _must_ have a disk in
22274
 
+                * the lower (active) part of the array to replace. 
22275
 
+                */
22276
 
+/*             if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
22277
 
+                       MD_BUG();
22278
 
+                       err = 1;
22279
 
+                       goto abort;
22280
 
+               }
22281
 
+  */           /* fall through */
22282
 
+
22283
 
+       case DISKOP_HOT_SPARE_ACTIVE:
22284
 
+       case DISKOP_SPARE_WRITE:
22285
 
+       case DISKOP_SPARE_INACTIVE:
22286
 
+
22287
 
+               /*
22288
 
+                * Find the spare disk ... (can only be in the 'high'
22289
 
+                * area of the array)
22290
 
+                ##### Actually it can be sooner now that we have improved MD #####
22291
 
+                This support required for expanding number of active mirrors.
22292
 
+                */
22293
 
+               for (i = 0; i < MD_SB_DISKS; i++) {
22294
 
+                       tmp = conf->mirrors + i;
22295
 
+                       if (tmp->spare && tmp->number == (*d)->number) {
22296
 
+                               spare_disk = i;
22297
 
+                               break;
22298
 
+                       }
22299
 
+               }
22300
 
+               if (spare_disk == -1) {
22301
 
+                       MD_BUG();
22302
 
+                       err = 1;
22303
 
+                       goto abort;
22304
 
+               }
22305
 
+               break;
22306
 
+
22307
 
+       case DISKOP_HOT_REMOVE_SPARE:
22308
 
+
22309
 
+               for (i = 0; i < MD_SB_DISKS; i++) {
22310
 
+                       tmp = conf->mirrors + i;
22311
 
+                       if (tmp->used_slot && (tmp->number == (*d)->number)) {
22312
 
+                               if (tmp->operational) {
22313
 
+                                       err = -EBUSY;
22314
 
+                                       goto abort;
22315
 
+                               } else if (!tmp->spare){
22316
 
+                                       MD_BUG();
22317
 
+                                       err = 1;
22318
 
+                                       goto abort;
22319
 
+                               }
22320
 
+                               removed_disk = i;
22321
 
+                               break;
22322
 
+                       }
22323
 
+               }
22324
 
+               if (removed_disk == -1) {
22325
 
+                       MD_BUG();
22326
 
+                       err = 1;
22327
 
+                       goto abort;
22328
 
+               }
22329
 
+               break;
22330
 
+       
22331
 
+       case DISKOP_HOT_REMOVE_DISK:
22332
 
+               if (conf->working_disks <= 1) {
22333
 
+                       err = -EBUSY;
22334
 
+                       goto abort;
22335
 
+               }
22336
 
+               for (i = 0; i < MD_SB_DISKS; i++) {
22337
 
+                       tmp = conf->mirrors + i;
22338
 
+                       if (tmp->used_slot && (tmp->number == (*d)->number)) {
22339
 
+                               removed_disk = i;
22340
 
+                               break;
22341
 
+                       }
22342
 
+               }
22343
 
+               if (removed_disk == -1) {
22344
 
+                       MD_BUG();
22345
 
+                       err = 1;
22346
 
+                       goto abort;
22347
 
+               }
22348
 
+               break;
22349
 
+
22350
 
+       case DISKOP_HOT_ADD_DISK:
22351
 
+               err = -ENOSYS;
22352
 
+               goto abort;
22353
 
+               break;
22354
 
+       }
22355
 
+
22356
 
+       switch (state) {
22357
 
+       /*
22358
 
+        * Switch the spare disk to write-only mode:
22359
 
+        */
22360
 
+       case DISKOP_SPARE_WRITE:
22361
 
+               sdisk = conf->mirrors + spare_disk;
22362
 
+               sdisk->operational = 1;
22363
 
+               sdisk->write_only = 1;
22364
 
+               break;
22365
 
+       /*
22366
 
+        * Deactivate a spare disk:
22367
 
+        */
22368
 
+       case DISKOP_SPARE_INACTIVE:
22369
 
+               close_sync(conf);
22370
 
+               sdisk = conf->mirrors + spare_disk;
22371
 
+               sdisk->operational = 0;
22372
 
+               sdisk->write_only = 0;
22373
 
+               break;
22374
 
+       /*
22375
 
+        * Activate (mark read-write) the (now sync) spare disk,
22376
 
+        * which means we switch it's 'raid position' (->raid_disk)
22377
 
+        * with the failed disk. (only the first 'conf->nr_disks'
22378
 
+        * slots are used for 'real' disks and we must preserve this
22379
 
+        * property)
22380
 
+        */
22381
 
+       case DISKOP_SPARE_ACTIVE:
22382
 
+               close_sync(conf);
22383
 
+               sdisk = conf->mirrors + spare_disk;
22384
 
+               if (failed_disk < 0) {
22385
 
+                       // preset failed disk to itself if no failed disk.
22386
 
+                       failed_disk = spare_disk;  
22387
 
+                       // try to find spare earlier in array
22388
 
+                       for (i = conf->raid_disks; i < spare_disk; i++) {
22389
 
+                               tmp = conf->mirrors + i;
22390
 
+                               if ((tmp->spare) || !tmp->used_slot) {
22391
 
+                                       failed_disk = i;
22392
 
+                                       break;
22393
 
+                               }
22394
 
+                       }
22395
 
+               }
22396
 
+               fdisk = conf->mirrors + failed_disk;
22397
 
+
22398
 
+               spare_desc = &sb->disks[sdisk->number];
22399
 
+               failed_desc = &sb->disks[fdisk->number];
22400
 
+
22401
 
+               if (spare_desc != *d) {
22402
 
+                       MD_BUG();
22403
 
+                       err = 1;
22404
 
+                       goto abort;
22405
 
+               }
22406
 
+
22407
 
+               if (spare_desc->raid_disk != sdisk->raid_disk) {
22408
 
+                       MD_BUG();
22409
 
+                       err = 1;
22410
 
+                       goto abort;
22411
 
+               }
22412
 
+                       
22413
 
+               if (sdisk->raid_disk != spare_disk) {
22414
 
+                       MD_BUG();
22415
 
+                       err = 1;
22416
 
+                       goto abort;
22417
 
+               }
22418
 
+
22419
 
+               if (failed_desc->raid_disk != fdisk->raid_disk) {
22420
 
+                       MD_BUG();
22421
 
+                       err = 1;
22422
 
+                       goto abort;
22423
 
+               }
22424
 
+
22425
 
+               if (fdisk->raid_disk != failed_disk) {
22426
 
+                       MD_BUG();
22427
 
+                       err = 1;
22428
 
+                       goto abort;
22429
 
+               }
22430
 
+
22431
 
+               /*
22432
 
+                * do the switch finally
22433
 
+                */
22434
 
+               spare_rdev = evms_md_find_rdev_nr(mddev, spare_desc->number);
22435
 
+               failed_rdev = evms_md_find_rdev_nr(mddev, failed_desc->number);
22436
 
+
22437
 
+               /* There must be a spare_rdev, but there may not be a
22438
 
+                * failed_rdev.  That slot might be empty...
22439
 
+                */
22440
 
+               spare_rdev->desc_nr = failed_desc->number;
22441
 
+               if (failed_rdev)
22442
 
+                       failed_rdev->desc_nr = spare_desc->number;
22443
 
+               
22444
 
+               xchg_values(*spare_desc, *failed_desc);
22445
 
+               xchg_values(*fdisk, *sdisk);
22446
 
+
22447
 
+               /*
22448
 
+                * (careful, 'failed' and 'spare' are switched from now on)
22449
 
+                *
22450
 
+                * we want to preserve linear numbering and we want to
22451
 
+                * give the proper raid_disk number to the now activated
22452
 
+                * disk. (this means we switch back these values)
22453
 
+                */
22454
 
+       
22455
 
+               xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
22456
 
+               xchg_values(sdisk->raid_disk, fdisk->raid_disk);
22457
 
+               xchg_values(spare_desc->number, failed_desc->number);
22458
 
+               xchg_values(sdisk->number, fdisk->number);
22459
 
+
22460
 
+               *d = failed_desc;
22461
 
+
22462
 
+               if (sdisk->dev == MKDEV(0,0))
22463
 
+                       sdisk->used_slot = 0;
22464
 
+               /*
22465
 
+                * this really activates the spare.
22466
 
+                */
22467
 
+               fdisk->spare = 0;
22468
 
+               fdisk->write_only = 0;
22469
 
+
22470
 
+               /*
22471
 
+                * if we activate a spare, we definitely replace a
22472
 
+                * non-operational disk slot in the 'low' area of
22473
 
+                * the disk array.
22474
 
+                */
22475
 
+
22476
 
+               conf->working_disks++;
22477
 
+
22478
 
+               break;
22479
 
+
22480
 
+       /* Activate a spare disk without a failed disk */
22481
 
+       case DISKOP_HOT_SPARE_ACTIVE:
22482
 
+               sdisk = conf->mirrors + spare_disk;
22483
 
+               sdisk->spare = 0;
22484
 
+               sdisk->write_only = 0;
22485
 
+               conf->working_disks++;
22486
 
+               conf->raid_disks++;
22487
 
+               if (raid1_grow_bh(conf, NR_RESERVED_BUFS) < NR_RESERVED_BUFS) 
22488
 
+                       LOG_WARNING("%s: Cannot grow BH pool\n", __FUNCTION__);
22489
 
+               break;
22490
 
+
22491
 
+       case DISKOP_HOT_REMOVE_SPARE:
22492
 
+               rdisk = conf->mirrors + removed_disk;
22493
 
+
22494
 
+               if (removed_disk < conf->raid_disks) {
22495
 
+                       MD_BUG();
22496
 
+                       err = 1;
22497
 
+                       goto abort;
22498
 
+               }
22499
 
+
22500
 
+               LOG_WARNING("%s: removing spare %s, [md%d] nr_disks=%d\n", 
22501
 
+                           __FUNCTION__, evms_md_partition_name(rdisk->node), 
22502
 
+                           conf->mddev->__minor, conf->nr_disks-1);
22503
 
+
22504
 
+               rdisk->dev = MKDEV(0,0);
22505
 
+               rdisk->node = NULL;
22506
 
+               rdisk->used_slot = 0;
22507
 
+               conf->nr_disks--;
22508
 
+               break;
22509
 
+       
22510
 
+       case DISKOP_HOT_REMOVE_DISK:
22511
 
+               rdisk = conf->mirrors + removed_disk;
22512
 
+
22513
 
+               LOG_WARNING("%s: removing active disk %s, [md%d] nr_disks=%d\n", 
22514
 
+                           __FUNCTION__, evms_md_partition_name(rdisk->node), 
22515
 
+                           conf->mddev->__minor, conf->nr_disks-1);
22516
 
+
22517
 
+               rdisk->dev = MKDEV(0,0);
22518
 
+               rdisk->node = NULL;
22519
 
+               rdisk->used_slot = 0;
22520
 
+               rdisk->operational = 0;
22521
 
+               conf->working_disks--;
22522
 
+               conf->nr_disks--;
22523
 
+               sb->raid_disks--;       //decrement raid disks.  md_core now increments
22524
 
+                                       //when activating new spare, don't assume add spare here
22525
 
+               break;
22526
 
+       default:
22527
 
+               MD_BUG();       
22528
 
+               err = 1;
22529
 
+               goto abort;
22530
 
+       }
22531
 
+abort:
22532
 
+       md_spin_unlock_irq(&conf->device_lock);
22533
 
+       if (state == DISKOP_SPARE_ACTIVE || state == DISKOP_SPARE_INACTIVE)
22534
 
+               /* should move to "END_REBUILD" when such exists */
22535
 
+               raid1_shrink_buffers(conf);
22536
 
+
22537
 
+       print_raid1_conf(conf);
22538
 
+       return err;
22539
 
+}
22540
 
+
22541
 
+
22542
 
+#define IO_ERROR KERN_ALERT \
22543
 
+"EVMS raid1: %s: unrecoverable I/O read error for block %lu\n"
22544
 
+
22545
 
+#define REDIRECT_SECTOR KERN_ERR \
22546
 
+"EVMS raid1: %s: redirecting sector %lu to another mirror\n"
22547
 
+
22548
 
+/*
22549
 
+ * This is a kernel thread which:
22550
 
+ *
22551
 
+ *     1.      Retries failed read operations on working mirrors.
22552
 
+ *     2.      Updates the raid superblock when problems encounter.
22553
 
+ *     3.      Performs writes following reads for array syncronising.
22554
 
+ */
22555
 
+static void end_sync_write(struct buffer_head *bh, int uptodate);
22556
 
+static void end_sync_read(struct buffer_head *bh, int uptodate);
22557
 
+
22558
 
+static void raid1d (void *data)
22559
 
+{
22560
 
+       struct raid1_bh *r1_bh;
22561
 
+       struct buffer_head *bh;
22562
 
+       unsigned long flags;
22563
 
+       mddev_t *mddev;
22564
 
+#ifdef ORG_RAID1_CODE
22565
 
+       kdev_t dev;
22566
 
+#endif
22567
 
+
22568
 
+       for (;;) {
22569
 
+               md_spin_lock_irqsave(&retry_list_lock, flags);
22570
 
+               r1_bh = evms_raid1_retry_list;
22571
 
+               if (!r1_bh)
22572
 
+                       break;
22573
 
+               evms_raid1_retry_list = r1_bh->next_r1;
22574
 
+               md_spin_unlock_irqrestore(&retry_list_lock, flags);
22575
 
+
22576
 
+               mddev = r1_bh->mddev;
22577
 
+               if (mddev->sb_dirty) {
22578
 
+                       LOG_DEFAULT("EVMS raid1: dirty sb detected, updating.\n");
22579
 
+                       mddev->sb_dirty = 0;
22580
 
+                       evms_md_update_sb(mddev);
22581
 
+               }
22582
 
+               bh = &r1_bh->bh_req;
22583
 
+               switch(r1_bh->cmd) {
22584
 
+               case SPECIAL:
22585
 
+                       /* have to allocate lots of bh structures and
22586
 
+                        * schedule writes
22587
 
+                        */
22588
 
+                       if (test_bit(R1BH_Uptodate, &r1_bh->state)) {
22589
 
+                               int i, sum_bhs = 0;
22590
 
+                               int disks = MD_SB_DISKS;
22591
 
+                               struct buffer_head *bhl, *mbh;
22592
 
+                               raid1_conf_t *conf;
22593
 
+                               
22594
 
+                               conf = mddev_to_conf(mddev);
22595
 
+                               bhl = raid1_alloc_bh(conf, conf->raid_disks); /* don't really need this many */
22596
 
+                               for (i = 0; i < disks ; i++) {
22597
 
+                                       if (!conf->mirrors[i].operational)
22598
 
+                                               continue;
22599
 
+                                       if (i==conf->last_used)
22600
 
+                                               /* we read from here, no need to write */
22601
 
+                                               continue;
22602
 
+                                       if (i < conf->raid_disks
22603
 
+                                           && !conf->resync_mirrors
22604
 
+                                           && !conf->mirrors[i].write_only)
22605
 
+                                               /* don't need to write this,
22606
 
+                                                * we are just rebuilding */
22607
 
+                                               continue;
22608
 
+                                       mbh = bhl;
22609
 
+                                       if (!mbh) {
22610
 
+                                               MD_BUG();
22611
 
+                                               break;
22612
 
+                                       }
22613
 
+                                       bhl = mbh->b_next;
22614
 
+                                       mbh->b_this_page = (struct buffer_head *)1;
22615
 
+
22616
 
+                                               
22617
 
+                               /*
22618
 
+                                * prepare mirrored bh (fields ordered for max mem throughput):
22619
 
+                                */
22620
 
+                                       mbh->b_blocknr    = bh->b_blocknr;
22621
 
+                                       mbh->b_dev        = conf->mirrors[i].dev;
22622
 
+                                       mbh->b_rdev       = conf->mirrors[i].dev;
22623
 
+                                       mbh->b_rsector    = bh->b_blocknr;
22624
 
+                                       mbh->b_state      = (1<<BH_Req) | (1<<BH_Dirty) |
22625
 
+                                               (1<<BH_Mapped) | (1<<BH_Lock);
22626
 
+                                       atomic_set(&mbh->b_count, 1);
22627
 
+                                       mbh->b_size       = bh->b_size;
22628
 
+                                       mbh->b_page       = bh->b_page;
22629
 
+                                       mbh->b_data       = bh->b_data;
22630
 
+                                       mbh->b_list       = BUF_LOCKED;
22631
 
+                                       mbh->b_end_io     = end_sync_write;
22632
 
+                                       //mbh->b_private    = r1_bh;
22633
 
+                                       mbh->b_private    = conf->mirrors[i].node;
22634
 
+
22635
 
+                                       mbh->b_next = r1_bh->mirror_bh_list;
22636
 
+                                       r1_bh->mirror_bh_list = mbh;
22637
 
+
22638
 
+                                       sum_bhs++;
22639
 
+                               }
22640
 
+                               md_atomic_set(&r1_bh->remaining, sum_bhs);
22641
 
+                               if (bhl) raid1_free_bh(conf, bhl);
22642
 
+                               mbh = r1_bh->mirror_bh_list;
22643
 
+
22644
 
+                               if (!sum_bhs) {
22645
 
+                                       /* nowhere to write this too... I guess we
22646
 
+                                        * must be done
22647
 
+                                        */
22648
 
+                                       sync_request_done(bh->b_blocknr, conf);
22649
 
+                                       evms_md_done_sync(mddev, bh->b_size>>9, 0);
22650
 
+                                       raid1_free_buf(r1_bh);
22651
 
+                               } else
22652
 
+                               while (mbh) {
22653
 
+                                       evms_logical_node_t *node;
22654
 
+                                       eio_t eio;
22655
 
+                                       struct buffer_head *bh1 = mbh;
22656
 
+
22657
 
+                                       mbh = mbh->b_next;
22658
 
+                                       node = (evms_logical_node_t *)bh1->b_private;
22659
 
+                                       bh1->b_private = r1_bh;
22660
 
+                                       eio = r1_bh->eio;
22661
 
+                                       eio.bh = bh1;
22662
 
+                                       add_node_mapping(r1_bh, node, bh1);
22663
 
+                                       W_IO(node, &eio);
22664
 
+                                       evms_md_sync_acct(bh1->b_dev, bh1->b_size/512);
22665
 
+                               }
22666
 
+                       } else {
22667
 
+                               /* There is no point trying a read-for-reconstruct
22668
 
+                                * as reconstruct is about to be aborted
22669
 
+                                */
22670
 
+
22671
 
+                               LOG_ERROR(IO_ERROR, evms_md_partition_name(r1_bh->node), bh->b_blocknr);
22672
 
+                               evms_md_done_sync(mddev, bh->b_size>>9, 0);
22673
 
+                       }
22674
 
+
22675
 
+                       break;
22676
 
+               case READ:
22677
 
+               case READA:
22678
 
+                       {
22679
 
+                               evms_logical_node_t *node, *new_node;
22680
 
+
22681
 
+                               node = r1_bh->node;
22682
 
+                               evms_raid1_map(mddev,&new_node);
22683
 
+                               if (new_node == node) {
22684
 
+                                       LOG_ERROR(" unrecoverable read error on %s at LBA(%Lu)\n",
22685
 
+                                                  node->name, r1_bh->eio.rsector);
22686
 
+                                       raid1_end_bh_io(r1_bh, 0);
22687
 
+                               } else {
22688
 
+                                       /* retry I/O on new device */
22689
 
+                                       eio_t eio;
22690
 
+                                       eio = r1_bh->eio;
22691
 
+                                       R_IO(new_node, &eio);
22692
 
+                               }
22693
 
+                       }
22694
 
+                       break;
22695
 
+               }
22696
 
+       }
22697
 
+       md_spin_unlock_irqrestore(&retry_list_lock, flags);
22698
 
+}
22699
 
+#undef IO_ERROR
22700
 
+#undef REDIRECT_SECTOR
22701
 
+
22702
 
+/*
22703
 
+ * Private kernel thread to reconstruct mirrors after an unclean
22704
 
+ * shutdown.
22705
 
+ */
22706
 
+static void raid1syncd (void *data)
22707
 
+{
22708
 
+       raid1_conf_t *conf = data;
22709
 
+       mddev_t *mddev = conf->mddev;
22710
 
+
22711
 
+       if (!conf->resync_mirrors)
22712
 
+               return;
22713
 
+       if (conf->resync_mirrors == 2)
22714
 
+               return;
22715
 
+       down(&mddev->recovery_sem);
22716
 
+       if (!evms_md_do_sync(mddev, NULL)) {
22717
 
+               /*
22718
 
+                * Only if everything went Ok.
22719
 
+                */
22720
 
+               conf->resync_mirrors = 0;
22721
 
+       }
22722
 
+
22723
 
+       close_sync(conf);
22724
 
+
22725
 
+       up(&mddev->recovery_sem);
22726
 
+       raid1_shrink_buffers(conf);
22727
 
+}
22728
 
+
22729
 
+/*
22730
 
+ * perform a "sync" on one "block"
22731
 
+ *
22732
 
+ * We need to make sure that no normal I/O request - particularly write
22733
 
+ * requests - conflict with active sync requests.
22734
 
+ * This is achieved by conceptually dividing the device space into a
22735
 
+ * number of sections:
22736
 
+ *  DONE: 0 .. a-1     These blocks are in-sync
22737
 
+ *  ACTIVE: a.. b-1    These blocks may have active sync requests, but
22738
 
+ *                     no normal IO requests
22739
 
+ *  READY: b .. c-1    These blocks have no normal IO requests - sync
22740
 
+ *                     request may be happening
22741
 
+ *  PENDING: c .. d-1  These blocks may have IO requests, but no new
22742
 
+ *                     ones will be added
22743
 
+ *  FUTURE:  d .. end  These blocks are not to be considered yet. IO may
22744
 
+ *                     be happening, but not sync
22745
 
+ *
22746
 
+ * We keep a
22747
 
+ *   phase    which flips (0 or 1) each time d moves and
22748
 
+ * a count of:
22749
 
+ *   z =  active io requests in FUTURE since d moved - marked with
22750
 
+ *        current phase
22751
 
+ *   y =  active io requests in FUTURE before d moved, or PENDING -
22752
 
+ *        marked with previous phase
22753
 
+ *   x =  active sync requests in READY
22754
 
+ *   w =  active sync requests in ACTIVE
22755
 
+ *   v =  active io requests in DONE
22756
 
+ *
22757
 
+ * Normally, a=b=c=d=0 and z= active io requests
22758
 
+ *   or a=b=c=d=END and v= active io requests
22759
 
+ * Allowed changes to a,b,c,d:
22760
 
+ * A:  c==d &&  y==0 -> d+=window, y=z, z=0, phase=!phase
22761
 
+ * B:  y==0 -> c=d
22762
 
+ * C:   b=c, w+=x, x=0
22763
 
+ * D:  w==0 -> a=b
22764
 
+ * E: a==b==c==d==end -> a=b=c=d=0, z=v, v=0
22765
 
+ *
22766
 
+ * At start of sync we apply A.
22767
 
+ * When y reaches 0, we apply B then A then being sync requests
22768
 
+ * When sync point reaches c-1, we wait for y==0, and W==0, and
22769
 
+ * then apply apply B then A then D then C.
22770
 
+ * Finally, we apply E
22771
 
+ *
22772
 
+ * The sync request simply issues a "read" against a working drive
22773
 
+ * This is marked so that on completion the raid1d thread is woken to
22774
 
+ * issue suitable write requests
22775
 
+ */
22776
 
+
22777
 
+static int raid1_sync_request (mddev_t *mddev, unsigned long sector_nr)
22778
 
+{
22779
 
+       raid1_conf_t *conf = mddev_to_conf(mddev);
22780
 
+       struct mirror_info *mirror;
22781
 
+       struct raid1_bh *r1_bh;
22782
 
+       struct buffer_head *bh;
22783
 
+       eio_t eio;
22784
 
+       int bsize;
22785
 
+       int disk;
22786
 
+       int block_nr;
22787
 
+
22788
 
+       spin_lock_irq(&conf->segment_lock);
22789
 
+       if (!sector_nr) {
22790
 
+               /* initialize ...*/
22791
 
+               int buffs;
22792
 
+               conf->start_active = 0;
22793
 
+               conf->start_ready = 0;
22794
 
+               conf->start_pending = 0;
22795
 
+               conf->start_future = 0;
22796
 
+               conf->phase = 0;
22797
 
+               /* we want enough buffers to hold twice the window of 128*/
22798
 
+               buffs = 128 *2 / (PAGE_SIZE>>9);
22799
 
+               buffs = raid1_grow_buffers(conf, buffs);
22800
 
+               if (buffs < 2)
22801
 
+                       goto nomem;
22802
 
+               
22803
 
+               conf->window = buffs*(PAGE_SIZE>>9)/2;
22804
 
+               conf->cnt_future += conf->cnt_done+conf->cnt_pending;
22805
 
+               conf->cnt_done = conf->cnt_pending = 0;
22806
 
+               if (conf->cnt_ready || conf->cnt_active)
22807
 
+                       MD_BUG();
22808
 
+       }
22809
 
+       while (sector_nr >= conf->start_pending) {
22810
 
+               PRINTK("wait .. sect=%lu start_active=%d ready=%d pending=%d future=%d, cnt_done=%d active=%d ready=%d pending=%d future=%d\n",
22811
 
+                       sector_nr, conf->start_active, conf->start_ready, conf->start_pending, conf->start_future,
22812
 
+                       conf->cnt_done, conf->cnt_active, conf->cnt_ready, conf->cnt_pending, conf->cnt_future);
22813
 
+               wait_event_lock_irq(conf->wait_done,
22814
 
+                                       !conf->cnt_active,
22815
 
+                                       conf->segment_lock);
22816
 
+               wait_event_lock_irq(conf->wait_ready,
22817
 
+                                       !conf->cnt_pending,
22818
 
+                                       conf->segment_lock);
22819
 
+               conf->start_active = conf->start_ready;
22820
 
+               conf->start_ready = conf->start_pending;
22821
 
+               conf->start_pending = conf->start_future;
22822
 
+               conf->start_future = conf->start_future+conf->window;
22823
 
+               // Note: falling off the end is not a problem
22824
 
+               conf->phase = conf->phase ^1;
22825
 
+               conf->cnt_active = conf->cnt_ready;
22826
 
+               conf->cnt_ready = 0;
22827
 
+               conf->cnt_pending = conf->cnt_future;
22828
 
+               conf->cnt_future = 0;
22829
 
+               wake_up(&conf->wait_done);
22830
 
+       }
22831
 
+       conf->cnt_ready++;
22832
 
+       spin_unlock_irq(&conf->segment_lock);
22833
 
+               
22834
 
+
22835
 
+       /* If reconstructing, and >1 working disc,
22836
 
+        * could dedicate one to rebuild and others to
22837
 
+        * service read requests ..
22838
 
+        */
22839
 
+       disk = conf->last_used;
22840
 
+       /* make sure disk is operational */
22841
 
+       while (!conf->mirrors[disk].operational) {
22842
 
+               if (disk <= 0) disk = conf->raid_disks;
22843
 
+               disk--;
22844
 
+               if (disk == conf->last_used)
22845
 
+                       break;
22846
 
+       }
22847
 
+       conf->last_used = disk;
22848
 
+       
22849
 
+       mirror = conf->mirrors+conf->last_used;
22850
 
+       
22851
 
+       r1_bh = raid1_alloc_buf (conf);
22852
 
+       r1_bh->master_bh = NULL;
22853
 
+       r1_bh->mddev = mddev;
22854
 
+       r1_bh->cmd = SPECIAL;
22855
 
+       bh = &r1_bh->bh_req;
22856
 
+
22857
 
+       block_nr = sector_nr;
22858
 
+       bsize = 512;
22859
 
+       while (!(block_nr & 1) && bsize < PAGE_SIZE
22860
 
+                       && (block_nr+2)*(bsize>>9) <= (mddev->sb->size *2)) {
22861
 
+               block_nr >>= 1;
22862
 
+               bsize <<= 1;
22863
 
+       }
22864
 
+       bh->b_size = bsize;
22865
 
+       bh->b_list = BUF_LOCKED;
22866
 
+       bh->b_dev = mirror->dev;
22867
 
+       bh->b_rdev = mirror->dev;
22868
 
+       bh->b_state = (1<<BH_Req) | (1<<BH_Mapped) | (1<<BH_Lock);
22869
 
+       if (!bh->b_page)
22870
 
+               BUG();
22871
 
+       if (!bh->b_data)
22872
 
+               BUG();
22873
 
+       if (bh->b_data != page_address(bh->b_page))
22874
 
+               BUG();
22875
 
+       bh->b_end_io = end_sync_read;
22876
 
+       bh->b_private = r1_bh;
22877
 
+       bh->b_blocknr = sector_nr;
22878
 
+       bh->b_rsector = sector_nr;
22879
 
+       r1_bh->node = mirror->node;
22880
 
+       r1_bh->eio.bh = bh;
22881
 
+       r1_bh->eio.rsector = bh->b_rsector;
22882
 
+       r1_bh->eio.rsize = bh->b_size/512;
22883
 
+       eio = r1_bh->eio;
22884
 
+       init_waitqueue_head(&bh->b_wait);
22885
 
+
22886
 
+       R_IO(mirror->node,&eio);
22887
 
+       evms_md_sync_acct(bh->b_dev, bh->b_size/512);
22888
 
+
22889
 
+       return (bsize >> 9);
22890
 
+
22891
 
+nomem:
22892
 
+       raid1_shrink_buffers(conf);
22893
 
+       spin_unlock_irq(&conf->segment_lock);
22894
 
+       return -ENOMEM;
22895
 
+}
22896
 
+
22897
 
+static void end_sync_read(struct buffer_head *bh, int uptodate)
22898
 
+{
22899
 
+       struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
22900
 
+
22901
 
+       /* we have read a block, now it needs to be re-written,
22902
 
+        * or re-read if the read failed.
22903
 
+        * We don't do much here, just schedule handling by raid1d
22904
 
+        */
22905
 
+       if (!uptodate) {
22906
 
+               if (r1_bh->node)
22907
 
+                       evms_md_error (r1_bh->mddev, r1_bh->node);
22908
 
+       }
22909
 
+       else
22910
 
+               set_bit(R1BH_Uptodate, &r1_bh->state);
22911
 
+       raid1_reschedule_retry(r1_bh);
22912
 
+}
22913
 
+
22914
 
+static void end_sync_write(struct buffer_head *bh, int uptodate)
22915
 
+{
22916
 
+       struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
22917
 
+       
22918
 
+       if (!uptodate) {
22919
 
+               evms_logical_node_t *node;
22920
 
+               node = bh_to_node(r1_bh,bh);
22921
 
+               if (node)
22922
 
+                       evms_md_error (r1_bh->mddev, node);
22923
 
+       }
22924
 
+       if (atomic_dec_and_test(&r1_bh->remaining)) {
22925
 
+               mddev_t *mddev = r1_bh->mddev;
22926
 
+               unsigned long sect = bh->b_blocknr;
22927
 
+               int size = bh->b_size;
22928
 
+               raid1_free_buf(r1_bh);
22929
 
+               sync_request_done(sect, mddev_to_conf(mddev));
22930
 
+               evms_md_done_sync(mddev,size>>9, uptodate);
22931
 
+       }
22932
 
+}
22933
 
+
22934
 
+#define INVALID_LEVEL KERN_WARNING \
22935
 
+"EVMS raid1: md%d: raid level not set to mirroring (%d)\n"
22936
 
+
22937
 
+#define NO_SB KERN_ERR \
22938
 
+"EVMS raid1: disabled mirror %s (couldn't access raid superblock)\n"
22939
 
+
22940
 
+#define ERRORS KERN_ERR \
22941
 
+"EVMS raid1: disabled mirror %s (errors detected)\n"
22942
 
+
22943
 
+#define NOT_IN_SYNC KERN_ERR \
22944
 
+"EVMS raid1: disabled mirror %s (not in sync)\n"
22945
 
+
22946
 
+#define INCONSISTENT KERN_ERR \
22947
 
+"EVMS raid1: disabled mirror %s (inconsistent descriptor)\n"
22948
 
+
22949
 
+#define ALREADY_RUNNING KERN_ERR \
22950
 
+"EVMS raid1: disabled mirror %s (mirror %d already operational)\n"
22951
 
+
22952
 
+#define OPERATIONAL KERN_INFO \
22953
 
+"EVMS raid1: device %s operational as mirror %d\n"
22954
 
+
22955
 
+#define MEM_ERROR KERN_ERR \
22956
 
+"EVMS raid1: couldn't allocate memory for md%d\n"
22957
 
+
22958
 
+#define SPARE KERN_INFO \
22959
 
+"EVMS raid1: spare disk %s\n"
22960
 
+
22961
 
+#define NONE_OPERATIONAL KERN_ERR \
22962
 
+"EVMS raid1: no operational mirrors for md%d\n"
22963
 
+
22964
 
+#define ARRAY_IS_ACTIVE KERN_INFO \
22965
 
+"EVMS raid1: raid set md%d active with %d out of %d mirrors\n"
22966
 
+
22967
 
+#define THREAD_ERROR KERN_ERR \
22968
 
+"EVMS raid1: couldn't allocate thread for md%d\n"
22969
 
+
22970
 
+#define START_RESYNC KERN_WARNING \
22971
 
+"EVMS raid1: raid set md%d not clean; reconstructing mirrors\n"
22972
 
+
22973
 
+static int raid1_run (mddev_t *mddev)
22974
 
+{
22975
 
+       raid1_conf_t *conf;
22976
 
+       int i, j, disk_idx;
22977
 
+       struct mirror_info *disk;
22978
 
+       mdp_super_t *sb = mddev->sb;
22979
 
+       mdp_disk_t *descriptor;
22980
 
+       mdk_rdev_t *rdev;
22981
 
+       struct md_list_head *tmp;
22982
 
+       int start_recovery = 0;
22983
 
+
22984
 
+       MOD_INC_USE_COUNT;
22985
 
+
22986
 
+       LOG_EXTRA(__FUNCTION__" ENTRY\n");
22987
 
+       if (sb->level != 1) {
22988
 
+               LOG_ERROR(INVALID_LEVEL, mdidx(mddev), sb->level);
22989
 
+               goto out;
22990
 
+       }
22991
 
+       /*
22992
 
+        * copy the already verified devices into our private RAID1
22993
 
+        * bookkeeping area. [whatever we allocate in raid1_run(),
22994
 
+        * should be freed in raid1_stop()]
22995
 
+        */
22996
 
+
22997
 
+       conf = kmalloc(sizeof(raid1_conf_t), GFP_KERNEL);
22998
 
+       mddev->private = conf;
22999
 
+       if (!conf) {
23000
 
+               LOG_ERROR(MEM_ERROR, mdidx(mddev));
23001
 
+               goto out;
23002
 
+       }
23003
 
+       memset(conf, 0, sizeof(*conf));
23004
 
+
23005
 
+       ITERATE_RDEV(mddev,rdev,tmp) {
23006
 
+               if (rdev->faulty) {
23007
 
+                       LOG_ERROR(ERRORS, evms_md_partition_name(rdev->node));
23008
 
+               } else {
23009
 
+                       if (!rdev->sb) {
23010
 
+                               MD_BUG();
23011
 
+                               continue;
23012
 
+                       }
23013
 
+               }
23014
 
+               if (rdev->desc_nr == -1) {
23015
 
+                       MD_BUG();
23016
 
+                       continue;
23017
 
+               }
23018
 
+               descriptor = &sb->disks[rdev->desc_nr];
23019
 
+               disk_idx = descriptor->raid_disk;
23020
 
+               disk = conf->mirrors + disk_idx;
23021
 
+
23022
 
+               if (disk_faulty(descriptor)) {
23023
 
+                       disk->number = descriptor->number;
23024
 
+                       disk->raid_disk = disk_idx;
23025
 
+                       disk->node = rdev->node;
23026
 
+                       disk->dev = rdev->dev;
23027
 
+                       disk->sect_limit = MAX_WORK_PER_DISK;
23028
 
+                       disk->operational = 0;
23029
 
+                       disk->write_only = 0;
23030
 
+                       disk->spare = 0;
23031
 
+                       disk->used_slot = 1;
23032
 
+                       disk->head_position = 0;
23033
 
+                       continue;
23034
 
+               }
23035
 
+               if (disk_active(descriptor)) {
23036
 
+                       if (!disk_sync(descriptor)) {
23037
 
+                               LOG_ERROR(NOT_IN_SYNC, evms_md_partition_name(rdev->node));
23038
 
+                               continue;
23039
 
+                       }
23040
 
+                       if ((descriptor->number > MD_SB_DISKS) ||
23041
 
+                                        (disk_idx > sb->raid_disks)) {
23042
 
+
23043
 
+                               LOG_ERROR(INCONSISTENT,evms_md_partition_name(rdev->node));
23044
 
+                               continue;
23045
 
+                       }
23046
 
+                       if (disk->operational) {
23047
 
+                               LOG_ERROR(ALREADY_RUNNING, evms_md_partition_name(rdev->node), disk_idx);
23048
 
+                               continue;
23049
 
+                       }
23050
 
+                       LOG_DEFAULT(OPERATIONAL, evms_md_partition_name(rdev->node), disk_idx);
23051
 
+                       disk->number = descriptor->number;
23052
 
+                       disk->raid_disk = disk_idx;
23053
 
+                       disk->node = rdev->node;
23054
 
+                       disk->dev = rdev->dev;
23055
 
+                       disk->sect_limit = MAX_WORK_PER_DISK;
23056
 
+                       disk->operational = 1;
23057
 
+                       disk->write_only = 0;
23058
 
+                       disk->spare = 0;
23059
 
+                       disk->used_slot = 1;
23060
 
+                       disk->head_position = 0;
23061
 
+                       conf->working_disks++;
23062
 
+               } else {
23063
 
+               /*
23064
 
+                * Must be a spare disk ..
23065
 
+                */
23066
 
+                       LOG_DEFAULT(SPARE, evms_md_partition_name(rdev->node));
23067
 
+                       disk->number = descriptor->number;
23068
 
+                       disk->raid_disk = disk_idx;
23069
 
+                       disk->node = rdev->node;
23070
 
+                       disk->dev = rdev->dev;
23071
 
+                       disk->sect_limit = MAX_WORK_PER_DISK;
23072
 
+                       disk->operational = 0;
23073
 
+                       disk->write_only = 0;
23074
 
+                       disk->spare = 1;
23075
 
+                       disk->used_slot = 1;
23076
 
+                       disk->head_position = 0;
23077
 
+               }
23078
 
+       }
23079
 
+       conf->raid_disks = sb->raid_disks;
23080
 
+       conf->nr_disks = sb->nr_disks;
23081
 
+       conf->mddev = mddev;
23082
 
+       conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
23083
 
+
23084
 
+       conf->segment_lock = MD_SPIN_LOCK_UNLOCKED;
23085
 
+       init_waitqueue_head(&conf->wait_buffer);
23086
 
+       init_waitqueue_head(&conf->wait_done);
23087
 
+       init_waitqueue_head(&conf->wait_ready);
23088
 
+
23089
 
+       if (!conf->working_disks) {
23090
 
+               LOG_ERROR(NONE_OPERATIONAL, mdidx(mddev));
23091
 
+               goto out_free_conf;
23092
 
+       }
23093
 
+
23094
 
+
23095
 
+       /* pre-allocate some buffer_head structures.
23096
 
+        * As a minimum, 1 r1bh and raid_disks buffer_heads
23097
 
+        * would probably get us by in tight memory situations,
23098
 
+        * but a few more is probably a good idea.
23099
 
+        * For now, try NR_RESERVED_BUFS r1bh and
23100
 
+        * NR_RESERVED_BUFS*raid_disks bufferheads
23101
 
+        * This will allow at least NR_RESERVED_BUFS concurrent
23102
 
+        * reads or writes even if kmalloc starts failing
23103
 
+        */
23104
 
+       if (raid1_grow_r1bh(conf, NR_RESERVED_BUFS) < NR_RESERVED_BUFS ||
23105
 
+           raid1_grow_bh(conf, NR_RESERVED_BUFS*conf->raid_disks)
23106
 
+                             < NR_RESERVED_BUFS*conf->raid_disks) {
23107
 
+               LOG_ERROR(MEM_ERROR, mdidx(mddev));
23108
 
+               goto out_free_conf;
23109
 
+       }
23110
 
+
23111
 
+       for (i = 0; i < MD_SB_DISKS; i++) {
23112
 
+               
23113
 
+               descriptor = sb->disks+i;
23114
 
+               disk_idx = descriptor->raid_disk;
23115
 
+               disk = conf->mirrors + disk_idx;
23116
 
+
23117
 
+               if (disk_faulty(descriptor) && (disk_idx < conf->raid_disks) &&
23118
 
+                               !disk->used_slot) {
23119
 
+
23120
 
+                       disk->number = descriptor->number;
23121
 
+                       disk->raid_disk = disk_idx;
23122
 
+                       disk->dev = MKDEV(0,0);
23123
 
+
23124
 
+                       disk->operational = 0;
23125
 
+                       disk->write_only = 0;
23126
 
+                       disk->spare = 0;
23127
 
+                       disk->used_slot = 1;
23128
 
+                       disk->head_position = 0;
23129
 
+               }
23130
 
+       }
23131
 
+
23132
 
+       /*
23133
 
+        * find the first working one and use it as a starting point
23134
 
+        * to read balancing.
23135
 
+        */
23136
 
+       for (j = 0; !conf->mirrors[j].operational && j < MD_SB_DISKS; j++)
23137
 
+               /* nothing */;
23138
 
+       conf->last_used = j;
23139
 
+
23140
 
+
23141
 
+       if (conf->working_disks != sb->raid_disks) {
23142
 
+               LOG_SERIOUS(" md%d, not all disks are operational -- trying to recover array\n",
23143
 
+                       mdidx(mddev));
23144
 
+               start_recovery = 1;
23145
 
+       }
23146
 
+
23147
 
+       {
23148
 
+               const char * name = "evms_raid1d";
23149
 
+
23150
 
+               conf->thread = evms_cs_register_thread(raid1d, conf, name);
23151
 
+               if (!conf->thread) {
23152
 
+                       LOG_ERROR(THREAD_ERROR, mdidx(mddev));
23153
 
+                       goto out_free_conf;
23154
 
+               }
23155
 
+       }
23156
 
+
23157
 
+       if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN)) &&
23158
 
+           (conf->working_disks > 1)) {
23159
 
+               const char * name = "evms_raid1syncd";
23160
 
+
23161
 
+               conf->resync_thread = evms_cs_register_thread(raid1syncd, conf,name);
23162
 
+               if (!conf->resync_thread) {
23163
 
+                       LOG_ERROR(THREAD_ERROR, mdidx(mddev));
23164
 
+                       goto out_free_conf;
23165
 
+               }
23166
 
+
23167
 
+               LOG_WARNING(START_RESYNC, mdidx(mddev));
23168
 
+               conf->resync_mirrors = 1;
23169
 
+               evms_cs_wakeup_thread(conf->resync_thread);
23170
 
+       }
23171
 
+
23172
 
+       /*
23173
 
+        * Regenerate the "device is in sync with the raid set" bit for
23174
 
+        * each device.
23175
 
+        */
23176
 
+       for (i = 0; i < MD_SB_DISKS; i++) {
23177
 
+               mark_disk_nonsync(sb->disks+i);
23178
 
+               for (j = 0; j < sb->raid_disks; j++) {
23179
 
+                       if (!conf->mirrors[j].operational)
23180
 
+                               continue;
23181
 
+                       if (sb->disks[i].number == conf->mirrors[j].number)
23182
 
+                               mark_disk_sync(sb->disks+i);
23183
 
+               }
23184
 
+       }
23185
 
+       sb->active_disks = conf->working_disks;
23186
 
+
23187
 
+       if (start_recovery)
23188
 
+               evms_md_recover_arrays();
23189
 
+
23190
 
+
23191
 
+       LOG_DEFAULT(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks);
23192
 
+       /*
23193
 
+        * Ok, everything is just fine now
23194
 
+        */
23195
 
+       return 0;
23196
 
+
23197
 
+out_free_conf:
23198
 
+       raid1_shrink_r1bh(conf);
23199
 
+       raid1_shrink_bh(conf);
23200
 
+       raid1_shrink_buffers(conf);
23201
 
+       kfree(conf);
23202
 
+       mddev->private = NULL;
23203
 
+out:
23204
 
+       MOD_DEC_USE_COUNT;
23205
 
+       return -EIO;
23206
 
+}
23207
 
+
23208
 
+#undef INVALID_LEVEL
23209
 
+#undef NO_SB
23210
 
+#undef ERRORS
23211
 
+#undef NOT_IN_SYNC
23212
 
+#undef INCONSISTENT
23213
 
+#undef ALREADY_RUNNING
23214
 
+#undef OPERATIONAL
23215
 
+#undef SPARE
23216
 
+#undef NONE_OPERATIONAL
23217
 
+#undef ARRAY_IS_ACTIVE
23218
 
+
23219
 
+static int raid1_stop_resync (mddev_t *mddev)
23220
 
+{
23221
 
+       raid1_conf_t *conf = mddev_to_conf(mddev);
23222
 
+
23223
 
+       LOG_DEFAULT(__FUNCTION__ " ENTRY\n");
23224
 
+       if (conf->resync_thread) {
23225
 
+               if (conf->resync_mirrors) {
23226
 
+                       conf->resync_mirrors = 2;
23227
 
+                       evms_cs_interrupt_thread(conf->resync_thread);
23228
 
+                       LOG_WARNING(" mirror resync was not fully finished, restarting next time.\n");
23229
 
+                       return 1;
23230
 
+               }
23231
 
+               return 0;
23232
 
+       }
23233
 
+       return 0;
23234
 
+}
23235
 
+
23236
 
+static int raid1_restart_resync (mddev_t *mddev)
23237
 
+{
23238
 
+       raid1_conf_t *conf = mddev_to_conf(mddev);
23239
 
+
23240
 
+       LOG_DEFAULT(__FUNCTION__" ENTRY\n");
23241
 
+       if (conf->resync_mirrors) {
23242
 
+               if (!conf->resync_thread) {
23243
 
+                       MD_BUG();
23244
 
+                       return 0;
23245
 
+               }
23246
 
+               conf->resync_mirrors = 1;
23247
 
+               evms_cs_wakeup_thread(conf->resync_thread);
23248
 
+               return 1;
23249
 
+       }
23250
 
+       return 0;
23251
 
+}
23252
 
+
23253
 
+static int raid1_stop (mddev_t *mddev)
23254
 
+{
23255
 
+       raid1_conf_t *conf = mddev_to_conf(mddev);
23256
 
+
23257
 
+       LOG_DEFAULT(__FUNCTION__ " ENTRY\n");
23258
 
+       evms_cs_unregister_thread(conf->thread);
23259
 
+       if (conf->resync_thread)
23260
 
+               evms_cs_unregister_thread(conf->resync_thread);
23261
 
+       raid1_shrink_r1bh(conf);
23262
 
+       raid1_shrink_bh(conf);
23263
 
+       raid1_shrink_buffers(conf);
23264
 
+       kfree(conf);
23265
 
+       mddev->private = NULL;
23266
 
+       MOD_DEC_USE_COUNT;
23267
 
+       return 0;
23268
 
+}
23269
 
+
23270
 
+static int raid1_evms_ioctl (
23271
 
+       mddev_t         * mddev,
23272
 
+       struct inode    * inode,
23273
 
+       struct file     * file, 
23274
 
+       unsigned int    cmd,
23275
 
+       unsigned long   arg)
23276
 
+{
23277
 
+       int i, rc = 0;
23278
 
+       evms_logical_node_t *node = NULL;
23279
 
+       raid1_conf_t *conf = mddev_to_conf(mddev);
23280
 
+
23281
 
+       switch (cmd) {
23282
 
+               case EVMS_GET_BMAP:
23283
 
+               {
23284
 
+                       for (i = 0; i < MD_SB_DISKS; i++) {
23285
 
+                               if (conf->mirrors[i].operational)  {
23286
 
+                                       node = conf->mirrors[i].node;
23287
 
+                                       break;
23288
 
+                               }
23289
 
+                       }
23290
 
+
23291
 
+                       if (node)
23292
 
+                               rc = IOCTL(node, inode, file, cmd, arg);
23293
 
+                       else
23294
 
+                               rc = -ENODEV;
23295
 
+
23296
 
+                       break;
23297
 
+               }
23298
 
+
23299
 
+               default:
23300
 
+                       rc = -EINVAL;
23301
 
+       }
23302
 
+       return rc;
23303
 
+}
23304
 
+
23305
 
+static mdk_personality_t raid1_personality=
23306
 
+{
23307
 
+       name:           "evms_raid1",
23308
 
+       init_io:        raid1_init_io,
23309
 
+       make_request:   raid1_make_request,
23310
 
+       run:            raid1_run,
23311
 
+       stop:           raid1_stop,
23312
 
+       status:         raid1_status,
23313
 
+       error_handler:  raid1_error,
23314
 
+       diskop:         raid1_diskop,
23315
 
+       stop_resync:    raid1_stop_resync,
23316
 
+       restart_resync: raid1_restart_resync,
23317
 
+       sync_request:   raid1_sync_request,
23318
 
+       evms_ioctl:     raid1_evms_ioctl
23319
 
+};
23320
 
+
23321
 
+static int md__init raid1_init (void)
23322
 
+{
23323
 
+       return evms_register_md_personality (RAID1, &raid1_personality);
23324
 
+}
23325
 
+
23326
 
+static void raid1_exit (void)
23327
 
+{
23328
 
+       evms_unregister_md_personality (RAID1);
23329
 
+}
23330
 
+
23331
 
+module_init(raid1_init);
23332
 
+module_exit(raid1_exit);
23333
 
+#ifdef MODULE_LICENSE
23334
 
+MODULE_LICENSE("GPL");
23335
 
+#endif
23336
 
diff -Naur linux-2002-03-28/drivers/evms/md_raid5.c evms-2002-03-28/drivers/evms/md_raid5.c
23337
 
--- linux-2002-03-28/drivers/evms/md_raid5.c    Wed Dec 31 18:00:00 1969
23338
 
+++ evms-2002-03-28/drivers/evms/md_raid5.c     Thu Mar 28 16:28:37 2002
23339
 
@@ -0,0 +1,2566 @@
23340
 
+/*
23341
 
+ * md_raid5.c : Multiple Devices driver for Linux
23342
 
+ *        Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
23343
 
+ *        Copyright (C) 1999, 2000 Ingo Molnar
23344
 
+ *
23345
 
+ * RAID-5 management functions.
23346
 
+ *
23347
 
+ * 'md_raid5.c' is an EVMS version of linux/drivers/md/raid5.c modified
23348
 
+ * by Cuong (Mike) Tran <miketran@us.ibm.com>, January 2002.
23349
 
+ *
23350
 
+ * This program is free software; you can redistribute it and/or modify
23351
 
+ * it under the terms of the GNU General Public License as published by
23352
 
+ * the Free Software Foundation; either version 2, or (at your option)
23353
 
+ * any later version.
23354
 
+ *
23355
 
+ * You should have received a copy of the GNU General Public License
23356
 
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
23357
 
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23358
 
+ */
23359
 
+
23360
 
+
23361
 
+#include <linux/config.h>
23362
 
+#include <linux/module.h>
23363
 
+#include <linux/locks.h>
23364
 
+#include <linux/slab.h>
23365
 
+#include <linux/evms/evms_raid5.h>
23366
 
+#include <asm/bitops.h>
23367
 
+#include <asm/atomic.h>
23368
 
+
23369
 
+#define LOG_PREFIX "md raid5: "
23370
 
+
23371
 
+static mdk_personality_t raid5_personality;
23372
 
+
23373
 
+/*
23374
 
+ * Stripe cache
23375
 
+ */
23376
 
+
23377
 
+#define NR_STRIPES             256
23378
 
+#define        IO_THRESHOLD            1
23379
 
+#define HASH_PAGES             1
23380
 
+#define HASH_PAGES_ORDER       0
23381
 
+#define NR_HASH                        (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *))
23382
 
+#define HASH_MASK              (NR_HASH - 1)
23383
 
+#define stripe_hash(conf, sect)        ((conf)->stripe_hashtbl[((sect) / ((conf)->buffer_size >> 9)) & HASH_MASK])
23384
 
+
23385
 
+/*
23386
 
+ * The following can be used to debug the driver
23387
 
+ */
23388
 
+#define RAID5_DEBUG    0
23389
 
+#define RAID5_PARANOIA 1
23390
 
+#if RAID5_PARANOIA && CONFIG_SMP
23391
 
+# define CHECK_DEVLOCK() if (!spin_is_locked(&conf->device_lock)) BUG()
23392
 
+#else
23393
 
+# define CHECK_DEVLOCK()
23394
 
+#endif
23395
 
+
23396
 
+
23397
 
+static void print_raid5_conf (raid5_conf_t *conf);
23398
 
+
23399
 
+static inline void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
23400
 
+{
23401
 
+       if (atomic_dec_and_test(&sh->count)) {
23402
 
+               if (!list_empty(&sh->lru))
23403
 
+                       BUG();
23404
 
+               if (atomic_read(&conf->active_stripes)==0)
23405
 
+                       BUG();
23406
 
+               if (test_bit(STRIPE_HANDLE, &sh->state)) {
23407
 
+                       if (test_bit(STRIPE_DELAYED, &sh->state))
23408
 
+                               list_add_tail(&sh->lru, &conf->delayed_list);
23409
 
+                       else
23410
 
+                               list_add_tail(&sh->lru, &conf->handle_list);
23411
 
+                       evms_cs_wakeup_thread(conf->thread);
23412
 
+               } else {
23413
 
+                       if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
23414
 
+                               atomic_dec(&conf->preread_active_stripes);
23415
 
+                               if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
23416
 
+                                       evms_cs_wakeup_thread(conf->thread);
23417
 
+                       }
23418
 
+                       list_add_tail(&sh->lru, &conf->inactive_list);
23419
 
+                       atomic_dec(&conf->active_stripes);
23420
 
+                       if (!conf->inactive_blocked ||
23421
 
+                           atomic_read(&conf->active_stripes) < (NR_STRIPES*3/4))
23422
 
+                               wake_up(&conf->wait_for_stripe);
23423
 
+               }
23424
 
+       }
23425
 
+}
23426
 
+static void release_stripe(struct stripe_head *sh)
23427
 
+{
23428
 
+       raid5_conf_t *conf = sh->raid_conf;
23429
 
+       unsigned long flags;
23430
 
+       
23431
 
+       spin_lock_irqsave(&conf->device_lock, flags);
23432
 
+       __release_stripe(conf, sh);
23433
 
+       spin_unlock_irqrestore(&conf->device_lock, flags);
23434
 
+}
23435
 
+
23436
 
+static void remove_hash(struct stripe_head *sh)
23437
 
+{
23438
 
+       LOG_DEBUG("%s: stripe %lu\n", __FUNCTION__, sh->sector);
23439
 
+
23440
 
+       if (sh->hash_pprev) {
23441
 
+               if (sh->hash_next)
23442
 
+                       sh->hash_next->hash_pprev = sh->hash_pprev;
23443
 
+               *sh->hash_pprev = sh->hash_next;
23444
 
+               sh->hash_pprev = NULL;
23445
 
+       }
23446
 
+}
23447
 
+
23448
 
+static __inline__ void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
23449
 
+{
23450
 
+       struct stripe_head **shp = &stripe_hash(conf, sh->sector);
23451
 
+
23452
 
+       LOG_DEBUG("%s: stripe %lu\n", __FUNCTION__, sh->sector);
23453
 
+
23454
 
+       CHECK_DEVLOCK();
23455
 
+       if ((sh->hash_next = *shp) != NULL)
23456
 
+               (*shp)->hash_pprev = &sh->hash_next;
23457
 
+       *shp = sh;
23458
 
+       sh->hash_pprev = shp;
23459
 
+}
23460
 
+
23461
 
+
23462
 
+/* find an idle stripe, make sure it is unhashed, and return it. */
23463
 
+static struct stripe_head *get_free_stripe(raid5_conf_t *conf)
23464
 
+{
23465
 
+       struct stripe_head *sh = NULL;
23466
 
+       struct list_head *first;
23467
 
+
23468
 
+       CHECK_DEVLOCK();
23469
 
+       if (list_empty(&conf->inactive_list))
23470
 
+               goto out;
23471
 
+       first = conf->inactive_list.next;
23472
 
+       sh = list_entry(first, struct stripe_head, lru);
23473
 
+       list_del_init(first);
23474
 
+       remove_hash(sh);
23475
 
+       atomic_inc(&conf->active_stripes);
23476
 
+out:
23477
 
+       return sh;
23478
 
+}
23479
 
+
23480
 
+static void shrink_buffers(struct stripe_head *sh, int num)
23481
 
+{
23482
 
+       struct buffer_head *bh;
23483
 
+       int i;
23484
 
+
23485
 
+       for (i=0; i<num ; i++) {
23486
 
+               bh = sh->bh_cache[i];
23487
 
+               if (!bh)
23488
 
+                       return;
23489
 
+               sh->bh_cache[i] = NULL;
23490
 
+               free_page((unsigned long) bh->b_data);
23491
 
+               kfree(bh);
23492
 
+       }
23493
 
+}
23494
 
+
23495
 
+static int grow_buffers(struct stripe_head *sh, int num, int b_size, int priority)
23496
 
+{
23497
 
+       struct buffer_head *bh;
23498
 
+       int i;
23499
 
+
23500
 
+       for (i=0; i<num; i++) {
23501
 
+               struct page *page;
23502
 
+               bh = kmalloc(sizeof(struct buffer_head), priority);
23503
 
+               if (!bh)
23504
 
+                       return 1;
23505
 
+               memset(bh, 0, sizeof (struct buffer_head));
23506
 
+               init_waitqueue_head(&bh->b_wait);
23507
 
+               if ((page = alloc_page(priority)))
23508
 
+                       bh->b_data = page_address(page);
23509
 
+               else {
23510
 
+                       kfree(bh);
23511
 
+                       return 1;
23512
 
+               }
23513
 
+               atomic_set(&bh->b_count, 0);
23514
 
+               bh->b_page = page;
23515
 
+               sh->bh_cache[i] = bh;
23516
 
+
23517
 
+       }
23518
 
+       return 0;
23519
 
+}
23520
 
+
23521
 
+static struct buffer_head *raid5_build_block (struct stripe_head *sh, int i);
23522
 
+
23523
 
+static inline void init_stripe(struct stripe_head *sh, unsigned long sector)
23524
 
+{
23525
 
+       raid5_conf_t *conf = sh->raid_conf;
23526
 
+       int disks = conf->raid_disks, i;
23527
 
+
23528
 
+       if (atomic_read(&sh->count) != 0)
23529
 
+               BUG();
23530
 
+       if (test_bit(STRIPE_HANDLE, &sh->state))
23531
 
+               BUG();
23532
 
+       
23533
 
+       CHECK_DEVLOCK();
23534
 
+       LOG_EXTRA("init_stripe called, stripe %lu\n", sh->sector);
23535
 
+
23536
 
+       remove_hash(sh);
23537
 
+       
23538
 
+       sh->sector = sector;
23539
 
+       sh->size = conf->buffer_size;
23540
 
+       sh->state = 0;
23541
 
+
23542
 
+       for (i=disks; i--; ) {
23543
 
+               if (sh->bh_read[i] || sh->bh_write[i] || sh->bh_written[i] ||
23544
 
+                   buffer_locked(sh->bh_cache[i])) {
23545
 
+                       LOG_ERROR("sector=%lx i=%d %p %p %p %d\n",
23546
 
+                              sh->sector, i, sh->bh_read[i],
23547
 
+                              sh->bh_write[i], sh->bh_written[i],
23548
 
+                              buffer_locked(sh->bh_cache[i]));
23549
 
+                       BUG();
23550
 
+               }
23551
 
+               clear_bit(BH_Uptodate, &sh->bh_cache[i]->b_state);
23552
 
+               raid5_build_block(sh, i);
23553
 
+       }
23554
 
+       insert_hash(conf, sh);
23555
 
+}
23556
 
+
23557
 
+/* the buffer size has changed, so unhash all stripes
23558
 
+ * as active stripes complete, they will go onto inactive list
23559
 
+ */
23560
 
+static void shrink_stripe_cache(raid5_conf_t *conf)
23561
 
+{
23562
 
+       int i;
23563
 
+       CHECK_DEVLOCK();
23564
 
+       if (atomic_read(&conf->active_stripes))
23565
 
+               BUG();
23566
 
+       for (i=0; i < NR_HASH; i++) {
23567
 
+               struct stripe_head *sh;
23568
 
+               while ((sh = conf->stripe_hashtbl[i]))
23569
 
+                       remove_hash(sh);
23570
 
+       }
23571
 
+}
23572
 
+
23573
 
+static struct stripe_head *__find_stripe(raid5_conf_t *conf, unsigned long sector)
23574
 
+{
23575
 
+       struct stripe_head *sh;
23576
 
+
23577
 
+       CHECK_DEVLOCK();
23578
 
+       LOG_DEBUG("%s: sector %lu\n", __FUNCTION__, sector);
23579
 
+       for (sh = stripe_hash(conf, sector); sh; sh = sh->hash_next)
23580
 
+               if (sh->sector == sector)
23581
 
+                       return sh;
23582
 
+       LOG_DEBUG("%s: %lu not in cache\n", __FUNCTION__, sector);
23583
 
+       return NULL;
23584
 
+}
23585
 
+
23586
 
+static struct stripe_head *get_active_stripe(raid5_conf_t *conf, unsigned long sector, int size, int noblock)
23587
 
+{
23588
 
+       struct stripe_head *sh;
23589
 
+
23590
 
+
23591
 
+       md_spin_lock_irq(&conf->device_lock);
23592
 
+
23593
 
+       do {
23594
 
+               if (conf->buffer_size == 0 ||
23595
 
+                   (size && size != conf->buffer_size)) {
23596
 
+                       /* either the size is being changed (buffer_size==0) or
23597
 
+                        * we need to change it.
23598
 
+                        * If size==0, we can proceed as soon as buffer_size gets set.
23599
 
+                        * If size>0, we can proceed when active_stripes reaches 0, or
23600
 
+                        * when someone else sets the buffer_size to size.
23601
 
+                        * If someone sets the buffer size to something else, we will need to
23602
 
+                        * assert that we want to change it again
23603
 
+                        */
23604
 
+                       if (size==0)
23605
 
+                               wait_event_lock_irq(conf->wait_for_stripe,
23606
 
+                                                   conf->buffer_size,
23607
 
+                                                   conf->device_lock);
23608
 
+                       else {
23609
 
+                               while (conf->buffer_size != size && atomic_read(&conf->active_stripes)) {
23610
 
+                                       conf->buffer_size = 0;
23611
 
+                                       wait_event_lock_irq(conf->wait_for_stripe,
23612
 
+                                                           atomic_read(&conf->active_stripes)==0 || conf->buffer_size,
23613
 
+                                                           conf->device_lock);
23614
 
+                               }
23615
 
+
23616
 
+                               if (conf->buffer_size != size) {
23617
 
+                                       shrink_stripe_cache(conf);
23618
 
+                                       if (size==0) BUG();
23619
 
+                                       conf->buffer_size = size;
23620
 
+                               }
23621
 
+                       }
23622
 
+               }
23623
 
+               if (size == 0)
23624
 
+                       sector -= sector & ((conf->buffer_size>>9)-1);
23625
 
+
23626
 
+               sh = __find_stripe(conf, sector);
23627
 
+               if (!sh) {
23628
 
+                       if (!conf->inactive_blocked)
23629
 
+                               sh = get_free_stripe(conf);
23630
 
+                       if (noblock && sh == NULL)
23631
 
+                               break;
23632
 
+                       if (!sh) {
23633
 
+                               conf->inactive_blocked = 1;
23634
 
+                               wait_event_lock_irq(conf->wait_for_stripe,
23635
 
+                                                   !list_empty(&conf->inactive_list) &&
23636
 
+                                                   (atomic_read(&conf->active_stripes) < (NR_STRIPES *3/4)
23637
 
+                                                    || !conf->inactive_blocked),
23638
 
+                                                   conf->device_lock);
23639
 
+                               conf->inactive_blocked = 0;
23640
 
+                       } else
23641
 
+                               init_stripe(sh, sector);
23642
 
+               } else {
23643
 
+                       if (atomic_read(&sh->count)) {
23644
 
+                               if (!list_empty(&sh->lru))
23645
 
+                                       BUG();
23646
 
+                       } else {
23647
 
+                               if (!test_bit(STRIPE_HANDLE, &sh->state))
23648
 
+                                       atomic_inc(&conf->active_stripes);
23649
 
+                               if (list_empty(&sh->lru))
23650
 
+                                       BUG();
23651
 
+                               list_del_init(&sh->lru);
23652
 
+                       }
23653
 
+               }
23654
 
+       } while (sh == NULL);
23655
 
+
23656
 
+       if (sh)
23657
 
+               atomic_inc(&sh->count);
23658
 
+
23659
 
+       md_spin_unlock_irq(&conf->device_lock);
23660
 
+       return sh;
23661
 
+}
23662
 
+
23663
 
+static int grow_stripes(raid5_conf_t *conf, int num, int priority)
23664
 
+{
23665
 
+       struct stripe_head *sh;
23666
 
+
23667
 
+       while (num--) {
23668
 
+               sh = kmalloc(sizeof(struct stripe_head), priority);
23669
 
+               if (!sh)
23670
 
+                       return 1;
23671
 
+               memset(sh, 0, sizeof(*sh));
23672
 
+               sh->raid_conf = conf;
23673
 
+               sh->lock = SPIN_LOCK_UNLOCKED;
23674
 
+
23675
 
+               if (grow_buffers(sh, conf->raid_disks, PAGE_SIZE, priority)) {
23676
 
+                       shrink_buffers(sh, conf->raid_disks);
23677
 
+                       kfree(sh);
23678
 
+                       return 1;
23679
 
+               }
23680
 
+               /* we just created an active stripe so... */
23681
 
+               atomic_set(&sh->count, 1);
23682
 
+               atomic_inc(&conf->active_stripes);
23683
 
+               INIT_LIST_HEAD(&sh->lru);
23684
 
+               release_stripe(sh);
23685
 
+       }
23686
 
+       return 0;
23687
 
+}
23688
 
+
23689
 
+static void shrink_stripes(raid5_conf_t *conf, int num)
23690
 
+{
23691
 
+       struct stripe_head *sh;
23692
 
+
23693
 
+       while (num--) {
23694
 
+               spin_lock_irq(&conf->device_lock);
23695
 
+               sh = get_free_stripe(conf);
23696
 
+               spin_unlock_irq(&conf->device_lock);
23697
 
+               if (!sh)
23698
 
+                       break;
23699
 
+               if (atomic_read(&sh->count))
23700
 
+                       BUG();
23701
 
+               shrink_buffers(sh, conf->raid_disks);
23702
 
+               kfree(sh);
23703
 
+               atomic_dec(&conf->active_stripes);
23704
 
+       }
23705
 
+}
23706
 
+
23707
 
+
23708
 
+static void raid5_end_read_request (struct buffer_head * bh, int uptodate)
23709
 
+{
23710
 
+       struct stripe_head *sh = bh->b_private;
23711
 
+       raid5_conf_t *conf = sh->raid_conf;
23712
 
+       int disks = conf->raid_disks, i;
23713
 
+       unsigned long flags;
23714
 
+
23715
 
+       for (i=0 ; i<disks; i++)
23716
 
+               if (bh == sh->bh_cache[i])
23717
 
+                       break;
23718
 
+                       
23719
 
+       if (i == disks) {
23720
 
+               BUG();
23721
 
+               return;
23722
 
+       }
23723
 
+
23724
 
+       if (uptodate) {
23725
 
+               struct buffer_head *buffer;
23726
 
+               spin_lock_irqsave(&conf->device_lock, flags);
23727
 
+               /* we can return a buffer if we bypassed the cache or
23728
 
+                * if the top buffer is not in highmem.  If there are
23729
 
+                * multiple buffers, leave the extra work to
23730
 
+                * handle_stripe
23731
 
+                */
23732
 
+               buffer = sh->bh_read[i];
23733
 
+               if (buffer &&
23734
 
+                   (!PageHighMem(buffer->b_page)
23735
 
+                    || buffer->b_page == bh->b_page )
23736
 
+                       ) {
23737
 
+                       sh->bh_read[i] = buffer->b_reqnext;
23738
 
+                       buffer->b_reqnext = NULL;
23739
 
+               } else
23740
 
+                       buffer = NULL;
23741
 
+               spin_unlock_irqrestore(&conf->device_lock, flags);
23742
 
+               if (sh->bh_page[i]==NULL)
23743
 
+                       set_bit(BH_Uptodate, &bh->b_state);
23744
 
+               if (buffer) {
23745
 
+                       if (buffer->b_page != bh->b_page)
23746
 
+                               memcpy(buffer->b_data, bh->b_data, bh->b_size);
23747
 
+                       buffer->b_end_io(buffer, 1);
23748
 
+               }
23749
 
+       } else {
23750
 
+               /* I/O error */
23751
 
+               if (sh->node[i])
23752
 
+                       evms_md_error(conf->mddev, sh->node[i]);
23753
 
+               else
23754
 
+                       LOG_WARNING("NODE was not set, skipping evms_md_error()\n");
23755
 
+               clear_bit(BH_Uptodate, &bh->b_state);
23756
 
+       }
23757
 
+       /* must restore b_page before unlocking buffer... */
23758
 
+       if (sh->bh_page[i]) {
23759
 
+               bh->b_page = sh->bh_page[i];
23760
 
+               bh->b_data = page_address(bh->b_page);
23761
 
+               sh->bh_page[i] = NULL;
23762
 
+               clear_bit(BH_Uptodate, &bh->b_state);
23763
 
+       }
23764
 
+       clear_bit(BH_Lock, &bh->b_state);
23765
 
+       set_bit(STRIPE_HANDLE, &sh->state);
23766
 
+       release_stripe(sh);
23767
 
+       if (sh->node[i]) {
23768
 
+               sh->node[i] = NULL;
23769
 
+       } else {
23770
 
+               LOG_WARNING(" evms node was not set.\n");
23771
 
+       }
23772
 
+       
23773
 
+}
23774
 
+
23775
 
+static void raid5_end_write_request (struct buffer_head *bh, int uptodate)
23776
 
+{
23777
 
+       struct stripe_head *sh = bh->b_private;
23778
 
+       raid5_conf_t *conf = sh->raid_conf;
23779
 
+       int disks = conf->raid_disks, i;
23780
 
+       unsigned long flags;
23781
 
+
23782
 
+       for (i=0 ; i<disks; i++)
23783
 
+               if (bh == sh->bh_cache[i])
23784
 
+                       break;
23785
 
+                       
23786
 
+       if (i == disks) {
23787
 
+               BUG();
23788
 
+               return;
23789
 
+       }
23790
 
+
23791
 
+       md_spin_lock_irqsave(&conf->device_lock, flags);
23792
 
+       if (!uptodate) {
23793
 
+               /* I/O error */
23794
 
+               if (sh->node[i])
23795
 
+                       evms_md_error(conf->mddev, sh->node[i]);
23796
 
+               else
23797
 
+                       LOG_WARNING(" NODE was not set, skipping evms_md_error()\n");
23798
 
+       }
23799
 
+       clear_bit(BH_Lock, &bh->b_state);
23800
 
+       set_bit(STRIPE_HANDLE, &sh->state);
23801
 
+       __release_stripe(conf, sh);
23802
 
+       md_spin_unlock_irqrestore(&conf->device_lock, flags);
23803
 
+       if (sh->node[i]) {
23804
 
+               sh->node[i] = NULL;
23805
 
+       } else {
23806
 
+               LOG_WARNING(" evms node was not set.\n");
23807
 
+       }
23808
 
+}
23809
 
+       
23810
 
+
23811
 
+
23812
 
+static struct buffer_head *raid5_build_block (struct stripe_head *sh, int i)
23813
 
+{
23814
 
+       raid5_conf_t *conf = sh->raid_conf;
23815
 
+       struct buffer_head *bh = sh->bh_cache[i];
23816
 
+       unsigned long block = sh->sector / (sh->size >> 9);
23817
 
+
23818
 
+       init_buffer(bh, raid5_end_read_request, sh);
23819
 
+       bh->b_dev       = conf->disks[i].dev;
23820
 
+       bh->b_blocknr   = block;
23821
 
+
23822
 
+       bh->b_state     = (1 << BH_Req) | (1 << BH_Mapped);
23823
 
+       bh->b_size      = sh->size;
23824
 
+       bh->b_list      = BUF_LOCKED;
23825
 
+       return bh;
23826
 
+}
23827
 
+
23828
 
+static int raid5_error (
23829
 
+       mddev_t *mddev,
23830
 
+       evms_logical_node_t *node)
23831
 
+{
23832
 
+       raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
23833
 
+       mdp_super_t *sb = mddev->sb;
23834
 
+       struct disk_info *disk;
23835
 
+       int i;
23836
 
+
23837
 
+       LOG_WARNING("%s: called\n", __FUNCTION__);
23838
 
+
23839
 
+       for (i = 0, disk = conf->disks; i < conf->raid_disks; i++, disk++) {
23840
 
+               if (disk->node == node) {
23841
 
+                       if (disk->operational) {
23842
 
+                               disk->operational = 0;
23843
 
+                               mark_disk_faulty(sb->disks+disk->number);
23844
 
+                               mark_disk_nonsync(sb->disks+disk->number);
23845
 
+                               mark_disk_inactive(sb->disks+disk->number);
23846
 
+                               sb->active_disks--;
23847
 
+                               sb->working_disks--;
23848
 
+                               sb->failed_disks++;
23849
 
+                               mddev->sb_dirty = 1;
23850
 
+                               conf->working_disks--;
23851
 
+                               conf->failed_disks++;
23852
 
+                               evms_cs_wakeup_thread(conf->thread);
23853
 
+                               LOG_WARNING("Disk failure on %s, disabling device."
23854
 
+                                       " Operation continuing on %d devices\n",
23855
 
+                                       evms_md_partition_name (disk->node), conf->working_disks);
23856
 
+                       }
23857
 
+                       return 0;
23858
 
+               }
23859
 
+       }
23860
 
+       /*
23861
 
+        * handle errors in spares (during reconstruction)
23862
 
+        */
23863
 
+       if (conf->spare) {
23864
 
+               disk = conf->spare;
23865
 
+               if (disk->node == node) {
23866
 
+                       LOG_WARNING("EVMS RAID5: Disk failure on spare %s\n",
23867
 
+                                   evms_md_partition_name (disk->node));
23868
 
+                       if (!conf->spare->operational) {
23869
 
+                               /* probably a SET_DISK_FAULTY ioctl */
23870
 
+                               return -EIO;
23871
 
+                       }
23872
 
+                       disk->operational = 0;
23873
 
+                       disk->write_only = 0;
23874
 
+                       conf->spare = NULL;
23875
 
+                       mark_disk_faulty(sb->disks+disk->number);
23876
 
+                       mark_disk_nonsync(sb->disks+disk->number);
23877
 
+                       mark_disk_inactive(sb->disks+disk->number);
23878
 
+                       sb->spare_disks--;
23879
 
+                       sb->working_disks--;
23880
 
+                       sb->failed_disks++;
23881
 
+
23882
 
+                       mddev->sb_dirty = 1;
23883
 
+                       evms_cs_wakeup_thread(conf->thread);
23884
 
+
23885
 
+                       return 0;
23886
 
+               }
23887
 
+       }
23888
 
+       MD_BUG();
23889
 
+       return -EIO;
23890
 
+}      
23891
 
+
23892
 
+/*
23893
 
+ * Input: a 'big' sector number,
23894
 
+ * Output: index of the data and parity disk, and the sector # in them.
23895
 
+ */
23896
 
+static unsigned long raid5_compute_sector(unsigned long r_sector, unsigned int raid_disks,
23897
 
+                       unsigned int data_disks, unsigned int * dd_idx,
23898
 
+                       unsigned int * pd_idx, raid5_conf_t *conf)
23899
 
+{
23900
 
+       unsigned long stripe;
23901
 
+       unsigned long chunk_number;
23902
 
+       unsigned int chunk_offset;
23903
 
+       unsigned long new_sector;
23904
 
+       int sectors_per_chunk = conf->chunk_size >> 9;
23905
 
+
23906
 
+       /* First compute the information on this sector */
23907
 
+
23908
 
+       /*
23909
 
+        * Compute the chunk number and the sector offset inside the chunk
23910
 
+        */
23911
 
+       chunk_number = r_sector / sectors_per_chunk;
23912
 
+       chunk_offset = r_sector % sectors_per_chunk;
23913
 
+
23914
 
+       /*
23915
 
+        * Compute the stripe number
23916
 
+        */
23917
 
+       stripe = chunk_number / data_disks;
23918
 
+
23919
 
+       /*
23920
 
+        * Compute the data disk and parity disk indexes inside the stripe
23921
 
+        */
23922
 
+       *dd_idx = chunk_number % data_disks;
23923
 
+
23924
 
+       /*
23925
 
+        * Select the parity disk based on the user selected algorithm.
23926
 
+        */
23927
 
+       if (conf->level == 4)
23928
 
+               *pd_idx = data_disks;
23929
 
+       else switch (conf->algorithm) {
23930
 
+               case ALGORITHM_LEFT_ASYMMETRIC:
23931
 
+                       *pd_idx = data_disks - stripe % raid_disks;
23932
 
+                       if (*dd_idx >= *pd_idx)
23933
 
+                               (*dd_idx)++;
23934
 
+                       break;
23935
 
+               case ALGORITHM_RIGHT_ASYMMETRIC:
23936
 
+                       *pd_idx = stripe % raid_disks;
23937
 
+                       if (*dd_idx >= *pd_idx)
23938
 
+                               (*dd_idx)++;
23939
 
+                       break;
23940
 
+               case ALGORITHM_LEFT_SYMMETRIC:
23941
 
+                       *pd_idx = data_disks - stripe % raid_disks;
23942
 
+                       *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
23943
 
+                       break;
23944
 
+               case ALGORITHM_RIGHT_SYMMETRIC:
23945
 
+                       *pd_idx = stripe % raid_disks;
23946
 
+                       *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
23947
 
+                       break;
23948
 
+               default:
23949
 
+                       LOG_ERROR(" unsupported algorithm %d\n", conf->algorithm);
23950
 
+       }
23951
 
+
23952
 
+       /*
23953
 
+        * Finally, compute the new sector number
23954
 
+        */
23955
 
+       new_sector = stripe * sectors_per_chunk + chunk_offset;
23956
 
+       return new_sector;
23957
 
+}
23958
 
+
23959
 
+#define check_xor()    do {                                    \
23960
 
+                          if (count == MAX_XOR_BLOCKS) {       \
23961
 
+                               evms_md_xor_block(count, bh_ptr);       \
23962
 
+                               count = 1;                      \
23963
 
+                          }                                    \
23964
 
+                       } while(0)
23965
 
+
23966
 
+
23967
 
+static void compute_block(struct stripe_head *sh, int dd_idx)
23968
 
+{
23969
 
+       raid5_conf_t *conf = sh->raid_conf;
23970
 
+       int i, count, disks = conf->raid_disks;
23971
 
+       struct buffer_head *bh_ptr[MAX_XOR_BLOCKS], *bh;
23972
 
+
23973
 
+       memset(sh->bh_cache[dd_idx]->b_data, 0, sh->size);
23974
 
+       bh_ptr[0] = sh->bh_cache[dd_idx];
23975
 
+       count = 1;
23976
 
+       for (i = disks ; i--; ) {
23977
 
+               if (i == dd_idx)
23978
 
+                       continue;
23979
 
+               bh = sh->bh_cache[i];
23980
 
+               if (buffer_uptodate(bh))
23981
 
+                       bh_ptr[count++] = bh;
23982
 
+               else
23983
 
+                       LOG_ERROR("%s: %d, stripe %lu, %d not present\n",
23984
 
+                                 __FUNCTION__, dd_idx, sh->sector, i);
23985
 
+
23986
 
+               check_xor();
23987
 
+       }
23988
 
+       if (count != 1)
23989
 
+               evms_md_xor_block(count, bh_ptr);
23990
 
+       set_bit(BH_Uptodate, &sh->bh_cache[dd_idx]->b_state);
23991
 
+}
23992
 
+
23993
 
+static void compute_parity(struct stripe_head *sh, int method)
23994
 
+{
23995
 
+       raid5_conf_t *conf = sh->raid_conf;
23996
 
+       int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count;
23997
 
+       struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];
23998
 
+       struct buffer_head *chosen[MD_SB_DISKS];
23999
 
+
24000
 
+       memset(chosen, 0, sizeof(chosen));
24001
 
+
24002
 
+       count = 1;
24003
 
+       bh_ptr[0] = sh->bh_cache[pd_idx];
24004
 
+       switch(method) {
24005
 
+       case READ_MODIFY_WRITE:
24006
 
+               if (!buffer_uptodate(sh->bh_cache[pd_idx]))
24007
 
+                       BUG();
24008
 
+               for (i=disks ; i-- ;) {
24009
 
+                       if (i==pd_idx)
24010
 
+                               continue;
24011
 
+                       if (sh->bh_write[i] &&
24012
 
+                           buffer_uptodate(sh->bh_cache[i])) {
24013
 
+                               bh_ptr[count++] = sh->bh_cache[i];
24014
 
+                               chosen[i] = sh->bh_write[i];
24015
 
+                               sh->bh_write[i] = sh->bh_write[i]->b_reqnext;
24016
 
+                               chosen[i]->b_reqnext = sh->bh_written[i];
24017
 
+                               sh->bh_written[i] = chosen[i];
24018
 
+                               check_xor();
24019
 
+                       }
24020
 
+               }
24021
 
+               break;
24022
 
+       case RECONSTRUCT_WRITE:
24023
 
+               memset(sh->bh_cache[pd_idx]->b_data, 0, sh->size);
24024
 
+               for (i= disks; i-- ;)
24025
 
+                       if (i!=pd_idx && sh->bh_write[i]) {
24026
 
+                               chosen[i] = sh->bh_write[i];
24027
 
+                               sh->bh_write[i] = sh->bh_write[i]->b_reqnext;
24028
 
+                               chosen[i]->b_reqnext = sh->bh_written[i];
24029
 
+                               sh->bh_written[i] = chosen[i];
24030
 
+                       }
24031
 
+               break;
24032
 
+       case CHECK_PARITY:
24033
 
+               break;
24034
 
+       }
24035
 
+       if (count>1) {
24036
 
+               evms_md_xor_block(count, bh_ptr);
24037
 
+               count = 1;
24038
 
+       }
24039
 
+       
24040
 
+       for (i = disks; i--;)
24041
 
+               if (chosen[i]) {
24042
 
+                       struct buffer_head *bh = sh->bh_cache[i];
24043
 
+                       char *bdata;
24044
 
+                       bdata = bh_kmap(chosen[i]);
24045
 
+                       memcpy(bh->b_data,
24046
 
+                              bdata,sh->size);
24047
 
+                       bh_kunmap(chosen[i]);
24048
 
+                       set_bit(BH_Lock, &bh->b_state);
24049
 
+                       mark_buffer_uptodate(bh, 1);
24050
 
+               }
24051
 
+
24052
 
+       switch(method) {
24053
 
+       case RECONSTRUCT_WRITE:
24054
 
+       case CHECK_PARITY:
24055
 
+               for (i=disks; i--;)
24056
 
+                       if (i != pd_idx) {
24057
 
+                               bh_ptr[count++] = sh->bh_cache[i];
24058
 
+                               check_xor();
24059
 
+                       }
24060
 
+               break;
24061
 
+       case READ_MODIFY_WRITE:
24062
 
+               for (i = disks; i--;)
24063
 
+                       if (chosen[i]) {
24064
 
+                               bh_ptr[count++] = sh->bh_cache[i];
24065
 
+                               check_xor();
24066
 
+                       }
24067
 
+       }
24068
 
+       if (count != 1)
24069
 
+               evms_md_xor_block(count, bh_ptr);
24070
 
+       
24071
 
+       if (method != CHECK_PARITY) {
24072
 
+               mark_buffer_uptodate(sh->bh_cache[pd_idx], 1);
24073
 
+               set_bit(BH_Lock, &sh->bh_cache[pd_idx]->b_state);
24074
 
+       } else
24075
 
+               mark_buffer_uptodate(sh->bh_cache[pd_idx], 0);
24076
 
+}
24077
 
+
24078
 
+static void add_stripe_bh (struct stripe_head *sh, struct buffer_head *bh, int dd_idx, int rw)
24079
 
+{
24080
 
+       struct buffer_head **bhp;
24081
 
+       raid5_conf_t *conf = sh->raid_conf;
24082
 
+
24083
 
+       spin_lock(&sh->lock);
24084
 
+       spin_lock_irq(&conf->device_lock);
24085
 
+       bh->b_reqnext = NULL;
24086
 
+       if (rw == READ)
24087
 
+               bhp = &sh->bh_read[dd_idx];
24088
 
+       else
24089
 
+               bhp = &sh->bh_write[dd_idx];
24090
 
+       while (*bhp) {
24091
 
+               LOG_DEFAULT("EVMS RAID5: multiple %d requests for sector %ld\n", rw, sh->sector);
24092
 
+               bhp = & (*bhp)->b_reqnext;
24093
 
+       }
24094
 
+       *bhp = bh;
24095
 
+       spin_unlock_irq(&conf->device_lock);
24096
 
+       spin_unlock(&sh->lock);
24097
 
+
24098
 
+}
24099
 
+
24100
 
+
24101
 
+
24102
 
+
24103
 
+
24104
 
+/*
24105
 
+ * handle_stripe - do things to a stripe.
24106
 
+ *
24107
 
+ * We lock the stripe and then examine the state of various bits
24108
 
+ * to see what needs to be done.
24109
 
+ * Possible results:
24110
 
+ *    return some read request which now have data
24111
 
+ *    return some write requests which are safely on disc
24112
 
+ *    schedule a read on some buffers
24113
 
+ *    schedule a write of some buffers
24114
 
+ *    return confirmation of parity correctness
24115
 
+ *
24116
 
+ * Parity calculations are done inside the stripe lock
24117
 
+ * buffers are taken off read_list or write_list, and bh_cache buffers
24118
 
+ * get BH_Lock set before the stripe lock is released.
24119
 
+ *
24120
 
+ */
24121
 
+
24122
 
+static void handle_stripe(struct stripe_head *sh)
24123
 
+{
24124
 
+       raid5_conf_t *conf = sh->raid_conf;
24125
 
+       int disks = conf->raid_disks;
24126
 
+       struct buffer_head *return_ok= NULL, *return_fail = NULL;
24127
 
+       int action[MD_SB_DISKS];
24128
 
+       int i;
24129
 
+       int syncing;
24130
 
+       int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
24131
 
+       int failed_num=0;
24132
 
+       struct buffer_head *bh;
24133
 
+
24134
 
+       memset(action, 0, sizeof(action));
24135
 
+
24136
 
+       spin_lock(&sh->lock);
24137
 
+       clear_bit(STRIPE_HANDLE, &sh->state);
24138
 
+       clear_bit(STRIPE_DELAYED, &sh->state);
24139
 
+
24140
 
+       syncing = test_bit(STRIPE_SYNCING, &sh->state);
24141
 
+       /* Now to look around and see what can be done */
24142
 
+
24143
 
+       for (i=disks; i--; ) {
24144
 
+               bh = sh->bh_cache[i];
24145
 
+               /* maybe we can reply to a read */
24146
 
+               if (buffer_uptodate(bh) && sh->bh_read[i]) {
24147
 
+                       struct buffer_head *rbh, *rbh2;
24148
 
+                       spin_lock_irq(&conf->device_lock);
24149
 
+                       rbh = sh->bh_read[i];
24150
 
+                       sh->bh_read[i] = NULL;
24151
 
+                       spin_unlock_irq(&conf->device_lock);
24152
 
+                       while (rbh) {
24153
 
+                               char *bdata;
24154
 
+                               bdata = bh_kmap(rbh);
24155
 
+                               memcpy(bdata, bh->b_data, bh->b_size);
24156
 
+                               bh_kunmap(rbh);
24157
 
+                               rbh2 = rbh->b_reqnext;
24158
 
+                               rbh->b_reqnext = return_ok;
24159
 
+                               return_ok = rbh;
24160
 
+                               rbh = rbh2;
24161
 
+                       }
24162
 
+               }
24163
 
+
24164
 
+               /* now count some things */
24165
 
+               if (buffer_locked(bh)) locked++;
24166
 
+               if (buffer_uptodate(bh)) uptodate++;
24167
 
+
24168
 
+               
24169
 
+               if (sh->bh_read[i]) to_read++;
24170
 
+               if (sh->bh_write[i]) to_write++;
24171
 
+               if (sh->bh_written[i]) written++;
24172
 
+               if (!conf->disks[i].operational) {
24173
 
+                       failed++;
24174
 
+                       failed_num = i;
24175
 
+               }
24176
 
+       }
24177
 
+       /* check if the array has lost two devices and, if so, some requests might
24178
 
+        * need to be failed
24179
 
+        */
24180
 
+       if (failed > 1 && to_read+to_write) {
24181
 
+               for (i=disks; i--; ) {
24182
 
+                       /* fail all writes first */
24183
 
+                       if (sh->bh_write[i]) to_write--;
24184
 
+                       while ((bh = sh->bh_write[i])) {
24185
 
+                               sh->bh_write[i] = bh->b_reqnext;
24186
 
+                               bh->b_reqnext = return_fail;
24187
 
+                               return_fail = bh;
24188
 
+                       }
24189
 
+                       /* fail any reads if this device is non-operational */
24190
 
+                       if (!conf->disks[i].operational) {
24191
 
+                               spin_lock_irq(&conf->device_lock);
24192
 
+                               if (sh->bh_read[i]) to_read--;
24193
 
+                               while ((bh = sh->bh_read[i])) {
24194
 
+                                       sh->bh_read[i] = bh->b_reqnext;
24195
 
+                                       bh->b_reqnext = return_fail;
24196
 
+                                       return_fail = bh;
24197
 
+                               }
24198
 
+                               spin_unlock_irq(&conf->device_lock);
24199
 
+                       }
24200
 
+               }
24201
 
+       }
24202
 
+       if (failed > 1 && syncing) {
24203
 
+               evms_md_done_sync(conf->mddev, (sh->size>>9) - sh->sync_redone,0);
24204
 
+               clear_bit(STRIPE_SYNCING, &sh->state);
24205
 
+               syncing = 0;
24206
 
+       }
24207
 
+
24208
 
+       /* might be able to return some write requests if the parity block
24209
 
+        * is safe, or on a failed drive
24210
 
+        */
24211
 
+       bh = sh->bh_cache[sh->pd_idx];
24212
 
+       if ( written &&
24213
 
+            ( (conf->disks[sh->pd_idx].operational && !buffer_locked(bh) && buffer_uptodate(bh))
24214
 
+              || (failed == 1 && failed_num == sh->pd_idx))
24215
 
+           ) {
24216
 
+           /* any written block on a uptodate or failed drive can be returned */
24217
 
+           for (i=disks; i--; )
24218
 
+               if (sh->bh_written[i]) {
24219
 
+                   bh = sh->bh_cache[i];
24220
 
+                   if (!conf->disks[sh->pd_idx].operational ||
24221
 
+                       (!buffer_locked(bh) && buffer_uptodate(bh)) ) {
24222
 
+                       /* maybe we can return some write requests */
24223
 
+                       struct buffer_head *wbh, *wbh2;
24224
 
+                       wbh = sh->bh_written[i];
24225
 
+                       sh->bh_written[i] = NULL;
24226
 
+                       while (wbh) {
24227
 
+                           wbh2 = wbh->b_reqnext;
24228
 
+                           wbh->b_reqnext = return_ok;
24229
 
+                           return_ok = wbh;
24230
 
+                           wbh = wbh2;
24231
 
+                       }
24232
 
+                   }
24233
 
+               }
24234
 
+       }
24235
 
+               
24236
 
+       /* Now we might consider reading some blocks, either to check/generate
24237
 
+        * parity, or to satisfy requests
24238
 
+        */
24239
 
+       if (to_read || (syncing && (uptodate+failed < disks))) {
24240
 
+               for (i=disks; i--;) {
24241
 
+                       bh = sh->bh_cache[i];
24242
 
+                       if (!buffer_locked(bh) && !buffer_uptodate(bh) &&
24243
 
+                           (sh->bh_read[i] || syncing || (failed && sh->bh_read[failed_num]))) {
24244
 
+                               /* we would like to get this block, possibly
24245
 
+                                * by computing it, but we might not be able to
24246
 
+                                */
24247
 
+                               if (uptodate == disks-1) {
24248
 
+                                       compute_block(sh, i);
24249
 
+                                       uptodate++;
24250
 
+                               } else if (conf->disks[i].operational) {
24251
 
+                                       set_bit(BH_Lock, &bh->b_state);
24252
 
+                                       action[i] = READ+1;
24253
 
+                                       /* if I am just reading this block and we don't have
24254
 
+                                          a failed drive, or any pending writes then sidestep the cache */
24255
 
+                                       if (sh->bh_page[i]) BUG();
24256
 
+                                       if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext &&
24257
 
+                                           ! syncing && !failed && !to_write) {
24258
 
+                                               sh->bh_page[i] = sh->bh_cache[i]->b_page;
24259
 
+                                               sh->bh_cache[i]->b_page =  sh->bh_read[i]->b_page;
24260
 
+                                               sh->bh_cache[i]->b_data =  sh->bh_read[i]->b_data;
24261
 
+                                       }
24262
 
+                                       locked++;
24263
 
+                                       if (syncing)
24264
 
+                                               evms_md_sync_acct(conf->disks[i].dev, bh->b_size>>9);
24265
 
+                               }
24266
 
+                       }
24267
 
+               }
24268
 
+               set_bit(STRIPE_HANDLE, &sh->state);
24269
 
+       }
24270
 
+
24271
 
+       /* now to consider writing and what else, if anything should be read */
24272
 
+       if (to_write) {
24273
 
+               int rmw=0, rcw=0;
24274
 
+               for (i=disks ; i--;) {
24275
 
+                       /* would I have to read this buffer for read_modify_write */
24276
 
+                       bh = sh->bh_cache[i];
24277
 
+                       if ((sh->bh_write[i] || i == sh->pd_idx) &&
24278
 
+                           (!buffer_locked(bh) || sh->bh_page[i]) &&
24279
 
+                           !buffer_uptodate(bh)) {
24280
 
+                               if (conf->disks[i].operational
24281
 
+/*                                 && !(conf->resync_parity && i == sh->pd_idx) */
24282
 
+                                       )
24283
 
+                                       rmw++;
24284
 
+                               else rmw += 2*disks;  /* cannot read it */
24285
 
+                       }
24286
 
+                       /* Would I have to read this buffer for reconstruct_write */
24287
 
+                       if (!sh->bh_write[i] && i != sh->pd_idx &&
24288
 
+                           (!buffer_locked(bh) || sh->bh_page[i]) &&
24289
 
+                           !buffer_uptodate(bh)) {
24290
 
+                               if (conf->disks[i].operational) rcw++;
24291
 
+                               else rcw += 2*disks;
24292
 
+                       }
24293
 
+               }
24294
 
+               set_bit(STRIPE_HANDLE, &sh->state);
24295
 
+               if (rmw < rcw && rmw > 0)
24296
 
+                       /* prefer read-modify-write, but need to get some data */
24297
 
+                       for (i=disks; i--;) {
24298
 
+                               bh = sh->bh_cache[i];
24299
 
+                               if ((sh->bh_write[i] || i == sh->pd_idx) &&
24300
 
+                                   !buffer_locked(bh) && !buffer_uptodate(bh) &&
24301
 
+                                   conf->disks[i].operational) {
24302
 
+                                       if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
24303
 
+                                       {
24304
 
+                                               set_bit(BH_Lock, &bh->b_state);
24305
 
+                                               action[i] = READ+1;
24306
 
+                                               locked++;
24307
 
+                                       } else {
24308
 
+                                               set_bit(STRIPE_DELAYED, &sh->state);
24309
 
+                                               set_bit(STRIPE_HANDLE, &sh->state);
24310
 
+                                       }
24311
 
+                               }
24312
 
+                       }
24313
 
+               if (rcw <= rmw && rcw > 0)
24314
 
+                       /* want reconstruct write, but need to get some data */
24315
 
+                       for (i=disks; i--;) {
24316
 
+                               bh = sh->bh_cache[i];
24317
 
+                               if (!sh->bh_write[i]  && i != sh->pd_idx &&
24318
 
+                                   !buffer_locked(bh) && !buffer_uptodate(bh) &&
24319
 
+                                   conf->disks[i].operational) {
24320
 
+                                       if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
24321
 
+                                       {
24322
 
+                                               set_bit(BH_Lock, &bh->b_state);
24323
 
+                                               action[i] = READ+1;
24324
 
+                                               locked++;
24325
 
+                                       } else {
24326
 
+                                               set_bit(STRIPE_DELAYED, &sh->state);
24327
 
+                                               set_bit(STRIPE_HANDLE, &sh->state);
24328
 
+                                       }
24329
 
+                               }
24330
 
+                       }
24331
 
+               /* now if nothing is locked, and if we have enough data, we can start a write request */
24332
 
+               if (locked == 0 && (rcw == 0 ||rmw == 0)) {
24333
 
+                       compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
24334
 
+                       /* now every locked buffer is ready to be written */
24335
 
+                       for (i=disks; i--;)
24336
 
+                               if (buffer_locked(sh->bh_cache[i])) {
24337
 
+                                       locked++;
24338
 
+                                       action[i] = WRITE+1;
24339
 
+                                       if (!conf->disks[i].operational
24340
 
+                                           || (i==sh->pd_idx && failed == 0))
24341
 
+                                               set_bit(STRIPE_INSYNC, &sh->state);
24342
 
+                               }
24343
 
+                       if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
24344
 
+                               atomic_dec(&conf->preread_active_stripes);
24345
 
+                               if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
24346
 
+                                       evms_cs_wakeup_thread(conf->thread);
24347
 
+                       }
24348
 
+               }
24349
 
+       }
24350
 
+
24351
 
+       /* maybe we need to check and possibly fix the parity for this stripe
24352
 
+        * Any reads will already have been scheduled, so we just see if enough data
24353
 
+        * is available
24354
 
+        */
24355
 
+       if (syncing && locked == 0 &&
24356
 
+           !test_bit(STRIPE_INSYNC, &sh->state) && failed <= 1) {
24357
 
+               set_bit(STRIPE_HANDLE, &sh->state);
24358
 
+               if (failed == 0) {
24359
 
+                       if (uptodate != disks)
24360
 
+                               BUG();
24361
 
+                       compute_parity(sh, CHECK_PARITY);
24362
 
+                       uptodate--;
24363
 
+                       bh = sh->bh_cache[sh->pd_idx];
24364
 
+                       if ((*(u32*)bh->b_data) == 0 &&
24365
 
+                           !memcmp(bh->b_data, bh->b_data+4, bh->b_size-4)) {
24366
 
+                               /* parity is correct (on disc, not in buffer any more) */
24367
 
+                               set_bit(STRIPE_INSYNC, &sh->state);
24368
 
+                       }
24369
 
+               }
24370
 
+               if (!test_bit(STRIPE_INSYNC, &sh->state)) {
24371
 
+                       struct disk_info *spare;
24372
 
+                       if (failed==0)
24373
 
+                               failed_num = sh->pd_idx;
24374
 
+                       /* should be able to compute the missing block and write it to spare */
24375
 
+                       if (!buffer_uptodate(sh->bh_cache[failed_num])) {
24376
 
+                               if (uptodate+1 != disks)
24377
 
+                                       BUG();
24378
 
+                               compute_block(sh, failed_num);
24379
 
+                               uptodate++;
24380
 
+                       }
24381
 
+                       if (uptodate != disks)
24382
 
+                               BUG();
24383
 
+                       bh = sh->bh_cache[failed_num];
24384
 
+                       set_bit(BH_Lock, &bh->b_state);
24385
 
+                       action[failed_num] = WRITE+1;
24386
 
+                       locked++;
24387
 
+                       set_bit(STRIPE_INSYNC, &sh->state);
24388
 
+                       if (conf->disks[failed_num].operational)
24389
 
+                               evms_md_sync_acct(conf->disks[failed_num].dev, bh->b_size>>9);
24390
 
+                       else if ((spare=conf->spare))
24391
 
+                               evms_md_sync_acct(spare->dev, bh->b_size>>9);
24392
 
+
24393
 
+               }
24394
 
+       }
24395
 
+       if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
24396
 
+               evms_md_done_sync(conf->mddev, (sh->size>>9) - sh->sync_redone,1);
24397
 
+               clear_bit(STRIPE_SYNCING, &sh->state);
24398
 
+       }
24399
 
+       
24400
 
+       
24401
 
+       spin_unlock(&sh->lock);
24402
 
+
24403
 
+       while ((bh=return_ok)) {
24404
 
+               return_ok = bh->b_reqnext;
24405
 
+               bh->b_reqnext = NULL;
24406
 
+               bh->b_end_io(bh, 1);
24407
 
+       }
24408
 
+       while ((bh=return_fail)) {
24409
 
+               return_fail = bh->b_reqnext;
24410
 
+               bh->b_reqnext = NULL;
24411
 
+               bh->b_end_io(bh, 0);
24412
 
+       }
24413
 
+       for (i=disks; i-- ;)
24414
 
+               if (action[i]) {
24415
 
+                       struct buffer_head *bh = sh->bh_cache[i];
24416
 
+                       struct disk_info *spare = conf->spare;
24417
 
+                       evms_logical_node_t *node = NULL;
24418
 
+                       eio_t eio;
24419
 
+                       int skip = 0;
24420
 
+                       if (action[i] == READ+1)
24421
 
+                               bh->b_end_io = raid5_end_read_request;
24422
 
+                       else
24423
 
+                               bh->b_end_io = raid5_end_write_request;
24424
 
+                       if (conf->disks[i].operational) {
24425
 
+                               bh->b_dev = conf->disks[i].dev;
24426
 
+                               node = conf->disks[i].node;
24427
 
+                       } else if (spare && action[i] == WRITE+1) {
24428
 
+                               bh->b_dev = spare->dev;
24429
 
+                               node = spare->node;
24430
 
+                       } else skip=1;
24431
 
+                       if (!skip) {
24432
 
+                               atomic_inc(&sh->count);
24433
 
+                               bh->b_rdev = bh->b_dev;
24434
 
+                               bh->b_rsector = bh->b_blocknr * (bh->b_size>>9);
24435
 
+                               eio.bh = bh;
24436
 
+                               eio.rsector = bh->b_rsector;
24437
 
+                               eio.rsize = bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT;
24438
 
+                               sh->node[i] = node;
24439
 
+                               if (action[i] == READ+1)
24440
 
+                                       R_IO(node, &eio);
24441
 
+                               else
24442
 
+                                       W_IO(node, &eio);
24443
 
+                       } else {
24444
 
+                               clear_bit(BH_Lock, &bh->b_state);
24445
 
+                               set_bit(STRIPE_HANDLE, &sh->state);
24446
 
+                       }
24447
 
+               }
24448
 
+}
24449
 
+
24450
 
+static inline void raid5_activate_delayed(raid5_conf_t *conf)
24451
 
+{
24452
 
+       if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
24453
 
+               while (!list_empty(&conf->delayed_list)) {
24454
 
+                       struct list_head *l = conf->delayed_list.next;
24455
 
+                       struct stripe_head *sh;
24456
 
+                       sh = list_entry(l, struct stripe_head, lru);
24457
 
+                       list_del_init(l);
24458
 
+                       clear_bit(STRIPE_DELAYED, &sh->state);
24459
 
+                       if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
24460
 
+                               atomic_inc(&conf->preread_active_stripes);
24461
 
+                       list_add_tail(&sh->lru, &conf->handle_list);
24462
 
+               }
24463
 
+       }
24464
 
+}
24465
 
+static void raid5_unplug_device(void *data)
24466
 
+{
24467
 
+       raid5_conf_t *conf = (raid5_conf_t *)data;
24468
 
+       unsigned long flags;
24469
 
+
24470
 
+       spin_lock_irqsave(&conf->device_lock, flags);
24471
 
+
24472
 
+       raid5_activate_delayed(conf);
24473
 
+       
24474
 
+       conf->plugged = 0;
24475
 
+       evms_cs_wakeup_thread(conf->thread);
24476
 
+
24477
 
+       spin_unlock_irqrestore(&conf->device_lock, flags);
24478
 
+}
24479
 
+
24480
 
+static inline void raid5_plug_device(raid5_conf_t *conf)
24481
 
+{
24482
 
+       spin_lock_irq(&conf->device_lock);
24483
 
+       if (list_empty(&conf->delayed_list))
24484
 
+               if (!conf->plugged) {
24485
 
+                       conf->plugged = 1;
24486
 
+                       queue_task(&conf->plug_tq, &tq_disk);
24487
 
+               }
24488
 
+       spin_unlock_irq(&conf->device_lock);
24489
 
+}
24490
 
+
24491
 
+
24492
 
+static int raid5_make_request (mddev_t *mddev,
24493
 
+                              int rw,
24494
 
+                              eio_t *eio)
24495
 
+{
24496
 
+       raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
24497
 
+       const unsigned int raid_disks = conf->raid_disks;
24498
 
+       const unsigned int data_disks = raid_disks - 1;
24499
 
+       unsigned int dd_idx, pd_idx;
24500
 
+       unsigned long new_sector;
24501
 
+       int read_ahead = 0;
24502
 
+       struct buffer_head *bh = eio->bh;
24503
 
+       
24504
 
+       struct stripe_head *sh;
24505
 
+       
24506
 
+       /* Note: Need to add 64-bit support in the future */
24507
 
+       bh->b_size = (unsigned short)eio->rsize << EVMS_VSECTOR_SIZE_SHIFT;
24508
 
+       bh->b_rsector = (unsigned long)eio->rsector;
24509
 
+       if (rw == READA) {
24510
 
+               rw = READ;
24511
 
+               read_ahead=1;
24512
 
+       }
24513
 
+
24514
 
+       new_sector = raid5_compute_sector(bh->b_rsector,
24515
 
+                       raid_disks, data_disks, &dd_idx, &pd_idx, conf);
24516
 
+
24517
 
+       sh = get_active_stripe(conf, new_sector, bh->b_size, read_ahead);
24518
 
+       if (sh) {
24519
 
+               sh->pd_idx = pd_idx;
24520
 
+
24521
 
+               add_stripe_bh(sh, bh, dd_idx, rw);
24522
 
+
24523
 
+               raid5_plug_device(conf);
24524
 
+               handle_stripe(sh);
24525
 
+               release_stripe(sh);
24526
 
+       } else
24527
 
+               bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
24528
 
+       return 0;
24529
 
+}
24530
 
+
24531
 
+/*
24532
 
+ * function: allocate_bh
24533
 
+ *
24534
 
+ * This function obtains a buffer head from the private
24535
 
+ * buffer head pool (pre-allocated at EVMS initial
24536
 
+ * discovery time).
24537
 
+ *
24538
 
+ * NOTE: All access to the buffer head pool are protected
24539
 
+ * by a private spinlock.
24540
 
+ *
24541
 
+ */
24542
 
+static inline struct buffer_head *
24543
 
+allocate_bh(void)
24544
 
+{
24545
 
+       struct buffer_head *bh =
24546
 
+               evms_cs_allocate_from_pool(evms_bh_pool, FALSE);
24547
 
+       if (bh) {
24548
 
+               init_waitqueue_head(&bh->b_wait);
24549
 
+       }
24550
 
+       return(bh);
24551
 
+}
24552
 
+
24553
 
+/*
24554
 
+ * function: deallocate_bh
24555
 
+ *
24556
 
+ * This function returns a buffer head to the private
24557
 
+ * buffer head pool (pre-allocated at EVMS initial
24558
 
+ * discovery time).
24559
 
+ *
24560
 
+ * NOTE: All access to the buffer head pool are protected
24561
 
+ * by a private spinlock.
24562
 
+ *
24563
 
+ */
24564
 
+static inline void
24565
 
+deallocate_bh(struct buffer_head *bh)
24566
 
+{
24567
 
+       evms_cs_deallocate_to_pool(evms_bh_pool, bh);
24568
 
+}
24569
 
+
24570
 
+/* this is the buffer head control block structure definition */
24571
 
+typedef struct bh_cb_s {
24572
 
+       int                 rc;
24573
 
+        atomic_t            blks_allocated;
24574
 
+        wait_queue_head_t   cb_wait;
24575
 
+} bh_cb_t;
24576
 
+
24577
 
+/*
24578
 
+ * function: __wait_on_bh_cb
24579
 
+ *
24580
 
+ * This is a worker function to wait_on_bh_cb.
24581
 
+ * This function waits for a set of private buffer heads
24582
 
+ * associated to the specified buffer head control block
24583
 
+ * to return from I/O completion. On completion of the
24584
 
+ * last buffer head, the calling function is awakened
24585
 
+ * and continues running.
24586
 
+ *
24587
 
+ * This is the worker function to the function wait_on_bh_cb.
24588
 
+ *
24589
 
+ */
24590
 
+static void
24591
 
+__wait_on_bh_cb(bh_cb_t *bh_cb)
24592
 
+{
24593
 
+        struct task_struct *tsk = current;
24594
 
+        DECLARE_WAITQUEUE(wait, tsk);
24595
 
+
24596
 
+        add_wait_queue(&bh_cb->cb_wait, &wait);
24597
 
+        do {
24598
 
+                run_task_queue(&tq_disk);
24599
 
+                set_task_state(tsk, TASK_UNINTERRUPTIBLE);
24600
 
+                if (!atomic_read(&bh_cb->blks_allocated))
24601
 
+                        break;
24602
 
+                schedule();
24603
 
+        } while (atomic_read(&bh_cb->blks_allocated));
24604
 
+        tsk->state = TASK_RUNNING;
24605
 
+        remove_wait_queue(&bh_cb->cb_wait, &wait);
24606
 
+}
24607
 
+
24608
 
+/*
24609
 
+ * function: wait_on_bh_cb
24610
 
+ *
24611
 
+ * This function waits for a set of private buffer heads
24612
 
+ * associated to the specified buffer head control block
24613
 
+ * to return from I/O completion. On completion of the
24614
 
+ * last buffer head, the calling function is awakened
24615
 
+ * and continues running.
24616
 
+ *
24617
 
+ */
24618
 
+static void
24619
 
+wait_on_bh_cb(bh_cb_t *bh_cb)
24620
 
+{
24621
 
+        if (atomic_read(&bh_cb->blks_allocated))
24622
 
+                __wait_on_bh_cb(bh_cb);
24623
 
+       else
24624
 
+               /* if we ended up with no buffer heads on
24625
 
+                * this pass, lets wait a until a few buffer
24626
 
+                * heads have been freed and try again. This
24627
 
+                * should provide a reasonable delay.
24628
 
+                */
24629
 
+               schedule();
24630
 
+}
24631
 
+
24632
 
+/*
24633
 
+ * function: end_bh_cb_io
24634
 
+ *
24635
 
+ * This is the I/O completion function that is called for
24636
 
+ * each private buffer head obtained from the buffer head
24637
 
+ * pool. Control is return thru this routine so we can track
24638
 
+ * all outstanding requests to know when to awaken the caller,
24639
 
+ * and to regain control after all I/Os have been performed.
24640
 
+ *
24641
 
+ */
24642
 
+static void
24643
 
+end_bh_cb_io_sync(struct buffer_head *bh, int uptodate)
24644
 
+{
24645
 
+        bh_cb_t *bh_cb = (bh_cb_t *)bh->b_private;
24646
 
+
24647
 
+       /* record that errors occurred */
24648
 
+       if (!uptodate) {
24649
 
+               bh_cb->rc = -EIO;
24650
 
+       }
24651
 
+        mark_buffer_uptodate(bh, uptodate);
24652
 
+        unlock_buffer(bh);
24653
 
+
24654
 
+        deallocate_bh(bh);
24655
 
+        atomic_dec(&bh_cb->blks_allocated);
24656
 
+        if (!atomic_read(&bh_cb->blks_allocated))
24657
 
+                if (waitqueue_active(&bh_cb->cb_wait))
24658
 
+                    wake_up(&bh_cb->cb_wait);
24659
 
+}
24660
 
+
24661
 
+/*
24662
 
+ * function: md_raid5_internal_partial_sector_io
24663
 
+ *
24664
 
+ * This function is a support function for md_raid5_internal_io,
24665
 
+ * which handles the cases of performing I/O to only a part
24666
 
+ * of sector. This function is not designed to be called
24667
 
+ * directly, other than by md_raid5_internal_io.
24668
 
+ *
24669
 
+ */
24670
 
+static int
24671
 
+md_raid5_internal_partial_sector_io(
24672
 
+       mddev_t *mddev,
24673
 
+        int io_flag,
24674
 
+       bh_cb_t *bh_cb,
24675
 
+        u_int64_t next_offset,
24676
 
+        u_int64_t sector_offset,
24677
 
+       u_int64_t io_size,
24678
 
+        void *bufptr,
24679
 
+       unsigned char **sector_buf )
24680
 
+{
24681
 
+       int rc = 0;
24682
 
+        struct buffer_head *bh;
24683
 
+       eio_t eio;
24684
 
+       raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
24685
 
+
24686
 
+       if (*sector_buf == NULL)
24687
 
+               /* allocate buffer for incoming sector */
24688
 
+               rc = evms_cs_allocate_memory((void **)sector_buf,
24689
 
+                                            conf->buffer_size);
24690
 
+       if (!rc) {
24691
 
+               /* allocate a buffer head from the pool */
24692
 
+               while((bh = allocate_bh()) == NULL)
24693
 
+                       /* yielding the cpu is playing it
24694
 
+                        * safe. it might be wiser to just
24695
 
+                        * spin. requires more thought.
24696
 
+                        */
24697
 
+                       schedule();
24698
 
+
24699
 
+               /* set up the buffer head for this sector */
24700
 
+               bh->b_end_io = end_bh_cb_io_sync;
24701
 
+               bh->b_size = conf->buffer_size;
24702
 
+               bh->b_rdev = 0;
24703
 
+               bh->b_rsector = (next_offset - sector_offset) >> EVMS_VSECTOR_SIZE_SHIFT;
24704
 
+               bh->b_data = *sector_buf;
24705
 
+               bh->b_page = virt_to_page(*sector_buf); /* this isn't handling the case of a block with more than 1 sector, that spans pages */
24706
 
+               bh->b_state = 0;
24707
 
+               set_bit(BH_Dirty, &bh->b_state);
24708
 
+               set_bit(BH_Lock, &bh->b_state);
24709
 
+               set_bit(BH_Req, &bh->b_state);
24710
 
+               set_bit(BH_Mapped, &bh->b_state);
24711
 
+               bh->b_private = (void *)bh_cb;
24712
 
+               atomic_inc(&bh_cb->blks_allocated);
24713
 
+
24714
 
+               /* drive the buffer head down   */
24715
 
+               /* to the device                */
24716
 
+               eio.bh = bh;
24717
 
+               eio.rsector = bh->b_rsector;
24718
 
+               eio.rsize = (u64)bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT;
24719
 
+               raid5_make_request(mddev, READ, &eio);
24720
 
+
24721
 
+               /* wait for all bh's I/O's to end */
24722
 
+               wait_on_bh_cb(bh_cb);
24723
 
+
24724
 
+               /* copy data to/from user */
24725
 
+               if (io_flag != WRITE)
24726
 
+                       /* READ */
24727
 
+                       memcpy(bufptr,
24728
 
+                              *sector_buf + sector_offset,
24729
 
+                              io_size);
24730
 
+               else {
24731
 
+                       /* WRITE */
24732
 
+                       memcpy(*sector_buf + sector_offset,
24733
 
+                              bufptr,
24734
 
+                              io_size);
24735
 
+
24736
 
+                       /* allocate a buffer head from the pool */
24737
 
+                       while((bh = allocate_bh()) == NULL)
24738
 
+                               /* yielding the cpu is playing it
24739
 
+                                * safe. it might be wiser to just
24740
 
+                                * spin. requires more thought.
24741
 
+                                */
24742
 
+                               schedule();
24743
 
+
24744
 
+                       /* set up the buffer head for this sector */
24745
 
+                       bh->b_end_io = end_bh_cb_io_sync;
24746
 
+                       bh->b_size = conf->buffer_size;
24747
 
+                       bh->b_rdev = 0;
24748
 
+                       bh->b_rsector = (next_offset - sector_offset) >> EVMS_VSECTOR_SIZE_SHIFT;
24749
 
+                       bh->b_data = *sector_buf;
24750
 
+                       bh->b_page = virt_to_page(*sector_buf); /* this isn't handling the case of a block with more than 1 sector, that spans pages */
24751
 
+                       bh->b_state = 0;
24752
 
+                       set_bit(BH_Dirty, &bh->b_state);
24753
 
+                       set_bit(BH_Lock, &bh->b_state);
24754
 
+                       set_bit(BH_Req, &bh->b_state);
24755
 
+                       set_bit(BH_Mapped, &bh->b_state);
24756
 
+                       bh->b_private = (void *)bh_cb;
24757
 
+                       atomic_inc(&bh_cb->blks_allocated);
24758
 
+
24759
 
+                       /* drive the buffer head down   */
24760
 
+                       /* to the device                */
24761
 
+                       eio.bh = bh;
24762
 
+                       eio.rsector = bh->b_rsector;
24763
 
+                       eio.rsize = (u64)bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT;
24764
 
+                       raid5_make_request(mddev, WRITE, &eio);
24765
 
+
24766
 
+                       /* wait for all bh's I/O's to end */
24767
 
+                       wait_on_bh_cb(bh_cb);
24768
 
+               }
24769
 
+       }
24770
 
+       return(rc);
24771
 
+}
24772
 
+
24773
 
+/*
24774
 
+ * function: md_raid5_internal_io
24775
 
+ *
24776
 
+ * This function provides support for synchronous I/O
24777
 
+ * operations to the underlying devices. These I/O
24778
 
+ * operations are NOT buffered in any way including the
24779
 
+ * operating system's buffer cache.
24780
 
+ *
24781
 
+ * This function can work with any hardsector size that
24782
 
+ * is a power of 2.
24783
 
+ *
24784
 
+ * node           : logical node of the target logical disk
24785
 
+ * io_flag        : 0 = read, 1 = write, 2 = read-a-head
24786
 
+ * starting_offset: the 0-based (disk relative) byte offset
24787
 
+ * num_bytes      : the total number of bytes in this I/O
24788
 
+ * bufptr         : address of the memory to read/write the data
24789
 
+ *
24790
 
+ */
24791
 
+static int
24792
 
+md_raid5_internal_io(
24793
 
+       mddev_t *mddev,
24794
 
+        int io_flag,
24795
 
+        u_int64_t starting_offset,
24796
 
+       u_int64_t num_bytes,
24797
 
+        void *bufptr )
24798
 
+{
24799
 
+        int rc = 0;
24800
 
+        u_int64_t next_offset, remaining_bytes;
24801
 
+        char *cur_bufptr;
24802
 
+        bh_cb_t bh_cb;
24803
 
+       unsigned char *sector_buf = NULL;
24804
 
+       evms_logical_node_t *node = mddev->node;
24805
 
+       raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
24806
 
+
24807
 
+        LOG_EVERYTHING("%s: node(%s), ioflag(%u), start_offset(%Lu), num_bytes(%Lu), bufptr(0x%p)\n",
24808
 
+                  __FUNCTION__, node->name, io_flag, starting_offset, num_bytes, bufptr);
24809
 
+
24810
 
+       /* check for 0 length request */
24811
 
+        if ( num_bytes == 0 ) {
24812
 
+               LOG_ERROR("%s: error requesting 0 bytes.\n", __FUNCTION__);
24813
 
+                rc = -EINVAL;
24814
 
+       }
24815
 
+       /* check for out of bound request */
24816
 
+       if (!rc) {
24817
 
+               u64 node_total_bytes =
24818
 
+                       node->total_vsectors <<
24819
 
+                       EVMS_VSECTOR_SIZE_SHIFT;
24820
 
+               if ( (starting_offset + num_bytes) > node_total_bytes) {
24821
 
+                       LOG_ERROR("%s: attempted %s beyond boundary(%Lu bytes), requesting offset(%Lu), length(%Lu).\n",
24822
 
+                                 __FUNCTION__, (io_flag == WRITE) ? "WRITE" : "READ",
24823
 
+                               node_total_bytes, starting_offset, num_bytes);
24824
 
+                       rc = -EINVAL;
24825
 
+               }
24826
 
+       }
24827
 
+       /* check for invalid io_flag value */
24828
 
+       if (!rc)
24829
 
+               switch( io_flag ) {
24830
 
+                       case READ:   /* read...   */
24831
 
+                       case WRITE:  /* write...  */
24832
 
+                       case READA:  /* reada...  */
24833
 
+                               break;
24834
 
+                       default:
24835
 
+                               rc = -EINVAL;
24836
 
+                               break;
24837
 
+               }
24838
 
+
24839
 
+       /* initialize the buffer head control block */
24840
 
+       memset(&bh_cb, 0, sizeof(bh_cb_t));
24841
 
+       init_waitqueue_head(&bh_cb.cb_wait);
24842
 
+
24843
 
+       /* only update the local copy of variables */
24844
 
+       cur_bufptr = bufptr;
24845
 
+       next_offset = starting_offset;
24846
 
+       remaining_bytes = num_bytes;
24847
 
+
24848
 
+       /* continue if no errors found */
24849
 
+       if (!rc) {
24850
 
+               u_int64_t sector_offset;
24851
 
+
24852
 
+               /* check for a mid-sector starting offset
24853
 
+                *
24854
 
+                * if found, perform I/O on part of that
24855
 
+                * sector
24856
 
+                */
24857
 
+               sector_offset = next_offset & (conf->buffer_size - 1);
24858
 
+               if (sector_offset) {
24859
 
+                       u_int64_t io_size;
24860
 
+
24861
 
+                       /* determine bytes in IO to this sector */
24862
 
+                       io_size = conf->buffer_size - sector_offset;
24863
 
+                       if (io_size > remaining_bytes)
24864
 
+                               io_size = remaining_bytes;
24865
 
+
24866
 
+                       /* perform the partial sector io */
24867
 
+                       rc = md_raid5_internal_partial_sector_io(
24868
 
+                               mddev,io_flag,&bh_cb,
24869
 
+                               next_offset,
24870
 
+                               sector_offset, io_size,
24871
 
+                               cur_bufptr, &sector_buf);
24872
 
+
24873
 
+                       if (!rc) {
24874
 
+                               /* update progress in local variables */
24875
 
+                               cur_bufptr += io_size;
24876
 
+                               next_offset += io_size;
24877
 
+                               remaining_bytes -= io_size;
24878
 
+                       }
24879
 
+               }
24880
 
+       }
24881
 
+
24882
 
+       /* continue if no errors found */
24883
 
+       if (!rc) {
24884
 
+               /* perform I/O on all the complete sectors
24885
 
+                * in this request.
24886
 
+                *
24887
 
+                * loop until there are no more complete sectors
24888
 
+                * to process.
24889
 
+                */
24890
 
+               while(remaining_bytes >= conf->buffer_size) {
24891
 
+                       /* this inner loop attempts to drive as many
24892
 
+                        * bytes (in sector size multiples) down to
24893
 
+                        * the device as possible using the available
24894
 
+                        * buffer heads in the pool.
24895
 
+                        */
24896
 
+                       while(remaining_bytes >= conf->buffer_size) {
24897
 
+                               struct buffer_head *bh;
24898
 
+                               eio_t eio;
24899
 
+
24900
 
+                               /* allocate a buffer head from the pool */
24901
 
+                               bh = allocate_bh();
24902
 
+                               if (bh == NULL) break;
24903
 
+
24904
 
+                               /* set up the buffer head for this I/O */
24905
 
+                               bh->b_end_io = end_bh_cb_io_sync;
24906
 
+                               bh->b_size = conf->buffer_size;
24907
 
+                               bh->b_data = cur_bufptr;
24908
 
+                               bh->b_rdev = 0;
24909
 
+                               bh->b_rsector = next_offset >> EVMS_VSECTOR_SIZE_SHIFT;
24910
 
+                               bh->b_page = virt_to_page(cur_bufptr); /* this isn't handling the case of a block with more than 1 sector, that spans pages */
24911
 
+                               bh->b_state = 0;
24912
 
+                               set_bit(BH_Dirty, &bh->b_state);
24913
 
+                               set_bit(BH_Lock, &bh->b_state);
24914
 
+                               set_bit(BH_Req, &bh->b_state);
24915
 
+                               set_bit(BH_Mapped, &bh->b_state);
24916
 
+                               bh->b_private = (void *)&bh_cb;
24917
 
+                               atomic_inc(&bh_cb.blks_allocated);
24918
 
+
24919
 
+                               /* drive the buffer head down   */
24920
 
+                               /* to the device                */
24921
 
+                               eio.bh = bh;
24922
 
+                               eio.rsector = bh->b_rsector;
24923
 
+                               eio.rsize = (u64)bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT;
24924
 
+                               raid5_make_request(mddev, io_flag, &eio);
24925
 
+
24926
 
+                               /* update progress in local variables */
24927
 
+                               cur_bufptr += bh->b_size;
24928
 
+                               next_offset += bh->b_size;
24929
 
+                               remaining_bytes -= bh->b_size;
24930
 
+                       }
24931
 
+                       /* wait for all bh's I/O's to end */
24932
 
+                       wait_on_bh_cb(&bh_cb);
24933
 
+               }
24934
 
+       }
24935
 
+
24936
 
+       /* continue if no errors found */
24937
 
+       if (!rc)
24938
 
+               /* check for a mid-sector ending offset
24939
 
+                *
24940
 
+                * if found, perform I/O on part of that
24941
 
+                * sector
24942
 
+                */
24943
 
+               if (remaining_bytes)
24944
 
+                       /* perform the partial sector io */
24945
 
+                       rc = md_raid5_internal_partial_sector_io(
24946
 
+                               mddev, io_flag, &bh_cb,
24947
 
+                               next_offset,
24948
 
+                               0, remaining_bytes,
24949
 
+                               cur_bufptr, &sector_buf);
24950
 
+
24951
 
+       /* free the sector buffer if it was allocated */
24952
 
+       if (sector_buf)
24953
 
+               evms_cs_deallocate_memory(sector_buf);
24954
 
+
24955
 
+       /* coalesce return codes */
24956
 
+       rc |= bh_cb.rc;
24957
 
+
24958
 
+        LOG_EVERYTHING("%s: rc(%u)\n", __FUNCTION__, rc);
24959
 
+        return( rc );
24960
 
+}
24961
 
+
24962
 
+static int
24963
 
+raid5_init_io(
24964
 
+       mddev_t *mddev,
24965
 
+        int                   io_flag,
24966
 
+        evms_sector_t         startingLSN,
24967
 
+        evms_sector_t         numLSNs,
24968
 
+        void                 *bufptr )
24969
 
+{
24970
 
+       int rc = 0;
24971
 
+       u_int64_t starting_offset, num_bytes;
24972
 
+
24973
 
+       starting_offset = startingLSN;
24974
 
+       starting_offset <<= EVMS_VSECTOR_SIZE_SHIFT;
24975
 
+       num_bytes = numLSNs;
24976
 
+       num_bytes <<= EVMS_VSECTOR_SIZE_SHIFT;
24977
 
+       rc = md_raid5_internal_io(mddev,io_flag,starting_offset,
24978
 
+                               num_bytes, bufptr);
24979
 
+       return(rc);
24980
 
+}
24981
 
+
24982
 
+static int raid5_sync_request (mddev_t *mddev, unsigned long sector_nr)
24983
 
+{
24984
 
+       raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
24985
 
+       struct stripe_head *sh;
24986
 
+       int sectors_per_chunk = conf->chunk_size >> 9;
24987
 
+       unsigned long stripe = sector_nr/sectors_per_chunk;
24988
 
+       int chunk_offset = sector_nr % sectors_per_chunk;
24989
 
+       int dd_idx, pd_idx;
24990
 
+       unsigned long first_sector;
24991
 
+       int raid_disks = conf->raid_disks;
24992
 
+       int data_disks = raid_disks-1;
24993
 
+       int redone = 0;
24994
 
+       int bufsize;
24995
 
+
24996
 
+       sh = get_active_stripe(conf, sector_nr, 0, 0);
24997
 
+       bufsize = sh->size;
24998
 
+       redone = sector_nr - sh->sector;
24999
 
+       first_sector = raid5_compute_sector(stripe*data_disks*sectors_per_chunk
25000
 
+               + chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf);
25001
 
+       sh->pd_idx = pd_idx;
25002
 
+       spin_lock(&sh->lock);   
25003
 
+       set_bit(STRIPE_SYNCING, &sh->state);
25004
 
+       clear_bit(STRIPE_INSYNC, &sh->state);
25005
 
+       sh->sync_redone = redone;
25006
 
+       spin_unlock(&sh->lock);
25007
 
+
25008
 
+       handle_stripe(sh);
25009
 
+       release_stripe(sh);
25010
 
+
25011
 
+       return (bufsize>>9)-redone;
25012
 
+}
25013
 
+
25014
 
+/*
25015
 
+ * This is our raid5 kernel thread.
25016
 
+ *
25017
 
+ * We scan the hash table for stripes which can be handled now.
25018
 
+ * During the scan, completed stripes are saved for us by the interrupt
25019
 
+ * handler, so that they will not have to wait for our next wakeup.
25020
 
+ */
25021
 
+static void raid5d (void *data)
25022
 
+{
25023
 
+       struct stripe_head *sh;
25024
 
+       raid5_conf_t *conf = data;
25025
 
+       mddev_t *mddev = conf->mddev;
25026
 
+       int handled;
25027
 
+
25028
 
+       LOG_ENTRY_EXIT("+++ raid5d active\n");
25029
 
+
25030
 
+       handled = 0;
25031
 
+
25032
 
+       if (mddev->sb_dirty) {
25033
 
+               mddev->sb_dirty = 0;
25034
 
+               evms_md_update_sb(mddev);
25035
 
+       }
25036
 
+       md_spin_lock_irq(&conf->device_lock);
25037
 
+       while (1) {
25038
 
+               struct list_head *first;
25039
 
+
25040
 
+               if (list_empty(&conf->handle_list) &&
25041
 
+                   atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD &&
25042
 
+                   !conf->plugged &&
25043
 
+                   !list_empty(&conf->delayed_list))
25044
 
+                       raid5_activate_delayed(conf);
25045
 
+
25046
 
+               if (list_empty(&conf->handle_list))
25047
 
+                       break;
25048
 
+
25049
 
+               first = conf->handle_list.next;
25050
 
+               sh = list_entry(first, struct stripe_head, lru);
25051
 
+
25052
 
+               list_del_init(first);
25053
 
+               atomic_inc(&sh->count);
25054
 
+               if (atomic_read(&sh->count)!= 1)
25055
 
+                       BUG();
25056
 
+               md_spin_unlock_irq(&conf->device_lock);
25057
 
+               
25058
 
+               handled++;
25059
 
+               handle_stripe(sh);
25060
 
+               release_stripe(sh);
25061
 
+
25062
 
+               md_spin_lock_irq(&conf->device_lock);
25063
 
+       }
25064
 
+       LOG_DEBUG("%d stripes handled\n", handled);
25065
 
+
25066
 
+       md_spin_unlock_irq(&conf->device_lock);
25067
 
+
25068
 
+       LOG_ENTRY_EXIT("+++ raid5d inactive\n");
25069
 
+}
25070
 
+
25071
 
+/*
25072
 
+ * Private kernel thread for parity reconstruction after an unclean
25073
 
+ * shutdown. Reconstruction on spare drives in case of a failed drive
25074
 
+ * is done by the generic mdsyncd.
25075
 
+ */
25076
 
+static void raid5syncd (void *data)
25077
 
+{
25078
 
+       raid5_conf_t *conf = data;
25079
 
+       mddev_t *mddev = conf->mddev;
25080
 
+
25081
 
+       if (!conf->resync_parity)
25082
 
+               return;
25083
 
+       if (conf->resync_parity == 2)
25084
 
+               return;
25085
 
+       down(&mddev->recovery_sem);
25086
 
+       if (evms_md_do_sync(mddev,NULL)) {
25087
 
+               up(&mddev->recovery_sem);
25088
 
+               LOG_WARNING("resync aborted!\n");
25089
 
+               return;
25090
 
+       }
25091
 
+       conf->resync_parity = 0;
25092
 
+       up(&mddev->recovery_sem);
25093
 
+       LOG_DEFAULT("resync finished.\n");
25094
 
+}
25095
 
+
25096
 
+static int raid5_run (mddev_t *mddev)
25097
 
+{
25098
 
+       raid5_conf_t *conf;
25099
 
+       int i, j, raid_disk, memory;
25100
 
+       mdp_super_t *sb = mddev->sb;
25101
 
+       mdp_disk_t *desc;
25102
 
+       mdk_rdev_t *rdev;
25103
 
+       struct disk_info *disk;
25104
 
+       struct md_list_head *tmp;
25105
 
+       int start_recovery = 0;
25106
 
+
25107
 
+       MOD_INC_USE_COUNT;
25108
 
+
25109
 
+       if (sb->level != 5 && sb->level != 4) {
25110
 
+               LOG_ERROR("%s: [md%d] raid level not set to 4/5 (%d)\n",
25111
 
+                         __FUNCTION__, mdidx(mddev), sb->level);
25112
 
+               MOD_DEC_USE_COUNT;
25113
 
+               return -EIO;
25114
 
+       }
25115
 
+
25116
 
+       mddev->private = kmalloc (sizeof (raid5_conf_t), GFP_KERNEL);
25117
 
+       if ((conf = mddev->private) == NULL)
25118
 
+               goto abort;
25119
 
+       memset (conf, 0, sizeof (*conf));
25120
 
+       conf->mddev = mddev;
25121
 
+
25122
 
+       if ((conf->stripe_hashtbl = (struct stripe_head **) md__get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL)
25123
 
+               goto abort;
25124
 
+       memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
25125
 
+
25126
 
+       conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
25127
 
+       md_init_waitqueue_head(&conf->wait_for_stripe);
25128
 
+       INIT_LIST_HEAD(&conf->handle_list);
25129
 
+       INIT_LIST_HEAD(&conf->delayed_list);
25130
 
+       INIT_LIST_HEAD(&conf->inactive_list);
25131
 
+       atomic_set(&conf->active_stripes, 0);
25132
 
+       atomic_set(&conf->preread_active_stripes, 0);
25133
 
+       conf->buffer_size = PAGE_SIZE; /* good default for rebuild */
25134
 
+
25135
 
+       conf->plugged = 0;
25136
 
+       conf->plug_tq.sync = 0;
25137
 
+       conf->plug_tq.routine = &raid5_unplug_device;
25138
 
+       conf->plug_tq.data = conf;
25139
 
+
25140
 
+       ITERATE_RDEV(mddev,rdev,tmp) {
25141
 
+               /*
25142
 
+                * This is important -- we are using the descriptor on
25143
 
+                * the disk only to get a pointer to the descriptor on
25144
 
+                * the main superblock, which might be more recent.
25145
 
+                */
25146
 
+               desc = sb->disks + rdev->desc_nr;
25147
 
+               raid_disk = desc->raid_disk;
25148
 
+               disk = conf->disks + raid_disk;
25149
 
+
25150
 
+               if (disk_faulty(desc)) {
25151
 
+                       LOG_ERROR("%s: disabled device %s (errors detected)\n",
25152
 
+                                 __FUNCTION__, evms_md_partition_name(rdev->node));
25153
 
+                       if (!rdev->faulty) {
25154
 
+                               MD_BUG();
25155
 
+                               goto abort;
25156
 
+                       }
25157
 
+                       disk->number = desc->number;
25158
 
+                       disk->raid_disk = raid_disk;
25159
 
+                       disk->dev = rdev->dev;
25160
 
+                       disk->node = rdev->node;
25161
 
+
25162
 
+                       disk->operational = 0;
25163
 
+                       disk->write_only = 0;
25164
 
+                       disk->spare = 0;
25165
 
+                       disk->used_slot = 1;
25166
 
+                       continue;
25167
 
+               }
25168
 
+               if (disk_active(desc)) {
25169
 
+                       if (!disk_sync(desc)) {
25170
 
+                               LOG_ERROR("%s: disabled device %s (not in sync)\n",
25171
 
+                                         __FUNCTION__, evms_md_partition_name(rdev->node));
25172
 
+                               MD_BUG();
25173
 
+                               goto abort;
25174
 
+                       }
25175
 
+                       if (raid_disk > sb->raid_disks) {
25176
 
+                               LOG_ERROR("%s: disabled device %s (inconsistent descriptor)\n",
25177
 
+                                         __FUNCTION__, evms_md_partition_name(rdev->node));
25178
 
+                               continue;
25179
 
+                       }
25180
 
+                       if (disk->operational) {
25181
 
+                               LOG_ERROR("%s: disabled device %s (device %d already operational)\n",
25182
 
+                                         __FUNCTION__, evms_md_partition_name(rdev->node), raid_disk);
25183
 
+                               continue;
25184
 
+                       }
25185
 
+                       LOG_DEFAULT("%s: device %s operational as raid disk %d\n",
25186
 
+                                   __FUNCTION__, evms_md_partition_name(rdev->node), raid_disk);
25187
 
+       
25188
 
+                       disk->number = desc->number;
25189
 
+                       disk->raid_disk = raid_disk;
25190
 
+                       disk->dev = rdev->dev;
25191
 
+                       disk->node = rdev->node;
25192
 
+                       disk->operational = 1;
25193
 
+                       disk->used_slot = 1;
25194
 
+
25195
 
+                       conf->working_disks++;
25196
 
+               } else {
25197
 
+                       /*
25198
 
+                        * Must be a spare disk ..
25199
 
+                        */
25200
 
+                       LOG_DEFAULT(" spare disk %s\n", evms_md_partition_name(rdev->node));
25201
 
+                       disk->number = desc->number;
25202
 
+                       disk->raid_disk = raid_disk;
25203
 
+                       disk->dev = rdev->dev;
25204
 
+                       disk->node = rdev->node;
25205
 
+
25206
 
+                       disk->operational = 0;
25207
 
+                       disk->write_only = 0;
25208
 
+                       disk->spare = 1;
25209
 
+                       disk->used_slot = 1;
25210
 
+               }
25211
 
+       }
25212
 
+
25213
 
+       for (i = 0; i < MD_SB_DISKS; i++) {
25214
 
+               desc = sb->disks + i;
25215
 
+               raid_disk = desc->raid_disk;
25216
 
+               disk = conf->disks + raid_disk;
25217
 
+
25218
 
+               if (disk_faulty(desc) && (raid_disk < sb->raid_disks) &&
25219
 
+                       !conf->disks[raid_disk].used_slot) {
25220
 
+
25221
 
+                       disk->number = desc->number;
25222
 
+                       disk->raid_disk = raid_disk;
25223
 
+                       disk->dev = MKDEV(0,0);
25224
 
+                       disk->node = NULL;
25225
 
+
25226
 
+                       disk->operational = 0;
25227
 
+                       disk->write_only = 0;
25228
 
+                       disk->spare = 0;
25229
 
+                       disk->used_slot = 1;
25230
 
+               }
25231
 
+       }
25232
 
+
25233
 
+       conf->raid_disks = sb->raid_disks;
25234
 
+       /*
25235
 
+        * faied_disks: 0 for a fully functional array, 1 for a degraded array.
25236
 
+        */
25237
 
+       conf->failed_disks = conf->raid_disks - conf->working_disks;
25238
 
+       conf->mddev = mddev;
25239
 
+       conf->chunk_size = sb->chunk_size;
25240
 
+       conf->level = sb->level;
25241
 
+       conf->algorithm = sb->layout;
25242
 
+       conf->max_nr_stripes = NR_STRIPES;
25243
 
+
25244
 
+       /*
25245
 
+        * If chunk_size is validated in md_core.c, why do it again?
25246
 
+        * And the check in md_core is:
25247
 
+        *     chunk_size has to be a power of 2 and multiples of PAGE_SIZE
25248
 
+        */
25249
 
+
25250
 
+       if (!conf->chunk_size ||
25251
 
+           ( (1 << ffz(~conf->chunk_size)) != conf->chunk_size) ||
25252
 
+           (conf->chunk_size < PAGE_SIZE)) {
25253
 
+               LOG_ERROR("%s: invalid chunk size %d for md%d\n", __FUNCTION__, conf->chunk_size, mdidx(mddev));
25254
 
+               goto abort;
25255
 
+       }
25256
 
+       if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
25257
 
+               LOG_ERROR(" unsupported parity algorithm %d for md%d\n", conf->algorithm, mdidx(mddev));
25258
 
+               goto abort;
25259
 
+       }
25260
 
+       if (conf->failed_disks > 1) {
25261
 
+               LOG_ERROR(" not enough operational devices for md%d (%d/%d failed)\n",
25262
 
+                         mdidx(mddev), conf->failed_disks, conf->raid_disks);
25263
 
+               goto abort;
25264
 
+       }
25265
 
+
25266
 
+       if (conf->working_disks != sb->raid_disks) {
25267
 
+               LOG_WARNING(" md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
25268
 
+               start_recovery = 1;
25269
 
+       }
25270
 
+
25271
 
+       {
25272
 
+               const char * name = "evms_raid5d";
25273
 
+
25274
 
+               conf->thread = evms_cs_register_thread(raid5d, conf, name);
25275
 
+               if (!conf->thread) {
25276
 
+                       LOG_ERROR("%s: couldn't allocate thread for md%d\n", __FUNCTION__, mdidx(mddev));
25277
 
+                       goto abort;
25278
 
+               }
25279
 
+       }
25280
 
+
25281
 
+       memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
25282
 
+                conf->raid_disks * ((sizeof(struct buffer_head) + PAGE_SIZE))) / 1024;
25283
 
+       if (grow_stripes(conf, conf->max_nr_stripes, GFP_KERNEL)) {
25284
 
+               LOG_ERROR("%s: couldn't allocate %dkB for buffers\n", __FUNCTION__, memory);
25285
 
+               shrink_stripes(conf, conf->max_nr_stripes);
25286
 
+               goto abort;
25287
 
+       } else
25288
 
+               LOG_DETAILS("%s: allocated %dkB for md%d\n", __FUNCTION__, memory, mdidx(mddev));
25289
 
+
25290
 
+       /*
25291
 
+        * Regenerate the "device is in sync with the raid set" bit for
25292
 
+        * each device.
25293
 
+        */
25294
 
+       for (i = 0; i < MD_SB_DISKS ; i++) {
25295
 
+               mark_disk_nonsync(sb->disks + i);
25296
 
+               for (j = 0; j < sb->raid_disks; j++) {
25297
 
+                       if (!conf->disks[j].operational)
25298
 
+                               continue;
25299
 
+                       if (sb->disks[i].number == conf->disks[j].number)
25300
 
+                               mark_disk_sync(sb->disks + i);
25301
 
+               }
25302
 
+       }
25303
 
+       sb->active_disks = conf->working_disks;
25304
 
+
25305
 
+       if (sb->active_disks == sb->raid_disks) {
25306
 
+               LOG_DETAILS("%s: raid level %d set md%d active with %d out of %d devices, algorithm %d\n",
25307
 
+                       __FUNCTION__, conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm);
25308
 
+       } else {
25309
 
+               LOG_WARNING("%s: raid level %d set md%d active with %d out of %d devices, algorithm %d\n",
25310
 
+                       __FUNCTION__, conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm);
25311
 
+       }
25312
 
+
25313
 
+       if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN))) {
25314
 
+               const char * name = "evms_raid5syncd";
25315
 
+
25316
 
+               conf->resync_thread = evms_cs_register_thread(raid5syncd, conf,name);
25317
 
+               if (!conf->resync_thread) {
25318
 
+                       LOG_ERROR("%s: couldn't allocate thread for md%d\n", __FUNCTION__, mdidx(mddev));
25319
 
+                       goto abort;
25320
 
+               }
25321
 
+
25322
 
+               LOG_WARNING("%s: raid set md%d not clean; reconstructing parity\n", __FUNCTION__, mdidx(mddev));
25323
 
+               conf->resync_parity = 1;
25324
 
+               evms_cs_wakeup_thread(conf->resync_thread);
25325
 
+       }
25326
 
+
25327
 
+       print_raid5_conf(conf);
25328
 
+       if (start_recovery)
25329
 
+               evms_md_recover_arrays();
25330
 
+       print_raid5_conf(conf);
25331
 
+
25332
 
+       /* Ok, everything is just fine now */
25333
 
+       return (0);
25334
 
+abort:
25335
 
+       if (conf) {
25336
 
+               print_raid5_conf(conf);
25337
 
+               if (conf->stripe_hashtbl)
25338
 
+                       free_pages((unsigned long) conf->stripe_hashtbl,
25339
 
+                                                       HASH_PAGES_ORDER);
25340
 
+               kfree(conf);
25341
 
+       }
25342
 
+       mddev->private = NULL;
25343
 
+       LOG_WARNING("%s: failed to run raid set md%d\n", __FUNCTION__, mdidx(mddev));
25344
 
+       MOD_DEC_USE_COUNT;
25345
 
+       return -EIO;
25346
 
+}
25347
 
+
25348
 
+static int raid5_stop_resync (mddev_t *mddev)
25349
 
+{
25350
 
+       raid5_conf_t *conf = mddev_to_conf(mddev);
25351
 
+       evms_thread_t *thread;
25352
 
+       
25353
 
+       if (conf == NULL) {
25354
 
+               return 0;
25355
 
+       }
25356
 
+
25357
 
+       thread = conf->resync_thread;
25358
 
+
25359
 
+       if (thread) {
25360
 
+               if (conf->resync_parity) {
25361
 
+                       conf->resync_parity = 2;
25362
 
+                       evms_cs_interrupt_thread(thread);
25363
 
+                       LOG_WARNING("%s: parity resync was not fully finished, restarting next time.\n", __FUNCTION__);
25364
 
+                       return 1;
25365
 
+               }
25366
 
+               return 0;
25367
 
+       }
25368
 
+       return 0;
25369
 
+}
25370
 
+
25371
 
+static int raid5_restart_resync (mddev_t *mddev)
25372
 
+{
25373
 
+       raid5_conf_t *conf = mddev_to_conf(mddev);
25374
 
+
25375
 
+       if (conf->resync_parity) {
25376
 
+               if (!conf->resync_thread) {
25377
 
+                       MD_BUG();
25378
 
+                       return 0;
25379
 
+               }
25380
 
+               LOG_DEFAULT("%s: waking up raid5resync.\n", __FUNCTION__);
25381
 
+               conf->resync_parity = 1;
25382
 
+               evms_cs_wakeup_thread(conf->resync_thread);
25383
 
+               return 1;
25384
 
+       } else
25385
 
+               LOG_DEFAULT("%s: no restart-resync needed.\n", __FUNCTION__);
25386
 
+       return 0;
25387
 
+}
25388
 
+
25389
 
+
25390
 
+static int raid5_stop (mddev_t *mddev)
25391
 
+{
25392
 
+       raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
25393
 
+
25394
 
+       if (conf != NULL) {
25395
 
+               if (conf->resync_thread)
25396
 
+                       evms_cs_unregister_thread(conf->resync_thread);
25397
 
+               evms_cs_unregister_thread(conf->thread);
25398
 
+               shrink_stripes(conf, conf->max_nr_stripes);
25399
 
+               free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER);
25400
 
+               kfree(conf);
25401
 
+               mddev->private = NULL;
25402
 
+       }
25403
 
+       MOD_DEC_USE_COUNT;
25404
 
+       return 0;
25405
 
+}
25406
 
+
25407
 
+#if RAID5_DEBUG
25408
 
+static void print_sh (struct stripe_head *sh)
25409
 
+{
25410
 
+       int i;
25411
 
+
25412
 
+       LOG_DEFAULT("sh %lu, size %d, pd_idx %d, state %ld.\n", sh->sector, sh->size, sh->pd_idx, sh->state);
25413
 
+       LOG_DEFAULT("sh %lu,  count %d.\n", sh->sector, atomic_read(&sh->count));
25414
 
+       LOG_DEFAULT("sh %lu, ", sh->sector);
25415
 
+       for (i = 0; i < MD_SB_DISKS; i++) {
25416
 
+               if (sh->bh_cache[i])
25417
 
+                       LOG_DEFAULT("(cache%d: %p %ld) ", i, sh->bh_cache[i], sh->bh_cache[i]->b_state);
25418
 
+       }
25419
 
+       LOG_DEFAULT("\n");
25420
 
+}
25421
 
+
25422
 
+static void printall (raid5_conf_t *conf)
25423
 
+{
25424
 
+       struct stripe_head *sh;
25425
 
+       int i;
25426
 
+
25427
 
+       md_spin_lock_irq(&conf->device_lock);
25428
 
+       for (i = 0; i < NR_HASH; i++) {
25429
 
+               sh = conf->stripe_hashtbl[i];
25430
 
+               for (; sh; sh = sh->hash_next) {
25431
 
+                       if (sh->raid_conf != conf)
25432
 
+                               continue;
25433
 
+                       print_sh(sh);
25434
 
+               }
25435
 
+       }
25436
 
+       md_spin_unlock_irq(&conf->device_lock);
25437
 
+}
25438
 
+#endif
25439
 
+
25440
 
+static int raid5_status (char *page, mddev_t *mddev)
25441
 
+{
25442
 
+       raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
25443
 
+       mdp_super_t *sb = mddev->sb;
25444
 
+       int sz = 0, i;
25445
 
+
25446
 
+       sz += sprintf (page+sz, " level %d, %dk chunk, algorithm %d", sb->level, sb->chunk_size >> 10, sb->layout);
25447
 
+       sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks, conf->working_disks);
25448
 
+       for (i = 0; i < conf->raid_disks; i++)
25449
 
+               sz += sprintf (page+sz, "%s", conf->disks[i].operational ? "U" : "_");
25450
 
+       sz += sprintf (page+sz, "]");
25451
 
+#if RAID5_DEBUG
25452
 
+#define D(x) \
25453
 
+       sz += sprintf (page+sz, "<"#x":%d>", atomic_read(&conf->x))
25454
 
+       printall(conf);
25455
 
+#endif
25456
 
+       return sz;
25457
 
+}
25458
 
+
25459
 
+static void print_raid5_conf (raid5_conf_t *conf)
25460
 
+{
25461
 
+       int i;
25462
 
+       struct disk_info *tmp;
25463
 
+
25464
 
+       LOG_DEFAULT("RAID5 conf printout:\n");
25465
 
+       if (!conf) {
25466
 
+               LOG_DEFAULT("(conf==NULL)\n");
25467
 
+               return;
25468
 
+       }
25469
 
+       LOG_DEFAULT(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks,
25470
 
+                conf->working_disks, conf->failed_disks);
25471
 
+
25472
 
+#if RAID5_DEBUG
25473
 
+       for (i = 0; i < MD_SB_DISKS; i++) {
25474
 
+#else
25475
 
+       for (i = 0; i < conf->working_disks+conf->failed_disks; i++) {
25476
 
+#endif
25477
 
+               tmp = conf->disks + i;
25478
 
+               LOG_DEFAULT(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
25479
 
+                       i, tmp->spare,tmp->operational,
25480
 
+                       tmp->number,tmp->raid_disk,tmp->used_slot,
25481
 
+                       evms_md_partition_name(tmp->node));
25482
 
+       }
25483
 
+}
25484
 
+
25485
 
+static int raid5_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
25486
 
+{
25487
 
+       int err = 0;
25488
 
+       int i, failed_disk=-1, spare_disk=-1, removed_disk=-1;
25489
 
+       raid5_conf_t *conf = mddev->private;
25490
 
+       struct disk_info *tmp, *sdisk, *fdisk, *rdisk;
25491
 
+       mdp_super_t *sb = mddev->sb;
25492
 
+       mdp_disk_t *failed_desc, *spare_desc;
25493
 
+       mdk_rdev_t *spare_rdev, *failed_rdev;
25494
 
+
25495
 
+       print_raid5_conf(conf);
25496
 
+       md_spin_lock_irq(&conf->device_lock);
25497
 
+       /*
25498
 
+        * find the disk ...
25499
 
+        */
25500
 
+       switch (state) {
25501
 
+
25502
 
+       case DISKOP_SPARE_ACTIVE:
25503
 
+
25504
 
+               /*
25505
 
+                * Find the failed disk within the RAID5 configuration ...
25506
 
+                * (this can only be in the first conf->raid_disks part)
25507
 
+                */
25508
 
+               for (i = 0; i < conf->raid_disks; i++) {
25509
 
+                       tmp = conf->disks + i;
25510
 
+                       if ((!tmp->operational && !tmp->spare) ||
25511
 
+                                       !tmp->used_slot) {
25512
 
+                               failed_disk = i;
25513
 
+                               break;
25514
 
+                       }
25515
 
+               }
25516
 
+               /*
25517
 
+                * When we activate a spare disk we _must_ have a disk in
25518
 
+                * the lower (active) part of the array to replace.
25519
 
+                */
25520
 
+               if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
25521
 
+                       MD_BUG();
25522
 
+                       err = 1;
25523
 
+                       goto abort;
25524
 
+               }
25525
 
+               /* fall through */
25526
 
+
25527
 
+       case DISKOP_SPARE_WRITE:
25528
 
+       case DISKOP_SPARE_INACTIVE:
25529
 
+
25530
 
+               /*
25531
 
+                * Find the spare disk ... (can only be in the 'high'
25532
 
+                * area of the array)
25533
 
+                */
25534
 
+               for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
25535
 
+                       tmp = conf->disks + i;
25536
 
+                       if (tmp->spare && tmp->number == (*d)->number) {
25537
 
+                               spare_disk = i;
25538
 
+                               break;
25539
 
+                       }
25540
 
+               }
25541
 
+               if (spare_disk == -1) {
25542
 
+                       MD_BUG();
25543
 
+                       err = 1;
25544
 
+                       goto abort;
25545
 
+               }
25546
 
+               break;
25547
 
+
25548
 
+       case DISKOP_HOT_REMOVE_SPARE:
25549
 
+
25550
 
+               for (i = 0; i < MD_SB_DISKS; i++) {
25551
 
+                       tmp = conf->disks + i;
25552
 
+                       if (tmp->used_slot && (tmp->number == (*d)->number)) {
25553
 
+                               if (tmp->operational) {
25554
 
+                                       err = -EBUSY;
25555
 
+                                       goto abort;
25556
 
+                               } else if (!tmp->spare) {
25557
 
+                                       MD_BUG();
25558
 
+                                       err = 1;
25559
 
+                                       goto abort;
25560
 
+                               }
25561
 
+                               removed_disk = i;
25562
 
+                               break;
25563
 
+                       }
25564
 
+               }
25565
 
+               if (removed_disk == -1) {
25566
 
+                       MD_BUG();
25567
 
+                       err = 1;
25568
 
+                       goto abort;
25569
 
+               }
25570
 
+               break;
25571
 
+
25572
 
+       case DISKOP_HOT_REMOVE_DISK:
25573
 
+               for (i = 0; i < MD_SB_DISKS; i++) {
25574
 
+                       tmp = conf->disks + i;
25575
 
+                       if (tmp->used_slot && (tmp->number == (*d)->number)) {
25576
 
+                               if (i < conf->raid_disks) {
25577
 
+                                       if (conf->working_disks != conf->raid_disks) {
25578
 
+                                               /*
25579
 
+                                                * Can't remove a disk from an
25580
 
+                                                * array that is running in
25581
 
+                                                * degrade mode.
25582
 
+                                                */
25583
 
+                                               err = -EBUSY;
25584
 
+                                               goto abort;
25585
 
+                                       }
25586
 
+                                       if (sb->spare_disks == 0) {
25587
 
+                                               /*
25588
 
+                                                * Must have a spare ready
25589
 
+                                                * before removing an active
25590
 
+                                                * disk.
25591
 
+                                                */
25592
 
+                                               err = -EBUSY;
25593
 
+                                               goto abort;
25594
 
+                                       }
25595
 
+                               }
25596
 
+                               removed_disk = i;
25597
 
+                               break;
25598
 
+                       }
25599
 
+               }
25600
 
+               if (removed_disk == -1) {
25601
 
+                       MD_BUG();
25602
 
+                       err = 1;
25603
 
+                       goto abort;
25604
 
+               }
25605
 
+               break;
25606
 
+
25607
 
+       case DISKOP_HOT_ADD_DISK:
25608
 
+               err = -ENOSYS;
25609
 
+               goto abort;
25610
 
+               break;
25611
 
+       }
25612
 
+
25613
 
+       switch (state) {
25614
 
+       /*
25615
 
+        * Switch the spare disk to write-only mode:
25616
 
+        */
25617
 
+       case DISKOP_SPARE_WRITE:
25618
 
+               if (conf->spare) {
25619
 
+                       MD_BUG();
25620
 
+                       err = 1;
25621
 
+                       goto abort;
25622
 
+               }
25623
 
+               sdisk = conf->disks + spare_disk;
25624
 
+               sdisk->operational = 1;
25625
 
+               sdisk->write_only = 1;
25626
 
+               conf->spare = sdisk;
25627
 
+               break;
25628
 
+       /*
25629
 
+        * Deactivate a spare disk:
25630
 
+        */
25631
 
+       case DISKOP_SPARE_INACTIVE:
25632
 
+               sdisk = conf->disks + spare_disk;
25633
 
+               sdisk->operational = 0;
25634
 
+               sdisk->write_only = 0;
25635
 
+               /*
25636
 
+                * Was the spare being resynced?
25637
 
+                */
25638
 
+               if (conf->spare == sdisk)
25639
 
+                       conf->spare = NULL;
25640
 
+               break;
25641
 
+       /*
25642
 
+        * Activate (mark read-write) the (now sync) spare disk,
25643
 
+        * which means we switch it's 'raid position' (->raid_disk)
25644
 
+        * with the failed disk. (only the first 'conf->raid_disks'
25645
 
+        * slots are used for 'real' disks and we must preserve this
25646
 
+        * property)
25647
 
+        */
25648
 
+       case DISKOP_SPARE_ACTIVE:
25649
 
+               if (!conf->spare) {
25650
 
+                       MD_BUG();
25651
 
+                       err = 1;
25652
 
+                       goto abort;
25653
 
+               }
25654
 
+               sdisk = conf->disks + spare_disk;
25655
 
+               fdisk = conf->disks + failed_disk;
25656
 
+
25657
 
+               spare_desc = &sb->disks[sdisk->number];
25658
 
+               failed_desc = &sb->disks[fdisk->number];
25659
 
+
25660
 
+               if (spare_desc != *d) {
25661
 
+                       MD_BUG();
25662
 
+                       err = 1;
25663
 
+                       goto abort;
25664
 
+               }
25665
 
+
25666
 
+               if (spare_desc->raid_disk != sdisk->raid_disk) {
25667
 
+                       MD_BUG();
25668
 
+                       err = 1;
25669
 
+                       goto abort;
25670
 
+               }
25671
 
+                       
25672
 
+               if (sdisk->raid_disk != spare_disk) {
25673
 
+                       MD_BUG();
25674
 
+                       err = 1;
25675
 
+                       goto abort;
25676
 
+               }
25677
 
+
25678
 
+               if (failed_desc->raid_disk != fdisk->raid_disk) {
25679
 
+                       MD_BUG();
25680
 
+                       err = 1;
25681
 
+                       goto abort;
25682
 
+               }
25683
 
+
25684
 
+               if (fdisk->raid_disk != failed_disk) {
25685
 
+                       MD_BUG();
25686
 
+                       err = 1;
25687
 
+                       goto abort;
25688
 
+               }
25689
 
+
25690
 
+               /*
25691
 
+                * do the switch finally
25692
 
+                */
25693
 
+               spare_rdev = evms_md_find_rdev_nr(mddev, spare_desc->number);
25694
 
+               failed_rdev = evms_md_find_rdev_nr(mddev, failed_desc->number);
25695
 
+
25696
 
+               /* There must be a spare_rdev, but there may not be a
25697
 
+                * failed_rdev.  That slot might be empty...
25698
 
+                */
25699
 
+               spare_rdev->desc_nr = failed_desc->number;
25700
 
+               if (failed_rdev)
25701
 
+                       failed_rdev->desc_nr = spare_desc->number;
25702
 
+               
25703
 
+               xchg_values(*spare_desc, *failed_desc);
25704
 
+               xchg_values(*fdisk, *sdisk);
25705
 
+
25706
 
+               /*
25707
 
+                * (careful, 'failed' and 'spare' are switched from now on)
25708
 
+                *
25709
 
+                * we want to preserve linear numbering and we want to
25710
 
+                * give the proper raid_disk number to the now activated
25711
 
+                * disk. (this means we switch back these values)
25712
 
+                */
25713
 
+       
25714
 
+               xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
25715
 
+               xchg_values(sdisk->raid_disk, fdisk->raid_disk);
25716
 
+               xchg_values(spare_desc->number, failed_desc->number);
25717
 
+               xchg_values(sdisk->number, fdisk->number);
25718
 
+
25719
 
+               *d = failed_desc;
25720
 
+
25721
 
+               //if (sdisk->dev == MKDEV(0,0))
25722
 
+               if (sdisk->node == NULL)
25723
 
+                       sdisk->used_slot = 0;
25724
 
+
25725
 
+               /*
25726
 
+                * this really activates the spare.
25727
 
+                */
25728
 
+               fdisk->spare = 0;
25729
 
+               fdisk->write_only = 0;
25730
 
+
25731
 
+               /*
25732
 
+                * if we activate a spare, we definitely replace a
25733
 
+                * non-operational disk slot in the 'low' area of
25734
 
+                * the disk array.
25735
 
+                */
25736
 
+               conf->failed_disks--;
25737
 
+               conf->working_disks++;
25738
 
+               conf->spare = NULL;
25739
 
+
25740
 
+               break;
25741
 
+
25742
 
+       case DISKOP_HOT_REMOVE_SPARE:
25743
 
+               rdisk = conf->disks + removed_disk;
25744
 
+
25745
 
+               if (rdisk->spare && (removed_disk < conf->raid_disks)) {
25746
 
+                       MD_BUG();       
25747
 
+                       err = 1;
25748
 
+                       goto abort;
25749
 
+               }
25750
 
+               if (conf->spare != NULL) {
25751
 
+                       if (conf->spare->number == removed_disk) {
25752
 
+                               conf->spare = NULL;
25753
 
+                       }
25754
 
+               }
25755
 
+
25756
 
+               rdisk->dev = MKDEV(0,0);
25757
 
+               rdisk->node = NULL;
25758
 
+               rdisk->used_slot = 0;
25759
 
+
25760
 
+               break;
25761
 
+
25762
 
+       case DISKOP_HOT_REMOVE_DISK:
25763
 
+               rdisk = conf->disks + removed_disk;
25764
 
+               if (rdisk->operational) {
25765
 
+                       /* We're removing a running disk in the array. */
25766
 
+                       conf->working_disks--;
25767
 
+                       conf->failed_disks++;
25768
 
+               }
25769
 
+               rdisk->dev = MKDEV(0,0);
25770
 
+               rdisk->node = NULL;
25771
 
+               rdisk->used_slot = 0;
25772
 
+               rdisk->operational = 0;
25773
 
+               break;
25774
 
+       
25775
 
+       default:
25776
 
+               MD_BUG();       
25777
 
+               err = 1;
25778
 
+               goto abort;
25779
 
+       }
25780
 
+abort:
25781
 
+       md_spin_unlock_irq(&conf->device_lock);
25782
 
+       print_raid5_conf(conf);
25783
 
+       return err;
25784
 
+}
25785
 
+
25786
 
+static int raid5_bmap(mddev_t *mddev, evms_sector_t *rsector, evms_logical_node_t **node)
25787
 
+{
25788
 
+       raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
25789
 
+       const unsigned int raid_disks = conf->raid_disks;
25790
 
+       const unsigned int data_disks = raid_disks - 1;
25791
 
+       unsigned int dd_idx, pd_idx;
25792
 
+
25793
 
+       *rsector = (evms_sector_t)raid5_compute_sector((unsigned long)*rsector,
25794
 
+                                                      raid_disks,
25795
 
+                                                      data_disks,
25796
 
+                                                      &dd_idx,
25797
 
+                                                      &pd_idx,
25798
 
+                                                      conf);
25799
 
+       *node = conf->disks[dd_idx].node;
25800
 
+       return 0; /* always successful */
25801
 
+}
25802
 
+
25803
 
+static int raid5_evms_ioctl (
25804
 
+       mddev_t         * mddev,
25805
 
+       struct inode    * inode,
25806
 
+       struct file     * file,
25807
 
+       unsigned int    cmd,
25808
 
+       unsigned long   arg)
25809
 
+{
25810
 
+       int rc = 0;
25811
 
+       evms_logical_node_t *node;
25812
 
+
25813
 
+       switch (cmd) {
25814
 
+               case EVMS_GET_BMAP:
25815
 
+               {
25816
 
+                       evms_get_bmap_t *bmap = (evms_get_bmap_t *)arg;
25817
 
+                       rc = raid5_bmap(mddev,&bmap->rsector,&node);
25818
 
+                       if (!rc) {
25819
 
+                               if (node)
25820
 
+                                       rc = IOCTL(node, inode, file, cmd, arg);
25821
 
+                               else
25822
 
+                                       rc = -ENODEV;
25823
 
+                       }
25824
 
+                       break;
25825
 
+               }
25826
 
+
25827
 
+               default:
25828
 
+                       rc = -EINVAL;
25829
 
+       }
25830
 
+       return rc;
25831
 
+}
25832
 
+
25833
 
+static int raid5_pers_ioctl(mddev_t *mddev, int cmd, void * args){
25834
 
+
25835
 
+       int rc = 0;
25836
 
+       raid5_ioctl_init_io_t init_io_args;
25837
 
+       void * data;
25838
 
+
25839
 
+       LOG_DETAILS("%s: cmd == %d.\n", __FUNCTION__, cmd);
25840
 
+       switch (cmd) {
25841
 
+       case EVMS_MD_RAID5_INIT_IO:
25842
 
+
25843
 
+               if (copy_from_user(&init_io_args, (raid5_ioctl_init_io_t*)args, sizeof(init_io_args)) ) {
25844
 
+                       return -EFAULT;
25845
 
+               }
25846
 
+
25847
 
+               rc = evms_cs_allocate_memory(&data, init_io_args.nr_sects << EVMS_VSECTOR_SIZE_SHIFT);
25848
 
+               if (rc != 0) {
25849
 
+                       return rc;
25850
 
+               }
25851
 
+
25852
 
+               if (copy_from_user(data, init_io_args.data, init_io_args.nr_sects << EVMS_VSECTOR_SIZE_SHIFT)) {
25853
 
+                       evms_cs_deallocate_memory(data);
25854
 
+                       return -EFAULT;
25855
 
+               }
25856
 
+
25857
 
+               rc = raid5_init_io(mddev, init_io_args.rw,
25858
 
+                                  init_io_args.lsn, init_io_args.nr_sects,data);
25859
 
+
25860
 
+               copy_to_user(init_io_args.data, data, init_io_args.nr_sects << EVMS_VSECTOR_SIZE_SHIFT);
25861
 
+               evms_cs_deallocate_memory(data);
25862
 
+               
25863
 
+               copy_to_user((raid5_ioctl_init_io_t*)args, &init_io_args, sizeof(init_io_args));
25864
 
+               break;
25865
 
+
25866
 
+       default:
25867
 
+               rc = -ENOSYS;
25868
 
+       }
25869
 
+
25870
 
+       return rc;
25871
 
+}
25872
 
+
25873
 
+
25874
 
+static mdk_personality_t raid5_personality=
25875
 
+{
25876
 
+       name:           "evms_raid5",
25877
 
+       init_io:        raid5_init_io,
25878
 
+       make_request:   raid5_make_request,
25879
 
+       run:            raid5_run,
25880
 
+       stop:           raid5_stop,
25881
 
+       status:         raid5_status,
25882
 
+       error_handler:  raid5_error,
25883
 
+       diskop:         raid5_diskop,
25884
 
+       stop_resync:    raid5_stop_resync,
25885
 
+       restart_resync: raid5_restart_resync,
25886
 
+       sync_request:   raid5_sync_request,
25887
 
+       evms_ioctl:     raid5_evms_ioctl,
25888
 
+       md_pers_ioctl:  raid5_pers_ioctl
25889
 
+};
25890
 
+
25891
 
+static int md__init raid5_init (void)
25892
 
+{
25893
 
+       return evms_register_md_personality (RAID5, &raid5_personality);
25894
 
+}
25895
 
+
25896
 
+static void raid5_exit (void)
25897
 
+{
25898
 
+       evms_unregister_md_personality (RAID5);
25899
 
+}
25900
 
+
25901
 
+module_init(raid5_init);
25902
 
+module_exit(raid5_exit);
25903
 
+#ifdef MODULE_LICENSE
25904
 
+MODULE_LICENSE("GPL");
25905
 
+#endif
25906
 
diff -Naur linux-2002-03-28/drivers/evms/md_xor.c evms-2002-03-28/drivers/evms/md_xor.c
25907
 
--- linux-2002-03-28/drivers/evms/md_xor.c      Wed Dec 31 18:00:00 1969
25908
 
+++ evms-2002-03-28/drivers/evms/md_xor.c       Fri Mar  1 11:50:58 2002
25909
 
@@ -0,0 +1,149 @@
25910
 
+/*
25911
 
+ * md_xor.c : Multiple Devices driver for Linux
25912
 
+ *
25913
 
+ * Copyright (C) 1996, 1997, 1998, 1999, 2000,
25914
 
+ * Ingo Molnar, Matti Aarnio, Jakub Jelinek, Richard Henderson.
25915
 
+ *
25916
 
+ * Dispatch optimized RAID-5 checksumming functions.
25917
 
+ *
25918
 
+ * 'md_xor.c' is an EVMS version of linux/drivers/md/xor.c modified
25919
 
+ * by Cuong (Mike) Tran <miketran@us.ibm.com>, January 2002.
25920
 
+ *
25921
 
+ * This program is free software; you can redistribute it and/or modify
25922
 
+ * it under the terms of the GNU General Public License as published by
25923
 
+ * the Free Software Foundation; either version 2, or (at your option)
25924
 
+ * any later version.
25925
 
+ *
25926
 
+ * You should have received a copy of the GNU General Public License
25927
 
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
25928
 
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25929
 
+ */
25930
 
+
25931
 
+#define BH_TRACE 0
25932
 
+#include <linux/module.h>
25933
 
+#include <linux/evms/evms_md.h>
25934
 
+#include <linux/evms/evms_xor.h>
25935
 
+#include <asm/xor.h>
25936
 
+
25937
 
+#define LOG_PREFIX "md raid5: "
25938
 
+/* The xor routines to use.  */
25939
 
+static struct xor_block_template *active_template;
25940
 
+
25941
 
+void
25942
 
+evms_md_xor_block(unsigned int count, struct buffer_head **bh_ptr)
25943
 
+{
25944
 
+       unsigned long *p0, *p1, *p2, *p3, *p4;
25945
 
+       unsigned long bytes = bh_ptr[0]->b_size;
25946
 
+       
25947
 
+       p0 = (unsigned long *) bh_ptr[0]->b_data;
25948
 
+       p1 = (unsigned long *) bh_ptr[1]->b_data;
25949
 
+       if (count == 2) {
25950
 
+               active_template->do_2(bytes, p0, p1);
25951
 
+               return;
25952
 
+       }
25953
 
+
25954
 
+       p2 = (unsigned long *) bh_ptr[2]->b_data;
25955
 
+       if (count == 3) {
25956
 
+               active_template->do_3(bytes, p0, p1, p2);
25957
 
+               return;
25958
 
+       }
25959
 
+
25960
 
+       p3 = (unsigned long *) bh_ptr[3]->b_data;
25961
 
+       if (count == 4) {
25962
 
+               active_template->do_4(bytes, p0, p1, p2, p3);
25963
 
+               return;
25964
 
+       }
25965
 
+
25966
 
+       p4 = (unsigned long *) bh_ptr[4]->b_data;
25967
 
+       active_template->do_5(bytes, p0, p1, p2, p3, p4);
25968
 
+}
25969
 
+
25970
 
+/* Set of all registered templates.  */
25971
 
+static struct xor_block_template *template_list;
25972
 
+
25973
 
+#define BENCH_SIZE (PAGE_SIZE)
25974
 
+
25975
 
+static void
25976
 
+do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
25977
 
+{
25978
 
+       int speed;
25979
 
+       unsigned long now;
25980
 
+       int i, count, max;
25981
 
+
25982
 
+       tmpl->next = template_list;
25983
 
+       template_list = tmpl;
25984
 
+
25985
 
+       /*
25986
 
+        * Count the number of XORs done during a whole jiffy, and use
25987
 
+        * this to calculate the speed of checksumming.  We use a 2-page
25988
 
+        * allocation to have guaranteed color L1-cache layout.
25989
 
+        */
25990
 
+       max = 0;
25991
 
+       for (i = 0; i < 5; i++) {
25992
 
+               now = jiffies;
25993
 
+               count = 0;
25994
 
+               while (jiffies == now) {
25995
 
+                       mb();
25996
 
+                       tmpl->do_2(BENCH_SIZE, b1, b2);
25997
 
+                       mb();
25998
 
+                       count++;
25999
 
+                       mb();
26000
 
+               }
26001
 
+               if (count > max)
26002
 
+                       max = count;
26003
 
+       }
26004
 
+
26005
 
+       speed = max * (HZ * BENCH_SIZE / 1024);
26006
 
+       tmpl->speed = speed;
26007
 
+
26008
 
+       LOG_DEFAULT("   %-10s: %5d.%03d MB/sec\n", tmpl->name,
26009
 
+              speed / 1000, speed % 1000);
26010
 
+}
26011
 
+
26012
 
+static int
26013
 
+calibrate_xor_block(void)
26014
 
+{
26015
 
+       void *b1, *b2;
26016
 
+       struct xor_block_template *f, *fastest;
26017
 
+
26018
 
+       b1 = (void *) md__get_free_pages(GFP_KERNEL, 2);
26019
 
+       if (! b1) {
26020
 
+               LOG_ERROR("Yikes!  No memory available.\n");
26021
 
+               return -ENOMEM;
26022
 
+       }
26023
 
+       b2 = b1 + 2*PAGE_SIZE + BENCH_SIZE;
26024
 
+
26025
 
+       LOG_DEFAULT("measuring checksumming speed\n");
26026
 
+       sti();
26027
 
+
26028
 
+#define xor_speed(templ)       do_xor_speed((templ), b1, b2)
26029
 
+
26030
 
+       XOR_TRY_TEMPLATES;
26031
 
+
26032
 
+#undef xor_speed
26033
 
+
26034
 
+       free_pages((unsigned long)b1, 2);
26035
 
+
26036
 
+       fastest = template_list;
26037
 
+       for (f = fastest; f; f = f->next)
26038
 
+               if (f->speed > fastest->speed)
26039
 
+                       fastest = f;
26040
 
+
26041
 
+#ifdef XOR_SELECT_TEMPLATE
26042
 
+       fastest = XOR_SELECT_TEMPLATE(fastest);
26043
 
+#endif
26044
 
+
26045
 
+       active_template = fastest;
26046
 
+       LOG_DEFAULT("using function: %s (%d.%03d MB/sec)\n",
26047
 
+              fastest->name, fastest->speed / 1000, fastest->speed % 1000);
26048
 
+
26049
 
+       return 0;
26050
 
+}
26051
 
+
26052
 
+MD_EXPORT_SYMBOL(evms_md_xor_block);
26053
 
+
26054
 
+#ifdef MODULE_LICENSE
26055
 
+MODULE_LICENSE("GPL");
26056
 
+#endif
26057
 
+
26058
 
+module_init(calibrate_xor_block);
26059
 
diff -Naur linux-2002-03-28/drivers/evms/os2lvm_vge.c evms-2002-03-28/drivers/evms/os2lvm_vge.c
26060
 
--- linux-2002-03-28/drivers/evms/os2lvm_vge.c  Wed Dec 31 18:00:00 1969
26061
 
+++ evms-2002-03-28/drivers/evms/os2lvm_vge.c   Thu Mar 28 12:50:56 2002
26062
 
@@ -0,0 +1,2207 @@
26063
 
+/*
26064
 
+ *
26065
 
+ *   Copyright (c) International Business Machines Corp., 2001
26066
 
+ *
26067
 
+ *   This program is free software;  you can redistribute it and/or modify
26068
 
+ *   it under the terms of the GNU General Public License as published by
26069
 
+ *   the Free Software Foundation; either version 2 of the License, or
26070
 
+ *   (at your option) any later version.
26071
 
+ *
26072
 
+ *   This program is distributed in the hope that it will be useful,
26073
 
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
26074
 
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
26075
 
+ *   the GNU General Public License for more details.
26076
 
+ *
26077
 
+ *   You should have received a copy of the GNU General Public License
26078
 
+ *   along with this program;  if not, write to the Free Software
26079
 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
26080
 
+ *
26081
 
+ */
26082
 
+
26083
 
+/*
26084
 
+ * linux/drivers/evms/os2lvm_vge.c
26085
 
+ *
26086
 
+ * EVMS OS/2 LVM Emulator
26087
 
+ *
26088
 
+ * This Volume Group Emulator will take the type 0x35 partitions created by
26089
 
+ *  OS/2 versions 4.5 and later and build them into volumes.  It emulates
26090
 
+ *  the Drive Linking and Bad Block Relocation features and therefore
26091
 
+ *  provides binary compatibility with the OS/2 version.  Of course, if
26092
 
+ *  you select to mkfs a file system OS/2 doesn't support, you're on your
26093
 
+ *  own...
26094
 
+ *
26095
 
+ * Since OS/2 LVM volumes can only exist on DOS-style partitioned disks,
26096
 
+ *  this VGE has a dependency on dospart.c to report a list of the
26097
 
+ *  candidate partitions.  This module will then take the appropriate partitions
26098
 
+ *  from the list and use them to build the OS/2-style volumes.
26099
 
+ *
26100
 
+ * Change Activity:
26101
 
+ *
26102
 
+ *   7/01/2001  John Stiles  getting started.
26103
 
+ *   9/14/2001  John Stiles  original version.
26104
 
+ *  11/01/2001  John Stiles  new naming scheme.
26105
 
+ *  11/21/2001  John Stiles  i/o path changes.
26106
 
+ */
26107
 
+
26108
 
+#define EVMS_DEBUG 1
26109
 
+#define EVMS_OS2_DEBUG 1
26110
 
+
26111
 
+#include <linux/module.h>
26112
 
+#include <linux/kernel.h>
26113
 
+#include <linux/config.h>
26114
 
+#include <linux/genhd.h>
26115
 
+#include <linux/major.h>
26116
 
+#include <linux/string.h>
26117
 
+#include <linux/blk.h>
26118
 
+#include <linux/init.h>
26119
 
+#include <linux/evms/evms_kernel.h>
26120
 
+#include <linux/evms/evms_os2.h>
26121
 
+#include <asm/uaccess.h>
26122
 
+
26123
 
+#define LOG_PREFIX "os2lvm: "
26124
 
+
26125
 
+// Global Structure and Type definitions
26126
 
+typedef struct BBR_IO_Transfer_Record_s{
26127
 
+                                        int                               Write_Flag;   /* 0 = read, 1 = write */
26128
 
+                                        os2_drivelink_runtime_entry_t  *  Partition_Data;
26129
 
+                                        eio_t                             eio;
26130
 
+                                        struct BBR_IO_Transfer_Record_s * Next;
26131
 
+} BBR_IO_Transfer_Record_t;
26132
 
+
26133
 
+typedef struct DL_IO_Tracking_Record_s{ /* structure used to track IO requests that must be broken into two pieces due to drive linking */
26134
 
+                                       unsigned int                      IO_In_Progress;
26135
 
+                                       int                               Up_To_Date;
26136
 
+                                       eio_t                             Original;   /* Original IO */
26137
 
+                                       eio_t                             Link1;      /* First child. */
26138
 
+                                       os2_drivelink_runtime_entry_t  *  Link1_Partition_Data;
26139
 
+                                       BBR_IO_Transfer_Record_t *        Link1_Transfer_Record;
26140
 
+                                       int                               Link1_BBR_Attempted;
26141
 
+                                       eio_t                             Link2;      /* Second child */
26142
 
+                                       os2_drivelink_runtime_entry_t  *  Link2_Partition_Data;
26143
 
+                                       BBR_IO_Transfer_Record_t *        Link2_Transfer_Record;
26144
 
+                                       int                               Link2_BBR_Attempted;
26145
 
+} DL_IO_Tracking_Record_t;
26146
 
+
26147
 
+// Prototypes for local VGE functions
26148
 
+static int discover_os2lvm_partitions( evms_logical_node_t ** );
26149
 
+static evms_logical_node_t  * find_os2_volume( u_int32_t );
26150
 
+static int add_os2link( os2_drivelink_runtime_entry_t  *, evms_logical_node_t  * );
26151
 
+static os2_drivelink_runtime_entry_t  * find_link_data( os2_drivelink_runtime_entry_t  **, u_int32_t );
26152
 
+static int find_drive_link( evms_logical_node_t  *, os2_drivelink_runtime_entry_t  **, evms_sector_t *, evms_sector_t * );
26153
 
+static int validate_signaturesector( evms_logical_node_t *, LVM_Signature_Sector *, u_int32_t );
26154
 
+static int validate_drivelinksector( void *, int, u_int32_t);
26155
 
+static int validate_bbrtablesector( void *, int, u_int32_t );
26156
 
+static u_int32_t check_for_os2_bbr_relocations( char  * );
26157
 
+static int check_os2_volumes( evms_logical_node_t ** );
26158
 
+static int OS2_ioctl_cmd_broadcast(
26159
 
+                evms_logical_node_t *node,
26160
 
+                struct inode *inode, struct file *file,
26161
 
+                unsigned long cmd, unsigned long arg);
26162
 
+static int os2_ioctl_cmd_plugin_ioctl(
26163
 
+                evms_logical_node_t *node, 
26164
 
+                struct inode *inode, struct file *file,
26165
 
+                unsigned long cmd, unsigned long arg);
26166
 
+static void BBR_Worker( void *);
26167
 
+static void OS2_BBR_Write_Callback( BBR_IO_Transfer_Record_t * Transfer_Record,
26168
 
+                                   struct buffer_head       * bh,
26169
 
+                                   int                        uptodate,
26170
 
+                                   int                      * redrive );
26171
 
+static void BBR_Transfer_IO(BBR_IO_Transfer_Record_t * Transfer_Record);
26172
 
+static void OS2_DL_Callback(struct buffer_head *bh, int uptodate);
26173
 
+static int  Sector_Is_Remapped(os2_drivelink_runtime_entry_t  * io_dlentry, 
26174
 
+                               evms_sector_t                    Source_Sector, 
26175
 
+                               evms_sector_t *                  Replacement_Sector);
26176
 
+static void Invalidate_Mapping(os2_drivelink_runtime_entry_t  * io_dlentry, 
26177
 
+                               evms_sector_t                    Source_Sector,
26178
 
+                               int                              Replacement_Sector_Is_Bad);
26179
 
+static int Create_New_BBR_Table_Entry(os2_drivelink_runtime_entry_t  * io_dlentry,
26180
 
+                                      evms_sector_t                    starting_lsn, 
26181
 
+                                      unsigned int                     count, 
26182
 
+                                      void *                           buffer);
26183
 
+static void Clone_Bufferhead(struct buffer_head * Source, struct buffer_head * Child);
26184
 
+
26185
 
+
26186
 
+// Prototypes for local memory allocation/deallocation functions
26187
 
+static os2_drivelink_runtime_entry_t  * new_os2_drive_link( LVM_Signature_Sector *, evms_logical_node_t  * );
26188
 
+static char  * new_os2_link_data( u_int32_t, u_int32_t, u_int32_t, evms_logical_node_t  * );
26189
 
+static char  * new_os2_bbr_data( u_int32_t, u_int32_t, u_int32_t, evms_logical_node_t  * );
26190
 
+static evms_logical_node_t  * new_os2volume( u_int32_t, char  * );
26191
 
+static int delete_os2lvm_volume( evms_logical_node_t * );
26192
 
+static int delete_os2_drive_link( os2_drivelink_runtime_entry_t  *, int );
26193
 
+
26194
 
+
26195
 
+// Prototypes for Function Table interface
26196
 
+static int  discover_os2lvm( evms_logical_node_t ** );
26197
 
+static int  delete_os2lvm( evms_logical_node_t * );
26198
 
+static void read_os2lvm( evms_logical_node_t *, eio_t * );
26199
 
+static void write_os2lvm( evms_logical_node_t *, eio_t * );
26200
 
+static int  init_io_os2lvm( evms_logical_node_t *, int, evms_sector_t, evms_sector_t, void * );
26201
 
+static int  ioctl_os2lvm( evms_logical_node_t *, struct inode *, struct file *, unsigned int, unsigned long );
26202
 
+static int  do_os2_bbr_io( os2_drivelink_runtime_entry_t  *, int, evms_sector_t, evms_sector_t, void * );
26203
 
+
26204
 
+
26205
 
+// Global data structures
26206
 
+static evms_logical_node_t *    os2lvm_nodes = NULL;
26207
 
+static evms_thread_t   *       BBR_Worker_Thread = NULL;
26208
 
+static spinlock_t               BBR_Queue_Lock = SPIN_LOCK_UNLOCKED;
26209
 
+static const char  *            BBR_Worker_Name = "evms_os2_bbr_io";
26210
 
+static BBR_IO_Transfer_Record_t *BBR_IO_List_Head = NULL;
26211
 
+static BBR_IO_Transfer_Record_t *BBR_IO_List_Tail = NULL;
26212
 
+static evms_pool_mgmt_t *       BBR_Transfer_Pool = NULL;
26213
 
+static char *             BBR_Transfer_Pool_Name = "OS-2 Transfer Pool";
26214
 
+static char *             DL_Tracking_Pool_Name = "OS-2 Tracking Pool";
26215
 
+static evms_pool_mgmt_t *       DL_Tracking_Pool = NULL;
26216
 
+
26217
 
+
26218
 
+// Required plug-in Function Table definition
26219
 
+static evms_plugin_function_table_t function_table = {
26220
 
+        discover: &discover_os2lvm,
26221
 
+        delete  : &delete_os2lvm,
26222
 
+        read    : &read_os2lvm,
26223
 
+        write   : &write_os2lvm,
26224
 
+        init_io : &init_io_os2lvm,
26225
 
+        ioctl   : &ioctl_os2lvm
26226
 
+};
26227
 
+
26228
 
+
26229
 
+// Required plug-in Header definition
26230
 
+static evms_plugin_header_t plugin_header = {
26231
 
+        id : SetPluginID(
26232
 
+                        IBM_OEM_ID,
26233
 
+                        EVMS_REGION_MANAGER,            // Region Manger class
26234
 
+                        2 ),                            // Unique ID within VGEs
26235
 
+        version : {
26236
 
+                major           : 1,
26237
 
+                minor           : 0,
26238
 
+                patchlevel      : 0
26239
 
+        },
26240
 
+        required_common_services_version: {
26241
 
+                major           : EVMS_COMMON_SERVICES_MAJOR,
26242
 
+                minor           : EVMS_COMMON_SERVICES_MINOR,
26243
 
+                patchlevel      : EVMS_COMMON_SERVICES_PATCHLEVEL
26244
 
+        },
26245
 
+        function_table  : &function_table               // Function table for this plugin
26246
 
+};
26247
 
+
26248
 
+
26249
 
+//  Required Plugin Functions
26250
 
+
26251
 
+
26252
 
+/*
26253
 
+ * Function:  discover_os2lvm
26254
 
+ *
26255
 
+ *      This is the entry point into the discovery process.
26256
 
+ */
26257
 
+static int discover_os2lvm( evms_logical_node_t ** evms_partition_list )
26258
 
+{
26259
 
+        int rc;
26260
 
+
26261
 
+        if ( ! BBR_Transfer_Pool ) {
26262
 
+                BBR_Transfer_Pool = evms_cs_create_pool( sizeof(BBR_IO_Transfer_Record_t), BBR_Transfer_Pool_Name, NULL, NULL);
26263
 
+                if ( ! BBR_Transfer_Pool ) {
26264
 
+                        return -ENOMEM;
26265
 
+                }
26266
 
+        }
26267
 
+
26268
 
+        if ( ! DL_Tracking_Pool ) {
26269
 
+                DL_Tracking_Pool = evms_cs_create_pool( sizeof(DL_IO_Tracking_Record_t), DL_Tracking_Pool_Name, NULL, NULL);
26270
 
+                if ( ! DL_Tracking_Pool ) {
26271
 
+                        return -ENOMEM;
26272
 
+                }
26273
 
+        }
26274
 
+
26275
 
+        rc = discover_os2lvm_partitions( evms_partition_list );
26276
 
+
26277
 
+        if (!rc) {
26278
 
+                rc = check_os2_volumes( evms_partition_list );
26279
 
+        }
26280
 
+
26281
 
+        return rc;
26282
 
+}
26283
 
+
26284
 
+
26285
 
+/*
26286
 
+ * Function:  delete_os2lvm
26287
 
+ *
26288
 
+ *      This is the entry point for deleting a node.
26289
 
+ */
26290
 
+static int delete_os2lvm( evms_logical_node_t * logical_node )
26291
 
+{
26292
 
+        LOG_EXTRA("Deleting volume: %s\n", logical_node->name );
26293
 
+
26294
 
+        return delete_os2lvm_volume( logical_node );
26295
 
+}
26296
 
+
26297
 
+
26298
 
+/*
26299
 
+ * Function:  read_os2lvm
26300
 
+ */
26301
 
+static void read_os2lvm( evms_logical_node_t  * node,
26302
 
+                         eio_t                * eio )
26303
 
+{
26304
 
+        int                              rc;
26305
 
+        evms_sector_t                    sector_count;
26306
 
+        struct buffer_head     *         Link1 = NULL;
26307
 
+        struct buffer_head     *         Link2 = NULL;
26308
 
+        DL_IO_Tracking_Record_t *        Tracking_Record = NULL;
26309
 
+        os2_drivelink_runtime_entry_t  * cur_dlentry = NULL;
26310
 
+        BBR_IO_Transfer_Record_t *       Transfer_Record;
26311
 
+
26312
 
+        sector_count = eio->rsize;
26313
 
+        rc = find_drive_link( node, &cur_dlentry, &eio->rsector, &sector_count );
26314
 
+        switch ( rc ) {
26315
 
+                case 1 :
26316
 
+                        if ( cur_dlentry->bbr_is_active )   {
26317
 
+                                Transfer_Record = evms_cs_allocate_from_pool(BBR_Transfer_Pool,1);  /* Block until we get a transfer record. */
26318
 
+                                /* Transfer the IO to the BBR Worker Thread. */
26319
 
+                                Transfer_Record->Write_Flag = 0;
26320
 
+                                Transfer_Record->Partition_Data = cur_dlentry;
26321
 
+                                Transfer_Record->eio = *eio;
26322
 
+                                Transfer_Record->Next = NULL;
26323
 
+                                BBR_Transfer_IO(Transfer_Record);
26324
 
+                        }
26325
 
+                        else
26326
 
+                                R_IO( cur_dlentry->link_partition, eio );
26327
 
+                        break;
26328
 
+                case 2 :
26329
 
+                        /* We must split the IO.  Duplicate the buffer head twice and allocate the tracking record. */
26330
 
+                        Tracking_Record = evms_cs_allocate_from_pool(DL_Tracking_Pool,1);  /* Block until we get a tracking record. */
26331
 
+                        Link1 = evms_cs_allocate_from_pool(evms_bh_pool,1);
26332
 
+                        Link2 = evms_cs_allocate_from_pool(evms_bh_pool,1);
26333
 
+
26334
 
+                        /* Initialize the tracking record so we can associate the two new I/Os with the original. */
26335
 
+                        Tracking_Record->IO_In_Progress = 2;
26336
 
+                        Tracking_Record->Up_To_Date = 0;
26337
 
+                        Tracking_Record->Original = *eio;
26338
 
+
26339
 
+                        /* Create the I/O to the first link. */
26340
 
+                        Clone_Bufferhead(eio->bh,Link1);
26341
 
+                        Link1->b_private = Tracking_Record;
26342
 
+                        Link1->b_end_io = OS2_DL_Callback;
26343
 
+                        Link1->b_size = sector_count << EVMS_VSECTOR_SIZE_SHIFT;
26344
 
+                        Tracking_Record->Link1.rsector = eio->rsector;
26345
 
+                        Tracking_Record->Link1.rsize = sector_count;
26346
 
+                        Tracking_Record->Link1.bh = Link1;
26347
 
+                        Tracking_Record->Link1_Partition_Data = cur_dlentry;
26348
 
+                        Tracking_Record->Link1_BBR_Attempted = 0;
26349
 
+                        Tracking_Record->Link1_Transfer_Record = NULL;
26350
 
+
26351
 
+                        /* Create the I/O to the second link */
26352
 
+                        Clone_Bufferhead(eio->bh,Link2);
26353
 
+                        Link2->b_private = Tracking_Record;
26354
 
+                        Link2->b_end_io = OS2_DL_Callback;
26355
 
+                        Link2->b_data += sector_count << EVMS_VSECTOR_SIZE_SHIFT;
26356
 
+                        Tracking_Record->Link2.bh = Link2;
26357
 
+                        Tracking_Record->Link2_Partition_Data = cur_dlentry->next;
26358
 
+                        Link2->b_rsector = 0;
26359
 
+                        Tracking_Record->Link2.rsector = 0;
26360
 
+                        Tracking_Record->Link2.rsize = eio->rsize - sector_count;
26361
 
+                        Link2->b_size = Tracking_Record->Link2.rsize << EVMS_VSECTOR_SIZE_SHIFT;
26362
 
+                        Tracking_Record->Link2_BBR_Attempted = 0;
26363
 
+                        Tracking_Record->Link2_Transfer_Record = NULL;
26364
 
+
26365
 
+                        /* Process the I/O to the first link. */
26366
 
+                        if ( cur_dlentry->bbr_is_active )   {
26367
 
+                                Transfer_Record = evms_cs_allocate_from_pool(BBR_Transfer_Pool,1);  /* Block until we get a transfer record. */
26368
 
+                                /* Transfer the IO to the BBR Worker Thread. */
26369
 
+                                Transfer_Record->Write_Flag = 0;
26370
 
+                                Transfer_Record->Partition_Data = cur_dlentry;
26371
 
+                                Transfer_Record->eio = Tracking_Record->Link1;
26372
 
+                                Transfer_Record->Next = NULL;
26373
 
+                                BBR_Transfer_IO(Transfer_Record);
26374
 
+                        }
26375
 
+                        else
26376
 
+                                R_IO( cur_dlentry->link_partition, &Tracking_Record->Link1 );
26377
 
+
26378
 
+                        /* Process the I/O to the second link. */
26379
 
+                        cur_dlentry = cur_dlentry->next;
26380
 
+                        if ( cur_dlentry->bbr_is_active )   {
26381
 
+                                Transfer_Record = evms_cs_allocate_from_pool(BBR_Transfer_Pool,1);  /* Block until we get a transfer record. */
26382
 
+                                /* Transfer the IO to the BBR Worker Thread. */
26383
 
+                                Transfer_Record->Write_Flag = 0;
26384
 
+                                Transfer_Record->Partition_Data = cur_dlentry;
26385
 
+                                Transfer_Record->eio = Tracking_Record->Link2;
26386
 
+                                Transfer_Record->Next = NULL;
26387
 
+                                BBR_Transfer_IO(Transfer_Record);
26388
 
+                        }
26389
 
+                        else
26390
 
+                                R_IO( cur_dlentry->link_partition, &Tracking_Record->Link2 );
26391
 
+
26392
 
+                        break;
26393
 
+                default:
26394
 
+                        LOG_SERIOUS("READ error, request exceeds volume size.\n" );
26395
 
+                        EVMS_IO_ERROR(eio);
26396
 
+                        break;
26397
 
+        }
26398
 
+}
26399
 
+
26400
 
+
26401
 
+/*
26402
 
+ * Function:  write_os2lvm
26403
 
+ */
26404
 
+static void write_os2lvm( evms_logical_node_t  * node,
26405
 
+                         eio_t                * eio )
26406
 
+{
26407
 
+        int                              rc;
26408
 
+        evms_sector_t                    sector_count;
26409
 
+        struct buffer_head     *         Link1 = NULL;
26410
 
+        struct buffer_head     *         Link2 = NULL;
26411
 
+        DL_IO_Tracking_Record_t *        Tracking_Record = NULL;
26412
 
+        os2_drivelink_runtime_entry_t  * cur_dlentry = NULL;
26413
 
+        BBR_IO_Transfer_Record_t *       Transfer_Record;
26414
 
+
26415
 
+        sector_count = eio->rsize;
26416
 
+        rc = find_drive_link( node, &cur_dlentry, &eio->rsector, &sector_count );
26417
 
+        switch ( rc ) {
26418
 
+                case 1 :
26419
 
+                        /* Set up a Transfer Record.  If there are Bad Blocks on the partition that this I/O is 
26420
 
+                           directed to, then we will need the Transfer Record to put the I/O in the queue for the 
26421
 
+                           BBR Worker Thread.  If there are no bad blocks, then we will need the Transfer Record 
26422
 
+                           for the OS2_BBR_Write_Callback function.  This function expects the Transfer Record to 
26423
 
+                           be pre-allocated and available because it is running on an interrupt thread and should
26424
 
+                           not do memory allocation.  If there is an error during the write, then the 
26425
 
+                           OS2_BBR_Write_Callback function will use the Transfer Record to transfer the I/O
26426
 
+                           to the BBR worker thread for further processing.  If there are no errors during the I/O,
26427
 
+                           then the OS2_BBR_Write_Callback will deallocate the Transfer Record.                     */
26428
 
+                        Transfer_Record = evms_cs_allocate_from_pool(BBR_Transfer_Pool,1);  /* Block until we get a transfer record. */
26429
 
+                        Transfer_Record->Write_Flag = 1;
26430
 
+                        Transfer_Record->Partition_Data = cur_dlentry;
26431
 
+                        Transfer_Record->eio = *eio;
26432
 
+                        Transfer_Record->Next = NULL;
26433
 
+                        if ( cur_dlentry->bbr_is_active )   {
26434
 
+                                /* Transfer the IO to the BBR Worker Thread. */
26435
 
+                                BBR_Transfer_IO(Transfer_Record);
26436
 
+                        }
26437
 
+                        else {
26438
 
+                                evms_cs_register_for_end_io_notification(Transfer_Record,eio->bh,OS2_BBR_Write_Callback);
26439
 
+                                W_IO( cur_dlentry->link_partition, eio );
26440
 
+                        }
26441
 
+                        break;
26442
 
+                case 2 :
26443
 
+                        /* We must split the IO.  Duplicate the buffer head twice and allocate the tracking record. */
26444
 
+                        Tracking_Record = evms_cs_allocate_from_pool(DL_Tracking_Pool,1);  /* Block until we get a tracking record. */
26445
 
+                        Link1 = evms_cs_allocate_from_pool(evms_bh_pool,1);
26446
 
+                        Link2 = evms_cs_allocate_from_pool(evms_bh_pool,1);
26447
 
+
26448
 
+                        /* Initialize the tracking record so we can associate the two new I/Os with the original. */
26449
 
+                        Tracking_Record->IO_In_Progress = 2;
26450
 
+                        Tracking_Record->Up_To_Date = 0;
26451
 
+                        Tracking_Record->Original = *eio;
26452
 
+
26453
 
+                        /* Create the I/O to the first link. */
26454
 
+                        Clone_Bufferhead(eio->bh,Link1);
26455
 
+                        Link1->b_private = Tracking_Record;
26456
 
+                        Link1->b_end_io = OS2_DL_Callback;
26457
 
+                        Link1->b_size = sector_count << EVMS_VSECTOR_SIZE_SHIFT;
26458
 
+                        Tracking_Record->Link1.rsector = eio->rsector;
26459
 
+                        Tracking_Record->Link1.rsize = sector_count;
26460
 
+                        Tracking_Record->Link1.bh = Link1;
26461
 
+                        Tracking_Record->Link1_Partition_Data = cur_dlentry;
26462
 
+
26463
 
+                        /* Create the I/O to the second link */
26464
 
+                        Clone_Bufferhead(eio->bh,Link2);
26465
 
+                        Link2->b_private = Tracking_Record;
26466
 
+                        Link2->b_end_io = OS2_DL_Callback;
26467
 
+                        Link2->b_data += sector_count << EVMS_VSECTOR_SIZE_SHIFT;
26468
 
+                        Tracking_Record->Link2.bh = Link2;
26469
 
+                        Tracking_Record->Link2_Partition_Data = cur_dlentry->next;
26470
 
+                        Link2->b_rsector = 0;
26471
 
+                        Tracking_Record->Link2.rsector = 0;
26472
 
+                        Tracking_Record->Link2.rsize = eio->rsize - sector_count;
26473
 
+                        Link2->b_size = Tracking_Record->Link2.rsize << EVMS_VSECTOR_SIZE_SHIFT;
26474
 
+
26475
 
+                        Transfer_Record = evms_cs_allocate_from_pool(BBR_Transfer_Pool,1);  /* Block until we get a transfer record. */
26476
 
+                        Transfer_Record->Write_Flag = 1;
26477
 
+                        Transfer_Record->Partition_Data = cur_dlentry;
26478
 
+                        Transfer_Record->eio = Tracking_Record->Link1;
26479
 
+                        Transfer_Record->Next = NULL;
26480
 
+                        Tracking_Record->Link1_Transfer_Record = Transfer_Record;
26481
 
+                        /* Process the I/O to the first link. */
26482
 
+                        if ( cur_dlentry->bbr_is_active )   {
26483
 
+                                /* Transfer the IO to the BBR Worker Thread. */
26484
 
+                                Tracking_Record->Link1_BBR_Attempted = 1;
26485
 
+                                BBR_Transfer_IO(Transfer_Record);
26486
 
+                        }
26487
 
+                        else {
26488
 
+                                Tracking_Record->Link1_BBR_Attempted = 0;
26489
 
+                                W_IO( cur_dlentry->link_partition, &Tracking_Record->Link1 );
26490
 
+                        }
26491
 
+
26492
 
+                        /* Process the I/O to the second link. */
26493
 
+                        cur_dlentry = cur_dlentry->next;
26494
 
+                        Transfer_Record = evms_cs_allocate_from_pool(BBR_Transfer_Pool,1);  /* Block until we get a transfer record. */
26495
 
+                        Transfer_Record->Write_Flag = 1;
26496
 
+                        Transfer_Record->Partition_Data = cur_dlentry;
26497
 
+                        Transfer_Record->eio = Tracking_Record->Link2;
26498
 
+                        Transfer_Record->Next = NULL;
26499
 
+                        Tracking_Record->Link2_Transfer_Record= Transfer_Record;
26500
 
+                        if ( cur_dlentry->bbr_is_active )   {
26501
 
+                                /* Transfer the IO to the BBR Worker Thread. */
26502
 
+                                Tracking_Record->Link2_BBR_Attempted = 1;
26503
 
+                                BBR_Transfer_IO(Transfer_Record);
26504
 
+                        }
26505
 
+                        else {
26506
 
+                                Tracking_Record->Link2_BBR_Attempted = 0;
26507
 
+                                W_IO( cur_dlentry->link_partition, &Tracking_Record->Link2 );
26508
 
+                        }
26509
 
+
26510
 
+                        break;
26511
 
+                default:
26512
 
+                        LOG_SERIOUS("WRITE error, request exceeds volume size.\n" );
26513
 
+                        EVMS_IO_ERROR(eio);
26514
 
+                        break;
26515
 
+        }
26516
 
+}
26517
 
+
26518
 
+
26519
 
+static int os2_ioctl_cmd_plugin_ioctl(  evms_logical_node_t *node, 
26520
 
+                                        struct inode *inode, 
26521
 
+                                        struct file *file,
26522
 
+                                        unsigned long cmd, 
26523
 
+                                        unsigned long arg)
26524
 
+{
26525
 
+        int rc = 0;
26526
 
+        os2_volume_runtime_entry_t * Node_Data;
26527
 
+        os2_drivelink_runtime_entry_t * curlink, * nextlink;
26528
 
+        evms_plugin_ioctl_t tmp, *user_parms;
26529
 
+
26530
 
+        user_parms = (evms_plugin_ioctl_t *)arg;
26531
 
+        /* copy user's parameters to kernel space */
26532
 
+        if (copy_from_user(&tmp, user_parms, sizeof(tmp)))
26533
 
+                rc = -EFAULT;
26534
 
+
26535
 
+        if (!rc) {
26536
 
+                Node_Data = (os2_volume_runtime_entry_t *)node->instance_data;
26537
 
+                /* is this cmd targetted at this feature ? */
26538
 
+                if (tmp.feature_id == node->plugin->id) {
26539
 
+                        switch(tmp.feature_command) {
26540
 
+                                default:
26541
 
+                                        break;
26542
 
+                        }
26543
 
+                } else { /* broadcast this cmd to all children */
26544
 
+                        curlink = Node_Data->drive_link;
26545
 
+
26546
 
+                        /* broadcast this cmd to all children */
26547
 
+                        while ( curlink ) {
26548
 
+                                nextlink = curlink->next;
26549
 
+
26550
 
+                                rc = IOCTL(curlink->link_partition,inode,file,cmd,arg);
26551
 
+
26552
 
+                                if (rc) {
26553
 
+                                      break;  
26554
 
+                                }
26555
 
+                                curlink = nextlink;
26556
 
+                        }
26557
 
+
26558
 
+                }
26559
 
+                /* copy info to userspace */
26560
 
+                if (copy_to_user(user_parms, &tmp, sizeof(tmp)))
26561
 
+                        rc = -EFAULT;
26562
 
+        }
26563
 
+        return(rc);
26564
 
+}
26565
 
+
26566
 
+
26567
 
+static int OS2_ioctl_cmd_broadcast( evms_logical_node_t *node,
26568
 
+                                    struct inode *inode, 
26569
 
+                                    struct file *file,
26570
 
+                                    unsigned long cmd, 
26571
 
+                                    unsigned long arg)
26572
 
+{
26573
 
+        int rc = 0;
26574
 
+        os2_volume_runtime_entry_t * Node_Data;
26575
 
+        os2_drivelink_runtime_entry_t * curlink, * nextlink;
26576
 
+
26577
 
+        Node_Data = (os2_volume_runtime_entry_t *)node->instance_data;
26578
 
+        curlink = Node_Data->drive_link;
26579
 
+        
26580
 
+        /* broadcast this cmd to all children */
26581
 
+        while ( curlink ) {
26582
 
+                nextlink = curlink->next;
26583
 
+
26584
 
+                rc |= IOCTL(curlink->link_partition,inode,file,cmd,arg);
26585
 
+
26586
 
+                curlink = nextlink;
26587
 
+        }
26588
 
+
26589
 
+        return(rc);
26590
 
+}
26591
 
+
26592
 
+
26593
 
+/*
26594
 
+ * Function:  ioctl_os2lvm
26595
 
+ */
26596
 
+static int ioctl_os2lvm( evms_logical_node_t  * logical_node,
26597
 
+                         struct inode         * inode,
26598
 
+                         struct file          * file,
26599
 
+                         unsigned int         cmd,
26600
 
+                         unsigned long        arg )
26601
 
+{
26602
 
+        int   rc = 0;
26603
 
+        evms_sector_t          Sectors_Per_Cylinder;
26604
 
+        evms_sector_t          Total_Sectors;
26605
 
+        evms_logical_node_t  * partition_node;
26606
 
+
26607
 
+        partition_node = (( os2_volume_runtime_entry_t * )logical_node->instance_data )->drive_link->link_partition;
26608
 
+
26609
 
+        if ( !inode )
26610
 
+                return -EINVAL;
26611
 
+
26612
 
+        LOG_EVERYTHING("Ioctl %d\n", cmd );
26613
 
+
26614
 
+        switch ( cmd ) {
26615
 
+                case HDIO_GETGEO:
26616
 
+                        {
26617
 
+                                // Return fake geometry
26618
 
+                                struct hd_geometry *hd = ( struct hd_geometry * )arg;
26619
 
+                                short cylinders;
26620
 
+                                unsigned char heads = 255;
26621
 
+                                unsigned char sectors = OS2LVM_SYNTHETIC_SECTORS_PER_TRACK;
26622
 
+                                long start = 0;
26623
 
+
26624
 
+                                /* OS/2 always created a fake geometry using the maximum cylinder size. */
26625
 
+                                Sectors_Per_Cylinder = heads * sectors;
26626
 
+                                for ( cylinders = 0, Total_Sectors = 0; Total_Sectors < ( ( os2_volume_runtime_entry_t * )logical_node->instance_data )->size_in_sectors; cylinders++ )
26627
 
+                                        Total_Sectors += Sectors_Per_Cylinder;
26628
 
+
26629
 
+                                cylinders--;
26630
 
+
26631
 
+                                if ( copy_to_user(( short * )( &hd->cylinders ), &cylinders, sizeof( cylinders )) ||
26632
 
+                                     copy_to_user(( char * )( &hd->heads ), &heads, sizeof( heads )) ||
26633
 
+                                     copy_to_user(( char * )( &hd->sectors ), &sectors, sizeof( sectors )) ||
26634
 
+                                     copy_to_user(( long * )( &hd->start ), &start, sizeof( start )) ) {
26635
 
+                                        return -EFAULT;
26636
 
+                                }
26637
 
+                        }
26638
 
+                        break;
26639
 
+
26640
 
+                case EVMS_GET_BMAP:
26641
 
+                        // No kernel images allowed on OS/2 volumes right now.
26642
 
+                        rc = -EINVAL;
26643
 
+                        break;
26644
 
+
26645
 
+                case EVMS_QUIESCE_VOLUME:
26646
 
+                case EVMS_GET_DISK_LIST:
26647
 
+                case EVMS_CHECK_MEDIA_CHANGE:
26648
 
+                case EVMS_REVALIDATE_DISK:
26649
 
+                case EVMS_OPEN_VOLUME:
26650
 
+                case EVMS_CLOSE_VOLUME:
26651
 
+                        rc = OS2_ioctl_cmd_broadcast(logical_node, inode, file, cmd, arg);
26652
 
+                        break;
26653
 
+                case EVMS_PLUGIN_IOCTL:
26654
 
+                        rc = os2_ioctl_cmd_plugin_ioctl( logical_node, inode, file, cmd, arg);
26655
 
+                        break;
26656
 
+                default:
26657
 
+                        rc = -EINVAL;
26658
 
+                        break;                        
26659
 
+        }
26660
 
+
26661
 
+        return rc;
26662
 
+}
26663
 
+
26664
 
+
26665
 
+/*
26666
 
+ * Function:  init_io_os2lvm
26667
 
+ */
26668
 
+static int init_io_os2lvm( evms_logical_node_t  * node,
26669
 
+                           int                  io_flag,      /* 0=read, 1=write   */
26670
 
+                           evms_sector_t        sect_nr,      /* disk LBA          */
26671
 
+                           evms_sector_t        num_sects,    /* # of sectors      */
26672
 
+                           void                 * buf_addr )  /* buffer address    */
26673
 
+{
26674
 
+        int   rc = 0;
26675
 
+        evms_sector_t  sector_count;
26676
 
+        evms_logical_node_t  * partition_node;
26677
 
+        os2_drivelink_runtime_entry_t  * cur_dlentry = NULL;
26678
 
+
26679
 
+        sector_count = num_sects;
26680
 
+        rc = find_drive_link( node, &cur_dlentry, &sect_nr, &sector_count );
26681
 
+        switch ( rc ) {
26682
 
+                case 1 :
26683
 
+                        partition_node = cur_dlentry->link_partition;
26684
 
+                        if ( cur_dlentry->bbr_is_active )
26685
 
+                                rc = do_os2_bbr_io( cur_dlentry, io_flag, sect_nr, num_sects, buf_addr );
26686
 
+                        else {
26687
 
+                                rc = INIT_IO( partition_node, io_flag, sect_nr, num_sects, buf_addr );
26688
 
+                                if ( rc && io_flag ) {
26689
 
+                                        cur_dlentry->bbr_is_active = 1;
26690
 
+                                        rc = do_os2_bbr_io( cur_dlentry, io_flag, sect_nr, num_sects, buf_addr );
26691
 
+                                }
26692
 
+                        }
26693
 
+                        break;
26694
 
+                case 2 :
26695
 
+                        partition_node = cur_dlentry->link_partition;
26696
 
+                        if ( cur_dlentry->bbr_is_active )
26697
 
+                                rc = do_os2_bbr_io( cur_dlentry, io_flag, sect_nr, sector_count, buf_addr );
26698
 
+                        else {
26699
 
+                                rc = INIT_IO( partition_node, io_flag, sect_nr, sector_count, buf_addr );
26700
 
+                                if ( rc && io_flag) {
26701
 
+                                        cur_dlentry->bbr_is_active = 1;
26702
 
+                                        rc = do_os2_bbr_io( cur_dlentry, io_flag, sect_nr, sector_count, buf_addr );
26703
 
+                                }
26704
 
+                        }
26705
 
+
26706
 
+                        if ( !rc ) {
26707
 
+                                cur_dlentry = cur_dlentry->next;
26708
 
+                                partition_node = cur_dlentry->link_partition;
26709
 
+                                num_sects -= sector_count;
26710
 
+                                buf_addr += sector_count << OS2_SECTOR_SHIFT;
26711
 
+                                rc = 1;
26712
 
+                                if ( cur_dlentry->bbr_is_active )
26713
 
+                                        rc = do_os2_bbr_io( cur_dlentry, io_flag, 0, num_sects, buf_addr );
26714
 
+                                else {
26715
 
+                                        rc = INIT_IO( partition_node, io_flag, 0, num_sects, buf_addr );
26716
 
+                                        if ( rc && io_flag ) {
26717
 
+                                                cur_dlentry->bbr_is_active = 1;
26718
 
+                                                rc = do_os2_bbr_io( cur_dlentry, io_flag, 0, num_sects, buf_addr );
26719
 
+                                        }
26720
 
+
26721
 
+                                }
26722
 
+                        }
26723
 
+                        break;
26724
 
+                default:
26725
 
+                        LOG_SERIOUS("INITIO error, request exceeds volume size.\n" );
26726
 
+                        break;
26727
 
+        }
26728
 
+
26729
 
+        return rc;
26730
 
+}
26731
 
+
26732
 
+
26733
 
+/*
26734
 
+ * Function:  do_os2_bbr_io
26735
 
+ *
26736
 
+ *      Check the Bad Block Relocation list for relocated sectors.  If any are found,
26737
 
+ *       this function will do the i/o directly.
26738
 
+ *      Return values:  0 == i/o done,  1 == unable to complete i/o
26739
 
+ */
26740
 
+static int do_os2_bbr_io( os2_drivelink_runtime_entry_t  * io_dlentry,
26741
 
+                          int                  rw,      /* 0=read, 1=write  */
26742
 
+                          evms_sector_t        starting_lsn, /* disk LBA         */
26743
 
+                          evms_sector_t        count,        /* # of sectors     */
26744
 
+                          void               * buffer )      /* buffer address   */
26745
 
+{
26746
 
+       evms_sector_t   lsn, remapped_lsn;
26747
 
+        int             rc;
26748
 
+
26749
 
+       // For each sector in this request, check if this sector has already
26750
 
+       // been remapped. If so, process all previous sectors in this request,
26751
 
+       // followed by the remapped sector. Then reset the starting lsn and
26752
 
+       // count and keep going with the rest of the request as if it were
26753
 
+       // a whole new request.
26754
 
+       for ( lsn = 0; lsn < count; lsn++ ) {
26755
 
+               remapped_lsn = starting_lsn + lsn;
26756
 
+               rc = Sector_Is_Remapped(io_dlentry,remapped_lsn, &remapped_lsn);
26757
 
+               if (rc) {
26758
 
+                       // Process all sectors in the request up to this one.
26759
 
+                       if (lsn > 0) {
26760
 
+                               rc = INIT_IO(io_dlentry->link_partition, rw, starting_lsn, lsn, buffer);
26761
 
+                               if (rc) {
26762
 
+                                        /* If this is a read, then we are done. */
26763
 
+                                        if (! rw) {
26764
 
+                                                return 1;
26765
 
+                                        }
26766
 
+
26767
 
+                                        /* Since this was a write, we must see if we can remap the bad sector to a replacement sector. */
26768
 
+                                        if ( !Create_New_BBR_Table_Entry(io_dlentry, starting_lsn, lsn, buffer) ) {
26769
 
+                                                /* We were unable to remap the bad sector(s) in the I/O.  We can not complete the I/O. */
26770
 
+                                                return 1;
26771
 
+                                        }
26772
 
+                               }
26773
 
+                               buffer += (lsn * OS2_BYTES_PER_SECTOR);
26774
 
+                       }
26775
 
+
26776
 
+                       // Process the remapped sector.
26777
 
+                       rc = INIT_IO(io_dlentry->link_partition, rw, remapped_lsn, 1, buffer);
26778
 
+                       if (rc) {
26779
 
+                                /* If this is a read, then we are done. */
26780
 
+                                if (! rw) {
26781
 
+                                        return 1;
26782
 
+                                }
26783
 
+
26784
 
+                                /* Get the original sector that was remapped. */
26785
 
+                                remapped_lsn = starting_lsn + lsn;
26786
 
+
26787
 
+                                /* Invalidate the current remapping. */
26788
 
+                                Invalidate_Mapping(io_dlentry,remapped_lsn,1);
26789
 
+
26790
 
+                                /* Try to remap the bad sector to another replacement sector. */
26791
 
+                                if ( !Create_New_BBR_Table_Entry(io_dlentry, remapped_lsn, 1, buffer) ) {
26792
 
+                                        /* We were unable to remap the bad sector(s) in the I/O.  We can not complete the I/O. */
26793
 
+                                        return 1;
26794
 
+                                }
26795
 
+
26796
 
+                       }
26797
 
+
26798
 
+                        buffer += OS2_BYTES_PER_SECTOR;
26799
 
+
26800
 
+                       starting_lsn += (lsn + 1);
26801
 
+                       count -= (lsn + 1);
26802
 
+                       lsn = -1;
26803
 
+               }
26804
 
+
26805
 
+       }
26806
 
+
26807
 
+        /* Are there any sectors left to process? */
26808
 
+        if ( count > 0 ) {
26809
 
+                rc = INIT_IO(io_dlentry->link_partition, rw, starting_lsn, count, buffer);
26810
 
+               if (rc) {
26811
 
+                        /* If this is a read, then we are done. */
26812
 
+                        if (! rw) {
26813
 
+                                return 1;
26814
 
+                        }
26815
 
+
26816
 
+                        /* Since this was a write, we must see if we can remap the bad sector to a replacement sector. */
26817
 
+                        if ( !Create_New_BBR_Table_Entry(io_dlentry, starting_lsn, count, buffer) ) {
26818
 
+                                /* We were unable to remap the bad sector(s) in the I/O.  We can not complete the I/O. */
26819
 
+                                return 1;
26820
 
+                        }
26821
 
+
26822
 
+               }
26823
 
+
26824
 
+        }
26825
 
+
26826
 
+        return 0;
26827
 
+}
26828
 
+
26829
 
+
26830
 
+/*
26831
 
+ * Function: os2lvm_vge_init
26832
 
+ */
26833
 
+int __init os2lvm_vge_init( void )
26834
 
+{
26835
 
+        /* Should I be allocating the pools and BBR Worker Thread here? */
26836
 
+        return evms_cs_register_plugin( &plugin_header );/* register with EVMS*/
26837
 
+}
26838
 
+
26839
 
+void __exit os2lvm_vge_exit( void )
26840
 
+{
26841
 
+        /* BUGBUG - Is there where I need to kill the BBR Worker Thread and free any memory I am still holding? */
26842
 
+
26843
 
+        evms_cs_unregister_plugin(&plugin_header);
26844
 
+}
26845
 
+
26846
 
+module_init(os2lvm_vge_init);
26847
 
+module_exit(os2lvm_vge_exit);
26848
 
+#ifdef MODULE_LICENSE
26849
 
+MODULE_LICENSE("GPL");
26850
 
+#endif
26851
 
+
26852
 
+
26853
 
+
26854
 
+
26855
 
+// Local VGE Functions
26856
 
+
26857
 
+
26858
 
+/*
26859
 
+ * Function:  discover_os2lvm_partitions
26860
 
+ *
26861
 
+ *     Examine the list of logical partitions.  Any type 0x35 partition that contains
26862
 
+ *      a valid OS/2 signature sector is consumed and added to the appropriate logical
26863
 
+ *      volume.
26864
 
+ */
26865
 
+static int discover_os2lvm_partitions( evms_logical_node_t ** evms_partition_list )
26866
 
+{
26867
 
+        evms_logical_node_t  *          evms_partition;
26868
 
+        evms_logical_node_t  *          next_partition;
26869
 
+        evms_logical_node_t  *          new_volume;
26870
 
+        evms_sector_t                   sectornum = 0;
26871
 
+        u_int32_t                       volumeserial;
26872
 
+        char                 *          sigsect;
26873
 
+        char                 *          volumename;
26874
 
+        char                            driveletter[8];
26875
 
+        LVM_Signature_Sector  *         sigsector;
26876
 
+        os2_drivelink_runtime_entry_t * new_dlentry;
26877
 
+
26878
 
+        LOG_ENTRY_EXIT("Discovering OS/2 Logical Volumes\n" );
26879
 
+        if ( evms_cs_allocate_memory(( void** )&sigsect, OS2_BYTES_PER_SECTOR ) ) {
26880
 
+                LOG_SERIOUS("Could not allocate Signature sector data\n" );
26881
 
+                return -ENOMEM;
26882
 
+        }
26883
 
+
26884
 
+        for ( evms_partition = *evms_partition_list; evms_partition; evms_partition = next_partition ) {
26885
 
+                // Save the next node. We may remove this one from the list.
26886
 
+                next_partition = evms_partition->next;
26887
 
+
26888
 
+                // The node must not have the OS/2 vge id.
26889
 
+                if ( evms_partition->plugin->id == plugin_header.id ) {
26890
 
+                        continue;
26891
 
+                }
26892
 
+
26893
 
+                LOG_EXTRA("Examining partition serial %s\n", evms_partition->name );
26894
 
+
26895
 
+                // Have to go to the last accessible sector of the partition and
26896
 
+                //  read it in.  It should be the LVM Signature Sector.
26897
 
+                sectornum = evms_partition->total_vsectors - 1;
26898
 
+                if ( INIT_IO( evms_partition, 0, sectornum, 1, sigsect ) ) {
26899
 
+                        // On an I/O error, continue on to the next partition.
26900
 
+                        // This means that the volume it belongs to will be incomplete
26901
 
+                        //  and later deleted in the completeness check.
26902
 
+                        LOG_SERIOUS("I/O error on Signature sector read\n" );
26903
 
+                        continue;
26904
 
+                }
26905
 
+                sigsector = ( LVM_Signature_Sector * )sigsect;
26906
 
+
26907
 
+                // Validate the Signature Sector
26908
 
+                if ( validate_signaturesector( evms_partition, sigsector, OS2_BYTES_PER_SECTOR )) {
26909
 
+                        LOG_EXTRA("Signature sector is not valid\n" );
26910
 
+                        continue;
26911
 
+                }
26912
 
+// Bugbug - At this point, we have validated an OS/2 LVM Signature Sector.  However, if the partition
26913
 
+// is not marked as a type 0x35, then this Signature Sector may be erroneous.  The problem here is that
26914
 
+// there is currently no way to find out if this partition was marked as a type 0x35.  Also, if we 
26915
 
+// should reject this partition due to some problem with the drive linking or BBR metadata, should we
26916
 
+// leave the partition in the evms partition list or not?  If the partition was marked as a type 0x35
26917
 
+// and the Signature Sector was valid, then I would say that we should remove it from the evms partition
26918
 
+// partition list.  If the partition is not marked as a type 0x35 but the Signature Sector is valid, then
26919
 
+// we could have a stray Signature Sector, in which case the partition should remain in the evms partition
26920
 
+// list.  The OS/2 LVM Signature Sector does have additional information that could be used to resolve
26921
 
+// this issue, such as the starting LBA of the partition that the Signature Sector belongs to, but
26922
 
+// we can not get the starting LBA of the partition to compare against.  If we leave the partition in
26923
 
+// the evms partition list when we should not, then an extraneous compatibility volume could result.
26924
 
+                // Build the Metadata for this partition
26925
 
+                if ( !( new_dlentry = new_os2_drive_link( sigsector, evms_partition )) ) {
26926
 
+                        continue;
26927
 
+                }
26928
 
+
26929
 
+                // Search for the parent Volume for this partition
26930
 
+                volumeserial = sigsector->Volume_Serial_Number;
26931
 
+                if ( !( new_volume = find_os2_volume( volumeserial )) ) {
26932
 
+
26933
 
+                        // If not found, allocate a new Volume
26934
 
+                        LOG_EVERYTHING("Parent not found, allocate new.\n" );
26935
 
+                        if ( sigsector->Drive_Letter != '\0' ) {
26936
 
+                                driveletter[0] = sigsector->Drive_Letter;
26937
 
+                                driveletter[1] = '\0';
26938
 
+                                volumename = driveletter;
26939
 
+                        }
26940
 
+                        else
26941
 
+                                volumename = sigsector->Volume_Name;
26942
 
+
26943
 
+                        if ( !( new_volume = new_os2volume( volumeserial, volumename )) ) {
26944
 
+                                delete_os2_drive_link( new_dlentry, 0 );
26945
 
+                                new_dlentry = NULL;
26946
 
+                                continue;
26947
 
+                        }
26948
 
+                }
26949
 
+
26950
 
+                // Now remove the partition from the List
26951
 
+                evms_cs_remove_logical_node_from_list( evms_partition_list, evms_partition );
26952
 
+
26953
 
+                if ( (( os2_volume_runtime_entry_t  * )new_volume->instance_data )->complete ) {
26954
 
+                        // Volume is complete, delete this duplicate
26955
 
+                        delete_os2_drive_link( new_dlentry, 0 );
26956
 
+                        LOG_EVERYTHING("Deleting duplicate node.\n" );
26957
 
+                        (( os2_volume_runtime_entry_t  * )new_volume->instance_data )->Export_Needed = 1;   //We must export this volume again!
26958
 
+                }
26959
 
+                else  /* Add this partition to its parent Volume */
26960
 
+                        add_os2link( new_dlentry, new_volume );
26961
 
+
26962
 
+        }
26963
 
+
26964
 
+        evms_cs_deallocate_memory(( void* )sigsect );
26965
 
+        LOG_ENTRY_EXIT("Finished Discovering OS/2 Logical Volumes\n" );
26966
 
+
26967
 
+        return 0;
26968
 
+}
26969
 
+
26970
 
+
26971
 
+/*
26972
 
+ * Function:  find_os2_volume
26973
 
+ *
26974
 
+ *      Search for the OS/2 volume that matches the volume serial.
26975
 
+ */
26976
 
+static evms_logical_node_t  * find_os2_volume( u_int32_t volumeserial )
26977
 
+{
26978
 
+        os2_volume_runtime_entry_t  * cur_volume;
26979
 
+        evms_logical_node_t         * cur_node;
26980
 
+
26981
 
+        cur_node = os2lvm_nodes;
26982
 
+
26983
 
+        while ( cur_node ) {
26984
 
+                cur_volume = ( os2_volume_runtime_entry_t  * )cur_node->instance_data;
26985
 
+                if ( cur_volume->Volume_Serial_Number == volumeserial ) {
26986
 
+                        LOG_EVERYTHING("%s: found volser match.\n", __FUNCTION__ );
26987
 
+                        return  cur_node;
26988
 
+                }
26989
 
+                LOG_EVERYTHING("%s: volser does not match.\n", __FUNCTION__ );
26990
 
+                cur_node = cur_volume->next_os2lvm_node;
26991
 
+        }
26992
 
+
26993
 
+        return NULL;
26994
 
+}
26995
 
+
26996
 
+
26997
 
+/*
26998
 
+ * Function:  add_os2link
26999
 
+ *
27000
 
+ *      Add the Drive Link metadata to the parent OS/2 volume.
27001
 
+ */
27002
 
+static int add_os2link( os2_drivelink_runtime_entry_t  * newlink,
27003
 
+                        evms_logical_node_t  * parent_volume )
27004
 
+{
27005
 
+        os2_volume_runtime_entry_t  * parent_metadata = ( os2_volume_runtime_entry_t * )parent_volume->instance_data;
27006
 
+        os2_drivelink_runtime_entry_t  * curlink = parent_metadata->drive_link, * nextlink;
27007
 
+
27008
 
+        if ( curlink ) {
27009
 
+                nextlink = curlink->next;
27010
 
+                while ( nextlink ) {
27011
 
+                        curlink = nextlink;
27012
 
+                        nextlink = curlink->next;
27013
 
+                }
27014
 
+                curlink->next = newlink;
27015
 
+        }
27016
 
+        else {
27017
 
+                parent_metadata->drive_link = newlink;
27018
 
+        }
27019
 
+        parent_metadata->drive_link_count++;
27020
 
+        parent_metadata->size_in_sectors += newlink->sector_count;
27021
 
+        parent_volume->total_vsectors += newlink->sector_count;
27022
 
+        return 0;
27023
 
+}
27024
 
+
27025
 
+
27026
 
+/*
27027
 
+ * Function:  find_link_data
27028
 
+ *
27029
 
+ *      Find the Drive Link metadata that matches the partition serial number.
27030
 
+ *       Remove it from the link_list passed in.
27031
 
+ */
27032
 
+static os2_drivelink_runtime_entry_t  * find_link_data( os2_drivelink_runtime_entry_t  ** link_list,
27033
 
+                                                        u_int32_t partitionser )
27034
 
+{
27035
 
+        os2_drivelink_runtime_entry_t  * curlink = *link_list, * prevlink = NULL;
27036
 
+
27037
 
+        while ( curlink ) {
27038
 
+                if ( curlink->Partition_Serial_Number == partitionser ) {
27039
 
+                        if ( prevlink ) {
27040
 
+                                prevlink->next = curlink->next;
27041
 
+                        }
27042
 
+                        else {
27043
 
+                                *link_list = curlink->next;
27044
 
+                        }
27045
 
+                        curlink->next = NULL;
27046
 
+                        return curlink;
27047
 
+                }
27048
 
+                prevlink = curlink;
27049
 
+                curlink = prevlink->next;
27050
 
+        }
27051
 
+
27052
 
+        return NULL;
27053
 
+}
27054
 
+
27055
 
+
27056
 
+/*
27057
 
+ * Function:  find_drive_link
27058
 
+ *
27059
 
+ *      Walk the linked list of drive links to find the proper
27060
 
+ *       target partition.  Returns the metadata associated with
27061
 
+ *       the drive link.
27062
 
+ *      Return values:  1 == data contained in 1 partition, 2 == data crosses 2 partitions,
27063
 
+ *                      0 == target partition not found
27064
 
+ */
27065
 
+static int find_drive_link( evms_logical_node_t  * node,
27066
 
+                            os2_drivelink_runtime_entry_t  ** dlentry,
27067
 
+                            evms_sector_t  * sector,
27068
 
+                            evms_sector_t  * num_sectors )
27069
 
+{
27070
 
+        evms_sector_t last_link_sector, cur_last_sector;
27071
 
+        os2_drivelink_runtime_entry_t  * curlink = (( os2_volume_runtime_entry_t * )node->instance_data )->drive_link, * nextlink;
27072
 
+
27073
 
+        while ( curlink ) {
27074
 
+                nextlink = curlink->next;
27075
 
+                last_link_sector = curlink->start_sector + curlink->sector_count;
27076
 
+                if ( *sector < last_link_sector ) {
27077
 
+                        *dlentry = curlink;
27078
 
+                        cur_last_sector = *sector + *num_sectors;
27079
 
+                        *sector -= curlink->start_sector;
27080
 
+                        LOG_EVERYTHING("I/O start_RBA == %Ld , sector_count == %Ld\n", *sector, *num_sectors );
27081
 
+                        if ( cur_last_sector <= last_link_sector )
27082
 
+                                return 1;
27083
 
+                        else {
27084
 
+                                if ( (*dlentry)->next )
27085
 
+                                        *num_sectors -= cur_last_sector - last_link_sector;
27086
 
+                                else
27087
 
+                                        return 0;
27088
 
+                        }
27089
 
+                        return 2;
27090
 
+                }
27091
 
+
27092
 
+                curlink = nextlink;
27093
 
+        }
27094
 
+
27095
 
+        return 0;
27096
 
+}
27097
 
+
27098
 
+
27099
 
+
27100
 
+// Allocation/Deallocation Functions
27101
 
+
27102
 
+
27103
 
+/*
27104
 
+ * Function:  new_os2_drive_link
27105
 
+ *
27106
 
+ *      Allocate space for a new OS/2 drive link structure.
27107
 
+ *        Initialize the appropriate fields.
27108
 
+ *        Note:  since the BBR info applies to each link, the BBR structures
27109
 
+ *               are also initialized here.
27110
 
+ */
27111
 
+static os2_drivelink_runtime_entry_t  * new_os2_drive_link( LVM_Signature_Sector * signature_sector,
27112
 
+                                                            evms_logical_node_t  * evms_partition )
27113
 
+{
27114
 
+        int i;
27115
 
+        u_int32_t feature, feature_size, sectoroffset;
27116
 
+        os2_drivelink_runtime_entry_t  * new_dlentry;
27117
 
+
27118
 
+        if ( evms_cs_allocate_memory(( void** )&new_dlentry, sizeof( os2_drivelink_runtime_entry_t )) ) {
27119
 
+                LOG_SERIOUS("Could not allocate drivelink metadata\n" );
27120
 
+                return NULL;
27121
 
+        }
27122
 
+        new_dlentry->sector_count = signature_sector->Partition_Size_To_Report_To_User;
27123
 
+        new_dlentry->Partition_Serial_Number = signature_sector->Partition_Serial_Number;
27124
 
+        new_dlentry->bbr_is_active = 0;  // initialize to not active
27125
 
+        new_dlentry->link_partition = evms_partition;
27126
 
+        init_MUTEX( &(new_dlentry->BBR_Table_Lock) );
27127
 
+
27128
 
+        sectoroffset = signature_sector->Partition_Start;
27129
 
+        LOG_EVERYTHING("Partition Start is at LBA %i\n", sectoroffset );
27130
 
+        for ( i = 0 ; i < OS2LVM_MAX_FEATURES_PER_VOLUME ; i++ ) {
27131
 
+                feature = signature_sector->LVM_Feature_Array[i].Feature_ID;
27132
 
+                if ( feature ) {
27133
 
+                        feature_size = signature_sector->LVM_Feature_Array[i].Feature_Data_Size;
27134
 
+                        LOG_EVERYTHING("Entry %d in Feature Table is valid,\n", i+1 );
27135
 
+                        LOG_EVERYTHING("Feature Data size is %i sectors.\n", feature_size );
27136
 
+                        if ( feature == DRIVE_LINKING_FEATURE_ID ) {
27137
 
+                                if ( !new_dlentry->link_data ) {
27138
 
+                                        new_dlentry->Drive_Link_Data_Copy1 = signature_sector->LVM_Feature_Array[i].Location_Of_Primary_Feature_Data - sectoroffset;
27139
 
+                                        new_dlentry->Drive_Link_Data_Copy2 = signature_sector->LVM_Feature_Array[i].Location_Of_Secondary_Feature_Data - sectoroffset;
27140
 
+                                        new_dlentry->link_data = new_os2_link_data( new_dlentry->Drive_Link_Data_Copy1, new_dlentry->Drive_Link_Data_Copy2, feature_size, evms_partition );
27141
 
+                                        if ( new_dlentry->link_data == NULL) {
27142
 
+                                                delete_os2_drive_link(new_dlentry,0);
27143
 
+                                                new_dlentry = NULL;                                                
27144
 
+                                        }
27145
 
+                                }
27146
 
+                                else {
27147
 
+                                        LOG_WARNING("os2lvm_vge: Drive Linking Feature encountered twice in the same Feature Array!\n");
27148
 
+                                        delete_os2_drive_link(new_dlentry,0);
27149
 
+                                        new_dlentry = NULL;
27150
 
+                                }
27151
 
+                        }
27152
 
+                        else if ( feature == BBR_FEATURE_ID ) {
27153
 
+                                if ( !new_dlentry->bbr_data ) {
27154
 
+                                        new_dlentry->BBR_Data_Copy1 = signature_sector->LVM_Feature_Array[i].Location_Of_Primary_Feature_Data;
27155
 
+                                        new_dlentry->BBR_Data_Copy2 = signature_sector->LVM_Feature_Array[i].Location_Of_Secondary_Feature_Data;
27156
 
+                                        new_dlentry->BBR_Feature_Size = feature_size;
27157
 
+                                        new_dlentry->bbr_data = new_os2_bbr_data( new_dlentry->BBR_Data_Copy1, new_dlentry->BBR_Data_Copy2, feature_size, evms_partition );
27158
 
+                                        if ( new_dlentry->bbr_data == NULL) {
27159
 
+                                                delete_os2_drive_link(new_dlentry,0);
27160
 
+                                                new_dlentry = NULL;                                                
27161
 
+                                        }
27162
 
+                                        else if ( signature_sector->LVM_Feature_Array[i].Feature_Active ) {
27163
 
+                                                new_dlentry->bbr_is_active = check_for_os2_bbr_relocations( new_dlentry->bbr_data );
27164
 
+                                        }
27165
 
+                                }
27166
 
+                                else {
27167
 
+                                        LOG_WARNING("os2lvm_vge: BBR Feature encountered twice in the same Feature Array!\n");
27168
 
+                                        delete_os2_drive_link(new_dlentry,0);
27169
 
+                                        new_dlentry = NULL;
27170
 
+                                }
27171
 
+                        }
27172
 
+                        else {
27173
 
+                                LOG_WARNING("os2lvm_vge: Unknown Feature entry %d found.\n", feature );
27174
 
+                                delete_os2_drive_link(new_dlentry,0);
27175
 
+                                new_dlentry = NULL;
27176
 
+                        }
27177
 
+
27178
 
+                        if ( signature_sector->LVM_Feature_Array[i].Feature_Active ) {
27179
 
+                                LOG_EVERYTHING("Feature is active.\n" );
27180
 
+                        }
27181
 
+                }
27182
 
+        }
27183
 
+
27184
 
+        if ( new_dlentry && 
27185
 
+             ( ( ! new_dlentry->bbr_data ) || ( ! new_dlentry->link_data ) )
27186
 
+           ) {
27187
 
+                LOG_WARNING("os2lvm_vge: Incomplete Feature Data found.\n" );
27188
 
+                delete_os2_drive_link(new_dlentry,0);
27189
 
+                new_dlentry = NULL;               
27190
 
+        }
27191
 
+        return new_dlentry;
27192
 
+}
27193
 
+
27194
 
+
27195
 
+/*
27196
 
+ * Function:  new_os2_link_data
27197
 
+ *
27198
 
+ *      Allocate space for OS/2 drive link information.
27199
 
+ *      Read in and validate the information from disk.
27200
 
+ *      Note:  assumes 512 byte sectors.
27201
 
+ */
27202
 
+static char  * new_os2_link_data( u_int32_t linksector1,
27203
 
+                                  u_int32_t linksector2,
27204
 
+                                  u_int32_t linknumsectors,
27205
 
+                                  evms_logical_node_t  * link_partition )
27206
 
+{
27207
 
+        char *    new_data1;  /* Buffer used to hold the primary copy of the drive linking data. */
27208
 
+        char *    new_data2;  /* Buffer used to hold the secondary copy of the drive linking data. */
27209
 
+        char *    p1;         /* Used to access individual sectors of data within new_data1. */
27210
 
+        char *    p2;         /* Used to access individual sectors of data within new_data2. */
27211
 
+        int       memsize = linknumsectors * OS2_BYTES_PER_SECTOR;
27212
 
+        u_int32_t i, seq1, seq2;
27213
 
+
27214
 
+        /* Allocate Memory for the buffers to hold the drive linking data. */
27215
 
+        LOG_EVERYTHING("Drive Linking Feature entry found.\n" );
27216
 
+        if ( evms_cs_allocate_memory(( void** )&new_data1, memsize ) ) {
27217
 
+                LOG_SERIOUS("Could not allocate Primary Link data\n" );
27218
 
+                return NULL;
27219
 
+        }
27220
 
+        if ( evms_cs_allocate_memory(( void** )&new_data2, memsize ) ) {
27221
 
+                LOG_SERIOUS("Could not allocate Secondary Link data\n" );
27222
 
+                evms_cs_deallocate_memory(( void* )new_data1 );
27223
 
+                return NULL;
27224
 
+        }
27225
 
+
27226
 
+        LOG_EVERYTHING("Primary Feature Data starts at RBA %i\n", linksector1 );
27227
 
+        LOG_EVERYTHING("Secondary Feature Data starts at RBA %i\n", linksector2 );
27228
 
+
27229
 
+        /* Read the drive linking data into memory. */
27230
 
+        if ( INIT_IO( link_partition, 0, linksector1, linknumsectors, new_data1 ) ) {
27231
 
+                LOG_SERIOUS("I/O error reading Primary Feature Data.\n" );
27232
 
+                seq1 = 0;
27233
 
+                p1 = NULL;
27234
 
+        }
27235
 
+        else {
27236
 
+                /* Set up access to the buffer.  Extract the Master Sequence Number from the buffer. */
27237
 
+                p1 = new_data1;
27238
 
+                seq1 = (( LVM_Link_Table_First_Sector * )p1 )->Sequence_Number;
27239
 
+        }
27240
 
+
27241
 
+        if ( INIT_IO( link_partition, 0, linksector2, linknumsectors, new_data2 ) ) {
27242
 
+                LOG_SERIOUS("I/O error reading Secondary Feature Data.\n" );
27243
 
+                seq2 = 0;
27244
 
+                p2 = NULL;
27245
 
+        }
27246
 
+        else {
27247
 
+                /* Set up access to the second buffer.  Extract its copy of the Master Sequence Number. */
27248
 
+                p2 = new_data2;
27249
 
+                seq2 = (( LVM_Link_Table_Sector * )p2 )->Sequence_Number;
27250
 
+        }
27251
 
+
27252
 
+        /* Validate both copies of the drive linking data one sector at a time. */
27253
 
+        for ( i = 0; i < linknumsectors; i++, p1 += OS2_BYTES_PER_SECTOR, p2 += OS2_BYTES_PER_SECTOR ) {
27254
 
+                if ( (seq1 > 0) && validate_drivelinksector( ( LVM_Link_Table_Sector * )p1, i, seq1 )) {
27255
 
+                        LOG_SERIOUS("The primary copy of the drive link data is invalid!  Sector %i is not valid\n", i );
27256
 
+                        seq1 = 0;
27257
 
+                }
27258
 
+
27259
 
+                if ( (seq2 > 0) && validate_drivelinksector( ( LVM_Link_Table_Sector * )p2, i, seq2 )) {
27260
 
+                        LOG_SERIOUS("The secondary copy of the drive link data is invalid!  Sector %i is not valid\n", i );
27261
 
+                        seq2 = 0;
27262
 
+                }
27263
 
+
27264
 
+        }
27265
 
+
27266
 
+        LOG_EVERYTHING("Primary Feature Data sequence # %i\n", seq1 );
27267
 
+        LOG_EVERYTHING("Secondary Feature Data sequence # %i\n", seq2 );
27268
 
+
27269
 
+        /* Choose which copy of the drive linking data to use.  If both sequence numbers are 0, then both copies
27270
 
+           of the drive linking data are bad.  If both are equal and non-zero, then both copies are good and it
27271
 
+           really doesn't matter which one you choose.  Otherwise, choose the copy with the highest sequence number. */
27272
 
+        if ( seq2 > seq1 ) {
27273
 
+                evms_cs_deallocate_memory(( void* )new_data1 );
27274
 
+                return  new_data2;
27275
 
+        }
27276
 
+        else {
27277
 
+                evms_cs_deallocate_memory(( void* )new_data2 );
27278
 
+                if ( !seq1 ) {
27279
 
+                        evms_cs_deallocate_memory(( void* )new_data1 );
27280
 
+                        new_data1 = NULL;
27281
 
+                }
27282
 
+        }
27283
 
+        return  new_data1;
27284
 
+}
27285
 
+
27286
 
+
27287
 
+/*
27288
 
+ * Function:  new_os2_bbr_data
27289
 
+ *
27290
 
+ *      Allocate space for OS/2 bad block relocation information.
27291
 
+ *      Read in and validate the information from disk.
27292
 
+ *      Note:  assumes 512 byte sectors.
27293
 
+ */
27294
 
+static char  * new_os2_bbr_data( u_int32_t bbrsector1,
27295
 
+                                 u_int32_t bbrsector2,
27296
 
+                                 u_int32_t bbrnumsectors,
27297
 
+                                 evms_logical_node_t  * bbr_partition )
27298
 
+{
27299
 
+        char *    new_data1;  /* Buffer to hold the primary copy of the BBR data. */
27300
 
+        char *    new_data2;  /* Buffer to hold the secondary copy of the BBR data. */
27301
 
+        char *    p1;         /* Used to examine the individual sectors of BBR data within new_data1. */
27302
 
+        char *    p2;         /* Used to examine the individual sectors of BBR data within new_data2. */
27303
 
+        int       memsize = bbrnumsectors * OS2_BYTES_PER_SECTOR;
27304
 
+        u_int32_t i, seq1, seq2;
27305
 
+
27306
 
+        LOG_EVERYTHING("BBR Feature entry found.\n" );
27307
 
+
27308
 
+        /* Allocate memory for the buffers. */
27309
 
+        if ( evms_cs_allocate_memory(( void** )&new_data1, memsize ) ) {
27310
 
+                LOG_SERIOUS("Could not allocate Primary BBR data\n" );
27311
 
+                return NULL;
27312
 
+        }
27313
 
+        if ( evms_cs_allocate_memory(( void** )&new_data2, memsize ) ) {
27314
 
+                LOG_SERIOUS("Could not allocate Secondary BBR data\n" );
27315
 
+                evms_cs_deallocate_memory(( void* )new_data1 );
27316
 
+                return NULL;
27317
 
+        }
27318
 
+
27319
 
+        LOG_EVERYTHING("Primary Feature Data starts at RBA %i\n", bbrsector1 );
27320
 
+        LOG_EVERYTHING("Secondary Feature Data starts at RBA %i\n", bbrsector2 );
27321
 
+
27322
 
+        /* Read in both copies of the BBR data. */
27323
 
+        if ( INIT_IO( bbr_partition, 0, bbrsector1, bbrnumsectors, new_data1 ) ) {
27324
 
+                LOG_SERIOUS("I/O error reading Primary Feature Data.\n" );
27325
 
+                seq1 = 0;
27326
 
+                p1 = NULL;
27327
 
+        }
27328
 
+        else {
27329
 
+                /* Establish access to the first sector of the BBR data.  Extract the Master Sequence Number
27330
 
+                   for this copy of the BBR data.                                                             */
27331
 
+                p1 = new_data1;
27332
 
+                seq1 = (( LVM_BBR_Table_First_Sector * )p1 )->Sequence_Number;
27333
 
+        }
27334
 
+
27335
 
+        if ( INIT_IO( bbr_partition, 0, bbrsector2, bbrnumsectors, new_data2 ) ) {
27336
 
+                LOG_SERIOUS("I/O error reading Secondary Feature Data.\n" );
27337
 
+                seq2 = 0;
27338
 
+                p2 = NULL;
27339
 
+        }
27340
 
+        else {
27341
 
+                /* Establish access to the first sector of the second copy of the BBR data.  Extract the 
27342
 
+                  Master Sequence Number for this copy of the BBR data.                                   */
27343
 
+                p2 = new_data2;
27344
 
+                seq2 = (( LVM_BBR_Table_Sector * )p2 )->Sequence_Number;
27345
 
+        }
27346
 
+
27347
 
+        /* Validate both copies of the BBR Data, one sector at a time. */
27348
 
+        for ( i = 0; i < bbrnumsectors; i++, p1 += OS2_BYTES_PER_SECTOR, p2 += OS2_BYTES_PER_SECTOR ) {
27349
 
+                if ( (seq1 > 0) && validate_bbrtablesector( p1, i, seq1 )) {
27350
 
+                        LOG_SERIOUS("The primary BBR data is invalid!  Sector %i is not valid\n", i );
27351
 
+                        seq1 = 0;
27352
 
+                }
27353
 
+
27354
 
+                if ( (seq2 > 0) && validate_bbrtablesector( p2, i, seq2 )) {
27355
 
+                        LOG_SERIOUS("The secondary BBR data is invalid!  Sector %i is not valid\n", i );
27356
 
+                        seq2 = 0;
27357
 
+                }
27358
 
+
27359
 
+        }
27360
 
+
27361
 
+        LOG_EVERYTHING("Primary Feature Data sequence # %i\n", seq1 );
27362
 
+        LOG_EVERYTHING("Secondary Feature Data sequence # %i\n", seq2 );
27363
 
+
27364
 
+        /* Choose which copy of the BBR Data to use based upon the sequence number.  If both sequence numbers
27365
 
+           are 0, then there is no valid BBR data.  If both are non-zero and equal, then it really doesn't 
27366
 
+           matter which copy is used.  Otherwise, choose the copy with the highest sequence number.            */
27367
 
+        if ( seq2 > seq1 ) {
27368
 
+                evms_cs_deallocate_memory(( void* )new_data1 );
27369
 
+                return  new_data2;
27370
 
+        }
27371
 
+        else {
27372
 
+                evms_cs_deallocate_memory(( void* )new_data2 );
27373
 
+                if ( !seq1 ) {
27374
 
+                        evms_cs_deallocate_memory(( void* )new_data1 );
27375
 
+                        new_data1 = NULL;
27376
 
+                }
27377
 
+        }
27378
 
+        return  new_data1;
27379
 
+}
27380
 
+
27381
 
+
27382
 
+/*
27383
 
+ * Function:  new_os2volume
27384
 
+ *
27385
 
+ *      Allocate space for a new OS/2 logical volume.
27386
 
+ *      Initialize the appropriate fields.
27387
 
+ */
27388
 
+static evms_logical_node_t  * new_os2volume( u_int32_t volumeserial,
27389
 
+                                             char  * volume_name )
27390
 
+{
27391
 
+        evms_logical_node_t  * new_node;
27392
 
+        os2_volume_runtime_entry_t  * cur_volume;
27393
 
+
27394
 
+        if ( evms_cs_allocate_logical_node( &new_node ) ) {
27395
 
+                LOG_SERIOUS("Could not allocate new volume\n" );
27396
 
+                return NULL;
27397
 
+        }
27398
 
+        if ( evms_cs_allocate_memory( &new_node->instance_data, sizeof( os2_volume_runtime_entry_t )) ) {
27399
 
+                LOG_SERIOUS("Could not allocate volume metadata\n" );
27400
 
+                evms_cs_deallocate_logical_node( new_node );
27401
 
+                return NULL;
27402
 
+        }
27403
 
+        new_node->plugin = &plugin_header;
27404
 
+        new_node->system_id = LVM_PARTITION_INDICATOR;
27405
 
+        sprintf( new_node->name, "os2/%s", volume_name );
27406
 
+        cur_volume = ( os2_volume_runtime_entry_t * )new_node->instance_data;
27407
 
+        cur_volume->Volume_Serial_Number = volumeserial;
27408
 
+        cur_volume->Export_Needed = 1;
27409
 
+
27410
 
+        if ( os2lvm_nodes == NULL )
27411
 
+                os2lvm_nodes = new_node;
27412
 
+
27413
 
+               // This is the first node discovered. Start the BBR thread.
27414
 
+               if ( ! BBR_Worker_Thread ) {
27415
 
+                       BBR_Worker_Thread = evms_cs_register_thread(BBR_Worker, NULL, BBR_Worker_Name);
27416
 
+                       if ( ! BBR_Worker_Thread ) {
27417
 
+                               evms_cs_deallocate_memory(new_node->instance_data);
27418
 
+                               evms_cs_deallocate_logical_node(new_node);
27419
 
+                               os2lvm_nodes = NULL;
27420
 
+                               return NULL;
27421
 
+                       }
27422
 
+               }
27423
 
+        else {
27424
 
+                cur_volume = ( os2_volume_runtime_entry_t  * )os2lvm_nodes->instance_data;
27425
 
+                while ( cur_volume->next_os2lvm_node )
27426
 
+                        cur_volume = ( os2_volume_runtime_entry_t  * )cur_volume->next_os2lvm_node->instance_data;
27427
 
+                cur_volume->next_os2lvm_node = new_node;
27428
 
+        }
27429
 
+
27430
 
+        MOD_INC_USE_COUNT;
27431
 
+
27432
 
+        return new_node;
27433
 
+}
27434
 
+
27435
 
+
27436
 
+/*
27437
 
+ * Function:  delete_os2lvm_volume
27438
 
+ *
27439
 
+ *      This function deletes the in-memory representation of an OS/2
27440
 
+ *      logical volume.
27441
 
+ */
27442
 
+static int delete_os2lvm_volume( evms_logical_node_t * logical_node )
27443
 
+{
27444
 
+        os2_drivelink_runtime_entry_t  * curdrvlink = (( os2_volume_runtime_entry_t * )logical_node->instance_data )->drive_link, * nextdrvlink;
27445
 
+        os2_volume_runtime_entry_t  * cur_volume, * next_volume;
27446
 
+
27447
 
+        while ( curdrvlink ) {
27448
 
+                nextdrvlink = curdrvlink->next;
27449
 
+                delete_os2_drive_link( curdrvlink, 1 );
27450
 
+                curdrvlink = nextdrvlink;
27451
 
+        }
27452
 
+
27453
 
+        cur_volume = ( os2_volume_runtime_entry_t  * )os2lvm_nodes->instance_data;
27454
 
+        if ( os2lvm_nodes == logical_node )
27455
 
+                os2lvm_nodes = cur_volume->next_os2lvm_node;
27456
 
+        else {
27457
 
+                while ( cur_volume->next_os2lvm_node ) {
27458
 
+                        next_volume = ( os2_volume_runtime_entry_t  * )cur_volume->next_os2lvm_node->instance_data;
27459
 
+                        if ( cur_volume->next_os2lvm_node == logical_node ) {
27460
 
+                                cur_volume->next_os2lvm_node = next_volume->next_os2lvm_node;
27461
 
+                                break;
27462
 
+                        }
27463
 
+                }
27464
 
+        }
27465
 
+
27466
 
+       if ( os2lvm_nodes == NULL ) {
27467
 
+               // Just deleted the last os2 node. Stop the BBR thread.
27468
 
+               if ( BBR_Worker_Thread ) {
27469
 
+                       evms_cs_unregister_thread(BBR_Worker_Thread);
27470
 
+                       BBR_Worker_Thread = NULL;
27471
 
+               }
27472
 
+       }
27473
 
+
27474
 
+        evms_cs_deallocate_memory( logical_node->instance_data );
27475
 
+        evms_cs_deallocate_logical_node( logical_node );
27476
 
+
27477
 
+        MOD_DEC_USE_COUNT;
27478
 
+
27479
 
+        return 0;
27480
 
+}
27481
 
+
27482
 
+
27483
 
+/*
27484
 
+ * Function:  delete_os2_drive_link
27485
 
+ *
27486
 
+ *      This function deletes the drive link runtime structure and any
27487
 
+ *       other structures it points to.
27488
 
+ */
27489
 
+static int delete_os2_drive_link( os2_drivelink_runtime_entry_t  * drive_link,
27490
 
+                                  int delete_link_partition )
27491
 
+{
27492
 
+        if ( drive_link->link_data )
27493
 
+                evms_cs_deallocate_memory( drive_link->link_data );
27494
 
+        if ( drive_link->bbr_data )
27495
 
+                evms_cs_deallocate_memory( drive_link->bbr_data );
27496
 
+        if ( delete_link_partition )
27497
 
+                DELETE( drive_link->link_partition );
27498
 
+        evms_cs_deallocate_memory( drive_link );
27499
 
+
27500
 
+        return 0;
27501
 
+}
27502
 
+
27503
 
+
27504
 
+
27505
 
+// Consistency Checking Functions
27506
 
+
27507
 
+
27508
 
+/*
27509
 
+ * Function:  validate_signaturesector
27510
 
+ *
27511
 
+ *      This function checks the OS/2 LVM Signature Sector
27512
 
+ */
27513
 
+static int validate_signaturesector(evms_logical_node_t * evms_partition, 
27514
 
+                                    LVM_Signature_Sector * signature_sector,
27515
 
+                                    u_int32_t sectorsize )
27516
 
+{
27517
 
+        u_int32_t  crc_hold, crc_new;
27518
 
+
27519
 
+        /* In order for a signature sector to be considered valid, its signature and CRC must
27520
 
+           be correct.  Also, OS/2 stores the starting LBA of the partition and the size of
27521
 
+           the partition that this signature sector corresponds to.  These should be checked
27522
 
+           as well.  However, since the starting LBA of the partition that this belongs to is
27523
 
+           not available to us as part of an evms_logical_node_t, we can only check the size
27524
 
+           of the partition against what is stored in the signature sector.                    */
27525
 
+
27526
 
+        /* The signature used is in two parts.  Test the first part. */
27527
 
+        if ( signature_sector->LVM_Signature1 != OS2LVM_PRIMARY_SIGNATURE ) {
27528
 
+                LOG_EVERYTHING("Primary LVM Signature failed.\n" );
27529
 
+                return 1;
27530
 
+        }
27531
 
+
27532
 
+        /* Test the second part of the signature. */
27533
 
+        if ( signature_sector->LVM_Signature2 != OS2LVM_SECONDARY_SIGNATURE ) {
27534
 
+                LOG_EVERYTHING("Secondary LVM Signature failed.\n" );
27535
 
+                return 1;
27536
 
+        }
27537
 
+
27538
 
+        /* Calculate the CRC and compare it against the stored CRC. */
27539
 
+        crc_hold = signature_sector->Signature_Sector_CRC;
27540
 
+        signature_sector->Signature_Sector_CRC = 0;
27541
 
+        crc_new = evms_cs_calculate_crc( EVMS_INITIAL_CRC, ( void * )signature_sector, sectorsize );
27542
 
+        if ( crc_hold != crc_new ) {
27543
 
+                LOG_EVERYTHING("Signature sector crc failed.\n" );
27544
 
+                LOG_EVERYTHING("sector_crc == %x , calc_crc == %x \n", crc_hold, crc_new );
27545
 
+                return 1;
27546
 
+        }
27547
 
+
27548
 
+        // The partition size must == that found in the Signature Sector
27549
 
+        if ( evms_partition->total_vsectors != signature_sector->Partition_Sector_Count ) {
27550
 
+                LOG_EXTRA("Partition size is not valid\n" );
27551
 
+                return 1;
27552
 
+        }
27553
 
+
27554
 
+        return 0;
27555
 
+}
27556
 
+
27557
 
+
27558
 
+/*
27559
 
+ * Function:  validate_drivelinksector
27560
 
+ *
27561
 
+ *      This function checks the OS/2 LVM Drivelink Feature Sector
27562
 
+ */
27563
 
+static int validate_drivelinksector( void *    Sector_To_Validate,
27564
 
+                                     int       Sector_Index,
27565
 
+                                     u_int32_t Master_Sequence_Number )
27566
 
+{
27567
 
+        u_int32_t  crc_hold, crc_new;
27568
 
+        LVM_Link_Table_First_Sector * First_Sector = (LVM_Link_Table_First_Sector * ) Sector_To_Validate;
27569
 
+        LVM_Link_Table_Sector *       Link_Sector = (LVM_Link_Table_Sector  * ) Sector_To_Validate;
27570
 
+
27571
 
+        /* The OS/2 drive linking data covers several sectors.  The format of the first sector is slightly
27572
 
+           different from the following sectors because it contains additional information about how many
27573
 
+           drive links are actually in use.  The following sectors just contain portions of the drive link
27574
 
+           table.  Each sector of OS/2 drive linking data contains a signature, crc, and sequence number
27575
 
+           which must be validated.                                                                         */
27576
 
+
27577
 
+        if ( Sector_Index == 0 ) {
27578
 
+
27579
 
+                /* Link Table Master Signature Check */
27580
 
+                if ( LINK_TABLE_MASTER_SIGNATURE != First_Sector->Link_Table_Signature ) {
27581
 
+                        LOG_EVERYTHING("Link Table Master Signature Test failed.\n" );
27582
 
+                        return 1;
27583
 
+                }
27584
 
+
27585
 
+                /* We will NOT check the sequence number here as the first sector of drive link data is the
27586
 
+                   source of the Master_Sequence_Number which was passed in to us.                           */
27587
 
+
27588
 
+                /* Set up for the CRC Check */
27589
 
+                crc_hold = First_Sector->Link_Table_CRC;
27590
 
+                First_Sector->Link_Table_CRC = 0;
27591
 
+        }
27592
 
+        else {
27593
 
+                /* Link Table Internal Signature Check */
27594
 
+                if ( LINK_TABLE_SIGNATURE != Link_Sector->Link_Table_Signature ) {
27595
 
+                        LOG_EVERYTHING("Link Table Internal Signature Test failed.\n" );
27596
 
+                        return 1;
27597
 
+                }
27598
 
+
27599
 
+                /* Check the sequence number. */
27600
 
+                if ( Master_Sequence_Number != Link_Sector->Sequence_Number ) {
27601
 
+                        LOG_EVERYTHING("Link Table Internal Sequence Number Test failed.\n" );
27602
 
+                        return 1;                        
27603
 
+                }
27604
 
+
27605
 
+                /* Set up for the CRC Check */
27606
 
+                crc_hold = Link_Sector->Link_Table_CRC;
27607
 
+                Link_Sector->Link_Table_CRC = 0;
27608
 
+        }
27609
 
+
27610
 
+        crc_new = evms_cs_calculate_crc( EVMS_INITIAL_CRC, Sector_To_Validate, OS2_BYTES_PER_SECTOR );
27611
 
+        if ( crc_hold != crc_new ) {
27612
 
+                LOG_EVERYTHING("Link Table crc failed.\n" );
27613
 
+                LOG_EVERYTHING("sector_crc == %x , calc_crc == %x \n", crc_hold, crc_new );
27614
 
+                return 1;
27615
 
+        }
27616
 
+
27617
 
+        return 0;
27618
 
+}
27619
 
+
27620
 
+
27621
 
+/*
27622
 
+ * Function:  validate_bbrtablesector
27623
 
+ *
27624
 
+ *      This function checks the OS/2 LVM Bad Block Relocation Feature Sector
27625
 
+ */
27626
 
+static int validate_bbrtablesector(  void *    Sector_To_Validate,
27627
 
+                                     int       Sector_Index,
27628
 
+                                     u_int32_t Master_Sequence_Number )
27629
 
+{
27630
 
+        u_int32_t                       crc_hold, crc_new;
27631
 
+        LVM_BBR_Table_First_Sector *    First_Sector = (LVM_BBR_Table_First_Sector * ) Sector_To_Validate;
27632
 
+        LVM_BBR_Table_Sector *          BBR_Sector = (LVM_BBR_Table_Sector  * ) Sector_To_Validate;
27633
 
+
27634
 
+        /* The OS/2 bad block relocation (BBR) data covers several sectors.  The format of the first sector 
27635
 
+           is different from the following sectors because it contains additional information about how many
27636
 
+           relocations are actually in use and the size and location of the block of replacement sectors.  
27637
 
+           The following sectors just contain portions of the BBR remap table.  Each sector of OS/2 BBR data 
27638
 
+           contains a signature, crc, and sequence number which must be validated.                             */
27639
 
+
27640
 
+        if ( Sector_Index == 0 ) {
27641
 
+
27642
 
+                /* BBR Table Master Signature Check */
27643
 
+                if ( BBR_TABLE_MASTER_SIGNATURE != First_Sector->Signature ) {
27644
 
+                        LOG_EVERYTHING("BBR Table Master Signature Test failed.\n" );
27645
 
+                        return 1;
27646
 
+                }
27647
 
+
27648
 
+                /* We will NOT check the sequence number here as the first sector of BBR data is the
27649
 
+                   source of the Master_Sequence_Number which was passed in to us.                      */
27650
 
+
27651
 
+                /* Set up for the CRC Check */
27652
 
+                crc_hold = First_Sector->CRC;
27653
 
+                First_Sector->CRC = 0;
27654
 
+
27655
 
+        }
27656
 
+        else {
27657
 
+                /* BBR Table Internal Signature Check */
27658
 
+                if ( BBR_TABLE_SIGNATURE != BBR_Sector->Signature ) {
27659
 
+                        LOG_EVERYTHING("BBR Table Internal Signature Test failed.\n" );
27660
 
+                        return 1;
27661
 
+                }
27662
 
+
27663
 
+                /* Check the sequence number. */
27664
 
+                if ( Master_Sequence_Number != BBR_Sector->Sequence_Number ) {
27665
 
+                        LOG_EVERYTHING("BBR Table Internal Sequence Number Test failed.\n" );
27666
 
+                        return 1;                        
27667
 
+                }
27668
 
+
27669
 
+                /* Set up for the CRC Check */
27670
 
+                crc_hold = BBR_Sector->CRC;
27671
 
+                BBR_Sector->CRC = 0;
27672
 
+        }
27673
 
+
27674
 
+        crc_new = evms_cs_calculate_crc( EVMS_INITIAL_CRC, Sector_To_Validate, OS2_BYTES_PER_SECTOR );
27675
 
+        if ( crc_hold != crc_new ) {
27676
 
+                LOG_EVERYTHING("BBRTable crc failed.\n" );
27677
 
+                LOG_EVERYTHING("sector_crc == %x , calc_crc == %x \n", crc_hold, crc_new );
27678
 
+                return 1;
27679
 
+        }
27680
 
+
27681
 
+        return 0;
27682
 
+}
27683
 
+
27684
 
+
27685
 
+/*
27686
 
+ * Function:  check_for_os2_bbr_relocations
27687
 
+ *
27688
 
+ *      This function checks the OS/2 LVM Bad Block Relocation Tables
27689
 
+ *       for any active relocation sectors.  The bbr table is reformatted in memory
27690
 
+ *       to make searches faster.
27691
 
+ *      Return values:  0 == no active relocations, 1 == contains active relocations
27692
 
+ */
27693
 
+static u_int32_t check_for_os2_bbr_relocations( char  * bbr_data_ptr )
27694
 
+{
27695
 
+        LVM_BBR_Feature *  feature_data = ( LVM_BBR_Feature * )bbr_data_ptr;
27696
 
+
27697
 
+        if ( feature_data->control.Table_Entries_In_Use ) {
27698
 
+                LOG_EVERYTHING("There are %d active relocations.\n", feature_data->control.Table_Entries_In_Use );
27699
 
+                return 1;
27700
 
+        }
27701
 
+
27702
 
+        return 0;
27703
 
+}
27704
 
+
27705
 
+
27706
 
+/*
27707
 
+ * Function:  check_os2_volumes
27708
 
+ *
27709
 
+ *      This function performs a consistency check on all existing OS/2
27710
 
+ *        Logical Volumes.  The list of constituent partitions ( links )
27711
 
+ *        is checked and ordered according to the Link Table.  If any link
27712
 
+ *        is missing or inconsistent, the entire volume will be deleted.
27713
 
+ */
27714
 
+static int check_os2_volumes( evms_logical_node_t ** node_list )
27715
 
+{
27716
 
+        os2_volume_runtime_entry_t  * cur_volume;
27717
 
+        os2_volume_runtime_entry_t  * previous_volume;
27718
 
+        evms_logical_node_t         * cur_node;
27719
 
+        evms_logical_node_t         * previous_node = NULL;
27720
 
+        os2_drivelink_runtime_entry_t  * link_list, * link_hold;
27721
 
+        LVM_Link_Table_First_Sector  * psector1;
27722
 
+        int i, rc = 0;
27723
 
+        u_int32_t  numlinks, countlinks, linkser;
27724
 
+        u_int32_t  Master_Sequence_Number;  /* Used to check whether or not all of the copies of Drive Linking data match. */
27725
 
+        evms_sector_t   partition_offset;
27726
 
+        char  * sect_ptr;
27727
 
+
27728
 
+        LOG_ENTRY_EXIT("Checking OS/2 Logical Volumes\n" );
27729
 
+
27730
 
+        cur_node = os2lvm_nodes;
27731
 
+
27732
 
+        while ( cur_node ) {
27733
 
+                cur_volume = ( os2_volume_runtime_entry_t  * )cur_node->instance_data;
27734
 
+                link_list = NULL;
27735
 
+                if ( !cur_volume->complete ) {  /* need to verify this one  */
27736
 
+                        cur_volume->complete = 1;
27737
 
+                        LOG_EVERYTHING("Checking volume %s\n", cur_node->name );
27738
 
+
27739
 
+                        // Reset fields for sort operation
27740
 
+                        cur_volume->size_in_sectors = 0;
27741
 
+                        numlinks = cur_volume->drive_link_count;
27742
 
+                        cur_volume->drive_link_count = 0;
27743
 
+                        cur_node->total_vsectors = 0;
27744
 
+                        link_list = cur_volume->drive_link;
27745
 
+                        cur_volume->drive_link = NULL;
27746
 
+
27747
 
+                        // Access the link data to order the drive links
27748
 
+                        psector1 = ( LVM_Link_Table_First_Sector * )link_list->link_data;
27749
 
+                        Master_Sequence_Number = psector1->Sequence_Number;
27750
 
+
27751
 
+                        if ( numlinks != psector1->Links_In_Use ) {
27752
 
+                                LOG_SERIOUS("Link Count mismatch vol=%i, table=%i\n", numlinks, psector1->Links_In_Use );
27753
 
+                                cur_volume->complete = 0;
27754
 
+                                countlinks = 0;
27755
 
+                        }
27756
 
+                        else{
27757
 
+                                if ( numlinks > LINKS_IN_FIRST_SECTOR ) {
27758
 
+                                        countlinks = LINKS_IN_FIRST_SECTOR;
27759
 
+                                        numlinks -= LINKS_IN_FIRST_SECTOR;
27760
 
+                                }
27761
 
+                                else {
27762
 
+                                        countlinks = numlinks;
27763
 
+                                        numlinks = 0;
27764
 
+                                }
27765
 
+
27766
 
+                        }
27767
 
+
27768
 
+                        partition_offset = 0;
27769
 
+                        for ( i = 0; (i < countlinks) && (cur_volume->complete == 1); i++ ) {
27770
 
+                                linkser = psector1->Link_Table[i].Partition_Serial_Number;
27771
 
+                                if ( ( link_hold = find_link_data( &link_list, linkser ) ) ) {
27772
 
+                                        // Add this partition to its parent Volume
27773
 
+                                        add_os2link( link_hold, cur_node );
27774
 
+                                        LOG_EVERYTHING("Link start_RBA == %Ld , sector_count == %Ld\n",
27775
 
+                                                       partition_offset, link_hold->sector_count );
27776
 
+                                        link_hold->start_sector = partition_offset;
27777
 
+                                        partition_offset += link_hold->sector_count;
27778
 
+                                }
27779
 
+                                else {
27780
 
+                                        LOG_SERIOUS("Link Table entry %i metadata missing\n", i );
27781
 
+                                        cur_volume->complete = 0;
27782
 
+                                        break;
27783
 
+                                }
27784
 
+                        }
27785
 
+
27786
 
+                        sect_ptr = ( char * )psector1;
27787
 
+
27788
 
+                        while ( numlinks && (cur_volume->complete == 1) ) {
27789
 
+                                if ( numlinks > LINKS_IN_NEXT_SECTOR ) {
27790
 
+                                        countlinks = LINKS_IN_NEXT_SECTOR;
27791
 
+                                        numlinks -= LINKS_IN_NEXT_SECTOR;
27792
 
+                                }
27793
 
+                                else {
27794
 
+                                        countlinks = numlinks;
27795
 
+                                        numlinks = 0;
27796
 
+                                }
27797
 
+                                sect_ptr += OS2_BYTES_PER_SECTOR;
27798
 
+                                if ( Master_Sequence_Number != (( LVM_Link_Table_Sector  * )sect_ptr )->Sequence_Number ) {
27799
 
+                                        cur_volume->complete = 0;
27800
 
+                                        LOG_SERIOUS("Bad Sequence Number for Drive Linking Metadata!\n");
27801
 
+                                }
27802
 
+                                else {
27803
 
+                                        for ( i = 0; i < countlinks; i++ ) {
27804
 
+                                                linkser = (( LVM_Link_Table_Sector  * )sect_ptr )->Link_Table[i].Partition_Serial_Number;
27805
 
+                                                if ( ( link_hold = find_link_data( &link_list, linkser ) ) ) {
27806
 
+                                                        // Add this partition to its parent Volume
27807
 
+                                                        add_os2link( link_hold, cur_node );
27808
 
+                                                        LOG_EVERYTHING("Link start_RBA == %Ld , sector_count == %Ld\n",
27809
 
+                                                                       partition_offset, link_hold->sector_count );
27810
 
+                                                        link_hold->start_sector = partition_offset;
27811
 
+                                                        partition_offset += link_hold->sector_count;
27812
 
+                                                }
27813
 
+                                                else {
27814
 
+                                                        LOG_SERIOUS("Link Table entry %i metadata missing\n", i );
27815
 
+                                                        cur_volume->complete = 0;
27816
 
+                                                        break;
27817
 
+                                                }
27818
 
+                                        }
27819
 
+                                }
27820
 
+                        }
27821
 
+                }
27822
 
+
27823
 
+                /* If the volume is complete we can export it for use. */
27824
 
+                if ( cur_volume->complete && (link_list == NULL) ) {                      
27825
 
+                        
27826
 
+                        // Link new volume into the node list
27827
 
+                        if ( cur_volume->Export_Needed &&
27828
 
+                             ( !evms_cs_add_logical_node_to_list( node_list, cur_node ) )
27829
 
+                           ) {
27830
 
+                                rc++;
27831
 
+                                cur_volume->Export_Needed = 0;
27832
 
+                        }
27833
 
+
27834
 
+                        previous_node = cur_node;
27835
 
+                        cur_node = cur_volume->next_os2lvm_node;
27836
 
+                }
27837
 
+                else {
27838
 
+                        /* Remove the volume from os2lvm_nodes list and delete it. */
27839
 
+                        if ( previous_node != NULL ) {
27840
 
+                                
27841
 
+                                previous_volume = ( os2_volume_runtime_entry_t  * )previous_node->instance_data;
27842
 
+                                previous_volume->next_os2lvm_node = cur_volume->next_os2lvm_node;
27843
 
+                                cur_volume->next_os2lvm_node = NULL;
27844
 
+
27845
 
+                                delete_os2lvm_volume(cur_node);
27846
 
+
27847
 
+                                cur_node = previous_volume->next_os2lvm_node;
27848
 
+                        }
27849
 
+                        else {
27850
 
+                                previous_node = cur_volume->next_os2lvm_node;
27851
 
+                                delete_os2lvm_volume(cur_node);
27852
 
+                                cur_node = previous_node;
27853
 
+                                previous_node = NULL;
27854
 
+                                os2lvm_nodes = cur_node;
27855
 
+                        }
27856
 
+
27857
 
+                        /* If any items remain in link_list, delete those as well. */
27858
 
+                        while (link_list) {
27859
 
+                                link_hold = link_list->next;
27860
 
+                                delete_os2_drive_link(link_list,1);
27861
 
+                                link_list = link_hold;
27862
 
+                        }
27863
 
+
27864
 
+                }
27865
 
+
27866
 
+        }
27867
 
+
27868
 
+        LOG_ENTRY_EXIT("Finished Checking OS/2 Logical Volumes\n" );
27869
 
+
27870
 
+        return rc;
27871
 
+}
27872
 
+
27873
 
+
27874
 
+
27875
 
+/* BBR_Transfer_IO
27876
 
+ *
27877
 
+ *     Transfer the responsibility for completing the specified IO from
27878
 
+ *      the thread that requested it to the BBR Worker Thread
27879
 
+ */
27880
 
+static void BBR_Transfer_IO(BBR_IO_Transfer_Record_t * Transfer_Record)
27881
 
+{
27882
 
+       unsigned long           flags;
27883
 
+        int                     Wake_Worker_Thread = 0;  /* Assume that the worker is already awake. */
27884
 
+
27885
 
+       spin_lock_irqsave(&BBR_Queue_Lock, flags);
27886
 
+
27887
 
+        /* The BBR IO List is a singly linked list.  BBR_IO_List_Head points
27888
 
+           to the first item in the list, and BBR_IO_List_Tail points to the
27889
 
+           last item in the list.                                            */
27890
 
+        Transfer_Record->Next = NULL;
27891
 
+        if ( !BBR_IO_List_Tail ) {    /* Empty list */
27892
 
+                BBR_IO_List_Head = Transfer_Record;
27893
 
+                Wake_Worker_Thread = 1;             /* Wake up the worker thread. */
27894
 
+        }
27895
 
+        else /* Items already in the list. */
27896
 
+                BBR_IO_List_Tail->Next = Transfer_Record;
27897
 
+
27898
 
+        BBR_IO_List_Tail = Transfer_Record;
27899
 
+
27900
 
+       spin_unlock_irqrestore(&BBR_Queue_Lock, flags);
27901
 
+        if ( Wake_Worker_Thread )
27902
 
+         evms_cs_wakeup_thread(BBR_Worker_Thread);
27903
 
+
27904
 
+        return;
27905
 
+}
27906
 
+
27907
 
+
27908
 
+/* OS2_DL_Callback
27909
 
+ * 
27910
 
+ * This is the callback function used when an I/O request has to be broken 
27911
 
+ * into two parts because it crosses a drive link boundary.
27912
 
+ *
27913
 
+ */
27914
 
+static void OS2_DL_Callback(struct buffer_head *bh, int uptodate)
27915
 
+{
27916
 
+
27917
 
+        DL_IO_Tracking_Record_t * Tracking_Record;
27918
 
+        struct buffer_head *      Original;
27919
 
+
27920
 
+        Tracking_Record = bh->b_private;
27921
 
+
27922
 
+        /* Is this a read or a write? */
27923
 
+        if ( Tracking_Record->Link1_Transfer_Record || 
27924
 
+             Tracking_Record->Link2_Transfer_Record ) {
27925
 
+                /* We have a write here.  Was it successful? */
27926
 
+                if ( ! uptodate) {
27927
 
+                        /* Have we tried BBR yet? */
27928
 
+                        if ( ( bh == Tracking_Record->Link1.bh ) &&
27929
 
+                             ( ! Tracking_Record->Link1_BBR_Attempted ) ){
27930
 
+                                 /* Attempt BBR. */
27931
 
+                                BBR_Transfer_IO(Tracking_Record->Link1_Transfer_Record);
27932
 
+                                Tracking_Record->Link1_BBR_Attempted = 1;
27933
 
+                                return;
27934
 
+                        }
27935
 
+                        else if ( ( bh == Tracking_Record->Link2.bh ) &&
27936
 
+                                  ( ! Tracking_Record->Link2_BBR_Attempted ) ) {
27937
 
+                                 /* Attempt BBR. */
27938
 
+                                BBR_Transfer_IO(Tracking_Record->Link2_Transfer_Record);
27939
 
+                                Tracking_Record->Link2_BBR_Attempted = 1;
27940
 
+                                return;
27941
 
+                        }
27942
 
+
27943
 
+                }
27944
 
+
27945
 
+        }
27946
 
+
27947
 
+        Tracking_Record->IO_In_Progress -= 1;
27948
 
+        if ( Tracking_Record->IO_In_Progress) {
27949
 
+                Tracking_Record->Up_To_Date = uptodate;
27950
 
+        }
27951
 
+        Original = Tracking_Record->Original.bh;
27952
 
+
27953
 
+        if ( ! Tracking_Record->IO_In_Progress ) {
27954
 
+                uptodate &= Tracking_Record->Up_To_Date;
27955
 
+                /* If this is a write, then Transfer Records will have been set up for both Link1 and Link2.
27956
 
+                   If the transfer records were used because of BBR, then the BBR worker thread will have
27957
 
+                   disposed of the transfer records.  If the transfer records were not used, then we must
27958
 
+                   dispose of them here to prevent memory leaks.                                             */
27959
 
+                if ( Tracking_Record->Link1_Transfer_Record &&
27960
 
+                     ( ! Tracking_Record->Link1_BBR_Attempted) ) {
27961
 
+                        evms_cs_deallocate_to_pool( BBR_Transfer_Pool,Tracking_Record->Link1_Transfer_Record);
27962
 
+                }
27963
 
+                if ( Tracking_Record->Link2_Transfer_Record &&
27964
 
+                     ( ! Tracking_Record->Link2_BBR_Attempted) ) {
27965
 
+                        evms_cs_deallocate_to_pool( BBR_Transfer_Pool,Tracking_Record->Link2_Transfer_Record);
27966
 
+                }
27967
 
+                evms_cs_deallocate_to_pool(evms_bh_pool,Tracking_Record->Link1.bh);
27968
 
+                evms_cs_deallocate_to_pool(evms_bh_pool,Tracking_Record->Link2.bh);
27969
 
+                evms_cs_deallocate_to_pool(DL_Tracking_Pool,Tracking_Record);
27970
 
+                Original->b_end_io(Original,uptodate);
27971
 
+        }
27972
 
+
27973
 
+        return;
27974
 
+}
27975
 
+
27976
 
+/* OS2_BBR_Write_Callback
27977
 
+ *
27978
 
+ *     This is the callback for normal write requests. Check for an error
27979
 
+ *     during the I/O, and send to the worker thread for processing if necessary.
27980
 
+ */
27981
 
+static void OS2_BBR_Write_Callback( BBR_IO_Transfer_Record_t * Transfer_Record,
27982
 
+                                   struct buffer_head       * bh,
27983
 
+                                   int                        uptodate,
27984
 
+                                   int                      * redrive )
27985
 
+{
27986
 
+       if ( ! uptodate ) {
27987
 
+               BBR_Transfer_IO(Transfer_Record);
27988
 
+               *redrive = TRUE;
27989
 
+       }
27990
 
+       else {
27991
 
+               evms_cs_deallocate_to_pool(BBR_Transfer_Pool,Transfer_Record);
27992
 
+       }
27993
 
+
27994
 
+        return;
27995
 
+}
27996
 
+
27997
 
+
27998
 
+
27999
 
+
28000
 
+/* Worker thread to handle:
28001
 
+
28002
 
+   I/O to drive/partitions/objects where bad blocks are known to exist
28003
 
+   I/O to drive/partition/object where a new bad block has been discovered and the I/O must be redriven.
28004
 
+   
28005
 
+*/
28006
 
+static void BBR_Worker( void * Not_Used)
28007
 
+{
28008
 
+       unsigned long              flags;
28009
 
+        BBR_IO_Transfer_Record_t * Current_IO;
28010
 
+        int                        complete;
28011
 
+
28012
 
+       for (;;) {
28013
 
+               // Process bbr_io_list, one entry at a time.
28014
 
+               spin_lock_irqsave(&BBR_Queue_Lock, flags);
28015
 
+
28016
 
+                /* Is there any work for us? */
28017
 
+                if ( ! BBR_IO_List_Head ) {
28018
 
+                        spin_unlock_irqrestore(&BBR_Queue_Lock, flags);
28019
 
+                        break;  /* List empty - nothing to do. */
28020
 
+                }
28021
 
+
28022
 
+                /* Get the IO to perform. */
28023
 
+                Current_IO = BBR_IO_List_Head;
28024
 
+                BBR_IO_List_Head = Current_IO->Next;
28025
 
+                if (! BBR_IO_List_Head ) 
28026
 
+                        BBR_IO_List_Tail = BBR_IO_List_Head;
28027
 
+               
28028
 
+                spin_unlock_irqrestore(&BBR_Queue_Lock, flags);
28029
 
+
28030
 
+                /* Now lets process the I/O request. */
28031
 
+                complete = do_os2_bbr_io(Current_IO->Partition_Data,Current_IO->Write_Flag, Current_IO->eio.rsector, Current_IO->eio.rsize, Current_IO->eio.bh->b_data);
28032
 
+
28033
 
+                /* We need to do the callback. */
28034
 
+                Current_IO->eio.bh->b_end_io(Current_IO->eio.bh, (complete == 0) );
28035
 
+
28036
 
+                /* Now cleanup */
28037
 
+                evms_cs_deallocate_to_pool(BBR_Transfer_Pool,Current_IO);
28038
 
+       }
28039
 
+
28040
 
+        return;  /* Go to sleep. */
28041
 
+
28042
 
+}
28043
 
+
28044
 
+
28045
 
+/* 
28046
 
+ * Sector_Is_Remapped
28047
 
+ *
28048
 
+ * This function returns 1 if the specified sector has been remapped, 0 if it has not
28049
 
+ *
28050
 
+ * If the sector has been remapped, then the new sector is returned in Replacement_Sector
28051
 
+ *
28052
 
+ */
28053
 
+static int Sector_Is_Remapped(os2_drivelink_runtime_entry_t  * io_dlentry, evms_sector_t Source_Sector, evms_sector_t * Replacement_Sector)
28054
 
+{
28055
 
+        LVM_BBR_Feature * Feature_Data = ( LVM_BBR_Feature * )io_dlentry->bbr_data;
28056
 
+        unsigned int      Sector_Index;    /* The BBR Table is spread across several sectors.  This tracks which sector we are looking at. */
28057
 
+        unsigned int      BBR_Table_Index; /* This tracks the actual entry in the BBR Table that we are examining. */
28058
 
+        unsigned int      BBR_Table_Entries_In_Use = Feature_Data->control.Table_Entries_In_Use;
28059
 
+        BBR_Table_Entry * BBR_Table_Entry;
28060
 
+        unsigned int      Guard1;
28061
 
+
28062
 
+        /* Default value is no remap. */
28063
 
+        *Replacement_Sector = Source_Sector;
28064
 
+
28065
 
+        do {
28066
 
+                Guard1 = io_dlentry->Guard1;  /* Lamport's Theorem */
28067
 
+
28068
 
+                for ( BBR_Table_Index = 0; BBR_Table_Index < BBR_Table_Entries_In_Use; BBR_Table_Index++) {
28069
 
+                        Sector_Index = BBR_Table_Index / BBR_TABLE_ENTRIES_PER_SECTOR;
28070
 
+                        BBR_Table_Entry = &(Feature_Data->remap[Sector_Index].BBR_Table[BBR_Table_Index - (Sector_Index * BBR_TABLE_ENTRIES_PER_SECTOR)]);
28071
 
+                        if ( BBR_Table_Entry->BadSector == Source_Sector ){
28072
 
+                                *Replacement_Sector = BBR_Table_Entry->ReplacementSector;
28073
 
+                                break;
28074
 
+                        }
28075
 
+                }
28076
 
+
28077
 
+        } while ( Guard1 != io_dlentry->Guard2 );  /* Lamport's Theorem */
28078
 
+
28079
 
+        if ( *Replacement_Sector != Source_Sector )
28080
 
+                return 1;
28081
 
+        else
28082
 
+                return 0;
28083
 
+}
28084
 
+
28085
 
+
28086
 
+/*
28087
 
+ * Invalidate_Mapping
28088
 
+ *
28089
 
+ * This function either frees a replacement sector to be reused, or it 
28090
 
+ * marks the replacement sector as bad.
28091
 
+ *
28092
 
+ */
28093
 
+static void Invalidate_Mapping(os2_drivelink_runtime_entry_t  * dlentry, 
28094
 
+                               evms_sector_t                    Source_Sector,
28095
 
+                               int                              Replacement_Sector_Is_Bad)
28096
 
+{
28097
 
+        LVM_BBR_Feature * Feature_Data = ( LVM_BBR_Feature * )dlentry->bbr_data;
28098
 
+        unsigned int      Sector_Index;    /* The BBR Table is spread across several sectors.  This tracks which sector we are looking at. */
28099
 
+        unsigned int      BBR_Table_Index; /* This tracks the actual entry in the BBR Table that we are examining. */
28100
 
+        unsigned int      BBR_Table_Entries_In_Use = Feature_Data->control.Table_Entries_In_Use;
28101
 
+        BBR_Table_Entry * BBR_Table_Entry = NULL;
28102
 
+
28103
 
+        /* Lock for the BBR Table. */
28104
 
+        down( &(dlentry->BBR_Table_Lock) );
28105
 
+
28106
 
+        /* Find the entry to invalidate. */
28107
 
+        for ( BBR_Table_Index = 0; BBR_Table_Index < BBR_Table_Entries_In_Use; BBR_Table_Index++) {
28108
 
+                Sector_Index = BBR_Table_Index / BBR_TABLE_ENTRIES_PER_SECTOR;
28109
 
+                BBR_Table_Entry = &(Feature_Data->remap[Sector_Index].BBR_Table[BBR_Table_Index - (Sector_Index * BBR_TABLE_ENTRIES_PER_SECTOR)]);
28110
 
+                if ( BBR_Table_Entry->BadSector == Source_Sector ){
28111
 
+                        break;
28112
 
+                }
28113
 
+        }
28114
 
+
28115
 
+        /* Now that we have found the entry, we must invalidate it. */
28116
 
+        if ( Replacement_Sector_Is_Bad ) {
28117
 
+                BBR_Table_Entry->BadSector = (u_int32_t) -1;
28118
 
+        }
28119
 
+        /* OS/2 supported a method for clearing out bad block remappings if the filesystem on the volume supported
28120
 
+           the tracking of bad blocks.  We don't support that under Linux, so there is no else case here.           */
28121
 
+        
28122
 
+        /* Unlock the BBR Table */
28123
 
+        up( &(dlentry->BBR_Table_Lock) );
28124
 
+
28125
 
+        return;
28126
 
+}
28127
 
+
28128
 
+/*
28129
 
+ * Create_New_BBR_Table_Entry
28130
 
+ *
28131
 
+ * Finds bad blocks within the range specified, allocates replacement sectors,
28132
 
+ * writes the data to the replacement sectors, and updates the BBR metadata on
28133
 
+ * disk to reflect the new mapping.  Returns 1 if successful, 0 otherwise.
28134
 
+ *
28135
 
+ */
28136
 
+static int Create_New_BBR_Table_Entry(os2_drivelink_runtime_entry_t  * dlentry,
28137
 
+                                      evms_sector_t                    starting_lsn, 
28138
 
+                                      unsigned int                     count, 
28139
 
+                                      void *                           buffer)
28140
 
+{
28141
 
+        evms_sector_t    lsn;
28142
 
+        BBR_Table_Entry *Table_Entry;
28143
 
+        unsigned int     Sector_Index;
28144
 
+        unsigned int     Table_Index;
28145
 
+        int              rc;
28146
 
+        int              rc2;
28147
 
+        u_int32_t        New_Sequence_Number;
28148
 
+        LVM_BBR_Feature *BBR_Data = (LVM_BBR_Feature*) dlentry->bbr_data;
28149
 
+
28150
 
+        for ( lsn = starting_lsn; lsn < (starting_lsn + count); lsn++) {
28151
 
+                rc = INIT_IO(dlentry->link_partition, 1, lsn, 1, buffer);
28152
 
+                while (rc) {
28153
 
+                        
28154
 
+                        /* Lock for the BBR Table. */
28155
 
+                        down( &(dlentry->BBR_Table_Lock) );
28156
 
+
28157
 
+                        /* Increment the second guard value. This will cause those reading the BBR Table to spin.*/
28158
 
+                        dlentry->Guard2++;
28159
 
+
28160
 
+                        /* Ensure that the bbr active flag is set. */
28161
 
+                        dlentry->bbr_is_active = 1;
28162
 
+
28163
 
+                        /* Allocate a replacement sector */
28164
 
+                        if ( BBR_Data->control.Table_Entries_In_Use < BBR_Data->control.Table_Size ) {
28165
 
+                                Sector_Index = BBR_Data->control.Table_Entries_In_Use / BBR_TABLE_ENTRIES_PER_SECTOR;
28166
 
+                                Table_Index = BBR_Data->control.Table_Entries_In_Use % BBR_TABLE_ENTRIES_PER_SECTOR;
28167
 
+                                BBR_Data->control.Table_Entries_In_Use = BBR_Data->control.Table_Entries_In_Use + 1;
28168
 
+                                Table_Entry = (BBR_Table_Entry *) &(BBR_Data->remap[Sector_Index].BBR_Table[Table_Index]);
28169
 
+                                Table_Entry->BadSector = lsn;
28170
 
+                        }
28171
 
+                        else {
28172
 
+                                /* There are no more replacement sectors available!  Time to bail ... */
28173
 
+                                up( &(dlentry->BBR_Table_Lock) );
28174
 
+                                return 0;
28175
 
+                        }
28176
 
+
28177
 
+                        /* Now that we have a replacement sector, increment the first guard value.  This will free any 
28178
 
+                           threads reading the BBR Table.                                                                */
28179
 
+                        dlentry->Guard1++;
28180
 
+
28181
 
+                        /* Release the lock now that we have a replacement sector. */
28182
 
+                        up( &(dlentry->BBR_Table_Lock) );
28183
 
+
28184
 
+                        /* Test the replacement sector. */
28185
 
+                        rc = INIT_IO(dlentry->link_partition, 1, Table_Entry->ReplacementSector, 1, buffer);
28186
 
+                        if (rc) {
28187
 
+                                /* The replacement sector was bad.  Lets mark it bad in the table and try again. */
28188
 
+                                Table_Entry->BadSector = (u_int32_t) -1;
28189
 
+                        }
28190
 
+
28191
 
+                }  /* End of processing for the current sector. */
28192
 
+
28193
 
+        } /* end of loop to test each sector in the I/O and remap any bad ones found. */
28194
 
+
28195
 
+        /* Need to write the modified BBR Table back to disk.  This includes updating the sequence numbers and CRCs. */
28196
 
+
28197
 
+        /* Lock for the BBR Table. */
28198
 
+        down( &(dlentry->BBR_Table_Lock) );
28199
 
+
28200
 
+        /* Increment the sequence numbers. */
28201
 
+        New_Sequence_Number = BBR_Data->control.Sequence_Number + 1;
28202
 
+        BBR_Data->control.Sequence_Number = New_Sequence_Number;
28203
 
+        for ( Sector_Index = 0; Sector_Index < BBR_Data->control.Sectors_Per_Table; Sector_Index++) {
28204
 
+                BBR_Data->remap[Sector_Index].Sequence_Number = New_Sequence_Number;
28205
 
+        }
28206
 
+
28207
 
+        /* Calculate the new CRC values. */
28208
 
+        BBR_Data->control.CRC = 0;
28209
 
+        BBR_Data->control.CRC = evms_cs_calculate_crc(EVMS_INITIAL_CRC,&(BBR_Data->control),OS2_BYTES_PER_SECTOR);
28210
 
+        for ( Sector_Index = 0; Sector_Index < BBR_Data->control.Sectors_Per_Table; Sector_Index++) {
28211
 
+                BBR_Data->remap[Sector_Index].CRC = 0;
28212
 
+                BBR_Data->remap[Sector_Index].CRC = evms_cs_calculate_crc(EVMS_INITIAL_CRC,&(BBR_Data->remap[Sector_Index]),OS2_BYTES_PER_SECTOR);
28213
 
+        }
28214
 
+
28215
 
+        /* Now we must write the table back to the partition from whence it came. */
28216
 
+
28217
 
+        /* Write the first copy. */
28218
 
+        rc = INIT_IO(dlentry->link_partition,1,dlentry->BBR_Data_Copy1,dlentry->BBR_Feature_Size,BBR_Data);
28219
 
+
28220
 
+        /* Write the second copy. */
28221
 
+        rc2 = INIT_IO(dlentry->link_partition,1,dlentry->BBR_Data_Copy2,dlentry->BBR_Feature_Size,BBR_Data);
28222
 
+
28223
 
+        /* If both copies failed to reach the disk, then fail the I/O. */
28224
 
+        if ( rc && rc2 ) {
28225
 
+                rc = 0;
28226
 
+        }
28227
 
+        else
28228
 
+                rc = 1;
28229
 
+
28230
 
+        /* Unlock the BBR Table */
28231
 
+        up( &(dlentry->BBR_Table_Lock) );
28232
 
+
28233
 
+        /* Indicate success. */
28234
 
+        return rc;
28235
 
+}
28236
 
+
28237
 
+
28238
 
+/*
28239
 
+ * Clone_Bufferhead
28240
 
+ *
28241
 
+ * Prepares a usable copy of an existing bufferhead.
28242
 
+ *
28243
 
+ */
28244
 
+static void Clone_Bufferhead(struct buffer_head * Source, struct buffer_head * Child)
28245
 
+{
28246
 
+        Child->b_next = NULL;
28247
 
+        Child->b_blocknr = Source->b_blocknr;
28248
 
+        Child->b_size = Source->b_size;
28249
 
+        Child->b_list = 0;
28250
 
+        Child->b_dev = Source->b_dev;
28251
 
+        Child->b_count = Source->b_count;
28252
 
+        Child->b_rdev = Source->b_rdev;
28253
 
+        Child->b_state = Source->b_state;
28254
 
+        Child->b_flushtime = 0;
28255
 
+        Child->b_next_free = NULL;
28256
 
+        Child->b_prev_free = NULL;
28257
 
+        Child->b_this_page = NULL;
28258
 
+        Child->b_reqnext = NULL;
28259
 
+        Child->b_pprev = NULL;
28260
 
+        Child->b_data = Source->b_data;
28261
 
+        Child->b_page = Source->b_page;
28262
 
+        Child->b_end_io = Source->b_end_io;
28263
 
+        Child->b_private = Source->b_private;
28264
 
+        Child->b_rsector = Source->b_rsector;
28265
 
+        Child->b_inode = NULL;
28266
 
+        Child->b_inode_buffers.next = NULL;
28267
 
+        Child->b_inode_buffers.prev = NULL;
28268
 
+        return;
28269
 
+}
28270
 
diff -Naur linux-2002-03-28/drivers/evms/s390_part.c evms-2002-03-28/drivers/evms/s390_part.c
28271
 
--- linux-2002-03-28/drivers/evms/s390_part.c   Wed Dec 31 18:00:00 1969
28272
 
+++ evms-2002-03-28/drivers/evms/s390_part.c    Tue Mar 26 14:28:49 2002
28273
 
@@ -0,0 +1,836 @@
28274
 
+/* -*- linux-c -*- */
28275
 
+/*
28276
 
+ *
28277
 
+ *
28278
 
+ *   Copyright (c) International Business Machines  Corp., 2000
28279
 
+ *
28280
 
+ *   This program is free software;  you can redistribute it and/or modify
28281
 
+ *   it under the terms of the GNU General Public License as published by
28282
 
+ *   the Free Software Foundation; either version 2 of the License, or
28283
 
+ *   (at your option) any later version.
28284
 
+ *
28285
 
+ *   This program is distributed in the hope that it will be useful,
28286
 
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
28287
 
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
28288
 
+ *   the GNU General Public License for more details.
28289
 
+ *
28290
 
+ *   You should have received a copy of the GNU General Public License
28291
 
+ *   along with this program;  if not, write to the Free Software
28292
 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
28293
 
+ *
28294
 
+ *
28295
 
+ */
28296
 
+/*
28297
 
+ * linux/drivers/evms/s390_part.c
28298
 
+ *
28299
 
+ * EVMS S/390 partition manager
28300
 
+ *
28301
 
+ * Partial code extracted from
28302
 
+ *
28303
 
+ *  linux/fs/partitions/ibm.c
28304
 
+ *
28305
 
+ */
28306
 
+
28307
 
+#include <linux/config.h>
28308
 
+#include <linux/module.h>
28309
 
+#include <linux/kernel.h>
28310
 
+#include <linux/config.h>
28311
 
+#include <linux/string.h>
28312
 
+#include <linux/blk.h>
28313
 
+#include <asm/ebcdic.h>
28314
 
+#include <asm/uaccess.h>
28315
 
+#include <asm/dasd.h>
28316
 
+#include <asm/vtoc.h>
28317
 
+#include <linux/evms/evms_kernel.h>
28318
 
+
28319
 
+/* prefix used in logging messages */
28320
 
+#define LOG_PREFIX "s390_part: "
28321
 
+
28322
 
+/* Private instance data structure for node we produced */
28323
 
+typedef struct local_instance_data_s {
28324
 
+        evms_logical_node_t     * source_disk;
28325
 
+        evms_sector_t           start_sect;     /* starting LBA */
28326
 
+        evms_sector_t           nr_sects;       /* number of sectors */
28327
 
+        unsigned char           type;           /* partition type or filesystem format indicator, can be set to 0 */
28328
 
+} local_instance_data_t;
28329
 
+
28330
 
+static int exported_nodes;      /* total # of exported segments
28331
 
+                                 * produced during this discovery.
28332
 
+                                 */
28333
 
+
28334
 
+/* Prototypes */
28335
 
+static int  s390_partition_discover(evms_logical_node_t **);
28336
 
+static int  s390_partition_delete(evms_logical_node_t *);
28337
 
+static void s390_partition_read(evms_logical_node_t *,
28338
 
+                                   eio_t *);
28339
 
+static void s390_partition_write(evms_logical_node_t *,
28340
 
+                                    eio_t *);
28341
 
+static int  s390_partition_ioctl(evms_logical_node_t *,
28342
 
+                                    struct inode *,
28343
 
+                                    struct file *,
28344
 
+                                    unsigned int,
28345
 
+                                    unsigned long);
28346
 
+static int  s390_partition_init_io(evms_logical_node_t *,
28347
 
+                                      int,
28348
 
+                                      evms_sector_t,
28349
 
+                                      evms_sector_t,
28350
 
+                                      void *);
28351
 
+
28352
 
+static evms_plugin_function_table_t function_table = {
28353
 
+        discover: &s390_partition_discover,
28354
 
+        delete  : &s390_partition_delete,
28355
 
+        read    : &s390_partition_read,
28356
 
+        write   : &s390_partition_write,
28357
 
+        init_io : &s390_partition_init_io,
28358
 
+        ioctl   : &s390_partition_ioctl
28359
 
+};
28360
 
+
28361
 
+#define EVMS_S390_PARTITION_MANAGER_ID 2
28362
 
+
28363
 
+static evms_plugin_header_t plugin_header = {
28364
 
+        id              : SetPluginID(
28365
 
+                IBM_OEM_ID,
28366
 
+                EVMS_SEGMENT_MANAGER,
28367
 
+                EVMS_S390_PARTITION_MANAGER_ID),
28368
 
+        version         : {
28369
 
+                major      : 1,
28370
 
+                minor      : 0,
28371
 
+                patchlevel : 0
28372
 
+        },
28373
 
+        required_common_services_version : {
28374
 
+                major      : 0,
28375
 
+                minor      : 5,
28376
 
+                patchlevel : 0
28377
 
+        },
28378
 
+        function_table  : &function_table
28379
 
+};
28380
 
+
28381
 
+/***************************************************/
28382
 
+/* List Support - Typedefs, Variables, & Functions */
28383
 
+/***************************************************/
28384
 
+
28385
 
+/* Typedefs */
28386
 
+
28387
 
+typedef struct local_segment_list_node_s {
28388
 
+        evms_logical_node_t              *segment;
28389
 
+        struct local_segment_list_node_s *next;
28390
 
+} local_segment_list_node_t;
28391
 
+
28392
 
+typedef struct local_disk_list_node_s {
28393
 
+        evms_logical_node_t           *disk;
28394
 
+        local_segment_list_node_t     *segment_list;
28395
 
+        struct local_disk_list_node_s *next;
28396
 
+} local_disk_list_node_t;
28397
 
+
28398
 
+/* Variables */
28399
 
+
28400
 
+static local_disk_list_node_t *my_disk_list;
28401
 
+
28402
 
+/* Functions */
28403
 
+
28404
 
+static local_disk_list_node_t **
28405
 
+lookup_disk(
28406
 
+        evms_logical_node_t *disk)
28407
 
+{
28408
 
+        local_disk_list_node_t **ldln;
28409
 
+
28410
 
+        ldln = &my_disk_list;
28411
 
+        while(*ldln) {
28412
 
+                if ((*ldln)->disk == disk)
28413
 
+                        break;
28414
 
+                ldln = &(*ldln)->next;
28415
 
+        }
28416
 
+        return(ldln);
28417
 
+}
28418
 
+
28419
 
+static local_segment_list_node_t **
28420
 
+lookup_segment(
28421
 
+        local_disk_list_node_t *disk,
28422
 
+        evms_logical_node_t    *segment)
28423
 
+{
28424
 
+        local_segment_list_node_t **lsln;
28425
 
+
28426
 
+        lsln = &disk->segment_list;
28427
 
+        while(*lsln) {
28428
 
+                if ((*lsln)->segment == segment)
28429
 
+                        break;
28430
 
+                lsln = &(*lsln)->next;
28431
 
+        }
28432
 
+        return(lsln);
28433
 
+}
28434
 
+
28435
 
+static evms_logical_node_t *
28436
 
+find_segment_on_disk(
28437
 
+        evms_logical_node_t *disk,
28438
 
+        u_int64_t start_sect,
28439
 
+        u_int64_t nr_sects)
28440
 
+{
28441
 
+        evms_logical_node_t *rc = NULL;
28442
 
+        local_disk_list_node_t **ldln;
28443
 
+        local_segment_list_node_t **lsln;
28444
 
+        local_instance_data_t *lid;
28445
 
+
28446
 
+        ldln = lookup_disk(disk);
28447
 
+        if (*ldln) {
28448
 
+                /* disk found in list */
28449
 
+                /* attempt to find segment */
28450
 
+
28451
 
+                lsln = &(*ldln)->segment_list;
28452
 
+                while(*lsln) {
28453
 
+                        lid = (*lsln)->segment->instance_data;
28454
 
+                        if (lid->start_sect == start_sect)
28455
 
+                                if (lid->nr_sects == nr_sects)
28456
 
+                                        break;
28457
 
+                        lsln = &(*lsln)->next;
28458
 
+                }
28459
 
+                if (*lsln)
28460
 
+                        rc = (*lsln)->segment;
28461
 
+        }
28462
 
+        return(rc);
28463
 
+}
28464
 
+
28465
 
+/* function description: add_segment_to_disk
28466
 
+ *
28467
 
+ * this function attempts to add a segment to the segment
28468
 
+ * list of a disk. if the specified disk is not found, it
28469
 
+ * will be added to the global disk list. this function will
28470
 
+ * return a pointer to the matching segment in the disk's
28471
 
+ * segment list. the caller must compare the returned pointer
28472
 
+ * to the specified segment to see if the
28473
 
+ * specified segment was already present in the disk's segment
28474
 
+ * list. if the return pointer matches the specified segment,
28475
 
+ * then the specified segment was added to the list. if the
28476
 
+ * return segment pointer to does not match the specified
28477
 
+ * segment pointer, then the specified segment pointer was
28478
 
+ * a duplicate and can be thrown away.
28479
 
+ */
28480
 
+static int
28481
 
+add_segment_to_disk(
28482
 
+        evms_logical_node_t *disk,
28483
 
+        evms_logical_node_t *segment)
28484
 
+{
28485
 
+        int rc = 0;
28486
 
+        local_disk_list_node_t **ldln, *new_disk;
28487
 
+        local_segment_list_node_t **lsln, *new_segment;
28488
 
+
28489
 
+        ldln = lookup_disk(disk);
28490
 
+        if (*ldln == NULL) {
28491
 
+                /* disk not in list, add disk */
28492
 
+                rc = evms_cs_allocate_memory((void **)&new_disk,
28493
 
+                                             sizeof(*new_disk));
28494
 
+                if (!rc) {
28495
 
+                        new_disk->disk = disk;
28496
 
+                        *ldln = new_disk;
28497
 
+                }
28498
 
+        }
28499
 
+        if (!rc) {
28500
 
+                /* attempt to add segment */
28501
 
+                lsln = lookup_segment(*ldln, segment);
28502
 
+                if (*lsln == NULL) {
28503
 
+                        /* segment not in list, add segment */
28504
 
+                        rc = evms_cs_allocate_memory((void **)&new_segment,
28505
 
+                                                     sizeof(*new_segment));
28506
 
+                        if (!rc) {
28507
 
+                                new_segment->segment = segment;
28508
 
+                                *lsln = new_segment;
28509
 
+                        }
28510
 
+                } else
28511
 
+                        rc = -1;
28512
 
+        }
28513
 
+        return(rc);
28514
 
+}
28515
 
+
28516
 
+static int
28517
 
+remove_segment_from_disk(
28518
 
+        evms_logical_node_t *disk,
28519
 
+        evms_logical_node_t *segment,
28520
 
+        evms_logical_node_t **empty_disk)
28521
 
+{
28522
 
+        int rc = 0;
28523
 
+        local_disk_list_node_t **ldln, *tmp_disk_node;
28524
 
+        local_segment_list_node_t **lsln, *tmp_segment_node;
28525
 
+
28526
 
+        *empty_disk = NULL;
28527
 
+        ldln = lookup_disk(disk);
28528
 
+        if (*ldln == NULL) {
28529
 
+                rc = -1;
28530
 
+        } else {
28531
 
+                /* disk found in list */
28532
 
+                /* attempt to add segment */
28533
 
+                lsln = lookup_segment(*ldln, segment);
28534
 
+                if (*lsln == NULL) {
28535
 
+                        rc = -2;
28536
 
+                } else {
28537
 
+                        tmp_segment_node = *lsln;
28538
 
+                        /* remove segment from list */
28539
 
+                        *lsln = (*lsln)->next;
28540
 
+                        /* free the segment list node */
28541
 
+                        evms_cs_deallocate_memory(tmp_segment_node);
28542
 
+
28543
 
+                        if ((*ldln)->segment_list == NULL) {
28544
 
+                                tmp_disk_node = *ldln;
28545
 
+                                *empty_disk = tmp_disk_node->disk;
28546
 
+                                /* remove disk from list */
28547
 
+                                *ldln = (*ldln)->next;
28548
 
+                                /* free the disk list node */
28549
 
+                                evms_cs_deallocate_memory(tmp_disk_node);
28550
 
+                        }
28551
 
+                }
28552
 
+        }
28553
 
+        return(rc);
28554
 
+}
28555
 
+
28556
 
+/*
28557
 
+ * Function:  add_segment
28558
 
+ */
28559
 
+static int
28560
 
+s390_process_segment(
28561
 
+        evms_logical_node_t **discover_list,
28562
 
+        evms_logical_node_t *node,
28563
 
+        u_int64_t            start_sect,
28564
 
+        u_int64_t            nr_sects,
28565
 
+        unsigned char        type,
28566
 
+        int                  part_num)
28567
 
+{
28568
 
+        local_instance_data_t *InstData = NULL;
28569
 
+        evms_logical_node_t *segment;
28570
 
+        int rc = 0;
28571
 
+
28572
 
+        segment = find_segment_on_disk(node, start_sect, nr_sects);
28573
 
+        if (segment) {
28574
 
+               LOG_DETAILS("exporting segment '%s'.\n",
28575
 
+                           segment->name);
28576
 
+       } else {
28577
 
+                rc = evms_cs_allocate_memory((void **)&InstData,sizeof(*InstData));
28578
 
+                if (!rc) {
28579
 
+                        InstData->source_disk = node;
28580
 
+                        InstData->start_sect = start_sect;
28581
 
+                        InstData->nr_sects = nr_sects;
28582
 
+                        InstData->type = type;
28583
 
+                        rc = evms_cs_allocate_logical_node(&segment);
28584
 
+                }
28585
 
+                if (!rc) {
28586
 
+                        segment->plugin = &plugin_header;
28587
 
+                        segment->system_id = (unsigned int)type;
28588
 
+                        segment->total_vsectors = nr_sects;
28589
 
+                        segment->block_size = node->block_size;
28590
 
+                        segment->hardsector_size = node->hardsector_size;
28591
 
+                        segment->instance_data = InstData;
28592
 
+                       segment->flags = node->flags;
28593
 
+                        strcpy(segment->name, node->name);
28594
 
+                        sprintf(segment->name + strlen(segment->name), "%d", part_num);
28595
 
+                        LOG_DETAILS("creating segment '%s'.\n",
28596
 
+                                segment->name);
28597
 
+                        rc = add_segment_to_disk(node, segment);
28598
 
+                        if (rc) {
28599
 
+                                LOG_ERROR("%s: error(%d) adding segment '%s'!\n",
28600
 
+                                        __FUNCTION__, rc, segment->name);
28601
 
+                                rc = 0;
28602
 
+                        } else {
28603
 
+                               MOD_INC_USE_COUNT;
28604
 
+                       }
28605
 
+                }
28606
 
+                if (rc) {
28607
 
+                        if (InstData)
28608
 
+                                evms_cs_deallocate_memory(InstData);
28609
 
+                        if (segment)
28610
 
+                                evms_cs_deallocate_logical_node(segment);
28611
 
+                }
28612
 
+        }
28613
 
+        if (!rc) {
28614
 
+                evms_cs_add_logical_node_to_list(discover_list, segment);
28615
 
+                exported_nodes++;
28616
 
+        }
28617
 
+        return rc;
28618
 
+}
28619
 
+
28620
 
+typedef enum {
28621
 
+       ibm_partition_lnx1 = 0,
28622
 
+       ibm_partition_vol1 = 1,
28623
 
+       ibm_partition_cms1 = 2,
28624
 
+       ibm_partition_none = 3
28625
 
+} ibm_partition_t;
28626
 
+
28627
 
+static char* part_names[] = {
28628
 
+       [ibm_partition_lnx1] = "LNX1",
28629
 
+       [ibm_partition_vol1] = "VOL1",
28630
 
+       [ibm_partition_cms1] = "CMS1",
28631
 
+       [ibm_partition_none] = "(nonl)"
28632
 
+};
28633
 
+
28634
 
+static ibm_partition_t
28635
 
+get_partition_type ( char * type )
28636
 
+{
28637
 
+       int i;
28638
 
+       for ( i = 0; i < 3; i ++) {
28639
 
+               if ( ! strncmp (type,part_names[i],4) ) 
28640
 
+                       break;
28641
 
+       }
28642
 
+        return i;
28643
 
+}
28644
 
+
28645
 
+/*
28646
 
+ * compute the block number from a 
28647
 
+ * cyl-cyl-head-head structure
28648
 
+ */
28649
 
+static inline int
28650
 
+cchh2blk (cchh_t *ptr, struct hd_geometry *geo) {
28651
 
+        return ptr->cc * geo->heads * geo->sectors +
28652
 
+              ptr->hh * geo->sectors;
28653
 
+}
28654
 
+
28655
 
+
28656
 
+/*
28657
 
+ * compute the block number from a 
28658
 
+ * cyl-cyl-head-head-block structure
28659
 
+ */
28660
 
+static inline int
28661
 
+cchhb2blk (cchhb_t *ptr, struct hd_geometry *geo) {
28662
 
+        return ptr->cc * geo->heads * geo->sectors +
28663
 
+               ptr->hh * geo->sectors +
28664
 
+               ptr->b;
28665
 
+}
28666
 
+                             
28667
 
+void print_mem( void *buffer, int length )
28668
 
+{
28669
 
+        int i, done;
28670
 
+        unsigned char *bufptr;
28671
 
+
28672
 
+        bufptr = (unsigned char *)buffer;
28673
 
+        i = done = 0;
28674
 
+        while( !done ) {
28675
 
+                if ( (i % 16) == 0 )
28676
 
+                        printk(KERN_INFO "\n0x%p->", buffer + i);
28677
 
+                printk(KERN_INFO "%02x ", bufptr[i]);
28678
 
+                if ( ++i >= length )
28679
 
+                        done++;
28680
 
+        }
28681
 
+        printk(KERN_INFO "\n");
28682
 
+}
28683
 
+
28684
 
+static int 
28685
 
+s390_probe_for_segments(
28686
 
+       evms_logical_node_t **discover_list,
28687
 
+       evms_logical_node_t *disk)
28688
 
+{
28689
 
+       char type[5] = {0,}, name[7] = {0,};
28690
 
+       int rc, vsects_per_hardsect = 0;
28691
 
+       unsigned int blk;
28692
 
+       u64 io_start;
28693
 
+       dasd_information_t *info = NULL;
28694
 
+       struct hd_geometry *geo = NULL;
28695
 
+       unchar *data = NULL;
28696
 
+       
28697
 
+       /* allocate space for DASD ioctl packet
28698
 
+        */
28699
 
+       rc = evms_cs_allocate_memory((void **)&info, sizeof(dasd_information_t));
28700
 
+       if (!rc) {
28701
 
+               LOG_DEBUG("probing '%s' for 390 DASD info...\n",
28702
 
+                           disk->name);
28703
 
+               /* issue DASD info ioctl
28704
 
+                */
28705
 
+               rc = evms_cs_kernel_ioctl(disk, BIODASDINFO, (unsigned long)info);
28706
 
+               if (rc) {
28707
 
+                       LOG_DEBUG("error(%d) from BIODASDINFO ioctl.\n", rc);
28708
 
+                       LOG_DEBUG("assuming '%s' is not a valid 390 device!\n",
28709
 
+                                   disk->name);
28710
 
+               }
28711
 
+       }
28712
 
+       if (!rc) {
28713
 
+               /* if we successfully completed the previous
28714
 
+                * get DASD info ioctl, we will assume that
28715
 
+                * the device is a valid 390 disk.
28716
 
+                *
28717
 
+                * remove it from the discover list.
28718
 
+                */
28719
 
+               rc = evms_cs_remove_logical_node_from_list(
28720
 
+                       discover_list, disk);
28721
 
+               if (rc) {
28722
 
+                       LOG_ERROR("error(%d) removing disk(%s) from discover list.\n",
28723
 
+                                 rc, disk->name);
28724
 
+               }
28725
 
+       }
28726
 
+       if (!rc)
28727
 
+               /* allocate space for the geometry packet
28728
 
+                */
28729
 
+               rc = evms_cs_allocate_memory((void **)&geo, sizeof(struct hd_geometry));
28730
 
+       if (!rc) {
28731
 
+               /* issue the Get GEO ioctl
28732
 
+                */
28733
 
+               rc = evms_cs_kernel_ioctl(disk, HDIO_GETGEO, (unsigned long)geo);
28734
 
+               if (rc) {
28735
 
+                       LOG_ERROR("error(%d) from HDIO_GETGEO ioctl.\n", rc);
28736
 
+               }
28737
 
+       }
28738
 
+       if (!rc) {
28739
 
+               /* retrieve the vsects_per_hardsect (hardsector size)
28740
 
+                */
28741
 
+               vsects_per_hardsect = disk->hardsector_size;
28742
 
+               vsects_per_hardsect >>= EVMS_VSECTOR_SIZE_SHIFT;
28743
 
+               rc = evms_cs_allocate_memory((void **)&data, EVMS_VSECTOR_SIZE);
28744
 
+       }
28745
 
+       if (!rc) {
28746
 
+               /* go read the 1st block on the disk
28747
 
+                */
28748
 
+               io_start = info->label_block * vsects_per_hardsect;
28749
 
+               rc = INIT_IO(disk, READ, io_start, 1, data);
28750
 
+               if (rc) {
28751
 
+                       LOG_ERROR("error(%d) reading sector(%Ld) from '%s'.\n",
28752
 
+                                 rc, io_start, disk->name);
28753
 
+               } else {
28754
 
+//                     print_mem(data, EVMS_VSECTOR_SIZE);
28755
 
+               }
28756
 
+       }
28757
 
+       if (!rc) {
28758
 
+               int offset, size, psize, counter = 0;
28759
 
+               format1_label_t f1;
28760
 
+               volume_label_t vlabel;
28761
 
+               ibm_partition_t partition_type;
28762
 
+
28763
 
+               /* determine the format type
28764
 
+                */
28765
 
+
28766
 
+               strncpy (type, data, 4);
28767
 
+               if ((!info->FBA_layout) && (!strcmp(info->type,"ECKD"))) {
28768
 
+                       strncpy ( name, data + 8, 6);
28769
 
+               } else {
28770
 
+                       strncpy ( name, data + 4, 6);
28771
 
+               }
28772
 
+               memcpy (&vlabel, data, sizeof(volume_label_t));
28773
 
+
28774
 
+               EBCASC(type,4);
28775
 
+               EBCASC(name,6);
28776
 
+               partition_type = get_partition_type(type);
28777
 
+               LOG_DETAILS("disk: raw type(%s), type(%s), name(%s)\n",
28778
 
+                           type, part_names[partition_type], name);
28779
 
+               switch ( partition_type ) {
28780
 
+               case ibm_partition_cms1:
28781
 
+                       if (*((long *)data + 13) != 0) {
28782
 
+                               /* disk is reserved minidisk */
28783
 
+                               long *label=(long*)data;
28784
 
+                               vsects_per_hardsect = label[3] >> EVMS_VSECTOR_SIZE_SHIFT;
28785
 
+                               offset = label[13];
28786
 
+                               size = (label[7] - 1) * vsects_per_hardsect; 
28787
 
+                               LOG_DEBUG("(MDSK)");
28788
 
+                       } else {
28789
 
+                               offset = info->label_block + 1;
28790
 
+                               size = disk->total_vsectors;
28791
 
+                       }
28792
 
+                       offset *= vsects_per_hardsect;
28793
 
+                       /* adjust for 0 thru label block offset
28794
 
+                        */
28795
 
+                       size -= offset;
28796
 
+                       rc = s390_process_segment(discover_list,
28797
 
+                                            disk,
28798
 
+                                            offset,
28799
 
+                                            size,
28800
 
+                                            0,
28801
 
+                                            1);
28802
 
+                       break;
28803
 
+               case ibm_partition_lnx1: 
28804
 
+               case ibm_partition_none:
28805
 
+                       offset = info->label_block + 1;
28806
 
+                       offset *= vsects_per_hardsect;
28807
 
+                       size = disk->total_vsectors;
28808
 
+                       /* adjust for 0 thru label block offset
28809
 
+                        */
28810
 
+                       size -= offset;
28811
 
+                       rc = s390_process_segment(discover_list,
28812
 
+                                            disk,
28813
 
+                                            offset,
28814
 
+                                            size,
28815
 
+                                            0,
28816
 
+                                            1);
28817
 
+                       break;
28818
 
+               case ibm_partition_vol1: 
28819
 
+                       /* get block number and read then first format1 label */
28820
 
+                       blk = cchhb2blk(&vlabel.vtoc, geo) + 1;
28821
 
+                       io_start = blk * vsects_per_hardsect;
28822
 
+                       rc = INIT_IO(disk, READ, io_start, 1, data);
28823
 
+                       if (rc) {
28824
 
+                               LOG_ERROR("error(%d) reading sector(%Ld) from '%s'.\n",
28825
 
+                                         rc, io_start, disk->name);
28826
 
+                               break;
28827
 
+                       } else {
28828
 
+//                             print_mem(data, EVMS_VSECTOR_SIZE);
28829
 
+                       }
28830
 
+                       memcpy (&f1, data, sizeof(format1_label_t));
28831
 
+
28832
 
+                       while (f1.DS1FMTID == _ascebc['1']) {
28833
 
+                               offset = cchh2blk(&f1.DS1EXT1.llimit, geo);
28834
 
+                               psize  = cchh2blk(&f1.DS1EXT1.ulimit, geo) - 
28835
 
+                                       offset + geo->sectors;
28836
 
+
28837
 
+                               counter++;
28838
 
+                               rc = s390_process_segment(discover_list,
28839
 
+                                                    disk,
28840
 
+                                                    offset * vsects_per_hardsect,
28841
 
+                                                    psize * vsects_per_hardsect,
28842
 
+                                                    0,
28843
 
+                                                    counter);
28844
 
+
28845
 
+                               blk++;
28846
 
+                               io_start = blk * vsects_per_hardsect;
28847
 
+                               rc = INIT_IO(disk, READ, io_start, 1, data);
28848
 
+                               if (rc) {
28849
 
+                                       LOG_ERROR("error(%d) reading sector(%Ld) from '%s'.\n",
28850
 
+                                                 rc, io_start, disk->name);
28851
 
+                                       break;
28852
 
+                               } else {
28853
 
+//                                     print_mem(data, EVMS_VSECTOR_SIZE);
28854
 
+                               }
28855
 
+                               memcpy (&f1, data, sizeof(format1_label_t));
28856
 
+                       }
28857
 
+                       break;
28858
 
+               default:
28859
 
+                       rc = s390_process_segment(discover_list,
28860
 
+                                            disk, 0, 0, 0, 1);
28861
 
+                       break;
28862
 
+               }
28863
 
+       }
28864
 
+       if (info) {
28865
 
+               evms_cs_deallocate_memory(info);
28866
 
+       }
28867
 
+       if (geo) {
28868
 
+               evms_cs_deallocate_memory(geo);
28869
 
+       }
28870
 
+       if (data)
28871
 
+               evms_cs_deallocate_memory(data);
28872
 
+       
28873
 
+       return(rc);
28874
 
+}
28875
 
+
28876
 
+/*
28877
 
+ * Function: s390_partition_discover
28878
 
+ *
28879
 
+ */
28880
 
+static int
28881
 
+s390_partition_discover(evms_logical_node_t **discover_list)
28882
 
+{
28883
 
+        int rc = 0;
28884
 
+        evms_logical_node_t *node, *next_node;
28885
 
+
28886
 
+        LOG_ENTRY_EXIT("%s: ENTRY\n", __FUNCTION__);
28887
 
+
28888
 
+        /* initialize global variable */
28889
 
+        exported_nodes = 0;
28890
 
+
28891
 
+        /* examine each node on the discover list */
28892
 
+        next_node = *discover_list;
28893
 
+        while(next_node) {
28894
 
+                node = next_node;
28895
 
+                next_node = node->next;
28896
 
+               if (GetPluginType(node->plugin->id) != EVMS_DEVICE_MANAGER)
28897
 
+                       /* only process disk nodes
28898
 
+                        */
28899
 
+                       continue;
28900
 
+                s390_probe_for_segments(discover_list, node);
28901
 
+        }
28902
 
+
28903
 
+        LOG_ENTRY_EXIT("%s: EXIT(exported nodes:%d, error code:%d)\n",
28904
 
+                        __FUNCTION__, exported_nodes, rc);
28905
 
+        if (exported_nodes)
28906
 
+                rc = exported_nodes;
28907
 
+        return(rc);
28908
 
+}
28909
 
+
28910
 
+/*
28911
 
+ * Function: s390_partition_delete
28912
 
+ *
28913
 
+ */
28914
 
+static int
28915
 
+s390_partition_delete(evms_logical_node_t *segment)
28916
 
+{
28917
 
+        int rc = 0;
28918
 
+        local_instance_data_t *LID;
28919
 
+        evms_logical_node_t *empty_disk = NULL;
28920
 
+
28921
 
+        LOG_DETAILS("deleting segment '%s'.\n",segment->name);
28922
 
+
28923
 
+        if (!segment) {
28924
 
+                rc = -ENODEV;
28925
 
+        } else {
28926
 
+                LID = segment->instance_data;
28927
 
+                if (LID) {
28928
 
+                        /* remove the segment from the
28929
 
+                         * disk's segment list
28930
 
+                         */
28931
 
+                        rc = remove_segment_from_disk(
28932
 
+                                LID->source_disk,
28933
 
+                                segment,
28934
 
+                                &empty_disk);
28935
 
+                        /* free the local instance data */
28936
 
+                        evms_cs_deallocate_memory(LID);
28937
 
+                }
28938
 
+                /* free the segment node */
28939
 
+                evms_cs_deallocate_logical_node(segment);
28940
 
+                MOD_DEC_USE_COUNT;
28941
 
+                /* if the last segment on the disk was
28942
 
+                 * deleted, delete the disk node too
28943
 
+                 */
28944
 
+                if (empty_disk)
28945
 
+                        DELETE(empty_disk);
28946
 
+        }
28947
 
+        return(rc);
28948
 
+}
28949
 
+
28950
 
+/*
28951
 
+ * function: s390_partition_io_error
28952
 
+ *
28953
 
+ * this function was primarily created because the function
28954
 
+ * buffer_IO_error is inline and kgdb doesn't allow breakpoints
28955
 
+ * to be set on inline functions. Since this was an error path
28956
 
+ * and not mainline, I decided to add a trace statement to help
28957
 
+ * report on the failing condition.
28958
 
+ *
28959
 
+ */
28960
 
+static void
28961
 
+s390_partition_io_error(
28962
 
+        evms_logical_node_t *node,
28963
 
+        int io_flag,
28964
 
+        eio_t *eio)
28965
 
+{
28966
 
+        LOG_SERIOUS("attempt to %s beyond partition boundary(%Ld) on (%s), rsector(%Ld).\n",
28967
 
+                (io_flag) ? "WRITE" : "READ",
28968
 
+                node->total_vsectors - 1,
28969
 
+                node->name,
28970
 
+                eio->rsector);
28971
 
+
28972
 
+        EVMS_IO_ERROR(eio);
28973
 
+}
28974
 
+
28975
 
+/*
28976
 
+ * Function: s390_partition_read
28977
 
+ *
28978
 
+ */
28979
 
+static void
28980
 
+s390_partition_read(
28981
 
+        evms_logical_node_t *partition,
28982
 
+        eio_t *eio)
28983
 
+{
28984
 
+        local_instance_data_t *LID = partition->instance_data;
28985
 
+
28986
 
+        if ((eio->rsector + eio->rsize) <= partition->total_vsectors) {
28987
 
+                eio->rsector += LID->start_sect;
28988
 
+                R_IO(LID->source_disk, eio);
28989
 
+        } else
28990
 
+                s390_partition_io_error(partition, READ, eio);
28991
 
+}
28992
 
+
28993
 
+/*
28994
 
+ * Function: s390_partition_write
28995
 
+ *
28996
 
+ */
28997
 
+static void
28998
 
+s390_partition_write(
28999
 
+        evms_logical_node_t *partition,
29000
 
+        eio_t *eio)
29001
 
+{
29002
 
+        local_instance_data_t *LID = partition->instance_data;
29003
 
+
29004
 
+        if ((eio->rsector + eio->rsize) <= partition->total_vsectors) {
29005
 
+                eio->rsector += LID->start_sect;
29006
 
+                W_IO(LID->source_disk, eio);
29007
 
+        } else
29008
 
+                s390_partition_io_error(partition, WRITE, eio);
29009
 
+}
29010
 
+
29011
 
+/*
29012
 
+ * Function: s390_partition_init_io
29013
 
+ *
29014
 
+ */
29015
 
+static int
29016
 
+s390_partition_init_io(
29017
 
+        evms_logical_node_t *partition,
29018
 
+        int                  io_flag,        /* 0=read, 1=write*/
29019
 
+        evms_sector_t        sect_nr,        /* disk LBA */
29020
 
+        evms_sector_t        num_sects,      /* # of sectors */
29021
 
+        void                *buf_addr)       /* buffer address */
29022
 
+{
29023
 
+        int rc;
29024
 
+        local_instance_data_t *LID = partition->instance_data;
29025
 
+
29026
 
+        if ((sect_nr + num_sects) <= partition->total_vsectors) {
29027
 
+                rc = INIT_IO(LID->source_disk, io_flag, sect_nr + LID->start_sect, num_sects, buf_addr);
29028
 
+        } else {
29029
 
+                LOG_SERIOUS("init_io: attempt to %s beyond partition(%s) boundary(%Ld) at sector(%Ld) for count(%Ld).\n",
29030
 
+                        (io_flag) ? "WRITE" : "READ",
29031
 
+                       partition->name,
29032
 
+                        (LID->nr_sects - 1),
29033
 
+                        sect_nr, num_sects);
29034
 
+                rc = -EINVAL;
29035
 
+        }
29036
 
+
29037
 
+        return(rc);
29038
 
+}
29039
 
+
29040
 
+/*
29041
 
+ * Function: s390_partition_ioctl
29042
 
+ *
29043
 
+ */
29044
 
+static int
29045
 
+s390_partition_ioctl (
29046
 
+        evms_logical_node_t *partition,
29047
 
+        struct inode        *inode,
29048
 
+        struct file         *file,
29049
 
+        unsigned int         cmd,
29050
 
+        unsigned long        arg)
29051
 
+{
29052
 
+        local_instance_data_t *LID;
29053
 
+        struct hd_geometry hd_geo;
29054
 
+        int rc;
29055
 
+
29056
 
+        rc = 0;
29057
 
+        LID = partition->instance_data;
29058
 
+        if (!inode)
29059
 
+                return -EINVAL;
29060
 
+        switch (cmd) {
29061
 
+                case HDIO_GETGEO:
29062
 
+                {
29063
 
+                        rc = IOCTL(LID->source_disk, inode, file, cmd, arg);
29064
 
+                        if (rc) break;
29065
 
+                        if (copy_from_user(&hd_geo, (void *)arg, sizeof(struct hd_geometry)))
29066
 
+                                rc = -EFAULT;
29067
 
+                        if (rc) break;
29068
 
+                        hd_geo.start = LID->start_sect;
29069
 
+                        if (copy_to_user((void *)arg, &hd_geo, sizeof(struct hd_geometry)))
29070
 
+                                rc = -EFAULT;
29071
 
+                }
29072
 
+                break;
29073
 
+               case EVMS_GET_BMAP:
29074
 
+                       {
29075
 
+                               evms_get_bmap_t *bmap = (evms_get_bmap_t *)arg;
29076
 
+                               bmap->rsector += LID->start_sect;
29077
 
+                               /* intentionally fall thru to
29078
 
+                                * default ioctl down to device
29079
 
+                                * manager.
29080
 
+                                */
29081
 
+                       }
29082
 
+                default:
29083
 
+                        rc = IOCTL(LID->source_disk, inode, file, cmd, arg);
29084
 
+        }
29085
 
+        return rc;
29086
 
+}
29087
 
+
29088
 
+/*
29089
 
+ * Function: s390_part_init
29090
 
+ *
29091
 
+ */
29092
 
+static int __init
29093
 
+s390_part_init(void)
29094
 
+{
29095
 
+        return evms_cs_register_plugin(&plugin_header); /* register with EVMS */
29096
 
+}
29097
 
+
29098
 
+static void __exit
29099
 
+s390_part_exit(void)
29100
 
+{
29101
 
+        evms_cs_unregister_plugin(&plugin_header);
29102
 
+}
29103
 
+
29104
 
+module_init(s390_part_init);
29105
 
+module_exit(s390_part_exit);
29106
 
+#ifdef MODULE_LICENSE
29107
 
+MODULE_LICENSE("GPL");
29108
 
+#endif
29109
 
+
29110
 
diff -Naur linux-2002-03-28/drivers/evms/snapshot.c evms-2002-03-28/drivers/evms/snapshot.c
29111
 
--- linux-2002-03-28/drivers/evms/snapshot.c    Wed Dec 31 18:00:00 1969
29112
 
+++ evms-2002-03-28/drivers/evms/snapshot.c     Thu Mar 21 16:17:47 2002
29113
 
@@ -0,0 +1,1212 @@
29114
 
+/* -*- linux-c -*- */
29115
 
+
29116
 
+/*
29117
 
+ *
29118
 
+ *
29119
 
+ *   Copyright (c) International Business Machines  Corp., 2000
29120
 
+ *
29121
 
+ *   This program is free software;  you can redistribute it and/or modify
29122
 
+ *   it under the terms of the GNU General Public License as published by
29123
 
+ *   the Free Software Foundation; either version 2 of the License, or
29124
 
+ *   (at your option) any later version.
29125
 
+ *
29126
 
+ *   This program is distributed in the hope that it will be useful,
29127
 
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
29128
 
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
29129
 
+ *   the GNU General Public License for more details.
29130
 
+ *
29131
 
+ *   You should have received a copy of the GNU General Public License
29132
 
+ *   along with this program;  if not, write to the Free Software
29133
 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
29134
 
+ *
29135
 
+ *
29136
 
+ */
29137
 
+/*
29138
 
+ * linux/drivers/evms/snapshot.c
29139
 
+
29140
 
+ *
29141
 
+ * EVMS SnapShot Feature.
29142
 
+ *
29143
 
+ * This feature provides the ability to Snapshot ANY existing EVMS volume(including compatibility)
29144
 
+ * to a new EVMS volume that is created when the SnapShot is enabled.
29145
 
+ *
29146
 
+ * This feature will appear in the call stack for both the original and the snapshot volume.
29147
 
+ */
29148
 
+
29149
 
+#include <linux/module.h>
29150
 
+#include <linux/kernel.h>
29151
 
+#include <linux/config.h>
29152
 
+#include <linux/genhd.h>
29153
 
+#include <linux/major.h>
29154
 
+#include <linux/string.h>
29155
 
+#include <linux/blk.h>
29156
 
+#include <linux/init.h>
29157
 
+#include <linux/slab.h>
29158
 
+#include <linux/vmalloc.h>
29159
 
+#include <linux/evms/evms_kernel.h>
29160
 
+#include <linux/evms/evms_snapshot.h>
29161
 
+#include <asm/system.h>
29162
 
+#include <asm/uaccess.h>
29163
 
+
29164
 
+#define LOG_PREFIX "snapshot: "
29165
 
+
29166
 
+static struct proc_dir_entry * snap_proc = NULL;
29167
 
+
29168
 
+static int discover_snapshot_volumes( evms_logical_node_t ** evms_node_list );
29169
 
+static int delete_snapshot_volume( evms_logical_node_t * node );
29170
 
+static void read_snap( evms_logical_node_t     * node,
29171
 
+                       eio_t   * eio );
29172
 
+static void write_snap(        evms_logical_node_t     * node,
29173
 
+                       eio_t * eio );
29174
 
+static int init_io_snap( evms_logical_node_t   * node,
29175
 
+                       int                     io_flag,
29176
 
+                       evms_sector_t           sect_nr,
29177
 
+                       evms_sector_t           num_sects,
29178
 
+                       void                    * buf_addr );
29179
 
+static int ioctl_snap( evms_logical_node_t     * node,
29180
 
+                       struct inode            * inode,
29181
 
+                       struct file             * file,
29182
 
+                       unsigned int            cmd,
29183
 
+                       unsigned long           arg );
29184
 
+static int add_snapshot(evms_logical_node_t * node,
29185
 
+                       snapshot_metadata_t * metadata,
29186
 
+                       evms_logical_node_t ** evms_node_list );
29187
 
+static int snap_proc_read(char         * page,
29188
 
+                       char            ** start,
29189
 
+                       off_t           off,
29190
 
+                       int             count,
29191
 
+                       int             * eof,
29192
 
+                       void            * data );
29193
 
+
29194
 
+
29195
 
+/********** Required Plugin Functions **********/
29196
 
+
29197
 
+
29198
 
+static evms_plugin_function_table_t function_table = {
29199
 
+       discover: &discover_snapshot_volumes,
29200
 
+       delete  : &delete_snapshot_volume,
29201
 
+       read    : &read_snap,
29202
 
+       write   : &write_snap,
29203
 
+       init_io : &init_io_snap,
29204
 
+       ioctl   : &ioctl_snap
29205
 
+};
29206
 
+
29207
 
+
29208
 
+static evms_plugin_header_t plugin_header = {
29209
 
+       id : SetPluginID(
29210
 
+               IBM_OEM_ID,
29211
 
+               EVMS_ASSOCIATIVE_FEATURE,       // Feature class
29212
 
+               EVMS_SNAPSHOT_FEATURE_ID ),     // Unique ID within features
29213
 
+       version : {
29214
 
+               major           : 2,
29215
 
+               minor           : 0,
29216
 
+               patchlevel      : 0
29217
 
+       },
29218
 
+       required_common_services_version : {
29219
 
+               major           : EVMS_COMMON_SERVICES_MAJOR,
29220
 
+               minor           : EVMS_COMMON_SERVICES_MINOR,
29221
 
+               patchlevel      : EVMS_COMMON_SERVICES_PATCHLEVEL
29222
 
+       },
29223
 
+       function_table   : &function_table              // function table for this plugin
29224
 
+};
29225
 
+
29226
 
+/*
29227
 
+ * Function: convert_metadata
29228
 
+ *
29229
 
+ *     Performs endian conversion on metadata sector.                  
29230
 
+ */
29231
 
+static int convert_metadata( snapshot_metadata_t * metadata ){
29232
 
+
29233
 
+       metadata->chunk_size = le32_to_cpu(metadata->chunk_size);
29234
 
+       metadata->flags = le32_to_cpu(metadata->flags);
29235
 
+       metadata->lba_of_COW_table = le64_to_cpu(metadata->lba_of_COW_table);
29236
 
+       metadata->lba_of_first_chunk = le64_to_cpu(metadata->lba_of_first_chunk);
29237
 
+       metadata->original_size = le64_to_cpu(metadata->original_size);
29238
 
+        metadata->signature = le32_to_cpu(metadata->signature);
29239
 
+       metadata->total_chunks = le32_to_cpu(metadata->total_chunks);
29240
 
+        metadata->version.major = le32_to_cpu(metadata->version.major);
29241
 
+        metadata->version.minor = le32_to_cpu(metadata->version.minor);
29242
 
+        metadata->version.patchlevel = le32_to_cpu(metadata->version.patchlevel);
29243
 
+       metadata->CRC = le32_to_cpu(metadata->CRC);
29244
 
+
29245
 
+       return(0);
29246
 
+}
29247
 
+
29248
 
+/*
29249
 
+ * Function: insert_snapshot_hash_entry
29250
 
+ *
29251
 
+ *     This function inserts a new entry into a snapshot hash chain, immediately
29252
 
+ *     following the specified entry. This function should not be used to add an
29253
 
+ *     entry into an empty list, or as the first entry in an existing list. For
29254
 
+ *     that case, use insert_snapshot_map_entry_at_head().
29255
 
+ */
29256
 
+static int insert_snapshot_hash_entry( snapshot_hash_entry_t * entry,
29257
 
+                                       snapshot_hash_entry_t * base )
29258
 
+{
29259
 
+       entry->next = base->next;
29260
 
+       entry->prev = base;
29261
 
+       base->next = entry;
29262
 
+       if ( entry->next ) {
29263
 
+               entry->next->prev = entry;
29264
 
+       }
29265
 
+       return 0;
29266
 
+}
29267
 
+
29268
 
+/*
29269
 
+ * Function: insert_snapshot_hash_entry_at_head
29270
 
+ *
29271
 
+ *     This function inserts a new entry into a snapshot chain as the first
29272
 
+ *     entry in the chain.
29273
 
+ */
29274
 
+static int insert_snapshot_hash_entry_at_head( snapshot_hash_entry_t * entry,
29275
 
+                                               snapshot_hash_entry_t ** head )
29276
 
+{
29277
 
+       entry->next = *head;
29278
 
+       entry->prev = NULL;
29279
 
+       *head = entry;
29280
 
+       if ( entry->next ) {
29281
 
+               entry->next->prev = entry;
29282
 
+       }
29283
 
+       return 0;
29284
 
+}
29285
 
+
29286
 
+
29287
 
+/*
29288
 
+ * Function: set_snapshot_flags
29289
 
+ *
29290
 
+ *     Set a bit in the flags field of the metadata to mark the snapshot node
29291
 
+ *     as either disabled or full, and write the metadata sector to the 
29292
 
+ *     snapshot volume. The node passed in to this function should be the
29293
 
+ *     "lower" of the snapshot nodes, meaning the one passed into the snapshot
29294
 
+ *     plugin, not the one exported from the plugin. Currently, appropriate
29295
 
+ *     values for "flag" are EVMS_SNAPSHOT_DISABLED and EVMS_SNAPSHOT_FULL.
29296
 
+ */
29297
 
+static int set_snapshot_flags( evms_logical_node_t     * snap_node,
29298
 
+                               unsigned long           flag )
29299
 
+{
29300
 
+       unsigned char data[EVMS_VSECTOR_SIZE] = {0};
29301
 
+       snapshot_metadata_t * metadata  = (snapshot_metadata_t*)data;
29302
 
+
29303
 
+       // Read the metadata sector
29304
 
+       if ( INIT_IO( snap_node, 0, snap_node->total_vsectors-3, 1, data ) ) {
29305
 
+               return -EIO;
29306
 
+       }
29307
 
+       // Set the appropriate flag.
29308
 
+       // do endian conversion on the fly
29309
 
+       metadata->flags |= cpu_to_le32(flag);
29310
 
+       metadata->CRC = 0;
29311
 
+       metadata->CRC = evms_cs_calculate_crc(
29312
 
+               EVMS_INITIAL_CRC,
29313
 
+               metadata, sizeof(snapshot_metadata_t));
29314
 
+       // Write the metadata sector back to the volume
29315
 
+       if ( INIT_IO( snap_node, 1, snap_node->total_vsectors-3, 1, data ) ) {
29316
 
+               return -EIO;
29317
 
+       }
29318
 
+       return 0;
29319
 
+}
29320
 
+
29321
 
+
29322
 
+/*
29323
 
+ * Function: discover_snapshot_volumes
29324
 
+ *
29325
 
+ *     Inspect the global node list, looking for volumes with a valid
29326
 
+ *     snapshot metadata sector.
29327
 
+ */
29328
 
+static int discover_snapshot_volumes( evms_logical_node_t ** evms_node_list )
29329
 
+{
29330
 
+       evms_logical_node_t     * node;
29331
 
+       evms_logical_node_t     * next_node;
29332
 
+       snapshot_metadata_t     * metadata = NULL;  
29333
 
+       int                     rc = 0;
29334
 
+       int org_crc, final_crc;
29335
 
+
29336
 
+       if ( evms_cs_allocate_memory( (void**)&metadata, EVMS_VSECTOR_SIZE )) {
29337
 
+               return -ENOMEM;
29338
 
+       }
29339
 
+
29340
 
+       for ( node = *evms_node_list; node && (rc == 0); node = next_node) {
29341
 
+               next_node = node->next;
29342
 
+               // if the id of this node is ours, skip to next node because this 
29343
 
+               // must be one we put back on the list
29344
 
+               if (node->plugin->id == plugin_header.id) {
29345
 
+                       continue;
29346
 
+               }
29347
 
+               if (node->feature_header && node->feature_header->feature_id == plugin_header.id) {
29348
 
+                       // Read next to last sector for the snapshot metadata. Check for
29349
 
+                       // a valid snapshot signature.
29350
 
+                       if ( INIT_IO(node, 0, node->total_vsectors-3, 1, metadata) ) {
29351
 
+                               LOG_ERROR("IO error on  '%s' sector %Ld.\n",
29352
 
+                                       node->name, node->total_vsectors-3);
29353
 
+                               rc =  -EVMS_FEATURE_FATAL_ERROR;
29354
 
+                               evms_cs_remove_logical_node_from_list(evms_node_list,node);
29355
 
+                               DELETE(node);
29356
 
+                               break;
29357
 
+                       }
29358
 
+                       if ( le32_to_cpu(metadata->signature) == EVMS_SNAPSHOT_SIGNATURE ) {
29359
 
+                               org_crc = le32_to_cpu(metadata->CRC);
29360
 
+                               metadata->CRC = 0;
29361
 
+                               final_crc = evms_cs_calculate_crc(
29362
 
+                                       EVMS_INITIAL_CRC,
29363
 
+                                       metadata, sizeof(snapshot_metadata_t));
29364
 
+                               if (final_crc != org_crc) {
29365
 
+                                       LOG_ERROR("CRC error in feature data on '%s'.\n", node->name);
29366
 
+                                       rc = -EVMS_FEATURE_FATAL_ERROR;
29367
 
+                                       evms_cs_remove_logical_node_from_list(evms_node_list,node);
29368
 
+                                       DELETE(node);
29369
 
+                               } else{
29370
 
+                                       convert_metadata(metadata);
29371
 
+                                       if (metadata->version.major > plugin_header.version.major) {
29372
 
+                                               LOG_ERROR("ERROR: unsuppoprted version of feature in meta data on '%s'.\n",
29373
 
+                                                       node->name);
29374
 
+                                               rc = -EVMS_FEATURE_FATAL_ERROR;
29375
 
+                                               evms_cs_remove_logical_node_from_list(evms_node_list,node);
29376
 
+                                               DELETE(node);
29377
 
+                                       }else {
29378
 
+                                               rc = add_snapshot(node, metadata, evms_node_list);
29379
 
+                                       }
29380
 
+                               }
29381
 
+                       }
29382
 
+               }
29383
 
+       }
29384
 
+       if (metadata) {
29385
 
+               evms_cs_deallocate_memory(metadata);
29386
 
+       }
29387
 
+       return rc;
29388
 
+}
29389
 
+
29390
 
+
29391
 
+/*
29392
 
+ * Function: check_quiesce
29393
 
+ *
29394
 
+ *     Make sure a snapshot and it's original volume quiesced.
29395
 
+ */
29396
 
+static int check_quiesce( snapshot_volume_t * org_volume )
29397
 
+{
29398
 
+       snapshot_volume_t * next_vol;
29399
 
+       for ( next_vol = org_volume; next_vol; next_vol = next_vol->snapshot_next ) {
29400
 
+               if ( ! (next_vol->flags & EVMS_SNAPSHOT_QUIESCED) ) {
29401
 
+                       LOG_ERROR("Can't delete snapshot, volume '%s' not quiesced.\n",
29402
 
+                               next_vol->logical_node->name);
29403
 
+                       return -EBUSY;
29404
 
+               }
29405
 
+       }
29406
 
+       return 0;
29407
 
+}
29408
 
+
29409
 
+
29410
 
+/*
29411
 
+ * Function: remove_snapshot_from_chain
29412
 
+ *
29413
 
+ *     Remove the specified snapshot volume from its original's chain of
29414
 
+ *     snapshots.
29415
 
+ */
29416
 
+static int remove_snapshot_from_chain( snapshot_volume_t * snap_volume )
29417
 
+{
29418
 
+       snapshot_volume_t * org_volume = snap_volume->snapshot_org;
29419
 
+
29420
 
+       if ( org_volume ) {
29421
 
+               while ( org_volume->snapshot_next && org_volume->snapshot_next != snap_volume ) {
29422
 
+                       org_volume = org_volume->snapshot_next;
29423
 
+               }
29424
 
+               if ( org_volume->snapshot_next ) {
29425
 
+                       org_volume->snapshot_next = org_volume->snapshot_next->snapshot_next;
29426
 
+               }
29427
 
+       }
29428
 
+       snap_volume->snapshot_org = NULL;
29429
 
+       snap_volume->snapshot_next = NULL;
29430
 
+       return 0;
29431
 
+}
29432
 
+
29433
 
+
29434
 
+/*
29435
 
+ * Function: delete_snapshot_hash_chain
29436
 
+ *
29437
 
+ *     Delete all items in a single chain in the hash table.
29438
 
+ */
29439
 
+static int delete_snapshot_hash_chain( snapshot_hash_entry_t * head )
29440
 
+{
29441
 
+       snapshot_hash_entry_t * next;
29442
 
+
29443
 
+       while ( head ) {
29444
 
+               next = head->next;
29445
 
+               evms_cs_deallocate_memory(head);
29446
 
+               head = next;
29447
 
+       }
29448
 
+       return 0;
29449
 
+}
29450
 
+
29451
 
+
29452
 
+/*
29453
 
+ * Function: delete_snapshot_volume
29454
 
+ *
29455
 
+ *     Delete the in-memory representation of a volume. The specified node
29456
 
+ *     can actually be either a snapshot or an original. Deleting a snapshot
29457
 
+ *     causes it to be removed from its original's chain of snapshots.
29458
 
+ */
29459
 
+static int delete_snapshot_volume(evms_logical_node_t * node)
29460
 
+{
29461
 
+       snapshot_volume_t       * volume = (snapshot_volume_t *) node->instance_data;
29462
 
+       snapshot_volume_t       * org_volume = volume->snapshot_org;
29463
 
+       snapshot_volume_t * next_vol;
29464
 
+       int                     rc = 0;
29465
 
+       int                     i;
29466
 
+
29467
 
+       // Delete the instance data                    
29468
 
+       if ( volume ) {
29469
 
+               if (volume->flags & EVMS_SNAPSHOT) {
29470
 
+                       // This node is a snapshot. Remove it from the 
29471
 
+                       // original's list. Check all snapshots in the chain
29472
 
+                       // for quiesce before this is done.
29473
 
+                       if ( !(volume->flags & EVMS_SNAPSHOT_QUIESCED) ){
29474
 
+                               return(-EBUSY);
29475
 
+                       }
29476
 
+                       if ( volume->snapshot_org && 
29477
 
+                               !(org_volume->flags & EVMS_SNAPSHOT_QUIESCED)) {
29478
 
+                               return(-EBUSY);
29479
 
+                       }
29480
 
+
29481
 
+                       remove_snapshot_from_chain( volume );
29482
 
+
29483
 
+                       // If we just deleted the only/last snapshot for this
29484
 
+                       // original, the original will not be modified. It is
29485
 
+                       // the engine's responsibility to delete the original
29486
 
+                       // and rediscover in order to clear it of its snapshot
29487
 
+                       // information. Even if that doesn't happen, the state
29488
 
+                       // of the kernel will still be safe. I/O's coming into
29489
 
+                       // this plugin for the original will just be passed
29490
 
+                       // down without any other action or modification.
29491
 
+
29492
 
+                       // Unregister the proc-fs entry for this node.
29493
 
+                       if ( snap_proc ) {
29494
 
+                               remove_proc_entry(node->volume_info->volume_name, snap_proc);
29495
 
+                       }
29496
 
+               }
29497
 
+               else {
29498
 
+                       // This is an original. It's the engine's responsibility
29499
 
+                       // to delete all snapshots before deleting an original.
29500
 
+                       // Otherwise, a snapshot could be left pointing to an
29501
 
+                       // original that no longer exists. Thus, we just need to
29502
 
+                       // make sure there are no snapshots in the chain.
29503
 
+                       if ( (rc = check_quiesce(volume)) ) {
29504
 
+//                     if ( volume->snapshot_next ) {
29505
 
+                               return -EBUSY;
29506
 
+                       }
29507
 
+                       // loop through all snapshots left on this original, and 
29508
 
+                       // NULL out their org pointer and mark disabled, in case they don't get deleted.
29509
 
+                       for ( next_vol = volume->snapshot_next;
29510
 
+                               next_vol; next_vol = next_vol->snapshot_next ) {
29511
 
+                               next_vol->snapshot_org = NULL;
29512
 
+                               next_vol->flags |= EVMS_SNAPSHOT_DISABLED; // disable in memory only.
29513
 
+                       }
29514
 
+               }
29515
 
+
29516
 
+               // Free up all memory used by the instance data, including
29517
 
+               // the underlying node, the hash table, and the data buffer.
29518
 
+               if (volume->logical_node) {
29519
 
+                       if ( (rc = DELETE(volume->logical_node)) ) {
29520
 
+                               return(rc);
29521
 
+                       }
29522
 
+               }
29523
 
+               if (volume->snapshot_map) {
29524
 
+                       // Delete all of the hash chains, then the actual table.
29525
 
+                       for ( i = 0; i < volume->hash_table_size; i++ ) {
29526
 
+                               delete_snapshot_hash_chain( volume->snapshot_map[i] );
29527
 
+                       }
29528
 
+                       vfree(volume->snapshot_map);
29529
 
+               }
29530
 
+               if (volume->chunk_data_buffer) {
29531
 
+                       evms_cs_deallocate_memory(volume->chunk_data_buffer);
29532
 
+               }
29533
 
+
29534
 
+               evms_cs_deallocate_memory(volume);
29535
 
+       }
29536
 
+
29537
 
+       evms_cs_deallocate_logical_node(node);
29538
 
+
29539
 
+       MOD_DEC_USE_COUNT;
29540
 
+
29541
 
+       return 0;
29542
 
+}
29543
 
+
29544
 
+/*
29545
 
+ * Function: search_snapshot_hash_chain
29546
 
+ *
29547
 
+ *     This function will search the hash chain that is anchored at the
29548
 
+ *     specified head pointer. If the sector number is found, a pointer to that
29549
 
+ *     entry in the chain is set, and a 1 is returned. If the sector is not
29550
 
+ *     found, a pointer to the previous entry is set and 0 is returned. If the
29551
 
+ *     return pointer is NULL, this means either the list is empty, or the
29552
 
+ *     specified sector should become the first list item.
29553
 
+ */
29554
 
+static int search_snapshot_hash_chain( u_int64_t       chunk,
29555
 
+                                       snapshot_hash_entry_t   * head,
29556
 
+                                       snapshot_hash_entry_t   ** result )
29557
 
+{
29558
 
+       snapshot_hash_entry_t * curr = head;
29559
 
+       snapshot_hash_entry_t * prev = head;
29560
 
+       while ( curr && curr->org_chunk < chunk ) {
29561
 
+               prev = curr;
29562
 
+               curr = curr->next;
29563
 
+       }
29564
 
+       if (!curr) {    // Either an empty chain or went off the end of the chain.
29565
 
+               *result = prev;
29566
 
+               return 0;
29567
 
+       }
29568
 
+       else if ( curr->org_chunk != chunk ) {
29569
 
+               *result = curr->prev;
29570
 
+               return 0;
29571
 
+       }
29572
 
+       else {
29573
 
+               *result = curr;
29574
 
+               return 1;
29575
 
+       }
29576
 
+}
29577
 
+
29578
 
+
29579
 
+/*
29580
 
+ * Function: snapshot_remap_chunk
29581
 
+ *
29582
 
+ *     This function performs a sector remap on a snapshot volume. This should
29583
 
+ *     be called from the I/O read path, It first determines the base sector of
29584
 
+ *     the chunk containing the specified sector, and saves the remainder. Then
29585
 
+ *     it performs a search through the snapshot map for the specified volume.
29586
 
+ *     If a match is found, the sector number is changed to the new value. If
29587
 
+ *     no match is found, the value is left the same, meaning the read should
29588
 
+ *     proceed down the original volume.
29589
 
+ */
29590
 
+static int snapshot_remap_chunk(snapshot_volume_t      * snap_volume,
29591
 
+                               evms_sector_t           * sector )
29592
 
+{
29593
 
+       snapshot_hash_entry_t   * result;
29594
 
+       unsigned long           hash_value;
29595
 
+       u_int64_t       chunk;
29596
 
+       unsigned long           remainder;
29597
 
+
29598
 
+        remainder = *sector & (u_int64_t)( snap_volume->chunk_size -1);
29599
 
+        chunk = *sector >> snap_volume->chunk_shift;
29600
 
+       hash_value      = ((unsigned long)chunk) % snap_volume->hash_table_size;
29601
 
+
29602
 
+       if ( search_snapshot_hash_chain( chunk, snap_volume->snapshot_map[hash_value], &result ) ) {
29603
 
+               *sector = (result->snap_chunk << snap_volume->chunk_shift) + remainder;
29604
 
+               return 0;
29605
 
+       }
29606
 
+       return 1;
29607
 
+}
29608
 
+
29609
 
+
29610
 
+/*
29611
 
+ * Function: read_snap
29612
 
+ */
29613
 
+static void read_snap( evms_logical_node_t     * node, eio_t *eio)
29614
 
+{
29615
 
+       snapshot_volume_t       * volume = (snapshot_volume_t * ) node->instance_data;
29616
 
+
29617
 
+               // Size check
29618
 
+               if ( (eio->rsector + eio->rsize) > node->total_vsectors ) {
29619
 
+                       EVMS_IO_ERROR(eio);
29620
 
+                       return;
29621
 
+               }
29622
 
+
29623
 
+       // On a read to the original, we can just pass it through completely
29624
 
+       // untouched. Only reads to the snapshot can be broken up.
29625
 
+       if ( volume->flags & EVMS_SNAPSHOT_ORG ) {
29626
 
+               R_IO(volume->logical_node,eio);
29627
 
+               return;
29628
 
+       }
29629
 
+
29630
 
+       // Lock the snapshot before processing the request.
29631
 
+       down(&volume->snap_semaphore);
29632
 
+
29633
 
+               // Make sure the snapshot is not full/disabled, and that
29634
 
+               // the original is present.
29635
 
+               if ( (volume->flags & (EVMS_SNAPSHOT_DISABLED|EVMS_SNAPSHOT_FULL)) ||
29636
 
+                    (! volume->snapshot_org) ) {
29637
 
+                       EVMS_IO_ERROR(eio);
29638
 
+               up(&volume->snap_semaphore);
29639
 
+                       return;
29640
 
+               }
29641
 
+
29642
 
+
29643
 
+               // Check if this sector has been remapped
29644
 
+               if ( snapshot_remap_chunk(volume, &eio->rsector)){
29645
 
+                       // Has not been remapped. Send IO to the original.
29646
 
+                       R_IO(volume->snapshot_org->logical_node,eio);
29647
 
+               } else {
29648
 
+                       // Sector was remapped. Send IO to the snapshot.
29649
 
+                       R_IO(volume->logical_node,eio);
29650
 
+               }
29651
 
+
29652
 
+       up(&volume->snap_semaphore);
29653
 
+}
29654
 
+
29655
 
+
29656
 
+static int snapshot_copy_1( snapshot_volume_t * snap_volume, evms_sector_t org_sector,
29657
 
+                           u_int64_t * remap_chunk) {
29658
 
+
29659
 
+       snapshot_hash_entry_t   * target_entry;
29660
 
+       snapshot_hash_entry_t   * new_map_entry;
29661
 
+       snapshot_volume_t       * org_volume = snap_volume->snapshot_org;
29662
 
+       unsigned long           hash_value;
29663
 
+       u_int64_t               chunk;
29664
 
+       u_int32_t       io_size = snap_volume->chunk_size;
29665
 
+       int             i, iterations = 1;
29666
 
+
29667
 
+       if ( SNAPSHOT_CHUNK_BUFFER_SIZE < snap_volume->chunk_size ) {
29668
 
+               iterations = snap_volume->chunk_size / org_volume->chunk_size;
29669
 
+               io_size = org_volume->chunk_size;
29670
 
+       }
29671
 
+
29672
 
+               // Lock out this snapshot while we are remapping.
29673
 
+               down(&snap_volume->snap_semaphore);
29674
 
+
29675
 
+               // Make sure the snapshot has not been disabled.
29676
 
+               if ( snap_volume->flags & (EVMS_SNAPSHOT_DISABLED|EVMS_SNAPSHOT_FULL) ) {
29677
 
+                       up(&snap_volume->snap_semaphore);
29678
 
+                       return -ENOSPC;
29679
 
+               }
29680
 
+
29681
 
+               // Search the hash table to see if this sector has already been
29682
 
+               // remapped on this snapshot.
29683
 
+               chunk = org_sector >> snap_volume->chunk_shift;
29684
 
+               hash_value = (long)chunk % snap_volume->hash_table_size;
29685
 
+               if ( search_snapshot_hash_chain( chunk, snap_volume->snapshot_map[hash_value], &target_entry ) ) {
29686
 
+                       // Chunk is already remapped.
29687
 
+                       up(&snap_volume->snap_semaphore);
29688
 
+                       *remap_chunk = target_entry->snap_chunk;
29689
 
+                       return 0;
29690
 
+               }
29691
 
+       
29692
 
+               // Is there enough room remaining on the snapshot to
29693
 
+               // remap this chunk?
29694
 
+               if ( snap_volume->next_free_chunk >= snap_volume->num_chunks ) {
29695
 
+                       // Once the snapshot becomes full, further writes to the
29696
 
+                       // original can't be remapped, and thus this snapshot
29697
 
+                       // will become "corrupted".
29698
 
+                       set_snapshot_flags(snap_volume->logical_node,EVMS_SNAPSHOT_FULL);
29699
 
+                       snap_volume->flags |= EVMS_SNAPSHOT_FULL;
29700
 
+                       up(&snap_volume->snap_semaphore);
29701
 
+                       return -ENOSPC;
29702
 
+               }
29703
 
+
29704
 
+
29705
 
+       for ( i = 0; i < iterations; i++ ) {
29706
 
+               // Read the part of all chunk from the original volume.
29707
 
+               if ( INIT_IO( org_volume->logical_node, 0, chunk * snap_volume->chunk_size + i*io_size, io_size, org_volume->chunk_data_buffer ) ) {
29708
 
+                       // An error reading from the original volume is very bad.
29709
 
+                       // If the read fails, the original write will likely fail
29710
 
+                       // as well, so let's just return an error.
29711
 
+                       up(&snap_volume->snap_semaphore);
29712
 
+                       return -EIO;
29713
 
+               }
29714
 
+
29715
 
+               // save of chunk number of the destination in snapshot of where this remap is going.
29716
 
+               *remap_chunk = snap_volume->next_free_chunk;
29717
 
+               // Write this chunk to the snapshot volume. 
29718
 
+               if ( INIT_IO( snap_volume->logical_node, 1, (snap_volume->next_free_chunk * snap_volume->chunk_size + i*io_size), io_size, org_volume->chunk_data_buffer) ) {
29719
 
+                       // An error writing to the snapshot is the same
29720
 
+                       // situation as a full snapshot.
29721
 
+                       set_snapshot_flags(snap_volume->logical_node,EVMS_SNAPSHOT_DISABLED);
29722
 
+                       snap_volume->flags |= EVMS_SNAPSHOT_DISABLED;
29723
 
+                       up(&snap_volume->snap_semaphore);
29724
 
+                       LOG_ERROR("I/O error on COW on '%s' disabling snapshot.\n",
29725
 
+                               snap_volume->logical_node->name);
29726
 
+                       return -ENOSPC;
29727
 
+               }
29728
 
+       }
29729
 
+               // Fill in the appropriate COW table entry and write that
29730
 
+               // metadata sector back to the snapshot volume.
29731
 
+       // convert to little endian on disk
29732
 
+               snap_volume->cow_table[snap_volume->next_cow_entry] = cpu_to_le64(chunk);
29733
 
+               if ( INIT_IO( snap_volume->logical_node, 1, snap_volume->current_cow_sector, 1, snap_volume->cow_table ) ) {
29734
 
+                       // The data was written to the snapshot, but writing the
29735
 
+                       // metadata failed.
29736
 
+                       set_snapshot_flags(snap_volume->logical_node,EVMS_SNAPSHOT_DISABLED);
29737
 
+                       snap_volume->flags |= EVMS_SNAPSHOT_DISABLED;
29738
 
+                       up(&snap_volume->snap_semaphore);
29739
 
+               LOG_ERROR("I/O error on COW table on '%s' disabling snapshot.\n",
29740
 
+                       snap_volume->logical_node->name);
29741
 
+                       return -ENOSPC;
29742
 
+               }
29743
 
+               snap_volume->next_cow_entry++;
29744
 
+               if ( snap_volume->next_cow_entry >= (SECTOR_SIZE/sizeof(u_int64_t)) ) {
29745
 
+                       snap_volume->next_cow_entry = 0;
29746
 
+                       snap_volume->current_cow_sector++;
29747
 
+                       memset( snap_volume->cow_table, 0xff, SECTOR_SIZE );
29748
 
+                       if ( INIT_IO( snap_volume->logical_node, 1, snap_volume->current_cow_sector, 1, snap_volume->cow_table ) ) {
29749
 
+                               // Can't clear out the next sector of metadata. This
29750
 
+                               // is bad and would kill us on a new discover, so 
29751
 
+                               // disable the snapshot now before we really screw up.
29752
 
+                               set_snapshot_flags(snap_volume->logical_node,EVMS_SNAPSHOT_DISABLED);
29753
 
+                               snap_volume->flags |= EVMS_SNAPSHOT_DISABLED;
29754
 
+                               up(&snap_volume->snap_semaphore);
29755
 
+                       LOG_ERROR("I/O error on COW table init on '%s' disabling snapshot.\n",
29756
 
+                                       snap_volume->logical_node->name);
29757
 
+                               return -ENOSPC;
29758
 
+                       }
29759
 
+               }
29760
 
+
29761
 
+               // Create a new snapshot map entry and add it in the appropriate
29762
 
+               // place in the map.
29763
 
+               if ( evms_cs_allocate_memory((void **)&new_map_entry, sizeof(snapshot_hash_entry_t)) ) {
29764
 
+                       set_snapshot_flags(snap_volume->logical_node,EVMS_SNAPSHOT_DISABLED);
29765
 
+                       snap_volume->flags |= EVMS_SNAPSHOT_DISABLED;
29766
 
+                       up(&snap_volume->snap_semaphore);
29767
 
+               LOG_ERROR("no memory for remap entry, on '%s' disabling snapshot.\n",
29768
 
+                       snap_volume->logical_node->name);
29769
 
+                       return -ENOMEM;
29770
 
+               }
29771
 
+               new_map_entry->org_chunk = chunk;
29772
 
+               new_map_entry->snap_chunk = snap_volume->next_free_chunk;
29773
 
+
29774
 
+               if ( target_entry ) {   
29775
 
+                       insert_snapshot_hash_entry( new_map_entry, target_entry );
29776
 
+               }
29777
 
+               else {
29778
 
+                       insert_snapshot_hash_entry_at_head( new_map_entry, &(snap_volume->snapshot_map[hash_value]) );
29779
 
+               }
29780
 
+               snap_volume->next_free_chunk++; 
29781
 
+
29782
 
+               up(&snap_volume->snap_semaphore);
29783
 
+
29784
 
+       return 0;
29785
 
+}
29786
 
+/*
29787
 
+ * Function: snapshot_copy_data
29788
 
+ *
29789
 
+ *     On a write to a snapshotted volume, check all snapshots to see if the
29790
 
+ *     specified chunk has already been remapped. If it has not, read the
29791
 
+ *     original data from the volume, write the data to the next available
29792
 
+ *     chunk on the snapshot, update the COW table, write the COW table to
29793
 
+ *     the snapshot, and insert a new entry into the snapshot map.
29794
 
+ */
29795
 
+static int snapshot_copy_data( snapshot_volume_t       * org_volume,
29796
 
+                               evms_sector_t           org_sector)
29797
 
+{
29798
 
+       snapshot_volume_t       * snap_volume;
29799
 
+       snapshot_volume_t       * next_volume;
29800
 
+       u_int64_t               remap_chunk;  // unused here, needed for call to copy1
29801
 
+
29802
 
+       // Volumes can be snapshotted multiple times. Check every snapshot.
29803
 
+       for ( snap_volume = org_volume->snapshot_next; snap_volume; snap_volume = next_volume ) {
29804
 
+               next_volume = snap_volume->snapshot_next;
29805
 
+               snapshot_copy_1(snap_volume, org_sector, &remap_chunk);
29806
 
+
29807
 
+       }
29808
 
+
29809
 
+       return 0;
29810
 
+}
29811
 
+
29812
 
+
29813
 
+/*
29814
 
+ * Function: write_snap
29815
 
+ */
29816
 
+static void write_snap(        evms_logical_node_t     * node, eio_t * eio)
29817
 
+{
29818
 
+       snapshot_volume_t       * volume = (snapshot_volume_t *) node->instance_data;
29819
 
+       int                     rc = 0;         
29820
 
+       u_int64_t               remap_chunk;
29821
 
+       u_int64_t               remainder;
29822
 
+
29823
 
+
29824
 
+               // Size check.
29825
 
+               if ( eio->rsector + eio->rsize > node->total_vsectors) {
29826
 
+                       EVMS_IO_ERROR(eio);
29827
 
+                       return;
29828
 
+               }
29829
 
+
29830
 
+               // if this is a snapshot
29831
 
+               if ( volume->flags & EVMS_SNAPSHOT ) {
29832
 
+                       if ( volume->flags & EVMS_SNAPSHOT_WRITEABLE) { 
29833
 
+                               if (snapshot_copy_1(volume, eio->rsector, &remap_chunk)){
29834
 
+                                       EVMS_IO_ERROR(eio);
29835
 
+                               } else{
29836
 
+                                       remainder = eio->rsector & (u_int64_t)(volume->chunk_size -1);
29837
 
+                                       eio->rsector = (remap_chunk * volume->chunk_size) + remainder;
29838
 
+                                       W_IO(volume->logical_node,eio);
29839
 
+                               }
29840
 
+                       } else{
29841
 
+                               EVMS_IO_ERROR(eio);
29842
 
+                       }
29843
 
+
29844
 
+                       return;
29845
 
+               } else{ // write to original
29846
 
+                       // Remap this sector if necessary.
29847
 
+                       if ( (rc = snapshot_copy_data(volume, eio->rsector)) ) {
29848
 
+                               return;
29849
 
+                       }
29850
 
+                       W_IO(volume->logical_node,eio);
29851
 
+               }
29852
 
+       return;
29853
 
+}
29854
 
+
29855
 
+
29856
 
+/*
29857
 
+ * Function: ioctl_snap
29858
 
+ *
29859
 
+ */
29860
 
+static int ioctl_snap( evms_logical_node_t     * logical_node,
29861
 
+                       struct inode            * inode,
29862
 
+                       struct file             * file,
29863
 
+                       unsigned int            cmd,
29864
 
+                       unsigned long           arg)
29865
 
+{
29866
 
+       int rc=0;
29867
 
+       snapshot_volume_t * volume = (snapshot_volume_t*)logical_node->instance_data;
29868
 
+
29869
 
+       if (!inode || !logical_node) {
29870
 
+               return -EINVAL;
29871
 
+       }
29872
 
+       switch (cmd) {
29873
 
+       case EVMS_QUIESCE_VOLUME:
29874
 
+               {
29875
 
+                       evms_quiesce_volume_t *tmp = (evms_quiesce_volume_t*)arg;
29876
 
+                       if ( tmp->command ) {   // Quiesce
29877
 
+                               volume->flags |= EVMS_SNAPSHOT_QUIESCED;
29878
 
+                       }
29879
 
+                       else {                  // Un-quiesce
29880
 
+                               volume->flags &= ~EVMS_SNAPSHOT_QUIESCED;
29881
 
+                       }
29882
 
+               }
29883
 
+               break;
29884
 
+
29885
 
+       case EVMS_GET_BMAP:
29886
 
+               {
29887
 
+                       if ( volume->flags & EVMS_SNAPSHOT_ORG ) {
29888
 
+                               rc = IOCTL(volume->logical_node, inode, file, cmd, arg);
29889
 
+                       }
29890
 
+                       else {
29891
 
+                               rc = -EINVAL;
29892
 
+                       }
29893
 
+               }
29894
 
+               break;
29895
 
+
29896
 
+       case EVMS_PLUGIN_IOCTL:
29897
 
+               {
29898
 
+                       evms_plugin_ioctl_t tmp, *user_parms;
29899
 
+                       int percent_full;
29900
 
+                       user_parms = (evms_plugin_ioctl_t *)arg;
29901
 
+
29902
 
+                       /* copy user's parameters to kernel space */
29903
 
+                       if (copy_from_user(&tmp, user_parms, sizeof(tmp)))
29904
 
+                               rc = -EFAULT;
29905
 
+
29906
 
+                       if (!rc) {
29907
 
+                               /* is this cmd targetted at this feature ? */
29908
 
+                               if (tmp.feature_id == logical_node->plugin->id) {
29909
 
+                                       switch(tmp.feature_command) {
29910
 
+                                       case SNAPSHOT_QUERY_PERCENT_FULL:
29911
 
+                                               if (volume->flags & EVMS_SNAPSHOT_FULL) {
29912
 
+                                                       percent_full = -1;
29913
 
+                                               } else if (volume->flags & EVMS_SNAPSHOT_DISABLED) {
29914
 
+                                                       percent_full = -2;
29915
 
+                                               } else {
29916
 
+                                                       percent_full = (volume->next_free_chunk * 100) / volume->num_chunks;
29917
 
+                                               }
29918
 
+                                               rc = copy_to_user(tmp.feature_ioctl_data, &percent_full, sizeof(percent_full));
29919
 
+                                               default:
29920
 
+                                                       break;
29921
 
+                                       }
29922
 
+                               } else { /* broadcast this cmd to all children */
29923
 
+                                               rc = IOCTL(logical_node,inode, file, cmd, arg);
29924
 
+                                               break;
29925
 
+                               }
29926
 
+                       }
29927
 
+               }
29928
 
+               break;
29929
 
+       case EVMS_CHECK_MEDIA_CHANGE:
29930
 
+       case EVMS_REVALIDATE_DISK:
29931
 
+       case EVMS_GET_DISK_LIST:
29932
 
+
29933
 
+               if (!(volume->flags & EVMS_SNAPSHOT_ORG)) {
29934
 
+                       volume = volume->snapshot_org;
29935
 
+               }
29936
 
+                       while ( volume ) {
29937
 
+                       rc = IOCTL(volume->logical_node, inode, file, cmd, arg);
29938
 
+                               volume = volume->snapshot_next;
29939
 
+               }
29940
 
+               break;
29941
 
+
29942
 
+       default:
29943
 
+               rc = IOCTL(volume->logical_node, inode, file, cmd, arg);
29944
 
+
29945
 
+       }
29946
 
+       return rc;
29947
 
+}
29948
 
+
29949
 
+
29950
 
+static int init_io_snap(evms_logical_node_t    * node,
29951
 
+                       int                     io_flag,        /* 0=read, 1=write*/
29952
 
+                       evms_sector_t           sect_nr,        /* disk LBA */
29953
 
+                       evms_sector_t           num_sects,      /* # of sectors */
29954
 
+                       void                    * buf_addr )    /* buffer address */
29955
 
+{
29956
 
+       snapshot_volume_t * volume = (snapshot_volume_t *)(node->instance_data);
29957
 
+
29958
 
+       // no init io access to snapshot, and no writes allowed to original
29959
 
+       // since they would not be snapshotted.
29960
 
+       if (io_flag || (volume->flags & EVMS_SNAPSHOT)) {
29961
 
+               return(-EINVAL);
29962
 
+       }
29963
 
+       return INIT_IO(volume->logical_node, io_flag, sect_nr, num_sects, buf_addr);
29964
 
+}
29965
 
+
29966
 
+
29967
 
+
29968
 
+/*
29969
 
+ * Function: snapshot_init
29970
 
+ *
29971
 
+ */
29972
 
+int __init snapshot_init(void)
29973
 
+{
29974
 
+       struct proc_dir_entry * pde;
29975
 
+
29976
 
+       // Register a directory in proc-fs.
29977
 
+       pde = evms_cs_get_evms_proc_dir();
29978
 
+       if (pde) {
29979
 
+               snap_proc = create_proc_entry("snapshot", S_IFDIR, pde);
29980
 
+       }
29981
 
+
29982
 
+       return evms_cs_register_plugin(&plugin_header); /* register with EVMS */
29983
 
+}
29984
 
+
29985
 
+/*
29986
 
+ * Function: snapshot_exit
29987
 
+ */
29988
 
+void __exit snapshot_exit(void)
29989
 
+{
29990
 
+       struct proc_dir_entry * pde;
29991
 
+
29992
 
+       // Unregister the directory in proc-fs.
29993
 
+       pde = evms_cs_get_evms_proc_dir();
29994
 
+       if (pde) {
29995
 
+               remove_proc_entry("snapshot", pde);
29996
 
+       }
29997
 
+       
29998
 
+       evms_cs_unregister_plugin(&plugin_header);
29999
 
+}
30000
 
+
30001
 
+module_init(snapshot_init);
30002
 
+module_exit(snapshot_exit);
30003
 
+#ifdef MODULE_LICENSE
30004
 
+MODULE_LICENSE("GPL");
30005
 
+#endif
30006
 
+
30007
 
+
30008
 
+/********** SnapShot Functions **********/
30009
 
+
30010
 
+
30011
 
+
30012
 
+/*
30013
 
+ * Function: add_cow_entry_to_snapshot_map
30014
 
+ *
30015
 
+ *     This function takes a cow table entry (from the on-disk data), and
30016
 
+ *     converts it into an appropriate entry for the snapshot map, and
30017
 
+ *     inserts it into the appropriate map for the specified volume.
30018
 
+ */
30019
 
+static int add_cow_entry_to_snapshot_map( u_int64_t    org_chunk,
30020
 
+                                       u_int64_t       snap_chunk,
30021
 
+                                       snapshot_volume_t       * volume )
30022
 
+{
30023
 
+       snapshot_hash_entry_t   * new_entry;
30024
 
+       snapshot_hash_entry_t   * target_entry;
30025
 
+       unsigned long           hash_value;
30026
 
+
30027
 
+       evms_cs_allocate_memory((void **)&new_entry,sizeof (snapshot_hash_entry_t));
30028
 
+       if (!new_entry) {
30029
 
+               return -ENOMEM;
30030
 
+       }
30031
 
+       new_entry->org_chunk = org_chunk;
30032
 
+       new_entry->snap_chunk = snap_chunk;
30033
 
+
30034
 
+       hash_value = (long)org_chunk % volume->hash_table_size;
30035
 
+       if ( search_snapshot_hash_chain( org_chunk, volume->snapshot_map[hash_value], &target_entry ) ) {       
30036
 
+               // This means a duplicate mapping was found. This should not happen.
30037
 
+       }
30038
 
+       else {
30039
 
+               if ( target_entry ) {
30040
 
+                       insert_snapshot_hash_entry( new_entry, target_entry );
30041
 
+               }
30042
 
+               else {
30043
 
+                       insert_snapshot_hash_entry_at_head( new_entry, &(volume->snapshot_map[hash_value]) );
30044
 
+               }
30045
 
+       }
30046
 
+       return 0;
30047
 
+}
30048
 
+
30049
 
+
30050
 
+/*
30051
 
+ * Function: build_snapshot_maps
30052
 
+ *
30053
 
+ *     Construct the initial hash table state based on 
30054
 
+ *     existing COW tables on the disk.
30055
 
+ */
30056
 
+static int build_snapshot_maps(snapshot_volume_t * volume)
30057
 
+{
30058
 
+       int rc = 0;
30059
 
+       int done = 0;
30060
 
+               while (!done) {
30061
 
+
30062
 
+                       // Read in one sector's worth of COW tables.
30063
 
+                       if ( INIT_IO(volume->logical_node, 0, volume->current_cow_sector, 1, volume->cow_table) ) {
30064
 
+                               return -EIO;
30065
 
+                       }
30066
 
+                       // Translate every valid COW table entry into
30067
 
+                       // a snapshot map entry.
30068
 
+                       for ( volume->next_cow_entry = 0;
30069
 
+                             volume->next_cow_entry < (SECTOR_SIZE/sizeof(u_int64_t)) &&
30070
 
+                             volume->cow_table[volume->next_cow_entry] != 0xffffffffffffffff;
30071
 
+                             volume->next_cow_entry++, volume->next_free_chunk++ ) {
30072
 
+                               if ( (rc = add_cow_entry_to_snapshot_map( le64_to_cpu(volume->cow_table[volume->next_cow_entry]),
30073
 
+                                                                 volume->next_free_chunk, volume ))) {
30074
 
+                                       return(rc);
30075
 
+                               }
30076
 
+                       }
30077
 
+                       // Move on to the next sector if necessary.
30078
 
+                       if ( volume->next_cow_entry == (SECTOR_SIZE/sizeof(u_int64_t)) ) {
30079
 
+                               volume->current_cow_sector++;
30080
 
+                       }
30081
 
+                       else {
30082
 
+                               done = 1;
30083
 
+                       }
30084
 
+               }
30085
 
+       return 0;
30086
 
+}
30087
 
+
30088
 
+
30089
 
+/*
30090
 
+ * Function:  add_snapshot
30091
 
+ *
30092
 
+ *     Initializes a snapshot instance and exports an evms_logical_node to
30093
 
+ *     the global list.
30094
 
+ */
30095
 
+static int add_snapshot(evms_logical_node_t * snap_node,
30096
 
+                       snapshot_metadata_t * metadata,
30097
 
+                       evms_logical_node_t ** evms_node_list )
30098
 
+{
30099
 
+       evms_logical_node_t     * new_snap_node;
30100
 
+       evms_logical_node_t     * new_org_node;
30101
 
+       evms_logical_node_t     * org_node;
30102
 
+       snapshot_volume_t       * snap_volume;
30103
 
+       snapshot_volume_t       * org_volume;
30104
 
+       snapshot_volume_t       * tmp_volume;
30105
 
+       int                     rc = 0;
30106
 
+
30107
 
+       evms_cs_remove_logical_node_from_list(evms_node_list,snap_node);
30108
 
+
30109
 
+       // Make sure the snapshot is not full or disabled.
30110
 
+       if ( metadata->flags & (EVMS_SNAPSHOT_DISABLED | EVMS_SNAPSHOT_FULL) ) {
30111
 
+               LOG_WARNING("Error: Snapshot %s discovered as disabled/full.\n", snap_node->name);
30112
 
+               LOG_WARNING("       Deleting from further use.\n");
30113
 
+               DELETE(snap_node);
30114
 
+               return -ENOSPC;
30115
 
+       }
30116
 
+
30117
 
+       // Inspect the global list until a node is found with the name of
30118
 
+       // this snapshot's original. There can only be one original for
30119
 
+       // each snapshot.
30120
 
+       for ( org_node = *evms_node_list;
30121
 
+             org_node &&
30122
 
+             strncmp(EVMS_GET_NODE_NAME(org_node), metadata->original_volume, EVMS_VOLUME_NAME_SIZE);
30123
 
+             org_node = org_node->next ) {
30124
 
+               ;
30125
 
+       }
30126
 
+       if (!org_node) {
30127
 
+               // No original was found. Disable and delete the snapshot.
30128
 
+               LOG_WARNING("Error: No original found for snapshot %s, looking for %s\n", snap_node->name,metadata->original_volume);
30129
 
+               set_snapshot_flags(snap_node,EVMS_SNAPSHOT_DISABLED);
30130
 
+               DELETE(snap_node);
30131
 
+               return -ENODEV;
30132
 
+       }
30133
 
+
30134
 
+       LOG_EXTRA("Adding snapshot for volume %s\n",org_node->name);
30135
 
+
30136
 
+       // ok, we found the original on the list.
30137
 
+       // verify the size to be sure the name didn't change for compatibility
30138
 
+       if (org_node->total_vsectors != metadata->original_size) {
30139
 
+               LOG_WARNING("Error: Original volume size does not match\n");
30140
 
+               LOG_WARNING("         vol=%s: org_size=%d, current size=%d\n",
30141
 
+                       org_node->name, (int)(metadata->original_size), (int)(org_node->total_vsectors));
30142
 
+               // The snapshot no longer points at a valid original.
30143
 
+               // Disable and delete the snapshot.
30144
 
+               set_snapshot_flags(snap_node,EVMS_SNAPSHOT_DISABLED);
30145
 
+               DELETE(snap_node);
30146
 
+               return -ENODEV;
30147
 
+       }
30148
 
+
30149
 
+       // New EVMS node for the snapshot
30150
 
+       if ( evms_cs_allocate_logical_node( &new_snap_node ) ) {
30151
 
+               set_snapshot_flags(snap_node,EVMS_SNAPSHOT_DISABLED);
30152
 
+               DELETE( snap_node );
30153
 
+               return -ENOMEM;
30154
 
+       }
30155
 
+
30156
 
+       MOD_INC_USE_COUNT;
30157
 
+
30158
 
+       // Instance data for the snapshot
30159
 
+       if ( evms_cs_allocate_memory( (void**)&snap_volume, sizeof(snapshot_volume_t) )) {
30160
 
+               delete_snapshot_volume( new_snap_node );
30161
 
+               set_snapshot_flags(snap_node,EVMS_SNAPSHOT_DISABLED);
30162
 
+               DELETE( snap_node );
30163
 
+               return -ENOMEM;
30164
 
+       }
30165
 
+
30166
 
+       // Initialize the snapshot node
30167
 
+       if (metadata->flags & EVMS_SNAPSHOT_WRITEABLE) {
30168
 
+               new_snap_node->flags            = snap_node->flags;
30169
 
+       }else { // if not writeable, set read only
30170
 
+               new_snap_node->flags            = snap_node->flags | EVMS_VOLUME_SET_READ_ONLY;
30171
 
+       }
30172
 
+       new_snap_node->flags            = new_snap_node->flags | 
30173
 
+               (org_node->flags &(EVMS_DEVICE_REMOVABLE | EVMS_VOLUME_PARTIAL));
30174
 
+       new_snap_node->system_id        = 0x536e4170;                   // SnAp 
30175
 
+       new_snap_node->total_vsectors   = org_node->total_vsectors;     // Lying about the size.
30176
 
+       new_snap_node->block_size       = snap_node->block_size;
30177
 
+       new_snap_node->hardsector_size  = snap_node->hardsector_size;
30178
 
+       new_snap_node->plugin           = &plugin_header;
30179
 
+       new_snap_node->instance_data    = (void*)snap_volume;   
30180
 
+       // Get the new node's name from the consumed node's feature
30181
 
+       // header.
30182
 
+       strcpy(new_snap_node->name, snap_node->feature_header->object_name);
30183
 
+       // No problem with propagating the volume name up.
30184
 
+       new_snap_node->volume_info = snap_node->volume_info;
30185
 
+
30186
 
+       // Initialize the instance data
30187
 
+       snap_volume->logical_node       = snap_node;
30188
 
+       snap_volume->chunk_size         = metadata->chunk_size;
30189
 
+       snap_volume->chunk_shift        = evms_cs_log2((u_int64_t)metadata->chunk_size);
30190
 
+       snap_volume->num_chunks         = metadata->total_chunks;
30191
 
+       snap_volume->current_cow_sector = metadata->lba_of_COW_table;
30192
 
+       snap_volume->hash_table_size    = (metadata->total_chunks)/MAX_HASH_CHAIN_ENTRIES + 1;
30193
 
+       snap_volume->flags              = EVMS_SNAPSHOT;
30194
 
+       if (metadata->flags & EVMS_SNAPSHOT_WRITEABLE) {
30195
 
+               snap_volume->flags |= EVMS_SNAPSHOT_WRITEABLE;
30196
 
+       }
30197
 
+
30198
 
+       // Snapshot hash table
30199
 
+               snap_volume->snapshot_map = vmalloc(snap_volume->hash_table_size * sizeof(snapshot_hash_entry_t*));
30200
 
+       if ( !snap_volume->snapshot_map) {
30201
 
+               set_snapshot_flags(snap_node,EVMS_SNAPSHOT_DISABLED);
30202
 
+                       delete_snapshot_volume( new_snap_node );
30203
 
+                       return -ENOMEM;
30204
 
+       }
30205
 
+
30206
 
+       memset(snap_volume->snapshot_map, 0, snap_volume->hash_table_size * sizeof(snapshot_hash_entry_t*));
30207
 
+
30208
 
+       if ( (rc = build_snapshot_maps(snap_volume)) ){
30209
 
+               set_snapshot_flags(snap_node,EVMS_SNAPSHOT_DISABLED);
30210
 
+               delete_snapshot_volume( new_snap_node );
30211
 
+               return(rc);
30212
 
+       }
30213
 
+
30214
 
+       // check to see if the node we found is one we put back on the list due to 
30215
 
+       // another snapshot of the original, if so then don't allocate a new 
30216
 
+       // node and volume info, just get the old
30217
 
+       if (org_node->plugin->id != plugin_header.id) {
30218
 
+
30219
 
+               // New EVMS node for the original
30220
 
+               if ( evms_cs_allocate_logical_node( &new_org_node ) ) {
30221
 
+                       set_snapshot_flags(snap_node,EVMS_SNAPSHOT_DISABLED);
30222
 
+                       delete_snapshot_volume( new_snap_node );
30223
 
+                       return -ENOMEM;
30224
 
+               }
30225
 
+
30226
 
+               MOD_INC_USE_COUNT;
30227
 
+
30228
 
+               // Instance data for the original
30229
 
+               if ( evms_cs_allocate_memory( (void**)&org_volume, sizeof(snapshot_volume_t) )) {
30230
 
+                       set_snapshot_flags(snap_node,EVMS_SNAPSHOT_DISABLED);
30231
 
+                       delete_snapshot_volume( new_snap_node );
30232
 
+                       delete_snapshot_volume( new_org_node );
30233
 
+                       return -ENOMEM;
30234
 
+               }
30235
 
+
30236
 
+               // Initialize the new node
30237
 
+               new_org_node->flags             = org_node->flags | 
30238
 
+                       (snap_node->flags &(EVMS_DEVICE_REMOVABLE | EVMS_VOLUME_PARTIAL));
30239
 
+               new_org_node->system_id         = 0x4f724967;   // OrIg 
30240
 
+               new_org_node->total_vsectors    = org_node->total_vsectors;
30241
 
+               new_org_node->block_size        = org_node->block_size;
30242
 
+               new_org_node->hardsector_size   = org_node->hardsector_size;
30243
 
+               new_org_node->plugin            = &plugin_header;
30244
 
+               new_org_node->instance_data     = (void*)org_volume;    
30245
 
+               // Must reuse the original node's name
30246
 
+               strcpy(new_org_node->name, org_node->name);
30247
 
+               new_org_node->volume_info = org_node->volume_info;
30248
 
+
30249
 
+               // Initialize the instance data
30250
 
+               org_volume->chunk_size          = SNAPSHOT_CHUNK_BUFFER_SIZE;
30251
 
+               org_volume->num_chunks          = 0;
30252
 
+               org_volume->current_cow_sector  = 0;
30253
 
+               org_volume->flags               = EVMS_SNAPSHOT_ORG;
30254
 
+               org_volume->snapshot_next       = snap_volume;
30255
 
+               snap_volume->snapshot_next      = NULL;
30256
 
+
30257
 
+               // Buffer for copying data from the original to the snapshot
30258
 
+               if ( evms_cs_allocate_memory( (void**)(&org_volume->chunk_data_buffer), SNAPSHOT_CHUNK_BUFFER_SIZE * SECTOR_SIZE)) {
30259
 
+                       set_snapshot_flags(snap_node,EVMS_SNAPSHOT_DISABLED);
30260
 
+                       delete_snapshot_volume( new_snap_node );
30261
 
+                       delete_snapshot_volume( new_org_node );
30262
 
+                       return -ENOMEM;
30263
 
+               }
30264
 
+
30265
 
+               // remove the original volume from the global list, then
30266
 
+               // add the new version of the original to the global list.
30267
 
+               evms_cs_remove_logical_node_from_list(evms_node_list,org_node);
30268
 
+               org_volume->logical_node = org_node;
30269
 
+               evms_cs_add_logical_node_to_list(evms_node_list,new_org_node);
30270
 
+
30271
 
+       } else {
30272
 
+               // There is already at least one snapshot for this original.
30273
 
+               new_org_node    = org_node;
30274
 
+               org_volume      = (snapshot_volume_t*)org_node->instance_data;
30275
 
+
30276
 
+               // propagate the flags from the new snapshot node to the original, and then to every other snapshot
30277
 
+               for (tmp_volume=org_volume; tmp_volume;tmp_volume=tmp_volume->snapshot_next) {
30278
 
+                       tmp_volume->logical_node->flags         = org_node->flags | 
30279
 
+                               (snap_node->flags &(EVMS_DEVICE_REMOVABLE | EVMS_VOLUME_PARTIAL));
30280
 
+               }
30281
 
+               // Insert the new snapshot at the start of the original's chain.
30282
 
+               snap_volume->snapshot_next      = org_volume->snapshot_next;
30283
 
+               org_volume->snapshot_next       = snap_volume;
30284
 
+       }
30285
 
+
30286
 
+       if ( snap_proc ) {
30287
 
+               create_proc_read_entry(snap_node->feature_header->volume_name, S_IFREG, snap_proc, snap_proc_read, new_snap_node);
30288
 
+       }
30289
 
+
30290
 
+       init_MUTEX( &snap_volume->snap_semaphore );
30291
 
+       snap_volume->snapshot_org = org_volume;
30292
 
+       evms_cs_add_logical_node_to_list(evms_node_list,new_snap_node);
30293
 
+
30294
 
+       return 0;
30295
 
+}
30296
 
+
30297
 
+
30298
 
+
30299
 
+/* Function: snap_proc_read
30300
 
+ *
30301
 
+ *     Callback function for the proc-fs entry for each snapshot node.
30302
 
+ *     Print out pertinent information about this snapshot. The "data"
30303
 
+ *     parameter is a pointer to an EVMS logical node.
30304
 
+ */
30305
 
+static int snap_proc_read(char         * page,
30306
 
+                       char            ** start,
30307
 
+                       off_t           off,
30308
 
+                       int             count,
30309
 
+                       int             * eof,
30310
 
+                       void            * data )
30311
 
+{
30312
 
+       evms_logical_node_t     * snap_node = data;
30313
 
+       snapshot_volume_t       * snap_volume = snap_node->instance_data;
30314
 
+       int                     sz = 0;
30315
 
+
30316
 
+       PROCPRINT("Snapshot of    : %s\n",      (snap_volume->snapshot_org) ? EVMS_GET_NODE_NAME(snap_volume->snapshot_org->logical_node) : "Unknown");
30317
 
+       PROCPRINT("Size (KB)      : %ld\n",     (snap_volume->num_chunks * snap_volume->chunk_size)/2);
30318
 
+       PROCPRINT("Chunk Size (KB): %ld\n",     (snap_volume->chunk_size)/2);
30319
 
+       PROCPRINT("Writeable      : %s\n",      (snap_volume->flags & EVMS_SNAPSHOT_WRITEABLE) ? "True" : "False");
30320
 
+       PROCPRINT("Usage          : %ld%%\n",   (snap_volume->next_free_chunk * 100) / snap_volume->num_chunks);
30321
 
+       PROCPRINT("Status         : %s\n",      (snap_volume->flags & EVMS_SNAPSHOT_FULL) ? "Full / Disabled" : (snap_volume->flags & EVMS_SNAPSHOT_DISABLED) ? "Disabled" : "Active");
30322
 
+
30323
 
+       return sz;
30324
 
+}
30325
 
+
30326
 
diff -Naur linux-2002-03-28/include/linux/evms/evms.h evms-2002-03-28/include/linux/evms/evms.h
30327
 
--- linux-2002-03-28/include/linux/evms/evms.h  Wed Dec 31 18:00:00 1969
30328
 
+++ evms-2002-03-28/include/linux/evms/evms.h   Mon Mar 25 15:51:13 2002
30329
 
@@ -0,0 +1,279 @@
30330
 
+/* -*- linux-c -*- */
30331
 
+/*
30332
 
+ *
30333
 
+ *   Copyright (c) International Business Machines  Corp., 2000
30334
 
+ *
30335
 
+ *   This program is free software;  you can redistribute it and/or modify
30336
 
+ *   it under the terms of the GNU General Public License as published by
30337
 
+ *   the Free Software Foundation; either version 2 of the License, or 
30338
 
+ *   (at your option) any later version.
30339
 
+ * 
30340
 
+ *   This program is distributed in the hope that it will be useful,
30341
 
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
30342
 
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
30343
 
+ *   the GNU General Public License for more details.
30344
 
+ *
30345
 
+ *   You should have received a copy of the GNU General Public License
30346
 
+ *   along with this program;  if not, write to the Free Software 
30347
 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
30348
 
+ */
30349
 
+/*
30350
 
+ * linux/include/linux/evms/evms.h
30351
 
+ *
30352
 
+ * EVMS public kernel header file
30353
 
+ *
30354
 
+ */
30355
 
+
30356
 
+#ifndef __EVMS_INCLUDED__
30357
 
+#define __EVMS_INCLUDED__
30358
 
+
30359
 
+#include <linux/genhd.h>
30360
 
+#include <linux/fs.h>
30361
 
+#include <linux/iobuf.h>
30362
 
+#include <linux/kdev_t.h>
30363
 
+#include <linux/hdreg.h>
30364
 
+#include <linux/slab.h>
30365
 
+#include <linux/proc_fs.h>
30366
 
+
30367
 
+#define FALSE                           0
30368
 
+#define TRUE                            1
30369
 
+
30370
 
+/* tracing info */
30371
 
+#define EVMS_INFO_CRITICAL              0
30372
 
+#define EVMS_INFO_SERIOUS               1
30373
 
+#define EVMS_INFO_ERROR                 2
30374
 
+#define EVMS_INFO_WARNING               3
30375
 
+#define EVMS_INFO_DEFAULT               5
30376
 
+#define EVMS_INFO_DETAILS               6
30377
 
+#define EVMS_INFO_DEBUG                 7
30378
 
+#define EVMS_INFO_EXTRA                 8
30379
 
+#define EVMS_INFO_ENTRY_EXIT            9
30380
 
+#define EVMS_INFO_EVERYTHING            10
30381
 
+
30382
 
+extern int evms_info_level;
30383
 
+/* information message: e.g., configuration, major event */
30384
 
+#define evmsTRACE(info_level,prspec) { if (evms_info_level >= info_level) printk prspec; }
30385
 
+#define evmsTRACE2(info_level,statement) { if (evms_info_level >= info_level) statement; }
30386
 
+// sample - be sure to use enclose "prspec" or "statement" with parens ()
30387
 
+// evmsTRACE(info_level,(KERN_INFO "evms_myfunction: name = %s\n", name));
30388
 
+// evmsTRACE2(info_level,(print_mem( buffer_address, buffer_length)));
30389
 
+                                                            
30390
 
+/* LOG MACROS to make evms log messages look much 
30391
 
+ * cleaner in the source.
30392
 
+ */
30393
 
+#define EVMS_LOG_PREFIX "evms: "
30394
 
+#define LOG_CRITICAL(msg, args...)     evmsTRACE(EVMS_INFO_CRITICAL,   (KERN_CRIT    EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))
30395
 
+#define LOG_SERIOUS(msg, args...)      evmsTRACE(EVMS_INFO_SERIOUS,    (KERN_ERR     EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))
30396
 
+#define LOG_ERROR(msg, args...)                evmsTRACE(EVMS_INFO_ERROR,      (KERN_ERR     EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))
30397
 
+#define LOG_WARNING(msg, args...)      evmsTRACE(EVMS_INFO_WARNING,    (KERN_WARNING EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))
30398
 
+#define LOG_DEFAULT(msg, args...)      evmsTRACE(EVMS_INFO_DEFAULT,    (KERN_INFO    EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))
30399
 
+#define LOG_DETAILS(msg, args...)      evmsTRACE(EVMS_INFO_DETAILS,    (KERN_INFO    EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))
30400
 
+#define LOG_DEBUG(msg, args...)                evmsTRACE(EVMS_INFO_DEBUG,      (KERN_INFO    EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))
30401
 
+#define LOG_EXTRA(msg, args...)                evmsTRACE(EVMS_INFO_EXTRA,      (KERN_INFO    EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))
30402
 
+#define LOG_ENTRY_EXIT(msg, args...)   evmsTRACE(EVMS_INFO_ENTRY_EXIT, (KERN_INFO    EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))
30403
 
+#define LOG_EVERYTHING(msg, args...)   evmsTRACE(EVMS_INFO_EVERYTHING, (KERN_INFO    EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))
30404
 
+
30405
 
+#ifdef CONFIG_PROC_FS
30406
 
+#define PROCPRINT(msg, args...) (sz += sprintf(page + sz, msg, ## args))
30407
 
+#endif
30408
 
+
30409
 
+#define EVMS_HANDLE_KEY         0x89ABCDEF
30410
 
+
30411
 
+/* Plugin structure definitions */
30412
 
+
30413
 
+typedef struct evms_plugin_header_s {
30414
 
+        u_int32_t                 id;
30415
 
+        evms_version_t            version;
30416
 
+        evms_version_t            required_common_services_version;
30417
 
+        struct evms_plugin_function_table_s  *function_table;
30418
 
+} evms_plugin_header_t;
30419
 
+
30420
 
+typedef struct evms_volume_info_s {
30421
 
+/*  0*/ u_int64_t               volume_serial_number;
30422
 
+/*  8*/ u_int32_t               volume_system_id;       /* the minor is stored here */
30423
 
+/* 12*/ char                    volume_name[EVMS_VOLUME_NAME_SIZE+1];
30424
 
+/*140*/
30425
 
+} evms_volume_info_t;
30426
 
+
30427
 
+/* flags field bit definitions in evms_common.h */
30428
 
+/* iflags field used internally by the kernel only */
30429
 
+#define EVMS_FEATURE_BOTTOM                    (1<<0)
30430
 
+typedef struct evms_logical_node_s {
30431
 
+/*  0*/ evms_sector_t                    total_vsectors;
30432
 
+/*  8*/ evms_plugin_header_t           * plugin;
30433
 
+/* 12*/ void                           * instance_data;         /* ptr to private instance data */
30434
 
+/* 16*/ unsigned int                     flags;
30435
 
+/* 20*/        unsigned int                     iflags;
30436
 
+/* 24*/        int                              hardsector_size;
30437
 
+/* 28*/        int                              block_size;
30438
 
+/* 32*/ unsigned int                    system_id;
30439
 
+/* 36*/        evms_volume_info_t             * volume_info;
30440
 
+/* 40*/ evms_feature_header_t          * feature_header;
30441
 
+/* 44*/ struct evms_logical_node_s     * next;
30442
 
+/* 48*/        char                             name[EVMS_VOLUME_NAME_SIZE+1];
30443
 
+/*176*/
30444
 
+} evms_logical_node_t;
30445
 
+
30446
 
+/* this macro will retrieve the appropriate kernel node name
30447
 
+ * based on the node type.
30448
 
+ */
30449
 
+#define EVMS_GET_NODE_NAME(node)                               \
30450
 
+       ((node->flags & EVMS_VOLUME_FLAG) ?                     \
30451
 
+               node->volume_info->volume_name :                \
30452
 
+               node->name)
30453
 
+
30454
 
+/* bit definitions of FLAGS field in logical volume struct */
30455
 
+/* NOTE: these bit field definitions can be found in 
30456
 
+ * evms_ioctl.h above the evms_volume_data_t structure
30457
 
+ */
30458
 
+typedef struct evms_logical_volume_s {
30459
 
+        char                           * name;                  /* devfs name if any */
30460
 
+        evms_logical_node_t            * node;                  /* ptr to top logical node */
30461
 
+       int                              flags;
30462
 
+        int                              quiesced;
30463
 
+       int                              vfs_quiesced;
30464
 
+        atomic_t                         requests_in_progress;
30465
 
+        wait_queue_head_t                wait_queue;
30466
 
+        devfs_handle_t                   devfs_handle;
30467
 
+#ifdef CONFIG_SMP
30468
 
+       request_queue_t                  request_queue;
30469
 
+#endif
30470
 
+} evms_logical_volume_t;
30471
 
+
30472
 
+/* EVMS generic I/O structure */
30473
 
+typedef struct eio_s {
30474
 
+       evms_sector_t           rsector;
30475
 
+       evms_sector_t           rsize;
30476
 
+       struct buffer_head     *bh;
30477
 
+} eio_t;
30478
 
+
30479
 
+/* Abstraction MACROs */
30480
 
+#define EVMS_IO_ERROR(eio) (buffer_IO_error(eio->bh))
30481
 
+
30482
 
+/*
30483
 
+ * The following function table is used for all plugins.
30484
 
+ */
30485
 
+typedef struct evms_plugin_function_table_s {
30486
 
+        int  (* discover)(evms_logical_node_t **);
30487
 
+        int  (* end_discover)(evms_logical_node_t **);
30488
 
+        int  (* delete)  (evms_logical_node_t *);
30489
 
+        void (* read)    (evms_logical_node_t *, eio_t *);
30490
 
+        void (* write)   (evms_logical_node_t *, eio_t *);
30491
 
+        int  (* init_io) (evms_logical_node_t *, int, evms_sector_t, 
30492
 
+                          evms_sector_t, void *);
30493
 
+        int  (* ioctl)   (evms_logical_node_t *, struct inode *, 
30494
 
+                          struct file *, unsigned int, unsigned long);
30495
 
+       int  (* direct_ioctl)(struct inode *, struct file *,
30496
 
+                             unsigned int, unsigned long);
30497
 
+} evms_plugin_function_table_t;
30498
 
+
30499
 
+/* 
30500
 
+ * These macros facilitate easier use of the 
30501
 
+ * entry points in the function table
30502
 
+ */
30503
 
+#define DISCOVER(node, list) ((node)->plugin->function_table->discover(list))
30504
 
+#define END_DISCOVER(node, list) ((node)->plugin->function_table->end_discover(list))
30505
 
+#define DELETE(node) ((node)->plugin->function_table->delete(node))
30506
 
+#define R_IO(node, eio)  ((node)->plugin->function_table->read(node, eio))
30507
 
+#define W_IO(node, eio)  ((node)->plugin->function_table->write(node, eio))
30508
 
+#define INIT_IO(node, rw_flag, start_sec, num_secs, buf_addr) ((node)->plugin->function_table->init_io(node, rw_flag, start_sec, num_secs, buf_addr))
30509
 
+#define INT_IO(node, rw_flag, start_sec, num_secs, buf_addr) ((node)->plugin->function_table->int_io(node, rw_flag, start_sec, num_secs, buf_addr))
30510
 
+#define IOCTL(node, inode, file, cmd, arg)    ((node)->plugin->function_table->ioctl(node, inode, file, cmd, arg))
30511
 
+#define DIRECT_IOCTL(reg_record, inode, file, cmd, arg)   ((reg_record)->plugin->function_table->direct_ioctl(inode, file, cmd, arg))
30512
 
+
30513
 
+typedef struct evms_list_node_s {
30514
 
+       void                    *item;
30515
 
+       struct evms_list_node_s *next;
30516
 
+} evms_list_node_t;
30517
 
+
30518
 
+/* pool management structure */
30519
 
+typedef struct evms_pool_mgmt_s {
30520
 
+       kmem_cache_t    *cachep;
30521
 
+       int              member_size;
30522
 
+       void            *head;
30523
 
+       atomic_t         waiters;
30524
 
+       wait_queue_head_t wait_queue;
30525
 
+       /* WARNING!!! pool name MUST be less than 20 chars */
30526
 
+       char            *name;
30527
 
+} evms_pool_mgmt_t;
30528
 
+
30529
 
+/*
30530
 
+ * Notes:  
30531
 
+ *     All of the following kernel thread functions belong to EVMS base.
30532
 
+ *     These functions were copied from md_core.c
30533
 
+ */
30534
 
+#define EVMS_THREAD_WAKEUP 0
30535
 
+typedef struct evms_thread_s {
30536
 
+       void                    (*run) (void *data);
30537
 
+       void                    *data;
30538
 
+       wait_queue_head_t       wqueue;
30539
 
+       unsigned long           flags;
30540
 
+       struct completion       *event;
30541
 
+       struct task_struct      *tsk;
30542
 
+       const char              *name;
30543
 
+} evms_thread_t;
30544
 
+
30545
 
+/* EVMS (common services) exported functions prototypes */
30546
 
+#define EVMS_COMMON_SERVICES_MAJOR              0
30547
 
+#define EVMS_COMMON_SERVICES_MINOR              6
30548
 
+#define EVMS_COMMON_SERVICES_PATCHLEVEL         0
30549
 
+
30550
 
+void evms_cs_get_version(int *, int *);
30551
 
+int evms_cs_check_version(evms_version_t *, evms_version_t *);
30552
 
+int evms_cs_register_plugin(evms_plugin_header_t *);
30553
 
+int evms_cs_unregister_plugin(evms_plugin_header_t *);
30554
 
+#ifdef EVMS_MEM_DEBUG
30555
 
+int evms_cs_verify_memory_integrity(int);
30556
 
+#endif
30557
 
+int evms_cs_allocate_memory(void **, int);
30558
 
+int evms_cs_deallocate_memory(void *);
30559
 
+int evms_cs_allocate_logical_node(evms_logical_node_t **);
30560
 
+void evms_cs_deallocate_volume_info(evms_logical_node_t *);
30561
 
+int evms_cs_deallocate_logical_node(evms_logical_node_t *);
30562
 
+int evms_cs_add_logical_node_to_list(evms_logical_node_t **, 
30563
 
+                                     evms_logical_node_t *);
30564
 
+int evms_cs_remove_logical_node_from_list(evms_logical_node_t **,
30565
 
+                                          evms_logical_node_t *);
30566
 
+int evms_cs_kernel_ioctl(evms_logical_node_t *, unsigned int, 
30567
 
+                         unsigned long);
30568
 
+int evms_cs_get_hardsect_size(evms_logical_node_t *, int *);
30569
 
+int evms_cs_get_blocksize_size(evms_logical_node_t *, int *);
30570
 
+unsigned long evms_cs_size_in_sectors(unsigned long, unsigned long);
30571
 
+unsigned long evms_cs_size_in_vsectors(long long);
30572
 
+int evms_cs_log2(long long);
30573
 
+u_int32_t evms_cs_calculate_crc(u_int32_t, void *, u_int32_t);
30574
 
+int evms_cs_register_for_end_io_notification(void *,
30575
 
+                                             struct buffer_head *,
30576
 
+                                             void *callback_function);
30577
 
+evms_pool_mgmt_t * evms_cs_create_pool(
30578
 
+       int,
30579
 
+       char *, 
30580
 
+        void (*ctor)(void*, kmem_cache_t *, unsigned long),
30581
 
+       void (*dtor)(void*, kmem_cache_t *, unsigned long));
30582
 
+#define EVMS_BLOCKABLE TRUE
30583
 
+void * evms_cs_allocate_from_pool(evms_pool_mgmt_t *, int);
30584
 
+void   evms_cs_deallocate_to_pool(evms_pool_mgmt_t *, void *);
30585
 
+void   evms_cs_destroy_pool(evms_pool_mgmt_t *);
30586
 
+int evms_cs_add_item_to_list(evms_list_node_t **, void *);
30587
 
+int evms_cs_remove_item_from_list(evms_list_node_t **, void *);
30588
 
+int evms_cs_register_device(evms_logical_node_t *);
30589
 
+int evms_cs_unregister_device(evms_logical_node_t *);
30590
 
+int evms_cs_find_next_device(evms_logical_node_t *, 
30591
 
+                            evms_logical_node_t **);
30592
 
+void evms_cs_signal_event(int);
30593
 
+evms_thread_t * evms_cs_register_thread (
30594
 
+       void (*run) (void *), 
30595
 
+       void *data, 
30596
 
+       const char *name);
30597
 
+void evms_cs_unregister_thread (evms_thread_t *thread);
30598
 
+void evms_cs_wakeup_thread(evms_thread_t *thread);
30599
 
+void evms_cs_interrupt_thread (evms_thread_t *thread);
30600
 
+struct proc_dir_entry *evms_cs_get_evms_proc_dir(void);
30601
 
+int evms_cs_volume_request_in_progress(kdev_t, int, int *);
30602
 
+
30603
 
+
30604
 
+/* EVMS exported global variables */
30605
 
+extern evms_pool_mgmt_t *evms_bh_pool;
30606
 
+extern char *evms_primary_string;
30607
 
+extern char *evms_secondary_string;
30608
 
+#endif
30609
 
diff -Naur linux-2002-03-28/include/linux/evms/evms_aix.h evms-2002-03-28/include/linux/evms/evms_aix.h
30610
 
--- linux-2002-03-28/include/linux/evms/evms_aix.h      Wed Dec 31 18:00:00 1969
30611
 
+++ evms-2002-03-28/include/linux/evms/evms_aix.h       Wed Mar 27 19:27:56 2002
30612
 
@@ -0,0 +1,401 @@
30613
 
+/*                                                                           
30614
 
+* The following structures are nested within the structures used by the    
30615
 
+* system management routines. These structures and sizes were pulled from the AIX
30616
 
+* src tree.
30617
 
+*/                                                                        
30618
 
+#define LVM_MAXLPS      65535       /* max number of logical partitions allowed */
30619
 
+#define LVM_NAMESIZ     64          /* maximum size for the logical volume name */
30620
 
+#define LVM_NUMCOPIES   3           /* max number of copies allowed of a logical partition */
30621
 
+#define LVM_MAXVGS      255
30622
 
+#define LVM_MAXPVS      32
30623
 
+#define LVM_MAXLVS      256
30624
 
+#define AIX_MIN_BLOCK_SIZE 4096
30625
 
+#define VGSA_BT_PV      127
30626
 
+#define NBPI            32
30627
 
+#define TRUE             1
30628
 
+#define OFFSET_CONSTANT     144
30629
 
+#define SLEEP_TIME            0
30630
 
+#define MAXLVS_OFFSET        16
30631
 
+#define PHYS_VOL_OFFSET      34
30632
 
+#define AIX_PVHPP_LENGTH     PHYS_VOL_OFFSET
30633
 
+#define MAX_SECTORS_NAMELIST 32
30634
 
+#define AIX_DEFAULT_MIRRORING 1
30635
 
+#define AIX_FIRST_MIRROR      2
30636
 
+#define AIX_MAX_MIRRORS       3  // AIX defines ALL copies as mirrors - 3 mirrors MAX - 1 orig and 2 copies
30637
 
+
30638
 
+#define PSN_LVM_REC      7
30639
 
+#define PSN_VGSA_REC     128
30640
 
+#define PSN_NAMELIST_REC 2065
30641
 
+#define PSN_VGT_TRAILER  135
30642
 
+#define PSN_LVE_REC        1
30643
 
+#define PSN_PPH_OFFSET    17
30644
 
+#define PSN_PVH_INCREMENT 34
30645
 
+#define AIX_SECTOR_SIZE  512
30646
 
+#define MAX_PPENT_SECTOR  16
30647
 
+#define        NAME_LEN                 128    /* don't change!!! */
30648
 
+#define        UUID_LEN                  32    /* don't change!!! */
30649
 
+#define MAX_SECTORS_LV_ENTRIES 16
30650
 
+#define AIX_MIN_MIRROR_POOL    10
30651
 
+#define AIX_MIRROR_POOL_CHANGE 10
30652
 
+
30653
 
+#define LV_SET_ACCESS           _IOW ( 0xfe, 0x28, 1)
30654
 
+#define LV_SET_ALLOCATION       _IOW ( 0xfe, 0x29, 1)
30655
 
+#define LV_SET_STATUS           _IOW ( 0xfe, 0x2a, 1)
30656
 
+#define        LV_BMAP                 _IOWR ( 0xfe, 0x30, 1)
30657
 
+
30658
 
+#define        LV_ACTIVE            0x01       /* lv_status */
30659
 
+#define        LV_SPINDOWN          0x02       /*     "     */
30660
 
+#define LV_ERROR             0x99   /*     "     */ 
30661
 
+
30662
 
+#define        VG_ACTIVE            0x01       /* vg_status */
30663
 
+
30664
 
+#define        AIX_LV_READ              0x00   /* lv_access */
30665
 
+#define        AIX_LV_WRITE         0x01       /*     "     */
30666
 
+#define EVMS_LV_NEW             0x10   // volume was created during the current discovery pass
30667
 
+#define EVMS_LV_INCOMPLETE      0x20   // volume has an incomplete LE map
30668
 
+#define EVMS_LV_INVALID                 0x40   // volume has a memory-corruption problem
30669
 
+#define EVMS_VG_DIRTY           0x01   // group has had a new PV added during this discovery
30670
 
+#define AIX_VG_INCOMPLETE       0x20   // volume group is incomplete 
30671
 
+
30672
 
+
30673
 
+#define LOG_PREFIX             "--AIXlvm: "
30674
 
+
30675
 
+// Entries in the list of physical volumes (PV)
30676
 
+// in a volume group (VG)
30677
 
+
30678
 
+typedef struct unique_id_s {
30679
 
+    uint32_t  word1;
30680
 
+    uint32_t  word2;
30681
 
+    uint32_t  word3;
30682
 
+    uint32_t  word4;
30683
 
+} unique_id;
30684
 
+
30685
 
+typedef struct _partition_list_entry {
30686
 
+       evms_logical_node_t             * logical_node;
30687
 
+       u_int32_t                         pv_number;
30688
 
+       u_int32_t                 block_size;   // bytes
30689
 
+       u_int32_t                             hard_sect_size;   // bytes
30690
 
+       struct _partition_list_entry    * next;
30691
 
+
30692
 
+} partition_list_entry_t;
30693
 
+
30694
 
+// Table for mapping logical extents (LE) to physical extents (PE)
30695
 
+typedef struct _pe_table_entry {
30696
 
+       partition_list_entry_t  * owning_pv;
30697
 
+       u_int64_t               pe_sector_offset;
30698
 
+} pe_table_entry_t;
30699
 
+
30700
 
+// Logical volumes (LV) in a volume group (VG)
30701
 
+typedef struct _aix_logical_volume {
30702
 
+       u_int32_t               lv_number;
30703
 
+       u_int64_t               lv_size;                // Sectors
30704
 
+       u_int32_t               lv_access;              // Flags: LV_READ, LV_WRITE, LV_SNAPSHOT, LV_SNAPSHOT_ORG, EVMS_QUIESCE
30705
 
+       u_int32_t               lv_status;              // Flags: LV_ACTIVE, LV_SPINDOWN
30706
 
+       u_int32_t               lv_minor;               // Device minor number
30707
 
+       u_int32_t               mirror_copies;  // Do we have mirroring and how many  ?
30708
 
+       u_int32_t               mirror_number;  // mirror number - which copy is this ?
30709
 
+       u_int32_t               mirror_iterations;      // Which mirror should we be writing to ?
30710
 
+       u_int32_t               stripes;
30711
 
+       u_int32_t               stripe_size;        // Sectors
30712
 
+       u_int32_t               stripe_size_shift;  // Number of bits to shift right instead of dividing by stripe_size
30713
 
+       u_int32_t               pe_size;                // Sectors
30714
 
+       u_int32_t               pe_size_shift;      // Number of bits to shift right instead of dividing by pe_size
30715
 
+       u_int32_t               num_le;                 // Number of entries in the le_to_pe_map
30716
 
+       u_int32_t               new_volume;             // Flag to indicate if this volume needs to be exported
30717
 
+       struct _aix_volume_group  * group;              // Pointer back to parent volume group
30718
 
+       unsigned char           name[EVMS_VOLUME_NAME_SIZE+1];  // Dev-tree volume name (eg: /dev/group0/vol0)
30719
 
+       pe_table_entry_t        * le_to_pe_map; // Mapping of logical to physical extents
30720
 
+       pe_table_entry_t        * le_to_pe_map_mir1;    // Mapping of logical to physical extents for mirror 1
30721
 
+       pe_table_entry_t        * le_to_pe_map_mir2;    // Mapping of logical to physical extents for mirror 2
30722
 
+       evms_logical_node_t     * volume_node;  // Pointer to the parent EVMS node representing this volume
30723
 
+
30724
 
+} aix_logical_volume_t;
30725
 
+
30726
 
+// Volume groups (VG)
30727
 
+typedef struct _aix_volume_group {
30728
 
+       unique_id               vg_id;                  // volume group number */
30729
 
+       u_int32_t               numpvs;                         // Number of PVs found on this VG.
30730
 
+       u_int32_t               numlvs;                         // Number of LVs found on this VG.
30731
 
+       u_int32_t               hard_sect_size;                 // The largest hard_sect_size and block_size
30732
 
+       u_int32_t               block_size;                         // values of all partitions in this group.
30733
 
+       u_int32_t               flags;                      //
30734
 
+       u_int32_t               lv_max;                 // maximum logical volumes */
30735
 
+       u_int32_t               pe_size;                            // physical extent size in sectors */
30736
 
+       partition_list_entry_t  * partition_list;       // List of partitions/segments/PVs that make up this VG
30737
 
+       u_int32_t               partition_count;
30738
 
+       struct _aix_logical_volume      ** volume_list;         // Array of volumes found in this VG.
30739
 
+       struct _aix_volume_group * next;            // Pointer to the next VG
30740
 
+    u_int32_t                 CleanVGInfo;      // Do we have a clean VG Info to work with ?
30741
 
+       daddr_t                   vgda_psn;         // Which VGDA we should use
30742
 
+       long                      vgda_len;         // length of the volume group descriptor area */
30743
 
+       struct _vg_header       * AIXvgh;           // Pointer to valid data area on disk for the VG
30744
 
+} aix_volume_group_t;
30745
 
+
30746
 
+typedef struct _aix_mirror_bh {
30747
 
+    atomic_t                     remaining;
30748
 
+       int                                  iteration;     // 'have we finished' count, used from IRQ handlers
30749
 
+       int                                      cmd;
30750
 
+    u_int64_t                    mir_sector1;
30751
 
+    u_int64_t                    mir_sector2;
30752
 
+       struct buffer_head              *master_bh;
30753
 
+       struct buffer_head               bh_req;
30754
 
+       struct _aix_mirror_bh       *mirror_bh_list;
30755
 
+       evms_logical_node_t             *node;          // map to evms node (READ only)
30756
 
+       evms_logical_node_t             *mir_node1;             // 
30757
 
+       evms_logical_node_t             *mir_node2;             // 
30758
 
+       eio_t                                eio;
30759
 
+       struct _aix_mirror_bh           *next_r1;           // next for retry or in free list 
30760
 
+} aix_mirror_bh_t;
30761
 
+
30762
 
+typedef struct _timestruc_t 
30763
 
+{
30764
 
+  int tv_sec;
30765
 
+  int tv_nsec;
30766
 
+
30767
 
+} timestruc_t;
30768
 
+
30769
 
+typedef struct ipl_rec_area
30770
 
+{
30771
 
+    unsigned int      IPL_record_id;    /* This physical volume contains a   */
30772
 
+                                        /* valid IPL record if and only if   */
30773
 
+                                        /* this field contains IPLRECID      */
30774
 
+
30775
 
+#define IPLRECID 0xc9c2d4c1             /* Value is EBCIDIC 'IBMA'           */
30776
 
+
30777
 
+    char              reserved1[20];
30778
 
+    unsigned int      formatted_cap;    /* Formatted capacity. The number of */
30779
 
+                                        /* sectors available after formatting*/
30780
 
+                                        /* The presence or absence of bad    */
30781
 
+                                        /* blocks does not alter this value. */
30782
 
+
30783
 
+    char              last_head;        /* THIS IS DISKETTE INFORMATION      */
30784
 
+                                        /* The number of heads minus 1. Heads*/
30785
 
+                                        /* are number from 0 to last_head.   */
30786
 
+
30787
 
+    char              last_sector;      /* THIS IS DISKETTE INFORMATION      */
30788
 
+                                        /* The number of sectors per track.  */
30789
 
+                                        /* Sectors are numbered from 1 to    */
30790
 
+                                        /* last_sector.                      */
30791
 
+
30792
 
+    char              reserved2[6];
30793
 
+
30794
 
+    unsigned int      boot_code_length; /* Boot code length in sectors. A 0  */
30795
 
+                                        /* value implies no boot code present*/
30796
 
+
30797
 
+    unsigned int      boot_code_offset; /* Boot code offset. Must be 0 if no */
30798
 
+                                        /* boot code present, else contains  */
30799
 
+                                        /* byte offset from start of boot    */
30800
 
+                                        /* code to first instruction.        */
30801
 
+
30802
 
+    unsigned int      boot_lv_start;    /* Contains the PSN of the start of  */
30803
 
+                                        /* the BLV.                          */
30804
 
+
30805
 
+    unsigned int      boot_prg_start;   /* Boot code start. Must be 0 if no  */
30806
 
+                                        /* boot code present, else contains  */
30807
 
+                                        /* the PSN of the start of boot code.*/
30808
 
+
30809
 
+    unsigned int      boot_lv_length;   /* BLV length in sectors.            */
30810
 
+
30811
 
+    unsigned int      boot_load_add;    /* 512 byte boundary load address for*/
30812
 
+                                        /* boot code.                        */
30813
 
+
30814
 
+    char              boot_frag;        /* Boot code fragmentation flag. Must*/
30815
 
+                                        /* be 0 if no fragmentation allowed, */
30816
 
+                                        /* else must be 0x01.                */
30817
 
+
30818
 
+    char             boot_emulation;   /* ROS network emulation flag */
30819
 
+                                       /* 0x0 => not an emul support image   */
30820
 
+                                       /* 0x1 => ROS network emulation code  */
30821
 
+                                       /* 0x2 => AIX code supporting ROS emul*/
30822
 
+
30823
 
+    char              reserved3[2];
30824
 
+
30825
 
+    ushort            basecn_length;    /* Number of sectors for base        */
30826
 
+                                        /* customization. Normal mode.       */
30827
 
+
30828
 
+    ushort            basecs_length;    /* Number of sectors for base        */
30829
 
+                                        /* customization. Service mode.      */
30830
 
+
30831
 
+    unsigned int      basecn_start;     /* Starting PSN value for base       */
30832
 
+                                        /* customization. Normal mode.       */
30833
 
+
30834
 
+    unsigned int      basecs_start;     /* Starting PSN value for base       */
30835
 
+                                        /* customization. Service mode.      */
30836
 
+
30837
 
+    char              reserved4[24];
30838
 
+
30839
 
+    unsigned int      ser_code_length;  /* Service code length in sectors.   */
30840
 
+                                        /* A 0 value implies no service code */
30841
 
+                                        /* present.                          */
30842
 
+
30843
 
+    unsigned int      ser_code_offset;  /* Service code offset. Must be 0 if */
30844
 
+                                        /* no service code is present, else  */
30845
 
+                                        /* contains byte offset from start of*/
30846
 
+                                        /* service code to first instruction.*/
30847
 
+
30848
 
+    unsigned int      ser_lv_start;     /* Contains the PSN of the start of  */
30849
 
+                                        /* the SLV.                          */
30850
 
+
30851
 
+    unsigned int      ser_prg_start;    /* Service code start. Must be 0 if  */
30852
 
+                                        /* service code is not present, else */
30853
 
+                                        /* contains the PSN of the start of  */
30854
 
+                                        /* service code.                     */
30855
 
+
30856
 
+    unsigned int      ser_lv_length;    /* SLV length in sectors.            */
30857
 
+
30858
 
+    unsigned int      ser_load_add;     /* 512 byte boundary load address for*/
30859
 
+                                        /* service code.                     */
30860
 
+
30861
 
+    char              ser_frag;         /* Service code fragmentation flag.  */
30862
 
+                                        /* Must be 0 if no fragmentation     */
30863
 
+                                        /* allowed, else must be 0x01.       */
30864
 
+
30865
 
+    char             ser_emulation;    /* ROS network emulation flag */
30866
 
+                                       /* 0x0 => not an emul support image   */
30867
 
+                                       /* 0x1 => ROS network emulation code  */
30868
 
+                                       /* 0x2 => AIX code supporting ROS emul*/
30869
 
+
30870
 
+    char              reserved5[2];
30871
 
+
30872
 
+    unique_id         pv_id;            /* The unique identifier for this    */
30873
 
+                                        /* physical volume.                  */
30874
 
+    char              dummy[512 - 128 - sizeof(unique_id)];
30875
 
+}AIXIPL_REC, *AIXIPL_REC_PTR;
30876
 
+
30877
 
+
30878
 
+typedef struct AIXlvm_rec_s
30879
 
+        /* structure which describes the physical volume LVM record */
30880
 
+       {
30881
 
+       long lvm_id;                        /* LVM id field which identifies whether the PV is a member of a volume group */
30882
 
+
30883
 
+#define AIX_LVM_LVMID     0x5F4C564D            /* LVM id field of ASCII "_LVM" */
30884
 
+
30885
 
+       unique_id          vg_id;           /* the id of the volume group to which this physical volume belongs */
30886
 
+       long               lvmarea_len;     /* the length of the LVM reserved area */
30887
 
+       long               vgda_len;        /* length of the volume group descriptor area */
30888
 
+       daddr_t            vgda_psn [2];    /* the physical sector numbers of the beginning of the volume group descriptor area copies on this disk */
30889
 
+       daddr_t            reloc_psn;       /* the physical sector number of the beginning of a pool of blocks  */
30890
 
+                                           /* (located at the end of the PV) which are reserved for the relocation of bad blocks */
30891
 
+       long               reloc_len;       /* the length in number of sectors of the pool of bad block relocation blocks */
30892
 
+       short int          pv_num;          /* the physical volume number within the volume group of this physical volume */
30893
 
+       short int          pp_size;         /* the size in bytes for the partition, expressed as a power of 2 (i.e., the partition size is 2 to the power pp_size) */
30894
 
+       long               vgsa_len;        /* length of the volume group status area */
30895
 
+       daddr_t            vgsa_psn [2];    /* the physical sector numbers of the beginning of the volume group status area copies on this disk */
30896
 
+       short int          version;         /* the version number of this volume group descriptor and status area */
30897
 
+
30898
 
+#define  LVM_VERSION_1         1              /* first version - AIX 3.0 */
30899
 
+#define  LVM_STRIPE_ENHANCE    2              /* version with striped lv's - AIX 4.1 */
30900
 
+#define  LVM_1024_PPSIZE       3              /* ppsizes of 512 and 1024 */
30901
 
+#define  LVM_GT_1016           4              /* version with support for > 1016 pps/pv */
30902
 
+#define  LVM_MAX_VERSION       LVM_GT_1016    /* max version # */
30903
 
+
30904
 
+       char res1 [450];                    /* reserved area */
30905
 
+
30906
 
+ } AIXlvm_rec_t;
30907
 
+
30908
 
+
30909
 
+
30910
 
+/*  II.Volume Group Descriptor Area  */
30911
 
+
30912
 
+typedef struct _vgsa_area
30913
 
+{
30914
 
+      timestruc_t           b_tmstamp;    /* Beginning timestamp */
30915
 
+      unsigned int          pv_missing [(LVM_MAXPVS + (NBPI -1)) / NBPI];  /* Bit per PV */
30916
 
+      unsigned char         stalepp    [LVM_MAXPVS] [VGSA_BT_PV];
30917
 
+      short                 factor;
30918
 
+      char                  resv[10];     /* Padding */
30919
 
+      timestruc_t           e_tmstamp;    /* Ending timestamp */
30920
 
+
30921
 
+} vgsa_area;
30922
 
+
30923
 
+typedef struct _vg_header
30924
 
+{
30925
 
+      timestruc_t           vg_timestamp; /* time of last update */
30926
 
+      unique_id             vg_id;        /* unique id for volume group */ 
30927
 
+      short                 numlvs;       /* number of lvs in vg */
30928
 
+      short                 maxlvs;       /* max number of lvs allowed in vg */
30929
 
+      short                 pp_size;      /* size of pps in the vg */
30930
 
+      short                 numpvs;       /* number of pvs in the vg */
30931
 
+      short                 total_vgdas;  /* number of copies of vg */
30932
 
+                                                             /* descriptor area on disk */
30933
 
+      short                 vgda_size;    /* size of volume group descriptor */
30934
 
+      short                bigvg;
30935
 
+      short                quorum;
30936
 
+      short                auto_varyon;
30937
 
+      int                  checksum;
30938
 
+      int                  bigda_size;
30939
 
+   } vg_header;
30940
 
30941
 
+typedef struct _lv_entries
30942
 
+   {
30943
 
+      short       lvname;            /* name of LV */
30944
 
+      short       res1;                      /* reserved area */
30945
 
+      int        maxsize;        /* maximum number of partitions allowed */
30946
 
+      char        lv_state;      /* state of logical volume */
30947
 
+      char        mirror;                /* none,single, or double */
30948
 
+      short       mirror_policy;  /* type of writing used to write */
30949
 
+      int        num_lps;            /* number of logical partitions on the lv */
30950
 
+                                     /* base 1 */
30951
 
+      char        permissions;           /* read write or read only */
30952
 
+      char        bb_relocation;  /* specifies if bad block */
30953
 
+                                  /* relocation is desired */
30954
 
+      char        write_verify;   /* verify all writes to the LV */
30955
 
+      char        mirwrt_consist; /* mirror write consistency flag */
30956
 
+      unsigned short  stripe_exp;  /* stripe size in exponent value */
30957
 
+      unsigned short  striping_width;   /* stripe width */
30958
 
+      unsigned short  lv_avoid;
30959
 
+      unsigned short  child_minor_num;
30960
 
+      char      res4[4];           /* reserved area on disk */
30961
 
+   } lv_entries;
30962
 
+
30963
 
30964
 
+typedef struct _pv_header
30965
 
+   {
30966
 
+      unique_id             pv_id;      /* unique identifier of PV */
30967
 
+      unsigned short        pp_count;   /* number of physical partitions */
30968
 
+                                        /* on PV */
30969
 
+      char                  pv_state;   /* state of physical volume */
30970
 
+      char                  res1;       /* reserved area on disk */
30971
 
+      daddr_t               psn_part1;  /* physical sector number of 1st pp */
30972
 
+      short                 pvnum_vgdas;/* number of vg descriptor areas */
30973
 
+                                        /* on the physical volume */
30974
 
+      short                 pv_num;     /* PV number */
30975
 
+      long                  res2;     /* reserved area on disk */
30976
 
+
30977
 
+    } pv_header;
30978
 
30979
 
+typedef struct _pp_entries
30980
 
+    {
30981
 
+       short        lv_index;     /* index to lv pp is on */
30982
 
+       short        res_1;        /* reserved area on disk */
30983
 
+       long         lp_num;       /* log. part. number */
30984
 
+       char         copy;         /* the copy of the logical partition */
30985
 
+                                                 /* that this pp is allocated for */
30986
 
+       char         pp_state;     /* current state of pp */
30987
 
+       char         fst_alt_vol;  /* pv where partition allocation for*/
30988
 
+                                  /* first mirror begins */
30989
 
+       char         snd_alt_vol;  /* pv where partition allocation for*/
30990
 
+                                  /* second mirror begins */ 
30991
 
+       short        fst_alt_part; /* partition to begin first mirror */
30992
 
+       short        snd_alt_part; /*partition to begin second mirror */
30993
 
+       double       res_3;        /* reserved area  on disk */
30994
 
+       double       res_4;        /* reserved area on disk */
30995
 
+    } pp_entries;
30996
 
+
30997
 
+typedef struct _namelist
30998
 
+{
30999
 
+   char       name[LVM_MAXLVS][LVM_NAMESIZ];
31000
 
+} namelist;
31001
 
31002
 
+typedef struct _vg_trailer
31003
 
+{
31004
 
+       timestruc_t         timestamp; /*  time of last update */
31005
 
+       short                   concurrency;
31006
 
+       /* MS Nibble = concurrent capable                       */
31007
 
+       /* LS Nibble = concurrent auto-varyon                   */
31008
 
+       short                   res_2;
31009
 
+       int                         res_3;      /* reserved area on disk */
31010
 
+       double                  res_4;  /* reserved area on disk */
31011
 
+       double                  res_5;  /* reserved area on disk */
31012
 
+} vg_trailer;
31013
 
+
31014
 
diff -Naur linux-2002-03-28/include/linux/evms/evms_bbr.h evms-2002-03-28/include/linux/evms/evms_bbr.h
31015
 
--- linux-2002-03-28/include/linux/evms/evms_bbr.h      Wed Dec 31 18:00:00 1969
31016
 
+++ evms-2002-03-28/include/linux/evms/evms_bbr.h       Tue Mar 26 16:04:31 2002
31017
 
@@ -0,0 +1,96 @@
31018
 
+/*
31019
 
+ *
31020
 
+ *   Copyright (c) International Business Machines  Corp., 2000
31021
 
+ *
31022
 
+ *   This program is free software;  you can redistribute it and/or modify
31023
 
+ *   it under the terms of the GNU General Public License as published by
31024
 
+ *   the Free Software Foundation; either version 2 of the License, or
31025
 
+ *   (at your option) any later version.
31026
 
+ *
31027
 
+ *   This program is distributed in the hope that it will be useful,
31028
 
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
31029
 
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
31030
 
+ *   the GNU General Public License for more details.
31031
 
+ *
31032
 
+ *   You should have received a copy of the GNU General Public License
31033
 
+ *   along with this program;  if not, write to the Free Software
31034
 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
31035
 
+ *
31036
 
+ */
31037
 
+/*
31038
 
+ * linux/include/linux/evms_bbr.h
31039
 
+ *
31040
 
+ * EVMS Bad Block Relocation Feature kernel header file
31041
 
+ *
31042
 
+ */
31043
 
+
31044
 
+#ifndef EVMS_BBR_INCLUDED
31045
 
+
31046
 
+#define EVMS_BBR_INCLUDED
31047
 
+
31048
 
+#define EVMS_BBR_VERSION_MAJOR            1
31049
 
+#define EVMS_BBR_VERSION_MINOR            0
31050
 
+#define EVMS_BBR_VERSION_PATCHLEVEL       0
31051
 
+
31052
 
+#define EVMS_BBR_FEATURE_ID       6
31053
 
+#define EVMS_BBR_SIGNATURE        0x42627246   /* BbrF */
31054
 
+
31055
 
+/* The following defines establish the minimum and maximum number of
31056
 
+ * replacement sectors which can be allocated for Bad Block Relocation.
31057
 
+ * Otherwise, 1 replacement sector per MB of disk space is allocated. */
31058
 
+#define EVMS_BBR_ENTRIES_PER_SECT    31 /* Assume sector size is 512 bytes*/
31059
 
+#define EVMS_BBR_LIMIT  4096
31060
 
+
31061
 
+#define EVMS_BBR_TABLE_SIGNATURE         0x42627254 /* BbrT */
31062
 
+
31063
 
+typedef struct evms_bbr_table_entry_s {
31064
 
+    u_int64_t bad_sect;
31065
 
+    u_int64_t replacement_sect;
31066
 
+} evms_bbr_table_entry_t;
31067
 
+
31068
 
+typedef struct evms_bbr_table_s {
31069
 
+    u_int32_t signature;                /* Signature for a sector of the bbr table (EVMS_BBR_TABLE_SIGNATURE) */
31070
 
+    u_int32_t crc;                      /* CRC for this sector of the BBR Table. */
31071
 
+    u_int32_t sequence_number;          /* Used to resolve conflicts when the primary and secondary tables do not match. */
31072
 
+    u_int32_t in_use_cnt;               /* number of in-use entries */
31073
 
+    evms_bbr_table_entry_t entries[EVMS_BBR_ENTRIES_PER_SECT];   /* BBR table entries available for this sector of the BBR table */
31074
 
+} evms_bbr_table_t;
31075
 
+
31076
 
+/* description of on disk meta data sector for bbr feature */
31077
 
+typedef struct evms_bbr_metadata_s {
31078
 
+/* 0*/        u_int32_t signature;                /* EVMS_BBR_SIGNATURE */
31079
 
+/* 4*/        u_int32_t crc;
31080
 
+/* 8*/        u_int32_t block_size;               /* block size in bytes */
31081
 
+/*12*/        u_int32_t flags;                    /* Global flag used by BBR */
31082
 
+/*16*/        u_int64_t sequence_number;
31083
 
+/*24*/        u_int64_t start_sect_bbr_table;     /* start 64-bit LBA of the BBR table */
31084
 
+/*32*/        u_int64_t nr_sects_bbr_table;       /* number of sectors to hold the BBR table */
31085
 
+/*40*/        u_int64_t start_replacement_sect;   /* start 64-bit LBA of the replacement sectors */
31086
 
+/*48*/        u_int64_t nr_replacement_blks;      /* number of replacement blocks. */
31087
 
+/*56*/        char      pads[456];                /* padding for 512-byte sector alignment */
31088
 
+} evms_bbr_metadata_t;
31089
 
+
31090
 
+
31091
 
+// BBR direct ioctl commands.
31092
 
+#define BBR_GET_INFO_CMD       1       // Return the total number of sectors
31093
 
+                                       // that are currently remapped for the
31094
 
+                                       // bbr object.
31095
 
+#define BBR_STOP_REMAP_CMD     2       // Stop ... do not remap any new sectors
31096
 
+                                       // or even honor any existing remaps for
31097
 
+                                       // the bbr object until after the next
31098
 
+                                       // rediscover command is received.
31099
 
+#define BBR_SECTOR_IO_CMD      3       // Process an I/O from the engine directly
31100
 
+                                       // through the bbr object.
31101
 
+
31102
 
+typedef struct evms_notify_bbr_s {
31103
 
+       char            object_name[EVMS_VOLUME_NAME_SIZE+1];   // Input  - Name of bbr object from feature header
31104
 
+       u_int64_t       count;          // Output - Count of remapped sectors
31105
 
+       u_int64_t       start_sect;     // Input - Starting sector for sector_io
31106
 
+       u_int64_t       nr_sect;        // Input - Number of sectors for sector_io
31107
 
+       unsigned long   buffer;         // Input - Pointer to buffer for sector_io
31108
 
+       int             rw;             // Input - READ or WRITE for sector_io
31109
 
+} evms_notify_bbr_t;
31110
 
+
31111
 
+
31112
 
+
31113
 
+#endif
31114
 
diff -Naur linux-2002-03-28/include/linux/evms/evms_bbr_k.h evms-2002-03-28/include/linux/evms/evms_bbr_k.h
31115
 
--- linux-2002-03-28/include/linux/evms/evms_bbr_k.h    Wed Dec 31 18:00:00 1969
31116
 
+++ evms-2002-03-28/include/linux/evms/evms_bbr_k.h     Wed Mar 27 16:08:55 2002
31117
 
@@ -0,0 +1,207 @@
31118
 
+#ifndef __EVMS_BBR_K__
31119
 
+#define __EVMS_BBR_K__
31120
 
+
31121
 
+/*
31122
 
+ *
31123
 
+ *   Copyright (c) International Business Machines  Corp., 2000
31124
 
+ *
31125
 
+ *   This program is free software;  you can redistribute it and/or modify
31126
 
+ *   it under the terms of the GNU General Public License as published by
31127
 
+ *   the Free Software Foundation; either version 2 of the License, or
31128
 
+ *   (at your option) any later version.
31129
 
+ *
31130
 
+ *   This program is distributed in the hope that it will be useful,
31131
 
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
31132
 
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
31133
 
+ *   the GNU General Public License for more details.
31134
 
+ *
31135
 
+ *   You should have received a copy of the GNU General Public License
31136
 
+ *   along with this program;  if not, write to the Free Software
31137
 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
31138
 
+ */
31139
 
+
31140
 
+/* linux/include/linux/evms/evms_bbr_k.h
31141
 
+ *
31142
 
+ * Kernel header file for Bad Block Relocation (BBR) Feature
31143
 
+ *
31144
 
+ * BBR feature is designed to remap I/O write failures to another safe location on disk.
31145
 
+ * Note that most disk drives have BBR built into them, this means that our software BBR
31146
 
+ * will be only activated when all hardware BBR replacement sectors have been used.
31147
 
+ */
31148
 
+
31149
 
+#include <linux/config.h>
31150
 
+#include <linux/module.h>
31151
 
+#include <linux/kernel.h>
31152
 
+#include <linux/sched.h>
31153
 
+#include <linux/smp_lock.h>
31154
 
+#include <linux/locks.h>
31155
 
+#include <linux/delay.h>
31156
 
+#include <linux/reboot.h>
31157
 
+#include <linux/completion.h>
31158
 
+#include <linux/vmalloc.h>
31159
 
+#include <asm/uaccess.h>
31160
 
+#include <linux/blk.h>
31161
 
+
31162
 
+#include <linux/evms/evms_kernel.h>
31163
 
+#include <linux/evms/evms_bbr.h>
31164
 
+
31165
 
+#define BBR_POOL_NAME_LENGTH   20
31166
 
+
31167
 
+/* Required common services version */
31168
 
+#define EVMS_BBR_COMMON_SERVICES_MAJOR         0
31169
 
+#define EVMS_BBR_COMMON_SERVICES_MINOR         6
31170
 
+#define EVMS_BBR_COMMON_SERVICES_PATCHLEVEL    0
31171
 
+
31172
 
+
31173
 
+static int bbr_notify_reboot(
31174
 
+       struct notifier_block *this,
31175
 
+       unsigned long code, 
31176
 
+       void *x);
31177
 
+
31178
 
+typedef struct bbr_runtime_remap_s {
31179
 
+       evms_bbr_table_entry_t           remap;
31180
 
+       struct bbr_runtime_remap_s      *left;  /** for binary tree */
31181
 
+       struct bbr_runtime_remap_s      *right; /** for binary tree */
31182
 
+}bbr_runtime_remap_t;
31183
 
+
31184
 
+
31185
 
+/* local instance data structure definition */
31186
 
+
31187
 
+#define BBR_STOP_REMAP (1<<0)
31188
 
+
31189
 
+typedef struct bbr_instance_data_s {
31190
 
+       struct bbr_instance_data_s *next;       /* link all bbr_instances */
31191
 
+       evms_logical_node_t     *node;          /* bbr_node */
31192
 
+       evms_logical_node_t     *source;        /* consumed node */
31193
 
+       evms_bbr_table_t        *bbr_table;
31194
 
+       u_int64_t               lba_table1;
31195
 
+       u_int64_t               lba_table2;
31196
 
+       u_int64_t               nr_sects_bbr_table;
31197
 
+       u_int64_t               nr_replacement_blks;
31198
 
+       u_int64_t               start_replacement_sect;
31199
 
+       u_int32_t               blksize_in_sects;
31200
 
+       evms_pool_mgmt_t        *bbr_bh_pool;
31201
 
+       char                    bh_pool_name[BBR_POOL_NAME_LENGTH+1];
31202
 
+       evms_pool_mgmt_t        *remap_pool;
31203
 
+       char                    remap_pool_name[BBR_POOL_NAME_LENGTH+1];
31204
 
+       atomic_t                in_use_replacement_blks;
31205
 
+       bbr_runtime_remap_t     *remap_root;            /* for binary tree */
31206
 
+       spinlock_t              bbr_id_lock;            /* lock for runtime remap table */
31207
 
+       u_int32_t               flag;
31208
 
+       evms_sector_t           total_vsectors;
31209
 
+} bbr_instance_data_t;
31210
 
+
31211
 
+#define BBR_BH_USE_EVMS_CALLBACK (1<<0)                // Set if an EVMS callback was registered for this I/O
31212
 
+
31213
 
+typedef struct bbr_bh_s {
31214
 
+       struct bbr_bh_s         *next;          // Used by bbr_io_list.
31215
 
+       bbr_instance_data_t     *BBRID;         // Object for this request.
31216
 
+       eio_t                   eio;            // Original eio.
31217
 
+       atomic_t                waiters;        // Used by bbr_init_io.
31218
 
+       int                     rw;             // READ or WRITE
31219
 
+       int                     rc;             // Return code from bbr_io_handler.
31220
 
+       unsigned long           flag;
31221
 
+}bbr_bh_t;
31222
 
+
31223
 
+
31224
 
+/*   --- discovery support functions ---  */
31225
 
+static int load_feature_data(
31226
 
+       evms_logical_node_t *node,
31227
 
+       bbr_instance_data_t **ID);
31228
 
+
31229
 
+static int load_meta_data(
31230
 
+       evms_logical_node_t *node,
31231
 
+       evms_sector_t LSN,
31232
 
+       evms_bbr_metadata_t **md,
31233
 
+       evms_bbr_table_t **bbr_table);
31234
 
+
31235
 
+static int validate_meta_data(evms_bbr_metadata_t *md);
31236
 
+static int validate_bbr_table_sector(evms_bbr_table_t *p);
31237
 
+static u_int32_t validate_bbr_table(
31238
 
+       evms_bbr_metadata_t *md,
31239
 
+       evms_bbr_table_t *p);
31240
 
+static u_int32_t validate_bbr_tables(
31241
 
+       evms_logical_node_t *node,
31242
 
+       evms_bbr_metadata_t *MD1,
31243
 
+       evms_bbr_metadata_t *MD2,
31244
 
+       evms_bbr_table_t *p1,
31245
 
+       evms_bbr_table_t *p2);
31246
 
+void update_invalid_bbr_table_sector(
31247
 
+       evms_logical_node_t *node,
31248
 
+       evms_bbr_table_t *valid,
31249
 
+       evms_bbr_table_t *invalid,
31250
 
+       evms_sector_t LSN);
31251
 
+
31252
 
+static u_int32_t bbr_table_to_remap_list(bbr_instance_data_t *BBRID);
31253
 
+
31254
 
+static int bbr_create_pools(bbr_instance_data_t *BBRID);
31255
 
+static void bbr_destroy_pools(bbr_instance_data_t *BBRID);
31256
 
+
31257
 
+#ifdef EVMS_BBR_DEBUG
31258
 
+static void print_meta_data(evms_bbr_metadata_t *md);
31259
 
+static void print_bbr_table_sector(evms_bbr_table_t *bbr_table);
31260
 
+static void print_remap_list(bbr_instance_data_t *BBRID);
31261
 
+#define BBR_DEBUG_PRINT_META_DATA(md) print_meta_data(md)
31262
 
+#define BBR_DEBUG_PRINT_TABLE_SECTOR(table) print_bbr_table_sector(table)
31263
 
+#define BBR_DEBUG_PRINT_REMAP_LIST(BBRID) print_remap_list(BBRID)
31264
 
+#else
31265
 
+#define BBR_DEBUG_PRINT_META_DATA(md)
31266
 
+#define BBR_DEBUG_PRINT_TABLE_SECTOR(table)
31267
 
+#define BBR_DEBUG_PRINT_REMAP_LIST(BBRID)
31268
 
+#endif
31269
 
+
31270
 
+#define BBR_BUG(msg) LOG_SERIOUS(__FUNCTION__ msg "\n")
31271
 
+
31272
 
+/* -- Mapping functions -- */
31273
 
+void bbr_binary_tree_insert(
31274
 
+       bbr_runtime_remap_t **node, 
31275
 
+       bbr_runtime_remap_t *newnode);
31276
 
+bbr_runtime_remap_t * bbr_binary_search(
31277
 
+       bbr_runtime_remap_t *node, 
31278
 
+       evms_sector_t bad_sect);
31279
 
+static int bbr_insert_remap_entry(
31280
 
+       bbr_instance_data_t *BBRID,
31281
 
+       evms_bbr_table_entry_t *new_bbr_entry);
31282
 
+static evms_bbr_table_entry_t * bbr_search_remap_entry(
31283
 
+       bbr_instance_data_t *BBRID,
31284
 
+       evms_sector_t sect);
31285
 
+static inline int bbr_remap(
31286
 
+       bbr_instance_data_t *BBRID,
31287
 
+       evms_sector_t *lsn);
31288
 
+static void bbr_free_remap(bbr_instance_data_t *BBRID);
31289
 
+static void bbr_free_instance_data(bbr_instance_data_t *BBRID);
31290
 
+static inline void bbr_list_add(bbr_instance_data_t *BBRID);
31291
 
+static void bbr_list_remove(bbr_instance_data_t *BBRID);
31292
 
+static bbr_instance_data_t *bbr_find_instance_data (char * object_name);
31293
 
+
31294
 
+/*   --- runtime support functions ---  */
31295
 
+static bbr_bh_t * allocate_bbr_bh(
31296
 
+       bbr_instance_data_t *BBRID,
31297
 
+       int rw);
31298
 
+static void bbr_io_handler( void * void_data );
31299
 
+
31300
 
+/* -- EVMS Plugin interface functions -- */
31301
 
+static int  bbr_discover(evms_logical_node_t **);
31302
 
+static int  bbr_delete(evms_logical_node_t *);
31303
 
+static void bbr_read(evms_logical_node_t *, eio_t *);
31304
 
+static void bbr_write(evms_logical_node_t *, eio_t *);
31305
 
+static int bbr_ioctl (
31306
 
+       evms_logical_node_t *bbr_node,
31307
 
+       struct inode *inode,
31308
 
+       struct file *file,
31309
 
+       unsigned int cmd,
31310
 
+       unsigned long arg);
31311
 
+static int bbr_direct_ioctl (
31312
 
+       struct inode *inode,
31313
 
+       struct file *file,
31314
 
+       unsigned int cmd,
31315
 
+       unsigned long arg);
31316
 
+
31317
 
+static int bbr_init_io(
31318
 
+       evms_logical_node_t * bbr_node,
31319
 
+       int io_flag,
31320
 
+       evms_sector_t startLSN,
31321
 
+       evms_sector_t nr_sects,
31322
 
+       void *bufptr );
31323
 
+
31324
 
+#endif
31325
 
diff -Naur linux-2002-03-28/include/linux/evms/evms_common.h evms-2002-03-28/include/linux/evms/evms_common.h
31326
 
--- linux-2002-03-28/include/linux/evms/evms_common.h   Wed Dec 31 18:00:00 1969
31327
 
+++ evms-2002-03-28/include/linux/evms/evms_common.h    Wed Mar 27 15:51:36 2002
31328
 
@@ -0,0 +1,158 @@
31329
 
+/* -*- linux-c -*- */
31330
 
+/*
31331
 
+ *
31332
 
+ *   Copyright (c) International Business Machines  Corp., 2000
31333
 
+ *
31334
 
+ *   This program is free software;  you can redistribute it and/or modify
31335
 
+ *   it under the terms of the GNU General Public License as published by
31336
 
+ *   the Free Software Foundation; either version 2 of the License, or
31337
 
+ *   (at your option) any later version.
31338
 
+ *
31339
 
+ *   This program is distributed in the hope that it will be useful,
31340
 
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
31341
 
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
31342
 
+ *   the GNU General Public License for more details.
31343
 
+ *
31344
 
+ *   You should have received a copy of the GNU General Public License
31345
 
+ *   along with this program;  if not, write to the Free Software
31346
 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
31347
 
+ */
31348
 
+/*
31349
 
+ * linux/include/linux/evms/evms_common.h
31350
 
+ *
31351
 
+ * EVMS common (kernel and user) header file
31352
 
+ *
31353
 
+ */
31354
 
+
31355
 
+#ifndef __EVMS_COMMON_INCLUDED__
31356
 
+#define __EVMS_COMMON_INCLUDED__
31357
 
+
31358
 
+/* version info */
31359
 
+#define EVMS_MAJOR                      63      /* use experimental major 63 for now */
31360
 
+#define EVMS_MAJOR_VERSION              1
31361
 
+#define EVMS_MINOR_VERSION              0
31362
 
+#define EVMS_PATCHLEVEL_VERSION         0
31363
 
+
31364
 
+#define MAX_EVMS_VOLUMES                256 /* There are 256 minors */
31365
 
+#define EVMS_VOLUME_NAME_SIZE           127
31366
 
+
31367
 
+#define IBM_OEM_ID                      8112    // could be anything, but used
31368
 
+                                                // I=8, B=1, M=12
31369
 
+// this one going away as well.
31370
 
+#define EVMS_OEM_IBM    IBM_OEM_ID
31371
 
+
31372
 
+#define EVMS_INITIAL_CRC                0xFFFFFFFF
31373
 
+#define EVMS_MAGIC_CRC                 0x31415926
31374
 
+
31375
 
+#define EVMS_VSECTOR_SIZE               512
31376
 
+#define EVMS_VSECTOR_SIZE_SHIFT         9
31377
 
+
31378
 
+#define DEV_PATH                       "/dev"
31379
 
+#define EVMS_DIR_NAME                  "evms"
31380
 
+#define EVMS_DEV_NAME                  "block_device"
31381
 
+#define EVMS_DEV_NODE_PATH             DEV_PATH "/" EVMS_DIR_NAME "/"
31382
 
+#define EVMS_DEVICE_NAME               DEV_PATH "/" EVMS_DIR_NAME "/" EVMS_DEV_NAME
31383
 
+
31384
 
+/* EVMS will always use 64-bit fields */
31385
 
+typedef u_int64_t evms_sector_t;
31386
 
+
31387
 
+typedef struct evms_version_s {
31388
 
+        /* major changes when incompatible differences are introduced */
31389
 
+        u_int32_t    major;
31390
 
+        /* minor changes when additions are made */
31391
 
+        u_int32_t    minor;
31392
 
+        /* patchlevel changes when bugs are fixed */
31393
 
+        u_int32_t    patchlevel;
31394
 
+} evms_version_t;
31395
 
+
31396
 
+typedef enum evms_plugin_code_s {
31397
 
+        EVMS_NO_PLUGIN,                                // 0
31398
 
+        EVMS_DEVICE_MANAGER,                           // 1
31399
 
+        EVMS_SEGMENT_MANAGER,                          // 2
31400
 
+        EVMS_REGION_MANAGER,                           // 3
31401
 
+        EVMS_FEATURE,                                  // 4
31402
 
+        EVMS_ASSOCIATIVE_FEATURE,                      // 5
31403
 
+        EVMS_FILESYSTEM_INTERFACE_MODULE,              // 6
31404
 
+        EVMS_CLUSTER_MANAGER_INTERFACE_MODULE,         // 7
31405
 
+        EVMS_DISTRIBUTED_LOCK_MANAGER_INTERFACE_MODULE // 8
31406
 
+} evms_plugin_code_t;
31407
 
+
31408
 
+#define SetPluginID(oem, type, id) ((oem << 16) | (type << 12) | id)
31409
 
+#define GetPluginOEM(pluginid) (pluginid >> 16)
31410
 
+#define GetPluginType(pluginid) ((pluginid >> 12) & 0xf)
31411
 
+#define GetPluginID(pluginid) (pluginid & 0xfff)
31412
 
+
31413
 
+/* bit definitions for the flags field in
31414
 
+ * the EVMS LOGICAL NODE (kernel) and
31415
 
+ * the EVMS LOGICAL VOLUME (user) structures.
31416
 
+ */
31417
 
+#define EVMS_FLAGS_WIDTH                       32
31418
 
+#define EVMS_VOLUME_FLAG                        (1<<0)
31419
 
+#define EVMS_VOLUME_PARTIAL_FLAG                (1<<1)
31420
 
+#define EVMS_VOLUME_PARTIAL                    (1<<1)
31421
 
+#define EVMS_VOLUME_SET_READ_ONLY               (1<<2)
31422
 
+#define EVMS_VOLUME_READ_ONLY                  (1<<2)
31423
 
+/* queued flags bits */
31424
 
+#define EVMS_REQUESTED_DELETE                  (1<<5)
31425
 
+#define EVMS_REQUESTED_QUIESCE                 (1<<6)
31426
 
+#define EVMS_REQUESTED_VFS_QUIESCE             (1<<7)
31427
 
+/* this bit indicates corruption */
31428
 
+#define EVMS_VOLUME_CORRUPT                    (1<<8)
31429
 
+/* these bits define the source of the corruption */
31430
 
+#define EVMS_VOLUME_SOFT_DELETED                       (1<<9)
31431
 
+#define EVMS_VOLUME_GENDISK_GONE               (1<<10)
31432
 
+/* these bits define volume status */
31433
 
+#define EVMS_MEDIA_CHANGED                     (1<<20)
31434
 
+#define EVMS_DEVICE_UNPLUGGED                  (1<<21)
31435
 
+/* these bits used for removable status */
31436
 
+#define EVMS_DEVICE_MEDIA_PRESENT              (1<<24)
31437
 
+#define EVMS_DEVICE_PRESENT                    (1<<25)
31438
 
+#define EVMS_DEVICE_LOCKABLE                   (1<<26)
31439
 
+#define EVMS_DEVICE_REMOVABLE                  (1<<27)
31440
 
+
31441
 
+/* version info for evms_feature_header_t */
31442
 
+#define EVMS_FEATURE_HEADER_MAJOR      3
31443
 
+#define EVMS_FEATURE_HEADER_MINOR      0
31444
 
+#define EVMS_FEATURE_HEADER_PATCHLEVEL 0
31445
 
+
31446
 
+/* bit definitions of FEATURE HEADER bits in the FLAGS field  */
31447
 
+#define EVMS_FEATURE_ACTIVE                     (1<<0)
31448
 
+#define EVMS_FEATURE_VOLUME_COMPLETE            (1<<1)
31449
 
+/* bit definitions for VOLUME bits in the FLAGS field */
31450
 
+#define EVMS_VOLUME_DATA_OBJECT                        (1<<16)
31451
 
+#define EVMS_VOLUME_DATA_STOP                  (1<<17)
31452
 
+
31453
 
+#define EVMS_FEATURE_HEADER_SIGNATURE           0x54414546 //FEAT
31454
 
+typedef struct evms_feature_header_s {
31455
 
+/*  0*/ u_int32_t               signature;
31456
 
+/*  4*/ u_int32_t               crc;
31457
 
+/*  8*/ evms_version_t          version;               /* structure version */
31458
 
+/* 20*/ evms_version_t          engine_version;                /* version of the Engine that */
31459
 
+                                                       /* wrote this feature header  */
31460
 
+/* 32*/ u_int32_t               flags;
31461
 
+/* 36*/ u_int32_t               feature_id;
31462
 
+/* 40*/ u_int64_t              sequence_number;
31463
 
+/* 48*/ u_int64_t              alignment_padding;
31464
 
+        //required: starting lsn to 1st copy of feature's metadata.
31465
 
+/* 56*/ evms_sector_t           feature_data1_start_lsn;
31466
 
+/* 64*/        evms_sector_t           feature_data1_size; //in 512 byte units
31467
 
+       //optional: starting lsn to 2nd copy of feature's metadata.
31468
 
+       //          if unused set size field to 0.
31469
 
+/* 72*/ evms_sector_t           feature_data2_start_lsn;
31470
 
+/* 80*/        evms_sector_t           feature_data2_size; //in 512 byte units
31471
 
+/* 88*/ u_int64_t               volume_serial_number;
31472
 
+/* 96*/ u_int32_t               volume_system_id;       /* the minor is stored here */
31473
 
+/*100*/ u_int32_t               object_depth;  /* depth of object in the volume tree */
31474
 
+/*104*/ char                    object_name[EVMS_VOLUME_NAME_SIZE+1];
31475
 
+/*232*/ char                    volume_name[EVMS_VOLUME_NAME_SIZE+1];
31476
 
+/*360*/ unsigned char          pad[152];
31477
 
+/*512*/
31478
 
+} evms_feature_header_t;
31479
 
+
31480
 
+/* EVMS specific error codes */
31481
 
+#define EVMS_FEATURE_FATAL_ERROR                257
31482
 
+#define EVMS_VOLUME_FATAL_ERROR                 258
31483
 
+
31484
 
+#define EVMS_FEATURE_INCOMPLETE_ERROR          259
31485
 
+
31486
 
+#endif
31487
 
diff -Naur linux-2002-03-28/include/linux/evms/evms_drivelink.h evms-2002-03-28/include/linux/evms/evms_drivelink.h
31488
 
--- linux-2002-03-28/include/linux/evms/evms_drivelink.h        Wed Dec 31 18:00:00 1969
31489
 
+++ evms-2002-03-28/include/linux/evms/evms_drivelink.h Wed Dec 12 09:37:43 2001
31490
 
@@ -0,0 +1,78 @@
31491
 
+/* -*- linux-c -*- */
31492
 
+/*
31493
 
+ *
31494
 
+ *   Copyright (c) International Business Machines  Corp., 2000
31495
 
+ *
31496
 
+ *   This program is free software;  you can redistribute it and/or modify
31497
 
+ *   it under the terms of the GNU General Public License as published by
31498
 
+ *   the Free Software Foundation; either version 2 of the License, or 
31499
 
+ *   (at your option) any later version.
31500
 
+ * 
31501
 
+ *   This program is distributed in the hope that it will be useful,
31502
 
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
31503
 
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
31504
 
+ *   the GNU General Public License for more details.
31505
 
+ *
31506
 
+ *   You should have received a copy of the GNU General Public License
31507
 
+ *   along with this program;  if not, write to the Free Software 
31508
 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
31509
 
+ */
31510
 
+/*
31511
 
+ * linux/include/linux/evms_drvlink.h
31512
 
+ *
31513
 
+ * EVMS DriveLink Feature kernel header file
31514
 
+ *
31515
 
+ */
31516
 
+
31517
 
+#ifndef __EVMS_DRIVELINK_INCLUDED__
31518
 
+#define __EVMS_DRIVELINK_INCLUDED__
31519
 
+
31520
 
+#define EVMS_DRIVELINK_VERSION_MAJOR            2
31521
 
+#define EVMS_DRIVELINK_VERSION_MINOR            0
31522
 
+#define EVMS_DRIVELINK_VERSION_PATCHLEVEL       0
31523
 
+
31524
 
+#define EVMS_DRIVELINK_FEATURE_ID       1
31525
 
+#define EVMS_DRIVELINK_SIGNATURE        0x4C767244   //DrvL
31526
 
+#define EVMS_DRIVELINK_MAX_ENTRIES      60
31527
 
+
31528
 
+// description of on disk meta data sector for drivelink feature
31529
 
+
31530
 
+typedef struct evms_dl_ordering_table_entry_s {
31531
 
+       u_int64_t                       child_serial_number;
31532
 
+       evms_sector_t                   child_vsize;
31533
 
+} evms_dl_ordering_table_entry_t;
31534
 
+
31535
 
+typedef struct evms_drivelink_metadata_s {
31536
 
+/*  0*/ u_int32_t                       signature;
31537
 
+/*  4*/ u_int32_t                       crc;
31538
 
+/*  8*/        evms_version_t                  version;
31539
 
+/* 20*/ u_int32_t                      flags;
31540
 
+/* 24*/ u_int64_t                      sequence_number;
31541
 
+/* 32*/ u_int64_t                       child_serial_number;
31542
 
+/* 40*/ u_int64_t                       parent_serial_number;
31543
 
+/* 48*/ u_int64_t                       child_count;
31544
 
+/* 56*/ u_int64_t                      pad;
31545
 
+/* 64*/ evms_dl_ordering_table_entry_t  ordering_table[EVMS_DRIVELINK_MAX_ENTRIES];
31546
 
+/*1024*/
31547
 
+} evms_drivelink_metadata_t;
31548
 
+
31549
 
+#ifdef __KERNEL__
31550
 
+// description of in memory meta data for drivelink feature
31551
 
+typedef struct evms_drivelink_runtime_entry_s {
31552
 
+        u_int64_t                       block_size;
31553
 
+        evms_sector_t                   voffset;
31554
 
+        evms_sector_t                   vsize;
31555
 
+        evms_logical_node_t            *child_node;
31556
 
+        evms_drivelink_metadata_t      *child_metadata;
31557
 
+} evms_drivelink_runtime_entry_t;
31558
 
+
31559
 
+typedef struct evms_drivelink_runtime_data_s {
31560
 
+        u_int64_t                       block_size;
31561
 
+       // keep the fields below this point in order
31562
 
+        u_int64_t                       parent_serial_number;
31563
 
+        u_int64_t                       child_count;
31564
 
+        evms_drivelink_runtime_entry_t *child_table;
31565
 
+} evms_drivelink_runtime_data_t;
31566
 
+#endif
31567
 
+
31568
 
+#endif
31569
 
diff -Naur linux-2002-03-28/include/linux/evms/evms_ecr.h evms-2002-03-28/include/linux/evms/evms_ecr.h
31570
 
--- linux-2002-03-28/include/linux/evms/evms_ecr.h      Wed Dec 31 18:00:00 1969
31571
 
+++ evms-2002-03-28/include/linux/evms/evms_ecr.h       Wed Nov  7 14:32:21 2001
31572
 
@@ -0,0 +1,107 @@
31573
 
+/*
31574
 
+ *
31575
 
+ *   Copyright (c) International Business Machines  Corp., 2000
31576
 
+ *
31577
 
+ *   This program is free software;  you can redistribute it and/or modify
31578
 
+ *   it under the terms of the GNU General Public License as published by
31579
 
+ *   the Free Software Foundation; either version 2 of the License, or 
31580
 
+ *   (at your option) any later version.
31581
 
+ * 
31582
 
+ *   This program is distributed in the hope that it will be useful,
31583
 
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
31584
 
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
31585
 
+ *   the GNU General Public License for more details.
31586
 
+ *
31587
 
+ *   You should have received a copy of the GNU General Public License
31588
 
+ *   along with this program;  if not, write to the Free Software 
31589
 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
31590
 
+ *
31591
 
+ */
31592
 
+/*
31593
 
+ * linux/include/linux/evms_ecr.h
31594
 
+ *
31595
 
+ * EVMS Cluster enablement kernel header file
31596
 
+ *
31597
 
+ */
31598
 
+
31599
 
+#ifndef __EVMS_ECR__
31600
 
+
31601
 
+#define __EVMS_ECR__
31602
 
+
31603
 
+#define ECR_SUCCESS 0
31604
 
+#define ECR_FAIL   -1
31605
 
+
31606
 
+/* 
31607
 
+ * Beginning of group messaging API
31608
 
+ */
31609
 
+typedef int            ecr_group_t;
31610
 
+typedef int            ecr_nodeid_t;
31611
 
+typedef void           ecr_cred_t;
31612
 
+typedef void           ecr_instance_t;
31613
 
+typedef void           ecr_message_t;
31614
 
+
31615
 
+typedef enum ecr_type_s {
31616
 
+       ECR_GROUP_START,        /* 0th entry is reserved */
31617
 
+       ECR_P2P,                /* Point to Point message type */
31618
 
+       ECR_BROADCAST,          /* Broadcast message type */
31619
 
+       ECR_ATOMIC_EXECUTE,     /* Atomic execute type */
31620
 
+       ECR_GROUP_LAST          /* Just a last enum type, not a message type */
31621
 
+} ecr_type_t;
31622
 
+
31623
 
+typedef struct ecr_table_s {
31624
 
+       void  (*join) (ecr_nodeid_t, uint,  ecr_nodeid_t *,  ecr_instance_t *);
31625
 
+       int   (*can_join)(ecr_nodeid_t, ecr_cred_t *, size_t, ecr_instance_t *);
31626
 
+       void  (*leave) (ecr_nodeid_t, ecr_instance_t *);
31627
 
+       void  (*recover)(ecr_nodeid_t, ecr_instance_t *);
31628
 
+       void  (*message)(ecr_message_t *, ecr_type_t, ecr_nodeid_t, 
31629
 
+                               void *, size_t,  ecr_instance_t *);
31630
 
+       void  (*vol_leave)(ecr_nodeid_t, ecr_instance_t *);
31631
 
+} ecr_table_t;
31632
 
+
31633
 
+
31634
 
+#define ECR_GROUPNAME_MAX_SIZE  NAME_SIZE /* maximum size of a group name */
31635
 
+
31636
 
+ecr_group_t  ecr_group_join(char *,  ecr_table_t *, ecr_cred_t *, size_t, 
31637
 
+                                       ecr_instance_t *);
31638
 
+void        ecr_group_leave(ecr_group_t);
31639
 
+int         ecr_group_send(ecr_group_t, ecr_nodeid_t, void *, size_t, 
31640
 
+                               ecr_instance_t *, 
31641
 
+                               void callback(int, ecr_instance_t *));
31642
 
+int         ecr_group_send_wait(ecr_group_t, ecr_nodeid_t, void *, size_t, 
31643
 
+                               int *);
31644
 
+int         ecr_group_broadcast(ecr_group_t, void *, size_t, ecr_instance_t *,
31645
 
+                               void callback(u_char, ecr_instance_t *));
31646
 
+int         ecr_group_broadcast_wait(ecr_group_t, void *, size_t, u_char *);
31647
 
+int         ecr_group_atomic_execute(ecr_group_t, void *, size_t, 
31648
 
+                               ecr_instance_t *,
31649
 
+                               void callback(ecr_instance_t *));
31650
 
+int         ecr_group_atomic_execute_wait(ecr_group_t, void *, size_t);
31651
 
+void        ecr_group_success_response(ecr_message_t *);
31652
 
+void        ecr_group_failure_response(ecr_message_t *, int);
31653
 
+
31654
 
+
31655
 
+
31656
 
+/* 
31657
 
+ * Beginning of distributed lock API
31658
 
+ */
31659
 
+
31660
 
+typedef int            ecr_lock_t;
31661
 
+typedef enum ecr_lock_mode_s {
31662
 
+       ECR_LOCK_START,         /* 0th entry is reserved */
31663
 
+       ECR_LOCK_CONCURRENT,    /* concurrent access */
31664
 
+       ECR_LOCK_EXCLUSIVE,     /* exclusive access */
31665
 
+       ECR_LOCK_LAST           /* Just a last enum type, not a lock type */
31666
 
+} ecr_lock_mode_t;
31667
 
+
31668
 
+typedef u_char         ecr_mode_t;
31669
 
+
31670
 
+
31671
 
+#define ECR_LOCKNAME_MAX_SIZE  NAME_SIZE /* maximum size of a lock name */
31672
 
+#define ECR_BLOCK 1 /* waitflag set */
31673
 
+
31674
 
+ecr_lock_t   ecr_lock_create(char *  /* lock name */);
31675
 
+int         ecr_lock(ecr_lock_t, u_int64_t, u_int64_t, ecr_lock_mode_t, 
31676
 
+                               u_char /*waitflag*/);
31677
 
+int         ecr_unlock(ecr_lock_t, u_int64_t, u_int64_t);
31678
 
+
31679
 
+#endif /* __EVMS_ECR__ */
31680
 
diff -Naur linux-2002-03-28/include/linux/evms/evms_ioctl.h evms-2002-03-28/include/linux/evms/evms_ioctl.h
31681
 
--- linux-2002-03-28/include/linux/evms/evms_ioctl.h    Wed Dec 31 18:00:00 1969
31682
 
+++ evms-2002-03-28/include/linux/evms/evms_ioctl.h     Thu Mar 21 14:08:50 2002
31683
 
@@ -0,0 +1,293 @@
31684
 
+/* -*- linux-c -*- */
31685
 
+/*
31686
 
+ *
31687
 
+ *   Copyright (c) International Business Machines  Corp., 2000
31688
 
+ *
31689
 
+ *   This program is free software;  you can redistribute it and/or modify
31690
 
+ *   it under the terms of the GNU General Public License as published by
31691
 
+ *   the Free Software Foundation; either version 2 of the License, or
31692
 
+ *   (at your option) any later version.
31693
 
+ *
31694
 
+ *   This program is distributed in the hope that it will be useful,
31695
 
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
31696
 
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
31697
 
+ *   the GNU General Public License for more details.
31698
 
+ *
31699
 
+ *   You should have received a copy of the GNU General Public License
31700
 
+ *   along with this program;  if not, write to the Free Software
31701
 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
31702
 
+ */
31703
 
+/*
31704
 
+ * linux/include/linux/evms.h
31705
 
+ *
31706
 
+ * EVMS public kernel header file
31707
 
+ *
31708
 
+ */
31709
 
+
31710
 
+#ifndef __EVMS_IOCTL_INCLUDED__
31711
 
+#define __EVMS_IOCTL_INCLUDED__
31712
 
+
31713
 
+#include <linux/hdreg.h>
31714
 
+
31715
 
+/* IOCTL interface version definitions */
31716
 
+#define EVMS_IOCTL_INTERFACE_MAJOR           10
31717
 
+#define EVMS_IOCTL_INTERFACE_MINOR           0
31718
 
+#define EVMS_IOCTL_INTERFACE_PATCHLEVEL      0
31719
 
+
31720
 
+/* IOCTL definitions */
31721
 
+typedef enum evms_ioctl_cmds_s {
31722
 
+       /* version commands */
31723
 
+       EVMS_GET_IOCTL_VERSION_NUMBER = 0,
31724
 
+       EVMS_GET_VERSION_NUMBER,
31725
 
+#ifdef __KERNEL__
31726
 
+       /* EVMS internal commands */
31727
 
+       EVMS_GET_DISK_LIST_NUMBER = 0x40,
31728
 
+       EVMS_CHECK_MEDIA_CHANGE_NUMBER,
31729
 
+       EVMS_REVALIDATE_DISK_NUMBER,
31730
 
+       EVMS_OPEN_VOLUME_NUMBER,
31731
 
+       EVMS_CLOSE_VOLUME_NUMBER,
31732
 
+       EVMS_QUIESCE_VOLUME_NUMBER,
31733
 
+#endif
31734
 
+       /* configuration commands */
31735
 
+       EVMS_GET_INFO_LEVEL_NUMBER = 0x80,
31736
 
+       EVMS_SET_INFO_LEVEL_NUMBER,
31737
 
+       EVMS_REDISCOVER_VOLUMES_NUMBER,
31738
 
+       EVMS_DELETE_VOLUME_NUMBER,
31739
 
+       EVMS_PLUGIN_IOCTL_NUMBER,
31740
 
+       EVMS_PROCESS_NOTIFY_EVENT_NUMBER,
31741
 
+       /* query info commands */
31742
 
+       EVMS_GET_LOGICAL_DISK_NUMBER = 0xC0,
31743
 
+       EVMS_GET_LOGICAL_DISK_INFO_NUMBER,
31744
 
+       EVMS_SECTOR_IO_NUMBER,
31745
 
+       EVMS_GET_MINOR_NUMBER,
31746
 
+       EVMS_GET_VOLUME_DATA_NUMBER,
31747
 
+       EVMS_GET_PLUGIN_NUMBER,
31748
 
+       EVMS_COMPUTE_CSUM_NUMBER,
31749
 
+       EVMS_GET_BMAP_NUMBER,
31750
 
+} evms_ioctl_cmds_t;
31751
 
+
31752
 
+/* version commands */
31753
 
+#define EVMS_GET_IOCTL_VERSION_STRING   "EVMS_GET_IOCTL_VERSION"
31754
 
+#define EVMS_GET_IOCTL_VERSION          _IOR(EVMS_MAJOR, EVMS_GET_IOCTL_VERSION_NUMBER, evms_version_t)
31755
 
+
31756
 
+#define EVMS_GET_VERSION_STRING         "EVMS_GET_VERSION"
31757
 
+#define EVMS_GET_VERSION                _IOR(EVMS_MAJOR, EVMS_GET_VERSION_NUMBER, evms_version_t)
31758
 
+
31759
 
+#ifdef __KERNEL__
31760
 
+
31761
 
+/* EVMS internal commands */
31762
 
+#define EVMS_GET_DISK_LIST_STRING       "EVMS_GET_DISK_LIST"
31763
 
+#define EVMS_GET_DISK_LIST              _IOWR(EVMS_MAJOR, EVMS_GET_DISK_LIST_NUMBER, evms_list_node_t **)
31764
 
+
31765
 
+#define EVMS_CHECK_MEDIA_CHANGE_STRING  "EVMS_CHECK_MEDIA_CHANGE"
31766
 
+#define EVMS_CHECK_MEDIA_CHANGE         _IO(EVMS_MAJOR, EVMS_CHECK_MEDIA_CHANGE_NUMBER)
31767
 
+
31768
 
+#define EVMS_REVALIDATE_DISK_STRING     "EVMS_REVALIDATE_DISK"
31769
 
+#define EVMS_REVALIDATE_DISK            _IO(EVMS_MAJOR, EVMS_REVALIDATE_DISK_NUMBER)
31770
 
+
31771
 
+#define EVMS_OPEN_VOLUME_STRING         "EVMS_OPEN_VOLUME"
31772
 
+#define EVMS_OPEN_VOLUME                _IO(EVMS_MAJOR, EVMS_OPEN_VOLUME_NUMBER)
31773
 
+
31774
 
+#define EVMS_CLOSE_VOLUME_STRING        "EVMS_CLOSE_VOLUME"
31775
 
+#define EVMS_CLOSE_VOLUME               _IO(EVMS_MAJOR, EVMS_CLOSE_VOLUME_NUMBER)
31776
 
+
31777
 
+/* field: command: defines */
31778
 
+#define EVMS_UNQUIESCE          0
31779
 
+#define EVMS_QUIESCE            1
31780
 
+
31781
 
+/* field: do_vfs: defines */
31782
 
+/* see evms_delete_volume */
31783
 
+typedef struct evms_quiesce_volume_s {
31784
 
+       int             command;                /* 0 = unquiesce, 1 = quiesce */
31785
 
+       int             minor;                  /* minor device number of target volume */
31786
 
+       int             do_vfs;                 /* 0 = do nothing, 1 = also perform equivalent VFS operation */
31787
 
+       int             status;                 /* 0 = success */
31788
 
+} evms_quiesce_volume_t;
31789
 
+
31790
 
+#define EVMS_QUIESCE_VOLUME_STRING      "EVMS_QUIESCE_VOLUME"
31791
 
+#define EVMS_QUIESCE_VOLUME             _IOR(EVMS_MAJOR, EVMS_QUIESCE_VOLUME_NUMBER, evms_quiesce_volume_t)
31792
 
+
31793
 
+#endif
31794
 
+
31795
 
+/* configuration commands */
31796
 
+#define EVMS_GET_INFO_LEVEL_STRING      "EVMS_GET_INFO_LEVEL"
31797
 
+#define EVMS_GET_INFO_LEVEL             _IOR(EVMS_MAJOR, EVMS_GET_INFO_LEVEL_NUMBER, int)
31798
 
+
31799
 
+#define EVMS_SET_INFO_LEVEL_STRING      "EVMS_SET_INFO_LEVEL"
31800
 
+#define EVMS_SET_INFO_LEVEL             _IOW(EVMS_MAJOR, EVMS_SET_INFO_LEVEL_NUMBER, int)
31801
 
+
31802
 
+/* field: drive_count: defines */
31803
 
+#define REDISCOVER_ALL_DEVICES          0xFFFFFFFF
31804
 
+typedef struct evms_rediscover_s {
31805
 
+       int             status;
31806
 
+       unsigned int    drive_count;            /* 0xffffffff = rediscover all known disks */
31807
 
+       unsigned long  *drive_array;
31808
 
+} evms_rediscover_t;
31809
 
+
31810
 
+#define EVMS_REDISCOVER_VOLUMES_STRING  "EVMS_REDISCOVER_VOLUMES"
31811
 
+#define EVMS_REDISCOVER_VOLUMES         _IOWR(EVMS_MAJOR, EVMS_REDISCOVER_VOLUMES_NUMBER, evms_rediscover_t)
31812
 
+
31813
 
+/* field: command: defines */
31814
 
+#define EVMS_SOFT_DELETE        0
31815
 
+#define EVMS_HARD_DELETE        1
31816
 
+
31817
 
+/* field: do_vfs: defines */
31818
 
+#define EVMS_VFS_DO_NOTHING     0
31819
 
+#define EVMS_VFS_DO             1
31820
 
+typedef struct evms_delete_volume_s {
31821
 
+       int             command;                /* 0 = "temp", 1 = "permanent" */
31822
 
+       int             minor;                  /* minor device number of target volume */
31823
 
+       int             do_vfs;                 /* 0 = do nothing, 1 = perform VFS operations */
31824
 
+       int             associative_minor;      /* optional minor of associative volume */
31825
 
+                                               /* must be 0 when not in use */
31826
 
+       int             status;                 /* 0 = success, other is error */
31827
 
+} evms_delete_volume_t;
31828
 
+
31829
 
+#define EVMS_DELETE_VOLUME_STRING       "EVMS_DELETE_VOLUME"
31830
 
+#define EVMS_DELETE_VOLUME              _IOR(EVMS_MAJOR, EVMS_DELETE_VOLUME_NUMBER, evms_delete_volume_t)
31831
 
+
31832
 
+typedef struct evms_plugin_ioctl_s {
31833
 
+       unsigned long   feature_id;             /* ID of feature to receive this ioctl */
31834
 
+       int             feature_command;        /* feature specific ioctl command      */
31835
 
+       int             status;                 /* 0 = completed, non-0 = error        */
31836
 
+       void           *feature_ioctl_data;     /* ptr to feature specific struct      */
31837
 
+} evms_plugin_ioctl_t;
31838
 
+
31839
 
+#define EVMS_PLUGIN_IOCTL_STRING        "EVMS_PLUGIN_IOCTL"
31840
 
+#define EVMS_PLUGIN_IOCTL               _IOR(EVMS_MAJOR, EVMS_PLUGIN_IOCTL_NUMBER, evms_plugin_ioctl_t)
31841
 
+
31842
 
+/* field: eventid: defines */
31843
 
+#define EVMS_EVENT_END_OF_DISCOVERY     0
31844
 
+typedef struct evms_event_s {
31845
 
+       int     pid;                            /* PID to act on */
31846
 
+       int     eventid;                        /* event id to respond to */
31847
 
+       int     signo;                          /* signal # to send when event occurs */
31848
 
+} evms_event_t;
31849
 
+
31850
 
+/* field: command: defines */
31851
 
+#define EVMS_EVENT_UNREGISTER   0
31852
 
+#define EVMS_EVENT_REGISTER     1
31853
 
+typedef struct evms_notify_s {
31854
 
+       int             command;                /* 0 = unregister, 1 = register */
31855
 
+       evms_event_t    eventry;                /* event structure */
31856
 
+       int             status;                 /* return status */
31857
 
+} evms_notify_t;
31858
 
+
31859
 
+#define EVMS_PROCESS_NOTIFY_EVENT_STRING "EVMS_PROCESS_NOTIFY_EVENT"
31860
 
+#define EVMS_PROCESS_NOTIFY_EVENT       _IOWR(EVMS_MAJOR, EVMS_PROCESS_NOTIFY_EVENT_NUMBER, evms_notify_t)
31861
 
+
31862
 
+/* query info commands */
31863
 
+
31864
 
+/* field: command: defines */
31865
 
+#define EVMS_FIRST_DISK         0
31866
 
+#define EVMS_NEXT_DISK          1
31867
 
+
31868
 
+/* field: status: defines */
31869
 
+#define EVMS_DISK_INVALID       0
31870
 
+#define EVMS_DISK_VALID         1
31871
 
+typedef struct evms_user_disk_s {
31872
 
+       int             command;                /* 0 = first disk, 1 = next disk */
31873
 
+       int             status;                 /* 0 = no more disks, 1 = valid disk info */
31874
 
+       unsigned long   disk_handle;            /* only valid when status == 1 */
31875
 
+} evms_user_disk_t;
31876
 
+
31877
 
+#define EVMS_GET_LOGICAL_DISK_STRING    "EVMS_GET_LOGICAL_DISK"
31878
 
+#define EVMS_GET_LOGICAL_DISK           _IOWR(EVMS_MAJOR, EVMS_GET_LOGICAL_DISK_NUMBER, evms_user_disk_t)
31879
 
+
31880
 
+/* flags fields described in evms_common.h */
31881
 
+typedef struct evms_user_disk_info_s {
31882
 
+       unsigned int    status;
31883
 
+       unsigned int    flags;
31884
 
+       unsigned long   disk_handle;
31885
 
+       unsigned int    disk_dev;
31886
 
+       struct hd_geometry geometry;
31887
 
+       unsigned int    block_size;
31888
 
+       unsigned int    hardsect_size;
31889
 
+       u_int64_t       total_sectors;
31890
 
+       char            disk_name[EVMS_VOLUME_NAME_SIZE];
31891
 
+} evms_user_disk_info_t;
31892
 
+
31893
 
+#define EVMS_GET_LOGICAL_DISK_INFO_STRING "EVMS_GET_LOGICAL_DISK_INFO"
31894
 
+#define EVMS_GET_LOGICAL_DISK_INFO      _IOWR(EVMS_MAJOR, EVMS_GET_LOGICAL_DISK_INFO_NUMBER, evms_user_disk_info_t)
31895
 
+
31896
 
+/* field: io_flag: defines */
31897
 
+#define EVMS_SECTOR_IO_READ    0
31898
 
+#define EVMS_SECTOR_IO_WRITE   1
31899
 
+typedef struct evms_sector_io_s {
31900
 
+       unsigned long   disk_handle;            /* valid disk handle */
31901
 
+       int             io_flag;                /* 0 = READ, 1 = WRITE */
31902
 
+       evms_sector_t   starting_sector;        /* disk relative LBA */
31903
 
+       evms_sector_t   sector_count;           /* number of sectors in IO */
31904
 
+       unsigned char  *buffer_address;         /* IO address */
31905
 
+       int             status;                 /* 0 = success, not 0 = error */
31906
 
+} evms_sector_io_t;
31907
 
+
31908
 
+#define EVMS_SECTOR_IO_STRING           "EVMS_SECTOR_IO"
31909
 
+#define EVMS_SECTOR_IO                  _IOWR(EVMS_MAJOR, EVMS_SECTOR_IO_NUMBER, evms_sector_io_t)
31910
 
+
31911
 
+/* field: command: defines */
31912
 
+#define EVMS_FIRST_VOLUME       0
31913
 
+#define EVMS_NEXT_VOLUME        1
31914
 
+
31915
 
+/* field: status: defines */
31916
 
+#define EVMS_VOLUME_INVALID     0
31917
 
+#define EVMS_VOLUME_VALID       1
31918
 
+typedef struct evms_user_minor_s {
31919
 
+       int             command;                /* 0 = first volume, 1 = next volume */
31920
 
+       int             status;                 /* 0 = no more, 1 = valid info */
31921
 
+       int             minor;                  /* only valid when status == 1 */
31922
 
+} evms_user_minor_t;
31923
 
+
31924
 
+#define EVMS_GET_MINOR_STRING           "EVMS_GET_MINOR"
31925
 
+#define EVMS_GET_MINOR                  _IOWR(EVMS_MAJOR, EVMS_GET_MINOR_NUMBER, evms_user_minor_t)
31926
 
+
31927
 
+/* flags field described in evms_common.h */
31928
 
+typedef struct evms_volume_data_s {
31929
 
+       int             minor;                  /* minor of target volume */
31930
 
+       int             flags;
31931
 
+       char            volume_name[EVMS_VOLUME_NAME_SIZE + 1];
31932
 
+       int             status;
31933
 
+} evms_volume_data_t;
31934
 
+
31935
 
+#define EVMS_GET_VOLUME_DATA_STRING     "EVMS_GET_VOLUME_DATA"
31936
 
+#define EVMS_GET_VOLUME_DATA            _IOWR(EVMS_MAJOR, EVMS_GET_VOLUME_DATA_NUMBER, evms_volume_data_t)
31937
 
+
31938
 
+/* field: command: defines */
31939
 
+#define EVMS_FIRST_PLUGIN       0
31940
 
+#define EVMS_NEXT_PLUGIN        1
31941
 
+
31942
 
+/* field: status: defines */
31943
 
+#define EVMS_PLUGIN_INVALID     0
31944
 
+#define EVMS_PLUGIN_VALID       1
31945
 
+typedef struct evms_kernel_plugin_s {
31946
 
+       int             command;                /* 0 = first item, 1 = next item */
31947
 
+       u_int32_t       id;                     /* returned plugin id */
31948
 
+       evms_version_t  version;                /* maj,min,patch of plugin */
31949
 
+       int             status;                 /* 0 = no more, 1 = valid info */
31950
 
+} evms_kernel_plugin_t;
31951
 
+
31952
 
+#define EVMS_GET_PLUGIN_STRING          "EVMS_GET_PLUGIN"
31953
 
+#define EVMS_GET_PLUGIN                 _IOWR(EVMS_MAJOR, EVMS_GET_PLUGIN_NUMBER, evms_kernel_plugin_t)
31954
 
+
31955
 
+typedef struct evms_compute_csum_s {
31956
 
+       unsigned char  *buffer_address;         /* IO address */
31957
 
+       int             buffer_size;            /* byte size of buffer */
31958
 
+       unsigned int    insum;                  /* previous csum to be factored in */
31959
 
+       unsigned int    outsum;                 /* resulting csum value of buffer */
31960
 
+       int             status;                 /* 0 = success, not 0 = error */
31961
 
+} evms_compute_csum_t;
31962
 
+
31963
 
+#define EVMS_COMPUTE_CSUM_STRING        "EVMS_COMPUTE_CSUM"
31964
 
+#define EVMS_COMPUTE_CSUM               _IOWR(EVMS_MAJOR, EVMS_COMPUTE_CSUM_NUMBER, evms_compute_csum_t)
31965
 
+
31966
 
+typedef struct evms_get_bmap_s {
31967
 
+       u_int64_t       rsector;                /* input: volume relative rsector value */
31968
 
+                                               /* output: disk relative rsector value */
31969
 
+       u_int32_t       dev;                    /* output = physical device */
31970
 
+       int             status;                 /* 0 = success, not 0 = error */
31971
 
+} evms_get_bmap_t;
31972
 
+
31973
 
+#define EVMS_GET_BMAP_STRING            "EVMS_GET_BMAP"
31974
 
+#define EVMS_GET_BMAP                   _IOWR(EVMS_MAJOR, EVMS_GET_BMAP_NUMBER, evms_get_bmap_t)
31975
 
+
31976
 
+#endif
31977
 
diff -Naur linux-2002-03-28/include/linux/evms/evms_kernel.h evms-2002-03-28/include/linux/evms/evms_kernel.h
31978
 
--- linux-2002-03-28/include/linux/evms/evms_kernel.h   Wed Dec 31 18:00:00 1969
31979
 
+++ evms-2002-03-28/include/linux/evms/evms_kernel.h    Wed May 16 13:40:56 2001
31980
 
@@ -0,0 +1,29 @@
31981
 
+/* -*- linux-c -*- */
31982
 
+/*
31983
 
+ *
31984
 
+ *   Copyright (c) International Business Machines  Corp., 2000
31985
 
+ *
31986
 
+ *   This program is free software;  you can redistribute it and/or modify
31987
 
+ *   it under the terms of the GNU General Public License as published by
31988
 
+ *   the Free Software Foundation; either version 2 of the License, or 
31989
 
+ *   (at your option) any later version.
31990
 
+ * 
31991
 
+ *   This program is distributed in the hope that it will be useful,
31992
 
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
31993
 
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
31994
 
+ *   the GNU General Public License for more details.
31995
 
+ *
31996
 
+ *   You should have received a copy of the GNU General Public License
31997
 
+ *   along with this program;  if not, write to the Free Software 
31998
 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
31999
 
+ */
32000
 
+/*
32001
 
+ * linux/include/linux/evms_kernel.h
32002
 
+ *
32003
 
+ * EVMS (master) kernel header file
32004
 
+ *
32005
 
+ */
32006
 
+
32007
 
+#include <linux/evms/evms_common.h>
32008
 
+#include <linux/evms/evms.h>
32009
 
+#include <linux/evms/evms_ioctl.h>
32010
 
diff -Naur linux-2002-03-28/include/linux/evms/evms_linear.h evms-2002-03-28/include/linux/evms/evms_linear.h
32011
 
--- linux-2002-03-28/include/linux/evms/evms_linear.h   Wed Dec 31 18:00:00 1969
32012
 
+++ evms-2002-03-28/include/linux/evms/evms_linear.h    Thu Jan 10 12:51:50 2002
32013
 
@@ -0,0 +1,33 @@
32014
 
+#ifndef __EVMS_LINEAR_H
32015
 
+#define __EVMS_LINEAR_H
32016
 
+
32017
 
+#include <linux/evms/evms_md.h>
32018
 
+
32019
 
+struct dev_info {
32020
 
+       evms_logical_node_t *node;
32021
 
+       kdev_t          dev;
32022
 
+       unsigned long   size;
32023
 
+       unsigned long   offset;
32024
 
+};
32025
 
+
32026
 
+typedef struct dev_info dev_info_t;
32027
 
+
32028
 
+struct linear_hash
32029
 
+{
32030
 
+       dev_info_t *dev0, *dev1;
32031
 
+};
32032
 
+
32033
 
+struct linear_private_data
32034
 
+{
32035
 
+       struct linear_hash      *hash_table;
32036
 
+       dev_info_t              disks[MD_SB_DISKS];
32037
 
+       dev_info_t              *smallest;
32038
 
+       int                     nr_zones;
32039
 
+};
32040
 
+
32041
 
+
32042
 
+typedef struct linear_private_data linear_conf_t;
32043
 
+
32044
 
+#define mddev_to_conf(mddev) ((linear_conf_t *) mddev->private)
32045
 
+
32046
 
+#endif
32047
 
diff -Naur linux-2002-03-28/include/linux/evms/evms_lvm.h evms-2002-03-28/include/linux/evms/evms_lvm.h
32048
 
--- linux-2002-03-28/include/linux/evms/evms_lvm.h      Wed Dec 31 18:00:00 1969
32049
 
+++ evms-2002-03-28/include/linux/evms/evms_lvm.h       Thu Mar 21 16:30:34 2002
32050
 
@@ -0,0 +1,300 @@
32051
 
+/* -*- linux-c -*- */
32052
 
+/*
32053
 
+ *   Copyright (c) International Business Machines  Corp., 2000
32054
 
+ *
32055
 
+ *   This program is free software;  you can redistribute it and/or modify
32056
 
+ *   it under the terms of the GNU General Public License as published by
32057
 
+ *   the Free Software Foundation; either version 2 of the License, or 
32058
 
+ *   (at your option) any later version.
32059
 
+ * 
32060
 
+ *   This program is distributed in the hope that it will be useful,
32061
 
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
32062
 
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
32063
 
+ *   the GNU General Public License for more details.
32064
 
+ *
32065
 
+ *   You should have received a copy of the GNU General Public License
32066
 
+ *   along with this program;  if not, write to the Free Software 
32067
 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
32068
 
+ */
32069
 
+/*
32070
 
+ * linux/include/linux/evms_lvm.h
32071
 
+ *
32072
 
+ * EVMS LVM VGE kernel header file
32073
 
+ */
32074
 
+
32075
 
+
32076
 
+#ifndef __EVMS_LVM_H__
32077
 
+#define __EVMS_LVM_H__
32078
 
+
32079
 
+#define EVMS_LVM_VERSION_MAJOR 1
32080
 
+#define EVMS_LVM_VERSION_MINOR 0
32081
 
+#define EVMS_LVM_VERSION_PATCH 0
32082
 
+
32083
 
+// The following definitions and data structures are copied from lvm.h and
32084
 
+// liblvm.h from the LVM 0.9.1beta8 distribution. Since the metadata format
32085
 
+// changed in beta8, lvm.h changed significantly enough that this module would
32086
 
+// no longer compile. Instead of requiring evms users to install the latest lvm 
32087
 
+// release, the required definitions and data structures will now be included
32088
 
+// in this header file.
32089
 
+
32090
 
+#ifndef        SECTOR_SIZE
32091
 
+#define SECTOR_SIZE            512
32092
 
+#endif
32093
 
+#define MAX_VG                 99
32094
 
+#define MAX_LV                 256
32095
 
+#define        MAX_PV                  256                     /* caused by 8 bit minor */
32096
 
+#define        NAME_LEN                128                     /* don't change!!! */
32097
 
+#define        UUID_LEN                32                      /* don't change!!! */
32098
 
+#define LV_SET_ACCESS           _IOW ( 0xfe, 0x28, 1)
32099
 
+#define LV_SET_ALLOCATION       _IOW ( 0xfe, 0x29, 1)
32100
 
+#define LV_SET_STATUS           _IOW ( 0xfe, 0x2a, 1)
32101
 
+#define LV_SNAPSHOT_USE_RATE    _IOWR ( 0xfe, 0x2c, 1)
32102
 
+#define        LV_BMAP                 _IOWR ( 0xfe, 0x30, 1)
32103
 
+#define LVM_VGDA_ALIGN         4096UL                  /* some metadata on the disk need to be aligned */
32104
 
+#define        LVM_PV_DISK_BASE        0L                      /* base of PV structure in disk partition */
32105
 
+#define        LVM_PV_DISK_SIZE        1024L                   /* size reserved for PV structure on disk */
32106
 
+#define        LVM_VG_DISK_BASE        round_up(LVM_PV_DISK_BASE + LVM_PV_DISK_SIZE, LVM_VGDA_ALIGN)
32107
 
+                                                       /* base of VG structure in disk partition */
32108
 
+#define        LVM_VG_DISK_SIZE        (8*512L)                /* size reserved for VG structure */
32109
 
+
32110
 
+/*
32111
 
+ * Status flags
32112
 
+ */
32113
 
+/* logical volume */
32114
 
+#define        LV_ACTIVE            0x01       /* lv_status */
32115
 
+#define        LV_READ              0x01       /* lv_access */
32116
 
+#define        LV_WRITE             0x02       /*     "     */
32117
 
+#define        LV_SNAPSHOT          0x04       /*     "     */
32118
 
+#define        LV_SNAPSHOT_ORG      0x08       /*     "     */
32119
 
+
32120
 
+/* copy on write tables in disk format */
32121
 
+typedef struct lv_COW_table_disk_v1 {
32122
 
+       uint64_t pv_org_number;
32123
 
+       uint64_t pv_org_rsector;
32124
 
+       uint64_t pv_snap_number;
32125
 
+       uint64_t pv_snap_rsector;
32126
 
+} lv_COW_table_disk_t;
32127
 
+
32128
 
+/* disk stored pe information */
32129
 
+typedef struct {
32130
 
+       uint16_t lv_num;
32131
 
+       uint16_t le_num;
32132
 
+} pe_disk_t;
32133
 
+
32134
 
+/* disk stored PV, VG, LV and PE size and offset information */
32135
 
+typedef struct {
32136
 
+       uint32_t base;
32137
 
+       uint32_t size;
32138
 
+} lvm_disk_data_t;
32139
 
+
32140
 
+/* disk */
32141
 
+typedef struct pv_disk_v2 {
32142
 
+       uint8_t id[2];          /* Identifier */
32143
 
+       uint16_t version;               /* HM lvm version */
32144
 
+       lvm_disk_data_t pv_on_disk;
32145
 
+       lvm_disk_data_t vg_on_disk;
32146
 
+       lvm_disk_data_t pv_uuidlist_on_disk;
32147
 
+       lvm_disk_data_t lv_on_disk;
32148
 
+       lvm_disk_data_t pe_on_disk;
32149
 
+       uint8_t pv_uuid[NAME_LEN];
32150
 
+       uint8_t vg_name[NAME_LEN];
32151
 
+       uint8_t system_id[NAME_LEN];    /* for vgexport/vgimport */
32152
 
+       uint32_t pv_major;
32153
 
+       uint32_t pv_number;
32154
 
+       uint32_t pv_status;
32155
 
+       uint32_t pv_allocatable;
32156
 
+       uint32_t pv_size;               /* HM */
32157
 
+       uint32_t lv_cur;
32158
 
+       uint32_t pe_size;
32159
 
+       uint32_t pe_total;
32160
 
+       uint32_t pe_allocated;
32161
 
+       
32162
 
+       /* new in struct version 2 */
32163
 
+       uint32_t pe_start;              /* in sectors */
32164
 
+
32165
 
+} pv_disk_t;
32166
 
+
32167
 
+/* disk */
32168
 
+typedef struct lv_disk_v3 {
32169
 
+       uint8_t lv_name[NAME_LEN];
32170
 
+       uint8_t vg_name[NAME_LEN];
32171
 
+       uint32_t lv_access;
32172
 
+       uint32_t lv_status;
32173
 
+       uint32_t lv_open;               /* HM */
32174
 
+       uint32_t lv_dev;                /* HM */
32175
 
+       uint32_t lv_number;     /* HM */
32176
 
+       uint32_t lv_mirror_copies;      /* for future use */
32177
 
+       uint32_t lv_recovery;   /*       "        */
32178
 
+       uint32_t lv_schedule;   /*       "        */
32179
 
+       uint32_t lv_size;
32180
 
+       uint32_t lv_snapshot_minor;/* minor number of original */
32181
 
+       uint16_t lv_chunk_size; /* chunk size of snapshot */
32182
 
+       uint16_t dummy;
32183
 
+       uint32_t lv_allocated_le;
32184
 
+       uint32_t lv_stripes;
32185
 
+       uint32_t lv_stripesize;
32186
 
+       uint32_t lv_badblock;   /* for future use */
32187
 
+       uint32_t lv_allocation;
32188
 
+       uint32_t lv_io_timeout; /* for future use */
32189
 
+       uint32_t lv_read_ahead; /* HM */
32190
 
+} lv_disk_t;
32191
 
+
32192
 
+/* disk */
32193
 
+typedef struct vg_disk_v2 {
32194
 
+       uint8_t vg_uuid[UUID_LEN];      /* volume group UUID */
32195
 
+       uint8_t vg_name_dummy[NAME_LEN-UUID_LEN];       /* rest of v1 VG name */
32196
 
+       uint32_t vg_number;     /* volume group number */
32197
 
+       uint32_t vg_access;     /* read/write */
32198
 
+       uint32_t vg_status;     /* active or not */
32199
 
+       uint32_t lv_max;                /* maximum logical volumes */
32200
 
+       uint32_t lv_cur;                /* current logical volumes */
32201
 
+       uint32_t lv_open;               /* open    logical volumes */
32202
 
+       uint32_t pv_max;                /* maximum physical volumes */
32203
 
+       uint32_t pv_cur;                /* current physical volumes FU */
32204
 
+       uint32_t pv_act;                /* active physical volumes */
32205
 
+       uint32_t dummy;
32206
 
+       uint32_t vgda;          /* volume group descriptor arrays FU */
32207
 
+       uint32_t pe_size;               /* physical extent size in sectors */
32208
 
+       uint32_t pe_total;              /* total of physical extents */
32209
 
+       uint32_t pe_allocated;  /* allocated physical extents */
32210
 
+       uint32_t pvg_total;     /* physical volume groups FU */
32211
 
+} vg_disk_t;
32212
 
+
32213
 
+/* useful inlines */
32214
 
+static inline ulong round_up(ulong n, ulong size) {
32215
 
+       size--;
32216
 
+       return (n + size) & ~size;
32217
 
+}
32218
 
+
32219
 
+static inline ulong div_up(ulong n, ulong size) {
32220
 
+       return round_up(n, size) / size;
32221
 
+}
32222
 
+
32223
 
+// End of lvm.h imported data structures
32224
 
+
32225
 
+
32226
 
+#define DEV_DIRECTORY          "/dev/"
32227
 
+#define LVM_DEV_DIRECTORY      "lvm/"
32228
 
+#define LVM_PROC_NAME          "lvm"
32229
 
+#define LVM_PROC_VG_NAME       "VGs"
32230
 
+#define LVM_PROC_LV_NAME       "LVs"
32231
 
+#define LVM_PROC_PV_NAME       "PVs"
32232
 
+#define LVM_PROC_GLOBAL_NAME   "global"
32233
 
+#define IO_BUFFER_SECTORS      8
32234
 
+
32235
 
+// Structure for doing PV remove ioctls
32236
 
+
32237
 
+#define EVMS_LVM_PV_REMOVE_IOCTL       0x01
32238
 
+#define EVMS_LVM_SNAPSHOT_STAT_IOCTL   0x02
32239
 
+
32240
 
+typedef struct lvm_pv_remove_ioctl_s {
32241
 
+       unsigned char                   vg_uuid[UUID_LEN];
32242
 
+       int                             pv_number;
32243
 
+       struct lvm_pv_remove_ioctl_s    * next;
32244
 
+} lvm_pv_remove_ioctl_t;
32245
 
+
32246
 
+
32247
 
+// Structure for doing snapshot stat ioctls
32248
 
+typedef struct lvm_snapshot_stat_ioctl_s {
32249
 
+       unsigned char   vg_uuid[UUID_LEN];
32250
 
+       int             lv_number;
32251
 
+       evms_sector_t   next_free_chunk;
32252
 
+       u_int32_t       lv_status;
32253
 
+} lvm_snapshot_stat_ioctl_t;
32254
 
+
32255
 
+
32256
 
+// Entries in the list of physical volumes (PV)
32257
 
+// in a volume group (VG)
32258
 
+typedef struct lvm_physical_volume_s {
32259
 
+       evms_logical_node_t             * logical_node;
32260
 
+       pv_disk_t                       * pv;           // Copy of on-disk PV struct
32261
 
+       pe_disk_t                       * pe_map;
32262
 
+       u_int32_t                       pv_number;
32263
 
+       struct lvm_physical_volume_s    * next;
32264
 
+} lvm_physical_volume_t;
32265
 
+
32266
 
+
32267
 
+// Table for mapping logical extents (LE) to physical extents (PE)
32268
 
+typedef struct le_table_entry_s {
32269
 
+       lvm_physical_volume_t   * owning_pv;
32270
 
+       evms_sector_t           pe_sector_offset;
32271
 
+} le_table_entry_t;
32272
 
+
32273
 
+
32274
 
+// Entries in the snapshot remapping structure
32275
 
+typedef struct snapshot_map_entry_s {
32276
 
+       evms_sector_t                   org_sector;
32277
 
+       evms_sector_t                   snap_sector;
32278
 
+       lvm_physical_volume_t           * snap_pv;
32279
 
+       struct snapshot_map_entry_s     * next;
32280
 
+       struct snapshot_map_entry_s     * prev;
32281
 
+} snapshot_map_entry_t;
32282
 
+
32283
 
+
32284
 
+// Logical volumes (LV) in a volume group (VG)
32285
 
+#define EVMS_LV_NEW            0x10    // volume was created during the current discovery pass
32286
 
+#define EVMS_LV_INCOMPLETE     0x20    // volume has an incomplete LE map
32287
 
+#define EVMS_LV_INVALID                0x40    // volume has a memory-corruption problem
32288
 
+#define EVMS_LV_QUIESCED       0x80    // volume is in quiesced state
32289
 
+#define MAX_HASH_CHAIN_ENTRIES 10
32290
 
+#define CHUNK_DATA_BUFFER_SIZE 64      // 32k in sectors. Feel free to change, but must be power of 2!
32291
 
+
32292
 
+typedef struct lvm_logical_volume_s {
32293
 
+       u_int32_t               lv_number;
32294
 
+       evms_sector_t           lv_size;        // Sectors
32295
 
+       u_int32_t               lv_access;      // Flags: LV_READ, LV_WRITE, LV_SNAPSHOT, LV_SNAPSHOT_ORG, EVMS_LV_*
32296
 
+       u_int32_t               lv_status;      // Flags: LV_ACTIVE, LV_SPINDOWN
32297
 
+       u_int32_t               lv_minor;       // Device minor number
32298
 
+       u_int32_t               stripes;
32299
 
+       u_int32_t               stripe_size;    // Sectors
32300
 
+       u_int32_t               stripe_size_shift; // Number of bits to shift right instead of dividing by stripe_size
32301
 
+       u_int32_t               pe_size;        // Sectors
32302
 
+       u_int32_t               pe_size_shift;  // Number of bits to shift right instead of dividing by pe_size
32303
 
+       u_int32_t               num_le;         // Number of entries in the le_to_pe_map
32304
 
+       struct lvm_volume_group_s * group;      // Pointer back to parent volume group
32305
 
+       unsigned char           name[NAME_LEN]; // Dev-tree volume name (eg: /dev/group0/vol0)
32306
 
+       le_table_entry_t        * le_map;       // Mapping of logical to physical extents
32307
 
+       evms_logical_node_t     * volume_node;  // Pointer to the parent EVMS node representing this volume
32308
 
+
32309
 
+       // Snapshotting information
32310
 
+       u_int32_t               chunk_size;             // Sectors
32311
 
+       u_int32_t               num_chunks;             // lv_size/chunk_size
32312
 
+       u_int32_t               snap_org_minor;         // Minor number of snapshot original
32313
 
+       u_int32_t               next_cow_entry;         // Index into current COW table
32314
 
+       evms_sector_t           current_cow_sector;     // LOGICAL sector of current COW table
32315
 
+       evms_sector_t           next_free_chunk;        // Starting LOGICAL sector of next free chunk
32316
 
+       u_int32_t               hash_table_size;        // Number of pointers in each hash table
32317
 
+       lv_COW_table_disk_t     * cow_table;            // Pointer to one sector's worth of COW tables
32318
 
+       unsigned char           * chunk_data_buffer;    // Buffer for reading data when doing a copy-on-write
32319
 
+       struct semaphore        snap_semaphore;         // For locking during snapshot I/O operations
32320
 
+       snapshot_map_entry_t    *** snapshot_map;       // Pointer to the remapping hash tables
32321
 
+       struct lvm_logical_volume_s * snapshot_next;    // Linked list of volumes snapshotting the original
32322
 
+       struct lvm_logical_volume_s * snapshot_org;     // Pointer to volume being snapshotted
32323
 
+} lvm_logical_volume_t;
32324
 
+
32325
 
+
32326
 
+// Volume groups (VG)
32327
 
+
32328
 
+#define EVMS_VG_DIRTY                  (1 << 0)        // group is new or has had a PV added during this discovery
32329
 
+#define EVMS_VG_PARTIAL_PVS            (1 << 1)        // group contains at least one partial PV.
32330
 
+#define EVMS_VG_REMOVABLE_PVS          (1 << 2)        // group contains at least one removeable PV.
32331
 
+
32332
 
+typedef struct lvm_volume_group_s {
32333
 
+       vg_disk_t               * vg;                   // Copy of on-disk VG metadata
32334
 
+       lvm_physical_volume_t   * pv_list;              // List of PVs that make up this group
32335
 
+       lvm_logical_volume_t    * volume_list[MAX_LV+1]; // Array of volumes
32336
 
+       lv_disk_t               * lv_array;             // Array of LV metadata
32337
 
+       unsigned char           * uuid_list;            // List of PV UUIDs
32338
 
+       unsigned char           vg_uuid[UUID_LEN];      // UUID from the VG metadata
32339
 
+       char                    vg_name[NAME_LEN];      // Name from the PV metadata
32340
 
+       u_int32_t               pv_count;               // Number of PVs found in this group
32341
 
+       u_int32_t               volume_count;           // Number of LVs found in this group
32342
 
+       int                     hard_sect_size;         // The largest hard_sect_size and block_size
32343
 
+       int                     block_size;             //   values of all PVs in this group.
32344
 
+       u_int32_t               flags;                  // EVMS_VG_?
32345
 
+       struct lvm_volume_group_s * next_group;
32346
 
+} lvm_volume_group_t;
32347
 
+
32348
 
+
32349
 
+#endif
32350
 
+
32351
 
diff -Naur linux-2002-03-28/include/linux/evms/evms_md.h evms-2002-03-28/include/linux/evms/evms_md.h
32352
 
--- linux-2002-03-28/include/linux/evms/evms_md.h       Wed Dec 31 18:00:00 1969
32353
 
+++ evms-2002-03-28/include/linux/evms/evms_md.h        Thu Mar 14 17:01:39 2002
32354
 
@@ -0,0 +1,107 @@
32355
 
+/*
32356
 
+ *   Copyright (c) International Business Machines  Corp., 2000
32357
 
+ *
32358
 
+ *   This program is free software;  you can redistribute it and/or modify
32359
 
+ *   it under the terms of the GNU General Public License as published by
32360
 
+ *   the Free Software Foundation; either version 2 of the License, or
32361
 
+ *   (at your option) any later version.
32362
 
+ *
32363
 
+ *   This program is distributed in the hope that it will be useful,
32364
 
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
32365
 
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
32366
 
+ *   the GNU General Public License for more details.
32367
 
+ *
32368
 
+ *   You should have received a copy of the GNU General Public License
32369
 
+ *   along with this program;  if not, write to the Free Software
32370
 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
32371
 
+ *
32372
 
+ * linux/include/linux/evms/evms_md.h
32373
 
+ *
32374
 
+ * EVMS Linux MD Region Manager Public Header File
32375
 
+ *
32376
 
+ * 'evms_md.h' is an EVMS version of linux/include/linux/raid/md.h modified
32377
 
+ * by Cuong (Mike) Tran <miketran@us.ibm.com>, January 2002.
32378
 
+ *
32379
 
+ */
32380
 
+
32381
 
+#ifndef __EVMS_MD_INCLUDED
32382
 
+#define __EVMS_MD_INCLUDED
32383
 
+
32384
 
+#include <linux/mm.h>
32385
 
+#include <linux/fs.h>
32386
 
+#include <linux/blkdev.h>
32387
 
+#include <asm/semaphore.h>
32388
 
+#include <linux/major.h>
32389
 
+#include <linux/ioctl.h>
32390
 
+#include <linux/types.h>
32391
 
+#include <asm/bitops.h>
32392
 
+#include <linux/module.h>
32393
 
+#include <linux/hdreg.h>
32394
 
+#include <linux/proc_fs.h>
32395
 
+#include <linux/smp_lock.h>
32396
 
+#include <linux/delay.h>
32397
 
+#include <net/checksum.h>
32398
 
+#include <linux/random.h>
32399
 
+#include <linux/locks.h>
32400
 
+#include <linux/kernel_stat.h>
32401
 
+#include <asm/io.h>
32402
 
+#include <linux/completion.h>
32403
 
+
32404
 
+#include <linux/evms/evms_kernel.h>
32405
 
+
32406
 
+#include <linux/raid/md_compatible.h>
32407
 
+/*
32408
 
+ * 'md_p.h' holds the 'physical' layout of RAID devices
32409
 
+ * 'md_u.h' holds the user <=> kernel API
32410
 
+ *
32411
 
+ * 'md_k.h' holds kernel internal definitions
32412
 
+ */
32413
 
+
32414
 
+#include <linux/evms/evms_md_p.h>
32415
 
+#include <linux/evms/evms_md_u.h>
32416
 
+#include <linux/evms/evms_md_k.h>
32417
 
+
32418
 
+#ifndef MAX_READAHEAD  /* The following #defines were removed as of 2.4.16 kernel */
32419
 
+
32420
 
+#define MAX_READAHEAD  31
32421
 
+#define MIN_READAHEAD  3
32422
 
+
32423
 
+#endif
32424
 
+
32425
 
+/*
32426
 
+ * Different major versions are not compatible.
32427
 
+ * Different minor versions are only downward compatible.
32428
 
+ * Different patchlevel versions are downward and upward compatible.
32429
 
+ */
32430
 
+#define MD_MAJOR_VERSION                0
32431
 
+#define MD_MINOR_VERSION                90
32432
 
+#define MD_PATCHLEVEL_VERSION           0
32433
 
+
32434
 
+#define EVMS_MD_COMMON_SERVICES_MAJOR          0
32435
 
+#define EVMS_MD_COMMON_SERVICES_MINOR          5
32436
 
+#define EVMS_MD_COMMON_SERVICES_PATCHLEVEL     0
32437
 
+
32438
 
+
32439
 
+extern int evms_md_size[MAX_MD_DEVS];
32440
 
+
32441
 
+extern void evms_md_add_mddev_mapping (mddev_t *mddev, kdev_t dev, void *data);
32442
 
+extern void evms_md_del_mddev_mapping (mddev_t *mddev, kdev_t dev);
32443
 
+extern char * evms_md_partition_name (evms_logical_node_t *node);
32444
 
+extern int evms_register_md_personality (int p_num, mdk_personality_t *p);
32445
 
+extern int evms_unregister_md_personality (int p_num);
32446
 
+
32447
 
+extern int evms_md_update_sb (mddev_t *mddev);
32448
 
+extern int evms_md_check_ordering (mddev_t *mddev);
32449
 
+extern void evms_md_print_devices (void);
32450
 
+
32451
 
+extern int evms_md_do_sync(mddev_t *mddev, mdp_disk_t *spare);
32452
 
+extern void evms_md_done_sync(mddev_t *mddev, int blocks, int ok);
32453
 
+extern void evms_md_sync_acct(kdev_t dev, unsigned long nr_sectors);
32454
 
+extern void evms_md_recover_arrays (void);
32455
 
+extern int evms_md_error (mddev_t *mddev, evms_logical_node_t *node);
32456
 
+
32457
 
+#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); evms_md_print_devices(); }
32458
 
+
32459
 
+
32460
 
+#endif 
32461
 
+
32462
 
diff -Naur linux-2002-03-28/include/linux/evms/evms_md_k.h evms-2002-03-28/include/linux/evms/evms_md_k.h
32463
 
--- linux-2002-03-28/include/linux/evms/evms_md_k.h     Wed Dec 31 18:00:00 1969
32464
 
+++ evms-2002-03-28/include/linux/evms/evms_md_k.h      Mon Mar 11 22:58:16 2002
32465
 
@@ -0,0 +1,419 @@
32466
 
+/*
32467
 
+ *   Copyright (c) International Business Machines  Corp., 2000
32468
 
+ *
32469
 
+ *   This program is free software;  you can redistribute it and/or modify
32470
 
+ *   it under the terms of the GNU General Public License as published by
32471
 
+ *   the Free Software Foundation; either version 2 of the License, or
32472
 
+ *   (at your option) any later version.
32473
 
+ *
32474
 
+ *   This program is distributed in the hope that it will be useful,
32475
 
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
32476
 
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
32477
 
+ *   the GNU General Public License for more details.
32478
 
+ *
32479
 
+ *   You should have received a copy of the GNU General Public License
32480
 
+ *   along with this program;  if not, write to the Free Software
32481
 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
32482
 
+ */
32483
 
+/*
32484
 
+ * linux/include/linux/evms/evms_md_k.h
32485
 
+ *
32486
 
+ * EVMS Linux MD Region Manager Public Header File
32487
 
+ *
32488
 
+ * 'evms_md_k.h' is an EVMS version of linux/include/linux/raid/md_k.h modified
32489
 
+ * by Cuong (Mike) Tran <miketran@us.ibm.com>, January 2002.
32490
 
+ *
32491
 
+ */
32492
 
+
32493
 
+#ifndef __EVMS_MD_K_INC__
32494
 
+#define __EVMS_MD_K_INC__
32495
 
+
32496
 
+#define MD_RESERVED       0UL
32497
 
+#define LINEAR            1UL
32498
 
+#define RAID0             2UL
32499
 
+#define RAID1             3UL
32500
 
+#define RAID5             4UL
32501
 
+#define TRANSLUCENT       5UL
32502
 
+#define HSM               6UL
32503
 
+#define MULTIPATH         7UL
32504
 
+#define MAX_PERSONALITY   8UL
32505
 
+
32506
 
+static inline int pers_to_level (int pers)
32507
 
+{
32508
 
+       switch (pers) {
32509
 
+               case MULTIPATH:         return -4;
32510
 
+               case HSM:               return -3;
32511
 
+               case TRANSLUCENT:       return -2;
32512
 
+               case LINEAR:            return -1;
32513
 
+               case RAID0:             return 0;
32514
 
+               case RAID1:             return 1;
32515
 
+               case RAID5:             return 5;
32516
 
+       }
32517
 
+       BUG();
32518
 
+       return MD_RESERVED;
32519
 
+}
32520
 
+
32521
 
+static inline int level_to_pers (int level)
32522
 
+{
32523
 
+       switch (level) {
32524
 
+               case -3: return HSM;
32525
 
+               case -2: return TRANSLUCENT;
32526
 
+               case -1: return LINEAR;
32527
 
+               case 0: return RAID0;
32528
 
+               case 1: return RAID1;
32529
 
+               case 4:
32530
 
+               case 5: return RAID5;
32531
 
+       }
32532
 
+       return MD_RESERVED;
32533
 
+}
32534
 
+
32535
 
+typedef struct mddev_s mddev_t;
32536
 
+typedef struct mdk_rdev_s mdk_rdev_t;
32537
 
+
32538
 
+#if (MINORBITS != 8)
32539
 
+#error MD doesnt handle bigger kdev yet
32540
 
+#endif
32541
 
+
32542
 
+#define MAX_MD_DEVS  (1<<MINORBITS)    /* Max number of md dev */
32543
 
+
32544
 
+/*
32545
 
+ * Maps a kdev to an mddev/subdev. How 'data' is handled is up to
32546
 
+ * the personality. (eg. HSM uses this to identify individual LVs)
32547
 
+ */
32548
 
+typedef struct dev_mapping_s {
32549
 
+       mddev_t *mddev;
32550
 
+       void *data;
32551
 
+} dev_mapping_t;
32552
 
+
32553
 
+
32554
 
+extern dev_mapping_t evms_mddev_map [MAX_MD_DEVS];
32555
 
+static inline mddev_t * kdev_to_mddev (kdev_t dev)
32556
 
+{
32557
 
+       if (MAJOR(dev) != MD_MAJOR)
32558
 
+               BUG();
32559
 
+        return evms_mddev_map[MINOR(dev)].mddev;
32560
 
+}
32561
 
+
32562
 
+/*
32563
 
+ * options passed in raidrun:
32564
 
+ */
32565
 
+
32566
 
+#define MAX_CHUNK_SIZE (4096*1024)
32567
 
+
32568
 
+/*
32569
 
+ * default readahead
32570
 
+ */
32571
 
+#define MD_READAHEAD   MAX_READAHEAD
32572
 
+
32573
 
+static inline int disk_faulty(mdp_disk_t * d)
32574
 
+{
32575
 
+       return d->state & (1 << MD_DISK_FAULTY);
32576
 
+}
32577
 
+
32578
 
+static inline int disk_active(mdp_disk_t * d)
32579
 
+{
32580
 
+       return d->state & (1 << MD_DISK_ACTIVE);
32581
 
+}
32582
 
+
32583
 
+static inline int disk_sync(mdp_disk_t * d)
32584
 
+{
32585
 
+       return d->state & (1 << MD_DISK_SYNC);
32586
 
+}
32587
 
+
32588
 
+static inline int disk_spare(mdp_disk_t * d)
32589
 
+{
32590
 
+       return !disk_sync(d) && !disk_active(d) && !disk_faulty(d);
32591
 
+}
32592
 
+
32593
 
+static inline int disk_removed(mdp_disk_t * d)
32594
 
+{
32595
 
+       return d->state & (1 << MD_DISK_REMOVED);
32596
 
+}
32597
 
+
32598
 
+static inline void mark_disk_faulty(mdp_disk_t * d)
32599
 
+{
32600
 
+       d->state |= (1 << MD_DISK_FAULTY);
32601
 
+}
32602
 
+
32603
 
+static inline void mark_disk_active(mdp_disk_t * d)
32604
 
+{
32605
 
+       d->state |= (1 << MD_DISK_ACTIVE);
32606
 
+       d->state &= ~(1 << MD_DISK_PENDING_ACTIVE);
32607
 
+}
32608
 
+
32609
 
+static inline void mark_disk_sync(mdp_disk_t * d)
32610
 
+{
32611
 
+       d->state |= (1 << MD_DISK_SYNC);
32612
 
+}
32613
 
+
32614
 
+static inline void mark_disk_spare(mdp_disk_t * d)
32615
 
+{
32616
 
+       d->state = 0;
32617
 
+}
32618
 
+
32619
 
+static inline void mark_disk_removed(mdp_disk_t * d)
32620
 
+{
32621
 
+       d->state = (1 << MD_DISK_FAULTY) | (1 << MD_DISK_REMOVED);
32622
 
+}
32623
 
+
32624
 
+static inline void mark_disk_inactive(mdp_disk_t * d)
32625
 
+{
32626
 
+       d->state &= ~(1 << MD_DISK_ACTIVE);
32627
 
+}
32628
 
+
32629
 
+static inline void mark_disk_nonsync(mdp_disk_t * d)
32630
 
+{
32631
 
+       d->state &= ~(1 << MD_DISK_SYNC);
32632
 
+}
32633
 
+
32634
 
+/*
32635
 
+ * MD's 'extended' device
32636
 
+ */
32637
 
+struct mdk_rdev_s
32638
 
+{
32639
 
+       struct md_list_head same_set;   /* RAID devices within the same set */
32640
 
+       struct md_list_head all;        /* all RAID devices */
32641
 
+       struct md_list_head pending;    /* undetected RAID devices */
32642
 
+       evms_logical_node_t *node;      /* EVMS device node */
32643
 
+       kdev_t dev;                     /* Device number */
32644
 
+       kdev_t old_dev;                 /*  "" when it was last imported */
32645
 
+       unsigned long size;             /* Device size (in blocks) */
32646
 
+       mddev_t *mddev;                 /* RAID array if running */
32647
 
+       unsigned long last_events;      /* IO event timestamp */
32648
 
+
32649
 
+       struct block_device *bdev;      /* block device handle */
32650
 
+
32651
 
+       mdp_super_t *sb;
32652
 
+       unsigned long sb_offset;        /* in blocks */
32653
 
+
32654
 
+       int virtual_spare;              /* "virtual" spare added via IOCTL */
32655
 
+       int alias_device;               /* device alias to the same disk */
32656
 
+       int faulty;                     /* if faulty do not issue IO requests */
32657
 
+       int desc_nr;                    /* descriptor index in the superblock */
32658
 
+};
32659
 
+
32660
 
+
32661
 
+/*
32662
 
+ * disk operations in a working array:
32663
 
+ */
32664
 
+#define DISKOP_SPARE_INACTIVE          0
32665
 
+#define DISKOP_SPARE_WRITE             1
32666
 
+#define DISKOP_SPARE_ACTIVE            2
32667
 
+#define DISKOP_HOT_SPARE_ACTIVE                3
32668
 
+#define DISKOP_HOT_REMOVE_SPARE                4
32669
 
+#define DISKOP_HOT_REMOVE_DISK         5
32670
 
+#define DISKOP_HOT_ADD_DISK            6
32671
 
+#define DISKOP_HOT_DEACTIVATE_DISK     7
32672
 
+
32673
 
+typedef struct mdk_personality_s mdk_personality_t;
32674
 
+
32675
 
+#define EVMS_MD_INCOMPLETE             (1<<0)
32676
 
+
32677
 
+struct mddev_s
32678
 
+{
32679
 
+       void                            *private;
32680
 
+       mdk_personality_t               *pers;
32681
 
+       evms_logical_node_t             *node;          /* evms node */
32682
 
+       unsigned long                   flag;
32683
 
+       int                             nr_raid_disks;
32684
 
+       int                             __minor;
32685
 
+       mdp_super_t                     *sb;
32686
 
+       int                             nb_dev;
32687
 
+       struct md_list_head             disks;
32688
 
+       int                             sb_dirty;
32689
 
+       mdu_param_t                     param;
32690
 
+       int                             ro;
32691
 
+       unsigned long                   curr_resync;    /* blocks scheduled */
32692
 
+       unsigned long                   resync_mark;    /* a recent timestamp */
32693
 
+       unsigned long                   resync_mark_cnt;/* blocks written at resync_mark */
32694
 
+       char                            *name;
32695
 
+       int                             recovery_running;
32696
 
+       struct semaphore                reconfig_sem;
32697
 
+       struct semaphore                recovery_sem;
32698
 
+       struct semaphore                resync_sem;
32699
 
+       atomic_t                        active;
32700
 
+
32701
 
+       atomic_t                        recovery_active; /* blocks scheduled, but not written */
32702
 
+       md_wait_queue_head_t            recovery_wait;
32703
 
+
32704
 
+       struct md_list_head             all_mddevs;
32705
 
+};
32706
 
+
32707
 
+struct mdk_personality_s
32708
 
+{
32709
 
+       char *name;
32710
 
+       int  (* init_io) (mddev_t *mddev, int rw, evms_sector_t LSN, evms_sector_t nr_sects, void *data);
32711
 
+       int (*make_request)(mddev_t *mddev, int rw, eio_t *eio);
32712
 
+       int (*run)(mddev_t *mddev);
32713
 
+       int (*stop)(mddev_t *mddev);
32714
 
+       int (*status)(char *page, mddev_t *mddev);
32715
 
+       int (*error_handler)(mddev_t *mddev, evms_logical_node_t *node);
32716
 
+
32717
 
+/*
32718
 
+ * Some personalities (RAID-1, RAID-5) can have disks hot-added and
32719
 
+ * hot-removed. Hot removal is different from failure. (failure marks
32720
 
+ * a disk inactive, but the disk is still part of the array) The interface
32721
 
+ * to such operations is the 'pers->diskop()' function, can be NULL.
32722
 
+ *
32723
 
+ * the diskop function can change the pointer pointing to the incoming
32724
 
+ * descriptor, but must do so very carefully. (currently only
32725
 
+ * SPARE_ACTIVE expects such a change)
32726
 
+ */
32727
 
+       int (*diskop) (mddev_t *mddev, mdp_disk_t **descriptor, int state);
32728
 
+
32729
 
+       int (*stop_resync)(mddev_t *mddev);
32730
 
+       int (*restart_resync)(mddev_t *mddev);
32731
 
+       int (*sync_request)(mddev_t *mddev, unsigned long block_nr);
32732
 
+       int (*evms_ioctl)(mddev_t *mddev, struct inode *inode, struct file *file,
32733
 
+                         unsigned int cmd, unsigned long arg);
32734
 
+       int (*md_pers_ioctl)(mddev_t *mddev, int cmd, void* pers_arg);
32735
 
+};
32736
 
+
32737
 
+/* This structure is required for activating a spare device */
32738
 
+typedef struct evms_md_activate_spare_s {
32739
 
+       struct evms_md_activate_spare_s *next;          /* next entry */
32740
 
+       mddev_t                         *mddev;         /* target mddev */
32741
 
+       mdp_disk_t                      *spare;         /* spare to activate */
32742
 
+} evms_md_activate_spare_t;
32743
 
+
32744
 
+/*
32745
 
+ * Currently we index md_array directly, based on the minor
32746
 
+ * number. This will have to change to dynamic allocation
32747
 
+ * once we start supporting partitioning of md devices.
32748
 
+ */
32749
 
+static inline int mdidx (mddev_t * mddev)
32750
 
+{
32751
 
+       return mddev->__minor;
32752
 
+}
32753
 
+
32754
 
+static inline kdev_t mddev_to_kdev(mddev_t * mddev)
32755
 
+{
32756
 
+       return MKDEV(MD_MAJOR, mdidx(mddev));
32757
 
+}
32758
 
+
32759
 
+extern mdk_rdev_t * evms_md_find_rdev(mddev_t * mddev, kdev_t dev);
32760
 
+extern mdk_rdev_t * evms_md_find_rdev_nr(mddev_t *mddev, int nr);
32761
 
+extern mdp_disk_t *get_spare(mddev_t *mddev);
32762
 
+
32763
 
+/*
32764
 
+ * iterates through some rdev ringlist. It's safe to remove the
32765
 
+ * current 'rdev'. Dont touch 'tmp' though.
32766
 
+ */
32767
 
+#define ITERATE_RDEV_GENERIC(head,field,rdev,tmp)                      \
32768
 
+                                                                       \
32769
 
+       for (tmp = head.next;                                           \
32770
 
+               rdev = md_list_entry(tmp, mdk_rdev_t, field),           \
32771
 
+                       tmp = tmp->next, tmp->prev != &head             \
32772
 
+               ; )
32773
 
+/*
32774
 
+ * iterates through the 'same array disks' ringlist
32775
 
+ */
32776
 
+#define ITERATE_RDEV(mddev,rdev,tmp)                                   \
32777
 
+       ITERATE_RDEV_GENERIC((mddev)->disks,same_set,rdev,tmp)
32778
 
+
32779
 
+/*
32780
 
+ * Same as above, but assumes that the device has rdev->desc_nr numbered
32781
 
+ * from 0 to mddev->nb_dev, and iterates through rdevs in ascending order.
32782
 
+ */
32783
 
+#define ITERATE_RDEV_ORDERED(mddev,rdev,i)                             \
32784
 
+       for (i = 0; rdev = evms_md_find_rdev_nr(mddev, i), i < mddev->nb_dev; i++)
32785
 
+
32786
 
+
32787
 
+/*
32788
 
+ * Iterates through all 'RAID managed disks'
32789
 
+ */
32790
 
+#define ITERATE_RDEV_ALL(rdev,tmp)                                     \
32791
 
+       ITERATE_RDEV_GENERIC(all_raid_disks,all,rdev,tmp)
32792
 
+
32793
 
+/*
32794
 
+ * Iterates through 'pending RAID disks'
32795
 
+ */
32796
 
+#define ITERATE_RDEV_PENDING(rdev,tmp)                                 \
32797
 
+       ITERATE_RDEV_GENERIC(pending_raid_disks,pending,rdev,tmp)
32798
 
+
32799
 
+/*
32800
 
+ * iterates through all used mddevs in the system.
32801
 
+ */
32802
 
+#define ITERATE_MDDEV(mddev,tmp)                                       \
32803
 
+                                                                       \
32804
 
+       for (tmp = all_mddevs.next;                                     \
32805
 
+               mddev = md_list_entry(tmp, mddev_t, all_mddevs),        \
32806
 
+                       tmp = tmp->next, tmp->prev != &all_mddevs       \
32807
 
+               ; )
32808
 
+
32809
 
+static inline int lock_mddev (mddev_t * mddev)
32810
 
+{
32811
 
+       return down_interruptible(&mddev->reconfig_sem);
32812
 
+}
32813
 
+
32814
 
+static inline void unlock_mddev (mddev_t * mddev)
32815
 
+{
32816
 
+       up(&mddev->reconfig_sem);
32817
 
+}
32818
 
+
32819
 
+#define xchg_values(x,y) do { __typeof__(x) __tmp = x; \
32820
 
+                               x = y; y = __tmp; } while (0)
32821
 
+
32822
 
+#define MAX_DISKNAME_LEN 64
32823
 
+
32824
 
+typedef struct dev_name_s {
32825
 
+       struct md_list_head list;
32826
 
+       kdev_t dev;
32827
 
+       char namebuf [MAX_DISKNAME_LEN];
32828
 
+       char *name;
32829
 
+} dev_name_t;
32830
 
+
32831
 
+
32832
 
+#define __wait_event_lock_irq(wq, condition, lock)                     \
32833
 
+do {                                                                   \
32834
 
+       wait_queue_t __wait;                                            \
32835
 
+       init_waitqueue_entry(&__wait, current);                         \
32836
 
+                                                                       \
32837
 
+       add_wait_queue(&wq, &__wait);                                   \
32838
 
+       for (;;) {                                                      \
32839
 
+               set_current_state(TASK_UNINTERRUPTIBLE);                \
32840
 
+               if (condition)                                          \
32841
 
+                       break;                                          \
32842
 
+               spin_unlock_irq(&lock);                                 \
32843
 
+               run_task_queue(&tq_disk);                               \
32844
 
+               schedule();                                             \
32845
 
+               spin_lock_irq(&lock);                                   \
32846
 
+       }                                                               \
32847
 
+       current->state = TASK_RUNNING;                                  \
32848
 
+       remove_wait_queue(&wq, &__wait);                                \
32849
 
+} while (0)
32850
 
+
32851
 
+#define wait_event_lock_irq(wq, condition, lock)                       \
32852
 
+do {                                                                   \
32853
 
+       if (condition)                                                  \
32854
 
+               break;                                                  \
32855
 
+       __wait_event_lock_irq(wq, condition, lock);                     \
32856
 
+} while (0)
32857
 
+
32858
 
+
32859
 
+#define __wait_disk_event(wq, condition)                               \
32860
 
+do {                                                                   \
32861
 
+       wait_queue_t __wait;                                            \
32862
 
+       init_waitqueue_entry(&__wait, current);                         \
32863
 
+                                                                       \
32864
 
+       add_wait_queue(&wq, &__wait);                                   \
32865
 
+       for (;;) {                                                      \
32866
 
+               set_current_state(TASK_UNINTERRUPTIBLE);                \
32867
 
+               if (condition)                                          \
32868
 
+                       break;                                          \
32869
 
+               run_task_queue(&tq_disk);                               \
32870
 
+               schedule();                                             \
32871
 
+       }                                                               \
32872
 
+       current->state = TASK_RUNNING;                                  \
32873
 
+       remove_wait_queue(&wq, &__wait);                                \
32874
 
+} while (0)
32875
 
+
32876
 
+#define wait_disk_event(wq, condition)                                         \
32877
 
+do {                                                                   \
32878
 
+       if (condition)                                                  \
32879
 
+               break;                                                  \
32880
 
+       __wait_disk_event(wq, condition);                               \
32881
 
+} while (0)
32882
 
+
32883
 
+#endif
32884
 
+
32885
 
diff -Naur linux-2002-03-28/include/linux/evms/evms_md_p.h evms-2002-03-28/include/linux/evms/evms_md_p.h
32886
 
--- linux-2002-03-28/include/linux/evms/evms_md_p.h     Wed Dec 31 18:00:00 1969
32887
 
+++ evms-2002-03-28/include/linux/evms/evms_md_p.h      Tue Mar 26 18:58:57 2002
32888
 
@@ -0,0 +1,197 @@
32889
 
+/*
32890
 
+ *   Copyright (c) International Business Machines  Corp., 2000
32891
 
+ *
32892
 
+ *   This program is free software;  you can redistribute it and/or modify
32893
 
+ *   it under the terms of the GNU General Public License as published by
32894
 
+ *   the Free Software Foundation; either version 2 of the License, or
32895
 
+ *   (at your option) any later version.
32896
 
+ *
32897
 
+ *   This program is distributed in the hope that it will be useful,
32898
 
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
32899
 
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
32900
 
+ *   the GNU General Public License for more details.
32901
 
+ *
32902
 
+ *   You should have received a copy of the GNU General Public License
32903
 
+ *   along with this program;  if not, write to the Free Software
32904
 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
32905
 
+ */
32906
 
+/*
32907
 
+ * linux/include/linux/evms/evms_md_p.h
32908
 
+ *
32909
 
+ * EVMS Linux MD Region Manager Public Header File
32910
 
+ *
32911
 
+ * 'evms_md_p.h' is an EVMS version of linux/include/linux/raid/md_p.h modified
32912
 
+ * by Cuong (Mike) Tran <miketran@us.ibm.com>, March 2002.
32913
 
+ *
32914
 
+ */
32915
 
+
32916
 
+#ifndef __EVMS_MD_P_INC__
32917
 
+#define __EVMS_MD_P_INC__
32918
 
+
32919
 
+/*
32920
 
+ * RAID superblock.
32921
 
+ *
32922
 
+ * The RAID superblock maintains some statistics on each RAID configuration.
32923
 
+ * Each real device in the RAID set contains it near the end of the device.
32924
 
+ * Some of the ideas are copied from the ext2fs implementation.
32925
 
+ *
32926
 
+ * We currently use 4096 bytes as follows:
32927
 
+ *
32928
 
+ *     word offset     function
32929
 
+ *
32930
 
+ *        0  -    31   Constant generic RAID device information.
32931
 
+ *        32  -    63   Generic state information.
32932
 
+ *       64  -   127   Personality specific information.
32933
 
+ *      128  -   511   12 32-words descriptors of the disks in the raid set.
32934
 
+ *      512  -   911   Reserved.
32935
 
+ *      912  -  1023   Disk specific descriptor.
32936
 
+ */
32937
 
+
32938
 
+/*
32939
 
+ * If x is the real device size in bytes, we return an apparent size of:
32940
 
+ *
32941
 
+ *     y = (x & ~(MD_RESERVED_BYTES - 1)) - MD_RESERVED_BYTES
32942
 
+ *
32943
 
+ * and place the 4kB superblock at offset y.
32944
 
+ */
32945
 
+#define MD_RESERVED_BYTES              (64 * 1024)
32946
 
+#define MD_RESERVED_SECTORS            (MD_RESERVED_BYTES / 512)
32947
 
+#define MD_RESERVED_BLOCKS             (MD_RESERVED_BYTES / BLOCK_SIZE)
32948
 
+
32949
 
+#define MD_NEW_SIZE_SECTORS(x)         ((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS)
32950
 
+#define MD_NEW_SIZE_BLOCKS(x)          ((x & ~(MD_RESERVED_BLOCKS - 1)) - MD_RESERVED_BLOCKS)
32951
 
+
32952
 
+#define MD_SB_BYTES                    4096
32953
 
+#define MD_SB_WORDS                    (MD_SB_BYTES / 4)
32954
 
+#define MD_SB_BLOCKS                   (MD_SB_BYTES / BLOCK_SIZE)
32955
 
+#define MD_SB_SECTORS                  (MD_SB_BYTES / 512)
32956
 
+
32957
 
+/*
32958
 
+ * The following are counted in 32-bit words
32959
 
+ */
32960
 
+#define        MD_SB_GENERIC_OFFSET            0
32961
 
+#define MD_SB_PERSONALITY_OFFSET       64
32962
 
+#define MD_SB_DISKS_OFFSET             128
32963
 
+#define MD_SB_DESCRIPTOR_OFFSET                992
32964
 
+
32965
 
+#define MD_SB_GENERIC_CONSTANT_WORDS   32
32966
 
+#define MD_SB_GENERIC_STATE_WORDS      32
32967
 
+#define MD_SB_GENERIC_WORDS            (MD_SB_GENERIC_CONSTANT_WORDS + MD_SB_GENERIC_STATE_WORDS)
32968
 
+#define MD_SB_PERSONALITY_WORDS                64
32969
 
+#define MD_SB_DESCRIPTOR_WORDS         32
32970
 
+#define MD_SB_DISKS                    27
32971
 
+#define MD_SB_DISKS_WORDS              (MD_SB_DISKS*MD_SB_DESCRIPTOR_WORDS)
32972
 
+#define MD_SB_RESERVED_WORDS           (1024 - MD_SB_GENERIC_WORDS - MD_SB_PERSONALITY_WORDS - MD_SB_DISKS_WORDS - MD_SB_DESCRIPTOR_WORDS)
32973
 
+#define MD_SB_EQUAL_WORDS              (MD_SB_GENERIC_WORDS + MD_SB_PERSONALITY_WORDS + MD_SB_DISKS_WORDS)
32974
 
+
32975
 
+/*
32976
 
+ * Device "operational" state bits
32977
 
+ */
32978
 
+#define MD_DISK_FAULTY         0 /* disk is faulty / operational */
32979
 
+#define MD_DISK_ACTIVE         1 /* disk is running or spare disk */
32980
 
+#define MD_DISK_SYNC           2 /* disk is in sync with the raid set */
32981
 
+#define MD_DISK_REMOVED                3 /* disk has kind of been removed, but not really or it would not be here */
32982
 
+#define MD_DISK_NEW            4 /* disk has just been added to the raid set */
32983
 
+#define MD_DISK_PENDING_ACTIVE 5 /* disk was spare, but should be activated */
32984
 
+
32985
 
+typedef struct mdp_device_descriptor_s {
32986
 
+       __u32 number;           /* 0 Device number in the entire set          */
32987
 
+       __u32 major;            /* 1 Device major number                      */
32988
 
+       __u32 minor;            /* 2 Device minor number                      */
32989
 
+       __u32 raid_disk;        /* 3 The role of the device in the raid set   */
32990
 
+       __u32 state;            /* 4 Operational state                        */
32991
 
+       __u32 reserved[MD_SB_DESCRIPTOR_WORDS - 5];
32992
 
+} mdp_disk_t;
32993
 
+
32994
 
+#define MD_SB_MAGIC            0xa92b4efc
32995
 
+
32996
 
+/*
32997
 
+ * Superblock state bits
32998
 
+ */
32999
 
+#define MD_SB_CLEAN            0
33000
 
+#define MD_SB_ERRORS           1
33001
 
+
33002
 
+typedef struct mdp_superblock_s {
33003
 
+       /*
33004
 
+        * Constant generic information
33005
 
+        */
33006
 
+       __u32 md_magic;         /*  0 MD identifier                           */
33007
 
+       __u32 major_version;    /*  1 major version to which the set conforms */
33008
 
+       __u32 minor_version;    /*  2 minor version ...                       */
33009
 
+       __u32 patch_version;    /*  3 patchlevel version ...                  */
33010
 
+       __u32 gvalid_words;     /*  4 Number of used words in this section    */
33011
 
+       __u32 set_uuid0;        /*  5 Raid set identifier                     */
33012
 
+       __u32 ctime;            /*  6 Creation time                           */
33013
 
+       __u32 level;            /*  7 Raid personality                        */
33014
 
+       __u32 size;             /*  8 Apparent size of each individual disk   */
33015
 
+       __u32 nr_disks;         /*  9 total disks in the raid set             */
33016
 
+       __u32 raid_disks;       /* 10 disks in a fully functional raid set    */
33017
 
+       __u32 md_minor;         /* 11 preferred MD minor device number        */
33018
 
+       __u32 not_persistent;   /* 12 does it have a persistent superblock    */
33019
 
+       __u32 set_uuid1;        /* 13 Raid set identifier #2                  */
33020
 
+       __u32 set_uuid2;        /* 14 Raid set identifier #3                  */
33021
 
+       __u32 set_uuid3;        /* 15 Raid set identifier #4                  */
33022
 
+       __u32 gstate_creserved[MD_SB_GENERIC_CONSTANT_WORDS - 16];
33023
 
+
33024
 
+       /*
33025
 
+        * Generic state information
33026
 
+        */
33027
 
+       __u32 utime;            /*  0 Superblock update time                  */
33028
 
+       __u32 state;            /*  1 State bits (clean, ...)                 */
33029
 
+       __u32 active_disks;     /*  2 Number of currently active disks        */
33030
 
+       __u32 working_disks;    /*  3 Number of working disks                 */
33031
 
+       __u32 failed_disks;     /*  4 Number of failed disks                  */
33032
 
+       __u32 spare_disks;      /*  5 Number of spare disks                   */
33033
 
+       __u32 sb_csum;          /*  6 checksum of the whole superblock        */
33034
 
+#ifdef __KERNEL__
33035
 
+#ifdef __BIG_ENDIAN
33036
 
+       __u32 events_hi;        /*  7 high-order of superblock update count   */
33037
 
+       __u32 events_lo;        /*  8 low-order of superblock update count    */
33038
 
+#else
33039
 
+       __u32 events_lo;        /*  7 low-order of superblock update count    */
33040
 
+       __u32 events_hi;        /*  8 high-order of superblock update count   */
33041
 
+#endif
33042
 
+#else   
33043
 
+#if __BYTE_ORDER == __BIG_ENDIAN
33044
 
+       __u32 events_hi;        /*  7 high-order of superblock update count   */
33045
 
+       __u32 events_lo;        /*  8 low-order of superblock update count    */
33046
 
+#else
33047
 
+       __u32 events_lo;        /*  7 low-order of superblock update count    */
33048
 
+       __u32 events_hi;        /*  8 high-order of superblock update count   */
33049
 
+#endif
33050
 
+#endif
33051
 
+       __u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 9];
33052
 
+
33053
 
+       /*
33054
 
+        * Personality information
33055
 
+        */
33056
 
+       __u32 layout;           /*  0 the array's physical layout             */
33057
 
+       __u32 chunk_size;       /*  1 chunk size in bytes                     */
33058
 
+       __u32 root_pv;          /*  2 LV root PV */
33059
 
+       __u32 root_block;       /*  3 LV root block */
33060
 
+       __u32 pstate_reserved[MD_SB_PERSONALITY_WORDS - 4];
33061
 
+
33062
 
+       /*
33063
 
+        * Disks information
33064
 
+        */
33065
 
+       mdp_disk_t disks[MD_SB_DISKS];
33066
 
+
33067
 
+       /*
33068
 
+        * Reserved
33069
 
+        */
33070
 
+       __u32 reserved[MD_SB_RESERVED_WORDS];
33071
 
+
33072
 
+       /*
33073
 
+        * Active descriptor
33074
 
+        */
33075
 
+       mdp_disk_t this_disk;
33076
 
+
33077
 
+}mdp_super_t;
33078
 
+
33079
 
+static inline __u64 md_event(mdp_super_t *sb) {
33080
 
+       __u64 ev = sb->events_hi;
33081
 
+       return (ev<<32)| sb->events_lo;
33082
 
+}
33083
 
+
33084
 
+#endif 
33085
 
+
33086
 
diff -Naur linux-2002-03-28/include/linux/evms/evms_md_u.h evms-2002-03-28/include/linux/evms/evms_md_u.h
33087
 
--- linux-2002-03-28/include/linux/evms/evms_md_u.h     Wed Dec 31 18:00:00 1969
33088
 
+++ evms-2002-03-28/include/linux/evms/evms_md_u.h      Wed Mar  6 17:08:40 2002
33089
 
@@ -0,0 +1,68 @@
33090
 
+/*
33091
 
+ *   Copyright (c) International Business Machines  Corp., 2000
33092
 
+ *
33093
 
+ *   This program is free software;  you can redistribute it and/or modify
33094
 
+ *   it under the terms of the GNU General Public License as published by
33095
 
+ *   the Free Software Foundation; either version 2 of the License, or
33096
 
+ *   (at your option) any later version.
33097
 
+ *
33098
 
+ *   This program is distributed in the hope that it will be useful,
33099
 
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
33100
 
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
33101
 
+ *   the GNU General Public License for more details.
33102
 
+ *
33103
 
+ *   You should have received a copy of the GNU General Public License
33104
 
+ *   along with this program;  if not, write to the Free Software
33105
 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
33106
 
+ *
33107
 
+ *
33108
 
+ * linux/include/linux/evms/evms_md_h.c
33109
 
+ *
33110
 
+ * EVMS MD Region Manager, User <-> Kernel common file
33111
 
+ *
33112
 
+ */
33113
 
+
33114
 
+#ifndef _EVMS_MD_U_INC_
33115
 
+#define _EVMS_MD_U_INC_
33116
 
+
33117
 
+#define EVMS_MD_ID     4
33118
 
+#define MD_SET_PLUGIN_ID SetPluginID(IBM_OEM_ID,EVMS_REGION_MANAGER,EVMS_MD_ID)
33119
 
+
33120
 
+#define EVMS_MD_PERS_IOCTL_CMD         1       /* personality specific ioctl command */
33121
 
+#define EVMS_MD_ADD            2
33122
 
+#define EVMS_MD_REMOVE         3
33123
 
+#define EVMS_MD_ACTIVATE       4
33124
 
+#define EVMS_MD_DEACTIVATE     5
33125
 
+#define EVMS_MD_GET_ARRAY_INFO  6
33126
 
+
33127
 
+/* structure definition to use with MD_ADD, MD_REMOVE, MD_ACTIVATE */
33128
 
+typedef struct evms_md_kdev_s {
33129
 
+       u_int32_t major;                /* 1 Device major number */
33130
 
+       u_int32_t minor;                /* 2 Device minor number */
33131
 
+} evms_md_kdev_t;
33132
 
+
33133
 
+/* structure definition to use with MD_GET_ARRAY_INFO */
33134
 
+#define EVMS_MD_ARRAY_DEGRADED  (1<<0)
33135
 
+#define EVMS_MD_ARRAY_SYNCING   (1<<1)
33136
 
+typedef struct evms_md_array_info_s {
33137
 
+        unsigned long   state; /* degraded mode, syncing,...*/
33138
 
+        mdp_super_t     *sb;   /* array super block */
33139
 
+} evms_md_array_info_t;
33140
 
+
33141
 
+typedef struct evms_md_ioctl_s {
33142
 
+       int     mddev_idx;      /* same as __minor in mddev_s struct */
33143
 
+       int     cmd;            /* Command for personality */
33144
 
+       void    *arg;           /* Command specific ioctl command structure */
33145
 
+} evms_md_ioctl_t;
33146
 
+
33147
 
+/* Needed by mddev_s structure in evms_md_k.h */
33148
 
+typedef struct mdu_param_s
33149
 
+{
33150
 
+       int                     personality;    /* 1,2,3,4 */
33151
 
+       int                     chunk_size;     /* in bytes */
33152
 
+       int                     max_fault;      /* unused for now */
33153
 
+} mdu_param_t;
33154
 
+
33155
 
+
33156
 
+#endif
33157
 
+
33158
 
diff -Naur linux-2002-03-28/include/linux/evms/evms_os2.h evms-2002-03-28/include/linux/evms/evms_os2.h
33159
 
--- linux-2002-03-28/include/linux/evms/evms_os2.h      Wed Dec 31 18:00:00 1969
33160
 
+++ evms-2002-03-28/include/linux/evms/evms_os2.h       Wed Mar 27 23:55:42 2002
33161
 
@@ -0,0 +1,407 @@
33162
 
+/*
33163
 
+ *
33164
 
+ *   Copyright (c) International Business Machines  Corp., 2000
33165
 
+ *
33166
 
+ *   This program is free software;  you can redistribute it and/or modify
33167
 
+ *   it under the terms of the GNU General Public License as published by
33168
 
+ *   the Free Software Foundation; either version 2 of the License, or
33169
 
+ *   (at your option) any later version.
33170
 
+ *
33171
 
+ *   This program is distributed in the hope that it will be useful,
33172
 
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
33173
 
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
33174
 
+ *   the GNU General Public License for more details.
33175
 
+ *
33176
 
+ *   You should have received a copy of the GNU General Public License
33177
 
+ *   along with this program;  if not, write to the Free Software
33178
 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
33179
 
+ *
33180
 
+ * Module: linux/include/linux/evms_os2.h
33181
 
+ */
33182
 
+
33183
 
+/*
33184
 
+ * Change History:
33185
 
+ *
33186
 
+ */
33187
 
+
33188
 
+/*
33189
 
+ * Description:  This module defines the disk structures used by the OS/2
33190
 
+ *               Logical Volume Manager, including that of the Master
33191
 
+ *               Boot Record (MBR) and Extended Boot Records (EBR).
33192
 
+ *
33193
 
+ * Notes: LVM Drive Letter Assignment Tables (DLA_Tables) appear on the
33194
 
+ *        last sector of each track containing a valid MBR or EBR.  Since
33195
 
+ *        partitions must be track aligned, any track containing an MBR or
33196
 
+ *        EBR will be almost all empty sectors.  We will grab the last
33197
 
+ *        of these empty sectors for our DLT_Tables.
33198
 
+ *
33199
 
+ */
33200
 
+
33201
 
+
33202
 
+#ifndef OS2LVM_INCLUDED__
33203
 
+#define OS2LVM_INCLUDED__
33204
 
+
33205
 
+/* The following define the values used to indicate that a partition table entry is for an EBR, not a partition. */
33206
 
+#define EBR_BOOT_INDICATOR     0
33207
 
+#define EBR_FORMAT_INDICATOR   5
33208
 
+
33209
 
+/* The following define is used as the default Format_Indicator for new non-primary partitions. */
33210
 
+#define NEW_LOGICAL_DRIVE_FORMAT_INDICATOR   0x6
33211
 
+
33212
 
+/* The following define is used as the default Format_Indicator for a new non-active primary partitions. */
33213
 
+#define NEW_PRIMARY_PARTITION_FORMAT_INDICATOR   0x16
33214
 
+
33215
 
+/* The following define is used as the default Format_Indicator for a new active primary partition. */
33216
 
+#define NEW_ACTIVE_PRIMARY_PARTITION_FORMAT_INDICATOR  0x06
33217
 
+
33218
 
+/* The following define is used to hold the value of the Boot_Indicator for active partitions. */
33219
 
+#define ACTIVE_PARTITION   0x80
33220
 
+
33221
 
+/* Define the size of a Partition Name.  Partition Names are user defined names given to a partition. */
33222
 
+#define PARTITION_NAME_SIZE  20
33223
 
+
33224
 
+/* Define the size of a volume name.  Volume Names are user defined names given to a volume. */
33225
 
+#define VOLUME_NAME_SIZE  20
33226
 
+
33227
 
+/* Define the size of a disk name.  Disk Names are user defined names given to physical disk drives in the system. */
33228
 
+#define DISK_NAME_SIZE    20
33229
 
+
33230
 
+/* The name of the filesystem in use on a partition.  This name may be up to 12 ( + NULL terminator) characters long. */
33231
 
+#define FILESYSTEM_NAME_SIZE 20
33232
 
+
33233
 
+/* The comment field is reserved but is not currently used.  This is for future expansion and use. */
33234
 
+#define COMMENT_SIZE 81
33235
 
+
33236
 
+
33237
 
+/* Define the minimum number of sectors to reserve on the disk for Boot Manager. */
33238
 
+#define BOOT_MANAGER_SIZE     2048
33239
 
+
33240
 
+#define OS2_BYTES_PER_SECTOR  512
33241
 
+#define OS2_SECTOR_SHIFT      9
33242
 
+
33243
 
+
33244
 
+/*--------------------------------------------------
33245
 
+ * Type definitions
33246
 
+ --------------------------------------------------*/
33247
 
+
33248
 
+/* The following definitions define the drive letter assignment table used by LVM.
33249
 
+   For each partition table on the disk, there will be a drive letter assignment table in the last sector
33250
 
+   of the track containing the partition table. */
33251
 
+
33252
 
+/* NOTE: DLA stands for Drive Letter Assignment. */
33253
 
+
33254
 
+#define DLA_TABLE_SIGNATURE1  0x424D5202L
33255
 
+#define DLA_TABLE_SIGNATURE2  0x44464D50L
33256
 
+
33257
 
+
33258
 
+typedef struct _DLA_Entry { /* DE */
33259
 
+        u_int32_t      Volume_Serial_Number;                 /* The serial number of the volume that this partition belongs to. */
33260
 
+        u_int32_t      Partition_Serial_Number;              /* The serial number of this partition. */
33261
 
+        u_int32_t      Partition_Size;                       /* The size of the partition, in sectors. */
33262
 
+        u_int32_t      Partition_Start;                      /* The starting sector of the partition. */
33263
 
+        unsigned char  On_Boot_Manager_Menu;                 /* Set to TRUE if this volume/partition is on the Boot Manager Menu. */
33264
 
+        unsigned char  Installable;                          /* Set to TRUE if this volume is the one to install the operating system on. */
33265
 
+        char           Drive_Letter;                         /* The drive letter assigned to the partition. */
33266
 
+        unsigned char  Reserved;
33267
 
+        char           Volume_Name[VOLUME_NAME_SIZE];        /* The name assigned to the volume by the user. */
33268
 
+        char           Partition_Name[PARTITION_NAME_SIZE];  /* The name assigned to the partition. */
33269
 
+} DLA_Entry;
33270
 
+
33271
 
+typedef struct _DLA_Table_Sector { /* DTS */
33272
 
+        u_int32_t  DLA_Signature1;             /* The magic signature (part 1) of a Drive Letter Assignment Table. */
33273
 
+        u_int32_t  DLA_Signature2;             /* The magic signature (part 2) of a Drive Letter Assignment Table. */
33274
 
+        u_int32_t  DLA_CRC;                    /* The 32 bit CRC for this sector.  Calculated assuming that this field and all unused space in the sector is 0. */
33275
 
+        u_int32_t  Disk_Serial_Number;         /* The serial number assigned to this disk. */
33276
 
+        u_int32_t  Boot_Disk_Serial_Number;    /* The serial number of the disk used to boot the system.  This is for conflict resolution when multiple volumes
33277
 
+                                                  want the same drive letter.  Since LVM.EXE will not let this situation happen, the only way to get this situation
33278
 
+                                                  is for the disk to have been altered by something other than LVM.EXE, or if a disk drive has been moved from one
33279
 
+                                                  machine to another.  If the drive has been moved, then it should have a different Boot_Disk_Serial_Number.  Thus,
33280
 
+                                                  we can tell which disk drive is the "foreign" drive and therefore reject its claim for the drive letter in question.
33281
 
+                                                  If we find that all of the claimaints have the same Boot_Disk_Serial_Number, then we must assign drive letters on
33282
 
+                                                  a first come, first serve basis.*/
33283
 
+        u_int32_t  Install_Flags;              /* Used by the Install program. */
33284
 
+        u_int32_t  Cylinders;
33285
 
+        u_int32_t  Heads_Per_Cylinder;
33286
 
+        u_int32_t  Sectors_Per_Track;
33287
 
+        char           Disk_Name[DISK_NAME_SIZE];  /* The name assigned to the disk containing this sector. */
33288
 
+        unsigned char  Reboot;                     /* For use by Install.  Used to keep track of reboots initiated by install. */
33289
 
+        unsigned char  Reserved[3];                /* Alignment. */
33290
 
+        DLA_Entry      DLA_Array[4];               /* These are the four entries which correspond to the entries in the partition table. */
33291
 
+} DLA_Table_Sector;
33292
 
+
33293
 
+
33294
 
+/* The following definitions define the LVM signature sector which will appear as the last sector in an LVM partition. */
33295
 
+
33296
 
+
33297
 
+#define  OS2LVM_PRIMARY_SIGNATURE   0x4A435332L
33298
 
+#define  OS2LVM_SECONDARY_SIGNATURE 0x4252444BL
33299
 
+
33300
 
+
33301
 
+#define  CURRENT_OS2LVM_MAJOR_VERSION_NUMBER   2        /* Define as appropriate. */
33302
 
+#define  CURRENT_OS2LVM_MINOR_VERSION_NUMBER   0        /* Define as appropriate. */
33303
 
+
33304
 
+
33305
 
+/* The following definitions limit the number of LVM features that can be applied to a volume, as well as defining a "NULL" feature for use in feature table entries that are not being used. */
33306
 
+#define  OS2LVM_MAX_FEATURES_PER_VOLUME  10     /* The maximum number of LVM features that can be applied to a volume. */
33307
 
+#define  OS2LVM_NULL_FEATURE              0     /* No feature.  Used in all unused entries of the feature array in the LVM Signature sector. */
33308
 
+
33309
 
+
33310
 
+/* The following structure is used to hold the location of the feature specific data for LVM features. */
33311
 
+typedef struct _LVM_Feature_Data { /* LFD */
33312
 
+        u_int32_t      Feature_ID;                            /* The ID of the feature. */
33313
 
+        u_int32_t      Location_Of_Primary_Feature_Data;      /* The u_int32_t of the starting sector of the private data for this feature. */
33314
 
+        u_int32_t      Location_Of_Secondary_Feature_Data;    /* The u_int32_t of the starting sector of the backup copy of the private data for this feature. */
33315
 
+        u_int32_t      Feature_Data_Size;                     /* The number of sectors used by this feature for its private data. */
33316
 
+        u_int16_t      Feature_Major_Version_Number;          /* The integer portion of the version number of this feature. */
33317
 
+        u_int16_t      Feature_Minor_Version_Number;          /* The decimal portion of the version number of this feature. */
33318
 
+        unsigned char  Feature_Active;                        /* TRUE if this feature is active on this partition/volume, FALSE otherwise. */
33319
 
+        unsigned char  Reserved[3];                           /* Alignment. */
33320
 
+} LVM_Feature_Data;
33321
 
+
33322
 
+
33323
 
+/* The following structure defines the LVM Signature Sector.  This is the last sector of every partition which is part of an LVM volume.  It gives vital
33324
 
+   information about the version of LVM used to create the LVM volume that it is a part of, as well as which LVM features (BBR, drive linking, etc.) are
33325
 
+   active on the volume that this partition is a part of.                                                                                                   */
33326
 
+typedef struct _LVM_Signature_Sector { /* LSS */
33327
 
+        u_int32_t         LVM_Signature1;                       /* The first part of the magic LVM signature. */
33328
 
+        u_int32_t         LVM_Signature2;                       /* The second part of the magic LVM signature. */
33329
 
+        u_int32_t         Signature_Sector_CRC;                 /* 32 bit CRC for this sector.  Calculated using 0 for this field. */
33330
 
+        u_int32_t         Partition_Serial_Number;              /* The LVM assigned serial number for this partition.  */
33331
 
+        u_int32_t         Partition_Start;                      /* u_int32_t of the first sector of this partition. */
33332
 
+        u_int32_t         Partition_End;                        /* u_int32_t of the last sector of this partition. */
33333
 
+        u_int32_t         Partition_Sector_Count;               /* The number of sectors in this partition. */
33334
 
+        u_int32_t         LVM_Reserved_Sector_Count;            /* The number of sectors reserved for use by LVM. */
33335
 
+        u_int32_t         Partition_Size_To_Report_To_User;     /* The size of the partition as the user sees it - i.e. (the actual size of the partition - LVM reserved sectors) rounded to a track boundary. */
33336
 
+        u_int32_t         Boot_Disk_Serial_Number;              /* The serial number of the boot disk for the system.  If the system contains Boot Manager, then this is the serial number of the disk containing the active copy of Boot Manager. */
33337
 
+        u_int32_t         Volume_Serial_Number;                 /* The serial number of the volume that this partition belongs to. */
33338
 
+        u_int32_t         Fake_EBR_Location;                    /* The location, on disk, of a Fake EBR, if one has been allocated. */
33339
 
+        u_int16_t         LVM_Major_Version_Number;             /* Major version number of the LVM that created this partition. */
33340
 
+        u_int16_t         LVM_Minor_Version_Number;             /* Minor version number of the LVM that created this partition. */
33341
 
+        char              Partition_Name[PARTITION_NAME_SIZE];  /* User defined partition name. */
33342
 
+        char              Volume_Name[VOLUME_NAME_SIZE];        /* The name of the volume that this partition belongs to. */
33343
 
+        LVM_Feature_Data  LVM_Feature_Array[OS2LVM_MAX_FEATURES_PER_VOLUME]; /* The feature array.  This indicates which LVM features, if any, are active on this volume
33344
 
+                                                                         and what order they should be applied in.                                                  */
33345
 
+        char              Drive_Letter;                         /* The drive letter assigned to the volume that this partition is part of. */
33346
 
+        unsigned char     Fake_EBR_Allocated;                   /* If TRUE, then a fake EBR has been allocated. */
33347
 
+        char              Comment[COMMENT_SIZE];                /* User comment. */
33348
 
+        char              Disk_Name[DISK_NAME_SIZE];            /* Added to allow BBR to report the name of a disk when bad sectors are encountered on that disk. */
33349
 
+        u_int32_t         Sequence_Number;                      /* This indicates the order that partitions within a volume are used.  This number is 1 based.  A 0 here indicates that the volume was made by LVM Ver. 1. */
33350
 
+        u_int32_t         Next_Aggregate_Number;                /* Used during volume creation and expansion when creating unique names for aggregates. */
33351
 
+        /* The remainder of the sector is reserved for future use and should be all zero or else the CRC will not come out correctly. */
33352
 
+} LVM_Signature_Sector;
33353
 
+
33354
 
+
33355
 
+/* The following definitions define the format of a partition table and the Master Boot Record (MBR). */
33356
 
+typedef struct _Partition_Record { /* PR */
33357
 
+        unsigned char  Boot_Indicator;    /* 80h = active partition. */
33358
 
+        unsigned char  Starting_Head;
33359
 
+        unsigned char  Starting_Sector;   /* Bits 0-5 are the sector.  Bits 6 and 7 are the high order bits of the starting cylinder. */
33360
 
+        unsigned char  Starting_Cylinder; /* The cylinder number is a 10 bit value.  The high order bits of the 10 bit value come from bits 6 & 7 of the Starting_Sector field. */
33361
 
+        unsigned char  Format_Indicator;  /* An indicator of the format/operation system on this partition. */
33362
 
+        unsigned char  Ending_Head;
33363
 
+        unsigned char  Ending_Sector;
33364
 
+        unsigned char  Ending_Cylinder;
33365
 
+        u_int32_t      Sector_Offset;     /* The number of sectors on the disk which are prior to the start of this partition. */
33366
 
+        u_int32_t      Sector_Count;      /* The number of sectors in this partition. */
33367
 
+} Partition_Record;
33368
 
+
33369
 
+typedef struct _Master_Boot_Record { /* MBR */
33370
 
+        unsigned char     Reserved[446];
33371
 
+        Partition_Record  Partition_Table[4];
33372
 
+        u_int16_t    Signature;            /* AA55h in this field indicates that this is a valid partition table/MBR. */
33373
 
+} Master_Boot_Record;
33374
 
+
33375
 
+typedef Master_Boot_Record  Extended_Boot_Record;
33376
 
+
33377
 
+/* The following definition covers the Boot Manager Alias Table in the EBR.
33378
 
+
33379
 
+   The Alias Table in the EBR has 2 entries in it, although only the first one is actually used.  */
33380
 
+#define ALIAS_NAME_SIZE  8
33381
 
+typedef struct _AliasTableEntry { /* ATE */
33382
 
+        unsigned char  On_Boot_Manager_Menu;
33383
 
+        char           Name[ALIAS_NAME_SIZE];
33384
 
+} AliasTableEntry;
33385
 
+
33386
 
+#define ALIAS_TABLE_OFFSET  0x18A
33387
 
+
33388
 
+/* XLATOFF */
33389
 
+/* The following text is used for the Boot Manager Alias for items that were placed on the Boot Manager Menu by FDISK and
33390
 
+   which have since been migrated to the new LVM format.  This text is put into the Name field of an AliasTableEntry so
33391
 
+   that, if FDISK ( or another program which understands the old Boot Manager Menu format) is run, it will display
33392
 
+   something for those partitions/volumes which are on the Boot Manager Menu.
33393
 
+
33394
 
+   NOTE: This text must be exactly ALIAS_NAME_SIZE characters in length!                                                     */
33395
 
+#define ALIAS_TABLE_ENTRY_MIGRATION_TEXT       "--> LVM "
33396
 
+#define ALIAS_TABLE_ENTRY_MIGRATION_TEXT2      "--> LVM*"
33397
 
+
33398
 
+/* XLATON */
33399
 
+
33400
 
+/* The following is the signature used for an Master Boot Record, an Extended Boot Record, and a Boot Sector. */
33401
 
+#define MBR_EBR_SIGNATURE  0xAA55
33402
 
+
33403
 
+
33404
 
+/* The following list of definitions defines the values of interest for the Format_Indicator in a Partition_Record. */
33405
 
+#define EBR_INDICATOR                          0x5
33406
 
+#define WINDOZE_EBR_INDICATOR                  0xF
33407
 
+#define UNUSED_INDICATOR                       0x0
33408
 
+#define IFS_INDICATOR                          0x7
33409
 
+#define FAT12_INDICATOR                        0x1
33410
 
+#define FAT16_SMALL_PARTITION_INDICATOR        0x4
33411
 
+#define FAT16_LARGE_PARTITION_INDICATOR        0x6
33412
 
+#define BOOT_MANAGER_HIDDEN_PARTITION_FLAG     0x10
33413
 
+#define LVM_PARTITION_INDICATOR                0x35
33414
 
+#define BOOT_MANAGER_INDICATOR                 0x0A
33415
 
+
33416
 
+
33417
 
+/* The following is the signature used in the Boot Sector for Boot Manager. */
33418
 
+#define OS2LVM_BOOT_MANAGER_SIGNATURE       "APJ&WN"
33419
 
+
33420
 
+
33421
 
+/* The following is used for determining the synthetic geometry reported for Volumes employing drive linking. */
33422
 
+#define OS2LVM_SYNTHETIC_SECTORS_PER_TRACK  63
33423
 
+
33424
 
+
33425
 
+/*--------------------------------------------------
33426
 
+ * Declares for Drive Linking feature:
33427
 
+ *--------------------------------------------------*/
33428
 
+
33429
 
+/* The following defines uniquely identify Drive Linking. */
33430
 
+#define DRIVE_LINKING_FEATURE_ID     100
33431
 
+#define DRIVE_LINKING_MAJOR_VERSION  1
33432
 
+#define DRIVE_LINKING_MINOR_VERSION  0
33433
 
+
33434
 
+/* The following definitions are used for the disk structures supporting drive linking. */
33435
 
+
33436
 
+#define LINK_TABLE_MASTER_SIGNATURE  0x434E4157L
33437
 
+#define LINK_TABLE_SIGNATURE         0X4D4D5652L
33438
 
+
33439
 
+#define MAXIMUM_LINKS   246
33440
 
+
33441
 
+#define DRIVE_LINKING_RESERVED_SECTOR_COUNT 4
33442
 
+
33443
 
+#define LINKS_IN_FIRST_SECTOR 60
33444
 
+
33445
 
+#define LINKS_IN_NEXT_SECTOR  62
33446
 
+
33447
 
+typedef struct _Drive_Link {
33448
 
+        u_int32_t   Drive_Serial_Number;
33449
 
+        u_int32_t   Partition_Serial_Number;
33450
 
+} Drive_Link;
33451
 
+
33452
 
+typedef struct _LVM_Link_Table_First_Sector {
33453
 
+        u_int32_t   Link_Table_Signature;  /* Use the LINK_TABLE_MASTER_SIGNATURE here. */
33454
 
+        u_int32_t   Link_Table_CRC;
33455
 
+        u_int32_t   Sequence_Number;       /* Used to resolve conflicts when the primary and secondary tables do not match. */
33456
 
+        u_int32_t   Links_In_Use;
33457
 
+        Drive_Link  Link_Table[LINKS_IN_FIRST_SECTOR];
33458
 
+} LVM_Link_Table_First_Sector;
33459
 
+
33460
 
+typedef struct _LVM_Link_Table_Sector {
33461
 
+        u_int32_t   Link_Table_Signature;  /* Use LINK_TABLE_SIGNATURE here. */
33462
 
+        u_int32_t   Link_Table_CRC;
33463
 
+        u_int32_t   Sequence_Number;       /* Used to resolve conflicts when the primary and secondary tables do not match. */
33464
 
+        Drive_Link  Link_Table[LINKS_IN_NEXT_SECTOR];
33465
 
+} LVM_Link_Table_Sector;
33466
 
+
33467
 
+
33468
 
+/*--------------------------------------------------
33469
 
+ * Declares for Bad Block Relocation feature:
33470
 
+ *--------------------------------------------------*/
33471
 
+
33472
 
+/* The following definition is the numeric ID for Bad Block Relocation.  */
33473
 
+#define BBR_FEATURE_ID  101
33474
 
+
33475
 
+#define BBR_FEATURE_MAJOR_VERSION       0x0001
33476
 
+#define BBR_FEATURE_MINOR_VERSION       0x0000
33477
 
+
33478
 
+/* The following definitions are used for the disk structures supporting bad block relocation. */
33479
 
+
33480
 
+/* NOTE: BBR stands for Bad Block Relocation. */
33481
 
+
33482
 
+#define BBR_TABLE_MASTER_SIGNATURE  0x00726D62
33483
 
+#define BBR_TABLE_SIGNATURE         0x01726276
33484
 
+
33485
 
+
33486
 
+typedef struct _BBR_Table_Entry {
33487
 
+        u_int32_t    BadSector;
33488
 
+        u_int32_t    ReplacementSector;
33489
 
+} BBR_Table_Entry;
33490
 
+
33491
 
+typedef struct _LVM_BBR_Table_First_Sector {
33492
 
+        u_int32_t    Signature;   /* Signature for the first sector of the BBR Table. Use BBR_TABLE_MASTER_SIGNATURE here.*/
33493
 
+        u_int32_t    CRC;/* CRC for this sector.*/
33494
 
+        u_int32_t    Sequence_Number;     /* Used to resolve conflicts when the primary and secondary tables do not match.*/
33495
 
+        u_int32_t    Table_Size;  /* The number of BBR_Table_Entries in the BBR Table.*/
33496
 
+        u_int32_t    Table_Entries_In_Use;/* The number of BBR Table entries which are in use.*/
33497
 
+        u_int32_t    Sectors_Per_Table;   /* The number of LVM_BBR_Table_Sectors used to hold the BBR Table.*/
33498
 
+        u_int32_t    First_Replacement_Sector;    /* The location of the first replacement sector.*/
33499
 
+        u_int32_t    Last_Replacement_Sector;     /* The location of the last replacement sector.*/
33500
 
+        u_int32_t    Replacement_Sector_Count;    /* The number of replacement sectors.*/
33501
 
+        u_int32_t    Flags;       /* Flags global to the Bad Block Relocation Feature.*/
33502
 
+} LVM_BBR_Table_First_Sector;
33503
 
+
33504
 
+/*  Flags for LVM_BBR_Table_First_Sector  */
33505
 
+#define BBR_Flag_Write_Verify    0x00000001/* Indicate convert Write I/O to Write/Verify*/
33506
 
+
33507
 
+#define BBR_TABLE_ENTRIES_PER_SECTOR   62
33508
 
+
33509
 
+typedef struct _LVM_BBR_Table_Sector {
33510
 
+        u_int32_t    Signature;/* Signature for a sector of the BBR_Table which is not the first sector of the BBR Table. Use BBR_TABLE_SIGNATURE here.*/
33511
 
+        u_int32_t    CRC;/* CRC for this sector of the BBR Table.*/
33512
 
+        u_int32_t    Sequence_Number;   /* Used to resolve conflicts when the primary and secondary tables do not match.*/
33513
 
+        BBR_Table_Entry  BBR_Table[BBR_TABLE_ENTRIES_PER_SECTOR];
33514
 
+        u_int32_t    reserved1;/* for block alignment*/
33515
 
+} LVM_BBR_Table_Sector;
33516
 
+
33517
 
+//
33518
 
+// Combined structure to hold entire BBR feature data as it exists on disk.
33519
 
+typedef struct _LVM_BBR_Feature
33520
 
+{
33521
 
+        LVM_BBR_Table_First_Sector  control;
33522
 
+        char                  reserved1[OS2_BYTES_PER_SECTOR - sizeof(LVM_BBR_Table_First_Sector)];
33523
 
+        LVM_BBR_Table_Sector  remap[1];
33524
 
+} LVM_BBR_Feature;
33525
 
+
33526
 
+/* The following defines establish the minimum and maximum number of replacement sectors which can be allocated for
33527
 
+   Bad Block Relocation.  Otherwise, 1 replacement sector per MB of disk space is allocated.                          */
33528
 
+#define BBR_FLOOR    62
33529
 
+#define BBR_LIMIT  4096
33530
 
+
33531
 
+
33532
 
+#ifdef __KERNEL__
33533
 
+// In-memory Meta Data for Bad Block Relocation
33534
 
+// In-memory Meta Data for Drive Linking
33535
 
+typedef struct os2_drivelink_runtime_entry_s {
33536
 
+        evms_sector_t                   start_sector;
33537
 
+        evms_sector_t                   sector_count;
33538
 
+        evms_sector_t                   Drive_Link_Data_Copy1;    /* LSN of first on-disk copy of drive linking data. */
33539
 
+        evms_sector_t                   Drive_Link_Data_Copy2;    /* LSN of the second on-disk copy of drive linking data. */
33540
 
+        char                           *link_data;
33541
 
+        u_int32_t                       Partition_Serial_Number;
33542
 
+        evms_sector_t                   BBR_Data_Copy1;           /* LSN of the first on-disk copy of the BBR data.*/
33543
 
+        evms_sector_t                   BBR_Data_Copy2;           /* LSN of the second on-disk copy of the BBR data.*/
33544
 
+        u_int32_t                       BBR_Feature_Size;         /* # of sectors of BBR data. */
33545
 
+        u_int32_t                       bbr_is_active;
33546
 
+        struct semaphore                BBR_Table_Lock;            /* Used to serialize writers */
33547
 
+        unsigned int                    Guard1;                    /* Lamport's Theorem for mutual exclusion */
33548
 
+        char                           *bbr_data;
33549
 
+        unsigned int                    Guard2;                    /* Lamport's Theorem for mutual exclusion */
33550
 
+        evms_logical_node_t            *link_partition;
33551
 
+        struct os2_drivelink_runtime_entry_s  *next;
33552
 
+} os2_drivelink_runtime_entry_t;
33553
 
+
33554
 
+// In-memory Meta Data for each OS/2 LVM Volume:
33555
 
+typedef struct os2_volume_runtime_entry_s {
33556
 
+        int                             complete;
33557
 
+        u_int32_t                       Export_Needed;
33558
 
+        evms_sector_t                   size_in_sectors;
33559
 
+        u_int32_t                       Volume_Serial_Number;
33560
 
+        u_int32_t                       drive_link_count;
33561
 
+        os2_drivelink_runtime_entry_t  *drive_link;
33562
 
+        evms_logical_node_t            *next_os2lvm_node;
33563
 
+} os2_volume_runtime_entry_t;
33564
 
+#endif
33565
 
+
33566
 
+
33567
 
+#endif
33568
 
+
33569
 
diff -Naur linux-2002-03-28/include/linux/evms/evms_raid0.h evms-2002-03-28/include/linux/evms/evms_raid0.h
33570
 
--- linux-2002-03-28/include/linux/evms/evms_raid0.h    Wed Dec 31 18:00:00 1969
33571
 
+++ evms-2002-03-28/include/linux/evms/evms_raid0.h     Thu Jan  3 13:15:19 2002
33572
 
@@ -0,0 +1,33 @@
33573
 
+#ifndef _RAID0_H
33574
 
+#define _RAID0_H
33575
 
+
33576
 
+#include <linux/evms/evms_md.h>
33577
 
+
33578
 
+struct strip_zone
33579
 
+{
33580
 
+       unsigned long zone_offset;      /* Zone offset in md_dev */
33581
 
+       unsigned long dev_offset;       /* Zone offset in real dev */
33582
 
+       unsigned long size;             /* Zone size */
33583
 
+       int nb_dev;                     /* # of devices attached to the zone */
33584
 
+       mdk_rdev_t *dev[MD_SB_DISKS]; /* Devices attached to the zone */
33585
 
+};
33586
 
+
33587
 
+struct raid0_hash
33588
 
+{
33589
 
+       struct strip_zone *zone0, *zone1;
33590
 
+};
33591
 
+
33592
 
+struct raid0_private_data
33593
 
+{
33594
 
+       struct raid0_hash *hash_table; /* Dynamically allocated */
33595
 
+       struct strip_zone *strip_zone; /* This one too */
33596
 
+       int nr_strip_zones;
33597
 
+       struct strip_zone *smallest;
33598
 
+       int nr_zones;
33599
 
+};
33600
 
+
33601
 
+typedef struct raid0_private_data raid0_conf_t;
33602
 
+
33603
 
+#define mddev_to_conf(mddev) ((raid0_conf_t *) mddev->private)
33604
 
+
33605
 
+#endif
33606
 
diff -Naur linux-2002-03-28/include/linux/evms/evms_raid1.h evms-2002-03-28/include/linux/evms/evms_raid1.h
33607
 
--- linux-2002-03-28/include/linux/evms/evms_raid1.h    Wed Dec 31 18:00:00 1969
33608
 
+++ evms-2002-03-28/include/linux/evms/evms_raid1.h     Mon Mar 11 22:58:26 2002
33609
 
@@ -0,0 +1,104 @@
33610
 
+#ifndef _EVMS_RAID1_H
33611
 
+#define _EVMS_RAID1_H
33612
 
+
33613
 
+#include <linux/evms/evms_md.h>
33614
 
+
33615
 
+struct mirror_info {
33616
 
+       int             number;
33617
 
+       int             raid_disk;
33618
 
+       evms_logical_node_t *node;
33619
 
+       kdev_t          dev;
33620
 
+       int             sect_limit;
33621
 
+       int             head_position;
33622
 
+
33623
 
+       /*
33624
 
+        * State bits:
33625
 
+        */
33626
 
+       int             operational;
33627
 
+       int             write_only;
33628
 
+       int             spare;
33629
 
+
33630
 
+       int             used_slot;
33631
 
+};
33632
 
+
33633
 
+struct raid1_private_data {
33634
 
+       mddev_t                 *mddev;
33635
 
+       struct mirror_info      mirrors[MD_SB_DISKS];
33636
 
+       int                     nr_disks;
33637
 
+       int                     raid_disks;
33638
 
+       int                     working_disks;
33639
 
+       int                     last_used;
33640
 
+       unsigned long           next_sect;
33641
 
+       int                     sect_count;
33642
 
+       evms_thread_t           *thread, *resync_thread;
33643
 
+       int                     resync_mirrors;
33644
 
+       struct mirror_info      *spare;
33645
 
+       md_spinlock_t           device_lock;
33646
 
+
33647
 
+       /* buffer pool */
33648
 
+       /* buffer_heads that we have pre-allocated have b_pprev -> &freebh
33649
 
+        * and are linked into a stack using b_next
33650
 
+        * raid1_bh that are pre-allocated have R1BH_PreAlloc set.
33651
 
+        * All these variable are protected by device_lock
33652
 
+        */
33653
 
+       struct buffer_head      *freebh;
33654
 
+       int                     freebh_cnt;     /* how many are on the list */
33655
 
+       int                     freebh_blocked;
33656
 
+       struct raid1_bh         *freer1;
33657
 
+       int                     freer1_blocked;
33658
 
+       int                     freer1_cnt;
33659
 
+       struct raid1_bh         *freebuf;       /* each bh_req has a page allocated */
33660
 
+       md_wait_queue_head_t    wait_buffer;
33661
 
+
33662
 
+       /* for use when syncing mirrors: */
33663
 
+       unsigned long   start_active, start_ready,
33664
 
+               start_pending, start_future;
33665
 
+       int     cnt_done, cnt_active, cnt_ready,
33666
 
+               cnt_pending, cnt_future;
33667
 
+       int     phase;
33668
 
+       int     window;
33669
 
+       md_wait_queue_head_t    wait_done;
33670
 
+       md_wait_queue_head_t    wait_ready;
33671
 
+       md_spinlock_t           segment_lock;
33672
 
+};
33673
 
+
33674
 
+typedef struct raid1_private_data raid1_conf_t;
33675
 
+
33676
 
+/*
33677
 
+ * this is the only point in the RAID code where we violate
33678
 
+ * C type safety. mddev->private is an 'opaque' pointer.
33679
 
+ */
33680
 
+#define mddev_to_conf(mddev) ((raid1_conf_t *) mddev->private)
33681
 
+
33682
 
+/*
33683
 
+ * this is our 'private' 'collective' RAID1 buffer head.
33684
 
+ * it contains information about what kind of IO operations were started
33685
 
+ * for this RAID1 operation, and about their status:
33686
 
+ */
33687
 
+
33688
 
+/* This structure is used to map a buffer head to a evms logical node */
33689
 
+typedef struct raid1_node_map_s {
33690
 
+       evms_logical_node_t     *node;
33691
 
+       struct buffer_head      *bh;
33692
 
+} raid1_node_map_t;
33693
 
+
33694
 
+struct raid1_bh {
33695
 
+       atomic_t                remaining; /* 'have we finished' count,
33696
 
+                                           * used from IRQ handlers
33697
 
+                                           */
33698
 
+       int                     cmd;
33699
 
+       unsigned long           state;
33700
 
+       mddev_t                 *mddev;
33701
 
+       struct buffer_head      *master_bh;
33702
 
+       struct buffer_head      *mirror_bh_list;
33703
 
+       raid1_node_map_t        mirror_node_map[MD_SB_DISKS];
33704
 
+       struct buffer_head      bh_req;
33705
 
+       evms_logical_node_t     *node;          /* map to evms node (READ only) */
33706
 
+       eio_t                   eio;
33707
 
+       struct raid1_bh         *next_r1;       /* next for retry or in free list */
33708
 
+};
33709
 
+/* bits for raid1_bh.state */
33710
 
+#define        R1BH_Uptodate   1
33711
 
+#define        R1BH_SyncPhase  2
33712
 
+#define        R1BH_PreAlloc   3       /* this was pre-allocated, add to free list */
33713
 
+#endif
33714
 
diff -Naur linux-2002-03-28/include/linux/evms/evms_raid5.h evms-2002-03-28/include/linux/evms/evms_raid5.h
33715
 
--- linux-2002-03-28/include/linux/evms/evms_raid5.h    Wed Dec 31 18:00:00 1969
33716
 
+++ evms-2002-03-28/include/linux/evms/evms_raid5.h     Mon Mar 11 22:58:36 2002
33717
 
@@ -0,0 +1,251 @@
33718
 
+#ifndef _RAID5_H
33719
 
+#define _RAID5_H
33720
 
+
33721
 
+#include <linux/evms/evms_md.h>
33722
 
+#include <linux/evms/evms_xor.h>
33723
 
+
33724
 
+/*
33725
 
+ *
33726
 
+ * Each stripe contains one buffer per disc.  Each buffer can be in
33727
 
+ * one of a number of states determined by bh_state.  Changes between
33728
 
+ * these states happen *almost* exclusively under a per-stripe
33729
 
+ * spinlock.  Some very specific changes can happen in b_end_io, and
33730
 
+ * these are not protected by the spin lock.
33731
 
+ *
33732
 
+ * The bh_state bits that are used to represent these states are:
33733
 
+ *   BH_Uptodate, BH_Lock
33734
 
+ *
33735
 
+ * State Empty == !Uptodate, !Lock
33736
 
+ *        We have no data, and there is no active request
33737
 
+ * State Want == !Uptodate, Lock
33738
 
+ *        A read request is being submitted for this block
33739
 
+ * State Dirty == Uptodate, Lock
33740
 
+ *        Some new data is in this buffer, and it is being written out
33741
 
+ * State Clean == Uptodate, !Lock
33742
 
+ *        We have valid data which is the same as on disc
33743
 
+ *
33744
 
+ * The possible state transitions are:
33745
 
+ *
33746
 
+ *  Empty -> Want   - on read or write to get old data for  parity calc
33747
 
+ *  Empty -> Dirty  - on compute_parity to satisfy write/sync request.(RECONSTRUCT_WRITE)
33748
 
+ *  Empty -> Clean  - on compute_block when computing a block for failed drive
33749
 
+ *  Want  -> Empty  - on failed read
33750
 
+ *  Want  -> Clean  - on successful completion of read request
33751
 
+ *  Dirty -> Clean  - on successful completion of write request
33752
 
+ *  Dirty -> Clean  - on failed write
33753
 
+ *  Clean -> Dirty  - on compute_parity to satisfy write/sync (RECONSTRUCT or RMW)
33754
 
+ *
33755
 
+ * The Want->Empty, Want->Clean, Dirty->Clean, transitions
33756
 
+ * all happen in b_end_io at interrupt time.
33757
 
+ * Each sets the Uptodate bit before releasing the Lock bit.
33758
 
+ * This leaves one multi-stage transition:
33759
 
+ *    Want->Dirty->Clean
33760
 
+ * This is safe because thinking that a Clean buffer is actually dirty
33761
 
+ * will at worst delay some action, and the stripe will be scheduled
33762
 
+ * for attention after the transition is complete.
33763
 
+ *
33764
 
+ * There is one possibility that is not covered by these states.  That
33765
 
+ * is if one drive has failed and there is a spare being rebuilt.  We
33766
 
+ * can't distinguish between a clean block that has been generated
33767
 
+ * from parity calculations, and a clean block that has been
33768
 
+ * successfully written to the spare ( or to parity when resyncing).
33769
 
+ * To distingush these states we have a stripe bit STRIPE_INSYNC that
33770
 
+ * is set whenever a write is scheduled to the spare, or to the parity
33771
 
+ * disc if there is no spare.  A sync request clears this bit, and
33772
 
+ * when we find it set with no buffers locked, we know the sync is
33773
 
+ * complete.
33774
 
+ *
33775
 
+ * Buffers for the md device that arrive via make_request are attached
33776
 
+ * to the appropriate stripe in one of two lists linked on b_reqnext.
33777
 
+ * One list (bh_read) for read requests, one (bh_write) for write.
33778
 
+ * There should never be more than one buffer on the two lists
33779
 
+ * together, but we are not guaranteed of that so we allow for more.
33780
 
+ *
33781
 
+ * If a buffer is on the read list when the associated cache buffer is
33782
 
+ * Uptodate, the data is copied into the read buffer and it's b_end_io
33783
 
+ * routine is called.  This may happen in the end_request routine only
33784
 
+ * if the buffer has just successfully been read.  end_request should
33785
 
+ * remove the buffers from the list and then set the Uptodate bit on
33786
 
+ * the buffer.  Other threads may do this only if they first check
33787
 
+ * that the Uptodate bit is set.  Once they have checked that they may
33788
 
+ * take buffers off the read queue.
33789
 
+ *
33790
 
+ * When a buffer on the write list is committed for write is it copied
33791
 
+ * into the cache buffer, which is then marked dirty, and moved onto a
33792
 
+ * third list, the written list (bh_written).  Once both the parity
33793
 
+ * block and the cached buffer are successfully written, any buffer on
33794
 
+ * a written list can be returned with b_end_io.
33795
 
+ *
33796
 
+ * The write list and read list both act as fifos.  The read list is
33797
 
+ * protected by the device_lock.  The write and written lists are
33798
 
+ * protected by the stripe lock.  The device_lock, which can be
33799
 
+ * claimed while the stipe lock is held, is only for list
33800
 
+ * manipulations and will only be held for a very short time.  It can
33801
 
+ * be claimed from interrupts.
33802
 
+ *
33803
 
+ *
33804
 
+ * Stripes in the stripe cache can be on one of two lists (or on
33805
 
+ * neither).  The "inactive_list" contains stripes which are not
33806
 
+ * currently being used for any request.  They can freely be reused
33807
 
+ * for another stripe.  The "handle_list" contains stripes that need
33808
 
+ * to be handled in some way.  Both of these are fifo queues.  Each
33809
 
+ * stripe is also (potentially) linked to a hash bucket in the hash
33810
 
+ * table so that it can be found by sector number.  Stripes that are
33811
 
+ * not hashed must be on the inactive_list, and will normally be at
33812
 
+ * the front.  All stripes start life this way.
33813
 
+ *
33814
 
+ * The inactive_list, handle_list and hash bucket lists are all protected by the
33815
 
+ * device_lock.
33816
 
+ *  - stripes on the inactive_list never have their stripe_lock held.
33817
 
+ *  - stripes have a reference counter. If count==0, they are on a list.
33818
 
+ *  - If a stripe might need handling, STRIPE_HANDLE is set.
33819
 
+ *  - When refcount reaches zero, then if STRIPE_HANDLE it is put on
33820
 
+ *    handle_list else inactive_list
33821
 
+ *
33822
 
+ * This, combined with the fact that STRIPE_HANDLE is only ever
33823
 
+ * cleared while a stripe has a non-zero count means that if the
33824
 
+ * refcount is 0 and STRIPE_HANDLE is set, then it is on the
33825
 
+ * handle_list and if recount is 0 and STRIPE_HANDLE is not set, then
33826
 
+ * the stripe is on inactive_list.
33827
 
+ *
33828
 
+ * The possible transitions are:
33829
 
+ *  activate an unhashed/inactive stripe (get_active_stripe())
33830
 
+ *     lockdev check-hash unlink-stripe cnt++ clean-stripe hash-stripe unlockdev
33831
 
+ *  activate a hashed, possibly active stripe (get_active_stripe())
33832
 
+ *     lockdev check-hash if(!cnt++)unlink-stripe unlockdev
33833
 
+ *  attach a request to an active stripe (add_stripe_bh())
33834
 
+ *     lockdev attach-buffer unlockdev
33835
 
+ *  handle a stripe (handle_stripe())
33836
 
+ *     lockstripe clrSTRIPE_HANDLE ... (lockdev check-buffers unlockdev) .. change-state .. record io needed unlockstripe schedule io
33837
 
+ *  release an active stripe (release_stripe())
33838
 
+ *     lockdev if (!--cnt) { if  STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev
33839
 
+ *
33840
 
+ * The refcount counts each thread that have activated the stripe,
33841
 
+ * plus raid5d if it is handling it, plus one for each active request
33842
 
+ * on a cached buffer.
33843
 
+ */
33844
 
+struct stripe_head {
33845
 
+       struct stripe_head      *hash_next, **hash_pprev; /* hash pointers */
33846
 
+       struct list_head        lru;                    /* inactive_list or handle_list */
33847
 
+       struct raid5_private_data       *raid_conf;
33848
 
+       struct buffer_head      *bh_cache[MD_SB_DISKS]; /* buffered copy */
33849
 
+       struct buffer_head      *bh_read[MD_SB_DISKS];  /* read request buffers of the MD device */
33850
 
+       struct buffer_head      *bh_write[MD_SB_DISKS]; /* write request buffers of the MD device */
33851
 
+       struct buffer_head      *bh_written[MD_SB_DISKS]; /* write request buffers of the MD device that have been scheduled for write */
33852
 
+       struct page             *bh_page[MD_SB_DISKS];  /* saved bh_cache[n]->b_page when reading around the cache */
33853
 
+       evms_logical_node_t     *node[MD_SB_DISKS];     /* the target device node */
33854
 
+       unsigned long           sector;                 /* sector of this row */
33855
 
+       int                     size;                   /* buffers size */
33856
 
+       int                     pd_idx;                 /* parity disk index */
33857
 
+       unsigned long           state;                  /* state flags */
33858
 
+       atomic_t                count;                  /* nr of active thread/requests */
33859
 
+       spinlock_t              lock;
33860
 
+       int                     sync_redone;
33861
 
+};
33862
 
+
33863
 
+
33864
 
+/*
33865
 
+ * Write method
33866
 
+ */
33867
 
+#define RECONSTRUCT_WRITE      1
33868
 
+#define READ_MODIFY_WRITE      2
33869
 
+/* not a write method, but a compute_parity mode */
33870
 
+#define        CHECK_PARITY            3
33871
 
+
33872
 
+/*
33873
 
+ * Stripe state
33874
 
+ */
33875
 
+#define STRIPE_ERROR           1
33876
 
+#define STRIPE_HANDLE          2
33877
 
+#define        STRIPE_SYNCING          3
33878
 
+#define        STRIPE_INSYNC           4
33879
 
+#define        STRIPE_PREREAD_ACTIVE   5
33880
 
+#define        STRIPE_DELAYED          6
33881
 
+
33882
 
+/*
33883
 
+ * Plugging:
33884
 
+ *
33885
 
+ * To improve write throughput, we need to delay the handling of some
33886
 
+ * stripes until there has been a chance that several write requests
33887
 
+ * for the one stripe have all been collected.
33888
 
+ * In particular, any write request that would require pre-reading
33889
 
+ * is put on a "delayed" queue until there are no stripes currently
33890
 
+ * in a pre-read phase.  Further, if the "delayed" queue is empty when
33891
 
+ * a stripe is put on it then we "plug" the queue and do not process it
33892
 
+ * until an unplg call is made. (the tq_disk list is run).
33893
 
+ *
33894
 
+ * When preread is initiated on a stripe, we set PREREAD_ACTIVE and add
33895
 
+ * it to the count of prereading stripes.
33896
 
+ * When write is initiated, or the stripe refcnt == 0 (just in case) we
33897
 
+ * clear the PREREAD_ACTIVE flag and decrement the count
33898
 
+ * Whenever the delayed queue is empty and the device is not plugged, we
33899
 
+ * move any strips from delayed to handle and clear the DELAYED flag and set PREREAD_ACTIVE.
33900
 
+ * In stripe_handle, if we find pre-reading is necessary, we do it if
33901
 
+ * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue.
33902
 
+ * HANDLE gets cleared if stripe_handle leave nothing locked.
33903
 
+ */
33904
 
+
33905
 
+
33906
 
+struct disk_info {
33907
 
+       kdev_t  dev;
33908
 
+       evms_logical_node_t *node;
33909
 
+       int     operational;
33910
 
+       int     number;
33911
 
+       int     raid_disk;
33912
 
+       int     write_only;
33913
 
+       int     spare;
33914
 
+       int     used_slot;
33915
 
+};
33916
 
+
33917
 
+struct raid5_private_data {
33918
 
+       struct stripe_head      **stripe_hashtbl;
33919
 
+       mddev_t                 *mddev;
33920
 
+       evms_thread_t           *thread, *resync_thread;
33921
 
+       struct disk_info        disks[MD_SB_DISKS];
33922
 
+       struct disk_info        *spare;
33923
 
+       int                     buffer_size;
33924
 
+       int                     chunk_size, level, algorithm;
33925
 
+       int                     raid_disks, working_disks, failed_disks;
33926
 
+       int                     resync_parity;
33927
 
+       int                     max_nr_stripes;
33928
 
+
33929
 
+       struct list_head        handle_list; /* stripes needing handling */
33930
 
+       struct list_head        delayed_list; /* stripes that have plugged requests */
33931
 
+       atomic_t                preread_active_stripes; /* stripes with scheduled io */
33932
 
+       /*
33933
 
+        * Free stripes pool
33934
 
+        */
33935
 
+       atomic_t                active_stripes;
33936
 
+       struct list_head        inactive_list;
33937
 
+       md_wait_queue_head_t    wait_for_stripe;
33938
 
+       int                     inactive_blocked;       /* release of inactive stripes blocked,
33939
 
+                                                        * waiting for 25% to be free
33940
 
+                                                        */
33941
 
+       md_spinlock_t           device_lock;
33942
 
+
33943
 
+       int                     plugged;
33944
 
+       struct tq_struct        plug_tq;
33945
 
+};
33946
 
+
33947
 
+typedef struct raid5_private_data raid5_conf_t;
33948
 
+
33949
 
+#define mddev_to_conf(mddev) ((raid5_conf_t *) mddev->private)
33950
 
+
33951
 
+/*
33952
 
+ * Our supported algorithms
33953
 
+ */
33954
 
+#define ALGORITHM_LEFT_ASYMMETRIC      0
33955
 
+#define ALGORITHM_RIGHT_ASYMMETRIC     1
33956
 
+#define ALGORITHM_LEFT_SYMMETRIC       2
33957
 
+#define ALGORITHM_RIGHT_SYMMETRIC      3
33958
 
+
33959
 
+
33960
 
+#define EVMS_MD_RAID5_INIT_IO          1
33961
 
+
33962
 
+typedef struct raid5_ioctl_init_io_s {
33963
 
+       int           rw;
33964
 
+       evms_sector_t lsn;
33965
 
+       evms_sector_t nr_sects;
33966
 
+       void          *data;
33967
 
+} raid5_ioctl_init_io_t;
33968
 
+#endif
33969
 
diff -Naur linux-2002-03-28/include/linux/evms/evms_snapshot.h evms-2002-03-28/include/linux/evms/evms_snapshot.h
33970
 
--- linux-2002-03-28/include/linux/evms/evms_snapshot.h Wed Dec 31 18:00:00 1969
33971
 
+++ evms-2002-03-28/include/linux/evms/evms_snapshot.h  Thu Dec  6 18:42:08 2001
33972
 
@@ -0,0 +1,131 @@
33973
 
+/* -*- linux-c -*- */
33974
 
+/*
33975
 
+ *
33976
 
+ *   Copyright (c) International Business Machines  Corp., 2000
33977
 
+ *
33978
 
+ *   This program is free software;  you can redistribute it and/or modify
33979
 
+ *   it under the terms of the GNU General Public License as published by
33980
 
+ *   the Free Software Foundation; either version 2 of the License, or 
33981
 
+ *   (at your option) any later version.
33982
 
+ * 
33983
 
+ *   This program is distributed in the hope that it will be useful,
33984
 
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
33985
 
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
33986
 
+ *   the GNU General Public License for more details.
33987
 
+ *
33988
 
+ *   You should have received a copy of the GNU General Public License
33989
 
+ *   along with this program;  if not, write to the Free Software 
33990
 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
33991
 
+ */
33992
 
+/*
33993
 
+ * linux/include/linux/evms_snapshot.h
33994
 
+ *
33995
 
+ * EVMS Snapshot Feature kernel header file
33996
 
+ *
33997
 
+ */
33998
 
+
33999
 
+#ifndef __EVMS_SNAPSHOT_INCLUDED__
34000
 
+#define __EVMS_SNAPSHOT_INCLUDED__
34001
 
+
34002
 
+#define        EVMS_SNAPSHOT_VERSION_MAJOR             2
34003
 
+#define EVMS_SNAPSHOT_VERSION_MINOR            0
34004
 
+#define EVMS_SNAPSHOT_VERSION_PATCHLEVEL       0
34005
 
+
34006
 
+#define EVMS_SNAPSHOT_FEATURE_ID               104
34007
 
+
34008
 
+#define EVMS_SNAPSHOT_SIGNATURE        0x536e4170      // SnAp
34009
 
+#define MAX_HASH_CHAIN_ENTRIES 10
34010
 
+
34011
 
+#define EVMS_SNAPSHOT          0x01            // Status flags
34012
 
+#define EVMS_SNAPSHOT_ORG      0x02
34013
 
+#define EVMS_SNAPSHOT_DISABLED 0x04
34014
 
+#define EVMS_SNAPSHOT_FULL     0x08
34015
 
+#define EVMS_SNAPSHOT_QUIESCED 0x10
34016
 
+#define EVMS_SNAPSHOT_WRITEABLE 0x20
34017
 
+
34018
 
+                                               // option definitions
34019
 
+#define SNAP_OPTION_ORG_VOLUME_NAME     "original"   // original volume   
34020
 
+#define SNAP_OPTION_ORG_VOLUME_INDEX    0            // original volume   
34021
 
+#define SNAP_OPTION_SNAPSHOT_NAME       "snapshot"   // snapshot volume   
34022
 
+#define SNAP_OPTION_SNAPSHOT_INDEX      1            // snapshot volume   
34023
 
+#define SNAP_OPTION_CHUNKSIZE_NAME      "chunksize"  // chunksize   
34024
 
+#define SNAP_OPTION_CHUNKSIZE_INDEX     2            // chunksize  
34025
 
+#define SNAP_OPTION_WRITEABLE_NAME      "writeable"  // writeable snapshot
34026
 
+#define SNAP_OPTION_WRITEABLE_INDEX     3            // writeable snapshot
34027
 
+
34028
 
+#define SNAPSHOT_DEFAULT_CHUNK_SIZE 128             //sectors
34029
 
+#define SNAPSHOT_MIN_CHUNK_SIZE     16              // 8k
34030
 
+#define SNAPSHOT_MAX_CHUNK_SIZE     2048            // = 1Meg
34031
 
+#define SNAPSHOT_CHUNK_BUFFER_SIZE  128             // copy buffer
34032
 
+
34033
 
+#define SNAPSHOT_QUERY_PERCENT_FULL 1                // ioctl internal command to query percent full
34034
 
+
34035
 
+#define SECTOR_SIZE 512
34036
 
+
34037
 
+// description of on disk meta data sector for snapshot feature
34038
 
+typedef struct _snapshot_metadata {
34039
 
+/* 0*/ u_int32_t                       signature;
34040
 
+/* 4*/ u_int32_t                       CRC;
34041
 
+/* 8*/ evms_version_t                  version;                /* structure version */
34042
 
+/*12*/ u_int32_t                       flags;
34043
 
+/*16*/ char                            original_volume[128];
34044
 
+/*144*/        u_int64_t                       original_size;
34045
 
+/*152*/        u_int64_t                       lba_of_COW_table;
34046
 
+/*160*/        u_int64_t                       lba_of_first_chunk;
34047
 
+/*168*/        u_int32_t                       chunk_size;          // in sectors
34048
 
+/*172*/        u_int32_t                       total_chunks;        
34049
 
+} snapshot_metadata_t;
34050
 
+
34051
 
+
34052
 
+#ifdef __KERNEL__
34053
 
+
34054
 
+// Entries in the snapshot remapping structure
34055
 
+typedef struct _snapshot_hash_entry {
34056
 
+       unsigned long long              org_chunk;
34057
 
+       unsigned long long              snap_chunk;
34058
 
+       struct _snapshot_hash_entry     * next;
34059
 
+       struct _snapshot_hash_entry     * prev;
34060
 
+} snapshot_hash_entry_t;
34061
 
+
34062
 
+
34063
 
+typedef struct _snapshot_volume {
34064
 
+       evms_logical_node_t *   logical_node;           // node below us
34065
 
+       unsigned long           chunk_size;             // Sectors
34066
 
+       unsigned long           chunk_shift;            // shift value for chunk size
34067
 
+       unsigned long           num_chunks;             // in this volume
34068
 
+       unsigned long           next_cow_entry;         // Index into current COW table
34069
 
+       unsigned long long      current_cow_sector;     // LOGICAL sector of current COW table
34070
 
+       unsigned long           next_free_chunk;        // index of next free chunk (not LBA!)
34071
 
+       u_int64_t               cow_table[64];          // Pointer to one sector's worth of COW tables
34072
 
+       unsigned long           hash_table_size;        // size of the hash table for the remap
34073
 
+       unsigned long           flags;                  // status flags
34074
 
+       snapshot_hash_entry_t   ** snapshot_map;        // array of remapped chunks
34075
 
+       struct _snapshot_volume * snapshot_next;        // Linked list of volumes snapshotting this original
34076
 
+       struct _snapshot_volume * snapshot_org;         // Pointer to volume being snapshotted
34077
 
+       struct semaphore        snap_semaphore;         // Semaphore for locking of snapshots
34078
 
+       unsigned char           * chunk_data_buffer;    // Buffer for reading data when doing a copy-on-write
34079
 
+} snapshot_volume_t;
34080
 
+
34081
 
+#else
34082
 
+typedef struct _snapshot_volume {
34083
 
+       storage_object_t *      object;                 // our exported object
34084
 
+       storage_object_t *      child_object;           // our child object
34085
 
+       unsigned long           chunk_size;             // Sectors
34086
 
+       unsigned long           num_chunks;             // in this volume
34087
 
+       unsigned long           next_cow_entry;         // Index into current COW table
34088
 
+       unsigned long long      current_cow_sector;     // LOGICAL sector of current COW table
34089
 
+       unsigned long           next_free_chunk;        // index of next free chunk (not LBA!)
34090
 
+       u_int64_t               cow_table[64];          // Pointer to one sector's worth of COW tables
34091
 
+       unsigned long           hash_table_size;        // size of the hash table for the remap
34092
 
+       unsigned long           flags;                  // status flags
34093
 
+//     snapshot_hash_entry_t   ** snapshot_map;        // array of remapped chunks
34094
 
+       struct _snapshot_volume * snapshot_next;        // Linked list of volumes snapshotting this original
34095
 
+       struct _snapshot_volume * snapshot_org;         // Pointer to volume being snapshotted
34096
 
+//     struct semaphore        snap_semaphore;         // Semaphore for locking of snapshots
34097
 
+//     unsigned char           * chunk_data_buffer;    // Buffer for reading data when doing a copy-on-write
34098
 
+       snapshot_metadata_t     meta_data;              // copy of metadata if not original
34099
 
+} snapshot_volume_t;
34100
 
+
34101
 
+#endif
34102
 
+#endif
34103
 
+
34104
 
diff -Naur linux-2002-03-28/include/linux/evms/evms_user.h evms-2002-03-28/include/linux/evms/evms_user.h
34105
 
--- linux-2002-03-28/include/linux/evms/evms_user.h     Wed Dec 31 18:00:00 1969
34106
 
+++ evms-2002-03-28/include/linux/evms/evms_user.h      Wed May 16 13:40:56 2001
34107
 
@@ -0,0 +1,28 @@
34108
 
+/* -*- linux-c -*- */
34109
 
+/*
34110
 
+ *
34111
 
+ *   Copyright (c) International Business Machines  Corp., 2000
34112
 
+ *
34113
 
+ *   This program is free software;  you can redistribute it and/or modify
34114
 
+ *   it under the terms of the GNU General Public License as published by
34115
 
+ *   the Free Software Foundation; either version 2 of the License, or 
34116
 
+ *   (at your option) any later version.
34117
 
+ * 
34118
 
+ *   This program is distributed in the hope that it will be useful,
34119
 
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
34120
 
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
34121
 
+ *   the GNU General Public License for more details.
34122
 
+ *
34123
 
+ *   You should have received a copy of the GNU General Public License
34124
 
+ *   along with this program;  if not, write to the Free Software 
34125
 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
34126
 
+ */
34127
 
+/*
34128
 
+ * linux/include/linux/evms_user.h
34129
 
+ *
34130
 
+ * EVMS (master) user header file
34131
 
+ *
34132
 
+ */
34133
 
+
34134
 
+#include <linux/evms/evms_common.h>
34135
 
+#include <linux/evms/evms_ioctl.h>
34136
 
diff -Naur linux-2002-03-28/include/linux/evms/evms_xor.h evms-2002-03-28/include/linux/evms/evms_xor.h
34137
 
--- linux-2002-03-28/include/linux/evms/evms_xor.h      Wed Dec 31 18:00:00 1969
34138
 
+++ evms-2002-03-28/include/linux/evms/evms_xor.h       Mon Feb  4 09:58:43 2002
34139
 
@@ -0,0 +1,23 @@
34140
 
+#ifndef _XOR_H
34141
 
+#define _XOR_H
34142
 
+
34143
 
+#include <linux/evms/evms_md.h>
34144
 
+
34145
 
+#define MAX_XOR_BLOCKS 5
34146
 
+
34147
 
+extern void evms_md_xor_block(unsigned int count, struct buffer_head **bh_ptr);
34148
 
+
34149
 
+struct xor_block_template {
34150
 
+        struct xor_block_template *next;
34151
 
+        const char *name;
34152
 
+        int speed;
34153
 
+       void (*do_2)(unsigned long, unsigned long *, unsigned long *);
34154
 
+       void (*do_3)(unsigned long, unsigned long *, unsigned long *,
34155
 
+                    unsigned long *);
34156
 
+       void (*do_4)(unsigned long, unsigned long *, unsigned long *,
34157
 
+                    unsigned long *, unsigned long *);
34158
 
+       void (*do_5)(unsigned long, unsigned long *, unsigned long *,
34159
 
+                    unsigned long *, unsigned long *, unsigned long *);
34160
 
+};
34161
 
+
34162
 
+#endif