3
* VMware Disk format implementation.
5
* (c) 2006 Andrew Warfield and Julian Chesterfield
7
* This is largely the same as the vmdk driver in Qemu, I've just twisted it
8
* to match our interfaces. The original (BSDish) Copyright message appears
13
* Block driver for the VMDK format
15
* Copyright (c) 2004 Fabrice Bellard
16
* Copyright (c) 2005 Filip Navara
18
* Permission is hereby granted, free of charge, to any person obtaining a copy
19
* of this software and associated documentation files (the "Software"), to deal
20
* in the Software without restriction, including without limitation the rights
21
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
22
* copies of the Software, and to permit persons to whom the Software is
23
* furnished to do so, subject to the following conditions:
25
* The above copyright notice and this permission notice shall be included in
26
* all copies or substantial portions of the Software.
28
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
29
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
30
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
31
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
32
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
33
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
42
#include <sys/statvfs.h>
44
#include <sys/ioctl.h>
49
/* *BSD has no O_LARGEFILE */
54
#define safer_free(_x) \
62
#define VMDK3_MAGIC (('C' << 24) | ('O' << 16) | ('W' << 8) | 'D')
63
#define VMDK4_MAGIC (('K' << 24) | ('D' << 16) | ('M' << 8) | 'V')
68
uint32_t disk_sectors;
70
uint32_t l1dir_offset;
72
uint32_t file_sectors;
75
uint32_t sectors_per_track;
85
int32_t num_gtes_per_gte;
91
} __attribute__((packed)) VMDK4Header;
93
#define L2_CACHE_SIZE 16
97
int poll_pipe[2]; /* dummy fd for polling on */
100
int64_t l1_table_offset;
101
int64_t l1_backup_table_offset;
102
uint32_t l1_entry_sectors;
103
unsigned int l2_size;
106
uint32_t *l1_backup_table;
108
uint32_t l2_cache_offsets[L2_CACHE_SIZE];
109
uint32_t l2_cache_counts[L2_CACHE_SIZE];
111
unsigned int cluster_sectors;
114
static inline void init_fds(struct disk_driver *dd)
117
struct tdvmdk_state *prv = (struct tdvmdk_state *)dd->private;
119
for (i = 0; i < MAX_IOFD; i++)
122
dd->io_fd[0] = prv->poll_pipe[0];
125
/* Open the disk file and initialize aio state. */
126
static int tdvmdk_open (struct disk_driver *dd,
127
const char *name, td_flag_t flags)
130
int l1_size, i, o_flags;
132
struct td_state *s = dd->td_state;
133
struct tdvmdk_state *prv = (struct tdvmdk_state *)dd->private;
135
/* set up a pipe so that we can hand back a poll fd that won't fire.*/
136
ret = pipe(prv->poll_pipe);
141
o_flags = O_DIRECT | O_LARGEFILE |
142
((flags == TD_RDONLY) ? O_RDONLY : O_RDWR);
143
fd = open(name, o_flags);
145
if ( (fd == -1) && (errno == EINVAL) ) {
147
/* Maybe O_DIRECT isn't supported. */
148
o_flags &= ~O_DIRECT;
149
fd = open(name, o_flags);
150
if (fd != -1) DPRINTF("WARNING: Accessing image without"
151
"O_DIRECT! (%s)\n", name);
153
} else if (fd != -1) DPRINTF("open(%s) with O_DIRECT\n", name);
156
DPRINTF("Unable to open [%s]!\n",name);
163
/* Grok the vmdk header. */
164
if ((ret = read(fd, &magic, sizeof(magic))) != sizeof(magic))
166
magic = be32_to_cpu(magic);
167
if (magic == VMDK3_MAGIC) {
169
if (read(fd, &header, sizeof(header)) !=
172
prv->cluster_sectors = le32_to_cpu(header.granularity);
173
prv->l2_size = 1 << 9;
174
prv->l1_size = 1 << 6;
175
s->size = le32_to_cpu(header.disk_sectors);
176
prv->l1_table_offset = le32_to_cpu(header.l1dir_offset) << 9;
177
prv->l1_backup_table_offset = 0;
178
prv->l1_entry_sectors = prv->l2_size * prv->cluster_sectors;
179
} else if (magic == VMDK4_MAGIC) {
182
if (read(fd, &header, sizeof(header)) != sizeof(header))
184
s->size = le32_to_cpu(header.capacity);
185
prv->cluster_sectors = le32_to_cpu(header.granularity);
186
prv->l2_size = le32_to_cpu(header.num_gtes_per_gte);
187
prv->l1_entry_sectors = prv->l2_size * prv->cluster_sectors;
188
if (prv->l1_entry_sectors <= 0)
190
prv->l1_size = (s->size + prv->l1_entry_sectors - 1)
191
/ prv->l1_entry_sectors;
192
prv->l1_table_offset = le64_to_cpu(header.rgd_offset) << 9;
193
prv->l1_backup_table_offset =
194
le64_to_cpu(header.gd_offset) << 9;
198
/* read the L1 table */
199
l1_size = prv->l1_size * sizeof(uint32_t);
200
prv->l1_table = malloc(l1_size);
203
if (lseek(fd, prv->l1_table_offset, SEEK_SET) == -1)
205
if (read(fd, prv->l1_table, l1_size) != l1_size)
207
for (i = 0; i < prv->l1_size; i++) {
208
le32_to_cpus(&prv->l1_table[i]);
211
if (prv->l1_backup_table_offset) {
212
prv->l1_backup_table = malloc(l1_size);
213
if (!prv->l1_backup_table)
215
if (lseek(fd, prv->l1_backup_table_offset, SEEK_SET) == -1)
217
if (read(fd, prv->l1_backup_table, l1_size) != l1_size)
219
for(i = 0; i < prv->l1_size; i++) {
220
le32_to_cpus(&prv->l1_backup_table[i]);
224
prv->l2_cache = malloc(prv->l2_size * L2_CACHE_SIZE *sizeof(uint32_t));
229
DPRINTF("VMDK File opened successfully\n");
233
DPRINTF("VMDK File open failed.\n");
234
safer_free(prv->l1_backup_table);
241
static uint64_t get_cluster_offset(struct tdvmdk_state *prv,
242
uint64_t offset, int allocate)
244
unsigned int l1_index, l2_offset, l2_index;
246
uint32_t min_count, *l2_table, tmp;
247
uint64_t cluster_offset;
249
l1_index = (offset >> 9) / prv->l1_entry_sectors;
250
if (l1_index >= prv->l1_size)
252
l2_offset = prv->l1_table[l1_index];
255
for (i = 0; i < L2_CACHE_SIZE; i++) {
256
if (l2_offset == prv->l2_cache_offsets[i]) {
257
/* increment the hit count */
258
if (++prv->l2_cache_counts[i] == 0xffffffff) {
259
for(j = 0; j < L2_CACHE_SIZE; j++) {
260
prv->l2_cache_counts[j] >>= 1;
263
l2_table = prv->l2_cache + (i * prv->l2_size);
267
/* not found: load a new entry in the least used one */
269
min_count = 0xffffffff;
270
for (i = 0; i < L2_CACHE_SIZE; i++) {
271
if (prv->l2_cache_counts[i] < min_count) {
272
min_count = prv->l2_cache_counts[i];
276
l2_table = prv->l2_cache + (min_index * prv->l2_size);
277
lseek(prv->fd, (int64_t)l2_offset * 512, SEEK_SET);
278
if (read(prv->fd, l2_table, prv->l2_size * sizeof(uint32_t)) !=
279
prv->l2_size * sizeof(uint32_t))
281
prv->l2_cache_offsets[min_index] = l2_offset;
282
prv->l2_cache_counts[min_index] = 1;
284
l2_index = ((offset >> 9) / prv->cluster_sectors) % prv->l2_size;
285
cluster_offset = le32_to_cpu(l2_table[l2_index]);
286
if (!cluster_offset) {
289
cluster_offset = lseek(prv->fd, 0, SEEK_END);
290
if (ftruncate(prv->fd, cluster_offset +
291
(prv->cluster_sectors << 9)))
293
cluster_offset >>= 9;
294
/* update L2 table */
295
tmp = cpu_to_le32(cluster_offset);
296
l2_table[l2_index] = tmp;
297
lseek(prv->fd, ((int64_t)l2_offset * 512) +
298
(l2_index * sizeof(tmp)), SEEK_SET);
299
if (write(prv->fd, &tmp, sizeof(tmp)) != sizeof(tmp))
301
/* update backup L2 table */
302
if (prv->l1_backup_table_offset != 0) {
303
l2_offset = prv->l1_backup_table[l1_index];
304
lseek(prv->fd, ((int64_t)l2_offset * 512) +
305
(l2_index * sizeof(tmp)), SEEK_SET);
306
if (write(prv->fd, &tmp, sizeof(tmp)) != sizeof(tmp))
310
cluster_offset <<= 9;
311
return cluster_offset;
314
static int tdvmdk_queue_read(struct disk_driver *dd, uint64_t sector,
315
int nb_sectors, char *buf, td_callback_t cb,
316
int id, void *private)
318
struct tdvmdk_state *prv = (struct tdvmdk_state *)dd->private;
319
int index_in_cluster, n;
320
uint64_t cluster_offset;
323
while (nb_sectors > 0) {
324
cluster_offset = get_cluster_offset(prv, sector << 9, 0);
325
index_in_cluster = sector % prv->cluster_sectors;
326
n = prv->cluster_sectors - index_in_cluster;
329
if (!cluster_offset) {
330
memset(buf, 0, 512 * n);
332
lseek(prv->fd, cluster_offset + index_in_cluster * 512,
334
ret = read(prv->fd, buf, n * 512);
335
if (ret != n * 512) {
345
return cb(dd, ret == -1 ? -1 : 0, sector, nb_sectors, id, private);
348
static int tdvmdk_queue_write(struct disk_driver *dd, uint64_t sector,
349
int nb_sectors, char *buf, td_callback_t cb,
350
int id, void *private)
352
struct tdvmdk_state *prv = (struct tdvmdk_state *)dd->private;
353
int index_in_cluster, n;
354
uint64_t cluster_offset;
357
while (nb_sectors > 0) {
358
index_in_cluster = sector & (prv->cluster_sectors - 1);
359
n = prv->cluster_sectors - index_in_cluster;
362
cluster_offset = get_cluster_offset(prv, sector << 9, 1);
363
if (!cluster_offset) {
367
lseek(prv->fd, cluster_offset + index_in_cluster * 512,
369
ret = write(prv->fd, buf, n * 512);
370
if (ret != n * 512) {
379
return cb(dd, ret == -1 ? -1 : 0, sector, nb_sectors, id, private);
382
static int tdvmdk_submit(struct disk_driver *dd)
387
static int tdvmdk_close(struct disk_driver *dd)
389
struct tdvmdk_state *prv = (struct tdvmdk_state *)dd->private;
391
safer_free(prv->l1_table);
392
safer_free(prv->l1_backup_table);
393
safer_free(prv->l2_cache);
395
close(prv->poll_pipe[0]);
396
close(prv->poll_pipe[1]);
400
static int tdvmdk_do_callbacks(struct disk_driver *dd, int sid)
402
/* always ask for a kick */
406
static int tdvmdk_get_parent_id(struct disk_driver *dd, struct disk_id *id)
411
static int tdvmdk_validate_parent(struct disk_driver *dd,
412
struct disk_driver *parent, td_flag_t flags)
417
struct tap_disk tapdisk_vmdk = {
418
.disk_type = "tapdisk_vmdk",
419
.private_data_size = sizeof(struct tdvmdk_state),
420
.td_open = tdvmdk_open,
421
.td_queue_read = tdvmdk_queue_read,
422
.td_queue_write = tdvmdk_queue_write,
423
.td_submit = tdvmdk_submit,
424
.td_close = tdvmdk_close,
425
.td_do_callbacks = tdvmdk_do_callbacks,
426
.td_get_parent_id = tdvmdk_get_parent_id,
427
.td_validate_parent = tdvmdk_validate_parent