3
* Asynchronous Qemu copy-on-write disk implementation.
4
* Code based on the Qemu implementation
5
* (see copyright notice below)
7
* (c) 2006 Andrew Warfield and Julian Chesterfield
12
* Block driver for the QCOW format
14
* Copyright (c) 2004 Fabrice Bellard
16
* Permission is hereby granted, free of charge, to any person obtaining a copy
17
* of this software and associated documentation files(the "Software"), to deal
18
* in the Software without restriction, including without limitation the rights
19
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
20
* copies of the Software, and to permit persons to whom the Software is
21
* furnished to do so, subject to the following conditions:
29
#include <sys/statvfs.h>
31
#include <sys/ioctl.h>
36
#include <openssl/md5.h>
42
#include "tapdisk-driver.h"
43
#include "tapdisk-interface.h"
48
/* *BSD has no O_LARGEFILE */
55
if ( !(_p) ) { DPRINTF("Assertion '%s' failed, line %d, file %s", #_p , \
56
__LINE__, __FILE__); *(int*)0=0; }
58
#define ASSERT(_p) ((void)0)
71
#define IOCB_IDX(_s, _io) ((_io) - (_s)->iocb_list)
73
#define ZERO_TEST(_b) (_b | 0x00)
78
struct tdqcow_state *state;
81
static int decompress_cluster(struct tdqcow_state *s, uint64_t cluster_offset);
87
uint32_t gen_cksum(char *ptr, int len)
92
/* Generate checksum */
93
gcry_md_hash_buffer(GCRY_MD_MD5, md, ptr, len);
98
#else /* use libcrypto */
100
#include <openssl/md5.h>
102
uint32_t gen_cksum(char *ptr, int len)
108
md = malloc(MD5_DIGEST_LENGTH);
111
/* Generate checksum */
112
if (MD5((unsigned char *)ptr, len, md) != md)
115
memcpy(&ret, md, sizeof(uint32_t));
124
static void free_aio_state(struct tdqcow_state* s)
126
free(s->aio_requests);
127
free(s->aio_free_list);
130
static int init_aio_state(td_driver_t *driver)
133
td_disk_info_t *bs = &(driver->info);
134
struct tdqcow_state *s = (struct tdqcow_state *)driver->data;
136
// A segment (i.e. a page) can span multiple clusters
137
s->max_aio_reqs = ((getpagesize() / s->cluster_size) + 1) *
138
MAX_SEGMENTS_PER_REQ * MAX_REQUESTS;
140
s->aio_free_count = s->max_aio_reqs;
142
if (!(s->aio_requests = calloc(s->max_aio_reqs, sizeof(struct qcow_request))) ||
143
!(s->aio_free_list = calloc(s->max_aio_reqs, sizeof(struct qcow_request)))) {
144
DPRINTF("Failed to allocate AIO structs (max_aio_reqs = %d)\n",
149
for (i = 0; i < s->max_aio_reqs; i++)
150
s->aio_free_list[i] = &s->aio_requests[i];
152
DPRINTF("AIO state initialised\n");
159
int get_filesize(char *filename, uint64_t *size, struct stat *st)
164
/*Set to the backing file size*/
165
fd = open(filename, O_RDONLY);
168
if (read(fd, &header, sizeof(header)) < sizeof(header)) {
174
be32_to_cpus(&header.magic);
175
be64_to_cpus(&header.size);
176
if (header.magic == QCOW_MAGIC) {
177
*size = header.size >> SECTOR_SHIFT;
181
if(S_ISBLK(st->st_mode)) {
182
fd = open(filename, O_RDONLY);
185
if (blk_getimagesize(fd, size) != 0) {
186
printf("Unable to get Block device size\n");
191
} else *size = (st->st_size >> SECTOR_SHIFT);
195
static int qcow_set_key(struct tdqcow_state *s, const char *key)
200
memset(keybuf, 0, 16);
204
/* XXX: we could compress the chars to 7 bits to increase
206
for (i = 0; i < len; i++) {
209
s->crypt_method = s->crypt_method_header;
211
if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
213
if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
223
AES_encrypt(in, tmp, &s->aes_encrypt_key);
224
AES_decrypt(tmp, out, &s->aes_decrypt_key);
225
for (i = 0; i < 16; i++)
226
DPRINTF(" %02x", tmp[i]);
228
for (i = 0; i < 16; i++)
229
DPRINTF(" %02x", out[i]);
236
void tdqcow_complete(void *arg, struct tiocb *tiocb, int err)
238
struct qcow_request *aio = (struct qcow_request *)arg;
239
struct tdqcow_state *s = aio->state;
241
td_complete_request(aio->treq, err);
243
s->aio_free_list[s->aio_free_count++] = aio;
246
static void async_read(td_driver_t *driver, td_request_t treq)
250
struct qcow_request *aio;
251
struct tdqcow_state *prv;
253
prv = (struct tdqcow_state *)driver->data;
254
size = treq.secs * driver->info.sector_size;
255
offset = treq.sec * (uint64_t)driver->info.sector_size;
257
if (prv->aio_free_count == 0)
260
aio = prv->aio_free_list[--prv->aio_free_count];
264
td_prep_read(&aio->tiocb, prv->fd, treq.buf,
265
size, offset, tdqcow_complete, aio);
266
td_queue_tiocb(driver, &aio->tiocb);
271
td_complete_request(treq, -EBUSY);
274
static void async_write(td_driver_t *driver, td_request_t treq)
278
struct qcow_request *aio;
279
struct tdqcow_state *prv;
281
prv = (struct tdqcow_state *)driver->data;
282
size = treq.secs * driver->info.sector_size;
283
offset = treq.sec * (uint64_t)driver->info.sector_size;
285
if (prv->aio_free_count == 0)
288
aio = prv->aio_free_list[--prv->aio_free_count];
292
td_prep_write(&aio->tiocb, prv->fd, treq.buf,
293
size, offset, tdqcow_complete, aio);
294
td_queue_tiocb(driver, &aio->tiocb);
299
td_complete_request(treq, -EBUSY);
303
* The crypt function is compatible with the linux cryptoloop
304
* algorithm for < 4 GB images. NOTE: out_buf == in_buf is
307
static void encrypt_sectors(struct tdqcow_state *s, int64_t sector_num,
308
uint8_t *out_buf, const uint8_t *in_buf,
309
int nb_sectors, int enc,
318
for (i = 0; i < nb_sectors; i++) {
319
ivec.ll[0] = cpu_to_le64(sector_num);
321
AES_cbc_encrypt(in_buf, out_buf, 512, key,
329
int qtruncate(int fd, off_t length, int sparse)
332
int current = 0, rem = 0;
337
/* If length is greater than the current file len
338
* we synchronously write zeroes to the end of the
339
* file, otherwise we truncate the length down
341
ret = fstat(fd, &st);
344
if (S_ISBLK(st.st_mode))
347
sectors = (length + DEFAULT_SECTOR_SIZE - 1)/DEFAULT_SECTOR_SIZE;
348
current = (st.st_size + DEFAULT_SECTOR_SIZE - 1)/DEFAULT_SECTOR_SIZE;
349
rem = st.st_size % DEFAULT_SECTOR_SIZE;
351
/* If we are extending this file, we write zeros to the end --
352
* this tries to ensure that the extents allocated wind up being
353
* contiguous on disk.
355
if(st.st_size < sectors * DEFAULT_SECTOR_SIZE) {
356
/*We are extending the file*/
357
if ((ret = posix_memalign((void **)&buf,
358
512, DEFAULT_SECTOR_SIZE))) {
359
DPRINTF("posix_memalign failed: %d\n", ret);
362
memset(buf, 0x00, DEFAULT_SECTOR_SIZE);
363
if (lseek(fd, 0, SEEK_END)==-1) {
364
DPRINTF("Lseek EOF failed (%d), internal error\n",
370
ret = write(fd, buf, rem);
372
DPRINTF("write failed: ret = %d, err = %s\n",
373
ret, strerror(errno));
378
for (i = current; i < sectors; i++ ) {
379
ret = write(fd, buf, DEFAULT_SECTOR_SIZE);
380
if (ret != DEFAULT_SECTOR_SIZE) {
381
DPRINTF("write failed: ret = %d, err = %s\n",
382
ret, strerror(errno));
388
} else if(sparse && (st.st_size > sectors * DEFAULT_SECTOR_SIZE))
389
if (ftruncate(fd, (off_t)sectors * DEFAULT_SECTOR_SIZE)==-1) {
390
DPRINTF("Ftruncate failed (%s)\n", strerror(errno));
400
* 1 to allocate a normal cluster (for sector indexes 'n_start' to
403
* 2 to allocate a compressed cluster of size
404
* 'compressed_size'. 'compressed_size' must be > 0 and <
407
* return 0 if not allocated.
409
static uint64_t get_cluster_offset(struct tdqcow_state *s,
410
uint64_t offset, int allocate,
412
int n_start, int n_end)
414
int min_index, i, j, l1_index, l2_index, l2_sector, l1_sector;
415
char *tmp_ptr2, *l2_ptr, *l1_ptr;
417
uint64_t l2_offset, *l2_table, cluster_offset, tmp;
421
/*Check L1 table for the extent offset*/
422
l1_index = offset >> (s->l2_bits + s->cluster_bits);
423
l2_offset = s->l1_table[l1_index];
429
* allocating a new l2 entry + extent
430
* at the end of the file, we must also
431
* update the L1 entry safely.
433
l2_offset = s->fd_end;
435
/* round to cluster size */
436
l2_offset = (l2_offset + s->cluster_size - 1)
437
& ~(s->cluster_size - 1);
439
/* update the L1 entry */
440
s->l1_table[l1_index] = l2_offset;
442
/*Truncate file for L2 table
443
*(initialised to zero in case we crash)*/
445
l2_offset + (s->l2_size * sizeof(uint64_t)),
447
DPRINTF("ERROR truncating file\n");
450
s->fd_end = l2_offset + (s->l2_size * sizeof(uint64_t));
452
/*Update the L1 table entry on disk
453
* (for O_DIRECT we write 4KByte blocks)*/
454
l1_sector = (l1_index * sizeof(uint64_t)) >> 12;
455
l1_ptr = (char *)s->l1_table + (l1_sector << 12);
457
if (posix_memalign((void **)&tmp_ptr, 4096, 4096) != 0) {
458
DPRINTF("ERROR allocating memory for L1 table\n");
460
memcpy(tmp_ptr, l1_ptr, 4096);
462
/* Convert block to write to big endian */
463
for(i = 0; i < 4096 / sizeof(uint64_t); i++) {
464
cpu_to_be64s(&tmp_ptr[i]);
468
* Issue non-asynchronous L1 write.
469
* For safety, we must ensure that
470
* entry is written before blocks.
472
lseek(s->fd, s->l1_table_offset + (l1_sector << 12), SEEK_SET);
473
if (write(s->fd, tmp_ptr, 4096) != 4096) {
481
} else if (s->min_cluster_alloc == s->l2_size) {
482
/*Fast-track the request*/
483
cluster_offset = l2_offset + (s->l2_size * sizeof(uint64_t));
484
l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
485
return cluster_offset + (l2_index * s->cluster_size);
488
/*Check to see if L2 entry is already cached*/
489
for (i = 0; i < L2_CACHE_SIZE; i++) {
490
if (l2_offset == s->l2_cache_offsets[i]) {
491
/* increment the hit count */
492
if (++s->l2_cache_counts[i] == 0xffffffff) {
493
for (j = 0; j < L2_CACHE_SIZE; j++) {
494
s->l2_cache_counts[j] >>= 1;
497
l2_table = s->l2_cache + (i << s->l2_bits);
503
/* not found: load a new entry in the least used one */
505
min_count = 0xffffffff;
506
for (i = 0; i < L2_CACHE_SIZE; i++) {
507
if (s->l2_cache_counts[i] < min_count) {
508
min_count = s->l2_cache_counts[i];
512
l2_table = s->l2_cache + (min_index << s->l2_bits);
514
/*If extent pre-allocated, read table from disk,
515
*otherwise write new table to disk*/
517
/*Should we allocate the whole extent? Adjustable parameter.*/
518
if (s->cluster_alloc == s->l2_size) {
519
cluster_offset = l2_offset +
520
(s->l2_size * sizeof(uint64_t));
521
cluster_offset = (cluster_offset + s->cluster_size - 1)
522
& ~(s->cluster_size - 1);
523
if (qtruncate(s->fd, cluster_offset +
524
(s->cluster_size * s->l2_size),
526
DPRINTF("ERROR truncating file\n");
529
s->fd_end = cluster_offset +
530
(s->cluster_size * s->l2_size);
531
for (i = 0; i < s->l2_size; i++) {
532
l2_table[i] = cpu_to_be64(cluster_offset +
533
(i*s->cluster_size));
535
} else memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
537
lseek(s->fd, l2_offset, SEEK_SET);
538
if (write(s->fd, l2_table, s->l2_size * sizeof(uint64_t)) !=
539
s->l2_size * sizeof(uint64_t))
542
lseek(s->fd, l2_offset, SEEK_SET);
543
if (read(s->fd, l2_table, s->l2_size * sizeof(uint64_t)) !=
544
s->l2_size * sizeof(uint64_t))
548
/*Update the cache entries*/
549
s->l2_cache_offsets[min_index] = l2_offset;
550
s->l2_cache_counts[min_index] = 1;
553
/*The extent is split into 's->l2_size' blocks of
554
*size 's->cluster_size'*/
555
l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
556
cluster_offset = be64_to_cpu(l2_table[l2_index]);
558
if (!cluster_offset ||
559
((cluster_offset & QCOW_OFLAG_COMPRESSED) && allocate == 1) ) {
563
if ((cluster_offset & QCOW_OFLAG_COMPRESSED) &&
564
(n_end - n_start) < s->cluster_sectors) {
565
/* cluster is already allocated but compressed, we must
566
decompress it in the case it is not completely
568
if (decompress_cluster(s, cluster_offset) < 0)
570
cluster_offset = lseek(s->fd, s->fd_end, SEEK_SET);
571
cluster_offset = (cluster_offset + s->cluster_size - 1)
572
& ~(s->cluster_size - 1);
573
/* write the cluster content - not asynchronous */
574
lseek(s->fd, cluster_offset, SEEK_SET);
575
if (write(s->fd, s->cluster_cache, s->cluster_size) !=
579
/* allocate a new cluster */
580
cluster_offset = lseek(s->fd, s->fd_end, SEEK_SET);
582
/* round to cluster size */
584
(cluster_offset + s->cluster_size - 1)
585
& ~(s->cluster_size - 1);
586
if (qtruncate(s->fd, cluster_offset +
587
s->cluster_size, s->sparse)!=0) {
588
DPRINTF("ERROR truncating file\n");
591
s->fd_end = (cluster_offset + s->cluster_size);
592
/* if encrypted, we must initialize the cluster
593
content which won't be written */
594
if (s->crypt_method &&
595
(n_end - n_start) < s->cluster_sectors) {
597
start_sect = (offset &
598
~(s->cluster_size - 1))
600
memset(s->cluster_data + 512,
602
for (i = 0; i < s->cluster_sectors;i++)
604
if (i < n_start || i >= n_end)
606
encrypt_sectors(s, start_sect + i,
608
s->cluster_data + 512, 1, 1,
609
&s->aes_encrypt_key);
610
lseek(s->fd, cluster_offset + i * 512, SEEK_SET);
611
if (write(s->fd, s->cluster_data, 512) != 512)
617
cluster_offset |= QCOW_OFLAG_COMPRESSED |
618
(uint64_t)compressed_size
619
<< (63 - s->cluster_bits);
622
/* update L2 table */
623
tmp = cpu_to_be64(cluster_offset);
624
l2_table[l2_index] = tmp;
626
/*For IO_DIRECT we write 4KByte blocks*/
627
l2_sector = (l2_index * sizeof(uint64_t)) >> 12;
628
l2_ptr = (char *)l2_table + (l2_sector << 12);
630
if (posix_memalign((void **)&tmp_ptr2, 4096, 4096) != 0) {
631
DPRINTF("ERROR allocating memory for L1 table\n");
633
memcpy(tmp_ptr2, l2_ptr, 4096);
634
lseek(s->fd, l2_offset + (l2_sector << 12), SEEK_SET);
635
if (write(s->fd, tmp_ptr2, 4096) != 4096) {
641
return cluster_offset;
644
static int qcow_is_allocated(struct tdqcow_state *s, int64_t sector_num,
645
int nb_sectors, int *pnum)
647
int index_in_cluster, n;
648
uint64_t cluster_offset;
650
cluster_offset = get_cluster_offset(s, sector_num << 9, 0, 0, 0, 0);
651
index_in_cluster = sector_num & (s->cluster_sectors - 1);
652
n = s->cluster_sectors - index_in_cluster;
656
return (cluster_offset != 0);
659
static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
660
const uint8_t *buf, int buf_size)
662
z_stream strm1, *strm = &strm1;
665
memset(strm, 0, sizeof(*strm));
667
strm->next_in = (uint8_t *)buf;
668
strm->avail_in = buf_size;
669
strm->next_out = out_buf;
670
strm->avail_out = out_buf_size;
672
ret = inflateInit2(strm, -12);
675
ret = inflate(strm, Z_FINISH);
676
out_len = strm->next_out - out_buf;
677
if ( (ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
678
(out_len != out_buf_size) ) {
686
static int decompress_cluster(struct tdqcow_state *s, uint64_t cluster_offset)
691
coffset = cluster_offset & s->cluster_offset_mask;
692
if (s->cluster_cache_offset != coffset) {
693
csize = cluster_offset >> (63 - s->cluster_bits);
694
csize &= (s->cluster_size - 1);
695
lseek(s->fd, coffset, SEEK_SET);
696
ret = read(s->fd, s->cluster_data, csize);
699
if (decompress_buffer(s->cluster_cache, s->cluster_size,
700
s->cluster_data, csize) < 0) {
703
s->cluster_cache_offset = coffset;
709
tdqcow_read_header(int fd, QCowHeader *header)
714
size_t size, expected;
716
memset(header, 0, sizeof(*header));
718
err = fstat(fd, &st);
722
err = lseek(fd, 0, SEEK_SET);
723
if (err == (off_t)-1)
726
size = (sizeof(*header) + 511) & ~511;
727
err = posix_memalign((void **)&buf, 512, size);
732
if (st.st_size < size)
733
expected = st.st_size;
736
err = read(fd, buf, size);
737
if (err != expected) {
738
err = (errno ? -errno : -EIO);
742
memcpy(header, buf, sizeof(*header));
743
be32_to_cpus(&header->magic);
744
be32_to_cpus(&header->version);
745
be64_to_cpus(&header->backing_file_offset);
746
be32_to_cpus(&header->backing_file_size);
747
be32_to_cpus(&header->mtime);
748
be64_to_cpus(&header->size);
749
be32_to_cpus(&header->crypt_method);
750
be64_to_cpus(&header->l1_table_offset);
760
tdqcow_load_l1_table(struct tdqcow_state *s, QCowHeader *header)
766
QCowHeader_ext *exthdr;
767
uint32_t l1_table_bytes, l1_table_block, l1_table_size;
772
shift = s->cluster_bits + s->l2_bits;
774
s->l1_size = (header->size + (1LL << shift) - 1) >> shift;
775
s->l1_table_offset = header->l1_table_offset;
777
s->min_cluster_alloc = 1; /* default */
779
l1_table_bytes = s->l1_size * sizeof(uint64_t);
780
l1_table_size = (l1_table_bytes + 4095) & ~4095;
781
l1_table_block = (l1_table_bytes + s->l1_table_offset + 4095) & ~4095;
783
DPRINTF("L1 Table offset detected: %"PRIu64", size %d (%d)\n",
784
(uint64_t)s->l1_table_offset,
785
(int) (s->l1_size * sizeof(uint64_t)),
788
err = fstat(s->fd, &st);
794
err = lseek(s->fd, 0, SEEK_SET);
795
if (err == (off_t)-1) {
800
err = posix_memalign((void **)&buf, 512, l1_table_block);
806
err = posix_memalign((void **)&s->l1_table, 4096, l1_table_size);
812
memset(buf, 0, l1_table_block);
813
memset(s->l1_table, 0, l1_table_size);
815
expected = l1_table_block;
816
if (st.st_size < l1_table_block)
817
expected = st.st_size;
820
err = read(s->fd, buf, l1_table_block);
821
if (err != expected) {
822
err = (errno ? -errno : -EIO);
826
memcpy(s->l1_table, buf + s->l1_table_offset, l1_table_size);
827
exthdr = (QCowHeader_ext *)(buf + sizeof(QCowHeader));
829
/* check for xen extended header */
830
if (s->l1_table_offset % 4096 == 0 &&
831
be32_to_cpu(exthdr->xmagic) == XEN_MAGIC) {
832
uint32_t flags = be32_to_cpu(exthdr->flags);
833
uint32_t cksum = be32_to_cpu(exthdr->cksum);
836
* Try to detect old tapdisk images. They have to be fixed
837
* because they use big endian rather than native endian for
838
* the L1 table. After this block, the l1 table will
839
* definitely be in BIG endian.
841
if (!(flags & EXTHDR_L1_BIG_ENDIAN)) {
842
DPRINTF("qcow: converting to big endian L1 table\n");
844
/* convert to big endian */
845
for (i = 0; i < s->l1_size; i++)
846
cpu_to_be64s(&s->l1_table[i]);
848
flags |= EXTHDR_L1_BIG_ENDIAN;
849
exthdr->flags = cpu_to_be32(flags);
851
memcpy(buf + s->l1_table_offset,
852
s->l1_table, l1_table_size);
854
err = lseek(s->fd, 0, SEEK_SET);
855
if (err == (off_t)-1) {
860
err = atomicio(vwrite, s->fd, buf, l1_table_block);
861
if (err != l1_table_block) {
867
/* check the L1 table checksum */
868
if (cksum != gen_cksum((char *)s->l1_table,
869
s->l1_size * sizeof(uint64_t)))
870
DPRINTF("qcow: bad L1 checksum\n");
873
s->sparse = (be32_to_cpu(exthdr->flags) & SPARSE_FILE);
874
s->min_cluster_alloc =
875
be32_to_cpu(exthdr->min_cluster_alloc);
879
/* convert L1 table to native endian for operation */
880
for (i = 0; i < s->l1_size; i++)
881
be64_to_cpus(&s->l1_table[i]);
894
/* Open the disk file and initialize qcow state. */
895
int tdqcow_open (td_driver_t *driver, const char *name, td_flag_t flags)
897
int fd, len, i, ret, size, o_flags;
898
td_disk_info_t *bs = &(driver->info);
899
struct tdqcow_state *s = (struct tdqcow_state *)driver->data;
901
uint64_t final_cluster = 0;
903
DPRINTF("QCOW: Opening %s\n", name);
905
o_flags = O_DIRECT | O_LARGEFILE |
906
((flags == TD_OPEN_RDONLY) ? O_RDONLY : O_RDWR);
907
fd = open(name, o_flags);
909
DPRINTF("Unable to open %s (%d)\n", name, -errno);
914
s->name = strdup(name);
918
if (tdqcow_read_header(fd, &header))
921
if (header.magic != QCOW_MAGIC)
924
switch (header.version) {
928
//TODO: Port qcow2 to new blktap framework.
930
// dd->drv = &tapdisk_qcow2;
931
// return dd->drv->td_open(dd, name, flags);
937
if (header.size <= 1 || header.cluster_bits < 9)
939
if (header.crypt_method > QCOW_CRYPT_AES)
941
s->crypt_method_header = header.crypt_method;
942
if (s->crypt_method_header)
944
s->cluster_bits = header.cluster_bits;
945
s->cluster_size = 1 << s->cluster_bits;
946
s->cluster_sectors = 1 << (s->cluster_bits - 9);
947
s->l2_bits = header.l2_bits;
948
s->l2_size = 1 << s->l2_bits;
949
s->cluster_alloc = s->l2_size;
950
bs->size = header.size / 512;
951
s->cluster_offset_mask = (1LL << (63 - s->cluster_bits)) - 1;
952
s->backing_file_offset = header.backing_file_offset;
953
s->backing_file_size = header.backing_file_size;
955
/* allocate and load l1 table */
956
if (tdqcow_load_l1_table(s, &header))
960
size = s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t);
961
ret = posix_memalign((void **)&s->l2_cache, 4096, size);
962
if(ret != 0) goto fail;
964
size = s->cluster_size;
965
ret = posix_memalign((void **)&s->cluster_cache, 4096, size);
966
if(ret != 0) goto fail;
968
ret = posix_memalign((void **)&s->cluster_data, 4096, size);
969
if(ret != 0) goto fail;
970
s->cluster_cache_offset = -1;
972
if (s->backing_file_offset != 0)
973
s->cluster_alloc = 1; /*Cannot use pre-alloc*/
975
bs->sector_size = 512;
978
for(i = 0; i < s->l1_size; i++)
979
if (s->l1_table[i] > final_cluster)
980
final_cluster = s->l1_table[i];
982
if (init_aio_state(driver)!=0) {
983
DPRINTF("Unable to initialise AIO state\n");
989
s->fd_end = s->l1_table_offset +
990
((s->l1_size * sizeof(uint64_t) + 4095) & ~4095);
992
s->fd_end = lseek(fd, 0, SEEK_END);
993
if (s->fd_end == (off_t)-1)
1000
DPRINTF("QCOW Open failed\n");
1005
free(s->cluster_cache);
1006
free(s->cluster_data);
1011
void tdqcow_queue_read(td_driver_t *driver, td_request_t treq)
1013
struct tdqcow_state *s = (struct tdqcow_state *)driver->data;
1014
int ret = 0, index_in_cluster, n, i;
1015
uint64_t cluster_offset, sector, nb_sectors;
1016
struct qcow_prv* prv;
1017
td_request_t clone = treq;
1018
char* buf = treq.buf;
1021
nb_sectors = treq.secs;
1023
/*We store a local record of the request*/
1024
while (nb_sectors > 0) {
1026
get_cluster_offset(s, sector << 9, 0, 0, 0, 0);
1027
index_in_cluster = sector & (s->cluster_sectors - 1);
1028
n = s->cluster_sectors - index_in_cluster;
1032
if (s->aio_free_count == 0) {
1033
td_complete_request(treq, -EBUSY);
1037
if(!cluster_offset) {
1039
/* Forward entire request if possible. */
1040
for(i=0; i<nb_sectors; i++)
1041
if(get_cluster_offset(s, (sector+i) << 9, 0, 0, 0, 0))
1042
goto coalesce_failed;
1045
treq.secs = nb_sectors;
1046
td_forward_request(treq);
1052
td_forward_request(treq);
1054
} else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
1055
if (decompress_cluster(s, cluster_offset) < 0) {
1056
td_complete_request(treq, -EIO);
1059
memcpy(buf, s->cluster_cache + index_in_cluster * 512,
1065
td_complete_request(treq, 0);
1068
clone.sec = (cluster_offset>>9)+index_in_cluster;
1070
async_read(driver, clone);
1080
void tdqcow_queue_write(td_driver_t *driver, td_request_t treq)
1082
struct tdqcow_state *s = (struct tdqcow_state *)driver->data;
1083
int ret = 0, index_in_cluster, n, i;
1084
uint64_t cluster_offset, sector, nb_sectors;
1086
struct qcow_prv* prv;
1087
char* buf = treq.buf;
1088
td_request_t clone=treq;
1091
nb_sectors = treq.secs;
1093
/*We store a local record of the request*/
1094
while (nb_sectors > 0) {
1095
index_in_cluster = sector & (s->cluster_sectors - 1);
1096
n = s->cluster_sectors - index_in_cluster;
1100
if (s->aio_free_count == 0) {
1101
td_complete_request(treq, -EBUSY);
1105
cluster_offset = get_cluster_offset(s, sector << 9, 1, 0,
1107
index_in_cluster+n);
1108
if (!cluster_offset) {
1109
DPRINTF("Ooops, no write cluster offset!\n");
1110
td_complete_request(treq, -EIO);
1114
if (s->crypt_method) {
1115
encrypt_sectors(s, sector, s->cluster_data,
1116
(unsigned char *)buf, n, 1,
1117
&s->aes_encrypt_key);
1120
clone.sec = (cluster_offset>>9) + index_in_cluster;
1122
async_write(driver, clone);
1125
clone.sec = (cluster_offset>>9) + index_in_cluster;
1128
async_write(driver, clone);
1135
s->cluster_cache_offset = -1; /* disable compressed cache */
1141
tdqcow_update_checksum(struct tdqcow_state *s)
1144
uint32_t offset, cksum, out;
1149
fd = open(s->name, O_WRONLY | O_LARGEFILE); /* open without O_DIRECT */
1155
offset = sizeof(QCowHeader) + offsetof(QCowHeader_ext, cksum);
1156
if (lseek(fd, offset, SEEK_SET) == (off_t)-1) {
1161
/* convert to big endian for checksum */
1162
for (i = 0; i < s->l1_size; i++)
1163
cpu_to_be64s(&s->l1_table[i]);
1165
cksum = gen_cksum((char *)s->l1_table, s->l1_size * sizeof(uint64_t));
1167
/* and back again... */
1168
for (i = 0; i < s->l1_size; i++)
1169
be64_to_cpus(&s->l1_table[i]);
1171
DPRINTF("Writing cksum: %d", cksum);
1173
out = cpu_to_be32(cksum);
1174
if (write(fd, &out, sizeof(out)) != sizeof(out)) {
1183
DPRINTF("failed to update checksum: %d\n", err);
1189
int tdqcow_close(td_driver_t *driver)
1191
struct tdqcow_state *s = (struct tdqcow_state *)driver->data;
1193
/*Update the hdr cksum*/
1194
tdqcow_update_checksum(s);
1200
free(s->cluster_cache);
1201
free(s->cluster_data);
1206
int qcow_create(const char *filename, uint64_t total_size,
1207
const char *backing_file, int sparse)
1209
int fd, header_size, backing_filename_len, l1_size, i;
1210
int shift, length, adjust, flags = 0, ret = 0;
1212
QCowHeader_ext exthdr;
1213
char backing_filename[PATH_MAX], *ptr;
1214
uint64_t tmp, size, total_length;
1217
DPRINTF("Qcow_create: size %"PRIu64"\n",total_size);
1220
O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
1225
memset(&header, 0, sizeof(header));
1226
header.magic = cpu_to_be32(QCOW_MAGIC);
1227
header.version = cpu_to_be32(QCOW_VERSION);
1229
/*Create extended header fields*/
1230
exthdr.xmagic = cpu_to_be32(XEN_MAGIC);
1232
header_size = sizeof(header) + sizeof(QCowHeader_ext);
1233
backing_filename_len = 0;
1234
size = (total_size >> SECTOR_SHIFT);
1236
if (strcmp(backing_file, "fat:")) {
1238
/* XXX: this is a hack: we do not attempt to
1239
*check for URL like syntax */
1240
p = strchr(backing_file, ':');
1241
if (p && (p - backing_file) >= 2) {
1242
/* URL like but exclude "c:" like filenames */
1243
strncpy(backing_filename, backing_file,
1244
sizeof(backing_filename));
1246
if (realpath(backing_file, backing_filename) == NULL ||
1247
stat(backing_filename, &st) != 0) {
1251
header.backing_file_offset = cpu_to_be64(header_size);
1252
backing_filename_len = strlen(backing_filename);
1253
header.backing_file_size = cpu_to_be32(
1254
backing_filename_len);
1255
header_size += backing_filename_len;
1257
/*Set to the backing file size*/
1258
if(get_filesize(backing_filename, &size, &st)) {
1261
DPRINTF("Backing file size detected: %"PRId64" sectors"
1262
"(total %"PRId64" [%"PRId64" MB])\n",
1264
(uint64_t)(size << SECTOR_SHIFT),
1265
(uint64_t)(size >> 11));
1267
backing_file = NULL;
1268
DPRINTF("Setting file size: %"PRId64" (total %"PRId64")\n",
1270
(uint64_t) (total_size << SECTOR_SHIFT));
1272
header.mtime = cpu_to_be32(st.st_mtime);
1273
header.cluster_bits = 9; /* 512 byte cluster to avoid copying
1274
unmodifyed sectors */
1275
header.l2_bits = 12; /* 32 KB L2 tables */
1276
exthdr.min_cluster_alloc = cpu_to_be32(1);
1278
DPRINTF("Setting file size: %"PRId64" sectors"
1279
"(total %"PRId64" [%"PRId64" MB])\n",
1281
(uint64_t) (size << SECTOR_SHIFT),
1282
(uint64_t) (size >> 11));
1283
header.cluster_bits = 12; /* 4 KB clusters */
1284
header.l2_bits = 9; /* 4 KB L2 tables */
1285
exthdr.min_cluster_alloc = cpu_to_be32(1 << 9);
1287
/*Set the header size value*/
1288
header.size = cpu_to_be64(size * 512);
1290
header_size = (header_size + 7) & ~7;
1291
if (header_size % 4096 > 0) {
1292
header_size = ((header_size >> 12) + 1) << 12;
1295
shift = header.cluster_bits + header.l2_bits;
1296
l1_size = ((size * 512) + (1LL << shift) - 1) >> shift;
1298
header.l1_table_offset = cpu_to_be64(header_size);
1299
DPRINTF("L1 Table offset: %d, size %d\n",
1301
(int)(l1_size * sizeof(uint64_t)));
1302
header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
1304
ptr = calloc(1, l1_size * sizeof(uint64_t));
1305
exthdr.cksum = cpu_to_be32(gen_cksum(ptr, l1_size * sizeof(uint64_t)));
1306
printf("Created cksum: %d\n",exthdr.cksum);
1309
/*adjust file length to system page size boundary*/
1310
length = ROUNDUP(header_size + (l1_size * sizeof(uint64_t)),
1312
if (qtruncate(fd, length, 0)!=0) {
1313
DPRINTF("ERROR truncating file\n");
1318
/*Filesize is length+l1_size*(1 << s->l2_bits)+(size*512)*/
1319
total_length = length + (l1_size * (1 << 9)) + (size * 512);
1320
if (qtruncate(fd, total_length, 0)!=0) {
1321
DPRINTF("ERROR truncating file\n");
1324
printf("File truncated to length %"PRIu64"\n",total_length);
1326
flags = SPARSE_FILE;
1328
flags |= EXTHDR_L1_BIG_ENDIAN;
1329
exthdr.flags = cpu_to_be32(flags);
1331
/* write all the data */
1332
lseek(fd, 0, SEEK_SET);
1333
ret += write(fd, &header, sizeof(header));
1334
ret += write(fd, &exthdr, sizeof(exthdr));
1336
ret += write(fd, backing_filename, backing_filename_len);
1338
lseek(fd, header_size, SEEK_SET);
1340
for (i = 0;i < l1_size; i++) {
1341
ret += write(fd, &tmp, sizeof(tmp));
1349
static int qcow_make_empty(struct tdqcow_state *s)
1351
uint32_t l1_length = s->l1_size * sizeof(uint64_t);
1353
memset(s->l1_table, 0, l1_length);
1354
lseek(s->fd, s->l1_table_offset, SEEK_SET);
1355
if (write(s->fd, s->l1_table, l1_length) < 0)
1357
if (qtruncate(s->fd, s->l1_table_offset + l1_length, s->sparse)!=0) {
1358
DPRINTF("ERROR truncating file\n");
1362
memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
1363
memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
1364
memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));
1369
static int qcow_get_cluster_size(struct tdqcow_state *s)
1371
return s->cluster_size;
1374
/* XXX: put compressed sectors first, then all the cluster aligned
1375
tables to avoid losing bytes in alignment */
1376
static int qcow_compress_cluster(struct tdqcow_state *s, int64_t sector_num,
1382
uint64_t cluster_offset;
1384
out_buf = malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
1388
/* best compression, small window, no zlib header */
1389
memset(&strm, 0, sizeof(strm));
1390
ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
1392
9, Z_DEFAULT_STRATEGY);
1398
strm.avail_in = s->cluster_size;
1399
strm.next_in = (uint8_t *)buf;
1400
strm.avail_out = s->cluster_size;
1401
strm.next_out = out_buf;
1403
ret = deflate(&strm, Z_FINISH);
1404
if (ret != Z_STREAM_END && ret != Z_OK) {
1409
out_len = strm.next_out - out_buf;
1413
if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
1414
/* could not compress: write normal cluster */
1415
//tdqcow_queue_write(bs, sector_num, buf, s->cluster_sectors);
1417
cluster_offset = get_cluster_offset(s, sector_num << 9, 2,
1419
cluster_offset &= s->cluster_offset_mask;
1420
lseek(s->fd, cluster_offset, SEEK_SET);
1421
if (write(s->fd, out_buf, out_len) != out_len) {
1432
tdqcow_get_image_type(const char *file, int *type)
1438
fd = open(file, O_RDONLY);
1442
size = read(fd, &header, sizeof(header));
1444
if (size != sizeof(header))
1445
return (errno ? -errno : -EIO);
1447
be32_to_cpus(&header.magic);
1448
if (header.magic == QCOW_MAGIC)
1449
*type = DISK_TYPE_QCOW;
1451
*type = DISK_TYPE_AIO;
1456
int tdqcow_get_parent_id(td_driver_t *driver, td_disk_id_t *id)
1459
char *buf, *filename;
1460
int len, secs, type, err = -EINVAL;
1461
struct tdqcow_state *child = (struct tdqcow_state *)driver->data;
1463
if (!child->backing_file_offset)
1464
return TD_NO_PARENT;
1466
/* read the backing file name */
1467
len = child->backing_file_size;
1468
off = child->backing_file_offset - (child->backing_file_offset % 512);
1469
secs = (len + (child->backing_file_offset - off) + 511) >> 9;
1471
if (posix_memalign((void **)&buf, 512, secs << 9))
1474
if (lseek(child->fd, off, SEEK_SET) == (off_t)-1)
1477
if (read(child->fd, buf, secs << 9) != secs << 9)
1479
filename = buf + (child->backing_file_offset - off);
1480
filename[len] = '\0';
1482
if (tdqcow_get_image_type(filename, &type))
1485
id->name = strdup(filename);
1486
id->drivertype = type;
1493
int tdqcow_validate_parent(td_driver_t *driver,
1494
td_driver_t *pdriver, td_flag_t flags)
1497
uint64_t psize, csize;
1498
struct tdqcow_state *c = (struct tdqcow_state *)driver->data;
1499
struct tdqcow_state *p = (struct tdqcow_state *)pdriver->data;
1501
if (stat(p->name, &stats))
1503
if (get_filesize(p->name, &psize, &stats))
1506
if (stat(c->name, &stats))
1508
if (get_filesize(c->name, &csize, &stats))
1517
struct tap_disk tapdisk_qcow = {
1518
.disk_type = "tapdisk_qcow",
1520
.private_data_size = sizeof(struct tdqcow_state),
1521
.td_open = tdqcow_open,
1522
.td_close = tdqcow_close,
1523
.td_queue_read = tdqcow_queue_read,
1524
.td_queue_write = tdqcow_queue_write,
1525
.td_get_parent_id = tdqcow_get_parent_id,
1526
.td_validate_parent = tdqcow_validate_parent,