1
/*****************************************************************************
3
Copyright (c) 2005, 2009, Innobase Oy. All Rights Reserved.
5
This program is free software; you can redistribute it and/or modify it under
6
the terms of the GNU General Public License as published by the Free Software
7
Foundation; version 2 of the License.
9
This program is distributed in the hope that it will be useful, but WITHOUT
10
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13
You should have received a copy of the GNU General Public License along with
14
this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15
Place, Suite 330, Boston, MA 02111-1307 USA
17
*****************************************************************************/
19
/**************************************************//**
21
New index creation routines using a merge sort
23
Created 12/4/2005 Jan Lindstrom
24
Completed by Sunny Bains and Marko Makela
25
*******************************************************/
27
#include "row0merge.h"
33
#include "dict0dict.h"
35
#include "dict0boot.h"
36
#include "dict0crea.h"
37
#include "dict0load.h"
39
#include "mach0data.h"
44
#include "trx0purge.h"
48
#include "read0read.h"
50
#include "lock0lock.h"
51
#include "data0data.h"
52
#include "data0type.h"
54
#include "pars0pars.h"
58
#include "handler0alter.h"
62
/** Set these in order ot enable debug printout. */
64
static ibool row_merge_print_cmp;
65
static ibool row_merge_print_read;
66
static ibool row_merge_print_write;
68
#endif /* UNIV_DEBUG */
70
/** @brief Block size for I/O operations in merge sort.
72
The minimum is UNIV_PAGE_SIZE, or page_get_free_space_of_empty()
73
rounded to a power of 2.
75
When not creating a PRIMARY KEY that contains column prefixes, this
76
can be set as small as UNIV_PAGE_SIZE / 2. See the comment above
77
ut_ad(data_size < sizeof(row_merge_block_t)). */
78
typedef byte row_merge_block_t[1048576];
80
/** @brief Secondary buffer for I/O operations of merge records.
82
This buffer is used for writing or reading a record that spans two
83
row_merge_block_t. Thus, it must be able to hold one merge record,
84
whose maximum size is the same as the minimum size of
86
typedef byte mrec_buf_t[UNIV_PAGE_SIZE];
88
/** @brief Merge record in row_merge_block_t.
90
The format is the same as a record in ROW_FORMAT=COMPACT with the
91
exception that the REC_N_NEW_EXTRA_BYTES are omitted. */
94
/** Buffer for sorting in main memory. */
95
struct row_merge_buf_struct {
96
mem_heap_t* heap; /*!< memory heap where allocated */
97
dict_index_t* index; /*!< the index the tuples belong to */
98
ulint total_size; /*!< total amount of data bytes */
99
ulint n_tuples; /*!< number of data tuples */
100
ulint max_tuples; /*!< maximum number of data tuples */
101
const dfield_t**tuples; /*!< array of pointers to
102
arrays of fields that form
104
const dfield_t**tmp_tuples; /*!< temporary copy of tuples,
108
/** Buffer for sorting in main memory. */
109
typedef struct row_merge_buf_struct row_merge_buf_t;
111
/** Information about temporary files used in merge sort */
112
struct merge_file_struct {
113
int fd; /*!< file descriptor */
114
ulint offset; /*!< file offset */
117
/** Information about temporary files used in merge sort */
118
typedef struct merge_file_struct merge_file_t;
121
/******************************************************//**
122
Display a merge tuple. */
125
row_merge_tuple_print(
126
/*==================*/
127
FILE* f, /*!< in: output stream */
128
const dfield_t* entry, /*!< in: tuple to print */
129
ulint n_fields)/*!< in: number of fields in the tuple */
133
for (j = 0; j < n_fields; j++) {
134
const dfield_t* field = &entry[j];
136
if (dfield_is_null(field)) {
137
fputs("\n NULL;", f);
139
ulint field_len = dfield_get_len(field);
140
ulint len = ut_min(field_len, 20);
141
if (dfield_is_ext(field)) {
146
ut_print_buf(f, dfield_get_data(field), len);
147
if (len != field_len) {
148
fprintf(f, " (total %lu bytes)", field_len);
154
#endif /* UNIV_DEBUG */
156
/******************************************************//**
157
Allocate a sort buffer.
158
@return own: sort buffer */
161
row_merge_buf_create_low(
162
/*=====================*/
163
mem_heap_t* heap, /*!< in: heap where allocated */
164
dict_index_t* index, /*!< in: secondary index */
165
ulint max_tuples, /*!< in: maximum number of data tuples */
166
ulint buf_size) /*!< in: size of the buffer, in bytes */
168
row_merge_buf_t* buf;
170
ut_ad(max_tuples > 0);
171
ut_ad(max_tuples <= sizeof(row_merge_block_t));
172
ut_ad(max_tuples < buf_size);
174
buf = mem_heap_zalloc(heap, buf_size);
177
buf->max_tuples = max_tuples;
178
buf->tuples = mem_heap_alloc(heap,
179
2 * max_tuples * sizeof *buf->tuples);
180
buf->tmp_tuples = buf->tuples + max_tuples;
185
/******************************************************//**
186
Allocate a sort buffer.
187
@return own: sort buffer */
190
row_merge_buf_create(
191
/*=================*/
192
dict_index_t* index) /*!< in: secondary index */
194
row_merge_buf_t* buf;
199
max_tuples = sizeof(row_merge_block_t)
200
/ ut_max(1, dict_index_get_min_size(index));
202
buf_size = (sizeof *buf) + (max_tuples - 1) * sizeof *buf->tuples;
204
heap = mem_heap_create(buf_size + sizeof(row_merge_block_t));
206
buf = row_merge_buf_create_low(heap, index, max_tuples, buf_size);
211
/******************************************************//**
213
@return sort buffer */
218
row_merge_buf_t* buf) /*!< in,own: sort buffer */
221
ulint max_tuples = buf->max_tuples;
222
mem_heap_t* heap = buf->heap;
223
dict_index_t* index = buf->index;
225
buf_size = (sizeof *buf) + (max_tuples - 1) * sizeof *buf->tuples;
227
mem_heap_empty(heap);
229
return(row_merge_buf_create_low(heap, index, max_tuples, buf_size));
232
/******************************************************//**
233
Deallocate a sort buffer. */
238
row_merge_buf_t* buf) /*!< in,own: sort buffer, to be freed */
240
mem_heap_free(buf->heap);
243
/******************************************************//**
244
Insert a data tuple into a sort buffer.
245
@return TRUE if added, FALSE if out of space */
250
row_merge_buf_t* buf, /*!< in/out: sort buffer */
251
const dtuple_t* row, /*!< in: row in clustered index */
252
const row_ext_t* ext) /*!< in: cache of externally stored
253
column prefixes, or NULL */
259
const dict_index_t* index;
263
if (buf->n_tuples >= buf->max_tuples) {
267
UNIV_PREFETCH_R(row->fields);
271
n_fields = dict_index_get_n_fields(index);
273
entry = mem_heap_alloc(buf->heap, n_fields * sizeof *entry);
274
buf->tuples[buf->n_tuples] = entry;
278
extra_size = UT_BITS_IN_BYTES(index->n_nullable);
280
for (i = 0; i < n_fields; i++, field++) {
281
const dict_field_t* ifield;
282
const dict_col_t* col;
284
const dfield_t* row_field;
287
ifield = dict_index_get_nth_field(index, i);
289
col_no = dict_col_get_no(col);
290
row_field = dtuple_get_nth_field(row, col_no);
291
dfield_copy(field, row_field);
292
len = dfield_get_len(field);
294
if (dfield_is_null(field)) {
295
ut_ad(!(col->prtype & DATA_NOT_NULL));
297
} else if (UNIV_LIKELY(!ext)) {
298
} else if (dict_index_is_clust(index)) {
299
/* Flag externally stored fields. */
300
const byte* buf = row_ext_lookup(ext, col_no,
302
if (UNIV_LIKELY_NULL(buf)) {
303
ut_a(buf != field_ref_zero);
304
if (i < dict_index_get_n_unique(index)) {
305
dfield_set_data(field, buf, len);
307
dfield_set_ext(field);
308
len = dfield_get_len(field);
312
const byte* buf = row_ext_lookup(ext, col_no,
314
if (UNIV_LIKELY_NULL(buf)) {
315
ut_a(buf != field_ref_zero);
316
dfield_set_data(field, buf, len);
320
/* If a column prefix index, take only the prefix */
322
if (ifield->prefix_len) {
323
len = dtype_get_at_most_n_mbchars(
325
col->mbminlen, col->mbmaxlen,
327
len, dfield_get_data(field));
328
dfield_set_len(field, len);
331
ut_ad(len <= col->len || col->mtype == DATA_BLOB);
333
if (ifield->fixed_len) {
334
ut_ad(len == ifield->fixed_len);
335
ut_ad(!dfield_is_ext(field));
336
} else if (dfield_is_ext(field)) {
339
|| (col->len < 256 && col->mtype != DATA_BLOB)) {
342
/* For variable-length columns, we look up the
343
maximum length from the column itself. If this
344
is a prefix index column shorter than 256 bytes,
345
this will waste one byte. */
356
size = rec_get_converted_size_comp(index,
358
entry, n_fields, &extra);
360
ut_ad(data_size + extra_size + REC_N_NEW_EXTRA_BYTES == size);
361
ut_ad(extra_size + REC_N_NEW_EXTRA_BYTES == extra);
363
#endif /* UNIV_DEBUG */
365
/* Add to the total size of the record in row_merge_block_t
366
the encoded length of extra_size and the extra bytes (extra_size).
367
See row_merge_buf_write() for the variable-length encoding
369
data_size += (extra_size + 1) + ((extra_size + 1) >= 0x80);
371
/* The following assertion may fail if row_merge_block_t is
372
declared very small and a PRIMARY KEY is being created with
373
many prefix columns. In that case, the record may exceed the
374
page_zip_rec_needs_ext() limit. However, no further columns
375
will be moved to external storage until the record is inserted
376
to the clustered index B-tree. */
377
ut_ad(data_size < sizeof(row_merge_block_t));
379
/* Reserve one byte for the end marker of row_merge_block_t. */
380
if (buf->total_size + data_size >= sizeof(row_merge_block_t) - 1) {
384
buf->total_size += data_size;
389
/* Copy the data fields. */
392
dfield_dup(field++, buf->heap);
393
} while (--n_fields);
398
/** Structure for reporting duplicate records. */
399
struct row_merge_dup_struct {
400
const dict_index_t* index; /*!< index being sorted */
401
TABLE* table; /*!< MySQL table object */
402
ulint n_dup; /*!< number of duplicates */
405
/** Structure for reporting duplicate records. */
406
typedef struct row_merge_dup_struct row_merge_dup_t;
408
/*************************************************************//**
409
Report a duplicate key. */
412
row_merge_dup_report(
413
/*=================*/
414
row_merge_dup_t* dup, /*!< in/out: for reporting duplicates */
415
const dfield_t* entry) /*!< in: duplicate index entry */
418
const dtuple_t* tuple;
419
dtuple_t tuple_store;
421
const dict_index_t* index = dup->index;
422
ulint n_fields= dict_index_get_n_fields(index);
423
mem_heap_t* heap = NULL;
424
ulint offsets_[REC_OFFS_NORMAL_SIZE];
429
/* Only report the first duplicate record,
430
but count all duplicate records. */
434
rec_offs_init(offsets_);
436
/* Convert the tuple to a record and then to MySQL format. */
438
tuple = dtuple_from_fields(&tuple_store, entry, n_fields);
439
n_ext = dict_index_is_clust(index) ? dtuple_get_n_ext(tuple) : 0;
441
rec = rec_convert_dtuple_to_rec(buf, index, tuple, n_ext);
442
offsets = rec_get_offsets(rec, index, offsets_, ULINT_UNDEFINED,
445
innobase_rec_to_mysql(dup->table, rec, index, offsets);
447
if (UNIV_LIKELY_NULL(heap)) {
452
/*************************************************************//**
454
@return 1, 0, -1 if a is greater, equal, less, respectively, than b */
459
ulint n_field,/*!< in: number of fields */
460
const dfield_t* a, /*!< in: first tuple to be compared */
461
const dfield_t* b, /*!< in: second tuple to be compared */
462
row_merge_dup_t* dup) /*!< in/out: for reporting duplicates */
465
const dfield_t* field = a;
467
/* Compare the fields of the tuples until a difference is
468
found or we run out of fields to compare. If !cmp at the
469
end, the tuples are equal. */
471
cmp = cmp_dfield_dfield(a++, b++);
472
} while (!cmp && --n_field);
474
if (UNIV_UNLIKELY(!cmp) && UNIV_LIKELY_NULL(dup)) {
475
/* Report a duplicate value error if the tuples are
476
logically equal. NULL columns are logically inequal,
477
although they are equal in the sorting order. Find
478
out if any of the fields are NULL. */
479
for (b = field; b != a; b++) {
480
if (dfield_is_null(b)) {
486
row_merge_dup_report(dup, field);
493
/** Wrapper for row_merge_tuple_sort() to inject some more context to
494
UT_SORT_FUNCTION_BODY().
495
@param a array of tuples that being sorted
496
@param b aux (work area), same size as tuples[]
497
@param c lower bound of the sorting area, inclusive
498
@param d upper bound of the sorting area, inclusive */
499
#define row_merge_tuple_sort_ctx(a,b,c,d) \
500
row_merge_tuple_sort(n_field, dup, a, b, c, d)
501
/** Wrapper for row_merge_tuple_cmp() to inject some more context to
502
UT_SORT_FUNCTION_BODY().
503
@param a first tuple to be compared
504
@param b second tuple to be compared
505
@return 1, 0, -1 if a is greater, equal, less, respectively, than b */
506
#define row_merge_tuple_cmp_ctx(a,b) row_merge_tuple_cmp(n_field, a, b, dup)
508
/**********************************************************************//**
509
Merge sort the tuple buffer in main memory. */
512
row_merge_tuple_sort(
513
/*=================*/
514
ulint n_field,/*!< in: number of fields */
515
row_merge_dup_t* dup, /*!< in/out: for reporting duplicates */
516
const dfield_t** tuples, /*!< in/out: tuples */
517
const dfield_t** aux, /*!< in/out: work area */
518
ulint low, /*!< in: lower bound of the
519
sorting area, inclusive */
520
ulint high) /*!< in: upper bound of the
521
sorting area, exclusive */
523
UT_SORT_FUNCTION_BODY(row_merge_tuple_sort_ctx,
524
tuples, aux, low, high, row_merge_tuple_cmp_ctx);
527
/******************************************************//**
533
row_merge_buf_t* buf, /*!< in/out: sort buffer */
534
row_merge_dup_t* dup) /*!< in/out: for reporting duplicates */
536
row_merge_tuple_sort(dict_index_get_n_unique(buf->index), dup,
537
buf->tuples, buf->tmp_tuples, 0, buf->n_tuples);
540
/******************************************************//**
541
Write a buffer to a block. */
546
const row_merge_buf_t* buf, /*!< in: sorted buffer */
548
const merge_file_t* of, /*!< in: output file */
549
#endif /* UNIV_DEBUG */
550
row_merge_block_t* block) /*!< out: buffer for writing to file */
552
# define row_merge_buf_write(buf, of, block) row_merge_buf_write(buf, block)
553
#endif /* !UNIV_DEBUG */
555
const dict_index_t* index = buf->index;
556
ulint n_fields= dict_index_get_n_fields(index);
557
byte* b = &(*block)[0];
561
for (i = 0; i < buf->n_tuples; i++) {
564
const dfield_t* entry = buf->tuples[i];
566
size = rec_get_converted_size_comp(index,
570
ut_ad(size > extra_size);
571
ut_ad(extra_size >= REC_N_NEW_EXTRA_BYTES);
572
extra_size -= REC_N_NEW_EXTRA_BYTES;
573
size -= REC_N_NEW_EXTRA_BYTES;
575
/* Encode extra_size + 1 */
576
if (extra_size + 1 < 0x80) {
577
*b++ = (byte) (extra_size + 1);
579
ut_ad((extra_size + 1) < 0x8000);
580
*b++ = (byte) (0x80 | ((extra_size + 1) >> 8));
581
*b++ = (byte) (extra_size + 1);
584
ut_ad(b + size < block[1]);
586
rec_convert_dtuple_to_rec_comp(b + extra_size, 0, index,
593
if (row_merge_print_write) {
594
fprintf(stderr, "row_merge_buf_write %p,%d,%lu %lu",
595
(void*) b, of->fd, (ulong) of->offset,
597
row_merge_tuple_print(stderr, entry, n_fields);
599
#endif /* UNIV_DEBUG */
602
/* Write an "end-of-chunk" marker. */
604
ut_a(b == block[0] + buf->total_size);
606
#ifdef UNIV_DEBUG_VALGRIND
607
/* The rest of the block is uninitialized. Initialize it
608
to avoid bogus warnings. */
609
memset(b, 0xff, block[1] - b);
610
#endif /* UNIV_DEBUG_VALGRIND */
612
if (row_merge_print_write) {
613
fprintf(stderr, "row_merge_buf_write %p,%d,%lu EOF\n",
614
(void*) b, of->fd, (ulong) of->offset);
616
#endif /* UNIV_DEBUG */
619
/******************************************************//**
620
Create a memory heap and allocate space for row_merge_rec_offsets().
621
@return memory heap */
624
row_merge_heap_create(
625
/*==================*/
626
const dict_index_t* index, /*!< in: record descriptor */
627
ulint** offsets1, /*!< out: offsets */
628
ulint** offsets2) /*!< out: offsets */
630
ulint i = 1 + REC_OFFS_HEADER_SIZE
631
+ dict_index_get_n_fields(index);
632
mem_heap_t* heap = mem_heap_create(2 * i * sizeof *offsets1);
634
*offsets1 = mem_heap_alloc(heap, i * sizeof *offsets1);
635
*offsets2 = mem_heap_alloc(heap, i * sizeof *offsets2);
637
(*offsets1)[0] = (*offsets2)[0] = i;
638
(*offsets1)[1] = (*offsets2)[1] = dict_index_get_n_fields(index);
643
/**********************************************************************//**
644
Search an index object by name and column names. If several indexes match,
645
return the index with the max id.
646
@return matching index, NULL if not found */
649
row_merge_dict_table_get_index(
650
/*===========================*/
651
dict_table_t* table, /*!< in: table */
652
const merge_index_def_t*index_def) /*!< in: index definition */
656
const char** column_names;
658
column_names = mem_alloc(index_def->n_fields * sizeof *column_names);
660
for (i = 0; i < index_def->n_fields; ++i) {
661
column_names[i] = index_def->fields[i].field_name;
664
index = dict_table_get_index_by_max_id(
665
table, index_def->name, column_names, index_def->n_fields);
667
mem_free((void*) column_names);
672
/********************************************************************//**
673
Read a merge block from the file system.
674
@return TRUE if request was successful, FALSE if fail */
679
int fd, /*!< in: file descriptor */
680
ulint offset, /*!< in: offset where to read */
681
row_merge_block_t* buf) /*!< out: data */
683
ib_uint64_t ofs = ((ib_uint64_t) offset) * sizeof *buf;
686
success = os_file_read_no_error_handling(OS_FILE_FROM_FD(fd), buf,
687
(ulint) (ofs & 0xFFFFFFFF),
690
if (UNIV_UNLIKELY(!success)) {
691
ut_print_timestamp(stderr);
693
" InnoDB: failed to read merge block at %"PRIu64"\n", ofs);
696
return(UNIV_LIKELY(success));
699
/********************************************************************//**
700
Read a merge block from the file system.
701
@return TRUE if request was successful, FALSE if fail */
706
int fd, /*!< in: file descriptor */
707
ulint offset, /*!< in: offset where to write */
708
const void* buf) /*!< in: data */
710
ib_uint64_t ofs = ((ib_uint64_t) offset)
711
* sizeof(row_merge_block_t);
713
return(UNIV_LIKELY(os_file_write("(merge)", OS_FILE_FROM_FD(fd), buf,
714
(ulint) (ofs & 0xFFFFFFFF),
716
sizeof(row_merge_block_t))));
719
/********************************************************************//**
721
@return pointer to next record, or NULL on I/O error or end of list */
726
row_merge_block_t* block, /*!< in/out: file buffer */
727
mrec_buf_t* buf, /*!< in/out: secondary buffer */
728
const byte* b, /*!< in: pointer to record */
729
const dict_index_t* index, /*!< in: index of the record */
730
int fd, /*!< in: file descriptor */
731
ulint* foffs, /*!< in/out: file offset */
732
const mrec_t** mrec, /*!< out: pointer to merge record,
733
or NULL on end of list
734
(non-NULL on I/O error) */
735
ulint* offsets)/*!< out: offsets of mrec */
743
ut_ad(b >= block[0]);
750
ut_ad(*offsets == 1 + REC_OFFS_HEADER_SIZE
751
+ dict_index_get_n_fields(index));
755
if (UNIV_UNLIKELY(!extra_size)) {
759
if (row_merge_print_read) {
760
fprintf(stderr, "row_merge_read %p,%p,%d,%lu EOF\n",
761
(const void*) b, (const void*) block,
764
#endif /* UNIV_DEBUG */
768
if (extra_size >= 0x80) {
769
/* Read another byte of extra_size. */
771
if (UNIV_UNLIKELY(b >= block[1])) {
772
if (!row_merge_read(fd, ++(*foffs), block)) {
774
/* Signal I/O error. */
779
/* Wrap around to the beginning of the buffer. */
783
extra_size = (extra_size & 0x7f) << 8;
787
/* Normalize extra_size. Above, value 0 signals "end of list". */
790
/* Read the extra bytes. */
792
if (UNIV_UNLIKELY(b + extra_size >= block[1])) {
793
/* The record spans two blocks. Copy the entire record
794
to the auxiliary buffer and handle this as a special
797
avail_size = block[1] - b;
799
memcpy(*buf, b, avail_size);
801
if (!row_merge_read(fd, ++(*foffs), block)) {
806
/* Wrap around to the beginning of the buffer. */
809
/* Copy the record. */
810
memcpy(*buf + avail_size, b, extra_size - avail_size);
811
b += extra_size - avail_size;
813
*mrec = *buf + extra_size;
815
rec_init_offsets_comp_ordinary(*mrec, 0, index, offsets);
817
data_size = rec_offs_data_size(offsets);
819
/* These overflows should be impossible given that
820
records are much smaller than either buffer, and
821
the record starts near the beginning of each buffer. */
822
ut_a(extra_size + data_size < sizeof *buf);
823
ut_a(b + data_size < block[1]);
825
/* Copy the data bytes. */
826
memcpy(*buf + extra_size, b, data_size);
832
*mrec = b + extra_size;
834
rec_init_offsets_comp_ordinary(*mrec, 0, index, offsets);
836
data_size = rec_offs_data_size(offsets);
837
ut_ad(extra_size + data_size < sizeof *buf);
839
b += extra_size + data_size;
841
if (UNIV_LIKELY(b < block[1])) {
842
/* The record fits entirely in the block.
843
This is the normal case. */
847
/* The record spans two blocks. Copy it to buf. */
849
b -= extra_size + data_size;
850
avail_size = block[1] - b;
851
memcpy(*buf, b, avail_size);
852
*mrec = *buf + extra_size;
854
/* We cannot invoke rec_offs_make_valid() here, because there
855
are no REC_N_NEW_EXTRA_BYTES between extra_size and data_size.
856
Similarly, rec_offs_validate() would fail, because it invokes
858
offsets[2] = (ulint) *mrec;
859
offsets[3] = (ulint) index;
860
#endif /* UNIV_DEBUG */
862
if (!row_merge_read(fd, ++(*foffs), block)) {
867
/* Wrap around to the beginning of the buffer. */
870
/* Copy the rest of the record. */
871
memcpy(*buf + avail_size, b, extra_size + data_size - avail_size);
872
b += extra_size + data_size - avail_size;
876
if (row_merge_print_read) {
877
fprintf(stderr, "row_merge_read %p,%p,%d,%lu ",
878
(const void*) b, (const void*) block,
880
rec_print_comp(stderr, *mrec, offsets);
883
#endif /* UNIV_DEBUG */
888
/********************************************************************//**
889
Write a merge record. */
892
row_merge_write_rec_low(
893
/*====================*/
894
byte* b, /*!< out: buffer */
895
ulint e, /*!< in: encoded extra_size */
897
ulint size, /*!< in: total size to write */
898
int fd, /*!< in: file descriptor */
899
ulint foffs, /*!< in: file offset */
900
#endif /* UNIV_DEBUG */
901
const mrec_t* mrec, /*!< in: record to write */
902
const ulint* offsets)/*!< in: offsets of mrec */
904
# define row_merge_write_rec_low(b, e, size, fd, foffs, mrec, offsets) \
905
row_merge_write_rec_low(b, e, mrec, offsets)
906
#endif /* !UNIV_DEBUG */
909
const byte* const end = b + size;
910
ut_ad(e == rec_offs_extra_size(offsets) + 1);
912
if (row_merge_print_write) {
913
fprintf(stderr, "row_merge_write %p,%d,%lu ",
914
(void*) b, fd, (ulong) foffs);
915
rec_print_comp(stderr, mrec, offsets);
918
#endif /* UNIV_DEBUG */
923
*b++ = (byte) (0x80 | (e >> 8));
927
memcpy(b, mrec - rec_offs_extra_size(offsets), rec_offs_size(offsets));
928
ut_ad(b + rec_offs_size(offsets) == end);
931
/********************************************************************//**
932
Write a merge record.
933
@return pointer to end of block, or NULL on error */
938
row_merge_block_t* block, /*!< in/out: file buffer */
939
mrec_buf_t* buf, /*!< in/out: secondary buffer */
940
byte* b, /*!< in: pointer to end of block */
941
int fd, /*!< in: file descriptor */
942
ulint* foffs, /*!< in/out: file offset */
943
const mrec_t* mrec, /*!< in: record to write */
944
const ulint* offsets)/*!< in: offsets of mrec */
952
ut_ad(b >= block[0]);
956
ut_ad(mrec < block[0] || mrec > block[1]);
957
ut_ad(mrec < buf[0] || mrec > buf[1]);
959
/* Normalize extra_size. Value 0 signals "end of list". */
960
extra_size = rec_offs_extra_size(offsets) + 1;
962
size = extra_size + (extra_size >= 0x80)
963
+ rec_offs_data_size(offsets);
965
if (UNIV_UNLIKELY(b + size >= block[1])) {
966
/* The record spans two blocks.
967
Copy it to the temporary buffer first. */
968
avail_size = block[1] - b;
970
row_merge_write_rec_low(buf[0],
971
extra_size, size, fd, *foffs,
974
/* Copy the head of the temporary buffer, write
975
the completed block, and copy the tail of the
976
record to the head of the new block. */
977
memcpy(b, buf[0], avail_size);
979
if (!row_merge_write(fd, (*foffs)++, block)) {
983
UNIV_MEM_INVALID(block[0], sizeof block[0]);
987
memcpy(b, buf[0] + avail_size, size - avail_size);
988
b += size - avail_size;
990
row_merge_write_rec_low(b, extra_size, size, fd, *foffs,
998
/********************************************************************//**
999
Write an end-of-list marker.
1000
@return pointer to end of block, or NULL on error */
1003
row_merge_write_eof(
1004
/*================*/
1005
row_merge_block_t* block, /*!< in/out: file buffer */
1006
byte* b, /*!< in: pointer to end of block */
1007
int fd, /*!< in: file descriptor */
1008
ulint* foffs) /*!< in/out: file offset */
1011
ut_ad(b >= block[0]);
1012
ut_ad(b < block[1]);
1015
if (row_merge_print_write) {
1016
fprintf(stderr, "row_merge_write %p,%p,%d,%lu EOF\n",
1017
(void*) b, (void*) block, fd, (ulong) *foffs);
1019
#endif /* UNIV_DEBUG */
1022
UNIV_MEM_ASSERT_RW(block[0], b - block[0]);
1023
UNIV_MEM_ASSERT_W(block[0], sizeof block[0]);
1024
#ifdef UNIV_DEBUG_VALGRIND
1025
/* The rest of the block is uninitialized. Initialize it
1026
to avoid bogus warnings. */
1027
memset(b, 0xff, block[1] - b);
1028
#endif /* UNIV_DEBUG_VALGRIND */
1030
if (!row_merge_write(fd, (*foffs)++, block)) {
1034
UNIV_MEM_INVALID(block[0], sizeof block[0]);
1038
/*************************************************************//**
1039
Compare two merge records.
1040
@return 1, 0, -1 if mrec1 is greater, equal, less, respectively, than mrec2 */
1045
const mrec_t* mrec1, /*!< in: first merge
1046
record to be compared */
1047
const mrec_t* mrec2, /*!< in: second merge
1048
record to be compared */
1049
const ulint* offsets1, /*!< in: first record offsets */
1050
const ulint* offsets2, /*!< in: second record offsets */
1051
const dict_index_t* index) /*!< in: index */
1055
cmp = cmp_rec_rec_simple(mrec1, mrec2, offsets1, offsets2, index);
1058
if (row_merge_print_cmp) {
1059
fputs("row_merge_cmp1 ", stderr);
1060
rec_print_comp(stderr, mrec1, offsets1);
1061
fputs("\nrow_merge_cmp2 ", stderr);
1062
rec_print_comp(stderr, mrec2, offsets2);
1063
fprintf(stderr, "\nrow_merge_cmp=%d\n", cmp);
1065
#endif /* UNIV_DEBUG */
1070
/********************************************************************//**
1071
Reads clustered index of the table and create temporary files
1072
containing the index entries for the indexes to be built.
1073
@return DB_SUCCESS or error */
1076
row_merge_read_clustered_index(
1077
/*===========================*/
1078
trx_t* trx, /*!< in: transaction */
1079
TABLE* table, /*!< in/out: MySQL table object,
1080
for reporting erroneous records */
1081
const dict_table_t* old_table,/*!< in: table where rows are
1083
const dict_table_t* new_table,/*!< in: table where indexes are
1084
created; identical to old_table
1085
unless creating a PRIMARY KEY */
1086
dict_index_t** index, /*!< in: indexes to be created */
1087
merge_file_t* files, /*!< in: temporary files */
1088
ulint n_index,/*!< in: number of indexes to create */
1089
row_merge_block_t* block) /*!< in/out: file buffer */
1091
dict_index_t* clust_index; /* Clustered index */
1092
mem_heap_t* row_heap; /* Heap memory to create
1093
clustered index records */
1094
row_merge_buf_t** merge_buf; /* Temporary list for records*/
1095
btr_pcur_t pcur; /* Persistent cursor on the
1097
mtr_t mtr; /* Mini transaction */
1098
ulint err = DB_SUCCESS;/* Return code */
1100
ulint n_nonnull = 0; /* number of columns
1101
changed to NOT NULL */
1102
ulint* nonnull = NULL; /* NOT NULL columns */
1104
trx->op_info = "reading clustered index";
1112
/* Create and initialize memory for record buffers */
1114
merge_buf = mem_alloc(n_index * sizeof *merge_buf);
1116
for (i = 0; i < n_index; i++) {
1117
merge_buf[i] = row_merge_buf_create(index[i]);
1122
/* Find the clustered index and create a persistent cursor
1125
clust_index = dict_table_get_first_index(old_table);
1127
btr_pcur_open_at_index_side(
1128
TRUE, clust_index, BTR_SEARCH_LEAF, &pcur, TRUE, &mtr);
1130
if (UNIV_UNLIKELY(old_table != new_table)) {
1131
ulint n_cols = dict_table_get_n_cols(old_table);
1133
/* A primary key will be created. Identify the
1134
columns that were flagged NOT NULL in the new table,
1135
so that we can quickly check that the records in the
1136
(old) clustered index do not violate the added NOT
1137
NULL constraints. */
1139
ut_a(n_cols == dict_table_get_n_cols(new_table));
1141
nonnull = mem_alloc(n_cols * sizeof *nonnull);
1143
for (i = 0; i < n_cols; i++) {
1144
if (dict_table_get_nth_col(old_table, i)->prtype
1150
if (dict_table_get_nth_col(new_table, i)->prtype
1153
nonnull[n_nonnull++] = i;
1163
row_heap = mem_heap_create(sizeof(mrec_buf_t));
1165
/* Scan the clustered index. */
1169
dtuple_t* row = NULL;
1171
ibool has_next = TRUE;
1173
btr_pcur_move_to_next_on_page(&pcur);
1175
/* When switching pages, commit the mini-transaction
1176
in order to release the latch on the old page. */
1178
if (btr_pcur_is_after_last_on_page(&pcur)) {
1179
btr_pcur_store_position(&pcur, &mtr);
1182
btr_pcur_restore_position(BTR_SEARCH_LEAF,
1184
has_next = btr_pcur_move_to_next_user_rec(&pcur, &mtr);
1187
if (UNIV_LIKELY(has_next)) {
1188
rec = btr_pcur_get_rec(&pcur);
1189
offsets = rec_get_offsets(rec, clust_index, NULL,
1190
ULINT_UNDEFINED, &row_heap);
1192
/* Skip delete marked records. */
1193
if (rec_get_deleted_flag(
1194
rec, dict_table_is_comp(old_table))) {
1198
srv_n_rows_inserted++;
1200
/* Build a row based on the clustered index. */
1202
row = row_build(ROW_COPY_POINTERS, clust_index,
1204
new_table, &ext, row_heap);
1206
if (UNIV_LIKELY_NULL(nonnull)) {
1207
for (i = 0; i < n_nonnull; i++) {
1209
= &row->fields[nonnull[i]];
1211
= dfield_get_type(field);
1213
ut_a(!(field_type->prtype
1216
if (dfield_is_null(field)) {
1217
err = DB_PRIMARY_KEY_IS_NULL;
1222
field_type->prtype |= DATA_NOT_NULL;
1227
/* Build all entries for all the indexes to be created
1228
in a single scan of the clustered index. */
1230
for (i = 0; i < n_index; i++) {
1231
row_merge_buf_t* buf = merge_buf[i];
1232
merge_file_t* file = &files[i];
1233
const dict_index_t* index = buf->index;
1236
(row && row_merge_buf_add(buf, row, ext))) {
1240
/* The buffer must be sufficiently large
1241
to hold at least one record. */
1242
ut_ad(buf->n_tuples || !has_next);
1244
/* We have enough data tuples to form a block.
1245
Sort them and write to disk. */
1247
if (buf->n_tuples) {
1248
if (dict_index_is_unique(index)) {
1249
row_merge_dup_t dup;
1250
dup.index = buf->index;
1254
row_merge_buf_sort(buf, &dup);
1257
err = DB_DUPLICATE_KEY;
1259
trx->error_key_num = i;
1263
row_merge_buf_sort(buf, NULL);
1267
row_merge_buf_write(buf, file, block);
1269
if (!row_merge_write(file->fd, file->offset++,
1271
err = DB_OUT_OF_FILE_SPACE;
1275
UNIV_MEM_INVALID(block[0], sizeof block[0]);
1276
merge_buf[i] = row_merge_buf_empty(buf);
1278
/* Try writing the record again, now that
1279
the buffer has been written out and emptied. */
1282
(row && !row_merge_buf_add(buf, row, ext))) {
1283
/* An empty buffer should have enough
1284
room for at least one record. */
1289
mem_heap_empty(row_heap);
1291
if (UNIV_UNLIKELY(!has_next)) {
1297
btr_pcur_close(&pcur);
1299
mem_heap_free(row_heap);
1301
if (UNIV_LIKELY_NULL(nonnull)) {
1305
for (i = 0; i < n_index; i++) {
1306
row_merge_buf_free(merge_buf[i]);
1309
mem_free(merge_buf);
1316
/** Write a record via buffer 2 and read the next record to buffer N.
1317
@param N number of the buffer (0 or 1)
1318
@param AT_END statement to execute at end of input */
1319
#define ROW_MERGE_WRITE_GET_NEXT(N, AT_END) \
1321
b2 = row_merge_write_rec(&block[2], &buf[2], b2, \
1322
of->fd, &of->offset, \
1323
mrec##N, offsets##N); \
1324
if (UNIV_UNLIKELY(!b2)) { \
1327
b##N = row_merge_read_rec(&block[N], &buf[N], \
1329
file->fd, foffs##N, \
1330
&mrec##N, offsets##N); \
1331
if (UNIV_UNLIKELY(!b##N)) { \
1339
/*************************************************************//**
1340
Merge two blocks of linked lists on disk and write a bigger block.
1341
@return DB_SUCCESS or error code */
1346
const dict_index_t* index, /*!< in: index being created */
1347
merge_file_t* file, /*!< in/out: file containing
1349
row_merge_block_t* block, /*!< in/out: 3 buffers */
1350
ulint* foffs0, /*!< in/out: offset of first
1351
source list in the file */
1352
ulint* foffs1, /*!< in/out: offset of second
1353
source list in the file */
1354
merge_file_t* of, /*!< in/out: output file */
1355
TABLE* table) /*!< in/out: MySQL table, for
1356
reporting erroneous key value
1359
mem_heap_t* heap; /*!< memory heap for offsets0, offsets1 */
1361
mrec_buf_t buf[3]; /*!< buffer for handling split mrec in block[] */
1362
const byte* b0; /*!< pointer to block[0] */
1363
const byte* b1; /*!< pointer to block[1] */
1364
byte* b2; /*!< pointer to block[2] */
1365
const mrec_t* mrec0; /*!< merge rec, points to block[0] or buf[0] */
1366
const mrec_t* mrec1; /*!< merge rec, points to block[1] or buf[1] */
1367
ulint* offsets0;/* offsets of mrec0 */
1368
ulint* offsets1;/* offsets of mrec1 */
1370
heap = row_merge_heap_create(index, &offsets0, &offsets1);
1372
/* Write a record and read the next record. Split the output
1373
file in two halves, which can be merged on the following pass. */
1375
if (!row_merge_read(file->fd, *foffs0, &block[0])
1376
|| !row_merge_read(file->fd, *foffs1, &block[1])) {
1378
mem_heap_free(heap);
1379
return(DB_CORRUPTION);
1386
b0 = row_merge_read_rec(&block[0], &buf[0], b0, index, file->fd,
1387
foffs0, &mrec0, offsets0);
1388
b1 = row_merge_read_rec(&block[1], &buf[1], b1, index, file->fd,
1389
foffs1, &mrec1, offsets1);
1390
if (UNIV_UNLIKELY(!b0 && mrec0)
1391
|| UNIV_UNLIKELY(!b1 && mrec1)) {
1396
while (mrec0 && mrec1) {
1397
switch (row_merge_cmp(mrec0, mrec1,
1398
offsets0, offsets1, index)) {
1401
(dict_index_is_unique(index))) {
1402
innobase_rec_to_mysql(table, mrec0,
1404
mem_heap_free(heap);
1405
return(DB_DUPLICATE_KEY);
1409
ROW_MERGE_WRITE_GET_NEXT(0, goto merged);
1412
ROW_MERGE_WRITE_GET_NEXT(1, goto merged);
1422
/* append all mrec0 to output */
1424
ROW_MERGE_WRITE_GET_NEXT(0, goto done0);
1429
/* append all mrec1 to output */
1431
ROW_MERGE_WRITE_GET_NEXT(1, goto done1);
1436
mem_heap_free(heap);
1437
b2 = row_merge_write_eof(&block[2], b2, of->fd, &of->offset);
1438
return(b2 ? DB_SUCCESS : DB_CORRUPTION);
1441
/*************************************************************//**
1443
@return DB_SUCCESS or error code */
1448
const dict_index_t* index, /*!< in: index being created */
1449
merge_file_t* file, /*!< in/out: file containing
1451
ulint half, /*!< in: half the file */
1452
row_merge_block_t* block, /*!< in/out: 3 buffers */
1453
int* tmpfd, /*!< in/out: temporary file handle */
1454
TABLE* table) /*!< in/out: MySQL table, for
1455
reporting erroneous key value
1458
ulint foffs0; /*!< first input offset */
1459
ulint foffs1; /*!< second input offset */
1460
ulint error; /*!< error code */
1461
merge_file_t of; /*!< output file */
1463
UNIV_MEM_ASSERT_W(block[0], 3 * sizeof block[0]);
1469
/* Merge blocks to the output file. */
1473
for (; foffs0 < half && foffs1 < file->offset; foffs0++, foffs1++) {
1474
error = row_merge_blocks(index, file, block,
1475
&foffs0, &foffs1, &of, table);
1477
if (error != DB_SUCCESS) {
1482
/* Copy the last block, if there is one. */
1483
while (foffs0 < half) {
1484
if (!row_merge_read(file->fd, foffs0++, block)
1485
|| !row_merge_write(of.fd, of.offset++, block)) {
1486
return(DB_CORRUPTION);
1489
while (foffs1 < file->offset) {
1490
if (!row_merge_read(file->fd, foffs1++, block)
1491
|| !row_merge_write(of.fd, of.offset++, block)) {
1492
return(DB_CORRUPTION);
1496
/* Swap file descriptors for the next pass. */
1500
UNIV_MEM_INVALID(block[0], 3 * sizeof block[0]);
1505
/*************************************************************//**
1507
@return DB_SUCCESS or error code */
1512
const dict_index_t* index, /*!< in: index being created */
1513
merge_file_t* file, /*!< in/out: file containing
1515
row_merge_block_t* block, /*!< in/out: 3 buffers */
1516
int* tmpfd, /*!< in/out: temporary file handle */
1517
TABLE* table) /*!< in/out: MySQL table, for
1518
reporting erroneous key value
1521
ulint blksz; /*!< block size */
1523
for (blksz = 1; blksz < file->offset; blksz *= 2) {
1527
ut_ad(ut_is_2pow(blksz));
1528
half = ut_2pow_round((file->offset + (blksz - 1)) / 2, blksz);
1529
error = row_merge(index, file, half, block, tmpfd, table);
1531
if (error != DB_SUCCESS) {
1539
/*************************************************************//**
1540
Copy externally stored columns to the data tuple. */
1543
row_merge_copy_blobs(
1544
/*=================*/
1545
const mrec_t* mrec, /*!< in: merge record */
1546
const ulint* offsets,/*!< in: offsets of mrec */
1547
ulint zip_size,/*!< in: compressed page size in bytes, or 0 */
1548
dtuple_t* tuple, /*!< in/out: data tuple */
1549
mem_heap_t* heap) /*!< in/out: memory heap */
1552
ulint n_fields = dtuple_get_n_fields(tuple);
1554
for (i = 0; i < n_fields; i++) {
1557
dfield_t* field = dtuple_get_nth_field(tuple, i);
1559
if (!dfield_is_ext(field)) {
1563
ut_ad(!dfield_is_null(field));
1565
/* The table is locked during index creation.
1566
Therefore, externally stored columns cannot possibly
1567
be freed between the time the BLOB pointers are read
1568
(row_merge_read_clustered_index()) and dereferenced
1570
data = btr_rec_copy_externally_stored_field(
1571
mrec, offsets, zip_size, i, &len, heap);
1573
dfield_set_data(field, data, len);
1577
/********************************************************************//**
1578
Read sorted file containing index data tuples and insert these data
1580
@return DB_SUCCESS or error number */
1583
row_merge_insert_index_tuples(
1584
/*==========================*/
1585
trx_t* trx, /*!< in: transaction */
1586
dict_index_t* index, /*!< in: index */
1587
dict_table_t* table, /*!< in: new table */
1588
ulint zip_size,/*!< in: compressed page size of
1589
the old table, or 0 if uncompressed */
1590
int fd, /*!< in: file descriptor */
1591
row_merge_block_t* block) /*!< in/out: file buffer */
1597
mem_heap_t* tuple_heap;
1598
mem_heap_t* graph_heap;
1599
ulint error = DB_SUCCESS;
1607
/* We use the insert query graph as the dummy graph
1608
needed in the row module call */
1610
trx->op_info = "inserting index entries";
1612
graph_heap = mem_heap_create(500);
1613
node = ins_node_create(INS_DIRECT, table, graph_heap);
1615
thr = pars_complete_graph_for_exec(node, trx, graph_heap);
1617
que_thr_move_to_run_state_for_mysql(thr, trx);
1619
tuple_heap = mem_heap_create(1000);
1622
ulint i = 1 + REC_OFFS_HEADER_SIZE
1623
+ dict_index_get_n_fields(index);
1624
offsets = mem_heap_alloc(graph_heap, i * sizeof *offsets);
1626
offsets[1] = dict_index_get_n_fields(index);
1631
if (!row_merge_read(fd, foffs, block)) {
1632
error = DB_CORRUPTION;
1639
b = row_merge_read_rec(block, &buf, b, index,
1640
fd, &foffs, &mrec, offsets);
1641
if (UNIV_UNLIKELY(!b)) {
1642
/* End of list, or I/O error */
1644
error = DB_CORRUPTION;
1649
dtuple = row_rec_to_index_entry_low(
1650
mrec, index, offsets, &n_ext, tuple_heap);
1652
if (UNIV_UNLIKELY(n_ext)) {
1653
row_merge_copy_blobs(mrec, offsets, zip_size,
1654
dtuple, tuple_heap);
1658
node->table = table;
1659
node->trx_id = trx->id;
1661
ut_ad(dtuple_validate(dtuple));
1664
thr->run_node = thr;
1665
thr->prev_node = thr->common.parent;
1667
error = row_ins_index_entry(index, dtuple,
1670
if (UNIV_LIKELY(error == DB_SUCCESS)) {
1675
thr->lock_state = QUE_THR_LOCK_ROW;
1676
trx->error_state = error;
1677
que_thr_stop_for_mysql(thr);
1678
thr->lock_state = QUE_THR_LOCK_NOLOCK;
1679
} while (row_mysql_handle_errors(&error, trx,
1684
mem_heap_empty(tuple_heap);
1688
que_thr_stop_for_mysql_no_error(thr, trx);
1690
que_graph_free(thr->graph);
1694
mem_heap_free(tuple_heap);
1699
/*********************************************************************//**
1700
Sets an exclusive lock on a table, for the duration of creating indexes.
1701
@return error code or DB_SUCCESS */
1704
row_merge_lock_table(
1705
/*=================*/
1706
trx_t* trx, /*!< in/out: transaction */
1707
dict_table_t* table, /*!< in: table to lock */
1708
enum lock_mode mode) /*!< in: LOCK_X or LOCK_S */
1716
ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
1717
ut_ad(mode == LOCK_X || mode == LOCK_S);
1719
heap = mem_heap_create(512);
1721
trx->op_info = "setting table lock for creating or dropping index";
1723
node = sel_node_create(heap);
1724
thr = pars_complete_graph_for_exec(node, trx, heap);
1725
thr->graph->state = QUE_FORK_ACTIVE;
1727
/* We use the select query graph as the dummy graph needed
1728
in the lock module call */
1730
thr = que_fork_get_first_thr(que_node_get_parent(thr));
1731
que_thr_move_to_run_state_for_mysql(thr, trx);
1734
thr->run_node = thr;
1735
thr->prev_node = thr->common.parent;
1737
err = lock_table(0, table, mode, thr);
1739
trx->error_state = err;
1741
if (UNIV_LIKELY(err == DB_SUCCESS)) {
1742
que_thr_stop_for_mysql_no_error(thr, trx);
1744
que_thr_stop_for_mysql(thr);
1746
if (err != DB_QUE_THR_SUSPENDED) {
1747
ibool was_lock_wait;
1749
was_lock_wait = row_mysql_handle_errors(
1750
&err, trx, thr, NULL);
1752
if (was_lock_wait) {
1759
parent = que_node_get_parent(thr);
1760
run_thr = que_fork_start_command(parent);
1762
ut_a(run_thr == thr);
1764
/* There was a lock wait but the thread was not
1765
in a ready to run or running state. */
1766
trx->error_state = DB_LOCK_WAIT;
1772
que_graph_free(thr->graph);
1778
/*********************************************************************//**
1779
Drop an index from the InnoDB system tables. The data dictionary must
1780
have been locked exclusively by the caller, because the transaction
1781
will not be committed. */
1784
row_merge_drop_index(
1785
/*=================*/
1786
dict_index_t* index, /*!< in: index to be removed */
1787
dict_table_t* table, /*!< in: table */
1788
trx_t* trx) /*!< in: transaction handle */
1791
pars_info_t* info = pars_info_create();
1793
/* We use the private SQL parser of Innobase to generate the
1794
query graphs needed in deleting the dictionary data from system
1795
tables in Innobase. Deleting a row from SYS_INDEXES table also
1796
frees the file segments of the B-tree associated with the index. */
1798
static const char str1[] =
1799
"PROCEDURE DROP_INDEX_PROC () IS\n"
1801
"DELETE FROM SYS_FIELDS WHERE INDEX_ID = :indexid;\n"
1802
"DELETE FROM SYS_INDEXES WHERE ID = :indexid\n"
1803
" AND TABLE_ID = :tableid;\n"
1806
ut_ad(index && table && trx);
1808
pars_info_add_dulint_literal(info, "indexid", index->id);
1809
pars_info_add_dulint_literal(info, "tableid", table->id);
1811
trx_start_if_not_started(trx);
1812
trx->op_info = "dropping index";
1814
ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
1816
err = que_eval_sql(info, str1, FALSE, trx);
1818
ut_a(err == DB_SUCCESS);
1820
/* Replace this index with another equivalent index for all
1821
foreign key constraints on this table where this index is used */
1823
dict_table_replace_index_in_foreign_list(table, index);
1824
dict_index_remove_from_cache(table, index);
1829
/*********************************************************************//**
1830
Drop those indexes which were created before an error occurred when
1831
building an index. The data dictionary must have been locked
1832
exclusively by the caller, because the transaction will not be
1836
row_merge_drop_indexes(
1837
/*===================*/
1838
trx_t* trx, /*!< in: transaction */
1839
dict_table_t* table, /*!< in: table containing the indexes */
1840
dict_index_t** index, /*!< in: indexes to drop */
1841
ulint num_created) /*!< in: number of elements in index[] */
1845
for (key_num = 0; key_num < num_created; key_num++) {
1846
row_merge_drop_index(index[key_num], table, trx);
1850
/*********************************************************************//**
1851
Drop all partially created indexes during crash recovery. */
1854
row_merge_drop_temp_indexes(void)
1855
/*=============================*/
1860
/* We use the private SQL parser of Innobase to generate the
1861
query graphs needed in deleting the dictionary data from system
1862
tables in Innobase. Deleting a row from SYS_INDEXES table also
1863
frees the file segments of the B-tree associated with the index. */
1864
static const char drop_temp_indexes[] =
1865
"PROCEDURE DROP_TEMP_INDEXES_PROC () IS\n"
1867
"DECLARE CURSOR c IS SELECT ID FROM SYS_INDEXES\n"
1868
"WHERE SUBSTR(NAME,0,1)='" TEMP_INDEX_PREFIX_STR "';\n"
1871
"\tWHILE 1=1 LOOP\n"
1872
"\t\tFETCH c INTO indexid;\n"
1873
"\t\tIF (SQL % NOTFOUND) THEN\n"
1876
"\t\tDELETE FROM SYS_FIELDS WHERE INDEX_ID = indexid;\n"
1877
"\t\tDELETE FROM SYS_INDEXES WHERE ID = indexid;\n"
1883
trx = trx_allocate_for_background();
1884
trx->op_info = "dropping partially created indexes";
1885
row_mysql_lock_data_dictionary(trx);
1887
/* Incomplete transactions may be holding some locks on the
1888
data dictionary tables. However, they should never have been
1889
able to lock the records corresponding to the partially
1890
created indexes that we are attempting to delete, because the
1891
table was locked when the indexes were being created. We will
1892
drop the partially created indexes before the rollback of
1893
incomplete transactions is initiated. Thus, this should not
1894
interfere with the incomplete transactions. */
1895
trx->isolation_level = TRX_ISO_READ_UNCOMMITTED;
1896
pars_info_t *info = pars_info_create();
1897
err = que_eval_sql(info, drop_temp_indexes, FALSE, trx);
1898
ut_a(err == DB_SUCCESS);
1900
row_mysql_unlock_data_dictionary(trx);
1901
trx_free_for_background(trx);
1904
/*********************************************************************//**
1905
Create a merge file. */
1908
row_merge_file_create(
1909
/*==================*/
1910
merge_file_t* merge_file) /*!< out: merge file structure */
1912
merge_file->fd = innobase_mysql_tmpfile();
1913
merge_file->offset = 0;
1916
/*********************************************************************//**
1917
Destroy a merge file. */
1920
row_merge_file_destroy(
1921
/*===================*/
1922
merge_file_t* merge_file) /*!< out: merge file structure */
1924
if (merge_file->fd != -1) {
1925
close(merge_file->fd);
1926
merge_file->fd = -1;
1930
/*********************************************************************//**
1931
Determine the precise type of a column that is added to a tem
1932
if a column must be constrained NOT NULL.
1933
@return col->prtype, possibly ORed with DATA_NOT_NULL */
1936
row_merge_col_prtype(
1937
/*=================*/
1938
const dict_col_t* col, /*!< in: column */
1939
const char* col_name, /*!< in: name of the column */
1940
const merge_index_def_t*index_def) /*!< in: the index definition
1941
of the primary key */
1943
ulint prtype = col->prtype;
1946
ut_ad(index_def->ind_type & DICT_CLUSTERED);
1948
if (prtype & DATA_NOT_NULL) {
1953
/* All columns that are included
1954
in the PRIMARY KEY must be NOT NULL. */
1956
for (i = 0; i < index_def->n_fields; i++) {
1957
if (!strcmp(col_name, index_def->fields[i].field_name)) {
1958
return(prtype | DATA_NOT_NULL);
1965
/*********************************************************************//**
1966
Create a temporary table for creating a primary key, using the definition
1967
of an existing table.
1968
@return table, or NULL on error */
1971
row_merge_create_temporary_table(
1972
/*=============================*/
1973
const char* table_name, /*!< in: new table name */
1974
const merge_index_def_t*index_def, /*!< in: the index definition
1975
of the primary key */
1976
const dict_table_t* table, /*!< in: old table definition */
1977
trx_t* trx) /*!< in/out: transaction
1978
(sets error_state) */
1981
dict_table_t* new_table = NULL;
1982
ulint n_cols = dict_table_get_n_user_cols(table);
1984
mem_heap_t* heap = mem_heap_create(1000);
1989
ut_ad(mutex_own(&dict_sys->mutex));
1991
new_table = dict_mem_table_create(table_name, 0, n_cols, table->flags);
1993
for (i = 0; i < n_cols; i++) {
1994
const dict_col_t* col;
1995
const char* col_name;
1997
col = dict_table_get_nth_col(table, i);
1998
col_name = dict_table_get_col_name(table, i);
2000
dict_mem_table_add_col(new_table, heap, col_name, col->mtype,
2001
row_merge_col_prtype(col, col_name,
2006
error = row_create_table_for_mysql(new_table, trx);
2007
mem_heap_free(heap);
2009
if (error != DB_SUCCESS) {
2010
trx->error_state = error;
2017
/*********************************************************************//**
2018
Rename the temporary indexes in the dictionary to permanent ones. The
2019
data dictionary must have been locked exclusively by the caller,
2020
because the transaction will not be committed.
2021
@return DB_SUCCESS if all OK */
2024
row_merge_rename_indexes(
2025
/*=====================*/
2026
trx_t* trx, /*!< in/out: transaction */
2027
dict_table_t* table) /*!< in/out: table with new indexes */
2029
ulint err = DB_SUCCESS;
2030
pars_info_t* info = pars_info_create();
2032
/* We use the private SQL parser of Innobase to generate the
2033
query graphs needed in renaming indexes. */
2035
static const char rename_indexes[] =
2036
"PROCEDURE RENAME_INDEXES_PROC () IS\n"
2038
"UPDATE SYS_INDEXES SET NAME=SUBSTR(NAME,1,LENGTH(NAME)-1)\n"
2039
"WHERE TABLE_ID = :tableid AND SUBSTR(NAME,0,1)='"
2040
TEMP_INDEX_PREFIX_STR "';\n"
2045
ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
2047
trx->op_info = "renaming indexes";
2049
pars_info_add_dulint_literal(info, "tableid", table->id);
2051
err = que_eval_sql(info, rename_indexes, FALSE, trx);
2053
if (err == DB_SUCCESS) {
2054
dict_index_t* index = dict_table_get_first_index(table);
2056
if (*index->name == TEMP_INDEX_PREFIX) {
2059
index = dict_table_get_next_index(index);
2068
/*********************************************************************//**
2069
Rename the tables in the data dictionary. The data dictionary must
2070
have been locked exclusively by the caller, because the transaction
2071
will not be committed.
2072
@return error code or DB_SUCCESS */
2075
row_merge_rename_tables(
2076
/*====================*/
2077
dict_table_t* old_table, /*!< in/out: old table, renamed to
2079
dict_table_t* new_table, /*!< in/out: new table, renamed to
2081
const char* tmp_name, /*!< in: new name for old_table */
2082
trx_t* trx) /*!< in: transaction handle */
2084
ulint err = DB_ERROR;
2086
const char* old_name= old_table->name;
2088
ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
2089
ut_ad(old_table != new_table);
2090
ut_ad(mutex_own(&dict_sys->mutex));
2092
ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
2094
trx->op_info = "renaming tables";
2096
/* We use the private SQL parser of Innobase to generate the query
2097
graphs needed in updating the dictionary data in system tables. */
2099
info = pars_info_create();
2101
pars_info_add_str_literal(info, "new_name", new_table->name);
2102
pars_info_add_str_literal(info, "old_name", old_name);
2103
pars_info_add_str_literal(info, "tmp_name", tmp_name);
2105
err = que_eval_sql(info,
2106
"PROCEDURE RENAME_TABLES () IS\n"
2108
"UPDATE SYS_TABLES SET NAME = :tmp_name\n"
2109
" WHERE NAME = :old_name;\n"
2110
"UPDATE SYS_TABLES SET NAME = :old_name\n"
2111
" WHERE NAME = :new_name;\n"
2112
"END;\n", FALSE, trx);
2114
if (err != DB_SUCCESS) {
2119
/* The following calls will also rename the .ibd data files if
2120
the tables are stored in a single-table tablespace */
2122
if (!dict_table_rename_in_cache(old_table, tmp_name, FALSE)
2123
|| !dict_table_rename_in_cache(new_table, old_name, FALSE)) {
2129
err = dict_load_foreigns(old_name, TRUE);
2131
if (err != DB_SUCCESS) {
2133
trx->error_state = DB_SUCCESS;
2134
trx_general_rollback_for_mysql(trx, FALSE, NULL);
2135
trx->error_state = DB_SUCCESS;
2143
/*********************************************************************//**
2144
Create and execute a query graph for creating an index.
2145
@return DB_SUCCESS or error code */
2148
row_merge_create_index_graph(
2149
/*=========================*/
2150
trx_t* trx, /*!< in: trx */
2151
dict_table_t* table, /*!< in: table */
2152
dict_index_t* index) /*!< in: index */
2154
ind_node_t* node; /*!< Index creation node */
2155
mem_heap_t* heap; /*!< Memory heap */
2156
que_thr_t* thr; /*!< Query thread */
2163
heap = mem_heap_create(512);
2165
index->table = table;
2166
node = ind_create_graph_create(index, heap);
2167
thr = pars_complete_graph_for_exec(node, trx, heap);
2169
ut_a(thr == que_fork_start_command(que_node_get_parent(thr)));
2171
que_run_threads(thr);
2173
err = trx->error_state;
2175
que_graph_free((que_t*) que_node_get_parent(thr));
2180
/*********************************************************************//**
2181
Create the index and load in to the dictionary.
2182
@return index, or NULL on error */
2185
row_merge_create_index(
2186
/*===================*/
2187
trx_t* trx, /*!< in/out: trx (sets error_state) */
2188
dict_table_t* table, /*!< in: the index is on this table */
2189
const merge_index_def_t*index_def)
2190
/*!< in: the index definition */
2192
dict_index_t* index;
2194
ulint n_fields = index_def->n_fields;
2197
/* Create the index prototype, using the passed in def, this is not
2198
a persistent operation. We pass 0 as the space id, and determine at
2199
a lower level the space id where to store the table. */
2201
index = dict_mem_index_create(table->name, index_def->name,
2202
0, index_def->ind_type, n_fields);
2206
for (i = 0; i < n_fields; i++) {
2207
merge_index_field_t* ifield = &index_def->fields[i];
2209
dict_mem_index_add_field(index, ifield->field_name,
2210
ifield->prefix_len);
2213
/* Add the index to SYS_INDEXES, using the index prototype. */
2214
err = row_merge_create_index_graph(trx, table, index);
2216
if (err == DB_SUCCESS) {
2218
index = row_merge_dict_table_get_index(
2223
/* Note the id of the transaction that created this
2224
index, we use it to restrict readers from accessing
2225
this index, to ensure read consistency. */
2226
index->trx_id = (ib_uint64_t)
2227
ut_conv_dulint_to_longlong(trx->id);
2235
/*********************************************************************//**
2236
Check if a transaction can use an index. */
2239
row_merge_is_index_usable(
2240
/*======================*/
2241
const trx_t* trx, /*!< in: transaction */
2242
const dict_index_t* index) /*!< in: index to check */
2244
return(!trx->read_view || read_view_sees_trx_id(
2246
ut_dulint_create((ulint) (index->trx_id >> 32),
2247
(ulint) index->trx_id & 0xFFFFFFFF)));
2250
/*********************************************************************//**
2252
@return DB_SUCCESS or error code */
2255
row_merge_drop_table(
2256
/*=================*/
2257
trx_t* trx, /*!< in: transaction */
2258
dict_table_t* table) /*!< in: table to drop */
2260
/* There must be no open transactions on the table. */
2261
ut_a(table->n_mysql_handles_opened == 0);
2263
return(row_drop_table_for_mysql(table->name, trx, FALSE));
2266
/*********************************************************************//**
2267
Build indexes on a table by reading a clustered index,
2268
creating a temporary file containing index entries, merge sorting
2269
these index entries and inserting sorted index entries to indexes.
2270
@return DB_SUCCESS or error code */
2273
row_merge_build_indexes(
2274
/*====================*/
2275
trx_t* trx, /*!< in: transaction */
2276
dict_table_t* old_table, /*!< in: table where rows are
2278
dict_table_t* new_table, /*!< in: table where indexes are
2279
created; identical to old_table
2280
unless creating a PRIMARY KEY */
2281
dict_index_t** indexes, /*!< in: indexes to be created */
2282
ulint n_indexes, /*!< in: size of indexes[] */
2283
TABLE* table) /*!< in/out: MySQL table, for
2284
reporting erroneous key value
2287
merge_file_t* merge_files;
2288
row_merge_block_t* block;
2300
trx_start_if_not_started(trx);
2302
/* Allocate memory for merge file data structure and initialize
2305
merge_files = mem_alloc(n_indexes * sizeof *merge_files);
2306
block_size = 3 * sizeof *block;
2307
block = os_mem_alloc_large(&block_size);
2309
for (i = 0; i < n_indexes; i++) {
2311
row_merge_file_create(&merge_files[i]);
2314
tmpfd = innobase_mysql_tmpfile();
2316
/* Reset the MySQL row buffer that is used when reporting
2318
innobase_rec_reset(table);
2320
/* Read clustered index of the table and create files for
2321
secondary index entries for merge sort */
2323
error = row_merge_read_clustered_index(
2324
trx, table, old_table, new_table, indexes,
2325
merge_files, n_indexes, block);
2327
if (error != DB_SUCCESS) {
2332
/* Now we have files containing index entries ready for
2333
sorting and inserting. */
2335
for (i = 0; i < n_indexes; i++) {
2336
error = row_merge_sort(indexes[i], &merge_files[i],
2337
block, &tmpfd, table);
2339
if (error == DB_SUCCESS) {
2340
error = row_merge_insert_index_tuples(
2341
trx, indexes[i], new_table,
2342
dict_table_zip_size(old_table),
2343
merge_files[i].fd, block);
2346
/* Close the temporary file to free up space. */
2347
row_merge_file_destroy(&merge_files[i]);
2349
if (error != DB_SUCCESS) {
2350
trx->error_key_num = i;
2358
for (i = 0; i < n_indexes; i++) {
2359
row_merge_file_destroy(&merge_files[i]);
2362
mem_free(merge_files);
2363
os_mem_free_large(block, block_size);