2
* See the file LICENSE for redistribution information.
4
* Copyright (c) 1998-2001
5
* Sleepycat Software. All rights reserved.
11
static const char revid[] = "$Id: db_join.c,v 11.45 2001/07/02 01:05:37 bostic Exp $";
14
#ifndef NO_SYSTEM_INCLUDES
15
#include <sys/types.h>
27
static int __db_join_close __P((DBC *));
28
static int __db_join_cmp __P((const void *, const void *));
29
static int __db_join_del __P((DBC *, u_int32_t));
30
static int __db_join_get __P((DBC *, DBT *, DBT *, u_int32_t));
31
static int __db_join_getnext __P((DBC *, DBT *, DBT *, u_int32_t, u_int32_t));
32
static int __db_join_put __P((DBC *, DBT *, DBT *, u_int32_t));
35
* Check to see if the Nth secondary cursor of join cursor jc is pointing
36
* to a sorted duplicate set.
38
#define SORTED_SET(jc, n) ((jc)->j_curslist[(n)]->dbp->dup_compare != NULL)
41
* This is the duplicate-assisted join functionality. Right now we're
42
* going to write it such that we return one item at a time, although
43
* I think we may need to optimize it to return them all at once.
44
* It should be easier to get it working this way, and I believe that
45
* changing it should be fairly straightforward.
47
* We optimize the join by sorting cursors from smallest to largest
48
* cardinality. In most cases, this is indeed optimal. However, if
49
* a cursor with large cardinality has very few data in common with the
50
* first cursor, it is possible that the join will be made faster by
51
* putting it earlier in the cursor list. Since we have no way to detect
52
* cases like this, we simply provide a flag, DB_JOIN_NOSORT, which retains
53
* the sort order specified by the caller, who may know more about the
54
* structure of the data.
56
* The first cursor moves sequentially through the duplicate set while
57
* the others search explicitly for the duplicate in question.
63
* This is the interface to the duplicate-assisted join functionality.
64
* In the same way that cursors mark a position in a database, a cursor
65
* can mark a position in a join. While most cursors are created by the
66
* cursor method of a DB, join cursors are created through an explicit
69
* The curslist is an array of existing, intialized cursors and primary
70
* is the DB of the primary file. The data item that joins all the
71
* cursors in the curslist is used as the key into the primary and that
72
* key and data are returned. When no more items are left in the join
73
* set, the c_next operation off the join cursor will return DB_NOTFOUND.
75
* PUBLIC: int __db_join __P((DB *, DBC **, DBC **, u_int32_t));
78
__db_join(primary, curslist, dbcp, flags)
80
DBC **curslist, **dbcp;
87
u_int32_t i, ncurs, nslots;
91
PANIC_CHECK(primary->dbenv);
93
if ((ret = __db_joinchk(primary, curslist, flags)) != 0)
98
dbenv = primary->dbenv;
100
if ((ret = __os_calloc(dbenv, 1, sizeof(DBC), &dbc)) != 0)
103
if ((ret = __os_calloc(dbenv,
104
1, sizeof(JOIN_CURSOR), &jc)) != 0)
107
if ((ret = __os_malloc(dbenv, 256, &jc->j_key.data)) != 0)
109
jc->j_key.ulen = 256;
110
F_SET(&jc->j_key, DB_DBT_USERMEM);
112
F_SET(&jc->j_rdata, DB_DBT_REALLOC);
114
for (jc->j_curslist = curslist;
115
*jc->j_curslist != NULL; jc->j_curslist++)
119
* The number of cursor slots we allocate is one greater than
120
* the number of cursors involved in the join, because the
121
* list is NULL-terminated.
123
ncurs = jc->j_curslist - curslist;
127
* !!! -- A note on the various lists hanging off jc.
129
* j_curslist is the initial NULL-terminated list of cursors passed
130
* into __db_join. The original cursors are not modified; pristine
131
* copies are required because, in databases with unsorted dups, we
132
* must reset all of the secondary cursors after the first each
133
* time the first one is incremented, or else we will lose data
134
* which happen to be sorted differently in two different cursors.
136
* j_workcurs is where we put those copies that we're planning to
137
* work with. They're lazily c_dup'ed from j_curslist as we need
138
* them, and closed when the join cursor is closed or when we need
139
* to reset them to their original values (in which case we just
142
* j_fdupcurs is an array of cursors which point to the first
143
* duplicate in the duplicate set that contains the data value
144
* we're currently interested in. We need this to make
145
* __db_join_get correctly return duplicate duplicates; i.e., if a
146
* given data value occurs twice in the set belonging to cursor #2,
147
* and thrice in the set belonging to cursor #3, and once in all
148
* the other cursors, successive calls to __db_join_get need to
149
* return that data item six times. To make this happen, each time
150
* cursor N is allowed to advance to a new datum, all cursors M
151
* such that M > N have to be reset to the first duplicate with
152
* that datum, so __db_join_get will return all the dup-dups again.
153
* We could just reset them to the original cursor from j_curslist,
154
* but that would be a bit slower in the unsorted case and a LOT
155
* slower in the sorted one.
157
* j_exhausted is a list of boolean values which represent
158
* whether or not their corresponding cursors are "exhausted",
159
* i.e. whether the datum under the corresponding cursor has
160
* been found not to exist in any unreturned combinations of
161
* later secondary cursors, in which case they are ready to be
165
/* We don't want to free regions whose callocs have failed. */
166
jc->j_curslist = NULL;
167
jc->j_workcurs = NULL;
168
jc->j_fdupcurs = NULL;
169
jc->j_exhausted = NULL;
171
if ((ret = __os_calloc(dbenv, nslots, sizeof(DBC *),
172
&jc->j_curslist)) != 0)
174
if ((ret = __os_calloc(dbenv, nslots, sizeof(DBC *),
175
&jc->j_workcurs)) != 0)
177
if ((ret = __os_calloc(dbenv, nslots, sizeof(DBC *),
178
&jc->j_fdupcurs)) != 0)
180
if ((ret = __os_calloc(dbenv, nslots, sizeof(u_int8_t),
181
&jc->j_exhausted)) != 0)
183
for (i = 0; curslist[i] != NULL; i++) {
184
jc->j_curslist[i] = curslist[i];
185
jc->j_workcurs[i] = NULL;
186
jc->j_fdupcurs[i] = NULL;
187
jc->j_exhausted[i] = 0;
192
* If DB_JOIN_NOSORT is not set, optimize secondary cursors by
193
* sorting in order of increasing cardinality.
195
if (!LF_ISSET(DB_JOIN_NOSORT))
196
qsort(jc->j_curslist, ncurs, sizeof(DBC *), __db_join_cmp);
199
* We never need to reset the 0th cursor, so there's no
200
* solid reason to use workcurs[0] rather than curslist[0] in
201
* join_get. Nonetheless, it feels cleaner to do it for symmetry,
202
* and this is the most logical place to copy it.
205
* There's no need to close the new cursor if we goto err only
206
* because this is the last thing that can fail. Modifier of this
209
if ((ret = jc->j_curslist[0]->c_dup(jc->j_curslist[0], jc->j_workcurs,
213
dbc->c_close = __db_join_close;
214
dbc->c_del = __db_join_del;
215
dbc->c_get = __db_join_get;
216
dbc->c_put = __db_join_put;
217
dbc->internal = (DBC_INTERNAL *) jc;
219
jc->j_primary = primary;
223
MUTEX_THREAD_LOCK(dbenv, primary->mutexp);
224
TAILQ_INSERT_TAIL(&primary->join_queue, dbc, links);
225
MUTEX_THREAD_UNLOCK(dbenv, primary->mutexp);
229
err: if (jc != NULL) {
230
if (jc->j_curslist != NULL)
232
jc->j_curslist, nslots * sizeof(DBC *));
233
if (jc->j_workcurs != NULL) {
234
if (jc->j_workcurs[0] != NULL)
236
jc->j_workcurs[0], sizeof(DBC));
238
jc->j_workcurs, nslots * sizeof(DBC *));
240
if (jc->j_fdupcurs != NULL)
242
jc->j_fdupcurs, nslots * sizeof(DBC *));
243
if (jc->j_exhausted != NULL)
245
jc->j_exhausted, nslots * sizeof(u_int8_t));
246
__os_free(dbenv, jc, sizeof(JOIN_CURSOR));
249
__os_free(dbenv, dbc, sizeof(DBC));
254
__db_join_put(dbc, key, data, flags)
260
PANIC_CHECK(dbc->dbp->dbenv);
262
COMPQUIET(key, NULL);
263
COMPQUIET(data, NULL);
269
__db_join_del(dbc, flags)
273
PANIC_CHECK(dbc->dbp->dbenv);
280
__db_join_get(dbc, key_arg, data_arg, flags)
282
DBT *key_arg, *data_arg;
285
DBT *key_n, key_n_mem;
289
int db_manage_data, ret;
290
u_int32_t i, j, operation, opmods;
293
jc = (JOIN_CURSOR *)dbc->internal;
295
PANIC_CHECK(dbp->dbenv);
297
operation = LF_ISSET(DB_OPFLAGS_MASK);
298
opmods = LF_ISSET(DB_RMW | DB_DIRTY_READ);
300
if ((ret = __db_joingetchk(dbp, key_arg, flags)) != 0)
304
* Since we are fetching the key as a datum in the secondary indices,
305
* we must be careful of caller-specified DB_DBT_* memory
306
* management flags. If necessary, use a stack-allocated DBT;
307
* we'll appropriately copy and/or allocate the data later.
309
if (F_ISSET(key_arg, DB_DBT_USERMEM) ||
310
F_ISSET(key_arg, DB_DBT_MALLOC)) {
311
/* We just use the default buffer; no need to go malloc. */
313
memset(key_n, 0, sizeof(DBT));
316
* Either DB_DBT_REALLOC or the default buffer will work
317
* fine if we have to reuse it, as we do.
323
* If our last attempt to do a get on the primary key failed,
324
* short-circuit the join and try again with the same key.
326
if (F_ISSET(jc, JOIN_RETRY))
328
F_CLR(jc, JOIN_RETRY);
330
retry: ret = jc->j_workcurs[0]->c_real_get(jc->j_workcurs[0],
332
opmods | (jc->j_exhausted[0] ? DB_NEXT_DUP : DB_CURRENT));
335
jc->j_key.ulen <<= 1;
336
if ((ret = __os_realloc(dbp->dbenv,
337
jc->j_key.ulen, &jc->j_key.data)) != 0)
343
* If ret == DB_NOTFOUND, we're out of elements of the first
344
* secondary cursor. This is how we finally finish the join
351
* If jc->j_exhausted[0] == 1, we've just advanced the first cursor,
352
* and we're going to want to advance all the cursors that point to
353
* the first member of a duplicate duplicate set (j_fdupcurs[1..N]).
354
* Close all the cursors in j_fdupcurs; we'll reopen them the
355
* first time through the upcoming loop.
357
for (i = 1; i < jc->j_ncurs; i++) {
358
if (jc->j_fdupcurs[i] != NULL &&
359
(ret = jc->j_fdupcurs[i]->c_close(jc->j_fdupcurs[i])) != 0)
361
jc->j_fdupcurs[i] = NULL;
365
* If jc->j_curslist[1] == NULL, we have only one cursor in the join.
366
* Thus, we can safely increment that one cursor on each call
367
* to __db_join_get, and we signal this by setting jc->j_exhausted[0]
370
* Otherwise, reset jc->j_exhausted[0] to 0, so that we don't
371
* increment it until we know we're ready to.
373
if (jc->j_curslist[1] == NULL)
374
jc->j_exhausted[0] = 1;
376
jc->j_exhausted[0] = 0;
378
/* We have the first element; now look for it in the other cursors. */
379
for (i = 1; i < jc->j_ncurs; i++) {
380
DB_ASSERT(jc->j_curslist[i] != NULL);
381
if (jc->j_workcurs[i] == NULL)
382
/* If this is NULL, we need to dup curslist into it. */
383
if ((ret = jc->j_curslist[i]->c_dup(
384
jc->j_curslist[i], jc->j_workcurs + i,
388
retry2: cp = jc->j_workcurs[i];
390
if ((ret = __db_join_getnext(cp, &jc->j_key, key_n,
391
jc->j_exhausted[i], opmods)) == DB_NOTFOUND) {
393
* jc->j_workcurs[i] has no more of the datum we're
394
* interested in. Go back one cursor and get
395
* a new dup. We can't just move to a new
396
* element of the outer relation, because that way
397
* we might miss duplicate duplicates in cursor i-1.
399
* If this takes us back to the first cursor,
400
* -then- we can move to a new element of the outer
404
jc->j_exhausted[i] = 1;
407
for (j = 1; jc->j_workcurs[j] != NULL; j++) {
409
* We're moving to a new element of
410
* the first secondary cursor. If
411
* that cursor is sorted, then any
412
* other sorted cursors can be safely
413
* reset to the first duplicate
414
* duplicate in the current set if we
415
* have a pointer to it (we can't just
416
* leave them be, or we'll miss
417
* duplicate duplicates in the outer
420
* If the first cursor is unsorted, or
421
* if cursor j is unsorted, we can
422
* make no assumptions about what
423
* we're looking for next or where it
424
* will be, so we reset to the very
425
* beginning (setting workcurs NULL
426
* will achieve this next go-round).
428
* XXX: This is likely to break
429
* horribly if any two cursors are
430
* both sorted, but have different
431
* specified sort functions. For,
432
* now, we dismiss this as pathology
433
* and let strange things happen--we
434
* can't make rope childproof.
436
if ((ret = jc->j_workcurs[j]->c_close(
437
jc->j_workcurs[j])) != 0)
439
if (!SORTED_SET(jc, 0) ||
440
!SORTED_SET(jc, j) ||
441
jc->j_fdupcurs[j] == NULL)
446
jc->j_workcurs[j] = NULL;
448
/* Partial reset suffices. */
449
if ((jc->j_fdupcurs[j]->c_dup(
454
jc->j_exhausted[j] = 0;
461
* We're about to advance the cursor and need to
462
* reset all of the workcurs[j] where j>i, so that
463
* we don't miss any duplicate duplicates.
466
jc->j_workcurs[j] != NULL;
468
if ((ret = jc->j_workcurs[j]->c_close(
469
jc->j_workcurs[j])) != 0)
471
jc->j_exhausted[j] = 0;
472
if (jc->j_fdupcurs[j] != NULL &&
473
(ret = jc->j_fdupcurs[j]->c_dup(
474
jc->j_fdupcurs[j], &jc->j_workcurs[j],
478
jc->j_workcurs[j] = NULL;
485
jc->j_key.ulen <<= 1;
486
if ((ret = __os_realloc(dbp->dbenv, jc->j_key.ulen,
487
&jc->j_key.data)) != 0) {
488
mem_err: __db_err(dbp->dbenv,
489
"Allocation failed for join key, len = %lu",
490
(u_long)jc->j_key.ulen);
500
* If we made it this far, we've found a matching
501
* datum in cursor i. Mark the current cursor
502
* unexhausted, so we don't miss any duplicate
503
* duplicates the next go-round--unless this is the
504
* very last cursor, in which case there are none to
505
* miss, and we'll need that exhausted flag to finally
506
* get a DB_NOTFOUND and move on to the next datum in
507
* the outermost cursor.
509
if (i + 1 != jc->j_ncurs)
510
jc->j_exhausted[i] = 0;
512
jc->j_exhausted[i] = 1;
515
* If jc->j_fdupcurs[i] is NULL and the ith cursor's dups are
516
* sorted, then we're here for the first time since advancing
517
* cursor 0, and we have a new datum of interest.
518
* jc->j_workcurs[i] points to the beginning of a set of
519
* duplicate duplicates; store this into jc->j_fdupcurs[i].
521
if (SORTED_SET(jc, i) && jc->j_fdupcurs[i] == NULL && (ret =
522
cp->c_dup(cp, &jc->j_fdupcurs[i], DB_POSITIONI)) != 0)
532
* Get the key we tried and failed to return last time;
533
* it should be the current datum of all the secondary cursors.
535
if ((ret = jc->j_workcurs[0]->c_real_get(jc->j_workcurs[0],
536
&jc->j_key, key_n, DB_CURRENT | opmods)) != 0)
538
F_CLR(jc, JOIN_RETRY);
542
* ret == 0; we have a key to return.
544
* If DB_DBT_USERMEM or DB_DBT_MALLOC is set, we need to
545
* copy it back into the dbt we were given for the key;
548
* Otherwise, assert that we do not in fact need to copy anything
549
* and simply proceed.
551
if (F_ISSET(key_arg, DB_DBT_USERMEM) ||
552
F_ISSET(key_arg, DB_DBT_MALLOC)) {
554
* We need to copy the key back into our original
557
if ((ret = __db_retcopy(dbp,
558
key_arg, key_n->data, key_n->size, NULL, NULL)) != 0) {
560
* The retcopy failed, most commonly because we
561
* have a user buffer for the key which is too small.
562
* Set things up to retry next time, and return.
564
F_SET(jc, JOIN_RETRY);
568
DB_ASSERT(key_n == key_arg);
572
* set, we return it; otherwise we do the lookup in the
573
* primary and then return.
575
* Note that we use key_arg here; it is safe (and appropriate)
578
if (operation == DB_JOIN_ITEM)
582
* If data_arg->flags == 0--that is, if DB is managing the
583
* data DBT's memory--it's not safe to just pass the DBT
584
* through to the primary get call, since we don't want that
585
* memory to belong to the primary DB handle (and if the primary
586
* is free-threaded, it can't anyway).
588
* Instead, use memory that is managed by the join cursor, in
591
if (!F_ISSET(data_arg, DB_DBT_MALLOC | DB_DBT_REALLOC | DB_DBT_USERMEM))
595
if ((ret = jc->j_primary->get(jc->j_primary,
596
jc->j_curslist[0]->txn, key_arg,
597
db_manage_data ? &jc->j_rdata : data_arg, opmods)) != 0) {
598
if (ret == DB_NOTFOUND)
600
* If ret == DB_NOTFOUND, the primary and secondary
601
* are out of sync; every item in each secondary
602
* should correspond to something in the primary,
603
* or we shouldn't have done the join this way.
606
ret = __db_secondary_corrupt(jc->j_primary);
609
* The get on the primary failed for some other
610
* reason, most commonly because we're using a user
611
* buffer that's not big enough. Flag our failure
612
* so we can return the same key next time.
614
F_SET(jc, JOIN_RETRY);
616
if (db_manage_data && ret == 0) {
617
data_arg->data = jc->j_rdata.data;
618
data_arg->size = jc->j_rdata.size;
634
jc = (JOIN_CURSOR *)dbc->internal;
640
* Remove from active list of join cursors. Note that this
641
* must happen before any action that can fail and return, or else
642
* __db_close may loop indefinitely.
644
MUTEX_THREAD_LOCK(dbenv, dbp->mutexp);
645
TAILQ_REMOVE(&dbp->join_queue, dbc, links);
646
MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp);
651
* Close any open scratch cursors. In each case, there may
652
* not be as many outstanding as there are cursors in
653
* curslist, but we want to close whatever's there.
655
* If any close fails, there's no reason not to close everything else;
656
* we'll just return the error code of the last one to fail. There's
657
* not much the caller can do anyway, since these cursors only exist
658
* hanging off a db-internal data structure that they shouldn't be
661
for (i = 0; i < jc->j_ncurs; i++) {
662
if (jc->j_workcurs[i] != NULL && (t_ret =
663
jc->j_workcurs[i]->c_close(jc->j_workcurs[i])) != 0)
665
if (jc->j_fdupcurs[i] != NULL && (t_ret =
666
jc->j_fdupcurs[i]->c_close(jc->j_fdupcurs[i])) != 0)
670
__os_free(dbenv, jc->j_exhausted, 0);
671
__os_free(dbenv, jc->j_curslist, 0);
672
__os_free(dbenv, jc->j_workcurs, 0);
673
__os_free(dbenv, jc->j_fdupcurs, 0);
674
__os_free(dbenv, jc->j_key.data, jc->j_key.ulen);
675
if (jc->j_rdata.data != NULL)
676
__os_ufree(dbenv, jc->j_rdata.data, 0);
677
__os_free(dbenv, jc, sizeof(JOIN_CURSOR));
678
__os_free(dbenv, dbc, sizeof(DBC));
684
* __db_join_getnext --
685
* This function replaces the DBC_CONTINUE and DBC_KEYSET
686
* functionality inside the various cursor get routines.
688
* If exhausted == 0, we're not done with the current datum;
689
* return it if it matches "matching", otherwise search
690
* using DB_GET_BOTHC (which is faster than iteratively doing
691
* DB_NEXT_DUP) forward until we find one that does.
693
* If exhausted == 1, we are done with the current datum, so just
694
* leap forward to searching NEXT_DUPs.
696
* If no matching datum exists, returns DB_NOTFOUND, else 0.
699
__db_join_getnext(dbc, key, data, exhausted, opmods)
702
u_int32_t exhausted, opmods;
707
int (*func) __P((DB *, const DBT *, const DBT *));
710
func = (dbp->dup_compare == NULL) ? __bam_defcmp : dbp->dup_compare;
715
* We don't want to step on data->data; use a new
716
* DBT and malloc so we don't step on dbc's rdata memory.
718
memset(&ldata, 0, sizeof(DBT));
719
F_SET(&ldata, DB_DBT_MALLOC);
720
if ((ret = dbc->c_real_get(dbc,
721
key, &ldata, opmods | DB_CURRENT)) != 0)
723
cmp = func(dbp, data, &ldata);
726
* We have to return the real data value. Copy
727
* it into data, then free the buffer we malloc'ed
730
if ((ret = __db_retcopy(dbp, data, ldata.data,
731
ldata.size, &data->data, &data->size)) != 0)
733
__os_ufree(dbp->dbenv, ldata.data, 0);
738
* Didn't match--we want to fall through and search future
739
* dups. We just forget about ldata and free
740
* its buffer--data contains the value we're searching for.
742
__os_ufree(dbp->dbenv, ldata.data, 0);
745
ret = dbc->c_real_get(dbc, key, data, opmods | DB_GET_BOTHC);
757
* Comparison function for sorting DBCs in cardinality order.
765
db_recno_t counta, countb;
767
/* In case c_count fails, pretend cursors are equal. */
770
dbca = *((DBC * const *)a);
771
dbcb = *((DBC * const *)b);
773
if (dbca->c_count(dbca, &counta, 0) != 0 ||
774
dbcb->c_count(dbcb, &countb, 0) != 0)
777
return (counta - countb);