1
/*-------------------------------------------------------------------------
4
* Routines to handle hash join nodes
6
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
7
* Portions Copyright (c) 1994, Regents of the University of California
11
* $PostgreSQL: pgsql/src/backend/executor/nodeHashjoin.c,v 1.67 2004-12-31 21:59:45 pgsql Exp $
13
*-------------------------------------------------------------------------
18
#include "executor/executor.h"
19
#include "executor/nodeHash.h"
20
#include "executor/nodeHashjoin.h"
21
#include "optimizer/clauses.h"
22
#include "utils/memutils.h"
25
static TupleTableSlot *ExecHashJoinOuterGetTuple(PlanState *node,
26
HashJoinState *hjstate);
27
static TupleTableSlot *ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
29
TupleTableSlot *tupleSlot);
30
static int ExecHashJoinNewBatch(HashJoinState *hjstate);
33
/* ----------------------------------------------------------------
36
* This function implements the Hybrid Hashjoin algorithm.
37
* recursive partitioning remains to be added.
38
* Note: the relation we build hash table on is the inner
39
* the other one is outer.
40
* ----------------------------------------------------------------
42
TupleTableSlot * /* return: a tuple or NULL */
43
ExecHashJoin(HashJoinState *node)
53
TupleTableSlot *inntuple;
54
ExprContext *econtext;
56
HashJoinTable hashtable;
58
TupleTableSlot *outerTupleSlot;
62
* get information from HashJoin node
64
hjclauses = node->hashclauses;
65
estate = node->js.ps.state;
66
joinqual = node->js.joinqual;
67
otherqual = node->js.ps.qual;
68
hashNode = (HashState *) innerPlanState(node);
69
outerNode = outerPlanState(node);
70
dir = estate->es_direction;
73
* get information from HashJoin state
75
hashtable = node->hj_HashTable;
76
outerkeys = node->hj_OuterHashKeys;
77
econtext = node->js.ps.ps_ExprContext;
80
* Check to see if we're still projecting out tuples from a previous
81
* join tuple (because there is a function-returning-set in the
82
* projection expressions). If so, try to project another one.
84
if (node->js.ps.ps_TupFromTlist)
86
TupleTableSlot *result;
88
result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
89
if (isDone == ExprMultipleResult)
91
/* Done with that source tuple... */
92
node->js.ps.ps_TupFromTlist = false;
96
* If we're doing an IN join, we want to return at most one row per
97
* outer tuple; so we can stop scanning the inner scan if we matched
98
* on the previous try.
100
if (node->js.jointype == JOIN_IN &&
101
node->hj_MatchedOuter)
102
node->hj_NeedNewOuter = true;
105
* Reset per-tuple memory context to free any expression evaluation
106
* storage allocated in the previous tuple cycle. Note this can't
107
* happen until we're done projecting out tuples from a join tuple.
109
ResetExprContext(econtext);
112
* if this is the first call, build the hash table for inner relation
114
if (!node->hj_hashdone)
117
* create the hash table
119
Assert(hashtable == NULL);
120
hashtable = ExecHashTableCreate((Hash *) hashNode->ps.plan,
121
node->hj_HashOperators);
122
node->hj_HashTable = hashtable;
125
* execute the Hash node, to build the hash table
127
hashNode->hashtable = hashtable;
128
(void) ExecProcNode((PlanState *) hashNode);
131
* If the inner relation is completely empty, and we're not doing
132
* an outer join, we can quit without scanning the outer relation.
134
if (!hashtable->hashNonEmpty && node->js.jointype != JOIN_LEFT)
136
ExecHashTableDestroy(hashtable);
137
node->hj_HashTable = NULL;
142
* Open temp files for outer batches, if needed. Note that file
143
* buffers are palloc'd in regular executor context.
145
for (i = 0; i < hashtable->nbatch; i++)
146
hashtable->outerBatchFile[i] = BufFileCreateTemp(false);
148
node->hj_hashdone = true;
152
* run the hash join process
157
* If we don't have an outer tuple, get the next one
159
if (node->hj_NeedNewOuter)
161
outerTupleSlot = ExecHashJoinOuterGetTuple(outerNode,
163
if (TupIsNull(outerTupleSlot))
169
node->js.ps.ps_OuterTupleSlot = outerTupleSlot;
170
econtext->ecxt_outertuple = outerTupleSlot;
171
node->hj_NeedNewOuter = false;
172
node->hj_MatchedOuter = false;
175
* now we have an outer tuple, find the corresponding bucket
176
* for this tuple from the hash table
178
node->hj_CurBucketNo = ExecHashGetBucket(hashtable, econtext,
180
node->hj_CurTuple = NULL;
183
* Now we've got an outer tuple and the corresponding hash
184
* bucket, but this tuple may not belong to the current batch.
185
* This need only be checked in the first pass.
187
if (hashtable->curbatch == 0)
189
int batchno = ExecHashGetBatch(node->hj_CurBucketNo,
195
* Need to postpone this outer tuple to a later batch.
196
* Save it in the corresponding outer-batch file.
198
hashtable->outerBatchSize[batchno]++;
199
ExecHashJoinSaveTuple(outerTupleSlot->val,
200
hashtable->outerBatchFile[batchno]);
201
node->hj_NeedNewOuter = true;
202
continue; /* loop around for a new outer tuple */
208
* OK, scan the selected hash bucket for matches
212
curtuple = ExecScanHashBucket(node,
215
if (curtuple == NULL)
216
break; /* out of matches */
219
* we've got a match, but still need to test non-hashed quals
221
inntuple = ExecStoreTuple(curtuple,
222
node->hj_HashTupleSlot,
224
false); /* don't pfree this tuple */
225
econtext->ecxt_innertuple = inntuple;
227
/* reset temp memory each time to avoid leaks from qual expr */
228
ResetExprContext(econtext);
231
* if we pass the qual, then save state for next call and have
232
* ExecProject form the projection, store it in the tuple
233
* table, and return the slot.
235
* Only the joinquals determine MatchedOuter status, but all
236
* quals must pass to actually return the tuple.
238
if (joinqual == NIL || ExecQual(joinqual, econtext, false))
240
node->hj_MatchedOuter = true;
242
if (otherqual == NIL || ExecQual(otherqual, econtext, false))
244
TupleTableSlot *result;
246
result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
248
if (isDone != ExprEndResult)
250
node->js.ps.ps_TupFromTlist =
251
(isDone == ExprMultipleResult);
257
* If we didn't return a tuple, may need to set
260
if (node->js.jointype == JOIN_IN)
262
node->hj_NeedNewOuter = true;
263
break; /* out of loop over hash bucket */
269
* Now the current outer tuple has run out of matches, so check
270
* whether to emit a dummy outer-join tuple. If not, loop around
271
* to get a new outer tuple.
273
node->hj_NeedNewOuter = true;
275
if (!node->hj_MatchedOuter &&
276
node->js.jointype == JOIN_LEFT)
279
* We are doing an outer join and there were no join matches
280
* for this outer tuple. Generate a fake join tuple with
281
* nulls for the inner tuple, and return it if it passes the
284
econtext->ecxt_innertuple = node->hj_NullInnerTupleSlot;
286
if (ExecQual(otherqual, econtext, false))
289
* qualification was satisfied so we project and return
290
* the slot containing the result tuple using
293
TupleTableSlot *result;
295
result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
297
if (isDone != ExprEndResult)
299
node->js.ps.ps_TupFromTlist =
300
(isDone == ExprMultipleResult);
308
/* ----------------------------------------------------------------
311
* Init routine for HashJoin node.
312
* ----------------------------------------------------------------
315
ExecInitHashJoin(HashJoin *node, EState *estate)
317
HashJoinState *hjstate;
326
* create state structure
328
hjstate = makeNode(HashJoinState);
329
hjstate->js.ps.plan = (Plan *) node;
330
hjstate->js.ps.state = estate;
333
* Miscellaneous initialization
335
* create expression context for node
337
ExecAssignExprContext(estate, &hjstate->js.ps);
340
* initialize child expressions
342
hjstate->js.ps.targetlist = (List *)
343
ExecInitExpr((Expr *) node->join.plan.targetlist,
344
(PlanState *) hjstate);
345
hjstate->js.ps.qual = (List *)
346
ExecInitExpr((Expr *) node->join.plan.qual,
347
(PlanState *) hjstate);
348
hjstate->js.jointype = node->join.jointype;
349
hjstate->js.joinqual = (List *)
350
ExecInitExpr((Expr *) node->join.joinqual,
351
(PlanState *) hjstate);
352
hjstate->hashclauses = (List *)
353
ExecInitExpr((Expr *) node->hashclauses,
354
(PlanState *) hjstate);
357
* initialize child nodes
359
outerNode = outerPlan(node);
360
hashNode = (Hash *) innerPlan(node);
362
outerPlanState(hjstate) = ExecInitNode(outerNode, estate);
363
innerPlanState(hjstate) = ExecInitNode((Plan *) hashNode, estate);
365
#define HASHJOIN_NSLOTS 3
368
* tuple table initialization
370
ExecInitResultTupleSlot(estate, &hjstate->js.ps);
371
hjstate->hj_OuterTupleSlot = ExecInitExtraTupleSlot(estate);
373
switch (node->join.jointype)
379
hjstate->hj_NullInnerTupleSlot =
380
ExecInitNullTupleSlot(estate,
381
ExecGetResultType(innerPlanState(hjstate)));
384
elog(ERROR, "unrecognized join type: %d",
385
(int) node->join.jointype);
389
* now for some voodoo. our temporary tuple slot is actually the
390
* result tuple slot of the Hash node (which is our inner plan). we
391
* do this because Hash nodes don't return tuples via ExecProcNode()
392
* -- instead the hash join node uses ExecScanHashBucket() to get at
393
* the contents of the hash table. -cim 6/9/91
396
HashState *hashstate = (HashState *) innerPlanState(hjstate);
397
TupleTableSlot *slot = hashstate->ps.ps_ResultTupleSlot;
399
hjstate->hj_HashTupleSlot = slot;
403
* initialize tuple type and projection info
405
ExecAssignResultTypeFromTL(&hjstate->js.ps);
406
ExecAssignProjectionInfo(&hjstate->js.ps);
408
ExecSetSlotDescriptor(hjstate->hj_OuterTupleSlot,
409
ExecGetResultType(outerPlanState(hjstate)),
413
* initialize hash-specific info
416
hjstate->hj_hashdone = false;
417
hjstate->hj_HashTable = NULL;
419
hjstate->hj_CurBucketNo = 0;
420
hjstate->hj_CurTuple = NULL;
423
* Deconstruct the hash clauses into outer and inner argument values,
424
* so that we can evaluate those subexpressions separately. Also make
425
* a list of the hash operator OIDs, in preparation for looking up the
426
* hash functions to use.
431
foreach(l, hjstate->hashclauses)
433
FuncExprState *fstate = (FuncExprState *) lfirst(l);
436
Assert(IsA(fstate, FuncExprState));
437
hclause = (OpExpr *) fstate->xprstate.expr;
438
Assert(IsA(hclause, OpExpr));
439
lclauses = lappend(lclauses, linitial(fstate->args));
440
rclauses = lappend(rclauses, lsecond(fstate->args));
441
hoperators = lappend_oid(hoperators, hclause->opno);
443
hjstate->hj_OuterHashKeys = lclauses;
444
hjstate->hj_InnerHashKeys = rclauses;
445
hjstate->hj_HashOperators = hoperators;
446
/* child Hash node needs to evaluate inner hash keys, too */
447
((HashState *) innerPlanState(hjstate))->hashkeys = rclauses;
449
hjstate->js.ps.ps_OuterTupleSlot = NULL;
450
hjstate->js.ps.ps_TupFromTlist = false;
451
hjstate->hj_NeedNewOuter = true;
452
hjstate->hj_MatchedOuter = false;
458
ExecCountSlotsHashJoin(HashJoin *node)
460
return ExecCountSlotsNode(outerPlan(node)) +
461
ExecCountSlotsNode(innerPlan(node)) +
465
/* ----------------------------------------------------------------
468
* clean up routine for HashJoin node
469
* ----------------------------------------------------------------
472
ExecEndHashJoin(HashJoinState *node)
477
if (node->hj_HashTable)
479
ExecHashTableDestroy(node->hj_HashTable);
480
node->hj_HashTable = NULL;
484
* Free the exprcontext
486
ExecFreeExprContext(&node->js.ps);
489
* clean out the tuple table
491
ExecClearTuple(node->js.ps.ps_ResultTupleSlot);
492
ExecClearTuple(node->hj_OuterTupleSlot);
493
ExecClearTuple(node->hj_HashTupleSlot);
498
ExecEndNode(outerPlanState(node));
499
ExecEndNode(innerPlanState(node));
502
/* ----------------------------------------------------------------
503
* ExecHashJoinOuterGetTuple
505
* get the next outer tuple for hashjoin: either by
506
* executing a plan node as in the first pass, or from
507
* the tmp files for the hashjoin batches.
508
* ----------------------------------------------------------------
511
static TupleTableSlot *
512
ExecHashJoinOuterGetTuple(PlanState *node, HashJoinState *hjstate)
514
HashJoinTable hashtable = hjstate->hj_HashTable;
515
int curbatch = hashtable->curbatch;
516
TupleTableSlot *slot;
519
{ /* if it is the first pass */
520
slot = ExecProcNode(node);
521
if (!TupIsNull(slot))
525
* We have just reached the end of the first pass. Try to switch
528
curbatch = ExecHashJoinNewBatch(hjstate);
532
* Try to read from a temp file. Loop allows us to advance to new
535
while (curbatch <= hashtable->nbatch)
537
slot = ExecHashJoinGetSavedTuple(hjstate,
538
hashtable->outerBatchFile[curbatch - 1],
539
hjstate->hj_OuterTupleSlot);
540
if (!TupIsNull(slot))
542
curbatch = ExecHashJoinNewBatch(hjstate);
545
/* Out of batches... */
549
/* ----------------------------------------------------------------
550
* ExecHashJoinGetSavedTuple
552
* read the next tuple from a tmp file
553
* ----------------------------------------------------------------
556
static TupleTableSlot *
557
ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
559
TupleTableSlot *tupleSlot)
565
nread = BufFileRead(file, (void *) &htup, sizeof(HeapTupleData));
567
return NULL; /* end of file */
568
if (nread != sizeof(HeapTupleData))
570
(errcode_for_file_access(),
571
errmsg("could not read from hash-join temporary file: %m")));
572
heapTuple = palloc(HEAPTUPLESIZE + htup.t_len);
573
memcpy((char *) heapTuple, (char *) &htup, sizeof(HeapTupleData));
574
heapTuple->t_datamcxt = CurrentMemoryContext;
575
heapTuple->t_data = (HeapTupleHeader)
576
((char *) heapTuple + HEAPTUPLESIZE);
577
nread = BufFileRead(file, (void *) heapTuple->t_data, htup.t_len);
578
if (nread != (size_t) htup.t_len)
580
(errcode_for_file_access(),
581
errmsg("could not read from hash-join temporary file: %m")));
582
return ExecStoreTuple(heapTuple, tupleSlot, InvalidBuffer, true);
585
/* ----------------------------------------------------------------
586
* ExecHashJoinNewBatch
588
* switch to a new hashjoin batch
589
* ----------------------------------------------------------------
592
ExecHashJoinNewBatch(HashJoinState *hjstate)
594
HashJoinTable hashtable = hjstate->hj_HashTable;
595
int nbatch = hashtable->nbatch;
596
int newbatch = hashtable->curbatch + 1;
597
long *innerBatchSize = hashtable->innerBatchSize;
598
long *outerBatchSize = hashtable->outerBatchSize;
600
TupleTableSlot *slot;
601
ExprContext *econtext;
607
* We no longer need the previous outer batch file; close it right
608
* away to free disk space.
610
BufFileClose(hashtable->outerBatchFile[newbatch - 2]);
611
hashtable->outerBatchFile[newbatch - 2] = NULL;
615
* Normally we can skip over any batches that are empty on either side
616
* --- but for JOIN_LEFT, can only skip when left side is empty.
617
* Release associated temp files right away.
619
while (newbatch <= nbatch &&
620
(outerBatchSize[newbatch - 1] == 0L ||
621
(innerBatchSize[newbatch - 1] == 0L &&
622
hjstate->js.jointype != JOIN_LEFT)))
624
BufFileClose(hashtable->innerBatchFile[newbatch - 1]);
625
hashtable->innerBatchFile[newbatch - 1] = NULL;
626
BufFileClose(hashtable->outerBatchFile[newbatch - 1]);
627
hashtable->outerBatchFile[newbatch - 1] = NULL;
631
if (newbatch > nbatch)
632
return newbatch; /* no more batches */
635
* Rewind inner and outer batch files for this batch, so that we can
636
* start reading them.
638
if (BufFileSeek(hashtable->outerBatchFile[newbatch - 1], 0, 0L, SEEK_SET))
640
(errcode_for_file_access(),
641
errmsg("could not rewind hash-join temporary file: %m")));
643
innerFile = hashtable->innerBatchFile[newbatch - 1];
645
if (BufFileSeek(innerFile, 0, 0L, SEEK_SET))
647
(errcode_for_file_access(),
648
errmsg("could not rewind hash-join temporary file: %m")));
651
* Reload the hash table with the new inner batch
653
ExecHashTableReset(hashtable, innerBatchSize[newbatch - 1]);
655
econtext = hjstate->js.ps.ps_ExprContext;
656
innerhashkeys = hjstate->hj_InnerHashKeys;
658
while ((slot = ExecHashJoinGetSavedTuple(hjstate,
660
hjstate->hj_HashTupleSlot))
663
econtext->ecxt_innertuple = slot;
664
ExecHashTableInsert(hashtable, econtext, innerhashkeys);
668
* after we build the hash table, the inner batch file is no longer
671
BufFileClose(innerFile);
672
hashtable->innerBatchFile[newbatch - 1] = NULL;
674
hashtable->curbatch = newbatch;
678
/* ----------------------------------------------------------------
679
* ExecHashJoinSaveTuple
681
* save a tuple to a tmp file.
683
* The data recorded in the file for each tuple is an image of its
684
* HeapTupleData (with meaningless t_data pointer) followed by the
685
* HeapTupleHeader and tuple data.
686
* ----------------------------------------------------------------
690
ExecHashJoinSaveTuple(HeapTuple heapTuple,
695
written = BufFileWrite(file, (void *) heapTuple, sizeof(HeapTupleData));
696
if (written != sizeof(HeapTupleData))
698
(errcode_for_file_access(),
699
errmsg("could not write to hash-join temporary file: %m")));
700
written = BufFileWrite(file, (void *) heapTuple->t_data, heapTuple->t_len);
701
if (written != (size_t) heapTuple->t_len)
703
(errcode_for_file_access(),
704
errmsg("could not write to hash-join temporary file: %m")));
708
ExecReScanHashJoin(HashJoinState *node, ExprContext *exprCtxt)
711
* If we haven't yet built the hash table then we can just return;
712
* nothing done yet, so nothing to undo.
714
if (!node->hj_hashdone)
716
Assert(node->hj_HashTable != NULL);
719
* In a multi-batch join, we currently have to do rescans the hard
720
* way, primarily because batch temp files may have already been
721
* released. But if it's a single-batch join, and there is no
722
* parameter change for the inner subnode, then we can just re-use the
723
* existing hash table without rebuilding it.
725
if (node->hj_HashTable->nbatch == 0 &&
726
((PlanState *) node)->righttree->chgParam == NULL)
728
/* okay to reuse the hash table; needn't rescan inner, either */
732
/* must destroy and rebuild hash table */
733
node->hj_hashdone = false;
734
ExecHashTableDestroy(node->hj_HashTable);
735
node->hj_HashTable = NULL;
738
* if chgParam of subnode is not null then plan will be re-scanned
739
* by first ExecProcNode.
741
if (((PlanState *) node)->righttree->chgParam == NULL)
742
ExecReScan(((PlanState *) node)->righttree, exprCtxt);
745
/* Always reset intra-tuple state */
746
node->hj_CurBucketNo = 0;
747
node->hj_CurTuple = NULL;
749
node->js.ps.ps_OuterTupleSlot = NULL;
750
node->js.ps.ps_TupFromTlist = false;
751
node->hj_NeedNewOuter = true;
752
node->hj_MatchedOuter = false;
755
* if chgParam of subnode is not null then plan will be re-scanned by
756
* first ExecProcNode.
758
if (((PlanState *) node)->lefttree->chgParam == NULL)
759
ExecReScan(((PlanState *) node)->lefttree, exprCtxt);