1
package org.apache.lucene.util.fst;
4
* Licensed to the Apache Software Foundation (ASF) under one or more
5
* contributor license agreements. See the NOTICE file distributed with
6
* this work for additional information regarding copyright ownership.
7
* The ASF licenses this file to You under the Apache License, Version 2.0
8
* (the "License"); you may not use this file except in compliance with
9
* the License. You may obtain a copy of the License at
11
* http://www.apache.org/licenses/LICENSE-2.0
13
* Unless required by applicable law or agreed to in writing, software
14
* distributed under the License is distributed on an "AS IS" BASIS,
15
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
* See the License for the specific language governing permissions and
17
* limitations under the License.
20
import org.apache.lucene.util.ArrayUtil;
21
import org.apache.lucene.util.RamUsageEstimator;
22
import org.apache.lucene.util.BytesRef;
23
import org.apache.lucene.util.IntsRef;
24
import org.apache.lucene.util.fst.FST.INPUT_TYPE; // javadoc
26
import java.io.IOException;
29
* Builds a compact FST (maps an IntsRef term to an arbitrary
30
* output) from pre-sorted terms with outputs (the FST
31
* becomes an FSA if you use NoOutputs). The FST is written
32
* on-the-fly into a compact serialized format byte array, which can
33
* be saved to / loaded from a Directory or used directly
34
* for traversal. The FST is always finite (no cycles).
36
* <p>NOTE: The algorithm is described at
37
* http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.24.3698</p>
39
* If your outputs are ByteSequenceOutput then the final FST
40
* will be minimal, but if you use PositiveIntOutput then
41
* it's only "near minimal". For example, aa/0, aab/1, bbb/2
42
* will produce 6 states when a 5 state fst is also
45
* The parameterized type T is the output type. See the
46
* subclasses of {@link Outputs}.
48
* @lucene.experimental
51
public class Builder<T> {
52
private final NodeHash<T> dedupHash;
53
private final FST<T> fst;
54
private final T NO_OUTPUT;
56
// simplistic pruning: we prune node (and all following
57
// nodes) if less than this number of terms go through it:
58
private final int minSuffixCount1;
60
// better pruning: we prune node (and all following
61
// nodes) if the prior node has less than this number of
62
// terms go through it:
63
private final int minSuffixCount2;
65
private final boolean doShareNonSingletonNodes;
66
private final int shareMaxTailLength;
68
private final IntsRef lastInput = new IntsRef();
70
// NOTE: cutting this over to ArrayList instead loses ~6%
71
// in build performance on 9.8M Wikipedia terms; so we
72
// left this as an array:
74
private UnCompiledNode<T>[] frontier;
77
* Instantiates an FST/FSA builder without any pruning. A shortcut
78
* to {@link #Builder(FST.INPUT_TYPE, int, int, boolean, boolean, int, Outputs)} with
79
* pruning options turned off.
81
public Builder(FST.INPUT_TYPE inputType, Outputs<T> outputs) {
82
this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs);
86
* Instantiates an FST/FSA builder with all the possible tuning and construction
87
* tweaks. Read parameter documentation carefully.
90
* The input type (transition labels). Can be anything from {@link INPUT_TYPE}
91
* enumeration. Shorter types will consume less memory. Strings (character sequences) are
92
* represented as {@link INPUT_TYPE#BYTE4} (full unicode codepoints).
94
* @param minSuffixCount1
95
* If pruning the input graph during construction, this threshold is used for telling
96
* if a node is kept or pruned. If transition_count(node) >= minSuffixCount1, the node
99
* @param minSuffixCount2
100
* (Note: only Mike McCandless knows what this one is really doing...)
102
* @param doShareSuffix
103
* If <code>true</code>, the shared suffixes will be compacted into unique paths.
104
* This requires an additional hash map for lookups in memory. Setting this parameter to
105
* <code>false</code> creates a single path for all input sequences. This will result in a larger
106
* graph, but may require less memory and will speed up construction.
108
* @param doShareNonSingletonNodes
109
* Only used if doShareSuffix is true. Set this to
110
* true to ensure FST is fully minimal, at cost of more
111
* CPU and more RAM during building.
113
* @param shareMaxTailLength
114
* Only used if doShareSuffix is true. Set this to
115
* Integer.MAX_VALUE to ensure FST is fully minimal, at cost of more
116
* CPU and more RAM during building.
118
* @param outputs The output type for each input sequence. Applies only if building an FST. For
119
* FSA, use {@link NoOutputs#getSingleton()} and {@link NoOutputs#getNoOutput()} as the
120
* singleton output object.
122
public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix,
123
boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs<T> outputs) {
124
this.minSuffixCount1 = minSuffixCount1;
125
this.minSuffixCount2 = minSuffixCount2;
126
this.doShareNonSingletonNodes = doShareNonSingletonNodes;
127
this.shareMaxTailLength = shareMaxTailLength;
128
fst = new FST<T>(inputType, outputs);
130
dedupHash = new NodeHash<T>(fst);
134
NO_OUTPUT = outputs.getNoOutput();
136
@SuppressWarnings("unchecked") final UnCompiledNode<T>[] f = (UnCompiledNode<T>[]) new UnCompiledNode[10];
138
for(int idx=0;idx<frontier.length;idx++) {
139
frontier[idx] = new UnCompiledNode<T>(this, idx);
143
public int getTotStateCount() {
144
return fst.nodeCount;
147
public long getTermCount() {
148
return frontier[0].inputCount;
151
public int getMappedStateCount() {
152
return dedupHash == null ? 0 : fst.nodeCount;
155
private CompiledNode compileNode(UnCompiledNode<T> n, int tailLength) throws IOException {
157
if (dedupHash != null && (doShareNonSingletonNodes || n.numArcs <= 1) && tailLength <= shareMaxTailLength) {
158
if (n.numArcs == 0) {
159
address = fst.addNode(n);
161
address = dedupHash.add(n);
164
address = fst.addNode(n);
166
assert address != -2;
170
final CompiledNode fn = new CompiledNode();
171
fn.address = address;
175
private void compilePrevTail(int prefixLenPlus1) throws IOException {
176
assert prefixLenPlus1 >= 1;
177
//System.out.println(" compileTail " + prefixLenPlus1);
178
for(int idx=lastInput.length; idx >= prefixLenPlus1; idx--) {
179
boolean doPrune = false;
180
boolean doCompile = false;
182
final UnCompiledNode<T> node = frontier[idx];
183
final UnCompiledNode<T> parent = frontier[idx-1];
185
if (node.inputCount < minSuffixCount1) {
188
} else if (idx > prefixLenPlus1) {
189
// prune if parent's inputCount is less than suffixMinCount2
190
if (parent.inputCount < minSuffixCount2 || minSuffixCount2 == 1 && parent.inputCount == 1) {
191
// my parent, about to be compiled, doesn't make the cut, so
192
// I'm definitely pruned
194
// if pruneCount2 is 1, we keep only up
195
// until the 'distinguished edge', ie we keep only the
196
// 'divergent' part of the FST. if my parent, about to be
197
// compiled, has inputCount 1 then we are already past the
198
// distinguished edge. NOTE: this only works if
199
// the FST outputs are not "compressible" (simple
200
// ords ARE compressible).
203
// my parent, about to be compiled, does make the cut, so
204
// I'm definitely not pruned
209
// if pruning is disabled (count is 0) we can always
210
// compile current node
211
doCompile = minSuffixCount2 == 0;
214
//System.out.println(" label=" + ((char) lastInput.ints[lastInput.offset+idx-1]) + " idx=" + idx + " inputCount=" + frontier[idx].inputCount + " doCompile=" + doCompile + " doPrune=" + doPrune);
216
if (node.inputCount < minSuffixCount2 || minSuffixCount2 == 1 && node.inputCount == 1) {
218
for(int arcIdx=0;arcIdx<node.numArcs;arcIdx++) {
219
@SuppressWarnings("unchecked") final UnCompiledNode<T> target = (UnCompiledNode<T>) node.arcs[arcIdx].target;
226
// this node doesn't make it -- deref it
228
parent.deleteLast(lastInput.ints[lastInput.offset+idx-1], node);
231
if (minSuffixCount2 != 0) {
232
compileAllTargets(node, lastInput.length-idx);
234
final T nextFinalOutput = node.output;
236
// We "fake" the node as being final if it has no
237
// outgoing arcs; in theory we could leave it
238
// as non-final (the FST can represent this), but
239
// FSTEnum, Util, etc., have trouble w/ non-final
241
final boolean isFinal = node.isFinal || node.numArcs == 0;
244
// this node makes it and we now compile it. first,
245
// compile any targets that were previously
247
parent.replaceLast(lastInput.ints[lastInput.offset + idx-1],
248
compileNode(node, 1+lastInput.length-idx),
252
// replaceLast just to install
253
// nextFinalOutput/isFinal onto the arc
254
parent.replaceLast(lastInput.ints[lastInput.offset + idx-1],
258
// this node will stay in play for now, since we are
259
// undecided on whether to prune it. later, it
260
// will be either compiled or pruned, so we must
261
// allocate a new node:
262
frontier[idx] = new UnCompiledNode<T>(this, idx);
268
private final IntsRef scratchIntsRef = new IntsRef(10);
270
public void add(BytesRef input, T output) throws IOException {
271
assert fst.getInputType() == FST.INPUT_TYPE.BYTE1;
272
scratchIntsRef.grow(input.length);
273
for(int i=0;i<input.length;i++) {
274
scratchIntsRef.ints[i] = input.bytes[i+input.offset] & 0xFF;
276
scratchIntsRef.length = input.length;
277
add(scratchIntsRef, output);
280
/** Sugar: adds the UTF32 codepoints from char[] slice. FST
281
* must be FST.INPUT_TYPE.BYTE4! */
282
public void add(char[] s, int offset, int length, T output) throws IOException {
283
assert fst.getInputType() == FST.INPUT_TYPE.BYTE4;
284
int charIdx = offset;
286
final int charLimit = offset + length;
287
while(charIdx < charLimit) {
288
scratchIntsRef.grow(intIdx+1);
289
final int utf32 = Character.codePointAt(s, charIdx);
290
scratchIntsRef.ints[intIdx] = utf32;
291
charIdx += Character.charCount(utf32);
294
scratchIntsRef.length = intIdx;
295
add(scratchIntsRef, output);
298
/** Sugar: adds the UTF32 codepoints from CharSequence. FST
299
* must be FST.INPUT_TYPE.BYTE4! */
300
public void add(CharSequence s, T output) throws IOException {
301
assert fst.getInputType() == FST.INPUT_TYPE.BYTE4;
304
final int charLimit = s.length();
305
while(charIdx < charLimit) {
306
scratchIntsRef.grow(intIdx+1);
307
final int utf32 = Character.codePointAt(s, charIdx);
308
scratchIntsRef.ints[intIdx] = utf32;
309
charIdx += Character.charCount(utf32);
312
scratchIntsRef.length = intIdx;
313
add(scratchIntsRef, output);
316
/** It's OK to add the same input twice in a row with
317
* different outputs, as long as outputs impls the merge
319
public void add(IntsRef input, T output) throws IOException {
320
//System.out.println("\nFST ADD: input=" + input + " output=" + fst.outputs.outputToString(output));
321
assert lastInput.length == 0 || input.compareTo(lastInput) >= 0: "inputs are added out of order lastInput=" + lastInput + " vs input=" + input;
322
assert validOutput(output);
324
//System.out.println("\nadd: " + input);
325
if (input.length == 0) {
326
// empty input: only allowed as first input. we have
327
// to special case this because the packed FST
328
// format cannot represent the empty input since
329
// 'finalness' is stored on the incoming arc, not on
331
frontier[0].inputCount++;
332
frontier[0].isFinal = true;
333
fst.setEmptyOutput(output);
337
// compare shared prefix length
339
int pos2 = input.offset;
340
final int pos1Stop = Math.min(lastInput.length, input.length);
342
//System.out.println(" incr " + pos1);
343
frontier[pos1].inputCount++;
344
if (pos1 >= pos1Stop || lastInput.ints[pos1] != input.ints[pos2]) {
350
final int prefixLenPlus1 = pos1+1;
352
if (frontier.length < input.length+1) {
353
@SuppressWarnings("unchecked") final UnCompiledNode<T>[] next =
354
new UnCompiledNode[ArrayUtil.oversize(input.length+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
355
System.arraycopy(frontier, 0, next, 0, frontier.length);
356
for(int idx=frontier.length;idx<next.length;idx++) {
357
next[idx] = new UnCompiledNode<T>(this, idx);
362
// minimize/compile states from previous input's
364
compilePrevTail(prefixLenPlus1);
366
// init tail states for current input
367
for(int idx=prefixLenPlus1;idx<=input.length;idx++) {
368
frontier[idx-1].addArc(input.ints[input.offset + idx - 1],
370
//System.out.println(" incr tail " + idx);
371
frontier[idx].inputCount++;
374
final UnCompiledNode<T> lastNode = frontier[input.length];
375
lastNode.isFinal = true;
376
lastNode.output = NO_OUTPUT;
378
// push conflicting outputs forward, only as far as
380
for(int idx=1;idx<prefixLenPlus1;idx++) {
381
final UnCompiledNode<T> node = frontier[idx];
382
final UnCompiledNode<T> parentNode = frontier[idx-1];
384
final T lastOutput = parentNode.getLastOutput(input.ints[input.offset + idx - 1]);
385
assert validOutput(lastOutput);
387
final T commonOutputPrefix;
390
if (lastOutput != NO_OUTPUT) {
391
commonOutputPrefix = fst.outputs.common(output, lastOutput);
392
assert validOutput(commonOutputPrefix);
393
wordSuffix = fst.outputs.subtract(lastOutput, commonOutputPrefix);
394
assert validOutput(wordSuffix);
395
parentNode.setLastOutput(input.ints[input.offset + idx - 1], commonOutputPrefix);
396
node.prependOutput(wordSuffix);
398
commonOutputPrefix = wordSuffix = NO_OUTPUT;
401
output = fst.outputs.subtract(output, commonOutputPrefix);
402
assert validOutput(output);
405
if (lastInput.length == input.length && prefixLenPlus1 == 1+input.length) {
406
// same input more than 1 time in a row, mapping to
408
lastNode.output = fst.outputs.merge(lastNode.output, output);
410
// this new arc is private to this new input; set its
411
// arc output to the leftover output:
412
frontier[prefixLenPlus1-1].setLastOutput(input.ints[input.offset + prefixLenPlus1-1], output);
416
lastInput.copy(input);
418
//System.out.println(" count[0]=" + frontier[0].inputCount);
421
private boolean validOutput(T output) {
422
return output == NO_OUTPUT || !output.equals(NO_OUTPUT);
425
/** Returns final FST. NOTE: this will return null if
426
* nothing is accepted by the FST. */
427
public FST<T> finish() throws IOException {
429
// minimize nodes in the last word's suffix
431
//System.out.println("finish: inputCount=" + frontier[0].inputCount);
432
if (frontier[0].inputCount < minSuffixCount1 || frontier[0].inputCount < minSuffixCount2 || frontier[0].numArcs == 0) {
433
if (fst.emptyOutput == null) {
435
} else if (minSuffixCount1 > 0 || minSuffixCount2 > 0) {
436
// empty string got pruned
439
fst.finish(compileNode(frontier[0], lastInput.length).address);
440
//System.out.println("compile addr = " + fst.getStartNode());
444
if (minSuffixCount2 != 0) {
445
compileAllTargets(frontier[0], lastInput.length);
447
//System.out.println("NOW: " + frontier[0].numArcs);
448
fst.finish(compileNode(frontier[0], lastInput.length).address);
452
if (dedupHash != null) {
453
System.out.println("NH: " + dedupHash.count());
460
private void compileAllTargets(UnCompiledNode<T> node, int tailLength) throws IOException {
461
for(int arcIdx=0;arcIdx<node.numArcs;arcIdx++) {
462
final Arc<T> arc = node.arcs[arcIdx];
463
if (!arc.target.isCompiled()) {
465
@SuppressWarnings("unchecked") final UnCompiledNode<T> n = (UnCompiledNode<T>) arc.target;
466
if (n.numArcs == 0) {
467
//System.out.println("seg=" + segment + " FORCE final arc=" + (char) arc.label);
468
arc.isFinal = n.isFinal = true;
470
arc.target = compileNode(n, tailLength-1);
475
static class Arc<T> {
476
public int label; // really an "unsigned" byte
478
public boolean isFinal;
480
public T nextFinalOutput;
483
// NOTE: not many instances of Node or CompiledNode are in
484
// memory while the FST is being built; it's only the
485
// current "frontier":
487
static interface Node {
488
boolean isCompiled();
491
static final class CompiledNode implements Node {
493
public boolean isCompiled() {
498
static final class UnCompiledNode<T> implements Node {
499
final Builder<T> owner;
506
/** This node's depth, starting from the automaton root. */
511
* The node's depth starting from the automaton root. Needed for
512
* LUCENE-2934 (node expansion based on conditions other than the
515
@SuppressWarnings("unchecked")
516
public UnCompiledNode(Builder<T> owner, int depth) {
518
arcs = (Arc<T>[]) new Arc[1];
519
arcs[0] = new Arc<T>();
520
output = owner.NO_OUTPUT;
524
public boolean isCompiled() {
528
public void clear() {
531
output = owner.NO_OUTPUT;
534
// We don't clear the depth here because it never changes
535
// for nodes on the frontier (even when reused).
538
public T getLastOutput(int labelToMatch) {
540
assert arcs[numArcs-1].label == labelToMatch;
541
return arcs[numArcs-1].output;
544
public void addArc(int label, Node target) {
546
assert numArcs == 0 || label > arcs[numArcs-1].label: "arc[-1].label=" + arcs[numArcs-1].label + " new label=" + label + " numArcs=" + numArcs;
547
if (numArcs == arcs.length) {
548
@SuppressWarnings("unchecked") final Arc<T>[] newArcs =
549
new Arc[ArrayUtil.oversize(numArcs+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
550
System.arraycopy(arcs, 0, newArcs, 0, arcs.length);
551
for(int arcIdx=numArcs;arcIdx<newArcs.length;arcIdx++) {
552
newArcs[arcIdx] = new Arc<T>();
556
final Arc<T> arc = arcs[numArcs++];
559
arc.output = arc.nextFinalOutput = owner.NO_OUTPUT;
563
public void replaceLast(int labelToMatch, Node target, T nextFinalOutput, boolean isFinal) {
565
final Arc<T> arc = arcs[numArcs-1];
566
assert arc.label == labelToMatch: "arc.label=" + arc.label + " vs " + labelToMatch;
568
//assert target.address != -2;
569
arc.nextFinalOutput = nextFinalOutput;
570
arc.isFinal = isFinal;
573
public void deleteLast(int label, Node target) {
575
assert label == arcs[numArcs-1].label;
576
assert target == arcs[numArcs-1].target;
580
public void setLastOutput(int labelToMatch, T newOutput) {
581
assert owner.validOutput(newOutput);
583
final Arc<T> arc = arcs[numArcs-1];
584
assert arc.label == labelToMatch;
585
arc.output = newOutput;
588
// pushes an output prefix forward onto all arcs
589
public void prependOutput(T outputPrefix) {
590
assert owner.validOutput(outputPrefix);
592
for(int arcIdx=0;arcIdx<numArcs;arcIdx++) {
593
arcs[arcIdx].output = owner.fst.outputs.add(outputPrefix, arcs[arcIdx].output);
594
assert owner.validOutput(arcs[arcIdx].output);
598
output = owner.fst.outputs.add(outputPrefix, output);
599
assert owner.validOutput(output);