~ubuntu-branches/ubuntu/trusty/mozjs24/trusty-proposed

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
/*
 ******************************************************************************
 *   Copyright (C) 1996-2011, International Business Machines                 *
 *   Corporation and others.  All Rights Reserved.                            *
 ******************************************************************************
 */

/**
 * \file 
 * \brief C++ API: Boyer-Moore StringSearch technology preview
 * \internal ICU 4.0.1 technology preview
 */
 
#ifndef B_M_SEARCH_H
#define B_M_SEARCH_H

#include "unicode/utypes.h"

#if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION

#include "unicode/uobject.h"
#include "unicode/ucol.h"

#include "unicode/colldata.h"

U_NAMESPACE_BEGIN

class BadCharacterTable;
class GoodSuffixTable;
class Target;

#ifndef U_HIDE_INTERNAL_API
/**
 * BoyerMooreSearch
 *
 * This object holds the information needed to do a Collation sensitive Boyer-Moore search. It encapulates
 * the pattern, the "bad character" and "good suffix" tables, the Collator-based data needed to compute them,
 * and a reference to the text being searched.
 *
 * To do a search, you fist need to get a <code>CollData</code> object by calling <code>CollData::open</code>.
 * Then you construct a <code>BoyerMooreSearch</code> object from the <code>CollData</code> object, the pattern
 * string and the target string. Then you call the <code>search</code> method. Here's a code sample:
 *
 * <pre>
 * void boyerMooreExample(UCollator *collator, UnicodeString *pattern, UnicodeString *target)
 * {
 *     UErrorCode status = U_ZERO_ERROR;
 *     CollData *collData = CollData::open(collator, status);
 *
 *     if (U_FAILURE(status)) {
 *         // could not create a CollData object
 *         return;
 *     }
 *
 *     BoyerMooreSearch *search = new BoyerMooreSearch(collData, *patternString, target, status);
 *
 *     if (U_FAILURE(status)) {
 *         // could not create a BoyerMooreSearch object
 *         CollData::close(collData);
 *         return;
 *     }
 *
 *     int32_t offset = 0, start = -1, end = -1;
 *
 *     // Find all matches
 *     while (search->search(offset, start, end)) {
 *         // process the match between start and end
 *         ...
 *         // advance past the match
 *         offset = end; 
 *     }
 *
 *     // at this point, if offset == 0, there were no matches
 *     if (offset == 0) {
 *         // handle the case of no matches
 *     }
 *
 *     delete search;
 *     CollData::close(collData);
 *
 *     // CollData objects are cached, so the call to
 *     // CollData::close doesn't delete the object.
 *     // Call this if you don't need the object any more.
 *     CollData::flushCollDataCache();
 * }
 * </pre>
 *
 * NOTE: This is a technology preview. The final version of this API may not bear any resenblence to this API.
 *
 * Knows linitations:
 *   1) Backwards searching has not been implemented.
 *
 *   2) For Han and Hangul characters, this code ignores any Collation tailorings. In general,
 *      this isn't a problem, but in Korean locals, at strength 1, Hangul characters are tailored
 *      to be equal to Han characters with the same pronounciation. Because this code ignroes
 *      tailorings, searching for a Hangul character will not find a Han character and visa-versa.
 *
 *   3) In some cases, searching for a pattern that needs to be normalized and ends
 *      in a discontiguous contraction may fail. The only known cases of this are with
 *      the Tibetan script. For example searching for the pattern
 *      "\u0F7F\u0F80\u0F81\u0F82\u0F83\u0F84\u0F85" will fail. (This case is artificial. We've
 *      been unable to find a pratical, real-world example of this failure.)  
 *
 * @internal ICU 4.0.1 technology preview
 *
 * @see CollData
 */
class U_I18N_API BoyerMooreSearch : public UObject
{
public:
    /**
     * Construct a <code>BoyerMooreSearch</code> object.
     *
     * @param theData - A <code>CollData</code> object holding the Collator-sensitive data
     * @param patternString - the string for which to search
     * @param targetString - the string in which to search or <code>NULL</code> if youu will
     *                       set it later by calling <code>setTargetString</code>.
     * @param status - will be set if any errors occur. 
     *
     * Note: if on return, status is set to an error code,
     * the only safe thing to do with this object is to call
     * the destructor.
     *
     * @internal ICU 4.0.1 technology preview
     */
    BoyerMooreSearch(CollData *theData, const UnicodeString &patternString, const UnicodeString *targetString, UErrorCode &status);

    /**
     * The desstructor
     *
     * @internal ICU 4.0.1 technology preview
     */
    ~BoyerMooreSearch();

    /**
     * Test the pattern to see if it generates any CEs.
     *
     * @return <code>TRUE</code> if the pattern string did not generate any CEs
     *
     * @internal ICU 4.0.1 technology preview
     */
    UBool empty();

    /**
     * Search for the pattern string in the target string.
     *
     * @param offset - the offset in the target string at which to begin the search
     * @param start - will be set to the starting offset of the match, or -1 if there's no match
     * @param end - will be set to the ending offset of the match, or -1 if there's no match
     *
     * @return <code>TRUE</code> if the match succeeds, <code>FALSE</code> otherwise.
     *
     * @internal ICU 4.0.1 technology preview
     */
    UBool search(int32_t offset, int32_t &start, int32_t &end);

    /**
     * Set the target string for the match.
     *
     * @param targetString - the new target string
     * @param status - will be set if any errors occur. 
     *
     * @internal ICU 4.0.1 technology preview
     */
    void setTargetString(const UnicodeString *targetString, UErrorCode &status);

    // **** no longer need these? ****
    /**
     * Return the <code>CollData</code> object used for searching
     *
     * @return the <code>CollData</code> object used for searching
     *
     * @internal ICU 4.0.1 technology preview
     */
    CollData *getData();

    /**
     * Return the CEs generated by the pattern string.
     *
     * @return a <code>CEList</code> object holding the CEs generated by the pattern string.
     *
     * @internal ICU 4.0.1 technology preview
     */
    CEList   *getPatternCEs();

    /**
     * Return the <code>BadCharacterTable</code> object computed for the pattern string.
     *
     * @return the <code>BadCharacterTable</code> object.
     *
     * @internal ICU 4.0.1 technology preview
     */
    BadCharacterTable *getBadCharacterTable();

    /**
     * Return the <code>GoodSuffixTable</code> object computed for the pattern string.
     *
     * @return the <code>GoodSuffixTable</code> object computed for the pattern string.
     *
     * @internal ICU 4.0.1 technology preview
     */
    GoodSuffixTable   *getGoodSuffixTable();

    /**
     * UObject glue...
     * @internal ICU 4.0.1 technology preview
     */
    virtual UClassID getDynamicClassID() const;
    /**
     * UObject glue...
     * @internal ICU 4.0.1 technology preview
     */
    static UClassID getStaticClassID();
    
private:
    CollData *data;
    CEList *patCEs;
    BadCharacterTable *badCharacterTable;
    GoodSuffixTable   *goodSuffixTable;
    UnicodeString pattern;
    Target *target;
};
#endif  /* U_HIDE_INTERNAL_API */

U_NAMESPACE_END

#endif // #if !UCONFIG_NO_COLLATION
#endif // #ifndef B_M_SEARCH_H