2
* Copyright 2010 Google Inc.
4
* Licensed under the Apache License, Version 2.0 (the "License");
5
* you may not use this file except in compliance with the License.
6
* You may obtain a copy of the License at
8
* http://www.apache.org/licenses/LICENSE-2.0
10
* Unless required by applicable law or agreed to in writing, software
11
* distributed under the License is distributed on an "AS IS" BASIS,
12
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
* See the License for the specific language governing permissions and
14
* limitations under the License.
17
// Copyright 2006 Google Inc. All Rights Reserved.
18
// Author: jrm@google.com (Jim Meehan)
21
#ifndef UTIL_UTF8_UNICODETEXT_H_
22
#define UTIL_UTF8_UNICODETEXT_H_
24
#include <stddef.h> // for NULL, ptrdiff_t
25
#include <iterator> // for bidirectional_iterator_tag, etc
26
#include <string> // for string
27
#include <utility> // for pair
29
#include "base/integral_types.h" // for char32
30
#include "base/port.h"
31
#include "util/utf8/public/config.h"
33
// ***************************** UnicodeText **************************
35
// A UnicodeText object is a container for a sequence of Unicode
36
// codepoint values. It has default, copy, and assignment constructors.
37
// Data can be appended to it from another UnicodeText, from
38
// iterators, or from a single codepoint.
40
// The internal representation of the text is UTF-8. Since UTF-8 is a
41
// variable-width format, UnicodeText does not provide random access
42
// to the text, and changes to the text are permitted only at the end.
44
// The UnicodeText class defines a const_iterator. The dereferencing
45
// operator (*) returns a codepoint (char32). The iterator is a
46
// bidirectional, read-only iterator. It becomes invalid if the text
49
// There are methods for appending and retrieving UTF-8 data directly.
50
// The 'utf8_data' method returns a const char* that contains the
51
// UTF-8-encoded version of the text; 'utf8_length' returns the number
52
// of bytes in the UTF-8 data. An iterator's 'get' method stores up to
53
// 4 bytes of UTF-8 data in a char array and returns the number of
54
// bytes that it stored.
56
// Codepoints are integers in the range [0, 0xD7FF] or [0xE000,
57
// 0x10FFFF], but UnicodeText has the additional restriction that it
58
// can contain only those characters that are valid for interchange on
59
// the Web. This excludes all of the control codes except for carriage
60
// return, line feed, and horizontal tab. It also excludes
61
// non-characters, but codepoints that are in the Private Use regions
62
// are allowed, as are codepoints that are unassigned. (See the
63
// Unicode reference for details.) The function UniLib::IsInterchangeValid
64
// can be used as a test for this property.
66
// UnicodeTexts are safe. Every method that constructs or modifies a
67
// UnicodeText tests for interchange-validity, and will substitute a
68
// space for the invalid data. Such cases are reported via
71
// MEMORY MANAGEMENT: copy, take ownership, or point to
73
// A UnicodeText is either an "owner", meaning that it owns the memory
74
// for the data buffer and will free it when the UnicodeText is
75
// destroyed, or it is an "alias", meaning that it does not.
77
// There are three methods for storing UTF-8 data in a UnicodeText:
79
// CopyUTF8(buffer, len) copies buffer.
81
// TakeOwnershipOfUTF8(buffer, size, capacity) takes ownership of buffer.
83
// PointToUTF8(buffer, size) creates an alias pointing to buffer.
85
// All three methods perform a validity check on the buffer. There are
86
// private, "unsafe" versions of these functions that bypass the
87
// validity check. They are used internally and by friend-functions
88
// that are handling UTF-8 data that has already been validated.
90
// The purpose of an alias is to avoid making an unnecessary copy of a
91
// UTF-8 buffer while still providing access to the Unicode values
92
// within that text through iterators or the fast scanners that are
93
// based on UTF-8 state tables. The lifetime of an alias must not
94
// exceed the lifetime of the buffer from which it was constructed.
96
// The semantics of an alias might be described as "copy on write or
97
// repair." The source data is never modified. If push_back() or
98
// append() is called on an alias, a copy of the data will be created,
99
// and the UnicodeText will become an owner. If clear() is called on
100
// an alias, it becomes an (empty) owner.
102
// The copy constructor and the assignment operator produce an owner.
103
// That is, after direct initialization ("UnicodeText x(y);") or copy
104
// initialization ("UnicodeText x = y;") x will be an owner, even if y
105
// was an alias. The assignment operator ("x = y;") also produces an
106
// owner unless x and y are the same object and y is an alias.
108
// Aliases should be used with care. If the source from which an alias
109
// was created is freed, or if the contents are changed, while the
110
// alias is still in use, fatal errors could result. But it can be
111
// quite useful to have a UnicodeText "window" through which to see a
112
// UTF-8 buffer without having to pay the price of making a copy.
116
// The interfaces in util/utf8/public/textutils.h provide higher-level
117
// utilities for dealing with UnicodeTexts, including routines for
118
// creating UnicodeTexts (both owners and aliases) from UTF-8 buffers or
119
// strings, creating strings from UnicodeTexts, normalizing text for
120
// efficient matching or display, and others.
124
class const_iterator;
126
typedef char32 value_type;
128
// Constructors. These always produce owners.
129
UnicodeText(); // Create an empty text.
130
UnicodeText(const UnicodeText& src); // copy constructor
131
// Construct a substring (copies the data).
132
UnicodeText(const const_iterator& first, const const_iterator& last);
134
// Assignment operator. This copies the data and produces an owner
135
// unless this == &src, e.g., "x = x;", which is a no-op.
136
UnicodeText& operator=(const UnicodeText& src);
138
// x.Copy(y) copies the data from y into x.
139
UnicodeText& Copy(const UnicodeText& src);
140
inline UnicodeText& assign(const UnicodeText& src) { return Copy(src); }
142
// x.PointTo(y) changes x so that it points to y's data.
143
// It does not copy y or take ownership of y's data.
144
UnicodeText& PointTo(const UnicodeText& src);
145
UnicodeText& PointTo(const const_iterator& first,
146
const const_iterator& last);
150
void clear(); // Clear text.
151
bool empty() const { return repr_.size_ == 0; } // Test if text is empty.
153
// Add a codepoint to the end of the text.
154
// If the codepoint is not interchange-valid, add a space instead
155
// and log a warning.
156
void push_back(char32 codepoint);
158
// Generic appending operation.
159
// iterator_traits<ForwardIterator>::value_type must be implicitly
160
// convertible to char32. Typical uses of this method might include:
161
// char32 chars[] = {0x1, 0x2, ...};
162
// vector<char32> more_chars = ...;
163
// utext.append(chars, chars+arraysize(chars));
164
// utext.append(more_chars.begin(), more_chars.end());
165
template<typename ForwardIterator>
166
UnicodeText& append(ForwardIterator first, const ForwardIterator last) {
167
while (first != last) { push_back(*first++); }
171
// A specialization of the generic append() method.
172
UnicodeText& append(const const_iterator& first, const const_iterator& last);
174
// An optimization of append(source.begin(), source.end()).
175
UnicodeText& append(const UnicodeText& source);
177
int size() const; // the number of Unicode characters (codepoints)
179
friend bool operator==(const UnicodeText& lhs, const UnicodeText& rhs);
180
friend bool operator!=(const UnicodeText& lhs, const UnicodeText& rhs);
182
class const_iterator {
183
typedef const_iterator CI;
185
typedef std::bidirectional_iterator_tag iterator_category;
186
typedef char32 value_type;
187
typedef ptrdiff_t difference_type;
188
typedef void pointer; // (Not needed.)
189
typedef const char32 reference; // (Needed for const_reverse_iterator)
191
// Iterators are default-constructible.
194
// It's safe to make multiple passes over a UnicodeText.
195
const_iterator(const const_iterator& other);
196
const_iterator& operator=(const const_iterator& other);
198
char32 operator*() const; // Dereference
200
const_iterator& operator++(); // Advance (++iter)
201
const_iterator operator++(int) { // (iter++)
202
const_iterator result(*this);
207
const_iterator& operator--(); // Retreat (--iter)
208
const_iterator operator--(int) { // (iter--)
209
const_iterator result(*this);
214
// We love relational operators.
215
friend bool operator==(const CI& lhs, const CI& rhs) {
216
return lhs.it_ == rhs.it_; }
217
friend bool operator!=(const CI& lhs, const CI& rhs) {
218
return !(lhs == rhs); }
219
friend bool operator<(const CI& lhs, const CI& rhs);
220
friend bool operator>(const CI& lhs, const CI& rhs) {
222
friend bool operator<=(const CI& lhs, const CI& rhs) {
223
return !(rhs < lhs); }
224
friend bool operator>=(const CI& lhs, const CI& rhs) {
225
return !(lhs < rhs); }
227
friend difference_type distance(const CI& first, const CI& last);
229
// UTF-8-specific methods
230
// Store the UTF-8 encoding of the current codepoint into buf,
231
// which must be at least 4 bytes long. Return the number of
233
int get_utf8(char* buf) const;
234
// Return the iterator's pointer into the UTF-8 data.
235
const char* utf8_data() const { return it_; }
237
string DebugString() const;
240
friend class UnicodeText;
241
friend class UnicodeTextUtils;
242
friend class UTF8StateTableProperty;
243
explicit const_iterator(const char* it) : it_(it) {}
248
const_iterator begin() const;
249
const_iterator end() const;
251
class const_reverse_iterator : public std::reverse_iterator<const_iterator> {
253
const_reverse_iterator(const_iterator it) :
254
std::reverse_iterator<const_iterator>(it) {}
255
const char* utf8_data() const {
256
const_iterator tmp_it = base();
257
return (--tmp_it).utf8_data();
259
int get_utf8(char* buf) const {
260
const_iterator tmp_it = base();
261
return (--tmp_it).get_utf8(buf);
264
const_reverse_iterator rbegin() const {
265
return const_reverse_iterator(end());
267
const_reverse_iterator rend() const {
268
return const_reverse_iterator(begin());
271
// Substring searching. Returns the beginning of the first
272
// occurrence of "look", or end() if not found.
273
const_iterator find(const UnicodeText& look, const_iterator start_pos) const;
274
// Equivalent to find(look, begin())
275
const_iterator find(const UnicodeText& look) const;
277
// Returns whether this contains the character U+FFFD. This can
278
// occur, for example, if the input to Encodings::Decode() had byte
279
// sequences that were invalid in the source encoding.
280
bool HasReplacementChar() const;
282
// UTF-8-specific methods
284
// Return the data, length, and capacity of UTF-8-encoded version of
285
// the text. Length and capacity are measured in bytes.
286
const char* utf8_data() const { return repr_.data_; }
287
int utf8_length() const { return repr_.size_; }
288
int utf8_capacity() const { return repr_.capacity_; }
290
// Return the UTF-8 data as a string.
291
static string UTF8Substring(const const_iterator& first,
292
const const_iterator& last);
294
// There are three methods for initializing a UnicodeText from UTF-8
295
// data. They vary in details of memory management. In all cases,
296
// the data is tested for interchange-validity. If it is not
297
// interchange-valid, a LOG(WARNING) is issued, and each
298
// structurally invalid byte and each interchange-invalid codepoint
299
// is replaced with a space.
301
// x.CopyUTF8(buf, len) copies buf into x.
302
UnicodeText& CopyUTF8(const char* utf8_buffer, int byte_length);
304
// x.TakeOwnershipOfUTF8(buf, len, capacity). x takes ownership of
305
// buf. buf is not copied.
306
UnicodeText& TakeOwnershipOfUTF8(char* utf8_buffer,
310
// x.PointToUTF8(buf,len) changes x so that it points to buf
311
// ("becomes an alias"). It does not take ownership or copy buf.
312
// If the buffer is not valid, this has the same effect as
313
// CopyUTF8(utf8_buffer, byte_length).
314
UnicodeText& PointToUTF8(const char* utf8_buffer, int byte_length);
316
// Occasionally it is necessary to use functions that operate on the
317
// pointer returned by utf8_data(). MakeIterator(p) provides a way
318
// to get back to the UnicodeText level. It uses CHECK to ensure
319
// that p is a pointer within this object's UTF-8 data, and that it
320
// points to the beginning of a character.
321
const_iterator MakeIterator(const char* p) const;
323
string DebugString() const;
326
friend class const_iterator;
327
friend class UnicodeTextUtils;
329
class Repr { // A byte-string.
334
bool ours_; // Do we own data_?
336
Repr() : data_(NULL), size_(0), capacity_(0), ours_(true) {}
337
~Repr() { if (ours_) delete[] data_; }
340
void reserve(int capacity);
341
void resize(int size);
343
void append(const char* bytes, int byte_length);
344
void Copy(const char* data, int size);
345
void TakeOwnershipOf(char* data, int size, int capacity);
346
void PointTo(const char* data, int size);
348
string DebugString() const;
351
Repr& operator=(const Repr&);
352
Repr(const Repr& other);
357
// UTF-8-specific private methods.
358
// These routines do not perform a validity check when compiled
360
// It is an error to call these methods with UTF-8 data that
361
// is not interchange-valid.
363
UnicodeText& UnsafeCopyUTF8(const char* utf8_buffer, int byte_length);
364
UnicodeText& UnsafeTakeOwnershipOfUTF8(
365
char* utf8_buffer, int byte_length, int byte_capacity);
366
UnicodeText& UnsafePointToUTF8(const char* utf8_buffer, int byte_length);
367
UnicodeText& UnsafeAppendUTF8(const char* utf8_buffer, int byte_length);
368
const_iterator UnsafeFind(const UnicodeText& look,
369
const_iterator start_pos) const;
372
bool operator==(const UnicodeText& lhs, const UnicodeText& rhs);
374
inline bool operator!=(const UnicodeText& lhs, const UnicodeText& rhs) {
375
return !(lhs == rhs);
378
// UnicodeTextRange is a pair of iterators, useful for specifying text
379
// segments. If the iterators are ==, the segment is empty.
380
typedef pair<UnicodeText::const_iterator,
381
UnicodeText::const_iterator> UnicodeTextRange;
383
inline bool UnicodeTextRangeIsEmpty(const UnicodeTextRange& r) {
384
return r.first == r.second;
388
// *************************** Utilities *************************
390
// A factory function for creating a UnicodeText from a buffer of
391
// UTF-8 data. The new UnicodeText takes ownership of the buffer. (It
394
// Each byte that is structurally invalid will be replaced with a
395
// space. Each codepoint that is interchange-invalid will also be
396
// replaced with a space, even if the codepoint was represented with a
397
// multibyte sequence in the UTF-8 data.
399
inline UnicodeText MakeUnicodeTextAcceptingOwnership(
400
char* utf8_buffer, int byte_length, int byte_capacity) {
401
return UnicodeText().TakeOwnershipOfUTF8(
402
utf8_buffer, byte_length, byte_capacity);
405
// A factory function for creating a UnicodeText from a buffer of
406
// UTF-8 data. The new UnicodeText does not take ownership of the
407
// buffer. (It is an "alias.")
409
inline UnicodeText MakeUnicodeTextWithoutAcceptingOwnership(
410
const char* utf8_buffer, int byte_length) {
411
return UnicodeText().PointToUTF8(utf8_buffer, byte_length);
414
// Create a UnicodeText from a UTF-8 string or buffer.
416
// If do_copy is true, then a copy of the string is made. The copy is
417
// owned by the resulting UnicodeText object and will be freed when
418
// the object is destroyed. This UnicodeText object is referred to
421
// If do_copy is false, then no copy is made. The resulting
422
// UnicodeText object does NOT take ownership of the string; in this
423
// case, the lifetime of the UnicodeText object must not exceed the
424
// lifetime of the string. This Unicodetext object is referred to as
425
// an "alias." This is the same as MakeUnicodeTextWithoutAcceptingOwnership.
427
// If the input string does not contain valid UTF-8, then a copy is
428
// made (as if do_copy were true) and coerced to valid UTF-8 by
429
// replacing each invalid byte with a space.
431
inline UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len,
435
t.CopyUTF8(utf8_buf, len);
437
t.PointToUTF8(utf8_buf, len);
442
inline UnicodeText UTF8ToUnicodeText(const string& utf_string, bool do_copy) {
443
return UTF8ToUnicodeText(utf_string.data(), utf_string.size(), do_copy);
446
inline UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len) {
447
return UTF8ToUnicodeText(utf8_buf, len, true);
449
inline UnicodeText UTF8ToUnicodeText(const string& utf8_string) {
450
return UTF8ToUnicodeText(utf8_string, true);
453
// Return a string containing the UTF-8 encoded version of all the
454
// Unicode characters in t.
455
inline string UnicodeTextToUTF8(const UnicodeText& t) {
456
return string(t.utf8_data(), t.utf8_length());
460
// For debugging. Return a string of integers, written in uppercase
461
// hex (%X), corresponding to the codepoints within the text. Each
462
// integer is followed by a space. E.g., "61 62 6A 3005 ".
463
string CodepointString(const UnicodeText& t);
465
#endif // UTIL_UTF8_UNICODETEXT_H_