2
* Copyright (C) 2012 Canonical, Ltd.
4
* This library is free software; you can redistribute it and/or modify
5
* it under the terms of the GNU Lesser General Public License
6
* version 3.0 as published by the Free Software Foundation.
8
* This library is distributed in the hope that it will be useful,
9
* but WITHOUT ANY WARRANTY; without even the implied warranty of
10
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
* GNU Lesser General Public License version 3.0 for more details.
13
* You should have received a copy of the GNU Lesser General Public
14
* License along with this library. If not, see
15
* <http://www.gnu.org/licenses/>.
17
* Authored by Mikkel Kamstrup Erlandsen <mikkel.kamstrup@canonical.com>
20
#include <unicode/utypes.h>
21
#include <unicode/localpointer.h>
22
#include <unicode/urep.h>
23
#include <unicode/parseerr.h>
24
#include <unicode/uenum.h>
25
#include <unicode/utrans.h>
26
#include <unicode/ustring.h>
34
* @title: Dee ICU Extensions
35
* @short_description: A suite of #DeeTermFilter<!-- -->s based on ICU
38
* This module allows developers to easily construct powerful
39
* #DeeTermFilter<!-- -->s with ease. The filters leverage the ICU
40
* framework to provide world class transliteration features.
42
* The filters can be employed manually by calling dee_icu_term_filter_apply()
43
* or installed in a #DeeAnalyzer by calling dee_analyzer_add_term_filter()
44
* passing the term filter instance as the user data and
45
* dee_icu_term_filter_destroy() as the #GDestroyNotify.
48
struct _DeeICUTermFilter {
49
UTransliterator *transliterator;
53
gchar2uchar (const gchar *string, int32_t *u_len)
57
UErrorCode u_error_code = U_ZERO_ERROR;
66
len = strlen (string) * 2;
67
u_string = g_new(UChar, 2*len + 1);
68
u_string[2*len] = '\0';
70
u_strFromUTF8Lenient (u_string, len, u_len, string, -1, &u_error_code);
72
if (U_FAILURE(u_error_code))
74
g_critical ("Failed to convert string '%s' into UTF-16: %s",
75
string, u_errorName(u_error_code));
83
print_error (const gchar *system_id,
85
UParseError *u_parse_error,
86
UErrorCode u_error_code)
91
str = g_string_new ("");
93
g_string_append_printf (str, "[%s]: Error creating transliterator "
94
"for system id '%s' and rules '%s'.",
95
u_errorName (u_error_code), system_id, rules);
97
if (u_parse_error->line >= 0)
98
g_string_append_printf(str, " On line %i.", u_parse_error->line);
100
if (u_parse_error->offset >= 0)
101
g_string_append_printf(str, " Offset %i.", u_parse_error->offset);
104
g_string_free (str, FALSE);
110
get_error_code (UErrorCode u_error_code)
112
/* The ICU error codes are quite tangled up,
113
* so excuse the spaghetti logic please :-)
116
if ( ! (u_error_code > U_PARSE_ERROR_START &&
117
u_error_code < U_PARSE_ERROR_LIMIT) &&
118
u_error_code != U_ILLEGAL_ARGUMENT_ERROR)
120
return DEE_ICU_ERROR_UNKNOWN;
123
switch (u_error_code)
126
case U_INVALID_FUNCTION:
127
return DEE_ICU_ERROR_BAD_ID;
128
case U_ILLEGAL_ARGUMENT_ERROR:
130
return DEE_ICU_ERROR_BAD_RULE;
136
* dee_icu_term_filter_new:
137
* @system_id: A system id for the transliterator to use.
138
* See <link anchor="http://userguide.icu-project.org/transforms/general">userguide.icu-project.org/transforms/general</link>
139
* @rules: (allow-none): A set of transliteration rules to use.
140
* See <link anchor="http://userguide.icu-project.org/transforms/general/rules">userguide.icu-project.org/transforms/general/rules</link>
141
* @error: (allow-none) (error-domains Dee.ICUError): A place to return a #GError, or %NULL to ignore errors
143
* Create a new #DeeICUTermFilter for a given ICU transliterator system id
144
* and/or set of transliteration rules.
146
* Returns: (transfer full): A newly allocated #DeeICUTermFilter.
147
* Free with dee_icu_term_filter_destroy().
150
dee_icu_term_filter_new (const gchar *system_id,
154
DeeICUTermFilter *self;
155
UChar *u_rules, *u_id;
156
int32_t u_rules_len, u_id_len;
157
UErrorCode u_error_code = 0;
158
UParseError u_parse_error = { 0 };
160
g_return_val_if_fail (error == NULL || *error == NULL, NULL);
162
self = g_new0 (DeeICUTermFilter, 1);
163
u_id = gchar2uchar (system_id, &u_id_len);
164
u_rules = gchar2uchar (rules, &u_rules_len);
166
self->transliterator = utrans_openU (u_id, u_id_len,
168
u_rules, u_rules_len,
169
&u_parse_error, &u_error_code);
171
if (U_FAILURE(u_error_code))
173
DeeICUError error_code;
176
error_code = get_error_code (u_error_code);
177
msg = print_error (system_id, rules, &u_parse_error, u_error_code);
179
g_set_error_literal (error, DEE_ICU_ERROR, error_code, msg);
192
* dee_icu_term_filter_new_ascii_folder:
194
* Construct a term filter that folds any UTF-8 string into ASCII.
196
* Returns: (transfer full): A newly allocated #DeeICUTermFilter. Free with
197
* dee_icu_term_filter_destroy().
200
dee_icu_term_filter_new_ascii_folder ()
202
return dee_icu_term_filter_new ("Latin; Latin-ASCII;", NULL, NULL);
206
* dee_icu_term_filter_apply:
207
* @self: The filter to apply
208
* @text: The text to apply the filter on
210
* Apply a #DeeICUTermFilter on a piece of UTF-8 text.
212
* Returns: (transfer full): A newly allocated string. Free with g_free().
215
dee_icu_term_filter_apply (DeeICUTermFilter *self,
219
int32_t u_cap, u_len, u_limit;
220
UErrorCode u_error_code = U_ZERO_ERROR;
223
g_return_val_if_fail (self != NULL, NULL);
224
g_return_val_if_fail (text != NULL, NULL);
226
u_cap = strlen (text) * 4 + 1;
227
u_text = g_new (UChar, u_cap);
228
u_text[u_cap - 1] = '\0';
230
u_strFromUTF8Lenient (u_text, u_cap, &u_len, text, -1, &u_error_code);
232
if (U_FAILURE(u_error_code))
234
g_critical ("Failed to convert string '%s' into UTF-16: %s",
235
text, u_errorName(u_error_code));
240
utrans_transUChars (self->transliterator,
241
u_text, &u_len, u_cap,
245
if (U_FAILURE(u_error_code))
247
g_critical ("Failed to transliterate '%s': %s",
248
text, u_errorName(u_error_code));
253
result = g_utf16_to_utf8(u_text, u_len, NULL, NULL, NULL);
260
* dee_icu_term_filter_destroy:
261
* @filter: The filter to free
263
* Free all resources allocated by a #DeeICUTermFilter.
266
dee_icu_term_filter_destroy (DeeICUTermFilter *filter)
268
g_return_if_fail (filter != NULL);
270
utrans_close (filter->transliterator);
276
dee_icu_error_quark (void)
278
return g_quark_from_static_string ("dee-icu-error-quark");