1
// -*- Mode: vala; indent-tabs-mode: nil; tab-width: 4 -*-
5
Copyright (c) 2014 Anthony Huben
7
Permission is hereby granted, free of charge, to any person obtaining a copy
8
of this software and associated documentation files (the "Software"), to deal
9
in the Software without restriction, including without limitation the rights
10
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
copies of the Software, and to permit persons to whom the Software is
12
furnished to do so, subject to the following conditions:
14
The above copyright notice and this permission notice shall be included in all
15
copies or substantial portions of the Software.
17
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25
-> Taken from Scratch (https://launchpad.net/scratch/)
28
namespace Writer.Utils {
30
public enum EncodingType {
109
public struct Encoding {
110
public EncodingType type;
111
public string? encoding;
115
public static const Encoding[] encodings = {
117
{ EncodingType.ISO_8859_1,
118
"ISO-8859-1", "Western" },
119
{ EncodingType.ISO_8859_2,
120
"ISO-8859-2", "Central European" },
121
{ EncodingType.ISO_8859_3,
122
"ISO-8859-3", "South European" },
123
{ EncodingType.ISO_8859_4,
124
"ISO-8859-4", "Baltic" },
125
{ EncodingType.ISO_8859_5,
126
"ISO-8859-5", "Cyrillic" },
127
{ EncodingType.ISO_8859_6,
128
"ISO-8859-6", "Arabic" },
129
{ EncodingType.ISO_8859_7,
130
"ISO-8859-7", "Greek" },
131
{ EncodingType.ISO_8859_8,
132
"ISO-8859-8", "Hebrew Visual" },
133
{ EncodingType.ISO_8859_9,
134
"ISO-8859-9", "Turkish" },
135
{ EncodingType.ISO_8859_10,
136
"ISO-8859-10", "Nordic" },
137
{ EncodingType.ISO_8859_13,
138
"ISO-8859-13", "Baltic" },
139
{ EncodingType.ISO_8859_14,
140
"ISO-8859-14", "Celtic" },
141
{ EncodingType.ISO_8859_15,
142
"ISO-8859-15", "Western" },
143
{ EncodingType.ISO_8859_16,
144
"ISO-8859-16", "Romanian" },
146
{ EncodingType.UTF_7,
147
"UTF-7", "Unicode" },
148
{ EncodingType.UTF_16,
149
"UTF-16", "Unicode" },
150
{ EncodingType.UTF_16_BE,
151
"UTF-16BE", "Unicode" },
152
{ EncodingType.UTF_16_LE,
153
"UTF-16LE", "Unicode" },
154
{ EncodingType.UTF_32,
155
"UTF-32", "Unicode" },
156
{ EncodingType.UCS_2,
157
"UCS-2", "Unicode" },
158
{ EncodingType.UCS_4,
159
"UCS-4", "Unicode" },
161
{ EncodingType.ARMSCII_8,
162
"ARMSCII-8", "Armenian" },
164
"BIG5", "Chinese Traditional" },
165
{ EncodingType.BIG5_HKSCS,
166
"BIG5-HKSCS", "Chinese Traditional" },
167
{ EncodingType.CP_866,
168
"CP866", "Cyrillic/Russian" },
170
{ EncodingType.EUC_JP,
171
"EUC-JP", "Japanese" },
172
{ EncodingType.EUC_JP_MS,
173
"EUC-JP-MS", "Japanese" },
174
{ EncodingType.CP932,
175
"CP932", "Japanese" },
177
{ EncodingType.EUC_KR,
178
"EUC-KR", "Korean" },
179
{ EncodingType.EUC_TW,
180
"EUC-TW", "Chinese Traditional" },
182
{ EncodingType.GB18030,
183
"GB18030", "Chinese Simplified" },
184
{ EncodingType.GB2312,
185
"GB2312", "Chinese Simplified" },
187
"GBK", "Chinese Simplified" },
188
{ EncodingType.GEOSTD8,
189
"GEORGIAN-ACADEMY", "Georgian" }, /* FIXME GEOSTD8 ? */
191
{ EncodingType.IBM_850,
192
"IBM850", "Western" },
193
{ EncodingType.IBM_852,
194
"IBM852", "Central European" },
195
{ EncodingType.IBM_855,
196
"IBM855", "Cyrillic" },
197
{ EncodingType.IBM_857,
198
"IBM857", "Turkish" },
199
{ EncodingType.IBM_862,
200
"IBM862", "Hebrew" },
201
{ EncodingType.IBM_864,
202
"IBM864", "Arabic" },
204
{ EncodingType.ISO_2022_JP,
205
"ISO-2022-JP", "Japanese" },
206
{ EncodingType.ISO_2022_KR,
207
"ISO-2022-KR", "Korean" },
208
{ EncodingType.ISO_IR_111,
209
"ISO-IR-111", "Cyrillic" },
210
{ EncodingType.JOHAB,
212
{ EncodingType.KOI8_R,
213
"KOI8R", "Cyrillic" },
214
{ EncodingType.KOI8__R,
215
"KOI8-R", "Cyrillic" },
216
{ EncodingType.KOI8_U,
217
"KOI8U", "Cyrillic/Ukrainian" },
219
{ EncodingType.SHIFT_JIS,
220
"SHIFT_JIS", "Japanese" },
222
"TCVN", "Vietnamese" },
223
{ EncodingType.TIS_620,
227
{ EncodingType.VISCII,
228
"VISCII", "Vietnamese" },
230
{ EncodingType.WINDOWS_1250,
231
"WINDOWS-1250", "Central European" },
232
{ EncodingType.WINDOWS_1251,
233
"WINDOWS-1251", "Cyrillic" },
234
{ EncodingType.WINDOWS_1252,
235
"WINDOWS-1252", "Western" },
236
{ EncodingType.WINDOWS_1253,
237
"WINDOWS-1253", "Greek" },
238
{ EncodingType.WINDOWS_1254,
239
"WINDOWS-1254", "Turkish" },
240
{ EncodingType.WINDOWS_1255,
241
"WINDOWS-1255", "Hebrew" },
242
{ EncodingType.WINDOWS_1256,
243
"WINDOWS-1256", "Arabic" },
244
{ EncodingType.WINDOWS_1257,
245
"WINDOWS-1257", "Baltic" },
246
{ EncodingType.WINDOWS_1258,
247
"WINDOWS-1258", "Vietnamese" }
250
private static bool test (string text, string charset) {
255
convert = GLib.convert (text, -1, "UTF-8", charset);
264
public static string get_charset (string path) {
265
// Get correct encoding via chardect.py script
267
const string FALLBACK_CHARSET = "ISO-8859-1";
268
string script = Constants.SCRIPTDIR + "/chardetect.py";
269
string command = "python " + script + " \"" + path.replace ("\\ ", " ") + "\"";
270
string? charset = null;
273
GLib.Process.spawn_command_line_sync (command, out charset);
274
} catch (SpawnError e) {
275
warning ("Could not execute \"%s\": %s", script, e.message);
277
if ( charset == null || strcmp (charset, "error") != 0) {
278
warning ("Could not automatically detect encoding, assuming %s", FALLBACK_CHARSET);
279
charset = FALLBACK_CHARSET; //TODO: prompt the user to meddle with encoding manually, until satisfied
281
debug ("Detected encoding of file \"%s\" to be \"%s\"", path, charset);
286
public string? file_content_to_utf8 (File file, string content, string mode = "r" /* it means read or write */) {
288
string? encoding = null;
289
string? encoded_content = null;
291
encoding = get_charset (file.get_path ());
294
InputStream @is = file.read ();
295
CharsetConverter iconverter = new CharsetConverter ("utf-8", encoding.down ());
296
ConverterInputStream @converted = new ConverterInputStream (@is, iconverter);
297
DataInputStream dis = new DataInputStream (@converted);
298
string line = dis.read_line ();
300
while ((line = dis.read_line (null)) != null) {
303
encoded_content = str;
304
} catch (GLib.ConvertError ce) {
305
warning (ce.message);
306
} catch (IOError e) {
312
return encoded_content;
b'\\ No newline at end of file'