73
73
static void init_encodings(void)
75
fill(0, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_14, "ISO-8859-14", _("Celtic"));
76
fill(1, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_7, "ISO-8859-7", _("Greek"));
77
fill(2, WESTEUROPEAN, GEANY_ENCODING_WINDOWS_1253, "WINDOWS-1253", _("Greek"));
78
fill(3, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_10, "ISO-8859-10", _("Nordic"));
79
fill(4, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_3, "ISO-8859-3", _("South European"));
80
fill(5, WESTEUROPEAN, GEANY_ENCODING_IBM_850, "IBM850", _("Western"));
81
fill(6, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_1, "ISO-8859-1", _("Western"));
82
fill(7, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_15, "ISO-8859-15", _("Western"));
83
fill(8, WESTEUROPEAN, GEANY_ENCODING_WINDOWS_1252, "WINDOWS-1252", _("Western"));
75
fill(0, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_14, "ISO-8859-14", _("Celtic"));
76
fill(1, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_7, "ISO-8859-7", _("Greek"));
77
fill(2, WESTEUROPEAN, GEANY_ENCODING_WINDOWS_1253, "WINDOWS-1253", _("Greek"));
78
fill(3, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_10, "ISO-8859-10", _("Nordic"));
79
fill(4, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_3, "ISO-8859-3", _("South European"));
80
fill(5, WESTEUROPEAN, GEANY_ENCODING_IBM_850, "IBM850", _("Western"));
81
fill(6, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_1, "ISO-8859-1", _("Western"));
82
fill(7, WESTEUROPEAN, GEANY_ENCODING_ISO_8859_15, "ISO-8859-15", _("Western"));
83
fill(8, WESTEUROPEAN, GEANY_ENCODING_WINDOWS_1252, "WINDOWS-1252", _("Western"));
85
fill(0, EASTEUROPEAN, GEANY_ENCODING_ISO_8859_4, "ISO-8859-4", _("Baltic"));
86
fill(1, EASTEUROPEAN, GEANY_ENCODING_ISO_8859_13, "ISO-8859-13", _("Baltic"));
87
fill(2, EASTEUROPEAN, GEANY_ENCODING_WINDOWS_1257, "WINDOWS-1257", _("Baltic"));
88
fill(3, EASTEUROPEAN, GEANY_ENCODING_IBM_852, "IBM852", _("Central European"));
89
fill(4, EASTEUROPEAN, GEANY_ENCODING_ISO_8859_2, "ISO-8859-2", _("Central European"));
90
fill(5, EASTEUROPEAN, GEANY_ENCODING_WINDOWS_1250, "WINDOWS-1250", _("Central European"));
91
fill(6, EASTEUROPEAN, GEANY_ENCODING_IBM_855, "IBM855", _("Cyrillic"));
92
fill(7, EASTEUROPEAN, GEANY_ENCODING_ISO_8859_5, "ISO-8859-5", _("Cyrillic"));
85
fill(0, EASTEUROPEAN, GEANY_ENCODING_ISO_8859_4, "ISO-8859-4", _("Baltic"));
86
fill(1, EASTEUROPEAN, GEANY_ENCODING_ISO_8859_13, "ISO-8859-13", _("Baltic"));
87
fill(2, EASTEUROPEAN, GEANY_ENCODING_WINDOWS_1257, "WINDOWS-1257", _("Baltic"));
88
fill(3, EASTEUROPEAN, GEANY_ENCODING_IBM_852, "IBM852", _("Central European"));
89
fill(4, EASTEUROPEAN, GEANY_ENCODING_ISO_8859_2, "ISO-8859-2", _("Central European"));
90
fill(5, EASTEUROPEAN, GEANY_ENCODING_WINDOWS_1250, "WINDOWS-1250", _("Central European"));
91
fill(6, EASTEUROPEAN, GEANY_ENCODING_IBM_855, "IBM855", _("Cyrillic"));
92
fill(7, EASTEUROPEAN, GEANY_ENCODING_ISO_8859_5, "ISO-8859-5", _("Cyrillic"));
93
93
/* ISO-IR-111 not available on Windows */
94
fill(8, EASTEUROPEAN, GEANY_ENCODING_ISO_IR_111, "ISO-IR-111", _("Cyrillic"));
95
fill(9, EASTEUROPEAN, GEANY_ENCODING_KOI8_R, "KOI8-R", _("Cyrillic"));
96
fill(10, EASTEUROPEAN, GEANY_ENCODING_WINDOWS_1251, "WINDOWS-1251", _("Cyrillic"));
97
fill(11, EASTEUROPEAN, GEANY_ENCODING_CP_866, "CP866", _("Cyrillic/Russian"));
98
fill(12, EASTEUROPEAN, GEANY_ENCODING_KOI8_U, "KOI8-U", _("Cyrillic/Ukrainian"));
99
fill(13, EASTEUROPEAN, GEANY_ENCODING_ISO_8859_16, "ISO-8859-16", _("Romanian"));
94
fill(8, EASTEUROPEAN, GEANY_ENCODING_ISO_IR_111, "ISO-IR-111", _("Cyrillic"));
95
fill(9, EASTEUROPEAN, GEANY_ENCODING_KOI8_R, "KOI8-R", _("Cyrillic"));
96
fill(10, EASTEUROPEAN, GEANY_ENCODING_WINDOWS_1251, "WINDOWS-1251", _("Cyrillic"));
97
fill(11, EASTEUROPEAN, GEANY_ENCODING_CP_866, "CP866", _("Cyrillic/Russian"));
98
fill(12, EASTEUROPEAN, GEANY_ENCODING_KOI8_U, "KOI8-U", _("Cyrillic/Ukrainian"));
99
fill(13, EASTEUROPEAN, GEANY_ENCODING_ISO_8859_16, "ISO-8859-16", _("Romanian"));
101
fill(0, MIDDLEEASTERN, GEANY_ENCODING_IBM_864, "IBM864", _("Arabic"));
102
fill(1, MIDDLEEASTERN, GEANY_ENCODING_ISO_8859_6, "ISO-8859-6", _("Arabic"));
103
fill(2, MIDDLEEASTERN, GEANY_ENCODING_WINDOWS_1256, "WINDOWS-1256", _("Arabic"));
104
fill(3, MIDDLEEASTERN, GEANY_ENCODING_IBM_862, "IBM862", _("Hebrew"));
101
fill(0, MIDDLEEASTERN, GEANY_ENCODING_IBM_864, "IBM864", _("Arabic"));
102
fill(1, MIDDLEEASTERN, GEANY_ENCODING_ISO_8859_6, "ISO-8859-6", _("Arabic"));
103
fill(2, MIDDLEEASTERN, GEANY_ENCODING_WINDOWS_1256, "WINDOWS-1256", _("Arabic"));
104
fill(3, MIDDLEEASTERN, GEANY_ENCODING_IBM_862, "IBM862", _("Hebrew"));
105
105
/* not available at all, ? */
106
fill(4, MIDDLEEASTERN, GEANY_ENCODING_ISO_8859_8_I, "ISO-8859-8-I", _("Hebrew"));
107
fill(5, MIDDLEEASTERN, GEANY_ENCODING_WINDOWS_1255, "WINDOWS-1255", _("Hebrew"));
108
fill(6, MIDDLEEASTERN, GEANY_ENCODING_ISO_8859_8, "ISO-8859-8", _("Hebrew Visual"));
110
fill(0, ASIAN, GEANY_ENCODING_ARMSCII_8, "ARMSCII-8", _("Armenian"));
111
fill(1, ASIAN, GEANY_ENCODING_GEOSTD8, "GEORGIAN-ACADEMY", _("Georgian"));
112
fill(2, ASIAN, GEANY_ENCODING_TIS_620, "TIS-620", _("Thai"));
113
fill(3, ASIAN, GEANY_ENCODING_IBM_857, "IBM857", _("Turkish"));
114
fill(4, ASIAN, GEANY_ENCODING_WINDOWS_1254, "WINDOWS-1254", _("Turkish"));
115
fill(5, ASIAN, GEANY_ENCODING_ISO_8859_9, "ISO-8859-9", _("Turkish"));
116
fill(6, ASIAN, GEANY_ENCODING_TCVN, "TCVN", _("Vietnamese"));
117
fill(7, ASIAN, GEANY_ENCODING_VISCII, "VISCII", _("Vietnamese"));
118
fill(8, ASIAN, GEANY_ENCODING_WINDOWS_1258, "WINDOWS-1258", _("Vietnamese"));
120
fill(0, UNICODE, GEANY_ENCODING_UTF_7, "UTF-7", _("Unicode"));
121
fill(1, UNICODE, GEANY_ENCODING_UTF_8, "UTF-8", _("Unicode"));
122
fill(2, UNICODE, GEANY_ENCODING_UTF_16LE, "UTF-16LE", _("Unicode"));
123
fill(3, UNICODE, GEANY_ENCODING_UTF_16BE, "UTF-16BE", _("Unicode"));
124
fill(4, UNICODE, GEANY_ENCODING_UCS_2LE, "UCS-2LE", _("Unicode"));
125
fill(5, UNICODE, GEANY_ENCODING_UCS_2BE, "UCS-2BE", _("Unicode"));
126
fill(6, UNICODE, GEANY_ENCODING_UTF_32LE, "UTF-32LE", _("Unicode"));
127
fill(7, UNICODE, GEANY_ENCODING_UTF_32BE, "UTF-32BE", _("Unicode"));
129
fill(0, EASTASIAN, GEANY_ENCODING_GB18030, "GB18030", _("Chinese Simplified"));
130
fill(1, EASTASIAN, GEANY_ENCODING_GB2312, "GB2312", _("Chinese Simplified"));
131
fill(2, EASTASIAN, GEANY_ENCODING_GBK, "GBK", _("Chinese Simplified"));
106
fill(4, MIDDLEEASTERN, GEANY_ENCODING_ISO_8859_8_I, "ISO-8859-8-I", _("Hebrew"));
107
fill(5, MIDDLEEASTERN, GEANY_ENCODING_WINDOWS_1255, "WINDOWS-1255", _("Hebrew"));
108
fill(6, MIDDLEEASTERN, GEANY_ENCODING_ISO_8859_8, "ISO-8859-8", _("Hebrew Visual"));
110
fill(0, ASIAN, GEANY_ENCODING_ARMSCII_8, "ARMSCII-8", _("Armenian"));
111
fill(1, ASIAN, GEANY_ENCODING_GEOSTD8, "GEORGIAN-ACADEMY", _("Georgian"));
112
fill(2, ASIAN, GEANY_ENCODING_TIS_620, "TIS-620", _("Thai"));
113
fill(3, ASIAN, GEANY_ENCODING_IBM_857, "IBM857", _("Turkish"));
114
fill(4, ASIAN, GEANY_ENCODING_WINDOWS_1254, "WINDOWS-1254", _("Turkish"));
115
fill(5, ASIAN, GEANY_ENCODING_ISO_8859_9, "ISO-8859-9", _("Turkish"));
116
fill(6, ASIAN, GEANY_ENCODING_TCVN, "TCVN", _("Vietnamese"));
117
fill(7, ASIAN, GEANY_ENCODING_VISCII, "VISCII", _("Vietnamese"));
118
fill(8, ASIAN, GEANY_ENCODING_WINDOWS_1258, "WINDOWS-1258", _("Vietnamese"));
120
fill(0, UNICODE, GEANY_ENCODING_UTF_7, "UTF-7", _("Unicode"));
121
fill(1, UNICODE, GEANY_ENCODING_UTF_8, "UTF-8", _("Unicode"));
122
fill(2, UNICODE, GEANY_ENCODING_UTF_16LE, "UTF-16LE", _("Unicode"));
123
fill(3, UNICODE, GEANY_ENCODING_UTF_16BE, "UTF-16BE", _("Unicode"));
124
fill(4, UNICODE, GEANY_ENCODING_UCS_2LE, "UCS-2LE", _("Unicode"));
125
fill(5, UNICODE, GEANY_ENCODING_UCS_2BE, "UCS-2BE", _("Unicode"));
126
fill(6, UNICODE, GEANY_ENCODING_UTF_32LE, "UTF-32LE", _("Unicode"));
127
fill(7, UNICODE, GEANY_ENCODING_UTF_32BE, "UTF-32BE", _("Unicode"));
129
fill(0, EASTASIAN, GEANY_ENCODING_GB18030, "GB18030", _("Chinese Simplified"));
130
fill(1, EASTASIAN, GEANY_ENCODING_GB2312, "GB2312", _("Chinese Simplified"));
131
fill(2, EASTASIAN, GEANY_ENCODING_GBK, "GBK", _("Chinese Simplified"));
132
132
/* maybe not available on Linux */
133
fill(3, EASTASIAN, GEANY_ENCODING_HZ, "HZ", _("Chinese Simplified"));
134
fill(4, EASTASIAN, GEANY_ENCODING_BIG5, "BIG5", _("Chinese Traditional"));
135
fill(5, EASTASIAN, GEANY_ENCODING_BIG5_HKSCS, "BIG5-HKSCS", _("Chinese Traditional"));
136
fill(6, EASTASIAN, GEANY_ENCODING_EUC_TW, "EUC-TW", _("Chinese Traditional"));
137
fill(7, EASTASIAN, GEANY_ENCODING_EUC_JP, "EUC-JP", _("Japanese"));
138
fill(8, EASTASIAN, GEANY_ENCODING_ISO_2022_JP, "ISO-2022-JP", _("Japanese"));
139
fill(9, EASTASIAN, GEANY_ENCODING_SHIFT_JIS, "SHIFT_JIS", _("Japanese"));
140
fill(10, EASTASIAN, GEANY_ENCODING_CP_932, "CP932", _("Japanese"));
141
fill(11, EASTASIAN, GEANY_ENCODING_EUC_KR, "EUC-KR", _("Korean"));
142
fill(12, EASTASIAN, GEANY_ENCODING_ISO_2022_KR, "ISO-2022-KR", _("Korean"));
143
fill(13, EASTASIAN, GEANY_ENCODING_JOHAB, "JOHAB", _("Korean"));
144
fill(14, EASTASIAN, GEANY_ENCODING_UHC, "UHC", _("Korean"));
146
fill(0, NONE, GEANY_ENCODING_NONE, "None", _("Without encoding"));
133
fill(3, EASTASIAN, GEANY_ENCODING_HZ, "HZ", _("Chinese Simplified"));
134
fill(4, EASTASIAN, GEANY_ENCODING_BIG5, "BIG5", _("Chinese Traditional"));
135
fill(5, EASTASIAN, GEANY_ENCODING_BIG5_HKSCS, "BIG5-HKSCS", _("Chinese Traditional"));
136
fill(6, EASTASIAN, GEANY_ENCODING_EUC_TW, "EUC-TW", _("Chinese Traditional"));
137
fill(7, EASTASIAN, GEANY_ENCODING_EUC_JP, "EUC-JP", _("Japanese"));
138
fill(8, EASTASIAN, GEANY_ENCODING_ISO_2022_JP, "ISO-2022-JP", _("Japanese"));
139
fill(9, EASTASIAN, GEANY_ENCODING_SHIFT_JIS, "SHIFT_JIS", _("Japanese"));
140
fill(10, EASTASIAN, GEANY_ENCODING_CP_932, "CP932", _("Japanese"));
141
fill(11, EASTASIAN, GEANY_ENCODING_EUC_KR, "EUC-KR", _("Korean"));
142
fill(12, EASTASIAN, GEANY_ENCODING_ISO_2022_KR, "ISO-2022-KR", _("Korean"));
143
fill(13, EASTASIAN, GEANY_ENCODING_JOHAB, "JOHAB", _("Korean"));
144
fill(14, EASTASIAN, GEANY_ENCODING_UHC, "UHC", _("Korean"));
146
fill(0, NONE, GEANY_ENCODING_NONE, "None", _("Without encoding"));
150
/* compares two encoding names in a permissive fashion.
151
* e.g. "utf8" matches "UTF-8", "iso8859_1" matches "ISO-8859-1", etc. */
152
static gboolean encodings_charset_equals(const gchar *a, const gchar *b)
154
gboolean was_alpha = FALSE; /* whether last character of previous word was a letter */
155
gboolean need_sep = FALSE; /* whether we're expecting an implicit separator */
161
if (g_ascii_toupper(*a) == g_ascii_toupper(*b) &&
162
((is_alpha = g_ascii_isalpha(*a)) || g_ascii_isdigit(*a)))
164
/* either there was a real separator, or we need a implicit one (a chage from alpha to
166
if (! need_sep || (was_alpha != is_alpha))
170
was_alpha = is_alpha;
180
if (! g_ascii_isalnum(*a))
185
if (! g_ascii_isalnum(*b))
764
gchar *data; /* null-terminated data */
765
gsize size; /* actual data size */
766
gsize len; /* string length of data */
773
/* convert data with the specified encoding */
775
handle_forced_encoding(BufferData *buffer, const gchar *forced_enc)
777
GeanyEncodingIndex enc_idx;
779
if (utils_str_equal(forced_enc, "UTF-8"))
781
if (! g_utf8_validate(buffer->data, buffer->len, NULL))
788
gchar *converted_text = encodings_convert_to_utf8_from_charset(
789
buffer->data, buffer->size, forced_enc, FALSE);
790
if (converted_text == NULL)
796
setptr(buffer->data, converted_text);
797
buffer->len = strlen(converted_text);
800
enc_idx = encodings_scan_unicode_bom(buffer->data, buffer->size, NULL);
801
buffer->bom = (enc_idx == GEANY_ENCODING_UTF_8);
802
buffer->enc = g_strdup(forced_enc);
807
/* detect encoding and convert to UTF-8 if necessary */
809
handle_encoding(BufferData *buffer, GeanyEncodingIndex enc_idx)
811
g_return_val_if_fail(buffer->enc == NULL, FALSE);
812
g_return_val_if_fail(buffer->bom == FALSE, FALSE);
814
if (buffer->size == 0)
816
/* we have no data so assume UTF-8, buffer->len can be 0 even we have an empty
817
* e.g. UTF32 file with a BOM(so size is 4, len is 0) */
818
buffer->enc = g_strdup("UTF-8");
822
/* first check for a BOM */
823
if (enc_idx != GEANY_ENCODING_NONE)
825
buffer->enc = g_strdup(encodings[enc_idx].charset);
828
if (enc_idx != GEANY_ENCODING_UTF_8) /* the BOM indicated something else than UTF-8 */
830
gchar *converted_text = encodings_convert_to_utf8_from_charset(
831
buffer->data, buffer->size, buffer->enc, FALSE);
832
if (converted_text != NULL)
834
setptr(buffer->data, converted_text);
835
buffer->len = strlen(converted_text);
839
/* there was a problem converting data from BOM encoding type */
840
setptr(buffer->enc, NULL);
846
if (buffer->enc == NULL) /* either there was no BOM or the BOM encoding failed */
848
/* first try to read the encoding from the file content */
849
gchar *regex_charset = encodings_check_regexes(buffer->data, buffer->size);
851
/* try UTF-8 first */
852
if (encodings_get_idx_from_charset(regex_charset) == GEANY_ENCODING_UTF_8 &&
853
(buffer->size == buffer->len) && g_utf8_validate(buffer->data, buffer->len, NULL))
855
buffer->enc = g_strdup("UTF-8");
859
/* detect the encoding */
860
gchar *converted_text = encodings_convert_to_utf8_with_suggestion(buffer->data,
861
buffer->size, regex_charset, &buffer->enc);
863
if (converted_text == NULL)
865
g_free(regex_charset);
868
setptr(buffer->data, converted_text);
869
buffer->len = strlen(converted_text);
871
g_free(regex_charset);
879
handle_bom(BufferData *buffer)
883
encodings_scan_unicode_bom(buffer->data, buffer->size, &bom_len);
884
g_return_if_fail(bom_len != 0);
886
/* use filedata->len here because the contents are already converted into UTF-8 */
887
buffer->len -= bom_len;
888
/* overwrite the BOM with the remainder of the file contents, plus the NULL terminator. */
889
g_memmove(buffer->data, buffer->data + bom_len, buffer->len + 1);
890
buffer->data = g_realloc(buffer->data, buffer->len + 1);
894
/* loads textfile data, verifies and converts to forced_enc or UTF-8. Also handles BOM. */
895
static gboolean handle_buffer(BufferData *buffer, const gchar *forced_enc)
897
GeanyEncodingIndex tmp_enc_idx;
899
/* temporarily retrieve the encoding idx based on the BOM to suppress the following warning
900
* if we have a BOM */
901
tmp_enc_idx = encodings_scan_unicode_bom(buffer->data, buffer->size, NULL);
903
/* check whether the size of the loaded data is equal to the size of the file in the
904
* filesystem file size may be 0 to allow opening files in /proc/ which have typically a
905
* file size of 0 bytes */
906
if (buffer->len != buffer->size && buffer->size != 0 && (
907
tmp_enc_idx == GEANY_ENCODING_UTF_8 || /* tmp_enc_idx can be UTF-7/8/16/32, UCS and None */
908
tmp_enc_idx == GEANY_ENCODING_UTF_7)) /* filter UTF-7/8 where no NULL bytes are allowed */
910
buffer->partial = TRUE;
913
/* Determine character encoding and convert to UTF-8 */
914
if (forced_enc != NULL)
916
/* the encoding should be ignored(requested by user), so open the file "as it is" */
917
if (utils_str_equal(forced_enc, encodings[GEANY_ENCODING_NONE].charset))
920
buffer->enc = g_strdup(encodings[GEANY_ENCODING_NONE].charset);
922
else if (! handle_forced_encoding(buffer, forced_enc))
927
else if (! handle_encoding(buffer, tmp_enc_idx))
939
* Tries to convert @a buffer into UTF-8 encoding. Unlike encodings_convert_to_utf8()
940
* and encodings_convert_to_utf8_from_charset() it handles the possible BOM in the data.
942
* @param buf a pointer to modifiable null-terminated buffer to convert.
943
* It may or may not be modified, and should be freed whatever happens.
944
* @param size a pointer to the size of the buffer (expected to be e.g. the on-disk
945
* file size). It will be updated to the new size.
946
* @param forced_enc forced encoding to use, or @c NULL
947
* @param used_encoding return location for the actually used encoding, or @c NULL
948
* @param has_bom return location to store whether the data had a BOM, or @c NULL
949
* @param partial return location to store whether the conversion may be partial, or @c NULL
951
* @return @C TRUE if the conversion succeeded, @c FALSE otherwise.
953
gboolean encodings_convert_to_utf8_auto(gchar **buf, gsize *size, const gchar *forced_enc,
954
gchar **used_encoding, gboolean *has_bom, gboolean *partial)
960
/* use strlen to check for null chars */
961
buffer.len = strlen(buffer.data);
964
buffer.partial = FALSE;
966
if (! handle_buffer(&buffer, forced_enc))
971
*used_encoding = buffer.enc;
975
*has_bom = buffer.bom;
977
*partial = buffer.partial;