2
2
* Copyright (c) 1987, Fujitsu LTD. (Itaru ICHIKAWA).
3
* Copyright (c) 1996-2009, The nkf Project.
3
* Copyright (c) 1996-2010, The nkf Project.
5
5
* This software is provided 'as-is', without any express or implied
6
6
* warranty. In no event will the authors be held liable for any damages
21
21
* 3. This notice may not be removed or altered from any source distribution.
23
#define NKF_VERSION "2.0.9"
24
#define NKF_RELEASE_DATE "2009-01-20"
23
#define NKF_VERSION "2.1.1"
24
#define NKF_RELEASE_DATE "2010-04-28"
25
25
#define COPY_RIGHT \
26
26
"Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa).\n" \
27
"Copyright (C) 1996-2009, The nkf Project."
27
"Copyright (C) 1996-2010, The nkf Project."
29
29
#include "config.h"
210
210
} encoding_name_to_id_table[] = {
211
211
{"US-ASCII", ASCII},
212
212
{"ASCII", ASCII},
213
215
{"ISO-2022-JP", ISO_2022_JP},
214
216
{"ISO2022JP-CP932", CP50220},
215
217
{"CP50220", CP50220},
221
223
{"ISO-2022-JP-2004", ISO_2022_JP_2004},
222
224
{"SHIFT_JIS", SHIFT_JIS},
223
225
{"SJIS", SHIFT_JIS},
226
{"MS_Kanji", SHIFT_JIS},
224
228
{"WINDOWS-31J", WINDOWS_31J},
225
229
{"CSWINDOWS31J", WINDOWS_31J},
226
230
{"CP932", WINDOWS_31J},
295
299
&& (c != '(') && (c != ')') && (c != '.') && (c != 0x22)))
297
301
#define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
298
#define nkf_byte_jisx0201_katakana_p(c) (SP <= c && c < (0xE0&0x7F))
302
#define nkf_byte_jisx0201_katakana_p(c) (SP <= c && c <= 0x5F)
300
304
#define HOLD_SIZE 1024
301
305
#if defined(INT_IS_SHORT)
393
399
static int broken_f = FALSE; /* convert ESC-less broken JIS */
394
400
static int iso8859_f = FALSE; /* ISO8859 through */
395
401
static int mimeout_f = FALSE; /* base64 mode */
396
static int x0201_f = X0201_DEFAULT; /* convert JIS X 0201 */
402
static int x0201_f = NKF_UNSPECIFIED; /* convert JIS X 0201 */
397
403
static int iso2022jp_f = FALSE; /* replace non ISO-2022-JP with GETA */
399
405
#ifdef UNICODE_NORMALIZATION
468
474
{"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
469
475
#ifdef UTF8_INPUT_ENABLE
470
476
{"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
477
{"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0},
478
{"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0},
480
{NULL, 0, 0, 0, {0, 0, 0}, NULL, NULL, 0}
475
483
static int mimeout_mode = 0; /* 0, -1, 'Q', 'B', 1, 2 */
827
835
#define nkf_buf_length(buf) ((buf)->len)
828
836
#define nkf_buf_empty_p(buf) ((buf)->len == 0)
831
839
nkf_buf_at(nkf_buf_t *buf, int index)
833
841
assert(index <= buf->len);
874
882
fprintf(HELP_OUTPUT,
875
883
"Usage: nkf -[flags] [--] [in file] .. [out file for -O flag]\n"
876
" j,s,e,w Output code is ISO-2022-JP, Shift JIS, EUC-JP, UTF-8N\n"
877
884
#ifdef UTF8_OUTPUT_ENABLE
878
" After 'w' you can add more options. -w[ 8 [0], 16 [[BL] [0]] ]\n"
885
" j/s/e/w Specify output encoding ISO-2022-JP, Shift_JIS, EUC-JP\n"
886
" UTF options is -w[8[0],{16,32}[{B,L}[0]]]\n"
880
" J,S,E,W Input assumption is JIS 7 bit , Shift JIS, EUC-JP, UTF-8\n"
881
889
#ifdef UTF8_INPUT_ENABLE
882
" After 'W' you can add more options. -W[ 8, 16 [BL] ] \n"
890
" J/S/E/W Specify input encoding ISO-2022-JP, Shift_JIS, EUC-JP\n"
891
" UTF option is -W[8,[16,32][B,L]]\n"
893
" J/S/E Specify output encoding ISO-2022-JP, Shift_JIS, EUC-JP\n"
885
896
fprintf(HELP_OUTPUT,
886
" m[BQSN0] MIME decode [B:base64,Q:quoted,S:strict,N:non-strict,0:no decode]\n"
897
" m[BQSN0] MIME decode [B:base64,Q:quoted,S:strict,N:nonstrict,0:no decode]\n"
887
898
" M[BQ] MIME encode [B:base64 Q:quoted]\n"
888
899
" f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n"
891
902
" Z[0-4] Default/0: Convert JISX0208 Alphabet to ASCII\n"
892
903
" 1: Kankaku to one space 2: to two spaces 3: HTML Entity\n"
893
904
" 4: JISX0208 Katakana to JISX0201 Katakana\n"
894
" X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n"
905
" X,x Convert Halfwidth Katakana to Fullwidth or preserve it\n"
896
907
fprintf(HELP_OUTPUT,
897
908
" O Output to File (DEFAULT 'nkf.out')\n"
898
909
" L[uwm] Line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n"
900
911
fprintf(HELP_OUTPUT,
901
"Long name options\n"
902
" --ic=<input codeset> --oc=<output codeset>\n"
903
" Specify the input or output codeset\n"
904
" --hiragana --katakana --katakana-hiragana\n"
905
" To Hiragana/Katakana Conversion\n"
912
" --ic=<encoding> Specify the input encoding\n"
913
" --oc=<encoding> Specify the output encoding\n"
914
" --hiragana --katakana Hiragana/Katakana Conversion\n"
915
" --katakana-hiragana Converts each other\n"
907
917
fprintf(HELP_OUTPUT,
908
918
#ifdef INPUT_OPTION
909
" --cap-input, --url-input Convert hex after ':' or '%%'\n"
919
" --{cap, url}-input Convert hex after ':' or '%%'\n"
911
921
#ifdef NUMCHAR_OPTION
912
" --numchar-input Convert Unicode Character Reference\n"
922
" --numchar-input Convert Unicode Character Reference\n"
914
924
#ifdef UTF8_INPUT_ENABLE
915
925
" --fb-{skip, html, xml, perl, java, subchar}\n"
916
" Specify how nkf handles unassigned characters\n"
926
" Specify unassigned character's replacement\n"
919
929
fprintf(HELP_OUTPUT,
921
" --in-place[=SUF] Overwrite original listed files by filtered result\n"
922
" --overwrite[=SUF] in-place and preserve timestamp of original files\n"
931
" --in-place[=SUF] Overwrite original files\n"
932
" --overwrite[=SUF] Preserve timestamp of original files\n"
924
" -g --guess Guess the input code\n"
925
" -v --version print the version\n"
926
" --help/-V print this help / configuration\n"
934
" -g --guess Guess the input code\n"
935
" -v --version Print the version\n"
936
" --help/-V Print this help / configuration\n"
1407
if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1385
1408
x0212_f = TRUE;
1386
1409
#ifdef UTF8_OUTPUT_ENABLE
1387
1410
ms_ucs_map_f = UCS_MAP_MS;
1390
1413
case EUCJP_ASCII:
1414
if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1391
1415
x0212_f = TRUE;
1392
1416
#ifdef UTF8_OUTPUT_ENABLE
1393
1417
ms_ucs_map_f = UCS_MAP_ASCII;
1650
1675
*p3 = 0x80 | ( val & 0x3f);
1652
1677
} else if (nkf_char_unicode_value_p(val)) {
1653
*p1 = 0xe0 | (val >> 16);
1678
*p1 = 0xf0 | (val >> 18);
1654
1679
*p2 = 0x80 | ((val >> 12) & 0x3f);
1655
1680
*p3 = 0x80 | ((val >> 6) & 0x3f);
1656
1681
*p4 = 0x80 | ( val & 0x3f);
2180
#define NKF_ICONV_NEED_ONE_MORE_BYTE -1
2181
#define NKF_ICONV_NEED_TWO_MORE_BYTES -2
2205
#define NKF_ICONV_NEED_ONE_MORE_BYTE (size_t)-1
2206
#define NKF_ICONV_NEED_TWO_MORE_BYTES (size_t)-2
2182
2207
#define UTF16_TO_UTF32(lead, trail) (((lead) << 10) + (trail) - NKF_INT32_C(0x35FDC00))
2184
2209
nkf_iconv_utf_16(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
2214
2239
static nkf_char
2215
2240
w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0)
2243
return 16; /* different from w_iconv32 */
2220
2246
static nkf_char
2221
2247
w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0)
2250
return 32; /* different from w_iconv16 */
3035
3062
#endif /*WIN32DLL*/
3037
static unsigned char hold_buf[HOLD_SIZE*2];
3064
static nkf_char hold_buf[HOLD_SIZE*2];
3038
3065
static int hold_count = 0;
3039
3066
static nkf_char
3040
3067
push_hold_buf(nkf_char c2)
3042
3069
if (hold_count >= HOLD_SIZE*2)
3044
hold_buf[hold_count++] = (unsigned char)c2;
3071
hold_buf[hold_count++] = c2;
3045
3072
return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
3049
h_conv(FILE *f, int c1, int c2)
3076
h_conv(FILE *f, nkf_char c1, nkf_char c2)
3052
3079
int hold_index;
3055
3082
/** it must NOT be in the kanji shifte sequence */
3056
3083
/** it must NOT be written in JIS7 */
3126
3157
} else if ((c3 = (*i_getc)(f)) == EOF) {
3131
if (hold_index < hold_count){
3132
c4 = hold_buf[hold_index++];
3133
} else if ((c4 = (*i_getc)(f)) == EOF) {
3138
(*iconv)(c1, c2, (c3<<8)|c4);
3162
if (hold_index < hold_count){
3163
c4 = hold_buf[hold_index++];
3164
} else if ((c4 = (*i_getc)(f)) == EOF) {
3169
(*iconv)(c1, c2, (c3<<8)|c4);
3143
3172
/* 3 bytes EUC or UTF-8 */
3337
3366
else if (c2 != 0 || c1 != LF) (*o_eol_conv)(c2, c1);
3370
put_newline(void (*func)(nkf_char))
3372
switch (eolmode_f ? eolmode_f : DEFAULT_NEWLINE) {
3387
oconv_newline(void (*func)(nkf_char, nkf_char))
3389
switch (eolmode_f ? eolmode_f : DEFAULT_NEWLINE) {
3341
3404
Return value of fold_conv()
3802
3865
(const unsigned char *)"\075?ISO-8859-1?Q?",
3803
3866
(const unsigned char *)"\075?ISO-8859-1?B?",
3804
3867
(const unsigned char *)"\075?ISO-2022-JP?B?",
3868
(const unsigned char *)"\075?ISO-2022-JP?B?",
3805
3869
(const unsigned char *)"\075?ISO-2022-JP?Q?",
3806
3870
#if defined(UTF8_INPUT_ENABLE)
3807
3871
(const unsigned char *)"\075?UTF-8?B?",
3824
3888
static const nkf_char mime_encode[] = {
3825
EUC_JP, SHIFT_JIS, ISO_8859_1, ISO_8859_1, JIS_X_0208, JIS_X_0201_1976_K,
3889
EUC_JP, SHIFT_JIS, ISO_8859_1, ISO_8859_1, JIS_X_0208, JIS_X_0201_1976_K, JIS_X_0201_1976_K,
3826
3890
#if defined(UTF8_INPUT_ENABLE)
3833
3897
static const nkf_char mime_encode_method[] = {
3834
'B', 'B','Q', 'B', 'B', 'Q',
3898
'B', 'B','Q', 'B', 'B', 'B', 'Q',
3835
3899
#if defined(UTF8_INPUT_ENABLE)
4426
4490
if (c1=='='&&c2<SP) { /* this is soft wrap */
4427
4491
while((c1 = (*i_mgetc)(f)) <=SP) {
4428
if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4492
if (c1 == EOF) return (EOF);
4430
4494
mime_decode_mode = 'Q'; /* still in MIME */
4431
4495
goto restart_mime_q;
4630
4694
if (c2 == EOF){
4631
4695
if (base64_count + mimeout_state.count/3*4> 73){
4632
4696
(*o_base64conv)(EOF,0);
4633
OCONV_NEWLINE((*o_base64conv));
4697
oconv_newline(o_base64conv);
4634
4698
(*o_base64conv)(0,SP);
4635
4699
base64_count = 1;
4638
if (base64_count + mimeout_state.count/3*4> 66) {
4702
if ((c2 != 0 || c1 > DEL) && base64_count + mimeout_state.count/3*4> 66) {
4639
4703
(*o_base64conv)(EOF,0);
4640
OCONV_NEWLINE((*o_base64conv));
4704
oconv_newline(o_base64conv);
4641
4705
(*o_base64conv)(0,SP);
4642
4706
base64_count = 1;
4643
4707
mimeout_mode = -1;
4935
5002
/* mimeout_mode == 'B', 1, 2 */
4936
if ( c<=DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
5003
if (c <= DEL && (output_mode==ASCII || output_mode == ISO_8859_1 ||
5004
output_mode == UTF_8)) {
4937
5005
if (lastchar == CR || lastchar == LF){
4938
5006
if (nkf_isblank(c)) {
4939
5007
for (i=0;i<mimeout_state.count;i++) {
4940
5008
mimeout_addchar(mimeout_state.buf[i]);
4942
5010
mimeout_state.count = 0;
4943
} else if (SP<c && c<DEL) {
4945
5013
for (i=0;i<mimeout_state.count;i++) {
4946
5014
(*o_mputc)(mimeout_state.buf[i]);
5236
5304
set_output_encoding(output_encoding);
5237
5305
oconv = nkf_enc_to_oconv(output_encoding);
5238
5306
o_putc = std_putc;
5307
if (nkf_enc_unicode_p(output_encoding))
5308
output_mode = UTF_8;
5310
if (x0201_f == NKF_UNSPECIFIED) {
5311
x0201_f = X0201_DEFAULT;
5240
5314
/* replace continucation module, from output side */
5344
5418
#define NEXT continue /* no output, get next */
5345
5419
#define SKIP c2=0;continue /* no output, get next */
5346
5420
#define MORE c2=c1;continue /* need one more byte */
5347
#define SEND ; /* output c1 and c2, get next */
5421
#define SEND (void)0 /* output c1 and c2, get next */
5348
5422
#define LAST break /* end of loop, go closing */
5349
5423
#define set_input_mode(mode) do { \
5350
5424
input_mode = mode; \
5384
5458
(c4 = (*i_getc)(f)) != EOF) {
5385
5459
nkf_iconv_utf_32(c1, c2, c3, c4);
5387
(*i_ungetc)(EOF, f);
5389
5463
else if (iconv == w_iconv16) {
5390
5464
while ((c1 = (*i_getc)(f)) != EOF &&
5391
5465
(c2 = (*i_getc)(f)) != EOF) {
5392
if (nkf_iconv_utf_16(c1, c2, 0, 0) == -2 &&
5466
if (nkf_iconv_utf_16(c1, c2, 0, 0) == NKF_ICONV_NEED_TWO_MORE_BYTES &&
5393
5467
(c3 = (*i_getc)(f)) != EOF &&
5394
5468
(c4 = (*i_getc)(f)) != EOF) {
5395
5469
nkf_iconv_utf_16(c1, c2, c3, c4);
5398
(*i_ungetc)(EOF, f);
5442
5516
if (input_mode == JIS_X_0208 && DEL <= c1 && c1 < 0x92) {
5519
}else if (input_codename && input_codename[0] == 'I' &&
5520
0xA1 <= c1 && c1 <= 0xDF) {
5521
/* JIS X 0201 Katakana in 8bit JIS */
5522
c2 = JIS_X_0201_1976_K;
5445
5525
} else if (c1 > DEL) {
5446
5526
/* 8 bit code */
5447
5527
if (!estab_f && !iso8859_f) {
5822
5903
option_mode = 1;
5825
for (i=0;i<sizeof(long_option)/sizeof(long_option[0]);i++) {
5906
for (i=0;i<(int)(sizeof(long_option)/sizeof(long_option[0]));i++) {
5826
5907
p = (unsigned char *)long_option[i].name;
5827
5908
for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
5828
5909
if (*p == cp[j] || cp[j] == SP){
6155
6236
if (cp[0]=='L') {
6157
6238
output_endian = ENDIAN_LITTLE;
6239
output_bom_f = TRUE;
6158
6240
} else if (cp[0] == 'B') {
6161
output_encoding = nkf_enc_from_index(enc_idx);
6242
output_bom_f = TRUE;
6164
6244
if (cp[0] == '0'){
6245
output_bom_f = FALSE;
6166
6247
enc_idx = enc_idx == UTF_16
6167
6248
? (output_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
6168
6249
: (output_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE);
6170
output_bom_f = TRUE;
6171
6251
enc_idx = enc_idx == UTF_16
6172
6252
? (output_endian == ENDIAN_LITTLE ? UTF_16LE_BOM : UTF_16BE_BOM)
6173
6253
: (output_endian == ENDIAN_LITTLE ? UTF_32LE_BOM : UTF_32BE_BOM);
6227
6307
bit:3 Convert HTML Entity
6228
6308
bit:4 Convert JIS X 0208 Katakana to JIS X 0201 Katakana
6230
while ('0'<= *cp && *cp <='9') {
6310
while ('0'<= *cp && *cp <='4') {
6231
6311
alpha_f |= 1 << (*cp++ - '0');
6233
if (!alpha_f) alpha_f = 1;
6235
6315
case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
6236
6316
x0201_f = FALSE; /* No X0201->X0208 conversion */