/* Copyright (C) 1995 Bjoern Beutel. */ /* Description. =============================================================*/ /* This module handles the conversion of KSC5601 hangul characters and * romanised Hangul to internal Malaga format (and back). * * In this conversion module, we use four representations of * Hangul characters and syllables: * 1. KSC5601 code, used by a couple of programs (MULE for example), * which codes single letters as well as whole syllables by codes * that use two characters in the range of 0xa1 to 0xfe. * 2. Trigem code, which decodes syllables and letters by a 16-bit word * divided as follows: * (MSB) | S | I I I I I | V V V V V | F F F F F | (LSB) * | 15 | 14 13 12 11 10 | 9 8 7 6 5 | 4 3 2 1 0 | * where: "S" is set if the code represents a syllable (no single letter). * "IIIII" is the code of the initial consonant(s). * (See "initial_consonants".) * "VVVVV" is the code of the vowel (see "vowels"). * "FFFFF" is the code of the final consonant(s). * (See "final_consonants".) * 3. Hancode, which is the internal, letter oriented code. The letters are * represented by codes in the range of 0x81-0x9a, and 0x80 marks the * beginning of a syllable. * 4. Roman code, which is similar to Hancode, but it uses the latin letters * and adopts the Yale standard for Hangul romanization. * Here, every syllable begins with a dot ".". */ /* Includes. ================================================================*/ #include #include #include #include #include #include "basic.h" #include "pools.h" #include "tries.h" #include "ksc_table.h" #include "hangul.h" /* Constants. ===============================================================*/ /* Hancode representation of Hangul characters as strings */ #define DOT "\x80" #define A "\x81" #define B "\x82" #define C "\x83" #define D "\x84" #define E "\x85" #define G "\x87" #define H "\x88" #define I "\x89" #define K "\x8b" #define L "\x8c" #define M "\x8d" #define N "\x8e" #define O "\x8f" #define P "\x90" #define S "\x93" #define T "\x94" #define U "\x95" #define W "\x97" #define X "\x98" #define Z "\x9a" #define NUL "" /* Hancode representation of Hangul characters as individual characters */ #define DOT_C '\x80' #define A_C '\x81' #define B_C '\x82' #define C_C '\x83' #define D_C '\x84' #define E_C '\x85' #define G_C '\x87' #define H_C '\x88' #define I_C '\x89' #define K_C '\x8b' #define L_C '\x8c' #define M_C '\x8d' #define N_C '\x8e' #define O_C '\x8f' #define P_C '\x90' #define S_C '\x93' #define T_C '\x94' #define U_C '\x95' #define W_C '\x97' #define X_C '\x98' #define Z_C '\x9a' #define VOWELS A E I O U W /* The initial consonants in a syllable. */ static string_t initial_consonants[32] = { NUL, NUL, K, K K, N, T, T T, L, M, P, P P, S, S S, X, C, C C, Z, G, D, B, H, NUL, NUL, NUL, NUL, NUL, NUL, NUL, NUL, NUL, NUL, NUL }; /* The vowels in a syllable. */ static string_t vowels[32] = { NUL, NUL, NUL, A, A I, I A, I A I, E, NUL, NUL, E I, I E, I E I, O, O A, O I E, NUL, NUL, O I, I O, W, W E, W E I, W I, NUL, NUL, I W, U, U I, I, NUL, NUL }; /* The final consonants in a syllable. */ static string_t final_consonants[32] = { NUL, NUL, K, K K, K S, N, N C, N H, T, L, L K, L M, L P, L S, L D, L B, L H, M, NUL, P, P S, S, S S, X, C, Z, G, D, B, H, NUL, NUL }; /* Macros. ==================================================================*/ #define SYLLABLE(trigem) (((trigem) & 0x8000) != 0) #define INITIAL_CONSONANT(trigem) (initial_consonants[ ((trigem) >> 10) & 31 ]) #define VOWEL(trigem) (vowels[ ((trigem) >> 5) & 31 ]) #define FINAL_CONSONANT(trigem) (final_consonants[ (trigem) & 31 ]) /* Global variables. ========================================================*/ bool_t convert_to_ksc; /* Indicates whether Hangul output is converted to KSC5601 * (else output is converted to romanised Hangul). */ /* Variables. ===============================================================*/ static string_t hancode[ KSC_TABLE_SIZE ]; /* The Hancode for the KSC table. */ static pool_t string_pool; /* String pool with all Hancode syllables. */ static int_t *hancode_trie; /* Trie used to segmentise Hancode syllables. */ static int_t hancode_trie_root; /* Root node index of HANCODE_TRIE. */ /* Functions. ===============================================================*/ static int compare_hancode_entries( const void *entry1, const void *entry2 ) /* Compare two trie entries. */ { return strcmp_no_case( ((trie_entry_t *) entry1)->key, ((trie_entry_t *) entry2)->key ); } /*---------------------------------------------------------------------------*/ void init_hangul( void ) /* Initialise the hangul module. */ { trie_entry_t hancodes[ KSC_TABLE_SIZE ]; /* KSC/Hancode pairs for trie. */ int_t i, trigem; pool_t hancode_trie_pool; text_t *text; if (! hangul) return; text = new_text(); string_pool = new_pool( sizeof( char_t ) ); /* Build Hancode strings from KSC_TABLE; copy entries to HANCODES. */ for (i = 0; i < KSC_TABLE_SIZE; i++) { /* Build Hancode string out of Trigem code. */ trigem = ksc_table[i]; clear_text( text ); if (SYLLABLE( trigem )) add_char_to_text( text, DOT_C ); add_to_text( text, INITIAL_CONSONANT( trigem ) ); add_to_text( text, VOWEL( trigem ) ); add_to_text( text, FINAL_CONSONANT( trigem ) ); hancode[i] = copy_string_to_pool( string_pool, text->buffer, NULL ); /* Copy to trie entry table. */ hancodes[i].key = hancode[i]; hancodes[i].content = table_index_to_ksc( i ); } free_text( &text ); /* Sort the Hancode strings and build the trie. */ qsort( hancodes, KSC_TABLE_SIZE, sizeof( trie_entry_t ), compare_hancode_entries); new_trie( KSC_TABLE_SIZE, hancodes, &hancode_trie_pool, &hancode_trie_root ); hancode_trie = pool_to_vector( hancode_trie_pool ); free_pool( &hancode_trie_pool ); convert_to_ksc = FALSE; } /*---------------------------------------------------------------------------*/ void terminate_hangul( void ) /* Terminate the hangul module. */ { if (! hangul) return; free_mem( &hancode_trie ); free_pool( &string_pool ); } /* Conversion of Hancode to romanised Hangul. ===============================*/ static string_t hancode_to_roman( string_t hancode_string ) /* Convert Hancode string HANCODE_STRING to romanised Hangul. */ { /* Modified Yale roman representation for each of the Hancode letters. */ static string_t romans[27] = { ".", "a", "ph", "c", "th", "e", "", "kh", "h", "i", "", "k", "l", "m", "n", "o", "p", "", "", "s", "t", "u", "", "wu", "ng", "", "ch" }; string_t hancode_string_p, roman_segment; text_t *roman_text; roman_text = new_text(); hancode_string_p = hancode_string; while (*hancode_string_p != EOS) { if (ORD( *hancode_string_p ) >= ORD( DOT_C ) && ORD( *hancode_string_p ) <= ORD( Z_C )) { /* Convert this character. */ add_char_to_text( roman_text, '{' ); while (ORD( *hancode_string_p ) >= ORD( DOT_C ) && ORD( *hancode_string_p ) <= ORD( Z_C )) { roman_segment = romans[ ORD( *hancode_string_p ) - ORD( DOT_C ) ]; if (*roman_segment == EOS) complain( "Internal error." ); /* Handle some special cases. */ switch (*hancode_string_p) { case X_C: if (hancode_string_p > hancode_string && hancode_string_p[-1] == DOT_C) { roman_segment = ""; } break; case I_C: if ((hancode_string_p > hancode_string && strchr( VOWELS, hancode_string_p[-1] ) != NULL) || (hancode_string_p[1] != EOS && strchr( VOWELS, hancode_string_p[1] ) != NULL)) { roman_segment = "y"; } break; case O_C: if (hancode_string_p[1] == A_C) roman_segment = "w"; break; case W_C: if (hancode_string_p > hancode_string && hancode_string_p[-1] == I_C) roman_segment = "u"; else if (hancode_string_p[1] != EOS && strchr( VOWELS, hancode_string_p[1] ) != NULL) { roman_segment = "w"; } break; default: break; } add_to_text( roman_text, roman_segment ); hancode_string_p++; } add_char_to_text( roman_text, '}' ); } else add_char_to_text( roman_text, *hancode_string_p++ ); } return text_to_string( &roman_text ); } /* Conversion of romanised Hangul to Hancode. ===============================*/ static string_t roman_to_hancode( string_t roman_string ) /* Convert romanised Hangul string ROMAN_STRING to Hancode. */ { /* All letter sequences that can be converted to hancode. */ static struct {string_t roman; string_t hancode;} romans[] = { /* Two-letter strings must come first. */ {"ch", Z}, {"kh", G}, {"th", D}, {"ph", B}, {"wu", W}, {"ng", X}, {"wa", O A}, {"yu", I W}, {"a", A}, {"c", C}, {"e", E}, {"h", H}, {"i", I}, {"k", K}, {"l", L}, {"m", M}, {"n", N}, {"o", O}, {"p", P}, {"r", L}, {"s", S}, {"t", T}, {"u", U}, {"w", W}, {"x", X}, {"y", I}, {".", DOT}, {NULL, NULL} }; string_t roman_string_p; int_t i; text_t *hancode_text; hancode_text = new_text(); roman_string_p = roman_string; while (*roman_string_p != EOS) { if (*roman_string_p == '{') { roman_string_p++; while (*roman_string_p != '}') { if (*roman_string_p == EOS) complain( "Missing \"}\" in romanised Hangul." ); /* Insert an "x" at beginning of syllable if vowel is following. */ if (roman_string_p[-1] == '.' && strrchr( "aeiouwy", TO_LOWER( roman_string_p[0] ) ) != NULL) { add_char_to_text( hancode_text, X_C ); } for (i = 0; romans[i].roman != NULL; i++) { if (strncmp_no_case( roman_string_p, romans[i].roman, strlen( romans[i].roman ) ) == 0) { add_to_text( hancode_text, romans[i].hancode ); roman_string_p += strlen( romans[i].roman ); break; } } if (romans[i].roman == NULL) { complain( "\"%c\" is not a romanised Hangul letter.", *roman_string_p ); } } /* Jump over closing "}" */ roman_string_p++; } else add_char_to_text( hancode_text, *roman_string_p++ ); } return text_to_string( &hancode_text ); } /* Conversion of Hancode to KSC5601. ========================================*/ static string_t hancode_to_ksc( string_t hancode_string ) /* Convert Hancode string HANCODE_STRING to KSC5601 code. */ { string_t hancode_string_p; text_t *ksc_text; int_t ksc_code, code; int_t trie_node; string_t string_p; ksc_text = new_text(); hancode_string_p = hancode_string; while (*hancode_string_p != EOS) { /* KSC code is actually u_short_t, but trie entries are of type int_t. */ if (ORD( *hancode_string_p ) >= ORD( DOT_C ) && ORD( *hancode_string_p ) <= ORD( Z_C )) { /* Search the trie until we have found the longest segment. */ trie_node = hancode_trie_root; string_p = hancode_string_p; ksc_code = 0; while (lookup_trie( hancode_trie, &trie_node, &string_p, &code )) { hancode_string_p = string_p; ksc_code = code; } if (ksc_code != 0) { add_char_to_text( ksc_text, ksc_code >> 8 ); add_char_to_text( ksc_text, ksc_code & 0xff ); } else if (*hancode_string_p == DOT_C) { add_to_text( ksc_text, "{.}" ); hancode_string_p++; } else complain( "Internal error." ); } else add_char_to_text( ksc_text, *hancode_string_p++ ); } return text_to_string( &ksc_text ); } /* Conversion of KSC5601 to Hancode. ========================================*/ static string_t ksc_to_hancode( string_t ksc_string ) /* Convert KSC5601 string KSC_STRING to Hancode format. * The returned string remains valid until this function is called again. */ { string_t ksc_string_p; text_t *hancode_text; int_t ksc_code; hancode_text = new_text(); ksc_string_p = ksc_string; while (*ksc_string_p != EOS) { if (ORD( *ksc_string_p ) < 0x80) /* Copy an ASCII character. */ add_char_to_text( hancode_text, *ksc_string_p++ ); else { /* Copy a KSC two-byte character. */ ksc_code = (ORD( ksc_string_p[0] ) << 8) | ORD( ksc_string_p[1] ); add_to_text( hancode_text, hancode[ ksc_to_table_index( ksc_code ) ] ); ksc_string_p += 2; } } return text_to_string( &hancode_text ); } /* Global conversion routines. ==============================================*/ void decode_hangul( string_t *string_p ) /* Decode *STRING_P to external format. * *STRING_P must be a string on the heap. * It will be replaced by the new string which is also on the heap. */ { string_t string; if (! hangul) return; if (convert_to_ksc) string = hancode_to_ksc( *string_p ); else string = hancode_to_roman( *string_p ); free_mem( string_p ); *string_p = string; } /*---------------------------------------------------------------------------*/ void encode_hangul( string_t *string_p ) /* Encode *STRING_P to internal format. * *STRING_P must be a string on the heap. * It will be replaced by the new string which is also on the heap. */ { string_t string; if (! hangul) return; string = ksc_to_hancode( *string_p ); free_mem( string_p ); *string_p = roman_to_hancode( string ); free_mem( &string ); } /* End of file. =============================================================*/