1
package Text::WrapI18N;
7
our @ISA = qw(Exporter);
8
our @EXPORT = qw(wrap);
9
our @EXPORT_OK = qw($columns $separator);
10
our %EXPORT_TAGS = ('all' => [ @EXPORT, @EXPORT_OK ]);
12
our $VERSION = '0.06';
14
use vars qw($columns $break $tabstop $separator $huge $unexpand $charmap);
15
use Text::CharWidth qw(mbswidth mblen);
19
# $break, $separator, $huge, and $unexpand are not supported yet.
33
$text = $top1 . $text;
35
# $out already-formatted text for output including current line
36
# $len visible width of the current line without the current word
37
# $word the current word which might be sent to the next line
38
# $wlen visible width of the current word
39
# $c the current character
40
# $b whether to allow line-breaking after the current character
41
# $cont_lf true when LF (line feed) characters appear continuously
42
# $w visible width of the current character
52
if (length($text) == 0) {
55
($c, $text, $w, $b) = _extract($text);
57
$out .= $word . $separator;
58
if (length($text) == 0) {return $out;}
60
$text = $top2 . $text;
61
$word = '' ; $wlen = 0;
64
# all control characters other than LF are ignored
68
# when the current line have enough room
69
# for the curren character
71
if ($len + $wlen + $w <= $columns) {
72
if ($c eq ' ' || $b) {
75
$word = ''; $wlen = 0;
77
$word .= $c; $wlen += $w;
82
# when the current line overflows with the
86
# the line ends by space
87
$out .= $word . $separator;
89
$text = $top2 . $text;
90
$word = ''; $wlen = 0;
91
} elsif ($wlen + $w <= $columns) {
92
# the current word is sent to next line
95
$text = $top2 . $word . $c . $text;
96
$word = ''; $wlen = 0;
98
# the current word is too long to fit a line
99
$out .= $word . $separator;
101
$text = $top2 . $c . $text;
102
$word = ''; $wlen = 0;
108
# Extract one character from the beginning from the given string.
109
# Supports multibyte encodings such as UTF-8, EUC-JP, EUC-KR,
112
# return value: (character, rest string, width, line breakable)
113
# character: a character. This may consist from multiple bytes.
114
# rest string: given string without the extracted character.
115
# width: number of columns which the character occupies on screen.
116
# line breakable: true if the character allows line break after it.
120
my ($l, $c, $r, $w, $b, $u);
122
if (length($string) == 0) {
123
return ('', '', 0, 0);
126
if ($l == 0 || $l == -1) {
127
return ('?', substr($string,1), 1, 0);
129
$c = substr($string, 0, $l);
130
$r = substr($string, $l);
133
if (!defined($charmap)) {
134
$charmap = `/usr/bin/locale charmap`;
137
if ($charmap =~ /UTF.8/i) {
141
$u = (ord(substr($c,0,1))&0x0f) * 0x1000
142
+ (ord(substr($c,1,1))&0x3f) * 0x40
143
+ (ord(substr($c,2,1))&0x3f);
147
$u = (ord(substr($c,0,1))&7) * 0x40000
148
+ (ord(substr($c,1,1))&0x3f) * 0x1000
149
+ (ord(substr($c,2,1))&0x3f) * 0x40
150
+ (ord(substr($c,3,1))&0x3f);
155
} elsif ($charmap =~ /(^EUC)|(^GB)|(^BIG)/i) {
156
# East Asian legacy encodings
157
# (EUC-JP, EUC-KR, GB2312, Big5, Big5HKSCS, and so on)
159
if (ord(substr($c,0,1)) >= 0x80) {$b = 1;} else {$b = 0;}
163
return ($c, $r, $w, $b);
166
# Returns 1 for Chinese and Japanese characters. This means that
167
# these characters allow line wrapping after this character even
168
# without whitespaces because these languages don't use whitespaces
171
# Character must be given in UCS-4 codepoint value.
176
if ($u >= 0x3000 && $u <= 0x312f) {
177
if ($u == 0x300a || $u == 0x300c || $u == 0x300e ||
178
$u == 0x3010 || $u == 0x3014 || $u == 0x3016 ||
179
$u == 0x3018 || $u == 0x301a) {return 0;}
181
} # CJK punctuations, Hiragana, Katakana, Bopomofo
182
if ($u >= 0x31a0 && $u <= 0x31bf) {return 1;} # Bopomofo
183
if ($u >= 0x31f0 && $u <= 0x31ff) {return 1;} # Katakana extension
184
if ($u >= 0x3400 && $u <= 0x9fff) {return 1;} # Han Ideogram
185
if ($u >= 0xf900 && $u <= 0xfaff) {return 1;} # Han Ideogram
186
if ($u >= 0x20000 && $u <= 0x2ffff) {return 1;} # Han Ideogram
196
Text::WrapI18N - Line wrapping module with support for multibyte, fullwidth,
197
and combining characters and languages without whitespaces between words
201
use Text::WrapI18N qw(wrap $columns);
202
wrap(firstheader, nextheader, texts);
206
This module intends to be a better Text::Wrap module.
207
This module is needed to support multibyte character encodings such
208
as UTF-8, EUC-JP, EUC-KR, GB2312, and Big5. This module also supports
209
characters with irregular widths, such as combining characters (which
210
occupy zero columns on terminal, like diacritical marks in UTF-8) and
211
fullwidth characters (which occupy two columns on terminal, like most
212
of east Asian characters). Also, minimal handling of languages which
213
doesn't use whitespaces between words (like Chinese and Japanese) is
216
Like Text::Wrap, hyphenation and "kinsoku" processing are not supported,
219
I<wrap(firstheader, nextheader, texts)> is the main subroutine of
220
Text::WrapI18N module to execute the line wrapping. Input parameters
221
and output data emulate Text::Wrap. The texts have to be written in
226
locale(5), utf-8(7), charsets(7)
230
Tomohiro KUBOTA, E<lt>kubota@debian.orgE<gt>
232
=head1 COPYRIGHT AND LICENSE
234
Copyright 2003 by Tomohiro KUBOTA
236
This library is free software; you can redistribute it and/or modify
237
it under the same terms as Perl itself.