~ubuntu-branches/ubuntu/gutsy/libtext-wrapi18n-perl/gutsy : revision 1

1

package Text::WrapI18N;

2

3

require Exporter;

4

use strict;

5

use warnings;

6

7

our @ISA = qw(Exporter);

8

our @EXPORT = qw(wrap);

9

our @EXPORT_OK = qw($columns $separator);

10

our %EXPORT_TAGS = ('all' => [ @EXPORT, @EXPORT_OK ]);

11

12

our $VERSION = '0.06';

13

14

use vars qw($columns $break $tabstop $separator $huge $unexpand $charmap);

15

use Text::CharWidth qw(mbswidth mblen);

16

17

BEGIN {

18

$columns = 76;

19

# $break, $separator, $huge, and $unexpand are not supported yet.

20

$break = '\s';

21

$tabstop = 8;

22

$separator = "\n";

23

$huge = 'wrap';

24

$unexpand = 1;

25

undef $charmap;

26

}

27

28

sub wrap {

29

my $top1=shift;

30

my $top2=shift;

31

my $text=shift;

32

33

$text = $top1 . $text;

34

35

# $out already-formatted text for output including current line

36

# $len visible width of the current line without the current word

37

# $word the current word which might be sent to the next line

38

# $wlen visible width of the current word

39

# $c the current character

40

# $b whether to allow line-breaking after the current character

41

# $cont_lf true when LF (line feed) characters appear continuously

42

# $w visible width of the current character

43

44

my $out = '';

45

my $len = 0;

46

my $word = '';

47

my $wlen = 0;

48

my $cont_lf = 0;

49

my ($c, $w, $b);

50

$text =~ s/\n+$/\n/;

51

while(1) {

52

if (length($text) == 0) {

53

return $out . $word;

54

}

55

($c, $text, $w, $b) = _extract($text);

56

if ($c eq "\n") {

57

$out .= $word . $separator;

58

if (length($text) == 0) {return $out;}

59

$len = 0;

60

$text = $top2 . $text;

61

$word = '' ; $wlen = 0;

62

next;

63

} elsif ($w == -1) {

64

# all control characters other than LF are ignored

65

next;

66

}

67

68

# when the current line have enough room

69

# for the curren character

70

71

if ($len + $wlen + $w <= $columns) {

72

if ($c eq ' ' || $b) {

73

$out .= $word . $c;

74

$len += $wlen + $w;

75

$word = ''; $wlen = 0;

76

} else {

77

$word .= $c; $wlen += $w;

78

}

79

next;

80

}

81

82

# when the current line overflows with the

83

# current character

84

85

if ($c eq ' ') {

86

# the line ends by space

87

$out .= $word . $separator;

88

$len = 0;

89

$text = $top2 . $text;

90

$word = ''; $wlen = 0;

91

} elsif ($wlen + $w <= $columns) {

92

# the current word is sent to next line

93

$out .= $separator;

94

$len = 0;

95

$text = $top2 . $word . $c . $text;

96

$word = ''; $wlen = 0;

97

} else {

98

# the current word is too long to fit a line

99

$out .= $word . $separator;

100

$len = 0;

101

$text = $top2 . $c . $text;

102

$word = ''; $wlen = 0;

103

}

104

}

105

}

106

107

108

# Extract one character from the beginning from the given string.

109

# Supports multibyte encodings such as UTF-8, EUC-JP, EUC-KR,

110

# GB2312, and Big5.

111

#

112

# return value: (character, rest string, width, line breakable)

113

# character: a character. This may consist from multiple bytes.

114

# rest string: given string without the extracted character.

115

# width: number of columns which the character occupies on screen.

116

# line breakable: true if the character allows line break after it.

117

118

sub _extract {

119

my $string=shift;

120

my ($l, $c, $r, $w, $b, $u);

121

122

if (length($string) == 0) {

123

return ('', '', 0, 0);

124

}

125

$l = mblen($string);

126

if ($l == 0 || $l == -1) {

127

return ('?', substr($string,1), 1, 0);

128

}

129

$c = substr($string, 0, $l);

130

$r = substr($string, $l);

131

$w = mbswidth($c);

132

133

if (!defined($charmap)) {

134

$charmap = `/usr/bin/locale charmap`;

135

}

136

137

if ($charmap =~ /UTF.8/i) {

138

# UTF-8

139

if ($l == 3) {

140

# U+0800 - U+FFFF

141

$u = (ord(substr($c,0,1))&0x0f) * 0x1000

142

+ (ord(substr($c,1,1))&0x3f) * 0x40

143

+ (ord(substr($c,2,1))&0x3f);

144

$b = _isCJ($u);

145

} elsif ($l == 4) {

146

# U+10000 - U+10FFFF

147

$u = (ord(substr($c,0,1))&7) * 0x40000

148

+ (ord(substr($c,1,1))&0x3f) * 0x1000

149

+ (ord(substr($c,2,1))&0x3f) * 0x40

150

+ (ord(substr($c,3,1))&0x3f);

151

$b = _isCJ($u);

152

} else {

153

$b = 0;

154

}

155

} elsif ($charmap =~ /(^EUC)|(^GB)|(^BIG)/i) {

156

# East Asian legacy encodings

157

# (EUC-JP, EUC-KR, GB2312, Big5, Big5HKSCS, and so on)

158

159

if (ord(substr($c,0,1)) >= 0x80) {$b = 1;} else {$b = 0;}

160

} else {

161

$b = 0;

162

}

163

return ($c, $r, $w, $b);

164

}

165

166

# Returns 1 for Chinese and Japanese characters. This means that

167

# these characters allow line wrapping after this character even

168

# without whitespaces because these languages don't use whitespaces

169

# between words.

170

#

171

# Character must be given in UCS-4 codepoint value.

172

173

sub _isCJ {

174

my $u=shift;

175

176

if ($u >= 0x3000 && $u <= 0x312f) {

177

if ($u == 0x300a || $u == 0x300c || $u == 0x300e ||

178

$u == 0x3010 || $u == 0x3014 || $u == 0x3016 ||

179

$u == 0x3018 || $u == 0x301a) {return 0;}

180

return 1;

181

} # CJK punctuations, Hiragana, Katakana, Bopomofo

182

if ($u >= 0x31a0 && $u <= 0x31bf) {return 1;} # Bopomofo

183

if ($u >= 0x31f0 && $u <= 0x31ff) {return 1;} # Katakana extension

184

if ($u >= 0x3400 && $u <= 0x9fff) {return 1;} # Han Ideogram

185

if ($u >= 0xf900 && $u <= 0xfaff) {return 1;} # Han Ideogram

186

if ($u >= 0x20000 && $u <= 0x2ffff) {return 1;} # Han Ideogram

187

188

return 0;

189

}

190

191

1;

192

__END__

193

194

=head1 NAME

195

196

Text::WrapI18N - Line wrapping module with support for multibyte, fullwidth,

197

and combining characters and languages without whitespaces between words

198

199

=head1 SYNOPSIS

200

201

use Text::WrapI18N qw(wrap $columns);

202

wrap(firstheader, nextheader, texts);

203

204

=head1 DESCRIPTION

205

206

This module intends to be a better Text::Wrap module.

207

This module is needed to support multibyte character encodings such

208

as UTF-8, EUC-JP, EUC-KR, GB2312, and Big5. This module also supports

209

characters with irregular widths, such as combining characters (which

210

occupy zero columns on terminal, like diacritical marks in UTF-8) and

211

fullwidth characters (which occupy two columns on terminal, like most

212

of east Asian characters). Also, minimal handling of languages which

213

doesn't use whitespaces between words (like Chinese and Japanese) is

214

supported.

215

216

Like Text::Wrap, hyphenation and "kinsoku" processing are not supported,

217

to keep simplicity.

218

219

I<wrap(firstheader, nextheader, texts)> is the main subroutine of

220

Text::WrapI18N module to execute the line wrapping. Input parameters

221

and output data emulate Text::Wrap. The texts have to be written in

222

locale encoding.

223

224

=head1 SEE ALSO

225

226

locale(5), utf-8(7), charsets(7)

227

228

=head1 AUTHOR

229

230

Tomohiro KUBOTA, E<lt>kubota@debian.orgE<gt>

231

232

=head1 COPYRIGHT AND LICENSE

233

234

235

236

This library is free software; you can redistribute it and/or modify

237

it under the same terms as Perl itself.

238

239

=cut