1
package Locale::Maketext::Extract;
2
$Locale::Maketext::Extract::VERSION = '0.08';
8
Locale::Maketext::Extract - Extract translatable strings from source
12
my $Ext = Locale::Maketext::Extract->new;
13
$Ext->read_po('messages.po');
14
$Ext->extract_file($_) for <*.pl>;
16
$Ext->write_po('messages.po');
20
This module can extract translatable strings from files, and write
21
them back to PO files. It can also parse existing PO files and merge
22
their contents with newly extracted strings.
24
A command-line utility, L<xgettext.pl>, is installed with this module
27
Following formats of input files are supported:
31
=item Perl source files
33
Valid localization function names are: C<translate>, C<maketext>,
34
C<gettext>, C<loc>, C<x>, C<_> and C<__>.
38
Strings inside C<E<lt>&|/lE<gt>I<...>E<lt>/&E<gt>> and
39
C<E<lt>&|/locE<gt>I<...>E<lt>/&E<gt>> are extracted.
41
=item Template Toolkit
43
Strings inside C<[%|l%]...[%END%]> or C<[%|loc%]...[%END%]>
48
Sentences between C<STARTxxx> and C<ENDxxx> are extracted individually.
50
=item Generic Template
52
Strings inside {{...}} are extracted.
66
bless({ header => '', entries => {}, lexicon => {}, @_ }, $class);
72
lexicon, set_lexicon, msgstr, set_msgstr
73
entries, set_entries, entry, add_entry, del_entry
78
sub header { $_[0]{header} || _default_header() };
79
sub set_header { $_[0]{header} = $_[1] };
81
sub lexicon { $_[0]{lexicon} }
82
sub set_lexicon { $_[0]{lexicon} = $_[1] || {}; delete $_[0]{lexicon}{''}; }
84
sub msgstr { $_[0]{lexicon}{$_[1]} }
85
sub set_msgstr { $_[0]{lexicon}{$_[1]} = $_[2] }
87
sub entries { $_[0]{entries} }
88
sub set_entries { $_[0]{entries} = $_[1] || {} }
90
sub entry { @{$_[0]->entries->{$_[1]} || [] } }
91
sub add_entry { push @{$_[0]->entries->{$_[1]}}, $_[2] }
92
sub del_entry { delete $_[0]->entries->{$_[1]} }
100
=head2 PO File manipulation
108
my ($self, $file, $verbatim) = @_;
112
open LEXICON, $file or die $!;
117
1 while chomp $header;
119
$self->set_header("$header\n");
121
require Locale::Maketext::Lexicon::Gettext;
122
my $lexicon = Locale::Maketext::Lexicon::Gettext->parse($_, <LEXICON>);
125
$verbatim ? { map _to_gettext($_), %$lexicon } : $lexicon
131
my ($self, $file, $add_format) = @_;
134
open LEXICON, ">$file" or die "Can't write to $file$!\n";
136
print LEXICON $self->header;
138
foreach my $msgid ($self->msgids) {
139
$self->normalize_space($msgid);
141
print LEXICON $self->msg_positions($msgid);
142
print LEXICON $self->msg_variables($msgid);
143
print LEXICON $self->msg_format($msgid) if $add_format;
144
print LEXICON $self->msg_out($msgid);
155
use constant NUL => 0;
156
use constant BEG => 1;
157
use constant PAR => 2;
158
use constant QUO1 => 3;
159
use constant QUO2 => 4;
160
use constant QUO3 => 5;
166
my $entries = $self->entries;
167
my $line = 1; pos($_) = 0;
170
if (/^STARTTEXT$/m and /^ENDTEXT$/m) {
171
require HTML::Parser;
172
require Lingua::EN::Sentence;
176
@MyParser::ISA = 'HTML::Parser';
178
my ($self, $str, $is_cdata) = @_;
179
my $sentences = Lingua::EN::Sentence::get_sentences($str) or return;
180
$str =~ s/\n/ /g; $str =~ s/^\s+//; $str =~ s/\s+$//;
181
$self->add_entry($str => [$file, $line]);
185
my $p = MyParser->new;
186
while (m/\G((.*?)^(?:START|END)[A-Z]+$)/smg) {
188
$line += ( () = ($1 =~ /\n/g) ); # cryptocontext!
189
$p->parse($str); $p->eof;
195
$line = 1; pos($_) = 0;
196
while (m!\G(.*?<&\|/l(?:oc)?(.*?)&>(.*?)</&>)!sg) {
197
my ($vars, $str) = ($2, $3);
198
$line += ( () = ($1 =~ /\n/g) ); # cryptocontext!
199
$self->add_entry($str, [ $file, $line, $vars ]);
203
$line = 1; pos($_) = 0;
204
while (m!\G(.*?\[%\s*\|l(?:oc)?(.*?)\s*%\](.*?)\[%\s*END\s*%\])!sg) {
205
my ($vars, $str) = ($2, $3);
206
$line += ( () = ($1 =~ /\n/g) ); # cryptocontext!
209
$self->add_entry($str, [ $file, $line, $vars ]);
213
$line = 1; pos($_) = 0;
214
while (m/\G(.*?(?<!\{)\{\{(?!\{)(.*?)\}\})/sg) {
215
my ($vars, $str) = ('', $2);
216
$line += ( () = ($1 =~ /\n/g) ); # cryptocontext!
217
$self->add_entry($str, [ $file, $line, $vars ]);
220
my $quoted = '(\')([^\\\']*(?:\\.[^\\\']*)*)(\')|(\")([^\\\"]*(?:\\.[^\\\"]*)*)(\")';
222
# Comment-based mark: "..." # loc
223
$line = 1; pos($_) = 0;
224
while (m/\G(.*?($quoted)[\}\)\],]*\s*\#\s*loc\s*$)/smog) {
225
my $str = substr($2, 1, -1);
226
$line += ( () = ( $1 =~ /\n/g ) ); # cryptocontext!
227
$str =~ s/\\(["'])/$1/g;
228
$self->add_entry($str, [ $file, $line, '' ]);
231
# Comment-based pair mark: "..." => "..." # loc_pair
232
$line = 1; pos($_) = 0;
233
while (m/\G(.*?(\w+)\s*=>\s*($quoted)[\}\)\],]*\s*\#\s*loc_pair\s*$)/smg) {
235
my $val = substr($3, 1, -1);
236
$line += ( () = ( $1 =~ /\n/g ) ); # cryptocontext!
237
$key =~ s/\\(["'])/$1/g;
238
$val =~ s/\\(["'])/$1/g;
239
$self->add_entry($key, [ $file, $line, '' ]);
240
$self->add_entry($val, [ $file, $line, '' ]);
244
my ($state,$str,$vars,$quo)=(0);
246
my $orig = 1 + (() = ((my $__ = $_) =~ /\n/g));
249
$_ = substr($_, pos($_)) if (pos($_));
250
my $line = $orig - (() = ((my $__ = $_) =~ /\n/g));
252
# maketext or loc or _
253
$state == NUL && m/\b(translate|maketext|gettext|__?|loc|x)/gc
254
&& do { $state = BEG; redo };
255
$state == BEG && m/^([\s\t\n]*)/gc && redo;
258
$state == BEG && m/^([\S\(])\s*/gc
259
&& do { $state = ( ($1 eq '(') ? PAR : NUL); redo };
261
# begin or end of string
262
$state == PAR && m/^(\')/gc && do { $state = $quo = QUO1; redo };
263
$state == QUO1 && m/^([^\']+)/gc && do { $str .= $1; redo };
264
$state == QUO1 && m/^\'/gc && do { $state = PAR; redo };
266
$state == PAR && m/^\"/gc && do { $state = $quo = QUO2; redo };
267
$state == QUO2 && m/^([^\"]+)/gc && do { $str .= $1; redo };
268
$state == QUO2 && m/^\"/gc && do { $state = PAR; redo };
270
$state == PAR && m/^\`/gc && do { $state = $quo = QUO3; redo };
271
$state == QUO3 && m/^([^\`]*)/gc && do { $str .= $1; redo };
272
$state == QUO3 && m/^\`/gc && do { $state = PAR; redo };
275
$state == PAR && m/^\s*[\)]/gc && do {
277
$vars =~ s/[\n\r]//g if ($vars);
279
$str =~ s/\\([\\'])/$1/g; # normalize q strings
282
$str =~ s/(\\(?:[0x]..|c?.))/"qq($1)"/eeg; # normalize qq / qx strings
284
push @{$entries->{$str}}, [ $file, $line - (() = $str =~ /\n/g), $vars] if ($str);
285
undef $str; undef $vars;
290
$state == PAR && m/^([^\)]*)/gc && do { $vars .= "$1\n"; redo };
295
my ($self, $file) = @_;
298
open FH, $file or die $!;
299
$self->extract($file => scalar <FH>);
311
my ($self, $verbatim) = @_;
312
my $entries = $self->entries;
313
my $lexicon = $self->lexicon;
315
foreach my $str (sort keys %$entries) {
317
my $entry = $entries->{$str};
318
my $lexi = $lexicon->{$ostr};
320
$str = _to_gettext($str, $verbatim);
321
$lexi = _to_gettext($lexi, $verbatim);
323
$lexicon->{$str} ||= '';
324
next if $ostr eq $str;
326
$lexicon->{$str} ||= $lexi;
327
delete $entries->{$ostr}; delete $lexicon->{$ostr};
328
$entries->{$str} = $entry;
334
my %Escapes = map {("\\$_" => eval("qq(\\$_)"))} qw(t r f b a e);
335
sub normalize_space {
336
my ($self, $msgid) = @_;
337
my $nospace = $msgid;
340
return unless (!$self->has_msgid($msgid) and $self->has_msgid($nospace));
343
$msgid => $self->msgstr($nospace) .
344
(' ' x (length($msgid) - length($nospace)))
348
=head2 Lexicon accessors
352
msg_positions, msg_variables, msg_format, msg_out
356
sub msgids { sort keys %{$_[0]{lexicon}} }
357
sub has_msgid { length $_[0]->msgstr($_[1]) }
360
my ($self, $msgid) = @_;
361
my %files = (map { ( " $_->[0]:$_->[1]" => 1 ) } $self->entry($msgid));
362
return join('', '#:', sort(keys %files), "\n");
366
my ($self, $msgid) = @_;
370
foreach my $entry ( grep { $_->[2] } $self->entry($msgid) ) {
371
my ($file, $line, $var) = @$entry;
372
$var =~ s/^\s*,\s*//; $var =~ s/\s*$//;
373
$out .= "#. ($var)\n" unless !length($var) or $seen{$var}++;
380
my ($self, $msgid) = @_;
381
return "#, perl-maketext-format\n" if $msgid =~ /%(?:\d|\w+\([^\)]*\))/;
386
my ($self, $msgid) = @_;
388
return "msgid " . _format($msgid) .
389
"msgstr " . _format($self->msgstr($msgid));
392
=head2 Internal utilities
401
sub _default_header {
403
# SOME DESCRIPTIVE TITLE.
404
# Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
405
# This file is distributed under the same license as the PACKAGE package.
406
# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
411
"Project-Id-Version: PACKAGE VERSION\n"
412
"POT-Creation-Date: YEAR-MO-DA HO:MI+ZONE\n"
413
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
414
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
415
"Language-Team: LANGUAGE <LL@li.org>\n"
416
"MIME-Version: 1.0\n"
417
"Content-Type: text/plain; charset=CHARSET\n"
418
"Content-Transfer-Encoding: 8bit\n"
423
my ($text, $verbatim) = @_;
424
return '' unless defined $text;
426
$text =~ s/\\/\\\\/g;
429
while (my ($char, $esc) = each %Escapes) {
430
$text =~ s/$esc/$char/g;
432
return $text if $verbatim;
434
$text =~ s/((?<!~)(?:~~)*)\[_(\d+)\]/$1%$2/g;
435
$text =~ s/((?<!~)(?:~~)*)\[([A-Za-z#*]\w*),([^\]]+)\]/$1%$2("""$3""")/g;
436
$text = join('', map {
437
/^""".*"""$/ ? _escape(substr($_, 3, -3)) : $_
438
} split(/(""".*?""")/, $text));
440
$text =~ s/~([\~\[\]])/$1/g;
446
$text =~ s/\b_(\d+)/%$1/g;
454
return "\"$str\"\n" unless $str =~ /\n/;
455
my $multi_line = ($str =~ /\n(?!\z)/);
456
$str =~ s/\n/\\n"\n"/g;
457
if ($str =~ /\n"$/) {
463
return $multi_line ? qq(""\n"$str) : qq("$str);
468
=head1 ACKNOWLEDGMENTS
470
Thanks to Jesse Vincent for contributing to an early version of this
473
Also to Alain Barbet, who effectively re-wrote the source parser with a
478
L<xgettext.pl>, L<Locale::Maketext>, L<Locale::Maketext::Lexicon>
482
Autrijus Tang E<lt>autrijus@autrijus.orgE<gt>
486
Copyright 2003, 2004 by Autrijus Tang E<lt>autrijus@autrijus.orgE<gt>.
488
This program is free software; you can redistribute it and/or
489
modify it under the same terms as Perl itself.
491
See L<http://www.perl.com/perl/misc/Artistic.html>