2
# -*- Mode: perl; indent-tabs-mode: nil -*-
4
# The contents of this file are subject to the Mozilla Public
5
# License Version 1.1 (the "License"); you may not use this file
6
# except in compliance with the License. You may obtain a copy of
7
# the License at http://www.mozilla.org/MPL/
9
# Software distributed under the License is distributed on an "AS
10
# IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
11
# implied. See the License for the specific language governing
12
# rights and limitations under the License.
14
# The Original Code is the Bugzilla Bug Tracking System.
16
# The Initial Developer of the Original Code is Everything Solved.
17
# Portions created by Everything Solved are Copyright (C) 2006
18
# Everything Solved. All Rights Reserved.
20
# Contributor(s): Max Kanat-Alexander <mkanat@bugzilla.org>
23
# Allow the script to be run from contrib or as contrib/recode.pl
27
use Bugzilla::Constants;
29
use Digest::MD5 qw(md5_base64);
30
use Encode qw(encode decode resolve_alias is_utf8);
39
use constant IGNORE_ENCODINGS => qw(utf8 ascii iso-8859-1);
41
use constant MAX_STRING_LEN => 25;
43
# For certain tables, we can't automatically determine their Primary Key.
44
# So, we specify it here as a string.
45
use constant SPECIAL_KEYS => {
46
bugs_activity => 'bug_id,bug_when,fieldid',
47
profile_setting => 'user_id,setting_name',
48
profiles_activity => 'userid,profiles_when,fieldid',
49
setting_value => 'name,value',
50
# longdescs didn't used to have a PK, before 2.20.
51
longdescs => 'bug_id,bug_when',
52
# The 2.16 versions table lacked a PK
53
versions => 'product_id,value',
54
# These are all for earlier versions of Bugzilla. On a modern
55
# version of Bugzilla, this script will ignore these (thanks to
57
components => 'program,value',
58
products => 'product',
65
# "truncate" is a file operation in perl, so we can't use that name.
68
my $truncated = substr($str, 0, MAX_STRING_LEN);
69
if (length($truncated) ne length($str)) {
78
my $encoding = detect($data);
79
$encoding = resolve_alias($encoding) if $encoding;
81
# Encode::Detect is bad at detecting certain charsets, but Encode::Guess
82
# is better at them. Here's the details:
84
# shiftjis, big5-eten, euc-kr, and euc-jp: (Encode::Detect
85
# tends to accidentally mis-detect UTF-8 strings as being
87
my @utf8_accidental = qw(shiftjis big5-eten euc-kr euc-jp);
88
if ($encoding && grep($_ eq $encoding, @utf8_accidental)) {
90
my $decoder = guess_encoding($data, @utf8_accidental);
91
$encoding = $decoder->name if ref $decoder;
94
# Encode::Detect sometimes mis-detects various ISO encodings as iso-8859-8,
95
# but Encode::Guess can usually tell which one it is.
96
if ($encoding && $encoding eq 'iso-8859-8') {
97
my $decoded_as = guess_iso($data, 'iso-8859-8',
98
# These are ordered this way because it gives the most
100
qw(iso-8859-7 iso-8859-2));
101
$encoding = $decoded_as if $decoded_as;
107
# A helper for do_guess.
109
my ($data, $versus, @isos) = @_;
112
foreach my $iso (@isos) {
113
my $decoder = guess_encoding($data, ($iso, $versus));
115
$encoding = $decoder->name if ref $decoder;
124
Encode::_utf8_on($str);
125
return is_utf8($str, 1);
133
GetOptions(\%switch, 'dry-run', 'guess', 'charset=s', 'show-failures',
134
'overrides=s', 'help|h');
136
pod2usage({ -verbose => 1 }) if $switch{'help'};
138
# You have to specify at least one of these switches.
139
pod2usage({ -verbose => 0 }) if (!$switch{'charset'} && !$switch{'guess'});
141
if (exists $switch{'charset'}) {
142
$switch{'charset'} = resolve_alias($switch{'charset'})
143
|| die "'$switch{charset}' is not a valid charset.";
146
if ($switch{'guess'}) {
147
# Encode::Detect::Detector doesn't seem to return a true value.
148
# So we have to check if we can run detect.
149
if (!eval { require Encode::Detect::Detector }) {
150
my $root = ROOT_USER;
152
Using --guess requires that Encode::Detect be installed. To install
153
Encode::Detect, first download it from:
155
http://search.cpan.org/dist/Encode-Detect/
157
Then, unpack it into its own directory and run the following commands
158
in that directory, as $root:
168
import Encode::Detect::Detector qw(detect);
172
if (exists $switch{'overrides'}) {
173
my $file = new IO::File($switch{'overrides'}, 'r')
174
|| die "$switch{overrides}: $!";
175
my @lines = $file->getlines();
177
foreach my $line (@lines) {
179
my ($digest, $encoding) = split(' ', $line);
180
$overrides{$digest} = $encoding;
184
my $dbh = Bugzilla->dbh;
186
if ($dbh->isa('Bugzilla::DB::Mysql')) {
187
# Get the actual current encoding of the DB.
188
my $collation_data = $dbh->selectrow_arrayref(
189
"SHOW VARIABLES LIKE 'character_set_database'");
190
my $db_charset = $collation_data->[1];
191
# Set our connection encoding to *that* encoding, so that MySQL
192
# correctly accepts our changes.
193
$dbh->do("SET NAMES $db_charset");
194
# Make the database give us raw bytes.
195
$dbh->do('SET character_set_results = NULL')
200
foreach my $table ($dbh->bz_table_list_real) {
201
my @columns = $dbh->bz_table_columns($table);
203
my $pk = SPECIAL_KEYS->{$table};
205
# Assure that we're on a version of Bugzilla where those keys
207
foreach my $column (split ',', $pk) {
208
$pk = undef if !$dbh->bz_column_info($table, $column);
212
# Figure out the primary key.
213
foreach my $column (@columns) {
214
my $def = $dbh->bz_column_info($table, $column);
215
$pk = $column if $def->{PRIMARYKEY};
217
# If there's no PK, it's defined by a UNIQUE index.
219
foreach my $column (@columns) {
220
my $index = $dbh->bz_index_info($table, "${table}_${column}_idx");
221
if ($index && ref($index) eq 'HASH') {
222
$pk = join(',', @{$index->{FIELDS}})
223
if $index->{TYPE} eq 'UNIQUE';
228
foreach my $column (@columns) {
229
my $def = $dbh->bz_column_info($table, $column);
230
# If this is a text column, it may need work.
231
if ($def->{TYPE} =~ /text|char/i) {
232
# If there's still no PK, we're upgrading from 2.14 or earlier.
233
# We can't reliably determine the PK (or at least, I don't want to
234
# maintain code to record what the PK was at all points in history).
235
# So instead we just use the field itself.
236
$pk = $column if !$pk;
238
print "Converting $table.$column...\n";
239
my $sth = $dbh->prepare("SELECT $column, $pk FROM $table
240
WHERE $column IS NOT NULL
243
my @pk_array = map {"$_ = ?"} split(',', $pk);
244
my $pk_where = join(' AND ', @pk_array);
245
my $update_sth = $dbh->prepare(
246
"UPDATE $table SET $column = ? WHERE $pk_where");
250
while (my @result = $sth->fetchrow_array) {
251
my $data = shift @result;
252
my $digest = md5_base64($data);
254
my @primary_keys = reverse split(',', $pk);
255
# We copy the array so that we can pop things from it without
256
# affecting the original.
257
my @pk_data = @result;
258
my $pk_line = join (', ',
259
map { "$_ = " . pop @pk_data } @primary_keys);
262
if ($switch{'guess'}) {
263
$encoding = do_guess($data);
265
# We only show failures if they don't appear to be
267
if ($switch{'show-failures'} && !$encoding
268
&& !is_valid_utf8($data))
270
my $truncated = trunc($data);
271
print "Row: [$pk_line]\n",
272
"Failed to guess: Key: $digest",
273
" DATA: $truncated\n";
276
# If we fail a guess, and the data is valid UTF-8,
277
# just assume we failed because it's UTF-8.
278
next if is_valid_utf8($data);
281
# If we couldn't detect the charset (or were instructed
282
# not to try), we fall back to --charset. If there's no
283
# fallback, we just do nothing.
284
if (!$encoding && $switch{'charset'}) {
285
$encoding = $switch{'charset'};
288
$encoding = $overrides{$digest} if $overrides{$digest};
290
# We only fix it if it's not ASCII or UTF-8 already.
291
if ($encoding && !grep($_ eq $encoding, IGNORE_ENCODINGS)) {
292
my $decoded = encode('utf8', decode($encoding, $data));
293
if ($switch{'dry-run'} && $data ne $decoded) {
294
print "Row: [$pk_line]\n",
295
"From: [" . trunc($data) . "] Key: $digest\n",
296
"To: [" . trunc($decoded) . "]",
297
" Encoding : $encoding\n";
300
$update_sth->execute($decoded, @result);
303
} # while (my @result = $sth->fetchrow_array)
304
} # if ($column->{TYPE} =~ /text|char/i)
305
} # foreach my $column (@columns)
314
recode.pl - Converts a database from one encoding (or multiple encodings)
319
contrib/recode.pl [--guess [--show-failures]] [--charset=iso-8859-2]
320
[--overrides=file_name]
322
--dry-run Don't modify the database.
324
--charset Primary charset your data is currently in. This can be
325
optionally omitted if you do --guess.
327
--guess Try to guess the charset of the data.
329
--show-failures If we fail to guess, show where we failed.
331
--overrides Specify a file containing overrides. See --help
334
--help Display detailed help.
336
If you aren't sure what to do, try:
338
contrib/recode.pl --guess --charset=cp1252
346
Don't modify the database, just print out what the conversions will be.
348
recode.pl will print out a Key for each item. You can use this in the
349
overrides file, described below.
353
If your database is in multiple different encodings, specify this switch
354
and recode.pl will do its best to determine the original charset of the data.
355
The detection is usually very reliable.
357
If recode.pl cannot guess the charset, it will leave the data alone, unless
358
you've specified --charset.
360
=item --charset=charset-name
362
If you do not specify --guess, then your database is converted
363
from this character set into the UTF-8.
365
If you have specified --guess, recode.pl will use this charset as
366
a fallback--when it cannot guess the charset of a particular piece
367
of data, it will guess that the data is in this charset and convert
368
it from this charset to UTF-8.
370
charset-name must be a charset that is known to perl's Encode
371
module. To see a list of available charsets, do:
373
C<perl -MEncode -e 'print join("\n", Encode-E<gt>encodings(":all"))'>
375
=item --show-failures
377
If --guess fails to guess a charset, print out the data it failed on.
379
=item --overrides=file_name
381
This is a way of specifying certain encodings to override the encodings of
382
--guess. The file is a series of lines. The line should start with the Key
383
from --dry-run, and then a space, and then the encoding you'd like to use.