1
# This program is copyright 2008-2011 Percona Inc.
2
# Feedback and improvements are welcome.
4
# THIS PROGRAM IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
5
# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
6
# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
8
# This program is free software; you can redistribute it and/or modify it under
9
# the terms of the GNU General Public License as published by the Free Software
10
# Foundation, version 2; OR the Perl Artistic License. On UNIX and similar
11
# systems, you can issue `man perlgpl' or `man perlartistic' to read these
14
# You should have received a copy of the GNU General Public License along with
15
# this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16
# Place, Suite 330, Boston, MA 02111-1307 USA.
17
# ###########################################################################
18
# QueryParser package $Revision: 7452 $
19
# ###########################################################################
21
# Package: QueryParser
22
# QueryParser extracts parts of SQL statements, like table lists and subqueries.
23
# This package differs from SQLParser because it only extracts from a query
24
# what is needed and only when that can be accomplished rather simply. By
25
# contrast, SQLParser parses the entire SQL statement no matter the complexity.
29
use warnings FATAL => 'all';
30
use English qw(-no_match_vars);
32
use constant MKDEBUG => $ENV{MKDEBUG} || 0;
33
our $tbl_ident = qr/(?:`[^`]+`|\w+)(?:\.(?:`[^`]+`|\w+))?/;
34
# This regex finds things that look like database.table identifiers, based on
35
# their proximity to keywords. (?<!KEY\s) is a workaround for ON DUPLICATE KEY
36
# UPDATE, which is usually followed by a column name.
38
\b(?:FROM|JOIN|(?<!KEY\s)UPDATE|INTO) # Words that precede table names
40
\(? # Optional paren around tables
41
# Capture the identifier and any number of comma-join identifiers that
42
# follow it, optionally with aliases with or without the AS keyword
44
(?: (?:\s+ (?:AS\s+)? \w+)?, \s*$tbl_ident )*
47
# This regex is meant to match "derived table" queries, of the form
48
# .. from ( select ...
49
# .. join ( select ...
50
# .. bar join foo, ( select ...
51
# Unfortunately it'll also match this:
52
# select a, b, (select ...
53
our $has_derived = qr{
58
# http://dev.mysql.com/doc/refman/5.1/en/sql-syntax-data-definition.html
59
# We treat TRUNCATE as a dds but really it's a data manipulation statement.
60
our $data_def_stmts = qr/(?:CREATE|ALTER|TRUNCATE|DROP|RENAME)/i;
62
# http://dev.mysql.com/doc/refman/5.1/en/sql-syntax-data-manipulation.html
63
# Data manipulation statements.
64
our $data_manip_stmts = qr/(?:INSERT|UPDATE|DELETE|REPLACE)/i;
71
# Returns a list of table names found in the query text.
73
my ( $self, $query ) = @_;
75
MKDEBUG && _d('Getting tables for', $query);
77
# Handle CREATE, ALTER, TRUNCATE and DROP TABLE.
78
my ( $ddl_stmt ) = $query =~ m/^\s*($data_def_stmts)\b/i;
80
MKDEBUG && _d('Special table type:', $ddl_stmt);
81
$query =~ s/IF\s+(?:NOT\s+)?EXISTS//i;
82
if ( $query =~ m/$ddl_stmt DATABASE\b/i ) {
83
# Handles CREATE DATABASE, not to be confused with CREATE TABLE.
84
MKDEBUG && _d('Query alters a database, not a table');
87
if ( $ddl_stmt =~ m/CREATE/i && $query =~ m/$ddl_stmt\b.+?\bSELECT\b/i ) {
88
# Handle CREATE TABLE ... SELECT. In this case, the real tables
89
# come from the SELECT, not the CREATE.
90
my ($select) = $query =~ m/\b(SELECT\b.+)/is;
91
MKDEBUG && _d('CREATE TABLE ... SELECT:', $select);
92
return $self->get_tables($select);
94
my ($tbl) = $query =~ m/TABLE\s+($tbl_ident)(\s+.*)?/i;
95
MKDEBUG && _d('Matches table:', $tbl);
99
# These keywords may appear between UPDATE or SELECT and the table refs.
100
# They need to be removed so that they are not mistaken for tables.
101
$query =~ s/ (?:LOW_PRIORITY|IGNORE|STRAIGHT_JOIN)//ig;
103
# Another special case: LOCK TABLES tbl [[AS] alias] READ|WRITE, etc.
104
# We strip the LOCK TABLES stuff and append "FROM" to fake a SELECT
105
# statement and allow $tbl_regex to match below.
106
if ( $query =~ /^\s*LOCK TABLES/i ) {
107
MKDEBUG && _d('Special table type: LOCK TABLES');
108
$query =~ s/^(\s*LOCK TABLES\s+)//;
109
$query =~ s/\s+(?:READ|WRITE|LOCAL)+\s*//g;
110
MKDEBUG && _d('Locked tables:', $query);
111
$query = "FROM $query";
114
$query =~ s/\\["']//g; # quoted strings
115
$query =~ s/".*?"/?/sg; # quoted strings
116
$query =~ s/'.*?'/?/sg; # quoted strings
119
foreach my $tbls ( $query =~ m/$tbl_regex/gio ) {
120
MKDEBUG && _d('Match tables:', $tbls);
122
# Some queries coming from certain ORM systems will have superfluous
123
# parens around table names, like SELECT * FROM (`mytable`); We match
124
# these so the table names can be extracted more simply with regexes. But
125
# in case of subqueries, this can cause us to match SELECT as a table
126
# name, for example, in SELECT * FROM (SELECT ....) AS X; It's possible
127
# that SELECT is really a table name, but so unlikely that we just skip
129
next if $tbls =~ m/\ASELECT\b/i;
131
foreach my $tbl ( split(',', $tbls) ) {
132
# Remove implicit or explicit (AS) alias.
133
$tbl =~ s/\s*($tbl_ident)(\s+.*)?/$1/gio;
135
# Sanity check for cases like when a column is named `from`
136
# and the regex matches junk. Instead of complex regex to
137
# match around these rarities, this simple check will save us.
138
if ( $tbl !~ m/[a-zA-Z]/ ) {
139
MKDEBUG && _d('Skipping suspicious table name:', $tbl);
149
# Returns true if it sees what looks like a "derived table", e.g. a subquery in
151
sub has_derived_table {
152
my ( $self, $query ) = @_;
153
# See the $tbl_regex regex above.
154
my $match = $query =~ m/$has_derived/;
155
MKDEBUG && _d($query, 'has ' . ($match ? 'a' : 'no') . ' derived table');
159
# Return a data structure of tables/databases and the name they're aliased to.
160
# Given the following query, SELECT * FROM db.tbl AS foo; the structure is:
161
# { TABLE => { foo => tbl }, DATABASE => { tbl => db } }
162
# If $list is true, then a flat list of tables found in the query is returned
163
# instead. This is used for things that want to know what tables the query
164
# touches, but don't care about aliases.
166
my ( $self, $query, $list ) = @_;
168
# This is the basic result every query must return.
173
return $result unless $query;
175
# These keywords may appear between UPDATE or SELECT and the table refs.
176
# They need to be removed so that they are not mistaken for tables.
177
$query =~ s/ (?:LOW_PRIORITY|IGNORE|STRAIGHT_JOIN)//ig;
179
# These keywords may appear before JOIN. They need to be removed so
180
# that they are not mistaken for implicit aliases of the preceding table.
181
$query =~ s/ (?:INNER|OUTER|CROSS|LEFT|RIGHT|NATURAL)//ig;
183
# Get the table references clause and the keyword that starts the clause.
184
# See the comments below for why we need the starting keyword.
186
my ($tbl_refs, $from) = $query =~ m{
188
(FROM|INTO|UPDATE)\b\s* # Keyword before table refs
191
(?:\s+|\z) # If the query does not end with the table
192
# refs then there must be at least 1 space
193
# between the last tbl ref and the next
195
(?:WHERE|ORDER|LIMIT|HAVING|SET|VALUES|\z) # Keyword after table refs
200
if ( $query =~ m/^(?:INSERT|REPLACE)/i ) {
201
# Remove optional columns def from INSERT/REPLACE.
202
$tbl_refs =~ s/\([^\)]+\)\s*//;
205
MKDEBUG && _d('tbl refs:', $tbl_refs);
207
# These keywords precede a table ref. They signal the start of a table
208
# ref, but to know where the table ref ends we need the after tbl ref
210
my $before_tbl = qr/(?:,|JOIN|\s|$from)+/i;
212
# These keywords signal the end of a table ref and either 1) the start
213
# of another table ref, or 2) the start of an ON|USING part of a JOIN
214
# clause (which we want to skip over), or 3) the end of the string (\z).
215
# We need these after tbl ref keywords so that they are not mistaken
216
# for implicit aliases of the preceding table.
217
my $after_tbl = qr/(?:,|JOIN|ON|USING|\z)/i;
219
# This is required for cases like:
220
# FROM t1 JOIN t2 ON t1.col1=t2.col2 JOIN t3 ON t2.col3 = t3.col4
221
# Because spaces may precede a tbl and a tbl may end with \z, then
222
# t3.col4 will match as a table. However, t2.col3=t3.col4 will not match.
223
$tbl_refs =~ s/ = /=/g;
228
( ($tbl_ident) (?:\s+ (?:AS\s+)? (\w+))? )
232
my ( $tbl_ref, $db_tbl, $alias ) = ($1, $2, $3);
233
MKDEBUG && _d('Match table:', $tbl_ref);
234
push @tbl_refs, $tbl_ref;
235
$alias = $self->trim_identifier($alias);
238
if ( $tbl_ref =~ m/^AS\s+\w+/i ) {
239
# According to the manual
240
# http://dev.mysql.com/doc/refman/5.0/en/unnamed-views.html:
241
# "The [AS] name clause is mandatory, because every table in a
242
# FROM clause must have a name."
243
# So if the tbl ref begins with 'AS', then we probably have a
245
MKDEBUG && _d('Subquery', $tbl_ref);
246
$result->{TABLE}->{$alias} = undef;
250
my ( $db, $tbl ) = $db_tbl =~ m/^(?:(.*?)\.)?(.*)/;
251
$db = $self->trim_identifier($db);
252
$tbl = $self->trim_identifier($tbl);
253
$result->{TABLE}->{$alias || $tbl} = $tbl;
254
$result->{DATABASE}->{$tbl} = $db if $db;
258
MKDEBUG && _d("No tables ref in", $query);
262
# Return raw text of the tbls without aliases, instead of identifier
263
# mappings. Include all identifier quotings and such.
271
# Splits a compound statement and returns an array with each sub-statement.
273
# INSERT INTO ... SELECT ...
274
# is split into two statements: "INSERT INTO ..." and "SELECT ...".
276
my ( $self, $query ) = @_;
277
return unless $query;
278
$query = $self->clean_query($query);
279
MKDEBUG && _d('Splitting', $query);
281
my $verbs = qr{SELECT|INSERT|UPDATE|DELETE|REPLACE|UNION|CREATE}i;
283
# This splits a statement on the above verbs which means that the verb
284
# gets chopped out. Capturing the verb (e.g. ($verb)) will retain it,
285
# but then it's disjointed from its statement. Example: for this query,
286
# INSERT INTO ... SELECT ...
287
# split returns ('INSERT', 'INTO ...', 'SELECT', '...'). Therefore,
288
# we must re-attach each verb to its statement; we do this later...
289
my @split_statements = grep { $_ } split(m/\b($verbs\b(?!(?:\s*\()))/io, $query);
292
if ( @split_statements == 1 ) {
293
# This happens if the query has no verbs, so it's probably a single
295
push @statements, $query;
298
# ...Re-attach verbs to their statements.
299
for ( my $i = 0; $i <= $#split_statements; $i += 2 ) {
300
push @statements, $split_statements[$i].$split_statements[$i+1];
302
# Variable-width negative look-behind assertions, (?<!), aren't
303
# fully supported so we split ON DUPLICATE KEY UPDATE. This
304
# puts it back together.
305
if ( $statements[-2] && $statements[-2] =~ m/on duplicate key\s+$/i ) {
306
$statements[-2] .= pop @statements;
311
# Wrap stmts in <> to make it more clear where each one begins/ends.
312
MKDEBUG && _d('statements:', map { $_ ? "<$_>" : 'none' } @statements);
317
my ( $self, $query ) = @_;
318
return unless $query;
319
$query =~ s!/\*.*?\*/! !g; # Remove /* comment blocks */
320
$query =~ s/^\s+//; # Remove leading spaces
321
$query =~ s/\s+$//; # Remove trailing spaces
322
$query =~ s/\s{2,}/ /g; # Remove extra spaces
327
my ( $self, $query ) = @_;
328
return unless $query;
329
$query = $self->clean_query($query);
333
my $sqno = 0; # subquery number
335
while ( $query =~ m/(\S+)(?:\s+|\Z)/g ) {
338
MKDEBUG && _d($word, $sqno);
339
if ( $word =~ m/^\(?SELECT\b/i ) {
340
my $start_pos = $pos - length($word) - 1;
343
MKDEBUG && _d('Subquery', $sqno, 'starts at', $start_pos);
344
$subqueries[$sqno] = {
345
start_pos => $start_pos,
349
lp => 1, # left parentheses
350
rp => 0, # right parentheses
355
MKDEBUG && _d('Main SELECT at pos 0');
359
next unless $sqno; # next unless we're in a subquery
360
MKDEBUG && _d('In subquery', $sqno);
361
my $sq = $subqueries[$sqno];
363
MKDEBUG && _d('This subquery is done; SQL is for',
364
($sqno - 1 ? "subquery $sqno" : "the main SELECT"));
367
push @{$sq->{words}}, $word;
368
my $lp = ($word =~ tr/\(//) || 0;
369
my $rp = ($word =~ tr/\)//) || 0;
370
MKDEBUG && _d('parentheses left', $lp, 'right', $rp);
371
if ( ($sq->{lp} + $lp) - ($sq->{rp} + $rp) == 0 ) {
372
my $end_pos = $pos - 1;
373
MKDEBUG && _d('Subquery', $sqno, 'ends at', $end_pos);
374
$sq->{end_pos} = $end_pos;
375
$sq->{len} = $end_pos - $sq->{start_pos};
380
for my $i ( 1..$#subqueries ) {
381
my $sq = $subqueries[$i];
383
$sq->{sql} = join(' ', @{$sq->{words}});
385
$sq->{start_pos} + 1, # +1 for (
386
$sq->{len} - 1, # -1 for )
390
return $query, map { $_->{sql} } grep { defined $_ } @subqueries;
394
my ( $self, $query, $qr ) = @_;
395
my ($type, undef) = $qr->distill_verbs($query);
397
if ( $type =~ m/^SELECT\b/ ) {
400
elsif ( $type =~ m/^$data_manip_stmts\b/
401
|| $type =~ m/^$data_def_stmts\b/ ) {
412
my ( $self, $query ) = @_;
414
return $cols unless $query;
417
if ( $query =~ m/^SELECT/i ) {
433
($cols_def) = $query =~ m/^SELECT\s+(.+?)\s+FROM/i;
435
elsif ( $query =~ m/^(?:INSERT|REPLACE)/i ) {
436
($cols_def) = $query =~ m/\(([^\)]+)\)\s*VALUE/i;
439
MKDEBUG && _d('Columns:', $cols_def);
441
@$cols = split(',', $cols_def);
454
my ( $self, $query ) = @_;
455
return unless $query;
458
# Flatten and clean query.
460
$query = $self->clean_query($query);
462
$parsed->{query} = $query,
463
$parsed->{tables} = $self->get_aliases($query, 1);
464
$parsed->{columns} = $self->get_columns($query);
466
my ($type) = $query =~ m/^(\w+)/;
467
$parsed->{type} = lc $type;
469
# my @words = $query =~ m/
470
# [A-Za-z_.]+\(.*?\)+ # Match FUNCTION(...)
471
# |\(.*?\)+ # Match grouped items
472
# |"(?:[^"]|\"|"")*"+ # Match double quotes
473
# |'[^'](?:|\'|'')*'+ # and single quotes
474
# |`(?:[^`]|``)*`+ # and backticks
479
$parsed->{sub_queries} = [];
484
# Returns an array of arrayrefs like [db,tbl] for each unique db.tbl
485
# in the query and its subqueries. db may be undef.
487
my ( $self, %args ) = @_;
488
my $query = $args{query};
489
my $default_db = $args{default_db};
490
my $q = $self->{Quoter} || $args{Quoter};
491
return unless $query;
492
MKDEBUG && _d('Extracting tables');
495
foreach my $db_tbl ( $self->get_tables($query) ) {
497
next if $seen{$db_tbl}++; # Unique-ify for issue 337.
498
my ( $db, $tbl ) = $q->split_unquote($db_tbl);
499
push @tables, [ $db || $default_db, $tbl ];
504
# This is a special trim function that removes whitespace and identifier-quotes
505
# (backticks, in the case of MySQL) from the string.
506
sub trim_identifier {
507
my ($self, $str) = @_;
508
return unless defined $str;
516
my ($package, undef, $line) = caller 0;
517
@_ = map { (my $temp = $_) =~ s/\n/\n# /g; $temp; }
518
map { defined $_ ? $_ : 'undef' }
520
print STDERR "# $package:$line $PID ", join(' ', @_), "\n";
525
# ###########################################################################
526
# End QueryParser package
527
# ###########################################################################