3
package Sort::Naturally; # Time-stamp: "2004-12-29 18:30:03 AST"
5
@EXPORT = ('nsort', 'ncmp');
13
#-----------------------------------------------------------------------------
15
BEGIN { *DEBUG = sub () {0} unless defined &DEBUG }
19
# Make a constant such that if a whole-number string is that long
20
# or shorter, we KNOW it's treatable as an integer
22
my $x = length(256 ** $Config::Config{'intsize'} / 2) - 1;
23
die "Crazy intsize: <$Config::Config{'intsize'}>" if $x < 4;
24
eval 'sub MAX_INT_SIZE () {' . $x . '}';
26
print "intsize $Config::Config{'intsize'} => MAX_INT_SIZE $x\n" if DEBUG;
32
my @ORD = ('same', 'swap', 'asis');
34
#-----------------------------------------------------------------------------
35
# For lack of a preprocessor:
38
$guts = <<'EOGUTS'; # This is the guts of both ncmp and nsort:
41
# trap this expensive case first, and then fall thru to tiebreaker
44
# Convoluted hack to get numerics to sort first, at string start:
45
} elsif($x =~ m/^\d/s) {
47
$rv = 0; # fall thru to normal comparison for the two numbers
50
DEBUG > 1 and print "Numeric-initial $x trumps letter-initial $y\n";
52
} elsif($y =~ m/^\d/s) {
54
DEBUG > 1 and print "Numeric-initial $y trumps letter-initial $x\n";
62
DEBUG and print "<$x> and <$y> compared...\n";
65
while(length $x and length $y) {
67
DEBUG > 2 and print " <$x> and <$y>...\n";
69
# First, non-numeric comparison:
70
$x2 = ($x =~ m/^(\D+)/s) ? length($1) : 0;
71
$y2 = ($y =~ m/^(\D+)/s) ? length($1) : 0;
72
# Now make x2 the min length of the two:
73
$x2 = $y2 if $x2 > $y2;
75
DEBUG > 1 and printf " <%s> and <%s> lexically for length $x2...\n",
76
substr($x,0,$x2), substr($y,0,$x2);
78
my $i = substr($x,0,$x2);
79
my $j = substr($y,0,$x2);
81
print "SCREAM! on <$i><$j> -- $sv != $rv \n" unless $rv == $sv;
87
# The ''. things here force a copy that seems to work around a
88
# mysterious intermittent bug that 'use locale' provokes in
89
# many versions of Perl.
91
? $cmp->(substr($x,0,$x2) . '',
92
substr($y,0,$x2) . '',
95
scalar(( substr($x,0,$x2) . '' ) cmp
96
( substr($y,0,$x2) . '' )
99
# otherwise trim and keep going:
100
substr($x,0,$x2) = '';
101
substr($y,0,$x2) = '';
105
# (actually just using $x2 and $y2 as scratch)
107
if( $x =~ s/^(\d+)//s ) {
109
if( $y =~ s/^(\d+)//s ) {
110
# We have two numbers here.
111
DEBUG > 1 and print " <$x2> and <$1> numerically\n";
112
if(length($x2) < MAX_INT_SIZE and length($1) < MAX_INT_SIZE) {
113
# small numbers: we can compare happily
114
last if $rv = $x2 <=> $1;
116
# ARBITRARILY large integers!
118
# This saves on loss of precision that could happen
119
# with actual stringification.
120
# Also, I sense that very large numbers aren't too
121
# terribly common in sort data.
124
($y2 = $1) =~ s/^0+//s;
126
print " Treating $x2 and $y2 as bigint\n" if DEBUG;
128
no locale; # we want the dumb cmp back.
130
# works only for non-negative whole numbers:
131
length($x2) <=> length($y2)
132
# the longer the numeral, the larger the value
134
# between equals, compare lexically!! amazing but true.
138
# X is numeric but Y isn't
142
} elsif( $y =~ s/^\d+//s ) { # we don't need to capture the substring
146
# else one of them is 0-length.
155
$code =~ s/~COMPARATOR~/$guts/g || die "Can't find ~COMPARATOR~";
160
##############################################################################
166
($cmp,$lc) = @{shift @_} if @_ and ref($_[0]) eq 'ARRAY';
168
return @_ unless @_ > 1 or wantarray; # be clever
170
my($x, $x2, $y, $y2, $rv); # scratch vars
172
# We use a Schwartzian xform to memoize the lc'ing and \W-removal
176
if($a->[0] eq $b->[0]) { 0 } # trap this expensive case
185
DEBUG > 1 and print " -<${$a}[0]> cmp <${$b}[0]> is $rv ($ORD[$rv])\n";
186
$rv ||= (length($x) <=> length($y)) # shorter is always first
187
|| ($cmp and $cmp->($x,$y) || $cmp->($a->[0], $b->[0]))
189
|| ($a->[0] cmp $b->[0])
192
DEBUG > 1 and print " <${$a}[0]> cmp <${$b}[0]> is $rv ($ORD[$rv])\n";
197
$x = $lc ? $lc->($_) : lc($_); # x as scratch
205
#-----------------------------------------------------------------------------
208
# The guts are basically the same as above...
212
($cmp,$lc) = @{shift @_} if @_ and ref($_[0]) eq 'ARRAY';
215
@_ = ($a, $b); # bit of a hack!
216
DEBUG > 1 and print "Hacking in <$a><$b>\n";
219
Carp::croak("Not enough options to ncmp!");
222
my($x, $x2, $y, $y2, $rv); # scratch vars
224
DEBUG > 1 and print "ncmp args <$a><$b>\n";
225
if($a eq $b) { # trap this expensive case
228
$x = ($lc ? $lc->($a) : lc($a));
230
$y = ($lc ? $lc->($b) : lc($b));
237
DEBUG > 1 and print " -<$a> cmp <$b> is $rv ($ORD[$rv])\n";
238
$rv ||= (length($x) <=> length($y)) # shorter is always first
239
|| ($cmp and $cmp->($x,$y) || $cmp->($a,$b))
244
DEBUG > 1 and print " <$a> cmp <$b> is $rv\n";
254
#-----------------------------------------------------------------------------
257
############### END OF MAIN SOURCE ###########################################
262
Sort::Naturally -- sort lexically, but sort numeral parts numerically
267
foo12a foo12z foo13a foo 14 9x foo12 fooa foolio Foolio Foo12a
269
print join(' ', @them), "\n";
273
9x 14 foo fooa foolio Foolio foo12 foo12a Foo12a foo12z foo13a
275
(Or "foo12a" + "Foo12a" and "foolio" + "Foolio" and might be
276
switched, depending on your locale.)
280
This module exports two functions, C<nsort> and C<ncmp>; they are used
281
in implementing my idea of a "natural sorting" algorithm. Under natural
282
sorting, numeric substrings are compared numerically, and other
283
word-characters are compared lexically.
285
This is the way I define natural sorting:
291
Non-numeric word-character substrings are sorted lexically,
292
case-insensitively: "Foo" comes between "fish" and "fowl".
296
Numeric substrings are sorted numerically:
297
"100" comes after "20", not before.
301
\W substrings (neither words-characters nor digits) are I<ignored>.
305
Our use of \w, \d, \D, and \W is locale-sensitive: Sort::Naturally
306
uses a C<use locale> statement.
310
When comparing two strings, where a numeric substring in one
311
place is I<not> up against a numeric substring in another,
312
the non-numeric always comes first. This is fudged by
313
reading pretending that the lack of a number substring has
314
the value -1, like so:
317
foobar => "foo", -1, "bar"
319
foo13xyz => "foo", 13, "xyz"
321
That's so that "foo" will come before "foo13", which will come
326
The start of a string is exceptional: leading non-\W (non-word,
328
components are are ignored, and numbers come I<before> letters.
332
I define "numeric substring" just as sequences matching m/\d+/ --
333
scientific notation, commas, decimals, etc., are not seen. If
334
your data has thousands separators in numbers
335
("20,000 Leagues Under The Sea" or "20.000 lieues sous les mers"),
336
consider stripping them before feeding them to C<nsort> or
341
=head2 The nsort function
343
This function takes a list of strings, and returns a copy of the list,
346
This is what most people will want to use:
348
@stuff = nsort(...list...);
350
When nsort needs to compare non-numeric substrings, it
351
uses Perl's C<lc> function in scope of a <use locale>.
352
And when nsort needs to lowercase things, it uses Perl's
353
C<lc> function in scope of a <use locale>. If you want nsort
354
to use other functions instead, you can specify them in
355
an arrayref as the first argument to nsort:
358
\&string_comparator, # optional
359
\&lowercaser_function # optional
364
If you want to specify a string comparator but no lowercaser,
365
then the options list is C<[\&comparator, '']> or
366
C<[\&comparator]>. If you want to specify no string comparator
367
but a lowercaser, then the options list is
368
C<['', \&lowercaser]>.
370
Any comparator you specify is called as
371
C<$comparator-E<gt>($left, $right)>,
372
and, like a normal Perl C<cmp> replacement, must return
373
-1, 0, or 1 depending on whether the left argument is stringwise
374
less than, equal to, or greater than the right argument.
376
Any lowercaser function you specify is called as
377
C<$lowercased = $lowercaser-E<gt>($original)>. The routine
378
must not modify its C<$_[0]>.
380
=head2 The ncmp function
382
Often, when sorting non-string values like this:
384
@objects_sorted = sort { $a->tag cmp $b->tag } @objects;
386
...or even in a Schwartzian transform, like this:
390
sort { $a->[1] cmp $b->[1] }
391
map { [$_, make_a_sort_key_from($_) ]
395
...you wight want something that replaces not C<sort>, but C<cmp>.
396
That's what Sort::Naturally's C<ncmp> function is for. Call it with
397
the syntax C<ncmp($left,$right)> instead of C<$left cmp $right>,
398
but otherwise it's a fine replacement:
400
@objects_sorted = sort { ncmp($a->tag,$b->tag) } @objects;
404
sort { ncmp($a->[1], $b->[1]) }
405
map { [$_, make_a_sort_key_from($_) ]
409
Just as with C<nsort> can take different a string-comparator
410
and/or lowercaser, you can do the same with C<ncmp>, by passing
411
an arrayref as the first argument:
414
\&string_comparator, # optional
415
\&lowercaser_function # optional
420
You might get string comparators from L<Sort::ArbBiLex|Sort::ArbBiLex>.
428
This module is not a substitute for
429
L<Sort::Versions|Sort::Versions>! If
430
you just need proper version sorting, use I<that!>
434
If you need something that works I<sort of> like this module's
435
functions, but not quite the same, consider scouting thru this
436
module's source code, and adapting what you see. Besides
437
the functions that actually compile in this module, after the POD,
438
there's several alternate attempts of mine at natural sorting
439
routines, which are not compiled as part of the module, but which you
440
might find useful. They should all be I<working> implementations of
441
slightly different algorithms
442
(all of them based on Martin Pool's C<nsort>) which I eventually
443
discarded in favor of my algorithm. If you are having to
444
naturally-sort I<very large> data sets, and sorting is getting
445
ridiculously slow, you might consider trying one of those
446
discarded functions -- I have a feeling they might be faster on
447
large data sets. Benchmark them on your data and see. (Unless
448
you I<need> the speed, don't bother. Hint: substitute C<sort>
449
for C<nsort> in your code, and unless your program speeds up
450
drastically, it's not the sorting that's slowing things down.
451
But if it I<is> C<nsort> that's slowing things down, consider
454
if(@set >= SOME_VERY_BIG_NUMBER) {
455
no locale; # vroom vroom
456
@sorted = sort(@set); # feh, good enough
457
} elsif(@set >= SOME_BIG_NUMBER) {
459
@sorted = sort(@set); # feh, good enough
461
# but keep it pretty for normal cases
462
@sorted = nsort(@set);
467
If you do adapt the routines in this module, email me; I'd
468
just be interested in hearing about it.
472
Thanks to the EFNet #perl people for encouraging this module,
473
especially magister and a-mused.
477
=head1 COPYRIGHT AND DISCLAIMER
479
Copyright 2001, Sean M. Burke C<sburke@cpan.org>, all rights
480
reserved. This program is free software; you can redistribute it
481
and/or modify it under the same terms as Perl itself.
483
This program is distributed in the hope that it will be useful, but
484
without any warranty; without even the implied warranty of
485
merchantability or fitness for a particular purpose.
489
Sean M. Burke C<sburke@cpan.org>
493
############ END OF DOCS ############
495
############################################################################
496
############################################################################
498
############ BEGIN OLD STUFF ############
500
# We can't have "use integer;", or else (5 <=> 5.1) comes out "0" !
502
#-----------------------------------------------------------------------------
505
return @_ if @_ < 2; # Just to be CLEVER.
507
my($x, $i); # scratch vars
509
# And now, the GREAT BIG Schwartzian transform:
515
# Uses $i as the index variable, $x as the result.
518
DEBUG and print "\nComparing ", map("{$_}", @$a),
519
' : ', map("{$_}", @$b), , "...\n";
521
while($i < @$a and $i < @$b) {
522
DEBUG and print " comparing $i: {$a->[$i]} cmp {$b->[$i]} => ",
523
$a->[$i] cmp $b->[$i], "\n";
524
last if ($x = ($a->[$i] cmp $b->[$i])); # lexicographic
527
DEBUG and print " comparing $i: {$a->[$i]} <=> {$b->[$i]} => ",
528
$a->[$i] <=> $b->[$i], "\n";
529
last if ($x = ($a->[$i] <=> $b->[$i])); # numeric
533
DEBUG and print "{$a->[0]} : {$b->[0]} is ",
534
$x || (@$a <=> @$b) || 0
537
$x || (@$a <=> @$b) || ($a->[0] cmp $b->[0]);
538
# unless we found a result for $x in the while loop,
539
# use length as a tiebreaker, otherwise use cmp
540
# on the original string as a fallback tiebreaker.
544
my @bit = ($x = defined($_) ? $_ : '');
546
if($x =~ m/^[+-]?(?=\d|\.\d)\d*(?:\.\d*)?(?:[Ee](?:[+-]?\d+))?\z/s) {
547
# It's entirely purely numeric, so treat it specially:
550
# Consume the string.
552
push @bit, ($x =~ s/^(\D+)//s) ? lc($1) : '';
553
push @bit, ($x =~ s/^(\d+)//s) ? $1 : 0;
556
DEBUG and print "$bit[0] => ", map("{$_} ", @bit), "\n";
558
# End result: [original bit , (text, number), (text, number), ...]
559
# Minimally: [0-length original bit,]
561
# ['10' => '' , 10, ]
562
# ['fo900' => 'fo' , 900, ]
563
# ['foo10' => 'foo', 10, ]
564
# ['foo9.pl' => 'foo', 9, , '.pl', 0 ]
565
# ['foo32.pl' => 'foo', 32, , '.pl', 0 ]
566
# ['foo325.pl' => 'foo', 325, , '.pl', 0 ]
567
# Yes, always an ODD number of elements.
574
#-----------------------------------------------------------------------------
575
# Same as before, except without the pure-number trap.
578
return @_ if @_ < 2; # Just to be CLEVER.
580
my($x, $i); # scratch vars
582
# And now, the GREAT BIG Schwartzian transform:
588
# Uses $i as the index variable, $x as the result.
591
DEBUG and print "\nComparing ", map("{$_}", @$a),
592
' : ', map("{$_}", @$b), , "...\n";
594
while($i < @$a and $i < @$b) {
595
DEBUG and print " comparing $i: {$a->[$i]} cmp {$b->[$i]} => ",
596
$a->[$i] cmp $b->[$i], "\n";
597
last if ($x = ($a->[$i] cmp $b->[$i])); # lexicographic
600
DEBUG and print " comparing $i: {$a->[$i]} <=> {$b->[$i]} => ",
601
$a->[$i] <=> $b->[$i], "\n";
602
last if ($x = ($a->[$i] <=> $b->[$i])); # numeric
606
DEBUG and print "{$a->[0]} : {$b->[0]} is ",
607
$x || (@$a <=> @$b) || 0
610
$x || (@$a <=> @$b) || ($a->[0] cmp $b->[0]);
611
# unless we found a result for $x in the while loop,
612
# use length as a tiebreaker, otherwise use cmp
613
# on the original string as a fallback tiebreaker.
617
my @bit = ($x = defined($_) ? $_ : '');
620
push @bit, ($x =~ s/^(\D+)//s) ? lc($1) : '';
621
push @bit, ($x =~ s/^(\d+)//s) ? $1 : 0;
623
DEBUG and print "$bit[0] => ", map("{$_} ", @bit), "\n";
625
# End result: [original bit , (text, number), (text, number), ...]
626
# Minimally: [0-length original bit,]
628
# ['10' => '' , 10, ]
629
# ['fo900' => 'fo' , 900, ]
630
# ['foo10' => 'foo', 10, ]
631
# ['foo9.pl' => 'foo', 9, , '.pl', 0 ]
632
# ['foo32.pl' => 'foo', 32, , '.pl', 0 ]
633
# ['foo325.pl' => 'foo', 325, , '.pl', 0 ]
634
# Yes, always an ODD number of elements.
641
#-----------------------------------------------------------------------------
642
# Same as before, except for the sort-key-making
645
return @_ if @_ < 2; # Just to be CLEVER.
647
my($x, $i); # scratch vars
649
# And now, the GREAT BIG Schwartzian transform:
655
# Uses $i as the index variable, $x as the result.
658
DEBUG and print "\nComparing ", map("{$_}", @$a),
659
' : ', map("{$_}", @$b), , "...\n";
661
while($i < @$a and $i < @$b) {
662
DEBUG and print " comparing $i: {$a->[$i]} cmp {$b->[$i]} => ",
663
$a->[$i] cmp $b->[$i], "\n";
664
last if ($x = ($a->[$i] cmp $b->[$i])); # lexicographic
667
DEBUG and print " comparing $i: {$a->[$i]} <=> {$b->[$i]} => ",
668
$a->[$i] <=> $b->[$i], "\n";
669
last if ($x = ($a->[$i] <=> $b->[$i])); # numeric
673
DEBUG and print "{$a->[0]} : {$b->[0]} is ",
674
$x || (@$a <=> @$b) || 0
677
$x || (@$a <=> @$b) || ($a->[0] cmp $b->[0]);
678
# unless we found a result for $x in the while loop,
679
# use length as a tiebreaker, otherwise use cmp
680
# on the original string as a fallback tiebreaker.
684
my @bit = ($x = defined($_) ? $_ : '');
686
if($x =~ m/^[+-]?(?=\d|\.\d)\d*(?:\.\d*)?(?:[Ee](?:[+-]?\d+))?\z/s) {
687
# It's entirely purely numeric, so treat it specially:
690
# Consume the string.
692
push @bit, ($x =~ s/^(\D+)//s) ? lc($1) : '';
694
if($x =~ s/^(\d+)//s) {
695
if(substr($1,0,1) eq '0' and $1 != 0) {
696
push @bit, $1 / (10 ** length($1));
705
DEBUG and print "$bit[0] => ", map("{$_} ", @bit), "\n";
712
#-----------------------------------------------------------------------------
713
# Like nsort0, but WITHOUT pure number handling, and WITH special treatment
714
# of pulling off extensions and version numbers.
717
return @_ if @_ < 2; # Just to be CLEVER.
719
my($x, $i); # scratch vars
721
# And now, the GREAT BIG Schwartzian transform:
727
# Uses $i as the index variable, $x as the result.
730
DEBUG and print "\nComparing ", map("{$_}", @$a),
731
' : ', map("{$_}", @$b), , "...\n";
733
while($i < @$a and $i < @$b) {
734
DEBUG and print " comparing $i: {$a->[$i]} cmp {$b->[$i]} => ",
735
$a->[$i] cmp $b->[$i], "\n";
736
last if ($x = ($a->[$i] cmp $b->[$i])); # lexicographic
739
DEBUG and print " comparing $i: {$a->[$i]} <=> {$b->[$i]} => ",
740
$a->[$i] <=> $b->[$i], "\n";
741
last if ($x = ($a->[$i] <=> $b->[$i])); # numeric
745
DEBUG and print "{$a->[0]} : {$b->[0]} is ",
746
$x || (@$a <=> @$b) || 0
749
$x || (@$a <=> @$b ) || ($a->[1] cmp $b->[1])
750
|| ($a->[2] <=> $b->[2]) || ($a->[0] cmp $b->[0]);
751
# unless we found a result for $x in the while loop,
752
# use length as a tiebreaker, otherwise use the
753
# lc'd extension, otherwise the verison, otherwise use
754
# the original string as a fallback tiebreaker.
758
my @bit = ( ($x = defined($_) ? $_ : ''), '',0 );
761
# Consume the string.
763
# First, pull off any VAX-style version
764
$bit[2] = $1 if $x =~ s/;(\d+)$//;
766
# Then pull off any apparent extension
767
if( $x !~ m/^\.+$/s and # don't mangle ".", "..", or "..."
768
$x =~ s/(\.[^\.\;]*)$//sg
769
# We could try to avoid catching all-digit extensions,
770
# but I think that's getting /too/ clever.
773
if($x =~ m<[^\\\://]$>s) {
774
# We didn't take the whole basename.
776
DEBUG and print "Consuming extension \"$1\"\n";
778
# We DID take the whole basename. Fix it.
779
$x = $1; # Repair it.
783
push @bit, '', -1 if $x =~ m/^\./s;
784
# A hack to make .-initial filenames sort first, regardless of locale.
785
# And -1 is always a sort-firster, since in the code below, there's
786
# no allowance for filenames containing negative numbers: -1.dat
787
# will be read as string '-' followed by number 1.
790
push @bit, ($x =~ s/^(\D+)//s) ? lc($1) : '';
792
if($x =~ s/^(\d+)//s) {
793
if(substr($1,0,1) eq '0' and $1 != 0) {
794
push @bit, $1 / (10 ** length($1));
804
DEBUG and print "$bit[0] => ", map("{$_} ", @bit), "\n";