1
package Dpkg::Copyright::Scanner ;
11
use feature qw/postderef signatures/;
12
no warnings qw/experimental::postderef experimental::signatures/;
14
our @EXPORT = qw(scan_files print_copyright);
16
my $whitespace_list_delimiter = $ENV{'whitespace_list_delimiter'} || "\n ";
18
# license and copyright sanitisation pilfered from Jonas's
19
# licensecheck2dep5 Originally GPL-2+, permission to license this
20
# derivative work to LGPL-2.1+ was given by Jonas.
21
# see https://lists.alioth.debian.org/pipermail/pkg-perl-maintainers/2015-March/084900.html
23
# Copyright 2014 Dominique Dumont <dod@debian.org>
24
# Copyright © 2005-2012 Jonas Smedegaard <dr@jones.dk>
25
# Description: Reformat licencecheck output to copyright file format
27
# This program is free software; you can redistribute it and/or
28
# modify it under the terms of the GNU General Public License as
29
# published by the Free Software Foundation; either version 2, or (at
30
# your option) any later version.
32
# This program is distributed in the hope that it will be useful, but
33
# WITHOUT ANY WARRANTY; without even the implied warranty of
34
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
35
# General Public License for more details.
37
# You should have received a copy of the GNU General Public License
38
# along with this program. If not, see <http://www.gnu.org/licenses/>.
40
sub print_copyright ( %args ) {
41
my ($files, $copyrights_by_id) = scan_files(%args);
43
# split file path and fill recursive hash, leaf is id
45
foreach my $path (keys %$files) {
46
__create_tree_leaf_from_paths ($split_files,$path,$files->{$path});
49
# regroup %files hash: all leaves have same id -> wild card
50
__squash($split_files);
52
# pack files by copyright id
53
my @packed = __pack_files($split_files);
57
foreach my $p (@packed) {
58
my ($id, @paths) = $p->@*;
59
my ($c,$l) = $copyrights_by_id->[$id]->@*;
61
next if $c eq 'no-info-found';
63
"Files: ", join($whitespace_list_delimiter, @paths )."\n",
65
"License: $l\n", "\n";
69
$args{out}->spew_utf8( @out);
72
binmode(STDOUT, ":utf8");
77
# option to skip UNKNOWN ?
78
# load a file to override some entries ?
79
sub scan_files ( %args ) {
83
@lines = $args{in}->lines_utf8; # for tests
86
my $pipe = IO::Pipe->new();
87
$pipe->reader("licensecheck --copyright -m -r .");
88
binmode($pipe, ":utf8");
89
@lines = $pipe->getlines;
97
foreach my $line (sort @lines) {
100
my ($f,$l,$c) = split /\t/, $line;
103
$l =~ s/([*?\\])/\\$1/g;
104
$l =~ s/\s*\(unversioned\/unknown version\)//;
105
$l =~ s/\s*\(with incorrect FSF address\)//;
106
$l =~ s/(\w+)\s+\(v([^)]+) or v([^)]+)\)/uc($1)."-$2 or ".uc($1)."-$3"/e;
107
$l =~ s/\s+\(v([^)]+) or later\)/-$1+/;
108
$l =~ s/\s+\(v([^)]+)\)/-$1/;
109
$l =~ s/^\s*(GENERATED FILE)/UNKNOWN/;
110
$l =~ s/\s+(GENERATED FILE)//;
111
$l =~ s/\bzlib\/libpng\b/Zlib/;
112
$l =~ s/\bMIT\/X11 \(BSD like\)/Expat/;
113
$l =~ s/^\s*BSD \((\d) clause\)$/BSD-$1-clause/;
114
$l =~ s/^\s*public domain$/public-domain/i;
116
# this is very fragile. may need to change license-check to output license keyword
117
$l =~ s/ / and /g unless $l =~ /\bor\b/;
120
$c =~ s/^©\s*//;
121
$c =~ s/(?<=\b\d{4})\s*-\s*\d{4}(?=\s*-\s*(\d{4})\b)//g;
122
$c =~ s/(\d+)\s*-\s*(\d+)/$1-$2/g;
123
$c =~ s/\b(\d{4}),?\s+([\S^\d])/$1, $2/g;
125
$c =~ s/all\s+rights?\s+reserved[\s\.]*//gi;
126
$c = 'no-info-found' if $c =~ /^\*No/;
128
$c =~ s!^[\s,/*]|[\s,/*-]+$!!g;
130
$c = __pack_copyright($c);
132
$files->{$f} = $copyrights{$c}{$l} //= $id++;
135
my @copyrights_by_id ;
136
foreach my $c (sort keys %copyrights) {
137
foreach my $l (sort keys $copyrights{$c}->%* ) {
138
my $id = $copyrights{$c}{$l};
139
$copyrights_by_id[$id] = [ $c, $l ] ;
143
say "No copyright information found" unless keys %$files;
145
my $merged_c_info = __squash_copyrights_years (\@copyrights_by_id) ;
147
# replace the old ids with news ids
148
__swap_merged_ids($files, $merged_c_info);
150
# stop here for update ...
151
return ($files, \@copyrights_by_id) ;
154
sub __split_copyright ($c) {
155
my ($years,$owner) = $c =~ /([\s,\d-]+)(.*)/;
156
# say "undef year in $c" unless defined $years;
157
return unless defined $years;
158
my @data = split /(?<=\d)[,\s]+/, $years;
159
return unless defined $owner;
160
$owner =~ s/^[\s.,-]+|[\s,*-]+$//g;
161
return ($owner,@data);
164
sub __create_tree_leaf_from_paths ($h,$path,$value) {
165
# explode path in subpaths
166
my @subpaths = split '/', $path;
167
my $last = pop @subpaths;
168
map { $h = $h->{$_} ||= {} } @subpaths ;
169
$h->{$last} = $value;
172
sub __pack_copyright ($r) {
174
return $r if $r eq 'no-info-found';
176
$r =~ /^[\s\W]+|[\s\W]+$/g;
177
foreach my $c ( split( m!\s*/\s*!, $r)) {
178
my ($owner, @data) = __split_copyright($c);
179
return $r unless defined $owner;
180
$cop{$owner} ||= [] ;
181
push $cop{$owner}->@*, @data ;
184
foreach my $owner (sort keys %cop) {
185
my $span = Array::IntSpan->new();
186
my $data = $cop{$owner};
187
foreach my $year ($data->@*) {
188
return $r if $year =~ /[^\d-]/; # bail-out
189
# take care of ranges written like 2002-3
190
$year =~ s/^(\d\d\d)(\d)-(\d)$/$1$2-$1$3/;
191
# take care of ranges written like 2014-15
192
$year =~ s/^(\d\d)(\d\d)-(\d\d)$/$1$2-$1$3/;
194
$span->set_range_as_string($year, $owner);
196
return $r if $@; # invalid range
198
$span->consolidate();
199
push @res, $span->get_range_list. ', '. $owner;
201
return join("\n ",reverse sort @res);
204
#in each directory, pack files that have the same copyright/license information
205
# traverse recursively %h (whose structure matches the scanned directory)
206
# @path keeps track of the recursion depth to provide the file path
207
sub __pack_files ($h) {
210
__pack_dir($h,\@res) ;
212
# sort by first path listed in there
213
my $sort_path = sub {
217
return sort $sort_path @res ;
220
sub __pack_dir ($h, $pack, @path) {
222
foreach my $file (sort keys %$h) {
223
my $id = $h->{$file};
225
__pack_dir($id, $pack, @path, $file) ;
227
elsif (defined $pack_by_id{$id} ) {
228
push $pack_by_id{$id}->@*, join('/',@path,$file);
231
$pack_by_id{$id} = [ join('/',@path,$file) ] ;
235
push $pack->@*, map { [ $_, $pack_by_id{$_}->@* ]; } keys %pack_by_id ;
238
# find ids that can be merged together
239
# I.e. merge entries with same license and same set of owners. In this
240
# case the years are merged together.
241
sub __squash_copyrights_years ($copyrights_by_id) {
243
my %id_year_by_same_owner_license;
244
for (my $id = 0; $id < $#$copyrights_by_id; $id++ ) {
245
my ($c,$l) = $copyrights_by_id->[$id]->@* ;
246
#say "id $id: c $c l $l";
249
foreach my $line (split(/\n\s+/,$c)) {
250
my ($owner, @year) = __split_copyright($line);
251
next unless defined $owner;
252
push @owners, $owner;
253
push @years, join(',',@year);
255
my $k = join('|', $l, @owners);
256
$id_year_by_same_owner_license{$k} //= [];
257
push $id_year_by_same_owner_license{$k}->@*, [ $id, @years ];
261
# now detect where %id_year_by_same_owner_license references more
262
# than one id this means that several entries can be merged in a
263
# *new* id (new id to avoid cloberring data of other directories)
264
foreach my $owner_license (keys %id_year_by_same_owner_license) {
265
my @entries = $id_year_by_same_owner_license{$owner_license}->@* ;
266
next unless @entries > 1;
268
my ($l,@owners) = split /\|/, $owner_license;
270
# create new copyright info with coaslesced years
271
my @squashed_c = __coalesce_copyright_years(\@entries,\@owners) ;
272
next unless @squashed_c ; # give up this entry when problem
274
# store (c) info with coalesced years in new item of $copyrights_by_id
275
my $new_id = @$copyrights_by_id ;
276
$copyrights_by_id->[$new_id] = [ join("\n ",@squashed_c), $l ];
278
# fill the swap table entry-id -> coaslesces entry-id
279
foreach my $id ( map { $_->[0]} @entries) {
280
$merged_c_info[$id] = $new_id;
284
return \@merged_c_info;
287
sub __swap_merged_ids ($files, $merged_c_info) {
288
foreach my $name (sort keys %$files) {
289
my $item = $files->{$name};
291
__swap_merged_ids($item,$merged_c_info);
293
elsif (my $new_id = $merged_c_info->[$item]) {
294
$files->{$name} = "$new_id" ;
299
sub __coalesce_copyright_years($entries, $owners) {
300
my @ranges_of_years ;
301
# $entries and $owners always have the same size
303
foreach my $entry (@$entries) {
304
my ($id, @years) = $entry->@* ;
306
for (my $i = 0; $i < @years; $i++) {
307
return () if $years[$i] =~ /[^\d,\s-]/;
308
my $span = $ranges_of_years[$i] //= Array::IntSpan->new();
309
return () unless $span; # bail out in case of problems
310
$span->set_range_as_string($years[$i], 1);
315
for (my $i=0; $i < @$owners ; $i++) {
316
$ranges_of_years[$i]->consolidate();
317
$squashed_c[$i] = $ranges_of_years[$i]->get_range_list.', '.$owners->[$i];
323
# $h is a tree of hash matching the directory structure. Each leaf is a
328
# count the number of times each (c) info is used in this directory.
329
# (including the main (c) info of each subdirectory)
330
foreach my $name (sort keys %$h) {
331
my $item = $h->{$name};
333
# squash may return a plain id, or a hash with '*' => id ,
334
# or a non squashable hash
335
$h->{$name} = __squash($item);
337
my $id = (ref($item) and defined $item->{'*'}) ? $item->{'*'} : $item ;
339
# do not count non squashable hashes (i.e. there's no main (c) info)
346
# find the most used (c) info in this directory (or the existing '*' entry)
348
my $max_id = $h->{'*'};
349
if (not defined $max_id) {
350
foreach my $id (sort keys %count) {
351
if ($count{$id} > $max) {
358
# all files associated to the most used (c) info are deleted to
359
# be represented by '*' entry
360
foreach my $name (sort keys %$h) {
361
my $item = $h->{$name};
362
if (ref($item) and defined $item->{'*'} and $item->{'*'} == $max_id) {
363
# delete ./item/* which is covered by ./*
365
# delete ./item if no files with different (c) info are there
366
delete $h->{$name} unless keys $h->{$name}->%*;
368
if (not ref ($item)) {
369
# delete file that is represented by '*' entry
370
delete $h->{$name} if $item == $max_id;
373
# here's the '*' file representing the most used (c) info
374
$h->{'*'} //= $max_id if defined $max_id;
385
Dpkg::Copyright::Scanner - Scan files to provide copyright data
389
use Dpkg::Copyright::Scanner qw/print_copyright scan_files/;
391
# print copyright data on STDOUT
394
# return a data structure containing copyright information
395
my @copyright_data = scan_files();
400
This modules scans current package directory to extract copyright and
401
license information. Information are packed in a way to ease review and
402
maintenance. Files information is grouped with wildcards ('*') to reduce
407
=head2 print_copyright
409
Print copyright information on STDOUT like L<scan-copyrights>.
413
Return a data structure with copyright and license information.
415
The structure is a list of list:
419
[ path1 ,path2, ...],
431
'1994-2001, by Frank Pilhofer.',
436
'2002-2006, Charles Kerr <charles@rebelbase.com>',
444
'2002-2007, Charles Kerr <charles@rebelbase.com>',
451
The output of L<licensecheck> is expected to be utf-8. Which means
452
that the source files scanned by L<licensecheck> should also be
453
encoded in utf-8. In practice, this will impact only copyright owner
454
name which may be garbled if comments are not encoded in utf-8.
458
Extracting license and copyright data from unstructured comments is not reliable.
459
User must check manually the files when no copyright info is found or when the
464
L<license_check>, C<licensecheck2dep5> from C<cdbs> package
468
Dominique Dumont <dod@debian.org>