~ubuntu-branches/debian/sid/libconfig-model-dpkg-perl/sid

Viewing changes to lib/Dpkg/Copyright/Scanner.pm

Committer: Package Import Robot
Author(s): Dominique Dumont
Date: 2015-05-08 10:24:03 UTC
mfrom: (1.1.3 experimental)
Revision ID: package-import@ubuntu.com-20150508102403-5a61zoujj1kqd7vw

Tags: 2.063

http://bugs.debian.org/783932

http://bugs.debian.org/783928

http://bugs.debian.org/782995

* Copyright scanner:
  * to cope with owner containing 'f00' (Closes: #783932)
  * handle ranges like 2010-12 or 2002-3 (Closes: #783928)
* Dpkg::Patch: fix handling of Subject body expressed as
  free form text below header.
* Updated list of supported arch in C::M::Dpkg::Dependency
  (Closes: #782995)
* control: added build-dep on libyaml-tiny-perl

files added:
bin

bin/scan-copyrights

lib/Config/Model/Dpkg/Copyright.pm

lib/Dpkg

lib/Dpkg/Copyright

lib/Dpkg/Copyright/Scanner.pm

t/copyright-update.t

t/model_tests.d/dpkg-examples/pan-copyright-from-scratch

t/model_tests.d/dpkg-examples/pan-copyright-from-scratch/debian

t/model_tests.d/dpkg-examples/pan-copyright-from-scratch/debian/compat

t/model_tests.d/dpkg-examples/pan-copyright-from-scratch/debian/control

t/model_tests.d/dpkg-examples/pan-copyright-from-scratch/debian/copyright

t/model_tests.d/dpkg-examples/pan-copyright-from-scratch/debian/fix.scanned.copyright

t/model_tests.d/dpkg-examples/pan-copyright-from-scratch/debian/rules

t/model_tests.d/dpkg-examples/pan-copyright-from-scratch/debian/source

t/model_tests.d/dpkg-examples/pan-copyright-from-scratch/debian/source/format

t/model_tests.d/dpkg-examples/pan-copyright-from-scratch/debian/watch

t/model_tests.d/dpkg-examples/pan-copyright-upgrade-update

t/model_tests.d/dpkg-examples/pan-copyright-upgrade-update-more

t/model_tests.d/dpkg-examples/pan-copyright-upgrade-update-more/debian

t/model_tests.d/dpkg-examples/pan-copyright-upgrade-update-more/debian/compat

t/model_tests.d/dpkg-examples/pan-copyright-upgrade-update-more/debian/control

t/model_tests.d/dpkg-examples/pan-copyright-upgrade-update-more/debian/copyright

t/model_tests.d/dpkg-examples/pan-copyright-upgrade-update-more/debian/fix.scanned.copyright

t/model_tests.d/dpkg-examples/pan-copyright-upgrade-update-more/debian/rules

t/model_tests.d/dpkg-examples/pan-copyright-upgrade-update-more/debian/source

t/model_tests.d/dpkg-examples/pan-copyright-upgrade-update-more/debian/source/format

t/model_tests.d/dpkg-examples/pan-copyright-upgrade-update/debian

t/model_tests.d/dpkg-examples/pan-copyright-upgrade-update/debian/compat

t/model_tests.d/dpkg-examples/pan-copyright-upgrade-update/debian/control

t/model_tests.d/dpkg-examples/pan-copyright-upgrade-update/debian/copyright

t/model_tests.d/dpkg-examples/pan-copyright-upgrade-update/debian/fix.scanned.copyright

t/model_tests.d/dpkg-examples/pan-copyright-upgrade-update/debian/rules

t/model_tests.d/dpkg-examples/pan-copyright-upgrade-update/debian/source

t/model_tests.d/dpkg-examples/pan-copyright-upgrade-update/debian/source/format

t/model_tests.d/dpkg-patch-examples/moarvm

t/scanner

t/scanner/examples

t/scanner/examples/moarvm.in

t/scanner/examples/moarvm.out

t/scanner/examples/pan.in

t/scanner/examples/pan.out

t/scanner/examples/sdl2.in

t/scanner/examples/sdl2.out

t/scanner/examples/test-783932.in

t/scanner/examples/test-783932.out

t/scanner/pack_copyright.t

t/scanner/pack_files.t

t/scanner/scan-copyright.t

t/scanner/squash_copyright_years.t

t/scanner/squash_files.t

files modified:
Build.PL

debian/changelog

debian/control

debian/copyright

lib/Config/Model/Backend/Dpkg/Control.pm

lib/Config/Model/Backend/Dpkg/Copyright.pm

lib/Config/Model/Backend/Dpkg/Patch.pm

lib/Config/Model/Backend/DpkgSyntax.pm

lib/Config/Model/Dpkg/Dependency.pm

lib/Config/Model/models/Dpkg/Control/Source.pl

lib/Config/Model/models/Dpkg/Copyright.pl

lib/Config/Model/models/Dpkg/Copyright/FileLicense.pl

lib/Config/Model/models/Dpkg/Copyright/LicenseSpec.pl

t/dependency-check.t

t/dpkg.t

t/model_tests.d/dependency-cache.txt

t/model_tests.d/dpkg-copyright-test-conf.pl

t/model_tests.d/dpkg-patch-test-conf.pl

t/model_tests.d/dpkg-patches-test-conf.pl

t/model_tests.d/dpkg-test-conf.pl

Show diffs side-by-side

added added

removed removed

lib/Dpkg/Copyright/Scanner.pm

package Dpkg::Copyright::Scanner ;

use strict;

use warnings;

use 5.20.0;

use IO::Pipe;

use Exporter::Lite;

use Array::IntSpan;

use feature qw/postderef signatures/;

no warnings qw/experimental::postderef experimental::signatures/;

our @EXPORT = qw(scan_files print_copyright);

my $whitespace_list_delimiter = $ENV{'whitespace_list_delimiter'} || "\n ";

# license and copyright sanitisation pilfered from Jonas's

# licensecheck2dep5 Originally GPL-2+, permission to license this

# derivative work to LGPL-2.1+ was given by Jonas.

# see https://lists.alioth.debian.org/pipermail/pkg-perl-maintainers/2015-March/084900.html

# Description: Reformat licencecheck output to copyright file format

# This program is free software; you can redistribute it and/or

# modify it under the terms of the GNU General Public License as

# published by the Free Software Foundation; either version 2, or (at

# your option) any later version.

# This program is distributed in the hope that it will be useful, but

# WITHOUT ANY WARRANTY; without even the implied warranty of

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

# General Public License for more details.

# You should have received a copy of the GNU General Public License

# along with this program. If not, see <http://www.gnu.org/licenses/>.

sub print_copyright ( %args ) {

my ($files, $copyrights_by_id) = scan_files(%args);

# split file path and fill recursive hash, leaf is id

my $split_files = {};

foreach my $path (keys %$files) {

__create_tree_leaf_from_paths ($split_files,$path,$files->{$path});

}

# regroup %files hash: all leaves have same id -> wild card

__squash($split_files);

# pack files by copyright id

my @packed = __pack_files($split_files);

my @out ;

foreach my $p (@packed) {

my ($id, @paths) = $p->@*;

my ($c,$l) = $copyrights_by_id->[$id]->@*;

next if $c eq 'no-info-found';

push @out,

"Files: ", join($whitespace_list_delimiter, @paths )."\n",

"Copyright: $c\n",

"License: $l\n", "\n";

}

if ($args{out}) {

$args{out}->spew_utf8( @out);

}

else {

binmode(STDOUT, ":utf8");

print @out;

}

# option to skip UNKNOWN ?

# load a file to override some entries ?

sub scan_files ( %args ) {

my @lines ;

if ($args{in}) {

@lines = $args{in}->lines_utf8; # for tests

}

else {

my $pipe = IO::Pipe->new();

$pipe->reader("licensecheck --copyright -m -r .");

binmode($pipe, ":utf8");

@lines = $pipe->getlines;

$pipe->close;

}

my %copyrights ;

my $files = {};

my $id = 0;

foreach my $line (sort @lines) {

chomp $line;

# say "found: $line";

100

my ($f,$l,$c) = split /\t/, $line;

101

$f =~ s!\./!!;

102

103

$l =~ s/([*?\\])/\\$1/g;

104

$l =~ s/\s*$unversioned\/unknown version$//;

105

$l =~ s/\s*$with incorrect FSF address$//;

106

$l =~ s/(\w+)\s+$v([^)]+) or v([^)]+)$/uc($1)."-$2 or ".uc($1)."-$3"/e;

107

$l =~ s/\s+$v([^)]+) or later$/-$1+/;

108

$l =~ s/\s+$v([^)]+)$/-$1/;

109

$l =~ s/^\s*(GENERATED FILE)/UNKNOWN/;

110

$l =~ s/\s+(GENERATED FILE)//;

111

$l =~ s/\bzlib\/libpng\b/Zlib/;

112

$l =~ s/\bMIT\/X11 $BSD like$/Expat/;

113

$l =~ s/^\s*BSD $(\d) clause$$/BSD-$1-clause/;

114

$l =~ s/^\s*public domain$/public-domain/i;

115

116

# this is very fragile. may need to change license-check to output license keyword

117

$l =~ s/ / and /g unless $l =~ /\bor\b/;

118

119

$c =~ s/'//g;

120

$c =~ s/^©\s*//;

121

$c =~ s/(?<=\b\d{4})\s*-\s*\d{4}(?=\s*-\s*(\d{4})\b)//g;

122

$c =~ s/(\d+)\s*-\s*(\d+)/$1-$2/g;

123

$c =~ s/\b(\d{4}),?\s+([\S^\d])/$1, $2/g;

124

$c =~ s/\s+by\s+//g;

125

$c =~ s/all\s+rights?\s+reserved[\s\.]*//gi;

126

$c = 'no-info-found' if $c =~ /^\*No/;

127

$c =~ s/$r$//g;

128

$c =~ s!^[\s,/*]|[\s,/*-]+$!!g;

129

130

$c = __pack_copyright($c);

131

132

$files->{$f} = $copyrights{$c}{$l} //= $id++;

133

}

134

135

my @copyrights_by_id ;

136

foreach my $c (sort keys %copyrights) {

137

foreach my $l (sort keys $copyrights{$c}->%* ) {

138

my $id = $copyrights{$c}{$l};

139

$copyrights_by_id[$id] = [ $c, $l ] ;

140

}

141

}

142

143

say "No copyright information found" unless keys %$files;

144

145

my $merged_c_info = __squash_copyrights_years (\@copyrights_by_id) ;

146

147

# replace the old ids with news ids

148

__swap_merged_ids($files, $merged_c_info);

149

150

# stop here for update ...

151

return ($files, \@copyrights_by_id) ;

152

}

153

154

sub __split_copyright ($c) {

155

my ($years,$owner) = $c =~ /([\s,\d-]+)(.*)/;

156

# say "undef year in $c" unless defined $years;

157

return unless defined $years;

158

my @data = split /(?<=\d)[,\s]+/, $years;

159

return unless defined $owner;

160

$owner =~ s/^[\s.,-]+|[\s,*-]+$//g;

161

return ($owner,@data);

162

}

163

164

sub __create_tree_leaf_from_paths ($h,$path,$value) {

165

# explode path in subpaths

166

my @subpaths = split '/', $path;

167

my $last = pop @subpaths;

168

map { $h = $h->{$_} ||= {} } @subpaths ;

169

$h->{$last} = $value;

170

}

171

172

sub __pack_copyright ($r) {

173

174

return $r if $r eq 'no-info-found';

175

my %cop;

176

$r =~ /^[\s\W]+|[\s\W]+$/g;

177

foreach my $c ( split( m!\s*/\s*!, $r)) {

178

my ($owner, @data) = __split_copyright($c);

179

return $r unless defined $owner;

180

$cop{$owner} ||= [] ;

181

push $cop{$owner}->@*, @data ;

182

}

183

my @res ;

184

foreach my $owner (sort keys %cop) {

185

my $span = Array::IntSpan->new();

186

my $data = $cop{$owner};

187

foreach my $year ($data->@*) {

188

return $r if $year =~ /[^\d-]/; # bail-out

189

# take care of ranges written like 2002-3

190

$year =~ s/^(\d\d\d)(\d)-(\d)$/$1$2-$1$3/;

191

# take care of ranges written like 2014-15

192

$year =~ s/^(\d\d)(\d\d)-(\d\d)$/$1$2-$1$3/;

193

eval {

194

$span->set_range_as_string($year, $owner);

195

};

196

return $r if $@; # invalid range

197

}

198

$span->consolidate();

199

push @res, $span->get_range_list. ', '. $owner;

200

}

201

return join("\n ",reverse sort @res);

202

}

203

204

#in each directory, pack files that have the same copyright/license information

205

# traverse recursively %h (whose structure matches the scanned directory)

206

# @path keeps track of the recursion depth to provide the file path

207

sub __pack_files ($h) {

208

209

my @res ;

210

__pack_dir($h,\@res) ;

211

212

# sort by first path listed in there

213

my $sort_path = sub {

214

$a->[1] cmp $b->[1];

215

};

216

217

return sort $sort_path @res ;

218

}

219

220

sub __pack_dir ($h, $pack, @path) {

221

my %pack_by_id;

222

foreach my $file (sort keys %$h) {

223

my $id = $h->{$file};

224

if (ref($id)) {

225

__pack_dir($id, $pack, @path, $file) ;

226

}

227

elsif (defined $pack_by_id{$id} ) {

228

push $pack_by_id{$id}->@*, join('/',@path,$file);

229

}

230

else {

231

$pack_by_id{$id} = [ join('/',@path,$file) ] ;

232

}

233

}

234

235

push $pack->@*, map { [ $_, $pack_by_id{$_}->@* ]; } keys %pack_by_id ;

236

}

237

238

# find ids that can be merged together

239

# I.e. merge entries with same license and same set of owners. In this

240

# case the years are merged together.

241

sub __squash_copyrights_years ($copyrights_by_id) {

242

243

my %id_year_by_same_owner_license;

244

for (my $id = 0; $id < $#$copyrights_by_id; $id++ ) {

245

my ($c,$l) = $copyrights_by_id->[$id]->@* ;

246

#say "id $id: c $c l $l";

247

my @owners ;

248

my @years ;

249

foreach my $line (split(/\n\s+/,$c)) {

250

my ($owner, @year) = __split_copyright($line);

251

next unless defined $owner;

252

push @owners, $owner;

253

push @years, join(',',@year);

254

}

255

my $k = join('|', $l, @owners);

256

$id_year_by_same_owner_license{$k} //= [];

257

push $id_year_by_same_owner_license{$k}->@*, [ $id, @years ];

258

}

259

260

my @merged_c_info;

261

# now detect where %id_year_by_same_owner_license references more

262

# than one id this means that several entries can be merged in a

263

# *new* id (new id to avoid cloberring data of other directories)

264

foreach my $owner_license (keys %id_year_by_same_owner_license) {

265

my @entries = $id_year_by_same_owner_license{$owner_license}->@* ;

266

next unless @entries > 1;

267

268

my ($l,@owners) = split /\|/, $owner_license;

269

270

# create new copyright info with coaslesced years

271

my @squashed_c = __coalesce_copyright_years(\@entries,\@owners) ;

272

next unless @squashed_c ; # give up this entry when problem

273

274

# store (c) info with coalesced years in new item of $copyrights_by_id

275

my $new_id = @$copyrights_by_id ;

276

$copyrights_by_id->[$new_id] = [ join("\n ",@squashed_c), $l ];

277

278

# fill the swap table entry-id -> coaslesces entry-id

279

foreach my $id ( map { $_->[0]} @entries) {

280

$merged_c_info[$id] = $new_id;

281

}

282

}

283

284

return \@merged_c_info;

285

}

286

287

sub __swap_merged_ids ($files, $merged_c_info) {

288

foreach my $name (sort keys %$files) {

289

my $item = $files->{$name};

290

if (ref($item)) {

291

__swap_merged_ids($item,$merged_c_info);

292

}

293

elsif (my $new_id = $merged_c_info->[$item]) {

294

$files->{$name} = "$new_id" ;

295

}

296

}

297

}

298

299

sub __coalesce_copyright_years($entries, $owners) {

300

my @ranges_of_years ;

301

# $entries and $owners always have the same size

302

303

foreach my $entry (@$entries) {

304

my ($id, @years) = $entry->@* ;

305

306

for (my $i = 0; $i < @years; $i++) {

307

return () if $years[$i] =~ /[^\d,\s-]/;

308

my $span = $ranges_of_years[$i] //= Array::IntSpan->new();

309

return () unless $span; # bail out in case of problems

310

$span->set_range_as_string($years[$i], 1);

311

}

312

}

313

314

my @squashed_c;

315

for (my $i=0; $i < @$owners ; $i++) {

316

$ranges_of_years[$i]->consolidate();

317

$squashed_c[$i] = $ranges_of_years[$i]->get_range_list.', '.$owners->[$i];

318

}

319

320

return @squashed_c;

321

}

322

323

# $h is a tree of hash matching the directory structure. Each leaf is a

324

# copyright id.

325

sub __squash ($h) {

326

my %count ;

327

328

# count the number of times each (c) info is used in this directory.

329

# (including the main (c) info of each subdirectory)

330

foreach my $name (sort keys %$h) {

331

my $item = $h->{$name};

332

if (ref($item)) {

333

# squash may return a plain id, or a hash with '*' => id ,

334

# or a non squashable hash

335

$h->{$name} = __squash($item);

336

}

337

my $id = (ref($item) and defined $item->{'*'}) ? $item->{'*'} : $item ;

338

339

# do not count non squashable hashes (i.e. there's no main (c) info)

340

if (not ref ($id)) {

341

$count{$id}//=0;

342

$count{$id} ++;

343

}

344

}

345

346

# find the most used (c) info in this directory (or the existing '*' entry)

347

my $max = 0;

348

my $max_id = $h->{'*'};

349

if (not defined $max_id) {

350

foreach my $id (sort keys %count) {

351

if ($count{$id} > $max) {

352

$max = $count{$id};

353

$max_id = $id ;

354

}

355

}

356

}

357

358

# all files associated to the most used (c) info are deleted to

359

# be represented by '*' entry

360

foreach my $name (sort keys %$h) {

361

my $item = $h->{$name};

362

if (ref($item) and defined $item->{'*'} and $item->{'*'} == $max_id) {

363

# delete ./item/* which is covered by ./*

364

delete $item->{'*'};

365

# delete ./item if no files with different (c) info are there

366

delete $h->{$name} unless keys $h->{$name}->%*;

367

}

368

if (not ref ($item)) {

369

# delete file that is represented by '*' entry

370

delete $h->{$name} if $item == $max_id;

371

}

372

}

373

# here's the '*' file representing the most used (c) info

374

$h->{'*'} //= $max_id if defined $max_id;

375

376

return $h;

377

}

378

379

380

381

__END__

382

383

=head1 NAME

384

385

Dpkg::Copyright::Scanner - Scan files to provide copyright data

386

387

=head1 SYNOPSIS

388

389

use Dpkg::Copyright::Scanner qw/print_copyright scan_files/;

390

391

# print copyright data on STDOUT

392

print_copyright;

393

394

# return a data structure containing copyright information

395

my @copyright_data = scan_files();

396

397

398

=head1 DESCRIPTION

399

400

This modules scans current package directory to extract copyright and

401

license information. Information are packed in a way to ease review and

402

maintenance. Files information is grouped with wildcards ('*') to reduce

403

the list of files.

404

405

=head1 METHODS

406

407

=head2 print_copyright

408

409

Print copyright information on STDOUT like L<scan-copyrights>.

410

411

=head2 scan_files

412

413

Return a data structure with copyright and license information.

414

415

The structure is a list of list:

416

417

[

418

[

419

[ path1 ,path2, ...],

420

421

license_short_name

422

423

...

424

]

425

426

Example:

427

428

[

429

[

430

[ '*' ],

431

'1994-2001, by Frank Pilhofer.',

432

'GPL-2+'

433

434

[

435

[ 'pan/*' ],

436

'2002-2006, Charles Kerr <charles@rebelbase.com>',

437

'GPL-2'

438

439

[

440

[

441

'pan/data/parts.cc',

442

'pan/data/parts.h'

443

444

'2002-2007, Charles Kerr <charles@rebelbase.com>',

445

'GPL-2'

446

447

]

448

449

=head1 Encoding

450

451

The output of L<licensecheck> is expected to be utf-8. Which means

452

that the source files scanned by L<licensecheck> should also be

453

encoded in utf-8. In practice, this will impact only copyright owner

454

name which may be garbled if comments are not encoded in utf-8.

455

456

=head1 BUGS

457

458

Extracting license and copyright data from unstructured comments is not reliable.

459

User must check manually the files when no copyright info is found or when the

460

license is unknown.

461

462

=head1 SEE ALSO

463

464

L<license_check>, C<licensecheck2dep5> from C<cdbs> package

465

466

=head1 AUTHOR

467

468

Dominique Dumont <dod@debian.org>

469

470

=cut

471

Older »