1
###############################################################################
2
# Note: Drizzle requires all files to include both copyright attribution and
3
# the license that the file is shared under. This file is widely distributed
4
# without any license. The below Apache License 2.0 is inferred from tracing
5
# back the original publication of this file to the pyjamas software package.
6
# I have added reference to Apache License so as to conform to Drizzle
7
# contribution guidelines.
9
# Original publication of this script:
10
# http://lists.debian.org/debian-devel/2009/09/msg00766.html
12
# The script in pyjamas SCM doesn't contain a copyright license, however
13
# pyjamas itself is licensed under Apache License 2.0:
14
# http://pyjamas.svn.sourceforge.net/viewvc/pyjamas/trunk/COPYING?revision=2
18
###############################################################################
20
# Copyright (C) 2009, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
22
# Licensed under the Apache License, Version 2.0 (the "License");
23
# you may not use this file except in compliance with the License.
24
# You may obtain a copy of the License at
26
# http://www.apache.org/licenses/LICENSE-2.0
28
# Unless required by applicable law or agreed to in writing, software
29
# distributed under the License is distributed on an "AS IS" BASIS,
30
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
31
# See the License for the specific language governing permissions and
32
# limitations under the License.
36
""" This is a debian copyright file checker. Put debian/copyright
37
file conforming to http://dep.debian.net/deps/dep5/ and
38
this program tells you which copyright holders you missed.
42
* for each section, you must put the full set of copyright holders.
43
whilst the file lists are "carried over" i.e. later sections
44
override earlier ones (see "remove_files()"), the same trick is
45
NOT applied to copyright holders.
47
* the qgram algorithm is applied to do some fuzzy string matching.
48
it's pretty good, but don't rely on it to be perfect.
50
* copyright year matching goes against "199" and "200" not
51
"198?", "199?", "200?" and certainly not "201?". if a name
52
happens to have "199" or "200" in it, on a line that happens
53
to have the word "copyright" in it, it gets assumed to be
56
* random sentences tacked onto the end of copyrights in files
57
are assumed to be part of the copyright holders' name
59
* copyrights are assumed to be in the first 80 lines of the file
61
* if the file doesn't _have_ a copyright notice, this program can't
62
bloody well find it, can it??
69
from string import strip
71
# qgram: a way to "roughly" match words. you're supposed to set splitlen
72
# to half the length of the average word, but 3 is good enough.
73
def qgram_set(word, splitlen):
75
pad = '\0'*(splitlen-1)
76
word = pad + word + pad
77
for idx in range(len(word)-splitlen):
78
s.add(word[idx:idx+splitlen])
81
def qgram(word1, word2, splitlen=3):
82
s1 = qgram_set(word1, splitlen)
83
s2 = qgram_set(word2, splitlen)
84
un = len(s1.union(s2))
85
ic = len(s1.intersection(s2))
86
return float(ic) / float(un)
88
def truncate_qgram(word1, word2):
92
if len(word1) > len(word2):
96
for i in range(len(word1), len(word2)+1):
97
qg = max(qgram(word1, word2[:i]), qg)
100
def check_match(word, word_list):
103
for word2 in word_list:
104
match = truncate_qgram(word, word2)
106
matches.add((word, word2))
107
#print "In check_match matched: \"%s\" - \"%s\"" % (word,word2)
109
#print "In check_match NOT matched: \"%s\" - \"%s\"" % (word,word2)
110
not_matches.add((word, word2))
111
return matches, not_matches
113
def sanitise(copyright):
114
if len(copyright) == 0:
116
if copyright[0] == ':':
117
copyright = copyright[1:].strip()
119
fco = copyright.lower().find(co)
121
copyright = copyright[fco+len(co):]
122
srrs = "some rights reserved"
123
srr = copyright.lower().find(srrs)
125
copyright = copyright[:srr] + copyright[srr+len(srrs):]
126
arrs = "all rights reserved"
127
arr = copyright.lower().find(arrs)
129
copyright = copyright[:arr] + copyright[arr+len(arrs):]
131
# hmmm... something not quite right here...
139
res = filter(lambda x:x, res)
142
def find_file_copyright_notices(fname):
144
pattern= re.compile('[1-3][0-9][0-9][0-9]')
146
lines = f.readlines()
147
for l in lines[:80]: # hmmm, assume copyright to be in first 80 lines
148
idx = l.lower().find("copyright")
151
copyright = l[idx+9:].strip()
152
copyright = sanitise(copyright)
153
# hmm, do a quick check to see if there's a year,
155
if not pattern.search(copyright):
160
def skip_file(fname):
161
if fname.startswith(".svn"):
163
if fname.startswith(".git"):
165
if fname.startswith(".sw"):
167
if fname == "output": # no thanks
169
if fname.find("PureMVC_Python_1_0") >= 0: # no thanks
171
if fname.endswith(".pyc"): # ehmm.. no.
173
if fname.endswith(".java"): # no again
175
if fname.endswith(".test"): # no again
177
if fname.endswith(".result"): # no again
179
if fname.endswith(".master.opt"): # no again
185
for p in glob.glob(os.path.join(d, "*")):
188
(pth, fname) = os.path.split(p)
191
if os.path.islink(p):
201
for d in glob.glob(match):
204
if os.path.islink(d):
207
(pth, fname) = os.path.split(d)
208
expath = get_files(d)
215
def __init__(self, pattern, files):
216
self.file_pattern = pattern
218
self.copyrights = set()
219
self.listed_copyrights = set()
220
self.files_by_author = {}
222
def read_files_for_copyrights(self):
223
for fname in self.files:
224
if fname.endswith("copyright_check.py"): # skip this program!
226
if fname == 'debian/copyright': # skip this one duh
228
cops = find_file_copyright_notices(fname)
229
self.listed_copyrights.update(cops)
231
if not self.files_by_author.has_key(c):
232
self.files_by_author[c] = set()
233
if fname not in self.files_by_author[c]:
234
self.files_by_author[c].add(fname)
235
print "Pattern", self.file_pattern
237
# Copyrights found in the master copyright file
238
for author in self.copyrights:
239
print "Copyright: \"%s\"" % author
241
# Copyrights found in the source file
242
for author in self.listed_copyrights:
243
print "Listed Copyright: \"%s\"" % author
245
def remove_files(self, to_remove):
246
for fname in to_remove:
247
if fname in self.files:
248
self.files.remove(fname)
250
def check_copyright_matches(self):
252
self.not_matches = set()
254
for author in self.listed_copyrights:
255
matches, not_matches = check_match(author, self.listed_copyrights)
256
self.matches.update(matches)
257
for (word1, word2) in not_matches:
258
matches1, not_matches1 = check_match(word2, self.copyrights)
259
if len(matches1) > 0:
261
self.not_matches.add(word2)
265
print" ** ** ** ** **"
266
for m in self.not_matches:
267
print " ** No matches found for: \"%s\"" % m
268
for fname in self.files_by_author[m]:
269
print" ** ** ** ** ** in source file: \"%s\"" % fname
272
#############################################################################
274
#############################################################################
276
all_listed_files = []
278
# read debian/copyright file and collect all matched files,
279
# copyrights and licenses
280
current_debsect = None
281
current_copyrights = set()
282
current_licenses = set()
284
# if argument supplied then read that file instead of the default
285
if len(sys.argv) > 1:
286
dc = open(sys.argv[1])
287
print "Parsing %s" % sys.argv[1]
289
dc = open("debian/copyright")
291
# Read the master copyright file and find all the License, Copyright and File sections
292
# Build up a list of licenses and copyrights to compare against later
294
# For a file or set of files that we find listed in the master copyright file,
295
# build up a list of files to compare its copyright strings against the master list
296
# of copyright strings
297
for l in dc.readlines():
298
if l.startswith("License:"):
299
current_licenses.add(strip(l[8:]))
301
if l.startswith("Copyright:"):
302
current_copyrights.add(sanitise(strip(l[10:])))
304
if not l.startswith("Files:"):
307
current_debsect.licenses = current_licenses
308
current_debsect.copyrights = current_copyrights
309
current_copyrights = set()
310
current_licenses = set()
314
# list of files can include wildcards e.g. 'drizzled/*'
315
for pattern in l[1:]:
317
if pattern[-1] == ',':
318
pattern = pattern[:-1]
319
files = get_dir(pattern)
320
listed_files += files
321
all_listed_files += files
322
current_debsect = DebSect(l[1:], listed_files)
323
copyright_sects.append(current_debsect)
326
current_debsect.copyrights = current_copyrights
327
current_debsect.licenses = current_licenses
331
# remove already-matching: further down takes precedence
332
for i in range(1, len(copyright_sects)):
334
#print i, j, copyright_sects[i].file_pattern, copyright_sects[j].file_pattern
335
copyright_sects[j].remove_files(copyright_sects[i].files)
337
for dc in copyright_sects:
338
dc.read_files_for_copyrights()
339
dc.check_copyright_matches()
342
#def check_in(l1, l2):
345
# if fname not in l2:
349
#not_in = check_in(all_files, listed_files)
350
#for fname in not_in:
353
#print check_in(listed_files, all_files)