-
Notifications
You must be signed in to change notification settings - Fork 3
/
detector.py
603 lines (498 loc) · 23.8 KB
/
detector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
# -*- coding: utf-8 -*-
###############################################################################
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
#
# A module to detect licenses in blocks of text, and split them into their
# component parts.
#
# The key API is "get_license_info", which takes a filename and returns
# information about its license(s). Pass "details=True" in the params to get
# details on the copyright lines and the license text itself.
###############################################################################
import re
import logging
import copy
import utils
import config
logging.basicConfig(filename="slic.log")
log = logging.getLogger("slic")
DEFAULT_MAX_LINES_IN_LICENSE = 50
# This number is fairly performance-sensitive
MAX_SCAN_BYTES = 32768
MAX_GAP_LINES = 200
class Detector(object):
def __init__(self, license_data, params={}):
"""Set up the class's internal data"""
self._group_names_to_tags = {}
self._flat_license_data = {}
self._license_data = copy.deepcopy(license_data)
self._preprocess(self._license_data, None)
self._details = params.get('details', False)
# Things to ignore on a line - not a copyright line, and not the
# license
self._cruft_re = re.compile("""Derived\ from
|Target\ configuration
|[Cc]ontributed\ by
|File:
|File\ speex
|Authors?:
|[Vv]ersion
|Written\ by
|Linux\ for
|You\ can\ look
|available\ under
""", re.VERBOSE)
def _preprocess(self, license_data, parent):
"""This function, called recursively, prepares the data structure the
detector will use. It does some sanity-checking, then caches compiled
versions of all the necessary regexps at the right point in the
structure.
"""
matches = []
for (tag, info) in license_data.iteritems():
if not tag:
raise Exception("Missing tag in license data")
if re.match("_", tag):
raise Exception("Hit tag %s starting with underscore" % tag)
# Bad things happen if we use the same name twice; detect this
# condition and bail so it can be fixed. This won't detect all
# instances because if they are at the same level, the later
# definition silently overrides the former before we even see the
# data. But it helps.
if tag in self._flat_license_data:
raise Exception("Duplicate tag %s in license data" % tag)
else:
self._flat_license_data[tag] = info
if parent is not None:
info['_parent'] = parent
# Python regexp group names have to be identifiers, but we want
# the freedom to use more characters than this in tags. So we
# make a compatible group name, and have a hash to map back.
groupname = re.sub("[^a-zA-Z0-9_]", "_", tag)
self._group_names_to_tags[groupname] = tag
matches.append("(?P<" + groupname + ">" + info['match'] + ")")
# Compile or find the appropriate bits of data for determining
# the extent of the license block
if 'start' in info:
info['_start_re'] = re.compile(info['start'])
else:
self._fill_in_from_parent(info, '_start_re')
if 'end' in info:
info['_end_re'] = re.compile(info['end'])
else:
self._fill_in_from_parent(info, '_end_re')
if 'maxlines' not in info:
info['maxlines'] = DEFAULT_MAX_LINES_IN_LICENSE
if 'cancel' in info:
info['cancel'] = set(info['cancel'])
# Python "only supports 100 named groups", although it seems to
# actually count all capturing groups, named or otherwise. So we must
# always use non-capturing groups - (?: ... ) - in the 'match' regexps
# otherwise we'll hit this error.
#
# Our structure certainly has more than this many entries at the top
# level, so we create a set of regexps and apply them sequentially.
#
# We store the text form of the regexp for debugging purposes.
grouplimit = 99
license_data['_match_res'] = []
license_data['_match_res_text'] = []
while len(matches) > grouplimit:
section = matches[:grouplimit]
matches = matches[grouplimit:]
text = "|".join(section)
license_data['_match_res_text'].append(text)
license_data['_match_res'].append(re.compile(text))
text = "|".join(matches)
license_data['_match_res_text'].append(text)
license_data['_match_res'].append(re.compile(text))
for (tag, info) in license_data.iteritems():
# Recurse if necessary
if 'subs' in info:
self._preprocess(info['subs'], info)
def _fill_in_from_parent(self, info, key):
"""If a member is not present, find the nearest present value from
the parents, or default to the 'match' member at the top level.
"""
retval = None
pointer = info
while pointer and retval is None:
if key in pointer:
retval = pointer[key]
else:
if '_parent' in pointer:
pointer = pointer['_parent']
else:
# Top level
retval = re.compile(info['match'])
if retval is None:
log.warning("_fill_in_from_parent found None; info: %r" % info)
info[key] = retval
def get_license_info(self, filename):
"""Find the license or licenses in a file. Returns a list of license
objects. The only guaranteed value in a license object is the 'tag',
which may be 'none'.
"""
fin = open(filename, 'r')
try:
content = fin.read(MAX_SCAN_BYTES)
finally:
fin.close()
try:
content = content.decode('utf-8')
except UnicodeDecodeError:
content = content.decode('iso-8859-1')
log.info("Processing: %s", filename)
licenses = []
# Get comment delimiter info for this file.
comment_delim_sets = config.get_delims(filename)
if not comment_delim_sets:
# We can't handle this type of file
log.warning("No comment delimiters for file %s" % filename)
return []
lines = content.splitlines()
for delims in comment_delim_sets:
log.debug("Trying delims: %r", delims)
start_line = 0
end_line = 0
most_recent_end_line = 0
# We break out if any of the following are true:
#
# * The delim is "" and we have executed the loop once
# * We have run out of comments in the file
# * We have found at least one license and the most recent one was
# more than MAX_GAP_LINES ago
while 1:
if most_recent_end_line - start_line > MAX_GAP_LINES:
log.debug("Ending: > MAX_GAP_LINES without license")
break
if delims[0] == '':
comment = lines
else:
(start_line, end_line) = self._find_next_comment(end_line,
lines,
delims)
if start_line == -1:
# No more comments; try next delim
log.debug("Ending: no more comments")
break
comment = lines[start_line:end_line]
comment = self._strip_comment_chars(comment, delims)
log.debug("Matching against:\n%s\n" % comment)
# We have a comment - is it a license block?
tags = self._find_license(comment)
if tags is not None:
# It is.
most_recent_end_line = end_line
for tag in tags:
license = {
'tag': tag
}
if self._details:
# Store away the info about the license for this
# file
(copyrights, text) = \
self._find_details(comment, tag)
copyrights = self._clean_copyrights(copyrights)
# De-dupe identical copyright lines
copyrights_dict = {}
for c in copyrights:
copyrights_dict[c] = 1
license['copyrights'] = copyrights_dict
license['text'] = text
licenses.append(license)
if delims[0] == '':
# We did the whole file in one go; try next delim
log.debug("Ending: blank delimiter so just one pass")
break
if licenses:
# Once we found at least one license, we assume all licenses
# use the same delim, so we don't try any further delims.
break
if not licenses:
# We also note if a comment is "suspicious" - in other words,
# if we don't detect a license but there is a suspicious
# comment, it suggests we should check the file by hand to
# see if our script needs improving.
#
# There are a lot of files which are Copyright AOSP and nothing
# else. The distinction made here is so we can eliminate false
# positives for suspicion.
tag = "none"
text = None
if re.search("[Cc]opyright", content):
tag = "suspiciousCopyright"
if re.search("Copyright[\d\s,\(chC\)-]+The Android Open Sourc",
content):
tag = "suspiciousAndroid"
elif re.search("Copyright[\d\s,\(chC\)-]+Microsoft Corp",
content):
tag = "suspiciousMicrosoft"
# Things more likely to have an actual license text
else:
match = re.search("[Ll]icen[cs]e|[Pp]ermi(t|ssion)|[Rr]edistribu",
content)
if match:
tag = "suspiciousLicensey"
# text = match.group(0)
license = { 'tag': tag }
if text is not None:
license['text'] = text
licenses.append(license)
log.debug("Filename: %s" % filename)
log.debug("get_license_info returned: %r" % licenses)
return licenses
def _find_next_comment(self, starting_from, lines, delims):
"""Returns the first line which is part of the next comment in the
block, and the first line which is not (which can therefore be fed
straight back in as the new starting_from value). Returns start_line of
-1 if no further comment found.
"""
end_line = starting_from
start_line = -1
start_re = re.compile("^\s*%s" % re.escape(delims[0]))
invert_end = False
if len(delims) == 3:
end_re = re.compile(re.escape(delims[2]))
elif len(delims) == 1:
# This regexp actually looks for lines which _are_ in the comment,
# because negative lookahead assertions are 2x slower. Hence the
# need for result inversion.
end_re = re.compile("^\s*(%s|$)" % re.escape(delims[0]))
invert_end = True
# Find start
for i in range(starting_from, len(lines)):
match = start_re.search(lines[i])
if match:
start_line = i
log.debug("Found start line: %i", i)
break
# No more comments
if start_line == -1:
log.debug("No start line found - EOF")
return -1, None
# Find end
found_end = False
# Begin on the same line to account for single-line /* */
for i in range(start_line, len(lines)):
match = end_re.search(lines[i])
end_line = i
if invert_end:
match = not match
if match:
log.debug("Found end line: %i", end_line)
found_end = True
break
if start_line == end_line and len(delims) == 3:
# Single-line comment of /* */ type. There could be a set of them
# Fast forward and see
while len(lines) > end_line + 1 and \
start_re.search(lines[end_line + 1]) and \
end_re.search(lines[end_line + 1]):
log.debug("Including next line from set of /* */ comments")
end_line = end_line + 1
log.debug("Adjusted end line: %i", end_line)
if len(delims) == 3 or \
(len(delims) == 1 and not found_end):
# In these two cases, we are actually on the last comment line
# so...
log.debug("Adding 1 to end_line")
end_line += 1
assert start_line != end_line
return start_line, end_line
def _clean_copyrights(self, copyrights):
"""Clean up individual copyright lines"""
for i in range(len(copyrights)):
copyrights[i] = utils.collapse(copyrights[i])
# Remove end cruft
copyrights[i] = re.sub("[\*#\s/]+$", "", copyrights[i])
return copyrights
def _strip_comment_chars(self, comment, delims):
"""Remove all the starting (and ending, if appropriate) comment chars
from a block comment, to leave just the text.
"""
prefix = delims[0]
if len(delims) == 3:
cont = delims[1]
suffix = delims[2]
elif len(delims) == 1:
cont = delims[0]
suffix = None
else:
raise Error("Invalid delimiter length in delims: %s" % delims)
# Strip prefix
prefix_re = re.compile("^\s*%s\s?" % re.escape(prefix))
comment[0] = re.sub(prefix_re, "", comment[0])
# Strip suffix
if suffix:
suffix_re = re.compile("\s*%s" % re.escape(suffix))
comment[-1] = re.sub(suffix_re, "", comment[-1])
# If this is a multi-line comment but the suffix appears on the
# first line, it's of the form where every line is its own
# mini-comment. (This happens most often with /* */.) If so, change
# the "cont" to the "start" to have it stripped off all lines.
if re.search(suffix_re, comment[0]) and len(comment) > 1:
log.debug("Multi-line comment with prefix/suffix on each line")
cont = prefix
# Allow multiple occurrences of cont char or last cont char
cont_re = re.compile("^\s*%s+\s?" % re.escape(cont))
for i in range(1, len(comment)):
# Strip continuation char
comment[i] = re.sub(cont_re, "", comment[i])
# Strip trailing whitespace and cruft
# (Also */ terminators from comments where each line is a single
# "multi-line" comment)
comment[i] = re.sub("[\*\/#\s]*$", "", comment[i])
return comment
def _find_license(self, comment):
"""Find all matching licenses in a particular comment. Entry function
for the recursively-called function below. Returns a sorted list.
"""
retval = None
linear_comment = " ".join(comment)
linear_comment = utils.collapse(linear_comment)
# log.debug("Looking in text: '%s'\n\n" % linear_comment)
tags = self._find_license_against(self._license_data, linear_comment)
if len(tags):
# Remove all tags that other tags cancel
for tag in tags.copy():
data = self._flat_license_data[tag]
if 'cancel' in data:
tags.difference_update(data['cancel'])
# Also permit "Ignore" semantics
retval = [tag for tag in tags if not tag.startswith("Ignore_")]
retval.sort()
log.info("Found license(s): %s" % "/".join(retval))
else:
log.debug("No license found in comment")
return retval
def _find_license_against(self, license_data, comment):
"""Recursive function to precisely identify all matching licenses in
a particular comment. Recurses to get more specific. Returns a set.
"""
tags = set()
retval = None
# For each regexp (remember, they are split up due to limits in
# Python)...
for match_re in license_data['_match_res']:
# For each match found...
for match in match_re.finditer(comment):
# For each actual hit in the match object...
hits = match.groupdict()
for hit in match.groupdict():
if hits[hit] is not None:
# Make a note of it in a de-duping hash
log.debug("Hit: %s" % hit)
tags.add(self._group_names_to_tags[hit])
for tag in tags.copy():
log.debug("Found license %s" % tag)
if 'subs' in license_data[tag]:
log.debug("Checking for sub-types")
newtags = self._find_license_against(license_data[tag]['subs'],
comment)
if len(newtags):
log.debug("Replacing license %s with %r" % (tag, newtags))
tags.discard(tag)
tags.update(newtags)
else:
log.debug("Sticking with base flavour")
return tags
def _find_details(self, text, tag):
"""Given a comment (array of lines) and a license tag, find the
license text block corresponding to that license in the comment.
Also extract any copyright lines. The incoming comment text should
have already been stripped of comment markers.
Heuristics galore.
"""
license = []
copyrights = []
in_copyrights = False
info = self._flat_license_data[tag]
start_line = -1
end_line = -1
# Find copyrights and start
for i in range(len(text)):
line = text[i]
if start_line == -1 and info['_start_re'].search(line):
log.debug("First license line: %s" % line)
start_line = i
in_copyrights = False
# If we break here, we only find copyrights written above the
# license. If we don't, we end up combining copyrights when
# there are multiple licenses in a file :-| No good option.
break
log.debug("Line: %s" % line)
# This check is in two parts because the first check is a lot
# cheaper than the second
if re.search("[Cc]opyright", line):
# The second half of the conditional attempts to catch the
# (erroneous) form where the person puts their name before
# the date or copyright symbol
if re.search("[Cc]opyright ?[\d\(©]", line) or \
re.search("[Cc]opyright.{0,50}?\d{4}", line):
log.debug("Copyright line: %s" % line)
copyrights.append(line)
in_copyrights = True
continue
if in_copyrights:
if re.search("^\s*$", line):
log.debug("Blank line (while in copyrights)")
# Blank line
in_copyrights = False
elif re.search("^\s*(\d{4}|©|\([Cc]\))", line):
log.debug("Another (C) line starting with year or symbol")
copyrights.append("Copyright " + line)
elif self._cruft_re.search(line):
log.debug("Line with ignorable cruft")
in_copyrights = False
else:
# Continuation line of previous copyright line
log.debug("CopyConti line: %s" % line)
copyrights[-1] = copyrights[-1] + " " + line
if start_line == -1:
log.warning("Can't find start line for license '%s'!" % tag)
return [], []
# Find license end, starting from text end
end_line = -1
for i in range(len(text) - 1, -1, -1):
line = text[i]
if info['_end_re'].search(line):
log.debug("Last license line: %s" % line)
end_line = i
if (end_line - start_line < info['maxlines']):
# If the license seems too long, keep looking in case
# there's a nearer end line, otherwise break. This deals
# with files where there's multiple copies of the license
# text, e.g. concatenated files
break
else:
if end_line == -1:
log.warning("Can't find end line for license '%s'!" % tag)
end_line = len(text)
log.debug("License extent: line %i to %i" % (start_line, end_line))
license = text[start_line:end_line + 1]
license = self._remove_initial_rubbish(license)
copyrights = self._remove_initial_rubbish(copyrights)
return copyrights, license
def _remove_initial_rubbish(self, comment):
"""While comment chars have been removed, some license blocks still
have repeated cruft at the start of the line (often a different
type of comment char. Or they have leading whitespace.
We can't just remove all leading whitespace line-by-line as that can
mess up formatting. However, we can remove any common prefix of
whitespace or random rubbish. For the moment, take whatever's on
the first line off every line.
"""
if not comment:
return comment
match = re.search("^([\s\*#\-/]+)", comment[0])
if match:
rubbish = match.group(0)
# Last char is made optional; it can be pre-text whitespace which
# doesn't appear on blank lines
rubbish_re = re.compile("^" + re.escape(rubbish) + "?")
for i in range(len(comment)):
comment[i] = re.sub(rubbish_re, "", comment[i])
return comment