]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - utils/analyzer/CmpRuns.py
Vendor import of clang trunk r338150:
[FreeBSD/FreeBSD.git] / utils / analyzer / CmpRuns.py
1 #!/usr/bin/env python
2
3 """
4 CmpRuns - A simple tool for comparing two static analyzer runs to determine
5 which reports have been added, removed, or changed.
6
7 This is designed to support automated testing using the static analyzer, from
8 two perspectives:
9   1. To monitor changes in the static analyzer's reports on real code bases,
10      for regression testing.
11
12   2. For use by end users who want to integrate regular static analyzer testing
13      into a buildbot like environment.
14
15 Usage:
16
17     # Load the results of both runs, to obtain lists of the corresponding
18     # AnalysisDiagnostic objects.
19     #
20     resultsA = loadResultsFromSingleRun(singleRunInfoA, deleteEmpty)
21     resultsB = loadResultsFromSingleRun(singleRunInfoB, deleteEmpty)
22
23     # Generate a relation from diagnostics in run A to diagnostics in run B
24     # to obtain a list of triples (a, b, confidence).
25     diff = compareResults(resultsA, resultsB)
26
27 """
28
29 from collections import defaultdict
30
31 from math import log
32 from optparse import OptionParser
33 import json
34 import os
35 import plistlib
36 import re
37 import sys
38
39 STATS_REGEXP = re.compile(r"Statistics: (\{.+\})", re.MULTILINE | re.DOTALL)
40
41 class Colors:
42     """
43     Color for terminal highlight.
44     """
45     RED = '\x1b[2;30;41m'
46     GREEN = '\x1b[6;30;42m'
47     CLEAR = '\x1b[0m'
48
49 # Information about analysis run:
50 # path - the analysis output directory
51 # root - the name of the root directory, which will be disregarded when
52 # determining the source file name
53 class SingleRunInfo:
54     def __init__(self, path, root="", verboseLog=None):
55         self.path = path
56         self.root = root.rstrip("/\\")
57         self.verboseLog = verboseLog
58
59
60 class AnalysisDiagnostic:
61     def __init__(self, data, report, htmlReport):
62         self._data = data
63         self._loc = self._data['location']
64         self._report = report
65         self._htmlReport = htmlReport
66         self._reportSize = len(self._data['path'])
67
68     def getFileName(self):
69         root = self._report.run.root
70         fileName = self._report.files[self._loc['file']]
71         if fileName.startswith(root) and len(root) > 0:
72             return fileName[len(root) + 1:]
73         return fileName
74
75     def getLine(self):
76         return self._loc['line']
77
78     def getColumn(self):
79         return self._loc['col']
80
81     def getPathLength(self):
82         return self._reportSize
83
84     def getCategory(self):
85         return self._data['category']
86
87     def getDescription(self):
88         return self._data['description']
89
90     def getIssueIdentifier(self):
91         id = self.getFileName() + "+"
92         if 'issue_context' in self._data:
93             id += self._data['issue_context'] + "+"
94         if 'issue_hash_content_of_line_in_context' in self._data:
95             id += str(self._data['issue_hash_content_of_line_in_context'])
96         return id
97
98     def getReport(self):
99         if self._htmlReport is None:
100             return " "
101         return os.path.join(self._report.run.path, self._htmlReport)
102
103     def getReadableName(self):
104         if 'issue_context' in self._data:
105             funcnamePostfix = "#" + self._data['issue_context']
106         else:
107             funcnamePostfix = ""
108         return '%s%s:%d:%d, %s: %s' % (self.getFileName(),
109                                        funcnamePostfix,
110                                        self.getLine(),
111                                        self.getColumn(), self.getCategory(),
112                                        self.getDescription())
113
114     # Note, the data format is not an API and may change from one analyzer
115     # version to another.
116     def getRawData(self):
117         return self._data
118
119
120 class AnalysisReport:
121     def __init__(self, run, files):
122         self.run = run
123         self.files = files
124         self.diagnostics = []
125
126
127 class AnalysisRun:
128     def __init__(self, info):
129         self.path = info.path
130         self.root = info.root
131         self.info = info
132         self.reports = []
133         # Cumulative list of all diagnostics from all the reports.
134         self.diagnostics = []
135         self.clang_version = None
136         self.stats = []
137
138     def getClangVersion(self):
139         return self.clang_version
140
141     def readSingleFile(self, p, deleteEmpty):
142         data = plistlib.readPlist(p)
143         if 'statistics' in data:
144             self.stats.append(json.loads(data['statistics']))
145             data.pop('statistics')
146
147         # We want to retrieve the clang version even if there are no
148         # reports. Assume that all reports were created using the same
149         # clang version (this is always true and is more efficient).
150         if 'clang_version' in data:
151             if self.clang_version is None:
152                 self.clang_version = data.pop('clang_version')
153             else:
154                 data.pop('clang_version')
155
156         # Ignore/delete empty reports.
157         if not data['files']:
158             if deleteEmpty:
159                 os.remove(p)
160             return
161
162         # Extract the HTML reports, if they exists.
163         if 'HTMLDiagnostics_files' in data['diagnostics'][0]:
164             htmlFiles = []
165             for d in data['diagnostics']:
166                 # FIXME: Why is this named files, when does it have multiple
167                 # files?
168                 assert len(d['HTMLDiagnostics_files']) == 1
169                 htmlFiles.append(d.pop('HTMLDiagnostics_files')[0])
170         else:
171             htmlFiles = [None] * len(data['diagnostics'])
172
173         report = AnalysisReport(self, data.pop('files'))
174         diagnostics = [AnalysisDiagnostic(d, report, h)
175                        for d, h in zip(data.pop('diagnostics'), htmlFiles)]
176
177         assert not data
178
179         report.diagnostics.extend(diagnostics)
180         self.reports.append(report)
181         self.diagnostics.extend(diagnostics)
182
183
184 def loadResults(path, opts, root="", deleteEmpty=True):
185     """
186     Backwards compatibility API.
187     """
188     return loadResultsFromSingleRun(SingleRunInfo(path, root, opts.verboseLog),
189                                     deleteEmpty)
190
191
192 def loadResultsFromSingleRun(info, deleteEmpty=True):
193     """
194     # Load results of the analyzes from a given output folder.
195     # - info is the SingleRunInfo object
196     # - deleteEmpty specifies if the empty plist files should be deleted
197
198     """
199     path = info.path
200     run = AnalysisRun(info)
201
202     if os.path.isfile(path):
203         run.readSingleFile(path, deleteEmpty)
204     else:
205         for (dirpath, dirnames, filenames) in os.walk(path):
206             for f in filenames:
207                 if (not f.endswith('plist')):
208                     continue
209                 p = os.path.join(dirpath, f)
210                 run.readSingleFile(p, deleteEmpty)
211
212     return run
213
214
215 def cmpAnalysisDiagnostic(d):
216     return d.getIssueIdentifier()
217
218
219 def compareResults(A, B, opts):
220     """
221     compareResults - Generate a relation from diagnostics in run A to
222     diagnostics in run B.
223
224     The result is the relation as a list of triples (a, b) where
225     each element {a,b} is None or a matching element from the respective run
226     """
227
228     res = []
229
230     # Map size_before -> size_after
231     path_difference_data = []
232
233     # Quickly eliminate equal elements.
234     neqA = []
235     neqB = []
236     eltsA = list(A.diagnostics)
237     eltsB = list(B.diagnostics)
238     eltsA.sort(key=cmpAnalysisDiagnostic)
239     eltsB.sort(key=cmpAnalysisDiagnostic)
240     while eltsA and eltsB:
241         a = eltsA.pop()
242         b = eltsB.pop()
243         if (a.getIssueIdentifier() == b.getIssueIdentifier()):
244             if a.getPathLength() != b.getPathLength():
245                 if opts.relative_path_histogram:
246                     path_difference_data.append(
247                         float(a.getPathLength()) / b.getPathLength())
248                 elif opts.relative_log_path_histogram:
249                     path_difference_data.append(
250                         log(float(a.getPathLength()) / b.getPathLength()))
251                 elif opts.absolute_path_histogram:
252                     path_difference_data.append(
253                         a.getPathLength() - b.getPathLength())
254
255             res.append((a, b))
256         elif a.getIssueIdentifier() > b.getIssueIdentifier():
257             eltsB.append(b)
258             neqA.append(a)
259         else:
260             eltsA.append(a)
261             neqB.append(b)
262     neqA.extend(eltsA)
263     neqB.extend(eltsB)
264
265     # FIXME: Add fuzzy matching. One simple and possible effective idea would
266     # be to bin the diagnostics, print them in a normalized form (based solely
267     # on the structure of the diagnostic), compute the diff, then use that as
268     # the basis for matching. This has the nice property that we don't depend
269     # in any way on the diagnostic format.
270
271     for a in neqA:
272         res.append((a, None))
273     for b in neqB:
274         res.append((None, b))
275
276     if opts.relative_log_path_histogram or opts.relative_path_histogram or \
277             opts.absolute_path_histogram:
278         from matplotlib import pyplot
279         pyplot.hist(path_difference_data, bins=100)
280         pyplot.show()
281
282     return res
283
284 def deriveStats(results):
285     # Assume all keys are the same in each statistics bucket.
286     combined_data = defaultdict(list)
287     for stat in results.stats:
288         for key, value in stat.iteritems():
289             combined_data[key].append(value)
290     combined_stats = {}
291     for key, values in combined_data.iteritems():
292         combined_stats[str(key)] = {
293             "max": max(values),
294             "min": min(values),
295             "mean": sum(values) / len(values),
296             "median": sorted(values)[len(values) / 2],
297             "total": sum(values)
298         }
299     return combined_stats
300
301
302 def compareStats(resultsA, resultsB):
303     statsA = deriveStats(resultsA)
304     statsB = deriveStats(resultsB)
305     keys = sorted(statsA.keys())
306     for key in keys:
307         print key
308         for kkey in statsA[key]:
309             valA = float(statsA[key][kkey])
310             valB = float(statsB[key][kkey])
311             report = "%.3f -> %.3f" % (valA, valB)
312             # Only apply highlighting when writing to TTY and it's not Windows
313             if sys.stdout.isatty() and os.name != 'nt':
314                 if valB != 0:
315                   ratio = (valB - valA) / valB
316                   if ratio < -0.2:
317                       report = Colors.GREEN + report + Colors.CLEAR
318                   elif ratio > 0.2:
319                       report = Colors.RED + report + Colors.CLEAR
320             print "\t %s %s" % (kkey, report)
321
322 def dumpScanBuildResultsDiff(dirA, dirB, opts, deleteEmpty=True,
323                              Stdout=sys.stdout):
324     # Load the run results.
325     resultsA = loadResults(dirA, opts, opts.rootA, deleteEmpty)
326     resultsB = loadResults(dirB, opts, opts.rootB, deleteEmpty)
327     if resultsA.stats:
328         compareStats(resultsA, resultsB)
329     if opts.stats_only:
330         return
331
332     # Open the verbose log, if given.
333     if opts.verboseLog:
334         auxLog = open(opts.verboseLog, "wb")
335     else:
336         auxLog = None
337
338     diff = compareResults(resultsA, resultsB, opts)
339     foundDiffs = 0
340     totalAdded = 0
341     totalRemoved = 0
342     for res in diff:
343         a, b = res
344         if a is None:
345             Stdout.write("ADDED: %r\n" % b.getReadableName())
346             foundDiffs += 1
347             totalAdded += 1
348             if auxLog:
349                 auxLog.write("('ADDED', %r, %r)\n" % (b.getReadableName(),
350                                                       b.getReport()))
351         elif b is None:
352             Stdout.write("REMOVED: %r\n" % a.getReadableName())
353             foundDiffs += 1
354             totalRemoved += 1
355             if auxLog:
356                 auxLog.write("('REMOVED', %r, %r)\n" % (a.getReadableName(),
357                                                         a.getReport()))
358         else:
359             pass
360
361     TotalReports = len(resultsB.diagnostics)
362     Stdout.write("TOTAL REPORTS: %r\n" % TotalReports)
363     Stdout.write("TOTAL ADDED: %r\n" % totalAdded)
364     Stdout.write("TOTAL REMOVED: %r\n" % totalRemoved)
365     if auxLog:
366         auxLog.write("('TOTAL NEW REPORTS', %r)\n" % TotalReports)
367         auxLog.write("('TOTAL DIFFERENCES', %r)\n" % foundDiffs)
368         auxLog.close()
369
370     return foundDiffs, len(resultsA.diagnostics), len(resultsB.diagnostics)
371
372 def generate_option_parser():
373     parser = OptionParser("usage: %prog [options] [dir A] [dir B]")
374     parser.add_option("", "--rootA", dest="rootA",
375                       help="Prefix to ignore on source files for directory A",
376                       action="store", type=str, default="")
377     parser.add_option("", "--rootB", dest="rootB",
378                       help="Prefix to ignore on source files for directory B",
379                       action="store", type=str, default="")
380     parser.add_option("", "--verbose-log", dest="verboseLog",
381                       help="Write additional information to LOG \
382                            [default=None]",
383                       action="store", type=str, default=None,
384                       metavar="LOG")
385     parser.add_option("--relative-path-differences-histogram",
386                       action="store_true", dest="relative_path_histogram",
387                       default=False,
388                       help="Show histogram of relative paths differences. \
389                             Requires matplotlib")
390     parser.add_option("--relative-log-path-differences-histogram",
391                       action="store_true", dest="relative_log_path_histogram",
392                       default=False,
393                       help="Show histogram of log relative paths differences. \
394                             Requires matplotlib")
395     parser.add_option("--absolute-path-differences-histogram",
396                       action="store_true", dest="absolute_path_histogram",
397                       default=False,
398                       help="Show histogram of absolute paths differences. \
399                             Requires matplotlib")
400     parser.add_option("--stats-only", action="store_true", dest="stats_only",
401                       default=False, help="Only show statistics on reports")
402     return parser
403
404
405 def main():
406     parser = generate_option_parser()
407     (opts, args) = parser.parse_args()
408
409     if len(args) != 2:
410         parser.error("invalid number of arguments")
411
412     dirA, dirB = args
413
414     dumpScanBuildResultsDiff(dirA, dirB, opts)
415
416
417 if __name__ == '__main__':
418     main()