utils/analyzer/CmpRuns.py

   1 #!/usr/bin/env python
   2
   3 """
   4 CmpRuns - A simple tool for comparing two static analyzer runs to determine
   5 which reports have been added, removed, or changed.
   6
   7 This is designed to support automated testing using the static analyzer, from
   8 two perspectives:
   9   1. To monitor changes in the static analyzer's reports on real code bases,
  10      for regression testing.
  11
  12   2. For use by end users who want to integrate regular static analyzer testing
  13      into a buildbot like environment.
  14
  15 Usage:
  16
  17     # Load the results of both runs, to obtain lists of the corresponding
  18     # AnalysisDiagnostic objects.
  19     #
  20     resultsA = loadResultsFromSingleRun(singleRunInfoA, deleteEmpty)
  21     resultsB = loadResultsFromSingleRun(singleRunInfoB, deleteEmpty)
  22
  23     # Generate a relation from diagnostics in run A to diagnostics in run B
  24     # to obtain a list of triples (a, b, confidence).
  25     diff = compareResults(resultsA, resultsB)
  26
  27 """
  28
  29 from collections import defaultdict
  30
  31 from math import log
  32 from optparse import OptionParser
  33 import json
  34 import os
  35 import plistlib
  36 import re
  37 import sys
  38
  39 STATS_REGEXP = re.compile(r"Statistics: (\{.+\})", re.MULTILINE | re.DOTALL)
  40
  41 class Colors:
  42     """
  43     Color for terminal highlight.
  44     """
  45     RED = '\x1b[2;30;41m'
  46     GREEN = '\x1b[6;30;42m'
  47     CLEAR = '\x1b[0m'
  48
  49 # Information about analysis run:
  50 # path - the analysis output directory
  51 # root - the name of the root directory, which will be disregarded when
  52 # determining the source file name
  53 class SingleRunInfo:
  54     def __init__(self, path, root="", verboseLog=None):
  55         self.path = path
  56         self.root = root.rstrip("/\\")
  57         self.verboseLog = verboseLog
  58
  59
  60 class AnalysisDiagnostic:
  61     def __init__(self, data, report, htmlReport):
  62         self._data = data
  63         self._loc = self._data['location']
  64         self._report = report
  65         self._htmlReport = htmlReport
  66         self._reportSize = len(self._data['path'])
  67
  68     def getFileName(self):
  69         root = self._report.run.root
  70         fileName = self._report.files[self._loc['file']]
  71         if fileName.startswith(root) and len(root) > 0:
  72             return fileName[len(root) + 1:]
  73         return fileName
  74
  75     def getLine(self):
  76         return self._loc['line']
  77
  78     def getColumn(self):
  79         return self._loc['col']
  80
  81     def getPathLength(self):
  82         return self._reportSize
  83
  84     def getCategory(self):
  85         return self._data['category']
  86
  87     def getDescription(self):
  88         return self._data['description']
  89
  90     def getIssueIdentifier(self):
  91         id = self.getFileName() + "+"
  92         if 'issue_context' in self._data:
  93             id += self._data['issue_context'] + "+"
  94         if 'issue_hash_content_of_line_in_context' in self._data:
  95             id += str(self._data['issue_hash_content_of_line_in_context'])
  96         return id
  97
  98     def getReport(self):
  99         if self._htmlReport is None:
 100             return " "
 101         return os.path.join(self._report.run.path, self._htmlReport)
 102
 103     def getReadableName(self):
 104         if 'issue_context' in self._data:
 105             funcnamePostfix = "#" + self._data['issue_context']
 106         else:
 107             funcnamePostfix = ""
 108         return '%s%s:%d:%d, %s: %s' % (self.getFileName(),
 109                                        funcnamePostfix,
 110                                        self.getLine(),
 111                                        self.getColumn(), self.getCategory(),
 112                                        self.getDescription())
 113
 114     # Note, the data format is not an API and may change from one analyzer
 115     # version to another.
 116     def getRawData(self):
 117         return self._data
 118
 119
 120 class AnalysisReport:
 121     def __init__(self, run, files):
 122         self.run = run
 123         self.files = files
 124         self.diagnostics = []
 125
 126
 127 class AnalysisRun:
 128     def __init__(self, info):
 129         self.path = info.path
 130         self.root = info.root
 131         self.info = info
 132         self.reports = []
 133         # Cumulative list of all diagnostics from all the reports.
 134         self.diagnostics = []
 135         self.clang_version = None
 136         self.stats = []
 137
 138     def getClangVersion(self):
 139         return self.clang_version
 140
 141     def readSingleFile(self, p, deleteEmpty):
 142         data = plistlib.readPlist(p)
 143         if 'statistics' in data:
 144             self.stats.append(json.loads(data['statistics']))
 145             data.pop('statistics')
 146
 147         # We want to retrieve the clang version even if there are no
 148         # reports. Assume that all reports were created using the same
 149         # clang version (this is always true and is more efficient).
 150         if 'clang_version' in data:
 151             if self.clang_version is None:
 152                 self.clang_version = data.pop('clang_version')
 153             else:
 154                 data.pop('clang_version')
 155
 156         # Ignore/delete empty reports.
 157         if not data['files']:
 158             if deleteEmpty:
 159                 os.remove(p)
 160             return
 161
 162         # Extract the HTML reports, if they exists.
 163         if 'HTMLDiagnostics_files' in data['diagnostics'][0]:
 164             htmlFiles = []
 165             for d in data['diagnostics']:
 166                 # FIXME: Why is this named files, when does it have multiple
 167                 # files?
 168                 assert len(d['HTMLDiagnostics_files']) == 1
 169                 htmlFiles.append(d.pop('HTMLDiagnostics_files')[0])
 170         else:
 171             htmlFiles = [None] * len(data['diagnostics'])
 172
 173         report = AnalysisReport(self, data.pop('files'))
 174         diagnostics = [AnalysisDiagnostic(d, report, h)
 175                        for d, h in zip(data.pop('diagnostics'), htmlFiles)]
 176
 177         assert not data
 178
 179         report.diagnostics.extend(diagnostics)
 180         self.reports.append(report)
 181         self.diagnostics.extend(diagnostics)
 182
 183
 184 def loadResults(path, opts, root="", deleteEmpty=True):
 185     """
 186     Backwards compatibility API.
 187     """
 188     return loadResultsFromSingleRun(SingleRunInfo(path, root, opts.verboseLog),
 189                                     deleteEmpty)
 190
 191
 192 def loadResultsFromSingleRun(info, deleteEmpty=True):
 193     """
 194     # Load results of the analyzes from a given output folder.
 195     # - info is the SingleRunInfo object
 196     # - deleteEmpty specifies if the empty plist files should be deleted
 197
 198     """
 199     path = info.path
 200     run = AnalysisRun(info)
 201
 202     if os.path.isfile(path):
 203         run.readSingleFile(path, deleteEmpty)
 204     else:
 205         for (dirpath, dirnames, filenames) in os.walk(path):
 206             for f in filenames:
 207                 if (not f.endswith('plist')):
 208                     continue
 209                 p = os.path.join(dirpath, f)
 210                 run.readSingleFile(p, deleteEmpty)
 211
 212     return run
 213
 214
 215 def cmpAnalysisDiagnostic(d):
 216     return d.getIssueIdentifier()
 217
 218
 219 def compareResults(A, B, opts):
 220     """
 221     compareResults - Generate a relation from diagnostics in run A to
 222     diagnostics in run B.
 223
 224     The result is the relation as a list of triples (a, b) where
 225     each element {a,b} is None or a matching element from the respective run
 226     """
 227
 228     res = []
 229
 230     # Map size_before -> size_after
 231     path_difference_data = []
 232
 233     # Quickly eliminate equal elements.
 234     neqA = []
 235     neqB = []
 236     eltsA = list(A.diagnostics)
 237     eltsB = list(B.diagnostics)
 238     eltsA.sort(key=cmpAnalysisDiagnostic)
 239     eltsB.sort(key=cmpAnalysisDiagnostic)
 240     while eltsA and eltsB:
 241         a = eltsA.pop()
 242         b = eltsB.pop()
 243         if (a.getIssueIdentifier() == b.getIssueIdentifier()):
 244             if a.getPathLength() != b.getPathLength():
 245                 if opts.relative_path_histogram:
 246                     path_difference_data.append(
 247                         float(a.getPathLength()) / b.getPathLength())
 248                 elif opts.relative_log_path_histogram:
 249                     path_difference_data.append(
 250                         log(float(a.getPathLength()) / b.getPathLength()))
 251                 elif opts.absolute_path_histogram:
 252                     path_difference_data.append(
 253                         a.getPathLength() - b.getPathLength())
 254
 255             res.append((a, b))
 256         elif a.getIssueIdentifier() > b.getIssueIdentifier():
 257             eltsB.append(b)
 258             neqA.append(a)
 259         else:
 260             eltsA.append(a)
 261             neqB.append(b)
 262     neqA.extend(eltsA)
 263     neqB.extend(eltsB)
 264
 265     # FIXME: Add fuzzy matching. One simple and possible effective idea would
 266     # be to bin the diagnostics, print them in a normalized form (based solely
 267     # on the structure of the diagnostic), compute the diff, then use that as
 268     # the basis for matching. This has the nice property that we don't depend
 269     # in any way on the diagnostic format.
 270
 271     for a in neqA:
 272         res.append((a, None))
 273     for b in neqB:
 274         res.append((None, b))
 275
 276     if opts.relative_log_path_histogram or opts.relative_path_histogram or \
 277             opts.absolute_path_histogram:
 278         from matplotlib import pyplot
 279         pyplot.hist(path_difference_data, bins=100)
 280         pyplot.show()
 281
 282     return res
 283
 284 def deriveStats(results):
 285     # Assume all keys are the same in each statistics bucket.
 286     combined_data = defaultdict(list)
 287     for stat in results.stats:
 288         for key, value in stat.iteritems():
 289             combined_data[key].append(value)
 290     combined_stats = {}
 291     for key, values in combined_data.iteritems():
 292         combined_stats[str(key)] = {
 293             "max": max(values),
 294             "min": min(values),
 295             "mean": sum(values) / len(values),
 296             "median": sorted(values)[len(values) / 2],
 297             "total": sum(values)
 298         }
 299     return combined_stats
 300
 301
 302 def compareStats(resultsA, resultsB):
 303     statsA = deriveStats(resultsA)
 304     statsB = deriveStats(resultsB)
 305     keys = sorted(statsA.keys())
 306     for key in keys:
 307         print key
 308         for kkey in statsA[key]:
 309             valA = float(statsA[key][kkey])
 310             valB = float(statsB[key][kkey])
 311             report = "%.3f -> %.3f" % (valA, valB)
 312             # Only apply highlighting when writing to TTY and it's not Windows
 313             if sys.stdout.isatty() and os.name != 'nt':
 314                 if valB != 0:
 315                   ratio = (valB - valA) / valB
 316                   if ratio < -0.2:
 317                       report = Colors.GREEN + report + Colors.CLEAR
 318                   elif ratio > 0.2:
 319                       report = Colors.RED + report + Colors.CLEAR
 320             print "\t %s %s" % (kkey, report)
 321
 322 def dumpScanBuildResultsDiff(dirA, dirB, opts, deleteEmpty=True,
 323                              Stdout=sys.stdout):
 324     # Load the run results.
 325     resultsA = loadResults(dirA, opts, opts.rootA, deleteEmpty)
 326     resultsB = loadResults(dirB, opts, opts.rootB, deleteEmpty)
 327     if resultsA.stats:
 328         compareStats(resultsA, resultsB)
 329     if opts.stats_only:
 330         return
 331
 332     # Open the verbose log, if given.
 333     if opts.verboseLog:
 334         auxLog = open(opts.verboseLog, "wb")
 335     else:
 336         auxLog = None
 337
 338     diff = compareResults(resultsA, resultsB, opts)
 339     foundDiffs = 0
 340     totalAdded = 0
 341     totalRemoved = 0
 342     for res in diff:
 343         a, b = res
 344         if a is None:
 345             Stdout.write("ADDED: %r\n" % b.getReadableName())
 346             foundDiffs += 1
 347             totalAdded += 1
 348             if auxLog:
 349                 auxLog.write("('ADDED', %r, %r)\n" % (b.getReadableName(),
 350                                                       b.getReport()))
 351         elif b is None:
 352             Stdout.write("REMOVED: %r\n" % a.getReadableName())
 353             foundDiffs += 1
 354             totalRemoved += 1
 355             if auxLog:
 356                 auxLog.write("('REMOVED', %r, %r)\n" % (a.getReadableName(),
 357                                                         a.getReport()))
 358         else:
 359             pass
 360
 361     TotalReports = len(resultsB.diagnostics)
 362     Stdout.write("TOTAL REPORTS: %r\n" % TotalReports)
 363     Stdout.write("TOTAL ADDED: %r\n" % totalAdded)
 364     Stdout.write("TOTAL REMOVED: %r\n" % totalRemoved)
 365     if auxLog:
 366         auxLog.write("('TOTAL NEW REPORTS', %r)\n" % TotalReports)
 367         auxLog.write("('TOTAL DIFFERENCES', %r)\n" % foundDiffs)
 368         auxLog.close()
 369
 370     return foundDiffs, len(resultsA.diagnostics), len(resultsB.diagnostics)
 371
 372 def generate_option_parser():
 373     parser = OptionParser("usage: %prog [options] [dir A] [dir B]")
 374     parser.add_option("", "--rootA", dest="rootA",
 375                       help="Prefix to ignore on source files for directory A",
 376                       action="store", type=str, default="")
 377     parser.add_option("", "--rootB", dest="rootB",
 378                       help="Prefix to ignore on source files for directory B",
 379                       action="store", type=str, default="")
 380     parser.add_option("", "--verbose-log", dest="verboseLog",
 381                       help="Write additional information to LOG \
 382                            [default=None]",
 383                       action="store", type=str, default=None,
 384                       metavar="LOG")
 385     parser.add_option("--relative-path-differences-histogram",
 386                       action="store_true", dest="relative_path_histogram",
 387                       default=False,
 388                       help="Show histogram of relative paths differences. \
 389                             Requires matplotlib")
 390     parser.add_option("--relative-log-path-differences-histogram",
 391                       action="store_true", dest="relative_log_path_histogram",
 392                       default=False,
 393                       help="Show histogram of log relative paths differences. \
 394                             Requires matplotlib")
 395     parser.add_option("--absolute-path-differences-histogram",
 396                       action="store_true", dest="absolute_path_histogram",
 397                       default=False,
 398                       help="Show histogram of absolute paths differences. \
 399                             Requires matplotlib")
 400     parser.add_option("--stats-only", action="store_true", dest="stats_only",
 401                       default=False, help="Only show statistics on reports")
 402     return parser
 403
 404
 405 def main():
 406     parser = generate_option_parser()
 407     (opts, args) = parser.parse_args()
 408
 409     if len(args) != 2:
 410         parser.error("invalid number of arguments")
 411
 412     dirA, dirB = args
 413
 414     dumpScanBuildResultsDiff(dirA, dirB, opts)
 415
 416
 417 if __name__ == '__main__':
 418     main()