fastimport/processors/info_processor.py

   1 # Copyright (C) 2008 Canonical Ltd
   2 #
   3 # This program is free software; you can redistribute it and/or modify
   4 # it under the terms of the GNU General Public License as published by
   5 # the Free Software Foundation; either version 2 of the License, or
   6 # (at your option) any later version.
   7 #
   8 # This program is distributed in the hope that it will be useful,
   9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  11 # GNU General Public License for more details.
  12 #
  13 # You should have received a copy of the GNU General Public License
  14 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  15
  16 """Import processor that dump stats about the input (and doesn't import)."""
  17
  18
  19 from fastimport import (
  20     commands,
  21     processor,
  22     reftracker,
  23     )
  24 from fastimport.helpers import (
  25     invert_dict,
  26     invert_dictset,
  27     )
  28 import stat
  29
  30
  31 class InfoProcessor(processor.ImportProcessor):
  32     """An import processor that dumps statistics about the input.
  33
  34     No changes to the current repository are made.
  35
  36     As well as providing useful information about an import
  37     stream before importing it, this processor is useful for
  38     benchmarking the speed at which data can be extracted from
  39     the source.
  40     """
  41
  42     def __init__(self, params=None, verbose=0, outf=None):
  43         processor.ImportProcessor.__init__(self, params, verbose,
  44             outf=outf)
  45
  46     def pre_process(self):
  47         # Init statistics
  48         self.cmd_counts = {}
  49         for cmd in commands.COMMAND_NAMES:
  50             self.cmd_counts[cmd] = 0
  51         self.file_cmd_counts = {}
  52         for fc in commands.FILE_COMMAND_NAMES:
  53             self.file_cmd_counts[fc] = 0
  54         self.parent_counts = {}
  55         self.max_parent_count = 0
  56         self.committers = set()
  57         self.separate_authors_found = False
  58         self.symlinks_found = False
  59         self.executables_found = False
  60         self.sha_blob_references = False
  61         self.lightweight_tags = 0
  62         # Blob usage tracking
  63         self.blobs = {}
  64         for usage in ['new', 'used', 'unknown', 'unmarked']:
  65             self.blobs[usage] = set()
  66         self.blob_ref_counts = {}
  67         # Head tracking
  68         self.reftracker = reftracker.RefTracker()
  69         # Stuff to cache: a map from mark to # of times that mark is merged
  70         self.merges = {}
  71         # Stuff to cache: these are maps from mark to sets
  72         self.rename_old_paths = {}
  73         self.copy_source_paths = {}
  74
  75     def post_process(self):
  76         # Dump statistics
  77         cmd_names = commands.COMMAND_NAMES
  78         fc_names = commands.FILE_COMMAND_NAMES
  79         self._dump_stats_group("Command counts",
  80             [(c, self.cmd_counts[c]) for c in cmd_names], str)
  81         self._dump_stats_group("File command counts",
  82             [(c, self.file_cmd_counts[c]) for c in fc_names], str)
  83
  84         # Commit stats
  85         if self.cmd_counts['commit']:
  86             p_items = []
  87             for i in xrange(0, self.max_parent_count + 1):
  88                 if i in self.parent_counts:
  89                     count = self.parent_counts[i]
  90                     p_items.append(("parents-%d" % i, count))
  91             merges_count = len(self.merges.keys())
  92             p_items.append(('total revisions merged', merges_count))
  93             flags = {
  94                 'separate authors found': self.separate_authors_found,
  95                 'executables': self.executables_found,
  96                 'symlinks': self.symlinks_found,
  97                 'blobs referenced by SHA': self.sha_blob_references,
  98                 }
  99             self._dump_stats_group("Parent counts", p_items, str)
 100             self._dump_stats_group("Commit analysis", flags.iteritems(), _found)
 101             heads = invert_dictset(self.reftracker.heads)
 102             self._dump_stats_group("Head analysis", heads.iteritems(), None,
 103                                     _iterable_as_config_list)
 104             # note("\t%d\t%s" % (len(self.committers), 'unique committers'))
 105             self._dump_stats_group("Merges", self.merges.iteritems(), None)
 106             # We only show the rename old path and copy source paths when -vv
 107             # (verbose=2) is specified. The output here for mysql's data can't
 108             # be parsed currently so this bit of code needs more work anyhow ..
 109             if self.verbose >= 2:
 110                 self._dump_stats_group("Rename old paths",
 111                     self.rename_old_paths.iteritems(), len,
 112                     _iterable_as_config_list)
 113                 self._dump_stats_group("Copy source paths",
 114                     self.copy_source_paths.iteritems(), len,
 115                     _iterable_as_config_list)
 116
 117         # Blob stats
 118         if self.cmd_counts['blob']:
 119             # In verbose mode, don't list every blob used
 120             if self.verbose:
 121                 del self.blobs['used']
 122             self._dump_stats_group("Blob usage tracking",
 123                 self.blobs.iteritems(), len, _iterable_as_config_list)
 124         if self.blob_ref_counts:
 125             blobs_by_count = invert_dict(self.blob_ref_counts)
 126             blob_items = blobs_by_count.items()
 127             blob_items.sort()
 128             self._dump_stats_group("Blob reference counts",
 129                 blob_items, len, _iterable_as_config_list)
 130
 131         # Other stats
 132         if self.cmd_counts['reset']:
 133             reset_stats = {
 134                 'lightweight tags': self.lightweight_tags,
 135                 }
 136             self._dump_stats_group("Reset analysis", reset_stats.iteritems())
 137
 138     def _dump_stats_group(self, title, items, normal_formatter=None,
 139         verbose_formatter=None):
 140         """Dump a statistics group.
 141
 142         In verbose mode, do so as a config file so
 143         that other processors can load the information if they want to.
 144         :param normal_formatter: the callable to apply to the value
 145           before displaying it in normal mode
 146         :param verbose_formatter: the callable to apply to the value
 147           before displaying it in verbose mode
 148         """
 149         if self.verbose:
 150             self.outf.write("[%s]\n" % (title,))
 151             for name, value in items:
 152                 if verbose_formatter is not None:
 153                     value = verbose_formatter(value)
 154                 if type(name) == str:
 155                     name = name.replace(' ', '-')
 156                 self.outf.write("%s = %s\n" % (name, value))
 157             self.outf.write("\n")
 158         else:
 159             self.outf.write("%s:\n" % (title,))
 160             for name, value in items:
 161                 if normal_formatter is not None:
 162                     value = normal_formatter(value)
 163                 self.outf.write("\t%s\t%s\n" % (value, name))
 164
 165     def progress_handler(self, cmd):
 166         """Process a ProgressCommand."""
 167         self.cmd_counts[cmd.name] += 1
 168
 169     def blob_handler(self, cmd):
 170         """Process a BlobCommand."""
 171         self.cmd_counts[cmd.name] += 1
 172         if cmd.mark is None:
 173             self.blobs['unmarked'].add(cmd.id)
 174         else:
 175             self.blobs['new'].add(cmd.id)
 176             # Marks can be re-used so remove it from used if already there.
 177             # Note: we definitely do NOT want to remove it from multi if
 178             # it's already in that set.
 179             try:
 180                 self.blobs['used'].remove(cmd.id)
 181             except KeyError:
 182                 pass
 183
 184     def checkpoint_handler(self, cmd):
 185         """Process a CheckpointCommand."""
 186         self.cmd_counts[cmd.name] += 1
 187
 188     def commit_handler(self, cmd):
 189         """Process a CommitCommand."""
 190         self.cmd_counts[cmd.name] += 1
 191         self.committers.add(cmd.committer)
 192         if cmd.author is not None:
 193             self.separate_authors_found = True
 194         for fc in cmd.iter_files():
 195             self.file_cmd_counts[fc.name] += 1
 196             if isinstance(fc, commands.FileModifyCommand):
 197                 if fc.mode & 0111:
 198                     self.executables_found = True
 199                 if stat.S_ISLNK(fc.mode):
 200                     self.symlinks_found = True
 201                 if fc.dataref is not None:
 202                     if fc.dataref[0] == ':':
 203                         self._track_blob(fc.dataref)
 204                     else:
 205                         self.sha_blob_references = True
 206             elif isinstance(fc, commands.FileRenameCommand):
 207                 self.rename_old_paths.setdefault(cmd.id, set()).add(fc.old_path)
 208             elif isinstance(fc, commands.FileCopyCommand):
 209                 self.copy_source_paths.setdefault(cmd.id, set()).add(fc.src_path)
 210
 211         # Track the heads
 212         parents = self.reftracker.track_heads(cmd)
 213
 214         # Track the parent counts
 215         parent_count = len(parents)
 216         if self.parent_counts.has_key(parent_count):
 217             self.parent_counts[parent_count] += 1
 218         else:
 219             self.parent_counts[parent_count] = 1
 220             if parent_count > self.max_parent_count:
 221                 self.max_parent_count = parent_count
 222
 223         # Remember the merges
 224         if cmd.merges:
 225             #self.merges.setdefault(cmd.ref, set()).update(cmd.merges)
 226             for merge in cmd.merges:
 227                 if merge in self.merges:
 228                     self.merges[merge] += 1
 229                 else:
 230                     self.merges[merge] = 1
 231
 232     def reset_handler(self, cmd):
 233         """Process a ResetCommand."""
 234         self.cmd_counts[cmd.name] += 1
 235         if cmd.ref.startswith('refs/tags/'):
 236             self.lightweight_tags += 1
 237         else:
 238             if cmd.from_ is not None:
 239                 self.reftracker.track_heads_for_ref(
 240                     cmd.ref, cmd.from_)
 241
 242     def tag_handler(self, cmd):
 243         """Process a TagCommand."""
 244         self.cmd_counts[cmd.name] += 1
 245
 246     def feature_handler(self, cmd):
 247         """Process a FeatureCommand."""
 248         self.cmd_counts[cmd.name] += 1
 249         feature = cmd.feature_name
 250         if feature not in commands.FEATURE_NAMES:
 251             self.warning("feature %s is not supported - parsing may fail"
 252                 % (feature,))
 253
 254     def _track_blob(self, mark):
 255         if mark in self.blob_ref_counts:
 256             self.blob_ref_counts[mark] += 1
 257             pass
 258         elif mark in self.blobs['used']:
 259             self.blob_ref_counts[mark] = 2
 260             self.blobs['used'].remove(mark)
 261         elif mark in self.blobs['new']:
 262             self.blobs['used'].add(mark)
 263             self.blobs['new'].remove(mark)
 264         else:
 265             self.blobs['unknown'].add(mark)
 266
 267 def _found(b):
 268     """Format a found boolean as a string."""
 269     return ['no', 'found'][b]
 270
 271 def _iterable_as_config_list(s):
 272     """Format an iterable as a sequence of comma-separated strings.
 273
 274     To match what ConfigObj expects, a single item list has a trailing comma.
 275     """
 276     items = sorted(s)
 277     if len(items) == 1:
 278         return "%s," % (items[0],)
 279     else:
 280         return ", ".join(items)