1 # Copyright (C) 2008 Canonical Ltd
3 # This program is free software; you can redistribute it and/or modify
4 # it under the terms of the GNU General Public License as published by
5 # the Free Software Foundation; either version 2 of the License, or
6 # (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
13 # You should have received a copy of the GNU General Public License
14 # along with this program. If not, see <http://www.gnu.org/licenses/>.
16 """Import processor that dump stats about the input (and doesn't import)."""
19 from fastimport import (
24 from fastimport.helpers import (
31 class InfoProcessor(processor.ImportProcessor):
32 """An import processor that dumps statistics about the input.
34 No changes to the current repository are made.
36 As well as providing useful information about an import
37 stream before importing it, this processor is useful for
38 benchmarking the speed at which data can be extracted from
42 def __init__(self, params=None, verbose=0, outf=None):
43 processor.ImportProcessor.__init__(self, params, verbose,
46 def pre_process(self):
49 for cmd in commands.COMMAND_NAMES:
50 self.cmd_counts[cmd] = 0
51 self.file_cmd_counts = {}
52 for fc in commands.FILE_COMMAND_NAMES:
53 self.file_cmd_counts[fc] = 0
54 self.parent_counts = {}
55 self.max_parent_count = 0
56 self.committers = set()
57 self.separate_authors_found = False
58 self.symlinks_found = False
59 self.executables_found = False
60 self.sha_blob_references = False
61 self.lightweight_tags = 0
64 for usage in ['new', 'used', 'unknown', 'unmarked']:
65 self.blobs[usage] = set()
66 self.blob_ref_counts = {}
68 self.reftracker = reftracker.RefTracker()
69 # Stuff to cache: a map from mark to # of times that mark is merged
71 # Stuff to cache: these are maps from mark to sets
72 self.rename_old_paths = {}
73 self.copy_source_paths = {}
75 def post_process(self):
77 cmd_names = commands.COMMAND_NAMES
78 fc_names = commands.FILE_COMMAND_NAMES
79 self._dump_stats_group("Command counts",
80 [(c, self.cmd_counts[c]) for c in cmd_names], str)
81 self._dump_stats_group("File command counts",
82 [(c, self.file_cmd_counts[c]) for c in fc_names], str)
85 if self.cmd_counts['commit']:
87 for i in xrange(0, self.max_parent_count + 1):
88 if i in self.parent_counts:
89 count = self.parent_counts[i]
90 p_items.append(("parents-%d" % i, count))
91 merges_count = len(self.merges.keys())
92 p_items.append(('total revisions merged', merges_count))
94 'separate authors found': self.separate_authors_found,
95 'executables': self.executables_found,
96 'symlinks': self.symlinks_found,
97 'blobs referenced by SHA': self.sha_blob_references,
99 self._dump_stats_group("Parent counts", p_items, str)
100 self._dump_stats_group("Commit analysis", flags.iteritems(), _found)
101 heads = invert_dictset(self.reftracker.heads)
102 self._dump_stats_group("Head analysis", heads.iteritems(), None,
103 _iterable_as_config_list)
104 # note("\t%d\t%s" % (len(self.committers), 'unique committers'))
105 self._dump_stats_group("Merges", self.merges.iteritems(), None)
106 # We only show the rename old path and copy source paths when -vv
107 # (verbose=2) is specified. The output here for mysql's data can't
108 # be parsed currently so this bit of code needs more work anyhow ..
109 if self.verbose >= 2:
110 self._dump_stats_group("Rename old paths",
111 self.rename_old_paths.iteritems(), len,
112 _iterable_as_config_list)
113 self._dump_stats_group("Copy source paths",
114 self.copy_source_paths.iteritems(), len,
115 _iterable_as_config_list)
118 if self.cmd_counts['blob']:
119 # In verbose mode, don't list every blob used
121 del self.blobs['used']
122 self._dump_stats_group("Blob usage tracking",
123 self.blobs.iteritems(), len, _iterable_as_config_list)
124 if self.blob_ref_counts:
125 blobs_by_count = invert_dict(self.blob_ref_counts)
126 blob_items = blobs_by_count.items()
128 self._dump_stats_group("Blob reference counts",
129 blob_items, len, _iterable_as_config_list)
132 if self.cmd_counts['reset']:
134 'lightweight tags': self.lightweight_tags,
136 self._dump_stats_group("Reset analysis", reset_stats.iteritems())
138 def _dump_stats_group(self, title, items, normal_formatter=None,
139 verbose_formatter=None):
140 """Dump a statistics group.
142 In verbose mode, do so as a config file so
143 that other processors can load the information if they want to.
144 :param normal_formatter: the callable to apply to the value
145 before displaying it in normal mode
146 :param verbose_formatter: the callable to apply to the value
147 before displaying it in verbose mode
150 self.outf.write("[%s]\n" % (title,))
151 for name, value in items:
152 if verbose_formatter is not None:
153 value = verbose_formatter(value)
154 if type(name) == str:
155 name = name.replace(' ', '-')
156 self.outf.write("%s = %s\n" % (name, value))
157 self.outf.write("\n")
159 self.outf.write("%s:\n" % (title,))
160 for name, value in items:
161 if normal_formatter is not None:
162 value = normal_formatter(value)
163 self.outf.write("\t%s\t%s\n" % (value, name))
165 def progress_handler(self, cmd):
166 """Process a ProgressCommand."""
167 self.cmd_counts[cmd.name] += 1
169 def blob_handler(self, cmd):
170 """Process a BlobCommand."""
171 self.cmd_counts[cmd.name] += 1
173 self.blobs['unmarked'].add(cmd.id)
175 self.blobs['new'].add(cmd.id)
176 # Marks can be re-used so remove it from used if already there.
177 # Note: we definitely do NOT want to remove it from multi if
178 # it's already in that set.
180 self.blobs['used'].remove(cmd.id)
184 def checkpoint_handler(self, cmd):
185 """Process a CheckpointCommand."""
186 self.cmd_counts[cmd.name] += 1
188 def commit_handler(self, cmd):
189 """Process a CommitCommand."""
190 self.cmd_counts[cmd.name] += 1
191 self.committers.add(cmd.committer)
192 if cmd.author is not None:
193 self.separate_authors_found = True
194 for fc in cmd.iter_files():
195 self.file_cmd_counts[fc.name] += 1
196 if isinstance(fc, commands.FileModifyCommand):
198 self.executables_found = True
199 if stat.S_ISLNK(fc.mode):
200 self.symlinks_found = True
201 if fc.dataref is not None:
202 if fc.dataref[0] == ':':
203 self._track_blob(fc.dataref)
205 self.sha_blob_references = True
206 elif isinstance(fc, commands.FileRenameCommand):
207 self.rename_old_paths.setdefault(cmd.id, set()).add(fc.old_path)
208 elif isinstance(fc, commands.FileCopyCommand):
209 self.copy_source_paths.setdefault(cmd.id, set()).add(fc.src_path)
212 parents = self.reftracker.track_heads(cmd)
214 # Track the parent counts
215 parent_count = len(parents)
216 if self.parent_counts.has_key(parent_count):
217 self.parent_counts[parent_count] += 1
219 self.parent_counts[parent_count] = 1
220 if parent_count > self.max_parent_count:
221 self.max_parent_count = parent_count
223 # Remember the merges
225 #self.merges.setdefault(cmd.ref, set()).update(cmd.merges)
226 for merge in cmd.merges:
227 if merge in self.merges:
228 self.merges[merge] += 1
230 self.merges[merge] = 1
232 def reset_handler(self, cmd):
233 """Process a ResetCommand."""
234 self.cmd_counts[cmd.name] += 1
235 if cmd.ref.startswith('refs/tags/'):
236 self.lightweight_tags += 1
238 if cmd.from_ is not None:
239 self.reftracker.track_heads_for_ref(
242 def tag_handler(self, cmd):
243 """Process a TagCommand."""
244 self.cmd_counts[cmd.name] += 1
246 def feature_handler(self, cmd):
247 """Process a FeatureCommand."""
248 self.cmd_counts[cmd.name] += 1
249 feature = cmd.feature_name
250 if feature not in commands.FEATURE_NAMES:
251 self.warning("feature %s is not supported - parsing may fail"
254 def _track_blob(self, mark):
255 if mark in self.blob_ref_counts:
256 self.blob_ref_counts[mark] += 1
258 elif mark in self.blobs['used']:
259 self.blob_ref_counts[mark] = 2
260 self.blobs['used'].remove(mark)
261 elif mark in self.blobs['new']:
262 self.blobs['used'].add(mark)
263 self.blobs['new'].remove(mark)
265 self.blobs['unknown'].add(mark)
268 """Format a found boolean as a string."""
269 return ['no', 'found'][b]
271 def _iterable_as_config_list(s):
272 """Format an iterable as a sequence of comma-separated strings.
274 To match what ConfigObj expects, a single item list has a trailing comma.
278 return "%s," % (items[0],)
280 return ", ".join(items)