Remove bzr-specific reftracker and idmapfile modules.
[jelmer/python-fastimport.git] / fastimport / processors / info_processor.py
1 # Copyright (C) 2008 Canonical Ltd
2 #
3 # This program is free software; you can redistribute it and/or modify
4 # it under the terms of the GNU General Public License as published by
5 # the Free Software Foundation; either version 2 of the License, or
6 # (at your option) any later version.
7 #
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 # GNU General Public License for more details.
12 #
13 # You should have received a copy of the GNU General Public License
14 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
15
16 """Import processor that dump stats about the input (and doesn't import)."""
17
18
19 from fastimport import (
20     commands,
21     processor,
22     reftracker,
23     )
24 from fastimport.helpers import (
25     invert_dict,
26     invert_dictset,
27     )
28 import stat
29
30
31 class InfoProcessor(processor.ImportProcessor):
32     """An import processor that dumps statistics about the input.
33
34     No changes to the current repository are made.
35
36     As well as providing useful information about an import
37     stream before importing it, this processor is useful for
38     benchmarking the speed at which data can be extracted from
39     the source.
40     """
41
42     def __init__(self, params=None, verbose=0, outf=None):
43         processor.ImportProcessor.__init__(self, params, verbose,
44             outf=outf)
45
46     def pre_process(self):
47         # Init statistics
48         self.cmd_counts = {}
49         for cmd in commands.COMMAND_NAMES:
50             self.cmd_counts[cmd] = 0
51         self.file_cmd_counts = {}
52         for fc in commands.FILE_COMMAND_NAMES:
53             self.file_cmd_counts[fc] = 0
54         self.parent_counts = {}
55         self.max_parent_count = 0
56         self.committers = set()
57         self.separate_authors_found = False
58         self.symlinks_found = False
59         self.executables_found = False
60         self.sha_blob_references = False
61         self.lightweight_tags = 0
62         # Blob usage tracking
63         self.blobs = {}
64         for usage in ['new', 'used', 'unknown', 'unmarked']:
65             self.blobs[usage] = set()
66         self.blob_ref_counts = {}
67         # Head tracking
68         self.reftracker = reftracker.RefTracker()
69         # Stuff to cache: a map from mark to # of times that mark is merged
70         self.merges = {}
71         # Stuff to cache: these are maps from mark to sets
72         self.rename_old_paths = {}
73         self.copy_source_paths = {}
74
75     def post_process(self):
76         # Dump statistics
77         cmd_names = commands.COMMAND_NAMES
78         fc_names = commands.FILE_COMMAND_NAMES
79         self._dump_stats_group("Command counts",
80             [(c, self.cmd_counts[c]) for c in cmd_names], str)
81         self._dump_stats_group("File command counts", 
82             [(c, self.file_cmd_counts[c]) for c in fc_names], str)
83
84         # Commit stats
85         if self.cmd_counts['commit']:
86             p_items = []
87             for i in xrange(0, self.max_parent_count + 1):
88                 if i in self.parent_counts:
89                     count = self.parent_counts[i]
90                     p_items.append(("parents-%d" % i, count))
91             merges_count = len(self.merges.keys())
92             p_items.append(('total revisions merged', merges_count))
93             flags = {
94                 'separate authors found': self.separate_authors_found,
95                 'executables': self.executables_found,
96                 'symlinks': self.symlinks_found,
97                 'blobs referenced by SHA': self.sha_blob_references,
98                 }
99             self._dump_stats_group("Parent counts", p_items, str)
100             self._dump_stats_group("Commit analysis", flags.iteritems(), _found)
101             heads = invert_dictset(self.reftracker.heads)
102             self._dump_stats_group("Head analysis", heads.iteritems(), None,
103                                     _iterable_as_config_list)
104             # note("\t%d\t%s" % (len(self.committers), 'unique committers'))
105             self._dump_stats_group("Merges", self.merges.iteritems(), None)
106             # We only show the rename old path and copy source paths when -vv
107             # (verbose=2) is specified. The output here for mysql's data can't
108             # be parsed currently so this bit of code needs more work anyhow ..
109             if self.verbose >= 2:
110                 self._dump_stats_group("Rename old paths",
111                     self.rename_old_paths.iteritems(), len,
112                     _iterable_as_config_list)
113                 self._dump_stats_group("Copy source paths",
114                     self.copy_source_paths.iteritems(), len,
115                     _iterable_as_config_list)
116
117         # Blob stats
118         if self.cmd_counts['blob']:
119             # In verbose mode, don't list every blob used
120             if self.verbose:
121                 del self.blobs['used']
122             self._dump_stats_group("Blob usage tracking",
123                 self.blobs.iteritems(), len, _iterable_as_config_list)
124         if self.blob_ref_counts:
125             blobs_by_count = invert_dict(self.blob_ref_counts)
126             blob_items = blobs_by_count.items()
127             blob_items.sort()
128             self._dump_stats_group("Blob reference counts",
129                 blob_items, len, _iterable_as_config_list)
130
131         # Other stats
132         if self.cmd_counts['reset']:
133             reset_stats = {
134                 'lightweight tags': self.lightweight_tags,
135                 }
136             self._dump_stats_group("Reset analysis", reset_stats.iteritems())
137
138     def _dump_stats_group(self, title, items, normal_formatter=None,
139         verbose_formatter=None):
140         """Dump a statistics group.
141         
142         In verbose mode, do so as a config file so
143         that other processors can load the information if they want to.
144         :param normal_formatter: the callable to apply to the value
145           before displaying it in normal mode
146         :param verbose_formatter: the callable to apply to the value
147           before displaying it in verbose mode
148         """
149         if self.verbose:
150             self.outf.write("[%s]\n" % (title,))
151             for name, value in items:
152                 if verbose_formatter is not None:
153                     value = verbose_formatter(value)
154                 if type(name) == str:
155                     name = name.replace(' ', '-')
156                 self.outf.write("%s = %s\n" % (name, value))
157             self.outf.write("\n")
158         else:
159             self.outf.write("%s:\n" % (title,))
160             for name, value in items:
161                 if normal_formatter is not None:
162                     value = normal_formatter(value)
163                 self.outf.write("\t%s\t%s\n" % (value, name))
164
165     def progress_handler(self, cmd):
166         """Process a ProgressCommand."""
167         self.cmd_counts[cmd.name] += 1
168
169     def blob_handler(self, cmd):
170         """Process a BlobCommand."""
171         self.cmd_counts[cmd.name] += 1
172         if cmd.mark is None:
173             self.blobs['unmarked'].add(cmd.id)
174         else:
175             self.blobs['new'].add(cmd.id)
176             # Marks can be re-used so remove it from used if already there.
177             # Note: we definitely do NOT want to remove it from multi if
178             # it's already in that set.
179             try:
180                 self.blobs['used'].remove(cmd.id)
181             except KeyError:
182                 pass
183
184     def checkpoint_handler(self, cmd):
185         """Process a CheckpointCommand."""
186         self.cmd_counts[cmd.name] += 1
187
188     def commit_handler(self, cmd):
189         """Process a CommitCommand."""
190         self.cmd_counts[cmd.name] += 1
191         self.committers.add(cmd.committer)
192         if cmd.author is not None:
193             self.separate_authors_found = True
194         for fc in cmd.iter_files():
195             self.file_cmd_counts[fc.name] += 1
196             if isinstance(fc, commands.FileModifyCommand):
197                 if fc.mode & 0111:
198                     self.executables_found = True
199                 if stat.S_ISLNK(fc.mode):
200                     self.symlinks_found = True
201                 if fc.dataref is not None:
202                     if fc.dataref[0] == ':':
203                         self._track_blob(fc.dataref)
204                     else:
205                         self.sha_blob_references = True
206             elif isinstance(fc, commands.FileRenameCommand):
207                 self.rename_old_paths.setdefault(cmd.id, set()).add(fc.old_path)
208             elif isinstance(fc, commands.FileCopyCommand):
209                 self.copy_source_paths.setdefault(cmd.id, set()).add(fc.src_path)
210
211         # Track the heads
212         parents = self.reftracker.track_heads(cmd)
213
214         # Track the parent counts
215         parent_count = len(parents)
216         if self.parent_counts.has_key(parent_count):
217             self.parent_counts[parent_count] += 1
218         else:
219             self.parent_counts[parent_count] = 1
220             if parent_count > self.max_parent_count:
221                 self.max_parent_count = parent_count
222
223         # Remember the merges
224         if cmd.merges:
225             #self.merges.setdefault(cmd.ref, set()).update(cmd.merges)
226             for merge in cmd.merges:
227                 if merge in self.merges:
228                     self.merges[merge] += 1
229                 else:
230                     self.merges[merge] = 1
231
232     def reset_handler(self, cmd):
233         """Process a ResetCommand."""
234         self.cmd_counts[cmd.name] += 1
235         if cmd.ref.startswith('refs/tags/'):
236             self.lightweight_tags += 1
237         else:
238             if cmd.from_ is not None:
239                 self.reftracker.track_heads_for_ref(
240                     cmd.ref, cmd.from_)
241
242     def tag_handler(self, cmd):
243         """Process a TagCommand."""
244         self.cmd_counts[cmd.name] += 1
245
246     def feature_handler(self, cmd):
247         """Process a FeatureCommand."""
248         self.cmd_counts[cmd.name] += 1
249         feature = cmd.feature_name
250         if feature not in commands.FEATURE_NAMES:
251             self.warning("feature %s is not supported - parsing may fail"
252                 % (feature,))
253
254     def _track_blob(self, mark):
255         if mark in self.blob_ref_counts:
256             self.blob_ref_counts[mark] += 1
257             pass
258         elif mark in self.blobs['used']:
259             self.blob_ref_counts[mark] = 2
260             self.blobs['used'].remove(mark)
261         elif mark in self.blobs['new']:
262             self.blobs['used'].add(mark)
263             self.blobs['new'].remove(mark)
264         else:
265             self.blobs['unknown'].add(mark)
266
267 def _found(b):
268     """Format a found boolean as a string."""
269     return ['no', 'found'][b]
270
271 def _iterable_as_config_list(s):
272     """Format an iterable as a sequence of comma-separated strings.
273     
274     To match what ConfigObj expects, a single item list has a trailing comma.
275     """
276     items = sorted(s)
277     if len(items) == 1:
278         return "%s," % (items[0],)
279     else:
280         return ", ".join(items)