Mercurial > diff-colorize
comparison diff-colorize.py @ 19:b709258a2fc2
Added highlighting differences between consecutive old and new lines.
author | Peter Hosey <hg@boredzo.org> |
---|---|
date | Wed, 05 Jan 2011 07:53:57 -0800 |
parents | 83d58ccc70bf |
children | b4caea436f4d |
comparison
equal
deleted
inserted
replaced
18:83d58ccc70bf | 19:b709258a2fc2 |
---|---|
64 | 64 |
65 for objects in zip_pad(*sequences): | 65 for objects in zip_pad(*sequences): |
66 for obj in objects: | 66 for obj in objects: |
67 if obj is not None: | 67 if obj is not None: |
68 yield obj | 68 yield obj |
69 | |
70 class Substring(object): | |
71 def __init__(self, a, a_start, a_stop, b, b_start, b_stop): | |
72 self.a = a | |
73 self.a_start = a_start | |
74 self.a_stop = a_stop | |
75 self.b = b | |
76 self.b_start = b_start | |
77 self.b_stop = b_stop | |
78 | |
79 def before_a_substring(self): | |
80 return self.a[:self.a_start] | |
81 def before_b_substring(self): | |
82 return self.b[:self.b_start] | |
83 def substring(self): | |
84 return ''.join(self.a[self.a_start:self.a_stop]) | |
85 a_substring = substring | |
86 b_substring = substring | |
87 def after_a_substring(self): | |
88 return self.a[self.a_stop:] | |
89 def after_b_substring(self): | |
90 return self.b[self.b_stop:] | |
91 | |
92 def __hash__(self): | |
93 return hash(self.substring()) | |
94 def __cmp__(self, other): | |
95 return cmp(self.a_start, other.a_start) | |
96 def __eq__(self, other): | |
97 return self.substring() == other.substring() | |
98 def __str__(self): | |
99 return self.substring() | |
100 def __repr__(self): | |
101 return 'Substring(%r)' % (self.substring(),) | |
102 return 'Substring(%r from %r, %r, %r, %r, %r, %r)' % ( | |
103 self.substring(), | |
104 self.a, self.a_start, self.a_stop, | |
105 self.b, self.b_start, self.b_stop, | |
106 ) | |
107 | |
108 def longest_common_substring(a, b): | |
109 """Returns a set of common substrings between a and b, which can be any finite indexable sliceable sequences. Substrings are returned as Substring objects. | |
110 | |
111 Clarified and slightly modified (to use a special Substring object) from http://en.wikibooks.org/w/index.php?title=Algorithm_implementation/Strings/Longest_common_substring&oldid=1419225#Python | |
112 """ | |
113 a_len = len(a) | |
114 b_len = len(b) | |
115 lengths = [[0] * (b_len + 1) for i in xrange(a_len + 1)] | |
116 substrings = set() | |
117 greatest_length = current_run_length = 0 | |
118 for a_idx in xrange(a_len): | |
119 for b_idx in xrange(b_len): | |
120 if a[a_idx] == b[b_idx]: | |
121 current_run_length = lengths[a_idx][b_idx] + 1 | |
122 lengths[a_idx+1][b_idx+1] = current_run_length | |
123 if current_run_length > greatest_length: | |
124 greatest_length = current_run_length | |
125 substrings.clear() | |
126 if current_run_length == greatest_length: | |
127 # substrings.add(a[a_idx - current_run_length + 1:a_idx + 1]) | |
128 substrings.add(Substring(a, a_idx - current_run_length + 1, a_idx + 1, b, b_idx - current_run_length + 1, b_idx + 1)) | |
129 else: | |
130 if current_run_length > 0: | |
131 substrings.add(Substring(a, a_idx - current_run_length + 1, a_idx + 1, b, b_idx - current_run_length + 1, b_idx + 1)) | |
132 return substrings | |
133 | |
134 def common_and_distinct_substrings(a, b): | |
135 "Takes two strings, a and b, tokenizes them, and returns a linked list whose nodes contain runs of either common or unique tokens." | |
136 def tokenize(a): | |
137 "Each token is an identifier, a number, or a single character." | |
138 import re | |
139 # Identifier, binary number, hex number, decimal number, operator, other punctuation. | |
140 token_exp = re.compile('[_a-zA-Z][_a-zA-Z0-9]+:?|0b[01]+|0[xX][0-9A-Fa-f]+|[0-9]+|[-+*|&^/%\[\]<=>,]|[()\\\\;`{}]') | |
141 start = 0 | |
142 for match in token_exp.finditer(a): | |
143 for ch in a[start:match.start()]: | |
144 yield ch | |
145 yield match.group(0) | |
146 start = match.end() | |
147 | |
148 remainder = a[start:] | |
149 if remainder: | |
150 yield remainder | |
151 | |
152 a = list(tokenize(a)) | |
153 b = list(tokenize(b)) | |
154 | |
155 class DualPayloadLinkedListNode(object): | |
156 "This linked list gives each node two next pointers." | |
157 def __init__(self, a, b, differ=None): | |
158 self.a = a | |
159 self.b = b | |
160 self.next = None | |
161 if differ is None: | |
162 differ = (self.a != self.b) | |
163 self.differ = differ | |
164 def __iter__(self): | |
165 def walk_linked_list(x): | |
166 while x is not None: | |
167 yield x | |
168 x = x.next | |
169 return walk_linked_list(self) | |
170 def __repr__(self): | |
171 return repr([('(%r, %r)' % (x.a, x.b)) if x.differ else repr(x.a) for x in self]) | |
172 | |
173 # Linked-list nodes for common substrings will have a single Substring object in both payloads. | |
174 # Nodes for difference runs will have a string in each payload. | |
175 empty_substring = Substring(a, 0, 0, b, 0, 0) | |
176 chunks_head = DualPayloadLinkedListNode(empty_substring, empty_substring, False) | |
177 # This node is used when the input strings have no common substrings. When they do have common substrings, this node will be replaced with a real node. | |
178 chunks_head.next = DualPayloadLinkedListNode(empty_substring, empty_substring, False) | |
179 # Not chunks_head.next, since it will be replaced. | |
180 chunks_tail = chunks_head | |
181 for sub in sorted(longest_common_substring(a, b)): | |
182 last_sub = chunks_tail.a | |
183 a_dif_run = ''.join(a[last_sub.a_stop:sub.a_start]) | |
184 b_dif_run = ''.join(b[last_sub.b_stop:sub.b_start]) | |
185 if a_dif_run or b_dif_run: | |
186 chunks_tail.next = DualPayloadLinkedListNode(a_dif_run, b_dif_run, True) | |
187 chunks_tail = chunks_tail.next | |
188 chunks_tail.next = DualPayloadLinkedListNode(sub, sub, False) | |
189 chunks_tail = chunks_tail.next | |
190 else: | |
191 # Get what comes after the last substring, if anything. | |
192 last_sub = chunks_tail.a | |
193 a_dif_run = ''.join(a[last_sub.a_stop:]) | |
194 b_dif_run = ''.join(b[last_sub.b_stop:]) | |
195 if a_dif_run or b_dif_run: | |
196 chunks_tail.next = DualPayloadLinkedListNode(a_dif_run, b_dif_run, True) | |
197 | |
198 return chunks_head.next | |
69 | 199 |
70 # Everything in the unified diff format is identified by a prefix. The prefixes are: | 200 # Everything in the unified diff format is identified by a prefix. The prefixes are: |
71 # 'Index: ': File marker (unified diff) | 201 # 'Index: ': File marker (unified diff) |
72 # 'diff --git': File marker (git-style diff) | 202 # 'diff --git': File marker (git-style diff) |
73 # 'old mode': File permissions mode before change | 203 # 'old mode': File permissions mode before change |
155 | 285 |
156 # Buffers to support interleaving old and new lines that were contiguous runs. | 286 # Buffers to support interleaving old and new lines that were contiguous runs. |
157 buffer_old = [] # '-' lines | 287 buffer_old = [] # '-' lines |
158 buffer_new = [] # '+' lines | 288 buffer_new = [] # '+' lines |
159 | 289 |
290 def flush_buffers(buffer_old, buffer_new): | |
291 "Flush the buffers, interleaving the lines and highlighting differences between them." | |
292 def print_single_line(buffered_line): | |
293 prefix = '-' if buffered_line.startswith('-') else '+' | |
294 buffered_line = buffered_line[len(prefix):] | |
295 | |
296 sys.stdout.write(prefixes[prefix]) | |
297 sys.stdout.write(buffered_line) | |
298 sys.stdout.write(RESET_FORMAT) | |
299 | |
300 last_line_if_old = None | |
301 for buffered_line in interleave(buffer_old, buffer_new): | |
302 if buffered_line.startswith('-'): | |
303 if last_line_if_old is not None: | |
304 print_single_line(last_line_if_old) | |
305 last_line_if_old = buffered_line | |
306 else: | |
307 if last_line_if_old is None: | |
308 # No old line immediately preceding this, so just print it. | |
309 print_single_line(buffered_line) | |
310 else: | |
311 old_line = last_line_if_old | |
312 new_line = buffered_line | |
313 | |
314 old_line_output = [prefixes['-']] | |
315 new_line_output = [prefixes['+']] | |
316 for node in common_and_distinct_substrings(old_line[1:], new_line[1:]): | |
317 if node.differ: | |
318 old_line_output.append(BEGIN_REVERSE_FORMAT) | |
319 old_line_output.append(str(node.a)) | |
320 old_line_output.append(END_REVERSE_FORMAT) | |
321 | |
322 new_line_output.append(BEGIN_REVERSE_FORMAT) | |
323 new_line_output.append(str(node.b)) | |
324 new_line_output.append(END_REVERSE_FORMAT) | |
325 else: | |
326 old_line_output.append(str(node.a)) | |
327 new_line_output.append(str(node.b)) | |
328 | |
329 last_line_if_old = None | |
330 sys.stdout.writelines(''.join(old_line_output)) | |
331 sys.stdout.writelines(''.join(new_line_output)) | |
332 else: | |
333 if last_line_if_old is not None: | |
334 print_single_line(last_line_if_old) | |
335 | |
336 del buffer_old[:] | |
337 del buffer_new[:] | |
338 | |
160 for line in fileinput.input(): | 339 for line in fileinput.input(): |
161 if line.startswith('-') and not line.startswith('---'): | 340 if line.startswith('-') and not line.startswith('---'): |
162 buffer_old.append(line) | 341 buffer_old.append(line) |
163 continue | 342 continue |
164 elif line.startswith('+') and not line.startswith('+++'): | 343 elif line.startswith('+') and not line.startswith('+++'): |
165 buffer_new.append(line) | 344 buffer_new.append(line) |
166 continue | 345 continue |
167 else: | 346 else: |
168 # Flush the buffers, interleaving the lines. | 347 flush_buffers(buffer_old, buffer_new) |
169 for buffered_line in interleave(buffer_old, buffer_new): | |
170 prefix = '-' if buffered_line.startswith('-') else '+' | |
171 buffered_line = buffered_line[len(prefix):] | |
172 | |
173 sys.stdout.write(prefixes[prefix]) | |
174 sys.stdout.write(buffered_line) | |
175 sys.stdout.write(RESET_FORMAT) | |
176 | |
177 del buffer_old[:] | |
178 del buffer_new[:] | |
179 | 348 |
180 for prefix_to_test in prefixes: | 349 for prefix_to_test in prefixes: |
181 if line.startswith(prefix_to_test): | 350 if line.startswith(prefix_to_test): |
182 sys.stdout.write(prefixes[prefix_to_test]) | 351 sys.stdout.write(prefixes[prefix_to_test]) |
183 line = line[len(prefix_to_test):] | 352 line = line[len(prefix_to_test):] |
184 | 353 |
185 sys.stdout.write(line) | 354 sys.stdout.write(line) |
186 | 355 |
187 sys.stdout.write(RESET_FORMAT) | 356 sys.stdout.write(RESET_FORMAT) |
357 else: | |
358 flush_buffers(buffer_old, buffer_new) |