comparison diff-colorize.py @ 19:b709258a2fc2

Added highlighting differences between consecutive old and new lines.
author Peter Hosey <hg@boredzo.org>
date Wed, 05 Jan 2011 07:53:57 -0800
parents 83d58ccc70bf
children b4caea436f4d
comparison
equal deleted inserted replaced
18:83d58ccc70bf 19:b709258a2fc2
64 64
65 for objects in zip_pad(*sequences): 65 for objects in zip_pad(*sequences):
66 for obj in objects: 66 for obj in objects:
67 if obj is not None: 67 if obj is not None:
68 yield obj 68 yield obj
69
70 class Substring(object):
71 def __init__(self, a, a_start, a_stop, b, b_start, b_stop):
72 self.a = a
73 self.a_start = a_start
74 self.a_stop = a_stop
75 self.b = b
76 self.b_start = b_start
77 self.b_stop = b_stop
78
79 def before_a_substring(self):
80 return self.a[:self.a_start]
81 def before_b_substring(self):
82 return self.b[:self.b_start]
83 def substring(self):
84 return ''.join(self.a[self.a_start:self.a_stop])
85 a_substring = substring
86 b_substring = substring
87 def after_a_substring(self):
88 return self.a[self.a_stop:]
89 def after_b_substring(self):
90 return self.b[self.b_stop:]
91
92 def __hash__(self):
93 return hash(self.substring())
94 def __cmp__(self, other):
95 return cmp(self.a_start, other.a_start)
96 def __eq__(self, other):
97 return self.substring() == other.substring()
98 def __str__(self):
99 return self.substring()
100 def __repr__(self):
101 return 'Substring(%r)' % (self.substring(),)
102 return 'Substring(%r from %r, %r, %r, %r, %r, %r)' % (
103 self.substring(),
104 self.a, self.a_start, self.a_stop,
105 self.b, self.b_start, self.b_stop,
106 )
107
108 def longest_common_substring(a, b):
109 """Returns a set of common substrings between a and b, which can be any finite indexable sliceable sequences. Substrings are returned as Substring objects.
110
111 Clarified and slightly modified (to use a special Substring object) from http://en.wikibooks.org/w/index.php?title=Algorithm_implementation/Strings/Longest_common_substring&oldid=1419225#Python
112 """
113 a_len = len(a)
114 b_len = len(b)
115 lengths = [[0] * (b_len + 1) for i in xrange(a_len + 1)]
116 substrings = set()
117 greatest_length = current_run_length = 0
118 for a_idx in xrange(a_len):
119 for b_idx in xrange(b_len):
120 if a[a_idx] == b[b_idx]:
121 current_run_length = lengths[a_idx][b_idx] + 1
122 lengths[a_idx+1][b_idx+1] = current_run_length
123 if current_run_length > greatest_length:
124 greatest_length = current_run_length
125 substrings.clear()
126 if current_run_length == greatest_length:
127 # substrings.add(a[a_idx - current_run_length + 1:a_idx + 1])
128 substrings.add(Substring(a, a_idx - current_run_length + 1, a_idx + 1, b, b_idx - current_run_length + 1, b_idx + 1))
129 else:
130 if current_run_length > 0:
131 substrings.add(Substring(a, a_idx - current_run_length + 1, a_idx + 1, b, b_idx - current_run_length + 1, b_idx + 1))
132 return substrings
133
134 def common_and_distinct_substrings(a, b):
135 "Takes two strings, a and b, tokenizes them, and returns a linked list whose nodes contain runs of either common or unique tokens."
136 def tokenize(a):
137 "Each token is an identifier, a number, or a single character."
138 import re
139 # Identifier, binary number, hex number, decimal number, operator, other punctuation.
140 token_exp = re.compile('[_a-zA-Z][_a-zA-Z0-9]+:?|0b[01]+|0[xX][0-9A-Fa-f]+|[0-9]+|[-+*|&^/%\[\]<=>,]|[()\\\\;`{}]')
141 start = 0
142 for match in token_exp.finditer(a):
143 for ch in a[start:match.start()]:
144 yield ch
145 yield match.group(0)
146 start = match.end()
147
148 remainder = a[start:]
149 if remainder:
150 yield remainder
151
152 a = list(tokenize(a))
153 b = list(tokenize(b))
154
155 class DualPayloadLinkedListNode(object):
156 "This linked list gives each node two next pointers."
157 def __init__(self, a, b, differ=None):
158 self.a = a
159 self.b = b
160 self.next = None
161 if differ is None:
162 differ = (self.a != self.b)
163 self.differ = differ
164 def __iter__(self):
165 def walk_linked_list(x):
166 while x is not None:
167 yield x
168 x = x.next
169 return walk_linked_list(self)
170 def __repr__(self):
171 return repr([('(%r, %r)' % (x.a, x.b)) if x.differ else repr(x.a) for x in self])
172
173 # Linked-list nodes for common substrings will have a single Substring object in both payloads.
174 # Nodes for difference runs will have a string in each payload.
175 empty_substring = Substring(a, 0, 0, b, 0, 0)
176 chunks_head = DualPayloadLinkedListNode(empty_substring, empty_substring, False)
177 # This node is used when the input strings have no common substrings. When they do have common substrings, this node will be replaced with a real node.
178 chunks_head.next = DualPayloadLinkedListNode(empty_substring, empty_substring, False)
179 # Not chunks_head.next, since it will be replaced.
180 chunks_tail = chunks_head
181 for sub in sorted(longest_common_substring(a, b)):
182 last_sub = chunks_tail.a
183 a_dif_run = ''.join(a[last_sub.a_stop:sub.a_start])
184 b_dif_run = ''.join(b[last_sub.b_stop:sub.b_start])
185 if a_dif_run or b_dif_run:
186 chunks_tail.next = DualPayloadLinkedListNode(a_dif_run, b_dif_run, True)
187 chunks_tail = chunks_tail.next
188 chunks_tail.next = DualPayloadLinkedListNode(sub, sub, False)
189 chunks_tail = chunks_tail.next
190 else:
191 # Get what comes after the last substring, if anything.
192 last_sub = chunks_tail.a
193 a_dif_run = ''.join(a[last_sub.a_stop:])
194 b_dif_run = ''.join(b[last_sub.b_stop:])
195 if a_dif_run or b_dif_run:
196 chunks_tail.next = DualPayloadLinkedListNode(a_dif_run, b_dif_run, True)
197
198 return chunks_head.next
69 199
70 # Everything in the unified diff format is identified by a prefix. The prefixes are: 200 # Everything in the unified diff format is identified by a prefix. The prefixes are:
71 # 'Index: ': File marker (unified diff) 201 # 'Index: ': File marker (unified diff)
72 # 'diff --git': File marker (git-style diff) 202 # 'diff --git': File marker (git-style diff)
73 # 'old mode': File permissions mode before change 203 # 'old mode': File permissions mode before change
155 285
156 # Buffers to support interleaving old and new lines that were contiguous runs. 286 # Buffers to support interleaving old and new lines that were contiguous runs.
157 buffer_old = [] # '-' lines 287 buffer_old = [] # '-' lines
158 buffer_new = [] # '+' lines 288 buffer_new = [] # '+' lines
159 289
290 def flush_buffers(buffer_old, buffer_new):
291 "Flush the buffers, interleaving the lines and highlighting differences between them."
292 def print_single_line(buffered_line):
293 prefix = '-' if buffered_line.startswith('-') else '+'
294 buffered_line = buffered_line[len(prefix):]
295
296 sys.stdout.write(prefixes[prefix])
297 sys.stdout.write(buffered_line)
298 sys.stdout.write(RESET_FORMAT)
299
300 last_line_if_old = None
301 for buffered_line in interleave(buffer_old, buffer_new):
302 if buffered_line.startswith('-'):
303 if last_line_if_old is not None:
304 print_single_line(last_line_if_old)
305 last_line_if_old = buffered_line
306 else:
307 if last_line_if_old is None:
308 # No old line immediately preceding this, so just print it.
309 print_single_line(buffered_line)
310 else:
311 old_line = last_line_if_old
312 new_line = buffered_line
313
314 old_line_output = [prefixes['-']]
315 new_line_output = [prefixes['+']]
316 for node in common_and_distinct_substrings(old_line[1:], new_line[1:]):
317 if node.differ:
318 old_line_output.append(BEGIN_REVERSE_FORMAT)
319 old_line_output.append(str(node.a))
320 old_line_output.append(END_REVERSE_FORMAT)
321
322 new_line_output.append(BEGIN_REVERSE_FORMAT)
323 new_line_output.append(str(node.b))
324 new_line_output.append(END_REVERSE_FORMAT)
325 else:
326 old_line_output.append(str(node.a))
327 new_line_output.append(str(node.b))
328
329 last_line_if_old = None
330 sys.stdout.writelines(''.join(old_line_output))
331 sys.stdout.writelines(''.join(new_line_output))
332 else:
333 if last_line_if_old is not None:
334 print_single_line(last_line_if_old)
335
336 del buffer_old[:]
337 del buffer_new[:]
338
160 for line in fileinput.input(): 339 for line in fileinput.input():
161 if line.startswith('-') and not line.startswith('---'): 340 if line.startswith('-') and not line.startswith('---'):
162 buffer_old.append(line) 341 buffer_old.append(line)
163 continue 342 continue
164 elif line.startswith('+') and not line.startswith('+++'): 343 elif line.startswith('+') and not line.startswith('+++'):
165 buffer_new.append(line) 344 buffer_new.append(line)
166 continue 345 continue
167 else: 346 else:
168 # Flush the buffers, interleaving the lines. 347 flush_buffers(buffer_old, buffer_new)
169 for buffered_line in interleave(buffer_old, buffer_new):
170 prefix = '-' if buffered_line.startswith('-') else '+'
171 buffered_line = buffered_line[len(prefix):]
172
173 sys.stdout.write(prefixes[prefix])
174 sys.stdout.write(buffered_line)
175 sys.stdout.write(RESET_FORMAT)
176
177 del buffer_old[:]
178 del buffer_new[:]
179 348
180 for prefix_to_test in prefixes: 349 for prefix_to_test in prefixes:
181 if line.startswith(prefix_to_test): 350 if line.startswith(prefix_to_test):
182 sys.stdout.write(prefixes[prefix_to_test]) 351 sys.stdout.write(prefixes[prefix_to_test])
183 line = line[len(prefix_to_test):] 352 line = line[len(prefix_to_test):]
184 353
185 sys.stdout.write(line) 354 sys.stdout.write(line)
186 355
187 sys.stdout.write(RESET_FORMAT) 356 sys.stdout.write(RESET_FORMAT)
357 else:
358 flush_buffers(buffer_old, buffer_new)