Mercurial > diff-colorize
comparison diff-colorize.py @ 19:b709258a2fc2
Added highlighting differences between consecutive old and new lines.
| author | Peter Hosey <hg@boredzo.org> |
|---|---|
| date | Wed, 05 Jan 2011 07:53:57 -0800 |
| parents | 83d58ccc70bf |
| children | b4caea436f4d |
comparison
equal
deleted
inserted
replaced
| 18:83d58ccc70bf | 19:b709258a2fc2 |
|---|---|
| 64 | 64 |
| 65 for objects in zip_pad(*sequences): | 65 for objects in zip_pad(*sequences): |
| 66 for obj in objects: | 66 for obj in objects: |
| 67 if obj is not None: | 67 if obj is not None: |
| 68 yield obj | 68 yield obj |
| 69 | |
| 70 class Substring(object): | |
| 71 def __init__(self, a, a_start, a_stop, b, b_start, b_stop): | |
| 72 self.a = a | |
| 73 self.a_start = a_start | |
| 74 self.a_stop = a_stop | |
| 75 self.b = b | |
| 76 self.b_start = b_start | |
| 77 self.b_stop = b_stop | |
| 78 | |
| 79 def before_a_substring(self): | |
| 80 return self.a[:self.a_start] | |
| 81 def before_b_substring(self): | |
| 82 return self.b[:self.b_start] | |
| 83 def substring(self): | |
| 84 return ''.join(self.a[self.a_start:self.a_stop]) | |
| 85 a_substring = substring | |
| 86 b_substring = substring | |
| 87 def after_a_substring(self): | |
| 88 return self.a[self.a_stop:] | |
| 89 def after_b_substring(self): | |
| 90 return self.b[self.b_stop:] | |
| 91 | |
| 92 def __hash__(self): | |
| 93 return hash(self.substring()) | |
| 94 def __cmp__(self, other): | |
| 95 return cmp(self.a_start, other.a_start) | |
| 96 def __eq__(self, other): | |
| 97 return self.substring() == other.substring() | |
| 98 def __str__(self): | |
| 99 return self.substring() | |
| 100 def __repr__(self): | |
| 101 return 'Substring(%r)' % (self.substring(),) | |
| 102 return 'Substring(%r from %r, %r, %r, %r, %r, %r)' % ( | |
| 103 self.substring(), | |
| 104 self.a, self.a_start, self.a_stop, | |
| 105 self.b, self.b_start, self.b_stop, | |
| 106 ) | |
| 107 | |
| 108 def longest_common_substring(a, b): | |
| 109 """Returns a set of common substrings between a and b, which can be any finite indexable sliceable sequences. Substrings are returned as Substring objects. | |
| 110 | |
| 111 Clarified and slightly modified (to use a special Substring object) from http://en.wikibooks.org/w/index.php?title=Algorithm_implementation/Strings/Longest_common_substring&oldid=1419225#Python | |
| 112 """ | |
| 113 a_len = len(a) | |
| 114 b_len = len(b) | |
| 115 lengths = [[0] * (b_len + 1) for i in xrange(a_len + 1)] | |
| 116 substrings = set() | |
| 117 greatest_length = current_run_length = 0 | |
| 118 for a_idx in xrange(a_len): | |
| 119 for b_idx in xrange(b_len): | |
| 120 if a[a_idx] == b[b_idx]: | |
| 121 current_run_length = lengths[a_idx][b_idx] + 1 | |
| 122 lengths[a_idx+1][b_idx+1] = current_run_length | |
| 123 if current_run_length > greatest_length: | |
| 124 greatest_length = current_run_length | |
| 125 substrings.clear() | |
| 126 if current_run_length == greatest_length: | |
| 127 # substrings.add(a[a_idx - current_run_length + 1:a_idx + 1]) | |
| 128 substrings.add(Substring(a, a_idx - current_run_length + 1, a_idx + 1, b, b_idx - current_run_length + 1, b_idx + 1)) | |
| 129 else: | |
| 130 if current_run_length > 0: | |
| 131 substrings.add(Substring(a, a_idx - current_run_length + 1, a_idx + 1, b, b_idx - current_run_length + 1, b_idx + 1)) | |
| 132 return substrings | |
| 133 | |
| 134 def common_and_distinct_substrings(a, b): | |
| 135 "Takes two strings, a and b, tokenizes them, and returns a linked list whose nodes contain runs of either common or unique tokens." | |
| 136 def tokenize(a): | |
| 137 "Each token is an identifier, a number, or a single character." | |
| 138 import re | |
| 139 # Identifier, binary number, hex number, decimal number, operator, other punctuation. | |
| 140 token_exp = re.compile('[_a-zA-Z][_a-zA-Z0-9]+:?|0b[01]+|0[xX][0-9A-Fa-f]+|[0-9]+|[-+*|&^/%\[\]<=>,]|[()\\\\;`{}]') | |
| 141 start = 0 | |
| 142 for match in token_exp.finditer(a): | |
| 143 for ch in a[start:match.start()]: | |
| 144 yield ch | |
| 145 yield match.group(0) | |
| 146 start = match.end() | |
| 147 | |
| 148 remainder = a[start:] | |
| 149 if remainder: | |
| 150 yield remainder | |
| 151 | |
| 152 a = list(tokenize(a)) | |
| 153 b = list(tokenize(b)) | |
| 154 | |
| 155 class DualPayloadLinkedListNode(object): | |
| 156 "This linked list gives each node two next pointers." | |
| 157 def __init__(self, a, b, differ=None): | |
| 158 self.a = a | |
| 159 self.b = b | |
| 160 self.next = None | |
| 161 if differ is None: | |
| 162 differ = (self.a != self.b) | |
| 163 self.differ = differ | |
| 164 def __iter__(self): | |
| 165 def walk_linked_list(x): | |
| 166 while x is not None: | |
| 167 yield x | |
| 168 x = x.next | |
| 169 return walk_linked_list(self) | |
| 170 def __repr__(self): | |
| 171 return repr([('(%r, %r)' % (x.a, x.b)) if x.differ else repr(x.a) for x in self]) | |
| 172 | |
| 173 # Linked-list nodes for common substrings will have a single Substring object in both payloads. | |
| 174 # Nodes for difference runs will have a string in each payload. | |
| 175 empty_substring = Substring(a, 0, 0, b, 0, 0) | |
| 176 chunks_head = DualPayloadLinkedListNode(empty_substring, empty_substring, False) | |
| 177 # This node is used when the input strings have no common substrings. When they do have common substrings, this node will be replaced with a real node. | |
| 178 chunks_head.next = DualPayloadLinkedListNode(empty_substring, empty_substring, False) | |
| 179 # Not chunks_head.next, since it will be replaced. | |
| 180 chunks_tail = chunks_head | |
| 181 for sub in sorted(longest_common_substring(a, b)): | |
| 182 last_sub = chunks_tail.a | |
| 183 a_dif_run = ''.join(a[last_sub.a_stop:sub.a_start]) | |
| 184 b_dif_run = ''.join(b[last_sub.b_stop:sub.b_start]) | |
| 185 if a_dif_run or b_dif_run: | |
| 186 chunks_tail.next = DualPayloadLinkedListNode(a_dif_run, b_dif_run, True) | |
| 187 chunks_tail = chunks_tail.next | |
| 188 chunks_tail.next = DualPayloadLinkedListNode(sub, sub, False) | |
| 189 chunks_tail = chunks_tail.next | |
| 190 else: | |
| 191 # Get what comes after the last substring, if anything. | |
| 192 last_sub = chunks_tail.a | |
| 193 a_dif_run = ''.join(a[last_sub.a_stop:]) | |
| 194 b_dif_run = ''.join(b[last_sub.b_stop:]) | |
| 195 if a_dif_run or b_dif_run: | |
| 196 chunks_tail.next = DualPayloadLinkedListNode(a_dif_run, b_dif_run, True) | |
| 197 | |
| 198 return chunks_head.next | |
| 69 | 199 |
| 70 # Everything in the unified diff format is identified by a prefix. The prefixes are: | 200 # Everything in the unified diff format is identified by a prefix. The prefixes are: |
| 71 # 'Index: ': File marker (unified diff) | 201 # 'Index: ': File marker (unified diff) |
| 72 # 'diff --git': File marker (git-style diff) | 202 # 'diff --git': File marker (git-style diff) |
| 73 # 'old mode': File permissions mode before change | 203 # 'old mode': File permissions mode before change |
| 155 | 285 |
| 156 # Buffers to support interleaving old and new lines that were contiguous runs. | 286 # Buffers to support interleaving old and new lines that were contiguous runs. |
| 157 buffer_old = [] # '-' lines | 287 buffer_old = [] # '-' lines |
| 158 buffer_new = [] # '+' lines | 288 buffer_new = [] # '+' lines |
| 159 | 289 |
| 290 def flush_buffers(buffer_old, buffer_new): | |
| 291 "Flush the buffers, interleaving the lines and highlighting differences between them." | |
| 292 def print_single_line(buffered_line): | |
| 293 prefix = '-' if buffered_line.startswith('-') else '+' | |
| 294 buffered_line = buffered_line[len(prefix):] | |
| 295 | |
| 296 sys.stdout.write(prefixes[prefix]) | |
| 297 sys.stdout.write(buffered_line) | |
| 298 sys.stdout.write(RESET_FORMAT) | |
| 299 | |
| 300 last_line_if_old = None | |
| 301 for buffered_line in interleave(buffer_old, buffer_new): | |
| 302 if buffered_line.startswith('-'): | |
| 303 if last_line_if_old is not None: | |
| 304 print_single_line(last_line_if_old) | |
| 305 last_line_if_old = buffered_line | |
| 306 else: | |
| 307 if last_line_if_old is None: | |
| 308 # No old line immediately preceding this, so just print it. | |
| 309 print_single_line(buffered_line) | |
| 310 else: | |
| 311 old_line = last_line_if_old | |
| 312 new_line = buffered_line | |
| 313 | |
| 314 old_line_output = [prefixes['-']] | |
| 315 new_line_output = [prefixes['+']] | |
| 316 for node in common_and_distinct_substrings(old_line[1:], new_line[1:]): | |
| 317 if node.differ: | |
| 318 old_line_output.append(BEGIN_REVERSE_FORMAT) | |
| 319 old_line_output.append(str(node.a)) | |
| 320 old_line_output.append(END_REVERSE_FORMAT) | |
| 321 | |
| 322 new_line_output.append(BEGIN_REVERSE_FORMAT) | |
| 323 new_line_output.append(str(node.b)) | |
| 324 new_line_output.append(END_REVERSE_FORMAT) | |
| 325 else: | |
| 326 old_line_output.append(str(node.a)) | |
| 327 new_line_output.append(str(node.b)) | |
| 328 | |
| 329 last_line_if_old = None | |
| 330 sys.stdout.writelines(''.join(old_line_output)) | |
| 331 sys.stdout.writelines(''.join(new_line_output)) | |
| 332 else: | |
| 333 if last_line_if_old is not None: | |
| 334 print_single_line(last_line_if_old) | |
| 335 | |
| 336 del buffer_old[:] | |
| 337 del buffer_new[:] | |
| 338 | |
| 160 for line in fileinput.input(): | 339 for line in fileinput.input(): |
| 161 if line.startswith('-') and not line.startswith('---'): | 340 if line.startswith('-') and not line.startswith('---'): |
| 162 buffer_old.append(line) | 341 buffer_old.append(line) |
| 163 continue | 342 continue |
| 164 elif line.startswith('+') and not line.startswith('+++'): | 343 elif line.startswith('+') and not line.startswith('+++'): |
| 165 buffer_new.append(line) | 344 buffer_new.append(line) |
| 166 continue | 345 continue |
| 167 else: | 346 else: |
| 168 # Flush the buffers, interleaving the lines. | 347 flush_buffers(buffer_old, buffer_new) |
| 169 for buffered_line in interleave(buffer_old, buffer_new): | |
| 170 prefix = '-' if buffered_line.startswith('-') else '+' | |
| 171 buffered_line = buffered_line[len(prefix):] | |
| 172 | |
| 173 sys.stdout.write(prefixes[prefix]) | |
| 174 sys.stdout.write(buffered_line) | |
| 175 sys.stdout.write(RESET_FORMAT) | |
| 176 | |
| 177 del buffer_old[:] | |
| 178 del buffer_new[:] | |
| 179 | 348 |
| 180 for prefix_to_test in prefixes: | 349 for prefix_to_test in prefixes: |
| 181 if line.startswith(prefix_to_test): | 350 if line.startswith(prefix_to_test): |
| 182 sys.stdout.write(prefixes[prefix_to_test]) | 351 sys.stdout.write(prefixes[prefix_to_test]) |
| 183 line = line[len(prefix_to_test):] | 352 line = line[len(prefix_to_test):] |
| 184 | 353 |
| 185 sys.stdout.write(line) | 354 sys.stdout.write(line) |
| 186 | 355 |
| 187 sys.stdout.write(RESET_FORMAT) | 356 sys.stdout.write(RESET_FORMAT) |
| 357 else: | |
| 358 flush_buffers(buffer_old, buffer_new) |
