# HG changeset patch # User Peter Hosey # Date 1294290317 28800 # Node ID d2bb1603f081a9eaeb93e0f8f0ae5d4d8ffa3ec2 # Parent 929a488c49606adfa21e58997a6a46b64984aab2# Parent d65be40d24b7c70d2babc7756c0f73820479ed34 Merged in the new 1.0 version-message changeset. diff --git a/diff-colorize.py b/diff-colorize.py --- a/diff-colorize.py +++ b/diff-colorize.py @@ -43,6 +43,198 @@ You can customize the color numbers used * DIFF_HUNK_START_COLOR (lines starting with "@@") """.strip() +def interleave(*sequences): + "Generator that yields one object from each sequence in turn." + + def zip_pad(*iterables, **kw): + "Downloaded from http://code.activestate.com/recipes/497007/" + from itertools import izip, chain + if kw: + assert len(kw) == 1 + pad = kw["pad"] + else: + pad = None + done = [len(iterables)-1] + def pad_iter(): + if not done[0]: + return + done[0] -= 1 + while 1: + yield pad + iterables = [chain(seq, pad_iter()) for seq in iterables] + return izip(*iterables) + + for objects in zip_pad(*sequences): + for obj in objects: + if obj is not None: + yield obj + +class Substring(object): + def __init__(self, a, a_start, a_stop, b, b_start, b_stop): + self.a = a + self.a_start = a_start + self.a_stop = a_stop + self.b = b + self.b_start = b_start + self.b_stop = b_stop + + def before_a_substring(self): + return self.a[:self.a_start] + def before_b_substring(self): + return self.b[:self.b_start] + def substring(self): + return ''.join(self.a[self.a_start:self.a_stop]) + a_substring = substring + b_substring = substring + def after_a_substring(self): + return self.a[self.a_stop:] + def after_b_substring(self): + return self.b[self.b_stop:] + + def __hash__(self): + return hash(self.substring()) + def __cmp__(self, other): + return cmp(self.a_start, other.a_start) + def __eq__(self, other): + return self.substring() == other.substring() + def __str__(self): + return self.substring() + def __repr__(self): + return 'Substring(%r)' % (self.substring(),) + return 'Substring(%r from %r, %r, %r, %r, %r, %r)' % ( + self.substring(), + self.a, self.a_start, self.a_stop, + self.b, self.b_start, self.b_stop, + ) + +def longest_common_substring(a, b): + """Returns the longest common substring between a and b, which can be any finite indexable sliceable sequences, as a Substring object. Returns None if there is no substring between the sequences. + +Clarified and slightly modified (to use a special Substring object) from http://en.wikibooks.org/w/index.php?title=Algorithm_implementation/Strings/Longest_common_substring&oldid=1419225#Python +""" + a_len = len(a) + b_len = len(b) + lengths = [[0] * (b_len + 1) for i in xrange(a_len + 1)] + substrings = set() + greatest_length = current_run_length = 0 + for a_idx in xrange(a_len): + for b_idx in xrange(b_len): + if a[a_idx] == b[b_idx]: + current_run_length = lengths[a_idx][b_idx] + 1 + lengths[a_idx+1][b_idx+1] = current_run_length + if current_run_length > greatest_length: + greatest_length = current_run_length + substrings.clear() + if current_run_length == greatest_length: + # substrings.add(a[a_idx - current_run_length + 1:a_idx + 1]) + substrings.add(Substring(a, a_idx - current_run_length + 1, a_idx + 1, b, b_idx - current_run_length + 1, b_idx + 1)) + else: + if current_run_length > 0: + substrings.add(Substring(a, a_idx - current_run_length + 1, a_idx + 1, b, b_idx - current_run_length + 1, b_idx + 1)) + try: + return substrings.pop() + except KeyError: + return None + +def common_subsequence(a, b): + "Returns all common substrings between a and b, which can be any finite indexable sliceable sequences, as Substring objects. Determines this by recursively calling itself on slices of a and b before and after each longest common substring." + # Inspired by http://en.wikibooks.org/w/index.php?title=Algorithm_Implementation/Strings/Longest_common_subsequence&oldid=1912924#Python + def LCS_length_matrix(a, b): + matrix = [[0] * (len(b) + 1) for i in xrange(len(a) + 1)] + for i, a_ch in enumerate(a): + for j, b_ch in enumerate(b): + if a_ch == b_ch: + matrix[i + 1][j + 1] = matrix[i][j] + 1 + else: + matrix[i + 1][j + 1] = max(matrix[i + 1][j], matrix[i][j + 1]) + return matrix + + def recursive_build_subsequence(a, b, matrix=None, i=None, j=None): + if matrix is None: + matrix = LCS_length_matrix(a, b) + if i is None: + i = len(a) + if j is None: + j = len(b) + + if i == 0 or j == 0: + return [] + elif a[i - 1] == b[j - 1]: + return recursive_build_subsequence(a, b, matrix, i - 1, j - 1) + [Substring(a, i - 1, i, b, j - 1, j)] + else: + if matrix[i][j - 1] > matrix[i - 1][j]: + return recursive_build_subsequence(a, b, matrix, i, j - 1) + else: + return recursive_build_subsequence(a, b, matrix, i - 1, j) + + return recursive_build_subsequence(a, b) + +def common_and_distinct_substrings(a, b): + "Takes two strings, a and b, tokenizes them, and returns a linked list whose nodes contain runs of either common or unique tokens." + def tokenize(a): + "Each token is an identifier, a number, or a single character." + import re + # Identifier, binary number, hex number, decimal number, operator, other punctuation. + token_exp = re.compile('[_a-zA-Z][_a-zA-Z0-9]+:?|0b[01]+|0[xX][0-9A-Fa-f]+|[0-9]+|[-+*|&^/%\[\]<=>,]|[()\\\\;`{}]') + start = 0 + for match in token_exp.finditer(a): + for ch in a[start:match.start()]: + yield ch + yield match.group(0) + start = match.end() + + remainder = a[start:] + if remainder: + yield remainder + + a = list(tokenize(a)) + b = list(tokenize(b)) + + class DualPayloadLinkedListNode(object): + "This linked list gives each node two next pointers." + def __init__(self, a, b, differ=None): + self.a = a + self.b = b + self.next = None + if differ is None: + differ = (self.a != self.b) + self.differ = differ + def __iter__(self): + def walk_linked_list(x): + while x is not None: + yield x + x = x.next + return walk_linked_list(self) + def __repr__(self): + return repr([('(%r, %r)' % (x.a, x.b)) if x.differ else repr(x.a) for x in self]) + + # Linked-list nodes for common substrings will have a single Substring object in both payloads. + # Nodes for difference runs will have a string in each payload. + empty_substring = Substring(a, 0, 0, b, 0, 0) + chunks_head = DualPayloadLinkedListNode(empty_substring, empty_substring, False) + # This node is used when the input strings have no common substrings. When they do have common substrings, this node will be replaced with a real node. + chunks_head.next = DualPayloadLinkedListNode(empty_substring, empty_substring, False) + # Not chunks_head.next, since it will be replaced. + chunks_tail = chunks_head + for sub in sorted(common_subsequence(a, b)): + last_sub = chunks_tail.a + a_dif_run = ''.join(a[last_sub.a_stop:sub.a_start]) + b_dif_run = ''.join(b[last_sub.b_stop:sub.b_start]) + if a_dif_run or b_dif_run: + chunks_tail.next = DualPayloadLinkedListNode(a_dif_run, b_dif_run, True) + chunks_tail = chunks_tail.next + chunks_tail.next = DualPayloadLinkedListNode(sub, sub, False) + chunks_tail = chunks_tail.next + else: + # Get what comes after the last substring, if anything. + last_sub = chunks_tail.a + a_dif_run = ''.join(a[last_sub.a_stop:]) + b_dif_run = ''.join(b[last_sub.b_stop:]) + if a_dif_run or b_dif_run: + chunks_tail.next = DualPayloadLinkedListNode(a_dif_run, b_dif_run, True) + + return chunks_head.next + # Everything in the unified diff format is identified by a prefix. The prefixes are: # 'Index: ': File marker (unified diff) # 'diff --git': File marker (git-style diff) @@ -129,7 +321,80 @@ if __name__ == "__main__": # Standard input is a TTY, meaning that the user ran 'diff-colorize' at the shell prompt, without redirecting anything into it. Print usage info and exit. sys.exit(USAGE) + # Buffers to support interleaving old and new lines that were contiguous runs. + buffer_old = [] # '-' lines + buffer_new = [] # '+' lines + + from string import whitespace + + def flush_buffers(buffer_old, buffer_new): + "Flush the buffers, interleaving the lines and highlighting differences between them." + def print_single_line(buffered_line): + prefix = '-' if buffered_line.startswith('-') else '+' + buffered_line = buffered_line[len(prefix):] + + sys.stdout.write(prefixes[prefix]) + sys.stdout.write(buffered_line) + sys.stdout.write(RESET_FORMAT) + + last_line_if_old = None + for buffered_line in interleave(buffer_old, buffer_new): + if buffered_line.startswith('-'): + if last_line_if_old is not None: + print_single_line(last_line_if_old) + last_line_if_old = buffered_line + else: + if last_line_if_old is None: + # No old line immediately preceding this, so just print it. + print_single_line(buffered_line) + else: + old_line = last_line_if_old + new_line = buffered_line + + old_line_output = [prefixes['-']] + new_line_output = [prefixes['+']] + + differenced_lines = common_and_distinct_substrings(old_line[1:], new_line[1:]) + lines_have_any_non_whitespace_part_in_common = False + for node in differenced_lines: + if not node.differ: + if str(node.a) not in whitespace: + lines_have_any_non_whitespace_part_in_common = True + break + + for node in differenced_lines: + if lines_have_any_non_whitespace_part_in_common and node.differ: + old_line_output.append(BEGIN_REVERSE_FORMAT) + old_line_output.append(str(node.a)) + old_line_output.append(END_REVERSE_FORMAT) + + new_line_output.append(BEGIN_REVERSE_FORMAT) + new_line_output.append(str(node.b)) + new_line_output.append(END_REVERSE_FORMAT) + else: + old_line_output.append(str(node.a)) + new_line_output.append(str(node.b)) + + last_line_if_old = None + sys.stdout.writelines(''.join(old_line_output)) + sys.stdout.writelines(''.join(new_line_output)) + else: + if last_line_if_old is not None: + print_single_line(last_line_if_old) + + del buffer_old[:] + del buffer_new[:] + for line in fileinput.input(): + if line.startswith('-') and not line.startswith('---'): + buffer_old.append(line) + continue + elif line.startswith('+') and not line.startswith('+++'): + buffer_new.append(line) + continue + else: + flush_buffers(buffer_old, buffer_new) + for prefix_to_test in prefixes: if line.startswith(prefix_to_test): sys.stdout.write(prefixes[prefix_to_test]) @@ -138,3 +403,5 @@ if __name__ == "__main__": sys.stdout.write(line) sys.stdout.write(RESET_FORMAT) + else: + flush_buffers(buffer_old, buffer_new)