Mercurial > diff-colorize

--- a/diff-colorize.py
+++ b/diff-colorize.py
@@ -43,6 +43,198 @@ You can customize the color numbers used
 * DIFF_HUNK_START_COLOR (lines starting with "@@")
 """.strip()

+def interleave(*sequences):
+	"Generator that yields one object from each sequence in turn."
+
+	def zip_pad(*iterables, **kw):
+		"Downloaded from http://code.activestate.com/recipes/497007/"
+		from itertools import izip, chain
+		if kw:
+			assert len(kw) == 1
+			pad = kw["pad"]
+		else:
+			pad = None
+		done = [len(iterables)-1]
+		def pad_iter():
+			if not done[0]:
+				return
+			done[0] -= 1
+			while 1:
+				yield pad
+		iterables = [chain(seq, pad_iter()) for seq in iterables]
+		return izip(*iterables)
+
+	for objects in zip_pad(*sequences):
+		for obj in objects:
+			if obj is not None:
+				yield obj
+
+class Substring(object):
+	def __init__(self, a, a_start, a_stop, b, b_start, b_stop):
+		self.a = a
+		self.a_start = a_start
+		self.a_stop = a_stop
+		self.b = b
+		self.b_start = b_start
+		self.b_stop = b_stop
+
+	def before_a_substring(self):
+		return self.a[:self.a_start]
+	def before_b_substring(self):
+		return self.b[:self.b_start]
+	def substring(self):
+		return ''.join(self.a[self.a_start:self.a_stop])
+	a_substring = substring
+	b_substring = substring
+	def after_a_substring(self):
+		return self.a[self.a_stop:]
+	def after_b_substring(self):
+		return self.b[self.b_stop:]
+
+	def __hash__(self):
+		return hash(self.substring())
+	def __cmp__(self, other):
+		return cmp(self.a_start, other.a_start)
+	def __eq__(self, other):
+		return self.substring() == other.substring()
+	def __str__(self):
+		return self.substring()
+	def __repr__(self):
+		return 'Substring(%r)' % (self.substring(),)
+		return 'Substring(%r from %r, %r, %r, %r, %r, %r)' % (
+			self.substring(),
+			self.a, self.a_start, self.a_stop,
+			self.b, self.b_start, self.b_stop,
+			)
+
+def longest_common_substring(a, b):
+	"""Returns the longest common substring between a and b, which can be any finite indexable sliceable sequences, as a Substring object. Returns None if there is no substring between the sequences.
+
+Clarified and slightly modified (to use a special Substring object) from http://en.wikibooks.org/w/index.php?title=Algorithm_implementation/Strings/Longest_common_substring&oldid=1419225#Python
+"""
+	a_len = len(a)
+	b_len = len(b)
+	lengths = [[0] * (b_len + 1) for i in xrange(a_len + 1)]
+	substrings = set()
+	greatest_length = current_run_length = 0
+	for a_idx in xrange(a_len):
+		for b_idx in xrange(b_len):
+			if a[a_idx] == b[b_idx]:
+				current_run_length = lengths[a_idx][b_idx] + 1
+				lengths[a_idx+1][b_idx+1] = current_run_length
+				if current_run_length > greatest_length:
+					greatest_length = current_run_length
+					substrings.clear()
+				if current_run_length == greatest_length:
+					# substrings.add(a[a_idx - current_run_length + 1:a_idx + 1])
+					substrings.add(Substring(a, a_idx - current_run_length + 1, a_idx + 1, b, b_idx - current_run_length + 1, b_idx + 1))
+	else:
+		if current_run_length > 0:
+			substrings.add(Substring(a, a_idx - current_run_length + 1, a_idx + 1, b, b_idx - current_run_length + 1, b_idx + 1))
+	try:
+		return substrings.pop()
+	except KeyError:
+		return None
+
+def common_subsequence(a, b):
+	"Returns all common substrings between a and b, which can be any finite indexable sliceable sequences, as Substring objects. Determines this by recursively calling itself on slices of a and b before and after each longest common substring."
+	# Inspired by http://en.wikibooks.org/w/index.php?title=Algorithm_Implementation/Strings/Longest_common_subsequence&oldid=1912924#Python
+	def LCS_length_matrix(a, b):
+		matrix = [[0] * (len(b) + 1) for i in xrange(len(a) + 1)]
+		for i, a_ch in enumerate(a):
+			for j, b_ch in enumerate(b):
+				if a_ch == b_ch:
+					matrix[i + 1][j + 1] = matrix[i][j] + 1
+				else:
+					matrix[i + 1][j + 1] = max(matrix[i + 1][j], matrix[i][j + 1])
+		return matrix
+
+	def recursive_build_subsequence(a, b, matrix=None, i=None, j=None):
+		if matrix is None:
+			matrix = LCS_length_matrix(a, b)
+		if i is None:
+			i = len(a)
+		if j is None:
+			j = len(b)
+
+		if i == 0 or j == 0:
+			return []
+		elif a[i - 1] == b[j - 1]:
+			return recursive_build_subsequence(a, b, matrix, i - 1, j - 1) + [Substring(a, i - 1, i, b, j - 1, j)]
+		else:
+			if matrix[i][j - 1] > matrix[i - 1][j]:
+				return recursive_build_subsequence(a, b, matrix, i, j - 1)
+			else:
+				return recursive_build_subsequence(a, b, matrix, i - 1, j)
+
+	return recursive_build_subsequence(a, b)
+
+def common_and_distinct_substrings(a, b):
+	"Takes two strings, a and b, tokenizes them, and returns a linked list whose nodes contain runs of either common or unique tokens."
+	def tokenize(a):
+		"Each token is an identifier, a number, or a single character."
+		import re
+		# Identifier, binary number, hex number, decimal number, operator, other punctuation.
+		token_exp = re.compile('[_a-zA-Z][_a-zA-Z0-9]+:?|0b[01]+|0[xX][0-9A-Fa-f]+|[0-9]+|[-+*|&^/%\[\]<=>,]|[()\\\\;`{}]')
+		start = 0
+		for match in token_exp.finditer(a):
+			for ch in a[start:match.start()]:
+				yield ch
+			yield match.group(0)
+			start = match.end()
+
+		remainder = a[start:]
+		if remainder:
+			yield remainder
+
+	a = list(tokenize(a))
+	b = list(tokenize(b))
+
+	class DualPayloadLinkedListNode(object):
+		"This linked list gives each node two next pointers."
+		def __init__(self, a, b, differ=None):
+			self.a = a
+			self.b = b
+			self.next = None
+			if differ is None:
+				differ = (self.a != self.b)
+			self.differ = differ
+		def __iter__(self):
+			def walk_linked_list(x):
+				while x is not None:
+					yield x
+					x = x.next
+			return walk_linked_list(self)
+		def __repr__(self):
+			return repr([('(%r, %r)' % (x.a, x.b)) if x.differ else repr(x.a) for x in self])
+
+	# Linked-list nodes for common substrings will have a single Substring object in both payloads.
+	# Nodes for difference runs will have a string in each payload.
+	empty_substring = Substring(a, 0, 0, b, 0, 0)
+	chunks_head = DualPayloadLinkedListNode(empty_substring, empty_substring, False)
+	# This node is used when the input strings have no common substrings. When they do have common substrings, this node will be replaced with a real node.
+	chunks_head.next = DualPayloadLinkedListNode(empty_substring, empty_substring, False)
+	# Not chunks_head.next, since it will be replaced.
+	chunks_tail = chunks_head
+	for sub in sorted(common_subsequence(a, b)):
+		last_sub = chunks_tail.a
+		a_dif_run = ''.join(a[last_sub.a_stop:sub.a_start])
+		b_dif_run = ''.join(b[last_sub.b_stop:sub.b_start])
+		if a_dif_run or b_dif_run:
+			chunks_tail.next = DualPayloadLinkedListNode(a_dif_run, b_dif_run, True)
+			chunks_tail = chunks_tail.next
+		chunks_tail.next = DualPayloadLinkedListNode(sub, sub, False)
+		chunks_tail = chunks_tail.next
+	else:
+		# Get what comes after the last substring, if anything.
+		last_sub = chunks_tail.a
+		a_dif_run = ''.join(a[last_sub.a_stop:])
+		b_dif_run = ''.join(b[last_sub.b_stop:])
+		if a_dif_run or b_dif_run:
+			chunks_tail.next = DualPayloadLinkedListNode(a_dif_run, b_dif_run, True)
+
+	return chunks_head.next
+
 # Everything in the unified diff format is identified by a prefix. The prefixes are:
 # 'Index: ':    File marker (unified diff)
 # 'diff --git': File marker (git-style diff)
@@ -129,7 +321,80 @@ if __name__ == "__main__":
 		# Standard input is a TTY, meaning that the user ran 'diff-colorize' at the shell prompt, without redirecting anything into it. Print usage info and exit.
 		sys.exit(USAGE)

+	# Buffers to support interleaving old and new lines that were contiguous runs.
+	buffer_old = [] # '-' lines
+	buffer_new = [] # '+' lines
+
+	from string import whitespace
+
+	def flush_buffers(buffer_old, buffer_new):
+		"Flush the buffers, interleaving the lines and highlighting differences between them."
+		def print_single_line(buffered_line):
+			prefix = '-' if buffered_line.startswith('-') else '+'
+			buffered_line = buffered_line[len(prefix):]
+
+			sys.stdout.write(prefixes[prefix])
+			sys.stdout.write(buffered_line)
+			sys.stdout.write(RESET_FORMAT)
+
+		last_line_if_old = None
+		for buffered_line in interleave(buffer_old, buffer_new):
+			if buffered_line.startswith('-'):
+				if last_line_if_old is not None:
+					print_single_line(last_line_if_old)
+				last_line_if_old = buffered_line
+			else:
+				if last_line_if_old is None:
+					# No old line immediately preceding this, so just print it.
+					print_single_line(buffered_line)
+				else:
+					old_line = last_line_if_old
+					new_line = buffered_line
+
+					old_line_output = [prefixes['-']]
+					new_line_output = [prefixes['+']]
+
+					differenced_lines = common_and_distinct_substrings(old_line[1:], new_line[1:])
+					lines_have_any_non_whitespace_part_in_common = False
+					for node in differenced_lines:
+						if not node.differ:
+							if str(node.a) not in whitespace:
+								lines_have_any_non_whitespace_part_in_common = True
+								break
+
+					for node in differenced_lines:
+						if lines_have_any_non_whitespace_part_in_common and node.differ:
+							old_line_output.append(BEGIN_REVERSE_FORMAT)
+							old_line_output.append(str(node.a))
+							old_line_output.append(END_REVERSE_FORMAT)
+
+							new_line_output.append(BEGIN_REVERSE_FORMAT)
+							new_line_output.append(str(node.b))
+							new_line_output.append(END_REVERSE_FORMAT)
+						else:
+							old_line_output.append(str(node.a))
+							new_line_output.append(str(node.b))
+
+					last_line_if_old = None
+					sys.stdout.writelines(''.join(old_line_output))
+					sys.stdout.writelines(''.join(new_line_output))
+		else:
+			if last_line_if_old is not None:
+				print_single_line(last_line_if_old)
+
+		del buffer_old[:]
+		del buffer_new[:]
+
 	for line in fileinput.input():
+		if line.startswith('-') and not line.startswith('---'):
+			buffer_old.append(line)
+			continue
+		elif line.startswith('+') and not line.startswith('+++'):
+			buffer_new.append(line)
+			continue
+		else:
+			flush_buffers(buffer_old, buffer_new)
+
 		for prefix_to_test in prefixes:
 			if line.startswith(prefix_to_test):
 				sys.stdout.write(prefixes[prefix_to_test])
@@ -138,3 +403,5 @@ if __name__ == "__main__":
 		sys.stdout.write(line)

 		sys.stdout.write(RESET_FORMAT)
+	else:
+		flush_buffers(buffer_old, buffer_new)