comparison diff-colorize.py @ 20:b4caea436f4d

Improve the highlighting of differences using a longest common subsequence algorithm.
author Peter Hosey <hg@boredzo.org>
date Wed, 05 Jan 2011 16:21:37 -0800
parents b709258a2fc2
children 929a488c4960
comparison
equal deleted inserted replaced
19:b709258a2fc2 20:b4caea436f4d
104 self.a, self.a_start, self.a_stop, 104 self.a, self.a_start, self.a_stop,
105 self.b, self.b_start, self.b_stop, 105 self.b, self.b_start, self.b_stop,
106 ) 106 )
107 107
108 def longest_common_substring(a, b): 108 def longest_common_substring(a, b):
109 """Returns a set of common substrings between a and b, which can be any finite indexable sliceable sequences. Substrings are returned as Substring objects. 109 """Returns the longest common substring between a and b, which can be any finite indexable sliceable sequences, as a Substring object. Returns None if there is no substring between the sequences.
110 110
111 Clarified and slightly modified (to use a special Substring object) from http://en.wikibooks.org/w/index.php?title=Algorithm_implementation/Strings/Longest_common_substring&oldid=1419225#Python 111 Clarified and slightly modified (to use a special Substring object) from http://en.wikibooks.org/w/index.php?title=Algorithm_implementation/Strings/Longest_common_substring&oldid=1419225#Python
112 """ 112 """
113 a_len = len(a) 113 a_len = len(a)
114 b_len = len(b) 114 b_len = len(b)
127 # substrings.add(a[a_idx - current_run_length + 1:a_idx + 1]) 127 # substrings.add(a[a_idx - current_run_length + 1:a_idx + 1])
128 substrings.add(Substring(a, a_idx - current_run_length + 1, a_idx + 1, b, b_idx - current_run_length + 1, b_idx + 1)) 128 substrings.add(Substring(a, a_idx - current_run_length + 1, a_idx + 1, b, b_idx - current_run_length + 1, b_idx + 1))
129 else: 129 else:
130 if current_run_length > 0: 130 if current_run_length > 0:
131 substrings.add(Substring(a, a_idx - current_run_length + 1, a_idx + 1, b, b_idx - current_run_length + 1, b_idx + 1)) 131 substrings.add(Substring(a, a_idx - current_run_length + 1, a_idx + 1, b, b_idx - current_run_length + 1, b_idx + 1))
132 return substrings 132 try:
133 return substrings.pop()
134 except KeyError:
135 return None
136
137 def common_subsequence(a, b):
138 "Returns all common substrings between a and b, which can be any finite indexable sliceable sequences, as Substring objects. Determines this by recursively calling itself on slices of a and b before and after each longest common substring."
139 # Inspired by http://en.wikibooks.org/w/index.php?title=Algorithm_Implementation/Strings/Longest_common_subsequence&oldid=1912924#Python
140 def LCS_length_matrix(a, b):
141 matrix = [[0] * (len(b) + 1) for i in xrange(len(a) + 1)]
142 for i, a_ch in enumerate(a):
143 for j, b_ch in enumerate(b):
144 if a_ch == b_ch:
145 matrix[i + 1][j + 1] = matrix[i][j] + 1
146 else:
147 matrix[i + 1][j + 1] = max(matrix[i + 1][j], matrix[i][j + 1])
148 return matrix
149
150 def recursive_build_subsequence(a, b, matrix=None, i=None, j=None):
151 if matrix is None:
152 matrix = LCS_length_matrix(a, b)
153 if i is None:
154 i = len(a)
155 if j is None:
156 j = len(b)
157
158 if i == 0 or j == 0:
159 return []
160 elif a[i - 1] == b[j - 1]:
161 return recursive_build_subsequence(a, b, matrix, i - 1, j - 1) + [Substring(a, i - 1, i, b, j - 1, j)]
162 else:
163 if matrix[i][j - 1] > matrix[i - 1][j]:
164 return recursive_build_subsequence(a, b, matrix, i, j - 1)
165 else:
166 return recursive_build_subsequence(a, b, matrix, i - 1, j)
167
168 return recursive_build_subsequence(a, b)
133 169
134 def common_and_distinct_substrings(a, b): 170 def common_and_distinct_substrings(a, b):
135 "Takes two strings, a and b, tokenizes them, and returns a linked list whose nodes contain runs of either common or unique tokens." 171 "Takes two strings, a and b, tokenizes them, and returns a linked list whose nodes contain runs of either common or unique tokens."
136 def tokenize(a): 172 def tokenize(a):
137 "Each token is an identifier, a number, or a single character." 173 "Each token is an identifier, a number, or a single character."
176 chunks_head = DualPayloadLinkedListNode(empty_substring, empty_substring, False) 212 chunks_head = DualPayloadLinkedListNode(empty_substring, empty_substring, False)
177 # This node is used when the input strings have no common substrings. When they do have common substrings, this node will be replaced with a real node. 213 # This node is used when the input strings have no common substrings. When they do have common substrings, this node will be replaced with a real node.
178 chunks_head.next = DualPayloadLinkedListNode(empty_substring, empty_substring, False) 214 chunks_head.next = DualPayloadLinkedListNode(empty_substring, empty_substring, False)
179 # Not chunks_head.next, since it will be replaced. 215 # Not chunks_head.next, since it will be replaced.
180 chunks_tail = chunks_head 216 chunks_tail = chunks_head
181 for sub in sorted(longest_common_substring(a, b)): 217 for sub in sorted(common_subsequence(a, b)):
182 last_sub = chunks_tail.a 218 last_sub = chunks_tail.a
183 a_dif_run = ''.join(a[last_sub.a_stop:sub.a_start]) 219 a_dif_run = ''.join(a[last_sub.a_stop:sub.a_start])
184 b_dif_run = ''.join(b[last_sub.b_stop:sub.b_start]) 220 b_dif_run = ''.join(b[last_sub.b_stop:sub.b_start])
185 if a_dif_run or b_dif_run: 221 if a_dif_run or b_dif_run:
186 chunks_tail.next = DualPayloadLinkedListNode(a_dif_run, b_dif_run, True) 222 chunks_tail.next = DualPayloadLinkedListNode(a_dif_run, b_dif_run, True)