Mercurial > diff-colorize
comparison diff-colorize.py @ 20:b4caea436f4d
Improve the highlighting of differences using a longest common subsequence algorithm.
author | Peter Hosey <hg@boredzo.org> |
---|---|
date | Wed, 05 Jan 2011 16:21:37 -0800 |
parents | b709258a2fc2 |
children | 929a488c4960 |
comparison
equal
deleted
inserted
replaced
19:b709258a2fc2 | 20:b4caea436f4d |
---|---|
104 self.a, self.a_start, self.a_stop, | 104 self.a, self.a_start, self.a_stop, |
105 self.b, self.b_start, self.b_stop, | 105 self.b, self.b_start, self.b_stop, |
106 ) | 106 ) |
107 | 107 |
108 def longest_common_substring(a, b): | 108 def longest_common_substring(a, b): |
109 """Returns a set of common substrings between a and b, which can be any finite indexable sliceable sequences. Substrings are returned as Substring objects. | 109 """Returns the longest common substring between a and b, which can be any finite indexable sliceable sequences, as a Substring object. Returns None if there is no substring between the sequences. |
110 | 110 |
111 Clarified and slightly modified (to use a special Substring object) from http://en.wikibooks.org/w/index.php?title=Algorithm_implementation/Strings/Longest_common_substring&oldid=1419225#Python | 111 Clarified and slightly modified (to use a special Substring object) from http://en.wikibooks.org/w/index.php?title=Algorithm_implementation/Strings/Longest_common_substring&oldid=1419225#Python |
112 """ | 112 """ |
113 a_len = len(a) | 113 a_len = len(a) |
114 b_len = len(b) | 114 b_len = len(b) |
127 # substrings.add(a[a_idx - current_run_length + 1:a_idx + 1]) | 127 # substrings.add(a[a_idx - current_run_length + 1:a_idx + 1]) |
128 substrings.add(Substring(a, a_idx - current_run_length + 1, a_idx + 1, b, b_idx - current_run_length + 1, b_idx + 1)) | 128 substrings.add(Substring(a, a_idx - current_run_length + 1, a_idx + 1, b, b_idx - current_run_length + 1, b_idx + 1)) |
129 else: | 129 else: |
130 if current_run_length > 0: | 130 if current_run_length > 0: |
131 substrings.add(Substring(a, a_idx - current_run_length + 1, a_idx + 1, b, b_idx - current_run_length + 1, b_idx + 1)) | 131 substrings.add(Substring(a, a_idx - current_run_length + 1, a_idx + 1, b, b_idx - current_run_length + 1, b_idx + 1)) |
132 return substrings | 132 try: |
133 return substrings.pop() | |
134 except KeyError: | |
135 return None | |
136 | |
137 def common_subsequence(a, b): | |
138 "Returns all common substrings between a and b, which can be any finite indexable sliceable sequences, as Substring objects. Determines this by recursively calling itself on slices of a and b before and after each longest common substring." | |
139 # Inspired by http://en.wikibooks.org/w/index.php?title=Algorithm_Implementation/Strings/Longest_common_subsequence&oldid=1912924#Python | |
140 def LCS_length_matrix(a, b): | |
141 matrix = [[0] * (len(b) + 1) for i in xrange(len(a) + 1)] | |
142 for i, a_ch in enumerate(a): | |
143 for j, b_ch in enumerate(b): | |
144 if a_ch == b_ch: | |
145 matrix[i + 1][j + 1] = matrix[i][j] + 1 | |
146 else: | |
147 matrix[i + 1][j + 1] = max(matrix[i + 1][j], matrix[i][j + 1]) | |
148 return matrix | |
149 | |
150 def recursive_build_subsequence(a, b, matrix=None, i=None, j=None): | |
151 if matrix is None: | |
152 matrix = LCS_length_matrix(a, b) | |
153 if i is None: | |
154 i = len(a) | |
155 if j is None: | |
156 j = len(b) | |
157 | |
158 if i == 0 or j == 0: | |
159 return [] | |
160 elif a[i - 1] == b[j - 1]: | |
161 return recursive_build_subsequence(a, b, matrix, i - 1, j - 1) + [Substring(a, i - 1, i, b, j - 1, j)] | |
162 else: | |
163 if matrix[i][j - 1] > matrix[i - 1][j]: | |
164 return recursive_build_subsequence(a, b, matrix, i, j - 1) | |
165 else: | |
166 return recursive_build_subsequence(a, b, matrix, i - 1, j) | |
167 | |
168 return recursive_build_subsequence(a, b) | |
133 | 169 |
134 def common_and_distinct_substrings(a, b): | 170 def common_and_distinct_substrings(a, b): |
135 "Takes two strings, a and b, tokenizes them, and returns a linked list whose nodes contain runs of either common or unique tokens." | 171 "Takes two strings, a and b, tokenizes them, and returns a linked list whose nodes contain runs of either common or unique tokens." |
136 def tokenize(a): | 172 def tokenize(a): |
137 "Each token is an identifier, a number, or a single character." | 173 "Each token is an identifier, a number, or a single character." |
176 chunks_head = DualPayloadLinkedListNode(empty_substring, empty_substring, False) | 212 chunks_head = DualPayloadLinkedListNode(empty_substring, empty_substring, False) |
177 # This node is used when the input strings have no common substrings. When they do have common substrings, this node will be replaced with a real node. | 213 # This node is used when the input strings have no common substrings. When they do have common substrings, this node will be replaced with a real node. |
178 chunks_head.next = DualPayloadLinkedListNode(empty_substring, empty_substring, False) | 214 chunks_head.next = DualPayloadLinkedListNode(empty_substring, empty_substring, False) |
179 # Not chunks_head.next, since it will be replaced. | 215 # Not chunks_head.next, since it will be replaced. |
180 chunks_tail = chunks_head | 216 chunks_tail = chunks_head |
181 for sub in sorted(longest_common_substring(a, b)): | 217 for sub in sorted(common_subsequence(a, b)): |
182 last_sub = chunks_tail.a | 218 last_sub = chunks_tail.a |
183 a_dif_run = ''.join(a[last_sub.a_stop:sub.a_start]) | 219 a_dif_run = ''.join(a[last_sub.a_stop:sub.a_start]) |
184 b_dif_run = ''.join(b[last_sub.b_stop:sub.b_start]) | 220 b_dif_run = ''.join(b[last_sub.b_stop:sub.b_start]) |
185 if a_dif_run or b_dif_run: | 221 if a_dif_run or b_dif_run: |
186 chunks_tail.next = DualPayloadLinkedListNode(a_dif_run, b_dif_run, True) | 222 chunks_tail.next = DualPayloadLinkedListNode(a_dif_run, b_dif_run, True) |