@@ -105,12 +105,8 @@ def preprocess(self):
105105
106106 # discard lines that do not match any line from the other file
107107 if n > 0 and m > 0 :
108- aset = set ()
109- bset = set ()
110- for newline in b :
111- bset .add (newline )
112- for newline in a :
113- aset .add (newline )
108+ aset = frozenset (a )
109+ bset = frozenset (b )
114110 a2 = []
115111 b2 = []
116112 j = 0
@@ -204,7 +200,8 @@ def build_matching_blocks(self, lastsnake, snakes):
204200
205201 def initialise (self ):
206202 """
207- Optimized implementaion of the O(NP) algorithm described by Sun Wu, Udi Manber, Gene Myers, Webb Miller
203+ Optimized implementation of the O(NP) algorithm described by Sun Wu,
204+ Udi Manber, Gene Myers, Webb Miller
208205 ("An O(NP) Sequence Comparison Algorithm", 1989)
209206 http://research.janelia.org/myers/Papers/np_diff.pdf
210207 """
@@ -214,10 +211,9 @@ def initialise(self):
214211 n = len (b )
215212 middle = m + 1
216213 lastsnake = None
217- delta = n - m
218- dmin = min (0 , delta )
219- dmax = max (0 , delta )
220-
214+ delta = n - m + middle
215+ dmin = min (middle , delta )
216+ dmax = max (middle , delta )
221217 snakes = []
222218 if n > 0 and m > 0 :
223219 size = n + m + 2
@@ -230,57 +226,53 @@ def initialise(self):
230226 # move along vertical edge
231227 yv = - 1
232228 node = None
233- for k in range (dmin - p , delta , 1 ):
234- km = k + middle
235- if yv < fp [ km + 1 ] [0 ]:
236- yv , node = fp [ km + 1 ]
229+ for km in range (dmin - p , delta , 1 ):
230+ t = fp [ km + 1 ]
231+ if yv < t [0 ]:
232+ yv , node = t
237233 else :
238234 yv += 1
239- x = yv - k
240- snake = 0
235+ snake = x = yv - km + middle
241236 while x < m and yv < n and a [x ] == b [yv ]:
242237 x += 1
243238 yv += 1
244- snake += 1
245- if snake :
239+ if x != snake :
240+ snake = x - snake
246241 snakes .append ((node , x - snake , yv - snake , snake ))
247242 node = len (snakes ) - 1
248243 fp [km ] = (yv , node )
249244 # move along horizontal edge
250245 yh = - 1
251246 node = None
252- for k in range (dmax + p , delta , - 1 ):
253- km = k + middle
254- if fp [ km - 1 ] [0 ] >= yh :
255- yh , node = fp [ km - 1 ]
247+ for km in range (dmax + p , delta , - 1 ):
248+ t = fp [ km - 1 ]
249+ if yh <= t [0 ]:
250+ yh , node = t
256251 yh += 1
257- x = yh - k
258- snake = 0
252+ snake = x = yh - km + middle
259253 while x < m and yh < n and a [x ] == b [yh ]:
260254 x += 1
261255 yh += 1
262- snake += 1
263- if snake :
256+ if x != snake :
257+ snake = x - snake
264258 snakes .append ((node , x - snake , yh - snake , snake ))
265259 node = len (snakes ) - 1
266260 fp [km ] = (yh , node )
267261 # point on the diagonal that leads to the sink
268- km = delta + middle
269262 if yv < yh :
270- y , node = fp [km + 1 ]
263+ y , node = fp [delta + 1 ]
271264 else :
272- y , node = fp [km - 1 ]
265+ y , node = fp [delta - 1 ]
273266 y += 1
274- x = y - delta
275- snake = 0
267+ snake = x = y - delta + middle
276268 while x < m and y < n and a [x ] == b [y ]:
277269 x += 1
278270 y += 1
279- snake += 1
280- if snake :
271+ if x != snake :
272+ snake = x - snake
281273 snakes .append ((node , x - snake , y - snake , snake ))
282274 node = len (snakes ) - 1
283- fp [km ] = (y , node )
275+ fp [delta ] = (y , node )
284276 if y >= n :
285277 lastsnake = node
286278 break
0 commit comments