2424import sys
2525from collections import namedtuple
2626from decimal import Decimal
27+ from mmap import ACCESS_COPY , mmap
2728
2829from gi .repository import Gdk
2930from gi .repository import Gio
@@ -90,6 +91,58 @@ def remove_blank_lines(text):
9091 return b'\n ' .join (filter (bool , text .splitlines ()))
9192
9293
94+ def _files_contents (files , stats ):
95+ mmaps = []
96+ is_bin = False
97+ contents = [b'' for file_obj in files ]
98+
99+ for index , file_and_stat in enumerate (zip (files , stats )):
100+ file_obj , stat_ = file_and_stat
101+ # use mmap for files with size > CHUNK_SIZE
102+ data = b''
103+ if stat_ .size > CHUNK_SIZE :
104+ data = mmap (file_obj .fileno (), 0 , access = ACCESS_COPY )
105+ mmaps .append (data )
106+ else :
107+ data = file_obj .read ()
108+ contents [index ] = data
109+
110+ # Rough test to see whether files are binary.
111+ chunk_size = min ([stat_ .size , CHUNK_SIZE ])
112+ if b"\0 " in data [:chunk_size ]:
113+ is_bin = True
114+
115+ return contents , mmaps , is_bin
116+
117+
118+ def _contents_same (contents , file_size ):
119+ other_files_index = list (range (1 , len (contents )))
120+ chunk_range = zip (
121+ range (0 , file_size , CHUNK_SIZE ),
122+ range (CHUNK_SIZE , file_size + CHUNK_SIZE , CHUNK_SIZE )
123+ )
124+
125+ for start , end in chunk_range :
126+ chunk = contents [0 ][start :end ]
127+ for index in other_files_index :
128+ if not chunk == contents [index ][start :end ]:
129+ return Different
130+
131+
132+ def _normalize (contents , ignore_blank_lines , regexes = ()):
133+ contents = (bytes (c ) for c in contents )
134+ # For probable text files, discard newline differences to match
135+ if ignore_blank_lines :
136+ contents = (remove_blank_lines (c ) for c in contents )
137+ else :
138+ contents = (b"\n " .join (c .splitlines ()) for c in contents )
139+
140+ for regex in regexes :
141+ contents = (regex .sub (b'' , c ) for c in contents )
142+
143+ return contents
144+
145+
93146def _files_same (files , regexes , comparison_args ):
94147 """Determine whether a list of files are the same.
95148
@@ -105,7 +158,6 @@ def _files_same(files, regexes, comparison_args):
105158 return Same
106159
107160 files = tuple (files )
108- regexes = tuple (regexes )
109161 stats = tuple ([StatItem ._make (os .stat (f )) for f in files ])
110162
111163 shallow_comparison = comparison_args ['shallow-comparison' ]
@@ -115,6 +167,8 @@ def _files_same(files, regexes, comparison_args):
115167
116168 need_contents = ignore_blank_lines or apply_text_filters
117169
170+ regexes = tuple (regexes ) if apply_text_filters else ()
171+
118172 # If all entries are directories, they are considered to be the same
119173 if all ([stat .S_ISDIR (s .mode ) for s in stats ]):
120174 return Same
@@ -130,8 +184,9 @@ def _files_same(files, regexes, comparison_args):
130184 )
131185 return DodgySame if all_same_timestamp else Different
132186
187+ same_size = all_same ([s .size for s in stats ])
133188 # If there are no text filters, unequal sizes imply a difference
134- if not need_contents and not all_same ([ s . size for s in stats ]) :
189+ if not need_contents and not same_size :
135190 return Different
136191
137192 # Check the cache before doing the expensive comparison
@@ -141,38 +196,31 @@ def _files_same(files, regexes, comparison_args):
141196 return cache .result
142197
143198 # Open files and compare bit-by-bit
144- contents = [[] for f in files ]
145199 result = None
146200
147201 try :
148- handles = [open (f , "rb" ) for f in files ]
202+ mmaps = []
203+ handles = [open (file_path , "rb" ) for file_path in files ]
149204 try :
150- data = [h .read (CHUNK_SIZE ) for h in handles ]
151-
152- # Rough test to see whether files are binary. If files are guessed
153- # to be binary, we don't examine contents for speed and space.
154- if any (b"\0 " in d for d in data ):
155- need_contents = False
156-
157- while True :
158- if all_same (data ):
159- if not data [0 ]:
160- break
161- else :
162- result = Different
163- if not need_contents :
164- break
205+ contents , mmaps , is_bin = _files_contents (handles , stats )
165206
166- if need_contents :
167- for i in range (len (data )):
168- contents [i ].append (data [i ])
207+ # compare files chunk-by-chunk
208+ if same_size :
209+ result = _contents_same (contents , stats [0 ].size )
210+ else :
211+ result = Different
169212
170- data = [h .read (CHUNK_SIZE ) for h in handles ]
213+ # normalize and compare files again
214+ if result == Different and need_contents and not is_bin :
215+ contents = _normalize (contents , ignore_blank_lines , regexes )
216+ result = SameFiltered if all_same (contents ) else Different
171217
172218 # Files are too large; we can't apply filters
173219 except (MemoryError , OverflowError ):
174220 result = DodgySame if all_same (stats ) else DodgyDifferent
175221 finally :
222+ for m in mmaps :
223+ m .close ()
176224 for h in handles :
177225 h .close ()
178226 except IOError :
@@ -182,20 +230,6 @@ def _files_same(files, regexes, comparison_args):
182230 if result is None :
183231 result = Same
184232
185- if result == Different and need_contents :
186- contents = (b"" .join (c ) for c in contents )
187- # For probable text files, discard newline differences to match
188- if ignore_blank_lines :
189- contents = (remove_blank_lines (c ) for c in contents )
190- else :
191- contents = (b"\n " .join (c .splitlines ()) for c in contents )
192-
193- if apply_text_filters :
194- for regex in regexes :
195- contents = (regex .sub (b'' , c ) for c in contents )
196-
197- result = SameFiltered if all_same (contents ) else Different
198-
199233 _cache [cache_key ] = CacheResult (stats , result )
200234 return result
201235
0 commit comments