Skip to content

Commit 9c44350

Browse files
committed
issue: #dirdiff._files_same.mmap - Use mmap for files with size greater than CHUNK_SIZE; commit: using mmap
split logic in different functions
1 parent 74ee2a9 commit 9c44350

2 files changed

Lines changed: 72 additions & 38 deletions

File tree

meld/dirdiff.py

Lines changed: 71 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import sys
2525
from collections import namedtuple
2626
from decimal import Decimal
27+
from mmap import ACCESS_COPY, mmap
2728

2829
from gi.repository import Gdk
2930
from gi.repository import Gio
@@ -90,6 +91,58 @@ def remove_blank_lines(text):
9091
return b'\n'.join(filter(bool, text.splitlines()))
9192

9293

94+
def _files_contents(files, stats):
95+
mmaps = []
96+
is_bin = False
97+
contents = [b'' for file_obj in files]
98+
99+
for index, file_and_stat in enumerate(zip(files, stats)):
100+
file_obj, stat_ = file_and_stat
101+
# use mmap for files with size > CHUNK_SIZE
102+
data = b''
103+
if stat_.size > CHUNK_SIZE:
104+
data = mmap(file_obj.fileno(), 0, access=ACCESS_COPY)
105+
mmaps.append(data)
106+
else:
107+
data = file_obj.read()
108+
contents[index] = data
109+
110+
# Rough test to see whether files are binary.
111+
chunk_size = min([stat_.size, CHUNK_SIZE])
112+
if b"\0" in data[:chunk_size]:
113+
is_bin = True
114+
115+
return contents, mmaps, is_bin
116+
117+
118+
def _contents_same(contents, file_size):
119+
other_files_index = list(range(1, len(contents)))
120+
chunk_range = zip(
121+
range(0, file_size, CHUNK_SIZE),
122+
range(CHUNK_SIZE, file_size + CHUNK_SIZE, CHUNK_SIZE)
123+
)
124+
125+
for start, end in chunk_range:
126+
chunk = contents[0][start:end]
127+
for index in other_files_index:
128+
if not chunk == contents[index][start:end]:
129+
return Different
130+
131+
132+
def _normalize(contents, ignore_blank_lines, regexes = ()):
133+
contents = (bytes(c) for c in contents)
134+
# For probable text files, discard newline differences to match
135+
if ignore_blank_lines:
136+
contents = (remove_blank_lines(c) for c in contents)
137+
else:
138+
contents = (b"\n".join(c.splitlines()) for c in contents)
139+
140+
for regex in regexes:
141+
contents = (regex.sub(b'', c) for c in contents)
142+
143+
return contents
144+
145+
93146
def _files_same(files, regexes, comparison_args):
94147
"""Determine whether a list of files are the same.
95148
@@ -105,7 +158,6 @@ def _files_same(files, regexes, comparison_args):
105158
return Same
106159

107160
files = tuple(files)
108-
regexes = tuple(regexes)
109161
stats = tuple([StatItem._make(os.stat(f)) for f in files])
110162

111163
shallow_comparison = comparison_args['shallow-comparison']
@@ -115,6 +167,8 @@ def _files_same(files, regexes, comparison_args):
115167

116168
need_contents = ignore_blank_lines or apply_text_filters
117169

170+
regexes = tuple(regexes) if apply_text_filters else ()
171+
118172
# If all entries are directories, they are considered to be the same
119173
if all([stat.S_ISDIR(s.mode) for s in stats]):
120174
return Same
@@ -130,8 +184,9 @@ def _files_same(files, regexes, comparison_args):
130184
)
131185
return DodgySame if all_same_timestamp else Different
132186

187+
same_size = all_same([s.size for s in stats])
133188
# If there are no text filters, unequal sizes imply a difference
134-
if not need_contents and not all_same([s.size for s in stats]):
189+
if not need_contents and not same_size:
135190
return Different
136191

137192
# Check the cache before doing the expensive comparison
@@ -141,38 +196,31 @@ def _files_same(files, regexes, comparison_args):
141196
return cache.result
142197

143198
# Open files and compare bit-by-bit
144-
contents = [[] for f in files]
145199
result = None
146200

147201
try:
148-
handles = [open(f, "rb") for f in files]
202+
mmaps = []
203+
handles = [open(file_path, "rb") for file_path in files]
149204
try:
150-
data = [h.read(CHUNK_SIZE) for h in handles]
151-
152-
# Rough test to see whether files are binary. If files are guessed
153-
# to be binary, we don't examine contents for speed and space.
154-
if any(b"\0" in d for d in data):
155-
need_contents = False
156-
157-
while True:
158-
if all_same(data):
159-
if not data[0]:
160-
break
161-
else:
162-
result = Different
163-
if not need_contents:
164-
break
205+
contents, mmaps, is_bin = _files_contents(handles, stats)
165206

166-
if need_contents:
167-
for i in range(len(data)):
168-
contents[i].append(data[i])
207+
# compare files chunk-by-chunk
208+
if same_size:
209+
result = _contents_same(contents, stats[0].size)
210+
else:
211+
result = Different
169212

170-
data = [h.read(CHUNK_SIZE) for h in handles]
213+
# normalize and compare files again
214+
if result == Different and need_contents and not is_bin:
215+
contents = _normalize(contents, ignore_blank_lines, regexes)
216+
result = SameFiltered if all_same(contents) else Different
171217

172218
# Files are too large; we can't apply filters
173219
except (MemoryError, OverflowError):
174220
result = DodgySame if all_same(stats) else DodgyDifferent
175221
finally:
222+
for m in mmaps:
223+
m.close()
176224
for h in handles:
177225
h.close()
178226
except IOError:
@@ -182,20 +230,6 @@ def _files_same(files, regexes, comparison_args):
182230
if result is None:
183231
result = Same
184232

185-
if result == Different and need_contents:
186-
contents = (b"".join(c) for c in contents)
187-
# For probable text files, discard newline differences to match
188-
if ignore_blank_lines:
189-
contents = (remove_blank_lines(c) for c in contents)
190-
else:
191-
contents = (b"\n".join(c.splitlines()) for c in contents)
192-
193-
if apply_text_filters:
194-
for regex in regexes:
195-
contents = (regex.sub(b'', c) for c in contents)
196-
197-
result = SameFiltered if all_same(contents) else Different
198-
199233
_cache[cache_key] = CacheResult(stats, result)
200234
return result
201235

test/dirdiff/fixture.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from os import mkdir, path
22

3-
CHUNK_SIZE = 4096
3+
CHUNK_SIZE = 4096 * 10
44

55
diff_definition = {
66
'a': {

0 commit comments

Comments
 (0)