meldbuffer: Move BufferLines to just use GTK+ line logic (#240)

kaiw · kaiw · commit 1be76337a17b · 2019-01-11T07:54:15.000+10:00
Over the years we've had many, many issues with GTK+ and Python
disagreeing about what constitutes a line break. The actual bug that has
prompted this rewrite is just another one in the long series of wild
issues, but it's *so annoying* that rather than work through the insane
logic I wrote before, I've just changed the whole approach and made our
treat-my-textbuffer-as-a-list-of-lines shim use GTK+ APIs to do its line
calculation logic.

I'm was worried that this will be slow - it is after all a _lot_ more
function calls and we're crossing through GObject introspection a bunch,
but it... seems okay?

One of the minor weird side-effects here is that this accessor
absolutely enforces that regex filters only apply to a single line. I
considered options for maintaining our very-broken status quo, but they
were very, very difficult by comparison, and any multi-line filters
are completely unsupported.
diff --git a/meld/meldbuffer.py b/meld/meldbuffer.py
@@ -224,10 +224,6 @@ class BufferLines:
     This class allows a Gtk.TextBuffer to be treated as a list of lines of
     possibly-filtered text. If no filter is given, the raw output from the
     Gtk.TextBuffer is used.
-
-    The logic here (and in places in FileDiff) requires that Python's
-    unicode splitlines() implementation and Gtk.TextBuffer agree on where
-    linebreaks occur. Happily, this is usually the case.
     """
 
     def __init__(self, buf, textfilter=None):
@@ -240,48 +236,18 @@ def __init__(self, buf, textfilter=None):
     def __getitem__(self, key):
         if isinstance(key, slice):
             lo, hi, _ = key.indices(self.buf.get_line_count())
-
-            # FIXME: If we ask for arbitrary slices past the end of the buffer,
-            # this will return the last line.
-            start = self.buf.get_iter_at_line_or_eof(lo)
+            line_start = self.buf.get_iter_at_line_or_eof(lo)
             end = self.buf.get_iter_at_line_or_eof(hi)
-            txt = self.buf.get_text(start, end, False)
-
-            filter_txt = self.textfilter(txt, self.buf, start, end)
-            lines = filter_txt.splitlines()
-            ends = filter_txt.splitlines(True)
-
-            # The last line in a Gtk.TextBuffer is guaranteed never to end in a
-            # newline. As splitlines() discards an empty line at the end, we
-            # need to artificially add a line if the requested slice is past
-            # the end of the buffer, and the last line in the slice ended in a
-            # newline.
-            if hi >= self.buf.get_line_count() and \
-               lo < self.buf.get_line_count() and \
-               (len(lines) == 0 or len(lines[-1]) != len(ends[-1])):
-                lines.append("")
-                ends.append("")
-
-            hi = self.buf.get_line_count() if hi == sys.maxsize else hi
-            if hi - lo != len(lines):
-                # These codepoints are considered line breaks by Python, but
-                # not by GtkTextStore.
-                additional_breaks = set(('\x0c', '\x85', '\u2028'))
-                i = 0
-                while i < len(ends):
-                    line, end = lines[i], ends[i]
-                    # It's possible that the last line in a file would end in a
-                    # line break character, which requires no joining.
-                    if end and end[-1] in additional_breaks and \
-                       (not line or line[-1] not in additional_breaks):
-                        assert len(ends) >= i + 1
-                        lines[i:i + 2] = [line + end[-1] + lines[i + 1]]
-                        ends[i:i + 2] = [end + ends[i + 1]]
-                    else:
-                        # We only increment if we don't correct a line, to
-                        # handle the case of a single line having multiple
-                        # additional_breaks characters that need correcting.
-                        i += 1
+
+            lines = []
+            while line_start.compare(end) < 0:
+                line_end = line_start.copy()
+                if not line_end.ends_line():
+                    line_end.forward_to_line_end()
+                txt = self.buf.get_text(line_start, line_end, False)
+                filter_txt = self.textfilter(txt, self.buf, line_start, end)
+                lines.append(filter_txt)
+                line_start.forward_visible_line()
 
             return lines
 
diff --git a/test/test_buffer_lines.py b/test/test_buffer_lines.py
@@ -0,0 +1,49 @@
+
+from unittest import mock
+
+import pytest
+
+from meld.meldbuffer import BufferLines, MeldBuffer
+
+
+text = ("""0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+""")
+
+
+@pytest.mark.parametrize("line_start, line_end, expected_text", [
+    (0, 1, ["0"],),
+    (0, 2, ["0", "1"],),
+    # zero-sized slice
+    (9, 9, [],),
+    (9, 10, ["9"],),
+    (9, 11, ["9", "10"],),
+    # Past the end of the buffer
+    (9, 12, ["9", "10"],),
+    # Waaaay past the end of the buffer
+    (9, 9999, ["9", "10"],),
+    # And sidling towards past-the-end start indices
+    (10, 12, ["10"],),
+    (11, 12, [],),
+])
+def test_filter_text(line_start, line_end, expected_text):
+
+    import meld.meldbuffer
+
+    meld.meldbuffer.bind_settings = mock.MagicMock()
+    meld.meldbuffer.meldsettings = mock.MagicMock(style_scheme=None)
+
+    buf = MeldBuffer()
+    buf.set_text(text)
+
+    buffer_lines = BufferLines(buf)
+    assert buffer_lines[line_start:line_end] == expected_text