浏览代码

fix `ui.codemirror` on JS UTF-16-based indexing (memoize JS length per char + bisect) (#4678)

This PR is in response to #4676 being merged, implementing the approach
at
https://github.com/zauberzeug/nicegui/pull/4578#issuecomment-2816803235
to fix #4575

Recap of the technique: 

> Maintain a list `self._cumulative_js_length`, same length as the
Python string, which stores the number of UTF-16 code points per
character in the Python string.
> 
> Do a binary search to find the starting index to slice the Python
string into. Do another binary search to find the ending index to slice
the Python string into as well.
> 
> Don't forget to recompute `self._cumulative_js_length` on change of
`self.value` if it has been mutated by others (obvious by the fact that
the new value is different to `self._cumulative_corresponds_to_string`

Changes in the code:
* `import bisect` (duh)
* Do not attach the `on_change` via `super().__init__(value=value,
on_value_change=on_change)`. We want our internal handler
`self._update_cumulative` to always go first. Attach the `on_change` via
`super().on_value_change(on_change)`
* Move `_apply_change_set` inside the class (since it is no longer in a
vacuum but relies on several instance variables)
* Helper function `get_cumulative_js_length` is introduced, returning
`[1, 2, 4, 5, 6]` if passed "aa🔄bb"
* Inner function `find_python_index_given_js_index`: consider "aa🔄bb",
say we want to find where to slice, if JS-side calls for `doc[:4]` in
its minds. `bisect.bisect_right([1,2,4,5,6],4)` yields 3, meaning we
should in-return do `doc[:3]` to mirror JS-side intention.
* Helper function `get_total_length_from_cumulative` is introduced.
Would return `0` if input `[]`, `5` if input `[1, 2, 3, 4, 5]`

---

Remaining tasks:

- [x] avoid counting 1s twice?
- [x] update docstrings
- [x] ~~use separate event for emitting change sets, avoiding the need
for selectively blocking `_update_codepoints`~~ it was easier to stick
with the "update:model-value" event but to use
`_send_update_on_value_change` for an early exit
- [x] check variable naming

---------

Co-authored-by: Falko Schindler <falko@zauberzeug.com>
Evan Chan 1 周之前
父节点
当前提交
087a1ed50e
共有 2 个文件被更改,包括 70 次插入24 次删除
  1. 40 14
      nicegui/elements/codemirror.py
  2. 30 10
      tests/test_codemirror.py

+ 40 - 14
nicegui/elements/codemirror.py

@@ -1,4 +1,4 @@
-from itertools import accumulate, zip_longest
+from itertools import accumulate, chain, repeat
 from pathlib import Path
 from pathlib import Path
 from typing import List, Literal, Optional, get_args
 from typing import List, Literal, Optional, get_args
 
 
@@ -281,7 +281,11 @@ class CodeMirror(ValueElement, DisableableElement, component='codemirror.js', de
         :param line_wrapping: whether to wrap lines (default: `False`)
         :param line_wrapping: whether to wrap lines (default: `False`)
         :param highlight_whitespace: whether to highlight whitespace (default: `False`)
         :param highlight_whitespace: whether to highlight whitespace (default: `False`)
         """
         """
-        super().__init__(value=value, on_value_change=on_change)
+        super().__init__(value=value, on_value_change=self._update_codepoints)
+        self._codepoints = b''
+        self._update_codepoints()
+        if on_change is not None:
+            super().on_value_change(on_change)
         self.add_resource(Path(__file__).parent / 'lib' / 'codemirror')
         self.add_resource(Path(__file__).parent / 'lib' / 'codemirror')
 
 
         self._props['language'] = language
         self._props['language'] = language
@@ -333,17 +337,39 @@ class CodeMirror(ValueElement, DisableableElement, component='codemirror.js', de
 
 
     def _event_args_to_value(self, e: GenericEventArguments) -> str:
     def _event_args_to_value(self, e: GenericEventArguments) -> str:
         """The event contains a change set which is applied to the current value."""
         """The event contains a change set which is applied to the current value."""
-        return _apply_change_set(self.value, e.args['sections'], e.args['inserted'])
+        return self._apply_change_set(e.args['sections'], e.args['inserted'])
 
 
+    @staticmethod
+    def _encode_codepoints(doc: str) -> bytes:
+        return b''.join(b'\0\1' if ord(c) > 0xFFFF else b'\1' for c in doc)
 
 
-def _apply_change_set(doc, sections: List[int], inserted: List[List[str]]) -> str:
-    # based on https://github.com/codemirror/state/blob/main/src/change.ts
-    old_lengths = sections[::2]
-    new_lengths = sections[1::2]
-    end_positions = accumulate(old_lengths)
-    joined_inserts = ('\n'.join(ins) for ins in inserted)
-    assert sum(old_lengths) == len(doc), 'Cannot apply change set to document due to length mismatch'
-    return ''.join(
-        doc[pos-old_len:pos] if new_len == -1 else ins  # type: ignore
-        for pos, old_len, new_len, ins in zip_longest(end_positions, old_lengths, new_lengths, joined_inserts, fillvalue='')
-    )
+    def _update_codepoints(self) -> None:
+        """Update `self._codepoints` as a concatenation of "1" for code points <=0xFFFF and "01" for code points >0xFFFF.
+
+        This captures how many Unicode code points are encoded by each UTF-16 code unit.
+        This is used to convert JavaScript string indices to Python by summing `self._codepoints` up to the JavaScript index.
+        """
+        if not self._send_update_on_value_change:
+            return  # the update is triggered by the user and codepoints are updated incrementally
+        self._codepoints = self._encode_codepoints(self.value or '')
+
+    def _apply_change_set(self, sections: List[int], inserted: List[List[str]]) -> str:
+        document = self.value or ''
+        old_lengths = sections[::2]
+        new_lengths = sections[1::2]
+        end_positions = accumulate(old_lengths)
+        document_parts: List[str] = []
+        codepoint_parts: List[bytes] = []
+        for end, old_len, new_len, insert in zip(end_positions, old_lengths, new_lengths, chain(inserted, repeat([]))):
+            if new_len == -1:
+                start = end - old_len
+                py_start = self._codepoints[:start].count(1)
+                py_end = py_start + self._codepoints[start:end].count(1)
+                document_parts.append(document[py_start:py_end])
+                codepoint_parts.append(self._codepoints[start:end])
+            else:
+                joined_insert = '\n'.join(insert)
+                document_parts.append(joined_insert)
+                codepoint_parts.append(self._encode_codepoints(joined_insert))
+        self._codepoints = b''.join(codepoint_parts)
+        return ''.join(document_parts)

+ 30 - 10
tests/test_codemirror.py

@@ -1,9 +1,12 @@
 from typing import Dict, List
 from typing import Dict, List
 
 
+import pytest
+
 from nicegui import ui
 from nicegui import ui
-from nicegui.elements.codemirror import _apply_change_set
 from nicegui.testing import Screen
 from nicegui.testing import Screen
 
 
+# pylint: disable=protected-access
+
 
 
 def test_codemirror(screen: Screen):
 def test_codemirror(screen: Screen):
     ui.codemirror('Line 1\nLine 2\nLine 3')
     ui.codemirror('Line 1\nLine 2\nLine 3')
@@ -34,12 +37,29 @@ def test_supported_values(screen: Screen):
     assert values['themes'] == values['supported_themes']
     assert values['themes'] == values['supported_themes']
 
 
 
 
-def test_change_set():
-    assert _apply_change_set('', [0, 1], [['A']]) == 'A'
-    assert _apply_change_set('', [0, 2], [['AB']]) == 'AB'
-    assert _apply_change_set('X', [1, 2], [['AB']]) == 'AB'
-    assert _apply_change_set('X', [1, -1], []) == 'X'
-    assert _apply_change_set('X', [1, -1, 0, 1], [[], ['Y']]) == 'XY'
-    assert _apply_change_set('Hello', [5, -1, 0, 8], [[], [', world!']]) == 'Hello, world!'
-    assert _apply_change_set('Hello, world!', [5, -1, 7, 0, 1, -1], []) == 'Hello!'
-    assert _apply_change_set('Hello, hello!', [2, -1, 3, 1, 4, -1, 3, 1, 1, -1], [[], ['y'], [], ['y']]) == 'Hey, hey!'
+@pytest.mark.parametrize('doc, sections, inserted, expected', [
+    ('', [0, 1], [['A']], 'A'),
+    ('', [0, 2], [['AB']], 'AB'),
+    ('X', [1, 2], [['AB']], 'AB'),
+    ('X', [1, -1], [], 'X'),
+    ('X', [1, -1, 0, 1], [[], ['Y']], 'XY'),
+    ('Hello', [5, -1, 0, 8], [[], [', world!']], 'Hello, world!'),
+    ('Hello, world!', [5, -1, 7, 0, 1, -1], [], 'Hello!'),
+    ('Hello, hello!', [2, -1, 3, 1, 4, -1, 3, 1, 1, -1], [[], ['y'], [], ['y']], 'Hey, hey!'),
+    ('Hello, world!', [5, -1, 1, 3, 7, -1], [[], [' 🙂']], 'Hello 🙂 world!'),
+    ('Hey! 🙂', [7, -1, 0, 4], [[], [' Ho!']], 'Hey! 🙂 Ho!'),
+    ('Ha 🙂\nha 🙂', [3, -1, 2, 0, 4, -1, 2, 0], [[], [''], [], ['']], 'Ha \nha '),
+])
+def test_change_set(screen: Screen, doc: str, sections: List[int], inserted: List[List[str]], expected: str):
+    editor = ui.codemirror(doc)
+
+    screen.open('/')
+    assert editor._apply_change_set(sections, inserted) == expected
+
+
+def test_encode_codepoints():
+    assert ui.codemirror._encode_codepoints('') == b''
+    assert ui.codemirror._encode_codepoints('Hello') == bytes([1, 1, 1, 1, 1])
+    assert ui.codemirror._encode_codepoints('🙂') == bytes([0, 1])
+    assert ui.codemirror._encode_codepoints('Hello 🙂') == bytes([1, 1, 1, 1, 1, 1, 0, 1])
+    assert ui.codemirror._encode_codepoints('😎😎😎') == bytes([0, 1, 0, 1, 0, 1])