diff --git a/datasette/utils/__init__.py b/datasette/utils/__init__.py index f7b341ca..19c81d3b 100644 --- a/datasette/utils/__init__.py +++ b/datasette/utils/__init__.py @@ -1141,13 +1141,34 @@ def add_cors_headers(headers): headers["Access-Control-Expose-Headers"] = "Link" +_DASH_ENCODING_SAFE = frozenset( + b"ABCDEFGHIJKLMNOPQRSTUVWXYZ" + b"abcdefghijklmnopqrstuvwxyz" + b"0123456789_" + # This is the same as Python percent-encoding but I removed + # '.' and '-' and '~' +) + + +class DashEncoder(dict): + # Keeps a cache internally, via __missing__ + def __missing__(self, b): + # Handle a cache miss, store encoded string in cache and return. + res = chr(b) if b in _DASH_ENCODING_SAFE else "-{:02X}".format(b) + self[b] = res + return res + + +_dash_encoder = DashEncoder().__getitem__ + + @documented def dash_encode(s: str) -> str: - "Returns dash-encoded string - for example ``/foo/bar`` -> ``-/foo-/bar``" - return s.replace("-", "--").replace(".", "-.").replace("/", "-/") + "Returns dash-encoded string - for example ``/foo/bar`` -> ``-2Ffoo-2Fbar``" + return "".join(_dash_encoder(char) for char in s.encode("utf-8")) @documented def dash_decode(s: str) -> str: - "Decodes a dash-encoded string, so ``-/foo-/bar`` -> ``/foo/bar``" - return s.replace("-/", "/").replace("-.", ".").replace("--", "-") + "Decodes a dash-encoded string, so ``-2Ffoo-2Fbar`` -> ``/foo/bar``" + return urllib.parse.unquote(s.replace("-", "%")) diff --git a/docs/internals.rst b/docs/internals.rst index 3bbf0a69..d035e1f1 100644 --- a/docs/internals.rst +++ b/docs/internals.rst @@ -883,13 +883,16 @@ Dash encoding Datasette uses a custom encoding scheme in some places, called **dash encoding**. This is primarily used for table names and row primary keys, to avoid any confusion between ``/`` characters in those values and the Datasette URLs that reference them. -Dash encoding applies the following rules, in order: +Dash encoding uses the same algorithm as `URL percent-encoding `__, but with the ``-`` hyphen character used in place of ``%``. -- All single ``-`` characters are replaced by ``--`` -- ``.`` characters are replaced by ``-.`` -- ``/`` characters are replaced by ``./`` +Any character other than ``ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz 0123456789_`` will be replaced by the numeric equivalent preceded by a hyphen. For example: -These rules are applied in reverse order to decode a dash encoded string. +- ``/`` becomes ``-2F`` +- ``.`` becomes ``-2E`` +- ``%`` becomes ``-25`` +- ``-`` becomes ``-2D`` +- Space character becomes ``-20`` +- ``polls/2022.primary`` becomes ``polls-2F2022-2Eprimary`` .. _internals_utils_dash_encode: diff --git a/tests/test_utils.py b/tests/test_utils.py index e3386324..3d5dee38 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -652,9 +652,11 @@ async def test_derive_named_parameters(sql, expected): "original,expected", ( ("abc", "abc"), - ("/foo/bar", "-/foo-/bar"), - ("/-/bar", "-/---/bar"), - ("-/db-/table---.csv-.csv", "---/db---/table-------.csv---.csv"), + ("/foo/bar", "-2Ffoo-2Fbar"), + ("/-/bar", "-2F-2D-2Fbar"), + ("-/db-/table.csv", "-2D-2Fdb-2D-2Ftable-2Ecsv"), + (r"%~-/", "-25-7E-2D-2F"), + ("-25-7E-2D-2F", "-2D25-2D7E-2D2D-2D2F"), ), ) def test_dash_encoding(original, expected):