From d1cb73180b4b5a07538380db76298618a5fc46b6 Mon Sep 17 00:00:00 2001 From: Simon Willison Date: Sat, 5 Mar 2022 11:31:49 -0800 Subject: [PATCH 1/9] Dash encoding functions, tests and docs, refs #1439 --- datasette/utils/__init__.py | 12 ++++++++++++ docs/internals.rst | 23 +++++++++++++++++++++++ tests/test_utils.py | 16 ++++++++++++++++ 3 files changed, 51 insertions(+) diff --git a/datasette/utils/__init__.py b/datasette/utils/__init__.py index e17b4d7f..f7b341ca 100644 --- a/datasette/utils/__init__.py +++ b/datasette/utils/__init__.py @@ -1139,3 +1139,15 @@ def add_cors_headers(headers): headers["Access-Control-Allow-Origin"] = "*" headers["Access-Control-Allow-Headers"] = "Authorization" headers["Access-Control-Expose-Headers"] = "Link" + + +@documented +def dash_encode(s: str) -> str: + "Returns dash-encoded string - for example ``/foo/bar`` -> ``-/foo-/bar``" + return s.replace("-", "--").replace(".", "-.").replace("/", "-/") + + +@documented +def dash_decode(s: str) -> str: + "Decodes a dash-encoded string, so ``-/foo-/bar`` -> ``/foo/bar``" + return s.replace("-/", "/").replace("-.", ".").replace("--", "-") diff --git a/docs/internals.rst b/docs/internals.rst index 12ef5c54..52688e5e 100644 --- a/docs/internals.rst +++ b/docs/internals.rst @@ -876,6 +876,29 @@ Utility function for calling ``await`` on a return value if it is awaitable, oth .. autofunction:: datasette.utils.await_me_maybe +.. _internals_dash_encoding: + +Dash encoding +------------- + +Datasette uses a custom encoding scheme in some places, called **dash encoding**. This is primarily used for table names and row primary keys, to avoid any confusion between ``/`` characters in those values and the Datasette URL that references them. + +Dash encoding applies the following rules, in order: + +- All single ``-`` characters are replaced by ``--`` +- ``.`` characters are replaced by ``-.`` +- ``/`` characters are replaced by ``./`` + +These rules are applied in reverse order to decode a dash encoded string. + +.. _internals_utils_dash_encode: + +.. autofunction:: datasette.utils.dash_encode + +.. _internals_utils_dash_decode: + +.. autofunction:: datasette.utils.dash_decode + .. _internals_tracer: datasette.tracer diff --git a/tests/test_utils.py b/tests/test_utils.py index e7d67045..e3386324 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -646,3 +646,19 @@ async def test_derive_named_parameters(sql, expected): db = ds.get_database("_memory") params = await utils.derive_named_parameters(db, sql) assert params == expected + + +@pytest.mark.parametrize( + "original,expected", + ( + ("abc", "abc"), + ("/foo/bar", "-/foo-/bar"), + ("/-/bar", "-/---/bar"), + ("-/db-/table---.csv-.csv", "---/db---/table-------.csv---.csv"), + ), +) +def test_dash_encoding(original, expected): + actual = utils.dash_encode(original) + assert actual == expected + # And test round-trip + assert original == utils.dash_decode(actual) From 727ebc8a837dcf8b79eee73a43766ba7267b25f6 Mon Sep 17 00:00:00 2001 From: Simon Willison Date: Sat, 5 Mar 2022 11:45:04 -0800 Subject: [PATCH 2/9] Fix for test failure caused by SQLite 3.37.0+, closes #1647 --- datasette/templates/_table.html | 2 +- tests/test_internals_database.py | 10 +++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/datasette/templates/_table.html b/datasette/templates/_table.html index d91a1a57..5332f831 100644 --- a/datasette/templates/_table.html +++ b/datasette/templates/_table.html @@ -4,7 +4,7 @@ {% for column in display_columns %} - + {% if not column.sortable %} {{ column.name }} {% else %} diff --git a/tests/test_internals_database.py b/tests/test_internals_database.py index bcecb486..31538a24 100644 --- a/tests/test_internals_database.py +++ b/tests/test_internals_database.py @@ -279,7 +279,15 @@ async def test_table_columns(db, table, expected): @pytest.mark.asyncio async def test_table_column_details(db, table, expected): columns = await db.table_column_details(table) - assert columns == expected + # Convert "type" to lowercase before comparison + # https://github.com/simonw/datasette/issues/1647 + compare_columns = [ + Column( + c.cid, c.name, c.type.lower(), c.notnull, c.default_value, c.is_pk, c.hidden + ) + for c in columns + ] + assert compare_columns == expected @pytest.mark.asyncio From d0246ce5c8c5571c82efd439616f91f972cdccf9 Mon Sep 17 00:00:00 2001 From: Simon Willison Date: Sat, 5 Mar 2022 11:49:35 -0800 Subject: [PATCH 3/9] Docs typo --- docs/internals.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/internals.rst b/docs/internals.rst index 52688e5e..1d40274b 100644 --- a/docs/internals.rst +++ b/docs/internals.rst @@ -881,7 +881,7 @@ Utility function for calling ``await`` on a return value if it is awaitable, oth Dash encoding ------------- -Datasette uses a custom encoding scheme in some places, called **dash encoding**. This is primarily used for table names and row primary keys, to avoid any confusion between ``/`` characters in those values and the Datasette URL that references them. +Datasette uses a custom encoding scheme in some places, called **dash encoding**. This is primarily used for table names and row primary keys, to avoid any confusion between ``/`` characters in those values and the Datasette URL that reference them. Dash encoding applies the following rules, in order: From f568c76cfbea477ddcfdc362d0e904b82f4662d9 Mon Sep 17 00:00:00 2001 From: Simon Willison Date: Sat, 5 Mar 2022 11:49:55 -0800 Subject: [PATCH 4/9] Another docs typo --- docs/internals.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/internals.rst b/docs/internals.rst index 1d40274b..3bbf0a69 100644 --- a/docs/internals.rst +++ b/docs/internals.rst @@ -881,7 +881,7 @@ Utility function for calling ``await`` on a return value if it is awaitable, oth Dash encoding ------------- -Datasette uses a custom encoding scheme in some places, called **dash encoding**. This is primarily used for table names and row primary keys, to avoid any confusion between ``/`` characters in those values and the Datasette URL that reference them. +Datasette uses a custom encoding scheme in some places, called **dash encoding**. This is primarily used for table names and row primary keys, to avoid any confusion between ``/`` characters in those values and the Datasette URLs that reference them. Dash encoding applies the following rules, in order: From d2e3fe3facf0ed0abf8b00cd54463af90dd6904d Mon Sep 17 00:00:00 2001 From: Simon Willison Date: Sun, 6 Mar 2022 10:53:25 -0800 Subject: [PATCH 5/9] dash encoding is now like percent encoding but with dashes Refs https://github.com/simonw/datasette/issues/1439#issuecomment-1059851259 --- datasette/utils/__init__.py | 29 +++++++++++++++++++++++++---- docs/internals.rst | 13 ++++++++----- tests/test_utils.py | 8 +++++--- 3 files changed, 38 insertions(+), 12 deletions(-) diff --git a/datasette/utils/__init__.py b/datasette/utils/__init__.py index f7b341ca..19c81d3b 100644 --- a/datasette/utils/__init__.py +++ b/datasette/utils/__init__.py @@ -1141,13 +1141,34 @@ def add_cors_headers(headers): headers["Access-Control-Expose-Headers"] = "Link" +_DASH_ENCODING_SAFE = frozenset( + b"ABCDEFGHIJKLMNOPQRSTUVWXYZ" + b"abcdefghijklmnopqrstuvwxyz" + b"0123456789_" + # This is the same as Python percent-encoding but I removed + # '.' and '-' and '~' +) + + +class DashEncoder(dict): + # Keeps a cache internally, via __missing__ + def __missing__(self, b): + # Handle a cache miss, store encoded string in cache and return. + res = chr(b) if b in _DASH_ENCODING_SAFE else "-{:02X}".format(b) + self[b] = res + return res + + +_dash_encoder = DashEncoder().__getitem__ + + @documented def dash_encode(s: str) -> str: - "Returns dash-encoded string - for example ``/foo/bar`` -> ``-/foo-/bar``" - return s.replace("-", "--").replace(".", "-.").replace("/", "-/") + "Returns dash-encoded string - for example ``/foo/bar`` -> ``-2Ffoo-2Fbar``" + return "".join(_dash_encoder(char) for char in s.encode("utf-8")) @documented def dash_decode(s: str) -> str: - "Decodes a dash-encoded string, so ``-/foo-/bar`` -> ``/foo/bar``" - return s.replace("-/", "/").replace("-.", ".").replace("--", "-") + "Decodes a dash-encoded string, so ``-2Ffoo-2Fbar`` -> ``/foo/bar``" + return urllib.parse.unquote(s.replace("-", "%")) diff --git a/docs/internals.rst b/docs/internals.rst index 3bbf0a69..d035e1f1 100644 --- a/docs/internals.rst +++ b/docs/internals.rst @@ -883,13 +883,16 @@ Dash encoding Datasette uses a custom encoding scheme in some places, called **dash encoding**. This is primarily used for table names and row primary keys, to avoid any confusion between ``/`` characters in those values and the Datasette URLs that reference them. -Dash encoding applies the following rules, in order: +Dash encoding uses the same algorithm as `URL percent-encoding `__, but with the ``-`` hyphen character used in place of ``%``. -- All single ``-`` characters are replaced by ``--`` -- ``.`` characters are replaced by ``-.`` -- ``/`` characters are replaced by ``./`` +Any character other than ``ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz 0123456789_`` will be replaced by the numeric equivalent preceded by a hyphen. For example: -These rules are applied in reverse order to decode a dash encoded string. +- ``/`` becomes ``-2F`` +- ``.`` becomes ``-2E`` +- ``%`` becomes ``-25`` +- ``-`` becomes ``-2D`` +- Space character becomes ``-20`` +- ``polls/2022.primary`` becomes ``polls-2F2022-2Eprimary`` .. _internals_utils_dash_encode: diff --git a/tests/test_utils.py b/tests/test_utils.py index e3386324..3d5dee38 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -652,9 +652,11 @@ async def test_derive_named_parameters(sql, expected): "original,expected", ( ("abc", "abc"), - ("/foo/bar", "-/foo-/bar"), - ("/-/bar", "-/---/bar"), - ("-/db-/table---.csv-.csv", "---/db---/table-------.csv---.csv"), + ("/foo/bar", "-2Ffoo-2Fbar"), + ("/-/bar", "-2F-2D-2Fbar"), + ("-/db-/table.csv", "-2D-2Fdb-2D-2Ftable-2Ecsv"), + (r"%~-/", "-25-7E-2D-2F"), + ("-25-7E-2D-2F", "-2D25-2D7E-2D2D-2D2F"), ), ) def test_dash_encoding(original, expected): From 4976494160db9ef4b0baf9f2aa9798660ea64406 Mon Sep 17 00:00:00 2001 From: Simon Willison Date: Sun, 6 Mar 2022 14:07:03 -0800 Subject: [PATCH 6/9] Use dash-encoding for row PKs and ?_next=, refs #1439 --- datasette/utils/__init__.py | 8 ++++---- datasette/views/table.py | 3 ++- tests/fixtures.py | 1 + tests/test_api.py | 2 +- tests/test_html.py | 36 +++++++++++++++++++++++++++--------- tests/test_table_api.py | 7 +++++-- tests/test_table_html.py | 12 +++++++++--- tests/test_utils.py | 2 +- 8 files changed, 50 insertions(+), 21 deletions(-) diff --git a/datasette/utils/__init__.py b/datasette/utils/__init__.py index 19c81d3b..aef85fbb 100644 --- a/datasette/utils/__init__.py +++ b/datasette/utils/__init__.py @@ -111,12 +111,12 @@ async def await_me_maybe(value: typing.Any) -> typing.Any: def urlsafe_components(token): - """Splits token on commas and URL decodes each component""" - return [urllib.parse.unquote_plus(b) for b in token.split(",")] + """Splits token on commas and dash-decodes each component""" + return [dash_decode(b) for b in token.split(",")] def path_from_row_pks(row, pks, use_rowid, quote=True): - """Generate an optionally URL-quoted unique identifier + """Generate an optionally dash-quoted unique identifier for a row from its primary keys.""" if use_rowid: bits = [row["rowid"]] @@ -125,7 +125,7 @@ def path_from_row_pks(row, pks, use_rowid, quote=True): row[pk]["value"] if isinstance(row[pk], dict) else row[pk] for pk in pks ] if quote: - bits = [urllib.parse.quote_plus(str(bit)) for bit in bits] + bits = [dash_encode(str(bit)) for bit in bits] else: bits = [str(bit) for bit in bits] diff --git a/datasette/views/table.py b/datasette/views/table.py index be9e9c3b..8ccb7371 100644 --- a/datasette/views/table.py +++ b/datasette/views/table.py @@ -12,6 +12,7 @@ from datasette.utils import ( MultiParams, append_querystring, compound_keys_after_sql, + dash_encode, escape_sqlite, filters_should_redirect, is_url, @@ -765,7 +766,7 @@ class TableView(RowTableShared): if prefix is None: prefix = "$null" else: - prefix = urllib.parse.quote_plus(str(prefix)) + prefix = dash_encode(str(prefix)) next_value = f"{prefix},{next_value}" added_args = {"_next": next_value} if sort: diff --git a/tests/fixtures.py b/tests/fixtures.py index 26f0cf7b..11f09c41 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -406,6 +406,7 @@ CREATE TABLE compound_primary_key ( ); INSERT INTO compound_primary_key VALUES ('a', 'b', 'c'); +INSERT INTO compound_primary_key VALUES ('a/b', '.c-d', 'c'); CREATE TABLE compound_three_primary_keys ( pk1 varchar(30), diff --git a/tests/test_api.py b/tests/test_api.py index 57471af2..caedd1ac 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -143,7 +143,7 @@ def test_database_page(app_client): "name": "compound_primary_key", "columns": ["pk1", "pk2", "content"], "primary_keys": ["pk1", "pk2"], - "count": 1, + "count": 2, "hidden": False, "fts_table": None, "foreign_keys": {"incoming": [], "outgoing": []}, diff --git a/tests/test_html.py b/tests/test_html.py index d5f4250d..a7ad4248 100644 --- a/tests/test_html.py +++ b/tests/test_html.py @@ -345,20 +345,38 @@ def test_row_links_from_other_tables(app_client, path, expected_text, expected_l assert link == expected_link -def test_row_html_compound_primary_key(app_client): - response = app_client.get("/fixtures/compound_primary_key/a,b") +@pytest.mark.parametrize( + "path,expected", + ( + ( + "/fixtures/compound_primary_key/a,b", + [ + [ + 'a', + 'b', + 'c', + ] + ], + ), + ( + "/fixtures/compound_primary_key/a-2Fb,-2Ec-2Dd", + [ + [ + 'a/b', + '.c-d', + 'c', + ] + ], + ), + ), +) +def test_row_html_compound_primary_key(app_client, path, expected): + response = app_client.get(path) assert response.status == 200 table = Soup(response.body, "html.parser").find("table") assert ["pk1", "pk2", "content"] == [ th.string.strip() for th in table.select("thead th") ] - expected = [ - [ - 'a', - 'b', - 'c', - ] - ] assert expected == [ [str(td) for td in tr.select("td")] for tr in table.select("tbody tr") ] diff --git a/tests/test_table_api.py b/tests/test_table_api.py index 6a6daed5..cc38d392 100644 --- a/tests/test_table_api.py +++ b/tests/test_table_api.py @@ -136,7 +136,10 @@ def test_table_shape_object(app_client): def test_table_shape_object_compound_primary_key(app_client): response = app_client.get("/fixtures/compound_primary_key.json?_shape=object") - assert {"a,b": {"pk1": "a", "pk2": "b", "content": "c"}} == response.json + assert response.json == { + "a,b": {"pk1": "a", "pk2": "b", "content": "c"}, + "a-2Fb,-2Ec-2Dd": {"pk1": "a/b", "pk2": ".c-d", "content": "c"}, + } def test_table_with_slashes_in_name(app_client): @@ -308,7 +311,7 @@ def test_sortable(app_client, query_string, sort_key, human_description_en): path = response.json["next_url"] if path: path = path.replace("http://localhost", "") - assert 5 == page + assert page == 5 expected = list(generate_sortable_rows(201)) expected.sort(key=sort_key) assert [r["content"] for r in expected] == [r["content"] for r in fetched] diff --git a/tests/test_table_html.py b/tests/test_table_html.py index 021268c3..77d97d80 100644 --- a/tests/test_table_html.py +++ b/tests/test_table_html.py @@ -563,11 +563,17 @@ def test_table_html_compound_primary_key(app_client): 'a', 'b', 'c', - ] + ], + [ + 'a/b,.c-d', + 'a/b', + '.c-d', + 'c', + ], ] - assert expected == [ + assert [ [str(td) for td in tr.select("td")] for tr in table.select("tbody tr") - ] + ] == expected def test_table_html_foreign_key_links(app_client): diff --git a/tests/test_utils.py b/tests/test_utils.py index 3d5dee38..1c3ab495 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -93,7 +93,7 @@ def test_path_with_replaced_args(path, args, expected): "row,pks,expected_path", [ ({"A": "foo", "B": "bar"}, ["A", "B"], "foo,bar"), - ({"A": "f,o", "B": "bar"}, ["A", "B"], "f%2Co,bar"), + ({"A": "f,o", "B": "bar"}, ["A", "B"], "f-2Co,bar"), ({"A": 123}, ["A"], "123"), ( utils.CustomRow( From b760264ae6a1836be1dfea04d12364cd4f89db0e Mon Sep 17 00:00:00 2001 From: Simon Willison Date: Sun, 6 Mar 2022 15:38:44 -0800 Subject: [PATCH 7/9] Use dash encoding for table names, refs #1439 --- datasette/url_builder.py | 6 +++--- datasette/views/base.py | 14 +++++++------- datasette/views/table.py | 6 +++--- tests/test_html.py | 8 ++++---- tests/test_internals_urls.py | 2 +- 5 files changed, 18 insertions(+), 18 deletions(-) diff --git a/datasette/url_builder.py b/datasette/url_builder.py index 2bcda869..3f54730b 100644 --- a/datasette/url_builder.py +++ b/datasette/url_builder.py @@ -1,4 +1,4 @@ -from .utils import path_with_format, HASH_LENGTH, PrefixedUrlString +from .utils import dash_encode, path_with_format, HASH_LENGTH, PrefixedUrlString import urllib @@ -38,13 +38,13 @@ class Urls: return path def table(self, database, table, format=None): - path = f"{self.database(database)}/{urllib.parse.quote_plus(table)}" + path = f"{self.database(database)}/{dash_encode(table)}" if format is not None: path = path_with_format(path=path, format=format) return PrefixedUrlString(path) def query(self, database, query, format=None): - path = f"{self.database(database)}/{urllib.parse.quote_plus(query)}" + path = f"{self.database(database)}/{dash_encode(query)}" if format is not None: path = path_with_format(path=path, format=format) return PrefixedUrlString(path) diff --git a/datasette/views/base.py b/datasette/views/base.py index c74d6141..6d9a5743 100644 --- a/datasette/views/base.py +++ b/datasette/views/base.py @@ -17,6 +17,8 @@ from datasette.utils import ( InvalidSql, LimitedWriter, call_with_supported_arguments, + dash_decode, + dash_encode, path_from_row_pks, path_with_added_args, path_with_removed_args, @@ -233,9 +235,7 @@ class DataView(BaseView): return await db.table_exists(t) table, _format = await resolve_table_and_format( - table_and_format=urllib.parse.unquote_plus( - kwargs["table_and_format"] - ), + table_and_format=dash_decode(kwargs["table_and_format"]), table_exists=async_table_exists, allowed_formats=self.ds.renderers.keys(), ) @@ -243,11 +243,11 @@ class DataView(BaseView): if _format: kwargs["as_format"] = f".{_format}" elif kwargs.get("table"): - kwargs["table"] = urllib.parse.unquote_plus(kwargs["table"]) + kwargs["table"] = dash_decode(kwargs["table"]) should_redirect = self.ds.urls.path(f"{name}-{expected}") if kwargs.get("table"): - should_redirect += "/" + urllib.parse.quote_plus(kwargs["table"]) + should_redirect += "/" + dash_encode(kwargs["table"]) if kwargs.get("pk_path"): should_redirect += "/" + kwargs["pk_path"] if kwargs.get("as_format"): @@ -467,7 +467,7 @@ class DataView(BaseView): return await db.table_exists(t) table, _ext_format = await resolve_table_and_format( - table_and_format=urllib.parse.unquote_plus(args["table_and_format"]), + table_and_format=dash_decode(args["table_and_format"]), table_exists=async_table_exists, allowed_formats=self.ds.renderers.keys(), ) @@ -475,7 +475,7 @@ class DataView(BaseView): args["table"] = table del args["table_and_format"] elif "table" in args: - args["table"] = urllib.parse.unquote_plus(args["table"]) + args["table"] = dash_decode(args["table"]) return _format, args async def view_get(self, request, database, hash, correct_hash_provided, **kwargs): diff --git a/datasette/views/table.py b/datasette/views/table.py index 8ccb7371..1d81755e 100644 --- a/datasette/views/table.py +++ b/datasette/views/table.py @@ -143,7 +143,7 @@ class RowTableShared(DataView): '{flat_pks}'.format( base_url=base_url, database=database, - table=urllib.parse.quote_plus(table), + table=dash_encode(table), flat_pks=str(markupsafe.escape(pk_path)), flat_pks_quoted=path_from_row_pks(row, pks, not pks), ) @@ -200,8 +200,8 @@ class RowTableShared(DataView): link_template.format( database=database, base_url=base_url, - table=urllib.parse.quote_plus(other_table), - link_id=urllib.parse.quote_plus(str(value)), + table=dash_encode(other_table), + link_id=dash_encode(str(value)), id=str(markupsafe.escape(value)), label=str(markupsafe.escape(label)) or "-", ) diff --git a/tests/test_html.py b/tests/test_html.py index a7ad4248..f6456aa5 100644 --- a/tests/test_html.py +++ b/tests/test_html.py @@ -140,7 +140,7 @@ def test_database_page(app_client): assert queries_ul is not None assert [ ( - "/fixtures/%F0%9D%90%9C%F0%9D%90%A2%F0%9D%90%AD%F0%9D%90%A2%F0%9D%90%9E%F0%9D%90%AC", + "/fixtures/-F0-9D-90-9C-F0-9D-90-A2-F0-9D-90-AD-F0-9D-90-A2-F0-9D-90-9E-F0-9D-90-AC", "𝐜𝐢𝐭𝐢𝐞𝐬", ), ("/fixtures/from_async_hook", "from_async_hook"), @@ -193,11 +193,11 @@ def test_row_redirects_with_url_hash(app_client_with_hash): def test_row_strange_table_name_with_url_hash(app_client_with_hash): - response = app_client_with_hash.get("/fixtures/table%2Fwith%2Fslashes.csv/3") + response = app_client_with_hash.get("/fixtures/table-2Fwith-2Fslashes-2Ecsv/3") assert response.status == 302 - assert response.headers["Location"].endswith("/table%2Fwith%2Fslashes.csv/3") + assert response.headers["Location"].endswith("/table-2Fwith-2Fslashes-2Ecsv/3") response = app_client_with_hash.get( - "/fixtures/table%2Fwith%2Fslashes.csv/3", follow_redirects=True + "/fixtures/table-2Fwith-2Fslashes-2Ecsv/3", follow_redirects=True ) assert response.status == 200 diff --git a/tests/test_internals_urls.py b/tests/test_internals_urls.py index e486e4c9..16515ad6 100644 --- a/tests/test_internals_urls.py +++ b/tests/test_internals_urls.py @@ -121,7 +121,7 @@ def test_database(ds, base_url, format, expected): ("/", "name", None, "/_memory/name"), ("/prefix/", "name", None, "/prefix/_memory/name"), ("/", "name", "json", "/_memory/name.json"), - ("/", "name.json", "json", "/_memory/name.json?_format=json"), + ("/", "name.json", "json", "/_memory/name-2Ejson.json"), ], ) def test_table_and_query(ds, base_url, name, format, expected): From 32548b88fdb413429d960966decba8f6b3889190 Mon Sep 17 00:00:00 2001 From: Simon Willison Date: Mon, 7 Mar 2022 07:31:37 -0800 Subject: [PATCH 8/9] Use dash encoding for database names, too, refs #1439 --- datasette/url_builder.py | 4 ++-- datasette/views/base.py | 10 +++++----- tests/test_api.py | 17 +++++++++++++++-- tests/test_cli.py | 5 +++-- tests/test_html.py | 6 +++--- 5 files changed, 28 insertions(+), 14 deletions(-) diff --git a/datasette/url_builder.py b/datasette/url_builder.py index 3f54730b..eebfe31e 100644 --- a/datasette/url_builder.py +++ b/datasette/url_builder.py @@ -31,10 +31,10 @@ class Urls: db = self.ds.databases[database] if self.ds.setting("hash_urls") and db.hash: path = self.path( - f"{urllib.parse.quote(database)}-{db.hash[:HASH_LENGTH]}", format=format + f"{dash_encode(database)}-{db.hash[:HASH_LENGTH]}", format=format ) else: - path = self.path(urllib.parse.quote(database), format=format) + path = self.path(dash_encode(database), format=format) return path def table(self, database, table, format=None): diff --git a/datasette/views/base.py b/datasette/views/base.py index 6d9a5743..7cd385b7 100644 --- a/datasette/views/base.py +++ b/datasette/views/base.py @@ -205,17 +205,17 @@ class DataView(BaseView): async def resolve_db_name(self, request, db_name, **kwargs): hash = None name = None - db_name = urllib.parse.unquote_plus(db_name) - if db_name not in self.ds.databases and "-" in db_name: + decoded_name = dash_decode(db_name) + if decoded_name not in self.ds.databases and "-" in db_name: # No matching DB found, maybe it's a name-hash? name_bit, hash_bit = db_name.rsplit("-", 1) - if name_bit not in self.ds.databases: + if dash_decode(name_bit) not in self.ds.databases: raise NotFound(f"Database not found: {name}") else: - name = name_bit + name = dash_decode(name_bit) hash = hash_bit else: - name = db_name + name = decoded_name try: db = self.ds.databases[name] diff --git a/tests/test_api.py b/tests/test_api.py index caedd1ac..dd916cf0 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -942,7 +942,7 @@ def test_cors(app_client_with_cors, path, status_code): ) def test_database_with_space_in_name(app_client_two_attached_databases, path): response = app_client_two_attached_databases.get( - "/extra database" + path, follow_redirects=True + "/extra-20database" + path, follow_redirects=True ) assert response.status == 200 @@ -953,7 +953,7 @@ def test_common_prefix_database_names(app_client_conflicting_database_names): d["name"] for d in app_client_conflicting_database_names.get("/-/databases.json").json ] - for db_name, path in (("foo", "/foo.json"), ("foo-bar", "/foo-bar.json")): + for db_name, path in (("foo", "/foo.json"), ("foo-bar", "/foo-2Dbar.json")): data = app_client_conflicting_database_names.get(path).json assert db_name == data["database"] @@ -992,3 +992,16 @@ async def test_hidden_sqlite_stat1_table(): data = (await ds.client.get("/db.json?_show_hidden=1")).json() tables = [(t["name"], t["hidden"]) for t in data["tables"]] assert tables == [("normal", False), ("sqlite_stat1", True)] + + +@pytest.mark.asyncio +@pytest.mark.parametrize("db_name", ("foo", r"fo%o", "f~/c.d")) +async def test_dash_encoded_database_names(db_name): + ds = Datasette() + ds.add_memory_database(db_name) + response = await ds.client.get("/.json") + assert db_name in response.json().keys() + path = response.json()[db_name]["path"] + # And the JSON for that database + response2 = await ds.client.get(path + ".json") + assert response2.status_code == 200 diff --git a/tests/test_cli.py b/tests/test_cli.py index 3fbfdee2..e30c2ad3 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -9,6 +9,7 @@ from datasette.app import SETTINGS from datasette.plugins import DEFAULT_PLUGINS from datasette.cli import cli, serve from datasette.version import __version__ +from datasette.utils import dash_encode from datasette.utils.sqlite import sqlite3 from click.testing import CliRunner import io @@ -294,12 +295,12 @@ def test_weird_database_names(ensure_eventloop, tmpdir, filename): assert result1.exit_code == 0, result1.output filename_no_stem = filename.rsplit(".", 1)[0] expected_link = '{}'.format( - urllib.parse.quote(filename_no_stem), filename_no_stem + dash_encode(filename_no_stem), filename_no_stem ) assert expected_link in result1.output # Now try hitting that database page result2 = runner.invoke( - cli, [db_path, "--get", "/{}".format(urllib.parse.quote(filename_no_stem))] + cli, [db_path, "--get", "/{}".format(dash_encode(filename_no_stem))] ) assert result2.exit_code == 0, result2.output diff --git a/tests/test_html.py b/tests/test_html.py index f6456aa5..b4a12b8a 100644 --- a/tests/test_html.py +++ b/tests/test_html.py @@ -29,7 +29,7 @@ def test_homepage(app_client_two_attached_databases): ) # Should be two attached databases assert [ - {"href": r"/extra%20database", "text": "extra database"}, + {"href": r"/extra-20database", "text": "extra database"}, {"href": "/fixtures", "text": "fixtures"}, ] == [{"href": a["href"], "text": a.text.strip()} for a in soup.select("h2 a")] # Database should show count text and attached tables @@ -44,8 +44,8 @@ def test_homepage(app_client_two_attached_databases): {"href": a["href"], "text": a.text.strip()} for a in links_p.findAll("a") ] assert [ - {"href": r"/extra%20database/searchable", "text": "searchable"}, - {"href": r"/extra%20database/searchable_view", "text": "searchable_view"}, + {"href": r"/extra-20database/searchable", "text": "searchable"}, + {"href": r"/extra-20database/searchable_view", "text": "searchable_view"}, ] == table_links From eeb58a09eee49c12eb6f524d10ab9ef7c3904cf4 Mon Sep 17 00:00:00 2001 From: Simon Willison Date: Mon, 7 Mar 2022 08:01:03 -0800 Subject: [PATCH 9/9] Redirect old % URLs to new - encoded URLs, closes #1650 Refs #1439 --- datasette/app.py | 7 +++++++ tests/test_html.py | 6 ++++++ 2 files changed, 13 insertions(+) diff --git a/datasette/app.py b/datasette/app.py index 8c5480cf..2907d90e 100644 --- a/datasette/app.py +++ b/datasette/app.py @@ -1211,6 +1211,13 @@ class DatasetteRouter: return await self.handle_404(request, send) async def handle_404(self, request, send, exception=None): + # If path contains % encoding, redirect to dash encoding + if "%" in request.path: + # Try the same path but with "%" replaced by "-" + # and "-" replaced with "-2D" + new_path = request.path.replace("-", "-2D").replace("%", "-") + await asgi_send_redirect(send, new_path) + return # If URL has a trailing slash, redirect to URL without it path = request.scope.get( "raw_path", request.scope["path"].encode("utf8") diff --git a/tests/test_html.py b/tests/test_html.py index b4a12b8a..3e24009e 100644 --- a/tests/test_html.py +++ b/tests/test_html.py @@ -952,3 +952,9 @@ def test_no_alternate_url_json(app_client, path): assert ( '