From 5a0a82faf9cf9dd109d76181ed00eea19472087c Mon Sep 17 00:00:00 2001 From: Simon Willison Date: Sun, 17 Jun 2018 16:08:02 -0700 Subject: [PATCH 1/6] Streaming works locally, needs cleanup + _dl= option --- datasette/views/base.py | 28 ++++++++++++++++++++++++++++ datasette/views/table.py | 10 +++++++--- 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/datasette/views/base.py b/datasette/views/base.py index 53ae08bd..7beff72e 100644 --- a/datasette/views/base.py +++ b/datasette/views/base.py @@ -149,7 +149,35 @@ class BaseView(RenderMixin): return await self.view_get(request, name, hash, **kwargs) + async def as_csv_stream(self, request, name, hash, **kwargs): + assert not request.args.get("_next") # TODO: real error + kwargs['_size'] = 'max' + + async def stream_fn(r): + first = True + next = None + writer = csv.writer(r) + while first or next: + if next: + kwargs['_next'] = next + data, extra_template_data, templates = await self.data( + request, name, hash, **kwargs + ) + if first: + writer.writerow(data["columns"]) + first = False + next = data["next"] + for row in data["rows"]: + writer.writerow(row) + + return response.stream( + stream_fn, + content_type="text/plain; charset=utf-8" + ) + async def as_csv(self, request, name, hash, **kwargs): + if request.args.get("_stream"): + return await self.as_csv_stream(request, name, hash, **kwargs) try: response_or_template_contexts = await self.data( request, name, hash, **kwargs diff --git a/datasette/views/table.py b/datasette/views/table.py index c57fd954..cb2c9ae5 100644 --- a/datasette/views/table.py +++ b/datasette/views/table.py @@ -220,7 +220,7 @@ class RowTableShared(BaseView): class TableView(RowTableShared): - async def data(self, request, name, hash, table, default_labels=False): + async def data(self, request, name, hash, table, default_labels=False, _next=None, _size=None): canned_query = self.ds.get_canned_query(name, table) if canned_query is not None: return await self.custom_sql( @@ -375,7 +375,7 @@ class TableView(RowTableShared): count_sql = "select count(*) {}".format(from_sql) - _next = special_args.get("_next") + _next = _next or special_args.get("_next") offset = "" if _next: if is_view: @@ -462,7 +462,7 @@ class TableView(RowTableShared): extra_args = {} # Handle ?_size=500 - page_size = request.raw_args.get("_size") + page_size = _size or request.raw_args.get("_size") if page_size: if page_size == "max": page_size = self.max_returned_rows @@ -512,6 +512,8 @@ class TableView(RowTableShared): facet_results = {} facets_timed_out = [] for column in facets: + if _next: + continue facet_sql = """ select {col} as value, count(*) as count {from_sql} {and_or_where} {col} is not null @@ -665,6 +667,8 @@ class TableView(RowTableShared): for facet_column in columns: if facet_column in facets: continue + if _next: + continue if not self.ds.config["suggest_facets"]: continue suggested_facet_sql = ''' From 619a9ddb338e2b11419477a6f16c6ef5bd57d32b Mon Sep 17 00:00:00 2001 From: Simon Willison Date: Sun, 17 Jun 2018 19:31:09 -0700 Subject: [PATCH 2/6] table.csv?_stream=1 to download all rows - refs #266 This option causes Datasette to serve ALL rows in the table, by internally following the _next= pagination links and serving everything out as a stream. Also added new config option, allow_csv_stream, which can be used to disable this feature. --- datasette/app.py | 3 ++ datasette/views/base.py | 84 ++++++++++++++++++++--------------------- docs/config.rst | 12 ++++++ tests/test_api.py | 1 + tests/test_csv.py | 13 +++++++ 5 files changed, 69 insertions(+), 44 deletions(-) diff --git a/datasette/app.py b/datasette/app.py index 70f2a93f..92327353 100644 --- a/datasette/app.py +++ b/datasette/app.py @@ -94,6 +94,9 @@ CONFIG_OPTIONS = ( ConfigOption("cache_size_kb", 0, """ SQLite cache size in KB (0 == use SQLite default) """.strip()), + ConfigOption("allow_csv_stream", True, """ + Allow .csv?_stream=1 to download all rows (ignoring max_returned_rows) + """.strip()), ) DEFAULT_CONFIG = { option.name: option.default diff --git a/datasette/views/base.py b/datasette/views/base.py index 7beff72e..a572c330 100644 --- a/datasette/views/base.py +++ b/datasette/views/base.py @@ -149,42 +149,24 @@ class BaseView(RenderMixin): return await self.view_get(request, name, hash, **kwargs) - async def as_csv_stream(self, request, name, hash, **kwargs): - assert not request.args.get("_next") # TODO: real error - kwargs['_size'] = 'max' - - async def stream_fn(r): - first = True - next = None - writer = csv.writer(r) - while first or next: - if next: - kwargs['_next'] = next - data, extra_template_data, templates = await self.data( - request, name, hash, **kwargs - ) - if first: - writer.writerow(data["columns"]) - first = False - next = data["next"] - for row in data["rows"]: - writer.writerow(row) - - return response.stream( - stream_fn, - content_type="text/plain; charset=utf-8" - ) - async def as_csv(self, request, name, hash, **kwargs): - if request.args.get("_stream"): - return await self.as_csv_stream(request, name, hash, **kwargs) + stream = request.args.get("_stream") + if stream: + # Some quick sanity checks + if not self.ds.config["allow_csv_stream"]: + raise DatasetteError("CSV streaming is disabled", status=400) + if request.args.get("_next"): + raise DatasetteError( + "_next not allowed for CSV streaming", status=400 + ) + kwargs["_size"] = "max" + # Fetch the first page try: response_or_template_contexts = await self.data( request, name, hash, **kwargs ) if isinstance(response_or_template_contexts, response.HTTPResponse): return response_or_template_contexts - else: data, extra_template_data, templates = response_or_template_contexts except (sqlite3.OperationalError, InvalidSql) as e: @@ -195,6 +177,7 @@ class BaseView(RenderMixin): except DatasetteError: raise + # Convert rows and columns to CSV headings = data["columns"] # if there are expanded_columns we need to add additional headings @@ -207,22 +190,35 @@ class BaseView(RenderMixin): headings.append("{}_label".format(column)) async def stream_fn(r): + nonlocal data writer = csv.writer(r) - writer.writerow(headings) - for row in data["rows"]: - if not expanded_columns: - # Simple path - writer.writerow(row) - else: - # Look for {"value": "label": } dicts and expand - new_row = [] - for cell in row: - if isinstance(cell, dict): - new_row.append(cell["value"]) - new_row.append(cell["label"]) - else: - new_row.append(cell) - writer.writerow(new_row) + first = True + next = None + while first or (next and stream): + if next: + kwargs["_next"] = next + if not first: + data, extra_template_data, templates = await self.data( + request, name, hash, **kwargs + ) + if first: + writer.writerow(headings) + first = False + next = data.get("next") + for row in data["rows"]: + if not expanded_columns: + # Simple path + writer.writerow(row) + else: + # Look for {"value": "label": } dicts and expand + new_row = [] + for cell in row: + if isinstance(cell, dict): + new_row.append(cell["value"]) + new_row.append(cell["label"]) + else: + new_row.append(cell) + writer.writerow(new_row) content_type = "text/plain; charset=utf-8" headers = {} diff --git a/docs/config.rst b/docs/config.rst index 8f0cd246..36a046fc 100644 --- a/docs/config.rst +++ b/docs/config.rst @@ -125,3 +125,15 @@ Sets the amount of memory SQLite uses for its `per-connection cache Date: Sun, 17 Jun 2018 20:01:30 -0700 Subject: [PATCH 3/6] New config option max_csv_mb limiting size of CSV export - refs #266 --- datasette/app.py | 3 ++ datasette/utils.py | 19 ++++++++++++ datasette/views/base.py | 61 +++++++++++++++++++++---------------- datasette/views/database.py | 4 +-- docs/config.rst | 9 ++++++ tests/fixtures.py | 7 +++++ tests/test_api.py | 1 + tests/test_csv.py | 14 ++++++++- 8 files changed, 89 insertions(+), 29 deletions(-) diff --git a/datasette/app.py b/datasette/app.py index 92327353..fb389d73 100644 --- a/datasette/app.py +++ b/datasette/app.py @@ -97,6 +97,9 @@ CONFIG_OPTIONS = ( ConfigOption("allow_csv_stream", True, """ Allow .csv?_stream=1 to download all rows (ignoring max_returned_rows) """.strip()), + ConfigOption("max_csv_mb", 100, """ + Maximum size allowed for CSV export in MB. Set 0 to disable this limit. + """.strip()), ) DEFAULT_CONFIG = { option.name: option.default diff --git a/datasette/utils.py b/datasette/utils.py index a179eddf..005db87f 100644 --- a/datasette/utils.py +++ b/datasette/utils.py @@ -832,3 +832,22 @@ def value_as_boolean(value): class ValueAsBooleanError(ValueError): pass + + +class WriteLimitExceeded(Exception): + pass + + +class LimitedWriter: + def __init__(self, writer, limit_mb): + self.writer = writer + self.limit_bytes = limit_mb * 1024 * 1024 + self.bytes_count = 0 + + def write(self, bytes): + self.bytes_count += len(bytes) + if self.limit_bytes and (self.bytes_count > self.limit_bytes): + raise WriteLimitExceeded("CSV contains more than {} bytes".format( + self.limit_bytes + )) + self.writer.write(bytes) diff --git a/datasette/views/base.py b/datasette/views/base.py index a572c330..0ca52e61 100644 --- a/datasette/views/base.py +++ b/datasette/views/base.py @@ -16,6 +16,7 @@ from datasette.utils import ( CustomJSONEncoder, InterruptedError, InvalidSql, + LimitedWriter, path_from_row_pks, path_with_added_args, path_with_format, @@ -191,34 +192,39 @@ class BaseView(RenderMixin): async def stream_fn(r): nonlocal data - writer = csv.writer(r) + writer = csv.writer(LimitedWriter(r, self.ds.config["max_csv_mb"])) first = True next = None while first or (next and stream): - if next: - kwargs["_next"] = next - if not first: - data, extra_template_data, templates = await self.data( - request, name, hash, **kwargs - ) - if first: - writer.writerow(headings) - first = False - next = data.get("next") - for row in data["rows"]: - if not expanded_columns: - # Simple path - writer.writerow(row) - else: - # Look for {"value": "label": } dicts and expand - new_row = [] - for cell in row: - if isinstance(cell, dict): - new_row.append(cell["value"]) - new_row.append(cell["label"]) - else: - new_row.append(cell) - writer.writerow(new_row) + try: + if next: + kwargs["_next"] = next + if not first: + data, extra_template_data, templates = await self.data( + request, name, hash, **kwargs + ) + if first: + writer.writerow(headings) + first = False + next = data.get("next") + for row in data["rows"]: + if not expanded_columns: + # Simple path + writer.writerow(row) + else: + # Look for {"value": "label": } dicts and expand + new_row = [] + for cell in row: + if isinstance(cell, dict): + new_row.append(cell["value"]) + new_row.append(cell["label"]) + else: + new_row.append(cell) + writer.writerow(new_row) + except Exception as e: + print('caught this', e) + r.write(str(e)) + return content_type = "text/plain; charset=utf-8" headers = {} @@ -417,7 +423,8 @@ class BaseView(RenderMixin): return r async def custom_sql( - self, request, name, hash, sql, editable=True, canned_query=None + self, request, name, hash, sql, editable=True, canned_query=None, + _size=None ): params = request.raw_args if "sql" in params: @@ -439,6 +446,8 @@ class BaseView(RenderMixin): extra_args = {} if params.get("_timelimit"): extra_args["custom_time_limit"] = int(params["_timelimit"]) + if _size: + extra_args["page_size"] = _size results = await self.ds.execute( name, sql, params, truncate=True, **extra_args ) diff --git a/datasette/views/database.py b/datasette/views/database.py index 2f3f41d3..a7df485b 100644 --- a/datasette/views/database.py +++ b/datasette/views/database.py @@ -9,13 +9,13 @@ from .base import BaseView, DatasetteError class DatabaseView(BaseView): - async def data(self, request, name, hash, default_labels=False): + async def data(self, request, name, hash, default_labels=False, _size=None): if request.args.get("sql"): if not self.ds.config["allow_sql"]: raise DatasetteError("sql= is not allowed", status=400) sql = request.raw_args.pop("sql") validate_sql_select(sql) - return await self.custom_sql(request, name, hash, sql) + return await self.custom_sql(request, name, hash, sql, _size=_size) info = self.ds.inspect()[name] metadata = self.ds.metadata.get("databases", {}).get(name, {}) diff --git a/docs/config.rst b/docs/config.rst index 36a046fc..e0013bf0 100644 --- a/docs/config.rst +++ b/docs/config.rst @@ -137,3 +137,12 @@ can turn it off like this:: :: datasette mydatabase.db --config allow_csv_stream:off + + +max_csv_mb +---------- + +The maximum size of CSV that can be exported, in megabytes. Defaults to 100MB. +You can disable the limit entirely by settings this to 0:: + + datasette mydatabase.db --config max_csv_mb:0 diff --git a/tests/fixtures.py b/tests/fixtures.py index 92fd5073..ea0b4e35 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -71,6 +71,13 @@ def app_client_larger_cache_size(): }) +@pytest.fixture(scope='session') +def app_client_csv_max_mb_one(): + yield from app_client(config={ + 'max_csv_mb': 1, + }) + + def generate_compound_rows(num): for a, b, c in itertools.islice( itertools.product(string.ascii_lowercase, repeat=3), num diff --git a/tests/test_api.py b/tests/test_api.py index 1e889963..2187deb5 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -902,6 +902,7 @@ def test_config_json(app_client): "num_sql_threads": 3, "cache_size_kb": 0, "allow_csv_stream": True, + "max_csv_mb": 100, } == response.json diff --git a/tests/test_csv.py b/tests/test_csv.py index 65bc9c1d..fcbacc76 100644 --- a/tests/test_csv.py +++ b/tests/test_csv.py @@ -1,4 +1,4 @@ -from .fixtures import app_client # noqa +from .fixtures import app_client, app_client_csv_max_mb_one # noqa EXPECTED_TABLE_CSV = '''id,content 1,hello @@ -61,6 +61,18 @@ def test_table_csv_download(app_client): assert expected_disposition == response.headers['Content-Disposition'] +def test_max_csv_mb(app_client_csv_max_mb_one): + response = app_client_csv_max_mb_one.get( + "/fixtures.csv?sql=select+randomblob(10000)+" + "from+compound_three_primary_keys&_stream=1&_size=max" + ) + # It's a 200 because we started streaming before we knew the error + assert response.status == 200 + # Last line should be an error message + last_line = [line for line in response.body.split(b"\r\n") if line][-1] + assert last_line.startswith(b"CSV contains more than") + + def test_table_csv_stream(app_client): # Without _stream should return header + 100 rows: response = app_client.get( From 9f9c737fc2618780dde46c071ad8b985ad6c567d Mon Sep 17 00:00:00 2001 From: Simon Willison Date: Sun, 17 Jun 2018 20:05:38 -0700 Subject: [PATCH 4/6] Fixed sphinx warning --- docs/metadata.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/metadata.rst b/docs/metadata.rst index 029e2960..7ae561af 100644 --- a/docs/metadata.rst +++ b/docs/metadata.rst @@ -122,6 +122,7 @@ This will restrict sorting of ``example_table`` to just the ``height`` and You can also disable sorting entirely by setting ``"sortable_columns": []`` .. _label_columns: + Specifying the label column for a table --------------------------------------- From b15f412e04ce9ff21983986e661fbe4396f97b43 Mon Sep 17 00:00:00 2001 From: Simon Willison Date: Sun, 17 Jun 2018 20:14:32 -0700 Subject: [PATCH 5/6] Only deploy latest on push to master (not pull request) --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 9e92eee3..d32df307 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,6 +13,7 @@ script: jobs: include: - stage: deploy latest.datasette.io + if: branch = master AND type = push script: - pip install . - npm install -g now @@ -23,7 +24,6 @@ jobs: - now alias --token=$NOW_TOKEN - echo "{\"name\":\"datasette-latest-$ALIAS\",\"alias\":\"$ALIAS.datasette.io\"}" > now.json - now alias --token=$NOW_TOKEN - on: master - stage: release tagged version if: tag IS present python: 3.6 From 6204ebb7d507b08b81f13cc5b9c176f62b499634 Mon Sep 17 00:00:00 2001 From: Simon Willison Date: Sun, 17 Jun 2018 23:03:22 -0700 Subject: [PATCH 6/6] Improved UI for CSV/JSON export, closes #266 --- datasette/static/app.css | 14 ++++++++++++++ datasette/templates/query.html | 2 +- datasette/templates/table.html | 23 ++++++++++++++++++++++- datasette/utils.py | 7 +++++++ datasette/views/base.py | 18 +++++++++--------- datasette/views/table.py | 2 ++ tests/test_html.py | 26 +++++++++++++++++++++++--- 7 files changed, 78 insertions(+), 14 deletions(-) diff --git a/datasette/static/app.css b/datasette/static/app.css index 9e95505f..2fc16940 100644 --- a/datasette/static/app.css +++ b/datasette/static/app.css @@ -118,6 +118,13 @@ form label { display: inline-block; width: 15%; } +.advanced-export form label { + width: auto; +} +.advanced-export input[type=submit] { + font-size: 0.6em; + margin-left: 1em; +} label.sort_by_desc { width: auto; padding-right: 1em; @@ -272,3 +279,10 @@ a.not-underlined { .facet-info a.cross:active { text-decoration: none; } +.advanced-export { + margin-top: 1em; + padding: 0.01em 2em 0.01em 1em; + width: auto; + display: inline-block; + box-shadow: 1px 2px 8px 2px rgba(0,0,0,0.08); +} diff --git a/datasette/templates/query.html b/datasette/templates/query.html index e04df160..8e2f9036 100644 --- a/datasette/templates/query.html +++ b/datasette/templates/query.html @@ -40,7 +40,7 @@ {% if rows %} - + diff --git a/datasette/templates/table.html b/datasette/templates/table.html index eda37bc7..bb2522d6 100644 --- a/datasette/templates/table.html +++ b/datasette/templates/table.html @@ -92,7 +92,7 @@

View and edit SQL

{% endif %} - + {% if suggested_facets %}

@@ -137,6 +137,27 @@

Next page

{% endif %} +{% if display_rows %} +
+

Advanced export

+

JSON shape: default, array{% if primary_keys %}, object{% endif %}

+
+

+ CSV options: + + {% if expandable_columns %}{% endif %} + {% if next_url %}{% endif %} + + {% for key, value in url_csv_args.items() %} + {% if key != "_labels" %} + + {% endif %} + {% endfor %} +

+ +
+{% endif %} + {% if table_definition %}
{{ table_definition }}
{% endif %} diff --git a/datasette/utils.py b/datasette/utils.py index 005db87f..6253fb7a 100644 --- a/datasette/utils.py +++ b/datasette/utils.py @@ -170,6 +170,13 @@ def validate_sql_select(sql): raise InvalidSql(msg) +def append_querystring(url, querystring): + op = "&" if ("?" in url) else "?" + return "{}{}{}".format( + url, op, querystring + ) + + def path_with_added_args(request, args, path=None): path = path or request.path if isinstance(args, dict): diff --git a/datasette/views/base.py b/datasette/views/base.py index 0ca52e61..c3da3ab7 100644 --- a/datasette/views/base.py +++ b/datasette/views/base.py @@ -382,6 +382,12 @@ class BaseView(RenderMixin): url_labels_extra = {} if data.get("expandable_columns"): url_labels_extra = {"_labels": "on"} + url_csv_args = { + "_size": "max", + **url_labels_extra + } + url_csv = path_with_format(request, "csv", url_csv_args) + url_csv_path = url_csv.split('?')[0] context = { **data, **extras, @@ -389,15 +395,9 @@ class BaseView(RenderMixin): "url_json": path_with_format(request, "json", { **url_labels_extra, }), - "url_csv": path_with_format(request, "csv", { - "_size": "max", - **url_labels_extra - }), - "url_csv_dl": path_with_format(request, "csv", { - "_dl": "1", - "_size": "max", - **url_labels_extra - }), + "url_csv": url_csv, + "url_csv_path": url_csv_path, + "url_csv_args": url_csv_args, "extra_css_urls": self.ds.extra_css_urls(), "extra_js_urls": self.ds.extra_js_urls(), "datasette_version": __version__, diff --git a/datasette/views/table.py b/datasette/views/table.py index cb2c9ae5..89dec455 100644 --- a/datasette/views/table.py +++ b/datasette/views/table.py @@ -10,6 +10,7 @@ from datasette.utils import ( CustomRow, Filters, InterruptedError, + append_querystring, compound_keys_after_sql, escape_sqlite, filters_should_redirect, @@ -748,6 +749,7 @@ class TableView(RowTableShared): "is_sortable": any(c["sortable"] for c in display_columns), "path_with_replaced_args": path_with_replaced_args, "path_with_removed_args": path_with_removed_args, + "append_querystring": append_querystring, "request": request, "sort": sort, "sort_desc": sort_desc, diff --git a/tests/test_html.py b/tests/test_html.py index 98e54998..11c5fd43 100644 --- a/tests/test_html.py +++ b/tests/test_html.py @@ -274,9 +274,10 @@ def test_table_html_simple_primary_key(app_client): ] == [[str(td) for td in tr.select('td')] for tr in table.select('tbody tr')] -def test_table_csv_json_export_links(app_client): +def test_table_csv_json_export_interface(app_client): response = app_client.get('/fixtures/simple_primary_key') assert response.status == 200 + # The links at the top of the page links = Soup(response.body, "html.parser").find("p", { "class": "export-links" }).findAll("a") @@ -284,9 +285,28 @@ def test_table_csv_json_export_links(app_client): expected = [ "simple_primary_key.json", "simple_primary_key.csv?_size=max", - "simple_primary_key.csv?_dl=1&_size=max" + "#export" ] assert expected == actual + # And the advaced export box at the bottom: + div = Soup(response.body, "html.parser").find("div", { + "class": "advanced-export" + }) + json_links = [a["href"].split("/")[-1] for a in div.find("p").findAll("a")] + assert [ + "simple_primary_key.json", + "simple_primary_key.json?_shape=array", + "simple_primary_key.json?_shape=object" + ] == json_links + # And the CSV form + form = div.find("form") + assert form["action"].endswith("/simple_primary_key.csv") + inputs = [str(input) for input in form.findAll("input")] + assert [ + '', + '', + '' + ] == inputs def test_csv_json_export_links_include_labels_if_foreign_keys(app_client): @@ -299,7 +319,7 @@ def test_csv_json_export_links_include_labels_if_foreign_keys(app_client): expected = [ "facetable.json?_labels=on", "facetable.csv?_labels=on&_size=max", - "facetable.csv?_dl=1&_labels=on&_size=max" + "#export" ] assert expected == actual