table.csv?_stream=1 to download all rows - refs #266

This option causes Datasette to serve ALL rows in the table, by internally following the _next= pagination links and serving everything out as a stream. Also added new config option, allow_csv_stream, which can be used to disable this feature.
2025-12-10 16:51:24 +01:00 · 2018-06-17 19:31:09 -07:00 · 2018-06-17 19:31:09 -07:00 · 619a9ddb33
commit 619a9ddb33
parent 5a0a82faf9
5 changed files with 69 additions and 44 deletions
--- a/datasette/app.py
+++ b/datasette/app.py
@ -94,6 +94,9 @@ CONFIG_OPTIONS = (
    ConfigOption("cache_size_kb", 0, """
        SQLite cache size in KB (0 == use SQLite default)
    """.strip()),
+    ConfigOption("allow_csv_stream", True, """
+        Allow .csv?_stream=1 to download all rows (ignoring max_returned_rows)
+    """.strip()),
 )
 DEFAULT_CONFIG = {
    option.name: option.default
--- a/datasette/views/base.py
+++ b/datasette/views/base.py
@ -149,42 +149,24 @@ class BaseView(RenderMixin):

        return await self.view_get(request, name, hash, **kwargs)

-    async def as_csv_stream(self, request, name, hash, **kwargs):
-        assert not request.args.get("_next")  # TODO: real error
-        kwargs['_size'] = 'max'
-
-        async def stream_fn(r):
-            first = True
-            next = None
-            writer = csv.writer(r)
-            while first or next:
-                if next:
-                    kwargs['_next'] = next
-                data, extra_template_data, templates = await self.data(
-                    request, name, hash, **kwargs
-                )
-                if first:
-                    writer.writerow(data["columns"])
-                    first = False
-                next = data["next"]
-                for row in data["rows"]:
-                    writer.writerow(row)
-
-        return response.stream(
-            stream_fn,
-            content_type="text/plain; charset=utf-8"
-        )
-
    async def as_csv(self, request, name, hash, **kwargs):
-        if request.args.get("_stream"):
-            return await self.as_csv_stream(request, name, hash, **kwargs)
+        stream = request.args.get("_stream")
+        if stream:
+            # Some quick sanity checks
+            if not self.ds.config["allow_csv_stream"]:
+                raise DatasetteError("CSV streaming is disabled", status=400)
+            if request.args.get("_next"):
+                raise DatasetteError(
+                    "_next not allowed for CSV streaming", status=400
+                )
+            kwargs["_size"] = "max"
+        # Fetch the first page
        try:
            response_or_template_contexts = await self.data(
                request, name, hash, **kwargs
            )
            if isinstance(response_or_template_contexts, response.HTTPResponse):
                return response_or_template_contexts
-
            else:
                data, extra_template_data, templates = response_or_template_contexts
        except (sqlite3.OperationalError, InvalidSql) as e:
@ -195,6 +177,7 @@ class BaseView(RenderMixin):

        except DatasetteError:
            raise
+
        # Convert rows and columns to CSV
        headings = data["columns"]
        # if there are expanded_columns we need to add additional headings
@ -207,22 +190,35 @@ class BaseView(RenderMixin):
                    headings.append("{}_label".format(column))

        async def stream_fn(r):
+            nonlocal data
            writer = csv.writer(r)
-            writer.writerow(headings)
-            for row in data["rows"]:
-                if not expanded_columns:
-                    # Simple path
-                    writer.writerow(row)
-                else:
-                    # Look for {"value": "label": } dicts and expand
-                    new_row = []
-                    for cell in row:
-                        if isinstance(cell, dict):
-                            new_row.append(cell["value"])
-                            new_row.append(cell["label"])
-                        else:
-                            new_row.append(cell)
-                    writer.writerow(new_row)
+            first = True
+            next = None
+            while first or (next and stream):
+                if next:
+                    kwargs["_next"] = next
+                if not first:
+                    data, extra_template_data, templates = await self.data(
+                        request, name, hash, **kwargs
+                    )
+                if first:
+                    writer.writerow(headings)
+                    first = False
+                next = data.get("next")
+                for row in data["rows"]:
+                    if not expanded_columns:
+                        # Simple path
+                        writer.writerow(row)
+                    else:
+                        # Look for {"value": "label": } dicts and expand
+                        new_row = []
+                        for cell in row:
+                            if isinstance(cell, dict):
+                                new_row.append(cell["value"])
+                                new_row.append(cell["label"])
+                            else:
+                                new_row.append(cell)
+                        writer.writerow(new_row)

        content_type = "text/plain; charset=utf-8"
        headers = {}
--- a/docs/config.rst
+++ b/docs/config.rst
@ -125,3 +125,15 @@ Sets the amount of memory SQLite uses for its `per-connection cache <https://www
 ::

    datasette mydatabase.db --config cache_size_kb:5000
+
+
+allow_csv_stream
+----------------
+
+Enables the feature where an entire table (potentially hundreds of thousands of
+rows) can be exported as a single CSV file. This is turned on by default - you
+can turn it off like this::
+
+::
+
+    datasette mydatabase.db --config allow_csv_stream:off
--- a/tests/test_api.py
+++ b/tests/test_api.py
@ -901,6 +901,7 @@ def test_config_json(app_client):
        "default_cache_ttl": 365 * 24 * 60 * 60,
        "num_sql_threads": 3,
        "cache_size_kb": 0,
+        "allow_csv_stream": True,
    } == response.json


--- a/tests/test_csv.py
+++ b/tests/test_csv.py
@ -59,3 +59,16 @@ def test_table_csv_download(app_client):
    assert 'text/csv; charset=utf-8' == response.headers['Content-Type']
    expected_disposition = 'attachment; filename="simple_primary_key.csv"'
    assert expected_disposition == response.headers['Content-Disposition']
+
+
+def test_table_csv_stream(app_client):
+    # Without _stream should return header + 100 rows:
+    response = app_client.get(
+        "/fixtures/compound_three_primary_keys.csv?_size=max"
+    )
+    assert 101 == len([b for b in response.body.split(b"\r\n") if b])
+    # With _stream=1 should return header + 1001 rows
+    response = app_client.get(
+        "/fixtures/compound_three_primary_keys.csv?_stream=1"
+    )
+    assert 1002 == len([b for b in response.body.split(b"\r\n") if b])