--limit= mechanism plus new limits for facets

Replaced the --max_returned_rows and --sql_time_limit_ms options to "datasette serve" with a new --limit option, which supports a larger list of limits. Example usage: datasette serve --limit max_returned_rows:1000 \ --limit sql_time_limit_ms:2500 \ --limit default_facet_size:50 \ --limit facet_time_limit_ms:1000 \ --limit facet_suggest_time_limit_ms:500 New docs: https://datasette.readthedocs.io/en/latest/limits.html Closes #270 Closes #264
2025-12-10 16:51:24 +01:00 · 2018-05-17 22:08:26 -07:00 · 2018-05-17 22:08:26 -07:00 · cef9a9a870
commit cef9a9a870
parent 8003a13331
10 changed files with 118 additions and 72 deletions
--- a/README.md
+++ b/README.md
@ -114,9 +114,6 @@ http://localhost:8001/History/downloads.json?_shape=objects will return that dat
      --cors                       Enable CORS by serving Access-Control-Allow-
                                   Origin: *
      --page_size INTEGER          Page size - default is 100
-      --max_returned_rows INTEGER  Max allowed rows to return at once - default is
-                                   1000. Set to 0 to disable check entirely.
-      --sql_time_limit_ms INTEGER  Max time allowed for SQL queries in ms
      --load-extension PATH        Path to a SQLite extension to load
      --inspect-file TEXT          Path to JSON file created using "datasette
                                   inspect"
@ -126,6 +123,8 @@ http://localhost:8001/History/downloads.json?_shape=objects will return that dat
      --plugins-dir DIRECTORY      Path to directory containing custom plugins
      --static STATIC MOUNT        mountpoint:path-to-directory for serving static
                                   files
+      --limit LIMIT                Set a limit using limitname:integer
+                                   datasette.readthedocs.io/en/latest/limits.html
      --help                       Show this message and exit.

 ## metadata.json
@ -214,13 +213,13 @@ If you have docker installed you can use `datasette package` to create a new Doc

 Both publish and package accept an `extra_options` argument option, which will affect how the resulting application is executed. For example, say you want to increase the SQL time limit for a particular container:

-    datasette package parlgov.db --extra-options="--sql_time_limit_ms=2500 --page_size=10"
+    datasette package parlgov.db --extra-options="--limit sql_time_limit_ms:2500 --page_size=10"

 The resulting container will run the application with those options.

 Here's example output for the package command:

-    $ datasette package parlgov.db --extra-options="--sql_time_limit_ms=2500 --page_size=10"
+    $ datasette package parlgov.db --extra-options="--limit sql_time_limit_ms:2500 --page_size=10"
    Sending build context to Docker daemon  4.459MB
    Step 1/7 : FROM python:3
     ---> 79e1dc9af1c1
@ -239,7 +238,7 @@ Here's example output for the package command:
    Step 6/7 : EXPOSE 8001
     ---> Using cache
     ---> 8e83844b0fed
-    Step 7/7 : CMD datasette serve parlgov.db --port 8001 --inspect-file inspect-data.json --sql_time_limit_ms=2500 --page_size=10
+    Step 7/7 : CMD datasette serve parlgov.db --port 8001 --inspect-file inspect-data.json --limit sql_time_limit_ms:2500 --page_size=10
     ---> Using cache
     ---> 1bd380ea8af3
    Successfully built 1bd380ea8af3
--- a/datasette/app.py
+++ b/datasette/app.py
@ -45,6 +45,15 @@ pm.add_hookspecs(hookspecs)
 pm.load_setuptools_entrypoints("datasette")


+DEFAULT_LIMITS = {
+    "max_returned_rows": 1000,
+    "sql_time_limit_ms": 1000,
+    "default_facet_size": 30,
+    "facet_time_limit_ms": 200,
+    "facet_suggest_time_limit_ms": 50,
+}
+
+
 class JsonDataView(RenderMixin):

    def __init__(self, datasette, filename, data_callback):
@ -79,8 +88,6 @@ class Datasette:
        num_threads=3,
        cache_headers=True,
        page_size=100,
-        max_returned_rows=1000,
-        sql_time_limit_ms=1000,
        cors=False,
        inspect_data=None,
        metadata=None,
@ -88,14 +95,13 @@ class Datasette:
        template_dir=None,
        plugins_dir=None,
        static_mounts=None,
+        limits=None,
    ):
        self.files = files
        self.num_threads = num_threads
        self.executor = futures.ThreadPoolExecutor(max_workers=num_threads)
        self.cache_headers = cache_headers
        self.page_size = page_size
-        self.max_returned_rows = max_returned_rows
-        self.sql_time_limit_ms = sql_time_limit_ms
        self.cors = cors
        self._inspect = inspect_data
        self.metadata = metadata or {}
@ -104,6 +110,9 @@ class Datasette:
        self.template_dir = template_dir
        self.plugins_dir = plugins_dir
        self.static_mounts = static_mounts or []
+        self.limits = dict(DEFAULT_LIMITS, **(limits or {}))
+        self.max_returned_rows = self.limits["max_returned_rows"]
+        self.sql_time_limit_ms = self.limits["sql_time_limit_ms"]
        # Execute plugins in constructor, to ensure they are available
        # when the rest of `datasette inspect` executes
        if self.plugins_dir:
--- a/datasette/cli.py
+++ b/datasette/cli.py
@ -5,7 +5,7 @@ import os
 import shutil
 from subprocess import call, check_output
 import sys
-from .app import Datasette
+from .app import Datasette, DEFAULT_LIMITS
 from .utils import temporary_docker_directory, temporary_heroku_directory


@ -15,7 +15,8 @@ class StaticMount(click.ParamType):
    def convert(self, value, param, ctx):
        if ":" not in value:
            self.fail(
-                '"%s" should be of format mountpoint:directory' % value, param, ctx
+                '"{}" should be of format mountpoint:directory'.format(value),
+                param, ctx
            )
        path, dirpath = value.split(":")
        if not os.path.exists(dirpath) or not os.path.isdir(dirpath):
@ -23,6 +24,26 @@ class StaticMount(click.ParamType):
        return path, dirpath


+class Limit(click.ParamType):
+    name = "limit"
+
+    def convert(self, value, param, ctx):
+        ok = True
+        if ":" not in value:
+            ok = False
+        else:
+            name, intvalue = value.split(":")
+            ok = intvalue.isdigit()
+        if not ok:
+            self.fail(
+                '"{}" should be of format name:integer'.format(value),
+                param, ctx
+            )
+        if name not in DEFAULT_LIMITS:
+            self.fail("{} is not a valid limit".format(name), param, ctx)
+        return name, int(intvalue)
+
+
@click.group(cls=DefaultGroup, default="serve", default_if_no_args=True)
@click.version_option()
 def cli():
@ -364,14 +385,6 @@ def package(
    "--cors", is_flag=True, help="Enable CORS by serving Access-Control-Allow-Origin: *"
 )
@click.option("--page_size", default=100, help="Page size - default is 100")
-@click.option(
-    "--max_returned_rows",
-    default=1000,
-    help="Max allowed rows to return at once - default is 1000. Set to 0 to disable check entirely.",
-)
-@click.option(
-    "--sql_time_limit_ms", default=1000, help="Max time allowed for SQL queries in ms"
-)
@click.option(
    "sqlite_extensions",
    "--load-extension",
@ -405,6 +418,12 @@ def package(
    help="mountpoint:path-to-directory for serving static files",
    multiple=True,
 )
+@click.option(
+    "--limit",
+    type=Limit(),
+    help="Set a limit using limitname:integer datasette.readthedocs.io/en/latest/limits.html",
+    multiple=True,
+)
 def serve(
    files,
    host,
@ -413,14 +432,13 @@ def serve(
    reload,
    cors,
    page_size,
-    max_returned_rows,
-    sql_time_limit_ms,
    sqlite_extensions,
    inspect_file,
    metadata,
    template_dir,
    plugins_dir,
    static,
+    limit,
 ):
    """Serve up specified SQLite database files with a web UI"""
    if reload:
@ -444,14 +462,13 @@ def serve(
        cache_headers=not debug and not reload,
        cors=cors,
        page_size=page_size,
-        max_returned_rows=max_returned_rows,
-        sql_time_limit_ms=sql_time_limit_ms,
        inspect_data=inspect_data,
        metadata=metadata_data,
        sqlite_extensions=sqlite_extensions,
        template_dir=template_dir,
        plugins_dir=plugins_dir,
        static_mounts=static,
+        limits=dict(limit),
    )
    # Force initial hashing/table counting
    ds.inspect()
--- a/datasette/views/table.py
+++ b/datasette/views/table.py
@ -536,7 +536,7 @@ class TableView(RowTableShared):
        )

        # facets support
-        FACET_SIZE = 20
+        facet_size = self.ds.limits["default_facet_size"]
        metadata_facets = table_metadata.get("facets", [])
        facets = metadata_facets[:]
        try:
@ -553,20 +553,21 @@ class TableView(RowTableShared):
                col=escape_sqlite(column),
                from_sql=from_sql,
                and_or_where='and' if where_clauses else 'where',
-                limit=FACET_SIZE+1,
+                limit=facet_size+1,
            )
            try:
                facet_rows = await self.execute(
                    name, facet_sql, params,
-                    truncate=False, custom_time_limit=200
+                    truncate=False,
+                    custom_time_limit=self.ds.limits["facet_time_limit_ms"],
                )
                facet_results_values = []
                facet_results[column] = {
                    "name": column,
                    "results": facet_results_values,
-                    "truncated": len(facet_rows) > FACET_SIZE,
+                    "truncated": len(facet_rows) > facet_size,
                }
-                facet_rows = facet_rows[:FACET_SIZE]
+                facet_rows = facet_rows[:facet_size]
                # Attempt to expand foreign keys into labels
                values = [row["value"] for row in facet_rows]
                expanded = (await self.expand_foreign_keys(
@ -644,7 +645,6 @@ class TableView(RowTableShared):
                pass

            # Detect suggested facets
-            FACET_LIMIT = 30
            suggested_facets = []
            for facet_column in columns:
                if facet_column in facets:
@ -657,19 +657,20 @@ class TableView(RowTableShared):
                    column=escape_sqlite(facet_column),
                    from_sql=from_sql,
                    and_or_where='and' if where_clauses else 'where',
-                    limit=FACET_LIMIT+1
+                    limit=facet_size+1
                )
                distinct_values = None
                try:
                    distinct_values = await self.execute(
                        name, suggested_facet_sql, params,
-                        truncate=False, custom_time_limit=50
+                        truncate=False,
+                        custom_time_limit=self.ds.limits["facet_suggest_time_limit_ms"],
                    )
                    num_distinct_values = len(distinct_values)
                    if (
                        num_distinct_values and
                        num_distinct_values > 1 and
-                        num_distinct_values <= FACET_LIMIT and
+                        num_distinct_values <= facet_size and
                        num_distinct_values < filtered_table_rows_count
                    ):
                        suggested_facets.append({
--- a/docs/getting_started.rst
+++ b/docs/getting_started.rst
@ -101,9 +101,6 @@ datasette serve options
      --cors                       Enable CORS by serving Access-Control-Allow-
                                   Origin: *
      --page_size INTEGER          Page size - default is 100
-      --max_returned_rows INTEGER  Max allowed rows to return at once - default is
-                                   1000. Set to 0 to disable check entirely.
-      --sql_time_limit_ms INTEGER  Max time allowed for SQL queries in ms
      --load-extension PATH        Path to a SQLite extension to load
      --inspect-file TEXT          Path to JSON file created using "datasette
                                   inspect"
@ -113,4 +110,6 @@ datasette serve options
      --plugins-dir DIRECTORY      Path to directory containing custom plugins
      --static STATIC MOUNT        mountpoint:path-to-directory for serving static
                                   files
+      --limit LIMIT                Set a limit using limitname:integer
+                                   datasette.readthedocs.io/en/latest/limits.html
      --help                       Show this message and exit.
--- a/docs/index.rst
+++ b/docs/index.rst
@ -22,6 +22,7 @@ Contents
   facets
   full_text_search
   metadata
+   limits
   custom_templates
   plugins
   changelog
--- a/docs/json_api.rst
+++ b/docs/json_api.rst
@ -132,7 +132,7 @@ Special table arguments
 The Datasette table view takes a number of special querystring arguments:

 ``?_size=1000`` or ``?_size=max``
-    Sets a custom page size. This cannot exceed the ``max_returned_rows`` option
+    Sets a custom page size. This cannot exceed the ``max_returned_rows`` limit
    passed to ``datasette serve``. Use ``max`` to get ``max_returned_rows``.

 ``?_sort=COLUMN``
--- a/docs/limits.rst
+++ b/docs/limits.rst
@ -0,0 +1,51 @@
+Limits
+======
+
+To prevent rogue, long-running queries from making a Datasette instance inaccessible to other users, Datasette imposes some limits on the SQL that you can execute.
+
+sql_time_limit_ms
+-----------------
+
+By default, queries have a time limit of one second. If a query takes longer than this to run Datasette will terminate the query and return an error.
+
+If this time limit is too short for you, you can customize it using the ``sql_time_limit_ms`` limit - for example, to increase it to 3.5 seconds::
+
+    datasette mydatabase.db --limit sql_time_limit_ms:3500
+
+You can optionally set a lower time limit for an individual query using the ``_timelimit`` query string argument::
+
+    /my-database/my-table?qSpecies=44&_timelimit=100
+
+This would set the time limit to 100ms for that specific query. This feature is useful if you are working with databases of unknown size and complexity - a query that might make perfect sense for a smaller table could take too long to execute on a table with millions of rows. By setting custom time limits you can execute queries "optimistically" - e.g. give me an exact count of rows matching this query but only if it takes less than 100ms to calculate.
+
+max_returned_rows
+-----------------
+
+Datasette returns a maximum of 1,000 rows of data at a time. If you execute a query that returns more than 1,000 rows, Datasette will return the first 1,000 and include a warning that the result set has been truncated. You can use OFFSET/LIMIT or other methods in your SQL to implement pagination if you need to return more than 1,000 rows.
+
+You can increase or decrease this limit like so::
+
+    datasette mydatabase.db --limit max_returned_rows:2000
+
+default_facet_size
+------------------
+
+The default number of unique rows returned by :ref:`facets` is 30. You can customize it like this::
+
+    datasette mydatabase.db --limit default_facet_size:50
+
+facet_time_limit_ms
+-------------------
+
+This is the time limit Datasette allows for calculating a facet, which defaults to 200ms::
+
+    datasette mydatabase.db --limit facet_time_limit_ms:1000
+
+facet_suggest_time_limit_ms
+---------------------------
+
+When Datasette calculates suggested facets it needs to run a SQL query for every column in your table. The default for this time limit is 50ms to account for the fact that it needs to run once for every column. If the time limit is exceeded the column will not be suggested as a facet.
+
+You can increase this time limit like so::
+
+    datasette mydatabase.db --limit facet_suggest_time_limit_ms:500
--- a/docs/sql_queries.rst
+++ b/docs/sql_queries.rst
@ -46,39 +46,6 @@ statements can be used to change database settings at runtime. If you need to
 include the string "pragma" in a query you can do so safely using a named
 parameter.

-Query limits
------------
-
-To prevent rogue, long-running queries from making a Datasette instance
-inaccessible to other users, Datasette imposes some limits on the SQL that you
-can execute.
-
-By default, queries have a time limit of one second. If a query takes longer
-than this to run Datasette will terminate the query and return an error.
-
-If this time limit is too short for you, you can customize it using the
-``sql_time_limit_ms`` option - for example, to increase it to 3.5 seconds::
-
-    datasette mydatabase.db --sql_time_limit_ms=3500
-
-You can optionally set a lower time limit for an individual query using the
-``_timelimit`` query string argument::
-
-    /my-database/my-table?qSpecies=44&_timelimit=100
-
-This would set the time limit to 100ms for that specific query. This feature
-is useful if you are working with databases of unknown size and complexity -
-a query that might make perfect sense for a smaller table could take too long
-to execute on a table with millions of rows. By setting custom time limits you
-can execute queries "optimistically" - e.g. give me an exact count of rows
-matching this query but only if it takes less than 100ms to calculate.
-
-Datasette returns a maximum of 1,000 rows of data at a time. If you execute a
-query that returns more than 1,000 rows, Datasette will return the first 1,000
-and include a warning that the result set has been truncated. You can use
-OFFSET/LIMIT or other methods in your SQL to implement pagination if you need to
-return more than 1,000 rows.
-
 Views
 -----

--- a/tests/fixtures.py
+++ b/tests/fixtures.py
@ -21,10 +21,12 @@ def app_client(sql_time_limit_ms=None, max_returned_rows=None):
        ds = Datasette(
            [filepath],
            page_size=50,
-            max_returned_rows=max_returned_rows or 100,
-            sql_time_limit_ms=sql_time_limit_ms or 200,
            metadata=METADATA,
            plugins_dir=plugins_dir,
+            limits={
+                'max_returned_rows': max_returned_rows or 100,
+                'sql_time_limit_ms': sql_time_limit_ms or 200,
+            }
        )
        ds.sqlite_functions.append(
            ('sleep', 1, lambda n: time.sleep(float(n))),