--limit= mechanism plus new limits for facets

Replaced the --max_returned_rows and --sql_time_limit_ms options to
"datasette serve" with a new --limit option, which supports a larger
list of limits.

Example usage:

	datasette serve --limit max_returned_rows:1000 \
		--limit sql_time_limit_ms:2500 \
		--limit default_facet_size:50 \
		--limit facet_time_limit_ms:1000 \
		--limit facet_suggest_time_limit_ms:500

New docs: https://datasette.readthedocs.io/en/latest/limits.html

Closes #270
Closes #264
This commit is contained in:
Simon Willison 2018-05-17 22:08:26 -07:00
commit cef9a9a870
No known key found for this signature in database
GPG key ID: 17E2DEA2588B7F52
10 changed files with 118 additions and 72 deletions

View file

@ -114,9 +114,6 @@ http://localhost:8001/History/downloads.json?_shape=objects will return that dat
--cors Enable CORS by serving Access-Control-Allow-
Origin: *
--page_size INTEGER Page size - default is 100
--max_returned_rows INTEGER Max allowed rows to return at once - default is
1000. Set to 0 to disable check entirely.
--sql_time_limit_ms INTEGER Max time allowed for SQL queries in ms
--load-extension PATH Path to a SQLite extension to load
--inspect-file TEXT Path to JSON file created using "datasette
inspect"
@ -126,6 +123,8 @@ http://localhost:8001/History/downloads.json?_shape=objects will return that dat
--plugins-dir DIRECTORY Path to directory containing custom plugins
--static STATIC MOUNT mountpoint:path-to-directory for serving static
files
--limit LIMIT Set a limit using limitname:integer
datasette.readthedocs.io/en/latest/limits.html
--help Show this message and exit.
## metadata.json
@ -214,13 +213,13 @@ If you have docker installed you can use `datasette package` to create a new Doc
Both publish and package accept an `extra_options` argument option, which will affect how the resulting application is executed. For example, say you want to increase the SQL time limit for a particular container:
datasette package parlgov.db --extra-options="--sql_time_limit_ms=2500 --page_size=10"
datasette package parlgov.db --extra-options="--limit sql_time_limit_ms:2500 --page_size=10"
The resulting container will run the application with those options.
Here's example output for the package command:
$ datasette package parlgov.db --extra-options="--sql_time_limit_ms=2500 --page_size=10"
$ datasette package parlgov.db --extra-options="--limit sql_time_limit_ms:2500 --page_size=10"
Sending build context to Docker daemon 4.459MB
Step 1/7 : FROM python:3
---> 79e1dc9af1c1
@ -239,7 +238,7 @@ Here's example output for the package command:
Step 6/7 : EXPOSE 8001
---> Using cache
---> 8e83844b0fed
Step 7/7 : CMD datasette serve parlgov.db --port 8001 --inspect-file inspect-data.json --sql_time_limit_ms=2500 --page_size=10
Step 7/7 : CMD datasette serve parlgov.db --port 8001 --inspect-file inspect-data.json --limit sql_time_limit_ms:2500 --page_size=10
---> Using cache
---> 1bd380ea8af3
Successfully built 1bd380ea8af3

View file

@ -45,6 +45,15 @@ pm.add_hookspecs(hookspecs)
pm.load_setuptools_entrypoints("datasette")
DEFAULT_LIMITS = {
"max_returned_rows": 1000,
"sql_time_limit_ms": 1000,
"default_facet_size": 30,
"facet_time_limit_ms": 200,
"facet_suggest_time_limit_ms": 50,
}
class JsonDataView(RenderMixin):
def __init__(self, datasette, filename, data_callback):
@ -79,8 +88,6 @@ class Datasette:
num_threads=3,
cache_headers=True,
page_size=100,
max_returned_rows=1000,
sql_time_limit_ms=1000,
cors=False,
inspect_data=None,
metadata=None,
@ -88,14 +95,13 @@ class Datasette:
template_dir=None,
plugins_dir=None,
static_mounts=None,
limits=None,
):
self.files = files
self.num_threads = num_threads
self.executor = futures.ThreadPoolExecutor(max_workers=num_threads)
self.cache_headers = cache_headers
self.page_size = page_size
self.max_returned_rows = max_returned_rows
self.sql_time_limit_ms = sql_time_limit_ms
self.cors = cors
self._inspect = inspect_data
self.metadata = metadata or {}
@ -104,6 +110,9 @@ class Datasette:
self.template_dir = template_dir
self.plugins_dir = plugins_dir
self.static_mounts = static_mounts or []
self.limits = dict(DEFAULT_LIMITS, **(limits or {}))
self.max_returned_rows = self.limits["max_returned_rows"]
self.sql_time_limit_ms = self.limits["sql_time_limit_ms"]
# Execute plugins in constructor, to ensure they are available
# when the rest of `datasette inspect` executes
if self.plugins_dir:

View file

@ -5,7 +5,7 @@ import os
import shutil
from subprocess import call, check_output
import sys
from .app import Datasette
from .app import Datasette, DEFAULT_LIMITS
from .utils import temporary_docker_directory, temporary_heroku_directory
@ -15,7 +15,8 @@ class StaticMount(click.ParamType):
def convert(self, value, param, ctx):
if ":" not in value:
self.fail(
'"%s" should be of format mountpoint:directory' % value, param, ctx
'"{}" should be of format mountpoint:directory'.format(value),
param, ctx
)
path, dirpath = value.split(":")
if not os.path.exists(dirpath) or not os.path.isdir(dirpath):
@ -23,6 +24,26 @@ class StaticMount(click.ParamType):
return path, dirpath
class Limit(click.ParamType):
name = "limit"
def convert(self, value, param, ctx):
ok = True
if ":" not in value:
ok = False
else:
name, intvalue = value.split(":")
ok = intvalue.isdigit()
if not ok:
self.fail(
'"{}" should be of format name:integer'.format(value),
param, ctx
)
if name not in DEFAULT_LIMITS:
self.fail("{} is not a valid limit".format(name), param, ctx)
return name, int(intvalue)
@click.group(cls=DefaultGroup, default="serve", default_if_no_args=True)
@click.version_option()
def cli():
@ -364,14 +385,6 @@ def package(
"--cors", is_flag=True, help="Enable CORS by serving Access-Control-Allow-Origin: *"
)
@click.option("--page_size", default=100, help="Page size - default is 100")
@click.option(
"--max_returned_rows",
default=1000,
help="Max allowed rows to return at once - default is 1000. Set to 0 to disable check entirely.",
)
@click.option(
"--sql_time_limit_ms", default=1000, help="Max time allowed for SQL queries in ms"
)
@click.option(
"sqlite_extensions",
"--load-extension",
@ -405,6 +418,12 @@ def package(
help="mountpoint:path-to-directory for serving static files",
multiple=True,
)
@click.option(
"--limit",
type=Limit(),
help="Set a limit using limitname:integer datasette.readthedocs.io/en/latest/limits.html",
multiple=True,
)
def serve(
files,
host,
@ -413,14 +432,13 @@ def serve(
reload,
cors,
page_size,
max_returned_rows,
sql_time_limit_ms,
sqlite_extensions,
inspect_file,
metadata,
template_dir,
plugins_dir,
static,
limit,
):
"""Serve up specified SQLite database files with a web UI"""
if reload:
@ -444,14 +462,13 @@ def serve(
cache_headers=not debug and not reload,
cors=cors,
page_size=page_size,
max_returned_rows=max_returned_rows,
sql_time_limit_ms=sql_time_limit_ms,
inspect_data=inspect_data,
metadata=metadata_data,
sqlite_extensions=sqlite_extensions,
template_dir=template_dir,
plugins_dir=plugins_dir,
static_mounts=static,
limits=dict(limit),
)
# Force initial hashing/table counting
ds.inspect()

View file

@ -536,7 +536,7 @@ class TableView(RowTableShared):
)
# facets support
FACET_SIZE = 20
facet_size = self.ds.limits["default_facet_size"]
metadata_facets = table_metadata.get("facets", [])
facets = metadata_facets[:]
try:
@ -553,20 +553,21 @@ class TableView(RowTableShared):
col=escape_sqlite(column),
from_sql=from_sql,
and_or_where='and' if where_clauses else 'where',
limit=FACET_SIZE+1,
limit=facet_size+1,
)
try:
facet_rows = await self.execute(
name, facet_sql, params,
truncate=False, custom_time_limit=200
truncate=False,
custom_time_limit=self.ds.limits["facet_time_limit_ms"],
)
facet_results_values = []
facet_results[column] = {
"name": column,
"results": facet_results_values,
"truncated": len(facet_rows) > FACET_SIZE,
"truncated": len(facet_rows) > facet_size,
}
facet_rows = facet_rows[:FACET_SIZE]
facet_rows = facet_rows[:facet_size]
# Attempt to expand foreign keys into labels
values = [row["value"] for row in facet_rows]
expanded = (await self.expand_foreign_keys(
@ -644,7 +645,6 @@ class TableView(RowTableShared):
pass
# Detect suggested facets
FACET_LIMIT = 30
suggested_facets = []
for facet_column in columns:
if facet_column in facets:
@ -657,19 +657,20 @@ class TableView(RowTableShared):
column=escape_sqlite(facet_column),
from_sql=from_sql,
and_or_where='and' if where_clauses else 'where',
limit=FACET_LIMIT+1
limit=facet_size+1
)
distinct_values = None
try:
distinct_values = await self.execute(
name, suggested_facet_sql, params,
truncate=False, custom_time_limit=50
truncate=False,
custom_time_limit=self.ds.limits["facet_suggest_time_limit_ms"],
)
num_distinct_values = len(distinct_values)
if (
num_distinct_values and
num_distinct_values > 1 and
num_distinct_values <= FACET_LIMIT and
num_distinct_values <= facet_size and
num_distinct_values < filtered_table_rows_count
):
suggested_facets.append({

View file

@ -101,9 +101,6 @@ datasette serve options
--cors Enable CORS by serving Access-Control-Allow-
Origin: *
--page_size INTEGER Page size - default is 100
--max_returned_rows INTEGER Max allowed rows to return at once - default is
1000. Set to 0 to disable check entirely.
--sql_time_limit_ms INTEGER Max time allowed for SQL queries in ms
--load-extension PATH Path to a SQLite extension to load
--inspect-file TEXT Path to JSON file created using "datasette
inspect"
@ -113,4 +110,6 @@ datasette serve options
--plugins-dir DIRECTORY Path to directory containing custom plugins
--static STATIC MOUNT mountpoint:path-to-directory for serving static
files
--limit LIMIT Set a limit using limitname:integer
datasette.readthedocs.io/en/latest/limits.html
--help Show this message and exit.

View file

@ -22,6 +22,7 @@ Contents
facets
full_text_search
metadata
limits
custom_templates
plugins
changelog

View file

@ -132,7 +132,7 @@ Special table arguments
The Datasette table view takes a number of special querystring arguments:
``?_size=1000`` or ``?_size=max``
Sets a custom page size. This cannot exceed the ``max_returned_rows`` option
Sets a custom page size. This cannot exceed the ``max_returned_rows`` limit
passed to ``datasette serve``. Use ``max`` to get ``max_returned_rows``.
``?_sort=COLUMN``

51
docs/limits.rst Normal file
View file

@ -0,0 +1,51 @@
Limits
======
To prevent rogue, long-running queries from making a Datasette instance inaccessible to other users, Datasette imposes some limits on the SQL that you can execute.
sql_time_limit_ms
-----------------
By default, queries have a time limit of one second. If a query takes longer than this to run Datasette will terminate the query and return an error.
If this time limit is too short for you, you can customize it using the ``sql_time_limit_ms`` limit - for example, to increase it to 3.5 seconds::
datasette mydatabase.db --limit sql_time_limit_ms:3500
You can optionally set a lower time limit for an individual query using the ``_timelimit`` query string argument::
/my-database/my-table?qSpecies=44&_timelimit=100
This would set the time limit to 100ms for that specific query. This feature is useful if you are working with databases of unknown size and complexity - a query that might make perfect sense for a smaller table could take too long to execute on a table with millions of rows. By setting custom time limits you can execute queries "optimistically" - e.g. give me an exact count of rows matching this query but only if it takes less than 100ms to calculate.
max_returned_rows
-----------------
Datasette returns a maximum of 1,000 rows of data at a time. If you execute a query that returns more than 1,000 rows, Datasette will return the first 1,000 and include a warning that the result set has been truncated. You can use OFFSET/LIMIT or other methods in your SQL to implement pagination if you need to return more than 1,000 rows.
You can increase or decrease this limit like so::
datasette mydatabase.db --limit max_returned_rows:2000
default_facet_size
------------------
The default number of unique rows returned by :ref:`facets` is 30. You can customize it like this::
datasette mydatabase.db --limit default_facet_size:50
facet_time_limit_ms
-------------------
This is the time limit Datasette allows for calculating a facet, which defaults to 200ms::
datasette mydatabase.db --limit facet_time_limit_ms:1000
facet_suggest_time_limit_ms
---------------------------
When Datasette calculates suggested facets it needs to run a SQL query for every column in your table. The default for this time limit is 50ms to account for the fact that it needs to run once for every column. If the time limit is exceeded the column will not be suggested as a facet.
You can increase this time limit like so::
datasette mydatabase.db --limit facet_suggest_time_limit_ms:500

View file

@ -46,39 +46,6 @@ statements can be used to change database settings at runtime. If you need to
include the string "pragma" in a query you can do so safely using a named
parameter.
Query limits
------------
To prevent rogue, long-running queries from making a Datasette instance
inaccessible to other users, Datasette imposes some limits on the SQL that you
can execute.
By default, queries have a time limit of one second. If a query takes longer
than this to run Datasette will terminate the query and return an error.
If this time limit is too short for you, you can customize it using the
``sql_time_limit_ms`` option - for example, to increase it to 3.5 seconds::
datasette mydatabase.db --sql_time_limit_ms=3500
You can optionally set a lower time limit for an individual query using the
``_timelimit`` query string argument::
/my-database/my-table?qSpecies=44&_timelimit=100
This would set the time limit to 100ms for that specific query. This feature
is useful if you are working with databases of unknown size and complexity -
a query that might make perfect sense for a smaller table could take too long
to execute on a table with millions of rows. By setting custom time limits you
can execute queries "optimistically" - e.g. give me an exact count of rows
matching this query but only if it takes less than 100ms to calculate.
Datasette returns a maximum of 1,000 rows of data at a time. If you execute a
query that returns more than 1,000 rows, Datasette will return the first 1,000
and include a warning that the result set has been truncated. You can use
OFFSET/LIMIT or other methods in your SQL to implement pagination if you need to
return more than 1,000 rows.
Views
-----

View file

@ -21,10 +21,12 @@ def app_client(sql_time_limit_ms=None, max_returned_rows=None):
ds = Datasette(
[filepath],
page_size=50,
max_returned_rows=max_returned_rows or 100,
sql_time_limit_ms=sql_time_limit_ms or 200,
metadata=METADATA,
plugins_dir=plugins_dir,
limits={
'max_returned_rows': max_returned_rows or 100,
'sql_time_limit_ms': sql_time_limit_ms or 200,
}
)
ds.sqlite_functions.append(
('sleep', 1, lambda n: time.sleep(float(n))),