diff --git a/README.md b/README.md index 1226dc08..2e41ea6d 100644 --- a/README.md +++ b/README.md @@ -114,9 +114,6 @@ http://localhost:8001/History/downloads.json?_shape=objects will return that dat --cors Enable CORS by serving Access-Control-Allow- Origin: * --page_size INTEGER Page size - default is 100 - --max_returned_rows INTEGER Max allowed rows to return at once - default is - 1000. Set to 0 to disable check entirely. - --sql_time_limit_ms INTEGER Max time allowed for SQL queries in ms --load-extension PATH Path to a SQLite extension to load --inspect-file TEXT Path to JSON file created using "datasette inspect" @@ -126,6 +123,8 @@ http://localhost:8001/History/downloads.json?_shape=objects will return that dat --plugins-dir DIRECTORY Path to directory containing custom plugins --static STATIC MOUNT mountpoint:path-to-directory for serving static files + --limit LIMIT Set a limit using limitname:integer + datasette.readthedocs.io/en/latest/limits.html --help Show this message and exit. ## metadata.json @@ -214,13 +213,13 @@ If you have docker installed you can use `datasette package` to create a new Doc Both publish and package accept an `extra_options` argument option, which will affect how the resulting application is executed. For example, say you want to increase the SQL time limit for a particular container: - datasette package parlgov.db --extra-options="--sql_time_limit_ms=2500 --page_size=10" + datasette package parlgov.db --extra-options="--limit sql_time_limit_ms:2500 --page_size=10" The resulting container will run the application with those options. Here's example output for the package command: - $ datasette package parlgov.db --extra-options="--sql_time_limit_ms=2500 --page_size=10" + $ datasette package parlgov.db --extra-options="--limit sql_time_limit_ms:2500 --page_size=10" Sending build context to Docker daemon 4.459MB Step 1/7 : FROM python:3 ---> 79e1dc9af1c1 @@ -239,7 +238,7 @@ Here's example output for the package command: Step 6/7 : EXPOSE 8001 ---> Using cache ---> 8e83844b0fed - Step 7/7 : CMD datasette serve parlgov.db --port 8001 --inspect-file inspect-data.json --sql_time_limit_ms=2500 --page_size=10 + Step 7/7 : CMD datasette serve parlgov.db --port 8001 --inspect-file inspect-data.json --limit sql_time_limit_ms:2500 --page_size=10 ---> Using cache ---> 1bd380ea8af3 Successfully built 1bd380ea8af3 diff --git a/datasette/app.py b/datasette/app.py index a37a4a45..1ba2964b 100644 --- a/datasette/app.py +++ b/datasette/app.py @@ -45,6 +45,15 @@ pm.add_hookspecs(hookspecs) pm.load_setuptools_entrypoints("datasette") +DEFAULT_LIMITS = { + "max_returned_rows": 1000, + "sql_time_limit_ms": 1000, + "default_facet_size": 30, + "facet_time_limit_ms": 200, + "facet_suggest_time_limit_ms": 50, +} + + class JsonDataView(RenderMixin): def __init__(self, datasette, filename, data_callback): @@ -79,8 +88,6 @@ class Datasette: num_threads=3, cache_headers=True, page_size=100, - max_returned_rows=1000, - sql_time_limit_ms=1000, cors=False, inspect_data=None, metadata=None, @@ -88,14 +95,13 @@ class Datasette: template_dir=None, plugins_dir=None, static_mounts=None, + limits=None, ): self.files = files self.num_threads = num_threads self.executor = futures.ThreadPoolExecutor(max_workers=num_threads) self.cache_headers = cache_headers self.page_size = page_size - self.max_returned_rows = max_returned_rows - self.sql_time_limit_ms = sql_time_limit_ms self.cors = cors self._inspect = inspect_data self.metadata = metadata or {} @@ -104,6 +110,9 @@ class Datasette: self.template_dir = template_dir self.plugins_dir = plugins_dir self.static_mounts = static_mounts or [] + self.limits = dict(DEFAULT_LIMITS, **(limits or {})) + self.max_returned_rows = self.limits["max_returned_rows"] + self.sql_time_limit_ms = self.limits["sql_time_limit_ms"] # Execute plugins in constructor, to ensure they are available # when the rest of `datasette inspect` executes if self.plugins_dir: diff --git a/datasette/cli.py b/datasette/cli.py index 2f9e1d43..f4818b75 100644 --- a/datasette/cli.py +++ b/datasette/cli.py @@ -5,7 +5,7 @@ import os import shutil from subprocess import call, check_output import sys -from .app import Datasette +from .app import Datasette, DEFAULT_LIMITS from .utils import temporary_docker_directory, temporary_heroku_directory @@ -15,7 +15,8 @@ class StaticMount(click.ParamType): def convert(self, value, param, ctx): if ":" not in value: self.fail( - '"%s" should be of format mountpoint:directory' % value, param, ctx + '"{}" should be of format mountpoint:directory'.format(value), + param, ctx ) path, dirpath = value.split(":") if not os.path.exists(dirpath) or not os.path.isdir(dirpath): @@ -23,6 +24,26 @@ class StaticMount(click.ParamType): return path, dirpath +class Limit(click.ParamType): + name = "limit" + + def convert(self, value, param, ctx): + ok = True + if ":" not in value: + ok = False + else: + name, intvalue = value.split(":") + ok = intvalue.isdigit() + if not ok: + self.fail( + '"{}" should be of format name:integer'.format(value), + param, ctx + ) + if name not in DEFAULT_LIMITS: + self.fail("{} is not a valid limit".format(name), param, ctx) + return name, int(intvalue) + + @click.group(cls=DefaultGroup, default="serve", default_if_no_args=True) @click.version_option() def cli(): @@ -364,14 +385,6 @@ def package( "--cors", is_flag=True, help="Enable CORS by serving Access-Control-Allow-Origin: *" ) @click.option("--page_size", default=100, help="Page size - default is 100") -@click.option( - "--max_returned_rows", - default=1000, - help="Max allowed rows to return at once - default is 1000. Set to 0 to disable check entirely.", -) -@click.option( - "--sql_time_limit_ms", default=1000, help="Max time allowed for SQL queries in ms" -) @click.option( "sqlite_extensions", "--load-extension", @@ -405,6 +418,12 @@ def package( help="mountpoint:path-to-directory for serving static files", multiple=True, ) +@click.option( + "--limit", + type=Limit(), + help="Set a limit using limitname:integer datasette.readthedocs.io/en/latest/limits.html", + multiple=True, +) def serve( files, host, @@ -413,14 +432,13 @@ def serve( reload, cors, page_size, - max_returned_rows, - sql_time_limit_ms, sqlite_extensions, inspect_file, metadata, template_dir, plugins_dir, static, + limit, ): """Serve up specified SQLite database files with a web UI""" if reload: @@ -444,14 +462,13 @@ def serve( cache_headers=not debug and not reload, cors=cors, page_size=page_size, - max_returned_rows=max_returned_rows, - sql_time_limit_ms=sql_time_limit_ms, inspect_data=inspect_data, metadata=metadata_data, sqlite_extensions=sqlite_extensions, template_dir=template_dir, plugins_dir=plugins_dir, static_mounts=static, + limits=dict(limit), ) # Force initial hashing/table counting ds.inspect() diff --git a/datasette/views/table.py b/datasette/views/table.py index ab66b880..29f0d8d3 100644 --- a/datasette/views/table.py +++ b/datasette/views/table.py @@ -536,7 +536,7 @@ class TableView(RowTableShared): ) # facets support - FACET_SIZE = 20 + facet_size = self.ds.limits["default_facet_size"] metadata_facets = table_metadata.get("facets", []) facets = metadata_facets[:] try: @@ -553,20 +553,21 @@ class TableView(RowTableShared): col=escape_sqlite(column), from_sql=from_sql, and_or_where='and' if where_clauses else 'where', - limit=FACET_SIZE+1, + limit=facet_size+1, ) try: facet_rows = await self.execute( name, facet_sql, params, - truncate=False, custom_time_limit=200 + truncate=False, + custom_time_limit=self.ds.limits["facet_time_limit_ms"], ) facet_results_values = [] facet_results[column] = { "name": column, "results": facet_results_values, - "truncated": len(facet_rows) > FACET_SIZE, + "truncated": len(facet_rows) > facet_size, } - facet_rows = facet_rows[:FACET_SIZE] + facet_rows = facet_rows[:facet_size] # Attempt to expand foreign keys into labels values = [row["value"] for row in facet_rows] expanded = (await self.expand_foreign_keys( @@ -644,7 +645,6 @@ class TableView(RowTableShared): pass # Detect suggested facets - FACET_LIMIT = 30 suggested_facets = [] for facet_column in columns: if facet_column in facets: @@ -657,19 +657,20 @@ class TableView(RowTableShared): column=escape_sqlite(facet_column), from_sql=from_sql, and_or_where='and' if where_clauses else 'where', - limit=FACET_LIMIT+1 + limit=facet_size+1 ) distinct_values = None try: distinct_values = await self.execute( name, suggested_facet_sql, params, - truncate=False, custom_time_limit=50 + truncate=False, + custom_time_limit=self.ds.limits["facet_suggest_time_limit_ms"], ) num_distinct_values = len(distinct_values) if ( num_distinct_values and num_distinct_values > 1 and - num_distinct_values <= FACET_LIMIT and + num_distinct_values <= facet_size and num_distinct_values < filtered_table_rows_count ): suggested_facets.append({ diff --git a/docs/getting_started.rst b/docs/getting_started.rst index 0a5178d7..75ba3b31 100644 --- a/docs/getting_started.rst +++ b/docs/getting_started.rst @@ -101,9 +101,6 @@ datasette serve options --cors Enable CORS by serving Access-Control-Allow- Origin: * --page_size INTEGER Page size - default is 100 - --max_returned_rows INTEGER Max allowed rows to return at once - default is - 1000. Set to 0 to disable check entirely. - --sql_time_limit_ms INTEGER Max time allowed for SQL queries in ms --load-extension PATH Path to a SQLite extension to load --inspect-file TEXT Path to JSON file created using "datasette inspect" @@ -113,4 +110,6 @@ datasette serve options --plugins-dir DIRECTORY Path to directory containing custom plugins --static STATIC MOUNT mountpoint:path-to-directory for serving static files + --limit LIMIT Set a limit using limitname:integer + datasette.readthedocs.io/en/latest/limits.html --help Show this message and exit. diff --git a/docs/index.rst b/docs/index.rst index a3c93a1e..23cb6225 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -22,6 +22,7 @@ Contents facets full_text_search metadata + limits custom_templates plugins changelog diff --git a/docs/json_api.rst b/docs/json_api.rst index 20e9f530..47c6f81b 100644 --- a/docs/json_api.rst +++ b/docs/json_api.rst @@ -132,7 +132,7 @@ Special table arguments The Datasette table view takes a number of special querystring arguments: ``?_size=1000`` or ``?_size=max`` - Sets a custom page size. This cannot exceed the ``max_returned_rows`` option + Sets a custom page size. This cannot exceed the ``max_returned_rows`` limit passed to ``datasette serve``. Use ``max`` to get ``max_returned_rows``. ``?_sort=COLUMN`` diff --git a/docs/limits.rst b/docs/limits.rst new file mode 100644 index 00000000..ccc0555d --- /dev/null +++ b/docs/limits.rst @@ -0,0 +1,51 @@ +Limits +====== + +To prevent rogue, long-running queries from making a Datasette instance inaccessible to other users, Datasette imposes some limits on the SQL that you can execute. + +sql_time_limit_ms +----------------- + +By default, queries have a time limit of one second. If a query takes longer than this to run Datasette will terminate the query and return an error. + +If this time limit is too short for you, you can customize it using the ``sql_time_limit_ms`` limit - for example, to increase it to 3.5 seconds:: + + datasette mydatabase.db --limit sql_time_limit_ms:3500 + +You can optionally set a lower time limit for an individual query using the ``_timelimit`` query string argument:: + + /my-database/my-table?qSpecies=44&_timelimit=100 + +This would set the time limit to 100ms for that specific query. This feature is useful if you are working with databases of unknown size and complexity - a query that might make perfect sense for a smaller table could take too long to execute on a table with millions of rows. By setting custom time limits you can execute queries "optimistically" - e.g. give me an exact count of rows matching this query but only if it takes less than 100ms to calculate. + +max_returned_rows +----------------- + +Datasette returns a maximum of 1,000 rows of data at a time. If you execute a query that returns more than 1,000 rows, Datasette will return the first 1,000 and include a warning that the result set has been truncated. You can use OFFSET/LIMIT or other methods in your SQL to implement pagination if you need to return more than 1,000 rows. + +You can increase or decrease this limit like so:: + + datasette mydatabase.db --limit max_returned_rows:2000 + +default_facet_size +------------------ + +The default number of unique rows returned by :ref:`facets` is 30. You can customize it like this:: + + datasette mydatabase.db --limit default_facet_size:50 + +facet_time_limit_ms +------------------- + +This is the time limit Datasette allows for calculating a facet, which defaults to 200ms:: + + datasette mydatabase.db --limit facet_time_limit_ms:1000 + +facet_suggest_time_limit_ms +--------------------------- + +When Datasette calculates suggested facets it needs to run a SQL query for every column in your table. The default for this time limit is 50ms to account for the fact that it needs to run once for every column. If the time limit is exceeded the column will not be suggested as a facet. + +You can increase this time limit like so:: + + datasette mydatabase.db --limit facet_suggest_time_limit_ms:500 diff --git a/docs/sql_queries.rst b/docs/sql_queries.rst index e02fad3d..0355d9d5 100644 --- a/docs/sql_queries.rst +++ b/docs/sql_queries.rst @@ -46,39 +46,6 @@ statements can be used to change database settings at runtime. If you need to include the string "pragma" in a query you can do so safely using a named parameter. -Query limits ------------- - -To prevent rogue, long-running queries from making a Datasette instance -inaccessible to other users, Datasette imposes some limits on the SQL that you -can execute. - -By default, queries have a time limit of one second. If a query takes longer -than this to run Datasette will terminate the query and return an error. - -If this time limit is too short for you, you can customize it using the -``sql_time_limit_ms`` option - for example, to increase it to 3.5 seconds:: - - datasette mydatabase.db --sql_time_limit_ms=3500 - -You can optionally set a lower time limit for an individual query using the -``_timelimit`` query string argument:: - - /my-database/my-table?qSpecies=44&_timelimit=100 - -This would set the time limit to 100ms for that specific query. This feature -is useful if you are working with databases of unknown size and complexity - -a query that might make perfect sense for a smaller table could take too long -to execute on a table with millions of rows. By setting custom time limits you -can execute queries "optimistically" - e.g. give me an exact count of rows -matching this query but only if it takes less than 100ms to calculate. - -Datasette returns a maximum of 1,000 rows of data at a time. If you execute a -query that returns more than 1,000 rows, Datasette will return the first 1,000 -and include a warning that the result set has been truncated. You can use -OFFSET/LIMIT or other methods in your SQL to implement pagination if you need to -return more than 1,000 rows. - Views ----- diff --git a/tests/fixtures.py b/tests/fixtures.py index 5def4292..436fa447 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -21,10 +21,12 @@ def app_client(sql_time_limit_ms=None, max_returned_rows=None): ds = Datasette( [filepath], page_size=50, - max_returned_rows=max_returned_rows or 100, - sql_time_limit_ms=sql_time_limit_ms or 200, metadata=METADATA, plugins_dir=plugins_dir, + limits={ + 'max_returned_rows': max_returned_rows or 100, + 'sql_time_limit_ms': sql_time_limit_ms or 200, + } ) ds.sqlite_functions.append( ('sleep', 1, lambda n: time.sleep(float(n))),