From 538d91c44ab1942456a94f57d7000dc432e6ec09 Mon Sep 17 00:00:00 2001 From: Simon Willison Date: Sat, 13 Apr 2019 13:03:59 -0700 Subject: [PATCH 01/17] WIP refactoring facets to plugin, refs #427 --- datasette/facets.py | 171 +++++++++++++++++++++++++++++++++++++++ datasette/hookspecs.py | 5 ++ datasette/plugins.py | 1 + datasette/views/table.py | 119 +++++---------------------- 4 files changed, 199 insertions(+), 97 deletions(-) create mode 100644 datasette/facets.py diff --git a/datasette/facets.py b/datasette/facets.py new file mode 100644 index 00000000..a1467d4d --- /dev/null +++ b/datasette/facets.py @@ -0,0 +1,171 @@ +from sanic.request import RequestParameters +import urllib +from datasette import hookimpl +from datasette.utils import ( + escape_sqlite, + path_with_added_args, + path_with_removed_args, + detect_json1 +) + + +@hookimpl +def register_facet_classes(): + return [ColumnFacet] + # classes = [ColumnFacet, ManyToManyFacet] + # if detect_json1(): + # classes.append(ArrayFacet) + # return classes + + +class Facet: + type = None + + def __init__(self, ds, request, database, table, configs): + self.ds = ds + self.request = request + self.database = database + self.table = table # can be None + self.configs = configs + + async def suggest(self, sql, params): + raise NotImplementedError + + async def facet_results(self, sql, params): + # returns ([results], [timed_out]) + raise NotImplementedError + + +class ColumnFacet(Facet): + # This is the default so type="" + type = "" + + async def suggest(self, sql, params, filtered_table_rows_count): + # Detect column names + columns = ( + await self.ds.execute( + self.database, "select * from ({}) limit 0".format(sql), + params + ) + ).columns + facet_size = self.ds.config("default_facet_size") + suggested_facets = [] + for column in columns: + suggested_facet_sql = ''' + select distinct {column} from ( + {sql} + ) where {column} is not null + limit {limit} + '''.format( + column=escape_sqlite(column), + sql=sql, + limit=facet_size+1 + ) + distinct_values = None + try: + distinct_values = await self.ds.execute( + self.database, suggested_facet_sql, params, + truncate=False, + custom_time_limit=self.ds.config("facet_suggest_time_limit_ms"), + ) + num_distinct_values = len(distinct_values) + if ( + num_distinct_values and + num_distinct_values > 1 and + num_distinct_values <= facet_size and + num_distinct_values < filtered_table_rows_count + ): + suggested_facets.append({ + 'name': column, + 'toggle_url': self.ds.absolute_url( + self.request, path_with_added_args( + self.request, {"_facet": column} + ) + ), + }) + except InterruptedError: + pass + return suggested_facets + + async def facet_results(self, sql, params): + # self.configs should be a plain list of columns + facet_results = {} + facets_timed_out = [] + + # TODO: refactor this + args = RequestParameters( + urllib.parse.parse_qs(self.request.query_string, keep_blank_values=True) + ) + other_args = {} + for key, value in args.items(): + if key.startswith("_") and "__" not in key: + pass + else: + other_args[key] = value[0] + + facet_size = self.ds.config("default_facet_size") + for column in self.configs: + facet_sql = """ + select {col} as value, count(*) as count from ( + {sql} + ) + where {col} is not null + group by {col} order by count desc limit {limit} + """.format( + col=escape_sqlite(column), + sql=sql, + limit=facet_size+1, + ) + try: + facet_rows_results = await self.ds.execute( + self.database, facet_sql, params, + truncate=False, + custom_time_limit=self.ds.config("facet_time_limit_ms"), + ) + facet_results_values = [] + facet_results[column] = { + "name": column, + "results": facet_results_values, + "truncated": len(facet_rows_results) > facet_size, + } + facet_rows = facet_rows_results.rows[:facet_size] + if self.table: + # Attempt to expand foreign keys into labels + values = [row["value"] for row in facet_rows] + expanded = (await self.ds.expand_foreign_keys( + self.database, self.table, column, values + )) + else: + expanded = {} + for row in facet_rows: + selected = str(other_args.get(column)) == str(row["value"]) + if selected: + toggle_path = path_with_removed_args( + self.request, {column: str(row["value"])} + ) + else: + toggle_path = path_with_added_args( + self.request, {column: row["value"]} + ) + facet_results_values.append({ + "value": row["value"], + "label": expanded.get( + (column, row["value"]), + row["value"] + ), + "count": row["count"], + "toggle_url": self.ds.absolute_url(self.request, toggle_path), + "selected": selected, + }) + except InterruptedError: + facets_timed_out.append(column) + + return facet_results, facets_timed_out + + +class ManyToManyFacet(Facet): + type = "m2m" + + +class ArrayFacet(Facet): + type = "array" diff --git a/datasette/hookspecs.py b/datasette/hookspecs.py index 6db95344..d244ba70 100644 --- a/datasette/hookspecs.py +++ b/datasette/hookspecs.py @@ -38,3 +38,8 @@ def publish_subcommand(publish): @hookspec(firstresult=True) def render_cell(value, column, table, database, datasette): "Customize rendering of HTML table cell values" + + +@hookspec +def register_facet_classes(): + "Register Facet subclasses" diff --git a/datasette/plugins.py b/datasette/plugins.py index 2d2c62e4..245df6b3 100644 --- a/datasette/plugins.py +++ b/datasette/plugins.py @@ -5,6 +5,7 @@ from . import hookspecs DEFAULT_PLUGINS = ( "datasette.publish.heroku", "datasette.publish.now", + "datasette.facets", ) pm = pluggy.PluginManager("datasette") diff --git a/datasette/views/table.py b/datasette/views/table.py index 5923ac92..3a26d247 100644 --- a/datasette/views/table.py +++ b/datasette/views/table.py @@ -1,4 +1,5 @@ import urllib +import itertools import jinja2 from sanic.exceptions import NotFound @@ -478,7 +479,7 @@ class TableView(RowTableShared): ) # facets support - facet_size = self.ds.config("default_facet_size") + # pylint: disable=no-member metadata_facets = table_metadata.get("facets", []) facets = metadata_facets[:] if request.args.get("_facet") and not self.ds.config("allow_facet"): @@ -487,61 +488,21 @@ class TableView(RowTableShared): facets.extend(request.args["_facet"]) except KeyError: pass + facet_classes = list( + itertools.chain.from_iterable(pm.hook.register_facet_classes()) + ) facet_results = {} facets_timed_out = [] - for column in facets: - if _next: - continue - facet_sql = """ - select {col} as value, count(*) as count - {from_sql} {and_or_where} {col} is not null - group by {col} order by count desc limit {limit} - """.format( - col=escape_sqlite(column), - from_sql=from_sql, - and_or_where='and' if from_sql_where_clauses else 'where', - limit=facet_size+1, + facet_instances = [] + for klass in facet_classes: + facet_instances.append(klass(self.ds, request, database, table, configs=facets)) + + for facet in facet_instances: + instance_facet_results, instance_facets_timed_out = await facet.facet_results( + sql, params, ) - try: - facet_rows_results = await self.ds.execute( - database, facet_sql, params, - truncate=False, - custom_time_limit=self.ds.config("facet_time_limit_ms"), - ) - facet_results_values = [] - facet_results[column] = { - "name": column, - "results": facet_results_values, - "truncated": len(facet_rows_results) > facet_size, - } - facet_rows = facet_rows_results.rows[:facet_size] - # Attempt to expand foreign keys into labels - values = [row["value"] for row in facet_rows] - expanded = (await self.ds.expand_foreign_keys( - database, table, column, values - )) - for row in facet_rows: - selected = str(other_args.get(column)) == str(row["value"]) - if selected: - toggle_path = path_with_removed_args( - request, {column: str(row["value"])} - ) - else: - toggle_path = path_with_added_args( - request, {column: row["value"]} - ) - facet_results_values.append({ - "value": row["value"], - "label": expanded.get( - (column, row["value"]), - row["value"] - ), - "count": row["count"], - "toggle_url": self.ds.absolute_url(request, toggle_path), - "selected": selected, - }) - except InterruptedError: - facets_timed_out.append(column) + facet_results.update(instance_facet_results) + facets_timed_out.extend(instance_facets_timed_out) columns = [r[0] for r in results.description] rows = list(results.rows) @@ -637,50 +598,14 @@ class TableView(RowTableShared): except InterruptedError: pass - # Detect suggested facets - suggested_facets = [] - if self.ds.config("suggest_facets") and self.ds.config("allow_facet"): - for facet_column in columns: - if facet_column in facets: - continue - if _next: - continue - if not self.ds.config("suggest_facets"): - continue - suggested_facet_sql = ''' - select distinct {column} {from_sql} - {and_or_where} {column} is not null - limit {limit} - '''.format( - column=escape_sqlite(facet_column), - from_sql=from_sql, - and_or_where='and' if from_sql_where_clauses else 'where', - limit=facet_size+1 - ) - distinct_values = None - try: - distinct_values = await self.ds.execute( - database, suggested_facet_sql, from_sql_params, - truncate=False, - custom_time_limit=self.ds.config("facet_suggest_time_limit_ms"), - ) - num_distinct_values = len(distinct_values) - if ( - num_distinct_values and - num_distinct_values > 1 and - num_distinct_values <= facet_size and - num_distinct_values < filtered_table_rows_count - ): - suggested_facets.append({ - 'name': facet_column, - 'toggle_url': self.ds.absolute_url( - request, path_with_added_args( - request, {"_facet": facet_column} - ) - ), - }) - except InterruptedError: - pass + # Detect suggested facets + suggested_facets = [] + + if self.ds.config("suggest_facets") and self.ds.config("allow_facet") and not _next: + for facet in facet_instances: + # TODO: ensure facet is not suggested if it is already active + # used to use 'if facet_column in facets' for this + suggested_facets.extend(await facet.suggest(sql, params, filtered_table_rows_count)) # human_description_en combines filters AND search, if provided human_description_en = filters.human_description_en(extra=search_descriptions) From 62810f8f7af56f44505d80062ea39245c43aa920 Mon Sep 17 00:00:00 2001 From: Simon Willison Date: Mon, 15 Apr 2019 07:01:39 -0700 Subject: [PATCH 02/17] Continue sketching out new facet design, refs #427 Includes various new partially implemented facet classes, to help exercise the API design. --- datasette/app.py | 10 +- datasette/facets.py | 519 ++++++++++++++++++++++++++++++++- datasette/templates/table.html | 2 +- datasette/views/base.py | 4 +- datasette/views/table.py | 19 +- 5 files changed, 524 insertions(+), 30 deletions(-) diff --git a/datasette/app.py b/datasette/app.py index 460464ab..a4b8f7f0 100644 --- a/datasette/app.py +++ b/datasette/app.py @@ -579,6 +579,7 @@ class Datasette: truncate=False, custom_time_limit=None, page_size=None, + log_sql_errors=True, ): """Executes sql against db_name in a thread""" page_size = page_size or self.page_size @@ -605,11 +606,12 @@ class Datasette: except sqlite3.OperationalError as e: if e.args == ('interrupted',): raise InterruptedError(e) - print( - "ERROR: conn={}, sql = {}, params = {}: {}".format( - conn, repr(sql), params, e + if log_sql_errors: + print( + "ERROR: conn={}, sql = {}, params = {}: {}".format( + conn, repr(sql), params, e + ) ) - ) raise if truncate: diff --git a/datasette/facets.py b/datasette/facets.py index a1467d4d..f1e4bbe7 100644 --- a/datasette/facets.py +++ b/datasette/facets.py @@ -1,17 +1,48 @@ from sanic.request import RequestParameters import urllib +import re from datasette import hookimpl from datasette.utils import ( escape_sqlite, + get_all_foreign_keys, path_with_added_args, path_with_removed_args, - detect_json1 + detect_json1, + InvalidSql, + sqlite3, ) +def load_facet_configs(request, table_metadata): + # Given a request and this tables metadata, return + # a dict of selected facets and their configs + # return {type, [config1, config2]...} + facet_configs = {} + #metadata_facets = table_metadata.get("facets", []) + #facets = metadata_facets[:] + args = RequestParameters( + urllib.parse.parse_qs(request.query_string, keep_blank_values=True) + ) + for key, values in args.items(): + if key.startswith("_facet"): + # Figure out the facet type + if key == "_facet": + type = "column" + elif key.startswith("_facet_"): + type = key[len("_facet_"):] + for value in values: + # The value is the config - either JSON or not + if value.startswith("{"): + config = json.loads(value) + else: + config = {"single": value} + facet_configs.setdefault(type, []).append(config) + return facet_configs + + @hookimpl def register_facet_classes(): - return [ColumnFacet] + return [ColumnFacet, ArrayFacet, ManyToManyFacet, DateFacet, EmojiFacet, PhrasesFacet] # classes = [ColumnFacet, ManyToManyFacet] # if detect_json1(): # classes.append(ArrayFacet) @@ -28,26 +59,30 @@ class Facet: self.table = table # can be None self.configs = configs - async def suggest(self, sql, params): - raise NotImplementedError + async def suggest(self, sql, params, filtered_table_rows_count): + return [] async def facet_results(self, sql, params): # returns ([results], [timed_out]) + # TODO: Include "hideable" with each one somehow, which indicates if it was + # defined in metadata (in which case you cannot turn it off) raise NotImplementedError + async def get_columns(self, sql, params=None): + return ( + await self.ds.execute( + self.database, "select * from ({}) limit 0".format(sql), + params or [] + ) + ).columns + class ColumnFacet(Facet): - # This is the default so type="" - type = "" + type = "column" async def suggest(self, sql, params, filtered_table_rows_count): - # Detect column names - columns = ( - await self.ds.execute( - self.database, "select * from ({}) limit 0".format(sql), - params - ) - ).columns + # Detect column names using the "limit 0" trick + columns = await self.get_columns(sql, params) facet_size = self.ds.config("default_facet_size") suggested_facets = [] for column in columns: @@ -104,7 +139,9 @@ class ColumnFacet(Facet): other_args[key] = value[0] facet_size = self.ds.config("default_facet_size") - for column in self.configs: + for config in (self.configs or []): + column = config.get("column") or config["single"] + # TODO: does this query break if inner sql produces value or count columns? facet_sql = """ select {col} as value, count(*) as count from ( {sql} @@ -166,6 +203,460 @@ class ColumnFacet(Facet): class ManyToManyFacet(Facet): type = "m2m" + async def suggest(self, sql, params, filtered_table_rows_count): + # This is calculated based on foreign key relationships to this table + # Are there any many-to-many tables pointing here? + suggested_facets = [] + all_foreign_keys = await self.ds.execute_against_connection_in_thread( + self.database, get_all_foreign_keys + ) + incoming = all_foreign_keys[self.table]["incoming"] + # Do any of these incoming tables have exactly two outgoing keys? + for fk in incoming: + other_table = fk["other_table"] + other_table_outgoing_foreign_keys = all_foreign_keys[other_table]["outgoing"] + if len(other_table_outgoing_foreign_keys) == 2: + suggested_facets.append({ + "name": other_table, + "type": "m2m", + "toggle_url": self.ds.absolute_url( + self.request, path_with_added_args( + self.request, {"_facet_m2m": other_table} + ) + ), + }) + return suggested_facets + + async def facet_results(self, *args, **kwargs): + + return [], [] + class ArrayFacet(Facet): type = "array" + + async def suggest(self, sql, params, filtered_table_rows_count): + columns = await self.get_columns(sql, params) + suggested_facets = [] + for column in columns: + # Is every value in this column either null or a JSON array? + suggested_facet_sql = """ + select distinct json_type({column}) + from ({sql}) + """.format( + column=escape_sqlite(column), + sql=sql, + ) + try: + results = await self.ds.execute( + self.database, suggested_facet_sql, params, + truncate=False, + custom_time_limit=self.ds.config("facet_suggest_time_limit_ms"), + log_sql_errors=False, + ) + types = tuple(r[0] for r in results.rows) + if types in ( + ("array",), + ("array", None) + ): + suggested_facets.append({ + "name": column, + "type": "array", + "toggle_url": self.ds.absolute_url( + self.request, path_with_added_args( + self.request, {"_facet_array": column} + ) + ), + }) + except (InterruptedError, sqlite3.OperationalError): + continue + return suggested_facets + + async def facet_results(self, sql, params): + # self.configs should be a plain list of columns + facet_results = {} + facets_timed_out = [] + + # TODO: refactor this + args = RequestParameters( + urllib.parse.parse_qs(self.request.query_string, keep_blank_values=True) + ) + other_args = {} + for key, value in args.items(): + if key.startswith("_") and "__" not in key: + pass + else: + other_args[key] = value[0] + + facet_size = self.ds.config("default_facet_size") + for config in (self.configs or []): + column = config.get("column") or config["single"] + facet_sql = """ + select j.value as value, count(*) as count from ( + {sql} + ) join json_each({col}) j + group by j.value order by count desc limit {limit} + """.format( + col=escape_sqlite(column), + sql=sql, + limit=facet_size+1, + ) + try: + facet_rows_results = await self.ds.execute( + self.database, facet_sql, params, + truncate=False, + custom_time_limit=self.ds.config("facet_time_limit_ms"), + ) + facet_results_values = [] + facet_results[column] = { + "name": column, + "results": facet_results_values, + "truncated": len(facet_rows_results) > facet_size, + } + facet_rows = facet_rows_results.rows[:facet_size] + for row in facet_rows: + selected = str(other_args.get(column)) == str(row["value"]) + if selected: + toggle_path = path_with_removed_args( + self.request, {"{}__arraycontains".format(column): str(row["value"])} + ) + else: + toggle_path = path_with_added_args( + self.request, {"{}__arraycontains".format(column): row["value"]} + ) + facet_results_values.append({ + "value": row["value"], + "label": row["value"], + "count": row["count"], + "toggle_url": self.ds.absolute_url(self.request, toggle_path), + "selected": selected, + }) + except InterruptedError: + facets_timed_out.append(column) + + return facet_results, facets_timed_out + + + +class DateFacet(Facet): + type = "date" + + async def suggest(self, sql, params, filtered_table_rows_count): + columns = await self.get_columns(sql, params) + suggested_facets = [] + for column in columns: + # Does this column contain any dates in the first 100 rows? + suggested_facet_sql = """ + select date({column}) from ( + {sql} + ) limit 100; + """.format( + column=escape_sqlite(column), + sql=sql, + ) + try: + results = await self.ds.execute( + self.database, suggested_facet_sql, params, + truncate=False, + custom_time_limit=self.ds.config("facet_suggest_time_limit_ms"), + log_sql_errors=False, + ) + values = tuple(r[0] for r in results.rows) + if (any(values)): + suggested_facets.append({ + "name": column, + "type": "date", + "toggle_url": self.ds.absolute_url( + self.request, path_with_added_args( + self.request, {"_facet_date": column} + ) + ), + }) + except (InterruptedError, sqlite3.OperationalError): + continue + return suggested_facets + + async def facet_results(self, sql, params): + # self.configs should be a plain list of columns + facet_results = {} + facets_timed_out = [] + + # TODO: refactor this + args = RequestParameters( + urllib.parse.parse_qs(self.request.query_string, keep_blank_values=True) + ) + other_args = {} + for key, value in args.items(): + if key.startswith("_") and "__" not in key: + pass + else: + other_args[key] = value[0] + + facet_size = self.ds.config("default_facet_size") + for config in (self.configs or []): + column = config.get("column") or config["single"] + # TODO: does this query break if inner sql produces value or count columns? + facet_sql = """ + select date({col}) as value, count(*) as count from ( + {sql} + ) + where date({col}) is not null + group by date({col}) order by date({col}) desc limit {limit} + """.format( + col=escape_sqlite(column), + sql=sql, + limit=facet_size+1, + ) + try: + facet_rows_results = await self.ds.execute( + self.database, facet_sql, params, + truncate=False, + custom_time_limit=self.ds.config("facet_time_limit_ms"), + ) + facet_results_values = [] + facet_results[column] = { + "name": column, + "results": facet_results_values, + "truncated": len(facet_rows_results) > facet_size, + } + facet_rows = facet_rows_results.rows[:facet_size] + for row in facet_rows: + selected = str(other_args.get("{}__date".format(column))) == str(row["value"]) + if selected: + toggle_path = path_with_removed_args( + self.request, {"{}__date".format(column): str(row["value"])} + ) + else: + toggle_path = path_with_added_args( + self.request, {"{}__date".format(column): row["value"]} + ) + facet_results_values.append({ + "value": row["value"], + "label": row["value"], + "count": row["count"], + "toggle_url": self.ds.absolute_url(self.request, toggle_path), + "selected": selected, + }) + except InterruptedError: + facets_timed_out.append(column) + + return facet_results, facets_timed_out + + + +class PhrasesFacet(Facet): + type = "phrases" + + async def facet_results(self, sql, params): + # Hmm... for this one we actually need the column name(s) AND the word list + # Current design supports one of the following: + # ?_facet_phrases=column:word1,word2,word3 + # which means we could support multiple columns like so: + # ?_facet_phrases=column1:column2:word1,word2,word3 + # As JSON: + # ?_facet_phrases={"columns":["column1","column2"],"phrases":["word1","word2"]} + # Urgh, the filter option when one is selected is going to be pretty nasty + facet_results = {} + facets_timed_out = [] + + facet_size = self.ds.config("default_facet_size") + for config in (self.configs or []): + if isinstance(config, dict) and "single" in config: + config = config["single"] + if isinstance(config, str): + columns = config.rsplit(":", 1)[0].split(":") + phrases = config.rsplit(":", 1)[1].split(",") + else: + columns = config["columns"] + phases = config["phrases"] + # FOR THE MOMENT only support one column + column = columns[0] + facet_sql = """ + select count(*) as count, j.value as value + from ( + select extract_phrases_json({col}, '{json_phrases}') as a from ( + {sql} + ) + ) + join json_each(a) j + group by j.value order by count desc limit {limit} + """.format( + col=escape_sqlite(column), + sql=sql, + # TODO: this will break if any phrases contain ' + json_phrases=json.dumps(phrases), + limit=facet_size+1, + ) + try: + facet_rows_results = await self.ds.execute( + self.database, facet_sql, params, + truncate=False, + custom_time_limit=self.ds.config("facet_time_limit_ms"), + ) + facet_results_values = [] + facet_results[column] = { + "name": column, + "results": facet_results_values, + "truncated": len(facet_rows_results) > facet_size, + } + facet_rows = facet_rows_results.rows[:facet_size] + for row in facet_rows: + facet_results_values.append({ + "value": row["value"], + "label": row["value"], + "count": row["count"], + # TODO: toggle_url for selected + "toggle_url": "", # self.ds.absolute_url(self.request, toggle_path), + # TODO: identify selected + "selected": False, + }) + except InterruptedError: + facets_timed_out.append(column) + + return facet_results, facets_timed_out + + +class EmojiFacet(Facet): + type = "emoji" + + async def suggest(self, sql, params, filtered_table_rows_count): + columns = await self.get_columns(sql, params) + suggested_facets = [] + for column in columns: + # Is every value in this column either null or a JSON array? + suggested_facet_sql = """ + select extract_emoji({column}) as emojis + from ({sql}) where emojis != "" limit 1 + """.format( + column=escape_sqlite(column), + sql=sql, + ) + try: + results = await self.ds.execute( + self.database, suggested_facet_sql, params, + truncate=False, + custom_time_limit=self.ds.config("facet_suggest_time_limit_ms"), + log_sql_errors=True, + ) + if results.rows: + suggested_facets.append({ + "name": column, + "type": "emoji", + "toggle_url": self.ds.absolute_url( + self.request, path_with_added_args( + self.request, {"_facet_emoji": column} + ) + ), + }) + except (InterruptedError, sqlite3.OperationalError) as e: + print(" oh no ", e) + continue + return suggested_facets + + async def facet_results(self, *args, **kwargs): + return [], [] + + +@hookimpl +def prepare_connection(conn): + conn.create_function("extract_emoji", 1, extract_emoji) + conn.create_function("extract_emoji_json", 1, extract_emoji_json) + conn.create_function("extract_phrases_json", 2, extract_phrases_json) + conn.create_function("extract_name_json", 1, extract_name_json) + conn.create_function("decode_punycode", 1, decode_punycode) + + +import json + +def extract_emoji(s): + if not isinstance(s, str): + return "" + try: + return "".join(emoji_re.findall(s)) + except Exception as e: + print(e) + raise + + +def extract_emoji_json(s): + try: + if not isinstance(s, str): + return "[]" + return json.dumps(list(set([ + c.encode("punycode").decode("latin1") for c in emoji_re.findall(s) + ]))) + except Exception as e: + print(e) + raise + + +def extract_name_json(s): + try: + if not isinstance(s, str): + return "[]" + return json.dumps(list(set([m.group(0) for m in name_re.finditer(s)]))) + except Exception as e: + print(e) + raise + + +def extract_phrases_json(s, phrases): + # phrases is a '["json", "list", "of", "phrases"]' + if not isinstance(s, str): + return "[]" + phrases_list = json.loads(phrases) + # I tried caching the regex but the performance boost was negligible + r = re.compile(r"\b{}\b".format("|".join(phrases_list)), re.I) + return json.dumps(list(set(w.lower() for w in r.findall(s)))) + + +name_re = re.compile("([A-Z][a-z]+)+( [A-Z][a-z]+)") + + + +def decode_punycode(s): + return s.encode("latin1").decode("punycode") + + +emoji_re = re.compile( + "[\xa9\xae\u203c\u2049\u2122\u2139\u2194-\u2199\u21a9-\u21aa\u231a-\u231b" + "\u2328\u23cf\u23e9-\u23f3\u23f8-\u23fa\u24c2\u25aa-\u25ab\u25b6\u25c0" + "\u25fb-\u25fe\u2600-\u2604\u260e\u2611\u2614-\u2615\u2618\u261d\u2620" + "\u2622-\u2623\u2626\u262a\u262e-\u262f\u2638-\u263a\u2640\u2642\u2648-" + "\u2653\u2660\u2663\u2665-\u2666\u2668\u267b\u267f\u2692-\u2697\u2699" + "\u269b-\u269c\u26a0-\u26a1\u26aa-\u26ab\u26b0-\u26b1\u26bd-\u26be\u26c4-" + "\u26c5\u26c8\u26ce\u26cf\u26d1\u26d3-\u26d4\u26e9-\u26ea\u26f0-\u26f5" + "\u26f7-\u26fa\u26fd\u2702\u2705\u2708-\u2709\u270a-\u270b\u270c-\u270d" + "\u270f\u2712\u2714\u2716\u271d\u2721\u2728\u2733-\u2734\u2744\u2747\u274c" + "\u274e\u2753-\u2755\u2757\u2763-\u2764\u2795-\u2797\u27a1\u27b0\u27bf" + "\u2934-\u2935\u2b05-\u2b07\u2b1b-\u2b1c\u2b50\u2b55\u3030\u303d\u3297" + "\u3299\U0001f004\U0001f0cf\U0001f170-\U0001f171\U0001f17e\U0001f17f" + "\U0001f18e\U0001f191-\U0001f19a\U0001f1e6-\U0001f1ff\U0001f201-\U0001f202" + "\U0001f21a\U0001f22f\U0001f232-\U0001f23a\U0001f250-\U0001f251\U0001f300-" + "\U0001f320\U0001f321\U0001f324-\U0001f32c\U0001f32d-\U0001f32f\U0001f330-" + "\U0001f335\U0001f336\U0001f337-\U0001f37c\U0001f37d\U0001f37e-\U0001f37f" + "\U0001f380-\U0001f393\U0001f396-\U0001f397\U0001f399-\U0001f39b\U0001f39e-" + "\U0001f39f\U0001f3a0-\U0001f3c4\U0001f3c5\U0001f3c6-\U0001f3ca\U0001f3cb-" + "\U0001f3ce\U0001f3cf-\U0001f3d3\U0001f3d4-\U0001f3df\U0001f3e0-\U0001f3f0" + "\U0001f3f3-\U0001f3f5\U0001f3f7\U0001f3f8-\U0001f3ff\U0001f400-\U0001f43e" + "\U0001f43f\U0001f440\U0001f441\U0001f442-\U0001f4f7\U0001f4f8\U0001f4f9-" + "\U0001f4fc\U0001f4fd\U0001f4ff\U0001f500-\U0001f53d\U0001f549-\U0001f54a" + "\U0001f54b-\U0001f54e\U0001f550-\U0001f567\U0001f56f-\U0001f570\U0001f573-" + "\U0001f579\U0001f57a\U0001f587\U0001f58a-\U0001f58d\U0001f590\U0001f595-" + "\U0001f596\U0001f5a4\U0001f5a5\U0001f5a8\U0001f5b1-\U0001f5b2\U0001f5bc" + "\U0001f5c2-\U0001f5c4\U0001f5d1-\U0001f5d3\U0001f5dc-\U0001f5de\U0001f5e1" + "\U0001f5e3\U0001f5e8\U0001f5ef\U0001f5f3\U0001f5fa\U0001f5fb-\U0001f5ff" + "\U0001f600\U0001f601-\U0001f610\U0001f611\U0001f612-\U0001f614\U0001f615" + "\U0001f616\U0001f617\U0001f618\U0001f619\U0001f61a\U0001f61b\U0001f61c-" + "\U0001f61e\U0001f61f\U0001f620-\U0001f625\U0001f626-\U0001f627\U0001f628-" + "\U0001f62b\U0001f62c\U0001f62d\U0001f62e-\U0001f62f\U0001f630-\U0001f633" + "\U0001f634\U0001f635-\U0001f640\U0001f641-\U0001f642\U0001f643-\U0001f644" + "\U0001f645-\U0001f64f\U0001f680-\U0001f6c5\U0001f6cb-\U0001f6cf\U0001f6d0" + "\U0001f6d1-\U0001f6d2\U0001f6e0-\U0001f6e5\U0001f6e9\U0001f6eb-\U0001f6ec" + "\U0001f6f0\U0001f6f3\U0001f6f4-\U0001f6f6\U0001f6f7-\U0001f6f8\U0001f910-" + "\U0001f918\U0001f919-\U0001f91e\U0001f91f\U0001f920-\U0001f927\U0001f928-" + "\U0001f92f\U0001f930\U0001f931-\U0001f932\U0001f933-\U0001f93a\U0001f93c-" + "\U0001f93e\U0001f940-\U0001f945\U0001f947-\U0001f94b\U0001f94c\U0001f950-" + "\U0001f95e\U0001f95f-\U0001f96b\U0001f980-\U0001f984\U0001f985-\U0001f991" + "\U0001f992-\U0001f997\U0001f9c0\U0001f9d0-\U0001f9e6]" +) diff --git a/datasette/templates/table.html b/datasette/templates/table.html index 1c65aa10..730a78ff 100644 --- a/datasette/templates/table.html +++ b/datasette/templates/table.html @@ -110,7 +110,7 @@ {% if suggested_facets %}

- Suggested facets: {% for facet in suggested_facets %}{{ facet.name }}{% if not loop.last %}, {% endif %}{% endfor %} + Suggested facets: {% for facet in suggested_facets %}{{ facet.name }}{% if facet.type %} ({{ facet.type }}){% endif %}{% if not loop.last %}, {% endif %}{% endfor %}

{% endif %} diff --git a/datasette/views/base.py b/datasette/views/base.py index 764ad7dd..db3956a7 100644 --- a/datasette/views/base.py +++ b/datasette/views/base.py @@ -363,7 +363,9 @@ class BaseView(RenderMixin): SQL query took too long. The time limit is controlled by the sql_time_limit_ms configuration option. - """, title="SQL Interrupted", status=400, messagge_is_html=True) + + {} + """.format(e), title="SQL Interrupted", status=400, messagge_is_html=True) except (sqlite3.OperationalError, InvalidSql) as e: raise DatasetteError(str(e), title="Invalid SQL", status=400) diff --git a/datasette/views/table.py b/datasette/views/table.py index 3a26d247..09078880 100644 --- a/datasette/views/table.py +++ b/datasette/views/table.py @@ -5,6 +5,7 @@ import jinja2 from sanic.exceptions import NotFound from sanic.request import RequestParameters +from datasette.facets import load_facet_configs from datasette.plugins import pm from datasette.utils import ( CustomRow, @@ -479,15 +480,12 @@ class TableView(RowTableShared): ) # facets support - # pylint: disable=no-member - metadata_facets = table_metadata.get("facets", []) - facets = metadata_facets[:] - if request.args.get("_facet") and not self.ds.config("allow_facet"): + if not self.ds.config("allow_facet") and any(arg.startswith("_facet") for arg in request.args): raise DatasetteError("_facet= is not allowed", status=400) - try: - facets.extend(request.args["_facet"]) - except KeyError: - pass + facet_configs = load_facet_configs(request, table_metadata) + print("facet_configs", facet_configs) + + # pylint: disable=no-member facet_classes = list( itertools.chain.from_iterable(pm.hook.register_facet_classes()) ) @@ -495,7 +493,7 @@ class TableView(RowTableShared): facets_timed_out = [] facet_instances = [] for klass in facet_classes: - facet_instances.append(klass(self.ds, request, database, table, configs=facets)) + facet_instances.append(klass(self.ds, request, database, table, configs=facet_configs.get(klass.type))) for facet in facet_instances: instance_facet_results, instance_facets_timed_out = await facet.facet_results( @@ -504,6 +502,7 @@ class TableView(RowTableShared): facet_results.update(instance_facet_results) facets_timed_out.extend(instance_facets_timed_out) + # Figure out columns and rows for the query columns = [r[0] for r in results.description] rows = list(results.rows) @@ -653,7 +652,7 @@ class TableView(RowTableShared): ), "extra_wheres_for_ui": extra_wheres_for_ui, "form_hidden_args": form_hidden_args, - "facet_hideable": lambda facet: facet not in metadata_facets, + "facet_hideable": lambda facet: facet not in [], # TODO: used to be metadata_facets fix this "is_sortable": any(c["sortable"] for c in display_columns), "path_with_replaced_args": path_with_replaced_args, "path_with_removed_args": path_with_removed_args, From 458f85871220688185176bb40a168bbd4009fd3d Mon Sep 17 00:00:00 2001 From: Simon Willison Date: Sat, 13 Apr 2019 15:49:07 -0700 Subject: [PATCH 03/17] Slightly more interesting example link --- docs/json_api.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/json_api.rst b/docs/json_api.rst index 2606f3a0..cd034568 100644 --- a/docs/json_api.rst +++ b/docs/json_api.rst @@ -218,7 +218,7 @@ The Datasette table view takes a number of special querystring arguments: Some examples: - * `facetable?_where=state="MI"&_where=city_id=3 `__ + * `facetable?_where=neighborhood like "%c%"&_where=city_id=3 `__ * `facetable?_where=city_id in (select id from facet_cities where name != "Detroit") `__ ``?_group_count=COLUMN`` From 65e913fbbc5ff6bfce5040874278304ce1639f53 Mon Sep 17 00:00:00 2001 From: Simon Willison Date: Mon, 15 Apr 2019 14:51:20 -0700 Subject: [PATCH 04/17] Extract and refactor filters into filters.py This will help in implementing __in as a filter, refs #433 --- datasette/filters.py | 156 +++++++++++++++++++++++++++++++++++++++ datasette/utils.py | 137 ---------------------------------- datasette/views/table.py | 2 +- tests/test_filters.py | 64 ++++++++++++++++ tests/test_utils.py | 63 +--------------- 5 files changed, 222 insertions(+), 200 deletions(-) create mode 100644 datasette/filters.py create mode 100644 tests/test_filters.py diff --git a/datasette/filters.py b/datasette/filters.py new file mode 100644 index 00000000..5fd722f3 --- /dev/null +++ b/datasette/filters.py @@ -0,0 +1,156 @@ +import numbers +from .utils import detect_json1 + + +class Filter: + key = None + display = None + no_argument = False + + def where_clause(self, table, column, value, param_counter): + raise NotImplementedError + + def human_clause(self, column, value): + raise NotImplementedError + + +class TemplatedFilter(Filter): + def __init__(self, key, display, sql_template, human_template, format='{}', numeric=False, no_argument=False): + self.key = key + self.display = display + self.sql_template = sql_template + self.human_template = human_template + self.format = format + self.numeric = numeric + self.no_argument = no_argument + + def where_clause(self, table, column, value, param_counter): + converted = self.format.format(value) + if self.numeric and converted.isdigit(): + converted = int(converted) + if self.no_argument: + kwargs = { + 'c': column, + } + converted = None + else: + kwargs = { + 'c': column, + 'p': 'p{}'.format(param_counter), + 't': table, + } + return self.sql_template.format(**kwargs), converted + + def human_clause(self, column, value): + if callable(self.human_template): + template = self.human_template(column, value) + else: + template = self.human_template + if self.no_argument: + return template.format(c=column) + else: + return template.format(c=column, v=value) + + +class Filters: + _filters = [ + # key, display, sql_template, human_template, format=, numeric=, no_argument= + TemplatedFilter('exact', '=', '"{c}" = :{p}', lambda c, v: '{c} = {v}' if v.isdigit() else '{c} = "{v}"'), + TemplatedFilter('not', '!=', '"{c}" != :{p}', lambda c, v: '{c} != {v}' if v.isdigit() else '{c} != "{v}"'), + TemplatedFilter('contains', 'contains', '"{c}" like :{p}', '{c} contains "{v}"', format='%{}%'), + TemplatedFilter('endswith', 'ends with', '"{c}" like :{p}', '{c} ends with "{v}"', format='%{}'), + TemplatedFilter('startswith', 'starts with', '"{c}" like :{p}', '{c} starts with "{v}"', format='{}%'), + TemplatedFilter('gt', '>', '"{c}" > :{p}', '{c} > {v}', numeric=True), + TemplatedFilter('gte', '\u2265', '"{c}" >= :{p}', '{c} \u2265 {v}', numeric=True), + TemplatedFilter('lt', '<', '"{c}" < :{p}', '{c} < {v}', numeric=True), + TemplatedFilter('lte', '\u2264', '"{c}" <= :{p}', '{c} \u2264 {v}', numeric=True), + TemplatedFilter('glob', 'glob', '"{c}" glob :{p}', '{c} glob "{v}"'), + TemplatedFilter('like', 'like', '"{c}" like :{p}', '{c} like "{v}"'), + ] + ([TemplatedFilter('arraycontains', 'array contains', """rowid in ( + select {t}.rowid from {t}, json_each({t}.{c}) j + where j.value = :{p} + )""", '{c} contains "{v}"') + ] if detect_json1() else []) + [ + TemplatedFilter('isnull', 'is null', '"{c}" is null', '{c} is null', no_argument=True), + TemplatedFilter('notnull', 'is not null', '"{c}" is not null', '{c} is not null', no_argument=True), + TemplatedFilter('isblank', 'is blank', '("{c}" is null or "{c}" = "")', '{c} is blank', no_argument=True), + TemplatedFilter('notblank', 'is not blank', '("{c}" is not null and "{c}" != "")', '{c} is not blank', no_argument=True), + ] + _filters_by_key = { + f.key: f for f in _filters + } + + def __init__(self, pairs, units={}, ureg=None): + self.pairs = pairs + self.units = units + self.ureg = ureg + + def lookups(self): + "Yields (lookup, display, no_argument) pairs" + for filter in self._filters: + yield filter.key, filter.display, filter.no_argument + + def human_description_en(self, extra=None): + bits = [] + if extra: + bits.extend(extra) + for column, lookup, value in self.selections(): + filter = self._filters_by_key.get(lookup, None) + if filter: + bits.append(filter.human_clause(column, value)) + # Comma separated, with an ' and ' at the end + and_bits = [] + commas, tail = bits[:-1], bits[-1:] + if commas: + and_bits.append(', '.join(commas)) + if tail: + and_bits.append(tail[0]) + s = ' and '.join(and_bits) + if not s: + return '' + return 'where {}'.format(s) + + def selections(self): + "Yields (column, lookup, value) tuples" + for key, value in self.pairs: + if '__' in key: + column, lookup = key.rsplit('__', 1) + else: + column = key + lookup = 'exact' + yield column, lookup, value + + def has_selections(self): + return bool(self.pairs) + + def convert_unit(self, column, value): + "If the user has provided a unit in the query, convert it into the column unit, if present." + if column not in self.units: + return value + + # Try to interpret the value as a unit + value = self.ureg(value) + if isinstance(value, numbers.Number): + # It's just a bare number, assume it's the column unit + return value + + column_unit = self.ureg(self.units[column]) + return value.to(column_unit).magnitude + + def build_where_clauses(self, table): + sql_bits = [] + params = {} + i = 0 + for column, lookup, value in self.selections(): + filter = self._filters_by_key.get(lookup, None) + if filter: + sql_bit, param = filter.where_clause(table, column, self.convert_unit(column, value), i) + sql_bits.append(sql_bit) + if param is not None: + if not isinstance(param, list): + param = [param] + for individual_param in param: + param_id = 'p{}'.format(i) + params[param_id] = individual_param + i += 1 + return sql_bits, params diff --git a/datasette/utils.py b/datasette/utils.py index bb5c17d6..0c161ac6 100644 --- a/datasette/utils.py +++ b/datasette/utils.py @@ -584,143 +584,6 @@ def table_columns(conn, table): ] -class Filter: - def __init__(self, key, display, sql_template, human_template, format='{}', numeric=False, no_argument=False): - self.key = key - self.display = display - self.sql_template = sql_template - self.human_template = human_template - self.format = format - self.numeric = numeric - self.no_argument = no_argument - - def where_clause(self, table, column, value, param_counter): - converted = self.format.format(value) - if self.numeric and converted.isdigit(): - converted = int(converted) - if self.no_argument: - kwargs = { - 'c': column, - } - converted = None - else: - kwargs = { - 'c': column, - 'p': 'p{}'.format(param_counter), - 't': table, - } - return self.sql_template.format(**kwargs), converted - - def human_clause(self, column, value): - if callable(self.human_template): - template = self.human_template(column, value) - else: - template = self.human_template - if self.no_argument: - return template.format(c=column) - else: - return template.format(c=column, v=value) - - -class Filters: - _filters = [ - # key, display, sql_template, human_template, format=, numeric=, no_argument= - Filter('exact', '=', '"{c}" = :{p}', lambda c, v: '{c} = {v}' if v.isdigit() else '{c} = "{v}"'), - Filter('not', '!=', '"{c}" != :{p}', lambda c, v: '{c} != {v}' if v.isdigit() else '{c} != "{v}"'), - Filter('contains', 'contains', '"{c}" like :{p}', '{c} contains "{v}"', format='%{}%'), - Filter('endswith', 'ends with', '"{c}" like :{p}', '{c} ends with "{v}"', format='%{}'), - Filter('startswith', 'starts with', '"{c}" like :{p}', '{c} starts with "{v}"', format='{}%'), - Filter('gt', '>', '"{c}" > :{p}', '{c} > {v}', numeric=True), - Filter('gte', '\u2265', '"{c}" >= :{p}', '{c} \u2265 {v}', numeric=True), - Filter('lt', '<', '"{c}" < :{p}', '{c} < {v}', numeric=True), - Filter('lte', '\u2264', '"{c}" <= :{p}', '{c} \u2264 {v}', numeric=True), - Filter('glob', 'glob', '"{c}" glob :{p}', '{c} glob "{v}"'), - Filter('like', 'like', '"{c}" like :{p}', '{c} like "{v}"'), - ] + ([Filter('arraycontains', 'array contains', """rowid in ( - select {t}.rowid from {t}, json_each({t}.{c}) j - where j.value = :{p} - )""", '{c} contains "{v}"') - ] if detect_json1() else []) + [ - Filter('isnull', 'is null', '"{c}" is null', '{c} is null', no_argument=True), - Filter('notnull', 'is not null', '"{c}" is not null', '{c} is not null', no_argument=True), - Filter('isblank', 'is blank', '("{c}" is null or "{c}" = "")', '{c} is blank', no_argument=True), - Filter('notblank', 'is not blank', '("{c}" is not null and "{c}" != "")', '{c} is not blank', no_argument=True), - ] - _filters_by_key = { - f.key: f for f in _filters - } - - def __init__(self, pairs, units={}, ureg=None): - self.pairs = pairs - self.units = units - self.ureg = ureg - - def lookups(self): - "Yields (lookup, display, no_argument) pairs" - for filter in self._filters: - yield filter.key, filter.display, filter.no_argument - - def human_description_en(self, extra=None): - bits = [] - if extra: - bits.extend(extra) - for column, lookup, value in self.selections(): - filter = self._filters_by_key.get(lookup, None) - if filter: - bits.append(filter.human_clause(column, value)) - # Comma separated, with an ' and ' at the end - and_bits = [] - commas, tail = bits[:-1], bits[-1:] - if commas: - and_bits.append(', '.join(commas)) - if tail: - and_bits.append(tail[0]) - s = ' and '.join(and_bits) - if not s: - return '' - return 'where {}'.format(s) - - def selections(self): - "Yields (column, lookup, value) tuples" - for key, value in self.pairs: - if '__' in key: - column, lookup = key.rsplit('__', 1) - else: - column = key - lookup = 'exact' - yield column, lookup, value - - def has_selections(self): - return bool(self.pairs) - - def convert_unit(self, column, value): - "If the user has provided a unit in the query, convert it into the column unit, if present." - if column not in self.units: - return value - - # Try to interpret the value as a unit - value = self.ureg(value) - if isinstance(value, numbers.Number): - # It's just a bare number, assume it's the column unit - return value - - column_unit = self.ureg(self.units[column]) - return value.to(column_unit).magnitude - - def build_where_clauses(self, table): - sql_bits = [] - params = {} - for i, (column, lookup, value) in enumerate(self.selections()): - filter = self._filters_by_key.get(lookup, None) - if filter: - sql_bit, param = filter.where_clause(table, column, self.convert_unit(column, value), i) - sql_bits.append(sql_bit) - if param is not None: - param_id = 'p{}'.format(i) - params[param_id] = param - return sql_bits, params - - filter_column_re = re.compile(r'^_filter_column_\d+$') diff --git a/datasette/views/table.py b/datasette/views/table.py index 09078880..4e05a56e 100644 --- a/datasette/views/table.py +++ b/datasette/views/table.py @@ -9,7 +9,6 @@ from datasette.facets import load_facet_configs from datasette.plugins import pm from datasette.utils import ( CustomRow, - Filters, InterruptedError, append_querystring, compound_keys_after_sql, @@ -29,6 +28,7 @@ from datasette.utils import ( urlsafe_components, value_as_boolean, ) +from datasette.filters import Filters from .base import BaseView, DatasetteError, ureg LINK_WITH_LABEL = '{label} {id}' diff --git a/tests/test_filters.py b/tests/test_filters.py new file mode 100644 index 00000000..b0cb3f34 --- /dev/null +++ b/tests/test_filters.py @@ -0,0 +1,64 @@ +from datasette.filters import Filters +import pytest + + +@pytest.mark.parametrize('args,expected_where,expected_params', [ + ( + { + 'name_english__contains': 'foo', + }, + ['"name_english" like :p0'], + ['%foo%'] + ), + ( + { + 'foo': 'bar', + 'bar__contains': 'baz', + }, + ['"bar" like :p0', '"foo" = :p1'], + ['%baz%', 'bar'] + ), + ( + { + 'foo__startswith': 'bar', + 'bar__endswith': 'baz', + }, + ['"bar" like :p0', '"foo" like :p1'], + ['%baz', 'bar%'] + ), + ( + { + 'foo__lt': '1', + 'bar__gt': '2', + 'baz__gte': '3', + 'bax__lte': '4', + }, + ['"bar" > :p0', '"bax" <= :p1', '"baz" >= :p2', '"foo" < :p3'], + [2, 4, 3, 1] + ), + ( + { + 'foo__like': '2%2', + 'zax__glob': '3*', + }, + ['"foo" like :p0', '"zax" glob :p1'], + ['2%2', '3*'] + ), + ( + { + 'foo__isnull': '1', + 'baz__isnull': '1', + 'bar__gt': '10' + }, + ['"bar" > :p0', '"baz" is null', '"foo" is null'], + [10] + ), +]) +def test_build_where(args, expected_where, expected_params): + f = Filters(sorted(args.items())) + sql_bits, actual_params = f.build_where_clauses("table") + assert expected_where == sql_bits + assert { + 'p{}'.format(i): param + for i, param in enumerate(expected_params) + } == actual_params diff --git a/tests/test_utils.py b/tests/test_utils.py index 07074e72..1ca202f4 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -3,6 +3,7 @@ Tests for various datasette helper functions. """ from datasette import utils +from datasette.filters import Filters import json import os import pytest @@ -133,68 +134,6 @@ def test_custom_json_encoder(obj, expected): assert expected == actual -@pytest.mark.parametrize('args,expected_where,expected_params', [ - ( - { - 'name_english__contains': 'foo', - }, - ['"name_english" like :p0'], - ['%foo%'] - ), - ( - { - 'foo': 'bar', - 'bar__contains': 'baz', - }, - ['"bar" like :p0', '"foo" = :p1'], - ['%baz%', 'bar'] - ), - ( - { - 'foo__startswith': 'bar', - 'bar__endswith': 'baz', - }, - ['"bar" like :p0', '"foo" like :p1'], - ['%baz', 'bar%'] - ), - ( - { - 'foo__lt': '1', - 'bar__gt': '2', - 'baz__gte': '3', - 'bax__lte': '4', - }, - ['"bar" > :p0', '"bax" <= :p1', '"baz" >= :p2', '"foo" < :p3'], - [2, 4, 3, 1] - ), - ( - { - 'foo__like': '2%2', - 'zax__glob': '3*', - }, - ['"foo" like :p0', '"zax" glob :p1'], - ['2%2', '3*'] - ), - ( - { - 'foo__isnull': '1', - 'baz__isnull': '1', - 'bar__gt': '10' - }, - ['"bar" > :p0', '"baz" is null', '"foo" is null'], - [10] - ), -]) -def test_build_where(args, expected_where, expected_params): - f = utils.Filters(sorted(args.items())) - sql_bits, actual_params = f.build_where_clauses("table") - assert expected_where == sql_bits - assert { - 'p{}'.format(i): param - for i, param in enumerate(expected_params) - } == actual_params - - @pytest.mark.parametrize('bad_sql', [ 'update blah;', 'PRAGMA case_sensitive_like = true' From c4645c0f2b5446c26bffbc48604dd7d3b24f9262 Mon Sep 17 00:00:00 2001 From: Simon Willison Date: Mon, 15 Apr 2019 15:41:11 -0700 Subject: [PATCH 05/17] Documentation for filters, plus new documentation unit test https://simonwillison.net/2018/Jul/28/documentation-unit-tests/ --- docs/json_api.rst | 64 +++++++++++++++++++++++++++++++++++++++++++--- tests/test_docs.py | 18 +++++++++++++ 2 files changed, 79 insertions(+), 3 deletions(-) diff --git a/docs/json_api.rst b/docs/json_api.rst index cd034568..1ea35672 100644 --- a/docs/json_api.rst +++ b/docs/json_api.rst @@ -176,10 +176,68 @@ querystring arguments: .. _table_arguments: -Special table arguments ------------------------ +Table arguments +--------------- -The Datasette table view takes a number of special querystring arguments: +The Datasette table view takes a number of special querystring arguments. + +Column filter arguments +~~~~~~~~~~~~~~~~~~~~~~~ + +You can filter the data returned by the table based on column values using a querystring argument. + +``?column__exact=value`` or ``?_column=value`` + Returns rows where the specified column exactly matches the value. + +``?column__not=value`` + Returns rows where the column does not match the value. + +``?column__contains=value`` + Rows where the string column contains the specified value (``column like "%value%"`` in SQL). + +``?column__endswith=value`` + Rows where the string column ends with the specified value (``column like "%value"`` in SQL). + +``?column__startswith=value`` + Rows where the string column starts with the specified value (``column like "value%"`` in SQL). + +``?column__gt=value`` + Rows which are greater than the specified value. + +``?column__gte=value`` + Rows which are greater than or equal to the specified value. + +``?column__lt=value`` + Rows which are less than the specified value. + +``?column__lte=value`` + Rows which are less than or equal to the specified value. + +``?column__like=value`` + Match rows with a LIKE clause, case insensitive and with ``%`` as the wildcard character. + +``?column__glob=value`` + Similar to LIKE but uses Unix wildcard syntax and is case sensitive. + +``?column__arraycontains=value`` + Works against columns that contain JSON arrays - matches if any of the values in that array match. + + This is only available if the ``json1`` SQLite extension is enabled. + +``?column__isnull=1`` + Matches rows where the column is null. + +``?column__notnull=1`` + Matches rows where the column is not null. + +``?column__isblank=1`` + Matches rows where the column is blank, meaning null or the empty string. + +``?column__notblank=1`` + Matches rows where the column is not blank. + +Special table arguments +~~~~~~~~~~~~~~~~~~~~~~~ ``?_labels=on/off`` Expand foreign key references for every possible column. See below. diff --git a/tests/test_docs.py b/tests/test_docs.py index 6f84832d..caf1cff3 100644 --- a/tests/test_docs.py +++ b/tests/test_docs.py @@ -4,6 +4,7 @@ Tests to ensure certain things are documented. from click.testing import CliRunner from datasette import app from datasette.cli import cli +from datasette.filters import Filters from pathlib import Path import pytest import re @@ -71,3 +72,20 @@ def documented_views(): @pytest.mark.parametrize("view_class", [v for v in dir(app) if v.endswith("View")]) def test_view_classes_are_documented(documented_views, view_class): assert view_class in documented_views + + +@pytest.fixture(scope="session") +def documented_table_filters(): + json_api_rst = (docs_path / "json_api.rst").read_text() + section = json_api_rst.split(".. _table_arguments:")[-1] + # Lines starting with ``?column__exact= are docs for filters + return set( + line.split("__")[1].split("=")[0] + for line in section.split("\n") + if line.startswith("``?column__") + ) + + +@pytest.mark.parametrize("filter", [f.key for f in Filters._filters]) +def test_table_filters_are_documented(documented_table_filters, filter): + assert filter in documented_table_filters From 78e9972b46d8fd0c4c7708eab10177e62d47b8d5 Mon Sep 17 00:00:00 2001 From: Simon Willison Date: Mon, 15 Apr 2019 15:42:28 -0700 Subject: [PATCH 06/17] New colname__in=x,y,z filter, closes #433 --- datasette/filters.py | 33 +++++++++++++++++++++++++++++++-- docs/json_api.rst | 9 +++++++++ tests/test_filters.py | 22 ++++++++++++++++++++++ 3 files changed, 62 insertions(+), 2 deletions(-) diff --git a/datasette/filters.py b/datasette/filters.py index 5fd722f3..abaafc5b 100644 --- a/datasette/filters.py +++ b/datasette/filters.py @@ -1,5 +1,10 @@ +import json import numbers -from .utils import detect_json1 + +from .utils import ( + detect_json1, + escape_sqlite, +) class Filter: @@ -52,6 +57,29 @@ class TemplatedFilter(Filter): return template.format(c=column, v=value) +class InFilter(Filter): + key = 'in' + display = 'in' + + def __init__(self): + pass + + def split_value(self, value): + if value.startswith("["): + return json.loads(value) + else: + return [v.strip() for v in value.split(",")] + + def where_clause(self, table, column, value, param_counter): + values = self.split_value(value) + params = [":p{}".format(param_counter + i) for i in range(len(values))] + sql = "{} in ({})".format(escape_sqlite(column), ", ".join(params)) + return sql, values + + def human_clause(self, column, value): + return "{} in {}".format(column, json.dumps(self.split_value(value))) + + class Filters: _filters = [ # key, display, sql_template, human_template, format=, numeric=, no_argument= @@ -64,8 +92,9 @@ class Filters: TemplatedFilter('gte', '\u2265', '"{c}" >= :{p}', '{c} \u2265 {v}', numeric=True), TemplatedFilter('lt', '<', '"{c}" < :{p}', '{c} < {v}', numeric=True), TemplatedFilter('lte', '\u2264', '"{c}" <= :{p}', '{c} \u2264 {v}', numeric=True), - TemplatedFilter('glob', 'glob', '"{c}" glob :{p}', '{c} glob "{v}"'), TemplatedFilter('like', 'like', '"{c}" like :{p}', '{c} like "{v}"'), + TemplatedFilter('glob', 'glob', '"{c}" glob :{p}', '{c} glob "{v}"'), + InFilter(), ] + ([TemplatedFilter('arraycontains', 'array contains', """rowid in ( select {t}.rowid from {t}, json_each({t}.{c}) j where j.value = :{p} diff --git a/docs/json_api.rst b/docs/json_api.rst index 1ea35672..67700224 100644 --- a/docs/json_api.rst +++ b/docs/json_api.rst @@ -219,6 +219,15 @@ You can filter the data returned by the table based on column values using a que ``?column__glob=value`` Similar to LIKE but uses Unix wildcard syntax and is case sensitive. +``?column__in=value1,value2,value3`` + Rows where column matches any of the provided values. + + You can use a comma separated string, or you can use a JSON array. + + The JSON array option is useful if one of your matching values itself contains a comma: + + ``?column__in=["value","value,with,commas"]`` + ``?column__arraycontains=value`` Works against columns that contain JSON arrays - matches if any of the values in that array match. diff --git a/tests/test_filters.py b/tests/test_filters.py index b0cb3f34..a5d6e3d0 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -53,6 +53,28 @@ import pytest ['"bar" > :p0', '"baz" is null', '"foo" is null'], [10] ), + ( + { + 'foo__in': '1,2,3', + }, + ['foo in (:p0, :p1, :p2)'], + ["1", "2", "3"] + ), + # JSON array variants of __in (useful for unexpected characters) + ( + { + 'foo__in': '[1,2,3]', + }, + ['foo in (:p0, :p1, :p2)'], + [1, 2, 3] + ), + ( + { + 'foo__in': '["dog,cat", "cat[dog]"]', + }, + ['foo in (:p0, :p1)'], + ["dog,cat", "cat[dog]"] + ), ]) def test_build_where(args, expected_where, expected_params): f = Filters(sorted(args.items())) From 661488e964b6abca405d8031e76699739d34e879 Mon Sep 17 00:00:00 2001 From: Simon Willison Date: Mon, 15 Apr 2019 15:54:54 -0700 Subject: [PATCH 07/17] New ?column__date=yyyy-mm-dd filter --- datasette/filters.py | 1 + docs/json_api.rst | 3 +++ tests/test_filters.py | 8 ++++++++ 3 files changed, 12 insertions(+) diff --git a/datasette/filters.py b/datasette/filters.py index abaafc5b..483f031f 100644 --- a/datasette/filters.py +++ b/datasette/filters.py @@ -100,6 +100,7 @@ class Filters: where j.value = :{p} )""", '{c} contains "{v}"') ] if detect_json1() else []) + [ + TemplatedFilter('date', 'date', 'date({c}) = :{p}', '"{c}" is on date {v}'), TemplatedFilter('isnull', 'is null', '"{c}" is null', '{c} is null', no_argument=True), TemplatedFilter('notnull', 'is not null', '"{c}" is not null', '{c} is not null', no_argument=True), TemplatedFilter('isblank', 'is blank', '("{c}" is null or "{c}" = "")', '{c} is blank', no_argument=True), diff --git a/docs/json_api.rst b/docs/json_api.rst index 67700224..ef1b4548 100644 --- a/docs/json_api.rst +++ b/docs/json_api.rst @@ -233,6 +233,9 @@ You can filter the data returned by the table based on column values using a que This is only available if the ``json1`` SQLite extension is enabled. +``?column__date=value`` + Column is a datestamp occurring on the specified YYYY-MM-DD date, e.g. ``2018-01-02``. + ``?column__isnull=1`` Matches rows where the column is null. diff --git a/tests/test_filters.py b/tests/test_filters.py index a5d6e3d0..7b19c4e9 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -60,6 +60,14 @@ import pytest ['foo in (:p0, :p1, :p2)'], ["1", "2", "3"] ), + # date + ( + { + "foo__date": "1988-01-01", + }, + ["date(foo) = :p0"], + ["1988-01-01"] + ), # JSON array variants of __in (useful for unexpected characters) ( { From b495839e60fa32f2943e1d7624fcccb76b454d87 Mon Sep 17 00:00:00 2001 From: Simon Willison Date: Mon, 15 Apr 2019 16:44:17 -0700 Subject: [PATCH 08/17] Support multiple filters of the same type Closes #288 --- datasette/views/table.py | 7 ++-- tests/test_api.py | 10 +++++ tests/test_filters.py | 87 ++++++++++++++++++++++------------------ 3 files changed, 62 insertions(+), 42 deletions(-) diff --git a/datasette/views/table.py b/datasette/views/table.py index 4e05a56e..1a7231fb 100644 --- a/datasette/views/table.py +++ b/datasette/views/table.py @@ -221,13 +221,14 @@ class TableView(RowTableShared): # it can still be queried using ?_col__exact=blah special_args = {} special_args_lists = {} - other_args = {} + other_args = [] for key, value in args.items(): if key.startswith("_") and "__" not in key: special_args[key] = value[0] special_args_lists[key] = value else: - other_args[key] = value[0] + for v in value: + other_args.append((key, v)) # Handle ?_filter_column and redirect, if present redirect_params = filters_should_redirect(special_args) @@ -255,7 +256,7 @@ class TableView(RowTableShared): table_metadata = self.ds.table_metadata(database, table) units = table_metadata.get("units", {}) - filters = Filters(sorted(other_args.items()), units, ureg) + filters = Filters(sorted(other_args), units, ureg) where_clauses, params = filters.build_where_clauses(table) extra_wheres_for_ui = [] diff --git a/tests/test_api.py b/tests/test_api.py index d6f612c8..53bf1d6e 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -903,6 +903,16 @@ def test_table_filter_queries(app_client, path, expected_rows): assert expected_rows == response.json['rows'] +def test_table_filter_queries_multiple_of_same_type(app_client): + response = app_client.get( + "/fixtures/simple_primary_key.json?content__not=world&content__not=hello" + ) + assert [ + ['3', ''], + ['4', 'RENDER_CELL_DEMO'] + ] == response.json['rows'] + + @pytest.mark.skipif( not detect_json1(), reason="Requires the SQLite json1 module" diff --git a/tests/test_filters.py b/tests/test_filters.py index 7b19c4e9..a905dd2e 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -4,88 +4,97 @@ import pytest @pytest.mark.parametrize('args,expected_where,expected_params', [ ( - { - 'name_english__contains': 'foo', - }, + ( + ('name_english__contains', 'foo'), + ), ['"name_english" like :p0'], ['%foo%'] ), ( - { - 'foo': 'bar', - 'bar__contains': 'baz', - }, + ( + ('foo', 'bar'), + ('bar__contains', 'baz'), + ), ['"bar" like :p0', '"foo" = :p1'], ['%baz%', 'bar'] ), ( - { - 'foo__startswith': 'bar', - 'bar__endswith': 'baz', - }, + ( + ('foo__startswith', 'bar'), + ('bar__endswith', 'baz'), + ), ['"bar" like :p0', '"foo" like :p1'], ['%baz', 'bar%'] ), ( - { - 'foo__lt': '1', - 'bar__gt': '2', - 'baz__gte': '3', - 'bax__lte': '4', - }, + ( + ('foo__lt', '1'), + ('bar__gt', '2'), + ('baz__gte', '3'), + ('bax__lte', '4'), + ), ['"bar" > :p0', '"bax" <= :p1', '"baz" >= :p2', '"foo" < :p3'], [2, 4, 3, 1] ), ( - { - 'foo__like': '2%2', - 'zax__glob': '3*', - }, + ( + ('foo__like', '2%2'), + ('zax__glob', '3*'), + ), ['"foo" like :p0', '"zax" glob :p1'], ['2%2', '3*'] ), + # Multiple like arguments: ( - { - 'foo__isnull': '1', - 'baz__isnull': '1', - 'bar__gt': '10' - }, + ( + ('foo__like', '2%2'), + ('foo__like', '3%3'), + ), + ['"foo" like :p0', '"foo" like :p1'], + ['2%2', '3%3'] + ), + ( + ( + ('foo__isnull', '1'), + ('baz__isnull', '1'), + ('bar__gt', '10'), + ), ['"bar" > :p0', '"baz" is null', '"foo" is null'], [10] ), ( - { - 'foo__in': '1,2,3', - }, + ( + ('foo__in', '1,2,3'), + ), ['foo in (:p0, :p1, :p2)'], ["1", "2", "3"] ), # date ( - { - "foo__date": "1988-01-01", - }, + ( + ("foo__date", "1988-01-01"), + ), ["date(foo) = :p0"], ["1988-01-01"] ), # JSON array variants of __in (useful for unexpected characters) ( - { - 'foo__in': '[1,2,3]', - }, + ( + ('foo__in', '[1,2,3]'), + ), ['foo in (:p0, :p1, :p2)'], [1, 2, 3] ), ( - { - 'foo__in': '["dog,cat", "cat[dog]"]', - }, + ( + ('foo__in', '["dog,cat", "cat[dog]"]'), + ), ['foo in (:p0, :p1)'], ["dog,cat", "cat[dog]"] ), ]) def test_build_where(args, expected_where, expected_params): - f = Filters(sorted(args.items())) + f = Filters(sorted(args)) sql_bits, actual_params = f.build_where_clauses("table") assert expected_where == sql_bits assert { From 63e52c0936dc4c99dec3e2e41edc85a48c0dd425 Mon Sep 17 00:00:00 2001 From: Simon Willison Date: Sat, 13 Apr 2019 13:03:59 -0700 Subject: [PATCH 09/17] WIP refactoring facets to plugin, refs #427 --- datasette/facets.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/datasette/facets.py b/datasette/facets.py index f1e4bbe7..cb2e9f9a 100644 --- a/datasette/facets.py +++ b/datasette/facets.py @@ -59,6 +59,7 @@ class Facet: self.table = table # can be None self.configs = configs +<<<<<<< HEAD async def suggest(self, sql, params, filtered_table_rows_count): return [] @@ -83,6 +84,28 @@ class ColumnFacet(Facet): async def suggest(self, sql, params, filtered_table_rows_count): # Detect column names using the "limit 0" trick columns = await self.get_columns(sql, params) +======= + async def suggest(self, sql, params): + raise NotImplementedError + + async def facet_results(self, sql, params): + # returns ([results], [timed_out]) + raise NotImplementedError + + +class ColumnFacet(Facet): + # This is the default so type="" + type = "" + + async def suggest(self, sql, params, filtered_table_rows_count): + # Detect column names + columns = ( + await self.ds.execute( + self.database, "select * from ({}) limit 0".format(sql), + params + ) + ).columns +>>>>>>> WIP refactoring facets to plugin, refs #427 facet_size = self.ds.config("default_facet_size") suggested_facets = [] for column in columns: @@ -139,9 +162,13 @@ class ColumnFacet(Facet): other_args[key] = value[0] facet_size = self.ds.config("default_facet_size") +<<<<<<< HEAD for config in (self.configs or []): column = config.get("column") or config["single"] # TODO: does this query break if inner sql produces value or count columns? +======= + for column in self.configs: +>>>>>>> WIP refactoring facets to plugin, refs #427 facet_sql = """ select {col} as value, count(*) as count from ( {sql} @@ -203,6 +230,7 @@ class ColumnFacet(Facet): class ManyToManyFacet(Facet): type = "m2m" +<<<<<<< HEAD async def suggest(self, sql, params, filtered_table_rows_count): # This is calculated based on foreign key relationships to this table # Are there any many-to-many tables pointing here? @@ -660,3 +688,8 @@ emoji_re = re.compile( "\U0001f95e\U0001f95f-\U0001f96b\U0001f980-\U0001f984\U0001f985-\U0001f991" "\U0001f992-\U0001f997\U0001f9c0\U0001f9d0-\U0001f9e6]" ) +======= + +class ArrayFacet(Facet): + type = "array" +>>>>>>> WIP refactoring facets to plugin, refs #427 From f6cdca3f6eb32236b112b1a7aa65041d6b8312ba Mon Sep 17 00:00:00 2001 From: Simon Willison Date: Tue, 16 Apr 2019 21:08:01 -0700 Subject: [PATCH 10/17] Tests should now pass for facets branch --- datasette/app.py | 2 +- datasette/facets.py | 11 +++++++---- datasette/views/table.py | 9 ++++++--- docs/plugins.rst | 9 +++++++++ 4 files changed, 23 insertions(+), 8 deletions(-) diff --git a/datasette/app.py b/datasette/app.py index a4b8f7f0..d092e1ad 100644 --- a/datasette/app.py +++ b/datasette/app.py @@ -605,7 +605,7 @@ class Datasette: truncated = False except sqlite3.OperationalError as e: if e.args == ('interrupted',): - raise InterruptedError(e) + raise InterruptedError(e, sql, params) if log_sql_errors: print( "ERROR: conn={}, sql = {}, params = {}: {}".format( diff --git a/datasette/facets.py b/datasette/facets.py index cb2e9f9a..a8ff0c40 100644 --- a/datasette/facets.py +++ b/datasette/facets.py @@ -8,6 +8,7 @@ from datasette.utils import ( path_with_added_args, path_with_removed_args, detect_json1, + InterruptedError, InvalidSql, sqlite3, ) @@ -142,7 +143,7 @@ class ColumnFacet(Facet): ), }) except InterruptedError: - pass + continue return suggested_facets async def facet_results(self, sql, params): @@ -238,6 +239,9 @@ class ManyToManyFacet(Facet): all_foreign_keys = await self.ds.execute_against_connection_in_thread( self.database, get_all_foreign_keys ) + if not all_foreign_keys.get(self.table): + # It's probably a view + return [] incoming = all_foreign_keys[self.table]["incoming"] # Do any of these incoming tables have exactly two outgoing keys? for fk in incoming: @@ -377,7 +381,7 @@ class DateFacet(Facet): suggested_facet_sql = """ select date({column}) from ( {sql} - ) limit 100; + ) where {column} glob "????-??-??" limit 100; """.format( column=escape_sqlite(column), sql=sql, @@ -429,7 +433,7 @@ class DateFacet(Facet): {sql} ) where date({col}) is not null - group by date({col}) order by date({col}) desc limit {limit} + group by date({col}) order by count desc limit {limit} """.format( col=escape_sqlite(column), sql=sql, @@ -577,7 +581,6 @@ class EmojiFacet(Facet): ), }) except (InterruptedError, sqlite3.OperationalError) as e: - print(" oh no ", e) continue return suggested_facets diff --git a/datasette/views/table.py b/datasette/views/table.py index 1a7231fb..30fa476c 100644 --- a/datasette/views/table.py +++ b/datasette/views/table.py @@ -464,11 +464,14 @@ class TableView(RowTableShared): else: page_size = self.ds.page_size - sql = "select {select} from {table_name} {where}{order_by}limit {limit}{offset}".format( + sql_no_limit = "select {select} from {table_name} {where}{order_by}".format( select=select, table_name=escape_sqlite(table), where=where_clause, order_by=order_by, + ) + sql = "{sql_no_limit} limit {limit}{offset}".format( + sql_no_limit=sql_no_limit.rstrip(), limit=page_size + 1, offset=offset, ) @@ -498,7 +501,7 @@ class TableView(RowTableShared): for facet in facet_instances: instance_facet_results, instance_facets_timed_out = await facet.facet_results( - sql, params, + sql_no_limit, params, ) facet_results.update(instance_facet_results) facets_timed_out.extend(instance_facets_timed_out) @@ -605,7 +608,7 @@ class TableView(RowTableShared): for facet in facet_instances: # TODO: ensure facet is not suggested if it is already active # used to use 'if facet_column in facets' for this - suggested_facets.extend(await facet.suggest(sql, params, filtered_table_rows_count)) + suggested_facets.extend(await facet.suggest(sql_no_limit, params, filtered_table_rows_count)) # human_description_en combines filters AND search, if provided human_description_en = filters.human_description_en(extra=search_descriptions) diff --git a/docs/plugins.rst b/docs/plugins.rst index 984e5c95..103e8a2b 100644 --- a/docs/plugins.rst +++ b/docs/plugins.rst @@ -551,3 +551,12 @@ The ``template``, ``database`` and ``table`` options can be used to return diffe The ``datasette`` instance is provided primarily so that you can consult any plugin configuration options that may have been set, using the ``datasette.plugin_config(plugin_name)`` method documented above. The string that you return from this function will be treated as "safe" for inclusion in a ``