From 6ff261c1de81b5561f12120ccdbc8420a66421a5 Mon Sep 17 00:00:00 2001 From: Simon Willison Date: Thu, 13 Feb 2020 18:23:34 -0800 Subject: [PATCH 1/5] --dirs scan mechanism, work in progress - refs #417 --- datasette/app.py | 24 ++++++++++++++++++++++++ datasette/cli.py | 9 +++++++++ datasette/utils/__init__.py | 21 +++++++++++++++++++++ docs/datasette-serve-help.txt | 1 + 4 files changed, 55 insertions(+) diff --git a/datasette/app.py b/datasette/app.py index 011002ee..6b0e6ada 100644 --- a/datasette/app.py +++ b/datasette/app.py @@ -31,6 +31,8 @@ from .utils import ( escape_css_string, escape_sqlite, format_bytes, + is_valid_sqlite, + get_plugins, module_from_path, sqlite3, to_css_class, @@ -149,6 +151,7 @@ class Datasette: def __init__( self, files, + dirs=None, immutables=None, cache_headers=True, cors=False, @@ -163,6 +166,7 @@ class Datasette: version_note=None, ): immutables = immutables or [] + self.dirs = dirs or [] self.files = tuple(files) + tuple(immutables) self.immutables = set(immutables) if not self.files: @@ -182,6 +186,7 @@ class Datasette: if db.name in self.databases: raise Exception("Multiple files with same stem: {}".format(db.name)) self.add_database(db.name, db) + self.scan_dirs() self.cache_headers = cache_headers self.cors = cors self._metadata = metadata or {} @@ -217,6 +222,25 @@ class Datasette: def remove_database(self, name): self.databases.pop(name) + def scan_dirs(self): + # Recurse through self.dirs looking for new SQLite DBs + i = 0 + for dir in self.dirs: + print(dir) + for filepath in Path(dir).glob("**/*.db"): + print(filepath) + if is_valid_sqlite(filepath): + self.add_database( + str(filepath) + .replace("../", "") + .replace("/", "_") + .replace(".db", ""), + Database(self, filepath, is_mutable=True), + ) + i += 1 + if i >= 20: + break + def config(self, key): return self._config.get(key, None) diff --git a/datasette/cli.py b/datasette/cli.py index 94da6ee4..29e16c9d 100644 --- a/datasette/cli.py +++ b/datasette/cli.py @@ -232,6 +232,13 @@ def package( @cli.command() @click.argument("files", type=click.Path(exists=True), nargs=-1) +@click.option( + "-d", + "--dir", + type=click.Path(exists=True), + help="Directories to scan for SQLite files to serve", + multiple=True, +) @click.option( "-i", "--immutable", @@ -310,6 +317,7 @@ def package( @click.option("--help-config", is_flag=True, help="Show available config options") def serve( files, + dir, immutable, host, port, @@ -361,6 +369,7 @@ def serve( ) ds = Datasette( files, + dir, immutables=immutable, cache_headers=not debug and not reload, cors=cors, diff --git a/datasette/utils/__init__.py b/datasette/utils/__init__.py index be99f890..c15b7eae 100644 --- a/datasette/utils/__init__.py +++ b/datasette/utils/__init__.py @@ -588,6 +588,27 @@ def to_css_class(s): return "-".join(bits) +SQLITE_MAGIC = b"SQLite format 3\x00" + + +def is_valid_sqlite(path): + if not path.is_file(): + return False + try: + with open(path, "rb") as fp: + has_magic = fp.read(len(SQLITE_MAGIC)) == SQLITE_MAGIC + except PermissionError: + return False + if not has_magic: + return False + # Check we can run `select * from sqlite_master` + try: + sqlite3.connect(str(path)).execute("select * from sqlite_master") + except Exception: + return False + return True + + def link_or_copy(src, dst): # Intended for use in populating a temp directory. We link if possible, # but fall back to copying if the temp directory is on a different device diff --git a/docs/datasette-serve-help.txt b/docs/datasette-serve-help.txt index c0b33c54..d900b0dc 100644 --- a/docs/datasette-serve-help.txt +++ b/docs/datasette-serve-help.txt @@ -5,6 +5,7 @@ Usage: datasette serve [OPTIONS] [FILES]... Serve up specified SQLite database files with a web UI Options: + -d, --dir PATH Directories to scan for SQLite files to serve -i, --immutable PATH Database files to open in immutable mode -h, --host TEXT Host for server. Defaults to 127.0.0.1 which means only connections from the local machine will be allowed. Use From 55e633e09f5b113589c61dd04c928d40509d5596 Mon Sep 17 00:00:00 2001 From: Simon Willison Date: Thu, 13 Feb 2020 21:58:48 -0800 Subject: [PATCH 2/5] Run scan_dirs() in a thread --- datasette/app.py | 7 ++----- datasette/views/index.py | 5 ++++- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/datasette/app.py b/datasette/app.py index 6b0e6ada..c0071d2a 100644 --- a/datasette/app.py +++ b/datasette/app.py @@ -186,7 +186,8 @@ class Datasette: if db.name in self.databases: raise Exception("Multiple files with same stem: {}".format(db.name)) self.add_database(db.name, db) - self.scan_dirs() + self.scan_dirs_executor = futures.ThreadPoolExecutor(max_workers=1) + self.scan_dirs_executor.submit(self.scan_dirs) self.cache_headers = cache_headers self.cors = cors self._metadata = metadata or {} @@ -224,7 +225,6 @@ class Datasette: def scan_dirs(self): # Recurse through self.dirs looking for new SQLite DBs - i = 0 for dir in self.dirs: print(dir) for filepath in Path(dir).glob("**/*.db"): @@ -237,9 +237,6 @@ class Datasette: .replace(".db", ""), Database(self, filepath, is_mutable=True), ) - i += 1 - if i >= 20: - break def config(self, key): return self._config.get(key, None) diff --git a/datasette/views/index.py b/datasette/views/index.py index fe88a38c..91988d1c 100644 --- a/datasette/views/index.py +++ b/datasette/views/index.py @@ -23,7 +23,10 @@ class IndexView(BaseView): async def get(self, request, as_format): databases = [] - for name, db in self.ds.databases.items(): + # Using list() here because scan_dirs() running in a thread might + # modify self.ds.databases while we are iterating it, which could + # cause 'RuntimeError: OrderedDict mutated during iteration' + for name, db in list(self.ds.databases.items()): table_names = await db.table_names() hidden_table_names = set(await db.hidden_table_names()) views = await db.view_names() From f2fd7d20bf3d1ca7ea486e3e40f27576d44f39d0 Mon Sep 17 00:00:00 2001 From: Simon Willison Date: Sat, 15 Feb 2020 09:28:48 -0800 Subject: [PATCH 3/5] Run scan_dirs in a thread every 10 seconds --- datasette/app.py | 38 ++++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/datasette/app.py b/datasette/app.py index c0071d2a..1b8be011 100644 --- a/datasette/app.py +++ b/datasette/app.py @@ -7,6 +7,7 @@ import os import re import sys import threading +import time import traceback import urllib.parse from concurrent import futures @@ -186,8 +187,11 @@ class Datasette: if db.name in self.databases: raise Exception("Multiple files with same stem: {}".format(db.name)) self.add_database(db.name, db) - self.scan_dirs_executor = futures.ThreadPoolExecutor(max_workers=1) - self.scan_dirs_executor.submit(self.scan_dirs) + if dirs: + self.scan_dirs_thread = threading.Thread( + target=self.scan_dirs, name="scan-dirs", daemon=True + ) + self.scan_dirs_thread.start() self.cache_headers = cache_headers self.cors = cors self._metadata = metadata or {} @@ -225,18 +229,24 @@ class Datasette: def scan_dirs(self): # Recurse through self.dirs looking for new SQLite DBs - for dir in self.dirs: - print(dir) - for filepath in Path(dir).glob("**/*.db"): - print(filepath) - if is_valid_sqlite(filepath): - self.add_database( - str(filepath) - .replace("../", "") - .replace("/", "_") - .replace(".db", ""), - Database(self, filepath, is_mutable=True), - ) + while True: + current_filepaths = { + d.path for d in list(self.databases.values()) if d.path is not None + } + for dir in self.dirs: + for filepath in Path(dir).glob("**/*.db"): + if str(filepath) in current_filepaths: + continue + print(filepath) + if is_valid_sqlite(filepath): + self.add_database( + str(filepath) + .replace("../", "") + .replace("/", "_") + .replace(".db", ""), + Database(self, str(filepath), is_mutable=True), + ) + time.sleep(10) def config(self, key): return self._config.get(key, None) From ec0d68da7099de6a829505541a51ae75b77afe8f Mon Sep 17 00:00:00 2001 From: Simon Willison Date: Thu, 26 Mar 2020 08:34:49 -0700 Subject: [PATCH 4/5] I moved this import --- datasette/app.py | 1 - 1 file changed, 1 deletion(-) diff --git a/datasette/app.py b/datasette/app.py index 1b8be011..98a31364 100644 --- a/datasette/app.py +++ b/datasette/app.py @@ -33,7 +33,6 @@ from .utils import ( escape_sqlite, format_bytes, is_valid_sqlite, - get_plugins, module_from_path, sqlite3, to_css_class, From ee718b98b793df2a15b125cbf20816c9864bf7e9 Mon Sep 17 00:00:00 2001 From: Simon Willison Date: Thu, 26 Mar 2020 18:03:41 -0700 Subject: [PATCH 5/5] Verify SQLite DBs with check_connection --- datasette/app.py | 2 +- datasette/utils/__init__.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/datasette/app.py b/datasette/app.py index 98a31364..34300272 100644 --- a/datasette/app.py +++ b/datasette/app.py @@ -227,7 +227,7 @@ class Datasette: self.databases.pop(name) def scan_dirs(self): - # Recurse through self.dirs looking for new SQLite DBs + # Recurse through self.dirs looking for new SQLite DBs. Runs in a thread. while True: current_filepaths = { d.path for d in list(self.databases.values()) if d.path is not None diff --git a/datasette/utils/__init__.py b/datasette/utils/__init__.py index c15b7eae..f0a2cb89 100644 --- a/datasette/utils/__init__.py +++ b/datasette/utils/__init__.py @@ -603,7 +603,8 @@ def is_valid_sqlite(path): return False # Check we can run `select * from sqlite_master` try: - sqlite3.connect(str(path)).execute("select * from sqlite_master") + conn = sqlite3.connect(str(path)) + check_connection(conn) except Exception: return False return True