diff --git a/docs/cli.rst b/docs/cli.rst index a6081609..6e2e1a87 100644 --- a/docs/cli.rst +++ b/docs/cli.rst @@ -176,6 +176,19 @@ You can use the ``--json-cols`` option to automatically detect these JSON column } ] +.. _cli_use_json_converters: + +Automatic JSON deserialization +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can use the ``--use-json-converters`` flag to automatically deserialize columns that are declared as ``JSON`` (or inferred as such during insertion). + +.. code-block:: bash + + sqlite-utils query dogs.db "select * from dogs" --use-json-converters + +If you use this flag with ``insert``, ``upsert`` or ``bulk``, it will also cause nested Python dictionaries or lists to be stored in columns with a declared type of ``JSON`` rather than ``TEXT``. + .. _cli_query_csv: Returning CSV or TSV @@ -1935,7 +1948,7 @@ Most of the time creating tables by inserting example data is the quickest appro This will create a table called ``mytable`` with two columns - an integer ``id`` column and a text ``name`` column. It will set the ``id`` column to be the primary key. -You can pass as many column-name column-type pairs as you like. Valid types are ``integer``, ``text``, ``float`` and ``blob``. +You can pass as many column-name column-type pairs as you like. Valid types are ``integer``, ``text``, ``float``, ``blob`` and ``json``. Pass ``--pk`` more than once for a compound primary key that covers multiple columns. diff --git a/docs/python-api.rst b/docs/python-api.rst index 267591ac..d4c8b1f4 100644 --- a/docs/python-api.rst +++ b/docs/python-api.rst @@ -117,6 +117,14 @@ By default, any :ref:`sqlite-utils plugins ` that implement the :ref:`p db = Database(memory=True, execute_plugins=False) +You can pass ``use_json_converters=True`` to enable automatic JSON conversion for columns declared as ``JSON``. This will register a custom converter with SQLite that uses ``json.loads()`` to deserialize values: + +.. code-block:: python + + db = Database("my_database.db", use_json_converters=True) + +When this is enabled, Python ``dict``, ``list`` and ``tuple`` values will be stored in columns with a declared type of ``JSON``, and those columns will be automatically deserialized back into Python objects when you retrieve them from the database. + You can pass ``strict=True`` to enable `SQLite STRICT mode `__ for all tables created using this database object: .. code-block:: python diff --git a/sqlite_utils/cli.py b/sqlite_utils/cli.py index 9b9ee20e..22e2c69a 100644 --- a/sqlite_utils/cli.py +++ b/sqlite_utils/cli.py @@ -66,7 +66,7 @@ def _close_databases(ctx): pass -VALID_COLUMN_TYPES = ("INTEGER", "TEXT", "FLOAT", "REAL", "BLOB") +VALID_COLUMN_TYPES = ("INTEGER", "TEXT", "FLOAT", "REAL", "BLOB", "JSON") UNICODE_ERROR = """ {} @@ -962,6 +962,12 @@ def inner(fn): default=False, help="Apply STRICT mode to created table", ), + click.option( + "--use-json-converters", + is_flag=True, + default=False, + help="Automatically use JSON columns for nested structures and register JSON converter", + ), ) ): fn = decorator(fn) @@ -1006,8 +1012,9 @@ def insert_upsert_implementation( bulk_sql=None, functions=None, strict=False, + use_json_converters=False, ): - db = sqlite_utils.Database(path) + db = sqlite_utils.Database(path, use_json_converters=use_json_converters) _register_db_for_cleanup(db) _load_extensions(db, load_extension) _maybe_register_functions(db, functions) @@ -1151,7 +1158,11 @@ def insert_upsert_implementation( try: db.table(table).insert_all( - docs, pk=pk, batch_size=batch_size, alter=alter, **extra_kwargs + docs, + pk=pk, + batch_size=batch_size, + alter=alter, + **extra_kwargs, ) except Exception as e: if ( @@ -1248,6 +1259,7 @@ def insert( not_null, default, strict, + use_json_converters, ): """ Insert records from FILE into a table, creating the table if it @@ -1328,6 +1340,7 @@ def insert( not_null=not_null, default=default, strict=strict, + use_json_converters=use_json_converters, ) except UnicodeDecodeError as ex: raise click.ClickException(UNICODE_ERROR.format(ex)) @@ -1365,6 +1378,7 @@ def upsert( load_extension, silent, strict, + use_json_converters, ): """ Upsert records based on their primary key. Works like 'insert' but if @@ -1411,6 +1425,7 @@ def upsert( load_extension=load_extension, silent=silent, strict=strict, + use_json_converters=use_json_converters, ) except UnicodeDecodeError as ex: raise click.ClickException(UNICODE_ERROR.format(ex)) @@ -1430,6 +1445,12 @@ def upsert( help="Python code or file path defining custom SQL functions", multiple=True, ) +@click.option( + "--use-json-converters", + is_flag=True, + default=False, + help="Automatically use JSON columns for nested structures and register JSON converter", +) @import_options @load_extension_option def bulk( @@ -1453,6 +1474,7 @@ def bulk( no_headers, encoding, load_extension, + use_json_converters, ): """ Execute parameterized SQL against the provided list of documents. @@ -1499,6 +1521,7 @@ def bulk( silent=False, bulk_sql=sql, functions=functions, + use_json_converters=use_json_converters, ) except (OperationalError, sqlite3.IntegrityError) as e: raise click.ClickException(str(e)) @@ -1613,7 +1636,7 @@ def create_table( height float \\ photo blob --pk id - Valid column types are text, integer, float and blob. + Valid column types are text, integer, float, blob and json. """ db = sqlite_utils.Database(path) _register_db_for_cleanup(db) @@ -1830,7 +1853,14 @@ def drop_view(path, view, ignore, load_extension): multiple=True, ) @load_extension_option +@click.option( + "--use-json-converters", + is_flag=True, + default=False, + help="Automatically use JSON columns for nested structures and register JSON converter", +) def query( + path, sql, attach, @@ -1847,6 +1877,7 @@ def query( param, load_extension, functions, + use_json_converters, ): """Execute SQL query and return the results as JSON @@ -1857,7 +1888,7 @@ def query( "select * from chickens where age > :age" \\ -p age 1 """ - db = sqlite_utils.Database(path) + db = sqlite_utils.Database(path, use_json_converters=use_json_converters) _register_db_for_cleanup(db) for alias, attach_path in attach: db.attach(alias, attach_path) @@ -1939,7 +1970,14 @@ def query( is_flag=True, help="Analyze resulting tables and output results", ) +@click.option( + "--use-json-converters", + is_flag=True, + default=False, + help="Automatically use JSON columns for nested structures and register JSON converter", +) @load_extension_option + def memory( paths, sql, @@ -1964,6 +2002,7 @@ def memory( save, analyze, load_extension, + use_json_converters, return_db=False, ): """Execute SQL query against an in-memory database, optionally populated by imported data @@ -1992,7 +2031,7 @@ def memory( \b sqlite-utils memory animals.csv --schema """ - db = sqlite_utils.Database(memory=True) + db = sqlite_utils.Database(memory=True, use_json_converters=use_json_converters) if not return_db: _register_db_for_cleanup(db) diff --git a/sqlite_utils/db.py b/sqlite_utils/db.py index aacdc893..c9cb0b6e 100644 --- a/sqlite_utils/db.py +++ b/sqlite_utils/db.py @@ -222,6 +222,7 @@ class Default: "real": "REAL", "blob": "BLOB", "bytes": "BLOB", + "JSON": "JSON", } # If numpy is available, add more types if np: @@ -326,6 +327,7 @@ class Database: :param use_old_upsert: set to ``True`` to force the older upsert implementation. See :ref:`python_api_old_upsert` :param strict: Apply STRICT mode to all created tables (unless overridden) + :param use_json_converters: Automatically use JSON columns for nested structures and register JSON converter """ _counts_table_name = "_counts" @@ -344,24 +346,33 @@ def __init__( execute_plugins: bool = True, use_old_upsert: bool = False, strict: bool = False, + use_json_converters: bool = False, ): self.memory_name = None self.memory = False self.use_old_upsert = use_old_upsert + self.use_json_converters = use_json_converters assert (filename_or_conn is not None and (not memory and not memory_name)) or ( filename_or_conn is None and (memory or memory_name) ), "Either specify a filename_or_conn or pass memory=True" + + detect_types = 0 + if use_json_converters: + sqlite3.register_converter("JSON", json.loads) + detect_types = sqlite3.PARSE_DECLTYPES + if memory_name: uri = "file:{}?mode=memory&cache=shared".format(memory_name) self.conn = sqlite3.connect( uri, uri=True, check_same_thread=False, + detect_types=detect_types, ) self.memory = True self.memory_name = memory_name elif memory or filename_or_conn == ":memory:": - self.conn = sqlite3.connect(":memory:") + self.conn = sqlite3.connect(":memory:", detect_types=detect_types) self.memory = True elif isinstance(filename_or_conn, (str, pathlib.Path)): if recreate and os.path.exists(filename_or_conn): @@ -370,9 +381,9 @@ def __init__( except OSError: # Avoid mypy and __repr__ errors, see: # https://github.com/simonw/sqlite-utils/issues/503 - self.conn = sqlite3.connect(":memory:") + self.conn = sqlite3.connect(":memory:", detect_types=detect_types) raise - self.conn = sqlite3.connect(str(filename_or_conn)) + self.conn = sqlite3.connect(str(filename_or_conn), detect_types=detect_types) else: assert not recreate, "recreate cannot be used with connections, only paths" self.conn = filename_or_conn @@ -1015,6 +1026,8 @@ def sort_key(p): ) ) column_type_str = COLUMN_TYPE_MAPPING[column_type] + if self.use_json_converters and column_type in (dict, list, tuple): + column_type_str = "JSON" # Special case for strict tables to map FLOAT to REAL # Refs https://github.com/simonw/sqlite-utils/issues/644 if strict and column_type_str == "FLOAT": @@ -3557,9 +3570,13 @@ def insert_all( if list_mode: # Convert list records to dicts for type detection chunk_as_dicts = [dict(zip(column_names, row)) for row in chunk] - column_types = suggest_column_types(chunk_as_dicts) + column_types = suggest_column_types( + chunk_as_dicts, json_converters=self.db.use_json_converters + ) else: - column_types = suggest_column_types(chunk) # type: ignore[arg-type] + column_types = suggest_column_types( + chunk, json_converters=self.db.use_json_converters + ) # type: ignore[arg-type] if extracts: for col in extracts: if col in column_types: @@ -3746,7 +3763,9 @@ def upsert_all( ) def add_missing_columns(self, records: Iterable[Dict[str, Any]]) -> "Table": - needed_columns = suggest_column_types(records) + needed_columns = suggest_column_types( + records, json_converters=self.db.use_json_converters + ) current_columns = {c.lower() for c in self.columns_dict} for col_name, col_type in needed_columns.items(): if col_name.lower() not in current_columns: diff --git a/sqlite_utils/utils.py b/sqlite_utils/utils.py index 05e9a511..7e7944ba 100644 --- a/sqlite_utils/utils.py +++ b/sqlite_utils/utils.py @@ -132,16 +132,18 @@ def find_spatialite() -> Optional[str]: def suggest_column_types( records: Iterable[Dict[str, Any]], + json_converters: bool = False, ) -> Dict[str, type]: all_column_types: Dict[str, Set[type]] = {} for record in records: for key, value in record.items(): all_column_types.setdefault(key, set()).add(type(value)) - return types_for_column_types(all_column_types) + return types_for_column_types(all_column_types, json_converters=json_converters) def types_for_column_types( all_column_types: Dict[str, Set[type]], + json_converters: bool = False, ) -> Dict[str, type]: column_types: Dict[str, type] = {} for key, types in all_column_types.items(): @@ -153,10 +155,19 @@ def types_for_column_types( t = str elif len(types) == 1: t = list(types)[0] - # But if it's a subclass of list / tuple / dict, use str - # instead as we will be storing it as JSON in the table - for superclass in (list, tuple, dict): - if issubclass(t, superclass): + if json_converters: + # Normalize subclasses of list / dict to the base class + # so they can be handled by the mapping in db.py + for superclass in (list, dict): + if issubclass(t, superclass): + t = superclass + break + elif issubclass(t, (list, dict)): + t = str + if issubclass(t, tuple): + if json_converters: + t = tuple + else: t = str elif {int, bool}.issuperset(types): t = int diff --git a/tests/test_cli_json_converters.py b/tests/test_cli_json_converters.py new file mode 100644 index 00000000..757dca16 --- /dev/null +++ b/tests/test_cli_json_converters.py @@ -0,0 +1,82 @@ +import pytest +from click.testing import CliRunner +from sqlite_utils import cli, Database +import json + +def test_insert_json_converters(tmpdir): + db_path = str(tmpdir / "test.db") + runner = CliRunner() + + # Input data with nested structure + data = [{"id": 1, "nested": {"foo": "bar"}}, {"id": 2, "nested": {"baz": 123}}] + input_str = json.dumps(data) + + # Insert without --use-json-converters + result = runner.invoke(cli.cli, ["insert", db_path, "t1", "-"], input=input_str) + assert result.exit_code == 0 + + db = Database(db_path) + # Access column by name to be robust against ordering changes + assert next(c for c in db["t1"].columns if c.name == "nested").type == "TEXT" + row = db["t1"].get(1) + assert isinstance(row["nested"], str) + + # Insert WITH --use-json-converters + result = runner.invoke(cli.cli, ["insert", db_path, "t2", "-", "--use-json-converters"], input=input_str) + assert result.exit_code == 0 + + # We need to open with use_json_converters=True to see the effect on get() + db_json = Database(db_path, use_json_converters=True) + assert next(c for c in db_json["t2"].columns if c.name == "nested").type == "JSON" + row = db_json["t2"].get(1) + assert isinstance(row["nested"], dict) + assert row["nested"] == {"foo": "bar"} + +def test_query_json_converters(tmpdir): + db_path = str(tmpdir / "test.db") + db = Database(db_path, use_json_converters=True) + db["t"].insert({"id": 1, "data": {"a": 1}}, pk="id") + + runner = CliRunner() + + # Query without flag - ensure it does not automatically deserialize by default in CLI + result = runner.invoke(cli.cli, ["query", db_path, "select * from t"]) + assert result.exit_code == 0 + assert '"data": "{\\"a\\": 1}"' in result.output + + # Query WITH flag + result = runner.invoke(cli.cli, ["query", db_path, "select * from t", "--use-json-converters"]) + assert result.exit_code == 0 + # Now it should be a real nested object in the JSON output + assert '"data": {"a": 1}' in result.output + +def test_create_table_json_type(tmpdir): + db_path = str(tmpdir / "test.db") + runner = CliRunner() + + # Create table with JSON type + result = runner.invoke(cli.cli, ["create-table", db_path, "t", "id", "integer", "data", "json", "--pk", "id"]) + assert result.exit_code == 0 + + db = Database(db_path) + assert next(c for c in db["t"].columns if c.name == "data").type == "JSON" + +def test_memory_json_converters(tmpdir): + csv_path = str(tmpdir / "test.csv") + with open(csv_path, "w") as f: + f.write("id,data\n1,'{\"a\": 1}'") + + runner = CliRunner() + + # Query memory with flag - check output deserialization + result = runner.invoke(cli.cli, ["memory", csv_path, "select * from test", "--use-json-converters"]) + assert result.exit_code == 0 + + # Let's try with JSON input + json_path = str(tmpdir / "test.json") + with open(json_path, "w") as f: + json.dump([{"id": 1, "data": {"a": 1}}], f) + + result = runner.invoke(cli.cli, ["memory", json_path, "select * from test", "--use-json-converters"]) + assert result.exit_code == 0 + assert '"data": {"a": 1}' in result.output diff --git a/tests/test_issue_612.py b/tests/test_issue_612.py new file mode 100644 index 00000000..3c18a3b9 --- /dev/null +++ b/tests/test_issue_612.py @@ -0,0 +1,184 @@ +import pytest +import json +from sqlite_utils import Database + +def test_insert_dict_default_behavior(): + """ + By default (without use_json_converters=True), inserting a dict + should create a TEXT column and return a string. + """ + db = Database(memory=True) + data = {"id": 1, "nested": {"foo": "bar"}} + t = db["t"] + t.insert(data, pk="id") + + # Check column type + # Access column by name to be robust against ordering changes + col = next(c for c in t.columns if c.name == "nested") + assert col.type == "TEXT" + + # Check returned value + row = t.get(1) + assert isinstance(row["nested"], str) + assert row["nested"] == '{"foo": "bar"}' + +def test_explicit_json_column_creation(): + """ + Test repeatedly creating a table with a specific JSON column type. + """ + db = Database(memory=True) + # This should work if JSON is in COLUMN_TYPE_MAPPING + db["t"].create({"id": int, "data": "JSON"}, pk="id") + col = next(c for c in db["t"].columns if c.name == "data") + assert col.type == "JSON" + +def test_use_json_converters_argument_exists(): + """ + Test that Database accepts use_json_converters argument. + """ + # strict=False is default, just ensuring we can pass the arg + try: + db = Database(memory=True, use_json_converters=True) + except TypeError: + pytest.fail("Database does not accept use_json_converters argument") + +def test_insert_dict_with_json_converters_enabled(): + """ + With use_json_converters=True: + 1. Inserting a dict should create a JSON column. + 2. Retrieving it should return a dict (auto-deserialization). + """ + db = Database(memory=True, use_json_converters=True) + data = {"id": 1, "attrs": {"color": "red", "size": 10}} + + db["items"].insert(data, pk="id") + + # Verify column type is inferred as JSON (not JSONB) + col = next(c for c in db["items"].columns if c.name == "attrs") + assert col.type == "JSON" + + # Verify auto-deserialization + row = db["items"].get(1) + assert isinstance(row["attrs"], dict) + assert row["attrs"] == data["attrs"] + +def test_list_deserialization(): + """ + Test that lists are also handled correctly when use_json_converters=True. + """ + db = Database(memory=True, use_json_converters=True) + + data = {"id": 1, "tags": ["a", "b", "c"]} + db["items"].insert(data, pk="id") + + # Verify column type is inferred as JSON + col = next(c for c in db["items"].columns if c.name == "tags") + assert col.type == "JSON" + + # Verify deserialization + row = db["items"].get(1) + assert isinstance(row["tags"], list) + assert row["tags"] == ["a", "b", "c"] + +def test_explicit_json_column_deserialization(): + """ + Test that explicit JSON columns are deserialized when flag is enabled. + """ + db = Database(memory=True, use_json_converters=True) + + # Create table explicitly + db["t"].create({"id": int, "data": "JSON"}, pk="id") + + data = {"foo": "bar"} + db["t"].insert({"id": 1, "data": data}) + + # Explicit creation doesn't rely on suggest_column_types for type, but insert might + row = db["t"].get(1) + assert isinstance(row["data"], dict) + assert row["data"] == data + +def test_suggest_column_types_conditional_behavior(): + """ + Test that suggest_column_types behaves differently when json_converters=True. + """ + from sqlite_utils.utils import suggest_column_types + records = [{"a": {"foo": "bar"}}, {"a": {"baz": 1}}] + + # Default: returns str + assert suggest_column_types(records) == {"a": str} + assert suggest_column_types(records, json_converters=False) == {"a": str} + + # With flag: returns dict + assert suggest_column_types(records, json_converters=True) == {"a": dict} + + list_records = [{"b": [1, 2]}, {"b": [3]}] + assert suggest_column_types(list_records, json_converters=True) == {"b": list} + +def test_json_null_values(): + """ + Test that null values are handled correctly in JSON columns. + """ + db = Database(memory=True, use_json_converters=True) + db["t"].insert({"id": 1, "data": None}, pk="id") + + # Check column type (should be inferred as TEXT by default if only None is seen, + # but here we just want to see if it breaks) + row = db["t"].get(1) + assert row["data"] is None + +def test_deeply_nested_structures(): + """ + Test deeply nested structures. + """ + db = Database(memory=True, use_json_converters=True) + data = {"a": {"b": {"c": [1, 2, {"d": "e"}]}}} + db["t"].insert({"id": 1, "data": data}, pk="id") + + row = db["t"].get(1) + assert row["data"] == data + +def test_malformed_json_raises_error(): + """ + Test that malformed JSON in a JSON-declared column raises an error on retrieval. + This is standard SQLite PARSE_DECLTYPES behavior with a registered converter. + """ + import sqlite3 + db = Database(memory=True, use_json_converters=True) + # Manually insert malformed JSON into a JSON column + db.execute("CREATE TABLE t (id INTEGER PRIMARY KEY, data JSON)") + db.execute("INSERT INTO t (id, data) VALUES (1, '{malformed')") + + with pytest.raises(Exception): + db["t"].get(1) + +def test_json_converters_only_affects_json_columns(): + """ + Verfiy that use_json_converters=True does NOT affect columns declared as TEXT. + """ + db = Database(memory=True, use_json_converters=True) + db.execute("CREATE TABLE t (id INTEGER PRIMARY KEY, data TEXT)") + db.execute("INSERT INTO t (id, data) VALUES (1, '{\"a\": 1}')") + + row = db["t"].get(1) + assert isinstance(row["data"], str) + assert row["data"] == '{"a": 1}' + +def test_tuple_deserialization(): + """ + Test that tuples are also handled correctly when use_json_converters=True. + """ + db = Database(memory=True, use_json_converters=True) + + data = {"id": 1, "nested_tuple": (1, 2, 3)} + db["items"].insert(data, pk="id") + + # Verify column type is inferred as JSON + col = next(c for c in db["items"].columns if c.name == "nested_tuple") + assert col.type == "JSON" + + # Verify deserialization + row = db["items"].get(1) + # Tuples become lists after JSON roundtrip + assert isinstance(row["nested_tuple"], list) + assert row["nested_tuple"] == [1, 2, 3] +