I'm attempting to optimize extraction of values obtained from a REST API which returns json values in an array.
Here's an minimal, complete, and verifiable example that reflects exactly what I'm doing.
USE tempdb;
DROP TABLE IF EXISTS dbo.json_test;
CREATE TABLE dbo.json_test
(
json_test_id int NOT NULL
IDENTITY(1,1)
, some_uniqueidentifier uniqueidentifier NULL
, some_varchar varchar(100) NULL
, the_json nvarchar(max) NULL
);
INSERT INTO dbo.json_test (some_uniqueidentifier, some_varchar, the_json)
SELECT
some_uniqueidentifier = NEWID()
, some_varchar = CONVERT(varchar(100), CRYPT_GEN_RANDOM(64), 1)
, the_json = (
SELECT st.*
FROM sys.tables st
CROSS JOIN sys.tables st2
WHERE st.object_id = t.object_id FOR JSON AUTO
)
FROM sys.tables t;
;WITH src AS
(
SELECT jt.some_uniqueidentifier
, jt.some_varchar
, top_array.[key]
, top_array.[value]
FROM dbo.json_test jt
CROSS APPLY OPENJSON(jt.the_json, N'lax $') top_array
),
src2 AS
(
SELECT src.some_uniqueidentifier
, src.some_varchar
, src.[key]
, src.[value]
, inner_key = inner_array.[key]
, inner_value = inner_array.[value]
FROM src
CROSS APPLY OPENJSON(src.[value], N'lax $') inner_array
)
SELECT src2.some_uniqueidentifier
, src2.some_varchar
, src2.[key]
, [name] = MAX(CASE WHEN src2.[inner_key] = 'name' THEN src2.[inner_value] ELSE NULL END)
, [object_id] = MAX(CASE WHEN src2.[inner_key] = 'object_id' THEN src2.[inner_value] ELSE NULL END)
, [principal_id] = MAX(CASE WHEN src2.[inner_key] = 'principal_id' THEN src2.[inner_value] ELSE NULL END)
, [schema_id] = MAX(CASE WHEN src2.[inner_key] = 'schema_id' THEN src2.[inner_value] ELSE NULL END)
, [parent_object_id] = MAX(CASE WHEN src2.[inner_key] = 'parent_object_id' THEN src2.[inner_value] ELSE NULL END)
, [type] = MAX(CASE WHEN src2.[inner_key] = 'type' THEN src2.[inner_value] ELSE NULL END)
, [type_desc] = MAX(CASE WHEN src2.[inner_key] = 'type_desc' THEN src2.[inner_value] ELSE NULL END)
, [create_date] = MAX(CASE WHEN src2.[inner_key] = 'create_date' THEN src2.[inner_value] ELSE NULL END)
, [modify_date] = MAX(CASE WHEN src2.[inner_key] = 'modify_date' THEN src2.[inner_value] ELSE NULL END)
, [is_ms_shipped] = MAX(CASE WHEN src2.[inner_key] = 'is_ms_shipped' THEN src2.[inner_value] ELSE NULL END)
, [is_published] = MAX(CASE WHEN src2.[inner_key] = 'is_published' THEN src2.[inner_value] ELSE NULL END)
, [is_schema_published] = MAX(CASE WHEN src2.[inner_key] = 'is_schema_published' THEN src2.[inner_value] ELSE NULL END)
, [lob_data_space_id] = MAX(CASE WHEN src2.[inner_key] = 'lob_data_space_id' THEN src2.[inner_value] ELSE NULL END)
, [filestream_data_space_id] = MAX(CASE WHEN src2.[inner_key] = 'filestream_data_space_id' THEN src2.[inner_value] ELSE NULL END)
, [max_column_id_used] = MAX(CASE WHEN src2.[inner_key] = 'max_column_id_used' THEN src2.[inner_value] ELSE NULL END)
, [lock_on_bulk_load] = MAX(CASE WHEN src2.[inner_key] = 'lock_on_bulk_load' THEN src2.[inner_value] ELSE NULL END)
, [uses_ansi_nulls] = MAX(CASE WHEN src2.[inner_key] = 'uses_ansi_nulls' THEN src2.[inner_value] ELSE NULL END)
, [is_replicated] = MAX(CASE WHEN src2.[inner_key] = 'is_replicated' THEN src2.[inner_value] ELSE NULL END)
, [has_replication_filter] = MAX(CASE WHEN src2.[inner_key] = 'has_replication_filter' THEN src2.[inner_value] ELSE NULL END)
, [is_merge_published] = MAX(CASE WHEN src2.[inner_key] = 'is_merge_published' THEN src2.[inner_value] ELSE NULL END)
, [is_sync_tran_subscribed] = MAX(CASE WHEN src2.[inner_key] = 'is_sync_tran_subscribed' THEN src2.[inner_value] ELSE NULL END)
, [has_unchecked_assembly_data] = MAX(CASE WHEN src2.[inner_key] = 'has_unchecked_assembly_data' THEN src2.[inner_value] ELSE NULL END)
, [text_in_row_limit] = MAX(CASE WHEN src2.[inner_key] = 'text_in_row_limit' THEN src2.[inner_value] ELSE NULL END)
, [large_value_types_out_of_row] = MAX(CASE WHEN src2.[inner_key] = 'large_value_types_out_of_row' THEN src2.[inner_value] ELSE NULL END)
, [is_tracked_by_cdc] = MAX(CASE WHEN src2.[inner_key] = 'is_tracked_by_cdc' THEN src2.[inner_value] ELSE NULL END)
, [lock_escalation] = MAX(CASE WHEN src2.[inner_key] = 'lock_escalation' THEN src2.[inner_value] ELSE NULL END)
, [lock_escalation_desc] = MAX(CASE WHEN src2.[inner_key] = 'lock_escalation_desc' THEN src2.[inner_value] ELSE NULL END)
, [is_filetable] = MAX(CASE WHEN src2.[inner_key] = 'is_filetable' THEN src2.[inner_value] ELSE NULL END)
, [is_memory_optimized] = MAX(CASE WHEN src2.[inner_key] = 'is_memory_optimized' THEN src2.[inner_value] ELSE NULL END)
, [durability] = MAX(CASE WHEN src2.[inner_key] = 'durability' THEN src2.[inner_value] ELSE NULL END)
, [durability_desc] = MAX(CASE WHEN src2.[inner_key] = 'durability_desc' THEN src2.[inner_value] ELSE NULL END)
, [temporal_type] = MAX(CASE WHEN src2.[inner_key] = 'temporal_type' THEN src2.[inner_value] ELSE NULL END)
, [temporal_type_desc] = MAX(CASE WHEN src2.[inner_key] = 'temporal_type_desc' THEN src2.[inner_value] ELSE NULL END)
, [history_table_id] = MAX(CASE WHEN src2.[inner_key] = 'history_table_id' THEN src2.[inner_value] ELSE NULL END)
, [is_remote_data_archive_enabled] = MAX(CASE WHEN src2.[inner_key] = 'is_remote_data_archive_enabled' THEN src2.[inner_value] ELSE NULL END)
, [is_external] = MAX(CASE WHEN src2.[inner_key] = 'is_external' THEN src2.[inner_value] ELSE NULL END)
, [history_retention_period] = MAX(CASE WHEN src2.[inner_key] = 'history_retention_period' THEN src2.[inner_value] ELSE NULL END)
, [history_retention_period_unit] = MAX(CASE WHEN src2.[inner_key] = 'history_retention_period_unit' THEN src2.[inner_value] ELSE NULL END)
, [history_retention_period_unit_desc] = MAX(CASE WHEN src2.[inner_key] = 'history_retention_period_unit_desc' THEN src2.[inner_value] ELSE NULL END)
, [is_node] = MAX(CASE WHEN src2.[inner_key] = 'is_node' THEN src2.[inner_value] ELSE NULL END)
, [is_edge] = MAX(CASE WHEN src2.[inner_key] = 'is_edge' THEN src2.[inner_value] ELSE NULL END)
FROM src2
GROUP BY src2.some_uniqueidentifier
, src2.some_varchar
, src2.[key]
ORDER BY src2.some_uniqueidentifier
, src2.some_varchar
, src2.[key];
The query plan uses a couple of nested loops joins regardless of how many rows are contained in the input table. Presumably that's an artifact of using the CROSS APPLY
operator. For your fun, I've already set up a DB Fiddle.
Is there a more efficient way of getting the data out of json format into a "real" set of columns?
I created the code above using my on-prem SQL Server 2019 instance, however the target will be Azure SQL Database, so all the latest-and-greatest options are available.
2 Answers 2
You should directly extract the values from the json array using the WITH
clause, like this:
SELECT src.*
, tt.*
FROM json_test src
CROSS APPLY OPENJSON(src.the_json , 'lax $')
WITH (
[name] sysname '$.name'
, [object_id] int '$.object_id'
, [principal_id] int '$.principal_id'
, [schema_id] smallint '$.schema_id'
, [parent_object_id] int '$.parent_object_id'
, [type] char(2) '$.type'
, [type_desc] nvarchar(60) '$.type_desc'
, [create_date] datetime '$.create_date'
, [modify_date] datetime '$.modify_date'
, [is_ms_shipped] bit '$.is_ms_shipped'
, [is_published] bit '$.is_published'
, [is_schema_published] bit '$.is_schema_published'
, [lob_data_space_id] int '$.lob_data_space_id'
, [filestream_data_space_id] int '$.filestream_data_space_id'
, [max_column_id_used] int '$.max_column_id_used'
, [lock_on_bulk_load] bit '$.lock_on_bulk_load'
, [uses_ansi_nulls] bit '$.uses_ansi_nulls'
, [is_replicated] bit '$.is_replicated'
, [has_replication_filter] bit '$.has_replication_filter'
, [is_merge_published] bit '$.is_merge_published'
, [is_sync_tran_subscribed] bit '$.is_sync_tran_subscribed'
, [has_unchecked_assembly_data] bit '$.has_unchecked_assembly_data'
, [text_in_row_limit] int '$.text_in_row_limit'
, [large_value_types_out_of_row] bit '$.large_value_types_out_of_row'
, [is_tracked_by_cdc] bit '$.is_tracked_by_cdc'
, [lock_escalation] tinyint '$.lock_escalation'
, [lock_escalation_desc] nvarchar(60) '$.lock_escalation_desc'
, [is_filetable] bit '$.is_filetable'
, [is_memory_optimized] bit '$.is_memory_optimized'
, [durability] tinyint '$.durability'
, [durability_desc] nvarchar(60) '$.durability_desc'
, [temporal_type] tinyint '$.temporal_type'
, [temporal_type_desc] nvarchar(60) '$.temporal_type_desc'
, [history_table_id] int '$.history_table_id'
, [is_remote_data_archive_enabled] bit '$.is_remote_data_archive_enabled'
, [is_external] bit '$.is_external'
, [history_retention_period] int '$.history_retention_period'
, [history_retention_period_unit] int '$.history_retention_period_unit'
, [history_retention_period_unit_desc] nvarchar(10) '$.history_retention_period_unit_desc'
, [is_node] bit '$.is_node'
, [is_edge] bit '$.is_edge'
) AS tt
This Fiddle compares the output from both methods proving the results are the same.
The only column missing from the output is the key
value generated by the OPENJSON
table-valued-function which is only returned when there is no WITH
clause specified. The plan for my variant only has a single nested loop, and appears to be far more efficient.
I ran across this recently and it is a way of speeding up access to json with index like speeds:
https://bertwagner.com/posts/one-sql-cheat-code-for-amazingly-fast-json-queries/
Not sure if this is allowed but I'll copy and paste some of the blog here:
Recently I've been working with JSON in SQL Server 2016 a lot .
One of the hesitations many people have with using JSON in SQL Server is that they think that querying it must be really slow — SQL is supposed to excel at relational data, not string parsing right?
It turns out that performance is pretty good with the standalone SQL Server JSON functions. Even better is that it's possible to make queries against JSON data run at ludicrous speeds by using indexes on JSON parsed computed columns . In this post I want to take a look at how SQL is able to parse* with such great performance.
*"Parse" here is actually a lie —it's doing something else behind the scenes. You'll see what I mean, keep reading!
Explore related questions
See similar questions with these tags.