Optimize extraction of json data via OPENJSON

Question 1

I'm attempting to optimize extraction of values obtained from a REST API which returns json values in an array.

Here's an minimal, complete, and verifiable example that reflects exactly what I'm doing.

USE tempdb;
DROP TABLE IF EXISTS dbo.json_test;
CREATE TABLE dbo.json_test
(
 json_test_id int NOT NULL
 IDENTITY(1,1)
 , some_uniqueidentifier uniqueidentifier NULL
 , some_varchar varchar(100) NULL
 , the_json nvarchar(max) NULL
);
INSERT INTO dbo.json_test (some_uniqueidentifier, some_varchar, the_json)
SELECT 
 some_uniqueidentifier = NEWID()
 , some_varchar = CONVERT(varchar(100), CRYPT_GEN_RANDOM(64), 1)
 , the_json = (
 SELECT st.* 
 FROM sys.tables st
 CROSS JOIN sys.tables st2
 WHERE st.object_id = t.object_id FOR JSON AUTO
 )
FROM sys.tables t;
;WITH src AS 
(
 SELECT jt.some_uniqueidentifier
 , jt.some_varchar
 , top_array.[key]
 , top_array.[value]
 FROM dbo.json_test jt
 CROSS APPLY OPENJSON(jt.the_json, N'lax $') top_array
),
src2 AS
(
 SELECT src.some_uniqueidentifier
 , src.some_varchar
 , src.[key]
 , src.[value]
 , inner_key = inner_array.[key]
 , inner_value = inner_array.[value]
 FROM src
 CROSS APPLY OPENJSON(src.[value], N'lax $') inner_array
)
SELECT src2.some_uniqueidentifier
 , src2.some_varchar
 , src2.[key]
 , [name] = MAX(CASE WHEN src2.[inner_key] = 'name' THEN src2.[inner_value] ELSE NULL END)
 , [object_id] = MAX(CASE WHEN src2.[inner_key] = 'object_id' THEN src2.[inner_value] ELSE NULL END)
 , [principal_id] = MAX(CASE WHEN src2.[inner_key] = 'principal_id' THEN src2.[inner_value] ELSE NULL END)
 , [schema_id] = MAX(CASE WHEN src2.[inner_key] = 'schema_id' THEN src2.[inner_value] ELSE NULL END)
 , [parent_object_id] = MAX(CASE WHEN src2.[inner_key] = 'parent_object_id' THEN src2.[inner_value] ELSE NULL END)
 , [type] = MAX(CASE WHEN src2.[inner_key] = 'type' THEN src2.[inner_value] ELSE NULL END)
 , [type_desc] = MAX(CASE WHEN src2.[inner_key] = 'type_desc' THEN src2.[inner_value] ELSE NULL END)
 , [create_date] = MAX(CASE WHEN src2.[inner_key] = 'create_date' THEN src2.[inner_value] ELSE NULL END)
 , [modify_date] = MAX(CASE WHEN src2.[inner_key] = 'modify_date' THEN src2.[inner_value] ELSE NULL END)
 , [is_ms_shipped] = MAX(CASE WHEN src2.[inner_key] = 'is_ms_shipped' THEN src2.[inner_value] ELSE NULL END)
 , [is_published] = MAX(CASE WHEN src2.[inner_key] = 'is_published' THEN src2.[inner_value] ELSE NULL END)
 , [is_schema_published] = MAX(CASE WHEN src2.[inner_key] = 'is_schema_published' THEN src2.[inner_value] ELSE NULL END)
 , [lob_data_space_id] = MAX(CASE WHEN src2.[inner_key] = 'lob_data_space_id' THEN src2.[inner_value] ELSE NULL END)
 , [filestream_data_space_id] = MAX(CASE WHEN src2.[inner_key] = 'filestream_data_space_id' THEN src2.[inner_value] ELSE NULL END)
 , [max_column_id_used] = MAX(CASE WHEN src2.[inner_key] = 'max_column_id_used' THEN src2.[inner_value] ELSE NULL END)
 , [lock_on_bulk_load] = MAX(CASE WHEN src2.[inner_key] = 'lock_on_bulk_load' THEN src2.[inner_value] ELSE NULL END)
 , [uses_ansi_nulls] = MAX(CASE WHEN src2.[inner_key] = 'uses_ansi_nulls' THEN src2.[inner_value] ELSE NULL END)
 , [is_replicated] = MAX(CASE WHEN src2.[inner_key] = 'is_replicated' THEN src2.[inner_value] ELSE NULL END)
 , [has_replication_filter] = MAX(CASE WHEN src2.[inner_key] = 'has_replication_filter' THEN src2.[inner_value] ELSE NULL END)
 , [is_merge_published] = MAX(CASE WHEN src2.[inner_key] = 'is_merge_published' THEN src2.[inner_value] ELSE NULL END)
 , [is_sync_tran_subscribed] = MAX(CASE WHEN src2.[inner_key] = 'is_sync_tran_subscribed' THEN src2.[inner_value] ELSE NULL END)
 , [has_unchecked_assembly_data] = MAX(CASE WHEN src2.[inner_key] = 'has_unchecked_assembly_data' THEN src2.[inner_value] ELSE NULL END)
 , [text_in_row_limit] = MAX(CASE WHEN src2.[inner_key] = 'text_in_row_limit' THEN src2.[inner_value] ELSE NULL END)
 , [large_value_types_out_of_row] = MAX(CASE WHEN src2.[inner_key] = 'large_value_types_out_of_row' THEN src2.[inner_value] ELSE NULL END)
 , [is_tracked_by_cdc] = MAX(CASE WHEN src2.[inner_key] = 'is_tracked_by_cdc' THEN src2.[inner_value] ELSE NULL END)
 , [lock_escalation] = MAX(CASE WHEN src2.[inner_key] = 'lock_escalation' THEN src2.[inner_value] ELSE NULL END)
 , [lock_escalation_desc] = MAX(CASE WHEN src2.[inner_key] = 'lock_escalation_desc' THEN src2.[inner_value] ELSE NULL END)
 , [is_filetable] = MAX(CASE WHEN src2.[inner_key] = 'is_filetable' THEN src2.[inner_value] ELSE NULL END)
 , [is_memory_optimized] = MAX(CASE WHEN src2.[inner_key] = 'is_memory_optimized' THEN src2.[inner_value] ELSE NULL END)
 , [durability] = MAX(CASE WHEN src2.[inner_key] = 'durability' THEN src2.[inner_value] ELSE NULL END)
 , [durability_desc] = MAX(CASE WHEN src2.[inner_key] = 'durability_desc' THEN src2.[inner_value] ELSE NULL END)
 , [temporal_type] = MAX(CASE WHEN src2.[inner_key] = 'temporal_type' THEN src2.[inner_value] ELSE NULL END)
 , [temporal_type_desc] = MAX(CASE WHEN src2.[inner_key] = 'temporal_type_desc' THEN src2.[inner_value] ELSE NULL END)
 , [history_table_id] = MAX(CASE WHEN src2.[inner_key] = 'history_table_id' THEN src2.[inner_value] ELSE NULL END)
 , [is_remote_data_archive_enabled] = MAX(CASE WHEN src2.[inner_key] = 'is_remote_data_archive_enabled' THEN src2.[inner_value] ELSE NULL END)
 , [is_external] = MAX(CASE WHEN src2.[inner_key] = 'is_external' THEN src2.[inner_value] ELSE NULL END)
 , [history_retention_period] = MAX(CASE WHEN src2.[inner_key] = 'history_retention_period' THEN src2.[inner_value] ELSE NULL END)
 , [history_retention_period_unit] = MAX(CASE WHEN src2.[inner_key] = 'history_retention_period_unit' THEN src2.[inner_value] ELSE NULL END)
 , [history_retention_period_unit_desc] = MAX(CASE WHEN src2.[inner_key] = 'history_retention_period_unit_desc' THEN src2.[inner_value] ELSE NULL END)
 , [is_node] = MAX(CASE WHEN src2.[inner_key] = 'is_node' THEN src2.[inner_value] ELSE NULL END)
 , [is_edge] = MAX(CASE WHEN src2.[inner_key] = 'is_edge' THEN src2.[inner_value] ELSE NULL END)
FROM src2
GROUP BY src2.some_uniqueidentifier
 , src2.some_varchar
 , src2.[key]
ORDER BY src2.some_uniqueidentifier
 , src2.some_varchar
 , src2.[key];

The query plan uses a couple of nested loops joins regardless of how many rows are contained in the input table. Presumably that's an artifact of using the CROSS APPLY operator. For your fun, I've already set up a DB Fiddle.

Is there a more efficient way of getting the data out of json format into a "real" set of columns?

I created the code above using my on-prem SQL Server 2019 instance, however the target will be Azure SQL Database, so all the latest-and-greatest options are available.

Question 2

You should directly extract the values from the json array using the WITH clause, like this:

SELECT src.*
 , tt.*
FROM json_test src
CROSS APPLY OPENJSON(src.the_json , 'lax $')
 WITH (
 [name] sysname '$.name'
 , [object_id] int '$.object_id'
 , [principal_id] int '$.principal_id'
 , [schema_id] smallint '$.schema_id'
 , [parent_object_id] int '$.parent_object_id'
 , [type] char(2) '$.type'
 , [type_desc] nvarchar(60) '$.type_desc'
 , [create_date] datetime '$.create_date'
 , [modify_date] datetime '$.modify_date'
 , [is_ms_shipped] bit '$.is_ms_shipped'
 , [is_published] bit '$.is_published'
 , [is_schema_published] bit '$.is_schema_published'
 , [lob_data_space_id] int '$.lob_data_space_id'
 , [filestream_data_space_id] int '$.filestream_data_space_id'
 , [max_column_id_used] int '$.max_column_id_used'
 , [lock_on_bulk_load] bit '$.lock_on_bulk_load'
 , [uses_ansi_nulls] bit '$.uses_ansi_nulls'
 , [is_replicated] bit '$.is_replicated'
 , [has_replication_filter] bit '$.has_replication_filter'
 , [is_merge_published] bit '$.is_merge_published'
 , [is_sync_tran_subscribed] bit '$.is_sync_tran_subscribed'
 , [has_unchecked_assembly_data] bit '$.has_unchecked_assembly_data'
 , [text_in_row_limit] int '$.text_in_row_limit'
 , [large_value_types_out_of_row] bit '$.large_value_types_out_of_row'
 , [is_tracked_by_cdc] bit '$.is_tracked_by_cdc'
 , [lock_escalation] tinyint '$.lock_escalation'
 , [lock_escalation_desc] nvarchar(60) '$.lock_escalation_desc'
 , [is_filetable] bit '$.is_filetable'
 , [is_memory_optimized] bit '$.is_memory_optimized'
 , [durability] tinyint '$.durability'
 , [durability_desc] nvarchar(60) '$.durability_desc'
 , [temporal_type] tinyint '$.temporal_type'
 , [temporal_type_desc] nvarchar(60) '$.temporal_type_desc'
 , [history_table_id] int '$.history_table_id'
 , [is_remote_data_archive_enabled] bit '$.is_remote_data_archive_enabled'
 , [is_external] bit '$.is_external'
 , [history_retention_period] int '$.history_retention_period'
 , [history_retention_period_unit] int '$.history_retention_period_unit'
 , [history_retention_period_unit_desc] nvarchar(10) '$.history_retention_period_unit_desc'
 , [is_node] bit '$.is_node'
 , [is_edge] bit '$.is_edge'
 ) AS tt

This Fiddle compares the output from both methods proving the results are the same.

The only column missing from the output is the key value generated by the OPENJSON table-valued-function which is only returned when there is no WITH clause specified. The plan for my variant only has a single nested loop, and appears to be far more efficient.

enter image description here

Microsoft Docs example for OPENJSON CROSS APPLY

Question 3

I ran across this recently and it is a way of speeding up access to json with index like speeds:

https://bertwagner.com/posts/one-sql-cheat-code-for-amazingly-fast-json-queries/

Not sure if this is allowed but I'll copy and paste some of the blog here:

Recently I've been working with JSON in SQL Server 2016 a lot .

One of the hesitations many people have with using JSON in SQL Server is that they think that querying it must be really slow — SQL is supposed to excel at relational data, not string parsing right?

It turns out that performance is pretty good with the standalone SQL Server JSON functions. Even better is that it's possible to make queries against JSON data run at ludicrous speeds by using indexes on JSON parsed computed columns . In this post I want to take a look at how SQL is able to parse* with such great performance.

*"Parse" here is actually a lie —it's doing something else behind the scenes. You'll see what I mean, keep reading!

Biju jose Biju jose 2,0982 gold badges18 silver badges27 bronze badges · Accepted Answer · 2021-07-17 07:19:35Z

You should directly extract the values from the json array using the WITH clause, like this:

SELECT src.*
 , tt.*
FROM json_test src
CROSS APPLY OPENJSON(src.the_json , 'lax $')
 WITH (
 [name] sysname '$.name'
 , [object_id] int '$.object_id'
 , [principal_id] int '$.principal_id'
 , [schema_id] smallint '$.schema_id'
 , [parent_object_id] int '$.parent_object_id'
 , [type] char(2) '$.type'
 , [type_desc] nvarchar(60) '$.type_desc'
 , [create_date] datetime '$.create_date'
 , [modify_date] datetime '$.modify_date'
 , [is_ms_shipped] bit '$.is_ms_shipped'
 , [is_published] bit '$.is_published'
 , [is_schema_published] bit '$.is_schema_published'
 , [lob_data_space_id] int '$.lob_data_space_id'
 , [filestream_data_space_id] int '$.filestream_data_space_id'
 , [max_column_id_used] int '$.max_column_id_used'
 , [lock_on_bulk_load] bit '$.lock_on_bulk_load'
 , [uses_ansi_nulls] bit '$.uses_ansi_nulls'
 , [is_replicated] bit '$.is_replicated'
 , [has_replication_filter] bit '$.has_replication_filter'
 , [is_merge_published] bit '$.is_merge_published'
 , [is_sync_tran_subscribed] bit '$.is_sync_tran_subscribed'
 , [has_unchecked_assembly_data] bit '$.has_unchecked_assembly_data'
 , [text_in_row_limit] int '$.text_in_row_limit'
 , [large_value_types_out_of_row] bit '$.large_value_types_out_of_row'
 , [is_tracked_by_cdc] bit '$.is_tracked_by_cdc'
 , [lock_escalation] tinyint '$.lock_escalation'
 , [lock_escalation_desc] nvarchar(60) '$.lock_escalation_desc'
 , [is_filetable] bit '$.is_filetable'
 , [is_memory_optimized] bit '$.is_memory_optimized'
 , [durability] tinyint '$.durability'
 , [durability_desc] nvarchar(60) '$.durability_desc'
 , [temporal_type] tinyint '$.temporal_type'
 , [temporal_type_desc] nvarchar(60) '$.temporal_type_desc'
 , [history_table_id] int '$.history_table_id'
 , [is_remote_data_archive_enabled] bit '$.is_remote_data_archive_enabled'
 , [is_external] bit '$.is_external'
 , [history_retention_period] int '$.history_retention_period'
 , [history_retention_period_unit] int '$.history_retention_period_unit'
 , [history_retention_period_unit_desc] nvarchar(10) '$.history_retention_period_unit_desc'
 , [is_node] bit '$.is_node'
 , [is_edge] bit '$.is_edge'
 ) AS tt

This Fiddle compares the output from both methods proving the results are the same.

The only column missing from the output is the key value generated by the OPENJSON table-valued-function which is only returned when there is no WITH clause specified. The plan for my variant only has a single nested loop, and appears to be far more efficient.

enter image description here

Microsoft Docs example for OPENJSON CROSS APPLY

Stack Exchange Network

Optimize extraction of json data via OPENJSON

2 Answers 2

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Optimize extraction of json data via OPENJSON

2 Answers 2

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions