Optimize extraction of json data via OPENJSON

All we need is an easy explanation of the problem, so here it is.

I’m attempting to optimize extraction of values obtained from a REST API which returns json values in an array.

Here’s an minimal, complete, and verifiable example that reflects exactly what I’m doing.

USE tempdb;

DROP TABLE IF EXISTS dbo.json_test;

CREATE TABLE dbo.json_test
(
    json_test_id                int                 NOT NULL
        IDENTITY(1,1)
    , some_uniqueidentifier     uniqueidentifier    NULL
    , some_varchar              varchar(100)        NULL
    , the_json                  nvarchar(max)       NULL
);

INSERT INTO dbo.json_test (some_uniqueidentifier, some_varchar, the_json)
SELECT 
        some_uniqueidentifier       = NEWID()
      , some_varchar                = CONVERT(varchar(100), CRYPT_GEN_RANDOM(64), 1)
      , the_json = (
            SELECT st.* 
            FROM sys.tables st
                CROSS JOIN sys.tables st2
            WHERE st.object_id = t.object_id FOR JSON AUTO
            )
FROM sys.tables t;

;WITH src AS 
(
    SELECT jt.some_uniqueidentifier
        , jt.some_varchar
        , top_array.[key]
        , top_array.[value]
    FROM dbo.json_test jt
        CROSS APPLY OPENJSON(jt.the_json, N'lax $') top_array
),
src2 AS
(
    SELECT src.some_uniqueidentifier
        , src.some_varchar
        , src.[key]
        , src.[value]
        , inner_key = inner_array.[key]
        , inner_value = inner_array.[value]
    FROM src
        CROSS APPLY OPENJSON(src.[value], N'lax $') inner_array
)
SELECT src2.some_uniqueidentifier
    , src2.some_varchar
    , src2.[key]
    , [name]                                = MAX(CASE WHEN src2.[inner_key] = 'name'                               THEN src2.[inner_value] ELSE NULL END)
    , [object_id]                           = MAX(CASE WHEN src2.[inner_key] = 'object_id'                          THEN src2.[inner_value] ELSE NULL END)
    , [principal_id]                        = MAX(CASE WHEN src2.[inner_key] = 'principal_id'                       THEN src2.[inner_value] ELSE NULL END)
    , [schema_id]                           = MAX(CASE WHEN src2.[inner_key] = 'schema_id'                          THEN src2.[inner_value] ELSE NULL END)
    , [parent_object_id]                    = MAX(CASE WHEN src2.[inner_key] = 'parent_object_id'                   THEN src2.[inner_value] ELSE NULL END)
    , [type]                                = MAX(CASE WHEN src2.[inner_key] = 'type'                               THEN src2.[inner_value] ELSE NULL END)
    , [type_desc]                           = MAX(CASE WHEN src2.[inner_key] = 'type_desc'                          THEN src2.[inner_value] ELSE NULL END)
    , [create_date]                         = MAX(CASE WHEN src2.[inner_key] = 'create_date'                        THEN src2.[inner_value] ELSE NULL END)
    , [modify_date]                         = MAX(CASE WHEN src2.[inner_key] = 'modify_date'                        THEN src2.[inner_value] ELSE NULL END)
    , [is_ms_shipped]                       = MAX(CASE WHEN src2.[inner_key] = 'is_ms_shipped'                      THEN src2.[inner_value] ELSE NULL END)
    , [is_published]                        = MAX(CASE WHEN src2.[inner_key] = 'is_published'                       THEN src2.[inner_value] ELSE NULL END)
    , [is_schema_published]                 = MAX(CASE WHEN src2.[inner_key] = 'is_schema_published'                THEN src2.[inner_value] ELSE NULL END)
    , [lob_data_space_id]                   = MAX(CASE WHEN src2.[inner_key] = 'lob_data_space_id'                  THEN src2.[inner_value] ELSE NULL END)
    , [filestream_data_space_id]            = MAX(CASE WHEN src2.[inner_key] = 'filestream_data_space_id'           THEN src2.[inner_value] ELSE NULL END)
    , [max_column_id_used]                  = MAX(CASE WHEN src2.[inner_key] = 'max_column_id_used'                 THEN src2.[inner_value] ELSE NULL END)
    , [lock_on_bulk_load]                   = MAX(CASE WHEN src2.[inner_key] = 'lock_on_bulk_load'                  THEN src2.[inner_value] ELSE NULL END)
    , [uses_ansi_nulls]                     = MAX(CASE WHEN src2.[inner_key] = 'uses_ansi_nulls'                    THEN src2.[inner_value] ELSE NULL END)
    , [is_replicated]                       = MAX(CASE WHEN src2.[inner_key] = 'is_replicated'                      THEN src2.[inner_value] ELSE NULL END)
    , [has_replication_filter]              = MAX(CASE WHEN src2.[inner_key] = 'has_replication_filter'             THEN src2.[inner_value] ELSE NULL END)
    , [is_merge_published]                  = MAX(CASE WHEN src2.[inner_key] = 'is_merge_published'                 THEN src2.[inner_value] ELSE NULL END)
    , [is_sync_tran_subscribed]             = MAX(CASE WHEN src2.[inner_key] = 'is_sync_tran_subscribed'            THEN src2.[inner_value] ELSE NULL END)
    , [has_unchecked_assembly_data]         = MAX(CASE WHEN src2.[inner_key] = 'has_unchecked_assembly_data'        THEN src2.[inner_value] ELSE NULL END)
    , [text_in_row_limit]                   = MAX(CASE WHEN src2.[inner_key] = 'text_in_row_limit'                  THEN src2.[inner_value] ELSE NULL END)
    , [large_value_types_out_of_row]        = MAX(CASE WHEN src2.[inner_key] = 'large_value_types_out_of_row'       THEN src2.[inner_value] ELSE NULL END)
    , [is_tracked_by_cdc]                   = MAX(CASE WHEN src2.[inner_key] = 'is_tracked_by_cdc'                  THEN src2.[inner_value] ELSE NULL END)
    , [lock_escalation]                     = MAX(CASE WHEN src2.[inner_key] = 'lock_escalation'                    THEN src2.[inner_value] ELSE NULL END)
    , [lock_escalation_desc]                = MAX(CASE WHEN src2.[inner_key] = 'lock_escalation_desc'               THEN src2.[inner_value] ELSE NULL END)
    , [is_filetable]                        = MAX(CASE WHEN src2.[inner_key] = 'is_filetable'                       THEN src2.[inner_value] ELSE NULL END)
    , [is_memory_optimized]                 = MAX(CASE WHEN src2.[inner_key] = 'is_memory_optimized'                THEN src2.[inner_value] ELSE NULL END)
    , [durability]                          = MAX(CASE WHEN src2.[inner_key] = 'durability'                         THEN src2.[inner_value] ELSE NULL END)
    , [durability_desc]                     = MAX(CASE WHEN src2.[inner_key] = 'durability_desc'                    THEN src2.[inner_value] ELSE NULL END)
    , [temporal_type]                       = MAX(CASE WHEN src2.[inner_key] = 'temporal_type'                      THEN src2.[inner_value] ELSE NULL END)
    , [temporal_type_desc]                  = MAX(CASE WHEN src2.[inner_key] = 'temporal_type_desc'                 THEN src2.[inner_value] ELSE NULL END)
    , [history_table_id]                    = MAX(CASE WHEN src2.[inner_key] = 'history_table_id'                   THEN src2.[inner_value] ELSE NULL END)
    , [is_remote_data_archive_enabled]      = MAX(CASE WHEN src2.[inner_key] = 'is_remote_data_archive_enabled'     THEN src2.[inner_value] ELSE NULL END)
    , [is_external]                         = MAX(CASE WHEN src2.[inner_key] = 'is_external'                        THEN src2.[inner_value] ELSE NULL END)
    , [history_retention_period]            = MAX(CASE WHEN src2.[inner_key] = 'history_retention_period'           THEN src2.[inner_value] ELSE NULL END)
    , [history_retention_period_unit]       = MAX(CASE WHEN src2.[inner_key] = 'history_retention_period_unit'      THEN src2.[inner_value] ELSE NULL END)
    , [history_retention_period_unit_desc]  = MAX(CASE WHEN src2.[inner_key] = 'history_retention_period_unit_desc' THEN src2.[inner_value] ELSE NULL END)
    , [is_node]                             = MAX(CASE WHEN src2.[inner_key] = 'is_node'                            THEN src2.[inner_value] ELSE NULL END)
    , [is_edge]                             = MAX(CASE WHEN src2.[inner_key] = 'is_edge'                            THEN src2.[inner_value] ELSE NULL END)
FROM src2
GROUP BY src2.some_uniqueidentifier
    , src2.some_varchar
    , src2.[key]
ORDER BY src2.some_uniqueidentifier
    , src2.some_varchar
    , src2.[key];

The query plan uses a couple of nested loops joins regardless of how many rows are contained in the input table. Presumably that’s an artifact of using the CROSS APPLY operator. For your fun, I’ve already set up a DB Fiddle.

Is there a more efficient way of getting the data out of json format into a "real" set of columns?

I created the code above using my on-prem SQL Server 2019 instance, however the target will be Azure SQL Database, so all the latest-and-greatest options are available.

How to solve :

I know you bored from this bug, So we are here to help you! Take a deep breath and look at the explanation of your problem. We have many solutions to this problem, But we recommend you to use the first method because it is tested & true method that will 100% work for you.

Method 1

You should directly extract the values from the json array using the WITH clause, like this:

SELECT src.*
    , tt.*
FROM json_test src
CROSS APPLY OPENJSON(src.the_json , 'lax $')
    WITH (
          [name]                               sysname       '$.name'
        , [object_id]                          int           '$.object_id'
        , [principal_id]                       int           '$.principal_id'
        , [schema_id]                          smallint      '$.schema_id'
        , [parent_object_id]                   int           '$.parent_object_id'
        , [type]                               char(2)       '$.type'
        , [type_desc]                          nvarchar(60)  '$.type_desc'
        , [create_date]                        datetime      '$.create_date'
        , [modify_date]                        datetime      '$.modify_date'
        , [is_ms_shipped]                      bit           '$.is_ms_shipped'
        , [is_published]                       bit           '$.is_published'
        , [is_schema_published]                bit           '$.is_schema_published'
        , [lob_data_space_id]                  int           '$.lob_data_space_id'
        , [filestream_data_space_id]           int           '$.filestream_data_space_id'
        , [max_column_id_used]                 int           '$.max_column_id_used'
        , [lock_on_bulk_load]                  bit           '$.lock_on_bulk_load'
        , [uses_ansi_nulls]                    bit           '$.uses_ansi_nulls'
        , [is_replicated]                      bit           '$.is_replicated'
        , [has_replication_filter]             bit           '$.has_replication_filter'
        , [is_merge_published]                 bit           '$.is_merge_published'
        , [is_sync_tran_subscribed]            bit           '$.is_sync_tran_subscribed'
        , [has_unchecked_assembly_data]        bit           '$.has_unchecked_assembly_data'
        , [text_in_row_limit]                  int           '$.text_in_row_limit'
        , [large_value_types_out_of_row]       bit           '$.large_value_types_out_of_row'
        , [is_tracked_by_cdc]                  bit           '$.is_tracked_by_cdc'
        , [lock_escalation]                    tinyint       '$.lock_escalation'
        , [lock_escalation_desc]               nvarchar(60)  '$.lock_escalation_desc'
        , [is_filetable]                       bit           '$.is_filetable'
        , [is_memory_optimized]                bit           '$.is_memory_optimized'
        , [durability]                         tinyint       '$.durability'
        , [durability_desc]                    nvarchar(60)  '$.durability_desc'
        , [temporal_type]                      tinyint       '$.temporal_type'
        , [temporal_type_desc]                 nvarchar(60)  '$.temporal_type_desc'
        , [history_table_id]                   int           '$.history_table_id'
        , [is_remote_data_archive_enabled]     bit           '$.is_remote_data_archive_enabled'
        , [is_external]                        bit           '$.is_external'
        , [history_retention_period]           int           '$.history_retention_period'
        , [history_retention_period_unit]      int           '$.history_retention_period_unit'
        , [history_retention_period_unit_desc] nvarchar(10)  '$.history_retention_period_unit_desc'
        , [is_node]                            bit           '$.is_node'
        , [is_edge]                            bit           '$.is_edge'
    ) AS tt

This Fiddle compares the output from both methods proving the results are the same.

The only column missing from the output is the key value generated by the OPENJSON table-valued-function which is only returned when there is no WITH clause specified. The plan for my variant only has a single nested loop, and appears to be far more efficient.

Optimize extraction of json data via OPENJSON

Microsoft Docs example for OPENJSON CROSS APPLY

Note: Use and implement method 1 because this method fully tested our system.
Thank you 🙂

All methods was sourced from stackoverflow.com or stackexchange.com, is licensed under cc by-sa 2.5, cc by-sa 3.0 and cc by-sa 4.0

Leave a Reply