From 5cff2697363496dd8fdb6c088803428df10966ab Mon Sep 17 00:00:00 2001 From: Mikhail Koviazin Date: Tue, 17 Feb 2026 17:57:35 +0100 Subject: [PATCH 1/3] MergeTreeData: force Wide part format when the table has deprecated Object columns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fixes #1412 Here's the analysis of #1412 by Claude: Root cause: The old Object('json') type converts JSON data to nested Tuple structures before storage. When parts have different JSON schemas, reading subcolumns from compact parts without per-substream marks fails — the compact reader's deserialization can't properly handle the complex nested type serialization, leading to tuple elements being deserialized with mismatched sizes. Crash: Logical error: 'Unexpected size of tuple element 1: 0. Expected size: 1' in SerializationTuple::deserializeBinaryBulkWithMultipleStreams Recommended Fix: Force wide parts for tables with deprecated Object columns In MergeTreeData::choosePartFormat, detect tables with deprecated Object columns and always choose Wide format. Wide parts work correctly with all ClickHouse versions and handle complex nested types properly. This is clean and targeted because: - Only affects tables using the deprecated Object('json') type (narrow scope) - The enum ordering (Wide=0 < Compact=1) means std::min in merge logic will pick Wide when choosePartFormat returns it, so even existing compact parts get rewritten to Wide during natural merges - No changes needed to the complex compact reader deserialization code --- src/Storages/MergeTree/MergeTreeData.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 3e9926239672..58b5efae215d 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -4461,6 +4461,14 @@ MergeTreeDataPartFormat MergeTreeData::choosePartFormat(size_t bytes_uncompresse using PartType = MergeTreeDataPartType; using PartStorageType = MergeTreeDataPartStorageType; + /// Deprecated Object type doesn't work correctly with compact parts + /// when write_marks_for_substreams_in_compact_parts is disabled, + /// because the compact reader cannot properly handle missing substreams + /// in complex nested Tuple structures that Object converts to. + /// Force wide parts for such tables to avoid deserialization crashes. + if (hasDynamicSubcolumnsDeprecated(getInMemoryMetadataPtr()->getColumns())) + return {PartType::Wide, PartStorageType::Full}; + String out_reason; const auto settings = getSettings(); if (!canUsePolymorphicParts(*settings, out_reason)) From afbc9381897ea863774f8606d91a440afe831143 Mon Sep 17 00:00:00 2001 From: Mikhail Koviazin Date: Tue, 17 Feb 2026 18:13:09 +0100 Subject: [PATCH 2/3] added a test --- ...03779_object_in_compact_mergetree_parts.reference | 2 ++ .../03779_object_in_compact_mergetree_parts.sql | 12 ++++++++++++ 2 files changed, 14 insertions(+) create mode 100644 tests/queries/0_stateless/03779_object_in_compact_mergetree_parts.reference create mode 100644 tests/queries/0_stateless/03779_object_in_compact_mergetree_parts.sql diff --git a/tests/queries/0_stateless/03779_object_in_compact_mergetree_parts.reference b/tests/queries/0_stateless/03779_object_in_compact_mergetree_parts.reference new file mode 100644 index 000000000000..1acdfbd34c12 --- /dev/null +++ b/tests/queries/0_stateless/03779_object_in_compact_mergetree_parts.reference @@ -0,0 +1,2 @@ +1 [['aaa','ccc']] [['bbb','']] [[0,0]] [''] +2 [['','']] [['ddd','']] [[10,20]] ['foo'] diff --git a/tests/queries/0_stateless/03779_object_in_compact_mergetree_parts.sql b/tests/queries/0_stateless/03779_object_in_compact_mergetree_parts.sql new file mode 100644 index 000000000000..e21ae90b2dfa --- /dev/null +++ b/tests/queries/0_stateless/03779_object_in_compact_mergetree_parts.sql @@ -0,0 +1,12 @@ +SET allow_experimental_object_type = 1; +DROP TABLE IF EXISTS t_json_complex; +CREATE TABLE t_json_complex (id UInt32, arr Array(Object('json'))) +ENGINE = MergeTree ORDER BY id; + +-- Insert data with nested arrays inside JSON objects +INSERT INTO t_json_complex FORMAT JSONEachRow {"id": 1, "arr": [{"k1": [{"k2": "aaa", "k3": "bbb"}, {"k2": "ccc"}]}]} + +INSERT INTO t_json_complex FORMAT JSONEachRow {"id": 2, "arr": [{"k1": [{"k3": "ddd", "k4": 10}, {"k4": 20}], "k5": {"k6": "foo"}}]} + +-- This query used to crash the server +SELECT id, arr.k1.k2, arr.k1.k3, arr.k1.k4, arr.k5.k6 FROM t_json_complex ORDER BY id; From 0cb21ed20007de810650814df7e78aa7d94a0e7f Mon Sep 17 00:00:00 2001 From: Mikhail Koviazin Date: Tue, 17 Feb 2026 19:45:48 +0100 Subject: [PATCH 3/3] test with setting explicitly disabled and enabled --- ...bject_in_compact_mergetree_parts.reference | 2 ++ ...3779_object_in_compact_mergetree_parts.sql | 22 ++++++++++++++++++- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/03779_object_in_compact_mergetree_parts.reference b/tests/queries/0_stateless/03779_object_in_compact_mergetree_parts.reference index 1acdfbd34c12..53742b87fafc 100644 --- a/tests/queries/0_stateless/03779_object_in_compact_mergetree_parts.reference +++ b/tests/queries/0_stateless/03779_object_in_compact_mergetree_parts.reference @@ -1,2 +1,4 @@ 1 [['aaa','ccc']] [['bbb','']] [[0,0]] [''] 2 [['','']] [['ddd','']] [[10,20]] ['foo'] +1 [['aaa','ccc']] [['bbb','']] [[0,0]] [''] +2 [['','']] [['ddd','']] [[10,20]] ['foo'] diff --git a/tests/queries/0_stateless/03779_object_in_compact_mergetree_parts.sql b/tests/queries/0_stateless/03779_object_in_compact_mergetree_parts.sql index e21ae90b2dfa..06ab06abca96 100644 --- a/tests/queries/0_stateless/03779_object_in_compact_mergetree_parts.sql +++ b/tests/queries/0_stateless/03779_object_in_compact_mergetree_parts.sql @@ -1,7 +1,10 @@ SET allow_experimental_object_type = 1; + +-- Test with the setting being disabled DROP TABLE IF EXISTS t_json_complex; CREATE TABLE t_json_complex (id UInt32, arr Array(Object('json'))) -ENGINE = MergeTree ORDER BY id; +ENGINE = MergeTree ORDER BY id +SETTINGS write_marks_for_substreams_in_compact_parts=0; -- Insert data with nested arrays inside JSON objects INSERT INTO t_json_complex FORMAT JSONEachRow {"id": 1, "arr": [{"k1": [{"k2": "aaa", "k3": "bbb"}, {"k2": "ccc"}]}]} @@ -10,3 +13,20 @@ INSERT INTO t_json_complex FORMAT JSONEachRow {"id": 2, "arr": [{"k1": [{"k3": " -- This query used to crash the server SELECT id, arr.k1.k2, arr.k1.k3, arr.k1.k4, arr.k5.k6 FROM t_json_complex ORDER BY id; +DROP TABLE t_json_complex; + +-- Now test with the setting explicitly enabled +DROP TABLE IF EXISTS t_json_complex_compact_parts; +CREATE TABLE t_json_complex_compact_parts (id UInt32, arr Array(Object('json'))) +ENGINE = MergeTree ORDER BY id +SETTINGS write_marks_for_substreams_in_compact_parts=1; + +-- Insert data with nested arrays inside JSON objects +INSERT INTO t_json_complex_compact_parts FORMAT JSONEachRow {"id": 1, "arr": [{"k1": [{"k2": "aaa", "k3": "bbb"}, {"k2": "ccc"}]}]} + +INSERT INTO t_json_complex_compact_parts FORMAT JSONEachRow {"id": 2, "arr": [{"k1": [{"k3": "ddd", "k4": 10}, {"k4": 20}], "k5": {"k6": "foo"}}]} + +-- This query used to crash the server +SELECT id, arr.k1.k2, arr.k1.k3, arr.k1.k4, arr.k5.k6 FROM t_json_complex_compact_parts ORDER BY id; +DROP TABLE t_json_complex_compact_parts; +