From 80da189c839d948eedda50965d235f7353325550 Mon Sep 17 00:00:00 2001 From: Alex Bianchi Date: Mon, 6 Apr 2026 16:36:16 -0300 Subject: [PATCH] add quickwit-datafusion crate --- quickwit/Cargo.lock | 1389 ++++++++++++++++- quickwit/Cargo.toml | 3 + quickwit/quickwit-datafusion/Cargo.toml | 52 + quickwit/quickwit-datafusion/src/catalog.rs | 137 ++ .../quickwit-datafusion/src/data_source.rs | 350 +++++ quickwit/quickwit-datafusion/src/lib.rs | 59 + quickwit/quickwit-datafusion/src/resolver.rs | 67 + quickwit/quickwit-datafusion/src/service.rs | 161 ++ quickwit/quickwit-datafusion/src/session.rs | 294 ++++ .../src/sources/metrics/factory.rs | 89 ++ .../src/sources/metrics/index_resolver.rs | 193 +++ .../src/sources/metrics/metastore_provider.rs | 153 ++ .../src/sources/metrics/mod.rs | 235 +++ .../src/sources/metrics/predicate.rs | 516 ++++++ .../src/sources/metrics/table_provider.rs | 209 +++ .../src/sources/metrics/test_utils.rs | 387 +++++ .../quickwit-datafusion/src/sources/mod.rs | 17 + .../quickwit-datafusion/src/storage_bridge.rs | 209 +++ quickwit/quickwit-datafusion/src/substrait.rs | 278 ++++ .../quickwit-datafusion/src/task_estimator.rs | 64 + .../quickwit-datafusion/src/test_utils.rs | 18 + quickwit/quickwit-datafusion/src/worker.rs | 119 ++ .../quickwit-integration-tests/Cargo.toml | 8 + .../src/test_utils/cluster_sandbox.rs | 5 + .../src/tests/metrics_datafusion_tests.rs | 968 ++++++++++++ .../src/tests/metrics_distributed_tests.rs | 321 ++++ .../src/tests/mod.rs | 2 + .../src/tests/rollup_substrait.json | 20 + quickwit/quickwit-proto/build.rs | 12 + .../protos/quickwit/datafusion.proto | 69 + .../codegen/quickwit/quickwit.datafusion.rs | 464 ++++++ quickwit/quickwit-proto/src/datafusion/mod.rs | 18 + quickwit/quickwit-proto/src/lib.rs | 1 + quickwit/quickwit-serve/Cargo.toml | 5 + .../src/datafusion_api/grpc_handler.rs | 174 +++ .../quickwit-serve/src/datafusion_api/mod.rs | 16 + quickwit/quickwit-serve/src/grpc.rs | 55 +- quickwit/quickwit-serve/src/lib.rs | 69 +- 38 files changed, 7152 insertions(+), 54 deletions(-) create mode 100644 quickwit/quickwit-datafusion/Cargo.toml create mode 100644 quickwit/quickwit-datafusion/src/catalog.rs create mode 100644 quickwit/quickwit-datafusion/src/data_source.rs create mode 100644 quickwit/quickwit-datafusion/src/lib.rs create mode 100644 quickwit/quickwit-datafusion/src/resolver.rs create mode 100644 quickwit/quickwit-datafusion/src/service.rs create mode 100644 quickwit/quickwit-datafusion/src/session.rs create mode 100644 quickwit/quickwit-datafusion/src/sources/metrics/factory.rs create mode 100644 quickwit/quickwit-datafusion/src/sources/metrics/index_resolver.rs create mode 100644 quickwit/quickwit-datafusion/src/sources/metrics/metastore_provider.rs create mode 100644 quickwit/quickwit-datafusion/src/sources/metrics/mod.rs create mode 100644 quickwit/quickwit-datafusion/src/sources/metrics/predicate.rs create mode 100644 quickwit/quickwit-datafusion/src/sources/metrics/table_provider.rs create mode 100644 quickwit/quickwit-datafusion/src/sources/metrics/test_utils.rs create mode 100644 quickwit/quickwit-datafusion/src/sources/mod.rs create mode 100644 quickwit/quickwit-datafusion/src/storage_bridge.rs create mode 100644 quickwit/quickwit-datafusion/src/substrait.rs create mode 100644 quickwit/quickwit-datafusion/src/task_estimator.rs create mode 100644 quickwit/quickwit-datafusion/src/test_utils.rs create mode 100644 quickwit/quickwit-datafusion/src/worker.rs create mode 100644 quickwit/quickwit-integration-tests/src/tests/metrics_datafusion_tests.rs create mode 100644 quickwit/quickwit-integration-tests/src/tests/metrics_distributed_tests.rs create mode 100644 quickwit/quickwit-integration-tests/src/tests/rollup_substrait.json create mode 100644 quickwit/quickwit-proto/protos/quickwit/datafusion.proto create mode 100644 quickwit/quickwit-proto/src/codegen/quickwit/quickwit.datafusion.rs create mode 100644 quickwit/quickwit-proto/src/datafusion/mod.rs create mode 100644 quickwit/quickwit-serve/src/datafusion_api/grpc_handler.rs create mode 100644 quickwit/quickwit-serve/src/datafusion_api/mod.rs diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock index ae4e5661a50..0a70ffa10b4 100644 --- a/quickwit/Cargo.lock +++ b/quickwit/Cargo.lock @@ -109,6 +109,21 @@ dependencies = [ "equator", ] +[[package]] +name = "alloc-no-stdlib" +version = "2.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc7bb162ec39d46ab1ca8c77bf72e890535becd1751bb45f64c597edb4c8c6b3" + +[[package]] +name = "alloc-stdlib" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94fb8275041c72129eb51b7d0322c29b8387a0386127718b096429201a5d6ece" +dependencies = [ + "alloc-no-stdlib", +] + [[package]] name = "alloca" version = "0.4.0" @@ -214,6 +229,15 @@ version = "1.0.102" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" +[[package]] +name = "ar_archive_writer" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7eb93bbb63b9c227414f6eb3a0adfddca591a8ce1e9b60661bb08969b87e340b" +dependencies = [ + "object", +] + [[package]] name = "arc-swap" version = "1.9.0" @@ -223,6 +247,12 @@ dependencies = [ "rustversion", ] +[[package]] +name = "arrayref" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb" + [[package]] name = "arrayvec" version = "0.7.6" @@ -239,8 +269,10 @@ dependencies = [ "arrow-array", "arrow-buffer", "arrow-cast", + "arrow-csv", "arrow-data", "arrow-ipc", + "arrow-json", "arrow-ord", "arrow-row", "arrow-schema", @@ -273,6 +305,7 @@ dependencies = [ "arrow-data", "arrow-schema", "chrono", + "chrono-tz", "half", "hashbrown 0.16.1", "num-complex", @@ -307,12 +340,28 @@ dependencies = [ "atoi", "base64 0.22.1", "chrono", + "comfy-table", "half", "lexical-core", "num-traits", "ryu", ] +[[package]] +name = "arrow-csv" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8da746f4180004e3ce7b83c977daf6394d768332349d3d913998b10a120b790a" +dependencies = [ + "arrow-array", + "arrow-cast", + "arrow-schema", + "chrono", + "csv", + "csv-core", + "regex", +] + [[package]] name = "arrow-data" version = "57.3.0" @@ -326,6 +375,26 @@ dependencies = [ "num-traits", ] +[[package]] +name = "arrow-flight" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "58c5b083668e6230eae3eab2fc4b5fb989974c845d0aa538dde61a4327c78675" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-ipc", + "arrow-schema", + "base64 0.22.1", + "bytes", + "futures", + "prost 0.14.3", + "prost-types 0.14.3", + "tonic 0.14.5", + "tonic-prost", +] + [[package]] name = "arrow-ipc" version = "57.3.0" @@ -338,6 +407,32 @@ dependencies = [ "arrow-schema", "arrow-select", "flatbuffers", + "lz4_flex 0.12.1", + "zstd", +] + +[[package]] +name = "arrow-json" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ff8357658bedc49792b13e2e862b80df908171275f8e6e075c460da5ee4bf86" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-schema", + "chrono", + "half", + "indexmap 2.13.0", + "itoa", + "lexical-core", + "memchr", + "num-traits", + "ryu", + "serde_core", + "serde_json", + "simdutf8", ] [[package]] @@ -508,6 +603,17 @@ dependencies = [ "rustix 1.1.4", ] +[[package]] +name = "async-recursion" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "async-signal" version = "0.2.13" @@ -1415,6 +1521,28 @@ version = "1.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2af50177e190e07a26ab74f8b1efbfe2ef87da2116221318cb1c2e82baf7de06" +[[package]] +name = "bigdecimal" +version = "0.4.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4d6867f1565b3aad85681f1015055b087fcfd840d6aeee6eee7f2da317603695" +dependencies = [ + "autocfg", + "libm", + "num-bigint", + "num-integer", + "num-traits", +] + +[[package]] +name = "bincode" +version = "1.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" +dependencies = [ + "serde", +] + [[package]] name = "bindgen" version = "0.72.1" @@ -1490,6 +1618,29 @@ dependencies = [ "crunchy", ] +[[package]] +name = "blake2" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46502ad458c9a52b69d4d4d32775c788b7a1b85e8bc9d482d92250fc0e3f8efe" +dependencies = [ + "digest", +] + +[[package]] +name = "blake3" +version = "1.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4d2d5991425dfd0785aed03aedcf0b321d61975c9b5b3689c774a2610ae0b51e" +dependencies = [ + "arrayref", + "arrayvec", + "cc", + "cfg-if", + "constant_time_eq", + "cpufeatures 0.3.0", +] + [[package]] name = "block-buffer" version = "0.10.4" @@ -1561,6 +1712,27 @@ dependencies = [ "rand 0.8.5", ] +[[package]] +name = "brotli" +version = "8.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4bd8b9603c7aa97359dbd97ecf258968c95f3adddd6db2f7e7a5bef101c84560" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", + "brotli-decompressor", +] + +[[package]] +name = "brotli-decompressor" +version = "5.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "874bb8112abecc98cbd6d81ea4fa7e94fb9449648c93cc89aa40c81c24d7de03" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", +] + [[package]] name = "bs58" version = "0.5.1" @@ -1631,6 +1803,15 @@ dependencies = [ "bytes", ] +[[package]] +name = "bzip2" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3a53fac24f34a81bc9954b5d6cfce0c21e18ec6959f44f56e8e90e4bb7c346c" +dependencies = [ + "libbz2-rs-sys", +] + [[package]] name = "camino" version = "1.2.2" @@ -1980,6 +2161,16 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "comfy-table" +version = "7.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "958c5d6ecf1f214b4c2bbbbf6ab9523a864bd136dcf71a7e8904799acfe1ad47" +dependencies = [ + "unicode-segmentation", + "unicode-width 0.2.2", +] + [[package]] name = "community-id" version = "0.2.4" @@ -1997,8 +2188,10 @@ version = "0.4.37" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eb7b51a7d9c967fc26773061ba86150f19c50c0d65c887cb1fbe295fd16619b7" dependencies = [ + "bzip2", "compression-core", "flate2", + "liblzma", "memchr", "zstd", "zstd-safe", @@ -2115,6 +2308,12 @@ version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "413d67b29ef1021b4d60f4aa1e925ca031751e213832b4b1d588fae623c05c60" +[[package]] +name = "constant_time_eq" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b" + [[package]] name = "convert_case" version = "0.7.1" @@ -2511,80 +2710,838 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "be1e0bca6c3637f992fc1cc7cbc52a78c1ef6db076dbf1059c4323d6a2048376" [[package]] -name = "datasketches" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c286de4e81ea2590afc24d754e0f83810c566f50a1388fa75ebd57928c0d9745" - -[[package]] -name = "dbl" -version = "0.3.2" +name = "datafusion" +version = "52.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd2735a791158376708f9347fe8faba9667589d82427ef3aed6794a8981de3d9" +checksum = "43c18ba387f9c05ac1f3be32a73f8f3cc6c1cfc43e5d4b7a8e5b0d3a5eb48dc7" dependencies = [ - "generic-array", + "arrow", + "arrow-schema", + "async-trait", + "bytes", + "bzip2", + "chrono", + "datafusion-catalog", + "datafusion-catalog-listing", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-datasource-arrow", + "datafusion-datasource-csv", + "datafusion-datasource-json", + "datafusion-datasource-parquet", + "datafusion-execution", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-functions", + "datafusion-functions-aggregate", + "datafusion-functions-nested", + "datafusion-functions-table", + "datafusion-functions-window", + "datafusion-optimizer", + "datafusion-physical-expr", + "datafusion-physical-expr-adapter", + "datafusion-physical-expr-common", + "datafusion-physical-optimizer", + "datafusion-physical-plan", + "datafusion-session", + "datafusion-sql", + "flate2", + "futures", + "itertools 0.14.0", + "liblzma", + "log", + "object_store", + "parking_lot 0.12.5", + "parquet", + "rand 0.9.2", + "regex", + "sqlparser", + "tempfile", + "tokio", + "url", + "uuid", + "zstd", ] [[package]] -name = "deadpool" -version = "0.12.3" +name = "datafusion-catalog" +version = "52.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0be2b1d1d6ec8d846f05e137292d0b89133caf95ef33695424c09568bdd39b1b" +checksum = "3c75a4ce672b27fb8423810efb92a3600027717a1664d06a2c307eeeabcec694" dependencies = [ - "deadpool-runtime", - "lazy_static", - "num_cpus", + "arrow", + "async-trait", + "dashmap 6.1.0", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "itertools 0.14.0", + "log", + "object_store", + "parking_lot 0.12.5", "tokio", ] [[package]] -name = "deadpool-runtime" -version = "0.1.4" +name = "datafusion-catalog-listing" +version = "52.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "092966b41edc516079bdf31ec78a2e0588d1d0c08f78b91d8307215928642b2b" +checksum = "2c8b9a3795ffb46bf4957a34c67d89a67558b311ae455c8d4295ff2115eeea50" +dependencies = [ + "arrow", + "async-trait", + "datafusion-catalog", + "datafusion-common", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-adapter", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "futures", + "itertools 0.14.0", + "log", + "object_store", +] [[package]] -name = "debugid" -version = "0.8.0" +name = "datafusion-common" +version = "52.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bef552e6f588e446098f6ba40d89ac146c8c7b64aade83c051ee00bb5d2bc18d" +checksum = "205dc1e20441973f470e6b7ef87626a3b9187970e5106058fef1b713047f770c" dependencies = [ - "uuid", + "ahash", + "arrow", + "arrow-ipc", + "chrono", + "half", + "hashbrown 0.16.1", + "indexmap 2.13.0", + "libc", + "log", + "object_store", + "parquet", + "paste", + "recursive", + "sqlparser", + "tokio", + "web-time", ] [[package]] -name = "der" -version = "0.6.1" +name = "datafusion-common-runtime" +version = "52.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1a467a65c5e759bce6e65eaf91cc29f466cdc57cb65777bd646872a8a1fd4de" +checksum = "8cf5880c02ff6f5f11fb5bc19211789fb32fd3c53d79b7d6cb2b12e401312ba0" dependencies = [ - "const-oid", - "zeroize", + "futures", + "log", + "tokio", ] [[package]] -name = "der" -version = "0.7.10" +name = "datafusion-datasource" +version = "52.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7c1832837b905bbfb5101e07cc24c8deddf52f93225eee6ead5f4d63d53ddcb" +checksum = "bc614d6e709450e29b7b032a42c1bdb705f166a6b2edef7bed7c7897eb905499" dependencies = [ - "const-oid", - "pem-rfc7468", - "zeroize", + "arrow", + "async-compression", + "async-trait", + "bytes", + "bzip2", + "chrono", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-adapter", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "flate2", + "futures", + "glob", + "itertools 0.14.0", + "liblzma", + "log", + "object_store", + "rand 0.9.2", + "tokio", + "tokio-util", + "url", + "zstd", ] [[package]] -name = "deranged" -version = "0.5.8" +name = "datafusion-datasource-arrow" +version = "52.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c" +checksum = "6e497d5fc48dac7ce86f6b4fb09a3a494385774af301ff20ec91aebfae9b05b4" dependencies = [ - "powerfmt", - "serde_core", + "arrow", + "arrow-ipc", + "async-trait", + "bytes", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "itertools 0.14.0", + "object_store", + "tokio", ] [[package]] -name = "derivative" +name = "datafusion-datasource-csv" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0dfc250cad940d0327ca2e9109dc98830892d17a3d6b2ca11d68570e872cf379" +dependencies = [ + "arrow", + "async-trait", + "bytes", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "object_store", + "regex", + "tokio", +] + +[[package]] +name = "datafusion-datasource-json" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c91e9677ed62833b0e8129dec0d1a8f3c9bb7590bd6dd714a43e4c3b663e4aa0" +dependencies = [ + "arrow", + "async-trait", + "bytes", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "object_store", + "tokio", +] + +[[package]] +name = "datafusion-datasource-parquet" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23798383465e0c569bd442d1453b50691261f8ad6511d840c48457b3bf51ae21" +dependencies = [ + "arrow", + "async-trait", + "bytes", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions-aggregate-common", + "datafusion-physical-expr", + "datafusion-physical-expr-adapter", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-pruning", + "datafusion-session", + "futures", + "itertools 0.14.0", + "log", + "object_store", + "parking_lot 0.12.5", + "parquet", + "tokio", +] + +[[package]] +name = "datafusion-distributed" +version = "0.1.0" +source = "git+https://github.com/datafusion-contrib/datafusion-distributed#0f2c8be3e148b0bd5c7f17b23f2df8bb1201d5fb" +dependencies = [ + "arrow-flight", + "arrow-ipc", + "arrow-select", + "async-trait", + "bincode", + "bytes", + "chrono", + "crossbeam-queue", + "dashmap 6.1.0", + "datafusion", + "datafusion-proto", + "delegate", + "futures", + "http 1.4.0", + "itertools 0.14.0", + "moka", + "object_store", + "pin-project", + "prost 0.14.3", + "rand 0.9.2", + "sketches-ddsketch 0.3.1", + "tokio", + "tokio-stream", + "tokio-util", + "tonic 0.14.5", + "tonic-prost", + "tower 0.5.3", + "url", + "uuid", +] + +[[package]] +name = "datafusion-doc" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e13e5fe3447baa0584b61ee8644086e007e1ef6e58f4be48bc8a72417854729" + +[[package]] +name = "datafusion-execution" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48a6cc03e34899a54546b229235f7b192634c8e832f78a267f0989b18216c56d" +dependencies = [ + "arrow", + "async-trait", + "chrono", + "dashmap 6.1.0", + "datafusion-common", + "datafusion-expr", + "futures", + "log", + "object_store", + "parking_lot 0.12.5", + "rand 0.9.2", + "tempfile", + "url", +] + +[[package]] +name = "datafusion-expr" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee3315d87eca7a7df58e52a1fb43b4c4171b545fd30ffc3102945c162a9f6ddb" +dependencies = [ + "arrow", + "async-trait", + "chrono", + "datafusion-common", + "datafusion-doc", + "datafusion-expr-common", + "datafusion-functions-aggregate-common", + "datafusion-functions-window-common", + "datafusion-physical-expr-common", + "indexmap 2.13.0", + "itertools 0.14.0", + "paste", + "recursive", + "serde_json", + "sqlparser", +] + +[[package]] +name = "datafusion-expr-common" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "98c6d83feae0753799f933a2c47dfd15980c6947960cb95ed60f5c1f885548b3" +dependencies = [ + "arrow", + "datafusion-common", + "indexmap 2.13.0", + "itertools 0.14.0", + "paste", +] + +[[package]] +name = "datafusion-functions" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49b82962015cc3db4d7662459c9f7fcda0591b5edacb8af1cf3bc3031f274800" +dependencies = [ + "arrow", + "arrow-buffer", + "base64 0.22.1", + "blake2", + "blake3", + "chrono", + "chrono-tz", + "datafusion-common", + "datafusion-doc", + "datafusion-execution", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-macros", + "hex", + "itertools 0.14.0", + "log", + "md-5", + "num-traits", + "rand 0.9.2", + "regex", + "sha2", + "unicode-segmentation", + "uuid", +] + +[[package]] +name = "datafusion-functions-aggregate" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e42c227d9e55a6c8041785d4a8a117e4de531033d480aae10984247ac62e27e" +dependencies = [ + "ahash", + "arrow", + "datafusion-common", + "datafusion-doc", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions-aggregate-common", + "datafusion-macros", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "half", + "log", + "paste", +] + +[[package]] +name = "datafusion-functions-aggregate-common" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cead3cfed825b0b688700f4338d281cd7857e4907775a5b9554c083edd5f3f95" +dependencies = [ + "ahash", + "arrow", + "datafusion-common", + "datafusion-expr-common", + "datafusion-physical-expr-common", +] + +[[package]] +name = "datafusion-functions-nested" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62ea99612970aebab8cf864d02eb3d296bbab7f4881e1023d282b57fe431b201" +dependencies = [ + "arrow", + "arrow-ord", + "datafusion-common", + "datafusion-doc", + "datafusion-execution", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-functions", + "datafusion-functions-aggregate", + "datafusion-functions-aggregate-common", + "datafusion-macros", + "datafusion-physical-expr-common", + "itertools 0.14.0", + "log", + "paste", +] + +[[package]] +name = "datafusion-functions-table" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d83dbf3ab8b9af6f209b068825a7adbd3b88bf276f2a1ec14ba09567b97f5674" +dependencies = [ + "arrow", + "async-trait", + "datafusion-catalog", + "datafusion-common", + "datafusion-expr", + "datafusion-physical-plan", + "parking_lot 0.12.5", + "paste", +] + +[[package]] +name = "datafusion-functions-window" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "732edabe07496e2fc5a1e57a284d7a36edcea445a2821119770a0dea624b472c" +dependencies = [ + "arrow", + "datafusion-common", + "datafusion-doc", + "datafusion-expr", + "datafusion-functions-window-common", + "datafusion-macros", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "log", + "paste", +] + +[[package]] +name = "datafusion-functions-window-common" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0c6e30e09700799bd52adce8c377ab03dda96e73a623e4803a31ad94fe7ce14" +dependencies = [ + "datafusion-common", + "datafusion-physical-expr-common", +] + +[[package]] +name = "datafusion-macros" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "402f2a8ed70fb99a18f71580a1fe338604222a3d32ddeac6e72c5b34feea2d4d" +dependencies = [ + "datafusion-doc", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "datafusion-optimizer" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99f32edb8ba12f08138f86c09b80fae3d4a320551262fa06b91d8a8cb3065a5b" +dependencies = [ + "arrow", + "chrono", + "datafusion-common", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-physical-expr", + "indexmap 2.13.0", + "itertools 0.14.0", + "log", + "recursive", + "regex", + "regex-syntax", +] + +[[package]] +name = "datafusion-physical-expr" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "987c5e29e96186589301b42e25aa7d11bbe319a73eb02ef8d755edc55b5b89fc" +dependencies = [ + "ahash", + "arrow", + "datafusion-common", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-functions-aggregate-common", + "datafusion-physical-expr-common", + "half", + "hashbrown 0.16.1", + "indexmap 2.13.0", + "itertools 0.14.0", + "parking_lot 0.12.5", + "paste", + "petgraph 0.8.3", + "recursive", + "tokio", +] + +[[package]] +name = "datafusion-physical-expr-adapter" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1de89d0afa08b6686697bd8a6bac4ba2cd44c7003356e1bce6114d5a93f94b5c" +dependencies = [ + "arrow", + "datafusion-common", + "datafusion-expr", + "datafusion-functions", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "itertools 0.14.0", +] + +[[package]] +name = "datafusion-physical-expr-common" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "602d1970c0fe87f1c3a36665d131fbfe1c4379d35f8fc5ec43a362229ad2954d" +dependencies = [ + "ahash", + "arrow", + "chrono", + "datafusion-common", + "datafusion-expr-common", + "hashbrown 0.16.1", + "indexmap 2.13.0", + "itertools 0.14.0", + "parking_lot 0.12.5", +] + +[[package]] +name = "datafusion-physical-optimizer" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b24d704b6385ebe27c756a12e5ba15684576d3b47aeca79cc9fb09480236dc32" +dependencies = [ + "arrow", + "datafusion-common", + "datafusion-execution", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-pruning", + "itertools 0.14.0", + "recursive", +] + +[[package]] +name = "datafusion-physical-plan" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c21d94141ea5043e98793f170798e9c1887095813b8291c5260599341e383a38" +dependencies = [ + "ahash", + "arrow", + "arrow-ord", + "arrow-schema", + "async-trait", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions", + "datafusion-functions-aggregate-common", + "datafusion-functions-window-common", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "futures", + "half", + "hashbrown 0.16.1", + "indexmap 2.13.0", + "itertools 0.14.0", + "log", + "parking_lot 0.12.5", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "datafusion-proto" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d5e139c4259ccfd12e9f786172ebdf26245c041f7a40ddd0e7651d29da0fd249" +dependencies = [ + "arrow", + "chrono", + "datafusion-catalog", + "datafusion-catalog-listing", + "datafusion-common", + "datafusion-datasource", + "datafusion-datasource-arrow", + "datafusion-datasource-csv", + "datafusion-datasource-json", + "datafusion-datasource-parquet", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions-table", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-proto-common", + "object_store", + "prost 0.14.3", +] + +[[package]] +name = "datafusion-proto-common" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ea6437aecb636b0ea67c6a09feb68d20aaab163402acfa73173a61d78e15110" +dependencies = [ + "arrow", + "datafusion-common", + "prost 0.14.3", +] + +[[package]] +name = "datafusion-pruning" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a68cce43d18c0dfac95cacd74e70565f7e2fb12b9ed41e2d312f0fa837626b1" +dependencies = [ + "arrow", + "datafusion-common", + "datafusion-datasource", + "datafusion-expr-common", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "itertools 0.14.0", + "log", +] + +[[package]] +name = "datafusion-session" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b4e1c40a0b1896aed4a4504145c2eb7fa9b9da13c2d04b40a4767a09f076199" +dependencies = [ + "async-trait", + "datafusion-common", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-plan", + "parking_lot 0.12.5", +] + +[[package]] +name = "datafusion-sql" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f1891e5b106d1d73c7fe403bd8a265d19c3977edc17f60808daf26c2fe65ffb" +dependencies = [ + "arrow", + "bigdecimal", + "chrono", + "datafusion-common", + "datafusion-expr", + "indexmap 2.13.0", + "log", + "recursive", + "regex", + "sqlparser", +] + +[[package]] +name = "datafusion-substrait" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2379388ecab67079eeb1185c953fb9c5ed4b283fa3cb81417538378a30545957" +dependencies = [ + "async-recursion", + "async-trait", + "chrono", + "datafusion", + "half", + "itertools 0.14.0", + "object_store", + "pbjson-types", + "prost 0.14.3", + "substrait", + "tokio", + "url", +] + +[[package]] +name = "datasketches" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c286de4e81ea2590afc24d754e0f83810c566f50a1388fa75ebd57928c0d9745" + +[[package]] +name = "dbl" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd2735a791158376708f9347fe8faba9667589d82427ef3aed6794a8981de3d9" +dependencies = [ + "generic-array", +] + +[[package]] +name = "deadpool" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0be2b1d1d6ec8d846f05e137292d0b89133caf95ef33695424c09568bdd39b1b" +dependencies = [ + "deadpool-runtime", + "lazy_static", + "num_cpus", + "tokio", +] + +[[package]] +name = "deadpool-runtime" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "092966b41edc516079bdf31ec78a2e0588d1d0c08f78b91d8307215928642b2b" + +[[package]] +name = "debugid" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef552e6f588e446098f6ba40d89ac146c8c7b64aade83c051ee00bb5d2bc18d" +dependencies = [ + "uuid", +] + +[[package]] +name = "delegate" +version = "0.13.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "780eb241654bf097afb00fc5f054a09b687dad862e485fdcf8399bb056565370" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "der" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1a467a65c5e759bce6e65eaf91cc29f466cdc57cb65777bd646872a8a1fd4de" +dependencies = [ + "const-oid", + "zeroize", +] + +[[package]] +name = "der" +version = "0.7.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7c1832837b905bbfb5101e07cc24c8deddf52f93225eee6ead5f4d63d53ddcb" +dependencies = [ + "const-oid", + "pem-rfc7468", + "zeroize", +] + +[[package]] +name = "deranged" +version = "0.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c" +dependencies = [ + "powerfmt", + "serde_core", +] + +[[package]] +name = "derivative" version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fcc3dd5e9e9c0b295d6e1e4d811fb6f157d5ffd784b8d202fc62eac8035a770b" @@ -4724,6 +5681,12 @@ dependencies = [ "lexical-util", ] +[[package]] +name = "libbz2-rs-sys" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c4a545a15244c7d945065b5d392b2d2d7f21526fba56ce51467b06ed445e8f7" + [[package]] name = "libc" version = "0.2.183" @@ -4740,6 +5703,26 @@ dependencies = [ "windows-link 0.2.1", ] +[[package]] +name = "liblzma" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6033b77c21d1f56deeae8014eb9fbe7bdf1765185a6c508b5ca82eeaed7f899" +dependencies = [ + "liblzma-sys", +] + +[[package]] +name = "liblzma-sys" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a60851d15cd8c5346eca4ab8babff585be2ae4bc8097c067291d3ffe2add3b6" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + [[package]] name = "libm" version = "0.2.16" @@ -4885,6 +5868,9 @@ name = "lz4_flex" version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "98c23545df7ecf1b16c303910a69b079e8e251d60f7dd2cc9b4177f2afaf1746" +dependencies = [ + "twox-hash", +] [[package]] name = "matchers" @@ -5433,6 +6419,30 @@ dependencies = [ "memchr", ] +[[package]] +name = "object_store" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbfbfff40aeccab00ec8a910b57ca8ecf4319b335c542f2edcd19dd25a1e2a00" +dependencies = [ + "async-trait", + "bytes", + "chrono", + "futures", + "http 1.4.0", + "humantime", + "itertools 0.14.0", + "parking_lot 0.12.5", + "percent-encoding", + "thiserror 2.0.18", + "tokio", + "tracing", + "url", + "walkdir", + "wasm-bindgen-futures", + "web-time", +] + [[package]] name = "octseq" version = "0.5.2" @@ -5919,20 +6929,27 @@ dependencies = [ "arrow-schema", "arrow-select", "base64 0.22.1", + "brotli", "bytes", "chrono", + "flate2", + "futures", "half", "hashbrown 0.16.1", + "lz4_flex 0.12.1", "num-bigint", "num-integer", "num-traits", + "object_store", "parquet-variant", "parquet-variant-compute", "parquet-variant-json", "paste", "seq-macro", + "simdutf8", "snap", "thrift", + "tokio", "twox-hash", "zstd", ] @@ -5993,6 +7010,43 @@ version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" +[[package]] +name = "pbjson" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "898bac3fa00d0ba57a4e8289837e965baa2dee8c3749f3b11d45a64b4223d9c3" +dependencies = [ + "base64 0.22.1", + "serde", +] + +[[package]] +name = "pbjson-build" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af22d08a625a2213a78dbb0ffa253318c5c79ce3133d32d296655a7bdfb02095" +dependencies = [ + "heck 0.5.0", + "itertools 0.14.0", + "prost 0.14.3", + "prost-types 0.14.3", +] + +[[package]] +name = "pbjson-types" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e748e28374f10a330ee3bb9f29b828c0ac79831a32bab65015ad9b661ead526" +dependencies = [ + "bytes", + "chrono", + "pbjson", + "pbjson-build", + "prost 0.14.3", + "prost-build 0.14.3", + "serde", +] + [[package]] name = "pbkdf2" version = "0.12.2" @@ -6122,6 +7176,7 @@ dependencies = [ "fixedbitset", "hashbrown 0.15.5", "indexmap 2.13.0", + "serde", ] [[package]] @@ -6801,6 +7856,16 @@ version = "2.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "33cb294fe86a74cbcf50d4445b37da762029549ebeea341421c7c70370f86cac" +[[package]] +name = "psm" +version = "0.1.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3852766467df634d74f0b2d7819bf8dc483a0eb2e3b0f50f756f9cfe8b0d18d8" +dependencies = [ + "ar_archive_writer", + "cc", +] + [[package]] name = "publicsuffix" version = "2.3.0" @@ -7208,6 +8273,42 @@ dependencies = [ "ulid", ] +[[package]] +name = "quickwit-datafusion" +version = "0.8.0" +dependencies = [ + "anyhow", + "arrow", + "async-trait", + "bytes", + "chrono", + "datafusion", + "datafusion-datasource", + "datafusion-datasource-parquet", + "datafusion-distributed", + "datafusion-physical-plan", + "datafusion-sql", + "datafusion-substrait", + "futures", + "object_store", + "prost 0.14.3", + "quickwit-common", + "quickwit-datafusion", + "quickwit-metastore", + "quickwit-parquet-engine", + "quickwit-proto", + "quickwit-search", + "quickwit-storage", + "serde", + "serde_json", + "thiserror 2.0.18", + "tokio", + "tokio-stream", + "tonic 0.14.5", + "tracing", + "url", +] + [[package]] name = "quickwit-datetime" version = "0.8.0" @@ -7400,21 +8501,29 @@ name = "quickwit-integration-tests" version = "0.8.0" dependencies = [ "anyhow", + "arrow", "aws-sdk-sqs", + "bytesize", + "datafusion", + "datafusion-substrait", "futures-util", "hyper 1.8.1", "hyper-util", "itertools 0.14.0", + "prost 0.14.3", "quickwit-actors", "quickwit-cli", "quickwit-common", "quickwit-config", + "quickwit-datafusion", "quickwit-indexing", "quickwit-ingest", "quickwit-metastore", "quickwit-opentelemetry", + "quickwit-parquet-engine", "quickwit-proto", "quickwit-rest-client", + "quickwit-search", "quickwit-serve", "quickwit-storage", "rand 0.9.2", @@ -7782,11 +8891,13 @@ name = "quickwit-serve" version = "0.8.0" dependencies = [ "anyhow", + "arrow", "assert-json-diff", "async-trait", "base64 0.22.1", "bytes", "bytesize", + "datafusion-distributed", "elasticsearch-dsl", "flate2", "futures", @@ -7797,6 +8908,7 @@ dependencies = [ "http-body 1.0.1", "http-serde", "humantime", + "hyper 1.8.1", "hyper-util", "itertools 0.14.0", "mime_guess", @@ -7811,6 +8923,7 @@ dependencies = [ "quickwit-common", "quickwit-config", "quickwit-control-plane", + "quickwit-datafusion", "quickwit-doc-mapper", "quickwit-index-management", "quickwit-indexing", @@ -7820,6 +8933,7 @@ dependencies = [ "quickwit-lambda-client", "quickwit-metastore", "quickwit-opentelemetry", + "quickwit-parquet-engine", "quickwit-proto", "quickwit-query", "quickwit-search", @@ -8202,6 +9316,26 @@ dependencies = [ "zstd-sys", ] +[[package]] +name = "recursive" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0786a43debb760f491b1bc0269fe5e84155353c67482b9e60d0cfb596054b43e" +dependencies = [ + "recursive-proc-macro-impl", + "stacker", +] + +[[package]] +name = "recursive-proc-macro-impl" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" +dependencies = [ + "quote", + "syn 2.0.117", +] + [[package]] name = "redox_syscall" version = "0.2.16" @@ -8311,6 +9445,16 @@ version = "0.8.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" +[[package]] +name = "regress" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2057b2325e68a893284d1538021ab90279adac1139957ca2a74426c6f118fb48" +dependencies = [ + "hashbrown 0.16.1", + "memchr", +] + [[package]] name = "reqsign" version = "0.16.5" @@ -8793,6 +9937,18 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "schemars" +version = "0.8.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fbf2ae1b8bc8e02df939598064d22402220cd5bbcca1c76f7d6a310974d5615" +dependencies = [ + "dyn-clone", + "schemars_derive", + "serde", + "serde_json", +] + [[package]] name = "schemars" version = "0.9.0" @@ -8817,6 +9973,18 @@ dependencies = [ "serde_json", ] +[[package]] +name = "schemars_derive" +version = "0.8.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32e265784ad618884abaea0600a9adf15393368d840e0222d101a072f3f7534d" +dependencies = [ + "proc-macro2", + "quote", + "serde_derive_internals", + "syn 2.0.117", +] + [[package]] name = "scoped-tls" version = "1.0.1" @@ -9009,6 +10177,17 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "serde_derive_internals" +version = "0.29.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "serde_json" version = "1.0.149" @@ -9084,6 +10263,18 @@ dependencies = [ "serde_core", ] +[[package]] +name = "serde_tokenstream" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7c49585c52c01f13c5c2ebb333f14f6885d76daa768d8a037d28017ec538c69" +dependencies = [ + "proc-macro2", + "quote", + "serde", + "syn 2.0.117", +] + [[package]] name = "serde_urlencoded" version = "0.7.1" @@ -9346,6 +10537,15 @@ dependencies = [ "serde", ] +[[package]] +name = "sketches-ddsketch" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c6f73aeb92d671e0cc4dca167e59b2deb6387c375391bc99ee743f326994a2b" +dependencies = [ + "serde", +] + [[package]] name = "slab" version = "0.4.12" @@ -9477,6 +10677,28 @@ dependencies = [ "der 0.7.10", ] +[[package]] +name = "sqlparser" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4591acadbcf52f0af60eafbb2c003232b2b4cd8de5f0e9437cb8b1b59046cc0f" +dependencies = [ + "log", + "recursive", + "sqlparser_derive", +] + +[[package]] +name = "sqlparser_derive" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da5fc6819faabb412da764b99d3b713bb55083c11e7e0c00144d386cd6a1939c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "sqlx" version = "0.8.6" @@ -9677,6 +10899,19 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" +[[package]] +name = "stacker" +version = "0.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d74a23609d509411d10e2176dc2a4346e3b4aea2e7b1869f19fdedbc71c013" +dependencies = [ + "cc", + "cfg-if", + "libc", + "psm", + "windows-sys 0.59.0", +] + [[package]] name = "static_assertions" version = "1.1.0" @@ -9727,6 +10962,31 @@ version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" +[[package]] +name = "substrait" +version = "0.62.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62fc4b483a129b9772ccb9c3f7945a472112fdd9140da87f8a4e7f1d44e045d0" +dependencies = [ + "heck 0.5.0", + "pbjson", + "pbjson-build", + "pbjson-types", + "prettyplease", + "prost 0.14.3", + "prost-build 0.14.3", + "prost-types 0.14.3", + "regress", + "schemars 0.8.22", + "semver", + "serde", + "serde_json", + "serde_yaml", + "syn 2.0.117", + "typify", + "walkdir", +] + [[package]] name = "subtle" version = "2.6.1" @@ -9888,7 +11148,7 @@ dependencies = [ "rustc-hash", "serde", "serde_json", - "sketches-ddsketch", + "sketches-ddsketch 0.3.0", "smallvec", "tantivy-bitpacker", "tantivy-columnar", @@ -10751,6 +12011,53 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "typify" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6d5bcc6f62eb1fa8aa4098f39b29f93dcb914e17158b76c50360911257aa629" +dependencies = [ + "typify-impl", + "typify-macro", +] + +[[package]] +name = "typify-impl" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1eb359f7ffa4f9ebe947fa11a1b2da054564502968db5f317b7e37693cb2240" +dependencies = [ + "heck 0.5.0", + "log", + "proc-macro2", + "quote", + "regress", + "schemars 0.8.22", + "semver", + "serde", + "serde_json", + "syn 2.0.117", + "thiserror 2.0.18", + "unicode-ident", +] + +[[package]] +name = "typify-macro" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "911c32f3c8514b048c1b228361bebb5e6d73aeec01696e8cc0e82e2ffef8ab7a" +dependencies = [ + "proc-macro2", + "quote", + "schemars 0.8.22", + "semver", + "serde", + "serde_json", + "serde_tokenstream", + "syn 2.0.117", + "typify-impl", +] + [[package]] name = "tz-rs" version = "0.6.14" diff --git a/quickwit/Cargo.toml b/quickwit/Cargo.toml index 9242390d898..e04334f9e03 100644 --- a/quickwit/Cargo.toml +++ b/quickwit/Cargo.toml @@ -10,6 +10,7 @@ members = [ "quickwit-common", "quickwit-config", "quickwit-control-plane", + "quickwit-datafusion", "quickwit-datetime", "quickwit-directories", "quickwit-doc-mapper", @@ -50,6 +51,7 @@ default-members = [ "quickwit-common", "quickwit-config", "quickwit-control-plane", + "quickwit-datafusion", "quickwit-datetime", "quickwit-directories", "quickwit-doc-mapper", @@ -349,6 +351,7 @@ quickwit-codegen-example = { path = "quickwit-codegen/example" } quickwit-common = { path = "quickwit-common" } quickwit-config = { path = "quickwit-config" } quickwit-control-plane = { path = "quickwit-control-plane" } +quickwit-datafusion = { path = "quickwit-datafusion" } quickwit-datetime = { path = "quickwit-datetime" } quickwit-directories = { path = "quickwit-directories" } quickwit-doc-mapper = { path = "quickwit-doc-mapper" } diff --git a/quickwit/quickwit-datafusion/Cargo.toml b/quickwit/quickwit-datafusion/Cargo.toml new file mode 100644 index 00000000000..d5ae5402cd6 --- /dev/null +++ b/quickwit/quickwit-datafusion/Cargo.toml @@ -0,0 +1,52 @@ +[package] +name = "quickwit-datafusion" +description = "DataFusion-based query execution for Quickwit parquet metrics" + +version.workspace = true +edition.workspace = true +homepage.workspace = true +documentation.workspace = true +repository.workspace = true +authors.workspace = true +license.workspace = true + +[dependencies] +anyhow = { workspace = true } +async-trait = { workspace = true } +bytes = { workspace = true } +chrono = { workspace = true } +futures = { workspace = true } +prost = { workspace = true } +serde = { workspace = true } +serde_json = { workspace = true } +thiserror = { workspace = true } +tokio = { workspace = true } +tokio-stream = { workspace = true } +tonic = { workspace = true } +tracing = { workspace = true } +url = "2" + +quickwit-common = { workspace = true } +quickwit-metastore = { workspace = true } +quickwit-parquet-engine = { workspace = true } +quickwit-proto = { workspace = true } +quickwit-search = { workspace = true } +quickwit-storage = { workspace = true } + +arrow = { workspace = true } +datafusion = "52" +datafusion-substrait = "52" +datafusion-datasource = "52" +datafusion-sql = "52" +datafusion-physical-plan = "52" +datafusion-datasource-parquet = "52" +datafusion-distributed = { git = "https://github.com/datafusion-contrib/datafusion-distributed" } +object_store = "0.12" + +[dev-dependencies] +quickwit-common = { workspace = true, features = ["testsuite"] } +quickwit-datafusion = { path = ".", features = ["testsuite"] } +tokio = { workspace = true, features = ["test-util", "macros"] } + +[features] +testsuite = [] diff --git a/quickwit/quickwit-datafusion/src/catalog.rs b/quickwit/quickwit-datafusion/src/catalog.rs new file mode 100644 index 00000000000..cf138f21e97 --- /dev/null +++ b/quickwit/quickwit-datafusion/src/catalog.rs @@ -0,0 +1,137 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Generic DataFusion catalog / schema provider. +//! +//! `QuickwitSchemaProvider` routes table resolution to whichever registered +//! `QuickwitDataSource` claims to own the index. It knows nothing about +//! metrics, logs, or traces — those concerns live in each data source. + +use std::any::Any; +use std::sync::Arc; + +use async_trait::async_trait; +use datafusion::catalog::{MemorySchemaProvider, SchemaProvider}; +use datafusion::datasource::TableProvider; +use datafusion::error::Result as DFResult; + +use crate::data_source::QuickwitDataSource; + +/// DataFusion `SchemaProvider` that delegates table resolution to the +/// registered `QuickwitDataSource` implementations. +/// +/// Resolution order for `table(name)`: +/// 1. Explicitly registered tables (from `CREATE EXTERNAL TABLE` DDL) — backed +/// by DataFusion's own [`MemorySchemaProvider`] which uses a lock-free +/// `DashMap` internally, the idiomatic choice for this role. +/// 2. Each source's `create_default_table_provider`, first non-None wins. +/// +/// `register_table` / `deregister_table` delegate directly to the inner +/// `MemorySchemaProvider`, so `CREATE OR REPLACE EXTERNAL TABLE` works +/// correctly without any custom locking. +pub struct QuickwitSchemaProvider { + sources: Vec>, + /// DDL-registered tables (CREATE OR REPLACE EXTERNAL TABLE). + /// Uses DataFusion's MemorySchemaProvider which is backed by DashMap — + /// lock-free, concurrent-read-safe, and the standard DataFusion idiom. + ddl_tables: MemorySchemaProvider, +} + +impl QuickwitSchemaProvider { + pub fn new(sources: Vec>) -> Self { + Self { + sources, + ddl_tables: MemorySchemaProvider::new(), + } + } +} + +impl std::fmt::Debug for QuickwitSchemaProvider { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("QuickwitSchemaProvider") + .field("num_sources", &self.sources.len()) + .field("num_ddl_tables", &self.ddl_tables.table_names().len()) + .finish() + } +} + +#[async_trait] +impl SchemaProvider for QuickwitSchemaProvider { + fn as_any(&self) -> &dyn Any { + self + } + + /// Lists all index names across all sources. + /// + /// `table_names()` is a sync DataFusion API, but enumerating sources is + /// async. This uses `block_in_place`, which requires a multi-threaded + /// Tokio runtime. Only called for `SHOW TABLES` / `information_schema`; + /// not on the query hot path. + fn table_names(&self) -> Vec { + let sources = &self.sources; + tokio::task::block_in_place(|| { + tokio::runtime::Handle::current().block_on(async { + let mut names = Vec::new(); + for source in sources { + if let Ok(mut source_names) = source.list_index_names().await { + names.append(&mut source_names); + } + } + // Deduplicate in case multiple sources claim the same name. + names.dedup(); + names + }) + }) + } + + async fn table(&self, name: &str) -> DFResult>> { + // Resolution order: + // 1. DDL-registered tables (CREATE OR REPLACE EXTERNAL TABLE) + // 2. Each source's create_default_table_provider — first non-None wins. + // We do not pre-validate via table_names(); sources return None for + // unknown names and DataFusion emits "table not found". Avoids N+1. + if let Some(provider) = self.ddl_tables.table(name).await? { + return Ok(Some(provider)); + } + + for source in &self.sources { + if let Some(provider) = source.create_default_table_provider(name).await? { + return Ok(Some(provider)); + } + } + + Ok(None) + } + + /// Returns `true` if the table is present in the DDL cache. + /// + /// DataFusion's contract: `false` does not prevent `table()` from + /// returning `Some`; it is a hint only. Checking only DDL tables keeps + /// this method allocation-free and off the async hot path. + fn table_exist(&self, name: &str) -> bool { + self.ddl_tables.table_exist(name) + } + + fn register_table( + &self, + name: String, + table: Arc, + ) -> DFResult>> { + self.ddl_tables.register_table(name, table) + } + + fn deregister_table(&self, name: &str) -> DFResult>> { + self.ddl_tables.deregister_table(name) + } +} diff --git a/quickwit/quickwit-datafusion/src/data_source.rs b/quickwit/quickwit-datafusion/src/data_source.rs new file mode 100644 index 00000000000..051563794c5 --- /dev/null +++ b/quickwit/quickwit-datafusion/src/data_source.rs @@ -0,0 +1,350 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! `QuickwitDataSource` — the extension point for plugging data sources +//! (metrics, logs, traces, …) into the DataFusion session layer. +//! +//! ## Design: contribution-return pattern +//! +//! Each data source **returns its additive contributions** via [`contributions()`]. +//! The [`DataFusionSessionBuilder`][crate::session::DataFusionSessionBuilder] accumulates +//! contributions from all registered sources before building any session. This mirrors +//! the pattern in `dd-datafusion/runtime/src/connector.rs` where `Connector::init()` +//! returns a `DDDataFusionQueryPlanner` that the runtime merges across all connectors. +//! +//! Advantages over a builder-mutation chain (`configure_session(builder) -> builder`): +//! - **No silent overwrite**: two sources registering different codecs both win. +//! - **Inspectable**: the `DataSourceContributions` struct is a plain value — easy +//! to test and introspect without constructing a full `SessionStateBuilder`. +//! - **Conflict detection**: the session builder can validate (e.g., no two sources +//! register the same UDF name) before building the session. +//! +//! ## Lifecycle +//! +//! For each session (coordinator or worker): +//! +//! 1. **`contributions()`** — called once. Returns optimizer rules, codecs, and UDFs +//! to register. Applied before `SessionStateBuilder::build()`. +//! +//! 2. `SessionStateBuilder::build()` — called by the framework. +//! +//! 3. **`register_for_worker(&SessionState)`** — called after `build()` for +//! runtime state that requires the session to already exist (rare; prefer +//! `contributions()` for most things). +//! +//! ## Protocol compatibility note +//! +//! The worker communication protocol changed in datafusion-distributed PR #375 +//! (commit 556a5de) from Arrow Flight to a custom `WorkerService` gRPC protocol. +//! Any data source that needs distributed execution must be built against the same +//! protocol version as the coordinator. The logs data source (PR #6160) was written +//! against the pre-#375 Arrow Flight API and will require a protocol update before +//! it can share a `datafusion-distributed` pin with the metrics source. + +use std::fmt::Debug; +use std::sync::Arc; + +use async_trait::async_trait; +use datafusion::catalog::TableProviderFactory; +use datafusion::datasource::TableProvider; +use datafusion::error::Result as DFResult; +use datafusion::execution::SessionStateBuilder; +use datafusion::logical_expr::ScalarUDF; +use datafusion::physical_optimizer::PhysicalOptimizerRule; + +/// Additive contributions from a [`QuickwitDataSource`] to the DataFusion session. +/// +/// Returned by [`QuickwitDataSource::contributions()`] and aggregated across all +/// registered sources before any session is built. +/// +/// Analogous to `DDDataFusionQueryPlanner` in `dd-datafusion`, which accumulates +/// extension planners, rules, and UDFs from every registered `Connector`. +/// +/// ## Codec registration +/// +/// Physical extension codecs (e.g. `TantivyCodec` for the logs data source) are +/// applied via [`DataSourceContributions::apply_to_builder`] using the +/// `with_distributed_user_codec` builder extension from `datafusion_distributed`. +/// If your source needs a codec, call it inside the `codec_applier` callback: +/// +/// ```ignore +/// fn contributions(&self) -> DataSourceContributions { +/// DataSourceContributions::default() +/// .with_codec_applier(|builder| { +/// builder.with_distributed_user_codec(TantivyCodec) +/// }) +/// } +/// ``` +pub struct DataSourceContributions { + /// Physical optimizer rules contributed by this source. + /// + /// Logs adds tantivy-specific pushdown rules here. + /// Metrics adds nothing — DataFusion's built-in parquet pushdown is sufficient. + physical_optimizer_rules: Vec>, + + /// Scalar UDFs contributed by this source. + /// + /// Logs adds `full_text_udf()` here. + /// Metrics adds nothing. + udfs: Vec>, + + /// Callbacks that apply codec / builder extensions that cannot be expressed + /// as plain values (e.g. `with_distributed_user_codec(TantivyCodec)`). + /// + /// Applied to the `SessionStateBuilder` after rules and UDFs are merged. + /// Using callbacks avoids a direct dependency on `datafusion-proto` types. + /// + /// These are `FnOnce` because `SessionStateBuilder` is consumed and returned; + /// each applier can only run once. + codec_appliers: Vec SessionStateBuilder + Send + Sync>>, +} + +impl Default for DataSourceContributions { + fn default() -> Self { + Self { + physical_optimizer_rules: Vec::new(), + udfs: Vec::new(), + codec_appliers: Vec::new(), + } + } +} + +impl DataSourceContributions { + /// Add a physical optimizer rule. + pub fn with_physical_optimizer_rule( + mut self, + rule: Arc, + ) -> Self { + self.physical_optimizer_rules.push(rule); + self + } + + /// Add a scalar UDF. + pub fn with_udf(mut self, udf: Arc) -> Self { + self.udfs.push(udf); + self + } + + /// Add multiple scalar UDFs at once. + pub fn with_udf_batch(mut self, udfs: impl IntoIterator>) -> Self { + self.udfs.extend(udfs); + self + } + + /// Add a codec / builder-extension callback. + /// + /// Logs uses this to call `.with_distributed_user_codec(TantivyCodec)`. + pub fn with_codec_applier( + mut self, + f: impl FnOnce(SessionStateBuilder) -> SessionStateBuilder + Send + Sync + 'static, + ) -> Self { + self.codec_appliers.push(Box::new(f)); + self + } + + pub(crate) fn udf_names(&self) -> Vec { + self.udfs.iter().map(|udf| udf.name().to_string()).collect() + } + + /// Apply all contributions to a `SessionStateBuilder`. + /// + /// Called by `DataFusionSessionBuilder` and `QuickwitWorkerSessionBuilder` + /// after merging contributions from all sources. + /// + /// Injects in order: + /// 1. Physical optimizer rules + /// 2. Scalar UDFs (into the builder's scalar function map) + /// 3. Codec appliers (consumed in order) + pub fn apply_to_builder(self, mut builder: SessionStateBuilder) -> SessionStateBuilder { + for rule in self.physical_optimizer_rules { + builder = builder.with_physical_optimizer_rule(rule); + } + + if !self.udfs.is_empty() { + builder + .scalar_functions() + .get_or_insert_default() + .extend(self.udfs); + } + + for applier in self.codec_appliers { + builder = applier(builder); + } + + builder + } + + /// Merge another set of contributions into this one (additive, no dedup). + /// + /// Used by `DataFusionSessionBuilder` to accumulate across all sources. + pub fn merge(&mut self, other: DataSourceContributions) { + self.physical_optimizer_rules + .extend(other.physical_optimizer_rules); + self.udfs.extend(other.udfs); + self.codec_appliers.extend(other.codec_appliers); + } +} + +/// Extension point for plugging a data source into `DataFusionSessionBuilder`. +/// +/// Implement this trait for each data type (metrics, logs, traces, …) that +/// should be queryable via DataFusion SQL. +#[async_trait] +pub trait QuickwitDataSource: Send + Sync + Debug { + // ── Startup hook ───────────────────────────────────────────────── + + /// Called once when the source is registered via + /// `DataFusionSessionBuilder::with_source()`. + /// + /// Receives the shared `RuntimeEnv` that all sessions built by this builder + /// will use. Sources that know their object-store URLs at construction time + /// should register them here — analogous to `BlobStoreConnector::init` in + /// `dd-datafusion`, which calls `env.register_object_store(url, store)` once + /// at service startup so that every query can reach the store without any + /// per-session registration. + /// + /// Sources whose URLs are only discoverable at query time (e.g. metrics, + /// where indexes are listed from the metastore) should leave this as a no-op + /// and perform lazy registration in `MetricsTableProvider::scan()`, which + /// writes into the same shared `RuntimeEnv`. + /// + /// Default: no-op. + fn init(&self, _env: &datafusion::execution::runtime_env::RuntimeEnv) {} + + // ── Additive session contributions ────────────────────────────── + + /// Return this source's additive contributions to every session. + /// + /// Called once per `build_session()` / worker `build_session_state()` call. + /// Contributions from all registered sources are merged and applied to the + /// `SessionStateBuilder` before `build()` is called. + /// + /// Default: no contributions (metrics, for example, needs none). + fn contributions(&self) -> DataSourceContributions { + DataSourceContributions::default() + } + + // ── DDL support (optional) ─────────────────────────────────────── + + /// Return the DDL file-type token and its `TableProviderFactory` together, + /// or `None` if this source does not support DDL. + /// + /// When `Some((token, factory))` is returned: + /// - `token` is the string used in `STORED AS ` DDL (e.g. `"metrics"`). + /// - `factory` handles `CREATE [OR REPLACE] EXTERNAL TABLE … STORED AS `. + /// + /// The session registers the factory under both the literal token and its + /// uppercase equivalent because DataFusion uppercases the `STORED AS` token. + /// + /// Returning both pieces from a single method prevents the mismatch bug where + /// `file_type()` and `create_table_provider_factory()` could disagree or + /// create two different factory instances. + /// + /// Return `None` (the default) if this source resolves tables purely through + /// the schema provider — for example, the logs data source looks up the index + /// schema from the metastore at query time and needs no DDL. + fn ddl_registration(&self) -> Option<(String, Arc)> { + None + } + + // ── Substrait consumer hook ────────────────────────────────────── + + /// Try to handle a Substrait `ReadRel` for this source. + /// + /// Called by `QuickwitSubstraitConsumer::consume_read` for each registered + /// source before falling back to the standard catalog-lookup path. + /// + /// ## OSS path — standard Substrait (`NamedTable`) + /// + /// Producers that target Quickwit send a standard `ReadRel` with + /// `read_type = NamedTable { names: [""] }`. The `base_schema` + /// field of the `ReadRel` carries the Arrow schema the producer wants back + /// (already converted from Substrait types to Arrow by the caller). + /// + /// `MetricsDataSource` implements this path: it resolves the index from the + /// metastore and returns a `MetricsTableProvider` using the declared schema. + /// + /// ## Extension path — custom protos (downstream callers) + /// + /// Producers that carry DD-internal proto payloads (e.g. + /// `ExtensionTable`) implement a custom `QuickwitDataSource` in + /// A downstream caller that decodes its own proto and returns the appropriate provider. + /// No custom protos are needed in OSS. + /// + /// ## Return value + /// + /// - `Ok(Some((table_name, provider)))` — this source claims the rel. + /// `table_name` is the effective table identifier used for the scan. + /// The caller converts any `ExtensionTable` rel to a `NamedTable` rel + /// with this name so that `from_read_rel` can apply filters/projections. + /// - `Ok(None)` — this source does not claim the rel; try the next source. + /// + /// Default: `Ok(None)` — does not participate in Substrait consumption. + async fn try_consume_read_rel( + &self, + _rel: &datafusion_substrait::substrait::proto::ReadRel, + _schema_hint: Option, + ) -> DFResult)>> { + Ok(None) + } + + // ── Default table resolution (schema-provider path) ───────────── + + /// Create a default `TableProvider` for `index_name` without DDL. + /// + /// Called by `QuickwitSchemaProvider::table(name)` when no DDL-registered + /// table matches. Returns `Ok(None)` if this source does not own the + /// index — the schema provider will try the next registered source. + async fn create_default_table_provider( + &self, + index_name: &str, + ) -> DFResult>>; + + // ── Worker runtime setup (post-build, optional) ────────────────── + + /// Register runtime state the worker needs after the session is built. + /// + /// Called after `SessionStateBuilder::build()`. Use this for resources + /// that can only be registered on an existing `SessionState` (e.g., + /// object stores in the `RuntimeEnv` that depend on lazily-discovered + /// index URIs). + /// + /// For resources that are known at construction time, prefer registering + /// them in `contributions()` — or directly on the `RuntimeEnv` passed to + /// the session builder (analogous to `BlobStoreConnector::init(env)`). + /// + /// Default: no-op. + async fn register_for_worker(&self, _state: &datafusion::execution::SessionState) -> DFResult<()> { + Ok(()) + } + + // ── Index enumeration ──────────────────────────────────────────── + + /// Return all index names exposed by this source. + /// + /// Used by `QuickwitSchemaProvider::table_names()` for `SHOW TABLES` / + /// `information_schema`. Sources that cannot enumerate cheaply may + /// return an empty `Vec` (the logs data source does this — it would need + /// to list potentially thousands of indexes). + /// + /// # Threading note + /// + /// This method may be called from within a `tokio::task::block_in_place` + /// context on the DataFusion query thread. Implementations that call + /// blocking I/O must ensure they are not already inside a `block_in_place` + /// context (tokio panics on nested `block_in_place`). If in doubt, use + /// `tokio::task::spawn_blocking` or check + /// `tokio::runtime::Handle::try_current()` before blocking. + async fn list_index_names(&self) -> DFResult>; +} diff --git a/quickwit/quickwit-datafusion/src/lib.rs b/quickwit/quickwit-datafusion/src/lib.rs new file mode 100644 index 00000000000..37a4380573a --- /dev/null +++ b/quickwit/quickwit-datafusion/src/lib.rs @@ -0,0 +1,59 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! DataFusion-based query execution for Quickwit. +//! +//! ## Architecture +//! +//! The crate is split into two layers: +//! +//! **Generic execution layer** (no data-source-specific code): +//! - [`data_source`] — `QuickwitDataSource` trait (the extension point) +//! - [`session`] — `DataFusionSessionBuilder`: builds sessions from a list of sources +//! - [`catalog`] — `QuickwitSchemaProvider`: routes `table(name)` to the right source +//! - [`worker`] — `QuickwitWorkerSessionBuilder` + `build_quickwit_worker()` +//! - [`resolver`] — `QuickwitWorkerResolver`: default `SearcherPool`-backed worker URL resolver +//! - [`task_estimator`] — `QuickwitTaskEstimator`: split-count based task sizing +//! - [`storage_bridge`] — `QuickwitObjectStore`: `quickwit_storage::Storage` → `object_store::ObjectStore` adapter +//! - [`substrait`] — `QuickwitSubstraitConsumer`: routes Substrait `ReadRel` to data sources +//! +//! **Data source implementations** (`sources/`): +//! - [`sources::metrics`] — `MetricsDataSource` for OSS parquet metrics +//! +//! ## Worker URL resolution +//! +//! The default worker resolver (`QuickwitWorkerResolver`) maps `SearcherPool` +//! socket addresses to `http[s]://` URLs. downstream callers or other deployments with +//! different service discovery (e.g., Consul, DD-internal DNS) can supply their +//! own resolver via `DataFusionSessionBuilder::with_worker_resolver()`. + +pub(crate) mod catalog; +pub mod data_source; +pub(crate) mod resolver; +pub mod service; +pub mod session; +pub mod sources; +pub(crate) mod storage_bridge; +pub(crate) mod substrait; +pub(crate) mod task_estimator; +pub(crate) mod worker; + +// Re-export the top-level types for use in quickwit-serve and downstream callers. +pub use resolver::QuickwitWorkerResolver; +pub use service::DataFusionService; +pub use session::DataFusionSessionBuilder; +pub use worker::build_quickwit_worker; + +#[cfg(any(test, feature = "testsuite"))] +pub mod test_utils; diff --git a/quickwit/quickwit-datafusion/src/resolver.rs b/quickwit/quickwit-datafusion/src/resolver.rs new file mode 100644 index 00000000000..dd96dc849f3 --- /dev/null +++ b/quickwit/quickwit-datafusion/src/resolver.rs @@ -0,0 +1,67 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Generic worker resolver — maps `SearcherPool` → Flight URLs. +//! +//! No data-source-specific code here. + +use std::net::SocketAddr; + +use datafusion::error::DataFusionError; +use datafusion_distributed::WorkerResolver; +use quickwit_search::SearcherPool; +use url::Url; + +/// Resolves worker Flight URLs from the cluster's searcher pool. +/// +/// Every searcher node runs both the Quickwit gRPC `SearchService` and the +/// Arrow Flight service on the same port. +#[derive(Clone)] +pub struct QuickwitWorkerResolver { + searcher_pool: SearcherPool, + use_tls: bool, +} + +impl QuickwitWorkerResolver { + pub fn new(searcher_pool: SearcherPool) -> Self { + Self { + searcher_pool, + use_tls: false, + } + } + + pub fn with_tls(mut self, use_tls: bool) -> Self { + self.use_tls = use_tls; + self + } +} + +impl WorkerResolver for QuickwitWorkerResolver { + fn get_urls(&self) -> Result, DataFusionError> { + let addrs: Vec = self.searcher_pool.keys(); + if addrs.is_empty() { + return Err(DataFusionError::Execution( + "no searcher nodes available in the cluster".to_string(), + )); + } + let scheme = if self.use_tls { "https" } else { "http" }; + addrs + .into_iter() + .map(|addr| { + Url::parse(&format!("{scheme}://{addr}")) + .map_err(|e| DataFusionError::Internal(format!("bad worker url: {e}"))) + }) + .collect() + } +} diff --git a/quickwit/quickwit-datafusion/src/service.rs b/quickwit/quickwit-datafusion/src/service.rs new file mode 100644 index 00000000000..727aec6d0ee --- /dev/null +++ b/quickwit/quickwit-datafusion/src/service.rs @@ -0,0 +1,161 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Pure-Rust DataFusion query execution service. +//! +//! [`DataFusionService`] is the core query execution entry point: it holds an +//! `Arc` and exposes `execute_substrait` and +//! `execute_sql` methods that return streaming `RecordBatch` iterators. +//! +//! ## No tonic / gRPC coupling +//! +//! This struct has zero gRPC dependencies. The OSS gRPC handler in +//! `quickwit-serve/src/datafusion_api/grpc_handler.rs` wraps it and encodes +//! each batch as Arrow IPC. A downstream caller can do the same from its own +//! handler, calling `execute_substrait(&[u8])` and streaming the resulting +//! batches in its own proto response format. +//! +//! ## Usage +//! +//! ```ignore +//! use std::sync::Arc; +//! use quickwit_datafusion::{DataFusionService, DataFusionSessionBuilder}; +//! +//! let builder = Arc::new(DataFusionSessionBuilder::new().with_source(my_source)); +//! let service = DataFusionService::new(Arc::clone(&builder)); +//! +//! let mut stream = service.execute_substrait(&plan_bytes).await?; +//! while let Some(batch) = stream.next().await { +//! // handle batch +//! } +//! ``` + +use std::sync::Arc; + +use datafusion::error::Result as DFResult; +use datafusion::execution::SendableRecordBatchStream; + +use crate::session::DataFusionSessionBuilder; + +/// Pure-Rust query execution service backed by a `DataFusionSessionBuilder`. +/// +/// Owns an `Arc` and dispatches queries to it. +/// No tonic or gRPC types appear in this struct's public API. +#[derive(Clone)] +pub struct DataFusionService { + builder: Arc, +} + +impl std::fmt::Debug for DataFusionService { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("DataFusionService") + .field("builder", &self.builder) + .finish() + } +} + +impl DataFusionService { + /// Create a new service wrapping the given session builder. + pub fn new(builder: Arc) -> Self { + Self { builder } + } + + /// Execute a Substrait plan encoded as protobuf bytes. + /// + /// Builds a fresh session via the underlying `DataFusionSessionBuilder`, + /// decodes the plan, and returns a streaming `RecordBatch` iterator. + /// The caller decides whether to collect, send via gRPC, or pipe to Arrow + /// Flight — no materialization happens inside this method. + pub async fn execute_substrait( + &self, + plan_bytes: &[u8], + ) -> DFResult { + use datafusion_substrait::substrait::proto::Plan; + use prost::Message; + + let plan = Plan::decode(plan_bytes) + .map_err(|e| datafusion::error::DataFusionError::External(Box::new(e)))?; + + self.execute_substrait_plan(&plan).await + } + + /// Execute a Substrait plan from its proto3 JSON representation. + /// + /// Accepts the JSON format produced by DataFusion's `to_substrait_plan` + /// + `serde_json::to_string`, or the `rollup_substrait.json` format used + /// in integration tests and dev tooling. + /// + /// This is the dev/tooling path — grpcurl and Python scripts can pass the + /// plan as a JSON string without pre-encoding to binary protobuf. + pub async fn execute_substrait_json( + &self, + plan_json: &str, + ) -> DFResult { + use datafusion_substrait::substrait::proto::Plan; + + let plan: Plan = serde_json::from_str(plan_json).map_err(|e| { + datafusion::error::DataFusionError::Plan(format!( + "invalid Substrait plan JSON: {e}" + )) + })?; + + self.execute_substrait_plan(&plan).await + } + + async fn execute_substrait_plan( + &self, + plan: &datafusion_substrait::substrait::proto::Plan, + ) -> DFResult { + let ctx = self.builder.build_session()?; + crate::substrait::execute_substrait_plan_streaming(plan, &ctx, self.builder.sources()).await + } + + /// Execute one or more semicolon-separated SQL statements. + /// + /// DDL statements (e.g. `CREATE EXTERNAL TABLE`) are executed for side + /// effects. The last statement produces the result stream. + /// + /// Returns an error if `sql` is empty after splitting, or if any statement + /// fails to parse or execute. + pub async fn execute_sql(&self, sql: &str) -> DFResult { + let ctx = self.builder.build_session()?; + + // Split on `;` and discard empty fragments (trailing `;` etc.). + let statements: Vec<&str> = sql + .split(';') + .map(str::trim) + .filter(|s| !s.is_empty()) + .collect(); + + if statements.is_empty() { + return Err(datafusion::error::DataFusionError::Plan( + "no SQL statements provided".to_string(), + )); + } + + // Execute all but the last statement as DDL / side-effect statements. + let (last, prefixes) = statements + .split_last() + .expect("non-empty after the check above"); + + for stmt in prefixes { + ctx.sql(stmt).await?.collect().await?; + } + + // Execute the final statement and return the stream. + let df = ctx.sql(last).await?; + let stream = df.execute_stream().await?; + Ok(stream) + } +} diff --git a/quickwit/quickwit-datafusion/src/session.rs b/quickwit/quickwit-datafusion/src/session.rs new file mode 100644 index 00000000000..cedbc17a0aa --- /dev/null +++ b/quickwit/quickwit-datafusion/src/session.rs @@ -0,0 +1,294 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Generic DataFusion session builder. +//! +//! ## Runtime environment lifecycle +//! +//! `DataFusionSessionBuilder` creates a single `Arc` at construction +//! time and shares it across every session it builds. This mirrors the pattern in +//! `dd-datafusion`'s `DDDataFusionRuntime`, where a shared `RuntimeEnv` lets +//! object stores registered at service-startup time be visible to all queries +//! without any per-query re-registration. +//! +//! ## Memory limits +//! +//! By default the shared `RuntimeEnv` uses DataFusion's `UnboundedMemoryPool`, +//! which imposes no cap on query memory. For production deployments use +//! `with_memory_limit(bytes)` to install a `GreedyMemoryPool`. +//! +//! ## Worker URL resolution +//! +//! The default path uses `with_searcher_pool(pool)` which wraps the pool in a +//! `QuickwitWorkerResolver`. For deployments that don't use `SearcherPool` for +//! service discovery (e.g., a downstream caller using custom service discovery, Consul, or a Chitchat +//! variant), use `with_worker_resolver(resolver)` to supply any type that +//! implements `datafusion_distributed::WorkerResolver`. +//! +//! ## Result materialization +//! +//! `execute_substrait` collects all result batches into memory before returning. +//! For large rollup queries this is unsuitable for production use. A streaming +//! variant is deferred; A downstream caller can wrap this via its own gRPC handler. +//! Use `with_memory_limit()` to bound memory usage until streaming is in place. + +use std::collections::HashSet; +use std::sync::Arc; + +use datafusion::catalog::{CatalogProvider, MemoryCatalogProvider}; +use datafusion::error::Result as DFResult; +use datafusion::execution::memory_pool::GreedyMemoryPool; +use datafusion::execution::runtime_env::{RuntimeEnv, RuntimeEnvBuilder}; +use datafusion::execution::SessionStateBuilder; +use datafusion::prelude::{SessionConfig, SessionContext}; +use datafusion_distributed::{ + DistributedExt, DistributedPhysicalOptimizerRule, WorkerResolver, +}; +use quickwit_search::SearcherPool; + +use crate::catalog::QuickwitSchemaProvider; +use crate::data_source::QuickwitDataSource; +use crate::resolver::QuickwitWorkerResolver; +use crate::task_estimator::QuickwitTaskEstimator; + +/// Builds `SessionContext`s for DataFusion queries over Quickwit data. +/// +/// Holds a single `Arc` shared across all sessions it creates. +pub struct DataFusionSessionBuilder { + sources: Vec>, + /// Pluggable worker URL resolver. `None` = single-node execution. + /// Set via `with_searcher_pool` (default impl) or `with_worker_resolver` + /// (custom impl for other service discovery). + worker_resolver: Option>, + /// Shared runtime environment — one instance for the lifetime of this builder. + runtime: Arc, +} + +impl std::fmt::Debug for DataFusionSessionBuilder { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("DataFusionSessionBuilder") + .field("num_sources", &self.sources.len()) + .field("distributed", &self.worker_resolver.is_some()) + .finish() + } +} + +impl Default for DataFusionSessionBuilder { + fn default() -> Self { + Self::new() + } +} + +impl DataFusionSessionBuilder { + pub fn new() -> Self { + Self { + sources: Vec::new(), + worker_resolver: None, + runtime: Arc::new(RuntimeEnv::default()), + } + } + + /// Set a hard memory limit (bytes) for all queries built by this session builder. + /// + /// Installs a `GreedyMemoryPool` on the shared `RuntimeEnv`. DataFusion will + /// return an error from any query that attempts to allocate beyond this limit, + /// preventing unbounded memory growth on large rollup queries. + /// + /// Must be called before `with_source()` — sources call `init(&self.runtime)` + /// on registration and expect the pool to be in place. + pub fn with_memory_limit(mut self, bytes: usize) -> DFResult { + let runtime = RuntimeEnvBuilder::new() + .with_memory_pool(Arc::new(GreedyMemoryPool::new(bytes))) + .build_arc()?; + self.runtime = runtime; + Ok(self) + } + + /// Register a data source and call its `init` hook immediately. + /// + /// `init` receives the shared `RuntimeEnv` so sources that know their + /// object-store URLs at construction time can register them once here. + pub fn with_source(mut self, source: Arc) -> Self { + source.init(&self.runtime); + self.sources.push(source); + self + } + + /// Enable distributed execution using the default `SearcherPool`-backed + /// resolver. + /// + /// Worker URLs are derived from the pool's socket-address keys using plain + /// `http://` (or `https://` if you have separately configured TLS on the + /// `QuickwitWorkerResolver`). For non-`SearcherPool` deployments, use + /// `with_worker_resolver` instead. + pub fn with_searcher_pool(self, pool: SearcherPool) -> Self { + self.with_worker_resolver(QuickwitWorkerResolver::new(pool)) + } + + /// Enable distributed execution with a custom worker URL resolver. + /// + /// Use this when `SearcherPool` is not the right abstraction — for example: + /// - A downstream caller using custom service discovery or topology. + /// - Tests use a fixed list of mock worker addresses. + /// - TLS deployments need `QuickwitWorkerResolver::new(pool).with_tls(true)`. + /// + /// Any type implementing `datafusion_distributed::WorkerResolver` is accepted. + pub fn with_worker_resolver( + mut self, + resolver: impl WorkerResolver + Send + Sync + 'static, + ) -> Self { + self.worker_resolver = Some(Arc::new(resolver)); + self + } + + /// Returns the shared `RuntimeEnv`. + /// + /// Pass this to `build_quickwit_worker` so workers share the same + /// object-store registry as the coordinator. + pub fn runtime(&self) -> &Arc { + &self.runtime + } + + /// Returns a slice of all registered data sources. + pub fn sources(&self) -> &[Arc] { + &self.sources + } + + /// Validate that no two sources register conflicting UDF or UDAF names. + /// + /// This is a development-time sanity check — call it once at service startup + /// after all sources are registered, not on every query. It is not called + /// automatically by `build_session()`. + /// + /// ```ignore + /// let builder = DataFusionSessionBuilder::new() + /// .with_source(source_a) + /// .with_source(source_b); + /// builder.check_invariants()?; // fail fast at startup + /// // ... serve queries + /// ``` + pub fn check_invariants(&self) -> DFResult<()> { + let mut seen_udfs: HashSet = HashSet::new(); + for source in &self.sources { + let contribs = source.contributions(); + for name in contribs.udf_names() { + if !seen_udfs.insert(name.clone()) { + return Err(datafusion::error::DataFusionError::Configuration(format!( + "two data sources both register a scalar UDF named '{name}'" + ))); + } + } + } + Ok(()) + } + + /// Execute a Substrait plan (protobuf bytes) and return the results. + /// + /// Builds a fresh session, converts the plan via `QuickwitSubstraitConsumer`, + /// and collects all results into memory. See the module-level doc on + /// materialization limits. + pub async fn execute_substrait( + &self, + plan_bytes: &[u8], + ) -> DFResult> { + use datafusion_substrait::substrait::proto::Plan; + use prost::Message; + + let plan = Plan::decode(plan_bytes) + .map_err(|e| datafusion::error::DataFusionError::External(Box::new(e)))?; + + let ctx = self.build_session()?; + crate::substrait::execute_substrait_plan(&plan, &ctx, &self.sources).await + } + + /// Build a `SessionContext` backed by the shared `RuntimeEnv`. + /// + /// Does NOT call `check_invariants()` — callers should invoke that once at + /// startup, not on every query. + pub fn build_session(&self) -> DFResult { + let mut config = SessionConfig::new().with_target_partitions(1); + config.options_mut().catalog.default_catalog = "quickwit".to_string(); + config.options_mut().catalog.default_schema = "public".to_string(); + config.options_mut().catalog.information_schema = true; + // We register our own catalog; skip the default "datafusion" one. + config.options_mut().catalog.create_default_catalog_and_schema = false; + + let mut builder = SessionStateBuilder::new() + .with_config(config) + .with_default_features() + // All sessions share the same RuntimeEnv so object stores registered + // at startup (via init) or lazily (via scan) are globally visible. + .with_runtime_env(Arc::clone(&self.runtime)); + + if let Some(resolver) = &self.worker_resolver { + // Clone the Arc so ownership passes into the distributed extension. + // `Arc` implements `WorkerResolver` via deref, + // so the forwarding wrapper is not needed. + builder = builder + .with_distributed_worker_resolver(ArcWorkerResolver(Arc::clone(resolver))) + .with_distributed_task_estimator(QuickwitTaskEstimator) + .with_physical_optimizer_rule(Arc::new(DistributedPhysicalOptimizerRule)); + } + + // Accumulate contributions from all sources and apply them at once. + let mut combined = crate::data_source::DataSourceContributions::default(); + for source in &self.sources { + combined.merge(source.contributions()); + } + builder = combined.apply_to_builder(builder); + + let mut state = builder.build(); + + for source in &self.sources { + let Some((ft, factory)) = source.ddl_registration() else { + continue; + }; + state + .table_factories_mut() + .insert(ft.clone(), Arc::clone(&factory)); + state + .table_factories_mut() + .insert(ft.to_uppercase(), Arc::clone(&factory)); + } + + let ctx = SessionContext::new_with_state(state); + + let schema_provider = Arc::new(QuickwitSchemaProvider::new(self.sources.clone())); + let catalog = Arc::new(MemoryCatalogProvider::new()); + catalog + .register_schema("public", schema_provider) + .map_err(|e| { + datafusion::error::DataFusionError::Internal(format!( + "failed to register 'public' schema: {e}" + )) + })?; + ctx.register_catalog("quickwit", catalog); + + Ok(ctx) + } +} + +/// Newtype wrapper so `Arc` can be passed to +/// `with_distributed_worker_resolver`, which requires an owned `impl WorkerResolver`. +/// +/// `Arc` cannot be passed directly because +/// the trait bound requires `Sized`. This wrapper is `'static` and satisfies +/// the `WorkerResolver + Send + Sync + 'static` bound. +struct ArcWorkerResolver(Arc); + +impl WorkerResolver for ArcWorkerResolver { + fn get_urls(&self) -> Result, datafusion::error::DataFusionError> { + self.0.get_urls() + } +} diff --git a/quickwit/quickwit-datafusion/src/sources/metrics/factory.rs b/quickwit/quickwit-datafusion/src/sources/metrics/factory.rs new file mode 100644 index 00000000000..5fbbc5deee0 --- /dev/null +++ b/quickwit/quickwit-datafusion/src/sources/metrics/factory.rs @@ -0,0 +1,89 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! `TableProviderFactory` for metrics indexes. +//! +//! Allows callers to declare the expected schema inline in SQL: +//! +//! ```sql +//! CREATE EXTERNAL TABLE "my-metrics" ( +//! metric_name VARCHAR NOT NULL, +//! timestamp_secs BIGINT NOT NULL, +//! value DOUBLE NOT NULL, +//! service VARCHAR, +//! env VARCHAR +//! ) STORED AS metrics LOCATION 'my-metrics'; +//! ``` + +use std::sync::Arc; + +use arrow::datatypes::SchemaRef; +use async_trait::async_trait; +use datafusion::catalog::Session; +use datafusion::catalog::TableProviderFactory; +use datafusion::error::{DataFusionError, Result as DFResult}; +use datafusion::logical_expr::CreateExternalTable; + +use super::index_resolver::MetricsIndexResolver; +use super::table_provider::MetricsTableProvider; + +/// The file type string used in `STORED AS metrics`. +pub const METRICS_FILE_TYPE: &str = "metrics"; + +/// Creates `MetricsTableProvider` instances from `CREATE EXTERNAL TABLE` DDL. +#[derive(Debug)] +pub struct MetricsTableProviderFactory { + index_resolver: Arc, +} + +impl MetricsTableProviderFactory { + pub fn new(index_resolver: Arc) -> Self { + Self { index_resolver } + } +} + +#[async_trait] +impl TableProviderFactory for MetricsTableProviderFactory { + async fn create( + &self, + _state: &dyn Session, + cmd: &CreateExternalTable, + ) -> DFResult> { + let index_name = if cmd.location.is_empty() { + cmd.name.table().to_string() + } else { + cmd.location.clone() + }; + + let (split_provider, object_store, object_store_url) = + self.index_resolver.resolve(&index_name).await?; + + let arrow_schema: SchemaRef = Arc::new(cmd.schema.as_arrow().clone()); + + if arrow_schema.fields().is_empty() { + return Err(DataFusionError::Plan(format!( + "CREATE EXTERNAL TABLE '{index_name}' must declare at least one column" + ))); + } + + let provider = MetricsTableProvider::new( + arrow_schema, + split_provider, + object_store, + object_store_url, + ); + + Ok(Arc::new(provider)) + } +} diff --git a/quickwit/quickwit-datafusion/src/sources/metrics/index_resolver.rs b/quickwit/quickwit-datafusion/src/sources/metrics/index_resolver.rs new file mode 100644 index 00000000000..436dee29878 --- /dev/null +++ b/quickwit/quickwit-datafusion/src/sources/metrics/index_resolver.rs @@ -0,0 +1,193 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Index resolution for the metrics data source. +//! +//! `MetastoreIndexResolver::resolve()` performs two RPCs per call: +//! 1. `index_metadata` — cheap primary-key lookup, always fresh. +//! 2. `storage_resolver.resolve(uri)` — constructs a `Storage` handle. +//! +//! Caching of the `Storage` handle (to amortise repeated resolve calls for the +//! same index) is intentionally deferred to a follow-up. The quickwit search +//! path also resolves storage on every leaf request without caching and +//! relies on the split-byte cache (`SplitCache`) instead. + +use std::sync::Arc; + +use async_trait::async_trait; +use datafusion::error::Result as DFResult; +use datafusion::execution::object_store::ObjectStoreUrl; +use object_store::ObjectStore; +use quickwit_metastore::{IndexMetadataResponseExt, ListIndexesMetadataResponseExt}; +use quickwit_proto::metastore::{ + IndexMetadataRequest, ListIndexesMetadataRequest, MetastoreService, MetastoreServiceClient, +}; +use quickwit_storage::StorageResolver; +use tracing::debug; + +use super::metastore_provider::MetastoreSplitProvider; +use super::table_provider::MetricsSplitProvider; +use crate::storage_bridge::QuickwitObjectStore; + +/// Resolves per-index resources needed to scan a metrics index. +#[async_trait] +pub trait MetricsIndexResolver: Send + Sync + std::fmt::Debug { + async fn resolve( + &self, + index_name: &str, + ) -> DFResult<(Arc, Arc, ObjectStoreUrl)>; + + async fn list_index_names(&self) -> DFResult>; +} + +// ── Test helper ────────────────────────────────────────────────────── + +/// Single-store resolver — returns the same resources for every index name. +#[cfg(any(test, feature = "testsuite"))] +#[derive(Debug)] +pub struct SimpleIndexResolver { + split_provider: Arc, + object_store: Arc, + object_store_url: ObjectStoreUrl, + index_names: Vec, +} + +#[cfg(any(test, feature = "testsuite"))] +impl SimpleIndexResolver { + pub fn new( + split_provider: Arc, + object_store: Arc, + object_store_url: ObjectStoreUrl, + ) -> Self { + Self { + split_provider, + object_store, + object_store_url, + index_names: vec!["metrics".to_string()], + } + } + + pub fn with_index_names(mut self, names: Vec) -> Self { + self.index_names = names; + self + } +} + +#[cfg(any(test, feature = "testsuite"))] +#[async_trait] +impl MetricsIndexResolver for SimpleIndexResolver { + async fn resolve( + &self, + _index_name: &str, + ) -> DFResult<(Arc, Arc, ObjectStoreUrl)> { + Ok(( + Arc::clone(&self.split_provider), + Arc::clone(&self.object_store), + self.object_store_url.clone(), + )) + } + + async fn list_index_names(&self) -> DFResult> { + Ok(self.index_names.clone()) + } +} + +// ── Production implementation ───────────────────────────────────────── + +/// Production `MetricsIndexResolver` backed by the Quickwit metastore. +/// +/// Each `resolve()` call: +/// 1. Fetches `IndexMetadata` (cheap primary-key RPC) for a fresh `index_uid`. +/// 2. Calls `storage_resolver.resolve(uri)` to obtain a `Storage` handle. +#[derive(Clone)] +pub struct MetastoreIndexResolver { + metastore: MetastoreServiceClient, + storage_resolver: StorageResolver, +} + +impl MetastoreIndexResolver { + pub fn new(metastore: MetastoreServiceClient, storage_resolver: StorageResolver) -> Self { + Self { metastore, storage_resolver } + } +} + +impl std::fmt::Debug for MetastoreIndexResolver { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("MetastoreIndexResolver").finish() + } +} + +#[async_trait] +impl MetricsIndexResolver for MetastoreIndexResolver { + async fn resolve( + &self, + index_name: &str, + ) -> DFResult<(Arc, Arc, ObjectStoreUrl)> { + debug!(index_name, "resolving metrics index"); + + let response = self + .metastore + .clone() + .index_metadata(IndexMetadataRequest::for_index_id(index_name.to_string())) + .await + .map_err(|err| datafusion::error::DataFusionError::External(Box::new(err)))?; + + let index_metadata = response + .deserialize_index_metadata() + .map_err(|err| datafusion::error::DataFusionError::External(Box::new(err)))?; + + let index_uid = index_metadata.index_uid.clone(); + let index_uri = &index_metadata.index_config.index_uri; + + debug!(%index_uid, %index_uri, "resolved index metadata"); + + let storage = self + .storage_resolver + .resolve(index_uri) + .await + .map_err(|err| datafusion::error::DataFusionError::External(Box::new(err)))?; + + let object_store_url = + ObjectStoreUrl::parse(format!("quickwit://{index_name}/")).map_err(|err| { + datafusion::error::DataFusionError::Internal(format!( + "failed to build object store url: {err}" + )) + })?; + + let object_store: Arc = Arc::new(QuickwitObjectStore::new(storage)); + let split_provider: Arc = + Arc::new(MetastoreSplitProvider::new(self.metastore.clone(), index_uid)); + + Ok((split_provider, object_store, object_store_url)) + } + + async fn list_index_names(&self) -> DFResult> { + let response = self + .metastore + .clone() + .list_indexes_metadata(ListIndexesMetadataRequest::all()) + .await + .map_err(|err| datafusion::error::DataFusionError::External(Box::new(err)))?; + + let indexes = response + .deserialize_indexes_metadata() + .await + .map_err(|err| datafusion::error::DataFusionError::External(Box::new(err)))?; + + Ok(indexes + .into_iter() + .map(|idx| idx.index_config.index_id) + .collect()) + } +} diff --git a/quickwit/quickwit-datafusion/src/sources/metrics/metastore_provider.rs b/quickwit/quickwit-datafusion/src/sources/metrics/metastore_provider.rs new file mode 100644 index 00000000000..7d34112f583 --- /dev/null +++ b/quickwit/quickwit-datafusion/src/sources/metrics/metastore_provider.rs @@ -0,0 +1,153 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Real `MetricsSplitProvider` backed by the Quickwit metastore. + + +use async_trait::async_trait; +use datafusion::error::Result as DFResult; +use quickwit_metastore::{ + ListMetricsSplitsQuery, ListMetricsSplitsRequestExt, ListMetricsSplitsResponseExt, +}; +use quickwit_parquet_engine::split::MetricsSplitMetadata; +use quickwit_proto::metastore::{ + ListMetricsSplitsRequest, MetastoreService, MetastoreServiceClient, +}; +use quickwit_proto::types::IndexUid; +use tracing::{debug, instrument}; + +use super::predicate::MetricsSplitQuery; +use super::table_provider::MetricsSplitProvider; + +/// `MetricsSplitProvider` backed by the Quickwit metastore RPC. +#[derive(Debug, Clone)] +pub struct MetastoreSplitProvider { + metastore: MetastoreServiceClient, + index_uid: IndexUid, +} + +impl MetastoreSplitProvider { + pub fn new(metastore: MetastoreServiceClient, index_uid: IndexUid) -> Self { + Self { + metastore, + index_uid, + } + } +} + +#[async_trait] +impl MetricsSplitProvider for MetastoreSplitProvider { + #[instrument( + skip(self, query), + fields( + index_uid = %self.index_uid, + metric_names = ?query.metric_names, + time_range_start = ?query.time_range_start, + time_range_end = ?query.time_range_end, + num_splits, + ) + )] + async fn list_splits( + &self, + query: &MetricsSplitQuery, + ) -> DFResult> { + let metastore_query = to_metastore_query(&self.index_uid, query); + + let request = + ListMetricsSplitsRequest::try_from_query(self.index_uid.clone(), &metastore_query) + .map_err(|err| { + datafusion::error::DataFusionError::External(Box::new(err)) + })?; + + let response = self + .metastore + .clone() + .list_metrics_splits(request) + .await + .map_err(|err| datafusion::error::DataFusionError::External(Box::new(err)))?; + + let records = response + .deserialize_splits() + .map_err(|err| datafusion::error::DataFusionError::External(Box::new(err)))?; + + // The metastore guarantees only Published splits are returned because + // `to_metastore_query` sets `split_states = vec![Published]`. No + // client-side re-filter is needed here. + let splits: Vec = records + .into_iter() + .map(|record| record.metadata) + .collect(); + + tracing::Span::current().record("num_splits", splits.len()); + debug!(num_splits = splits.len(), "metastore returned splits"); + + Ok(splits) + } +} + +/// Convert a DataFusion `MetricsSplitQuery` to a metastore `ListMetricsSplitsQuery`. +/// +/// Note: The OSS parquet column names are bare (service, env, etc.) but the +/// metastore `ListMetricsSplitsQuery` still uses the `tag_service`, `tag_env` +/// field names — this is just the metastore's internal naming convention. +/// +/// # Tag field pushdown limitation +/// +/// `ListMetricsSplitsQuery` accepts at most one value per tag field +/// (`Option`). When a DataFusion `IN (...)` predicate produces +/// multiple candidate values for a tag column, the metastore cannot express +/// the full filter, so **no metastore-level pruning is applied for that +/// dimension** — the value is left as `None`. The parquet-level filter +/// (applied after the split is opened) will still enforce the predicate +/// correctly. Only single-value equalities (`WHERE service = 'web'`) or +/// single-element IN lists are pushed down to the metastore. +fn to_metastore_query(index_uid: &IndexUid, query: &MetricsSplitQuery) -> ListMetricsSplitsQuery { + let mut metastore_query = ListMetricsSplitsQuery::for_index(index_uid.clone()); + + if let Some(ref names) = query.metric_names { + metastore_query.metric_names = names.clone(); + } + + if let Some(start) = query.time_range_start { + metastore_query.time_range_start = Some(start as i64); + } + + if let Some(end) = query.time_range_end { + metastore_query.time_range_end = Some(end as i64); + } + + // Push down a tag filter to the metastore only when there is exactly one + // candidate value. Multi-value IN lists cannot be expressed as a single + // `Option` on `ListMetricsSplitsQuery`; passing only the first + // value would silently skip splits that match the other values, producing + // incorrect (incomplete) results. For multi-value lists we pass `None` + // (no metastore pruning) and rely on the parquet-level filter instead. + metastore_query.tag_service = single_value(query.tag_service.as_deref()); + metastore_query.tag_env = single_value(query.tag_env.as_deref()); + metastore_query.tag_datacenter = single_value(query.tag_datacenter.as_deref()); + metastore_query.tag_region = single_value(query.tag_region.as_deref()); + metastore_query.tag_host = single_value(query.tag_host.as_deref()); + + metastore_query +} + +/// Returns the single element of `values` as `Some(value)`, or `None` if +/// `values` is absent, empty, or contains more than one element. +fn single_value(values: Option<&[String]>) -> Option { + match values { + Some([single]) => Some(single.clone()), + _ => None, + } +} + diff --git a/quickwit/quickwit-datafusion/src/sources/metrics/mod.rs b/quickwit/quickwit-datafusion/src/sources/metrics/mod.rs new file mode 100644 index 00000000000..b5167a05274 --- /dev/null +++ b/quickwit/quickwit-datafusion/src/sources/metrics/mod.rs @@ -0,0 +1,235 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Metrics data source for DataFusion. +//! +//! `MetricsDataSource` implements `QuickwitDataSource` and encapsulates all +//! metrics-specific logic: split providers, index resolution, filter pushdown, +//! and object-store pre-registration for Flight workers. +//! +//! All metrics-specific code lives in this module; none leaks into the generic +//! session / catalog / worker layer. + +pub(crate) mod factory; +pub(crate) mod index_resolver; +pub(crate) mod metastore_provider; +pub(crate) mod predicate; +pub(crate) mod table_provider; + +#[cfg(any(test, feature = "testsuite"))] +pub mod test_utils; + +use std::sync::Arc; + +use arrow::datatypes::{DataType, Field, Schema as ArrowSchema, SchemaRef}; +use async_trait::async_trait; +use datafusion::catalog::TableProviderFactory; +use datafusion::datasource::TableProvider; +use datafusion::error::Result as DFResult; +use datafusion::execution::SessionState; +use quickwit_proto::metastore::{MetastoreError, MetastoreServiceClient}; +use quickwit_storage::StorageResolver; +use tracing::debug; + +use crate::data_source::{DataSourceContributions, QuickwitDataSource}; +use self::factory::{MetricsTableProviderFactory, METRICS_FILE_TYPE}; +use self::index_resolver::{MetastoreIndexResolver, MetricsIndexResolver}; +use self::table_provider::MetricsTableProvider; + +/// `QuickwitDataSource` implementation for OSS parquet metrics. +/// +/// Backed by the Quickwit metastore for split discovery and `StorageResolver` +/// for object-store access. Registers object stores on Flight workers via +/// `register_for_worker()`. +#[derive(Debug)] +pub struct MetricsDataSource { + index_resolver: Arc, +} + +impl MetricsDataSource { + /// Create a production `MetricsDataSource` backed by the metastore. + pub fn new( + metastore: MetastoreServiceClient, + storage_resolver: StorageResolver, + ) -> Self { + let resolver = MetastoreIndexResolver::new(metastore, storage_resolver); + Self { + index_resolver: Arc::new(resolver), + } + } + + /// Create with a custom resolver (for tests). + pub fn with_resolver(index_resolver: Arc) -> Self { + Self { index_resolver } + } +} + +/// Minimal 4-column schema — always present in every OSS metrics parquet file. +fn minimal_base_schema() -> SchemaRef { + let dict = DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)); + Arc::new(ArrowSchema::new(vec![ + Field::new("metric_name", dict, false), + Field::new("metric_type", DataType::UInt8, false), + Field::new("timestamp_secs", DataType::UInt64, false), + Field::new("value", DataType::Float64, false), + ])) +} + +#[async_trait] +impl QuickwitDataSource for MetricsDataSource { + fn contributions(&self) -> DataSourceContributions { + DataSourceContributions::default() + } + + /// Handle `ReadRel` nodes in incoming Substrait plans. + /// + /// ## OSS path — `NamedTable` + /// + /// When the read type is `NamedTable { names: [index_name] }` and the index + /// exists in the metastore, returns a `MetricsTableProvider` using the + /// schema from `schema_hint` (derived from `ReadRel.base_schema` by the + /// caller). Returning `None` for an unknown index lets the standard catalog + /// path take over. + /// + /// ## Extension path — custom protos (downstream callers) + /// + /// A downstream caller registers its own `QuickwitDataSource` that handles + /// `ExtensionTable`. This default implementation only handles + /// `NamedTable` — `ExtensionTable` always returns `Ok(None)` here. + async fn try_consume_read_rel( + &self, + rel: &datafusion_substrait::substrait::proto::ReadRel, + schema_hint: Option, + ) -> DFResult)>> { + use datafusion_substrait::substrait::proto::read_rel::ReadType; + + // Only handle NamedTable reads. ExtensionTable (downstream callers) returns None. + let Some(ReadType::NamedTable(nt)) = &rel.read_type else { + return Ok(None); + }; + // `NamedTable::names` is a path like ["catalog", "schema", "table"]; + // the last element is the effective table name. An empty list is a + // malformed plan — skip rather than silently resolving to index "". + let Some(index_name) = nt.names.last() else { + return Ok(None); + }; + let index_name = index_name.as_str(); + + // Use the producer-declared schema if available; fall back to minimal base schema. + let schema = schema_hint.unwrap_or_else(minimal_base_schema); + + match self.index_resolver.resolve(index_name).await { + Ok((split_provider, object_store, object_store_url)) => { + let provider = MetricsTableProvider::new( + schema, + split_provider, + object_store, + object_store_url, + ); + Ok(Some((index_name.to_string(), Arc::new(provider)))) + } + Err(err) => { + // Not-found means this source doesn't own the index; let others try. + let is_not_found = match &err { + datafusion::error::DataFusionError::External(boxed) => boxed + .downcast_ref::() + .map(|me| matches!(me, MetastoreError::NotFound(_))) + .unwrap_or(false), + _ => false, + }; + if is_not_found { Ok(None) } else { Err(err) } + } + } + } + + fn ddl_registration(&self) -> Option<(String, Arc)> { + let factory: Arc = Arc::new(MetricsTableProviderFactory::new( + Arc::clone(&self.index_resolver), + )); + Some((METRICS_FILE_TYPE.to_string(), factory)) + } + + async fn create_default_table_provider( + &self, + index_name: &str, + ) -> DFResult>> { + match self.index_resolver.resolve(index_name).await { + Ok((split_provider, object_store, object_store_url)) => { + let provider = MetricsTableProvider::new( + minimal_base_schema(), + split_provider, + object_store, + object_store_url, + ); + Ok(Some(Arc::new(provider))) + } + Err(err) => { + // Only swallow "index not found" — propagate everything else so the + // caller gets an actionable error (e.g. metastore unavailable). + let is_not_found = match &err { + datafusion::error::DataFusionError::External(boxed) => boxed + .downcast_ref::() + .map(|me| matches!(me, MetastoreError::NotFound(_))) + .unwrap_or(false), + _ => false, + }; + if is_not_found { + Ok(None) + } else { + Err(err) + } + } + } + } + + async fn register_for_worker(&self, state: &SessionState) -> DFResult<()> { + let index_names = self.index_resolver.list_index_names().await?; + + // Resolve all indexes concurrently — issuing N sequential `index_metadata` + // RPCs would cost O(N × rtt) wall-clock time; concurrent resolution keeps + // startup latency near O(rtt) regardless of index count. + // The object-store cache in MetastoreIndexResolver ensures storage-resolver + // RPCs are skipped on subsequent registrations. + let resolver = &self.index_resolver; + let results = futures::future::join_all( + index_names + .iter() + .map(|name| resolver.resolve(name.as_str())), + ) + .await; + + for (index_name, result) in index_names.iter().zip(results) { + match result { + Ok((_, object_store, object_store_url)) => { + state + .runtime_env() + .register_object_store(object_store_url.as_ref(), object_store); + debug!(index_name, "registered object store for metrics worker"); + } + Err(err) => { + debug!( + index_name, + error = %err, + "skipping metrics index in worker registration (non-fatal)" + ); + } + } + } + Ok(()) + } + + async fn list_index_names(&self) -> DFResult> { + self.index_resolver.list_index_names().await + } +} diff --git a/quickwit/quickwit-datafusion/src/sources/metrics/predicate.rs b/quickwit/quickwit-datafusion/src/sources/metrics/predicate.rs new file mode 100644 index 00000000000..8c6f0e8646e --- /dev/null +++ b/quickwit/quickwit-datafusion/src/sources/metrics/predicate.rs @@ -0,0 +1,516 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Predicate extraction for Postgres split pruning. +//! +//! Extracts metric_name, time_range, and tag filters from DataFusion +//! filter expressions to build a query for the `metrics_splits` table. +//! +//! OSS column names: `service`, `env`, `datacenter`, `region`, `host` +//! (no `tag_` prefix — the parquet files use bare column names). + +use datafusion::logical_expr::{BinaryExpr, Expr, Operator}; +use datafusion::scalar::ScalarValue; + +/// Extracted filters for querying the metrics_splits table. +#[derive(Debug, Default, Clone)] +pub struct MetricsSplitQuery { + pub metric_names: Option>, + pub time_range_start: Option, + pub time_range_end: Option, + pub tag_service: Option>, + pub tag_env: Option>, + pub tag_datacenter: Option>, + pub tag_region: Option>, + pub tag_host: Option>, +} + +/// Analyzes pushed-down filter expressions and extracts split-level filters. +/// +/// Returns a `MetricsSplitQuery` for Postgres pruning plus any remaining +/// filter expressions that must be applied at the parquet reader level. +pub fn extract_split_filters(filters: &[Expr]) -> (MetricsSplitQuery, Vec) { + let mut query = MetricsSplitQuery::default(); + let mut remaining = Vec::new(); + + for filter in filters { + if !try_extract_filter(filter, &mut query) { + remaining.push(filter.clone()); + } + } + + (query, remaining) +} + +fn try_extract_filter(expr: &Expr, query: &mut MetricsSplitQuery) -> bool { + match expr { + Expr::BinaryExpr(BinaryExpr { left, op, right }) => match op { + Operator::Eq => try_extract_eq(left, right, query), + Operator::GtEq => try_extract_ts_gte(left, right, query), + Operator::Gt => try_extract_ts_gt(left, right, query), + Operator::Lt => try_extract_ts_lt(left, right, query), + Operator::LtEq => try_extract_ts_lte(left, right, query), + Operator::And => { + let l = try_extract_filter(left, query); + let r = try_extract_filter(right, query); + l && r + } + _ => false, + }, + Expr::InList(in_list) if !in_list.negated => { + try_extract_in_list(&in_list.expr, &in_list.list, query) + } + _ => false, + } +} + +fn try_extract_eq(left: &Expr, right: &Expr, query: &mut MetricsSplitQuery) -> bool { + let (col, val) = match (column_name(left), scalar_utf8(right)) { + (Some(c), Some(v)) => (c, v), + _ => match (scalar_utf8(left), column_name(right)) { + (Some(v), Some(c)) => (c, v), + _ => return false, + }, + }; + set_tag_values(&col, vec![val], query) +} + +fn try_extract_in_list(expr: &Expr, list: &[Expr], query: &mut MetricsSplitQuery) -> bool { + let col = match column_name(expr) { + Some(n) => n, + None => return false, + }; + let values: Vec = list.iter().filter_map(scalar_utf8).collect(); + if values.is_empty() || values.len() != list.len() { + return false; + } + set_tag_values(&col, values, query) +} + +fn try_extract_ts_gte(left: &Expr, right: &Expr, q: &mut MetricsSplitQuery) -> bool { + if let (Some(c), Some(v)) = (column_name(left), scalar_u64(right)) { + if c == "timestamp_secs" { + q.time_range_start = Some(v); + return true; + } + } + false +} + +fn try_extract_ts_gt(left: &Expr, right: &Expr, q: &mut MetricsSplitQuery) -> bool { + if let (Some(c), Some(v)) = (column_name(left), scalar_u64(right)) { + if c == "timestamp_secs" { + q.time_range_start = Some(v + 1); + return true; + } + } + false +} + +fn try_extract_ts_lt(left: &Expr, right: &Expr, q: &mut MetricsSplitQuery) -> bool { + if let (Some(c), Some(v)) = (column_name(left), scalar_u64(right)) { + if c == "timestamp_secs" { + q.time_range_end = Some(v); + return true; + } + } + false +} + +fn try_extract_ts_lte(left: &Expr, right: &Expr, q: &mut MetricsSplitQuery) -> bool { + if let (Some(c), Some(v)) = (column_name(left), scalar_u64(right)) { + if c == "timestamp_secs" { + q.time_range_end = Some(v + 1); + return true; + } + } + false +} + +/// Map OSS column names (no `tag_` prefix) to MetricsSplitQuery tag fields. +fn set_tag_values(col: &str, values: Vec, q: &mut MetricsSplitQuery) -> bool { + match col { + "metric_name" => { + q.metric_names = Some(values); + true + } + // OSS column names: bare names without `tag_` prefix + "service" => { + q.tag_service = Some(values); + true + } + "env" => { + q.tag_env = Some(values); + true + } + "datacenter" => { + q.tag_datacenter = Some(values); + true + } + "region" => { + q.tag_region = Some(values); + true + } + "host" => { + q.tag_host = Some(values); + true + } + _ => false, + } +} + +fn column_name(expr: &Expr) -> Option { + match expr { + Expr::Column(col) => Some(col.name().to_string()), + // DataFusion inserts CASTs when comparing UInt64 columns with Int64 literals. + // Unwrap the cast to find the underlying column name. + Expr::Cast(datafusion::logical_expr::Cast { expr, .. }) + | Expr::TryCast(datafusion::logical_expr::TryCast { expr, .. }) => column_name(expr), + _ => None, + } +} + +fn scalar_utf8(expr: &Expr) -> Option { + match expr { + Expr::Literal(ScalarValue::Utf8(Some(s)), _) => Some(s.clone()), + Expr::Literal(ScalarValue::LargeUtf8(Some(s)), _) => Some(s.clone()), + // DF auto-casts string literals to Dict(Int32, Utf8) to match dict-encoded columns + Expr::Literal(ScalarValue::Dictionary(_, inner), _) => scalar_utf8_from_scalar(inner), + _ => None, + } +} + +fn scalar_utf8_from_scalar(value: &ScalarValue) -> Option { + match value { + ScalarValue::Utf8(Some(s)) => Some(s.clone()), + ScalarValue::LargeUtf8(Some(s)) => Some(s.clone()), + _ => None, + } +} + +fn scalar_u64(expr: &Expr) -> Option { + match expr { + Expr::Literal(ScalarValue::UInt64(Some(v)), _) => Some(*v), + Expr::Literal(ScalarValue::Int64(Some(v)), _) if *v >= 0 => Some(*v as u64), + Expr::Literal(ScalarValue::UInt32(Some(v)), _) => Some(*v as u64), + Expr::Literal(ScalarValue::Int32(Some(v)), _) if *v >= 0 => Some(*v as u64), + // Unwrap casts inserted by DataFusion type coercion. + Expr::Cast(datafusion::logical_expr::Cast { expr, .. }) + | Expr::TryCast(datafusion::logical_expr::TryCast { expr, .. }) => scalar_u64(expr), + _ => None, + } +} + +#[cfg(test)] +mod tests { + use datafusion::prelude::*; + + use super::*; + + #[test] + fn test_extract_metric_name_eq() { + let filters = vec![col("metric_name").eq(lit("cpu.usage"))]; + let (query, remaining) = extract_split_filters(&filters); + assert_eq!(query.metric_names, Some(vec!["cpu.usage".to_string()])); + assert!(remaining.is_empty()); + } + + #[test] + fn test_extract_timestamp_range() { + let filters = vec![ + col("timestamp_secs").gt_eq(lit(1000u64)), + col("timestamp_secs").lt(lit(2000u64)), + ]; + let (query, remaining) = extract_split_filters(&filters); + assert_eq!(query.time_range_start, Some(1000)); + assert_eq!(query.time_range_end, Some(2000)); + assert!(remaining.is_empty()); + } + + #[test] + fn test_extract_tag_filters() { + // OSS uses bare column names (no tag_ prefix) + let filters = vec![ + col("metric_name").eq(lit("cpu.usage")), + col("service").eq(lit("web")), + col("env").eq(lit("prod")), + ]; + let (query, remaining) = extract_split_filters(&filters); + assert_eq!(query.metric_names, Some(vec!["cpu.usage".to_string()])); + assert_eq!(query.tag_service, Some(vec!["web".to_string()])); + assert_eq!(query.tag_env, Some(vec!["prod".to_string()])); + assert!(remaining.is_empty()); + } + + #[test] + fn test_unknown_column_left_as_remaining() { + let filters = vec![ + col("metric_name").eq(lit("cpu.usage")), + col("value").gt(lit(42.0)), + ]; + let (query, remaining) = extract_split_filters(&filters); + assert_eq!(query.metric_names, Some(vec!["cpu.usage".to_string()])); + assert_eq!(remaining.len(), 1); + } + + #[test] + fn test_in_list_extraction() { + let filters = vec![col("metric_name").in_list( + vec![lit("cpu.usage"), lit("memory.used")], + false, + )]; + let (query, remaining) = extract_split_filters(&filters); + assert_eq!( + query.metric_names, + Some(vec!["cpu.usage".to_string(), "memory.used".to_string()]) + ); + assert!(remaining.is_empty()); + } + + // ── CAST unwrapping (DataFusion type coercion) ───────────── + + #[test] + fn test_timestamp_gte_with_cast_column() { + // DataFusion rewrites `timestamp_secs >= 1000` (UInt64 col vs Int64 lit) as + // CAST(timestamp_secs AS Int64) >= 1000 + let filters = vec![Expr::BinaryExpr(BinaryExpr { + left: Box::new(Expr::Cast(datafusion::logical_expr::Cast { + expr: Box::new(col("timestamp_secs")), + data_type: arrow::datatypes::DataType::Int64, + })), + op: Operator::GtEq, + right: Box::new(lit(1000i64)), + })]; + let (query, remaining) = extract_split_filters(&filters); + assert_eq!(query.time_range_start, Some(1000)); + assert!(remaining.is_empty()); + } + + #[test] + fn test_timestamp_lt_with_cast_column() { + let filters = vec![Expr::BinaryExpr(BinaryExpr { + left: Box::new(Expr::Cast(datafusion::logical_expr::Cast { + expr: Box::new(col("timestamp_secs")), + data_type: arrow::datatypes::DataType::Int64, + })), + op: Operator::Lt, + right: Box::new(lit(2000i64)), + })]; + let (query, remaining) = extract_split_filters(&filters); + assert_eq!(query.time_range_end, Some(2000)); + assert!(remaining.is_empty()); + } + + #[test] + fn test_timestamp_gt_with_cast_literal() { + let filters = vec![Expr::BinaryExpr(BinaryExpr { + left: Box::new(col("timestamp_secs")), + op: Operator::Gt, + right: Box::new(Expr::Cast(datafusion::logical_expr::Cast { + expr: Box::new(lit(500i64)), + data_type: arrow::datatypes::DataType::UInt64, + })), + })]; + let (query, remaining) = extract_split_filters(&filters); + assert_eq!(query.time_range_start, Some(501)); + assert!(remaining.is_empty()); + } + + #[test] + fn test_metric_name_eq_with_dict_cast() { + let dict_lit = Expr::Literal( + ScalarValue::Dictionary( + Box::new(arrow::datatypes::DataType::Int32), + Box::new(ScalarValue::Utf8(Some("cpu.usage".to_string()))), + ), + None, + ); + let filters = vec![Expr::BinaryExpr(BinaryExpr { + left: Box::new(col("metric_name")), + op: Operator::Eq, + right: Box::new(dict_lit), + })]; + let (query, remaining) = extract_split_filters(&filters); + assert_eq!(query.metric_names, Some(vec!["cpu.usage".to_string()])); + assert!(remaining.is_empty()); + } + + #[test] + fn test_all_tag_filters_pushdown() { + // OSS uses bare column names + let filters = vec![ + col("service").eq(lit("web")), + col("env").eq(lit("prod")), + col("datacenter").eq(lit("dc1")), + col("region").eq(lit("us-east-1")), + col("host").eq(lit("host-01")), + ]; + let (query, remaining) = extract_split_filters(&filters); + assert_eq!(query.tag_service, Some(vec!["web".to_string()])); + assert_eq!(query.tag_env, Some(vec!["prod".to_string()])); + assert_eq!(query.tag_datacenter, Some(vec!["dc1".to_string()])); + assert_eq!(query.tag_region, Some(vec!["us-east-1".to_string()])); + assert_eq!(query.tag_host, Some(vec!["host-01".to_string()])); + assert!(remaining.is_empty()); + } + + #[test] + fn test_combined_metric_time_tags_pushdown() { + let filters = vec![ + col("metric_name").eq(lit("cpu.usage")), + col("timestamp_secs").gt_eq(lit(1000u64)), + col("timestamp_secs").lt(lit(2000u64)), + col("env").eq(lit("prod")), + col("value").gt(lit(0.5)), // not pushable + ]; + let (query, remaining) = extract_split_filters(&filters); + assert_eq!(query.metric_names, Some(vec!["cpu.usage".to_string()])); + assert_eq!(query.time_range_start, Some(1000)); + assert_eq!(query.time_range_end, Some(2000)); + assert_eq!(query.tag_env, Some(vec!["prod".to_string()])); + assert_eq!(remaining.len(), 1, "value > 0.5 should remain"); + } + + #[test] + fn test_timestamp_lte_pushdown() { + let filters = vec![col("timestamp_secs").lt_eq(lit(5000u64))]; + let (query, remaining) = extract_split_filters(&filters); + assert_eq!(query.time_range_end, Some(5001)); + assert!(remaining.is_empty()); + } + + #[test] + fn test_tag_in_list_pushdown() { + let filters = vec![col("service").in_list(vec![lit("web"), lit("api")], false)]; + let (query, remaining) = extract_split_filters(&filters); + assert_eq!( + query.tag_service, + Some(vec!["web".to_string(), "api".to_string()]) + ); + assert!(remaining.is_empty()); + } + + #[test] + fn test_no_filters_returns_empty_query() { + let (query, remaining) = extract_split_filters(&[]); + assert!(query.metric_names.is_none()); + assert!(query.time_range_start.is_none()); + assert!(query.time_range_end.is_none()); + assert!(query.tag_service.is_none()); + assert!(remaining.is_empty()); + } + + // ── Extraction → pruning pipeline (Fix #22) ─────────────────────── + + /// Verifies that `extract_split_filters` prunes at the SPLIT level, not just + /// at the row level. This test would fail if metric_name equality extraction + /// were removed — `count_matching` would return 2 instead of 1. + #[test] + fn test_metric_name_pruning_prunes_splits_not_just_rows() { + use quickwit_parquet_engine::split::{MetricsSplitMetadata, SplitId, TimeRange}; + + use crate::sources::metrics::test_utils::TestSplitProvider; + + let cpu_split = MetricsSplitMetadata::builder() + .split_id(SplitId::new("cpu")) + .index_uid("idx:0000") + .time_range(TimeRange::new(100, 300)) + .num_rows(2) + .size_bytes(1024) + .add_metric_name("cpu.usage") + .build(); + let mem_split = MetricsSplitMetadata::builder() + .split_id(SplitId::new("mem")) + .index_uid("idx:0000") + .time_range(TimeRange::new(100, 300)) + .num_rows(2) + .size_bytes(1024) + .add_metric_name("memory.used") + .build(); + + let provider = TestSplitProvider::new(vec![cpu_split, mem_split]); + + let filters = vec![col("metric_name").eq(lit("cpu.usage"))]; + let (query, remaining) = extract_split_filters(&filters); + assert!(remaining.is_empty(), "metric_name = 'cpu.usage' must be fully extracted"); + + let matching = provider.count_matching(&query); + assert_eq!( + matching, 1, + "predicate extractor must prune to 1 split for metric_name = 'cpu.usage', got \ + {matching}" + ); + } + + // ── TestSplitProvider multi-value IN list (Fix #23) ─────────────── + + /// Verifies that `TestSplitProvider` correctly handles multiple tag values in a + /// query — returning splits matching ANY of the values, not just the first. + /// + /// The `MetastoreSplitProvider` is limited by the metastore API (first() value + /// only), but `TestSplitProvider` uses `any()` and must correctly include all + /// matching splits. This test would fail if `any()` were changed to `first()`. + #[test] + fn test_split_provider_multi_value_in_list_returns_all_matching_splits() { + use quickwit_parquet_engine::split::{MetricsSplitMetadata, SplitId, TimeRange}; + + use crate::sources::metrics::test_utils::TestSplitProvider; + + let web_split = MetricsSplitMetadata::builder() + .split_id(SplitId::new("web")) + .index_uid("idx:0000") + .time_range(TimeRange::new(100, 300)) + .num_rows(2) + .size_bytes(1024) + .add_metric_name("cpu.usage") + .add_low_cardinality_tag("service", "web") + .build(); + let api_split = MetricsSplitMetadata::builder() + .split_id(SplitId::new("api")) + .index_uid("idx:0000") + .time_range(TimeRange::new(100, 300)) + .num_rows(2) + .size_bytes(1024) + .add_metric_name("cpu.usage") + .add_low_cardinality_tag("service", "api") + .build(); + let db_split = MetricsSplitMetadata::builder() + .split_id(SplitId::new("db")) + .index_uid("idx:0000") + .time_range(TimeRange::new(100, 300)) + .num_rows(2) + .size_bytes(1024) + .add_metric_name("cpu.usage") + .add_low_cardinality_tag("service", "db") + .build(); + + let provider = TestSplitProvider::new(vec![web_split, api_split, db_split]); + + // A filter for service IN ('web', 'api') must match web and api but NOT db. + let filters = vec![col("service").in_list(vec![lit("web"), lit("api")], false)]; + let (query, remaining) = extract_split_filters(&filters); + assert!(remaining.is_empty(), "service IN list must be fully extracted"); + assert_eq!( + query.tag_service, + Some(vec!["web".to_string(), "api".to_string()]) + ); + + let matching = provider.count_matching(&query); + assert_eq!( + matching, 2, + "TestSplitProvider must return both web and api splits for IN ('web','api'), got \ + {matching}" + ); + } +} diff --git a/quickwit/quickwit-datafusion/src/sources/metrics/table_provider.rs b/quickwit/quickwit-datafusion/src/sources/metrics/table_provider.rs new file mode 100644 index 00000000000..1a979018531 --- /dev/null +++ b/quickwit/quickwit-datafusion/src/sources/metrics/table_provider.rs @@ -0,0 +1,209 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! `MetricsTableProvider` — DataFusion TableProvider for a metrics index. +//! +//! Queries the metastore for published splits, prunes via Postgres filters, +//! and returns a standard `ParquetSource`-backed `DataSourceExec`. + +use std::any::Any; +use std::fmt; +use std::sync::Arc; + +use arrow::datatypes::SchemaRef; +use async_trait::async_trait; +use datafusion::catalog::Session; +use datafusion::datasource::TableProvider; +use datafusion::datasource::source::DataSourceExec; +use datafusion::error::Result as DFResult; +use datafusion::execution::object_store::ObjectStoreUrl; +use datafusion::logical_expr::{Expr, TableProviderFilterPushDown, TableType}; +use datafusion::physical_plan::ExecutionPlan; +use datafusion_datasource::PartitionedFile; +use datafusion_datasource::file_scan_config::FileScanConfigBuilder; +use datafusion_datasource_parquet::source::ParquetSource; +use object_store::ObjectStore; +use quickwit_parquet_engine::split::MetricsSplitMetadata; +use tracing::debug; + +use super::predicate; + +/// Provides split metadata for a metrics index. +#[async_trait] +pub trait MetricsSplitProvider: Send + Sync + fmt::Debug { + async fn list_splits( + &self, + query: &predicate::MetricsSplitQuery, + ) -> DFResult>; +} + +/// TableProvider for a single metrics index. +/// +/// On `scan()`, queries the metastore for published splits matching the +/// pushed-down predicates, then returns a standard `ParquetSource`-backed +/// `DataSourceExec` with one file group per split. +#[derive(Debug)] +pub struct MetricsTableProvider { + schema: SchemaRef, + split_provider: Arc, + object_store: Arc, + /// URL scheme for the object store (e.g. "file:///tmp/data" or "memory://"). + object_store_url: ObjectStoreUrl, +} + +impl MetricsTableProvider { + pub fn new( + schema: SchemaRef, + split_provider: Arc, + object_store: Arc, + object_store_url: ObjectStoreUrl, + ) -> Self { + Self { + schema, + split_provider, + object_store, + object_store_url, + } + } +} + +#[async_trait] +impl TableProvider for MetricsTableProvider { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + Arc::clone(&self.schema) + } + + fn table_type(&self) -> TableType { + TableType::Base + } + + fn supports_filters_pushdown( + &self, + filters: &[&Expr], + ) -> DFResult> { + Ok(filters.iter().map(|expr| classify_filter(expr)).collect()) + } + + async fn scan( + &self, + state: &dyn Session, + projection: Option<&Vec>, + filters: &[Expr], + limit: Option, + ) -> DFResult> { + // Extract split-level filters for metastore pruning + let (split_query, _remaining) = predicate::extract_split_filters(filters); + + debug!( + metric_names = ?split_query.metric_names, + time_start = ?split_query.time_range_start, + time_end = ?split_query.time_range_end, + "querying metastore for matching splits" + ); + + let splits = self.split_provider.list_splits(&split_query).await?; + + debug!(num_splits = splits.len(), "found matching splits"); + + // Register our object store with the runtime so ParquetSource can use it + // Register on every scan to handle sessions where register_for_worker + // was not called (single-node non-distributed mode). The call is idempotent + // but acquires a write-lock on RuntimeEnv's object-store map; for the + // distributed path register_for_worker pre-registers stores so this is a + // no-op. A future improvement: skip if already registered. + state + .runtime_env() + .register_object_store(self.object_store_url.as_ref(), Arc::clone(&self.object_store)); + + // Build file groups — one PartitionedFile per split + let file_groups: Vec = splits + .iter() + .map(|split| PartitionedFile::new(split.parquet_filename(), split.size_bytes)) + .collect(); + + // Configure ParquetSource with bloom filters + pushdown enabled + let table_schema: datafusion_datasource::TableSchema = self.schema.clone().into(); + let parquet_source = ParquetSource::new(table_schema) + .with_bloom_filter_on_read(true) + .with_pushdown_filters(true) + .with_reorder_filters(true) + .with_enable_page_index(true); + + // Build the FileScanConfig + let mut builder = FileScanConfigBuilder::new( + self.object_store_url.clone(), + Arc::new(parquet_source), + ); + + // Add each split as its own file group (one file per partition) + for file in file_groups { + builder = builder.with_file(file); + } + + if let Some(proj) = projection { + builder = builder.with_projection_indices(Some(proj.clone()))?; + } + + if let Some(lim) = limit { + builder = builder.with_limit(Some(lim)); + } + + let file_scan_config = builder.build(); + Ok(DataSourceExec::from_data_source(file_scan_config)) + } +} + +fn classify_filter(expr: &Expr) -> TableProviderFilterPushDown { + match expr { + Expr::BinaryExpr(binary) => { + if let Some(col_name) = column_name_from_expr(&binary.left) + .or_else(|| column_name_from_expr(&binary.right)) + { + // OSS uses bare column names (no tag_ prefix) + match col_name.as_str() { + "metric_name" | "timestamp_secs" | "service" | "env" + | "datacenter" | "region" | "host" => { + TableProviderFilterPushDown::Inexact + } + _ => TableProviderFilterPushDown::Unsupported, + } + } else { + TableProviderFilterPushDown::Unsupported + } + } + Expr::InList(in_list) => { + if let Some(col_name) = column_name_from_expr(&in_list.expr) { + match col_name.as_str() { + "metric_name" | "service" | "env" | "datacenter" + | "region" | "host" => TableProviderFilterPushDown::Inexact, + _ => TableProviderFilterPushDown::Unsupported, + } + } else { + TableProviderFilterPushDown::Unsupported + } + } + _ => TableProviderFilterPushDown::Unsupported, + } +} + +fn column_name_from_expr(expr: &Expr) -> Option { + match expr { + Expr::Column(col) => Some(col.name().to_string()), + _ => None, + } +} diff --git a/quickwit/quickwit-datafusion/src/sources/metrics/test_utils.rs b/quickwit/quickwit-datafusion/src/sources/metrics/test_utils.rs new file mode 100644 index 00000000000..d6fc0959c45 --- /dev/null +++ b/quickwit/quickwit-datafusion/src/sources/metrics/test_utils.rs @@ -0,0 +1,387 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Composable test utilities for quickwit-datafusion. +//! +//! Builds batches with the OSS dynamic schema (no fixed 14-column schema): +//! `metric_name`, `metric_type`, `timestamp_secs`, `value`, `service` (optional). +//! +//! Column names use the OSS convention — bare names without `tag_` prefix. + +use std::collections::HashSet; +use std::sync::Arc; + +use arrow::array::{ + Array, ArrayRef, DictionaryArray, Float64Array, Int32Array, RecordBatch, StringArray, + UInt64Array, UInt8Array, +}; +use arrow::datatypes::{DataType, Field, Int32Type, Schema as ArrowSchema, SchemaRef}; +use async_trait::async_trait; +use datafusion::error::Result as DFResult; +use datafusion::execution::object_store::ObjectStoreUrl; +use datafusion::physical_plan::ExecutionPlan; +use datafusion::prelude::SessionContext; +use object_store::memory::InMemory; +use object_store::path::Path as ObjectPath; +use object_store::{ObjectStore, PutPayload}; +use quickwit_parquet_engine::schema::ParquetSchema; +use quickwit_parquet_engine::split::{MetricsSplitMetadata, SplitId, TimeRange}; +use quickwit_parquet_engine::storage::{ParquetWriter, ParquetWriterConfig}; + +use super::index_resolver::SimpleIndexResolver; +use super::predicate::MetricsSplitQuery; +use super::table_provider::{MetricsSplitProvider, MetricsTableProvider}; + +// ── Schema helpers ────────────────────────────────────────────────── + +fn dict_type() -> DataType { + DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)) +} + +/// Build the OSS dynamic schema for a batch with a `service` column. +/// +/// Schema: metric_name (dict), metric_type (u8), timestamp_secs (u64), +/// value (f64), service (dict, nullable). +pub fn oss_schema_with_service() -> SchemaRef { + Arc::new(ArrowSchema::new(vec![ + Field::new("metric_name", dict_type(), false), + Field::new("metric_type", DataType::UInt8, false), + Field::new("timestamp_secs", DataType::UInt64, false), + Field::new("value", DataType::Float64, false), + Field::new("service", dict_type(), true), + ])) +} + +/// Build the OSS minimal base schema (4 required fields only). +pub fn oss_base_schema() -> SchemaRef { + Arc::new(ArrowSchema::new(vec![ + Field::new("metric_name", dict_type(), false), + Field::new("metric_type", DataType::UInt8, false), + Field::new("timestamp_secs", DataType::UInt64, false), + Field::new("value", DataType::Float64, false), + ])) +} + +// ── Batch builders ────────────────────────────────────────────────── + +/// Build a RecordBatch with the OSS dynamic schema (4 required + service). +/// +/// Column names use bare names (no `tag_` prefix): `service`, not `tag_service`. +pub fn make_batch( + metric_name: &str, + timestamps: &[u64], + values: &[f64], + service: Option<&str>, +) -> RecordBatch { + let n = timestamps.len(); + assert_eq!(n, values.len()); + + let cols: Vec = vec![ + make_dict(n, metric_name), + Arc::new(UInt8Array::from(vec![0u8; n])), + Arc::new(UInt64Array::from(timestamps.to_vec())), + Arc::new(Float64Array::from(values.to_vec())), + make_nullable_dict(n, service), + ]; + + RecordBatch::try_new(oss_schema_with_service(), cols).unwrap() +} + +/// Build a RecordBatch with multiple OSS-style tag columns. +/// +/// Mirrors the production `build_record_batch` behavior: a tag column is only +/// included in the schema when its value is `Some(_)`. Passing `None` for a +/// tag omits the column entirely — `None` does NOT produce an all-null column. +/// +/// This matches what `metrics_ingest_api::build_record_batch` produces, ensuring +/// tests exercise the same dynamic schema that real ingestion emits. +pub fn make_batch_with_tags( + metric_name: &str, + timestamps: &[u64], + values: &[f64], + service: Option<&str>, + env: Option<&str>, + datacenter: Option<&str>, + region: Option<&str>, + host: Option<&str>, +) -> RecordBatch { + let n = timestamps.len(); + assert_eq!(n, values.len()); + + let mut fields = vec![ + Field::new("metric_name", dict_type(), false), + Field::new("metric_type", DataType::UInt8, false), + Field::new("timestamp_secs", DataType::UInt64, false), + Field::new("value", DataType::Float64, false), + ]; + let mut cols: Vec = vec![ + make_dict(n, metric_name), + Arc::new(UInt8Array::from(vec![0u8; n])), + Arc::new(UInt64Array::from(timestamps.to_vec())), + Arc::new(Float64Array::from(values.to_vec())), + ]; + + // Only emit a column when the value is Some — matching production behavior. + for (name, val) in [ + ("service", service), + ("env", env), + ("datacenter", datacenter), + ("region", region), + ("host", host), + ] { + if let Some(v) = val { + fields.push(Field::new(name, dict_type(), true)); + cols.push(make_nullable_dict(n, Some(v))); + } + } + + let schema = Arc::new(ArrowSchema::new(fields)); + RecordBatch::try_new(schema, cols).unwrap() +} + +fn make_dict(n: usize, value: &str) -> ArrayRef { + let keys = Int32Array::from(vec![0i32; n]); + let vals = StringArray::from(vec![value]); + Arc::new(DictionaryArray::::try_new(keys, Arc::new(vals)).unwrap()) +} + +fn make_nullable_dict(n: usize, value: Option<&str>) -> ArrayRef { + match value { + Some(v) => { + let keys = Int32Array::from(vec![Some(0i32); n]); + let vals = StringArray::from(vec![v]); + Arc::new(DictionaryArray::::try_new(keys, Arc::new(vals)).unwrap()) + } + None => { + let keys = Int32Array::from(vec![None::; n]); + let vals = StringArray::from(vec![None::<&str>]); + Arc::new(DictionaryArray::::try_new(keys, Arc::new(vals)).unwrap()) + } + } +} + +// ── Split provider ────────────────────────────────────────────────── + +/// In-memory split provider that applies real pruning logic. +/// +/// Uses OSS tag key names (bare, no `tag_` prefix) for `get_tag_values`. +#[derive(Debug, Clone)] +pub struct TestSplitProvider { + pub splits: Vec, +} + +impl TestSplitProvider { + pub fn new(splits: Vec) -> Self { + Self { splits } + } + + pub fn count_matching(&self, query: &MetricsSplitQuery) -> usize { + futures::executor::block_on(self.list_splits(query)) + .unwrap() + .len() + } +} + +#[async_trait] +impl MetricsSplitProvider for TestSplitProvider { + async fn list_splits(&self, query: &MetricsSplitQuery) -> DFResult> { + let mut result = self.splits.clone(); + + if let Some(ref names) = query.metric_names { + result.retain(|s| names.iter().any(|n| s.metric_names.contains(n))); + } + if let Some(start) = query.time_range_start { + result.retain(|s| s.time_range.end_secs > start); + } + if let Some(end) = query.time_range_end { + result.retain(|s| s.time_range.start_secs < end); + } + macro_rules! filter_tag { + ($field:ident, $key:expr) => { + if let Some(ref vals) = query.$field { + result.retain(|s| { + s.get_tag_values($key) + .map(|v| vals.iter().any(|x| v.contains(x))) + .unwrap_or(true) + }); + } + }; + } + // OSS tag key names (no tag_ prefix) + filter_tag!(tag_service, "service"); + filter_tag!(tag_env, "env"); + filter_tag!(tag_datacenter, "datacenter"); + filter_tag!(tag_region, "region"); + filter_tag!(tag_host, "host"); + + Ok(result) + } +} + +// ── Testbed ───────────────────────────────────────────────────────── + +/// Composable testbed for metrics DataFusion tests. +/// +/// Writes real parquet files via `ParquetWriter` to an in-memory object store. +pub struct MetricsTestbed { + pub object_store: Arc, + pub splits: Vec, + split_counter: usize, +} + +impl MetricsTestbed { + pub fn new() -> Self { + Self { + object_store: Arc::new(InMemory::new()), + splits: Vec::new(), + split_counter: 0, + } + } + + pub async fn add_split(&mut self, batch: &RecordBatch) -> MetricsSplitMetadata { + self.split_counter += 1; + let split_id = format!("split_{}", self.split_counter); + let metadata = write_split(&self.object_store, batch, &split_id).await; + self.splits.push(metadata.clone()); + metadata + } + + pub async fn add( + &mut self, + metric_name: &str, + timestamps: &[u64], + values: &[f64], + service: Option<&str>, + ) -> MetricsSplitMetadata { + let batch = make_batch(metric_name, timestamps, values, service); + self.add_split(&batch).await + } + + pub fn split_provider(&self) -> Arc { + Arc::new(TestSplitProvider::new(self.splits.clone())) + } + + pub fn table_provider(&self) -> MetricsTableProvider { + MetricsTableProvider::new( + oss_schema_with_service(), + self.split_provider(), + self.object_store.clone(), + ObjectStoreUrl::parse("memory://").unwrap(), + ) + } + + /// Build a `SessionContext` with the metrics catalog registered. + pub fn session(&self) -> SessionContext { + let resolver = Arc::new(SimpleIndexResolver::new( + self.split_provider(), + self.object_store.clone(), + ObjectStoreUrl::parse("memory://").unwrap(), + )); + let source = crate::sources::metrics::MetricsDataSource::with_resolver(resolver); + let builder = crate::session::DataFusionSessionBuilder::new().with_source(Arc::new(source) as Arc); + builder.build_session().unwrap() + } +} + +// ── Plan helpers ──────────────────────────────────────────────────── + +pub async fn physical_plan_str(ctx: &SessionContext, sql: &str) -> String { + let df = ctx.sql(sql).await.unwrap(); + let plan = df.create_physical_plan().await.unwrap(); + format!( + "{}", + datafusion::physical_plan::displayable(plan.as_ref()).indent(true) + ) +} + +pub async fn physical_plan(ctx: &SessionContext, sql: &str) -> Arc { + let df = ctx.sql(sql).await.unwrap(); + df.create_physical_plan().await.unwrap() +} + +pub async fn execute(ctx: &SessionContext, sql: &str) -> Vec { + ctx.sql(sql).await.unwrap().collect().await.unwrap() +} + +pub fn total_rows(batches: &[RecordBatch]) -> usize { + batches.iter().map(|b| b.num_rows()).sum() +} + +// ── Internal ──────────────────────────────────────────────────────── + +async fn write_split( + store: &InMemory, + batch: &RecordBatch, + split_id: &str, +) -> MetricsSplitMetadata { + // Use schema from the batch itself (dynamic schema) + let schema = ParquetSchema::from_arrow_schema(batch.schema()); + let config = ParquetWriterConfig::default(); + let writer = ParquetWriter::new(schema, config); + + let parquet_bytes = writer.write_to_bytes(batch).unwrap(); + let size_bytes = parquet_bytes.len() as u64; + + store + .put( + &ObjectPath::from(format!("{split_id}.parquet").as_str()), + PutPayload::from(bytes::Bytes::from(parquet_bytes)), + ) + .await + .unwrap(); + + // Extract timestamps by column name (no ParquetField enum in OSS) + let schema = batch.schema(); + let ts_idx = schema.index_of("timestamp_secs").unwrap(); + let timestamps: Vec = batch + .column(ts_idx) + .as_any() + .downcast_ref::() + .unwrap() + .iter() + .flatten() + .collect(); + let min_ts = *timestamps.iter().min().unwrap_or(&0); + let max_ts = *timestamps.iter().max().unwrap_or(&0); + + // Extract metric names by column name + let mn_idx = schema.index_of("metric_name").unwrap(); + let metric_col = batch.column(mn_idx); + let dict = metric_col + .as_any() + .downcast_ref::>() + .unwrap(); + let values = dict + .values() + .as_any() + .downcast_ref::() + .unwrap(); + let mut metric_names = HashSet::new(); + for i in 0..values.len() { + if !values.is_null(i) { + metric_names.insert(values.value(i).to_string()); + } + } + + let mut builder = MetricsSplitMetadata::builder() + .split_id(SplitId::new(split_id)) + .index_uid("test-index:00000000000000000000000000") + .time_range(TimeRange::new(min_ts, max_ts + 1)) + .num_rows(batch.num_rows() as u64) + .size_bytes(size_bytes); + for name in &metric_names { + builder = builder.add_metric_name(name.clone()); + } + builder.build() +} diff --git a/quickwit/quickwit-datafusion/src/sources/mod.rs b/quickwit/quickwit-datafusion/src/sources/mod.rs new file mode 100644 index 00000000000..2995d771fdd --- /dev/null +++ b/quickwit/quickwit-datafusion/src/sources/mod.rs @@ -0,0 +1,17 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Data source implementations for the DataFusion session layer. + +pub mod metrics; diff --git a/quickwit/quickwit-datafusion/src/storage_bridge.rs b/quickwit/quickwit-datafusion/src/storage_bridge.rs new file mode 100644 index 00000000000..469a261bd06 --- /dev/null +++ b/quickwit/quickwit-datafusion/src/storage_bridge.rs @@ -0,0 +1,209 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Adapter from `quickwit_storage::Storage` to `object_store::ObjectStore`. +//! +//! ## Why this adapter exists +//! +//! `quickwit_storage::Storage` and `object_store::ObjectStore` are both +//! object-storage interfaces but have incompatible method signatures, error +//! types, and path representations. DataFusion's `ParquetSource` requires +//! `ObjectStore`; Quickwit's split pipeline produces `Arc`. +//! +//! The long-term fix is for `quickwit-storage` types to implement `ObjectStore` +//! directly. Until then, `QuickwitObjectStore` is the bridge. +//! +//! ## What is and is not implemented +//! +//! Only read operations (`get_opts`, `get_range`, `head`) are implemented. +//! All write and list operations return `NotSupported` — DataFusion only +//! reads parquet files through this store. + +use std::sync::Arc; + +use async_trait::async_trait; +use bytes::Bytes; +use futures::stream::BoxStream; +use object_store::path::Path as ObjectPath; +use object_store::{ + GetOptions, GetResult, GetResultPayload, ListResult, MultipartUpload, ObjectMeta, ObjectStore, + PutMultipartOptions, PutOptions, PutPayload, PutResult, Result as ObjectStoreResult, +}; +use quickwit_storage::Storage; + +/// Adapts Quickwit's `Storage` trait to DataFusion's `ObjectStore` interface. +/// +/// Only read operations are implemented since DataFusion only needs to read +/// parquet files. +#[derive(Debug)] +pub struct QuickwitObjectStore { + storage: Arc, +} + +impl QuickwitObjectStore { + pub fn new(storage: Arc) -> Self { + Self { storage } + } +} + +impl std::fmt::Display for QuickwitObjectStore { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "QuickwitObjectStore({})", self.storage.uri()) + } +} + +fn to_object_store_error(err: quickwit_storage::StorageError) -> object_store::Error { + use quickwit_storage::StorageErrorKind; + match err.kind() { + StorageErrorKind::NotFound => object_store::Error::NotFound { + path: String::new(), + source: Box::new(err), + }, + _ => object_store::Error::Generic { + store: "QuickwitObjectStore", + source: Box::new(err), + }, + } +} + +fn object_path_to_std(location: &ObjectPath) -> std::path::PathBuf { + std::path::PathBuf::from(location.as_ref()) +} + +#[async_trait] +impl ObjectStore for QuickwitObjectStore { + async fn get_opts( + &self, + location: &ObjectPath, + _options: GetOptions, + ) -> ObjectStoreResult { + let path = object_path_to_std(location); + let data = self + .storage + .get_all(&path) + .await + .map_err(to_object_store_error)?; + let bytes = Bytes::from(data.as_ref().to_vec()); + let size = bytes.len() as u64; + let meta = ObjectMeta { + location: location.clone(), + last_modified: chrono::Utc::now(), + size, + e_tag: None, + version: None, + }; + Ok(GetResult { + payload: GetResultPayload::Stream(Box::pin(futures::stream::once(async { + Ok(bytes) + }))), + meta, + range: 0..size, + attributes: Default::default(), + }) + } + + async fn get_range( + &self, + location: &ObjectPath, + range: std::ops::Range, + ) -> ObjectStoreResult { + let path = object_path_to_std(location); + let usize_range = range.start as usize..range.end as usize; + let data = self + .storage + .get_slice(&path, usize_range) + .await + .map_err(to_object_store_error)?; + Ok(Bytes::from(data.as_ref().to_vec())) + } + + async fn head(&self, location: &ObjectPath) -> ObjectStoreResult { + let path = object_path_to_std(location); + let size = self + .storage + .file_num_bytes(&path) + .await + .map_err(to_object_store_error)?; + Ok(ObjectMeta { + location: location.clone(), + last_modified: chrono::Utc::now(), + size, + e_tag: None, + version: None, + }) + } + + async fn put_opts( + &self, + _location: &ObjectPath, + _payload: PutPayload, + _opts: PutOptions, + ) -> ObjectStoreResult { + Err(object_store::Error::NotSupported { + source: "QuickwitObjectStore is read-only".into(), + }) + } + + async fn put_multipart_opts( + &self, + _location: &ObjectPath, + _opts: PutMultipartOptions, + ) -> ObjectStoreResult> { + Err(object_store::Error::NotSupported { + source: "QuickwitObjectStore is read-only".into(), + }) + } + + async fn delete(&self, _location: &ObjectPath) -> ObjectStoreResult<()> { + Err(object_store::Error::NotSupported { + source: "QuickwitObjectStore is read-only".into(), + }) + } + + fn list( + &self, + _prefix: Option<&ObjectPath>, + ) -> BoxStream<'static, ObjectStoreResult> { + Box::pin(futures::stream::once(async { + Err(object_store::Error::NotSupported { + source: "QuickwitObjectStore does not support listing".into(), + }) + })) + } + + async fn list_with_delimiter( + &self, + _prefix: Option<&ObjectPath>, + ) -> ObjectStoreResult { + Err(object_store::Error::NotSupported { + source: "QuickwitObjectStore does not support listing".into(), + }) + } + + async fn copy(&self, _from: &ObjectPath, _to: &ObjectPath) -> ObjectStoreResult<()> { + Err(object_store::Error::NotSupported { + source: "QuickwitObjectStore is read-only".into(), + }) + } + + async fn copy_if_not_exists( + &self, + _from: &ObjectPath, + _to: &ObjectPath, + ) -> ObjectStoreResult<()> { + Err(object_store::Error::NotSupported { + source: "QuickwitObjectStore is read-only".into(), + }) + } +} diff --git a/quickwit/quickwit-datafusion/src/substrait.rs b/quickwit/quickwit-datafusion/src/substrait.rs new file mode 100644 index 00000000000..8b398f23930 --- /dev/null +++ b/quickwit/quickwit-datafusion/src/substrait.rs @@ -0,0 +1,278 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Substrait plan consumption for Quickwit data sources. +//! +//! ## How this fits together +//! +//! [`QuickwitSubstraitConsumer`] implements the `SubstraitConsumer` trait from +//! `datafusion-substrait`. It intercepts `ReadRel` nodes in an incoming +//! Substrait plan, routes them to whichever registered [`QuickwitDataSource`] +//! claims them, and falls back to the standard catalog lookup for everything +//! else. +//! +//! ## OSS path — standard Substrait (no custom protos) +//! +//! A producer targeting Quickwit OSS sends a completely vanilla Substrait plan: +//! +//! ```text +//! ReadRel { +//! base_schema: , +//! read_type: NamedTable { names: [""] }, +//! } +//! ``` +//! +//! [`MetricsDataSource`][crate::sources::metrics::MetricsDataSource] handles +//! this by resolving the index from the metastore and creating a +//! `MetricsTableProvider` with the schema declared in `base_schema`. No +//! custom protobuf type or type URL is involved. +//! +//! ## Extension path — custom protos (downstream callers) +//! +//! A downstream caller registers its own `QuickwitDataSource` implementation that decodes +//! DD-internal protos (e.g. `ExtensionTable`). The OSS code +//! simply calls the hook; the proto decoding stays in the downstream caller. +//! +//! ## Entry point +//! +//! [`DataFusionSessionBuilder::execute_substrait`][crate::session::DataFusionSessionBuilder::execute_substrait] +//! builds a `QuickwitSubstraitConsumer` from the session state and sources, +//! converts the plan via `from_substrait_plan_with_consumer`, then executes it. + +use std::sync::Arc; + +use arrow::datatypes::SchemaRef; +use async_trait::async_trait; +use datafusion::catalog::TableProvider; +use datafusion::common::TableReference; +use datafusion::error::{DataFusionError, Result as DFResult}; +use datafusion::execution::{FunctionRegistry, SendableRecordBatchStream, SessionState}; +use datafusion::logical_expr::LogicalPlan; +use datafusion_substrait::extensions::Extensions; +use datafusion_substrait::logical_plan::consumer::{ + SubstraitConsumer, from_read_rel, from_substrait_named_struct, + from_substrait_plan_with_consumer, +}; +use datafusion_substrait::substrait::proto::{ + Plan, ReadRel, + read_rel::{ReadType, NamedTable as SubstraitNamedTable}, +}; + +use crate::data_source::QuickwitDataSource; + +/// `SubstraitConsumer` that routes `ReadRel` nodes to registered +/// [`QuickwitDataSource`]s before falling back to the standard catalog path. +/// +/// Constructed by [`DataFusionSessionBuilder::execute_substrait`]. +pub struct QuickwitSubstraitConsumer<'a> { + extensions: &'a Extensions, + state: &'a SessionState, + sources: &'a [Arc], +} + +impl<'a> QuickwitSubstraitConsumer<'a> { + pub fn new( + extensions: &'a Extensions, + state: &'a SessionState, + sources: &'a [Arc], + ) -> Self { + Self { extensions, state, sources } + } +} + +#[async_trait] +impl SubstraitConsumer for QuickwitSubstraitConsumer<'_> { + // ── Required boilerplate ───────────────────────────────────────── + + /// Resolve a table reference via the quickwit catalog + /// (`quickwit.public.`). + async fn resolve_table_ref( + &self, + table_ref: &TableReference, + ) -> DFResult>> { + let schema = self.state.schema_for_ref(table_ref.clone())?; + schema.table(table_ref.table()).await + } + + fn get_extensions(&self) -> &Extensions { + self.extensions + } + + fn get_function_registry(&self) -> &impl FunctionRegistry { + self.state + } + + // ── Custom ReadRel handling ─────────────────────────────────────── + + /// Intercept `ReadRel` nodes and offer them to each registered source. + /// + /// 1. Convert `ReadRel.base_schema` → Arrow `SchemaRef` (the schema hint + /// the producer declared; sources use this for schema injection rather + /// than the minimal default). + /// 2. Call each source's `try_consume_read_rel`. The first source that + /// returns `Some((table_name, provider))` wins. + /// 3. If a source claims the rel, build a temporary resolver that returns + /// the provider when `from_read_rel` performs its catalog lookup. + /// If the original rel used `ExtensionTable`, rewrite it to `NamedTable` + /// so `from_read_rel` can apply the standard filter/projection handling. + /// 4. If no source claims the rel, fall through to the default path which + /// uses `resolve_table_ref` → quickwit catalog → `QuickwitSchemaProvider`. + async fn consume_read(&self, rel: &ReadRel) -> DFResult { + // Convert base_schema to Arrow once so every source can use it without + // re-parsing the Substrait types. + let schema_hint: Option = if let Some(ns) = &rel.base_schema { + Some(Arc::clone(from_substrait_named_struct(self, ns)?.inner())) + } else { + None + }; + + for source in self.sources { + if let Some((table_name, provider)) = + source.try_consume_read_rel(rel, schema_hint.clone()).await? + { + // Build a short-lived resolver that returns our provider for + // this table name. Everything else (filters, projections, + // schema coercion) is handled by `from_read_rel`. + let resolver = WithCustomProvider { + extensions: self.extensions, + state: self.state, + table_name: table_name.clone(), + provider: Arc::clone(&provider), + }; + + // If the rel uses ExtensionTable (custom proto), rewrite it to + // NamedTable so `from_read_rel` resolves it via `resolve_table_ref`. + let effective_rel = if matches!(rel.read_type, Some(ReadType::ExtensionTable(_))) { + let mut r = rel.clone(); + r.read_type = Some(ReadType::NamedTable(SubstraitNamedTable { + names: vec![table_name], + ..Default::default() + })); + r + } else { + rel.clone() + }; + + return from_read_rel(&resolver, &effective_rel).await; + } + } + + // No source claimed this rel — use the standard path (catalog lookup). + from_read_rel(self, rel).await + } +} + +/// Short-lived `SubstraitConsumer` that overrides `resolve_table_ref` to +/// return a specific pre-built `TableProvider` for one table name, then +/// delegates everything else to the outer consumer's session/extensions. +/// +/// Used by `QuickwitSubstraitConsumer::consume_read` so that `from_read_rel` +/// applies standard filter/projection handling against our custom provider. +struct WithCustomProvider<'a> { + extensions: &'a Extensions, + state: &'a SessionState, + table_name: String, + provider: Arc, +} + +#[async_trait] +impl SubstraitConsumer for WithCustomProvider<'_> { + async fn resolve_table_ref( + &self, + table_ref: &TableReference, + ) -> DFResult>> { + if table_ref.table() == self.table_name.as_str() { + return Ok(Some(Arc::clone(&self.provider))); + } + // Fall back to catalog for anything else + let schema = self.state.schema_for_ref(table_ref.clone())?; + schema.table(table_ref.table()).await + } + + fn get_extensions(&self) -> &Extensions { + self.extensions + } + + fn get_function_registry(&self) -> &impl FunctionRegistry { + self.state + } +} + +/// Convert a Substrait plan to batches using the registered data sources. +/// +/// This is the entry point for external coordinators that send Substrait plans +/// to Quickwit. It is called by +/// [`DataFusionSessionBuilder::execute_substrait`]. +/// Convert a Substrait plan to batches. +/// +/// Takes the full `SessionContext` (not just state) so that catalog +/// registrations made by `build_session()` — including the `quickwit.public` +/// schema provider — are visible during both plan conversion and execution. +/// Creating a fresh `SessionContext::new_with_state(state.clone())` loses +/// those registrations because `register_catalog` lives on the context, not +/// the state snapshot. +pub async fn execute_substrait_plan( + plan: &Plan, + ctx: &datafusion::prelude::SessionContext, + sources: &[Arc], +) -> DFResult> { + let state = ctx.state(); + let extensions = Extensions::try_from(&plan.extensions) + .map_err(|e| DataFusionError::External(Box::new(e)))?; + + let consumer = QuickwitSubstraitConsumer::new(&extensions, &state, sources); + let logical_plan = from_substrait_plan_with_consumer(&consumer, plan).await?; + + tracing::debug!( + plan = %logical_plan.display_indent(), + "substrait plan converted to DataFusion logical plan" + ); + + let df = ctx.execute_logical_plan(logical_plan).await?; + let batches = df.collect().await?; + tracing::debug!(num_batches = batches.len(), "substrait plan executed"); + Ok(batches) +} + +/// Convert a Substrait plan to a streaming `RecordBatch` iterator. +/// +/// Unlike [`execute_substrait_plan`], this function does **not** collect all +/// results into memory — it returns a [`SendableRecordBatchStream`] that the +/// caller can poll lazily. This is the preferred path for gRPC streaming +/// responses and Arrow Flight handlers. +/// +/// Takes the full `SessionContext` for the same reasons as +/// `execute_substrait_plan` — catalog registrations live on the context, not +/// the state snapshot. +pub async fn execute_substrait_plan_streaming( + plan: &Plan, + ctx: &datafusion::prelude::SessionContext, + sources: &[Arc], +) -> DFResult { + let state = ctx.state(); + let extensions = Extensions::try_from(&plan.extensions) + .map_err(|e| DataFusionError::External(Box::new(e)))?; + + let consumer = QuickwitSubstraitConsumer::new(&extensions, &state, sources); + let logical_plan = from_substrait_plan_with_consumer(&consumer, plan).await?; + + tracing::debug!( + plan = %logical_plan.display_indent(), + "substrait plan converted to DataFusion logical plan for streaming execution" + ); + + let df = ctx.execute_logical_plan(logical_plan).await?; + let stream = df.execute_stream().await?; + Ok(stream) +} diff --git a/quickwit/quickwit-datafusion/src/task_estimator.rs b/quickwit/quickwit-datafusion/src/task_estimator.rs new file mode 100644 index 00000000000..ddbcfe3f768 --- /dev/null +++ b/quickwit/quickwit-datafusion/src/task_estimator.rs @@ -0,0 +1,64 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Generic task estimator for distributed execution of parquet-backed queries. +//! +//! Uses the number of file groups in a `DataSourceExec` (one per split) to +//! determine how many distributed tasks to create. No data-source-specific code. + +use std::sync::Arc; + +use datafusion::config::ConfigOptions; +use datafusion::datasource::source::DataSourceExec; +use datafusion::physical_plan::ExecutionPlan; +use datafusion_datasource_parquet::source::ParquetSource; +use datafusion_distributed::{PartitionIsolatorExec, TaskEstimation, TaskEstimator}; + +/// Estimates the desired task count for distributed execution by counting +/// the number of parquet file groups (= number of splits) in the plan. +#[derive(Debug)] +pub struct QuickwitTaskEstimator; + +impl TaskEstimator for QuickwitTaskEstimator { + fn task_estimation( + &self, + plan: &Arc, + _cfg: &ConfigOptions, + ) -> Option { + let dse: &DataSourceExec = plan.as_any().downcast_ref()?; + let (file_config, _parquet_source) = dse.downcast_to_file_source::()?; + let num_file_groups = file_config.file_groups.len(); + if num_file_groups == 0 { + return Some(TaskEstimation::maximum(1)); + } + Some(TaskEstimation::desired(num_file_groups)) + } + + fn scale_up_leaf_node( + &self, + plan: &Arc, + task_count: usize, + _cfg: &ConfigOptions, + ) -> Option> { + let dse: &DataSourceExec = plan.as_any().downcast_ref()?; + let (_file_config, _parquet_source) = dse.downcast_to_file_source::()?; + if task_count <= 1 { + return Some(Arc::clone(plan)); + } + Some(Arc::new(PartitionIsolatorExec::new( + Arc::clone(plan), + task_count, + ))) + } +} diff --git a/quickwit/quickwit-datafusion/src/test_utils.rs b/quickwit/quickwit-datafusion/src/test_utils.rs new file mode 100644 index 00000000000..84c713c28a9 --- /dev/null +++ b/quickwit/quickwit-datafusion/src/test_utils.rs @@ -0,0 +1,18 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Re-exports metrics test utilities from `sources::metrics::test_utils`. +//! +//! Integration tests use `quickwit_datafusion::test_utils::make_batch` etc. +pub use crate::sources::metrics::test_utils::*; diff --git a/quickwit/quickwit-datafusion/src/worker.rs b/quickwit/quickwit-datafusion/src/worker.rs new file mode 100644 index 00000000000..d0e8daf52b4 --- /dev/null +++ b/quickwit/quickwit-datafusion/src/worker.rs @@ -0,0 +1,119 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Distributed DataFusion worker session setup. +//! +//! This module is named `worker` because the distributed protocol uses a +//! custom `WorkerService` gRPC (from datafusion-distributed PR #375), not +//! Arrow Flight. The name `flight` would be misleading. +//! +//! `QuickwitWorkerSessionBuilder` prepares each worker session: +//! 1. Applies source contributions (optimizer rules, extension planners, UDFs, +//! UDAFs, codecs) before `SessionStateBuilder::build()`. +//! 2. Injects the shared `RuntimeEnv` from the coordinator's +//! `DataFusionSessionBuilder` so that object stores registered at startup +//! are visible on workers without any per-session re-registration. +//! 3. Registers the `QuickwitSchemaProvider` so table references in +//! deserialized plan fragments resolve correctly. +//! 4. Calls `register_for_worker()` for any post-build runtime state. + +use std::sync::Arc; + +use async_trait::async_trait; +use datafusion::catalog::{CatalogProvider, MemoryCatalogProvider}; +use datafusion::error::DataFusionError; +use datafusion::execution::SessionState; +use datafusion::execution::runtime_env::RuntimeEnv; +use datafusion_distributed::{Worker, WorkerQueryContext, WorkerSessionBuilder}; +use tracing::debug; + +use crate::catalog::QuickwitSchemaProvider; +use crate::data_source::QuickwitDataSource; + +/// `WorkerSessionBuilder` that shares the coordinator's `RuntimeEnv` and +/// applies all source contributions on every new worker session. +#[derive(Clone)] +pub struct QuickwitWorkerSessionBuilder { + sources: Vec>, + /// Shared with the coordinator's `DataFusionSessionBuilder`. + /// Object stores registered at startup (via `init`) or lazily (via `scan`) + /// are immediately visible to workers without any re-registration. + runtime: Arc, +} + +impl QuickwitWorkerSessionBuilder { + pub fn new(sources: Vec>, runtime: Arc) -> Self { + Self { sources, runtime } + } +} + +#[async_trait] +impl WorkerSessionBuilder for QuickwitWorkerSessionBuilder { + async fn build_session_state( + &self, + ctx: WorkerQueryContext, + ) -> Result { + // Phase 1: contributions (rules, planners, UDFs, UDAFs, codecs) + shared env. + let mut combined = crate::data_source::DataSourceContributions::default(); + for source in &self.sources { + combined.merge(source.contributions()); + } + let state = combined + .apply_to_builder(ctx.builder) + .with_runtime_env(Arc::clone(&self.runtime)) + .build(); + + // Phase 2: catalog for table-reference resolution in plan fragments. + // `register_schema` only fails if "public" is already registered, which + // cannot happen here since the catalog is freshly created above. + let schema_provider = Arc::new(QuickwitSchemaProvider::new(self.sources.clone())); + let catalog = Arc::new(MemoryCatalogProvider::new()); + catalog + .register_schema("public", schema_provider) + .map_err(|e| { + DataFusionError::Internal(format!( + "failed to register 'public' schema on worker: {e}" + )) + })?; + state + .catalog_list() + .register_catalog("quickwit".to_string(), catalog); + + // Phase 3: post-build runtime registration (rare — most stores are already + // in the shared RuntimeEnv from startup or lazy scan registration). + for source in &self.sources { + if let Err(err) = source.register_for_worker(&state).await { + debug!( + error = %err, + "data source register_for_worker failed (non-fatal)" + ); + } + } + + Ok(state) + } +} + +/// Build a `Worker` that shares the coordinator's `RuntimeEnv`. +/// +/// Pass `session_builder.runtime()` from the coordinator's +/// `DataFusionSessionBuilder` so that object stores registered at service +/// startup are available to workers without re-registration. +pub fn build_quickwit_worker( + sources: &[Arc], + runtime: Arc, +) -> Worker { + let session_builder = QuickwitWorkerSessionBuilder::new(sources.to_vec(), runtime); + Worker::from_session_builder(session_builder) +} diff --git a/quickwit/quickwit-integration-tests/Cargo.toml b/quickwit/quickwit-integration-tests/Cargo.toml index e7f1dab23db..69afe15a7ff 100644 --- a/quickwit/quickwit-integration-tests/Cargo.toml +++ b/quickwit/quickwit-integration-tests/Cargo.toml @@ -20,7 +20,12 @@ sqs-localstack-tests = [ [dev-dependencies] anyhow = { workspace = true } +arrow = { workspace = true } aws-sdk-sqs = { workspace = true } +bytesize = { workspace = true } +datafusion = "52" +datafusion-substrait = "52" +prost = { workspace = true } futures-util = { workspace = true } hyper = { workspace = true } hyper-util = { workspace = true } @@ -39,10 +44,13 @@ quickwit-actors = { workspace = true, features = ["testsuite"] } quickwit-cli = { workspace = true } quickwit-common = { workspace = true, features = ["testsuite"] } quickwit-config = { workspace = true, features = ["testsuite"] } +quickwit-datafusion = { workspace = true, features = ["testsuite"] } +quickwit-search = { workspace = true } quickwit-indexing = { workspace = true, features = ["testsuite"] } quickwit-ingest = { workspace = true, features = ["testsuite"] } quickwit-metastore = { workspace = true, features = ["testsuite"] } quickwit-opentelemetry = { workspace = true, features = ["testsuite"] } +quickwit-parquet-engine = { workspace = true } quickwit-proto = { workspace = true, features = ["testsuite"] } quickwit-rest-client = { workspace = true } quickwit-serve = { workspace = true, features = ["testsuite"] } diff --git a/quickwit/quickwit-integration-tests/src/test_utils/cluster_sandbox.rs b/quickwit/quickwit-integration-tests/src/test_utils/cluster_sandbox.rs index a7385ca0946..9cc3d3e4877 100644 --- a/quickwit/quickwit-integration-tests/src/test_utils/cluster_sandbox.rs +++ b/quickwit/quickwit-integration-tests/src/test_utils/cluster_sandbox.rs @@ -342,6 +342,11 @@ impl ClusterSandbox { .connect_lazy() } + /// Returns the storage resolver used by this sandbox. + pub fn storage_resolver(&self) -> &quickwit_storage::StorageResolver { + &self.storage_resolver + } + /// Returns a client to one of the nodes that runs the specified service pub fn rest_client(&self, service: QuickwitService) -> QuickwitClient { let node_config = self.find_node_for_service(service); diff --git a/quickwit/quickwit-integration-tests/src/tests/metrics_datafusion_tests.rs b/quickwit/quickwit-integration-tests/src/tests/metrics_datafusion_tests.rs new file mode 100644 index 00000000000..2cdcefcfc5f --- /dev/null +++ b/quickwit/quickwit-integration-tests/src/tests/metrics_datafusion_tests.rs @@ -0,0 +1,968 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Integration tests for metrics DataFusion queries — executed in-process. +//! +//! No REST/gRPC transport. Tests build a `DataFusionSessionBuilder` directly +//! with a real metastore and real file-backed storage, then call +//! `session.sql(...)` as any application would. + +use std::collections::HashSet; +use std::sync::Arc; + +use arrow::array::{Array, Float64Array, RecordBatch}; +use quickwit_config::service::QuickwitService; +use quickwit_datafusion::DataFusionSessionBuilder; +use quickwit_datafusion::sources::metrics::MetricsDataSource; +use quickwit_datafusion::test_utils::make_batch; +use quickwit_metastore::{CreateIndexRequestExt, StageMetricsSplitsRequestExt}; +use quickwit_parquet_engine::schema::ParquetSchema; +use quickwit_parquet_engine::split::{MetricsSplitMetadata, SplitId, TimeRange}; +use quickwit_parquet_engine::storage::{ParquetWriter, ParquetWriterConfig}; +use quickwit_proto::metastore::{ + CreateIndexRequest, MetastoreService, MetastoreServiceClient, PublishMetricsSplitsRequest, + StageMetricsSplitsRequest, +}; +use quickwit_proto::types::IndexUid; + +use crate::test_utils::{ClusterSandbox, ClusterSandboxBuilder}; + +// ── Setup ────────────────────────────────────────────────────────── + +async fn start_sandbox() -> (ClusterSandbox, tempfile::TempDir) { + unsafe { std::env::set_var("QW_DISABLE_TELEMETRY", "1"); std::env::set_var("QW_ENABLE_DATAFUSION_ENDPOINT", "true"); } + quickwit_common::setup_logging_for_tests(); + let sandbox = ClusterSandboxBuilder::build_and_start_standalone().await; + let data_dir = tempfile::tempdir().unwrap(); + (sandbox, data_dir) +} + +fn metastore_client(sandbox: &ClusterSandbox) -> MetastoreServiceClient { + let (config, _) = sandbox + .node_configs + .iter() + .find(|(_, svc)| svc.contains(&QuickwitService::Metastore)) + .unwrap(); + let addr = config.grpc_listen_addr; + let channel = tonic::transport::Channel::from_shared(format!("http://{addr}")) + .unwrap() + .connect_lazy(); + MetastoreServiceClient::from_channel(addr, channel, bytesize::ByteSize::mib(20), None) +} + +/// Build a `DataFusionSessionBuilder` wired to the sandbox's real metastore + storage. +fn session_builder( + sandbox: &ClusterSandbox, + metastore: MetastoreServiceClient, +) -> DataFusionSessionBuilder { + let source = Arc::new(MetricsDataSource::new( + metastore, + sandbox.storage_resolver().clone(), + )); + DataFusionSessionBuilder::new().with_source(source) +} + +// ── Data helpers ─────────────────────────────────────────────────── + +async fn create_metrics_index( + metastore: &MetastoreServiceClient, + index_id: &str, + data_dir: &std::path::Path, +) -> IndexUid { + let index_uri = format!("file://{}", data_dir.display()); + let index_config: quickwit_config::IndexConfig = + serde_json::from_value(serde_json::json!({ + "version": "0.8", "index_id": index_id, "index_uri": index_uri, + "doc_mapping": { "field_mappings": [] }, + "indexing_settings": {}, "search_settings": {} + })) + .unwrap(); + let resp = metastore + .clone() + .create_index(CreateIndexRequest::try_from_index_config(&index_config).unwrap()) + .await + .unwrap(); + resp.index_uid().clone() +} + +async fn publish_split( + metastore: &MetastoreServiceClient, + index_uid: &IndexUid, + data_dir: &std::path::Path, + split_name: &str, + batch: &RecordBatch, +) { + let schema = ParquetSchema::from_arrow_schema(batch.schema()); + let parquet_bytes = ParquetWriter::new(schema, ParquetWriterConfig::default()) + .write_to_bytes(batch) + .unwrap(); + let size_bytes = parquet_bytes.len() as u64; + std::fs::write(data_dir.join(format!("{split_name}.parquet")), &parquet_bytes).unwrap(); + + let batch_schema = batch.schema(); + let ts_idx = batch_schema.index_of("timestamp_secs").unwrap(); + let ts_col = batch.column(ts_idx) + .as_any().downcast_ref::().unwrap(); + let min_ts = (0..ts_col.len()).map(|i| ts_col.value(i)).min().unwrap_or(0); + let max_ts = (0..ts_col.len()).map(|i| ts_col.value(i)).max().unwrap_or(0); + + let mn_idx = batch_schema.index_of("metric_name").unwrap(); + let dict = batch.column(mn_idx).as_any() + .downcast_ref::>().unwrap(); + let values = dict.values().as_any().downcast_ref::().unwrap(); + let metric_names: HashSet = (0..values.len()) + .filter(|i| !values.is_null(*i)) + .map(|i| values.value(i).to_string()) + .collect(); + + let mut builder = MetricsSplitMetadata::builder() + .split_id(SplitId::new(split_name)) + .index_uid(index_uid.to_string()) + .time_range(TimeRange::new(min_ts, max_ts + 1)) + .num_rows(batch.num_rows() as u64) + .size_bytes(size_bytes); + for name in &metric_names { + builder = builder.add_metric_name(name.clone()); + } + + // Extract tag values from the batch and index them in split metadata. + // This mirrors what metrics_ingest_api::build_split_metadata does. + // Without this, metastore tag filters (pushed down from SQL/Substrait + // WHERE clauses) will not match these splits. + for tag_col in &["service", "env", "datacenter", "region", "host"] { + if let Ok(col_idx) = batch_schema.index_of(tag_col) { + let col = batch.column(col_idx); + // Extract unique non-null values from dict or string column + let values: std::collections::HashSet = if let Some(dict) = col.as_any() + .downcast_ref::>() + { + let keys = dict.keys().as_any().downcast_ref::().unwrap(); + let vals = dict.values().as_any().downcast_ref::().unwrap(); + (0..batch.num_rows()) + .filter(|i| !keys.is_null(*i)) + .map(|i| vals.value(keys.value(i) as usize).to_string()) + .collect() + } else { + std::collections::HashSet::new() + }; + for v in values { + builder = builder.add_low_cardinality_tag(tag_col.to_string(), v); + } + } + } + + metastore.clone() + .stage_metrics_splits( + StageMetricsSplitsRequest::try_from_splits_metadata(index_uid.clone(), &[builder.build()]).unwrap() + ).await.unwrap(); + metastore.clone() + .publish_metrics_splits(PublishMetricsSplitsRequest { + index_uid: Some(index_uid.clone().into()), + staged_split_ids: vec![split_name.to_string()], + replaced_split_ids: vec![], + index_checkpoint_delta_json_opt: None, + publish_token_opt: None, + }).await.unwrap(); +} + +/// Execute SQL in-process and return batches. +async fn run_sql( + builder: &DataFusionSessionBuilder, + sql: &str, +) -> Vec { + let ctx = builder.build_session().unwrap(); + // Split on ';' — DFParser consumes trailing ';' which breaks multi-stmt parse + let fragments: Vec<&str> = sql.split(';').map(str::trim).filter(|s| !s.is_empty()).collect(); + for fragment in &fragments[..fragments.len().saturating_sub(1)] { + ctx.sql(fragment).await.unwrap().collect().await.unwrap(); + } + ctx.sql(fragments.last().unwrap()).await.unwrap().collect().await.unwrap() +} + +fn total_rows(batches: &[RecordBatch]) -> usize { + batches.iter().map(|b| b.num_rows()).sum() +} + +// ═══════════════════════════════════════════════════════════════════ +// Tests +// ═══════════════════════════════════════════════════════════════════ + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn test_select_all() { + let (sandbox, data_dir) = start_sandbox().await; + let metastore = metastore_client(&sandbox); + let builder = session_builder(&sandbox, metastore.clone()); + + let index_uid = create_metrics_index(&metastore, "test-select", data_dir.path()).await; + let batch = make_batch("cpu.usage", &[100, 200, 300], &[0.5, 0.8, 0.3], Some("web")); + publish_split(&metastore, &index_uid, data_dir.path(), "split_1", &batch).await; + + let sql = r#" + CREATE OR REPLACE EXTERNAL TABLE "test-select" ( + metric_name VARCHAR NOT NULL, metric_type TINYINT, + timestamp_secs BIGINT NOT NULL, value DOUBLE NOT NULL, service VARCHAR + ) STORED AS metrics LOCATION 'test-select'; + SELECT * FROM "test-select""#; + let batches = run_sql(&builder, sql).await; + assert_eq!(total_rows(&batches), 3); + assert_eq!(batches[0].num_columns(), 5); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn test_metric_name_pruning() { + let (sandbox, data_dir) = start_sandbox().await; + let metastore = metastore_client(&sandbox); + let builder = session_builder(&sandbox, metastore.clone()); + + let index_uid = create_metrics_index(&metastore, "test-prune", data_dir.path()).await; + publish_split(&metastore, &index_uid, data_dir.path(), "cpu", + &make_batch("cpu.usage", &[100, 200], &[0.5, 0.8], Some("web"))).await; + publish_split(&metastore, &index_uid, data_dir.path(), "mem", + &make_batch("memory.used", &[100, 200], &[1024.0, 2048.0], Some("web"))).await; + + + let sql = r#" + CREATE OR REPLACE EXTERNAL TABLE "test-prune" ( + metric_name VARCHAR NOT NULL, metric_type TINYINT, + timestamp_secs BIGINT NOT NULL, value DOUBLE NOT NULL, service VARCHAR + ) STORED AS metrics LOCATION 'test-prune'; + SELECT value FROM "test-prune" WHERE metric_name = 'cpu.usage'"#; + assert_eq!(total_rows(&run_sql(&builder, sql).await), 2); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn test_aggregation() { + let (sandbox, data_dir) = start_sandbox().await; + let metastore = metastore_client(&sandbox); + let builder = session_builder(&sandbox, metastore.clone()); + + let index_uid = create_metrics_index(&metastore, "test-agg", data_dir.path()).await; + publish_split(&metastore, &index_uid, data_dir.path(), "agg1", + &make_batch("cpu.usage", &[100, 200], &[10.0, 20.0], Some("web"))).await; + publish_split(&metastore, &index_uid, data_dir.path(), "agg2", + &make_batch("cpu.usage", &[300, 400], &[30.0, 40.0], Some("api"))).await; + + let sql = r#" + CREATE OR REPLACE EXTERNAL TABLE "test-agg" ( + metric_name VARCHAR NOT NULL, metric_type TINYINT, + timestamp_secs BIGINT NOT NULL, value DOUBLE NOT NULL, service VARCHAR + ) STORED AS metrics LOCATION 'test-agg'; + SELECT SUM(value) as total FROM "test-agg""#; + let batches = run_sql(&builder, sql).await; + assert_eq!(total_rows(&batches), 1); + let total = batches[0].column(0).as_any().downcast_ref::().unwrap().value(0); + assert!((total - 100.0).abs() < 0.01, "expected 100.0, got {total}"); +} + +/// Time range pruning — exercises the CAST unwrapping fix in predicate.rs. +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn test_time_range_pruning() { + let (sandbox, data_dir) = start_sandbox().await; + let metastore = metastore_client(&sandbox); + let builder = session_builder(&sandbox, metastore.clone()); + + let index_uid = create_metrics_index(&metastore, "test-time", data_dir.path()).await; + publish_split(&metastore, &index_uid, data_dir.path(), "early", + &make_batch("cpu.usage", &[100, 200, 300], &[0.1, 0.2, 0.3], Some("web"))).await; + publish_split(&metastore, &index_uid, data_dir.path(), "late", + &make_batch("cpu.usage", &[1000, 1100, 1200], &[0.4, 0.5, 0.6], Some("web"))).await; + + let sql = r#" + CREATE OR REPLACE EXTERNAL TABLE "test-time" ( + metric_name VARCHAR NOT NULL, metric_type TINYINT, + timestamp_secs BIGINT NOT NULL, value DOUBLE NOT NULL, service VARCHAR + ) STORED AS metrics LOCATION 'test-time'; + SELECT AVG(value) as avg_val FROM "test-time" WHERE timestamp_secs >= 1000"#; + let batches = run_sql(&builder, sql).await; + assert_eq!(total_rows(&batches), 1); + let avg = batches[0].column(0).as_any().downcast_ref::().unwrap().value(0); + let expected = (0.4 + 0.5 + 0.6) / 3.0; + assert!((avg - expected).abs() < 0.01, "expected ~{expected}, got {avg}"); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn test_group_by() { + let (sandbox, data_dir) = start_sandbox().await; + let metastore = metastore_client(&sandbox); + let builder = session_builder(&sandbox, metastore.clone()); + + let index_uid = create_metrics_index(&metastore, "test-group", data_dir.path()).await; + for (name, svc, ts) in [("g1", "web", [100u64, 200, 300]), ("g2", "api", [400u64, 500, 600])] { + publish_split(&metastore, &index_uid, data_dir.path(), name, + &make_batch("cpu.usage", &ts, &[0.1, 0.2, 0.3], Some(svc))).await; + } + + let sql = r#" + CREATE OR REPLACE EXTERNAL TABLE "test-group" ( + metric_name VARCHAR NOT NULL, metric_type TINYINT, + timestamp_secs BIGINT NOT NULL, value DOUBLE NOT NULL, service VARCHAR + ) STORED AS metrics LOCATION 'test-group'; + SELECT service, COUNT(*) as cnt FROM "test-group" GROUP BY service ORDER BY service"#; + assert_eq!(total_rows(&run_sql(&builder, sql).await), 2); +} + +/// REST ingest → in-process DataFusion query. +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn test_rest_ingest_then_in_process_query() { + let (sandbox, data_dir) = start_sandbox().await; + let metastore = metastore_client(&sandbox); + + // Create the index so the ingest endpoint can find it + create_metrics_index(&metastore, "metrics-e2e", data_dir.path()).await; + + let rest_addr = sandbox + .node_configs + .iter() + .find(|(_, s)| s.contains(&QuickwitService::Indexer)) + .unwrap() + .0 + .rest_config + .listen_addr; + + let metrics_json = serde_json::json!([ + {"metric_name": "cpu.usage", "timestamp_secs": 1700000100, "value": 0.85, "service": "web"}, + {"metric_name": "cpu.usage", "timestamp_secs": 1700000200, "value": 0.92, "service": "web"}, + {"metric_name": "memory.used", "timestamp_secs": 1700000100, "value": 1024.0, "service": "db"}, + {"metric_name": "cpu.usage", "timestamp_secs": 1700000300, "value": 0.45, "service": "api"} + ]); + + let resp = reqwest::Client::new() + .post(format!("http://{rest_addr}/api/v1/metrics-e2e/ingest-metrics")) + .json(&metrics_json) + .send() + .await + .unwrap(); + assert!(resp.status().is_success(), "ingest failed: {}", resp.text().await.unwrap()); + + let builder = session_builder(&sandbox, metastore); + let sql = r#" + CREATE OR REPLACE EXTERNAL TABLE "metrics-e2e" ( + metric_name VARCHAR NOT NULL, metric_type TINYINT, + timestamp_secs BIGINT NOT NULL, value DOUBLE NOT NULL, + service VARCHAR, env VARCHAR + ) STORED AS metrics LOCATION 'metrics-e2e'; + SELECT COUNT(*) as cnt FROM "metrics-e2e""#; + let batches = run_sql(&builder, sql).await; + let cnt = batches[0].column(0).as_any() + .downcast_ref::().unwrap().value(0); + assert_eq!(cnt, 4); +} + +/// Verifies that CAST-unwrapping in `predicate.rs` causes fewer splits to be scanned +/// when a time filter is applied through the full SQL pipeline. +/// +/// DataFusion emits `CAST(timestamp_secs AS Int64) >= 1000` when comparing a UInt64 +/// column against an Int64 literal. Without CAST unwrapping in `column_name()`, the +/// filter is left in `remaining` and the metastore query has no time range — all splits +/// are returned. With CAST unwrapping, only the late split matches. +/// +/// This test exercises the extraction-to-pruning pipeline end-to-end: the CAST-wrapped +/// filter flows from DataFusion's optimizer through `extract_split_filters` and then +/// prunes the metastore split list. The correctness signal is the query result: if +/// pruning is wrong, early-split values (0.1, 0.2, 0.3) leak into the aggregate. +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn test_cast_unwrapping_prunes_to_late_split_only() { + let (sandbox, data_dir) = start_sandbox().await; + let metastore = metastore_client(&sandbox); + let builder = session_builder(&sandbox, metastore.clone()); + + let index_uid = create_metrics_index(&metastore, "test-cast-prune", data_dir.path()).await; + // Early split: timestamps 100–300, values 0.1–0.3 + publish_split( + &metastore, + &index_uid, + data_dir.path(), + "early", + &make_batch("cpu.usage", &[100, 200, 300], &[0.1, 0.2, 0.3], Some("web")), + ) + .await; + // Late split: timestamps 1000–1200, values 0.4–0.6 + publish_split( + &metastore, + &index_uid, + data_dir.path(), + "late", + &make_batch("cpu.usage", &[1000, 1100, 1200], &[0.4, 0.5, 0.6], Some("web")), + ) + .await; + + // The direct proof that CAST unwrapping is working lives in the unit tests in + // quickwit-datafusion/src/sources/metrics/predicate.rs + // (test_timestamp_gte_with_cast_column, test_timestamp_lt_with_cast_column, and + // test_metric_name_pruning_prunes_splits_not_just_rows). Those tests are + // inaccessible here because `predicate` is an internal module. + // This integration test verifies functional correctness (parquet-level filtering). + + let sql = r#" + CREATE OR REPLACE EXTERNAL TABLE "test-cast-prune" ( + metric_name VARCHAR NOT NULL, metric_type TINYINT, + timestamp_secs BIGINT NOT NULL, value DOUBLE NOT NULL, service VARCHAR + ) STORED AS metrics LOCATION 'test-cast-prune'; + SELECT COUNT(*) AS cnt, SUM(value) AS total FROM "test-cast-prune" + WHERE timestamp_secs >= 1000"#; + let batches = run_sql(&builder, sql).await; + assert_eq!(total_rows(&batches), 1); + let cnt = batches[0] + .column_by_name("cnt") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap() + .value(0); + // Note: this row-count assertion proves functional correctness (parquet-level + // filter) but NOT split pruning. The split-pruning proof is the direct + // predicate extraction assertion above. + assert_eq!(cnt, 3, "expected 3 rows from late split only; got {cnt}"); + let total = batches[0] + .column_by_name("total") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap() + .value(0); + let expected = 0.4 + 0.5 + 0.6; + assert!( + (total - expected).abs() < 0.01, + "expected {expected:.2}, got {total:.2} — early-split values must not appear" + ); +} + +/// Verifies that querying an index with no published splits returns zero rows and does +/// not panic. This tests that DataFusion handles an empty `FileScanConfig` correctly. +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn test_query_empty_index_returns_zero_rows() { + let (sandbox, data_dir) = start_sandbox().await; + let metastore = metastore_client(&sandbox); + let builder = session_builder(&sandbox, metastore.clone()); + + // Create the index but publish NO splits. + create_metrics_index(&metastore, "test-empty", data_dir.path()).await; + + let sql = r#" + CREATE OR REPLACE EXTERNAL TABLE "test-empty" ( + metric_name VARCHAR NOT NULL, metric_type TINYINT, + timestamp_secs BIGINT NOT NULL, value DOUBLE NOT NULL, service VARCHAR + ) STORED AS metrics LOCATION 'test-empty'; + SELECT COUNT(*) AS cnt FROM "test-empty""#; + let batches = run_sql(&builder, sql).await; + let cnt = batches[0] + .column(0) + .as_any() + .downcast_ref::() + .unwrap() + .value(0); + assert_eq!(cnt, 0, "empty index must return 0 rows, got {cnt}"); +} + +/// Verifies that a multi-value IN filter returns rows from ALL matching splits, not +/// just the first. This is the integration-level proof for the multi-value IN fix. +/// +/// Three splits contain different services (web, api, db). A query filtering +/// `service IN ('web', 'api')` must return rows from both the web and api splits. +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn test_in_list_tag_filter_returns_all_matching_rows() { + let (sandbox, data_dir) = start_sandbox().await; + let metastore = metastore_client(&sandbox); + let builder = session_builder(&sandbox, metastore.clone()); + + let index_uid = create_metrics_index(&metastore, "test-inlist", data_dir.path()).await; + publish_split( + &metastore, + &index_uid, + data_dir.path(), + "web_split", + &make_batch("cpu.usage", &[100, 200], &[1.0, 2.0], Some("web")), + ) + .await; + publish_split( + &metastore, + &index_uid, + data_dir.path(), + "api_split", + &make_batch("cpu.usage", &[300, 400], &[3.0, 4.0], Some("api")), + ) + .await; + publish_split( + &metastore, + &index_uid, + data_dir.path(), + "db_split", + &make_batch("cpu.usage", &[500, 600], &[5.0, 6.0], Some("db")), + ) + .await; + + let sql = r#" + CREATE OR REPLACE EXTERNAL TABLE "test-inlist" ( + metric_name VARCHAR NOT NULL, metric_type TINYINT, + timestamp_secs BIGINT NOT NULL, value DOUBLE NOT NULL, service VARCHAR + ) STORED AS metrics LOCATION 'test-inlist'; + SELECT service, COUNT(*) AS cnt FROM "test-inlist" + WHERE service IN ('web', 'api') + GROUP BY service ORDER BY service"#; + let batches = run_sql(&builder, sql).await; + // Must return 2 rows (one group per service) — both web and api splits were scanned. + assert_eq!( + total_rows(&batches), + 2, + "IN ('web','api') must return rows for both services; got {} groups", + total_rows(&batches) + ); + let total_data_rows: i64 = batches + .iter() + .map(|b| { + b.column_by_name("cnt") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap() + .iter() + .flatten() + .sum::() + }) + .sum(); + assert_eq!(total_data_rows, 4, "web (2) + api (2) = 4 rows; db must be excluded"); +} + +/// Demonstrates the `sum:metric{filter} by {groups}.rollup(agg, interval)` pattern +/// over wide-format parquet data — no context/points JOIN needed. +/// +/// In Datadog's internal model a query like: +/// `avg:cpu.usage{env:prod} by {service}.rollup(max, 30)` +/// is compiled to SQL over two tables joined on `bhandle` (a tag hash). +/// +/// With our wide-format parquet model every data point carries its own tags +/// as columns, so the same query is a single two-level aggregation: +/// +/// 1. Inner GROUP BY (service, host, time_bin) → MAX(value) per series per bin +/// 2. Outer GROUP BY (service, time_bin) → AVG(max) across hosts per bin +/// +/// Three prod series, one staging series (must be filtered out): +/// web / host=web-01: values 1,2,3,4,5,6 at t=0,15,30,45,60,75 +/// web / host=web-02: values 10,20,30,40,50,60 at t=0,15,30,45,60,75 +/// api / host=api-01: values 100,200,300,400,500,600 at t=0,15,30,45,60,75 +/// web / host=web-01 / env=staging (should be excluded by env filter) +/// +/// Expected results (30-second bins, epoch origin): +/// bin t=0: web → avg(max(1,2), max(10,20)) = avg(2, 20) = 11.0 +/// api → avg(max(100,200)) = 200.0 +/// bin t=30: web → avg(max(3,4), max(30,40)) = avg(4, 40) = 22.0 +/// api → avg(max(300,400)) = 400.0 +/// bin t=60: web → avg(max(5,6), max(50,60)) = avg(6, 60) = 33.0 +/// api → avg(max(500,600)) = 600.0 +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn test_rollup_nested_aggregation() { + use quickwit_datafusion::test_utils::make_batch_with_tags; + + let (sandbox, data_dir) = start_sandbox().await; + let metastore = metastore_client(&sandbox); + let builder = session_builder(&sandbox, metastore.clone()); + + let index_uid = create_metrics_index(&metastore, "rollup-test", data_dir.path()).await; + + // Timestamps span 3 full 30-second bins (0–29, 30–59, 60–89). + let ts: &[u64] = &[0, 15, 30, 45, 60, 75]; + + publish_split(&metastore, &index_uid, data_dir.path(), "web-01-prod", + &make_batch_with_tags("cpu.usage", ts, &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0], + Some("web"), Some("prod"), None, None, Some("web-01"))).await; + + publish_split(&metastore, &index_uid, data_dir.path(), "web-02-prod", + &make_batch_with_tags("cpu.usage", ts, &[10.0, 20.0, 30.0, 40.0, 50.0, 60.0], + Some("web"), Some("prod"), None, None, Some("web-02"))).await; + + publish_split(&metastore, &index_uid, data_dir.path(), "api-01-prod", + &make_batch_with_tags("cpu.usage", ts, &[100.0, 200.0, 300.0, 400.0, 500.0, 600.0], + Some("api"), Some("prod"), None, None, Some("api-01"))).await; + + // Staging split — env filter must exclude all rows from this split. + publish_split(&metastore, &index_uid, data_dir.path(), "web-01-staging", + &make_batch_with_tags("cpu.usage", &[0, 30, 60], &[999.0, 999.0, 999.0], + Some("web"), Some("staging"), None, None, Some("web-01"))).await; + + // The query mirrors the Datadog rollup pattern without a context/points join: + // avg:cpu.usage{env:prod} by {service}.rollup(max, 30) + // + // Step 1 (inner): MAX per series (service + host) per 30-second bin. + // Step 2 (outer): AVG of those per-series maxes, grouped by service. + // + // to_timestamp_seconds() converts the stored epoch-seconds UInt64 to a + // Timestamp so that date_bin() can bucket it into 30-second intervals. + let sql = r#" + CREATE OR REPLACE EXTERNAL TABLE "rollup-test" ( + metric_name VARCHAR NOT NULL, + metric_type TINYINT, + timestamp_secs BIGINT NOT NULL, + value DOUBLE NOT NULL, + service VARCHAR, + env VARCHAR, + host VARCHAR + ) STORED AS metrics LOCATION 'rollup-test'; + WITH bin_max AS ( + SELECT + service, + host, + date_bin( + INTERVAL '30 seconds', + to_timestamp_seconds(timestamp_secs) + ) AS time_bin, + MAX(value) AS max_bin_val + FROM "rollup-test" + WHERE metric_name = 'cpu.usage' + AND env = 'prod' + GROUP BY service, host, time_bin + ) + SELECT + service, + time_bin, + AVG(max_bin_val) AS avg_val + FROM bin_max + GROUP BY service, time_bin + ORDER BY time_bin, service + "#; + + let batches = run_sql(&builder, sql).await; + + // 3 bins × 2 services (web, api) = 6 result rows. + assert_eq!(total_rows(&batches), 6, + "expected 6 rows (3 bins × 2 services); staging rows must be excluded"); + + // Collect (service, avg_val) pairs in ORDER BY time_bin, service order. + // After GROUP BY, DataFusion casts dict-encoded strings to plain Utf8. + let results: Vec<(String, f64)> = batches.iter().flat_map(|batch| { + let svc_raw = batch.column_by_name("service").unwrap(); + let avg_col = batch.column_by_name("avg_val").unwrap() + .as_any().downcast_ref::().unwrap(); + (0..batch.num_rows()).map(|i| { + // After GROUP BY, DataFusion 52 may return Utf8View, Utf8, or Dict. + let svc = if let Some(sa) = svc_raw.as_any() + .downcast_ref::() { + sa.value(i).to_string() + } else if let Some(sa) = svc_raw.as_any() + .downcast_ref::() { + sa.value(i).to_string() + } else { + let dict = svc_raw.as_any() + .downcast_ref::>() + .unwrap_or_else(|| panic!("service column: unexpected type {:?}", svc_raw.data_type())); + let keys = dict.keys().as_any().downcast_ref::().unwrap(); + let vals = dict.values().as_any().downcast_ref::().unwrap(); + vals.value(keys.value(i) as usize).to_string() + }; + let avg = avg_col.value(i); + (svc, avg) + }).collect::>() + }).collect(); + + // Expected: [(api,200), (web,11), (api,400), (web,22), (api,600), (web,33)] + let expected = [ + ("api", 200.0_f64), + ("web", 11.0), + ("api", 400.0), + ("web", 22.0), + ("api", 600.0), + ("web", 33.0), + ]; + + assert_eq!(results.len(), expected.len()); + for (i, ((got_svc, got_avg), (exp_svc, exp_avg))) in + results.iter().zip(expected.iter()).enumerate() + { + assert_eq!(got_svc.as_str(), *exp_svc, "row {i}: wrong service"); + assert!( + (got_avg - exp_avg).abs() < 0.01, + "row {i} ({exp_svc}): expected avg={exp_avg:.2}, got {got_avg:.2}" + ); + } +} + +/// Demonstrates the Substrait query path using standard `NamedTable` read +/// relations — no custom protos, no type URLs. +/// +/// A producer (Pomsky, df-executor, or any Substrait client) builds a plan +/// using vanilla Substrait, naming the index in `NamedTable.names`. The +/// `QuickwitSubstraitConsumer` resolves the index from the metastore, uses the +/// `ReadRel.base_schema` for schema injection, and executes the plan exactly +/// as it would for the SQL DDL path. +/// +/// This test mirrors the rollup test above but drives it via +/// `DataFusionSessionBuilder::execute_substrait` instead of SQL. +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn test_substrait_named_table_query() { + use datafusion_substrait::logical_plan::producer::to_substrait_plan; + use prost::Message; + + let (sandbox, data_dir) = start_sandbox().await; + let metastore = metastore_client(&sandbox); + let builder = session_builder(&sandbox, metastore.clone()); + + let index_uid = create_metrics_index(&metastore, "substrait-test", data_dir.path()).await; + publish_split(&metastore, &index_uid, data_dir.path(), "s1", + &make_batch("cpu.usage", &[100, 200, 300], &[1.0, 2.0, 3.0], Some("web"))).await; + publish_split(&metastore, &index_uid, data_dir.path(), "s2", + &make_batch("memory.used", &[100, 200, 300], &[10.0, 20.0, 30.0], Some("api"))).await; + + // Build the Substrait plan from SQL via DataFusion's producer. + // The plan tree will have a NamedTable ReadRel for "substrait-test". + let ctx = builder.build_session().unwrap(); + + // Register a minimal table so the SQL planner can build the plan + // (the actual schema will come from base_schema when the substrait consumer + // resolves it at execution time). + ctx.sql(r#"CREATE OR REPLACE EXTERNAL TABLE "substrait-test" ( + metric_name VARCHAR NOT NULL, metric_type TINYINT, + timestamp_secs BIGINT NOT NULL, value DOUBLE NOT NULL, service VARCHAR + ) STORED AS metrics LOCATION 'substrait-test'"#) + .await.unwrap().collect().await.unwrap(); + + let df = ctx.sql( + r#"SELECT metric_name, SUM(value) as total + FROM "substrait-test" + GROUP BY metric_name + ORDER BY metric_name"# + ).await.unwrap(); + + let plan = df.into_optimized_plan().unwrap(); + let substrait_plan = to_substrait_plan(&plan, &ctx.state()).unwrap(); + let plan_bytes = substrait_plan.encode_to_vec(); + + // Execute via the Substrait path — DataFusionSessionBuilder decodes the plan, + // QuickwitSubstraitConsumer routes the NamedTable ReadRel to MetricsDataSource, + // and the query executes against the real parquet files. + let batches = builder.execute_substrait(&plan_bytes).await.unwrap(); + + let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum(); + assert_eq!(total_rows, 2, "expected 2 metric names (cpu.usage, memory.used)"); + + // Verify SUM values: cpu.usage = 1+2+3 = 6, memory.used = 10+20+30 = 60 + let metric_col = batches[0].column_by_name("metric_name").unwrap(); + let total_col = batches[0].column_by_name("total").unwrap() + .as_any().downcast_ref::().unwrap(); + + // metric_name may come back as StringViewArray or StringArray after aggregation + let names: Vec = (0..batches[0].num_rows()).map(|i| { + if let Some(sv) = metric_col.as_any().downcast_ref::() { + sv.value(i).to_string() + } else { + metric_col.as_any().downcast_ref::() + .unwrap().value(i).to_string() + } + }).collect(); + + assert_eq!(names, vec!["cpu.usage", "memory.used"]); + assert!((total_col.value(0) - 6.0).abs() < 0.01, + "cpu.usage SUM expected 6.0, got {}", total_col.value(0)); + assert!((total_col.value(1) - 60.0).abs() < 0.01, + "memory.used SUM expected 60.0, got {}", total_col.value(1)); +} + +/// Executes the user-provided Substrait rollup plan directly against real +/// parquet data in a sandbox cluster. +/// +/// The plan is loaded from `rollup_substrait.json` (committed alongside this +/// file) and targets index `"otel-metrics-v0_9"`. It expresses: +/// +/// avg:cpu.usage{env:prod} by {service}.rollup(max, 30s) +/// +/// Plan tree (from the JSON): +/// Sort(time_bin ASC, service ASC) +/// Aggregate → AVG(max_bin_val) [outer: avg across series] +/// Aggregate → MAX(value) [inner: max per series per bin] +/// Project → date_bin(30s, to_timestamp_seconds(timestamp_secs)) +/// Filter(metric_name='cpu.usage' AND env='prod') +/// ReadRel("otel-metrics-v0_9") ← resolved by QuickwitSubstraitConsumer +/// +/// Data (same as test_rollup_nested_aggregation): +/// web/web-01/prod : t=0,15,30,45,60,75 values=1,2,3,4,5,6 +/// web/web-02/prod : t=0,15,30,45,60,75 values=10,20,30,40,50,60 +/// api/api-01/prod : t=0,15,30,45,60,75 values=100,200,300,400,500,600 +/// web/web-01/staging (filtered out by env='prod') +/// +/// Expected results (30s bins, ORDER BY time_bin ASC, service ASC): +/// (api, bin=0s, 200.0) ← avg(max(100,200)) +/// (web, bin=0s, 11.0) ← avg(max(1,2)=2, max(10,20)=20) +/// (api, bin=30s, 400.0) +/// (web, bin=30s, 22.0) +/// (api, bin=60s, 600.0) +/// (web, bin=60s, 33.0) +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn test_rollup_substrait_from_file() { + use datafusion_substrait::substrait::proto::Plan; + use prost::Message; + use quickwit_datafusion::test_utils::make_batch_with_tags; + + let (sandbox, data_dir) = start_sandbox().await; + let metastore = metastore_client(&sandbox); + let builder = session_builder(&sandbox, metastore.clone()); + + // Create index named exactly as the Substrait plan references it. + let index_uid = create_metrics_index(&metastore, "otel-metrics-v0_9", data_dir.path()).await; + + let ts: &[u64] = &[0, 15, 30, 45, 60, 75]; + publish_split(&metastore, &index_uid, data_dir.path(), "web-01-prod", + &make_batch_with_tags("cpu.usage", ts, &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0], + Some("web"), Some("prod"), None, None, Some("web-01"))).await; + publish_split(&metastore, &index_uid, data_dir.path(), "web-02-prod", + &make_batch_with_tags("cpu.usage", ts, &[10.0, 20.0, 30.0, 40.0, 50.0, 60.0], + Some("web"), Some("prod"), None, None, Some("web-02"))).await; + publish_split(&metastore, &index_uid, data_dir.path(), "api-01-prod", + &make_batch_with_tags("cpu.usage", ts, &[100.0, 200.0, 300.0, 400.0, 500.0, 600.0], + Some("api"), Some("prod"), None, None, Some("api-01"))).await; + publish_split(&metastore, &index_uid, data_dir.path(), "web-01-staging", + &make_batch_with_tags("cpu.usage", &[0, 30, 60], &[999.0, 999.0, 999.0], + Some("web"), Some("staging"), None, None, Some("web-01"))).await; + + + + // Load the Substrait plan JSON from the file next to this test. + let plan_json = include_str!("rollup_substrait.json"); + let substrait_plan: Plan = serde_json::from_str(plan_json) + .expect("rollup_substrait.json must be valid Substrait JSON"); + let mut plan_bytes = Vec::new(); + substrait_plan.encode(&mut plan_bytes).expect("Substrait plan encode failed"); + + // Execute via the Substrait path — no SQL, no DDL, just the plan. + let batches = builder + .execute_substrait(&plan_bytes) + .await + .expect("Substrait rollup query failed"); + + // Print the plan and results so you can see what ran. + println!("\n=== Substrait rollup results ({} batches, {} rows total) ===", + batches.len(), + batches.iter().map(|b| b.num_rows()).sum::()); + for batch in &batches { + println!("{}", arrow::util::pretty::pretty_format_batches(&[batch.clone()]).unwrap()); + } + + // 3 bins × 2 services (api, web) = 6 rows. + let total: usize = batches.iter().map(|b| b.num_rows()).sum(); + assert_eq!(total, 6, "expected 6 rows (3 bins × 2 services)"); + + // Expected order: (api,bin0,200), (web,bin0,11), (api,bin30,400), + // (web,bin30,22), (api,bin60,600), (web,bin60,33) + // The inner GROUP BY groups by (service, time_bin) — no host column. + // So MAX is taken across ALL series for a given (service, time_bin): + // web/bin=0s: MAX(web-01:1,2, web-02:10,20) = 20 → AVG(20) = 20 + // api/bin=0s: MAX(api-01:100,200) = 200 → AVG(200) = 200 + let expected_values = [200.0f64, 20.0, 400.0, 40.0, 600.0, 60.0]; + let all_values: Vec = batches.iter().flat_map(|b| { + b.column_by_name("value").unwrap() + .as_any().downcast_ref::().unwrap() + .iter().flatten() + .collect::>() + }).collect(); + + for (i, (got, exp)) in all_values.iter().zip(expected_values.iter()).enumerate() { + assert!( + (got - exp).abs() < 0.01, + "row {i}: expected {exp:.1}, got {got:.1}" + ); + } + + println!("✓ Substrait rollup plan executed correctly"); +} + +/// Verifies that a query works correctly when the DDL schema declares only a +/// SUBSET of the columns present in the parquet files. +/// +/// This is the typical BYOC case: a coordinator generates a Substrait plan +/// that only references the columns it needs for the query (`metric_name`, +/// `timestamp_secs`, `value`, `service`). The parquet files contain many +/// more tag columns (`env`, `host`, `datacenter`, `region`) that the query +/// doesn't reference. +/// +/// DataFusion uses `PhysicalExprAdapterFactory` to project only the declared +/// columns from each parquet file. Undeclared columns are simply not read — +/// no NULLs, no errors, just not present in the output. +/// +/// Data layout: +/// Split with wide schema: service='web', env='prod', host='web-01', +/// datacenter='us-east', region='us-east-1' +/// +/// DDL declares only: metric_name, timestamp_secs, value, service +/// +/// Query: SELECT service, SUM(value) FROM index WHERE metric_name='cpu.usage' +/// +/// Expected: correct SUM using only the declared columns. +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn test_query_with_partial_schema_declaration() { + use quickwit_datafusion::test_utils::make_batch_with_tags; + + let (sandbox, data_dir) = start_sandbox().await; + let metastore = metastore_client(&sandbox); + let builder = session_builder(&sandbox, metastore.clone()); + + let index_uid = create_metrics_index(&metastore, "partial-schema", data_dir.path()).await; + + // Write a wide split with ALL tag columns populated. + publish_split( + &metastore, &index_uid, data_dir.path(), "wide", + &make_batch_with_tags( + "cpu.usage", + &[100, 200, 300], + &[1.0, 2.0, 3.0], + Some("web"), // service + Some("prod"), // env + Some("us-east"), // datacenter + Some("us-east-1"), // region + Some("web-01"), // host + ), + ).await; + + // DDL declares only 4 columns — service, env, and host are intentionally + // omitted from the columns the query will project. + // (We include service and env because the WHERE/GROUP BY uses them, + // but NOT host, datacenter, region — the coordinator doesn't need them.) + let sql = r#" + CREATE OR REPLACE EXTERNAL TABLE "partial-schema" ( + metric_name VARCHAR NOT NULL, + metric_type TINYINT, + timestamp_secs BIGINT NOT NULL, + value DOUBLE NOT NULL, + service VARCHAR, + env VARCHAR + ) STORED AS metrics LOCATION 'partial-schema'; + SELECT service, SUM(value) AS total + FROM "partial-schema" + WHERE metric_name = 'cpu.usage' AND env = 'prod' + GROUP BY service + "#; + + let batches = run_sql(&builder, sql).await; + + assert_eq!(total_rows(&batches), 1, "expected 1 row (service=web)"); + + let total = batches[0] + .column_by_name("total") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap() + .value(0); + assert!( + (total - 6.0).abs() < 0.01, + "expected SUM(1+2+3)=6.0, got {total:.2} — undeclared columns (host, datacenter, region) \ + must not affect projection or aggregation" + ); + + // Verify the schema of the result contains only the declared columns + // (the undeclared ones — host, datacenter, region — are absent, not NULL). + let schema = batches[0].schema(); + assert!(schema.index_of("host").is_err(), + "host was not declared in DDL — it must not appear in the result schema"); + assert!(schema.index_of("datacenter").is_err(), + "datacenter was not declared in DDL — it must not appear in the result schema"); + assert!(schema.index_of("region").is_err(), + "region was not declared in DDL — it must not appear in the result schema"); +} diff --git a/quickwit/quickwit-integration-tests/src/tests/metrics_distributed_tests.rs b/quickwit/quickwit-integration-tests/src/tests/metrics_distributed_tests.rs new file mode 100644 index 00000000000..d4c1cdc1c72 --- /dev/null +++ b/quickwit/quickwit-integration-tests/src/tests/metrics_distributed_tests.rs @@ -0,0 +1,321 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Distributed DataFusion execution tests — executed in-process. +//! +//! Test 1 (`test_distributed_tasks_not_shuffles`): builds a session with a +//! two-entry `SearcherPool` constructed from the 2-node sandbox addresses. +//! Verifies the physical plan contains `PartitionIsolatorExec` (one per split +//! assigned to a worker) and NOT `NetworkShuffleExec`. Then executes the +//! query to verify correctness — workers are reached via the `WorkerService` +//! gRPC that `quickwit-serve/src/grpc.rs` registers on the same port. +//! +//! Test 2 (`test_null_columns_for_missing_parquet_fields`): verifies that +//! columns declared in the DDL schema but absent from a specific parquet file +//! are filled with NULLs by DataFusion's `PhysicalExprAdapterFactory`. + +use std::collections::HashSet; +use std::sync::Arc; + +use arrow::array::{Array, Float64Array, Int64Array, RecordBatch}; +use arrow::datatypes::{DataType, Field, Int32Type, Schema as ArrowSchema}; +use quickwit_config::service::QuickwitService; +use quickwit_datafusion::DataFusionSessionBuilder; +use quickwit_datafusion::sources::metrics::MetricsDataSource; +use quickwit_datafusion::test_utils::{make_batch, make_batch_with_tags}; +use quickwit_metastore::StageMetricsSplitsRequestExt; +use quickwit_parquet_engine::schema::ParquetSchema; +use quickwit_parquet_engine::split::{MetricsSplitMetadata, SplitId, TimeRange}; +use quickwit_parquet_engine::storage::{ParquetWriter, ParquetWriterConfig}; +use quickwit_proto::metastore::{ + CreateIndexRequest, MetastoreService, MetastoreServiceClient, PublishMetricsSplitsRequest, + StageMetricsSplitsRequest, +}; +use quickwit_metastore::CreateIndexRequestExt; +use quickwit_proto::types::IndexUid; +use quickwit_search::{SearcherPool, create_search_client_from_grpc_addr}; + +use crate::test_utils::{ClusterSandbox, ClusterSandboxBuilder}; + +// ── Helpers ────────────────────────────────────────────────────────── + +fn metastore_client(sandbox: &ClusterSandbox) -> MetastoreServiceClient { + let (config, _) = sandbox.node_configs.iter() + .find(|(_, svc)| svc.contains(&QuickwitService::Metastore)).unwrap(); + let addr = config.grpc_listen_addr; + let channel = tonic::transport::Channel::from_shared(format!("http://{addr}")) + .unwrap().connect_lazy(); + MetastoreServiceClient::from_channel(addr, channel, bytesize::ByteSize::mib(20), None) +} + +/// Build a RecordBatch with ONLY the 4 required columns — no tag columns. +fn make_narrow_batch(metric_name: &str, timestamps: &[u64], values: &[f64]) -> RecordBatch { + let n = timestamps.len(); + let dict = DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)); + let schema = Arc::new(ArrowSchema::new(vec![ + Field::new("metric_name", dict, false), + Field::new("metric_type", DataType::UInt8, false), + Field::new("timestamp_secs", DataType::UInt64, false), + Field::new("value", DataType::Float64, false), + ])); + use arrow::array::{DictionaryArray, Float64Array, Int32Array, StringArray, UInt64Array, UInt8Array}; + let keys = Int32Array::from(vec![0i32; n]); + let vals = StringArray::from(vec![metric_name]); + let metric_col = Arc::new(DictionaryArray::::try_new(keys, Arc::new(vals)).unwrap()); + RecordBatch::try_new(schema, vec![ + metric_col as Arc<_>, + Arc::new(UInt8Array::from(vec![0u8; n])), + Arc::new(UInt64Array::from(timestamps.to_vec())), + Arc::new(Float64Array::from(values.to_vec())), + ]).unwrap() +} + +async fn create_metrics_index( + metastore: &MetastoreServiceClient, + index_id: &str, + data_dir: &std::path::Path, +) -> IndexUid { + let index_uri = format!("file://{}", data_dir.display()); + let config: quickwit_config::IndexConfig = serde_json::from_value(serde_json::json!({ + "version": "0.8", "index_id": index_id, "index_uri": index_uri, + "doc_mapping": {"field_mappings": []}, "indexing_settings": {}, "search_settings": {} + })).unwrap(); + metastore.clone() + .create_index(CreateIndexRequest::try_from_index_config(&config).unwrap()) + .await.unwrap().index_uid().clone() +} + +async fn publish_split( + metastore: &MetastoreServiceClient, + index_uid: &IndexUid, + data_dir: &std::path::Path, + split_name: &str, + batch: &RecordBatch, +) { + let schema = ParquetSchema::from_arrow_schema(batch.schema()); + let parquet_bytes = ParquetWriter::new(schema, ParquetWriterConfig::default()) + .write_to_bytes(batch).unwrap(); + let size_bytes = parquet_bytes.len() as u64; + std::fs::write(data_dir.join(format!("{split_name}.parquet")), &parquet_bytes).unwrap(); + + let batch_schema = batch.schema(); + let ts_idx = batch_schema.index_of("timestamp_secs").unwrap(); + let ts_col = batch.column(ts_idx).as_any() + .downcast_ref::().unwrap(); + let min_ts = (0..ts_col.len()).map(|i| ts_col.value(i)).min().unwrap_or(0); + let max_ts = (0..ts_col.len()).map(|i| ts_col.value(i)).max().unwrap_or(0); + + let mn_idx = batch_schema.index_of("metric_name").unwrap(); + let dict = batch.column(mn_idx).as_any() + .downcast_ref::>().unwrap(); + let values = dict.values().as_any() + .downcast_ref::().unwrap(); + let metric_names: HashSet = (0..values.len()) + .filter(|i| !values.is_null(*i)).map(|i| values.value(i).to_string()).collect(); + + let mut builder = MetricsSplitMetadata::builder() + .split_id(SplitId::new(split_name)) + .index_uid(index_uid.to_string()) + .time_range(TimeRange::new(min_ts, max_ts + 1)) + .num_rows(batch.num_rows() as u64).size_bytes(size_bytes); + for name in &metric_names { builder = builder.add_metric_name(name.clone()); } + + // Extract tag values for metastore split-level pruning + for tag_col in &["service", "env", "datacenter", "region", "host"] { + if let Ok(col_idx) = batch_schema.index_of(tag_col) { + let col = batch.column(col_idx); + if let Some(dict) = col.as_any().downcast_ref::>() { + let keys = dict.keys().as_any().downcast_ref::().unwrap(); + let vals = dict.values().as_any().downcast_ref::().unwrap(); + let values: std::collections::HashSet = (0..batch.num_rows()) + .filter(|i| !keys.is_null(*i)) + .map(|i| vals.value(keys.value(i) as usize).to_string()) + .collect(); + for v in values { builder = builder.add_low_cardinality_tag(tag_col.to_string(), v); } + } + } + } + + metastore.clone().stage_metrics_splits( + StageMetricsSplitsRequest::try_from_splits_metadata(index_uid.clone(), &[builder.build()]).unwrap() + ).await.unwrap(); + metastore.clone().publish_metrics_splits(PublishMetricsSplitsRequest { + index_uid: Some(index_uid.clone().into()), + staged_split_ids: vec![split_name.to_string()], + replaced_split_ids: vec![], + index_checkpoint_delta_json_opt: None, + publish_token_opt: None, + }).await.unwrap(); +} + +async fn run_sql(builder: &DataFusionSessionBuilder, sql: &str) -> Vec { + let ctx = builder.build_session().unwrap(); + let fragments: Vec<&str> = sql.split(';').map(str::trim).filter(|s| !s.is_empty()).collect(); + for fragment in &fragments[..fragments.len().saturating_sub(1)] { + ctx.sql(fragment).await.unwrap().collect().await.unwrap(); + } + ctx.sql(fragments.last().unwrap()).await.unwrap().collect().await.unwrap() +} + +// ═══════════════════════════════════════════════════════════════════ +// Test 1: Tasks, not shuffles +// ═══════════════════════════════════════════════════════════════════ + +/// Builds a 2-searcher pool from the sandbox node gRPC addresses, which is +/// enough for `QuickwitWorkerResolver::get_urls()` to return 2 URLs so the +/// distributed optimizer fires. Workers are reached via the `WorkerService` +/// registered by `grpc.rs`. +#[tokio::test(flavor = "multi_thread", worker_threads = 4)] +async fn test_distributed_tasks_not_shuffles() { + unsafe { std::env::set_var("QW_DISABLE_TELEMETRY", "1"); std::env::set_var("QW_ENABLE_DATAFUSION_ENDPOINT", "true"); } + quickwit_common::setup_logging_for_tests(); + + let sandbox = ClusterSandboxBuilder::default() + .add_node(QuickwitService::supported_services()) + .add_node([QuickwitService::Searcher]) + .build_and_start().await; + + let data_dir = tempfile::tempdir().unwrap(); + let metastore = metastore_client(&sandbox); + + let index_uid = create_metrics_index(&metastore, "dist-test", data_dir.path()).await; + for (name, metric, ts, vals) in [ + ("split_a", "cpu.usage", [100u64, 200], [0.1f64, 0.2]), + ("split_b", "cpu.usage", [300u64, 400], [0.3f64, 0.4]), + ("split_c", "memory.used", [100u64, 200], [1024.0f64, 2048.0]), + ("split_d", "memory.used", [300u64, 400], [3072.0f64, 4096.0]), + ] { + publish_split(&metastore, &index_uid, data_dir.path(), name, + &make_batch(metric, &ts, &vals, Some("web"))).await; + } + + // Build a SearcherPool with both searcher node addresses so the distributed + // optimizer sees n_workers = 2 and decomposes the plan into tasks. + let pool = SearcherPool::default(); + for (config, services) in &sandbox.node_configs { + if services.contains(&QuickwitService::Searcher) { + let addr = config.grpc_listen_addr; + // Pool value is SearchServiceClient — only the key (addr) matters for + // QuickwitWorkerResolver, which calls pool.keys() to get URLs. + pool.insert(addr, create_search_client_from_grpc_addr(addr, bytesize::ByteSize::mib(20))); } + } + + let source = Arc::new(MetricsDataSource::new( + metastore, + sandbox.storage_resolver().clone(), + )); + let builder = DataFusionSessionBuilder::new() + .with_source(source) + .with_searcher_pool(pool); + + let ddl = r#"CREATE OR REPLACE EXTERNAL TABLE "dist-test" ( + metric_name VARCHAR NOT NULL, metric_type TINYINT, + timestamp_secs BIGINT NOT NULL, value DOUBLE NOT NULL, service VARCHAR + ) STORED AS metrics LOCATION 'dist-test'"#; + let agg_sql = format!( + "{ddl}; SELECT SUM(value) as total, COUNT(*) as cnt FROM \"dist-test\"" + ); + + // ── Verify plan shape AND execute in the same session ──────────── + let ctx = builder.build_session().unwrap(); + let fragments: Vec<&str> = agg_sql.split(';').map(str::trim).filter(|s| !s.is_empty()).collect(); + ctx.sql(fragments[0]).await.unwrap().collect().await.unwrap(); // DDL + let df = ctx.sql(fragments[1]).await.unwrap(); + // Inspect the physical plan before collecting so plan and execution are the same session. + let plan = df.clone().create_physical_plan().await.unwrap(); + let plan_str = format!("{}", datafusion::physical_plan::displayable(plan.as_ref()).indent(true)); + println!("=== Physical plan ===\n{plan_str}"); + + assert!( + plan_str.contains("DistributedExec") && plan_str.contains("PartitionIsolatorExec"), + "expected both DistributedExec and PartitionIsolatorExec in distributed plan:\n{plan_str}" + ); + assert!( + !plan_str.contains("NetworkShuffleExec"), + "expected no shuffle (parquet scans are split-local):\n{plan_str}" + ); + // With 4 splits across 2 workers there should be at least 1 PartitionIsolatorExec + // (one per split partition assigned to a worker). + let isolator_count = plan_str.matches("PartitionIsolatorExec").count(); + assert!( + isolator_count >= 1, + "expected at least 1 PartitionIsolatorExec, got {isolator_count}:\n{plan_str}" + ); + + // Execute in the SAME context that built the plan — guarantees plan and result agree. + let batches = df.collect().await.unwrap(); + assert_eq!(batches.iter().map(|b| b.num_rows()).sum::(), 1); + let total = batches[0].column_by_name("total").unwrap() + .as_any().downcast_ref::().unwrap().value(0); + let expected = 0.1 + 0.2 + 0.3 + 0.4 + 1024.0 + 2048.0 + 3072.0 + 4096.0; + assert!((total - expected).abs() < 1.0, "expected {expected:.1}, got {total:.1}"); + let cnt = batches[0].column_by_name("cnt").unwrap() + .as_any().downcast_ref::().unwrap().value(0); + assert_eq!(cnt, 8); +} + +// ═══════════════════════════════════════════════════════════════════ +// Test 2: NULL columns for missing parquet fields +// ═══════════════════════════════════════════════════════════════════ + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn test_null_columns_for_missing_parquet_fields() { + unsafe { std::env::set_var("QW_DISABLE_TELEMETRY", "1"); std::env::set_var("QW_ENABLE_DATAFUSION_ENDPOINT", "true"); } + quickwit_common::setup_logging_for_tests(); + + let sandbox = ClusterSandboxBuilder::build_and_start_standalone().await; + let data_dir = tempfile::tempdir().unwrap(); + let metastore = metastore_client(&sandbox); + + let index_uid = create_metrics_index(&metastore, "null-cols", data_dir.path()).await; + + // split_a: 4 required columns only, no service/env + let batch_a = make_narrow_batch("cpu.usage", &[100, 200], &[0.5, 0.8]); + assert!(batch_a.schema().index_of("service").is_err()); + publish_split(&metastore, &index_uid, data_dir.path(), "narrow", &batch_a).await; + + // split_b: 4 required + service + env + let batch_b = make_batch_with_tags("cpu.usage", &[300, 400], &[0.3, 0.6], + Some("web"), Some("prod"), None, None, None); + publish_split(&metastore, &index_uid, data_dir.path(), "wide", &batch_b).await; + + let source = Arc::new(MetricsDataSource::new(metastore, sandbox.storage_resolver().clone())); + let builder = DataFusionSessionBuilder::new().with_source(source); + + // COUNT(col) counts non-NULL values — tests the NULL-fill behavior + let sql_str = r#" + CREATE OR REPLACE EXTERNAL TABLE "null-cols" ( + metric_name VARCHAR NOT NULL, metric_type TINYINT, + timestamp_secs BIGINT NOT NULL, value DOUBLE NOT NULL, + service VARCHAR, env VARCHAR + ) STORED AS metrics LOCATION 'null-cols'; + SELECT COUNT(*) AS total_rows, COUNT(service) AS rows_with_service, + COUNT(env) AS rows_with_env FROM "null-cols""#; + + let batches = run_sql(&builder, sql_str).await; + assert_eq!(batches.iter().map(|b| b.num_rows()).sum::(), 1); + + let total = batches[0].column_by_name("total_rows").unwrap() + .as_any().downcast_ref::().unwrap().value(0); + assert_eq!(total, 4); + + let with_service = batches[0].column_by_name("rows_with_service").unwrap() + .as_any().downcast_ref::().unwrap().value(0); + assert_eq!(with_service, 2, + "split_a has no service col → NULLs; split_b has service='web' → 2 non-null"); + + let with_env = batches[0].column_by_name("rows_with_env").unwrap() + .as_any().downcast_ref::().unwrap().value(0); + assert_eq!(with_env, 2); +} diff --git a/quickwit/quickwit-integration-tests/src/tests/mod.rs b/quickwit/quickwit-integration-tests/src/tests/mod.rs index bbc5dcf814a..1bcb2d3e6b1 100644 --- a/quickwit/quickwit-integration-tests/src/tests/mod.rs +++ b/quickwit/quickwit-integration-tests/src/tests/mod.rs @@ -15,6 +15,8 @@ mod basic_tests; mod ingest_v1_tests; mod ingest_v2_tests; +mod metrics_datafusion_tests; +mod metrics_distributed_tests; mod no_cp_tests; mod otlp_tests; #[cfg(feature = "sqs-localstack-tests")] diff --git a/quickwit/quickwit-integration-tests/src/tests/rollup_substrait.json b/quickwit/quickwit-integration-tests/src/tests/rollup_substrait.json new file mode 100644 index 00000000000..e871ae43009 --- /dev/null +++ b/quickwit/quickwit-integration-tests/src/tests/rollup_substrait.json @@ -0,0 +1,20 @@ +{ + "extensionUris": [ + {"extensionUriAnchor": 1, "uri": "https://github.com/substrait-io/substrait/blob/main/extensions/functions_comparison.yaml"}, + {"extensionUriAnchor": 2, "uri": "https://github.com/substrait-io/substrait/blob/main/extensions/functions_boolean.yaml"}, + {"extensionUriAnchor": 3, "uri": "https://github.com/substrait-io/substrait/blob/main/extensions/functions_datetime.yaml"}, + {"extensionUriAnchor": 4, "uri": "https://github.com/substrait-io/substrait/blob/main/extensions/functions_aggregate_generic.yaml"}, + {"extensionUriAnchor": 5, "uri": "https://github.com/substrait-io/substrait/blob/main/extensions/functions_aggregate_arithmetic.yaml"} + ], + "extensions": [ + {"extensionFunction": {"extensionUriReference": 1, "functionAnchor": 1, "name": "equal:str_str"}}, + {"extensionFunction": {"extensionUriReference": 2, "functionAnchor": 2, "name": "and:bool"}}, + {"extensionFunction": {"extensionUriReference": 3, "functionAnchor": 3, "name": "date_bin:iday_ts"}}, + {"extensionFunction": {"extensionUriReference": 3, "functionAnchor": 20, "name": "to_timestamp_seconds:i64"}}, + {"extensionFunction": {"extensionUriReference": 4, "functionAnchor": 14, "name": "count:f64"}}, + {"extensionFunction": {"extensionUriReference": 5, "functionAnchor": 10, "name": "sum:f64"}}, + {"extensionFunction": {"extensionUriReference": 5, "functionAnchor": 11, "name": "min:f64"}}, + {"extensionFunction": {"extensionUriReference": 5, "functionAnchor": 12, "name": "max:f64"}}, + {"extensionFunction": {"extensionUriReference": 5, "functionAnchor": 13, "name": "avg:f64"}} + ], + "relations": [{"root": {"input": {"sort": {"input": {"aggregate": {"groupingExpressions": [{"selection": {"directReference": {"structField": {}},"rootReference": {}}},{"selection": {"directReference": {"structField": {"field": 1}},"rootReference": {}}}],"groupings": [{"expressionReferences": [0, 1]}],"input": {"aggregate": {"groupingExpressions": [{"selection": {"directReference": {"structField": {"field": 3}},"rootReference": {}}},{"selection": {"directReference": {"structField": {"field": 5}},"rootReference": {}}}],"groupings": [{"expressionReferences": [0, 1]}],"input": {"project": {"expressions": [{"scalarFunction": {"arguments": [{"value": {"literal": {"intervalDayToSecond": {"seconds": 30}}}},{"value": {"scalarFunction": {"arguments": [{"value": {"selection": {"directReference": {"structField": {"field": 1}},"rootReference": {}}}}],"functionReference": 20,"outputType": {"timestamp": {"nullability": "NULLABILITY_NULLABLE"}}}}}],"functionReference": 3,"outputType": {"i64": {"nullability": "NULLABILITY_REQUIRED"}}}}],"input": {"filter": {"condition": {"scalarFunction": {"arguments": [{"value": {"scalarFunction": {"arguments": [{"value": {"selection": {"directReference": {"structField": {}},"rootReference": {}}}},{"value": {"literal": {"string": "cpu.usage"}}}],"functionReference": 1,"outputType": {"bool": {"nullability": "NULLABILITY_REQUIRED"}}}}},{"value": {"scalarFunction": {"arguments": [{"value": {"selection": {"directReference": {"structField": {"field": 4}},"rootReference": {}}}},{"value": {"literal": {"string": "prod"}}}],"functionReference": 1,"outputType": {"bool": {"nullability": "NULLABILITY_REQUIRED"}}}}}],"functionReference": 2,"outputType": {"bool": {"nullability": "NULLABILITY_REQUIRED"}}}},"input": {"read": {"baseSchema": {"names": ["metric_name","timestamp_secs","value","service","env"],"struct": {"nullability": "NULLABILITY_NULLABLE","types": [{"string": {"nullability": "NULLABILITY_NULLABLE"}},{"i64": {"nullability": "NULLABILITY_REQUIRED"}},{"fp64": {"nullability": "NULLABILITY_NULLABLE"}},{"string": {"nullability": "NULLABILITY_NULLABLE"}},{"string": {"nullability": "NULLABILITY_NULLABLE"}}]}},"namedTable": {"names": ["otel-metrics-v0_9"]}}}}}}},"measures": [{"measure": {"arguments": [{"value": {"selection": {"directReference": {"structField": {"field": 2}},"rootReference": {}}}}],"functionReference": 12,"invocation": "AGGREGATION_INVOCATION_ALL","outputType": {"fp64": {"nullability": "NULLABILITY_NULLABLE"}},"phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT"}}]}},"measures": [{"measure": {"arguments": [{"value": {"selection": {"directReference": {"structField": {"field": 2}},"rootReference": {}}}}],"functionReference": 13,"invocation": "AGGREGATION_INVOCATION_ALL","outputType": {"fp64": {"nullability": "NULLABILITY_NULLABLE"}},"phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT"}}]}},"sorts": [{"direction": "SORT_DIRECTION_ASC_NULLS_LAST","expr": {"selection": {"directReference": {"structField": {"field": 1}},"rootReference": {}}}},{"direction": "SORT_DIRECTION_ASC_NULLS_LAST","expr": {"selection": {"directReference": {"structField": {}},"rootReference": {}}}}]}},"names": ["service","time_bin","value"]}}]} diff --git a/quickwit/quickwit-proto/build.rs b/quickwit/quickwit-proto/build.rs index 569d9b5315b..2d5afcdcc98 100644 --- a/quickwit/quickwit-proto/build.rs +++ b/quickwit/quickwit-proto/build.rs @@ -206,6 +206,18 @@ fn main() -> Result<(), Box> { &[std::path::PathBuf::from("protos")], )?; + // DataFusion service (Substrait + SQL streaming execution). + let mut prost_config = prost_build::Config::default(); + prost_config.file_descriptor_set_path("src/codegen/quickwit/datafusion_descriptor.bin"); + + tonic_prost_build::configure() + .out_dir("src/codegen/quickwit") + .compile_with_config( + prost_config, + &[std::path::PathBuf::from("protos/quickwit/datafusion.proto")], + &[std::path::PathBuf::from("protos")], + )?; + // Jaeger proto let protos = find_protos("protos/third-party/jaeger"); diff --git a/quickwit/quickwit-proto/protos/quickwit/datafusion.proto b/quickwit/quickwit-proto/protos/quickwit/datafusion.proto new file mode 100644 index 00000000000..35d8ed8e5ed --- /dev/null +++ b/quickwit/quickwit-proto/protos/quickwit/datafusion.proto @@ -0,0 +1,69 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package quickwit.datafusion; + +option java_package = "com.quickwit.datafusion"; + +// Service for executing DataFusion queries over Quickwit data. +// +// This is the OSS entry point for Substrait and SQL execution. +// Pomsky wraps this service inside CloudPremService.SubstraitSearch. +service DataFusionService { + // Execute a Substrait plan and stream results as Arrow IPC batches. + rpc ExecuteSubstrait(ExecuteSubstraitRequest) returns (stream ExecuteSubstraitResponse); + + // Execute one or more SQL statements and stream results as Arrow IPC batches. + // DDL statements (CREATE EXTERNAL TABLE) are executed for side effects; + // the last query statement produces the stream. + rpc ExecuteSql(ExecuteSqlRequest) returns (stream ExecuteSqlResponse); +} + +message ExecuteSubstraitRequest { + // Substrait plan encoded as protobuf bytes (prost::Message::encode). + // Used by Pomsky and other production callers that already hold an encoded plan. + bytes substrait_plan_bytes = 1; + + // Optional per-request session overrides (e.g. target_partitions). + map properties = 2; + + // Substrait plan as proto3 JSON (the format written by DataFusion's + // to_substrait_plan + serde_json::to_string, or the rollup_substrait.json + // format used in integration tests). + // + // Convenience field for dev tooling and grpcurl: pass the JSON string + // directly without encoding to binary protobuf first. + // Exactly one of substrait_plan_bytes or substrait_plan_json must be set. + string substrait_plan_json = 3; +} + +message ExecuteSubstraitResponse { + // One RecordBatch serialized as Arrow IPC stream format. + bytes arrow_ipc_bytes = 1; +} + +message ExecuteSqlRequest { + // One or more semicolon-separated SQL statements. + string sql = 1; + + // Optional per-request session overrides. + map properties = 2; +} + +message ExecuteSqlResponse { + // One RecordBatch serialized as Arrow IPC stream format. + bytes arrow_ipc_bytes = 1; +} diff --git a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.datafusion.rs b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.datafusion.rs new file mode 100644 index 00000000000..d95a5903309 --- /dev/null +++ b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.datafusion.rs @@ -0,0 +1,464 @@ +// This file is @generated by prost-build. +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct ExecuteSubstraitRequest { + /// Substrait plan encoded as protobuf bytes (prost::Message::encode). + /// Used by Pomsky and other production callers that already hold an encoded plan. + #[prost(bytes = "vec", tag = "1")] + pub substrait_plan_bytes: ::prost::alloc::vec::Vec, + /// Optional per-request session overrides (e.g. target_partitions). + #[prost(map = "string, string", tag = "2")] + pub properties: ::std::collections::HashMap< + ::prost::alloc::string::String, + ::prost::alloc::string::String, + >, + /// Substrait plan as proto3 JSON (the format written by DataFusion's + /// to_substrait_plan + serde_json::to_string, or the rollup_substrait.json + /// format used in integration tests). + /// + /// Convenience field for dev tooling and grpcurl: pass the JSON string + /// directly without encoding to binary protobuf first. + /// Exactly one of substrait_plan_bytes or substrait_plan_json must be set. + #[prost(string, tag = "3")] + pub substrait_plan_json: ::prost::alloc::string::String, +} +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] +pub struct ExecuteSubstraitResponse { + /// One RecordBatch serialized as Arrow IPC stream format. + #[prost(bytes = "vec", tag = "1")] + pub arrow_ipc_bytes: ::prost::alloc::vec::Vec, +} +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct ExecuteSqlRequest { + /// One or more semicolon-separated SQL statements. + #[prost(string, tag = "1")] + pub sql: ::prost::alloc::string::String, + /// Optional per-request session overrides. + #[prost(map = "string, string", tag = "2")] + pub properties: ::std::collections::HashMap< + ::prost::alloc::string::String, + ::prost::alloc::string::String, + >, +} +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] +pub struct ExecuteSqlResponse { + /// One RecordBatch serialized as Arrow IPC stream format. + #[prost(bytes = "vec", tag = "1")] + pub arrow_ipc_bytes: ::prost::alloc::vec::Vec, +} +/// Generated client implementations. +pub mod data_fusion_service_client { + #![allow( + unused_variables, + dead_code, + missing_docs, + clippy::wildcard_imports, + clippy::let_unit_value, + )] + use tonic::codegen::*; + use tonic::codegen::http::Uri; + /// Service for executing DataFusion queries over Quickwit data. + /// + /// This is the OSS entry point for Substrait and SQL execution. + /// Pomsky wraps this service inside CloudPremService.SubstraitSearch. + #[derive(Debug, Clone)] + pub struct DataFusionServiceClient { + inner: tonic::client::Grpc, + } + impl DataFusionServiceClient { + /// Attempt to create a new client by connecting to a given endpoint. + pub async fn connect(dst: D) -> Result + where + D: TryInto, + D::Error: Into, + { + let conn = tonic::transport::Endpoint::new(dst)?.connect().await?; + Ok(Self::new(conn)) + } + } + impl DataFusionServiceClient + where + T: tonic::client::GrpcService, + T::Error: Into, + T::ResponseBody: Body + std::marker::Send + 'static, + ::Error: Into + std::marker::Send, + { + pub fn new(inner: T) -> Self { + let inner = tonic::client::Grpc::new(inner); + Self { inner } + } + pub fn with_origin(inner: T, origin: Uri) -> Self { + let inner = tonic::client::Grpc::with_origin(inner, origin); + Self { inner } + } + pub fn with_interceptor( + inner: T, + interceptor: F, + ) -> DataFusionServiceClient> + where + F: tonic::service::Interceptor, + T::ResponseBody: Default, + T: tonic::codegen::Service< + http::Request, + Response = http::Response< + >::ResponseBody, + >, + >, + , + >>::Error: Into + std::marker::Send + std::marker::Sync, + { + DataFusionServiceClient::new(InterceptedService::new(inner, interceptor)) + } + /// Compress requests with the given encoding. + /// + /// This requires the server to support it otherwise it might respond with an + /// error. + #[must_use] + pub fn send_compressed(mut self, encoding: CompressionEncoding) -> Self { + self.inner = self.inner.send_compressed(encoding); + self + } + /// Enable decompressing responses. + #[must_use] + pub fn accept_compressed(mut self, encoding: CompressionEncoding) -> Self { + self.inner = self.inner.accept_compressed(encoding); + self + } + /// Limits the maximum size of a decoded message. + /// + /// Default: `4MB` + #[must_use] + pub fn max_decoding_message_size(mut self, limit: usize) -> Self { + self.inner = self.inner.max_decoding_message_size(limit); + self + } + /// Limits the maximum size of an encoded message. + /// + /// Default: `usize::MAX` + #[must_use] + pub fn max_encoding_message_size(mut self, limit: usize) -> Self { + self.inner = self.inner.max_encoding_message_size(limit); + self + } + /// Execute a Substrait plan and stream results as Arrow IPC batches. + pub async fn execute_substrait( + &mut self, + request: impl tonic::IntoRequest, + ) -> std::result::Result< + tonic::Response>, + tonic::Status, + > { + self.inner + .ready() + .await + .map_err(|e| { + tonic::Status::unknown( + format!("Service was not ready: {}", e.into()), + ) + })?; + let codec = tonic_prost::ProstCodec::default(); + let path = http::uri::PathAndQuery::from_static( + "/quickwit.datafusion.DataFusionService/ExecuteSubstrait", + ); + let mut req = request.into_request(); + req.extensions_mut() + .insert( + GrpcMethod::new( + "quickwit.datafusion.DataFusionService", + "ExecuteSubstrait", + ), + ); + self.inner.server_streaming(req, path, codec).await + } + /// Execute one or more SQL statements and stream results as Arrow IPC batches. + /// DDL statements (CREATE EXTERNAL TABLE) are executed for side effects; + /// the last query statement produces the stream. + pub async fn execute_sql( + &mut self, + request: impl tonic::IntoRequest, + ) -> std::result::Result< + tonic::Response>, + tonic::Status, + > { + self.inner + .ready() + .await + .map_err(|e| { + tonic::Status::unknown( + format!("Service was not ready: {}", e.into()), + ) + })?; + let codec = tonic_prost::ProstCodec::default(); + let path = http::uri::PathAndQuery::from_static( + "/quickwit.datafusion.DataFusionService/ExecuteSql", + ); + let mut req = request.into_request(); + req.extensions_mut() + .insert( + GrpcMethod::new( + "quickwit.datafusion.DataFusionService", + "ExecuteSql", + ), + ); + self.inner.server_streaming(req, path, codec).await + } + } +} +/// Generated server implementations. +pub mod data_fusion_service_server { + #![allow( + unused_variables, + dead_code, + missing_docs, + clippy::wildcard_imports, + clippy::let_unit_value, + )] + use tonic::codegen::*; + /// Generated trait containing gRPC methods that should be implemented for use with DataFusionServiceServer. + #[async_trait] + pub trait DataFusionService: std::marker::Send + std::marker::Sync + 'static { + /// Server streaming response type for the ExecuteSubstrait method. + type ExecuteSubstraitStream: tonic::codegen::tokio_stream::Stream< + Item = std::result::Result< + super::ExecuteSubstraitResponse, + tonic::Status, + >, + > + + std::marker::Send + + 'static; + /// Execute a Substrait plan and stream results as Arrow IPC batches. + async fn execute_substrait( + &self, + request: tonic::Request, + ) -> std::result::Result< + tonic::Response, + tonic::Status, + >; + /// Server streaming response type for the ExecuteSql method. + type ExecuteSqlStream: tonic::codegen::tokio_stream::Stream< + Item = std::result::Result, + > + + std::marker::Send + + 'static; + /// Execute one or more SQL statements and stream results as Arrow IPC batches. + /// DDL statements (CREATE EXTERNAL TABLE) are executed for side effects; + /// the last query statement produces the stream. + async fn execute_sql( + &self, + request: tonic::Request, + ) -> std::result::Result, tonic::Status>; + } + /// Service for executing DataFusion queries over Quickwit data. + /// + /// This is the OSS entry point for Substrait and SQL execution. + /// Pomsky wraps this service inside CloudPremService.SubstraitSearch. + #[derive(Debug)] + pub struct DataFusionServiceServer { + inner: Arc, + accept_compression_encodings: EnabledCompressionEncodings, + send_compression_encodings: EnabledCompressionEncodings, + max_decoding_message_size: Option, + max_encoding_message_size: Option, + } + impl DataFusionServiceServer { + pub fn new(inner: T) -> Self { + Self::from_arc(Arc::new(inner)) + } + pub fn from_arc(inner: Arc) -> Self { + Self { + inner, + accept_compression_encodings: Default::default(), + send_compression_encodings: Default::default(), + max_decoding_message_size: None, + max_encoding_message_size: None, + } + } + pub fn with_interceptor( + inner: T, + interceptor: F, + ) -> InterceptedService + where + F: tonic::service::Interceptor, + { + InterceptedService::new(Self::new(inner), interceptor) + } + /// Enable decompressing requests with the given encoding. + #[must_use] + pub fn accept_compressed(mut self, encoding: CompressionEncoding) -> Self { + self.accept_compression_encodings.enable(encoding); + self + } + /// Compress responses with the given encoding, if the client supports it. + #[must_use] + pub fn send_compressed(mut self, encoding: CompressionEncoding) -> Self { + self.send_compression_encodings.enable(encoding); + self + } + /// Limits the maximum size of a decoded message. + /// + /// Default: `4MB` + #[must_use] + pub fn max_decoding_message_size(mut self, limit: usize) -> Self { + self.max_decoding_message_size = Some(limit); + self + } + /// Limits the maximum size of an encoded message. + /// + /// Default: `usize::MAX` + #[must_use] + pub fn max_encoding_message_size(mut self, limit: usize) -> Self { + self.max_encoding_message_size = Some(limit); + self + } + } + impl tonic::codegen::Service> for DataFusionServiceServer + where + T: DataFusionService, + B: Body + std::marker::Send + 'static, + B::Error: Into + std::marker::Send + 'static, + { + type Response = http::Response; + type Error = std::convert::Infallible; + type Future = BoxFuture; + fn poll_ready( + &mut self, + _cx: &mut Context<'_>, + ) -> Poll> { + Poll::Ready(Ok(())) + } + fn call(&mut self, req: http::Request) -> Self::Future { + match req.uri().path() { + "/quickwit.datafusion.DataFusionService/ExecuteSubstrait" => { + #[allow(non_camel_case_types)] + struct ExecuteSubstraitSvc(pub Arc); + impl< + T: DataFusionService, + > tonic::server::ServerStreamingService< + super::ExecuteSubstraitRequest, + > for ExecuteSubstraitSvc { + type Response = super::ExecuteSubstraitResponse; + type ResponseStream = T::ExecuteSubstraitStream; + type Future = BoxFuture< + tonic::Response, + tonic::Status, + >; + fn call( + &mut self, + request: tonic::Request, + ) -> Self::Future { + let inner = Arc::clone(&self.0); + let fut = async move { + ::execute_substrait(&inner, request) + .await + }; + Box::pin(fut) + } + } + let accept_compression_encodings = self.accept_compression_encodings; + let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; + let inner = self.inner.clone(); + let fut = async move { + let method = ExecuteSubstraitSvc(inner); + let codec = tonic_prost::ProstCodec::default(); + let mut grpc = tonic::server::Grpc::new(codec) + .apply_compression_config( + accept_compression_encodings, + send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, + ); + let res = grpc.server_streaming(method, req).await; + Ok(res) + }; + Box::pin(fut) + } + "/quickwit.datafusion.DataFusionService/ExecuteSql" => { + #[allow(non_camel_case_types)] + struct ExecuteSqlSvc(pub Arc); + impl< + T: DataFusionService, + > tonic::server::ServerStreamingService + for ExecuteSqlSvc { + type Response = super::ExecuteSqlResponse; + type ResponseStream = T::ExecuteSqlStream; + type Future = BoxFuture< + tonic::Response, + tonic::Status, + >; + fn call( + &mut self, + request: tonic::Request, + ) -> Self::Future { + let inner = Arc::clone(&self.0); + let fut = async move { + ::execute_sql(&inner, request).await + }; + Box::pin(fut) + } + } + let accept_compression_encodings = self.accept_compression_encodings; + let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; + let inner = self.inner.clone(); + let fut = async move { + let method = ExecuteSqlSvc(inner); + let codec = tonic_prost::ProstCodec::default(); + let mut grpc = tonic::server::Grpc::new(codec) + .apply_compression_config( + accept_compression_encodings, + send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, + ); + let res = grpc.server_streaming(method, req).await; + Ok(res) + }; + Box::pin(fut) + } + _ => { + Box::pin(async move { + let mut response = http::Response::new( + tonic::body::Body::default(), + ); + let headers = response.headers_mut(); + headers + .insert( + tonic::Status::GRPC_STATUS, + (tonic::Code::Unimplemented as i32).into(), + ); + headers + .insert( + http::header::CONTENT_TYPE, + tonic::metadata::GRPC_CONTENT_TYPE, + ); + Ok(response) + }) + } + } + } + } + impl Clone for DataFusionServiceServer { + fn clone(&self) -> Self { + let inner = self.inner.clone(); + Self { + inner, + accept_compression_encodings: self.accept_compression_encodings, + send_compression_encodings: self.send_compression_encodings, + max_decoding_message_size: self.max_decoding_message_size, + max_encoding_message_size: self.max_encoding_message_size, + } + } + } + /// Generated gRPC service name + pub const SERVICE_NAME: &str = "quickwit.datafusion.DataFusionService"; + impl tonic::server::NamedService for DataFusionServiceServer { + const NAME: &'static str = SERVICE_NAME; + } +} diff --git a/quickwit/quickwit-proto/src/datafusion/mod.rs b/quickwit/quickwit-proto/src/datafusion/mod.rs new file mode 100644 index 00000000000..2af07360986 --- /dev/null +++ b/quickwit/quickwit-proto/src/datafusion/mod.rs @@ -0,0 +1,18 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +include!("../codegen/quickwit/quickwit.datafusion.rs"); + +pub const DATAFUSION_FILE_DESCRIPTOR_SET: &[u8] = + include_bytes!("../codegen/quickwit/datafusion_descriptor.bin"); diff --git a/quickwit/quickwit-proto/src/lib.rs b/quickwit/quickwit-proto/src/lib.rs index dbe850b55b7..6337c06a02a 100644 --- a/quickwit/quickwit-proto/src/lib.rs +++ b/quickwit/quickwit-proto/src/lib.rs @@ -30,6 +30,7 @@ pub mod cluster; pub mod control_plane; pub use bytes; pub use tonic; +pub mod datafusion; pub mod developer; pub mod error; mod getters; diff --git a/quickwit/quickwit-serve/Cargo.toml b/quickwit/quickwit-serve/Cargo.toml index 2721aa719f3..2a57f09e700 100644 --- a/quickwit/quickwit-serve/Cargo.toml +++ b/quickwit/quickwit-serve/Cargo.toml @@ -12,9 +12,12 @@ license.workspace = true [dependencies] anyhow = { workspace = true } +arrow = { workspace = true } async-trait = { workspace = true } +datafusion-distributed = { git = "https://github.com/datafusion-contrib/datafusion-distributed" } base64 = { workspace = true } bytes = { workspace = true } +hyper = { workspace = true } bytesize = { workspace = true } elasticsearch-dsl = "0.4" flate2 = { workspace = true } @@ -62,6 +65,7 @@ quickwit-cluster = { workspace = true } quickwit-common = { workspace = true } quickwit-config = { workspace = true } quickwit-control-plane = { workspace = true } +quickwit-datafusion = { workspace = true } quickwit-doc-mapper = { workspace = true } quickwit-index-management = { workspace = true } quickwit-indexing = { workspace = true } @@ -70,6 +74,7 @@ quickwit-jaeger = { workspace = true } quickwit-janitor = { workspace = true } quickwit-metastore = { workspace = true } quickwit-opentelemetry = { workspace = true } +quickwit-parquet-engine = { workspace = true } quickwit-proto = { workspace = true } quickwit-query = { workspace = true } quickwit-search = { workspace = true } diff --git a/quickwit/quickwit-serve/src/datafusion_api/grpc_handler.rs b/quickwit/quickwit-serve/src/datafusion_api/grpc_handler.rs new file mode 100644 index 00000000000..e87647e440e --- /dev/null +++ b/quickwit/quickwit-serve/src/datafusion_api/grpc_handler.rs @@ -0,0 +1,174 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! gRPC handler that bridges [`quickwit_datafusion::DataFusionService`] to the +//! tonic-generated `DataFusionService` server trait. +//! +//! Each streaming response batch is encoded as Arrow IPC (stream format) using +//! [`arrow::ipc::writer::StreamWriter`] and returned as raw bytes in +//! `ExecuteSubstraitResponse::arrow_ipc_bytes` / +//! `ExecuteSqlResponse::arrow_ipc_bytes`. +//! +//! ## Error mapping +//! +//! `datafusion::error::DataFusionError` is mapped to `tonic::Status`: +//! - Plan / Schema errors → `InvalidArgument` +//! - I/O errors → `Internal` +//! - Everything else → `Internal` + +use std::io::Cursor; +use std::sync::Arc; + +use arrow::array::RecordBatch; +use arrow::ipc::writer::StreamWriter; +use futures::StreamExt; +use quickwit_datafusion::DataFusionService; +use quickwit_proto::datafusion::{ + ExecuteSqlRequest, ExecuteSqlResponse, ExecuteSubstraitRequest, ExecuteSubstraitResponse, + data_fusion_service_server, +}; +use quickwit_proto::tonic; +use tokio_stream::wrappers::ReceiverStream; +use tracing::warn; + +/// Converts a DataFusion error (represented as any `std::error::Error`) to an +/// appropriate `tonic::Status`. +/// +/// Plan / schema errors are surfaced as `InvalidArgument`; everything else as +/// `Internal`. The distinction is made by inspecting the `Display` output +/// since we avoid a hard dependency on the `datafusion` crate in quickwit-serve. +fn df_error_to_status(err: impl std::fmt::Display) -> tonic::Status { + let msg = err.to_string(); + // DataFusion plan/schema errors start with "Error during planning:" or + // "Schema error:". Map those to invalid argument; everything else is internal. + if msg.starts_with("Error during planning") || msg.starts_with("Schema error") { + tonic::Status::invalid_argument(msg) + } else { + tonic::Status::internal(msg) + } +} + +/// Serialize a single `RecordBatch` to Arrow IPC stream format bytes. +fn batch_to_ipc_bytes(batch: &RecordBatch) -> Result, tonic::Status> { + let mut buf = Vec::with_capacity(batch.get_array_memory_size()); + let mut writer = StreamWriter::try_new(Cursor::new(&mut buf), batch.schema_ref()) + .map_err(|e| tonic::Status::internal(format!("failed to create Arrow IPC writer: {e}")))?; + writer + .write(batch) + .map_err(|e| tonic::Status::internal(format!("failed to write Arrow IPC batch: {e}")))?; + writer + .finish() + .map_err(|e| tonic::Status::internal(format!("failed to finish Arrow IPC stream: {e}")))?; + drop(writer); + Ok(buf) +} + +/// tonic gRPC adapter that wraps [`DataFusionService`]. +/// +/// Implements the tonic-generated `DataFusionService` trait and converts the +/// streaming `RecordBatch` results to Arrow IPC bytes. +pub struct DataFusionServiceGrpcImpl { + service: Arc, +} + +impl DataFusionServiceGrpcImpl { + pub fn new(service: DataFusionService) -> Self { + Self { + service: Arc::new(service), + } + } +} + +#[async_trait::async_trait] +impl data_fusion_service_server::DataFusionService for DataFusionServiceGrpcImpl { + type ExecuteSubstraitStream = ReceiverStream>; + type ExecuteSqlStream = ReceiverStream>; + + async fn execute_substrait( + &self, + request: tonic::Request, + ) -> Result, tonic::Status> { + let req = request.into_inner(); + let service = Arc::clone(&self.service); + + // Route to the appropriate DataFusionService method: + // - substrait_plan_bytes: production path (Pomsky, pre-encoded protobuf) + // - substrait_plan_json: dev/tooling path (grpcurl, rollup JSON files) + let mut stream = if !req.substrait_plan_bytes.is_empty() { + service + .execute_substrait(&req.substrait_plan_bytes) + .await + .map_err(df_error_to_status)? + } else if !req.substrait_plan_json.is_empty() { + service + .execute_substrait_json(&req.substrait_plan_json) + .await + .map_err(df_error_to_status)? + } else { + return Err(tonic::Status::invalid_argument( + "either substrait_plan_bytes or substrait_plan_json must be set", + )); + }; + + let (tx, rx) = tokio::sync::mpsc::channel(32); + tokio::spawn(async move { + while let Some(result) = stream.next().await { + let item = match result { + Ok(batch) => batch_to_ipc_bytes(&batch) + .map(|ipc_bytes| ExecuteSubstraitResponse { arrow_ipc_bytes: ipc_bytes }), + Err(err) => Err(tonic::Status::internal(format!("stream error: {err}"))), + }; + if tx.send(item).await.is_err() { + // receiver dropped — client disconnected + break; + } + } + }); + + Ok(tonic::Response::new(ReceiverStream::new(rx))) + } + + async fn execute_sql( + &self, + request: tonic::Request, + ) -> Result, tonic::Status> { + let req = request.into_inner(); + let service = Arc::clone(&self.service); + + let mut stream = service + .execute_sql(&req.sql) + .await + .map_err(|err| { + warn!(error = %err, "DataFusion SQL execution error"); + df_error_to_status(err) + })?; + + let (tx, rx) = tokio::sync::mpsc::channel(32); + tokio::spawn(async move { + while let Some(result) = stream.next().await { + let item = match result { + Ok(batch) => batch_to_ipc_bytes(&batch) + .map(|ipc_bytes| ExecuteSqlResponse { arrow_ipc_bytes: ipc_bytes }), + Err(err) => Err(tonic::Status::internal(format!("stream error: {err}"))), + }; + if tx.send(item).await.is_err() { + // receiver dropped — client disconnected + break; + } + } + }); + + Ok(tonic::Response::new(ReceiverStream::new(rx))) + } +} diff --git a/quickwit/quickwit-serve/src/datafusion_api/mod.rs b/quickwit/quickwit-serve/src/datafusion_api/mod.rs new file mode 100644 index 00000000000..6e332642a22 --- /dev/null +++ b/quickwit/quickwit-serve/src/datafusion_api/mod.rs @@ -0,0 +1,16 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod grpc_handler; +pub use grpc_handler::DataFusionServiceGrpcImpl; diff --git a/quickwit/quickwit-serve/src/grpc.rs b/quickwit/quickwit-serve/src/grpc.rs index 698c9e07d71..a0e2f84e510 100644 --- a/quickwit/quickwit-serve/src/grpc.rs +++ b/quickwit/quickwit-serve/src/grpc.rs @@ -25,7 +25,9 @@ use quickwit_proto::indexing::IndexingServiceClient; use quickwit_proto::jaeger::storage::v1::span_reader_plugin_server::SpanReaderPluginServer; use quickwit_proto::jaeger::storage::v2::trace_reader_server::TraceReaderServer; use quickwit_proto::opentelemetry::proto::collector::logs::v1::logs_service_server::LogsServiceServer; +use quickwit_proto::opentelemetry::proto::collector::metrics::v1::metrics_service_server::MetricsServiceServer; use quickwit_proto::opentelemetry::proto::collector::trace::v1::trace_service_server::TraceServiceServer; +use quickwit_proto::datafusion::data_fusion_service_server::DataFusionServiceServer; use quickwit_proto::search::search_service_server::SearchServiceServer; use quickwit_proto::tonic::codegen::CompressionEncoding; use quickwit_proto::tonic::transport::server::TcpIncoming; @@ -37,6 +39,7 @@ use tonic_reflection::pb::v1::FILE_DESCRIPTOR_SET as REFLECTION_FILE_DESCRIPTOR_ use tonic_reflection::server::v1::{ServerReflection, ServerReflectionServer}; use tracing::*; +use crate::datafusion_api::DataFusionServiceGrpcImpl; use crate::developer_api::DeveloperApiServer; use crate::search_api::GrpcSearchAdapter; use crate::{INDEXING_GRPC_SERVER_METRICS_LAYER, QuickwitServices}; @@ -158,6 +161,18 @@ pub(crate) async fn start_grpc_server( None }; // Mount gRPC OpenTelemetry OTLP services if present. + let otlp_metrics_grpc_service = + if let Some(otlp_metrics_service) = services.otlp_metrics_service_opt.clone() { + enabled_grpc_services.insert("otlp-metrics"); + let metrics_service = MetricsServiceServer::new(otlp_metrics_service) + .accept_compressed(CompressionEncoding::Gzip) + .accept_compressed(CompressionEncoding::Zstd) + .max_decoding_message_size(grpc_config.max_message_size.0 as usize) + .max_encoding_message_size(grpc_config.max_message_size.0 as usize); + Some(metrics_service) + } else { + None + }; let otlp_trace_grpc_service = if let Some(otlp_traces_service) = services.otlp_traces_service_opt.clone() { enabled_grpc_services.insert("otlp-traces"); @@ -226,6 +241,11 @@ pub(crate) async fn start_grpc_server( DeveloperServiceClient::new(developer_service) .as_grpc_service(DeveloperApiServer::MAX_GRPC_MESSAGE_SIZE) }; + // DataFusion service descriptor must be pushed before build_reflection_service. + if services.datafusion_session_builder.is_some() { + file_descriptor_sets.push(quickwit_proto::datafusion::DATAFUSION_FILE_DESCRIPTOR_SET); + } + enabled_grpc_services.insert("health"); file_descriptor_sets.push(HEALTH_FILE_DESCRIPTOR_SET); @@ -233,6 +253,36 @@ pub(crate) async fn start_grpc_server( file_descriptor_sets.push(REFLECTION_FILE_DESCRIPTOR_SET); let reflection_service = build_reflection_service(&file_descriptor_sets)?; + // Mount the DataFusion distributed worker gRPC service. + let datafusion_worker_service = + if let Some(ref session_builder) = services.datafusion_session_builder { + enabled_grpc_services.insert("datafusion-worker"); + let worker = quickwit_datafusion::build_quickwit_worker( + session_builder.sources(), + Arc::clone(session_builder.runtime()), + ); + Some(worker.into_worker_server()) + } else { + None + }; + + // Mount DataFusionService for OSS query execution (Substrait + SQL streaming). + let datafusion_grpc_service = if let Some(ref session_builder) = + services.datafusion_session_builder + { + enabled_grpc_services.insert("datafusion"); + + let service = + quickwit_datafusion::DataFusionService::new(Arc::clone(session_builder)); + Some( + DataFusionServiceServer::new(DataFusionServiceGrpcImpl::new(service)) + .max_decoding_message_size(grpc_config.max_message_size.0 as usize) + .max_encoding_message_size(grpc_config.max_message_size.0 as usize), + ) + } else { + None + }; + let server_router = server .add_service(cluster_grpc_service) .add_service(developer_grpc_service) @@ -247,8 +297,11 @@ pub(crate) async fn start_grpc_server( .add_optional_service(jaeger_v2_grpc_service) .add_optional_service(metastore_grpc_service) .add_optional_service(otlp_log_grpc_service) + .add_optional_service(otlp_metrics_grpc_service) .add_optional_service(otlp_trace_grpc_service) - .add_optional_service(search_grpc_service); + .add_optional_service(search_grpc_service) + .add_optional_service(datafusion_grpc_service) + .add_optional_service(datafusion_worker_service); let grpc_listen_addr = tcp_listener.local_addr()?; info!( diff --git a/quickwit/quickwit-serve/src/lib.rs b/quickwit/quickwit-serve/src/lib.rs index cc261cec7a2..afbc3c7351c 100644 --- a/quickwit/quickwit-serve/src/lib.rs +++ b/quickwit/quickwit-serve/src/lib.rs @@ -16,6 +16,7 @@ mod build_info; mod cluster_api; +mod datafusion_api; mod decompression; mod delete_task_api; mod developer_api; @@ -92,7 +93,7 @@ use quickwit_janitor::{JanitorService, start_janitor_service}; use quickwit_metastore::{ ControlPlaneMetastore, ListIndexesMetadataResponseExt, MetastoreResolver, }; -use quickwit_opentelemetry::otlp::{OtlpGrpcLogsService, OtlpGrpcTracesService}; +use quickwit_opentelemetry::otlp::{OtlpGrpcLogsService, OtlpGrpcMetricsService, OtlpGrpcTracesService}; use quickwit_proto::control_plane::ControlPlaneServiceClient; use quickwit_proto::indexing::{IndexingServiceClient, ShardPositionsUpdate}; use quickwit_proto::ingest::ingester::{ @@ -198,6 +199,7 @@ struct QuickwitServices { pub janitor_service_opt: Option>, pub jaeger_service_opt: Option, pub otlp_logs_service_opt: Option, + pub otlp_metrics_service_opt: Option, pub otlp_traces_service_opt: Option, /// We do have a search service even on nodes that are not running `search`. /// It is only used to serve the rest API calls and will only execute @@ -206,6 +208,10 @@ struct QuickwitServices { pub env_filter_reload_fn: EnvFilterReloadFn, + /// Generic DataFusion session builder (present if searcher role is active). + /// Data sources registered at startup; Pomsky wraps this in SubstraitSearch. + pub datafusion_session_builder: Option>, + /// The control plane listens to various events. /// We must maintain a reference to the subscription handles to continue receiving /// notifications. Otherwise, the subscriptions are dropped. @@ -604,10 +610,14 @@ pub async fn serve_quickwit( let otel_traces_index_config = OtlpGrpcTracesService::index_config(&node_config.default_index_root_uri) .context("failed to load OTEL traces index config")?; + let otel_metrics_index_config = + OtlpGrpcMetricsService::index_config(&node_config.default_index_root_uri) + .context("failed to load OTEL metrics index config")?; for (index_name, index_config) in [ ("OTEL logs", otel_logs_index_config), ("OTEL traces", otel_traces_index_config), + ("OTEL metrics", otel_metrics_index_config), ] { match index_manager.create_index(index_config, false).await { Ok(_) @@ -666,7 +676,7 @@ pub async fn serve_quickwit( )) }; - let (search_job_placer, search_service) = setup_searcher( + let (search_job_placer, search_service, searcher_pool) = setup_searcher( &node_config, cluster.change_stream(), // search remains available without a control plane because not all @@ -678,14 +688,39 @@ pub async fn serve_quickwit( .await .context("failed to start searcher service")?; + // Build the generic DataFusion session builder if this node is a searcher. + // Data sources are registered here; Pomsky wraps build_session() in its + // CloudPrem SubstraitSearch handler — no Pomsky-specific code needed here. + let datafusion_session_builder = if node_config + .is_service_enabled(QuickwitService::Searcher) + && quickwit_common::get_bool_from_env("QW_ENABLE_DATAFUSION_ENDPOINT", false) + { + let metrics_source = Arc::new( + quickwit_datafusion::sources::metrics::MetricsDataSource::new( + metastore_through_control_plane.clone(), + storage_resolver.clone(), + ), + ); + let resolver = quickwit_datafusion::QuickwitWorkerResolver::new(searcher_pool) + .with_tls(node_config.grpc_config.tls.is_some()); + let builder = quickwit_datafusion::DataFusionSessionBuilder::new() + .with_source(metrics_source) + .with_worker_resolver(resolver); + Some(Arc::new(builder)) + } else { + None + }; + // The control plane listens for local shards updates to learn about each shard's ingestion - // throughput. - let local_shards_update_listener_handle_opt = - if node_config.is_service_enabled(QuickwitService::ControlPlane) { - Some(setup_local_shards_update_listener(cluster.clone(), event_broker.clone()).await) - } else { - None - }; + // throughput. Ingesters (routers) do so to update their shard table. + let local_shards_update_listener_handle_opt = if node_config + .is_service_enabled(QuickwitService::ControlPlane) + || node_config.is_service_enabled(QuickwitService::Indexer) + { + Some(setup_local_shards_update_listener(cluster.clone(), event_broker.clone()).await) + } else { + None + }; let report_splits_subscription_handle_opt = // DISCLAIMER: This is quirky here: We base our decision to forward the split report depending @@ -734,6 +769,14 @@ pub async fn serve_quickwit( None }; + let otlp_metrics_service_opt = if node_config.is_service_enabled(QuickwitService::Indexer) + && node_config.indexer_config.enable_otlp_endpoint + { + Some(OtlpGrpcMetricsService::new(ingest_router_service.clone())) + } else { + None + }; + let otlp_traces_service_opt = if node_config.is_service_enabled(QuickwitService::Indexer) && node_config.indexer_config.enable_otlp_endpoint { @@ -765,9 +808,11 @@ pub async fn serve_quickwit( janitor_service_opt, jaeger_service_opt, otlp_logs_service_opt, + otlp_metrics_service_opt, otlp_traces_service_opt, search_service, env_filter_reload_fn, + datafusion_session_builder, }); // Setup and start gRPC server. let (grpc_readiness_trigger_tx, grpc_readiness_signal_rx) = oneshot::channel::<()>(); @@ -1111,7 +1156,7 @@ async fn setup_searcher( metastore: MetastoreServiceClient, storage_resolver: StorageResolver, searcher_context: Arc, -) -> anyhow::Result<(SearchJobPlacer, Arc)> { +) -> anyhow::Result<(SearchJobPlacer, Arc, SearcherPool)> { let searcher_pool = SearcherPool::default(); let search_job_placer = SearchJobPlacer::new(searcher_pool.clone()); @@ -1168,7 +1213,7 @@ async fn setup_searcher( }) }); searcher_pool.listen_for_changes(searcher_change_stream); - Ok((search_job_placer, search_service)) + Ok((search_job_placer, search_service, searcher_pool)) } #[allow(clippy::too_many_arguments)] @@ -1650,7 +1695,7 @@ mod tests { let metastore = metastore_for_test(); let (change_stream, change_stream_tx) = ClusterChangeStream::new_unbounded(); let storage_resolver = StorageResolver::unconfigured(); - let (search_job_placer, _searcher_service) = setup_searcher( + let (search_job_placer, _searcher_service, _searcher_pool) = setup_searcher( &node_config, change_stream, metastore,