From d44e8252d8fe516488ac508630f8011ff4d237c9 Mon Sep 17 00:00:00 2001 From: "cong.xie" Date: Tue, 7 Apr 2026 14:56:39 -0400 Subject: [PATCH 1/2] feat: auto-apply case-insensitive regex for lowercasing tokenizers When a field's tokenizer lowercases indexed terms (e.g. "default", "raw_lowercase", "lowercase"), regex queries now automatically prepend (?i) to match case-insensitively. Without this, patterns like `.*ECONNREFUSED.*` would never match because the inverted index only contains lowercase tokens. Changes: - `to_field_and_regex` now returns the tokenizer name as a 4th element - `build_tantivy_ast_impl` and warmup `visit_regex` prepend (?i) when the tokenizer does lowercasing and the regex doesn't already have it - `TokenizerManager::tokenizer_does_lowercasing` public helper added - Unit tests for case-insensitive behavior, tokenizer detection, and edge cases (already-(?i), raw tokenizer, JSON fields) Co-Authored-By: Claude Opus 4.6 (1M context) --- .../quickwit-doc-mapper/src/query_builder.rs | 15 +- .../src/query_ast/regex_query.rs | 163 ++++++++++++++++-- .../src/tokenizers/tokenizer_manager.rs | 10 ++ 3 files changed, 173 insertions(+), 15 deletions(-) diff --git a/quickwit/quickwit-doc-mapper/src/query_builder.rs b/quickwit/quickwit-doc-mapper/src/query_builder.rs index 5900b577795..3798c8bac4d 100644 --- a/quickwit/quickwit-doc-mapper/src/query_builder.rs +++ b/quickwit/quickwit-doc-mapper/src/query_builder.rs @@ -394,12 +394,25 @@ impl<'a, 'b: 'a> QueryAstVisitor<'a> for ExtractPrefixTermRanges<'b> { } fn visit_regex(&mut self, regex_query: &'a RegexQuery) -> Result<(), Self::Err> { - let (field, path, regex) = match regex_query.to_field_and_regex(self.schema) { + let (field, path, regex, tokenizer_name) = match regex_query.to_field_and_regex(self.schema) + { Ok(res) => res, /* the query will be nullified when casting to a tantivy ast */ Err(InvalidQuery::FieldDoesNotExist { .. }) => return Ok(()), Err(e) => return Err(e), }; + // The warmup regex must match what build_tantivy_ast_impl produces. + // If the field's tokenizer lowercases indexed terms, the search will + // use (?i) to match case-insensitively, so the warmup must too. + let does_lowercasing = match tokenizer_name.as_deref() { + Some(name) => self.tokenizer_manager.tokenizer_does_lowercasing(name), + None => false, + }; + let regex = if !regex.starts_with("(?i)") && does_lowercasing { + format!("(?i){regex}") + } else { + regex + }; self.add_automaton(field, Automaton::Regex(path, regex)); Ok(()) } diff --git a/quickwit/quickwit-query/src/query_ast/regex_query.rs b/quickwit/quickwit-query/src/query_ast/regex_query.rs index 6e1ab208dbf..adb38751b32 100644 --- a/quickwit/quickwit-query/src/query_ast/regex_query.rs +++ b/quickwit/quickwit-query/src/query_ast/regex_query.rs @@ -48,10 +48,11 @@ impl RegexQuery { } impl RegexQuery { + /// Returns (field, optional json_path prefix, regex pattern, optional tokenizer name). pub fn to_field_and_regex( &self, schema: &TantivySchema, - ) -> Result<(Field, Option>, String), InvalidQuery> { + ) -> Result<(Field, Option>, String, Option), InvalidQuery> { let Some((field, field_entry, json_path)) = find_field_or_hit_dynamic(&self.field, schema) else { return Err(InvalidQuery::FieldDoesNotExist { @@ -62,22 +63,25 @@ impl RegexQuery { match field_type { FieldType::Str(text_options) => { - text_options.get_indexing_options().ok_or_else(|| { + let text_field_indexing = text_options.get_indexing_options().ok_or_else(|| { InvalidQuery::SchemaError(format!( "field {} is not full-text searchable", field_entry.name() )) })?; + let tokenizer_name = text_field_indexing.tokenizer().to_string(); - Ok((field, None, self.regex.to_string())) + Ok((field, None, self.regex.to_string(), Some(tokenizer_name))) } FieldType::JsonObject(json_options) => { - json_options.get_text_indexing_options().ok_or_else(|| { - InvalidQuery::SchemaError(format!( - "field {} is not full-text searchable", - field_entry.name() - )) - })?; + let text_field_indexing = + json_options.get_text_indexing_options().ok_or_else(|| { + InvalidQuery::SchemaError(format!( + "field {} is not full-text searchable", + field_entry.name() + )) + })?; + let tokenizer_name = text_field_indexing.tokenizer().to_string(); let mut term_for_path = Term::from_field_json_path( field, @@ -90,7 +94,12 @@ impl RegexQuery { // We skip the 1st byte which is a marker to tell this is json. This isn't present // in the dictionary let byte_path_prefix = value.as_serialized()[1..].to_owned(); - Ok((field, Some(byte_path_prefix), self.regex.to_string())) + Ok(( + field, + Some(byte_path_prefix), + self.regex.to_string(), + Some(tokenizer_name), + )) } _ => Err(InvalidQuery::SchemaError( "trying to run a regex query on a non-text field".to_string(), @@ -104,7 +113,23 @@ impl BuildTantivyAst for RegexQuery { &self, context: &BuildTantivyAstContext, ) -> Result { - let (field, path, regex) = self.to_field_and_regex(context.schema)?; + let (field, path, regex, tokenizer_name) = self.to_field_and_regex(context.schema)?; + + // If the field's tokenizer lowercases indexed terms (e.g. "datadog", + // "raw_lowercase", "default") and the regex doesn't already contain a + // (?i) flag, automatically make the match case-insensitive. Without + // this, an upstream regex like `.*ECONNREFUSED.*` would never match + // because the inverted index only contains lowercase tokens. + let does_lowercasing = match tokenizer_name.as_deref() { + Some(name) => context.tokenizer_manager.tokenizer_does_lowercasing(name), + None => false, + }; + let regex = if !regex.starts_with("(?i)") && does_lowercasing { + format!("(?i){regex}") + } else { + regex + }; + let regex = tantivy_fst::Regex::new(®ex).context("failed to parse regex")?; let regex_automaton_with_path = JsonPathPrefix { prefix: path.unwrap_or_default(), @@ -264,7 +289,7 @@ mod tests { field: "field".to_string(), regex: "abc.*xyz".to_string(), }; - let (field, path, regex) = query.to_field_and_regex(&schema).unwrap(); + let (field, path, regex, _tokenizer) = query.to_field_and_regex(&schema).unwrap(); assert_eq!(field, schema.get_field("field").unwrap()); assert!(path.is_none()); assert_eq!(regex, query.regex); @@ -280,7 +305,7 @@ mod tests { field: "field.sub.field".to_string(), regex: "abc.*xyz".to_string(), }; - let (field, path, regex) = query.to_field_and_regex(&schema).unwrap(); + let (field, path, regex, _tokenizer) = query.to_field_and_regex(&schema).unwrap(); assert_eq!(field, schema.get_field("field").unwrap()); assert_eq!(path.unwrap(), b"sub\x01field\0s"); assert_eq!(regex, query.regex); @@ -290,12 +315,122 @@ mod tests { field: "field".to_string(), regex: "abc.*xyz".to_string(), }; - let (field, path, regex) = query_empty_path.to_field_and_regex(&schema).unwrap(); + let (field, path, regex, _tokenizer) = + query_empty_path.to_field_and_regex(&schema).unwrap(); assert_eq!(field, schema.get_field("field").unwrap()); assert_eq!(path.unwrap(), b"\0s"); assert_eq!(regex, query_empty_path.regex); } + #[test] + fn test_tokenizer_does_lowercasing() { + let tokenizer_manager = crate::tokenizers::create_default_quickwit_tokenizer_manager(); + + assert!(tokenizer_manager.tokenizer_does_lowercasing("raw_lowercase")); + assert!(tokenizer_manager.tokenizer_does_lowercasing("default")); + assert!(tokenizer_manager.tokenizer_does_lowercasing("lowercase")); + assert!(!tokenizer_manager.tokenizer_does_lowercasing("raw")); + assert!(!tokenizer_manager.tokenizer_does_lowercasing("nonexistent")); + } + + #[test] + fn test_regex_query_returns_tokenizer_name() { + let mut schema_builder = TantivySchema::builder(); + schema_builder.add_text_field("field", TEXT); + let schema = schema_builder.build(); + + let query = RegexQuery { + field: "field".to_string(), + regex: "abc.*xyz".to_string(), + }; + let (_field, _path, _regex, tokenizer_name) = query.to_field_and_regex(&schema).unwrap(); + // TEXT uses the "default" tokenizer + assert_eq!(tokenizer_name.as_deref(), Some("default")); + } + + #[test] + fn test_regex_case_insensitive_with_lowercasing_tokenizer() { + use super::BuildTantivyAstContext; + use crate::query_ast::BuildTantivyAst; + + let mut schema_builder = TantivySchema::builder(); + // TEXT uses the "default" tokenizer which lowercases + schema_builder.add_text_field("field", TEXT); + let schema = schema_builder.build(); + + let context = BuildTantivyAstContext::for_test(&schema); + + let query = RegexQuery { + field: "field".to_string(), + regex: ".*ECONNREFUSED.*".to_string(), + }; + + // The query should succeed (regex is valid with (?i) prepended) + let result = query.build_tantivy_ast_impl(&context); + assert!(result.is_ok(), "regex query should build successfully"); + } + + #[test] + fn test_regex_already_case_insensitive_not_doubled() { + use super::BuildTantivyAstContext; + use crate::query_ast::BuildTantivyAst; + + let mut schema_builder = TantivySchema::builder(); + schema_builder.add_text_field("field", TEXT); + let schema = schema_builder.build(); + + let context = BuildTantivyAstContext::for_test(&schema); + + // Already has (?i), should not be doubled + let query = RegexQuery { + field: "field".to_string(), + regex: "(?i).*ECONNREFUSED.*".to_string(), + }; + + let result = query.build_tantivy_ast_impl(&context); + assert!( + result.is_ok(), + "regex query with existing (?i) should build successfully" + ); + } + + #[test] + fn test_regex_no_case_insensitive_with_raw_tokenizer() { + use tantivy::schema::{TextFieldIndexing, TextOptions}; + + let mut schema_builder = TantivySchema::builder(); + // Use "raw" tokenizer which does not lowercase + let text_options = TextOptions::default() + .set_indexing_options(TextFieldIndexing::default().set_tokenizer("raw")); + schema_builder.add_text_field("raw_field", text_options); + let schema = schema_builder.build(); + + let query = RegexQuery { + field: "raw_field".to_string(), + regex: "abc.*xyz".to_string(), + }; + let (_field, _path, regex, tokenizer_name) = query.to_field_and_regex(&schema).unwrap(); + assert_eq!(tokenizer_name.as_deref(), Some("raw")); + // The regex should NOT have (?i) since raw doesn't lowercase + assert_eq!(regex, "abc.*xyz"); + } + + #[test] + fn test_regex_json_field_returns_tokenizer_name() { + let mut schema_builder = TantivySchema::builder(); + schema_builder.add_json_field("field", TEXT); + let schema = schema_builder.build(); + + let query = RegexQuery { + field: "field.key".to_string(), + regex: "abc".to_string(), + }; + let (_field, path, _regex, tokenizer_name) = query.to_field_and_regex(&schema).unwrap(); + assert!(path.is_some()); + // JSON field with TEXT also uses "default" tokenizer + assert_eq!(tokenizer_name.as_deref(), Some("default")); + } + #[test] fn test_json_prefix_automaton_empty_path() { let regex = Arc::new(Regex::new("e(f|g.*)").unwrap()); diff --git a/quickwit/quickwit-query/src/tokenizers/tokenizer_manager.rs b/quickwit/quickwit-query/src/tokenizers/tokenizer_manager.rs index faf2d72c4de..fc0488ad2ba 100644 --- a/quickwit/quickwit-query/src/tokenizers/tokenizer_manager.rs +++ b/quickwit/quickwit-query/src/tokenizers/tokenizer_manager.rs @@ -91,6 +91,16 @@ impl TokenizerManager { self.get_tokenizer(analyzer) } + /// Returns true if the given tokenizer lowercases its output. + pub fn tokenizer_does_lowercasing(&self, tokenizer_name: &str) -> bool { + self.is_lowercaser + .read() + .unwrap() + .get(tokenizer_name) + .copied() + .unwrap_or(false) + } + /// Get the inner TokenizerManager pub fn tantivy_manager(&self) -> &TantivyTokenizerManager { &self.inner From d6a84604d3d8f5baaa1bb63f63ed881aec961d30 Mon Sep 17 00:00:00 2001 From: "cong.xie" Date: Tue, 7 Apr 2026 17:03:43 -0400 Subject: [PATCH 2/2] fix: extract ResolvedRegex struct to fix clippy type_complexity Replace the 4-element tuple return from `to_field_and_regex` with a named `ResolvedRegex` struct to satisfy clippy::type_complexity which is promoted to a hard error via -D warnings in CI. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../quickwit-doc-mapper/src/query_builder.rs | 13 ++- quickwit/quickwit-query/src/query_ast/mod.rs | 2 +- .../src/query_ast/regex_query.rs | 88 +++++++++++-------- 3 files changed, 57 insertions(+), 46 deletions(-) diff --git a/quickwit/quickwit-doc-mapper/src/query_builder.rs b/quickwit/quickwit-doc-mapper/src/query_builder.rs index 3798c8bac4d..8a40463d18b 100644 --- a/quickwit/quickwit-doc-mapper/src/query_builder.rs +++ b/quickwit/quickwit-doc-mapper/src/query_builder.rs @@ -394,8 +394,7 @@ impl<'a, 'b: 'a> QueryAstVisitor<'a> for ExtractPrefixTermRanges<'b> { } fn visit_regex(&mut self, regex_query: &'a RegexQuery) -> Result<(), Self::Err> { - let (field, path, regex, tokenizer_name) = match regex_query.to_field_and_regex(self.schema) - { + let resolved = match regex_query.to_field_and_regex(self.schema) { Ok(res) => res, /* the query will be nullified when casting to a tantivy ast */ Err(InvalidQuery::FieldDoesNotExist { .. }) => return Ok(()), @@ -404,16 +403,16 @@ impl<'a, 'b: 'a> QueryAstVisitor<'a> for ExtractPrefixTermRanges<'b> { // The warmup regex must match what build_tantivy_ast_impl produces. // If the field's tokenizer lowercases indexed terms, the search will // use (?i) to match case-insensitively, so the warmup must too. - let does_lowercasing = match tokenizer_name.as_deref() { + let does_lowercasing = match resolved.tokenizer_name.as_deref() { Some(name) => self.tokenizer_manager.tokenizer_does_lowercasing(name), None => false, }; - let regex = if !regex.starts_with("(?i)") && does_lowercasing { - format!("(?i){regex}") + let regex = if !resolved.regex.starts_with("(?i)") && does_lowercasing { + format!("(?i){}", resolved.regex) } else { - regex + resolved.regex }; - self.add_automaton(field, Automaton::Regex(path, regex)); + self.add_automaton(resolved.field, Automaton::Regex(resolved.json_path, regex)); Ok(()) } } diff --git a/quickwit/quickwit-query/src/query_ast/mod.rs b/quickwit/quickwit-query/src/query_ast/mod.rs index d51e36eb93a..c0dfae9fd88 100644 --- a/quickwit/quickwit-query/src/query_ast/mod.rs +++ b/quickwit/quickwit-query/src/query_ast/mod.rs @@ -39,7 +39,7 @@ pub use field_presence::FieldPresenceQuery; pub use full_text_query::{FullTextMode, FullTextParams, FullTextQuery}; pub use phrase_prefix_query::PhrasePrefixQuery; pub use range_query::RangeQuery; -pub use regex_query::{AutomatonQuery, JsonPathPrefix, RegexQuery}; +pub use regex_query::{AutomatonQuery, JsonPathPrefix, RegexQuery, ResolvedRegex}; use tantivy_query_ast::TantivyQueryAst; pub use term_query::TermQuery; pub use term_set_query::TermSetQuery; diff --git a/quickwit/quickwit-query/src/query_ast/regex_query.rs b/quickwit/quickwit-query/src/query_ast/regex_query.rs index adb38751b32..8603af11e36 100644 --- a/quickwit/quickwit-query/src/query_ast/regex_query.rs +++ b/quickwit/quickwit-query/src/query_ast/regex_query.rs @@ -24,6 +24,14 @@ use super::{BuildTantivyAst, BuildTantivyAstContext, QueryAst}; use crate::query_ast::TantivyQueryAst; use crate::{InvalidQuery, find_field_or_hit_dynamic}; +/// Result of resolving a `RegexQuery` against a schema. +pub struct ResolvedRegex { + pub field: Field, + pub json_path: Option>, + pub regex: String, + pub tokenizer_name: Option, +} + /// A Regex query #[derive(PartialEq, Eq, Debug, Serialize, Deserialize, Clone)] pub struct RegexQuery { @@ -48,11 +56,11 @@ impl RegexQuery { } impl RegexQuery { - /// Returns (field, optional json_path prefix, regex pattern, optional tokenizer name). + /// Resolves this regex query against the given schema. pub fn to_field_and_regex( &self, schema: &TantivySchema, - ) -> Result<(Field, Option>, String, Option), InvalidQuery> { + ) -> Result { let Some((field, field_entry, json_path)) = find_field_or_hit_dynamic(&self.field, schema) else { return Err(InvalidQuery::FieldDoesNotExist { @@ -71,7 +79,12 @@ impl RegexQuery { })?; let tokenizer_name = text_field_indexing.tokenizer().to_string(); - Ok((field, None, self.regex.to_string(), Some(tokenizer_name))) + Ok(ResolvedRegex { + field, + json_path: None, + regex: self.regex.to_string(), + tokenizer_name: Some(tokenizer_name), + }) } FieldType::JsonObject(json_options) => { let text_field_indexing = @@ -94,12 +107,12 @@ impl RegexQuery { // We skip the 1st byte which is a marker to tell this is json. This isn't present // in the dictionary let byte_path_prefix = value.as_serialized()[1..].to_owned(); - Ok(( + Ok(ResolvedRegex { field, - Some(byte_path_prefix), - self.regex.to_string(), - Some(tokenizer_name), - )) + json_path: Some(byte_path_prefix), + regex: self.regex.to_string(), + tokenizer_name: Some(tokenizer_name), + }) } _ => Err(InvalidQuery::SchemaError( "trying to run a regex query on a non-text field".to_string(), @@ -113,30 +126,30 @@ impl BuildTantivyAst for RegexQuery { &self, context: &BuildTantivyAstContext, ) -> Result { - let (field, path, regex, tokenizer_name) = self.to_field_and_regex(context.schema)?; + let resolved = self.to_field_and_regex(context.schema)?; - // If the field's tokenizer lowercases indexed terms (e.g. "datadog", - // "raw_lowercase", "default") and the regex doesn't already contain a + // If the field's tokenizer lowercases indexed terms (e.g. "default", + // "raw_lowercase", "lowercase") and the regex doesn't already contain a // (?i) flag, automatically make the match case-insensitive. Without // this, an upstream regex like `.*ECONNREFUSED.*` would never match // because the inverted index only contains lowercase tokens. - let does_lowercasing = match tokenizer_name.as_deref() { + let does_lowercasing = match resolved.tokenizer_name.as_deref() { Some(name) => context.tokenizer_manager.tokenizer_does_lowercasing(name), None => false, }; - let regex = if !regex.starts_with("(?i)") && does_lowercasing { - format!("(?i){regex}") + let regex = if !resolved.regex.starts_with("(?i)") && does_lowercasing { + format!("(?i){}", resolved.regex) } else { - regex + resolved.regex }; let regex = tantivy_fst::Regex::new(®ex).context("failed to parse regex")?; let regex_automaton_with_path = JsonPathPrefix { - prefix: path.unwrap_or_default(), + prefix: resolved.json_path.unwrap_or_default(), automaton: regex.into(), }; let regex_query_with_path = AutomatonQuery { - field, + field: resolved.field, automaton: Arc::new(regex_automaton_with_path), }; Ok(regex_query_with_path.into()) @@ -289,10 +302,10 @@ mod tests { field: "field".to_string(), regex: "abc.*xyz".to_string(), }; - let (field, path, regex, _tokenizer) = query.to_field_and_regex(&schema).unwrap(); - assert_eq!(field, schema.get_field("field").unwrap()); - assert!(path.is_none()); - assert_eq!(regex, query.regex); + let resolved = query.to_field_and_regex(&schema).unwrap(); + assert_eq!(resolved.field, schema.get_field("field").unwrap()); + assert!(resolved.json_path.is_none()); + assert_eq!(resolved.regex, query.regex); } #[test] @@ -305,21 +318,20 @@ mod tests { field: "field.sub.field".to_string(), regex: "abc.*xyz".to_string(), }; - let (field, path, regex, _tokenizer) = query.to_field_and_regex(&schema).unwrap(); - assert_eq!(field, schema.get_field("field").unwrap()); - assert_eq!(path.unwrap(), b"sub\x01field\0s"); - assert_eq!(regex, query.regex); + let resolved = query.to_field_and_regex(&schema).unwrap(); + assert_eq!(resolved.field, schema.get_field("field").unwrap()); + assert_eq!(resolved.json_path.unwrap(), b"sub\x01field\0s"); + assert_eq!(resolved.regex, query.regex); // i believe this is how concatenated field behave let query_empty_path = RegexQuery { field: "field".to_string(), regex: "abc.*xyz".to_string(), }; - let (field, path, regex, _tokenizer) = - query_empty_path.to_field_and_regex(&schema).unwrap(); - assert_eq!(field, schema.get_field("field").unwrap()); - assert_eq!(path.unwrap(), b"\0s"); - assert_eq!(regex, query_empty_path.regex); + let resolved = query_empty_path.to_field_and_regex(&schema).unwrap(); + assert_eq!(resolved.field, schema.get_field("field").unwrap()); + assert_eq!(resolved.json_path.unwrap(), b"\0s"); + assert_eq!(resolved.regex, query_empty_path.regex); } #[test] @@ -343,9 +355,9 @@ mod tests { field: "field".to_string(), regex: "abc.*xyz".to_string(), }; - let (_field, _path, _regex, tokenizer_name) = query.to_field_and_regex(&schema).unwrap(); + let resolved = query.to_field_and_regex(&schema).unwrap(); // TEXT uses the "default" tokenizer - assert_eq!(tokenizer_name.as_deref(), Some("default")); + assert_eq!(resolved.tokenizer_name.as_deref(), Some("default")); } #[test] @@ -409,10 +421,10 @@ mod tests { field: "raw_field".to_string(), regex: "abc.*xyz".to_string(), }; - let (_field, _path, regex, tokenizer_name) = query.to_field_and_regex(&schema).unwrap(); - assert_eq!(tokenizer_name.as_deref(), Some("raw")); + let resolved = query.to_field_and_regex(&schema).unwrap(); + assert_eq!(resolved.tokenizer_name.as_deref(), Some("raw")); // The regex should NOT have (?i) since raw doesn't lowercase - assert_eq!(regex, "abc.*xyz"); + assert_eq!(resolved.regex, "abc.*xyz"); } #[test] @@ -425,10 +437,10 @@ mod tests { field: "field.key".to_string(), regex: "abc".to_string(), }; - let (_field, path, _regex, tokenizer_name) = query.to_field_and_regex(&schema).unwrap(); - assert!(path.is_some()); + let resolved = query.to_field_and_regex(&schema).unwrap(); + assert!(resolved.json_path.is_some()); // JSON field with TEXT also uses "default" tokenizer - assert_eq!(tokenizer_name.as_deref(), Some("default")); + assert_eq!(resolved.tokenizer_name.as_deref(), Some("default")); } #[test]