diff --git a/quickwit/quickwit-doc-mapper/src/query_builder.rs b/quickwit/quickwit-doc-mapper/src/query_builder.rs index 5900b577795..8a40463d18b 100644 --- a/quickwit/quickwit-doc-mapper/src/query_builder.rs +++ b/quickwit/quickwit-doc-mapper/src/query_builder.rs @@ -394,13 +394,25 @@ impl<'a, 'b: 'a> QueryAstVisitor<'a> for ExtractPrefixTermRanges<'b> { } fn visit_regex(&mut self, regex_query: &'a RegexQuery) -> Result<(), Self::Err> { - let (field, path, regex) = match regex_query.to_field_and_regex(self.schema) { + let resolved = match regex_query.to_field_and_regex(self.schema) { Ok(res) => res, /* the query will be nullified when casting to a tantivy ast */ Err(InvalidQuery::FieldDoesNotExist { .. }) => return Ok(()), Err(e) => return Err(e), }; - self.add_automaton(field, Automaton::Regex(path, regex)); + // The warmup regex must match what build_tantivy_ast_impl produces. + // If the field's tokenizer lowercases indexed terms, the search will + // use (?i) to match case-insensitively, so the warmup must too. + let does_lowercasing = match resolved.tokenizer_name.as_deref() { + Some(name) => self.tokenizer_manager.tokenizer_does_lowercasing(name), + None => false, + }; + let regex = if !resolved.regex.starts_with("(?i)") && does_lowercasing { + format!("(?i){}", resolved.regex) + } else { + resolved.regex + }; + self.add_automaton(resolved.field, Automaton::Regex(resolved.json_path, regex)); Ok(()) } } diff --git a/quickwit/quickwit-query/src/query_ast/mod.rs b/quickwit/quickwit-query/src/query_ast/mod.rs index d51e36eb93a..c0dfae9fd88 100644 --- a/quickwit/quickwit-query/src/query_ast/mod.rs +++ b/quickwit/quickwit-query/src/query_ast/mod.rs @@ -39,7 +39,7 @@ pub use field_presence::FieldPresenceQuery; pub use full_text_query::{FullTextMode, FullTextParams, FullTextQuery}; pub use phrase_prefix_query::PhrasePrefixQuery; pub use range_query::RangeQuery; -pub use regex_query::{AutomatonQuery, JsonPathPrefix, RegexQuery}; +pub use regex_query::{AutomatonQuery, JsonPathPrefix, RegexQuery, ResolvedRegex}; use tantivy_query_ast::TantivyQueryAst; pub use term_query::TermQuery; pub use term_set_query::TermSetQuery; diff --git a/quickwit/quickwit-query/src/query_ast/regex_query.rs b/quickwit/quickwit-query/src/query_ast/regex_query.rs index 6e1ab208dbf..8603af11e36 100644 --- a/quickwit/quickwit-query/src/query_ast/regex_query.rs +++ b/quickwit/quickwit-query/src/query_ast/regex_query.rs @@ -24,6 +24,14 @@ use super::{BuildTantivyAst, BuildTantivyAstContext, QueryAst}; use crate::query_ast::TantivyQueryAst; use crate::{InvalidQuery, find_field_or_hit_dynamic}; +/// Result of resolving a `RegexQuery` against a schema. +pub struct ResolvedRegex { + pub field: Field, + pub json_path: Option>, + pub regex: String, + pub tokenizer_name: Option, +} + /// A Regex query #[derive(PartialEq, Eq, Debug, Serialize, Deserialize, Clone)] pub struct RegexQuery { @@ -48,10 +56,11 @@ impl RegexQuery { } impl RegexQuery { + /// Resolves this regex query against the given schema. pub fn to_field_and_regex( &self, schema: &TantivySchema, - ) -> Result<(Field, Option>, String), InvalidQuery> { + ) -> Result { let Some((field, field_entry, json_path)) = find_field_or_hit_dynamic(&self.field, schema) else { return Err(InvalidQuery::FieldDoesNotExist { @@ -62,22 +71,30 @@ impl RegexQuery { match field_type { FieldType::Str(text_options) => { - text_options.get_indexing_options().ok_or_else(|| { + let text_field_indexing = text_options.get_indexing_options().ok_or_else(|| { InvalidQuery::SchemaError(format!( "field {} is not full-text searchable", field_entry.name() )) })?; + let tokenizer_name = text_field_indexing.tokenizer().to_string(); - Ok((field, None, self.regex.to_string())) + Ok(ResolvedRegex { + field, + json_path: None, + regex: self.regex.to_string(), + tokenizer_name: Some(tokenizer_name), + }) } FieldType::JsonObject(json_options) => { - json_options.get_text_indexing_options().ok_or_else(|| { - InvalidQuery::SchemaError(format!( - "field {} is not full-text searchable", - field_entry.name() - )) - })?; + let text_field_indexing = + json_options.get_text_indexing_options().ok_or_else(|| { + InvalidQuery::SchemaError(format!( + "field {} is not full-text searchable", + field_entry.name() + )) + })?; + let tokenizer_name = text_field_indexing.tokenizer().to_string(); let mut term_for_path = Term::from_field_json_path( field, @@ -90,7 +107,12 @@ impl RegexQuery { // We skip the 1st byte which is a marker to tell this is json. This isn't present // in the dictionary let byte_path_prefix = value.as_serialized()[1..].to_owned(); - Ok((field, Some(byte_path_prefix), self.regex.to_string())) + Ok(ResolvedRegex { + field, + json_path: Some(byte_path_prefix), + regex: self.regex.to_string(), + tokenizer_name: Some(tokenizer_name), + }) } _ => Err(InvalidQuery::SchemaError( "trying to run a regex query on a non-text field".to_string(), @@ -104,14 +126,30 @@ impl BuildTantivyAst for RegexQuery { &self, context: &BuildTantivyAstContext, ) -> Result { - let (field, path, regex) = self.to_field_and_regex(context.schema)?; + let resolved = self.to_field_and_regex(context.schema)?; + + // If the field's tokenizer lowercases indexed terms (e.g. "default", + // "raw_lowercase", "lowercase") and the regex doesn't already contain a + // (?i) flag, automatically make the match case-insensitive. Without + // this, an upstream regex like `.*ECONNREFUSED.*` would never match + // because the inverted index only contains lowercase tokens. + let does_lowercasing = match resolved.tokenizer_name.as_deref() { + Some(name) => context.tokenizer_manager.tokenizer_does_lowercasing(name), + None => false, + }; + let regex = if !resolved.regex.starts_with("(?i)") && does_lowercasing { + format!("(?i){}", resolved.regex) + } else { + resolved.regex + }; + let regex = tantivy_fst::Regex::new(®ex).context("failed to parse regex")?; let regex_automaton_with_path = JsonPathPrefix { - prefix: path.unwrap_or_default(), + prefix: resolved.json_path.unwrap_or_default(), automaton: regex.into(), }; let regex_query_with_path = AutomatonQuery { - field, + field: resolved.field, automaton: Arc::new(regex_automaton_with_path), }; Ok(regex_query_with_path.into()) @@ -264,10 +302,10 @@ mod tests { field: "field".to_string(), regex: "abc.*xyz".to_string(), }; - let (field, path, regex) = query.to_field_and_regex(&schema).unwrap(); - assert_eq!(field, schema.get_field("field").unwrap()); - assert!(path.is_none()); - assert_eq!(regex, query.regex); + let resolved = query.to_field_and_regex(&schema).unwrap(); + assert_eq!(resolved.field, schema.get_field("field").unwrap()); + assert!(resolved.json_path.is_none()); + assert_eq!(resolved.regex, query.regex); } #[test] @@ -280,20 +318,129 @@ mod tests { field: "field.sub.field".to_string(), regex: "abc.*xyz".to_string(), }; - let (field, path, regex) = query.to_field_and_regex(&schema).unwrap(); - assert_eq!(field, schema.get_field("field").unwrap()); - assert_eq!(path.unwrap(), b"sub\x01field\0s"); - assert_eq!(regex, query.regex); + let resolved = query.to_field_and_regex(&schema).unwrap(); + assert_eq!(resolved.field, schema.get_field("field").unwrap()); + assert_eq!(resolved.json_path.unwrap(), b"sub\x01field\0s"); + assert_eq!(resolved.regex, query.regex); // i believe this is how concatenated field behave let query_empty_path = RegexQuery { field: "field".to_string(), regex: "abc.*xyz".to_string(), }; - let (field, path, regex) = query_empty_path.to_field_and_regex(&schema).unwrap(); - assert_eq!(field, schema.get_field("field").unwrap()); - assert_eq!(path.unwrap(), b"\0s"); - assert_eq!(regex, query_empty_path.regex); + let resolved = query_empty_path.to_field_and_regex(&schema).unwrap(); + assert_eq!(resolved.field, schema.get_field("field").unwrap()); + assert_eq!(resolved.json_path.unwrap(), b"\0s"); + assert_eq!(resolved.regex, query_empty_path.regex); + } + + #[test] + fn test_tokenizer_does_lowercasing() { + let tokenizer_manager = crate::tokenizers::create_default_quickwit_tokenizer_manager(); + + assert!(tokenizer_manager.tokenizer_does_lowercasing("raw_lowercase")); + assert!(tokenizer_manager.tokenizer_does_lowercasing("default")); + assert!(tokenizer_manager.tokenizer_does_lowercasing("lowercase")); + assert!(!tokenizer_manager.tokenizer_does_lowercasing("raw")); + assert!(!tokenizer_manager.tokenizer_does_lowercasing("nonexistent")); + } + + #[test] + fn test_regex_query_returns_tokenizer_name() { + let mut schema_builder = TantivySchema::builder(); + schema_builder.add_text_field("field", TEXT); + let schema = schema_builder.build(); + + let query = RegexQuery { + field: "field".to_string(), + regex: "abc.*xyz".to_string(), + }; + let resolved = query.to_field_and_regex(&schema).unwrap(); + // TEXT uses the "default" tokenizer + assert_eq!(resolved.tokenizer_name.as_deref(), Some("default")); + } + + #[test] + fn test_regex_case_insensitive_with_lowercasing_tokenizer() { + use super::BuildTantivyAstContext; + use crate::query_ast::BuildTantivyAst; + + let mut schema_builder = TantivySchema::builder(); + // TEXT uses the "default" tokenizer which lowercases + schema_builder.add_text_field("field", TEXT); + let schema = schema_builder.build(); + + let context = BuildTantivyAstContext::for_test(&schema); + + let query = RegexQuery { + field: "field".to_string(), + regex: ".*ECONNREFUSED.*".to_string(), + }; + + // The query should succeed (regex is valid with (?i) prepended) + let result = query.build_tantivy_ast_impl(&context); + assert!(result.is_ok(), "regex query should build successfully"); + } + + #[test] + fn test_regex_already_case_insensitive_not_doubled() { + use super::BuildTantivyAstContext; + use crate::query_ast::BuildTantivyAst; + + let mut schema_builder = TantivySchema::builder(); + schema_builder.add_text_field("field", TEXT); + let schema = schema_builder.build(); + + let context = BuildTantivyAstContext::for_test(&schema); + + // Already has (?i), should not be doubled + let query = RegexQuery { + field: "field".to_string(), + regex: "(?i).*ECONNREFUSED.*".to_string(), + }; + + let result = query.build_tantivy_ast_impl(&context); + assert!( + result.is_ok(), + "regex query with existing (?i) should build successfully" + ); + } + + #[test] + fn test_regex_no_case_insensitive_with_raw_tokenizer() { + use tantivy::schema::{TextFieldIndexing, TextOptions}; + + let mut schema_builder = TantivySchema::builder(); + // Use "raw" tokenizer which does not lowercase + let text_options = TextOptions::default() + .set_indexing_options(TextFieldIndexing::default().set_tokenizer("raw")); + schema_builder.add_text_field("raw_field", text_options); + let schema = schema_builder.build(); + + let query = RegexQuery { + field: "raw_field".to_string(), + regex: "abc.*xyz".to_string(), + }; + let resolved = query.to_field_and_regex(&schema).unwrap(); + assert_eq!(resolved.tokenizer_name.as_deref(), Some("raw")); + // The regex should NOT have (?i) since raw doesn't lowercase + assert_eq!(resolved.regex, "abc.*xyz"); + } + + #[test] + fn test_regex_json_field_returns_tokenizer_name() { + let mut schema_builder = TantivySchema::builder(); + schema_builder.add_json_field("field", TEXT); + let schema = schema_builder.build(); + + let query = RegexQuery { + field: "field.key".to_string(), + regex: "abc".to_string(), + }; + let resolved = query.to_field_and_regex(&schema).unwrap(); + assert!(resolved.json_path.is_some()); + // JSON field with TEXT also uses "default" tokenizer + assert_eq!(resolved.tokenizer_name.as_deref(), Some("default")); } #[test] diff --git a/quickwit/quickwit-query/src/tokenizers/tokenizer_manager.rs b/quickwit/quickwit-query/src/tokenizers/tokenizer_manager.rs index faf2d72c4de..fc0488ad2ba 100644 --- a/quickwit/quickwit-query/src/tokenizers/tokenizer_manager.rs +++ b/quickwit/quickwit-query/src/tokenizers/tokenizer_manager.rs @@ -91,6 +91,16 @@ impl TokenizerManager { self.get_tokenizer(analyzer) } + /// Returns true if the given tokenizer lowercases its output. + pub fn tokenizer_does_lowercasing(&self, tokenizer_name: &str) -> bool { + self.is_lowercaser + .read() + .unwrap() + .get(tokenizer_name) + .copied() + .unwrap_or(false) + } + /// Get the inner TokenizerManager pub fn tantivy_manager(&self) -> &TantivyTokenizerManager { &self.inner