-
Notifications
You must be signed in to change notification settings - Fork 538
feat: case-insensitive regex for lowercasing tokenizers #6277
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -24,6 +24,14 @@ use super::{BuildTantivyAst, BuildTantivyAstContext, QueryAst}; | |
| use crate::query_ast::TantivyQueryAst; | ||
| use crate::{InvalidQuery, find_field_or_hit_dynamic}; | ||
|
|
||
| /// Result of resolving a `RegexQuery` against a schema. | ||
| pub struct ResolvedRegex { | ||
| pub field: Field, | ||
| pub json_path: Option<Vec<u8>>, | ||
| pub regex: String, | ||
| pub tokenizer_name: Option<String>, | ||
| } | ||
|
|
||
| /// A Regex query | ||
| #[derive(PartialEq, Eq, Debug, Serialize, Deserialize, Clone)] | ||
| pub struct RegexQuery { | ||
|
|
@@ -48,10 +56,11 @@ impl RegexQuery { | |
| } | ||
|
|
||
| impl RegexQuery { | ||
| /// Resolves this regex query against the given schema. | ||
| pub fn to_field_and_regex( | ||
| &self, | ||
| schema: &TantivySchema, | ||
| ) -> Result<(Field, Option<Vec<u8>>, String), InvalidQuery> { | ||
| ) -> Result<ResolvedRegex, InvalidQuery> { | ||
| let Some((field, field_entry, json_path)) = find_field_or_hit_dynamic(&self.field, schema) | ||
| else { | ||
| return Err(InvalidQuery::FieldDoesNotExist { | ||
|
|
@@ -62,22 +71,30 @@ impl RegexQuery { | |
|
|
||
| match field_type { | ||
| FieldType::Str(text_options) => { | ||
| text_options.get_indexing_options().ok_or_else(|| { | ||
| let text_field_indexing = text_options.get_indexing_options().ok_or_else(|| { | ||
| InvalidQuery::SchemaError(format!( | ||
| "field {} is not full-text searchable", | ||
| field_entry.name() | ||
| )) | ||
| })?; | ||
| let tokenizer_name = text_field_indexing.tokenizer().to_string(); | ||
|
|
||
| Ok((field, None, self.regex.to_string())) | ||
| Ok(ResolvedRegex { | ||
| field, | ||
| json_path: None, | ||
| regex: self.regex.to_string(), | ||
| tokenizer_name: Some(tokenizer_name), | ||
| }) | ||
| } | ||
| FieldType::JsonObject(json_options) => { | ||
| json_options.get_text_indexing_options().ok_or_else(|| { | ||
| InvalidQuery::SchemaError(format!( | ||
| "field {} is not full-text searchable", | ||
| field_entry.name() | ||
| )) | ||
| })?; | ||
| let text_field_indexing = | ||
| json_options.get_text_indexing_options().ok_or_else(|| { | ||
| InvalidQuery::SchemaError(format!( | ||
| "field {} is not full-text searchable", | ||
| field_entry.name() | ||
| )) | ||
| })?; | ||
| let tokenizer_name = text_field_indexing.tokenizer().to_string(); | ||
|
|
||
| let mut term_for_path = Term::from_field_json_path( | ||
| field, | ||
|
|
@@ -90,7 +107,12 @@ impl RegexQuery { | |
| // We skip the 1st byte which is a marker to tell this is json. This isn't present | ||
| // in the dictionary | ||
| let byte_path_prefix = value.as_serialized()[1..].to_owned(); | ||
| Ok((field, Some(byte_path_prefix), self.regex.to_string())) | ||
| Ok(ResolvedRegex { | ||
| field, | ||
| json_path: Some(byte_path_prefix), | ||
| regex: self.regex.to_string(), | ||
| tokenizer_name: Some(tokenizer_name), | ||
| }) | ||
| } | ||
| _ => Err(InvalidQuery::SchemaError( | ||
| "trying to run a regex query on a non-text field".to_string(), | ||
|
|
@@ -104,14 +126,30 @@ impl BuildTantivyAst for RegexQuery { | |
| &self, | ||
| context: &BuildTantivyAstContext, | ||
| ) -> Result<TantivyQueryAst, InvalidQuery> { | ||
| let (field, path, regex) = self.to_field_and_regex(context.schema)?; | ||
| let resolved = self.to_field_and_regex(context.schema)?; | ||
|
|
||
| // If the field's tokenizer lowercases indexed terms (e.g. "default", | ||
| // "raw_lowercase", "lowercase") and the regex doesn't already contain a | ||
| // (?i) flag, automatically make the match case-insensitive. Without | ||
| // this, an upstream regex like `.*ECONNREFUSED.*` would never match | ||
| // because the inverted index only contains lowercase tokens. | ||
| let does_lowercasing = match resolved.tokenizer_name.as_deref() { | ||
| Some(name) => context.tokenizer_manager.tokenizer_does_lowercasing(name), | ||
| None => false, | ||
| }; | ||
| let regex = if !resolved.regex.starts_with("(?i)") && does_lowercasing { | ||
| format!("(?i){}", resolved.regex) | ||
| } else { | ||
|
Comment on lines
+140
to
+142
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i wonder if this is always correct, or if there are other |
||
| resolved.regex | ||
| }; | ||
|
|
||
| let regex = tantivy_fst::Regex::new(®ex).context("failed to parse regex")?; | ||
| let regex_automaton_with_path = JsonPathPrefix { | ||
| prefix: path.unwrap_or_default(), | ||
| prefix: resolved.json_path.unwrap_or_default(), | ||
| automaton: regex.into(), | ||
| }; | ||
| let regex_query_with_path = AutomatonQuery { | ||
| field, | ||
| field: resolved.field, | ||
| automaton: Arc::new(regex_automaton_with_path), | ||
| }; | ||
| Ok(regex_query_with_path.into()) | ||
|
|
@@ -264,10 +302,10 @@ mod tests { | |
| field: "field".to_string(), | ||
| regex: "abc.*xyz".to_string(), | ||
| }; | ||
| let (field, path, regex) = query.to_field_and_regex(&schema).unwrap(); | ||
| assert_eq!(field, schema.get_field("field").unwrap()); | ||
| assert!(path.is_none()); | ||
| assert_eq!(regex, query.regex); | ||
| let resolved = query.to_field_and_regex(&schema).unwrap(); | ||
| assert_eq!(resolved.field, schema.get_field("field").unwrap()); | ||
| assert!(resolved.json_path.is_none()); | ||
| assert_eq!(resolved.regex, query.regex); | ||
| } | ||
|
|
||
| #[test] | ||
|
|
@@ -280,20 +318,129 @@ mod tests { | |
| field: "field.sub.field".to_string(), | ||
| regex: "abc.*xyz".to_string(), | ||
| }; | ||
| let (field, path, regex) = query.to_field_and_regex(&schema).unwrap(); | ||
| assert_eq!(field, schema.get_field("field").unwrap()); | ||
| assert_eq!(path.unwrap(), b"sub\x01field\0s"); | ||
| assert_eq!(regex, query.regex); | ||
| let resolved = query.to_field_and_regex(&schema).unwrap(); | ||
| assert_eq!(resolved.field, schema.get_field("field").unwrap()); | ||
| assert_eq!(resolved.json_path.unwrap(), b"sub\x01field\0s"); | ||
| assert_eq!(resolved.regex, query.regex); | ||
|
|
||
| // i believe this is how concatenated field behave | ||
| let query_empty_path = RegexQuery { | ||
| field: "field".to_string(), | ||
| regex: "abc.*xyz".to_string(), | ||
| }; | ||
| let (field, path, regex) = query_empty_path.to_field_and_regex(&schema).unwrap(); | ||
| assert_eq!(field, schema.get_field("field").unwrap()); | ||
| assert_eq!(path.unwrap(), b"\0s"); | ||
| assert_eq!(regex, query_empty_path.regex); | ||
| let resolved = query_empty_path.to_field_and_regex(&schema).unwrap(); | ||
| assert_eq!(resolved.field, schema.get_field("field").unwrap()); | ||
| assert_eq!(resolved.json_path.unwrap(), b"\0s"); | ||
| assert_eq!(resolved.regex, query_empty_path.regex); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_tokenizer_does_lowercasing() { | ||
| let tokenizer_manager = crate::tokenizers::create_default_quickwit_tokenizer_manager(); | ||
|
|
||
| assert!(tokenizer_manager.tokenizer_does_lowercasing("raw_lowercase")); | ||
| assert!(tokenizer_manager.tokenizer_does_lowercasing("default")); | ||
| assert!(tokenizer_manager.tokenizer_does_lowercasing("lowercase")); | ||
| assert!(!tokenizer_manager.tokenizer_does_lowercasing("raw")); | ||
| assert!(!tokenizer_manager.tokenizer_does_lowercasing("nonexistent")); | ||
| } | ||
|
Comment on lines
+337
to
+346
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. wrong module, but this test should exists |
||
|
|
||
| #[test] | ||
| fn test_regex_query_returns_tokenizer_name() { | ||
| let mut schema_builder = TantivySchema::builder(); | ||
| schema_builder.add_text_field("field", TEXT); | ||
| let schema = schema_builder.build(); | ||
|
|
||
| let query = RegexQuery { | ||
| field: "field".to_string(), | ||
| regex: "abc.*xyz".to_string(), | ||
| }; | ||
| let resolved = query.to_field_and_regex(&schema).unwrap(); | ||
| // TEXT uses the "default" tokenizer | ||
| assert_eq!(resolved.tokenizer_name.as_deref(), Some("default")); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_regex_case_insensitive_with_lowercasing_tokenizer() { | ||
| use super::BuildTantivyAstContext; | ||
| use crate::query_ast::BuildTantivyAst; | ||
|
|
||
| let mut schema_builder = TantivySchema::builder(); | ||
| // TEXT uses the "default" tokenizer which lowercases | ||
| schema_builder.add_text_field("field", TEXT); | ||
| let schema = schema_builder.build(); | ||
|
|
||
| let context = BuildTantivyAstContext::for_test(&schema); | ||
|
|
||
| let query = RegexQuery { | ||
| field: "field".to_string(), | ||
| regex: ".*ECONNREFUSED.*".to_string(), | ||
| }; | ||
|
|
||
| // The query should succeed (regex is valid with (?i) prepended) | ||
| let result = query.build_tantivy_ast_impl(&context); | ||
| assert!(result.is_ok(), "regex query should build successfully"); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_regex_already_case_insensitive_not_doubled() { | ||
| use super::BuildTantivyAstContext; | ||
| use crate::query_ast::BuildTantivyAst; | ||
|
|
||
| let mut schema_builder = TantivySchema::builder(); | ||
| schema_builder.add_text_field("field", TEXT); | ||
| let schema = schema_builder.build(); | ||
|
|
||
| let context = BuildTantivyAstContext::for_test(&schema); | ||
|
|
||
| // Already has (?i), should not be doubled | ||
| let query = RegexQuery { | ||
| field: "field".to_string(), | ||
| regex: "(?i).*ECONNREFUSED.*".to_string(), | ||
| }; | ||
|
|
||
| let result = query.build_tantivy_ast_impl(&context); | ||
| assert!( | ||
| result.is_ok(), | ||
| "regex query with existing (?i) should build successfully" | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. would a query with |
||
| ); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_regex_no_case_insensitive_with_raw_tokenizer() { | ||
| use tantivy::schema::{TextFieldIndexing, TextOptions}; | ||
|
|
||
| let mut schema_builder = TantivySchema::builder(); | ||
| // Use "raw" tokenizer which does not lowercase | ||
| let text_options = TextOptions::default() | ||
| .set_indexing_options(TextFieldIndexing::default().set_tokenizer("raw")); | ||
| schema_builder.add_text_field("raw_field", text_options); | ||
| let schema = schema_builder.build(); | ||
|
|
||
| let query = RegexQuery { | ||
| field: "raw_field".to_string(), | ||
| regex: "abc.*xyz".to_string(), | ||
| }; | ||
| let resolved = query.to_field_and_regex(&schema).unwrap(); | ||
| assert_eq!(resolved.tokenizer_name.as_deref(), Some("raw")); | ||
| // The regex should NOT have (?i) since raw doesn't lowercase | ||
| assert_eq!(resolved.regex, "abc.*xyz"); | ||
|
Comment on lines
+426
to
+427
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add |
||
| } | ||
|
|
||
| #[test] | ||
| fn test_regex_json_field_returns_tokenizer_name() { | ||
| let mut schema_builder = TantivySchema::builder(); | ||
| schema_builder.add_json_field("field", TEXT); | ||
| let schema = schema_builder.build(); | ||
|
|
||
| let query = RegexQuery { | ||
| field: "field.key".to_string(), | ||
| regex: "abc".to_string(), | ||
| }; | ||
| let resolved = query.to_field_and_regex(&schema).unwrap(); | ||
| assert!(resolved.json_path.is_some()); | ||
| // JSON field with TEXT also uses "default" tokenizer | ||
| assert_eq!(resolved.tokenizer_name.as_deref(), Some("default")); | ||
| } | ||
|
|
||
| #[test] | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -91,6 +91,16 @@ impl TokenizerManager { | |
| self.get_tokenizer(analyzer) | ||
| } | ||
|
|
||
| /// Returns true if the given tokenizer lowercases its output. | ||
| pub fn tokenizer_does_lowercasing(&self, tokenizer_name: &str) -> bool { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you use this function to simplify get_normalizer() ? |
||
| self.is_lowercaser | ||
| .read() | ||
| .unwrap() | ||
| .get(tokenizer_name) | ||
| .copied() | ||
| .unwrap_or(false) | ||
| } | ||
|
|
||
| /// Get the inner TokenizerManager | ||
| pub fn tantivy_manager(&self) -> &TantivyTokenizerManager { | ||
| &self.inner | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.