Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions quickwit/quickwit-doc-mapper/src/query_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -394,13 +394,25 @@ impl<'a, 'b: 'a> QueryAstVisitor<'a> for ExtractPrefixTermRanges<'b> {
}

fn visit_regex(&mut self, regex_query: &'a RegexQuery) -> Result<(), Self::Err> {
let (field, path, regex) = match regex_query.to_field_and_regex(self.schema) {
let resolved = match regex_query.to_field_and_regex(self.schema) {
Ok(res) => res,
/* the query will be nullified when casting to a tantivy ast */
Err(InvalidQuery::FieldDoesNotExist { .. }) => return Ok(()),
Err(e) => return Err(e),
};
self.add_automaton(field, Automaton::Regex(path, regex));
// The warmup regex must match what build_tantivy_ast_impl produces.
// If the field's tokenizer lowercases indexed terms, the search will
// use (?i) to match case-insensitively, so the warmup must too.
let does_lowercasing = match resolved.tokenizer_name.as_deref() {
Some(name) => self.tokenizer_manager.tokenizer_does_lowercasing(name),
None => false,
};
let regex = if !resolved.regex.starts_with("(?i)") && does_lowercasing {
format!("(?i){}", resolved.regex)
} else {
resolved.regex
};
self.add_automaton(resolved.field, Automaton::Regex(resolved.json_path, regex));
Ok(())
}
}
Expand Down
2 changes: 1 addition & 1 deletion quickwit/quickwit-query/src/query_ast/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ pub use field_presence::FieldPresenceQuery;
pub use full_text_query::{FullTextMode, FullTextParams, FullTextQuery};
pub use phrase_prefix_query::PhrasePrefixQuery;
pub use range_query::RangeQuery;
pub use regex_query::{AutomatonQuery, JsonPathPrefix, RegexQuery};
pub use regex_query::{AutomatonQuery, JsonPathPrefix, RegexQuery, ResolvedRegex};
use tantivy_query_ast::TantivyQueryAst;
pub use term_query::TermQuery;
pub use term_set_query::TermSetQuery;
Expand Down
197 changes: 172 additions & 25 deletions quickwit/quickwit-query/src/query_ast/regex_query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,14 @@ use super::{BuildTantivyAst, BuildTantivyAstContext, QueryAst};
use crate::query_ast::TantivyQueryAst;
use crate::{InvalidQuery, find_field_or_hit_dynamic};

/// Result of resolving a `RegexQuery` against a schema.
pub struct ResolvedRegex {
pub field: Field,
pub json_path: Option<Vec<u8>>,
pub regex: String,
pub tokenizer_name: Option<String>,
}

/// A Regex query
#[derive(PartialEq, Eq, Debug, Serialize, Deserialize, Clone)]
pub struct RegexQuery {
Expand All @@ -48,10 +56,11 @@ impl RegexQuery {
}

impl RegexQuery {
/// Resolves this regex query against the given schema.
pub fn to_field_and_regex(
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
pub fn to_field_and_regex(
pub fn to_resolved(

&self,
schema: &TantivySchema,
) -> Result<(Field, Option<Vec<u8>>, String), InvalidQuery> {
) -> Result<ResolvedRegex, InvalidQuery> {
let Some((field, field_entry, json_path)) = find_field_or_hit_dynamic(&self.field, schema)
else {
return Err(InvalidQuery::FieldDoesNotExist {
Expand All @@ -62,22 +71,30 @@ impl RegexQuery {

match field_type {
FieldType::Str(text_options) => {
text_options.get_indexing_options().ok_or_else(|| {
let text_field_indexing = text_options.get_indexing_options().ok_or_else(|| {
InvalidQuery::SchemaError(format!(
"field {} is not full-text searchable",
field_entry.name()
))
})?;
let tokenizer_name = text_field_indexing.tokenizer().to_string();

Ok((field, None, self.regex.to_string()))
Ok(ResolvedRegex {
field,
json_path: None,
regex: self.regex.to_string(),
tokenizer_name: Some(tokenizer_name),
})
}
FieldType::JsonObject(json_options) => {
json_options.get_text_indexing_options().ok_or_else(|| {
InvalidQuery::SchemaError(format!(
"field {} is not full-text searchable",
field_entry.name()
))
})?;
let text_field_indexing =
json_options.get_text_indexing_options().ok_or_else(|| {
InvalidQuery::SchemaError(format!(
"field {} is not full-text searchable",
field_entry.name()
))
})?;
let tokenizer_name = text_field_indexing.tokenizer().to_string();

let mut term_for_path = Term::from_field_json_path(
field,
Expand All @@ -90,7 +107,12 @@ impl RegexQuery {
// We skip the 1st byte which is a marker to tell this is json. This isn't present
// in the dictionary
let byte_path_prefix = value.as_serialized()[1..].to_owned();
Ok((field, Some(byte_path_prefix), self.regex.to_string()))
Ok(ResolvedRegex {
field,
json_path: Some(byte_path_prefix),
regex: self.regex.to_string(),
tokenizer_name: Some(tokenizer_name),
})
}
_ => Err(InvalidQuery::SchemaError(
"trying to run a regex query on a non-text field".to_string(),
Expand All @@ -104,14 +126,30 @@ impl BuildTantivyAst for RegexQuery {
&self,
context: &BuildTantivyAstContext,
) -> Result<TantivyQueryAst, InvalidQuery> {
let (field, path, regex) = self.to_field_and_regex(context.schema)?;
let resolved = self.to_field_and_regex(context.schema)?;

// If the field's tokenizer lowercases indexed terms (e.g. "default",
// "raw_lowercase", "lowercase") and the regex doesn't already contain a
// (?i) flag, automatically make the match case-insensitive. Without
// this, an upstream regex like `.*ECONNREFUSED.*` would never match
// because the inverted index only contains lowercase tokens.
let does_lowercasing = match resolved.tokenizer_name.as_deref() {
Some(name) => context.tokenizer_manager.tokenizer_does_lowercasing(name),
None => false,
};
let regex = if !resolved.regex.starts_with("(?i)") && does_lowercasing {
format!("(?i){}", resolved.regex)
} else {
Comment on lines +140 to +142
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i wonder if this is always correct, or if there are other (?X) modifier that could be put and cause us to miss a case-Insensitive modifier

resolved.regex
};

let regex = tantivy_fst::Regex::new(&regex).context("failed to parse regex")?;
let regex_automaton_with_path = JsonPathPrefix {
prefix: path.unwrap_or_default(),
prefix: resolved.json_path.unwrap_or_default(),
automaton: regex.into(),
};
let regex_query_with_path = AutomatonQuery {
field,
field: resolved.field,
automaton: Arc::new(regex_automaton_with_path),
};
Ok(regex_query_with_path.into())
Expand Down Expand Up @@ -264,10 +302,10 @@ mod tests {
field: "field".to_string(),
regex: "abc.*xyz".to_string(),
};
let (field, path, regex) = query.to_field_and_regex(&schema).unwrap();
assert_eq!(field, schema.get_field("field").unwrap());
assert!(path.is_none());
assert_eq!(regex, query.regex);
let resolved = query.to_field_and_regex(&schema).unwrap();
assert_eq!(resolved.field, schema.get_field("field").unwrap());
assert!(resolved.json_path.is_none());
assert_eq!(resolved.regex, query.regex);
}

#[test]
Expand All @@ -280,20 +318,129 @@ mod tests {
field: "field.sub.field".to_string(),
regex: "abc.*xyz".to_string(),
};
let (field, path, regex) = query.to_field_and_regex(&schema).unwrap();
assert_eq!(field, schema.get_field("field").unwrap());
assert_eq!(path.unwrap(), b"sub\x01field\0s");
assert_eq!(regex, query.regex);
let resolved = query.to_field_and_regex(&schema).unwrap();
assert_eq!(resolved.field, schema.get_field("field").unwrap());
assert_eq!(resolved.json_path.unwrap(), b"sub\x01field\0s");
assert_eq!(resolved.regex, query.regex);

// i believe this is how concatenated field behave
let query_empty_path = RegexQuery {
field: "field".to_string(),
regex: "abc.*xyz".to_string(),
};
let (field, path, regex) = query_empty_path.to_field_and_regex(&schema).unwrap();
assert_eq!(field, schema.get_field("field").unwrap());
assert_eq!(path.unwrap(), b"\0s");
assert_eq!(regex, query_empty_path.regex);
let resolved = query_empty_path.to_field_and_regex(&schema).unwrap();
assert_eq!(resolved.field, schema.get_field("field").unwrap());
assert_eq!(resolved.json_path.unwrap(), b"\0s");
assert_eq!(resolved.regex, query_empty_path.regex);
}

#[test]
fn test_tokenizer_does_lowercasing() {
let tokenizer_manager = crate::tokenizers::create_default_quickwit_tokenizer_manager();

assert!(tokenizer_manager.tokenizer_does_lowercasing("raw_lowercase"));
assert!(tokenizer_manager.tokenizer_does_lowercasing("default"));
assert!(tokenizer_manager.tokenizer_does_lowercasing("lowercase"));
assert!(!tokenizer_manager.tokenizer_does_lowercasing("raw"));
assert!(!tokenizer_manager.tokenizer_does_lowercasing("nonexistent"));
}
Comment on lines +337 to +346
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wrong module, but this test should exists


#[test]
fn test_regex_query_returns_tokenizer_name() {
let mut schema_builder = TantivySchema::builder();
schema_builder.add_text_field("field", TEXT);
let schema = schema_builder.build();

let query = RegexQuery {
field: "field".to_string(),
regex: "abc.*xyz".to_string(),
};
let resolved = query.to_field_and_regex(&schema).unwrap();
// TEXT uses the "default" tokenizer
assert_eq!(resolved.tokenizer_name.as_deref(), Some("default"));
}

#[test]
fn test_regex_case_insensitive_with_lowercasing_tokenizer() {
use super::BuildTantivyAstContext;
use crate::query_ast::BuildTantivyAst;

let mut schema_builder = TantivySchema::builder();
// TEXT uses the "default" tokenizer which lowercases
schema_builder.add_text_field("field", TEXT);
let schema = schema_builder.build();

let context = BuildTantivyAstContext::for_test(&schema);

let query = RegexQuery {
field: "field".to_string(),
regex: ".*ECONNREFUSED.*".to_string(),
};

// The query should succeed (regex is valid with (?i) prepended)
let result = query.build_tantivy_ast_impl(&context);
assert!(result.is_ok(), "regex query should build successfully");
}

#[test]
fn test_regex_already_case_insensitive_not_doubled() {
use super::BuildTantivyAstContext;
use crate::query_ast::BuildTantivyAst;

let mut schema_builder = TantivySchema::builder();
schema_builder.add_text_field("field", TEXT);
let schema = schema_builder.build();

let context = BuildTantivyAstContext::for_test(&schema);

// Already has (?i), should not be doubled
let query = RegexQuery {
field: "field".to_string(),
regex: "(?i).*ECONNREFUSED.*".to_string(),
};

let result = query.build_tantivy_ast_impl(&context);
assert!(
result.is_ok(),
"regex query with existing (?i) should build successfully"
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would a query with (?i)(?i)something fail with this test? (is it testing something, or would the test pass anyway without the branch to check for already case-insensitive query?)

);
}

#[test]
fn test_regex_no_case_insensitive_with_raw_tokenizer() {
use tantivy::schema::{TextFieldIndexing, TextOptions};

let mut schema_builder = TantivySchema::builder();
// Use "raw" tokenizer which does not lowercase
let text_options = TextOptions::default()
.set_indexing_options(TextFieldIndexing::default().set_tokenizer("raw"));
schema_builder.add_text_field("raw_field", text_options);
let schema = schema_builder.build();

let query = RegexQuery {
field: "raw_field".to_string(),
regex: "abc.*xyz".to_string(),
};
let resolved = query.to_field_and_regex(&schema).unwrap();
assert_eq!(resolved.tokenizer_name.as_deref(), Some("raw"));
// The regex should NOT have (?i) since raw doesn't lowercase
assert_eq!(resolved.regex, "abc.*xyz");
Comment on lines +426 to +427
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add (?i) is done in build_tantivy_ast_impl, not in to_field_and_regex, this test doesn't do what it claims

}

#[test]
fn test_regex_json_field_returns_tokenizer_name() {
let mut schema_builder = TantivySchema::builder();
schema_builder.add_json_field("field", TEXT);
let schema = schema_builder.build();

let query = RegexQuery {
field: "field.key".to_string(),
regex: "abc".to_string(),
};
let resolved = query.to_field_and_regex(&schema).unwrap();
assert!(resolved.json_path.is_some());
// JSON field with TEXT also uses "default" tokenizer
assert_eq!(resolved.tokenizer_name.as_deref(), Some("default"));
}

#[test]
Expand Down
10 changes: 10 additions & 0 deletions quickwit/quickwit-query/src/tokenizers/tokenizer_manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,16 @@ impl TokenizerManager {
self.get_tokenizer(analyzer)
}

/// Returns true if the given tokenizer lowercases its output.
pub fn tokenizer_does_lowercasing(&self, tokenizer_name: &str) -> bool {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you use this function to simplify get_normalizer() ?
also it should probably return None when using an unknown tokenizer

self.is_lowercaser
.read()
.unwrap()
.get(tokenizer_name)
.copied()
.unwrap_or(false)
}

/// Get the inner TokenizerManager
pub fn tantivy_manager(&self) -> &TantivyTokenizerManager {
&self.inner
Expand Down
Loading