diff --git a/redisvl/utils/token_escaper.py b/redisvl/utils/token_escaper.py index 04e04cd2..374534fd 100644 --- a/redisvl/utils/token_escaper.py +++ b/redisvl/utils/token_escaper.py @@ -9,11 +9,11 @@ class TokenEscaper: """ # Characters that RediSearch requires us to escape during queries. - # Source: https://redis.io/docs/stack/search/reference/escaping/#the-rules-of-text-field-tokenization - DEFAULT_ESCAPED_CHARS = r"[,.<>{}\[\]\\\"\':;!@#$%^&*()\-+=~\/ ]" + # Source: https://redis.io/docs/latest/develop/ai/search-and-query/advanced-concepts/escaping/#tokenization-rules-for-text-fields + DEFAULT_ESCAPED_CHARS = r"[,.<>{}\[\]\\\"\':;!@#$%^&*()\-+=~\/ ?|]" - # Same as above but excludes * to allow wildcard patterns - ESCAPED_CHARS_NO_WILDCARD = r"[,.<>{}\[\]\\\"\':;!@#$%^&()\-+=~\/ ]" + # Same as above but excludes * and ? to allow wildcard patterns + ESCAPED_CHARS_NO_WILDCARD = r"[,.<>{}\[\]\\\"\':;!@#$%^&()\-+=~\/ |]" def __init__(self, escape_chars_re: Optional[Pattern] = None): if escape_chars_re: @@ -27,8 +27,8 @@ def escape(self, value: str, preserve_wildcards: bool = False) -> str: Args: value: The string value to escape. - preserve_wildcards: If True, preserves * characters for wildcard - matching. Defaults to False. + preserve_wildcards: If True, preserves * and ? characters for + wildcard matching. Defaults to False. Returns: The escaped string. diff --git a/tests/unit/test_token_escaper.py b/tests/unit/test_token_escaper.py index 0adb2d11..2836ed19 100644 --- a/tests/unit/test_token_escaper.py +++ b/tests/unit/test_token_escaper.py @@ -19,8 +19,8 @@ def escaper(): ), ( r"& symbols, like * and ?", - r"\&\ symbols\,\ like\ \*\ and\ ?", - ), # TODO: question marks are not caught? + r"\&\ symbols\,\ like\ \*\ and\ \?", + ), # underscores are ignored (r"-dashes_and_underscores-", r"\-dashes_and_underscores\-"), ], @@ -52,12 +52,12 @@ def test_escape_text_chars(escaper, test_input, expected): # Tags with less common, but legal characters ("_underscore_", r"_underscore_"), ("dot.tag", r"dot\.tag"), - # ("pipe|tag", r"pipe\|tag"), #TODO - pipes are not caught? + ("pipe|tag", r"pipe\|tag"), # More edge cases with special characters ("(parentheses)", r"\(parentheses\)"), ("[brackets]", r"\[brackets\]"), ("{braces}", r"\{braces\}"), - # ("question?mark", r"question\?mark"), #TODO - question marks are not caught? + ("question?mark", r"question\?mark"), # Unicode characters in tags ("你好", r"你好"), # Assuming non-Latin characters don't need escaping ("emoji:😊", r"emoji\:😊"), @@ -78,9 +78,11 @@ def test_escape_text_chars(escaper, test_input, expected): "hyphen", "underscore", "dot", + "pipe", "parentheses", "brackets", "braces", + "question", "non-latin", "emoji", ], @@ -120,3 +122,20 @@ def test_escape_long_string(escaper): # Use pytest's benchmark fixture to check performance escaped = escaper.escape(long_str) assert escaped == expected + + +@pytest.mark.parametrize( + ("test_input,expected"), + [ + ("wild*card", r"wild*card"), + ("single?char", r"single?char"), + ("combo*test?", r"combo*test?"), + ("mixed*and|pipe", r"mixed*and\|pipe"), + ("question?and|pipe", r"question\?and\|pipe"), # ? escaped when not preserving + ], + ids=["star", "question", "both", "star-only", "question-escaped"], +) +def test_escape_preserve_wildcards(escaper, test_input, expected): + """Test that * and ? are preserved when preserve_wildcards=True.""" + result = escaper.escape(test_input, preserve_wildcards=True) + assert result == expected