1 <?php rcs_id('$Id: TextSearchQuery.php,v 1.11 2004-11-26 18:39:01 rurban Exp $');
3 * A text search query, converting queries to PCRE or SQL matchers.
5 * This represents an enhanced "Google-like" text search query:
7 * <dt> default: case-insensitive glob-style search with special operators OR AND NOT -
9 * <dd> Match strings containing the substring 'wiki', and not containing the
11 * <dt> wiki word or page
12 * <dd> Match strings containing the substring 'wiki' and either the substring
13 * 'word' or the substring 'page'.
14 * <dt> detects regex hints, glob-style or regex-style, and converts them to PCRE or SQL matchers
15 * <dd> "^word$" => EXACT(word)
16 * <dd> "^word" => STARTS_WITH(word)
17 * <dd> "word*" => STARTS_WITH(word)
18 * <dd> "*word" => ENDS_WITH(word)
19 * <dd> "/^word.* /" => REGEX(^word.*)
20 * <dd> "word*word" => REGEX(word.*word)
23 * The full query syntax, in order of precedence, is roughly:
25 * The unary 'NOT' or '-' operator (they are equivalent) negates the
26 * following search clause.
28 * Search clauses may be joined with the (left-associative) binary operators
31 * Two adjoining search clauses are joined with an implicit 'AND'. This has
32 * lower precedence than either an explicit 'AND' or 'OR', so "a b OR c"
33 * parses as "a AND ( b OR c )", while "a AND b OR c" parses as
34 * "( a AND b ) OR c" (due to the left-associativity of 'AND' and 'OR'.)
36 * Search clauses can be grouped with parentheses.
38 * Phrases (or other things which don't look like words) can be forced to
39 * be interpreted as words by quoting them, either with single (') or double (")
40 * quotes. If you wan't to include the quote character within a quoted string,
41 * double-up on the quote character: 'I''m hungry' is equivalent to
44 * Force regex on "re:word" => posix-style, "/word/" => pcre-style
45 * or use regex='glob' to use file wildcard-like matching. (not yet)
47 * The parseed tree is then converted to the needed PCRE (highlight, simple backends)
50 * @author: Jeff Dairiki
51 * @author: Reini Urban (case and regex, enhanced sql callbacks)
53 class TextSearchQuery {
57 * @param $search_query string The query. Syntax is as described above.
58 * Note that an empty $search_query will match anything.
59 * @see TextSearchQuery
60 * TODO: support $regex arg, try to detect regex from $search_query (glob-style)
62 function TextSearchQuery($search_query, $case_exact=false, $regex='auto') {
63 $parser = new TextSearchQuery_Parser;
64 $this->_isregex = $regex; // default: auto
65 $this->_case_exact = $case_exact;
66 $this->_tree = $parser->parse($search_query, $case_exact, $regex);
70 function _optimize() {
71 $this->_tree = $this->_tree->optimize();
75 * Get a PCRE regexp which matches the query.
78 if (!isset($this->_regexp)) {
79 if ($this->_isregex) // TODO: convert glob-style regex to pcre
80 $this->_regexp = '/' . $this->_tree->regexp() . '/'.($this->_case_exact?'':'i').'sS';
82 $this->_regexp = '/^' . $this->_tree->regexp() . '/'.($this->_case_exact?'':'i').'sS';
84 return $this->_regexp;
88 * Match query against string.
90 * @param $string string The string to match.
91 * @return boolean True if the string matches the query.
93 function match($string) {
94 return preg_match($this->asRegexp(), $string);
98 * Get a regular expression suitable for highlighting matched words.
100 * This returns a PCRE regular expression which matches any non-negated
103 * @return string The PCRE regexp.
105 function getHighlightRegexp() {
106 if (!isset($this->_hilight_regexp)) {
107 $words = array_unique($this->_tree->highlight_words());
109 $this->_hilight_regexp = false;
112 foreach ($words as $key => $word)
113 $words[$key] = preg_quote($word, '/');
114 $this->_hilight_regexp = '(?:' . join('|', $words) . ')';
117 return $this->_hilight_regexp;
121 * Make an SQL clause which matches the query.
123 * @param $make_sql_clause_cb WikiCallback
124 * A callback which takes a single word as an argument and
125 * returns an SQL clause which will match exactly those records
126 * containing the word. The word passed to the callback will always
127 * be in all lower case.
129 * TODO: support db-specific extensions, like MATCH AGAINST or REGEX
130 * mysql => 4.0.1 can also do Google: MATCH AGAINST IN BOOLEAN MODE
131 * How? WikiDB backend method?
132 * Case-sensitivity option.
136 * function sql_title_match($word) {
137 * return sprintf("LOWER(title) like '%s'",
138 * addslashes($word));
143 * $query = new TextSearchQuery("wiki -page");
144 * $cb = new WikiFunctionCb('sql_title_match');
145 * $sql_clause = $query->makeSqlClause($cb);
147 * This will result in $sql_clause containing something like
148 * "(LOWER(title) like 'wiki') AND NOT (LOWER(title) like 'page')".
150 * @return string The PCRE regexp.
152 function makeSqlClause($sql_clause_cb) {
153 $this->_sql_clause_cb = $make_sql_clause_cb;
154 return $this->_sql_clause($this->_tree);
157 // get away with the callback and use a db-specific search class instead.
158 // "WikiDB_backend_PearDB_search"
159 // methods named as the op's.
160 function makeSqlClauseObj(&$sql_search_cb) {
161 $this->_sql_clause_cb = $sql_search_cb;
162 return $this->_sql_clause_obj($this->_tree);
165 function _sql_clause($node) {
167 /* case 'EXACT': // word => word
168 return $this->_sql_clause_cb->call($node->word);
169 case 'STARTS_WITH': // word => word%
170 return $this->_sql_clause_cb->call($node->word);
171 case 'ENDS_WITH': // word => %word
172 return $this->_sql_clause_cb->call($node->word);
174 case 'WORD': // word => %word%
175 return $this->_sql_clause_cb->call($node->word);
177 return "NOT (" . $this->_sql_clause($node->leaves[0]) . ")";
180 $subclauses = array();
181 foreach ($node->leaves as $leaf)
182 $subclauses[] = "(" . $this->_sql_clause($leaf) . ")";
183 return join(" $node->op ", $subclauses);
185 assert($node->op == VOID);
190 function _sql_clause_obj($node) {
193 return "NOT (" . $this->_sql_clause_cb->call($node->leaves[0]) . ")";
196 $subclauses = array();
197 foreach ($node->leaves as $leaf)
198 $subclauses[] = "(" . $this->_sql_clause_obj($leaf) . ")";
199 return join(" $node->op ", $subclauses);
203 return $this->_sql_clause_cb->call($node);
208 * Get printable representation of the parse tree.
210 * This is for debugging only.
211 * @return string Printable parse tree.
213 function asString() {
214 return $this->_as_string($this->_tree);
217 function _as_string($node, $indent = '') {
220 return $indent . "WORD: $node->word";
222 return $indent . "VOID";
224 $lines = array($indent . $node->op . ":");
226 foreach ($node->leaves as $leaf)
227 $lines[] = $this->_as_string($leaf, $indent);
228 return join("\n", $lines);
234 * This is a TextSearchQuery which matches nothing.
236 class NullTextSearchQuery extends TextSearchQuery {
238 * Create a new query.
240 * @see TextSearchQuery
242 function NullTextSearchQuery() {}
243 function asRegexp() { return '/^(?!a)a/x'; }
244 function match($string) { return false; }
245 function getHighlightRegexp() { return ""; }
246 function makeSqlClause($make_sql_clause_cb) { return "(1 = 0)"; }
247 function asString() { return "NullTextSearchQuery"; }
251 ////////////////////////////////////////////////////////////////
253 // Remaining classes are private.
255 ////////////////////////////////////////////////////////////////
257 * Virtual base class for nodes in a TextSearchQuery parse tree.
259 * Also servers as a 'VOID' (contentless) node.
261 class TextSearchQuery_node
266 * Optimize this node.
267 * @return object Optimized node.
269 function optimize() {
274 * @return regexp matching this node.
281 * @param bool True if this node has been negated (higher in the parse tree.)
282 * @return array A list of all non-negated words contained by this node.
284 function highlight_words($negated = false) {
290 * A whitespace seperated word.
292 class TextSearchQuery_node_word
293 extends TextSearchQuery_node {
296 function TextSearchQuery_node_word($word) {
301 return '(?=.*' . preg_quote($this->word, '/') . ')';
304 function highlight_words($negated = false) {
305 return $negated ? array() : array($this->word);
309 class TextSearchQuery_node_starts_with
310 extends TextSearchQuery_node_word {
311 var $op = "STARTS_WITH";
312 function regexp() { return '(?=' . preg_quote($this->word, '/') . ')'; }
314 class TextSearchQuery_node_ends_with
315 extends TextSearchQuery_node_word {
316 var $op = "ENDS_WITH";
317 function regexp() { return '(?=' . preg_quote($this->word, '/') . '.*)'; }
319 class TextSearchQuery_node_exact
320 extends TextSearchQuery_node_word {
322 function regexp() { return '(?=\B' . preg_quote($this->word, '/') . '\b)'; }
324 class TextSearchQuery_node_glob
325 extends TextSearchQuery_node_word {
327 function regexp() { return '(?=\B' . preg_quote(glob_to_pcre($this->word), '/') . '\b)'; }
329 class TextSearchQuery_node_regex
330 extends TextSearchQuery_node_word {
332 function regexp() { return '(?=\B' . preg_quote($this->word, '/') . '\b)'; }
335 class TextSearchQuery_node_pcre
336 extends TextSearchQuery_node_word {
338 function regexp() { return '(?=\B' . preg_quote($this->word, '/') . '\b)'; }
344 class TextSearchQuery_node_not
345 extends TextSearchQuery_node
349 function TextSearchQuery_node_not($leaf) {
350 $this->leaves = array($leaf);
353 function optimize() {
354 $leaf = &$this->leaves[0];
355 $leaf = $leaf->optimize();
356 if ($leaf->op == 'NOT')
357 return $leaf->leaves[0]; // ( NOT ( NOT x ) ) -> x
362 $leaf = &$this->leaves[0];
363 return '(?!' . $leaf->regexp() . ')';
366 function highlight_words($negated = false) {
367 return $this->leaves[0]->highlight_words(!$negated);
372 * Virtual base class for 'AND' and 'OR conjoins.
374 class TextSearchQuery_node_binop
375 extends TextSearchQuery_node
377 function TextSearchQuery_node_binop($leaves) {
378 $this->leaves = $leaves;
381 function _flatten() {
382 // This flattens e.g. (AND (AND a b) (OR c d) e)
383 // to (AND a b e (OR c d))
385 foreach ($this->leaves as $leaf) {
386 $leaf = $leaf->optimize();
387 if ($this->op == $leaf->op)
388 $flat = array_merge($flat, $leaf->leaves);
392 $this->leaves = $flat;
395 function optimize() {
397 assert(!empty($this->leaves));
398 if (count($this->leaves) == 1)
399 return $this->leaves[0]; // (AND x) -> x
403 function highlight_words($negated = false) {
405 foreach ($this->leaves as $leaf)
406 array_splice($words,0,0,
407 $leaf->highlight_words($negated));
413 * A (possibly multi-argument) 'AND' conjoin.
415 class TextSearchQuery_node_and
416 extends TextSearchQuery_node_binop
420 function optimize() {
423 // Convert (AND (NOT a) (NOT b) c d) into (AND (NOT (OR a b)) c d).
424 // Since OR's are more efficient for regexp matching:
425 // (?!.*a)(?!.*b) vs (?!.*(?:a|b))
427 // Suck out the negated leaves.
429 foreach ($this->leaves as $key => $leaf) {
430 if ($leaf->op == 'NOT') {
431 $nots[] = $leaf->leaves[0];
432 unset($this->leaves[$key]);
436 // Combine the negated leaves into a single negated or.
438 $node = ( new TextSearchQuery_node_not
439 (new TextSearchQuery_node_or($nots)) );
440 array_unshift($this->leaves, $node->optimize());
443 assert(!empty($this->leaves));
444 if (count($this->leaves) == 1)
445 return $this->leaves[0]; // (AND x) -> x
451 foreach ($this->leaves as $leaf)
452 $regexp .= $leaf->regexp();
458 * A (possibly multi-argument) 'OR' conjoin.
460 class TextSearchQuery_node_or
461 extends TextSearchQuery_node_binop
466 // We will combine any of our direct descendents which are WORDs
467 // into a single (?=.*(?:word1|word2|...)) regexp.
472 foreach ($this->leaves as $leaf) {
473 if ($leaf->op == 'WORD')
474 $words[] = preg_quote($leaf->word, '/');
476 $regexps[] = $leaf->regexp();
480 array_unshift($regexps,
481 '(?=.*' . $this->_join($words) . ')');
483 return $this->_join($regexps);
486 function _join($regexps) {
487 assert(count($regexps) > 0);
489 if (count($regexps) > 1)
490 return '(?:' . join('|', $regexps) . ')';
497 ////////////////////////////////////////////////////////////////
500 // op's (and, or, not) are forced to lowercase in the tokenizer.
502 ////////////////////////////////////////////////////////////////
503 define ('TSQ_TOK_WORD', 1);
504 define ('TSQ_TOK_BINOP', 2);
505 define ('TSQ_TOK_NOT', 4);
506 define ('TSQ_TOK_LPAREN', 8);
507 define ('TSQ_TOK_RPAREN', 16);
508 define ('TSQ_TOK_STARTS_WITH', 32);
509 define ('TSQ_TOK_ENDS_WITH', 64);
510 define ('TSQ_TOK_EXACT', 128);
511 define ('TSQ_TOK_REGEX_POSIX', 256);
512 define ('TSQ_TOK_REGEX_PCRE', 512);
513 define ('TSQ_TOK_REGEX_GLOB', 1024);
515 class TextSearchQuery_Parser
518 * This is a simple recursive descent parser, based on the following grammar:
533 * atom : '(' list ')'
538 * The terminal tokens are:
545 * /[^-()\s][^()\s]* WORD
552 * /regex/ PCRE-style REGEX
553 * re:WORD POSIX-style REGEX
556 function parse ($search_expr, $case_exact=false, $regex='auto') {
557 $this->lexer = new TextSearchQuery_Lexer($search_expr, $case_exact, $regex);
558 $tree = $this->get_list('toplevel');
559 assert($this->lexer->eof());
564 function get_list ($is_toplevel = false) {
567 // token types we'll accept as words (and thus expr's) for the
568 // purpose of error recovery:
569 $accept_as_words = TSQ_TOK_NOT | TSQ_TOK_BINOP;
571 $accept_as_words |= TSQ_TOK_LPAREN | TSQ_TOK_RPAREN;
573 while ( ($expr = $this->get_expr())
574 || ($expr = $this->get_word($accept_as_words)) ) {
581 return new TextSearchQuery_node;
585 return new TextSearchQuery_node_and($list);
588 function get_expr () {
589 if ( !($expr = $this->get_atom()) )
592 $savedpos = $this->lexer->tell();
593 while ( ($op = $this->lexer->get(TSQ_TOK_BINOP)) ) {
594 if ( ! ($right = $this->get_atom()) ) {
599 $expr = new TextSearchQuery_node_and(array($expr, $right));
602 $expr = new TextSearchQuery_node_or(array($expr, $right));
605 $savedpos = $this->lexer->tell();
607 $this->lexer->seek($savedpos);
613 function get_atom() {
614 if ($word = $this->get_word(TSQ_TOK_WORD + TSQ_TOK_STARTS_WITH + TSQ_TOK_ENDS_WITH
615 + TSQ_TOK_EXACT + TSQ_TOK_REGEX_GLOB + TSQ_TOK_REGEX_PCRE
616 + TSQ_TOK_REGEX_POSIX))
619 $savedpos = $this->lexer->tell();
620 if ( $this->lexer->get(TSQ_TOK_LPAREN) ) {
621 if ( ($list = $this->get_list()) && $this->lexer->get(TSQ_TOK_RPAREN) )
624 elseif ( $this->lexer->get(TSQ_TOK_NOT) ) {
625 if ( ($atom = $this->get_atom()) )
626 return new TextSearchQuery_node_not($atom);
628 $this->lexer->seek($savedpos);
632 function get_word($accept = TSQ_TOK_WORD) {
633 if ( $accept & TSQ_TOK_WORD and ($word = $this->lexer->get(TSQ_TOK_WORD)) )
634 return new TextSearchQuery_node_word($word);
635 if ( $accept & TSQ_TOK_STARTS_WITH and ($word = $this->lexer->get(TSQ_TOK_STARTS_WITH)) )
636 return new TextSearchQuery_node_starts_with($word);
637 if ( $accept & TSQ_TOK_ENDS_WITH and ($word = $this->lexer->get(TSQ_TOK_ENDS_WITH)) )
638 return new TextSearchQuery_node_ends_with($word);
639 if ( $accept & TSQ_TOK_EXACT and ($word = $this->lexer->get(TSQ_TOK_EXACT)) )
640 return new TextSearchQuery_node_exact($word);
641 if ( $accept & TSQ_TOK_REGEX_GLOB and ($word = $this->lexer->get(TSQ_TOK_REGEX_GLOB)) )
642 return new TextSearchQuery_node_glob($word);
643 if ( $accept & TSQ_TOK_REGEX_POSIX and ($word = $this->lexer->get(TSQ_TOK_REGEX_POSIX)) )
644 return new TextSearchQuery_node_regex($word);
645 if ( $accept & TSQ_TOK_REGEX_PCRE and ($word = $this->lexer->get(TSQ_TOK_REGEX_PCRE)) )
646 return new TextSearchQuery_node_pcre($word);
651 //TODO: support glob-style regex: $regex='glob'
652 class TextSearchQuery_Lexer {
653 function TextSearchQuery_Lexer ($query_str, $case_exact=false, $regex='auto') {
654 $this->tokens = $this->tokenize($query_str, $case_exact, $regex);
662 function seek($pos) {
667 return $this->pos == count($this->tokens);
670 function tokenize($string, $case_exact=false, $regex='auto') {
672 $buf = $case_exact ? ltrim($string) : strtolower(ltrim($string));
673 while (!empty($buf)) {
674 if (preg_match('/^(and|or)\b\s*/i', $buf, $m)) {
675 $val = strtolower($m[1]);
676 $type = TSQ_TOK_BINOP;
678 elseif (preg_match('/^(-|not\b)\s*/i', $buf, $m)) {
679 $val = strtolower($m[1]);
682 elseif (preg_match('/^([()])\s*/', $buf, $m)) {
684 $type = $m[1] == '(' ? TSQ_TOK_LPAREN : TSQ_TOK_RPAREN;
686 elseif (preg_match('/^re:([^-()][^()\s]*)\s*/', $buf, $m)) {
687 $regex = true; // posix-style
689 $type = TSQ_TOK_REGEX_POSIX;
691 elseif (preg_match('/^\/([^-()][^()\s]*)\/\s*/', $buf, $m)) {
692 $regex = true; // pcre-style
694 $type = TSQ_TOK_REGEX_PCRE;
696 elseif ($regex and preg_match('/^\^([^-()][^()\s]*)\$\s*/', $buf, $m)) {
698 $type = TSQ_TOK_EXACT;
700 elseif ($regex and preg_match('/^\^([^-()][^()\s]*)\s*/', $buf, $m)) {
702 $type = TSQ_TOK_STARTS_WITH;
704 elseif ($regex and preg_match('/^([^-()][^()\s]*)\*\s*/', $buf, $m)) {
706 $type = TSQ_TOK_STARTS_WITH;
708 elseif ($regex and preg_match('/^\*([^-()][^()\s]*)\s*/', $buf, $m)) {
710 $type = TSQ_TOK_ENDS_WITH;
712 elseif (preg_match('/^ " ( (?: [^"]+ | "" )* ) " \s*/x', $buf, $m)) {
713 $val = str_replace('""', '"', $m[1]);
714 $type = TSQ_TOK_WORD;
716 elseif (preg_match("/^ ' ( (?:[^']+|'')* ) ' \s*/x", $buf, $m)) {
717 $val = str_replace("''", "'", $m[1]);
718 $type = TSQ_TOK_WORD;
720 elseif (preg_match('/^([^-()][^()\s]*)\s*/', $buf, $m)) {
722 $type = TSQ_TOK_WORD;
728 $buf = substr($buf, strlen($m[0]));
730 /* refine the simple parsing from above: bla*bla, bla?bla
732 if ($regex and $type == TSQ_TOK_WORD) {
733 if (substr($val,0,1) == "^")
734 $type = TSQ_TOK_STARTS_WITH;
735 elseif (substr($val,0,1) == "*")
736 $type = TSQ_TOK_ENDS_WITH;
737 elseif (substr($val,-1,1) == "*")
738 $type = TSQ_TOK_STARTS_WITH;
740 $tokens[] = array($type, $val);
745 function get($accept) {
746 if ($this->pos >= count($this->tokens))
749 list ($type, $val) = $this->tokens[$this->pos];
750 if (($type & $accept) == 0)
762 // c-hanging-comment-ender-p: nil
763 // indent-tabs-mode: nil