BooleanSearch.php 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267
  1. <?php
  2. /**
  3. * Search lib by John-Paul Durrieu
  4. */
  5. // Token types
  6. define("TOKEN_STRING", 0);
  7. define("TOKEN_AND", 1);
  8. define("TOKEN_OR", 2);
  9. define("TOKEN_NOT", 3);
  10. define("TOKEN_LEFTPAREN", 4);
  11. define("TOKEN_RIGHTPAREN", 5);
  12. define("TOKEN_PLUS", 6);
  13. define("TOKEN_MINUS", 7);
  14. /**
  15. * Tokenize a search criteria string in pseudo-google format
  16. * e.g: this that OR (this and "the other") +this -that \+also
  17. *
  18. * @returns array of (tokentype, token) tuples.
  19. * e.g: ((TOKEN_STRING, "this"),
  20. * (TOKEN_STRING, "that"),
  21. * (TOKEN_OR, "OR"),
  22. * (TOKEN_LEFTPAREN, "("),
  23. * (TOKEN_STRING, "this"),
  24. * (TOKEN_AND, "and"),
  25. * (TOKEN_STRING, "the other"),
  26. * (TOKEN_RIGHTPAREN, ")"),
  27. * (TOKEN_PLUS, "+"),
  28. * (TOKEN_STRING, "this"),
  29. * (TOKEN_MINUS, "-"),
  30. * (TOKEN_STRING, "that"),
  31. * (TOKEN_STRING, "+also"))
  32. *
  33. * Based on example code in the PHP manual on php.net
  34. */
  35. function tokenize($criteria) {
  36. $tokens = array(
  37. TOKEN_STRING => '',
  38. TOKEN_AND => 'and',
  39. TOKEN_OR => 'or',
  40. TOKEN_NOT => 'not',
  41. TOKEN_LEFTPAREN => '(',
  42. TOKEN_RIGHTPAREN => ')',
  43. TOKEN_PLUS => '+',
  44. TOKEN_MINUS => '-');
  45. // automaton [states][chartypes] => actions
  46. // states:
  47. // STATE_SPACE 0
  48. // STATE_UNQUOTED 1
  49. // STATE_DOUBLEQUOTED 2
  50. // STATE_ESCAPED 3
  51. $chart = array(
  52. 0 => array(' '=>'', '"'=>'d', '\\'=>'ue', '+'=>'uaw ', '-'=>'uaw ', '('=>'uaw ', ')'=>'uaw ', 0 =>'ua'),
  53. 1 => array(' '=>'w ', '"'=>'wd', '\\'=>'e', '+'=>'waw', '-'=>'waw', '('=>'waw', ')'=>'waw', 0 =>'a'),
  54. 2 => array(' '=>'a', '"'=>'w ', '\\'=>'e', '+'=>'a', '-'=>'a', '('=>'a', ')'=>'a', 0 =>'a'),
  55. 3 => array(' '=>'ap', '"'=>'ap', '\\'=>'ap', '+'=>'ap', '-'=>'ap', '('=>'ap', ')'=>'ap', 0 =>'ap'));
  56. $state = 0; // STATE_SPACE
  57. $previous = ''; // stores current state when encountering a backslash (which changes $state to STATE_ESCAPED, but has to fall back into the previous $state afterwards)
  58. $out = array(); // the return value
  59. $word = '';
  60. $type = ''; // type of character
  61. for ($i=0; $i<=strlen($criteria); $i++) {
  62. $char = substr($criteria, $i, 1);
  63. $type = $char;
  64. if (!isset($chart[0][$type])) {
  65. $type = 0; //other
  66. // grab all consecutive non word-ending characters
  67. preg_match("/[ \+\-\(\)\"\\\]/", $criteria, $matches, PREG_OFFSET_CAPTURE, $i);
  68. if ($matches) {
  69. $matches = $matches[0];
  70. $char = substr($criteria, $i, $matches[1]-$i); // yep, $char length can be > 1
  71. $i = $matches[1] - 1;
  72. }else{
  73. // no more match on special characters, that must mean this is the last word!
  74. // the .= below is because we *might* be in the middle of a word that just contained special chars
  75. $word .= substr($criteria, $i);
  76. break; // jumps out of the for() loop
  77. }
  78. }
  79. $actions = $chart[$state][$type];
  80. for($j=0; $j<strlen($actions); $j++) {
  81. $act = substr($actions, $j, 1);
  82. if ($act == ' ') $state = 0; //STATE_SPACE
  83. if ($act == 'u') $state = 1; //STATE_UNQUOTED
  84. if ($act == 'd') $state = 2; //STATE_DOUBLEQUOTED
  85. if ($act == 'e') { $previous = $state; $state = 3; } //STATE_ESCAPED
  86. if ($act == 'a') $word .= $char;
  87. if ($act == 'p') $state = $previous;
  88. if ($act == 'w') {
  89. if (!empty($word)) {
  90. $tokentype = TOKEN_STRING;
  91. if ($state == 1) {
  92. //unquoted word, so look for keywords or operators
  93. $tokentype = array_search(strtolower($word), $tokens);
  94. if (!$tokentype) $tokentype = TOKEN_STRING;
  95. }
  96. $out[] = array($tokentype, $word);
  97. $word = '';
  98. }
  99. }
  100. } //for j
  101. } //for i
  102. if (!empty($word)) {
  103. $tokentype = TOKEN_STRING;
  104. if ($state == 1) {
  105. //unquoted word, so look for keywords or operators
  106. $tokentype = array_search(strtolower($word), $tokens);
  107. if (!$tokentype) $tokentype = TOKEN_STRING;
  108. }
  109. $out[] = array($tokentype, $word);
  110. $word = '';
  111. }
  112. return $out;
  113. } //tokenize
  114. /**
  115. * parse the criteria string according to a subset of the Google search syntax:
  116. *
  117. * example: this too AND +this OR (these AND "the other") -notthis
  118. *
  119. * @return array of criterias, each criteria being an array:
  120. * array (0 => operator, ' AND ',' OR ',' AND NOT '
  121. * 1 => value, the criteria's string value
  122. * 2 => wildcard flag, TRUE if wildcard matching, FALSE for strict matching
  123. * 3 => nesting) parentheses nesting level 0..n
  124. *
  125. * for convenience, the operator of the first criteria is blank.
  126. */
  127. function parsecriteria($criteria) {
  128. $results = array();
  129. $tokens = array();
  130. $thisresult = array('','',TRUE,0);
  131. $nesting = 0;
  132. //var_dump($criteria); //@@@
  133. //replace html quoting put there by some browsers, then tokenize
  134. $tokens = tokenize(str_replace ('&quot;', '"', $criteria));
  135. //var_dump($tokens); //@@@
  136. foreach ($tokens as $token) {
  137. switch ($token[0]) {
  138. case TOKEN_AND:
  139. $thisresult[0] = ' AND ';
  140. $thisresult[2] = TRUE; //reset wildcard in case of bad syntax
  141. break;
  142. case TOKEN_OR:
  143. $thisresult[0] = ' OR ';
  144. $thisresult[2] = TRUE; //reset wildcard in case of bad syntax
  145. break;
  146. case TOKEN_PLUS:
  147. $thisresult[2] = FALSE;
  148. break;
  149. case TOKEN_NOT:
  150. case TOKEN_MINUS:
  151. $thisresult[0] .= ' NOT '; //NOT or AND NOT
  152. break;
  153. case TOKEN_LEFTPAREN:
  154. $nesting += 1;
  155. $thisresult[2] = TRUE; //reset just in case of bad syntax
  156. break;
  157. case TOKEN_RIGHTPAREN:
  158. $nesting -= 1;
  159. $thisresult[2] = TRUE; //reset just in case of bad syntax
  160. break;
  161. default:
  162. // anything else -> output "as is"
  163. $thisresult[1] .= $token[1];
  164. $thisresult[3] = $nesting;
  165. $results[] = $thisresult;
  166. $thisresult = array(' AND ','',TRUE,0);
  167. break;
  168. }
  169. } //foreach $tokens
  170. return $results;
  171. } //parsecriteria
  172. function assemble_query ($criteria, $searchfields) {
  173. global $mysql, $username, $search;
  174. $whereCriterias = parsecriteria ($criteria);
  175. //var_dump($whereCriterias); //@@@
  176. $whereData = array();
  177. $columnNumber = 0;
  178. $whereClause = "";
  179. $nesting = 0;
  180. foreach ($whereCriterias as $mycriteria) {
  181. $whereClause .= $mycriteria[0];
  182. $thisnesting = $mycriteria[3];
  183. if ($thisnesting >= $nesting) {
  184. $whereClause .= str_repeat('(', $thisnesting - $nesting);
  185. } else {
  186. $whereClause .= str_repeat(')', $nesting - $thisnesting);
  187. }
  188. $nesting = $thisnesting;
  189. $firstcolumn = TRUE;
  190. $whereClause .= ' (';
  191. foreach ($searchfields as $column) {
  192. if ($firstcolumn) {
  193. $firstcolumn = FALSE;
  194. } else {
  195. $whereClause .= ' OR ';
  196. }
  197. if ($mycriteria[2]) {
  198. $whereClause .= "$column LIKE " . '\'' .'%' . $mysql->escape ($mycriteria[1]) . '%' . '\'' ;
  199. } else {
  200. /* no wildcard, so match exact words using a REGEXP */
  201. $whereClause .= "$column RLIKE " . '\'' . '[[:<:]]' . $mysql->escape ($mycriteria[1]) . '[[:>:]]' . '\'';
  202. }
  203. } //foreach $column
  204. $whereClause .= ')';
  205. } //foreach $whereCriterias
  206. $whereClause .= str_repeat(')', $nesting);
  207. $whereClause = trim ($whereClause);
  208. if ($whereClause != '') {
  209. $query = sprintf ("SELECT bookmark.title,
  210. bookmark.url,
  211. bookmark.description,
  212. UNIX_TIMESTAMP(bookmark.date) AS timestamp,
  213. bookmark.childof,
  214. bookmark.id,
  215. bookmark.favicon,
  216. bookmark.public,
  217. folder.name,
  218. folder.id AS fid,
  219. folder.public AS fpublic
  220. FROM bookmark LEFT JOIN folder ON bookmark.childof=folder.id
  221. WHERE bookmark.user='%s'
  222. AND bookmark.deleted!='1'
  223. AND ( %s )
  224. ORDER BY title",
  225. $mysql->escape ($username),
  226. $whereClause);
  227. }
  228. else {
  229. $query = false;
  230. }
  231. return $query;
  232. }
  233. ?>