基于Sphinx的实例解析

时间:2022-04-11 21:02:31
这里我主要讲重点,第一个是基于discuz的索引配置文件,这个配置文件比较灵活,可以根据不同的需求来配置
  1. #
  2. # LinuxTone full index search configure file
  3. #
  4. source lt_posts
  5. {
  6. type = mysql
  7. sql_host = 127.0.0.1
  8. sql_user = root
  9. sql_pass =
  10. sql_db = lt_bbs
  11. sql_port = 3306
  12. sql_query_pre = SET NAMES utf8
  13. sql_query = SELECT pid,tid,fid,dateline,subject,message,author FROM cdb_posts where first=1 #此处是基于posts表来做索引的,这样的目的是可以同时检索到subject,message,author 三个字段的值
  14. sql_attr_uint = fid
  15. sql_attr_timestamp = dateline
  16. sql_query_info = SELECT * FROM cdb_posts WHERE pid=$id
  17. }
  18. index lt_posts
  19. {
  20. source = lt_posts
  21. path = /data/sphinx/data/lt_posts
  22. docinfo = extern
  23. mlock = 0
  24. morphology = none
  25. min_word_len = 2
  26. html_strip = 1
  27. charset_dictpath = /usr/local/mmseg-3.2.13/etc/
  28. charset_type = zh_cn.utf-8
  29. ngram_len = 0
  30. }
  31. ########## 增量索引 ##################
  32. source delta
  33. {
  34. type = mysql
  35. sql_host = 127.0.0.1
  36. sql_user = root
  37. sql_pass =
  38. sql_db = lt_bbs
  39. sql_port = 3306 # optional, default is 3306
  40. sql_query_pre = SET NAMES utf8
  41. sql_query = SELECT pid,tid,fid,dateline,subject,message,author FROM cdb_posts where first=1 and dateline > unix_timestamp()-3600*10 #增量索引采用当前时间戳减去一个需要间隔的时间来新建新增的数据索引

  42. sql_attr_uint = fid
  43. sql_attr_timestamp = dateline
  44. sql_query_info = SELECT * FROM cdb_posts WHERE pid=$id
  45. }
  46. index delta
  47. {
  48. source = delta
  49. path = /data/sphinx/data/lt_delta
  50. docinfo = extern
  51. mlock = 0
  52. morphology = none
  53. min_word_len = 2
  54. html_strip = 1
  55. charset_dictpath = /usr/local/mmseg-3.2.13/etc/
  56. charset_type = zh_cn.utf-8
  57. ngram_len = 0

  58. }
  59. indexer
  60. {
  61. mem_limit = 32M
  62. }
  63. searchd
  64. {
  65. port = 9312
  66. log = /data/sphinx/var/log/searchd.log
  67. query_log = /data/sphinx/var/log/query.log
  68. read_timeout = 5
  69. max_children = 30
  70. pid_file = /data/sphinx/var/log/searchd.pid
  71. max_matches = 10000
  72. seamless_rotate = 1
  73. preopen_indexes = 0
  74. unlink_old = 1
  75. }

sphinx最主要的就是这个配置文件,当然在增量索引部分可以写一个脚本放到crontab里面来定时跑

下面介绍下sphinx的PHP调用部分,sphinx的接口采用PHP的扩展,可以通过pecl或者http://pecl.php.net/package/sphinx来安装
  1. <?php
  2. /**
  3. * LinuxTone全文搜索服务
  4. */
  5. define('IN_DISCUZ', TRUE);
  6. require_once './include/common.inc.php';

  7. $q = isset($_GET['q']) && !empty($_GET['q']) ? $_GET['q'] : '';
  8. $q = str_replace(array('<','>',' ','\'',','),array('','',' ','',''),strip_tags($q));

  9. $page = isset($_GET['page']) && intval($_GET['page'])>? intval($_GET['page']) : 1;
  10. $perNum = 20;
  11. $offset = ($page - 1) * $perNum;

  12. $search = new SphinxClient();
  13. $search->setServer('127.0.0.1',9312);
  14. $search->setConnectTimeout(2);
  15. $search->setArrayResult(true);
  16. $search->setMatchMode(SPH_MATCH_ANY);
  17. $search->setRankingMode(SPH_RANK_PROXIMITY_BM25);
  18. $search->setSortMode(SPH_SORT_EXTENDED,'@relevance desc,@weight desc');
  19. $search->setLimits($offset,$perNum);
  20. $search->setFieldWeights(array('subject'=>2000,'message'=>0));

  21. $rs = array();
  22. $query_totals = $query_time = 0;
  23. if(!empty($q)){
  24.          $rs = $search->Query($q,"*");
  25.          $pages = ceil($rs['total']/$perNum);

  26.          $query_totals = $rs['total_found'];
  27.          $query_time = $rs['time'];
  28. }

  29. $data = $title = $content = array();

  30. if(!empty($rs) && $page <= $pages){
  31.            $pids = array();
  32.            foreach($rs['matches'] as $v){
  33.                    $pids[] = $v['id'];
  34.          }
  35.          $pid = implode(',',$pids);
  36.          $sql = "select pid,tid,author,authorid,subject,message,dateline from cdb_posts where pid IN($pid) and status ='0' and invisible='0'";

  37.          $query = $db->query($sql);
  38.          while($row = $db->fetch_array($query)){
  39.                    $data[] = $row;
  40.                    $title[] = $row['subject'];
  41.                    $content[] = preg_replace('/\[[\/]?(b|img|url|color|s|hr|p|list|i|align|email|u|font|code|hide|table|tr|td|th|attach|list|indent|float).*\]/','',strip_tags($row['message']));
  42.          }
  43.          //搜索词高亮
  44.          $opts = array();
  45.          $opts['before_match'] = '';
  46.          $opts['after_match'] = '';
  47.          $title = $search->BuildExcerpts($title,'lt_posts',$q,$opts);
  48.          $content = $search->BuildExcerpts($content,'lt_posts',$q,$opts);

  49.          foreach($data as $k=>$v){
  50.                    $data[$k]['subject'] = $title[$k];
  51.                    $data[$k]['message'] = $content[$k];
  52.          }

  53.          $url = "s.php?q=".urlencode($q);
  54.          $multipage = multi($rs['total'], $perNum, $page, $url);
  55. }

  56. include template("lt_search");
  57. ?>

跑主索引的shell脚本search-index.sh
  1. #!/bin/bash
  2. #
  3. # The BBS search exec full index
  4. #
  5. /usr/local/csft-3.2.13/bin/indexer -c /usr/local/csft-3.2.13/etc/lt_posts.conf --rotate lt_posts >> /data/sphinx/var/`date "+%Y-%m-%d-%H"`.log

跑增量索引的shell脚本search-delta.sh
  1. #!/bin/bash
  2. #
  3. # The BBS search exec delta index
  4. #
  5. #跑增量索引
  6. /usr/local/csft-3.2.13/bin/indexer -c /usr/local/csft-3.2.13/etc/lt_posts.conf --rotate delta
  7. #合并主索引和增量索引
  8. #/usr/local/csft-3.2.13/bin/indexer --config /usr/local/csft-3.2.13/etc/lt_posts.conf --rotate --merge lt_posts delta

文章来源:http://blog.chinaunix.net/uid-25168129-id-91574.html