oracle模糊查询:全文索引方式(三)

时间:2022-09-22 12:21:46

--test:

DATABASE:dwtest(210开发库)

SQL> desc iquery.lpx_kw_tmp02
Name               Type          Nullable Default Comments
------------------ ------------- -------- ------- --------
KEYWORDS           VARCHAR2(256) Y                        
COUNTRY_ID         VARCHAR2(10)  Y                        
COUNTRY_NAME       VARCHAR2(256) Y                        
REGION_ID          NUMBER        Y                        
REGIN_NAME         VARCHAR2(128) Y                        
PRE_REGIN_ID       NUMBER        Y                        
PRE_REGIN_ENAME    VARCHAR2(128) Y                        
SRH_PV_THIS_5MON   NUMBER        Y                        
CATEGORY_LEAF_ID   NUMBER        Y                        
CATEGORY_ROOT_ID   NUMBER        Y                        
CATEGORY_LEVEL2_ID NUMBER        Y                        
CATEGORY_LEVEL3_ID NUMBER        Y   

--从业务上利用     
CREATE TABLE iquery.lpx_kw_tmp02_bak TABLESPACE TBS_EN_DSS_2009 AS
SELECT keywords,
       REGIN_NAME,
       CATEGORY_ROOT_ID,
       SUM(SRH_PV_THIS_5MON) AS SRH_PV_THIS_5MON
  FROM iquery.lpx_kw_tmp02
 WHERE CATEGORY_ROOT_ID IS NOT NULL
 GROUP BY keywords,
       REGIN_NAME,
       CATEGORY_ROOT_ID;

SQL> desc iquery.lpx_kw_tmp02_bak
Name             Type          Nullable Default Comments
---------------- ------------- -------- ------- --------
KEYWORDS         VARCHAR2(256) Y                        
REGIN_NAME       VARCHAR2(128) Y                        
CATEGORY_ROOT_ID NUMBER        Y                        
SRH_PV_THIS_5MON NUMBER        Y 

SQL> select count(1) from iquery.lpx_kw_tmp02_bak;
 
  COUNT(1)
----------
   7799710
 

--模糊匹配'mechanical'
SQL>   SELECT a.regin_name
  2           ,SUM(a.srh_pv_this_5mon) AS srh_pv_this_5mon
  3      FROM iquery.lpx_kw_tmp02_bak a
  4     WHERE keywords LIKE '%mechanical%'
  5       AND category_root_id = 80
  6     group by regin_name
  7    ;
 
 
REGIN_NAME                                                                       SRH_PV_THIS_5MON
-------------------------------------------------------------------------------- ----------------
Western Europe                                                                                  3
North America                                                                                  10
Middle East                                                                                     1
OTHER                                                                                           1
Asia                                                                                           21
--耗时:2.828S


SQL>   SELECT /*+parallel(a 4) */a.regin_name
  2           ,SUM(a.srh_pv_this_5mon) AS srh_pv_this_5mon
  3      FROM iquery.lpx_kw_tmp02_bak a
  4     WHERE keywords LIKE '%mechanical%'
  5       AND category_root_id = 80
  6     group by regin_name
  7    ;
 
 
REGIN_NAME                                                                       SRH_PV_THIS_5MON
-------------------------------------------------------------------------------- ----------------
North America                                                                                  10
OTHER                                                                                           1
Western Europe                                                                                  3
Middle East                                                                                     1
Asia                                                                                           21

--耗时1.11S

SQL>   EXPLAIN PLAN FOR SELECT a.regin_name
  2           ,SUM(a.srh_pv_this_5mon) AS srh_pv_this_5mon
  3      FROM iquery.lpx_kw_tmp02_bak a
  4     WHERE keywords LIKE '%mechanical%'
  5       AND category_root_id = 80
  6     group by regin_name
  7    ;
 
Explained
 
SQL> select * from table(dbms_xplan.display());
 
PLAN_TABLE_OUTPUT
--------------------------------------------------------------------------------
Plan hash value: 3331152321
--------------------------------------------------------------------------------
| Id  | Operation          | Name             | Rows  | Bytes | Cost (%CPU)| Tim
--------------------------------------------------------------------------------
|   0 | SELECT STATEMENT   |                  |  5390 |  1168K| 60651   (2)| 00:
|   1 |  HASH GROUP BY     |                  |  5390 |  1168K| 60651   (2)| 00:
|*  2 |   TABLE ACCESS FULL| LPX_KW_TMP02_BAK |  5390 |  1168K| 60649   (2)| 00:
--------------------------------------------------------------------------------
Predicate Information (identified by operation id):
---------------------------------------------------
   2 - filter("CATEGORY_ROOT_ID"=80 AND "KEYWORDS" LIKE '%mechanical%')
Note
-----
   - 'PLAN_TABLE' is old version
   - dynamic sampling used for this statement
 
19 rows selected


--创建全文索引:
1、授权
以sys登录
SQL> grant ctxapp to etl;
 
Grant succeeded

以ctxsys登录
grant execute on ctx_ddl to etl;

2、建立索引
以etl连接
SQL> exec ctx_ddl.create_preference('etl_kw1','BASIC_LEXER');
 
PL/SQL procedure successfully completed
--其中:第一个参数可以随便命名,第二个参数必须是oracle规定的几个值之一。
--第二个参数可以是CHINESE_LEXER、chinese_vgram_lexer;CHINESE_LEXER建索引时间长、但查询速度、查询准确度都比chinese_vgram_lexer高

--建立后可以用如下方式查询:
SQL> SELECT * FROM CTX_PREFERENCES;
 
PRE_OWNER                      PRE_NAME                       PRE_CLASS                      PRE_OBJECT
------------------------------ ------------------------------ ------------------------------ ------------------------------
CTXSYS                         URL_DATASTORE                  DATASTORE                      URL_DATASTORE
CTXSYS                         FILE_DATASTORE                 DATASTORE                      FILE_DATASTORE
WKSYS                          WK_DATASTORE                   DATASTORE                      FILE_DATASTORE
CTXSYS                         DEFAULT_DATASTORE              DATASTORE                      DIRECT_DATASTORE
CTXSYS                         DIRECT_DATASTORE               DATASTORE                      DIRECT_DATASTORE
CTXSYS                         AUTO_FILTER                    FILTER                         AUTO_FILTER
CTXSYS                         MAIL_FILTER                    FILTER                         MAIL_FILTER
CTXSYS                         INSO_FILTER                    FILTER                         INSO_FILTER
CTXSYS                         NULL_FILTER                    FILTER                         NULL_FILTER
WKSYS                          WK_FILTER                      FILTER                         NULL_FILTER
CTXSYS                         DEFAULT_EXTRACT_LEXER          LEXER                          AUTO_LEXER
WKSYS                          WK_KOREAN_LEXER                LEXER                          KOREAN_MORPH_LEXER
WKSYS                          WK_LEXER                       LEXER                          MULTI_LEXER
ETL                            ETL_LEXER                      LEXER                          CHINESE_LEXER
ETL                            ETL_LPXUAN                     LEXER                          CHINESE_LEXER
ETL                            ETL_KW                         LEXER                          CHINESE_LEXER
CTXSYS                         MY_LEXER1                      LEXER                          CHINESE_VGRAM_LEXER
WKSYS                          WK_CHINESE_LEXER               LEXER                          CHINESE_VGRAM_LEXER
WKSYS                          WK_JAPANESE_LEXER              LEXER                          JAPANESE_VGRAM_LEXER
CTXSYS                         MY_LEXER                       LEXER                          BASIC_LEXER
 
PRE_OWNER                      PRE_NAME                       PRE_CLASS                      PRE_OBJECT
------------------------------ ------------------------------ ------------------------------ ------------------------------
CTXSYS                         DEFAULT_LEXER                  LEXER                          BASIC_LEXER
CTXSYS                         BASIC_LEXER                    LEXER                          BASIC_LEXER
WKSYS                          WK_BASIC_LEXER                 LEXER                          BASIC_LEXER
CTXSYS                         DEFAULT_WORDLIST               WORDLIST                       BASIC_WORDLIST
CTXSYS                         BASIC_WORDLIST                 WORDLIST                       BASIC_WORDLIST
WKSYS                          WK_WORDLIST                    WORDLIST                       BASIC_WORDLIST
CTXSYS                         MYSTORE                        STORAGE                        BASIC_STORAGE
CTXSYS                         DEFAULT_STORAGE                STORAGE                        BASIC_STORAGE
CTXSYS                         MYSTORE1                       STORAGE                        BASIC_STORAGE
WKSYS                          WK_STORAGE                     STORAGE                        BASIC_STORAGE
CTXSYS                         ENTITY_STORAGE_DR              STORAGE                        ENTITY_STORAGE
CTXSYS                         ENTITY_STORAGE_D               STORAGE                        ENTITY_STORAGE
CTXSYS                         ENTITY_STORAGE_R               STORAGE                        ENTITY_STORAGE
CTXSYS                         ENTITY_STORAGE                 STORAGE                        ENTITY_STORAGE
CTXSYS                         DEFAULT_CLASSIFIER             CLASSIFIER                     RULE_CLASSIFIER
CTXSYS                         DEFAULT_CLUSTERING             CLUSTERING                     KMEAN_CLUSTERING
 
36 rows selected

SQL> create index idx_item_kw ON iquery.lpx_kw_tmp02_bak(keywords) indextype is ctxsys.context parameters('lexer etl_kw1');
 
Index created
--parameters括号中都第一个值lexer是固定的,第二个值是上一步建立的preference名称

--耗时:5056S

3、定期同步索引(同步索引为增量同步)
--context这种类型的索引需要手工同步
--同步:根据新增记录的文本内容更新全文搜索的索引。
SQL> create or replace procedure rel_kw_sync_index as
      begin
      ctx_ddl.sync_index('idx_item_rel_kw');
      end;
      /

Procedure created.

Elapsed: 00:00:00.08
SQL> VARIABLE jobno number;
SQL> BEGIN
2 DBMS_JOB.SUBMIT(:jobno,'rel_kw_sync_index();',
3 SYSDATE, 'SYSDATE + (1/24/4)');
4 commit;
5 END;
6 /

PL/SQL procedure successfully completed.

Elapsed: 00:00:00.27

4、定期优化索引
--优化:根据被删除记录清除全文搜索索引中的垃圾
SQL> create or replace procedure rel_kw_optimize_index as
   2   begin
   3   ctx_ddl.optimize_index('rel_kw_sync_index','FULL');
   4   end;
   5   /

SQL> VARIABLE jobno number;
SQL> BEGIN
2 DBMS_JOB.SUBMIT(:jobno,'rel_kw_optimize_index();',
3 SYSDATE, 'SYSDATE + 1');
4 commit;
5 END;
6 /
Procedure created.



全文检索的缺点是不能实时保证查询的正确性,所以要合理地定期同步索引和优化索引。

--造测试数据
DROP TABLE iquery.lpx_kw_test;
CREATE TABLE iquery.lpx_kw_test TABLESPACE TBS_EN_DSS_2009
AS
SELECT SUBSTR(keywords, INSTR(keywords, ' ', 1)+1, INSTR(keywords, ' ', 1, 2) - INSTR(keywords, ' ', 1) - 1) AS kw, a.* 
  FROM iquery.lpx_kw_tmp02_bak1  a
 WHERE length(keywords) - length(REPLACE(keywords, ' ', '')) > 2 AND ROWNUM < 1000 + 1;


--根据行业和大洲进行匹配
DROP TABLE iquery.lpx_kw_tmp03;
CREATE TABLE iquery.lpx_kw_tmp03(
 KW                                                 VARCHAR2(256)
 ,REGIN_NAME                                         VARCHAR2(128)
 ,CATEGORY_ROOT_ID                                   NUMBER
 ,SRH_PV_THIS_5MON                                   NUMBER
);
set serveroutput ON;

DECLARE 
  cnt               NUMBER;
  start_date        DATE;
  end_date          DATE;
 
CURSOR c1
IS
 select kw, keywords, regin_name, category_root_id FROM iquery.lpx_kw_test;
 
BEGIN

 cnt := 0;
 start_date := SYSDATE;
 
 FOR ref_cursor IN c1
 LOOP 
 INSERT INTO iquery.lpx_kw_tmp03
 SELECT ref_cursor.kw
        ,ref_cursor.regin_name
        ,ref_cursor.category_root_id
        ,SUM(a.srh_pv_this_5mon) AS srh_pv_this_5mon
   FROM iquery.lpx_kw_tmp02_bak a   --全文索引
  WHERE contains(a.keywords, ref_cursor.kw) > 0
    AND a.category_root_id = ref_cursor.category_root_id
    AND a.regin_name = ref_cursor.regin_name
  GROUP BY ref_cursor.kw
        ,ref_cursor.regin_name
        ,ref_cursor.category_root_id;
 COMMIT;
 cnt := cnt + 1;
 
 END LOOP;
 end_date := SYSDATE;
 
 DBMS_OUTPUT.PUT_LINE('total records:'||cnt);
 DBMS_OUTPUT.PUT_LINE('total time(ms):'||ROUND(TO_NUMBER(END_DATE - START_DATE) * 24 * 60 * 60 * 1000));
 DBMS_OUTPUT.PUT_LINE('avg time(ms):'||ROUND(TO_NUMBER(END_DATE - START_DATE) * 24 * 60 * 60 * 1000)/cnt);
END;
/


total records:100
total time(ms):34000
avg time(ms):340

PL/SQL procedure successfully completed.

 

 

Oracle实现全文检索,其机制其实很简单。即通过Oracle专利的词法分析器(lexer),将文章中所有的表意单元(Oracle称为 term)找出来,记录在一组以dr$开头的表中,同时记下该term出现的位置、次数、hash值等信息。检索时,Oracle从这组表中查找相应的 term,并计算其出现频率,根据某个算法来计算每个文档的得分(score),即所谓的‘匹配率’。而lexer则是该机制的核心,它决定了全文检索的 效率。Oracle针对不同的语言提供了不同的lexer,而我们通常能用到其中的三个:
     basic_lexer:针对英语。它能根据空格和标点来将英语单词从句子中分离,还能自动将一些出现频率过高已经失去检索意义的单词作为‘垃圾’处理, 如if , is等,具有较高的处理效率。但该lexer应用于汉语则有很多问题,由于它只认空格和标点,而汉语的一句话中通常不会有空格,因此,它会把整句话作为一 个term,事实上失去检索能力。以‘中国人民站起来了’这句话为例,basic_lexer分析的结果只有一个term ,就是‘中国人民站起来了’。此时若检索‘中国’,将检索不到内容。
     chinese_vgram_lexer:专门的汉语分析器,支持所有汉字字符集 (ZHS16CGB231280ZHS16GBKZHT32EUCZHT16BIG5ZHT32TRISZHT16MSWIN950ZHT16HKSCSUTF8)。 该分析器按字为单元来分析汉语句子。‘中国人民站起来了’这句话,会被它分析成如下几个term: ‘中’,‘中国’,‘国人’,‘人民’,‘民站’,‘站起’,起来’,‘来了’,‘了’。可以看出,这种分析方法,实现算法很简单,并且能实现‘一网打 尽’,但效率则是差强人意。