Dear team,
I hope you are all doing good!
I wanted to ask for some hints to fix the following problem:
We have a huge dataset (>60 mio) of items. Each item has an item number. The item number has the datatype varchar2(50).
The format of the item number vary from vendor to vendor [refer to line 03-07 in the example below].
It is quite hard for a user who is searching for an item to have the correct notation in mind.
To support the searching for item numbers we want to skip all these characters './,=?{}\()[]-;~|$!>*%_&' [refer to line 16 in the example below].
This is our
first solution which is sadly
not working as expected:
------------------------------------------------------------------
/*line: 01*/ /*** PREPARATION ***/
/*line: 02*/ create table test_item_number (item_number varchar2(50));
/*line: 03*/ insert into test_item_number values ('1.234.5');
/*line: 04*/ insert into test_item_number values ('1-25466');
/*line: 05*/ insert into test_item_number values ('001/007');
/*line: 06*/ insert into test_item_number values ('AA.123');
/*line: 07*/ insert into test_item_number values ('ABC.PD');
/*line: 08*/ commit;
/*line: 09*/
/*line: 10*/ /*** DOMAIN PREFERENCES ***/
/*line: 12*/ --grant execute on ctx_ddl to hr (if necessary);
/*line: 13*/ BEGIN
/*line: 14*/ --LEXER
/*line: 15*/ ctx_ddl.create_preference('MyLexer', 'BASIC_LEXER');
/*line: 16*/ ctx_ddl.set_attribute('MyLexer' , 'SKIPJOINS','./,=?{}\()[]-;~|$!>*%_&');
/*line: 17*/
/*line: 18*/ --WORDLIST
/*line: 19*/ ctx_ddl.create_preference('MyWordList', 'BASIC_WORDLIST');
/*line: 20*/ ctx_ddl.set_attribute('MyWordList','PREFIX_INDEX','TRUE');
/*line: 21*/ ctx_ddl.set_attribute('MyWordList','PREFIX_MIN_LENGTH', '2');
/*line: 22*/ ctx_ddl.set_attribute('MyWordList','PREFIX_MAX_LENGTH','50');
/*line: 23*/ ctx_ddl.set_attribute('MyWordList','SUBSTRING_INDEX','YES');
/*line: 24*/ ctx_ddl.set_attribute('MyWordList','REVERSE_INDEX','TRUE');
/*line: 25*/ END;
/*line: 26*/ /*** DOMAIN INDEX ***/
/*line: 27*/ create index IDX_ITEM_NUMBER
/*line: 28*/ on test_item_number(item_number)
/*line: 29*/ indextype is ctxsys.context
/*line: 30*/ parameters ('LEXER MyLexer WORDLIST MyWordList SYNC (ON COMMIT)');
------------------------------------------------------------------
This results in the following search problem:
select * from test_item_number where contains(item_number, 'AA1%') >0; -- 0 rows
select * from test_item_number where contains(item_number, '1234%') >0; -- 0 rows
The create domain index still include the dots ('.'). Here is the created index table '$i' (including my thought what can be the reason for this):
--show index entries
select token_text from dr$idx_item_number$i;
reult:
00
001
0010
00100
001007
001007
1. --because of numjoin?
1.2
1.23
1.234
1.234.
1.234.5
1.234.5
12
123
123
125
1254
12546
125466
125466
AA --because of punctuations?
AA
AB
ABC
ABCP
ABCPD
ABCPD
To aviod dots ('.') in the index I was trying this workarounds:
------------------------------------------------------------------
/***
WORKAROUND 1 (NOT WORKING) ***/
create index IDX_ITEM_NUMBER_REPLACE
on test_item_number(replace(item_number,'.',''))
indextype is ctxsys.context
parameters ('LEXER MyLexer
WORDLIST MyWordList
SYNC (ON COMMIT)');
--error: DRG-11304: function-based indexes are not supported by this indextype.
/***
WORKAROUND 2 (NOT WORKING) ***/
BEGIN
--LEXER
ctx_ddl.set_attribute('MyLexer' , 'NUMJOIN',' ');
ctx_ddl.set_attribute('MyLexer' , 'PUNCTUATIONS',' ');
END;
drop index IDX_ITEM_NUMBER;
create index IDX_ITEM_NUMBER
on test_item_number(item_number)
indextype is ctxsys.context
parameters ('LEXER MyLexer WORDLIST MyWordList SYNC (ON COMMIT)');
--the index entries (dr$idx_item_number$i) are the same like above
------------------------------------------------------------------
Regarding this problem I have the following
questions:
(1) Are my thoughts correct why the domain index includes the dots ('.') - because of numjoin/punctuations?
(2) Is there anything else i can do in the domain index preferences to avoid indexing any dot ('.')?
(3) If there is no solution for question 2, is there another easy workaround (maybe without invisible or virtual columns)?
(4) I can also use a normal index but then the left-truncated wildcard queries will be very slow - or can I also improve this in the normal index?
THANK you very much for you thoughts!Have a great day,
Walter
I reached out to Roger Ford, PM for Oracle Text on this. He had this to say:
Workaround 2 works - setting NUMJOIN to an empty string; shouldn't be necessary to change PUNCTUATIONS in this example.
It's best if you only look at TOKEN_TYPE=0 when listing the contents of $I - saves confusion with the prefix index stuff which is TOKEN_TYPE=6.
The first workaround won't work as-is because you can't create a domain index directly on a function. However, you can use a function as part (or all) of the COLUMNS list in a MULTI_COLUMN_DATASTORE.And provided these test cases:
Case 1
create table test_item_number (item_number varchar2(50));
insert into test_item_number values ('1.234.5');
insert into test_item_number values ('1-25466');
insert into test_item_number values ('001/007');
insert into test_item_number values ('AA.123');
insert into test_item_number values ('ABC.PD');
commit;
/*** DOMAIN PREFERENCES ***/
--grant execute on ctx_ddl to hr (if necessary);
BEGIN
--LEXER
ctx_ddl.create_preference('MyLexer', 'BASIC_LEXER');
ctx_ddl.set_attribute('MyLexer' , 'SKIPJOINS','./,=?{}\()[]-;~|$!>*%_&');
ctx_ddl.set_attribute('MyLexer' , 'NUMJOIN',' ');
--WORDLIST
ctx_ddl.create_preference('MyWordList', 'BASIC_WORDLIST');
ctx_ddl.set_attribute('MyWordList','PREFIX_INDEX','TRUE');
ctx_ddl.set_attribute('MyWordList','PREFIX_MIN_LENGTH', '2');
ctx_ddl.set_attribute('MyWordList','PREFIX_MAX_LENGTH','50');
ctx_ddl.set_attribute('MyWordList','SUBSTRING_INDEX','YES');
-- ctx_ddl.set_attribute('MyWordList','REVERSE_INDEX','TRUE');
END;
/
/*** DOMAIN INDEX ***/
create index IDX_ITEM_NUMBER
on test_item_number(item_number)
indextype is ctxsys.context
parameters ('LEXER MyLexer WORDLIST MyWordList SYNC (ON COMMIT)');
--show index entries
select token_text from dr$idx_item_number$i where token_type = 0;
TOKEN_TEXT
001007
12345
125466
AA123
ABCPD
select * from test_item_number
where contains ( item_number, '1234%' ) > 0;
ITEM_NUMBER
1.234.5
select *
from test_item_number
where contains ( item_number, 'AA1%' ) > 0;
ITEM_NUMBER
AA.123
Case 2
/*** PREPARATION ***/
create table test_item_number (item_number varchar2(50));
insert into test_item_number values ('1.234.5');
insert into test_item_number values ('1-25466');
insert into test_item_number values ('001/007');
insert into test_item_number values ('AA.123');
insert into test_item_number values ('ABC.PD');
commit;
/*** DOMAIN PREFERENCES ***/
--grant execute on ctx_ddl to hr (if necessary);
BEGIN
--LEXER
ctx_ddl.create_preference('MyLexer', 'BASIC_LEXER');
ctx_ddl.set_attribute('MyLexer' , 'SKIPJOINS','./,=?{}\()[]-;~|$!>*%_&');
--WORDLIST
ctx_ddl.create_preference('MyWordList', 'BASIC_WORDLIST');
ctx_ddl.set_attribute('MyWordList','PREFIX_INDEX','TRUE');
ctx_ddl.set_attribute('MyWordList','PREFIX_MIN_LENGTH', '2');
ctx_ddl.set_attribute('MyWordList','PREFIX_MAX_LENGTH','50');
ctx_ddl.set_attribute('MyWordList','SUBSTRING_INDEX','YES');
-- only in recent versions: ctx_ddl.set_attribute('MyWordList','REVERSE_INDEX','TRUE');
ctx_ddl.create_preference('MyDatastore', 'MULTI_COLUMN_DATASTORE');
ctx_ddl.set_attribute('MyDatastore', 'COLUMNS', 'replace(ITEM_NUMBER,''.'','''')');
ctx_ddl.set_attribute('MyDatastore', 'DELIMITER', 'newline'); /* avoid adding a tag */
END;
/
/*** DOMAIN INDEX ***/
create index IDX_ITEM_NUMBER
on test_item_number(item_number)
indextype is ctxsys.context
parameters ('LEXER MyLexer WORDLIST MyWordList DATASTORE MyDatastore SYNC (ON COMMIT)');
--show index entries
select token_text from dr$idx_item_number$i where token_type = 0;
TOKEN_TEXT
001007
12345
125466
AA123
ABCPD
select * from test_item_number
where contains ( item_number, '1234%' ) > 0;
ITEM_NUMBER
1.234.5
select *
from test_item_number
where contains ( item_number, 'AA1%' ) > 0;
ITEM_NUMBER
AA.123