Hi
In our application we file emails from outlook i.e in the process we extract the metadata into our tables and create text index on emails including attachments for search purposes. It has been working well until 11.2.0.1 but since we have upgraded our DB to 11.2.0.4 and 19c (We have both versions running on different servers), word, excel as email attachments are not being indexed. If there is a text file in the attachment it works well. The email body is also indexed properly and returning in search results. THere has been no change apart from DB upgrades. I will try and give as much information as I can:
--Create table
CREATE TABLE "ARTIFACT_FILES"
( "ARTIFACT_ID" VARCHAR2(40 BYTE),
"ORIGINAL_PATH" VARCHAR2(500 BYTE)
);
-- Create policy and filters:
begin
ctx_ddl.create_policy('"ARTIFACTS_AUTO_FILTER_POLICY"',
filter => 'ctxsys.auto_filter',
lexer=>'"ARTS_FILTER_POLICY_LEX"',
wordlist=>'"ARTS_FILTER_POLICY_WDL"',
stoplist=>'"KM_STOPLIST"'
);
end;
/
begin
ctx_ddl.create_policy('"ARTIFACTS_MAIL_FILTER_POLICY"',
filter => 'artifacts_mail_filter',
lexer=>'"ARTS_FILTER_POLICY_LEX"',
wordlist=>'"ARTS_FILTER_POLICY_WDL"',
stoplist=>'"KM_STOPLIST"'
);
end;
/
BEGIN
ctx_ddl.create_preference('artifacts_filter', 'PROCEDURE_FILTER');
ctx_ddl.set_attribute('artifacts_filter', 'procedure', 'kmcs_background.custom_kmcs_filter');
ctx_ddl.set_attribute('artifacts_filter', 'input_type', 'blob');
ctx_ddl.set_attribute('artifacts_filter', 'output_type', 'clob');
ctx_ddl.set_attribute('artifacts_filter', 'rowid_parameter', 'TRUE');
ctx_ddl.set_attribute('artifacts_filter', 'charset_parameter', 'TRUE');
EXCEPTION
WHEN OTHERS THEN
NULL;
END;
/
exec CTX_DDL.CREATE_PREFERENCE('artifacts_mail_filter', 'mail_filter');
exec CTX_DDL.SET_ATTRIBUTE('artifacts_mail_filter', 'INDEX_FIELDS', 'SUBJECT:TO:FROM:CC:BCC:Content-Disposition');
exec CTX_DDL.SET_ATTRIBUTE('artifacts_mail_filter', 'PART_FIELD_STYLE', 'text');
-- create preferences
begin
ctx_ddl.create_preference('"ARTIFACTS_TEXT_INDEX_DST"','FILE_DATASTORE');
end;
/
begin
ctx_ddl.create_preference('"ARTIFACTS_TEXT_INDEX_FIL"','PROCEDURE_FILTER');
ctx_ddl.set_attribute('"ARTIFACTS_TEXT_INDEX_FIL"','PROCEDURE','KMCS.KMCS_BACKGROUND.CUSTOM_KMCS_FILTER');
ctx_ddl.set_attribute('"ARTIFACTS_TEXT_INDEX_FIL"','INPUT_TYPE','BLOB');
ctx_ddl.set_attribute('"ARTIFACTS_TEXT_INDEX_FIL"','OUTPUT_TYPE','CLOB');
ctx_ddl.set_attribute('"ARTIFACTS_TEXT_INDEX_FIL"','ROWID_PARAMETER','YES');
ctx_ddl.set_attribute('"ARTIFACTS_TEXT_INDEX_FIL"','CHARSET_PARAMETER','YES');
end;
/
begin
ctx_ddl.create_section_group('"ARTIFACTS_TEXT_INDEX_SGP"','NULL_SECTION_GROUP');
end;
/
begin
ctx_ddl.create_preference('"ARTIFACTS_TEXT_INDEX_LEX"','BASIC_LEXER');
ctx_ddl.set_attribute('"ARTIFACTS_TEXT_INDEX_LEX"','BASE_LETTER','YES');
ctx_ddl.set_attribute('"ARTIFACTS_TEXT_INDEX_LEX"','COMPOSITE','GERMAN');
ctx_ddl.set_attribute('"ARTIFACTS_TEXT_INDEX_LEX"','ALTERNATE_SPELLING','GERMAN');
ctx_ddl.set_attribute('"ARTIFACTS_TEXT_INDEX_LEX"','NEW_GERMAN_SPELLING','YES');
end;
/
begin
ctx_ddl.create_preference('"ARTIFACTS_TEXT_INDEX_WDL"','BASIC_WORDLIST');
ctx_ddl.set_attribute('"ARTIFACTS_TEXT_INDEX_WDL"','STEMMER','AUTO');
ctx_ddl.set_attribute('"ARTIFACTS_TEXT_INDEX_WDL"','FUZZY_MATCH','GENERIC');
end;
/
begin
ctx_ddl.create_stoplist('"ARTIFACTS_TEXT_INDEX_SPL"','BASIC_STOPLIST');
ctx_ddl.add_stopword('"ARTIFACTS_TEXT_INDEX_SPL"','etc');
ctx_ddl.add_stopword('"ARTIFACTS_TEXT_INDEX_SPL"','etwa');
.
ctx_ddl.add_stopword('"ARTIFACTS_TEXT_INDEX_SPL"','?taient');
ctx_ddl.add_stopword('"ARTIFACTS_TEXT_INDEX_SPL"','?tait');
end;
/
begin
ctx_ddl.create_preference('"ARTIFACTS_TEXT_INDEX_STO"','BASIC_STORAGE');
ctx_ddl.set_attribute('"ARTIFACTS_TEXT_INDEX_STO"','I_TABLE_CLAUSE','tablespace &tablespace storage (initial 100M next 50M)');
ctx_ddl.set_attribute('"ARTIFACTS_TEXT_INDEX_STO"','K_TABLE_CLAUSE','tablespace &tablespace storage (initial 10M next 10M)');
ctx_ddl.set_attribute('"ARTIFACTS_TEXT_INDEX_STO"','R_TABLE_CLAUSE','tablespace &tablespace storage (initial 10M) lob (data) store as (cache)');
ctx_ddl.set_attribute('"ARTIFACTS_TEXT_INDEX_STO"','N_TABLE_CLAUSE','tablespace &tablespace storage (initial 10M)');
ctx_ddl.set_attribute('"ARTIFACTS_TEXT_INDEX_STO"','I_INDEX_CLAUSE','tablespace &tablespace storage (initial 10M) compress 2');
end;
/
--save an email with word attachment in the location mentioned in the above insert statement
-- create a text index
create index "ARTIFACTS_TEXT_INDEX"
on "ARTIFACT_FILES"
("ORIGINAL_PATH")
indextype is ctxsys.context
parameters('
datastore "ARTIFACTS_TEXT_INDEX_DST"
filter "ARTIFACTS_TEXT_INDEX_FIL"
section group "ARTIFACTS_TEXT_INDEX_SGP"
lexer "ARTIFACTS_TEXT_INDEX_LEX"
wordlist "ARTIFACTS_TEXT_INDEX_WDL"
stoplist "ARTIFACTS_TEXT_INDEX_SPL"
storage "ARTIFACTS_TEXT_INDEX_STO"
')
/
--Insert data in the table
insert into ARTIFACT_FILES (ARTIFACT_ID,ORIGINAL_PATH)
values ('test', '&file_path_of_the_email_with_attachment');
commit;
--Procedure to sync the inserted email
procedure custom_kmcs_filter (
row_id in rowid
, charset in varchar2
, input in blob
, output in out nocopy clob
)
is
l_intermediate_output blob;
l_artifact_id varchar2(40);
l_artifact_type varchar2(30);
begin
-- get artifact id using row id
select af.artifact_id
into l_artifact_id
from artifact_files af
where rowid = row_id;
-- determine what type of artifact it is
l_artifact_type := kmcs_artifacts.get_artifact_type(l_artifact_id);
-- uncompress file
--l_intermediate_output := UTL_COMPRESS.LZ_UNCOMPRESS(input);
if (l_artifact_type = 'EMAIL') then
-- run mail filter on output
ctx_doc.policy_filter(
policy_name=> 'artifacts_mail_filter_policy',
document => input,
restab => output,
plaintext => true,
format => 'BINARY');
else
-- run auto filter on output, KMCS-1549, if input used does not work on OCRED PDF files
ctx_doc.policy_filter(
policy_name=> 'artifacts_auto_filter_policy',
document => l_intermediate_output,
restab => output,
plaintext => false,
format => 'BINARY');
end if;
exception
when others then
kmcs_logging.error('custom_kmcs_filter error - ' || sqlerrm || ' ' || dbms_utility.format_error_backtrace());
raise;
end custom_kmcs_filter;
--job to index every 30 minutes
exec ctx_ddl.sync_index('artifacts_text_index', '20M');
Running search in the application yields no result. Also ran the following but no joy
SELECT artifact_id from artifact_files WHERE CONTAINS(original_path, 'flabbergastic', 1) > 0;
I am running out ideas and any pointer would be greatly appreciated.
If I am missing any information please let me know.
Thanks in advance
Nil
There was a bug related to this - it's now fixed so ensure you're patched up to the latest version.
If you need further details let us know.