Skip to Main Content
  • Questions
  • Email attachments (Word, exxcel) not being text indexed

Breadcrumb

Question and Answer

Connor McDonald

Thanks for the question, Nilesh.

Asked: January 24, 2022 - 6:11 pm UTC

Last updated: January 27, 2022 - 5:12 pm UTC

Version: 19.3

Viewed 1000+ times

You Asked

Hi

In our application we file emails from outlook i.e in the process we extract the metadata into our tables and create text index on emails including attachments for search purposes. It has been working well until 11.2.0.1 but since we have upgraded our DB to 11.2.0.4 and 19c (We have both versions running on different servers), word, excel as email attachments are not being indexed. If there is a text file in the attachment it works well. The email body is also indexed properly and returning in search results. THere has been no change apart from DB upgrades. I will try and give as much information as I can:

--Create table
CREATE TABLE "ARTIFACT_FILES"
( "ARTIFACT_ID" VARCHAR2(40 BYTE),
"ORIGINAL_PATH" VARCHAR2(500 BYTE)
);



-- Create policy and filters:
begin 
ctx_ddl.create_policy('"ARTIFACTS_AUTO_FILTER_POLICY"',
filter => 'ctxsys.auto_filter',
lexer=>'"ARTS_FILTER_POLICY_LEX"',
wordlist=>'"ARTS_FILTER_POLICY_WDL"',
stoplist=>'"KM_STOPLIST"'
);
end;
/


begin 
ctx_ddl.create_policy('"ARTIFACTS_MAIL_FILTER_POLICY"',
filter => 'artifacts_mail_filter',
lexer=>'"ARTS_FILTER_POLICY_LEX"',
wordlist=>'"ARTS_FILTER_POLICY_WDL"',
stoplist=>'"KM_STOPLIST"'
);
end;
/


BEGIN
ctx_ddl.create_preference('artifacts_filter', 'PROCEDURE_FILTER');
ctx_ddl.set_attribute('artifacts_filter', 'procedure', 'kmcs_background.custom_kmcs_filter');
ctx_ddl.set_attribute('artifacts_filter', 'input_type', 'blob');
ctx_ddl.set_attribute('artifacts_filter', 'output_type', 'clob');
ctx_ddl.set_attribute('artifacts_filter', 'rowid_parameter', 'TRUE');
ctx_ddl.set_attribute('artifacts_filter', 'charset_parameter', 'TRUE');
EXCEPTION
WHEN OTHERS THEN
NULL;
END;
/

exec CTX_DDL.CREATE_PREFERENCE('artifacts_mail_filter', 'mail_filter');
exec CTX_DDL.SET_ATTRIBUTE('artifacts_mail_filter', 'INDEX_FIELDS', 'SUBJECT:TO:FROM:CC:BCC:Content-Disposition');
exec CTX_DDL.SET_ATTRIBUTE('artifacts_mail_filter', 'PART_FIELD_STYLE', 'text');


-- create preferences

begin
ctx_ddl.create_preference('"ARTIFACTS_TEXT_INDEX_DST"','FILE_DATASTORE');
end;
/


begin
ctx_ddl.create_preference('"ARTIFACTS_TEXT_INDEX_FIL"','PROCEDURE_FILTER');
ctx_ddl.set_attribute('"ARTIFACTS_TEXT_INDEX_FIL"','PROCEDURE','KMCS.KMCS_BACKGROUND.CUSTOM_KMCS_FILTER');
ctx_ddl.set_attribute('"ARTIFACTS_TEXT_INDEX_FIL"','INPUT_TYPE','BLOB');
ctx_ddl.set_attribute('"ARTIFACTS_TEXT_INDEX_FIL"','OUTPUT_TYPE','CLOB');
ctx_ddl.set_attribute('"ARTIFACTS_TEXT_INDEX_FIL"','ROWID_PARAMETER','YES');
ctx_ddl.set_attribute('"ARTIFACTS_TEXT_INDEX_FIL"','CHARSET_PARAMETER','YES');
end;
/

begin
ctx_ddl.create_section_group('"ARTIFACTS_TEXT_INDEX_SGP"','NULL_SECTION_GROUP');
end;
/


begin
ctx_ddl.create_preference('"ARTIFACTS_TEXT_INDEX_LEX"','BASIC_LEXER');
ctx_ddl.set_attribute('"ARTIFACTS_TEXT_INDEX_LEX"','BASE_LETTER','YES');
ctx_ddl.set_attribute('"ARTIFACTS_TEXT_INDEX_LEX"','COMPOSITE','GERMAN');
ctx_ddl.set_attribute('"ARTIFACTS_TEXT_INDEX_LEX"','ALTERNATE_SPELLING','GERMAN');
ctx_ddl.set_attribute('"ARTIFACTS_TEXT_INDEX_LEX"','NEW_GERMAN_SPELLING','YES');
end;
/

begin
ctx_ddl.create_preference('"ARTIFACTS_TEXT_INDEX_WDL"','BASIC_WORDLIST');
ctx_ddl.set_attribute('"ARTIFACTS_TEXT_INDEX_WDL"','STEMMER','AUTO');
ctx_ddl.set_attribute('"ARTIFACTS_TEXT_INDEX_WDL"','FUZZY_MATCH','GENERIC');
end;
/

begin
ctx_ddl.create_stoplist('"ARTIFACTS_TEXT_INDEX_SPL"','BASIC_STOPLIST');
ctx_ddl.add_stopword('"ARTIFACTS_TEXT_INDEX_SPL"','etc');
ctx_ddl.add_stopword('"ARTIFACTS_TEXT_INDEX_SPL"','etwa');
.
ctx_ddl.add_stopword('"ARTIFACTS_TEXT_INDEX_SPL"','?taient');
ctx_ddl.add_stopword('"ARTIFACTS_TEXT_INDEX_SPL"','?tait');
end;
/


begin
ctx_ddl.create_preference('"ARTIFACTS_TEXT_INDEX_STO"','BASIC_STORAGE');
ctx_ddl.set_attribute('"ARTIFACTS_TEXT_INDEX_STO"','I_TABLE_CLAUSE','tablespace &tablespace storage (initial 100M next 50M)');
ctx_ddl.set_attribute('"ARTIFACTS_TEXT_INDEX_STO"','K_TABLE_CLAUSE','tablespace &tablespace storage (initial 10M next 10M)');
ctx_ddl.set_attribute('"ARTIFACTS_TEXT_INDEX_STO"','R_TABLE_CLAUSE','tablespace &tablespace storage (initial 10M) lob (data) store as (cache)');
ctx_ddl.set_attribute('"ARTIFACTS_TEXT_INDEX_STO"','N_TABLE_CLAUSE','tablespace &tablespace storage (initial 10M)');
ctx_ddl.set_attribute('"ARTIFACTS_TEXT_INDEX_STO"','I_INDEX_CLAUSE','tablespace &tablespace storage (initial 10M) compress 2');
end;
/


--save an email with word attachment in the location mentioned in the above insert statement

-- create a text index

create index "ARTIFACTS_TEXT_INDEX"
on "ARTIFACT_FILES"
("ORIGINAL_PATH")
indextype is ctxsys.context
parameters('
datastore "ARTIFACTS_TEXT_INDEX_DST"
filter "ARTIFACTS_TEXT_INDEX_FIL"
section group "ARTIFACTS_TEXT_INDEX_SGP"
lexer "ARTIFACTS_TEXT_INDEX_LEX"
wordlist "ARTIFACTS_TEXT_INDEX_WDL"
stoplist "ARTIFACTS_TEXT_INDEX_SPL"
storage "ARTIFACTS_TEXT_INDEX_STO"
')
/


--Insert data in the table

insert into ARTIFACT_FILES (ARTIFACT_ID,ORIGINAL_PATH)
values ('test', '&file_path_of_the_email_with_attachment');
commit;


--Procedure to sync the inserted email

procedure custom_kmcs_filter (
row_id in rowid
, charset in varchar2
, input in blob
, output in out nocopy clob
)
is
l_intermediate_output blob;
l_artifact_id varchar2(40);
l_artifact_type varchar2(30);
begin
-- get artifact id using row id
select af.artifact_id
into l_artifact_id
from artifact_files af
where rowid = row_id;

-- determine what type of artifact it is
l_artifact_type := kmcs_artifacts.get_artifact_type(l_artifact_id);

-- uncompress file
--l_intermediate_output := UTL_COMPRESS.LZ_UNCOMPRESS(input);

if (l_artifact_type = 'EMAIL') then

-- run mail filter on output
ctx_doc.policy_filter(
policy_name=> 'artifacts_mail_filter_policy',
document => input,
restab => output,
plaintext => true,
format => 'BINARY');
else

-- run auto filter on output, KMCS-1549, if input used does not work on OCRED PDF files
ctx_doc.policy_filter(
policy_name=> 'artifacts_auto_filter_policy',
document => l_intermediate_output,
restab => output,
plaintext => false,
format => 'BINARY');

end if;

exception
when others then
kmcs_logging.error('custom_kmcs_filter error - ' || sqlerrm || ' ' || dbms_utility.format_error_backtrace());
raise;
end custom_kmcs_filter;


--job to index every 30 minutes
exec ctx_ddl.sync_index('artifacts_text_index', '20M');

Running search in the application yields no result. Also ran the following but no joy

SELECT artifact_id from artifact_files WHERE CONTAINS(original_path, 'flabbergastic', 1) > 0;


I am running out ideas and any pointer would be greatly appreciated.

If I am missing any information please let me know.

Thanks in advance
Nil

and Chris said...

There was a bug related to this - it's now fixed so ensure you're patched up to the latest version.

If you need further details let us know.

Rating

  (2 ratings)

Is this answer out of date? If it is, please let us know via a Comment

Comments

Thank you

Nilesh, January 27, 2022 - 9:17 pm UTC

Hi Chris

Thank for the response, I did contact oracle support but had no luck.

If you are aware of the bug and the patch, please let me know.

Many thanks

Couldn't find the patch

Nilesh, March 08, 2022 - 6:18 pm UTC

Hi Chris

Can you please provide me link to the patch as I am not able to find on my oracle support?

Many thanks
N

More to Explore

CLOB/BLOB/etc

Complete documentation on Securefiles and Large Objects here