I have some PL/SQL code that takes two collections of numbers and unions them together, putting the result into a third collection. Why is there such a performance difference between casting the collections to tables and performing the union in SQL as opposed to using PL/SQL's MULTISET UNION operator? Doing the union in SQL is 100s of times faster.
CREATE OR REPLACE TYPE number_tab IS TABLE OF NUMBER;
/
DECLARE
lv_set_size NUMBER := 10000;
la_set1 number_tab;
la_set2 number_tab;
la_set3 number_tab;
lv_start TIMESTAMP WITH LOCAL TIME ZONE;
lv_end TIMESTAMP WITH LOCAL TIME ZONE;
-- get the time between 2 timestamps, in seconds
FUNCTION get_elapsed (
pv_start_i TIMESTAMP WITH LOCAL TIME ZONE,
pv_end_i TIMESTAMP WITH LOCAL TIME ZONE
) RETURN NUMBER
IS
lv_elapsed NUMBER;
BEGIN
SELECT EXTRACT(SECOND FROM pv_end_i - pv_start_i) +
(EXTRACT(MINUTE FROM pv_end_i - pv_start_i) * 60) +
(EXTRACT(HOUR FROM pv_end_i - pv_start_i) * 60 * 60) +
(EXTRACT(DAY FROM pv_end_i - pv_start_i) * 60 * 60 * 24)
INTO lv_elapsed
FROM dual;
RETURN lv_elapsed;
END;
BEGIN
-- populate 2 collections
SELECT ROWNUM
BULK COLLECT INTO la_set1
FROM dual
CONNECT BY ROWNUM <= lv_set_size;
SELECT ROWNUM
BULK COLLECT INTO la_set2
FROM dual
CONNECT BY ROWNUM <= lv_set_size;
-- perform union using multiset union
lv_start := systimestamp;
la_set3 := la_set1 MULTISET UNION DISTINCT la_set2;
lv_end := systimestamp;
dbms_output.put_line('elapsed multiset union: ' || get_elapsed(lv_start, lv_end));
-- perform union using sql
lv_start := systimestamp;
SELECT column_value
BULK COLLECT INTO la_set3
FROM TABLE(la_set1)
UNION
SELECT column_value
FROM TABLE(la_set2);
lv_end := systimestamp;
dbms_output.put_line('elapsed sql union: ' || get_elapsed(lv_start, lv_end));
END;
/
it seems to be mostly affected by the number of distinct elements - the plsql operator does not scale very well, consider this modified test:
ops$tkyte%ORA11GR2> CREATE OR REPLACE TYPE number_tab IS TABLE OF NUMBER;
2 /
Type created.
ops$tkyte%ORA11GR2>
ops$tkyte%ORA11GR2> drop table t;
Table dropped.
ops$tkyte%ORA11GR2>
ops$tkyte%ORA11GR2> create table t
2 ( set_size number,
3 num_dist number,
4 plsql_hsecs number,
5 plsql_cpu number,
6 sql_hsecs number,
7 sql_cpu number ,
8 primary key(set_size,num_dist)
9 );
Table created.
ops$tkyte%ORA11GR2>
ops$tkyte%ORA11GR2> create or replace procedure test_union( lv_set_size in number, p_num_dist in number )
2 as
3 la_set1 number_tab;
4 la_set2 number_tab;
5 la_set3 number_tab;
6 la_set4 number_tab;
7 lv_start_hsecs number;
8 lv_end_hsecs number;
9 lv_start_cpu number;
10 lv_end_cpu number;
11 BEGIN
12 -- populate 2 collections
13 SELECT mod(ROWNUM,p_num_dist)
14 BULK COLLECT INTO la_set1
15 FROM dual
16 CONNECT BY ROWNUM <= lv_set_size;
17
18 SELECT mod(ROWNUM,p_num_dist)
19 BULK COLLECT INTO la_set2
20 FROM dual
21 CONNECT BY ROWNUM <= lv_set_size;
22
23 -- perform union using multiset union
24 lv_start_hsecs := dbms_utility.get_time;
25 lv_start_cpu := dbms_utility.get_cpu_time;
26 la_set3 := la_set1 MULTISET UNION DISTINCT la_set2;
27 lv_end_hsecs := dbms_utility.get_time;
28 lv_end_cpu := dbms_utility.get_cpu_time;
29
30 insert into t
31 ( set_size , num_dist, plsql_hsecs , plsql_cpu , sql_hsecs , sql_cpu )
32 values
33 ( lv_set_size, p_num_dist, lv_end_hsecs-lv_start_hsecs, lv_end_cpu-lv_start_cpu, null, null );
34
35 -- perform union using sql
36 lv_start_hsecs := dbms_utility.get_time;
37 lv_start_cpu := dbms_utility.get_cpu_time;
38
39 SELECT column_value
40 BULK COLLECT INTO la_set4
41 FROM TABLE(la_set1)
42 UNION
43 SELECT column_value
44 FROM TABLE(la_set2);
45
46 lv_end_hsecs := dbms_utility.get_time;
47 lv_end_cpu := dbms_utility.get_cpu_time;
48
49 update t set sql_hsecs = lv_end_hsecs-lv_start_hsecs, sql_cpu = lv_end_cpu-lv_start_cpu
50 where set_size = lv_set_size and num_dist = p_num_dist;
51 END;
52 /
Procedure created.
ops$tkyte%ORA11GR2> declare
2 l_sizes number_tab := number_tab( 100, 500, 1000, 2500, 5000, 7500, 10000 );
3 l_num_dist number_tab := number_tab( 100, 250, 500, 1000, 10000 );
4 begin
5 for i in 1 .. l_sizes.count
6 loop
7 for j in 1 .. l_num_dist.count
8 loop
9 test_union(l_sizes(i), l_num_dist(j) );
10 end loop;
11 end loop;
12 end;
13 /
PL/SQL procedure successfully completed.
ops$tkyte%ORA11GR2> select * from t order by set_size, num_dist;
SET_SIZE NUM_DIST PLSQL_HSECS PLSQL_CPU SQL_HSECS SQL_CPU
---------- ---------- ----------- ---------- ---------- ----------
100 100 0 0 0 0
100 250 0 0 0 0
100 500 0 0 0 0
100 1000 0 0 0 0
100 10000 0 0 0 0
500 100 0 1 0 0
500 250 1 1 0 0
500 500 1 1 0 0
500 1000 2 1 0 0
500 10000 2 2 0 0
1000 100 1 1 0 0
1000 250 2 1 0 0
1000 500 3 2 1 0
1000 1000 6 5 0 0
1000 10000 6 5 1 0
2500 100 2 2 1 0
2500 250 3 3 1 1
2500 500 7 6 1 1
2500 1000 13 11 1 1
2500 10000 36 32 0 1
5000 100 3 3 1 1
5000 250 8 7 1 1
5000 500 15 15 1 1
5000 1000 30 28 1 1
5000 10000 147 142 1 2
7500 100 5 5 1 1
7500 250 12 12 2 1
7500 500 24 24 2 1
7500 1000 46 45 2 1
7500 10000 349 343 2 2
10000 100 8 8 2 1
10000 250 18 18 2 1
10000 500 34 33 2 2
10000 1000 67 68 2 2
10000 10000 640 620 2 3
35 rows selected.
I have to be honest, I've never used this language feature myself - I don't often (never yet actually) find the need to union two array's together in PLSQL.
If you look - as long as the arrays are smallish - in the low 100's - or as long as the arrays have small numbers of distinct values - again, low 100's - the PLSQL operator and SQL are fairly equivalent.
When it comes to larger sets, the sql operator - with the database at its beck and call - is definitely far superior.
Reason 312312543 to "if you can do it in sql, do it in sql" I guess :)
but I will point this out to the powers that be and see if it should be looked at. Thanks!