I have some code below, which should generate a random number of rows (10,25) for each employee_id then randomly assigns a location_id for each row. All of that appears to work fine with a caveat where some random numbers in the range aren't being represented.
I wrapped this code into PLSQL loop, for each iteration I am inserting the number of rows associated with each employee_id into the occurrences table so I can manipulate the number of rows being generated.
As you can see the distribution of values is VERY high for counts 17,18, somewhat normal for counts 15,16,19,20, a low or no concentration for the remaining counts 10,11,12,13,14,21,22,23,25,25.
I have run this several times with larger loops, increased employees and locations rows, added a TIMESTAMP to seed the random number generator and every time the largest concentration of numbers appears to be in the middle of the range.
Having thought it maybe an issue with DBMS_RANDOM I put together a second test (see below), which only calls DBMS_RANDOM and stores the results, which looked distributed for each value.
Since DBMS_RANDOM appears to work fine I am suspect the issue must be with the SQL.
Can you explain why the SQL is not creating a distributed amount of values for each count and suggest an alternative to get a better distribution?
I am running this on livesql Oracle Database 19c Enterprise Edition Release 19.0.0.0.0 - Production
Version 19.5.0.0.0
ALTER SESSION SET NLS_DATE_FORMAT = 'MMDDYYYY HH24:MI:SS';
CREATE TABLE employees AS
SELECT level AS employee_id,
'Employee ' || level AS emp_name
FROM dual
CONNECT BY level <= 50;
CREATE TABLE locations AS
SELECT level AS location_id,
'Location ' || level AS location_name
FROM dual
CONNECT BY level <= 25;
ALTER TABLE locations
ADD ( CONSTRAINT locations_pk
PRIMARY KEY (location_id));
CREATE TABLE data(
employee_id NUMBER(6),
location_id NUMBER(4),
access_date date
);
CREATE TABLE occurrences(
cnt NUMBER
);
DECLARE
l_seed VARCHAR2(100);
BEGIN
-- 50 employees x 10 loops creates 500 occurrences rows.
FOR l_counter IN 1..10
LOOP
l_seed := TO_CHAR(SYSTIMESTAMP,'YYYYDDMMHH24MISSFFFF');
DBMS_RANDOM.seed (val => TO_CHAR(SYSTIMESTAMP,'YYYYDDMMHH24MISSFFFF'));
EXECUTE IMMEDIATE 'truncate table data';
INSERT into data(
employee_id,
location_id,
access_date
)
with rws as (
select level rn
from dual
connect by level <= 25 ),
data as (
select e.*,
round (dbms_random.value(10,25)) as n
from employees e )
select
d.employee_id,
l.location_id,
trunc (sysdate) + dbms_random.value (0, 5) AS random_date
from rws r
join data d on r.rn <= d.n
cross apply (
select location_id
from locations
where d.employee_id*0 = location_id*0
order by dbms_random.value( )
fetch first 1 row only
) l
order by d.employee_id;
INSERT INTO occurrences (
cnt
)
SELECT count(*) FROM data
group by employee_id;
END LOOP;
END;
SELECT distinct (cnt), count(*) num_occurrences
FROM occurrences
group by cnt
ORDER BY cnt;
-- without seed.
CNT NUM_OCCURRENCES
12 1
13 1
14 14
15 44
16 79
17 128
18 109
19 78
20 35
21 5
22 5
-- with TIMESTAMP seed. Similar results
CNT NUM_OCCURRENCES
13 1
14 9
15 27
16 84
17 115
18 133
19 80
20 36
21 14
22 1
-- second test only call DBMS_RANDOM with seed.
DECLARE
l_seed VARCHAR2(100);
v_cnt occurrences.cnt%type;
BEGIN
l_seed := TO_CHAR(SYSTIMESTAMP,'YYYYDDMMHH24MISSFFFF');
EXECUTE IMMEDIATE 'truncate table occurrences';
FOR l_counter IN 1..500
LOOP
SELECT round (dbms_random.value(10,25)) into v_cnt from dual;
INSERT INTO occurrences (
cnt
) VALUES (v_cnt);
END LOOP;
END;
SELECT distinct (cnt), count(*) num_occurrences
FROM occurrences
group by cnt
ORDER BY cnt;
-- values appear to look distributed with a value for each count.
CNT NUM_OCCURRENCES
10 18
11 41
12 35
13 36
14 39
15 30
16 24
17 27
18 41
19 38
20 29
21 33
22 29
23 28
24 31
25 21
OK, I approached from a view of a single SQL. I've set the seed to 0 every call for reproduciblity
--
-- get the locs required per emp
--
SQL> with emp_dist as
2 ( select e.*, trunc(dbms_random.value(10,25.9999)) num_recs
3 from employees e
4 )
5 select * from emp_dist;
EMPLOYEE_ID EMP_NAME NUM_RECS
----------- ------------------------------------------------- ----------
1 Employee 1 11
2 Employee 2 23
3 Employee 3 13
4 Employee 4 13
5 Employee 5 15
6 Employee 6 11
7 Employee 7 17
8 Employee 8 25
9 Employee 9 25
10 Employee 10 23
...
43 Employee 43 15
44 Employee 44 15
45 Employee 45 22
46 Employee 46 11
47 Employee 47 20
48 Employee 48 15
49 Employee 49 11
50 Employee 50 18
50 rows selected.
--
-- disttribution seems ok
--
SQL> exec dbms_random.seed(0)
PL/SQL procedure successfully completed.
SQL> with emp_dist as
2 ( select e.*, trunc(dbms_random.value(10,25.9999)) num_recs
3 from employees e
4 )
5 select num_recs, count(*)
6 from emp_dist
7 group by num_recs
8 order by 1;
NUM_RECS COUNT(*)
---------- ----------
10 3
11 7
12 5
13 3
14 2
15 5
16 3
17 4
18 1
19 1
20 1
21 2
22 3
23 5
24 1
25 4
16 rows selected.
--
-- the sum of num_recs tell me I'll 800-ish random locations. Assuming we had a random list of those locations, the I can use num_recs to work out upper/lower bounds to tell me which rows from that set I would choose for each employee
--
SQL> with emp_dist as
2 ( select x.*,
3 sum(num_locs) over ( order by employee_id ) - num_locs + 1 lower_bound,
4 sum(num_locs) over ( order by employee_id ) upper_bound
5 from (
6 select e.*,
7 trunc(dbms_random.value(10,25.9999)) num_locs
8 from employees e
9 ) x
10 )
11 select * from emp_dist;
EMPLOYEE_ID EMP_NAME NUM_LOCS LOWER_BOUND UPPER_BOUND
----------- ------------------------------------------------- ---------- ----------- -----------
1 Employee 1 11 1 11
2 Employee 2 23 12 34
3 Employee 3 13 35 47
4 Employee 4 13 48 60
5 Employee 5 15 61 75
...
...
48 Employee 48 15 786 800
49 Employee 49 11 801 811
50 Employee 50 18 812 829
50 rows selected.
(eg, from a list of 800 random locations, employee 2 would be given entries 12 to 34, to yield 23 random locs)
--
-- so now I actually *need* a list of random locations
--
SQL> with emp_dist as
2 ( select x.*,
3 sum(num_locs) over ( order by employee_id ) - num_locs + 1 lower_bound,
4 sum(num_locs) over ( order by employee_id ) upper_bound
5 from (
6 select e.*,
7 trunc(dbms_random.value(10,25.9999)) num_locs
8 from employees e
9 ) x
10 ),
11 locs_needed as
12 ( select location_id,
13 row_number() over ( order by dbms_random.value ) as seq
14 from locations,
15 ( select 1 from dual
16 connect by level < ( select max(upper_bound) from emp_dist )/(select count(*) from locations )+1
17 )
18 )
19 select * from locs_needed;
LOCATION_ID SEQ
----------- ----------
12 1
11 2
18 3
11 4
5 5
7 6
15 7
6 8
24 9
15 10
10 11
10 12
25 13
18 14
...
...
...
4 845
13 846
18 847
19 848
20 849
12 850
850 rows selected.
--
-- and then finally bring it all together
--
SQL> exec dbms_random.seed(0)
PL/SQL procedure successfully completed.
SQL> with emp_dist as
2 ( select x.*,
3 sum(num_locs) over ( order by employee_id ) - num_locs + 1 lower_bound,
4 sum(num_locs) over ( order by employee_id ) upper_bound
5 from (
6 select e.*,
7 trunc(dbms_random.value(10,25.9999)) num_locs
8 from employees e
9 ) x
10 ),
11 locs_needed as
12 ( select location_id,
13 row_number() over ( order by dbms_random.value ) as seq
14 from locations,
15 ( select 1 from dual
16 connect by level < ( select max(upper_bound) from emp_dist )/(select count(*) from locations )+1
17 )
18 )
19 select emp.employee_id,
20 lo.location_id
21 from locs_needed lo,
22 emp_dist emp
23 where lo.seq between emp.lower_bound and emp.upper_bound
24 order by 1,2;
EMPLOYEE_ID LOCATION_ID
----------- -----------
1 5
1 6
1 7
1 10
1 11
1 11
1 12
1 15
1 15
1 18
1 24
2 1
2 2
2 4
2 4
2 4
2 5
2 5
2 6
2 10
2 10
...
...
...
50 5
50 7
50 7
50 11
50 13
50 18
50 19
50 20
50 22
50 22
50 23
50 25
829 rows selected.