What query can be used to find the duplicates using a group of columns among which one is a varchar type from a million of records. The varchar type column matching must be case insensitive and space insensitive.
In the attached data all rows corresponds to the same name but the group by or aggregate treats them as different record. The grouping should be case insensitive and ignore the spaces. I need to perform the operation on a million records.
create table duplicate_filter(name varchar2(50), age integer, salary number)
;
insert into duplicate_filter(name, age, salary) values ('John Doe', 20, 100)
;
insert into duplicate_filter(name, age, salary) values ('JohnDoe', 20, 100)
;
insert into duplicate_filter(name, age, salary) values ('john Doe', 20, 100)
;
insert into duplicate_filter(name, age, salary) values ('johndoe', 20, 100)
;
insert into duplicate_filter(name, age, salary) values ('john doe', 20, 100)
;
commit
select name, age, salary , count(1) total from duplicate_filter
group by name, age ,salary
;
select name, age, salary,
count(1) over(partition by name, age, salary) total
from duplicate_filter
;
SQL> create table duplicate_filter(name varchar2(50), age integer, salary number);
Table created.
SQL>
SQL> insert into duplicate_filter(name, age, salary) values ('John Doe', 20, 100);
1 row created.
SQL> insert into duplicate_filter(name, age, salary) values ('JohnDoe', 20, 100);
1 row created.
SQL> insert into duplicate_filter(name, age, salary) values ('john Doe', 20, 100);
1 row created.
SQL> insert into duplicate_filter(name, age, salary) values ('johndoe', 20, 100);
1 row created.
SQL> insert into duplicate_filter(name, age, salary) values ('john doe', 20, 100);
1 row created.
SQL>
SQL> commit;
Commit complete.
SQL>
SQL> select name, age, salary , count(1) total from duplicate_filter
2 group by name, age ,salary;
NAME AGE SALARY TOTAL
-------------------------------------------------- ---------- ---------- ----------
johndoe 20 100 1
JohnDoe 20 100 1
john Doe 20 100 1
John Doe 20 100 1
john doe 20 100 1
SQL>
SQL> select upper(replace(name,' ')), age, salary , count(1) total from duplicate_filter
2 group by upper(replace(name,' ')), age ,salary;
UPPER(REPLACE(NAME,'')) AGE SALARY TOTAL
-------------------------------------------------- ---------- ---------- ----------
JOHNDOE 20 100 5
SQL>
SQL> select name, age, salary,
2 count(*) over(partition by upper(replace(name,' ')), age, salary) total
3 from duplicate_filter;
NAME AGE SALARY TOTAL
-------------------------------------------------- ---------- ---------- ----------
John Doe 20 100 5
JohnDoe 20 100 5
john doe 20 100 5
johndoe 20 100 5
john Doe 20 100 5
SQL>
SQL> insert into duplicate_filter(name, age, salary) values ('Jane Doe', 20, 100);
1 row created.
SQL> insert into duplicate_filter(name, age, salary) values ('JaneDoe', 20, 100);
1 row created.
SQL> insert into duplicate_filter(name, age, salary) values ('jane Doe', 20, 100);
1 row created.
SQL> insert into duplicate_filter(name, age, salary) values ('janedoe', 20, 100);
1 row created.
SQL> insert into duplicate_filter(name, age, salary) values ('jane doe', 20, 100);
1 row created.
SQL>
SQL> insert into duplicate_filter(name, age, salary) values ('Sue Doe', 20, 100);
1 row created.
SQL> insert into duplicate_filter(name, age, salary) values ('SueDoe', 20, 100);
1 row created.
SQL> insert into duplicate_filter(name, age, salary) values ('sue Doe', 20, 100);
1 row created.
SQL> insert into duplicate_filter(name, age, salary) values ('suedoe', 20, 100);
1 row created.
SQL> insert into duplicate_filter(name, age, salary) values ('sue doe', 20, 100);
1 row created.
SQL>
SQL> insert into duplicate_filter
2 select d.* from duplicate_filter d,
3 ( select 1 from dual connect by level <= 100000);
1500000 rows created.
SQL>
SQL> commit;
Commit complete.
SQL> select count(*) from duplicate_filter;
COUNT(*)
----------
1500015
SQL>
SQL> set timing on
SQL> select upper(replace(name,' ')), age, salary , count(1) total from duplicate_filter
2 group by upper(replace(name,' ')), age ,salary;
UPPER(REPLACE(NAME,'')) AGE SALARY TOTAL
-------------------------------------------------- ---------- ---------- ----------
JOHNDOE 20 100 500005
JANEDOE 20 100 500005
SUEDOE 20 100 500005
Elapsed: 00:00:00.28
SQL>
SQL>
SQL>