I have a table of employees which contains about 25 columns. Right now there are a lot of duplicates and I would like to try and get rid of some of these duplicates.
我有一张员工表,其中包含大约25列。现在有很多重复,我想尝试摆脱一些重复。
First, I want to find the duplicates by looking for multiple records that have the same values in first name, last name, employee number, company number and status.
首先,我想通过查找在名字,姓氏,员工编号,公司编号和状态中具有相同值的多个记录来查找重复项。
SELECT
firstname,lastname,employeenumber, companynumber, statusflag
FROM
employeemaster
GROUP BY
firstname,lastname,employeenumber,companynumber, statusflag
HAVING
(COUNT(*) > 1)
This gives me duplicates but my goal is to find and keep the best single record and delete the other records. The "best single record" is defined by the record with the least amount of NULL values in all of the other columns. How can I do this?
这给了我重复,但我的目标是找到并保留最好的单个记录并删除其他记录。 “最佳单条记录”由所有其他列中具有最少NULL值的记录定义。我怎样才能做到这一点?
I am using Microsoft SQL Server 2012 MGMT Studio.
我正在使用Microsoft SQL Server 2012 MGMT Studio。
EXAMPLE:
Red: DELETE Green: KEEP
红色:删除绿色:保持
NOTE: There are a lot more columns in the table than what this table shows.
注意:表中列的列数多于此表所示的列数。
3 个解决方案
#1
2
You can use the sys.columns table to get a list of columns and build a dynamic query. This query will return a 'KeepThese' value for every record you want to keep based on your given criteria.
您可以使用sys.columns表获取列列表并构建动态查询。此查询将根据您给定的条件为您要保留的每条记录返回“KeepThese”值。
-- insert test data
create table EmployeeMaster
(
Record int identity(1,1),
FirstName varchar(50),
LastName varchar(50),
EmployeeNumber int,
CompanyNumber int,
StatusFlag int,
UserName varchar(50),
Branch varchar(50)
);
insert into EmployeeMaster
(
FirstName,
LastName,
EmployeeNumber,
CompanyNumber,
StatusFlag,
UserName,
Branch
)
values
('Jake','Jones',1234,1,1,'JJONES','PHX'),
('Jake','Jones',1234,1,1,NULL,'PHX'),
('Jake','Jones',1234,1,1,NULL,NULL),
('Jane','Jones',5678,1,1,'JJONES2',NULL);
-- get records with most non-null values with dynamic sys.column query
declare @sql varchar(max)
select @sql = '
select e.*,
row_number() over(partition by
e.FirstName,
e.LastName,
e.EmployeeNumber,
e.CompanyNumber,
e.StatusFlag
order by n.NonNullCnt desc) as KeepThese
from EmployeeMaster e
cross apply (select count(n.value) as NonNullCnt from (select ' +
replace((
select 'cast(' + c.name + ' as varchar(50)) as value union all select '
from sys.columns c
where c.object_id = t.object_id
for xml path('')
) + '#',' union all select #','') + ')n)n'
from sys.tables t
where t.name = 'EmployeeMaster'
exec(@sql)
#2
1
Try this.
;WITH cte
AS (SELECT Row_number()
OVER(
partition BY firstname, lastname, employeenumber, companynumber, statusflag
ORDER BY (SELECT NULL)) rn,
firstname,
lastname,
employeenumber,
companynumber,
statusflag,
username,
branch
FROM employeemaster),
cte1
AS (SELECT a.firstname,
a.lastname,
a.employeenumber,
a.companynumber,
a.statusflag,
Row_number()
OVER(
partition BY a.firstname, a.lastname, a.employeenumber, a.companynumber, a.statusflag
ORDER BY (CASE WHEN a.username IS NULL THEN 1 ELSE 0 END +CASE WHEN a.branch IS NULL THEN 1 ELSE 0 END) )rn
-- add the remaining columns in case statement
FROM cte a
JOIN employeemaster b
ON a.firstname = b.firstname
AND a.lastname = b.lastname
AND a.employeenumber = b.employeenumber
AND a.companynumbe = b.companynumber
AND a.statusflag = b.statusflag)
SELECT *
FROM cte1
WHERE rn = 1
#3
1
I test with MySQL and use NULL String concat to found the best record. Because LENGTH ( NULL || 'data') is 0. Only if all column not NULL some length exists. Maybe this is not perfekt.
我用MySQL测试并使用NULL String concat来找到最佳记录。因为LENGTH(NULL ||'data')是0.只有当所有列都不为NULL时才存在一些长度。也许这不是完美的。
create table EmployeeMaster
(
Record int auto_increment,
FirstName varchar(50),
LastName varchar(50),
EmployeeNumber int,
CompanyNumber int,
StatusFlag int,
UserName varchar(50),
Branch varchar(50),
PRIMARY KEY(record)
);
INSERT INTO EmployeeMaster
(
FirstName, LastName, EmployeeNumber, CompanyNumber, StatusFlag, UserName, Branch
) VALUES ('Jake', 'Jones', 1234, 1, 1, 'JJONES', 'PHX'), ('Jake', 'Jones', 1234, 1, 1, NULL, 'PHX'), ('Jake', 'Jones', 1234, 1, 1, NULL, NULL), ('Jane', 'Jones', 5678, 1, 1, 'JJONES2', NULL);
My query idea looks like this
我的查询想法是这样的
SELECT e.*
FROM employeemaster e
JOIN ( SELECT firstname,
lastname,
employeenumber,
companynumber,
statusflag,
MAX( LENGTH ( username || branch ) ) data_quality
FROM employeemaster
GROUP BY firstname, lastname, employeenumber, companynumber, statusflag
HAVING count(*) > 1
) g
ON LENGTH ( username || branch ) = g.data_quality
#1
2
You can use the sys.columns table to get a list of columns and build a dynamic query. This query will return a 'KeepThese' value for every record you want to keep based on your given criteria.
您可以使用sys.columns表获取列列表并构建动态查询。此查询将根据您给定的条件为您要保留的每条记录返回“KeepThese”值。
-- insert test data
create table EmployeeMaster
(
Record int identity(1,1),
FirstName varchar(50),
LastName varchar(50),
EmployeeNumber int,
CompanyNumber int,
StatusFlag int,
UserName varchar(50),
Branch varchar(50)
);
insert into EmployeeMaster
(
FirstName,
LastName,
EmployeeNumber,
CompanyNumber,
StatusFlag,
UserName,
Branch
)
values
('Jake','Jones',1234,1,1,'JJONES','PHX'),
('Jake','Jones',1234,1,1,NULL,'PHX'),
('Jake','Jones',1234,1,1,NULL,NULL),
('Jane','Jones',5678,1,1,'JJONES2',NULL);
-- get records with most non-null values with dynamic sys.column query
declare @sql varchar(max)
select @sql = '
select e.*,
row_number() over(partition by
e.FirstName,
e.LastName,
e.EmployeeNumber,
e.CompanyNumber,
e.StatusFlag
order by n.NonNullCnt desc) as KeepThese
from EmployeeMaster e
cross apply (select count(n.value) as NonNullCnt from (select ' +
replace((
select 'cast(' + c.name + ' as varchar(50)) as value union all select '
from sys.columns c
where c.object_id = t.object_id
for xml path('')
) + '#',' union all select #','') + ')n)n'
from sys.tables t
where t.name = 'EmployeeMaster'
exec(@sql)
#2
1
Try this.
;WITH cte
AS (SELECT Row_number()
OVER(
partition BY firstname, lastname, employeenumber, companynumber, statusflag
ORDER BY (SELECT NULL)) rn,
firstname,
lastname,
employeenumber,
companynumber,
statusflag,
username,
branch
FROM employeemaster),
cte1
AS (SELECT a.firstname,
a.lastname,
a.employeenumber,
a.companynumber,
a.statusflag,
Row_number()
OVER(
partition BY a.firstname, a.lastname, a.employeenumber, a.companynumber, a.statusflag
ORDER BY (CASE WHEN a.username IS NULL THEN 1 ELSE 0 END +CASE WHEN a.branch IS NULL THEN 1 ELSE 0 END) )rn
-- add the remaining columns in case statement
FROM cte a
JOIN employeemaster b
ON a.firstname = b.firstname
AND a.lastname = b.lastname
AND a.employeenumber = b.employeenumber
AND a.companynumbe = b.companynumber
AND a.statusflag = b.statusflag)
SELECT *
FROM cte1
WHERE rn = 1
#3
1
I test with MySQL and use NULL String concat to found the best record. Because LENGTH ( NULL || 'data') is 0. Only if all column not NULL some length exists. Maybe this is not perfekt.
我用MySQL测试并使用NULL String concat来找到最佳记录。因为LENGTH(NULL ||'data')是0.只有当所有列都不为NULL时才存在一些长度。也许这不是完美的。
create table EmployeeMaster
(
Record int auto_increment,
FirstName varchar(50),
LastName varchar(50),
EmployeeNumber int,
CompanyNumber int,
StatusFlag int,
UserName varchar(50),
Branch varchar(50),
PRIMARY KEY(record)
);
INSERT INTO EmployeeMaster
(
FirstName, LastName, EmployeeNumber, CompanyNumber, StatusFlag, UserName, Branch
) VALUES ('Jake', 'Jones', 1234, 1, 1, 'JJONES', 'PHX'), ('Jake', 'Jones', 1234, 1, 1, NULL, 'PHX'), ('Jake', 'Jones', 1234, 1, 1, NULL, NULL), ('Jane', 'Jones', 5678, 1, 1, 'JJONES2', NULL);
My query idea looks like this
我的查询想法是这样的
SELECT e.*
FROM employeemaster e
JOIN ( SELECT firstname,
lastname,
employeenumber,
companynumber,
statusflag,
MAX( LENGTH ( username || branch ) ) data_quality
FROM employeemaster
GROUP BY firstname, lastname, employeenumber, companynumber, statusflag
HAVING count(*) > 1
) g
ON LENGTH ( username || branch ) = g.data_quality