mysql数据库400W如何处理数据去重

使用select count查看一下数据库数据量

mysql> select count(*) from zyads_integral ;
+----------+
| count(*) |
+----------+
|  4130473 |
+----------+
1 row in set (0.01 sec)

`desc查看一下数据表结构

mysql> desc zyads_integral;
+-------+---------+------+-----+---------+----------------+
| Field | Type    | Null | Key | Default | Extra          |
+-------+---------+------+-----+---------+----------------+
| id    | int(11) | NO   | PRI | NULL    | auto_increment |
| hash  | text    | YES  |     | NULL    |                |
| sha1  | text    | NO   |     | NULL    |                |
| name  | text    | NO   |     | NULL    |                |
| index | text    | YES  |     | NULL    |                |
| size  | text    | YES  |     | NULL    |                |
+-------+---------+------+-----+---------+----------------+
6 rows in set (0.01 sec)

样例数据

mysql> select * from zyads_integral limit 1\G
*************************** 1. row ***************************
   id: 6721212
 hash: 0FA565EEFA9E688B1F87640815EE090C7326725D
 sha1: 8c907b045bb7905cf2a63f0b1208eeb3bca857d6
 name: 【无效链接】xxxxxx.html
index: 107
 size: 78110108
1 row in set (0.01 sec)

接下来开始去掉重复数据

mysql> select id, sha1, count(*) from zyads_integral group by sha1 limit 10;
+---------+------------------------------------------+----------+
| id      | sha1                                     | count(*) |
+---------+------------------------------------------+----------+
|    7696 |                                          |        1 |
| 5137851 | 0000000000000000000000000000000005325911 |        2 |
| 5363699 | 00000000000000000000000000000000097ecf88 |        5 |
| 4826139 | 000000000000000000000000000000000fd81983 |        1 |
| 6250586 | 000000000000000000000000000000001b41f909 |        1 |
| 5597063 | 000000000000000000000000000000001d385b7c |        2 |
| 5281295 | 000000000000000000000000000000002a91e078 |        2 |
| 6331972 | 000000000000000000000000000000003488380d |        2 |
| 4774906 | 00000000000000000000000000000000397db43d |        1 |
| 4550736 | 00000000000000000000000000000000494ec71f |        1 |
+---------+------------------------------------------+----------+
10 rows in set (24.71 sec)

mysql> select count(*) from zyads_integral where sha1= '0000000000000000000000000000000005325911';
+----------+
| count(*) |
+----------+
|        2 |
+----------+
1 row in set (1.03 sec)

mysql> select id, count(*) from zyads_integral group by sha1 having count(*) > 1;
+---------+----------+
| id      | count(*) |
+---------+----------+
| 5137851 |        2 |
| 5363699 |        5 |
| 5597063 |        2 |
| 5281295 |        2 |
...
| 4712249 |        6 |
| 1581236 |        3 |
| 5126827 |        2 |
| 1872277 |        7 |
+---------+----------+
836343 rows in set (33.77 sec)
mysql> select id from zyads_integral group by sha1 having count(*) >= 1;
+---------+
| id      |
+---------+
|    7696 |
| 5137851 |
| 5363699 |
| 4826139 |
| 6250586 |
...
| 5126827 |
|  570573 |
| 1872277 |
| 4514446 |
+---------+
2466076 rows in set (3 min 36.80 sec)

删除数据

mysql> delete from zyads_integral where id in (select a.id from (select id from zyads_integral group by sha1 having count(*) > 1) a);
CREATE TABLE `zyads_integral_tmp` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `hash` varchar(100),
  `sha1` varchar(100) NOT NULL,
  `name` varchar(1000) NOT NULL,
  `index` varchar(10),
  `size` varchar(10),
  UNIQUE KEY `sha1` (`sha1`),
  PRIMARY KEY (`id`)
) ENGINE=MyISAM AUTO_INCREMENT=6756155 DEFAULT CHARSET=gbk

INSERT INTO zyads_integral_tmp (`hash`,`sha1`,`name`,`index`,`size`) SELECT `hash`,`sha1`,`name`,`index`,`size` from zyads_integral group by sha1 having count(*)>=1;

mysql> rename zyads_integral zyads_integral_tmp_1 ;

0 comments

To reply to the article, please Login or registered