如下所示:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
|
# -*- coding:utf-8 -*-
from datetime import datetime
import re
def Main():
sourcr_dir = '/data/u_lx_data/fudan/muying/muying_11yue_all.txt'
target_dir = '/data/u_lx_data/fudan/muying/python/uid_regular_get.txt'
uset = set () #去重
print ( "开始。。。。。" )
print (datetime.now().strftime( '%Y-%m-%d %H:%M:%S' ))
with open (target_dir, 'w+' ) as f_write:
with open (sourcr_dir, 'r' ) as f_scorce:
for line in f_scorce:
line = line.strip().split( "\t" )
# 宝宝树
if line[ 2 ] = = 'babytree.com' :
uidList = re.findall(r '.*NL=u%02(u\d+)' , line[ 3 ], re.I)
if uidList:
# 去重代码
if uidList[ 0 ] not in uset:
f_write.write(uidList[ 0 ] + "\n" )
uset.add(uidList[ 0 ])
print ( "宝宝树已完成" )
# 柚宝宝
elif line[ 2 ] = = 'youzibuy.com' :
if line[ 4 ].find( "yunqi.youzibuy.com/tae_top_notify" ) ! = - 1 :
uidList = re.findall(r '.*myuid=(\d+)' , line[ 4 ], re.I)
if uidList:
if uidList[ 0 ] not in uset:
f_write.write(uidList[ 0 ] + "\n" )
uset.add(uidList[ 0 ])
print ( "柚宝宝已完成" )
# 妈妈帮
elif line[ 2 ] = = 'mmbang.com' :
uidList = re.findall(r '.*uid=(\d+)' , line[ 3 ], re.I)
if uidList:
if uidList[ 0 ] not in uset:
f_write.write(uidList[ 0 ] + "\n" )
uset.add(uidList[ 0 ])
print ( "妈妈帮已完成" )
# 妈妈网
elif line[ 2 ] = = 'mama.cn' :
if line[ 4 ].find( "mapi.mama.cn/feed/users/show" ) ! = - 1 :
uidList = re.findall(r '.*friend_uid=(\d+)' , line[ 4 ], re.I)
if uidList:
if uidList[ 0 ] not in uset:
f_write.write(uidList[ 0 ] + "\n" )
uset.add(uidList[ 0 ])
if line[ 4 ].find( "mamaquan/mmq_thread" ) ! = - 1 :
uidList = re.findall(r '.*uid=(\d+)' , line[ 4 ], re.I)
if uidList:
if uidList[ 0 ] not in uset:
f_write.write(uidList[ 0 ] + "\n" )
uset.add(uidList[ 0 ])
print ( "妈妈网已完成" )
# 育儿网
elif line[ 2 ] = = 'ci123.com' :
uidList = re.findall(r '.*ci123js=([a-zA-Z]+\d+)' , line[ 3 ], re.I)
if uidList:
if uidList[ 0 ] not in uset:
f_write.write(uidList[ 0 ] + "\n" )
uset.add(uidList[ 0 ])
print ( "育儿网已完成" )
print ( "完成。。。。。" )
print (datetime.now().strftime( '%Y-%m-%d %H:%M:%S' ))
if __name__ = = "__main__" :
Main()
|
以上这篇对python读写文件去重、RE、set的使用详解就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持服务器之家。
原文链接:https://blog.csdn.net/IBoyMan/article/details/79401596