Python3+Wordcloud 实现单身相亲网站词云分析

时间:2022-01-09 18:50:09

MongoDB封装,实现数据存储
'''
存储文件:MongoClass.py
'''
import pymongo
import traceback

class MongoOpt:
__client = ''
__db = ''
__col = ''
def __init__(self):
try:
self.__client = pymongo.MongoClient("mongodb://127.0.0.1:27017/")
self.__db = self.__client['geng']
self.__col = self.__db['gerenqingkuang']
except:
traceback.print_exc()

def add_collection(self, info):
try:
self.__col.insert_one(info)
except:
traceback.print_exc()

def get_all_collections(self):
try:

value = []
text = ''
for r in self.__col.find({},{"_id": 0, "相片": 0}):
value.extend(r.values())
for v in value:
text += str(v)
with open("./conf/233.txt", 'w', encoding='utf-8') as txt:
txt.write(text)
return text
except:
traceback.print_exc()

if __name__ == '__main__':
mo = MongoOpt()
# mo.add_collection({'name':"geng", 'sex':'m'})
res = mo.get_all_collections()
print( res)
爬虫封装,实现数据爬取
'''
文件目录:SpiderClass.py
'''
import requests
from bs4 import BeautifulSoup as bs
import re

class SpiderOpt:
def __init__(self, method, url, **kwargs):
self.__response = requests.request(method, url, **kwargs)

def get_response(self):
self.__res = self.__response.text
self.__bs = bs(self.__res, 'html.parser')

def get_person_details(self):
details = self.__bs.find_all(class_="newshow2")
keys = []
values = []
for detail in details:
keys.extend(detail.find_all(class_='s1'))
values.extend(detail.find_all(class_='txt'))

# re去除html标签
re_html = re.compile('<[^<]+?>')
for i in range(len(keys)):
keys[i] = re_html.sub('', str(keys[i])).split(':')[0]
for j in range(len(values)):
values[j] = re_html.sub('', str(values[j])).split()[0]
return dict(zip(keys, values))

def get_family_love(self):
details = self.__bs.find_all(class_="newshow3")
keys = []
values = []
for detail in details:
keys.extend(detail.find_all(class_='s1'))
values.extend(detail.find_all(class_='txt'))

# re去除html标签
re_html = re.compile('<[^<]+?>')
for i in range(len(keys)):
keys[i] = re_html.sub('', str(keys[i])).split(':')[0]
for j in range(len(values)):
values[j] = re_html.sub('', str(values[j])).split()[0]
return dict(zip(keys, values))

def get_love_request(self):
table = self.__bs.find(class_="tableB")
tds = table.find_all('td')
keys = []
values = []
# re去除html标签
re_html = re.compile('<[^<]+?>')
for i in range(len(tds)):
s = re_html.sub(www.cjyl1yule.com'', str(tds[i])).split(':')[0]
if i % 2 == 0:
keys.append(s)
else:
values.append(s)
return dict(zip(keys, values))

def get_jpg_src(self):
photo = self.__bs.find(class_=www.oushengyule.com"love_photo")
jpg_src = photo.find('a')
return jpg_src['href']
词云封装,实现词图生成
'''
文件目录:CloudClass.py
'''
from wordcloud import WordCloud
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import jieba

class Cloud:
def __init__(self):
pass

def get_cloud(self, text):
# path = r'./'
font = r'./conf/FZSTK.TTF' # 字体path
text = (open('./conf/233.txt', 'r'www.zheshengyuLe.com, encoding='utf-8')).read() # 如果是中文的话encoding换成utf8
cut = jieba.cut(text) # 分词
string = ' '.join(cut)
print(len(string))
img = Image.open('./conf/timg.jpg') # 打开图片
img_array = np.array(img) # 将图片装换为数组
stopwords = {'要求': 0, '汉族': 0, '厘米': 0, '公斤': 0, '父亲': 0, '母亲': 0, '父母': 0,
'随意': 0, '其他': 0, '退休': 0, '对方': 0, '中文': 0, '普通话': 0, '不能': 0,
'以上': 0, '一般': 0, '无神论': 0, '建在': 0, '退休金': 0
} # 噪声词
wc = WordCloud(
scale=4, # 清晰度
background_color='white', # 背景颜色
max_words=400, # 最多单词
width=1000,
height=800,
mask=img_array,
font_path=font,
stopwords=stopwords # 停用词
)
wc.generate_from_text(string) # 绘制图片
plt.imshow(www.chaoyuepint.com)
plt.axis('off')
plt.figure()
plt.show() # 显示图片
wc.to_file('./conf/个人信息.png') # 保存图片

if __name__ == '__main__':
cloud = Cloud()
cloud.get_cloud()

主函数
'''
文件目录:Spider.py
'''
from yc.MongoClass import MongoOpt
from yc.SpiderClass import SpiderOpt
from yc.CloudClass import Cloud

class Spider:
def spider(self, num):
try:
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
"Connection": "keep-alive",
"Cookie": "PHPSESSID=adgqumos4shf5pj3tkvfuohhs6; yc123_loveyun_loveloginMember=AAIGDhpVVQIHBggHWl1RUVMFAwcHAF8EUgEKUAFSXVZXV1BXVw%3D%3D; yc123_loveyun_showhxtoday=NA%3D%3D; yc123_loveyun_showweixinpushtoday=NA%3D%3D;",
"Host": www.yifayule2d.com"love.yc123.com",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36",
}
spider = SpiderOpt(www.yifa5yl.com'GET', 'https://www.zzhehong.com love.yc123.com/show.php?id=' + str(num), headers=headers)
spider.get_response()
person_details = spider.get_person_details()
# family_love = spider.get_family_love()
love_request = spider.get_love_request()
# jpg_src = spider.get_jpg_src()
result = {}
# result.update(person_details) # 个人情况
# result.update(family_love) # 家庭情况
result.update(love_request) # 择偶要求
# result.update({"相片": jpg_src})
mongo = MongoOpt()
mongo.add_collection(result)
return True
except:
return False

if __name__ == '__main__':
spider = Spider()
mongo = MongoOpt()
cloud = Cloud()
# x, y = 0, 0
# for i in range(3001, 4720):
# if not spider.spider(i):
# x += 1
# print(i, "爬取无效")
# continue
# y += 1
# print(i, "爬取完成")
# print(x, y, sep=" ")
text = mongo.get_all_collections()
cloud.get_cloud(text)