基于xpath选择器、PyQuery、正则表达式的格式清理工具详解

时间:2022-09-18 09:03:26

1,使用xpath清理不必要的标签元素,以及无内容标签

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
from lxml import etree
 
def xpath_clean(self, text: str, xpath_dict: dict) -> str:
    '''
    xpath 清除不必要的元素
    :param text: html_content
    :param xpath_dict: 清除目标xpath
    :return: string type html_content
    '''
    remove_by_xpath = xpath_dict if xpath_dict else dict()
 
    # 必然清除的项目 除非极端情况 一般这些都是要清除的
    remove_by_xpath.update({
      '_remove_2': '//iframe',
      '_remove_4': '//button',
      '_remove_5': '//form',
      '_remove_6': '//input',
      '_remove_7': '//select',
      '_remove_8': '//option',
      '_remove_9': '//textarea',
      '_remove_10': '//figure',
      '_remove_11': '//figcaption',
      '_remove_12': '//frame',
      '_remove_13': '//video',
      '_remove_14': '//script',
      '_remove_15': '//style'
    })
 
    parser = etree.HTMLParser(remove_blank_text=True, remove_comments=True)
    selector = etree.HTML(text, parser=parser)
 
    # 常规删除操作,不需要的标签删除
    for xpath in remove_by_xpath.values():
      for bad in selector.xpath(xpath):
        bad_string = etree.tostring(bad, encoding='utf-8',
                      pretty_print=True).decode()
        logger.debug(f"clean article content : {bad_string}")
        bad.getparent().remove(bad)
 
    skip_tip = "name()='img' or name()='tr' or " \
          "name()='th' or name()='tbody' or " \
          "name()='thead' or name()='table'"
    # 判断所有p标签,是否有内容存在,没有的直接删除
    for p in selector.xpath(f"//*[not({skip_tip})]"):
      # 跳过逻辑
      if p.xpath(f".//*[{skip_tip}]") or \
          bool(re.sub('\s', '', p.xpath('string(.)'))):
        continue
 
      bad_p = etree.tostring(p, encoding='utf-8',
                  pretty_print=True).decode()
      logger.debug(f"clean p tag : {bad_p}")
      p.getparent().remove(p)
 
    return etree.tostring(selector, encoding='utf-8',
               pretty_print=True).decode()

2,使用pyquery清理标签属性,并返回处理后源码和纯净文本

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#!/usr/bin/env python
# -*-coding:utf-8-*-
 
from pyquery import PyQuery as pq
 
def pyquery_clean(self, text, url, pq_dict) -> object:
    '''
    pyquery 做出必要的处理,
    :param text:
    :param url:
    :param pq_dict:
    :return:
    '''
    # 删除pq表达式字典
    remove_by_pq = pq_dict if pq_dict else dict()
    # 标签属性白名单
    attr_white_list = ['rowspan', 'colspan']
    # 图片链接key
    img_key_list = ['src', 'data-echo', 'data-src', 'data-original']
    # 生成pyquery对象
    dom = pq(text)
 
    # 删除无用标签
    for bad_tag in remove_by_pq.values():
      for bad in dom(bad_tag):
        bad_string = pq(bad).html()
        logger.debug(f"clean article content : {bad_string}")
      dom.remove(bad_tag)
 
    # 标签各个属性处理
    for tag in dom('*'):
      for key, value in tag.attrib.items():
        # 跳过逻辑,保留表格的rowspan和colspan属性
        if key in attr_white_list:
          continue
        # 处理图片链接,不完整url,补充完整后替换
        if key in img_key_list:
          img_url = self.absolute_url(url, value)
          pq(tag).remove_attr(key)
          pq(tag).attr('src', img_url)
          pq(tag).attr('alt', '')
        # img标签的alt属性保留为空
        elif key == 'alt':
          pq(tag).attr(key, '')
        # 其余所有属性做删除操作
        else:
          pq(tag).remove_attr(key)
 
    return dom.text(), dom.html()

 3,正则表达清理空格以及换行符内容

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
#!/usr/bin/env python
# -*-coding:utf-8-*-
 
import re 
 
def regular_clean(self, str1: str, str2: str):
    '''
    正则表达式处理数据格式
    :param str1: content
    :param str2: html_content
    :return: 返回处理后的结果
    '''
 
    def new_line(text):
      text = re.sub('<br\s?/?>', '<br>', text)
      text = re.sub(
        '</?a>|</?em>|</?html>|</?body>|'
        '</?head>|<[a-zA-Z]{1,10}\s?/>|'
        '</?strong>|</?blockquote>|</?b>|'
        '</?span>|</?i>|</?hr>|</?font>',
        '',
        text)
      text = re.sub('\n', '', text)
      text = re.sub('<h[1-6]>', '<p>', text)
      text = re.sub('</h[1-6]>', '</p>', text)
      text = text.replace('</p>', '</p>\n').replace('<br>', '<br/>')
      return text
 
    str1, str2 = self.clean_blank(str1), self.clean_blank(str2) # TODO 处理空白行问题
 
    # TODO html_content处理 1,删除多余的无法使用的标签以及影响数据展示的标签 2,换行符问题处理以及更换
 
    str2 = new_line(text=str2)
 
    return str1, str2

结尾部分,各个方法封装类代码展示

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
#!/usr/bin/env python
# -*-coding:utf-8-*-
'''
author: szhan
date:2020-08-17
summery: 清理html_conent以及获取纯净数据格式
'''
 
import re
from lxml import etree
from pyquery import PyQuery as pq
from urllib.parse import urlsplit, urljoin
 
from loguru import logger
 
 
class CleanArticle:
 
  def __init__(
      self,
      text: str,
      url: str = '',
      xpath_dict: dict = None,
      pq_dict: dict = None
  ):
    self.text = text
    self.url = url
    self.xpath_dict = xpath_dict or dict()
    self.pq_dict = pq_dict or dict()
 
  @staticmethod
  def absolute_url(baseurl: str, url: str) -> str:
    '''
    补充url
    :param baseurl:scheme url
    :param url: target url
    :return: complete url
    '''
    target_url = url if urlsplit(url).scheme else urljoin(baseurl, url)
    return target_url
 
  @staticmethod
  def clean_blank(text):
    '''
    空白处理
    :param text:
    :return:
    '''
    text = text.replace('&#13;', '').replace('\u3000', '').replace('\t', '').replace('\xa0', '')
    text = re.sub('\s{2,}', '', text)
    text = re.sub('\n{2,}', '\n', text)
    text = text.strip('\n').strip()
    return text
 
  def run(self):
    '''
    :return:处理后的content, html_content
    '''
    if (not bool(self.text)) or (not isinstance(self.text, str)):
      raise ValueError('html_content has a bad type value')
    # 首先,使用xpath去除空格,以及注释,iframe, button, form, script, style, video等标签
    text = self.xpath_clean(self.text, self.xpath_dict)
 
    # 第二步,使用pyquery处理具体细节方面
    str1, str2 = self.pyquery_clean(text, self.url, self.pq_dict)
 
    # 最终的正则处理
    content, html_content = self.regular_clean(str1, str2)
 
    return content, html_content
 
  def xpath_clean(self, text: str, xpath_dict: dict) -> str:
    '''
    xpath 清除不必要的元素
    :param text: html_content
    :param xpath_dict: 清除目标xpath
    :return: string type html_content
    '''
    remove_by_xpath = xpath_dict if xpath_dict else dict()
 
    # 必然清除的项目 除非极端情况 一般这些都是要清除的
    remove_by_xpath.update({
      '_remove_2': '//iframe',
      '_remove_4': '//button',
      '_remove_5': '//form',
      '_remove_6': '//input',
      '_remove_7': '//select',
      '_remove_8': '//option',
      '_remove_9': '//textarea',
      '_remove_10': '//figure',
      '_remove_11': '//figcaption',
      '_remove_12': '//frame',
      '_remove_13': '//video',
      '_remove_14': '//script',
      '_remove_15': '//style'
    })
 
    parser = etree.HTMLParser(remove_blank_text=True, remove_comments=True)
    selector = etree.HTML(text, parser=parser)
 
    # 常规删除操作,不需要的标签删除
    for xpath in remove_by_xpath.values():
      for bad in selector.xpath(xpath):
        bad_string = etree.tostring(bad, encoding='utf-8',
                      pretty_print=True).decode()
        logger.debug(f"clean article content : {bad_string}")
        bad.getparent().remove(bad)
 
    skip_tip = "name()='img' or name()='tr' or " \
          "name()='th' or name()='tbody' or " \
          "name()='thead' or name()='table'"
    # 判断所有p标签,是否有内容存在,没有的直接删除
    for p in selector.xpath(f"//*[not({skip_tip})]"):
      # 跳过逻辑
      if p.xpath(f".//*[{skip_tip}]") or \
          bool(re.sub('\s', '', p.xpath('string(.)'))):
        continue
 
      bad_p = etree.tostring(p, encoding='utf-8',
                  pretty_print=True).decode()
      logger.debug(f"clean p tag : {bad_p}")
      p.getparent().remove(p)
 
    return etree.tostring(selector, encoding='utf-8',
               pretty_print=True).decode()
 
  def pyquery_clean(self, text, url, pq_dict) -> object:
    '''
    pyquery 做出必要的处理,
    :param text:
    :param url:
    :param pq_dict:
    :return:
    '''
    # 删除pq表达式字典
    remove_by_pq = pq_dict if pq_dict else dict()
    # 标签属性白名单
    attr_white_list = ['rowspan', 'colspan']
    # 图片链接key
    img_key_list = ['src', 'data-echo', 'data-src', 'data-original']
    # 生成pyquery对象
    dom = pq(text)
 
    # 删除无用标签
    for bad_tag in remove_by_pq.values():
      for bad in dom(bad_tag):
        bad_string = pq(bad).html()
        logger.debug(f"clean article content : {bad_string}")
      dom.remove(bad_tag)
 
    # 标签各个属性处理
    for tag in dom('*'):
      for key, value in tag.attrib.items():
        # 跳过逻辑,保留表格的rowspan和colspan属性
        if key in attr_white_list:
          continue
        # 处理图片链接,不完整url,补充完整后替换
        if key in img_key_list:
          img_url = self.absolute_url(url, value)
          pq(tag).remove_attr(key)
          pq(tag).attr('src', img_url)
          pq(tag).attr('alt', '')
        # img标签的alt属性保留为空
        elif key == 'alt':
          pq(tag).attr(key, '')
        # 其余所有属性做删除操作
        else:
          pq(tag).remove_attr(key)
 
    return dom.text(), dom.html()
 
  def regular_clean(self, str1: str, str2: str):
    '''
    正则表达式处理数据格式
    :param str1: content
    :param str2: html_content
    :return: 返回处理后的结果
    '''
 
    def new_line(text):
      text = re.sub('<br\s?/?>', '<br>', text)
      text = re.sub(
        '</?a>|</?em>|</?html>|</?body>|'
        '</?head>|<[a-zA-Z]{1,10}\s?/>|'
        '</?strong>|</?blockquote>|</?b>|'
        '</?span>|</?i>|</?hr>|</?font>',
        '',
        text)
      text = re.sub('\n', '', text)
      text = re.sub('<h[1-6]>', '<p>', text)
      text = re.sub('</h[1-6]>', '</p>', text)
      text = text.replace('</p>', '</p>\n').replace('<br>', '<br/>')
      return text
 
    str1, str2 = self.clean_blank(str1), self.clean_blank(str2) # TODO 处理空白行问题
 
    # TODO html_content处理 1,删除多余的无法使用的标签以及影响数据展示的标签 2,换行符问题处理以及更换
 
    str2 = new_line(text=str2)
 
    return str1, str2
 
if __name__ == '__main__':
  with open('html_content.html', 'r', encoding='utf-8') as f:
    lines = f.readlines()
    html = ''
    for line in lines:
      html += line
  ca = CleanArticle(text=html)
  _, html_content = ca.run()
  print(html_content)

总结

到此这篇关于基于xpath选择器、PyQuery、正则表达式的格式清理工具详解的文章就介绍到这了,更多相关PyQuery、正则表达式的格式清理工具内容请搜索服务器之家以前的文章或继续浏览下面的相关文章希望大家以后多多支持服务器之家!

原文链接:https://blog.csdn.net/weixin_37128372/article/details/108340853