1,使用xpath清理不必要的标签元素,以及无内容标签
?| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
from lxml import etree
def xpath_clean(self, text: str, xpath_dict: dict) -> str:
'''
xpath 清除不必要的元素
:param text: html_content
:param xpath_dict: 清除目标xpath
:return: string type html_content
'''
remove_by_xpath = xpath_dict if xpath_dict else dict()
# 必然清除的项目 除非极端情况 一般这些都是要清除的
remove_by_xpath.update({
'_remove_2': '//iframe',
'_remove_4': '//button',
'_remove_5': '//form',
'_remove_6': '//input',
'_remove_7': '//select',
'_remove_8': '//option',
'_remove_9': '//textarea',
'_remove_10': '//figure',
'_remove_11': '//figcaption',
'_remove_12': '//frame',
'_remove_13': '//video',
'_remove_14': '//script',
'_remove_15': '//style'
})
parser = etree.HTMLParser(remove_blank_text=True, remove_comments=True)
selector = etree.HTML(text, parser=parser)
# 常规删除操作,不需要的标签删除
for xpath in remove_by_xpath.values():
for bad in selector.xpath(xpath):
bad_string = etree.tostring(bad, encoding='utf-8',
pretty_print=True).decode()
logger.debug(f"clean article content : {bad_string}")
bad.getparent().remove(bad)
skip_tip = "name()='img' or name()='tr' or " \
"name()='th' or name()='tbody' or " \
"name()='thead' or name()='table'"
# 判断所有p标签,是否有内容存在,没有的直接删除
for p in selector.xpath(f"//*[not({skip_tip})]"):
# 跳过逻辑
if p.xpath(f".//*[{skip_tip}]") or \
bool(re.sub('\s', '', p.xpath('string(.)'))):
continue
bad_p = etree.tostring(p, encoding='utf-8',
pretty_print=True).decode()
logger.debug(f"clean p tag : {bad_p}")
p.getparent().remove(p)
return etree.tostring(selector, encoding='utf-8',
pretty_print=True).decode()
|
2,使用pyquery清理标签属性,并返回处理后源码和纯净文本
?| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
#!/usr/bin/env python
# -*-coding:utf-8-*-
from pyquery import PyQuery as pq
def pyquery_clean(self, text, url, pq_dict) -> object:
'''
pyquery 做出必要的处理,
:param text:
:param url:
:param pq_dict:
:return:
'''
# 删除pq表达式字典
remove_by_pq = pq_dict if pq_dict else dict()
# 标签属性白名单
attr_white_list = ['rowspan', 'colspan']
# 图片链接key
img_key_list = ['src', 'data-echo', 'data-src', 'data-original']
# 生成pyquery对象
dom = pq(text)
# 删除无用标签
for bad_tag in remove_by_pq.values():
for bad in dom(bad_tag):
bad_string = pq(bad).html()
logger.debug(f"clean article content : {bad_string}")
dom.remove(bad_tag)
# 标签各个属性处理
for tag in dom('*'):
for key, value in tag.attrib.items():
# 跳过逻辑,保留表格的rowspan和colspan属性
if key in attr_white_list:
continue
# 处理图片链接,不完整url,补充完整后替换
if key in img_key_list:
img_url = self.absolute_url(url, value)
pq(tag).remove_attr(key)
pq(tag).attr('src', img_url)
pq(tag).attr('alt', '')
# img标签的alt属性保留为空
elif key == 'alt':
pq(tag).attr(key, '')
# 其余所有属性做删除操作
else:
pq(tag).remove_attr(key)
return dom.text(), dom.html()
|
3,正则表达清理空格以及换行符内容
?| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 |
#!/usr/bin/env python
# -*-coding:utf-8-*-
import re
def regular_clean(self, str1: str, str2: str):
'''
正则表达式处理数据格式
:param str1: content
:param str2: html_content
:return: 返回处理后的结果
'''
def new_line(text):
text = re.sub('<br\s?/?>', '<br>', text)
text = re.sub(
'</?a>|</?em>|</?html>|</?body>|'
'</?head>|<[a-zA-Z]{1,10}\s?/>|'
'</?strong>|</?blockquote>|</?b>|'
'</?span>|</?i>|</?hr>|</?font>',
'',
text)
text = re.sub('\n', '', text)
text = re.sub('<h[1-6]>', '<p>', text)
text = re.sub('</h[1-6]>', '</p>', text)
text = text.replace('</p>', '</p>\n').replace('<br>', '<br/>')
return text
str1, str2 = self.clean_blank(str1), self.clean_blank(str2) # TODO 处理空白行问题
# TODO html_content处理 1,删除多余的无法使用的标签以及影响数据展示的标签 2,换行符问题处理以及更换
str2 = new_line(text=str2)
return str1, str2
|
结尾部分,各个方法封装类代码展示
?| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 |
#!/usr/bin/env python
# -*-coding:utf-8-*-
'''
author: szhan
date:2020-08-17
summery: 清理html_conent以及获取纯净数据格式
'''
import re
from lxml import etree
from pyquery import PyQuery as pq
from urllib.parse import urlsplit, urljoin
from loguru import logger
class CleanArticle:
def __init__(
self,
text: str,
url: str = '',
xpath_dict: dict = None,
pq_dict: dict = None
):
self.text = text
self.url = url
self.xpath_dict = xpath_dict or dict()
self.pq_dict = pq_dict or dict()
@staticmethod
def absolute_url(baseurl: str, url: str) -> str:
'''
补充url
:param baseurl:scheme url
:param url: target url
:return: complete url
'''
target_url = url if urlsplit(url).scheme else urljoin(baseurl, url)
return target_url
@staticmethod
def clean_blank(text):
'''
空白处理
:param text:
:return:
'''
text = text.replace(' ', '').replace('\u3000', '').replace('\t', '').replace('\xa0', '')
text = re.sub('\s{2,}', '', text)
text = re.sub('\n{2,}', '\n', text)
text = text.strip('\n').strip()
return text
def run(self):
'''
:return:处理后的content, html_content
'''
if (not bool(self.text)) or (not isinstance(self.text, str)):
raise ValueError('html_content has a bad type value')
# 首先,使用xpath去除空格,以及注释,iframe, button, form, script, style, video等标签
text = self.xpath_clean(self.text, self.xpath_dict)
# 第二步,使用pyquery处理具体细节方面
str1, str2 = self.pyquery_clean(text, self.url, self.pq_dict)
# 最终的正则处理
content, html_content = self.regular_clean(str1, str2)
return content, html_content
def xpath_clean(self, text: str, xpath_dict: dict) -> str:
'''
xpath 清除不必要的元素
:param text: html_content
:param xpath_dict: 清除目标xpath
:return: string type html_content
'''
remove_by_xpath = xpath_dict if xpath_dict else dict()
# 必然清除的项目 除非极端情况 一般这些都是要清除的
remove_by_xpath.update({
'_remove_2': '//iframe',
'_remove_4': '//button',
'_remove_5': '//form',
'_remove_6': '//input',
'_remove_7': '//select',
'_remove_8': '//option',
'_remove_9': '//textarea',
'_remove_10': '//figure',
'_remove_11': '//figcaption',
'_remove_12': '//frame',
'_remove_13': '//video',
'_remove_14': '//script',
'_remove_15': '//style'
})
parser = etree.HTMLParser(remove_blank_text=True, remove_comments=True)
selector = etree.HTML(text, parser=parser)
# 常规删除操作,不需要的标签删除
for xpath in remove_by_xpath.values():
for bad in selector.xpath(xpath):
bad_string = etree.tostring(bad, encoding='utf-8',
pretty_print=True).decode()
logger.debug(f"clean article content : {bad_string}")
bad.getparent().remove(bad)
skip_tip = "name()='img' or name()='tr' or " \
"name()='th' or name()='tbody' or " \
"name()='thead' or name()='table'"
# 判断所有p标签,是否有内容存在,没有的直接删除
for p in selector.xpath(f"//*[not({skip_tip})]"):
# 跳过逻辑
if p.xpath(f".//*[{skip_tip}]") or \
bool(re.sub('\s', '', p.xpath('string(.)'))):
continue
bad_p = etree.tostring(p, encoding='utf-8',
pretty_print=True).decode()
logger.debug(f"clean p tag : {bad_p}")
p.getparent().remove(p)
return etree.tostring(selector, encoding='utf-8',
pretty_print=True).decode()
def pyquery_clean(self, text, url, pq_dict) -> object:
'''
pyquery 做出必要的处理,
:param text:
:param url:
:param pq_dict:
:return:
'''
# 删除pq表达式字典
remove_by_pq = pq_dict if pq_dict else dict()
# 标签属性白名单
attr_white_list = ['rowspan', 'colspan']
# 图片链接key
img_key_list = ['src', 'data-echo', 'data-src', 'data-original']
# 生成pyquery对象
dom = pq(text)
# 删除无用标签
for bad_tag in remove_by_pq.values():
for bad in dom(bad_tag):
bad_string = pq(bad).html()
logger.debug(f"clean article content : {bad_string}")
dom.remove(bad_tag)
# 标签各个属性处理
for tag in dom('*'):
for key, value in tag.attrib.items():
# 跳过逻辑,保留表格的rowspan和colspan属性
if key in attr_white_list:
continue
# 处理图片链接,不完整url,补充完整后替换
if key in img_key_list:
img_url = self.absolute_url(url, value)
pq(tag).remove_attr(key)
pq(tag).attr('src', img_url)
pq(tag).attr('alt', '')
# img标签的alt属性保留为空
elif key == 'alt':
pq(tag).attr(key, '')
# 其余所有属性做删除操作
else:
pq(tag).remove_attr(key)
return dom.text(), dom.html()
def regular_clean(self, str1: str, str2: str):
'''
正则表达式处理数据格式
:param str1: content
:param str2: html_content
:return: 返回处理后的结果
'''
def new_line(text):
text = re.sub('<br\s?/?>', '<br>', text)
text = re.sub(
'</?a>|</?em>|</?html>|</?body>|'
'</?head>|<[a-zA-Z]{1,10}\s?/>|'
'</?strong>|</?blockquote>|</?b>|'
'</?span>|</?i>|</?hr>|</?font>',
'',
text)
text = re.sub('\n', '', text)
text = re.sub('<h[1-6]>', '<p>', text)
text = re.sub('</h[1-6]>', '</p>', text)
text = text.replace('</p>', '</p>\n').replace('<br>', '<br/>')
return text
str1, str2 = self.clean_blank(str1), self.clean_blank(str2) # TODO 处理空白行问题
# TODO html_content处理 1,删除多余的无法使用的标签以及影响数据展示的标签 2,换行符问题处理以及更换
str2 = new_line(text=str2)
return str1, str2
if __name__ == '__main__':
with open('html_content.html', 'r', encoding='utf-8') as f:
lines = f.readlines()
html = ''
for line in lines:
html += line
ca = CleanArticle(text=html)
_, html_content = ca.run()
print(html_content)
|
总结
到此这篇关于基于xpath选择器、PyQuery、正则表达式的格式清理工具详解的文章就介绍到这了,更多相关PyQuery、正则表达式的格式清理工具内容请搜索服务器之家以前的文章或继续浏览下面的相关文章希望大家以后多多支持服务器之家!
原文链接:https://blog.csdn.net/weixin_37128372/article/details/108340853








发表评论
◎欢迎参与讨论,请在这里发表您的看法、交流您的观点。