Python数据预处理（二）- 清洗文本数据_技术笔记

扶云归 2023-09-03

清洗HTML数据

算法思路：

·分析html文本信息

·导入正则：re.l、re.L、re.M、re.S...

·清洗HTML标签：DOCTYPE、CDATA、Script、style

·HTML标签、注释、换行等处理：re.compile

·实现正则清洗HTML数据

import re

"""
re.I    使匹配对大小写不敏感
re.L    做本地化识别（locale-aware）匹配
re.M    多行匹配，影响^和$
re.S    使.匹配包括换行在内的所有字符
re.U    根据Unicode字符集解析字符。这个标志影响\w，\W，\b，\B
re.X    该标签通过给予你更灵活的格式以便你将正则表达式写得更容易
"""
# 清洗HTML标签文本
# @param htmlstr HTML字符串
def filter_tags(htmlstr):
    # 过滤DOCTYPE
    htmlstr = ' '.join(htmlstr.split())    #去除多余空格
    re_doctype = re.compile(r'<!DOCTYPE .*?>',re.S)
    res = re_doctype.sub('',htmlstr)
    
    # 过滤CDATA
    re_cdata = re.comile('//<!CDATA\[[ >]//\]>',re.I)
    res = re_cdata.sub('',res)
    
    #Script
    re_script = re.compile('<\s*script[^>]*>[^<]*<\s*/\s*script\s*>',re.I)
    res = re_script.sub('',res)
    
    #style
    re_style = re.compile('<\s*style[^>]*>[^<]*<\s*/\s*script\s*>',re.I)
    res = re_style.sub('',res)    #去掉style
    
    # 处理换行
    re_br = re.compile('<br\s*?/?>')
    res = re_br.sub('',res)    # 将br转行为换行
    
    # 处理HTML标签
    re_h = re.compile('</?\w+[^>]*>')
    res = re_h.sub('',res)    # 去掉HTML
    
    # 剔除超链接
    http_link = re.compile(r'(http://.+html)')
    res = http_link.sub('',res)
    
    # HTML注释
    re_comment = re.compile('<!--[^>]*-->')
    res = re_comment.sub('',res)
    # 处理多余的空格
    blank_line = re.compile('\n+')
    res = blank_line.sub('',res)
    
    blank_line_1 = re.compile('\n+')
    res = blank_line_1.sub('',res)
    
    blank-line_kon = re.compile('\t')
    res = blank_line_kon.sub('',res)
    
    blank_line_one = re.compile('\r\n')
    res = blank_line_one.sub('',res)
    
    blank_two = re.compile('\t')
    res = blank_two.sub('',res)
    
    blank_three = re.compile('\t')
    res = blank_three.sub('',res)
    
    return

def readFile(path):
    str_doc = ""
    with open(path,'r',encoding='utf-8') as f:
        str_doc = f.read()
    return str_doc
if __name__=='__main__':
    str_doc=readFile(r'./htmldome.txt')
    res = filter_tags(str_doc)
    print(res)

0赞 · 0采集

akabla 2021-03-26

2.script的清洗
3.style的清洗

截图
0赞 · 0采集
akabla 2021-03-26

1.re.S

截图
0赞 · 0采集

higandawn 2020-02-09

"""
Description:正则清洗HTML数据
Author:
Prompt: code in python3 env
"""
"""
   re.I   使匹配对大小写不敏感
   re.L   做本地化识别（locale-aware）匹配
   re.M   多行匹配，影响^(开头)和$(结尾)
   re.S   匹配包含换行在内的所有字符
   re.U   根据Unicode字符集解析字符，这个标志影响 \w, \W, \b, \B
   re.X   该标志通过给予你更灵活的格式以便你将正则表达式写得更加
"""
import re

# 处理HTML标签文本
# @param htmlstr html字符串


def filter_tags(htmlstr):
   # 过滤doc_type
   htmlstr = ' '.join(htmlstr.split())
   re_doctype = re.compile(r'<!DOCTYPE .*?>', re.S)
   res = re_doctype.sub('', htmlstr)

   """
   # 过滤CDATA
   re_cdata = re.compile( r'//<!CDATA\[[ >] //\] >', re.I)
   res = re_cdata.sub('', res)

   # Script
   re_script = re.compile('<\s*script[^>]*>[^<]*<\s*/\s*script\s*>', re.I)
   res = re_script.sub('', res)

   # 注释
   re_script = re.compile('<!--.*?-->', 0)
   res = re_script.sub('', res)

   # 换行符
   re_br = re.compile('<br\n*?/?>')
   res = re_br.sub('\n', res)

   # HTML 标签
   re_lable = re.compile('</?\w[^>]*>')
   res = re_lable.sub('', res)

   # 转义字符
   re_esc = re.compile('&.*?;')
   res = re_esc.sub('', res)

   # 空格处理
   re_blank = re.compile('\s+') # \s包含 \t \n \r \f \v
   res = re_blank.sub(' ', res)

   # 超链接处理
   re_http = re.compile(r'(http://.+.html)')
   res = re_http.sub(' ', res)
   """
   # return res
   re_mate = [
      (r'<!DOCTYPE .*?>', re.S),
      (r'//<!CDATA\[[ >] //\] >', re.I),
      (r'<\s*script[^>]*>[^<]*<\s*/\s*script\s*>', re.I),
      (r'<!--.*?-->', re.I),
      (r'<br\n*?/?>', ),
      (r'</?\w[^>]*>', ),
      (r'&.*?;', ),
      (r'\s+', ),
      (r'(http://.+.html)', ),
   ]

   d = lambda pattern, flags=0: re.compile(pattern, flags)
   for re_type in re_mate:
      re_type = d(*re_type)
      res = re_type.sub(' ', res)
   return res

def read_file(read_path):
   str_doc = ''
   with open(read_path, 'r', encoding='utf-8') as f:
      str_doc = f.read()
   return str_doc


if __name__ == '__main__':
   str_doc = read_file(r're.html')
   res = filter_tags(str_doc)
    print(res)


#   with open(r'../data/html/test.html', 'w', encoding='utf-8') as f:
#      f.write(res)
#   print('No Exception') # 我是通过另一个编辑器进行打开预览的

这是我做的笔记，和老师的有些不一样但是效果一样的

1赞 · 1采集

数据加载中...