python网络编程基础,第四版
pycharm实现,python版本2.7.5
二、解析html和xhtml
第七章 解析Html 和XHtml p151-p168
1.提取标题
代码:
#coding=utf-8
from HTMLParser import HTMLParser
import sys
class TitleParser(HTMLParser):
def __init__(self):
self.title=''
self.readingtitle=0
HTMLParser.__init__(self)
def handle_starttag(self, tag, attrs):
if tag =='title':
self.readingtitle = 1
def handle_data(self, data):
if self.readingtitle:
self.title += data
def handle_endtag(self, tag):
if tag == 'title':
self.readingtitle = 0
def gettitle(self):
return self.title
fd = open(sys.argv[1])
tp = TitleParser()
tp.feed(fd.read())
print "Title is:",tp.gettitle()
运行结果:
D:\python\python.exe E:/code/python/unit7/basic_title.py
E:/code/python/unit7/faqs.html
Title is: Appendix?B. MySQL 5.6 Frequently Asked QuestionsProcess finished with exit code 0
注:从表中摘取数据,<TR>或<TD>
2.改进
代码:
#coding=utf-8
from HTMLParser import HTMLParser
from htmlentitydefs import entitydefs
import sys
class TitleParser(HTMLParser):
def __init__(self):
self.title=''
self.readingtitle=0
HTMLParser.__init__(self)
def handle_starttag(self, tag, attrs):
if tag =='title':
self.readingtitle = 1
def handle_data(self, data):
if self.readingtitle:
self.title += data
def handle_endtag(self, tag):
if tag == 'title':
self.readingtitle = 0
def handle_entityref(self, name):
if entitydefs.has_key(name):
self.handle_data(entitydefs[name])
else:
self.handle_data('&'+name+';')
def gettitle(self):
return self.title
fd = open(sys.argv[1])
tp = TitleParser()
tp.feed(fd.read())
print "Title is:",tp.gettitle()
etitle.html
<!DOCTYPE html>
<html >
<head>
<title>Document Title &Intro</title>
</head>
<body>
this is my text.
</body>
</html>
运行结果一:
D:\python\python.exe E:/code/python/unit7/basic_title.py
E:/code/python/unit7/etitle.html
Title is: Document Title Intro
Process finished with exit code 0
运行结果二:
D:\python\python.exe E:/code/python/unit7/etitle.py
E:/code/python/unit7/etitle.html
Title is: Document Title &IntroProcess finished with exit code 0
当一个实体出现时,代码检查该实体是否可以识别,可以,转换为相应得知,否则输入流中的文字;
3.转换字符参考
代码:
#coding=utf-8
from HTMLParser import HTMLParser
from htmlentitydefs import entitydefs
import sys
class TitleParser(HTMLParser):
def __init__(self):
self.title=''
self.readingtitle=0
HTMLParser.__init__(self)
def handle_starttag(self, tag, attrs):
if tag =='title':
self.readingtitle = 1
def handle_data(self, data):
if self.readingtitle:
self.title += data
def handle_endtag(self, tag):
if tag == 'title':
self.readingtitle = 0
def handle_entityref(self, name):
if entitydefs.has_key(name):
self.handle_data(entitydefs[name])
else:
self.handle_data('&'+name+';')
def handle_charref(self, name):
try:
charnum=int(name)
except ValueError:
return
if charnum<1 or charnum>225:
return
self.handle_data(chr(charnum))
def gettitle(self):
return self.title
fd = open(sys.argv[1])
tp = TitleParser()
tp.feed(fd.read())
print "Title is:",tp.gettitle()
4.处理不均衡的标签
代码:
#coding=utf-8
from HTMLParser import HTMLParser
from htmlentitydefs import entitydefs
import sys,re
class TitleParser(HTMLParser):
def __init__(self):
self.taglevels=[]
self.handledtags=['title','ul','li']
self.processing=None
HTMLParser.__init__(self)
def handle_starttag(self, tag, attrs):
if len(self.taglevels) and self.taglevels[-1] == tag:
self.handle_endtag(tag)
self.taglevels.append(tag)
if tag in self.handledtags:
self.data = ''
self.processing = tag
if tag == 'ul':
print"List start"
def handle_data(self, data):
if self.processing:
self.data += data
def handle_endtag(self, tag):
if not tag in self.taglevels:
return
while len(self.taglevels):
starttag = self.taglevels.pop()
if starttag in self.handledtags:
self.finishprocessing(starttag)
if starttag == tag:
break
def cleanse(self):
self.data = re.sub('\s+', ' ', self.data)
def finishprocessing(self, tag):
self.cleanse()
if tag == 'title' and tag == self.processing:
print "Dom title", self.data
elif tag == 'ul':
print "List ended"
elif tag == 'li' and tag == self.processing:
print "List item", self.data
self.processing = None
def gettitle(self):
return self.title
处理特殊值,如果在映射表中有对应的,即采用映射的值,否则为字面值
def handle_entityref(self, name):
if entitydefs.has_key(name):
self.handle_data(entitydefs[name])
else:
self.handle_data('&' + name + ';')
def handle_charref(self, name):
try:
charnum = int(name)
except ValueError:
return
if charnum < 1 or charnum > 255:
return
self.handle_data(chr(charnum))
fd = open(sys.argv[1])
tp = TitleParser()
tp.feed(fd.read())
运行结果:
D:\python\python.exe E:/code/python/unit7/4un.py
E:/code/python/unit7/4un.html
Dom title DOCTYPE Title & Intro?
List start
List item First List item
List item second list item
List item second list item
List endedProcess finished with exit code 0
5.一个可以实际工作的例子
三、XML和XML-RPCP169-p190
展示XML文档:tree,event.基于事件的解析器可以扫描文档,事件解析器可以响应。
8.2 使用Dom
代码:
#coding=utf-8
from xml.dom import minidom,Node
def scanNode(node,level=0):
msg = node.__class__.__name__
if node.nodeType == Node.ELEMENT_NODE:
msg += ",tag" + node.tagName
print " " * level * 4, msg
if node.hasChildNodes:
for child in node.childNodes:
scanNode(child, level + 1)
doc = minidom.parse("Sample.xml")
scanNode(doc)
运行结果:
D:\python\python.exe E:/code/python/unit8/un1.py
Document
> Element,tagbook
> Text
> Element,tagtitle
> Text
> Text
> Element,tagauthor
> Text
> Element,tagname
> Text
> Element,tagfirst
> Text
> Text
> Element,taglast
> Text
> Text
> Text
> Element,tagaffiliation
> Text
> Text
> Text
> Element,tagchapter
> Text
> Element,tagtitle
> Text
> Text
> Element,tagpara
> Text
> Element,tagcompany
> Text
> Text
> Text
> Text
Process finished with exit code 0
sample.xml
<?xml version="1.0" encoding="UTF-8"?>
<book>
<title> Sample XML Thing </title>
<author>
<name>
<first>Benjamin</first>
<last>Smith</last>
</name>
<affiliation>Springy Widgets,Inc.</affiliation>
</author>
<chapter number = "1">
<title>First chapter</title>
<para>
I think widgets are great.you should buy lots
of them from <company>Springy widgets,Inc</company>
</para>
</chapter>
</book>
2.使用dom完全解析
代码:
#coding=utf-8
"""
将XML以文本形式重新格式化输出
1.使用Node的节点类型,判断下一步如何处理
2.对不同的节点名(tagName)进行相应的处理
"""
from xml.dom import minidom, Node
import re, textwrap
class SampleScanner:
def __init__(self, doc):
for child in doc.childNodes:
if child.nodeType == Node.ELEMENT_NODE and child.tagName == "book":
"""只处理book元素"""
self.handleBook(child)
def gettext(self, nodelist):
"""获取当前节点的文本,
1.如果当前的节点为TEXT_NODE,将文本追加到列表中
2.如果当前的节点不是TEXT_NODE,递归地调用gettext"""
retlist = []
for node in nodelist:
if node.nodeType == Node.TEXT_NODE:
retlist.append(node.wholeText)
elif node.hasChildNodes:
retlist.append(self.gettext(node.childNodes))
return re.sub("\s+", " ", "".join(retlist))
def handleBook(self, node):
"""处理Book节点
1.如果不是ELEMENT_NODE,不予理睬
2.如果是title,直接打印出文本内容
3.如果是author,调用handleAuthor,继续处理节点
4.如果是chapter,调用handleChapter,继续处理节点
"""
for child in node.childNodes:
if child.nodeType != Node.ELEMENT_NODE:
continue
if child.tagName == "title":
print "Book title is :", self.gettext(child.childNodes)
if child.tagName == "author":
self.handleAuthor(child)
if child.tagName == "chapter":
self.handleChapter(child)
def handleAuthor(self, node):
"""处理Autho节点
1.如果不是ELEMENT_NODE,不予理睬
2.如果是name,调用handleAuthoerName,继续处理节点
3.如果是affiliation,调用gettext,并打印出来
"""
for child in node.childNodes:
if child.nodeType != Node.ELEMENT_NODE:
continue
if child.tagName == "name":
self.handleAuthorName(child)
elif child.tagName == "affiliation":
print "Author affiliation:", self.gettext([child])
def handleAuthorName(self, node):
"""处理author.name节点
1.使用getElementsByTagName获得子节点
2.调用gettext得到子节点的文本,并打印处理
"""
surname = self.gettext(node.getElementsByTagName("last"))
givenname = self.gettext(node.getElementsByTagName("first"))
print "Author Name:%s %s " % (surname, givenname)
def handleChapter(self, node):
"""处理chapter节点
1.如果不是ELEMENT_NODE,不予理睬
2.如果是para,调用handlePara,继续处理
"""
print "*** Start of Chapter %s,%s" % (
node.getAttribute("number"), self.gettext(node.getElementsByTagName("title")))
for child in node.childNodes:
if child.nodeType != Node.ELEMENT_NODE:
continue
if child.tagName == "para":
self.handlePara(child)
def handlePara(self, node):
"""
1.获取当前节点的文本
2.调用textwrap格式化文本
"""
paratext = self.gettext([node])
paratext = textwrap.fill(paratext)
print paratext
doc = minidom.parse("Sample.xml")
SampleScanner(doc)
运行结果:
D:\python\python.exe E:/code/python/unit8/un2.py
Book title is : Sample XML Thing
Author Name:Smith Benjamin
Author affiliation: Springy Widgets,Inc.
*** Start of Chapter 1,First chapter
I think widgets are great.you should buy lots of them from Springy
widgets,IncProcess finished with exit code 0
3.使用Dom产生文档
代码:
#coding=utf-8
"""
使用minidom生成XML
1.创建Element,createElement
2.添加子节点,appendChild
3.创建Text,createTextNode
4.创建属性,createAttribute
"""
from xml.dom import minidom,Node
# 创建Document
doc = minidom.Document()
# 创建book节点
book = doc.createElement("book")
doc.appendChild(book)
# 创建Title节点
title = doc.createElement("title")
text = doc.createTextNode("Sample XML Thing")
title.appendChild(text)
book.appendChild(title)
# 创建author节点
author = doc.createElement("author")
# 创建name节点
name = doc.createElement("name")
first = doc.createElement("first")
first.appendChild(doc.createTextNode("Benjamin"))
name.appendChild(first)
last = doc.createElement("last")
last.appendChild(doc.createTextNode("Smith"))
name.appendChild(last)
author.appendChild(name)
book.appendChild(author)
# author节点完毕
# 创建chapter节点
chapter = doc.createElement("chapter")
chapter.setAttribute("number","1")
title = doc.createElement("title")
title.appendChild(doc.createTextNode("Fisrt Chapter"))
chapter.appendChild(title)
para = doc.createElement("para")
para.appendChild(doc.createTextNode("I think widgets are great.you should buy lots \
of them from"))
company = doc.createElement("company")
company.appendChild(doc.createTextNode("Springy widgets,Inc"))
para.appendChild(company)
chapter.appendChild(para)
# chapter节点完毕
book.appendChild(chapter)
# book节点完毕
print doc.toprettyxml(indent = " ")
运行结果:
D:\python\python.exe E:/code/python/unit8/un3.py
<?xml version="1.0" ?> <book> <title>Sample XML Thing</title> <author> <name> <first>Benjamin</first> <last>Smith</last> </name> </author> <chapter number="1"> <title>Fisrt Chapter</title> <para> I think widgets are great.you should buy lots of them from <company>Springy widgets,Inc</company> </para> </chapter> </book>
Process finished with exit code 0
4.dom类型参考
8.3使用xml-rpc
5.
代码:
#coding=utf-8
import xmlrpclib
url='http://liandesinian.blog.51cto.com/7737219/1565474'
s=xmlrpclib.ServerProxy(url)
catdata=s.meerkat.getCategories()
cattiles=[item['title'] for item in catdata]
cattiles.sort()
for item in cattiles:
print item
运行结果:
D:\python\python.exe E:/code/python/unit8/un6.py
Process finished with exit code 0
6.
代码:
#coding=utf-8
import xmlrpclib,sys,textwrap
class NewsCat:
def __init__(self,catdata):
self.id=catdata['id']
self.title=catdata['title']
def __cmp__(self, other):
return cmp(self.title,other.title)
class NewsSource:
def __init__(self,url='http://www.oreillynet.com/meerkat/xml-rpc/server.php'):
self.s=xmlrpclib.ServerProxy(url)
self.loadcats()
def loadcats(self):
print "Loading categories...."
catdata=self.s.meerkat.getCatgries()
self.cats=[NewsCat(item) for item in catdata]
self.cat.sort()
def displaycats(self):
numonline=0
i=0
for item in self.cats:
sys.stdout.write("%2d:%20.20s"%(i+1,item.title))
i+=1
numonline+=1
if numonline%3==0:
sys.stdout.write("\n")
if numonline!=0:
sys.stdout.write("\n")
def promotcat(self):
sys.__displaycats()
sys.stdout.write("select a catgory or q to quit")
selection = sys.stdin.readline().strip()
if selection == 'q':
sys.exit(0)
return int(selection) - 1
def dispact(self, cat):
items = self.s.meerkat.getItems({'category': cat,
'ids': 1,
'descriptions': 1,
'categories': 1,
'channels': 1,
'data': 1,
'num_items': 15})
if not len(items):
print "Sorry,no items in that category."
sys.stdout.write("Press Enter to continue:")
sys.stdin.readline()
return
while 1:
print self.dispitemsummary(items)
sys.stdout.write("select a catgory or q to quit")
selection = sys.stdin.readline().strip()
if selection=='q':
return
self.dispitem(items[int(selection)-1])
def dispitemsummary(self, items):
counter = 0
for item in items:
print "%2d:%s"(counter + 1, item['title'])
counter += 1
def dispitem(self, item):
print "---%s---" % item['title']
print "Posted on", item['data']
print "Description:"
print textwrap.fill(item['description'])
print "\nlink:", item['link']
sys.stdout.write("\nPress Enter to continue: ")
sys.stdin.readline()
n = NewsSource()
while 1:
cat = n.promotcat()
n.dispact(cat)