[python-chinese] a wrapper for BeautifulSoup
cry
zyqmail在tom.com
星期五 九月 1 17:44:56 HKT 2006
python,您好!
作了一个小玩意,BeautifulSoup的wrapper。呵呵,学习加玩乐。如果谁知道有人已经做过了,请告诉我。谢谢。
可以让你按照层次取得想要的东西。
比如:
soup = mysoup.MySoup(doc)
# soup里只包含了<body>数据(其它的从BeautifulSoup可以轻易得到)。
# soup里有一个Body成员,一切都从这里开始。(里面所有的成员都以大写字母开头,且只包含Tag)
text1 = souppBody.Table[1].Tr[1].Td[2].content.string
text2 = Body.Table[1].Tr[1].Td[3].content.contents[0].string
#其中,content是BeautifulSoup的Tag.
mysoup.py : 这个小玩意
mysoup_test.py : 一个应用的例子,取在线字典的数据,使用:python mysoup_test.py 要查的词
root$python mysoup_test.py python
python:
n. 丹舌,大蟒,巨蟒
mysoup.py ---------------------------------------------------------------------
# -*- coding: utf-8 -*-
"""
MySoup : a wrapper of BeautifulSoup
Author : Robin Zhang
Date : 2006/9/1
Example:
soup = MySoup(''.join(s))
text1 = soup.Body.Table[1].Tr[1].Td[0].Table[0].Tr[1].Td[0].Table[0].Tr[0].Td[0].Big[0].B[0].Font[0].content.string
text2 = soup.Body.Table[1].Tr[1].Td[0].Table[0].Tr[1].Td[0].Table[0].Tr[0].Td[0].Big[1].Font[0].content.contents[0].string
text3 = soup.Body.Table[1].Tr[1].Td[0].Table[0].Tr[1].Td[0].Table[0].Tr[0].Td[0].Big[1].Font[0].content.contents[2].string
"""
from BeautifulSoup import BeautifulSoup
class LayerTag:
"""
"""
def __init__(self, Tag):
"""
"""
self.content = Tag
def AddSubTag(self, SubTagName, SubLayerTag):
"""
"""
if getattr(self, SubTagName.capitalize(), None) == None:
setattr(self, SubTagName.capitalize(), [])
getattr(self, SubTagName.capitalize()).append(SubLayerTag)
class MySoup(BeautifulSoup):
"""
"""
def __init__(self, *args, **kwargs):
BeautifulSoup.__init__(self, *args, **kwargs)
self.Body = self._create_tag_layer(self.html.body)
def _create_tag_layer(self, Tag):
"""
"""
layertag = LayerTag(Tag)
for content in Tag.contents:
if content.__class__.__name__ == "Tag":
layertag.AddSubTag(content.name, self._create_tag_layer(content))
return layertag
mysoup_test.py ---------------------------------------------------------------------
# -*- coding: utf-8 -*-
"""
MySoup_test : test MySoup
Author : Robin Zhang
Date : 2006/9/1
"""
import mysoup
import urllib2,sys
if __name__ == '__main__':
"""
This example visits an on line dictionary site to consult a word
"""
url = "http://sh.dict.cn/search/?q="
if len(sys.argv) < 2:
print "Usage : mysoup new_word"
sys.exit()
try:
page = urllib2.urlopen("%s%s" % (url, sys.argv[1]))
soup = mysoup.MySoup(page).Body.Table[1].Tr[1].Td[0].Table[0].Tr[1].Td[0].Table[0].Tr[0].Td[0]
print soup.Big[0].B[0].Font[0].content.string.strip("\r\n")
for item in soup.Big[1].Font[0].content.contents:
if item.__class__.__name__!="Tag":
print item.string.strip("\r\n")
except Exception, msg:
print "Error: %s" % msg
---------------------------------------------------------------------------------
致
礼!
cry
zyqmail在tom.com
关于邮件列表 python-chinese 的更多信息