Sophie

Sophie

distrib > Mandriva > 2010.2 > i586 > media > contrib-release > by-pkgid > 6009aaa8e5ab2df861ebfa6faf6af1ce > files > 48

python-parsing-1.5.2-2mdv2010.1.noarch.rpm

import urllib

from pyparsing import *

anchorStart,anchorEnd = makeHTMLTags("a")

# read HTML from a web page
serverListPage = urllib.urlopen( "http://www.yahoo.com" )
htmlText = serverListPage.read()
serverListPage.close()

anchor = anchorStart + SkipTo(anchorEnd).setResultsName("body") + anchorEnd


for tokens,start,end in anchor.scanString(htmlText):
    print tokens.body,'->',tokens.href