Thursday, June 9, 2011

Crawling the web with SGMLParser

In this example we will use SGMLParser in order to build a simple web crawler.
import urllib
from random import choice
from sgmllib import SGMLParser

class LinkExplorer(SGMLParser): 
 def reset(self):                              
  SGMLParser.reset(self) 
  self.links = [] # list with the urls

 def start_a(self, attrs):
  """ fill the links with the links in the page """
  for k in attrs:
   if k[0] == 'href' and k[1].startswith('http'): 
    self.links.append(k[1])

def explore(parser,s_url,maxvisit=10,iter=0):
 """ pick a random link in the page s_url
     and follow its links recursively """
 if iter < maxvisit: # it will stop after maxvisit iteration
  print '(',iter,') I am in',s_url
  usock = urllib.urlopen(s_url) # download the page
  parser.reset() # reset the list
  parser.feed(usock.read()) # parse the current page
  if len(parser.links) > 0:
   explore(parser,choice(parser.links),maxvisit,iter+1)
  else: # if the page has no links to follow
   print 'the page has no links'

# test the crawler starting from the python's website
parser = LinkExplorer()
explore(parser,"http://www.python.org/")
Let's go!
( 0 ) I am in http://www.python.org/
( 1 ) I am in http://wiki.python.org/moin/NumericAndScientific
( 2 ) I am in http://numpy.scipy.org/
( 3 ) I am in http://sphinx.pocoo.org/
( 4 ) I am in http://www.bitbucket.org/birkenfeld/sphinx/issues/
( 5 ) I am in http://blog.bitbucket.org
( 6 ) I am in http://haproxy.1wt.eu/
( 7 ) I am in http://www.olivepeak.com/blog/posts/read/free-your-port-80-with-haproxy
( 8 ) I am in http://www.olivepeak.com/peaknotes/
( 9 ) I am in http://notes.olivepeak.com/account/create

2 comments:

  1. SGMLlib is now deprecated and has been outright removed from Python 3. :(

    ReplyDelete