Python Web Crawler Script

spider_webHere’s a simple web crawling script that will go from one url and find all the pages it links to up to a pre-defined depth. Web crawling is of course the lowest level tool used by Google to create its multi-billion dollar business. You may not be able to compete with Google’s search technology but being able to crawl your own sites, or that of your competitors can be very valuable.

You could for instance routinely check your websites to make sure that it is live and all the links are working. it could notify you of any 404 errors. By adding in a page rank check you could identify better linking strategies to boost your page rank scores. And you could identify possible leaks – paths a user could take that takes them away from where you want them to go.

Here’s the script:

# -*- coding: utf-8 -*-
from HTMLParser import HTMLParser
from urllib2 import urlopen
 
class Spider(HTMLParser):
    def __init__(self, starting_url, depth, max_span):
        HTMLParser.__init__(self)
        self.url = starting_url
        self.db = {self.url: 1}
        self.node = [self.url]
 
        self.depth = depth # recursion depth max
        self.max_span = max_span # max links obtained per url
        self.links_found = 0
 
    def handle_starttag(self, tag, attrs):
        if self.links_found < self.max_span and tag == 'a' and attrs:
            link = attrs[0][1]
            if link[:4] != "http":
                link = '/'.join(self.url.split('/')[:3])+('/'+link).replace('//','/')
 
            if link not in self.db:
                print "new link ---> %s" % link
                self.links_found += 1
                self.node.append(link)
            self.db[link] = (self.db.get(link) or 0) + 1
 
    def crawl(self):
        for depth in xrange(self.depth):
            print "*"*70+("\nScanning depth %d web\n" % (depth+1))+"*"*70
            context_node = self.node[:]
            self.node = []
            for self.url in context_node:
                self.links_found = 0
                try:
                    req = urlopen(self.url)
                    res = req.read()
                    self.feed(res)
                except:
                    self.reset()
        print "*"*40 + "\nRESULTS\n" + "*"*40
        zorted = [(v,k) for (k,v) in self.db.items()]
        zorted.sort(reverse = True)
        return zorted
 
if __name__ == "__main__":
    spidey = Spider(starting_url = 'http://www.7cerebros.com.ar', depth = 5, max_span = 10)
    result = spidey.crawl()
    for (n,link) in result:
        print "%s was found %d time%s." %(link,n, "s" if n is not 1 else "")