Tag Archives: urllib2

I got an email the other day from Frank Kern who was pimping another make money online product from his cousin Trey. The Number Effect is a DVD containing the results of an experiment where he created an affiliate link to every one of the 12,000 products for sale on ClickBank and sent paid (PPV) traffic to all of those links and found which ones were profitable. He found 54 niches with profitable campaigns out of 12,000.

Trey went on to talk about the software that he had written for this experiment. It apparently took a bit of work to get going from his outsourced programmer.

I thought it would be fun to try and implement the same script myself. It took about 1 hour to program the whole thing.

So if you want to create your own clickbank affiliate link for all of the clickbank products for sale here’s a script that will do it. Keep in mind that I never did any work to make this thing fast. and it takes about 8 hours to scrape all 13,000 products, create the affiliate links, and resolve the urls for where it goes. Sure I could make it faster, but I’m lazy.

Here’s the python script to do it:

#!/usr/bin/env python
# encoding: utf-8
"""
ClickBankMarketScrape.py
 
Created by Matt Warren on 2010-09-07.
Copyright (c) 2010 HalOtis.com. All rights reserved.
 
"""
 
 
 
CLICKBANK_URL = 'http://www.clickbank.com'
MARKETPLACE_URL = CLICKBANK_URL+'/marketplace.htm'
AFF_LINK_FORM = CLICKBANK_URL+'/info/jmap.htm'
 
AFFILIATE = 'mfwarren'
 
import urllib, urllib2
from BeautifulSoup import BeautifulSoup
import re
 
product_links = []
product_codes = []
pages_to_scrape = []
 
def get_category_urls():
	request = urllib2.Request(MARKETPLACE_URL, None)
	urlfile = urllib2.urlopen(request)
	page = urlfile.read()
	urlfile.close()
 
	soup = BeautifulSoup(page)
	parentCatLinks = [x['href'] for x in soup.findAll('a', {'class':'parentCatLink'})]
	return parentCatLinks
 
def get_products():
 
	fout = open('ClickBankLinks.csv', 'w')
 
	while len(pages_to_scrape) > 0:
 
		url = pages_to_scrape.pop()
		request = urllib2.Request(url, None)
		urlfile = urllib2.urlopen(request)
		page = urlfile.read()
		urlfile.close()
 
		soup = BeautifulSoup(page)
 
		results = [x.find('a') for x in soup.findAll('tr', {'class':'result'})]
 
		nextLink = soup.find('a', title='Next page')
		if nextLink:
			page_to_scrape.append(nextLink['href'])
 
		for product in results:
			try:
				product_code = str(product).split('.')[1]
				product_codes.append(product_code)
				m = re.search('^< (.*)>(.*)< ', str(product))
				title = m.group(2)
				my_link = get_hoplink(product_code)
				request = urllib2.Request(my_link)
				urlfile = urllib2.urlopen(request)
				display_url = urlfile.url
				#page = urlfile.read()  #continue here if you want to scrape keywords etc from landing page
 
				print my_link, display_url
				product_links.append({'code':product_code, 'aff_link':my_link, 'dest_url':display_url})
				fout.write(product_code + ', ' + my_link + ', ' + display_url + '\n')
				fout.flush()
			except:
				continue  # handle cases where destination url is offline
 
	fout.close()
 
def get_hoplink(vendor):
	request = urllib2.Request(AFF_LINK_FORM + '?affiliate=' + AFFILIATE + '&promocode=&submit=Create&vendor='+vendor+'&results=', None)
	urlfile = urllib2.urlopen(request)
	page = urlfile.read()
	urlfile.close()
	soup = BeautifulSoup(page)
	link = soup.findAll('input', {'class':'special'})[0]['value']
	return link
 
if __name__=='__main__':
	urls = get_category_ids()
	for url in urls:
		pages_to_scrape.append(CLICKBANK_URL+url)
	get_products()

spider_webHere’s a simple web crawling script that will go from one url and find all the pages it links to up to a pre-defined depth. Web crawling is of course the lowest level tool used by Google to create its multi-billion dollar business. You may not be able to compete with Google’s search technology but being able to crawl your own sites, or that of your competitors can be very valuable.

You could for instance routinely check your websites to make sure that it is live and all the links are working. it could notify you of any 404 errors. By adding in a page rank check you could identify better linking strategies to boost your page rank scores. And you could identify possible leaks – paths a user could take that takes them away from where you want them to go.

Here’s the script:

# -*- coding: utf-8 -*-
from HTMLParser import HTMLParser
from urllib2 import urlopen
 
class Spider(HTMLParser):
    def __init__(self, starting_url, depth, max_span):
        HTMLParser.__init__(self)
        self.url = starting_url
        self.db = {self.url: 1}
        self.node = [self.url]
 
        self.depth = depth # recursion depth max
        self.max_span = max_span # max links obtained per url
        self.links_found = 0
 
    def handle_starttag(self, tag, attrs):
        if self.links_found < self.max_span and tag == 'a' and attrs:
            link = attrs[0][1]
            if link[:4] != "http":
                link = '/'.join(self.url.split('/')[:3])+('/'+link).replace('//','/')
 
            if link not in self.db:
                print "new link ---> %s" % link
                self.links_found += 1
                self.node.append(link)
            self.db[link] = (self.db.get(link) or 0) + 1
 
    def crawl(self):
        for depth in xrange(self.depth):
            print "*"*70+("\nScanning depth %d web\n" % (depth+1))+"*"*70
            context_node = self.node[:]
            self.node = []
            for self.url in context_node:
                self.links_found = 0
                try:
                    req = urlopen(self.url)
                    res = req.read()
                    self.feed(res)
                except:
                    self.reset()
        print "*"*40 + "\nRESULTS\n" + "*"*40
        zorted = [(v,k) for (k,v) in self.db.items()]
        zorted.sort(reverse = True)
        return zorted
 
if __name__ == "__main__":
    spidey = Spider(starting_url = 'http://www.7cerebros.com.ar', depth = 5, max_span = 10)
    result = spidey.crawl()
    for (n,link) in result:
        print "%s was found %d time%s." %(link,n, "s" if n is not 1 else "")

deliciousIn yet another of my series of web scrapers this time I’m posting some code that will scrape links from delicious.com. This is a pretty cool way of finding links that other people have found relevant. And this could be used to generate useful content for visitors.

You could easily add this to a WordPress blogging robot script so that the newest links are posted in a weekly digest post. This type of promotion will get noticed by the people they link to, and spreads some of that link love. It will hopefully result in some reciprocal links for your site.

Another idea would be to create a link directory and seed it with links gathered from delicious. Or you could create a widget of the hottest links in your niche that automatically gets updated.

This script makes use of the BeautifulSoup library for parsing the HTML pages.

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# (C) 2009 HalOtis Marketing
# written by Matt Warren
# http://halotis.com/
"""
Scraper for Del.icio.us SERP.
 
This pulls the results for a match for a query on http://del.icio.us.
"""
 
import urllib2
import re
 
from BeautifulSoup import BeautifulSoup
 
def get_delicious_results(query, page_limit=10):
 
    page = 1
    links = []
 
    while page &lt; page_limit :
        url='http://delicious.com/search?p=' + '%20'.join(query.split()) + '&amp;context=all&amp;lc=1&amp;page=' + str(page)
        req = urllib2.Request(url)
        HTML = urllib2.urlopen(req).read()
        soup = BeautifulSoup(HTML)
 
        next = soup.find('a', attrs={'class':re.compile('.*next$', re.I)})
 
        #links is a list of (url, title) tuples
        links +=   [(link['href'], ''.join(link.findAll(text=True)) ) for link in soup.findAll('a', attrs={'class':re.compile('.*taggedlink.*', re.I)}) ]
 
        if next :
            page = page+1
        else :
            break
 
    return links
 
if __name__=='__main__':
    links = get_delicious_results('halotis marketing')
    print links

alexa_logoSometimes it’s useful to know where all the back-links to a website are coming from.

As a competitor it can give you information about how your competition is promoting their site. You can shortcut the process of finding the good places to get links from, and who might be a client or a good contact for your business by finding out who is linking to your competitors.

If you’re buying or selling a website the number and quality of back-links helps determine the value of a site. checking the links to a site should be on the checklist you use when buying a website.

With that in mind I wrote a short script that scrapes the links to a particular domain from the list that Alexa provides.

import urllib2
 
from BeautifulSoup import BeautifulSoup
 
def get_alexa_linksin(domain):
 
    page = 0
    linksin = []
 
    while True :
        url='http://www.alexa.com/site/linksin;'+str(page)+'/'+domain
        req = urllib2.Request(url)
        HTML = urllib2.urlopen(req).read()
        soup = BeautifulSoup(HTML)
 
        next = soup.find(id='linksin').find('a', attrs={'class':'next'})
 
        linksin += [(link['href'], link.string) for link in soup.find(id='linksin').findAll('a')]
 
        if next :
	    page = page+1
        else :
	    break
 
    return linksin
 
if __name__=='__main__':
    linksin = get_alexa_linksin('halotis.com')
    print linksin