Digg is by far the most popular social news site on the internet. With it’s simple “thumbs up” system the users of the site promote the most interesting and high quality stores and the best of those make it to the front page. What you end up with is a filtered view of the most interesting stuff.
It’s a great site and one that I visit every day.
I wanted to write a script that makes use of the search feature on Digg so that I could scrape out and re-purpose the best stuff to use elsewhere. The first step in writing that larger (top secret) program was to start with a scraper for Digg search.
The short python script I came up with will return the search results from Digg in a standard python data structure so it’s simple to use. It parses out the title, destination, comment count, digg link, digg count, and summary for the top 100 search results.
You can perform advanced searches on digg by using a number of different flags:
- +b Add to see buried stories
- +p Add to see only promoted stories
- +np Add to see only unpromoted stories
- +u Add to see only upcoming stories
- Put terms in “quotes” for an exact search
- -d Remove the domain from the search
- Add -term to exclude a term from your query (e.g. apple -iphone)
- Begin your query with site: to only display stories from that URL.
This script also allows the search results to be sorted:
from DiggSearch import digg_search
digg_search('twitter', sort='newest') #sort by newest first
digg_search('twitter', sort='digg') # sort by number of diggs
digg_search('twitter -d') # sort by best match |
from DiggSearch import digg_search
digg_search('twitter', sort='newest') #sort by newest first
digg_search('twitter', sort='digg') # sort by number of diggs
digg_search('twitter -d') # sort by best match
Here’s the Python code:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# (C) 2009 HalOtis Marketing
# written by Matt Warren
# http://halotis.com/
import urllib,urllib2
import re
from BeautifulSoup import BeautifulSoup
USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)'
def remove_extra_spaces(data):
p = re.compile(r'\s+')
return p.sub(' ', data)
def digg_search(query, sort=None, pages=10):
"""Returns a list of the information I need from a digg query
sort can be one of [None, 'digg', 'newest']
"""
digg_results = []
for page in range (1,pages):
#create the URL
address = "http://digg.com/search?s=%s" % (urllib.quote_plus(query))
if sort:
address = address + '&sort=' + sort
if page > 1:
address = address + '&page=' + str(page)
#GET the page
request = urllib2.Request(address, None, {'User-Agent':USER_AGENT} )
urlfile = urllib2.urlopen(request)
page = urlfile.read(200000)
urlfile.close()
#scrape it
soup = BeautifulSoup(page)
links = soup.findAll('h3', id=re.compile("title\d"))
comments = soup.findAll('a', attrs={'class':'tool comments'})
diggs = soup.findAll('strong', id=re.compile("diggs-strong-\d"))
body = soup.findAll('a', attrs={'class':'body'})
for i in range(0,len(links)):
item = {'title':remove_extra_spaces(' '.join(links[i].findAll(text=True))).strip(),
'destination':links[i].find('a')['href'],
'comment_count':int(comments[i].string.split()[0]),
'digg_link':comments[i]['href'],
'digg_count':diggs[i].string,
'summary':body[i].find(text=True)
}
digg_results.append(item)
#last page early exit
if len(links) < 10:
break
return digg_results
if __name__=='__main__':
#for testing
results = digg_search('twitter -d', 'digg', 2)
for r in results:
print r |
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# (C) 2009 HalOtis Marketing
# written by Matt Warren
# http://halotis.com/
import urllib,urllib2
import re
from BeautifulSoup import BeautifulSoup
USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)'
def remove_extra_spaces(data):
p = re.compile(r'\s+')
return p.sub(' ', data)
def digg_search(query, sort=None, pages=10):
"""Returns a list of the information I need from a digg query
sort can be one of [None, 'digg', 'newest']
"""
digg_results = []
for page in range (1,pages):
#create the URL
address = "http://digg.com/search?s=%s" % (urllib.quote_plus(query))
if sort:
address = address + '&sort=' + sort
if page > 1:
address = address + '&page=' + str(page)
#GET the page
request = urllib2.Request(address, None, {'User-Agent':USER_AGENT} )
urlfile = urllib2.urlopen(request)
page = urlfile.read(200000)
urlfile.close()
#scrape it
soup = BeautifulSoup(page)
links = soup.findAll('h3', id=re.compile("title\d"))
comments = soup.findAll('a', attrs={'class':'tool comments'})
diggs = soup.findAll('strong', id=re.compile("diggs-strong-\d"))
body = soup.findAll('a', attrs={'class':'body'})
for i in range(0,len(links)):
item = {'title':remove_extra_spaces(' '.join(links[i].findAll(text=True))).strip(),
'destination':links[i].find('a')['href'],
'comment_count':int(comments[i].string.split()[0]),
'digg_link':comments[i]['href'],
'digg_count':diggs[i].string,
'summary':body[i].find(text=True)
}
digg_results.append(item)
#last page early exit
if len(links) < 10:
break
return digg_results
if __name__=='__main__':
#for testing
results = digg_search('twitter -d', 'digg', 2)
for r in results:
print r
You can grab the source code from the bitbucket repository.