EDIT: Latest version of this script is available on my Purdue CS webspace.
There ! I code that is free from repetition. I do suppose that well written code has its uses. It looks beautiful :) . It accepts command line arguments too. All you need to do is install the BeautifulSoup module for Python and type, “python getstuff.py [extension]” (if you save this file as getstuff.py on your computer). That means if you want to download all the mp3 files on a webpage, type, “python getstuff.py .mp3″ and sit back and relax :).
#newproj.py
#Author: Shriphani Palakodety (PSP)
#Usage: python getstuff.py [extension]
import urllib
from urlparse import urlparse, urlunparse
from BeautifulSoup import BeautifulSoup
import sys
def linksFromSoup(soup, extension):
"""Iterator for all the valid links found."""
for link in soup.findAll(‘a’):
try:
link_url = link[‘href’]
except KeyError:
continue
possible_link, filename = parseLink(url, link_url)
if filename.endswith(extension):
yield possible_link, filename
def parseLink(src_url, link_url):
"""Takes the page URL and a link from it and makes
the absolute link and the filename."""
(link_scheme, link_hostname, link_path, link_params, link_query, link_fragment) = urlparse(link_url)
(src_scheme, src_hostname, src_path, src_params, src_query, src_fragment) = urlparse(src_url)
filename = link_path.split(‘/’)[-1]
if link_hostname:
possible_link = link_url
else:
full_path = ‘/’.join((src_path.rsplit(‘/’, 1)[0], link_path))
possible_link = urlunparse((src_scheme, src_hostname, full_path, link_params, link_query, link_fragment))
return possible_link, filename
if __name__ == ‘__main__’:
extension = sys.argv[1]
url = raw_input("URL you want to download things from: ")
html = urllib.urlopen(url)
print repr(html)
soup = BeautifulSoup(html)
for linkurl, filename in linksFromSoup(soup, extension):
urllib.urlretrieve(linkurl, filename)



No Comments so far ↓
There are no comments yet...Kick things off by filling out the form below.