Here is the 1st script which pertains to downloads. It will download all pdf files from a given url. I wrote this for a reason. Have a look at the huge amount of repetition. I was talking to this guy on IRC who told me the importance of neatly written code. I will put up the better looking version sometime later.
#!/usr/bin/python
#newproj.py
import urllib
from urlparse import urlparse
from BeautifulSoup import BeautifulSoup
url = raw_input("URL you want to download things from: ")
html = urllib.urlopen(url)
print repr(html)
a = BeautifulSoup(html)
for link in a.findAll('a'):
if url.endswith('.html') == True and urlparse(link['href']).hostname:
b = link['href']
print b
if b.endswith('.pdf') == True:
c=urlparse(b).path.split('/')[-1]
urllib.urlretrieve(b,c)
elif url.endswith('.html') == True and not urlparse(link['href']).hostname:
b = '/'.join(url.rsplit('/', 1), link['href'])
print b
if link['href'].endswith('.pdf') == True:
c=urlparse(link['href']).path.split('/')[-1]
urllib.urlretrieve(b)
elif url.endswith('.html') == False and not urlparse(link['href']).hostname:
b = url + "/" + link['href']
print b
if b.endswith('.pdf') == True:
c=urlparse(link['href']).path.split('/')[-1]
urllib.urlretrieve(b, c)
print " I have downloaded all available pdfs."



2 comments ↓
THANKS
you’re welcome
Leave a Comment