Download all pdf files from a website using Python
Solution 1
Check out the following implementation. I've used requests
module instead of urllib
to do the download. Moreover, I've used .select()
method instead of .find_all()
to avoid using re
.
import os
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
url = "http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016.html"
#If there is no such folder, the script will create one automatically
folder_location = r'E:\webscraping'
if not os.path.exists(folder_location):os.mkdir(folder_location)
response = requests.get(url)
soup= BeautifulSoup(response.text, "html.parser")
for link in soup.select("a[href$='.pdf']"):
#Name the pdf files using the last portion of each link which are unique in this case
filename = os.path.join(folder_location,link['href'].split('/')[-1])
with open(filename, 'wb') as f:
f.write(requests.get(urljoin(url,link['href'])).content)
Solution 2
couple of links where already containing the server address which caused the 404 not found. Also you should not remove the .pdf
from the filename as it will save it without extension.
from urllib import request
from bs4 import BeautifulSoup
import re
import os
import urllib
# connect to website and get list of all pdfs
url="http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016.html"
response = request.urlopen(url).read()
soup= BeautifulSoup(response, "html.parser")
links = soup.find_all('a', href=re.compile(r'(.pdf)'))
# clean the pdf link names
url_list = []
for el in links:
if(el['href'].startswith('http')):
url_list.append(el['href'])
else:
url_list.append("http://www.gatsby.ucl.ac.uk/teaching/courses/" + el['href'])
print(url_list)
# download the pdfs to a specified location
for url in url_list:
print(url)
fullfilename = os.path.join('E:\webscraping', url.replace("http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016/", ""))
print(fullfilename)
request.urlretrieve(url, fullfilename)
user11128
Updated on July 09, 2022Comments
-
user11128 almost 2 years
I have followed several online guides in an attempt to build a script that can identify and download all pdfs from a website to save me from doing it manually. Here is my code so far:
from urllib import request from bs4 import BeautifulSoup import re import os import urllib # connect to website and get list of all pdfs url="http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016.html" response = request.urlopen(url).read() soup= BeautifulSoup(response, "html.parser") links = soup.find_all('a', href=re.compile(r'(.pdf)')) # clean the pdf link names url_list = [] for el in links: url_list.append(("http://www.gatsby.ucl.ac.uk/teaching/courses/" + el['href'])) #print(url_list) # download the pdfs to a specified location for url in url_list: print(url) fullfilename = os.path.join('E:\webscraping', url.replace("http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016/", "").replace(".pdf","")) print(fullfilename) request.urlretrieve(url, fullfilename)
The code can appear to find all the pdfs (uncomment the
print(url_list)
to see this). However, it fails at the download stage. In particular I get this error and I am not able to understand what's gone wrong:E:\webscraping>python get_pdfs.py http://www.gatsby.ucl.ac.uk/teaching/courses/http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016/cribsheet.pdf E:\webscraping\http://www.gatsby.ucl.ac.uk/teaching/courses/cribsheet Traceback (most recent call last): File "get_pdfs.py", line 26, in <module> request.urlretrieve(url, fullfilename) File "C:\Users\User\Anaconda3\envs\snake\lib\urllib\request.py", line 248, in urlretrieve with contextlib.closing(urlopen(url, data)) as fp: File "C:\Users\User\Anaconda3\envs\snake\lib\urllib\request.py", line 223, in urlopen return opener.open(url, data, timeout) File "C:\Users\User\Anaconda3\envs\snake\lib\urllib\request.py", line 532, in open response = meth(req, response) File "C:\Users\User\Anaconda3\envs\snake\lib\urllib\request.py", line 642, in http_response 'http', request, response, code, msg, hdrs) File "C:\Users\User\Anaconda3\envs\snake\lib\urllib\request.py", line 570, in error return self._call_chain(*args) File "C:\Users\User\Anaconda3\envs\snake\lib\urllib\request.py", line 504, in _call_chain result = func(*args) File "C:\Users\User\Anaconda3\envs\snake\lib\urllib\request.py", line 650, in http_error_default raise HTTPError(req.full_url, code, msg, hdrs, fp) urllib.error.HTTPError: HTTP Error 404: Not Found
Can somebody help me please?
-
user11128 about 5 yearsThanks. This is short and clean. This is my first experience of webscraping - is it usually quite slow? It's taking several seconds for each file? Thanks.
-
mLstudent33 almost 5 years@SIM, how can I can name the downloaded PDFs in Asian characters (part of the URL)? I saw this but not sure how to fit it into the above code: qiita.com/mix/items/87d094414e46f857de45
-
Jabernet over 4 years@SIM this works for the link provided. I'm trying on a different page where I know there are several links to pdf documents. I'm only getting 2.
-
Amine Chadi about 3 yearsHi, I know it's a bit late, please, I tried the code on this URL = "covidmaroc.ma/Pages/LESINFOAR.aspx" but it didn't work, I have no idea why since I am not familiar with web scraping at all, any help please.
-
x89 almost 3 yearsDoes this method download all pdfs from a webpage or the whole website domain if we give the base url?
-
x89 almost 3 yearsDoes this method download all pdfs from a webpage or the whole website domain if we give the base url?