HTTP Error 999: Request denied

15,748

Solution 1

Try to set up User-Agent header. Add this line after op.set_handle_robots(False)

op.addheaders = [('User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36")]

Edit: If you want to scrape web-sites, first check if it has API or library, that deals with API.

Solution 2

You should be using the LinkedIn REST API, either directly or using python-linkedin. It allows for direct access to the data, instead of attempting to scrape the JavaScript-heavy web site.

Share:
15,748
Deepayan
Author by

Deepayan

Updated on July 25, 2022

Comments

  • Deepayan
    Deepayan almost 2 years

    I am trying to scrape some web pages from LinkedIn using BeautifulSoup and I keep getting error "HTTP Error 999: Request denied". Is there a way around to avoid this error. If you look at my code, I have tried Mechanize and URLLIB2 and both are giving me the same error.

    from __future__ import unicode_literals
    from bs4 import BeautifulSoup
    import urllib2
    import csv
    import os
    import re
    import requests
    import pandas as pd
    import urlparse
    import urllib
    import urllib2
    from BeautifulSoup import BeautifulSoup
    from BeautifulSoup import BeautifulStoneSoup
    import urllib
    import urlparse
    import pdb
    import codecs
    from BeautifulSoup import UnicodeDammit
    import codecs
    import webbrowser
    from urlgrabber import urlopen
    from urlgrabber.grabber import URLGrabber
    import mechanize
    
    fout5 = codecs.open('data.csv','r', encoding='utf-8', errors='replace')
    
    for y in range(2,10,1):
    
    
        url = "https://www.linkedin.com/job/analytics-%2b-data-jobs-united-kingdom/?sort=relevance&page_num=1"
    
        params = {'page_num':y}
    
        url_parts = list(urlparse.urlparse(url))
        query = dict(urlparse.parse_qsl(url_parts[4]))
        query.update(params)
    
        url_parts[4] = urllib.urlencode(query)
        y = urlparse.urlunparse(url_parts)
        #print y
    
    
    
        #url = urllib2.urlopen(y)
        #f = urllib2.urlopen(y)
    
        op = mechanize.Browser() # use mecahnize's browser
        op.set_handle_robots(False) #tell the webpage you're not a robot
        j = op.open(y)
        #print op.title()
    
    
        #g = URLGrabber()
        #data = g.urlread(y)
        #data = fo.read()
        #print data
    
        #html = response.read()
        soup1 = BeautifulSoup(y)
        print soup1