Scrapy crawl with next page
22,182
Your rule
is not used because you don't use a CrawlSpider
.
So you have to create the next page requests
manually like so:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.contrib.spiders import Rule
from scrapy.linkextractors import LinkExtractor
from lxml import html
class Scrapy1Spider(scrapy.Spider):
name = "craiglist"
allowed_domains = ["sfbay.craigslist.org"]
start_urls = (
'http://sfbay.craigslist.org/search/npo',
)
Rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[@class="button next"]',)), callback="parse", follow= True),)
def parse(self, response):
site = html.fromstring(response.body_as_unicode())
titles = site.xpath('//div[@class="content"]/p[@class="row"]')
print len(titles), 'AAAA'
# follow next page links
next_page = response.xpath('.//a[@class="button next"]/@href').extract()
if next_page:
next_href = next_page[0]
next_page_url = 'http://sfbay.craigslist.org' + next_href
request = scrapy.Request(url=next_page_url)
yield request
Or use the CrawlSpider
like so:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from lxml import html
class Scrapy1Spider(CrawlSpider):
name = "craiglist"
allowed_domains = ["sfbay.craigslist.org"]
start_urls = (
'http://sfbay.craigslist.org/search/npo',
)
rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[@class="button next"]',)), callback="parse_page", follow= True),)
def parse_page(self, response):
site = html.fromstring(response.body_as_unicode())
titles = site.xpath('//div[@class="content"]/p[@class="row"]')
print len(titles), 'AAAA'
Author by
Mirza Delic
Software Engineer with strong experience with Python, Django, Flask, MySQL, PostgreSQL, RESTful API services, Vue.js, AngularJS. A lot of experience building scheduling, tracking, POS and ticketing software. More than 6 years of experience developing software and 3 years working remotely. Links: https://www.linkedin.com/in/mirzadelic/ https://github.com/mirzadelic/
Updated on July 09, 2022Comments
-
Mirza Delic almost 2 years
I have this code for scrapy framework:
# -*- coding: utf-8 -*- import scrapy from scrapy.contrib.spiders import Rule from scrapy.linkextractors import LinkExtractor from lxml import html class Scrapy1Spider(scrapy.Spider): name = "scrapy1" allowed_domains = ["sfbay.craigslist.org"] start_urls = ( 'http://sfbay.craigslist.org/search/npo', ) rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[@class="button next"]',)), callback="parse", follow= True),) def parse(self, response): site = html.fromstring(response.body_as_unicode()) titles = site.xpath('//div[@class="content"]/p[@class="row"]') print len(titles), 'AAAA'
But problem is that i get 100 results, it doesn't go to next pages.
What is wrong here?