How to web-scrape multiple page with Selenium (Python)

10,435

Try the below code.It will loop through all pages not only 5 pages.Check the next button if available click on it else break the wile loop.

import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup

options = Options()
options.add_argument("window-size=1400,600")
from fake_useragent import UserAgent

ua = UserAgent()
a = ua.random
user_agent = ua.random
print(user_agent)
options.add_argument(f'user-agent={user_agent}')

driver = webdriver.Chrome('/Users/raduulea/Documents/chromedriver', options=options)

driver.get('https://www.immoweb.be/fr/recherche/immeuble-de-rapport/a-vendre')

import time

time.sleep(10)

Title = []
address = []
price = []
surface = []
desc = []
page=2
while True:
    time.sleep(10)
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    results = soup.find_all("div", {"class": "result-xl"})
    for result in results:
        Title.append(result.find("div", {"class": "title-bar-left"}).get_text().strip())
        address.append(result.find("span", {"result-adress"}).get_text().strip())
        price.append(result.find("div", {"class": "xl-price rangePrice"}).get_text().strip())
        surface.append(result.find("div", {"class": "xl-surface-ch"}).get_text().strip())
        desc.append(result.find("div", {"class": "xl-desc"}).get_text().strip())
    if len(driver.find_elements_by_css_selector("a.next")) > 0:
        url = "https://www.immoweb.be/fr/recherche/immeuble-de-rapport/a-vendre/?page={}".format(page)
        driver.get(url)
        page += 1
        #It will traverse for only 5 pages as you are after if want more page just comment the below if block
        if int(page)>5:
        break
    else:
        break

df = pd.DataFrame({"Title": Title, "Address": address, "Price:": price, "Surface": surface, "Description": desc})
df.to_csv("output.csv")
Share:
10,435
mr-kim
Author by

mr-kim

Updated on June 04, 2022

Comments

  • mr-kim
    mr-kim almost 2 years

    I've seen several solutions to scrape multiple pages from a website, but couldn't make it work on my code.

    At the moment, I have this code, that is working to scrape the first page. And I would like to create a loop to scrape all the page of the website (from page 1 to 5)

    import pandas as pd
    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    from bs4 import BeautifulSoup
    
    
    options = Options()
    options.add_argument("window-size=1400,600")
    from fake_useragent import UserAgent
    ua = UserAgent()
    a = ua.random
    user_agent = ua.random
    print(user_agent)
    options.add_argument(f'user-agent={user_agent}')
    
    driver = webdriver.Chrome('/Users/raduulea/Documents/chromedriver', options=options)
    driver.get('https://www.immoweb.be/fr/recherche/immeuble-de-rapport/a-vendre/liege/4000?page=1')
    
    import time
    time.sleep(10)
    
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    
    results = soup.find_all("div", {"class":"result-xl"})
    title=[]
    address=[]
    price=[]
    surface=[]
    desc=[]
    
    for result in results:
        title.append(result.find("div", {"class":"title-bar-left"}).get_text().strip())
        address.append(result.find("span", {"result-adress"}).get_text().strip())
        price.append(result.find("div", {"class":"xl-price rangePrice"}).get_text().strip())
        surface.append(result.find("div", {"class":"xl-surface-ch"}).get_text().strip())
        desc.append(result.find("div", {"class":"xl-desc"}).get_text().strip())
    
    
    df = pd.DataFrame({"Title":title,"Address":address,"Price:":price,"Surface" : surface,"Description":desc})
    df.to_csv("output.csv")