import requests
import time
from bs4 import BeautifulSoup
from random import sample
import pandas as pd
import os
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from datetime import datetime
import glob as gb
from selenium_stealth import stealth
Web-scraping Zillow Data
Web-scraping at Given Time
Import Libraries
- Request library allows you to send HTTP request in python to a specific URL. In our case we send an HTTP request to Zillow
- Time module allows to handle time related task including formatting dates, waiting and representing time
- The random module allows you to generate random
- The bs4 module allows you to pull data from HTML document after you get a response from HTTP request
- The os modules allows ou to interact with operating systems including changing working directory
- The selenium module allows you to automate interaction with a web browser including sending URL request and extracting HTML document response
Set Path
- Identify your destination folder
- Use os change directory to set your destination directory as the default. That is where all outputs will be exported to
= "../webscraping_outputs-Z"
path os.chdir(path)
Create a file name
- Create an outfile file name, I called mine ZillowSelium and formatted it with a date time stamp of the current time down to the current second
- Note: If you are scraping multiple times in a day, then you need to format the time stamp with hours or more minutely so that you don’t overwrite already exported data
= "ZillowSelenium" + "_" + "{:%Y_%h_%d_%H-%M-%S}".format(datetime.now()) +".csv"
finalfile finalfile
'ZillowSelenium_2023_Dec_21_13-45-20.csv'
Main Webscraping
- sets up Selenium webdrivers with
selenium-stealth
and searches for specific html tag classes- It is recommended to check this every so often should tags or their classes change
- Outputs results of obtained realtor pages and successfully obtained information
- Output written to file with the
finalfile
name
Code
#Create a list that will hold the results
=20
page_count= []
results
= ["This property accepts Zillow applications","Zillow last checked:"]
zillow_placeholders
# selectors
= "h1.Text-c11n-8-84-3__sc-aiai24-0"
titleSelector = "div.building-description"
descSelector = "div.StyledPropertyCardDataWrapper-c11n-8-84-3__sc-1omp4c3-0"
linkSelector = "div.Text-c11n-8-84-3__sc-aiai24-0"
saleSelector = "p.Text-c11n-8-84-3__sc-aiai24-0"
sale2Selector = "h4.Text-c11n-8-84-3__sc-aiai24-0"
nhoodSelector = "h2.styledComponents__BuildingCardTitle-sc-1bj2ydz-8"
nhood2Selector
= webdriver.Chrome()
driver = webdriver.Chrome()
textDriver
stealth(driver,=["en-US", "en"],
languages="Google Inc.",
vendor="Win32",
platform="Intel Inc.",
webgl_vendor="Intel Iris OpenGL Engine",
renderer=True,
fix_hairline
)
stealth(textDriver,=["en-US", "en"],
languages="Google Inc.",
vendor="Win32",
platform="Intel Inc.",
webgl_vendor="Intel Iris OpenGL Engine",
renderer=True,
fix_hairline
)
="https://www.zillow.com/philadelphia-pa/rentals/"
url
= "https://www.zillow.com/philadelphia-pa/for_sale/"
url2
# Inspect the zillow website and figure out the number pages for rental ads use
# In the charlotte example, there are a total of 20 pages so I set the range at 21
for page in range(1,page_count+1,1):
print("This is page: " + str(page))
#Identify the Zillow URL of your City, it should follow this format:
# 1. Default Zillow url : https://www.zillow.com/
# 2. Name of your City: eg. charlotte-nc, atlanta-ga
# 3. Pass the page number
# 4. Add the "_p" that is a default thing with the Zillow website
# 5. In a sample URL on page 15 for example will be like: https://www.zillow.com/charlotte-nc/rentals/15_p/
= str(page) + '_p/'
page
# Here we are going to utilize the selenium. To automate the interaction behavior of a web browser you would
# need a web driver. Each browser has a webdriver, in my case I am using google chrome so I download the web driver
# from this website "https://chromedriver.storage.googleapis.com/index.html?path=98.0.4758.80/"
# After downloading and extracting the web drive(chromdriver.exe) you use the webdrive.Chrome() method to initiate
# the chrome browser and pass the path where the driver is saved.
# CraiglistBrowser.maximize_window()
# After the browser has been launched use the get() to pass the url
print(f"Urls:\n")
= []
page_links for url in [url,url2]: # getListingType():
print(f"\t\t{url+page}\n")
= driver.get(url+page)
browser = driver.execute_script("return document.documentElement.outerHTML")
html = BeautifulSoup(html, 'html.parser')
soup
for item in soup.select(linkSelector):
= item.select("a")[0].attrs["href"]
l if not(l.startswith("https://")):
= "https://www.zillow.com"+l
l
page_links.append(l)
for link in page_links:
= textDriver.get(link)
ovPage = BeautifulSoup(textDriver.page_source,"html.parser")
textSoup
if len(textSoup.select("div.px-captcha-container")) > 0:
0.3)
time.sleep(continue
else:
= textSoup.select(titleSelector)[0].text
title = textSoup.select(nhoodSelector)
nh1 = textSoup.select(nhood2Selector)
nh2 = None
nhood
# get neighborhood from among header tags
if len(nh1) > 0:
for blurb in nh1:
if "neighborhood:" in blurb.text.lower():
= blurb.text.split(":")[1][1:]
nhood # print(blurb.text.split(":")[1][1:])
break
elif (len(nh2) > 0 and type(nhood) == type(None)):
for blurb in nh2:
if "neighborhood:" in blurb.text.lower():
= blurb.text.split(":")[1][1:]
nhood # print(blurb.text.split(":")[1][1:])
break
# getting address from title
= None
address for w in range(len(title)):
if title[w].isnumeric():
= title[w:]
address break
if len(textSoup.select(descSelector))>0 and len(textSoup.select(descSelector)[0].text)>70 and (not(any(holder in textSoup.select(descSelector)[0].text for holder in zillow_placeholders))):
= textSoup.select(descSelector)[0].text
text elif len(textSoup.select(sale2Selector)[0]) and len(textSoup.select(sale2Selector)[0].text)>70 and (not(any(holder in textSoup.select(sale2Selector)[0].text for holder in zillow_placeholders))):
= textSoup.select(sale2Selector)[0].text
textelif len(textSoup.select(saleSelector)[0])>0 and len(textSoup.select(saleSelector)[0].text)>70 and (not(any(holder in textSoup.select(saleSelector)[0].text for holder in zillow_placeholders))):
= textSoup.select(saleSelector)[0].text
textelse:
=""
text
results.append({"title": title,
"address": address,
"neighborhood": nhood,
"description": text,
"url": link
})print(f"title: {title}\t\taddress: {address}\t\tneighborhood: {nhood}\nlink: {link}\n\tdescription: {text}")
0.3)
time.sleep(
0.5)
time.sleep(
= pd.DataFrame(results)
Zillowdata = False) Zillowdata.to_csv(finalfile, index
Save current results to file if ending prematurely due to error
Code
= pd.DataFrame(results)
Zillowdata = False) Zillowdata.to_csv(finalfile, index
Full Data Compilation
Get a list of all CSV outputs
- Use the glob method get generate a list of all your output csv in your directory
= "../data/webscraping_outputs-Z"
path
os.chdir(path)= gb.glob(path + "/*.csv")
All All
Concatenate CSV
- combine all files as a single CSV by concatenating them. To do this:
- loop through the list of CSV in your path and use the pandas.read_csv method to read them. It will create a generator
- Afterward use the pd.concat method to concatenate all your csv files
if len(All)>0:
= (pd.read_csv(file) for file in All)
Zillow = pd.concat(Zillow, ignore_index=True)
FinalZillow else:
import warnings
f"There are no data in {path}, try using 'Webscraping Zillow Data.ipynb'.",UserWarning,stacklevel=2)
warnings.warn(
= FinalZillow.drop_duplicates() FinalUnique
Change directory for the output file and export the final output as a CSV file
Change the output directory for your final CSV
Since the cleaning (including concatenating) will be done multiple times for any addtional webscraping, you would have to
- export the final clean file to a different folder
OR
- delete the old version using the commented portion in the cell below
= path+"/clean"
outPath
os.chdir(outPath)# if os.path.exists("ZillowUnique.csv"):
# os.remove(("ZillowUnique.csv"))
"ZillowUnique.csv", index=False,)
FinalUnique.to_csv(print("compilation complete"))
("../"+path) os.chdir(