options(tigris_use_cache = TRUE) # set cache = TRUE to save time for future calls
<- zctas(year = 2021) %>% # get all ZIPs in US (we can't filter by state unfortunately)
zcta rename(zip_code = GEOID20) %>%
st_transform("EPSG:2272") %>% # re-project
filter(substr(ZCTA5CE20, start = 1, stop = 3) == "191") %>%
erase_water(area_threshold = 0.5)
<- zctas(year = 2021) %>% # get all ZIPs in US
zcta_bg rename(zip_code = GEOID20) %>%
st_transform("EPSG:2272") %>% # re-project
st_crop(st_bbox(zcta %>% st_buffer(5000))) %>%
erase_water(area_threshold = 0.5)
<- zcta$ZCTA5CE20 # list of zip codes to include in API query
zip_list
<- st_union(zcta) %>% # phl boundary
phl_bound st_as_sf()
<- primary_secondary_roads(state = "PA", year = 2021) %>%
roads_pa st_transform("EPSG:2272") %>%
st_crop(st_bbox(zcta %>% st_buffer(5000))) %>%
erase_water()
<- primary_secondary_roads(state = "NJ", year = 2021) %>%
roads_nj st_transform("EPSG:2272") %>%
st_crop(st_bbox(zcta %>% st_buffer(5000))) %>%
erase_water()
<- rbind(roads_nj, roads_pa) roads
Obtaining Yelp API Data
Setup
Libraries
Tigris boundaries
Yelp Fusion API
get_yelp() function
#### Yelp api call function ####
<- "https://api.yelp.com/v3/businesses/search"
url
= function(category, zip_code, offset_num) { # args are category of business, zipcode, and number to offset by
get_yelp
= list(
queryString location = zip_code, # argument to be filled
term = category, # argument to be filled
sort_by = "distance", # sort by dist
limit = 50, # 50 is the max for yelp fusion api, any higher and it won't work
offset = offset_num # argument to be filled
)
# use "GET" verb to request information from url
<- VERB("GET", url, query = queryString,
response add_headers('Authorization' = 'Bearer FrTAkRb4nKwMf5EdppcPMw11TY8XhgICjymVIWmu9IjOCK8FlKRb6x3im7DcBT038JJDqzeBrwWQM5__FeUnsXJL3e9-9TuIMqwGBJKJeLsbd2YbX0Rc0V28y_htZXYx'),
content_type("application/octet-stream"),
accept("application/json"))
# turn the response into a json file
= content(response, "parsed", flatten = TRUE, simplify = TRUE)
yelp.json
# retrieve columns from json structure
= data.frame(yelp.json$businesses$name)
biz.name = data.frame(yelp.json$businesses$coordinates.latitude)
biz.lat = data.frame(yelp.json$businesses$coordinates.longitude)
biz.lon = data.frame(yelp.json$businesses$rating)
biz.rating = data.frame(yelp.json$businesses$location.address1)
biz.addr
# bind the columns into one dataframe
= cbind(biz.name, biz.rating, biz.addr, biz.lat, biz.lon) %>%
yelp.df as.data.frame()
colnames(yelp.df) <- c("name", "rating", "address", "lat", "lon")
# add in category alias/title (this will give us cuisine information)
= yelp.json$businesses$categories
cuisine
<- map_dfr(cuisine, function(x) {
cuis.df tibble(
alias = paste(x$alias, collapse = ", "),
title = paste(x$title, collapse = ", ")) %>%
as.data.frame()
})
<- yelp.df %>%
yelp.df cbind(cuis.df)
# # Replace NA values with ""
# yelp.df <- yelp.df %>%
# mutate(across(everything(), ~replace_na(.x, "")))
# When creating an empty dataframe, use "" as default value
if(nrow(yelp.df) == 0) {
<- data.frame(name="", rating=numeric(0), address="", lat=numeric(0), lon=numeric(0), alias = "", title = "", stringsAsFactors=FALSE)
yelp.df
}
return(yelp.df)
}
API call / data collection
# Initialize a named list of empty dataframes
<- function(zips) {
initialize_named_dfs <- data.frame(name=character(0), rating=numeric(0), address=character(0), lat=numeric(0), lon=numeric(0))
empty_df <- lapply(zips, function(zip) empty_df)
named_list names(named_list) <- zips
return(named_list)
}
<- initialize_named_dfs(zip_list) # Initiate list of restaurant dataframes for each offset (since the query limit is 50, offset = 1 would return 51-100)
biz_list_0 <- initialize_named_dfs(zip_list) # 10 dfs x 50 restaurants each = a 500 restaurant sample per zip code
biz_list_1 <- initialize_named_dfs(zip_list)
biz_list_2 <- initialize_named_dfs(zip_list)
biz_list_3
<- initialize_named_dfs(zip_list)
biz_list_4 <- initialize_named_dfs(zip_list)
biz_list_5 <- initialize_named_dfs(zip_list)
biz_list_6
<- initialize_named_dfs(zip_list)
biz_list_7 <- initialize_named_dfs(zip_list)
biz_list_8
<- initialize_named_dfs(zip_list)
biz_list_9 <- initialize_named_dfs(zip_list)
biz_list_10
# master list to store the dataframes
<- list(biz_list_0, biz_list_1, biz_list_2, biz_list_3,
offset_list
biz_list_4, biz_list_5, biz_list_6, biz_list_7,
biz_list_8, biz_list_9, biz_list_10 )
# Loop through each offset (think of each offset as a page of results)
for (i in 1:length(zip_list)) {
# initialize zipnum (this is so we know where we're at in the list of zips)
<- 1
zipnum # initialize offset (so we can pull page 1, then page 2, then page 3, etc)
<- i-1
offset
# Loop through each zip code
for (zip_code in zip_list) {
print(paste("batch ", offset + 1, ", ", "zip", zipnum, ": ", zip_code, sep = ""))
# Fetch Yelp data for the zip code and store it in the list
as.character(zip_code)]] <- get_yelp("historic landmarks and museums", as.character(zip_code), offset)
offset_list[[i]][[
<- zipnum + 1 #iterate zipnum each loop
zipnum
}<- offset + 50
offset }
Data preparation
Store response as dataframe
# Combine all dataframes into one dataframe and remove duplicates
<- map_dfr(offset_list, ~ bind_rows(.x)) %>%
biz.sf unique() %>%
filter(!is.na(lat)) %>%
st_as_sf(crs = 4326, coords = c("lon", "lat")) %>%
st_transform("EPSG:2272")
Write geojson file
st_write(biz.sf, "historic_landmarks.geojson", driver = "geojson")
Write boundary files
<- places(state = 42, year = 2022) %>%
phl filter(NAME == "Philadelphia")
st_write(phl, "phl_bound.geojson", driver = "geojson")