Obtaining Yelp API Data

Author

Anna Duan

Published

December 23, 2023

Setup

Libraries

Tigris boundaries

options(tigris_use_cache = TRUE) # set cache = TRUE to save time for future calls

zcta <- zctas(year = 2021) %>% # get all ZIPs in US (we can't filter by state unfortunately)
  rename(zip_code = GEOID20) %>% 
  st_transform("EPSG:2272") %>% # re-project
  filter(substr(ZCTA5CE20, start = 1, stop = 3) == "191") %>%
  erase_water(area_threshold = 0.5)

zcta_bg <- zctas(year = 2021) %>% # get all ZIPs in US
  rename(zip_code = GEOID20) %>% 
  st_transform("EPSG:2272") %>% # re-project
  st_crop(st_bbox(zcta %>% st_buffer(5000))) %>%
  erase_water(area_threshold = 0.5)


zip_list <- zcta$ZCTA5CE20 # list of zip codes to include in API query

phl_bound <- st_union(zcta) %>% # phl boundary
  st_as_sf()

roads_pa <- primary_secondary_roads(state = "PA", year = 2021) %>%
  st_transform("EPSG:2272") %>%
  st_crop(st_bbox(zcta %>% st_buffer(5000))) %>%
  erase_water()

roads_nj <- primary_secondary_roads(state = "NJ", year = 2021) %>%
  st_transform("EPSG:2272") %>%
  st_crop(st_bbox(zcta %>% st_buffer(5000))) %>%
  erase_water()

roads <- rbind(roads_nj, roads_pa)

Yelp Fusion API

get_yelp() function

#### Yelp api call function ####
url <- "https://api.yelp.com/v3/businesses/search"

get_yelp = function(category, zip_code, offset_num) { # args are category of business, zipcode, and number to offset by
  
  queryString = list(
    location = zip_code, # argument to be filled
    term = category, # argument to be filled
    sort_by = "distance", # sort by dist
    limit = 50, # 50 is the max for yelp fusion api, any higher and it won't work
    offset = offset_num # argument to be filled
  )
  
  # use "GET" verb to request information from url
  response <- VERB("GET", url, query = queryString, 
                   add_headers('Authorization' = 'Bearer FrTAkRb4nKwMf5EdppcPMw11TY8XhgICjymVIWmu9IjOCK8FlKRb6x3im7DcBT038JJDqzeBrwWQM5__FeUnsXJL3e9-9TuIMqwGBJKJeLsbd2YbX0Rc0V28y_htZXYx'), 
                   content_type("application/octet-stream"), 
                   accept("application/json"))
  
  # turn the response into a json file
  yelp.json = content(response, "parsed", flatten = TRUE, simplify = TRUE)
  
  # retrieve columns from json structure
  biz.name = data.frame(yelp.json$businesses$name)
  biz.lat = data.frame(yelp.json$businesses$coordinates.latitude)
  biz.lon = data.frame(yelp.json$businesses$coordinates.longitude)
  biz.rating = data.frame(yelp.json$businesses$rating)
  biz.addr = data.frame(yelp.json$businesses$location.address1)
  
  # bind the columns into one dataframe
  yelp.df = cbind(biz.name, biz.rating, biz.addr, biz.lat, biz.lon)  %>%
    as.data.frame()
  
  colnames(yelp.df) <- c("name", "rating", "address", "lat", "lon")
  
  # add in category alias/title (this will give us cuisine information)
  cuisine = yelp.json$businesses$categories
  
  cuis.df <- map_dfr(cuisine, function(x) {
    tibble(
      alias = paste(x$alias, collapse = ", "),
      title = paste(x$title, collapse = ", ")) %>%
      as.data.frame()
  })
  
  yelp.df <- yelp.df %>%
    cbind(cuis.df)
  
  # # Replace NA values with ""
  # yelp.df <- yelp.df %>% 
  #   mutate(across(everything(), ~replace_na(.x, "")))
  
  # When creating an empty dataframe, use "" as default value
  if(nrow(yelp.df) == 0) {
    yelp.df <- data.frame(name="", rating=numeric(0), address="", lat=numeric(0), lon=numeric(0), alias = "", title = "", stringsAsFactors=FALSE)
  }
  
  return(yelp.df)
}

API call / data collection

# Initialize a named list of empty dataframes
initialize_named_dfs <- function(zips) { 
  empty_df <- data.frame(name=character(0), rating=numeric(0),  address=character(0), lat=numeric(0), lon=numeric(0))
  named_list <- lapply(zips, function(zip) empty_df)
  names(named_list) <- zips
  return(named_list)
}

biz_list_0 <- initialize_named_dfs(zip_list) # Initiate list of restaurant dataframes for each offset (since the query limit is 50, offset = 1 would return 51-100)
biz_list_1 <- initialize_named_dfs(zip_list) # 10 dfs x 50 restaurants each = a 500 restaurant sample per zip code
biz_list_2 <- initialize_named_dfs(zip_list)
biz_list_3 <- initialize_named_dfs(zip_list)

biz_list_4 <- initialize_named_dfs(zip_list)
biz_list_5 <- initialize_named_dfs(zip_list)
biz_list_6 <- initialize_named_dfs(zip_list)

biz_list_7 <- initialize_named_dfs(zip_list)
biz_list_8 <- initialize_named_dfs(zip_list)

biz_list_9 <- initialize_named_dfs(zip_list)
biz_list_10 <- initialize_named_dfs(zip_list)



# master list to store the dataframes
offset_list <- list(biz_list_0, biz_list_1, biz_list_2, biz_list_3,
                    biz_list_4, biz_list_5, biz_list_6, biz_list_7,
                    biz_list_8, biz_list_9, biz_list_10 )

# Loop through each offset (think of each offset as a page of results)
for (i in 1:length(zip_list)) {
  
  # initialize zipnum (this is so we know where we're at in the list of zips)
  zipnum <- 1
  # initialize offset (so we can pull page 1, then page 2, then page 3, etc)
  offset <- i-1
  
  # Loop through each zip code
  for (zip_code in zip_list) {
    print(paste("batch ", offset + 1, ", ", "zip", zipnum, ": ", zip_code, sep = ""))
    
    # Fetch Yelp data for the zip code and store it in the list
    offset_list[[i]][[as.character(zip_code)]] <- get_yelp("historic landmarks and museums", as.character(zip_code), offset)
    
    zipnum <- zipnum + 1 #iterate zipnum each loop
  }
  offset <- offset + 50
}

Data preparation

Store response as dataframe

# Combine all dataframes into one dataframe and remove duplicates
biz.sf <- map_dfr(offset_list, ~ bind_rows(.x)) %>%
  unique() %>%
  filter(!is.na(lat)) %>%
  st_as_sf(crs = 4326, coords = c("lon", "lat")) %>% 
  st_transform("EPSG:2272")

Write geojson file

st_write(biz.sf, "historic_landmarks.geojson", driver = "geojson")

Write boundary files

phl <- places(state = 42, year = 2022) %>%
  filter(NAME == "Philadelphia")

st_write(phl, "phl_bound.geojson", driver = "geojson")