import nltkimport stringimport pandas as pdimport geopandas as gpdfrom matplotlib import pyplot as pltfrom wordcloud import WordCloudfrom sklearn.cluster import KMeansimport altair as alt# import copy# import math# later cluster analysisfrom sklearn.preprocessing import MinMaxScaler, RobustScalerfrom sklearn.preprocessing import StandardScaler
Code
# df = copy.copy(zillow)# nhoodList=nhoodsdef getNHood_or_Count(df,searchL,switchCol="neighborhood"): nlist = [] prev =0for i inrange(len(df)): prev =len(nlist)sum=0 descText = df[i:i+1]["description"].squeeze() desc = descText.split(" ")if switchCol =="neighborhood":for nhood in searchL: # any(set(nhood).issubset(descText) if nhood in descText: nlist.append(nhood)breakiflen(nlist) == prev: nlist.append("")else:for nhood in searchL: # any(set(nhood).issubset(descText) if nhood in descText:sum+=1 nlist.append(sum) df[switchCol] = nlistreturn df
Code
def getAmenityTitles(df,col="title"): k =list(df[col])# print(k) ind = [] mul = []for elem inrange(len(k)): k[elem] = k[elem].lower()iflen(k[elem].split(", "))>1: ind.append(elem)for l inrange(len(ind)): mul.append(k[ind[l]]) ind.reverse()for l inrange(len(ind)):del k[ind[l]]for m in mul:# print(m.split(", ")) k.extend(m.split(", "))# print(k[-1]) k.reverse()# print(k[-1])# print(len(k))# get unique values in k k =list(set(k))# print(len(k))return k
Gathering Neighborhood and Amenity Mentions
Using the created data from the Yelp API and webscraped Zillow listing descriptions, we are quickly able to organize the relative locations and highly regarded amenities of the property listed. The neighborhood designation offers a reasonable alternative to geocoding when the listing title does not have an address. The results of this data cleaning (with the exception of geocoding) is viewable for the first 8 records below.
From the different individual descriptors of Yelp amenity groups, we can associate an approximate measure of importance of the category from the words the realtor chose. Ideally, these added counts will make a referencable cluster analysis of Philadelphia neighborhoods possible to compare with aggregated point data and ratings from Yelp.
Code
# input = "../webscraping_outputs-Z/clean/ZillowUnique.csv"input="../data/webscraping_outputs-Z/clean/ZillowUnique_v1.csv"try: zillowFull = pd.read_csv(input)exceptFileNotFoundError: zillowFull = pd.read_csv("../data/webscraping_outputs-Z/best_pre_neighborhood/ZillowSelenium_2023_Dec_10_19-09-38")# handle nanszillow = zillowFull.drop(zillowFull.loc[zillowFull["description"].isna()].index) # can choose to fill them too, but better for frequencies# get philly neighborhoods and amenities toophl_nhoods = gpd.read_file("https://raw.githubusercontent.com/opendataphilly/open-geo-data/master/Neighborhoods_Philadelphia/Neighborhoods_Philadelphia.geojson")phl_bound = gpd.read_file("../data/Yelp API/phl_bound.geojson").to_crs(2272)restaurant = getAmenityTitles(gpd.read_file("../data/Yelp API/restaurants.geojson"))park = getAmenityTitles(gpd.read_file("../data/Yelp API/parks.geojson"))edu = getAmenityTitles(gpd.read_file("../data/Yelp API/education.geojson"))grocery = getAmenityTitles(gpd.read_file("../data/Yelp API/grocery.geojson"))entertainment = getAmenityTitles(gpd.read_file("../data/Yelp API/entertainment.geojson"))nightlife = getAmenityTitles(gpd.read_file("../data/Yelp API/nightlife.geojson"))shopping = getAmenityTitles(gpd.read_file("../data/Yelp API/shopping.geojson"))healthcare = getAmenityTitles(gpd.read_file("../data/Yelp API/healthcare.geojson"))historic = getAmenityTitles(gpd.read_file("../data/Yelp API/historic_landmarks.geojson"))kids = getAmenityTitles(gpd.read_file("../data/Yelp API/kids.geojson"))arts = getAmenityTitles(gpd.read_file("../data/Yelp API/perf_arts.geojson"))beauty = getAmenityTitles(gpd.read_file("../data/Yelp API/beauty.geojson"))# master amenity listamenity_list = restaurant.copy()for amenL in [park,edu,grocery,entertainment,nightlife,shopping,healthcare,historic,kids,arts,beauty]: amenity_list.extend(amenL)amenity_types =12nhoods =list(phl_nhoods["listname"])toAdd = []for n in nhoods:iflen(n.split("-")) >1: toAdd.append(n.split("-")[0].strip())iflen(n.split(",")) >1: toAdd.append(n.split(",")[0].strip())nhoods.extend(toAdd)zillow = df=getNHood_or_Count(switchCol="beauty",searchL=beauty, df=getNHood_or_Count(switchCol="arts",searchL=arts, df=getNHood_or_Count(switchCol="kids",searchL=kids, df=getNHood_or_Count(switchCol="historic",searchL=historic, df=getNHood_or_Count(switchCol="healthcare",searchL=healthcare, df=getNHood_or_Count(switchCol="shopping",searchL=shopping, df=getNHood_or_Count(switchCol="nightlife",searchL=nightlife, df=getNHood_or_Count(switchCol="entertainment",searchL=entertainment, df=getNHood_or_Count(switchCol="grocery",searchL=grocery, df=getNHood_or_Count(switchCol="education",searchL=edu, df=getNHood_or_Count(switchCol="parks",searchL=park, df=getNHood_or_Count(switchCol="restaurants",searchL=restaurant, df=getNHood_or_Count(zillow,nhoods)))))))))))))zillow.head(8)
title
description
url
neighborhood
restaurants
parks
education
grocery
entertainment
nightlife
shopping
healthcare
historic
kids
arts
beauty
0
The Legacy at Powelton Village
Our University City student apartments are des...
https://www.zillow.com/apartments/philadelphia...
Powelton
1
0
1
0
1
0
1
1
0
2
0
2
1
The Piazza
A vibrant hub centered in the fashionable dist...
https://www.zillow.com/apartments/philadelphia...
Northern Liberties
1
0
1
0
1
0
1
1
0
2
0
2
2
The Aster, 4550 Mitchell St, Philadelphia, PA ...
The building has a variety of studio, 1, 2, an...
https://www.zillow.com/homedetails/4550-Mitche...
1
0
1
0
2
0
2
1
0
2
0
3
3
The Latham at Seventeenth
Located just one block to Rittenhouse Square, ...
https://www.zillow.com/apartments/philadelphia...
Rittenhouse
1
0
1
0
1
0
0
1
0
1
0
1
4
The Irvine - Modern Luxury Apartments In Spruc...
Recycled, Renovated, & Reimagined. The Irvine ...
https://www.zillow.com/apartments/philadelphia...
2
2
1
0
3
3
1
1
0
4
0
2
5
1600 Chestnut Apartments
1600 Chestnut is the perfect residence for tho...
https://www.zillow.com/apartments/philadelphia...
2
0
1
0
2
1
1
1
0
3
0
2
6
1327 N Marston, 1327 N Marston St, Philadelphi...
Welcome to The Milk Depot! Modern, new-constru...
https://www.zillow.com/homedetails/1327-N-Mars...
Brewerytown
1
0
1
0
1
0
1
1
0
2
0
2
7
1327 N Marston
Welcome to The Milk Depot! Modern, new-constru...
https://www.zillow.com/apartments/philadelphia...
Brewerytown
1
0
1
0
1
0
1
1
0
2
0
2
Code
punc =list(string.punctuation)nltk.download("stopwords");stop_words =list(set(nltk.corpus.stopwords.words("english")))desc = zillow["description"].str.strip().tolist()desc[:3]words = [d.split() for d in desc]flat_w = []for wList in words:for w in wList:if w[-1] in string.punctuation: w = w[:-1] flat_w.append(w.lower())no_stop = [w for w in flat_w if (w notin stop_words) and (w notin punc)]# now for neighborhoodsneighborhoods = [d for d in zillow["neighborhood"]]no_stop_nhoods = [w for w in neighborhoods if (w notin stop_words) and (w notin punc)]words = pd.DataFrame({"words": no_stop}).groupby("words",as_index=False).size().sort_values("size",ascending=False,ignore_index=True)words = words.loc[words["words"].isin(amenity_list)]words = words.drop(words.loc[words["words"]==""].index)nhoodFreq = pd.DataFrame({"neighborhood": no_stop_nhoods}).groupby("neighborhood",as_index=False).size().sort_values("size",ascending=False,ignore_index=True)nhoodFreq.drop(nhoodFreq.loc[nhoodFreq["neighborhood"]==""].index,inplace=True)## display the words dataframe if you wish# words.head(15)
Code
## save files for easy import across notebookswords.to_csv("../data/cleaned_for_notebooks/words.csv", index =False)nhoodFreq.to_csv("../data/cleaned_for_notebooks/frequent_neighborhood1.csv", index =False)zillow.to_csv("../data/cleaned_for_notebooks/zillow_mentions.csv", index =False)
Code
'''If additional words should be removed, create a list here'''to_remove = ["home","room","kitchen","property","floor","bedrooms","bedroom","rooms",""]words = words.drop(index = words.loc[words["words"].isin(to_remove)].index)print(words.head(25))# and save to csv if desired# words.to_csv("../words.csv", index = False)
Amenity Wordcloud from Zillow Listing Descriptions
Code
text =' '.join(words["words"])# Create the word cloudwordcloud = WordCloud(width=800, height=400, background_color ='white').generate(text)# Display the word cloud using matplotlib# Formatfig, ax = plt.subplots()ax.imshow(wordcloud, interpolation='bilinear')ax.set_axis_off()ax.patch.set_edgecolor('black') ax.patch.set_linewidth(1) plt.show()# plt.savefig(fname="../images/zillow_amenity_word_cloud.jpg",pad_inches=2,dpi=150)
From this word cloud, it is quickly apparent that words that might be related not only to houses, but also housing good stores like ‘appliances’ are relatively common. These add some specific consideration for words to be put in a user-defined removal list in future word frequency explorations and visualizations like bar charts of specific words. Otherwise, restaurants is the most common followed by similar food related words like ‘grocery’ and ‘french’ suggesting that restaurants are the most common amenity type. This is supported by our exploratory analysis of Yelp data.
Given the unique association to a business type, the counts of these words do not appear especially problematic for cluster analysis and listing counts. Though major differences in group representation can help clarify.
Neighborhood Wordcloud from Zillow Listing Descriptions
Code
text =' '.join(nhoodFreq["neighborhood"]).lower()# Create the word cloudwordcloud = WordCloud(width=800, height=400, background_color ='white').generate(text)# Display the word cloud using matplotlib# Formatfig, ax = plt.subplots()ax.imshow(wordcloud, interpolation='bilinear')ax.set_axis_off()ax.patch.set_edgecolor('black') ax.patch.set_linewidth(1) plt.show()# plt.savefig(fname="../images/zillow_neighborhood_word_cloud.jpg",pad_inches=2,dpi=150)
This word cloud gives a quick glance of the most often mentioned neighborhoods in the Zillow listings. There are unfortunately problems in accounting for words where the neighborhood name is split and the presence of common words like park, east, and hill (18, 12, & 8 neighborhoods include those words in their names, respectively). However, we can see that ‘city’ neighborhoods (i.e., Center City, University City, and Old City), PennyPack, Wissahickon, and other specific neighborhoods are relatively common. In consideration of amenity clusters, future geocoding of the location and cluster analysis of the amenity frequencies will likely provide clearer pictures of the businesses and services near those homes while others will not have as reliable profiles using Zillow alone.
It is important to note that the listings change often, and that results depicted here will not be the same as those reproduced.
The data used to create this word cloud comes from 20 webpages of listings from December 10, 2023 at 07:09:38 PM.
Code
for n in nhoods:if"park"in n.lower():print(n)print("\n\n")for n in nhoods:if"east"in n.lower():print(n)print("\n\n")for n in nhoods:if"city"in n.lower():print(n)
Sample Cluster Analysis of Neighborhood Amenity Typologies
Using the amenity mention counts and neighborhood assignments obtained earlier, we organize a simple 6 cluster analysis similar to that general amenity cluster shown in Exploratory Analysis based on the amenity mention counts.
This graph shows no immediate relationship between the clusters and individual amenity counts or geographic features like the square mile area of the neighborhood. Slight determination of cluster by the number of restaurant mentions is possible, and exceptions can likely be explained by other largely contributing amenity types.
Code
# Calculate the mean values of the amenitiesmean_values = phl_nhoods.groupby("label",as_index=False)[["arts", "beauty", "education", "entertainment", "grocery", "healthcare", "historic", "kids", "nightlife", "parks", "restaurants", "shopping"]].mean().round(0)# Calculate the counts for each clustercounts = phl_nhoods['label'].value_counts().rename_axis('label').reset_index(name='neighborhoods')# Merge the mean values and counts into one DataFramecluster_profile = pd.merge(mean_values, counts, on='label').reset_index()# Display the merged DataFramecluster_profile = cluster_profile.rename(columns={"index":"cluster"})cluster_profile["cluster"]=["15 mentions : 35 Neighborhoods","40 mentions : 3 Neighborhoods","125 mentions : 4 Neighborhoods","48 mentions : 11 Neighborhoods","229 mentions : 1 Neighborhood","72 mentions : 5 Neighborhoods","86 mentions : 1 Neighborhood"]cluster_profile
cluster
label
arts
beauty
education
entertainment
grocery
healthcare
historic
kids
nightlife
parks
restaurants
shopping
neighborhoods
0
15 mentions : 35 Neighborhoods
0
0.0
3.0
1.0
3.0
0.0
1.0
0.0
3.0
1.0
0.0
2.0
1.0
35
1
40 mentions : 3 Neighborhoods
1
1.0
5.0
4.0
7.0
0.0
3.0
1.0
6.0
3.0
2.0
5.0
3.0
3
2
125 mentions : 4 Neighborhoods
2
0.0
25.0
12.0
25.0
1.0
12.0
0.0
25.0
9.0
2.0
20.0
13.0
4
3
48 mentions : 11 Neighborhoods
3
0.0
9.0
4.0
8.0
1.0
4.0
0.0
9.0
2.0
0.0
6.0
5.0
11
4
229 mentions : 1 Neighborhood
4
2.0
47.0
18.0
35.0
4.0
20.0
0.0
38.0
9.0
2.0
27.0
27.0
1
5
72 mentions : 5 Neighborhoods
5
1.0
11.0
5.0
12.0
2.0
5.0
0.0
14.0
4.0
3.0
9.0
6.0
5
6
86 mentions : 1 Neighborhood
6
2.0
11.0
8.0
14.0
3.0
6.0
1.0
15.0
4.0
6.0
11.0
5.0
1
This dataframe of mean amenity type mentions makes it clear that Zillow descriptions (at least at the time of retrieval) are heavily skewed towards mention of nearby businesses and services for kids, education, restaurants, and shopping while also raising the question of arts and beautry mentions truly being in reference to the areas around a property or for the property itself.
A few neighborhoods like that in clusters 4 and 6 have many more mentions of amenities than those elsewhere with the following map identifying cluster 4’s neighborhood as Frankford. There is difficulty in telling if given neighborhoods are more descriptive due to higher house price or the number of businesses, but the comparison of individual point data or Yelp clusters needs more integration to be viable.
Code
## Map of Clustersphl_nhoods.drop(columns=["updated_at","created_at"]).explore( column="label", cmap="Set2", categorical=True, tiles="CartoDB positron")
Make this Notebook Trusted to load map: File -> Trust Notebook
This map of the neighborhoods by cluster group highlights the high number of neighborhoods missing from the description alone, but also shows the difficulty in differentiating the somewhat similar ratio of amenity types in the mean number in each cluster. More granularity would serve this analysis well which geocoding should be a potential solution to.