Data Preparation

Author

Anna Duan

Code
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import altair as alt
from sklearn.cluster import KMeans
import re
from wordcloud import WordCloud


# Show all columns in dataframes
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

np.seterr(invalid="ignore");
Code
# phila bounds
phl_bound = gpd.read_file("data/Yelp API/phl_bound.geojson")
phl_neigh = gpd.read_file("https://raw.githubusercontent.com/opendataphilly/open-geo-data/master/Neighborhoods_Philadelphia/Neighborhoods_Philadelphia.geojson")
phl_tract = gpd.read_file("data/Yelp API/census-tracts.geojson")
Code
phl_bound_proj = phl_bound.to_crs(2272)
phl_neigh_proj = phl_neigh.to_crs(2272)
phl_tract_proj = phl_tract.to_crs(2272)

I concatenate neighborhood and tract names to create 384 sub-neighborhoods that still have recognizable neighborhood names.

Code
# Copying the DataFrame
tract = phl_tract_proj.copy()

# Concatenating 'neighborhood_name' and 'tract_id' into a new column 'nb_name'
tract['nb_name'] = tract['neighborhood_name'] + " " + tract['tract_id'].astype(str)

tract["nb_name"].head()
0    Center City East 01
1    Center City East 02
2    Center City West 01
3    Center City West 02
4    Center City West 03
Name: nb_name, dtype: object

I calculate areas for each neighborhood so we can later calculate counts of amenities by square mile

Code
tract = tract.copy()
# Calculate area in square meters
tract['area_m2'] = tract['geometry'].area

# Convert area to square miles (1 square mile = 2,589,988.11 square meters)
tract['area_mi2'] = tract['area_m2'] / 27878400
Code
tract = tract[["nb_name", "area_mi2", "geometry"]]

I repeat the process for neighborhoods, first selecting and renaming the relevant columns from the projected neighborhood object.

Code
neigh = phl_neigh_proj[["mapname", "geometry"]].copy()
neigh["nb_name"] = neigh["mapname"]

I then calculate the area of each neighborhood

Code
# Calculate area in square meters
neigh['area_m2'] = neigh['geometry'].area

# Convert area to square miles (1 square mile = 2,589,988.11 square meters)
neigh['area_mi2'] = neigh['area_m2'] / 27878400

neigh = neigh[["area_mi2", "nb_name", "geometry"]]

Read in amenity files

Code
# amenities
restaurants = gpd.read_file("data/Yelp API/restaurants.geojson")
parks = gpd.read_file("data/Yelp API/parks.geojson")
education = gpd.read_file("data/Yelp API/education.geojson")
grocery = gpd.read_file("data/Yelp API/grocery.geojson")
entertainment = gpd.read_file("data/Yelp API/entertainment.geojson")
nightlife = gpd.read_file("data/Yelp API/nightlife.geojson")
shopping = gpd.read_file("data/Yelp API/shopping.geojson")
healthcare = gpd.read_file("data/Yelp API/healthcare.geojson")
historic = gpd.read_file("data/Yelp API/historic_landmarks.geojson")
kids = gpd.read_file("data/Yelp API/kids.geojson")
arts = gpd.read_file("data/Yelp API/perf_arts.geojson")
beauty = gpd.read_file("data/Yelp API/beauty.geojson")

Data Cleaning

Restaurants

Code
# Filter and create a copy to avoid SettingWithCopyWarning
food_desc = restaurants.copy()

# Create a new column 'desc_1'
food_desc["desc_1"] = food_desc['alias'].str.split(',').str[0].str.strip().str.lower()
food_desc["desc_2"] = food_desc['alias'].str.split(',').str[1].str.strip().str.lower()
food_desc["desc_3"] = food_desc['alias'].str.split(',').str[2].str.strip().str.lower()
Code
len(food_desc)
2697

First word in “alias”

Code
# Group by 'desc_1' and count, then convert to DataFrame
food_desc.groupby("desc_1").size().reset_index(name='count').sort_values("count", ascending = False).head(10)
desc_1 count
108 pizza 320
36 chinese 198
118 restaurants 182
45 delis 161
124 seafood 101
17 breakfast_brunch 88
77 hotdogs 84
100 newamerican 81
122 sandwiches 77
58 fooddeliveryservices 75

Second word in “alias”

Code
# Group by 'desc_2' and count, then convert to DataFrame
food_desc.groupby("desc_2").size().reset_index(name='count').sort_values("count", ascending = False).head(10)
desc_2 count
111 sandwiches 120
12 breakfast_brunch 86
64 hotdogs 79
17 burgers 78
112 seafood 73
28 chicken_wings 66
131 tradamerican 65
72 italian 44
98 pizza 42
27 cheesesteaks 41

Third word in “alias”

Code
# Group by 'desc_3' and count, then convert to DataFrame
food_desc.groupby("desc_3").size().reset_index(name='count').sort_values("count", ascending = False).head(10)
desc_3 count
83 sandwiches 124
97 tradamerican 61
20 chicken_wings 41
50 hotdogs 39
10 breakfast_brunch 38
13 burgers 33
25 coffee 30
31 delis 26
84 seafood 26
49 hotdog 23

Replace description 1 with description 2 when description 1 is irrelevant

Words like “restaurant” and “fooddeliveryservices” don’t tell us what kind of food is served. Let’s replace the description in these cases with something more descriptive.

In these cases we can search for restaurants where the first word isn’t descriptive, then replace the desc_1 with desc_2.

Code
# replace desc_2
condition1 = food_desc['desc_2'].isin(["food trucks", "foodtrucks", "fooddeliveryservices", "delis", "bars", "grocery", "pubs", "catering", "convenience", 
                                       "sportsbars", "popuprestaurants", "restaurants", "foodstands", "comfortfood", "food_court", "lounges", "breweries", 
                                       "wine_bars", "bowling", "meats", "newamerican", "tradamerican", "american", "buffets", "foodtrucks"])

condition2 = food_desc['desc_3'].notna()

# Update desc_1 where conditions are met
food_desc.loc[condition1 & condition2, 'desc_2'] = food_desc['desc_3']


# replace desc_1
condition1 = food_desc['desc_1'].isin(["food trucks", "foodtrucks", "fooddeliveryservices", "delis", "bars", "grocery", "pubs", "catering", "convenience", 
                                       "sportsbars", "popuprestaurants", "restaurants", "foodstands", "comfortfood", "food_court", "lounges", "breweries", 
                                       "wine_bars", "bowling", "meats", "newamerican", "tradamerican", "american", "buffets", "foodtrucks"])

condition2 = food_desc['desc_2'].notna()

# Update desc_1 where conditions are met
food_desc.loc[condition1 & condition2, 'desc_1'] = food_desc['desc_2']

food_desc.groupby("desc_1").size().reset_index(name='count').sort_values("count", ascending = False).head()
desc_1 count
111 pizza 339
37 chinese 203
124 restaurants 182
128 sandwiches 150
130 seafood 126

Unfortunately this didn’t replace “restaurant”. Restaurants with only “restaurants” as the first word in their alias usually don’t have other descriptive words. This shows that “restaurant” is the default. This calls for another approach.

Code
food_desc[food_desc["desc_1"] == "restaurants"].head(5)
name rating address alias title geometry desc_1 desc_2 desc_3
8 Smitty's Restaurant 0.0 5405 Chester Ave restaurants Restaurants POINT (2676904.285 231029.670) restaurants NaN NaN
10 Kan Yuen Ping 0.0 5428 Chester Ave restaurants Restaurants POINT (2676791.167 230669.291) restaurants NaN NaN
11 Silver Place 0.0 1500 S 58th St restaurants Restaurants POINT (2673976.157 231063.740) restaurants NaN NaN
14 Yuan East Restaurant 0.0 1832 S 58th St restaurants Restaurants POINT (2675345.361 229583.321) restaurants NaN NaN
17 Wei Hua 0.0 5140 Chester Ave restaurants Restaurants POINT (2677863.415 231724.048) restaurants NaN NaN

By inspecting the names of restaurants lacking descriptions, we can parse out some common categories.

Code
restaurant_cloud = food_desc[food_desc["desc_1"] == "restaurants"]

# Concatenate all text in the column
text = ' '.join(restaurant_cloud['name'].dropna())

# Create the word cloud
wordcloud = WordCloud(width=800, height=400, background_color ='white').generate(text)

# Display the word cloud using matplotlib
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

Code
# Define the mapping
keyword_mapping = {
    'yuen': 'chinese',
    'ping': 'chinese',
    'zhong': 'chinese',
    'china': 'chinese',
    "chinese": "chinese",
    'canton': 'chinese',
    'wok': 'chinese',
    'lee': 'chinese',
    'china': 'chinese',
    'zhang': 'chinese',
    'chung': 'chinese',
    'dragon': 'chinese',
    'meng': 'chinese',
    'xin': 'chinese',
    'hua': 'chinese',
    'ping': 'chinese',
    'hua': 'chinese',
    'meng': 'chinese',
    'yi': 'chinese',
    'xi': 'chinese',
    'lam': 'chinese',
    'zhang': 'chinese',
    'wei': 'chinese',
    'salad': 'salad',
    'pizza': 'pizza',
    'grill': 'grill',
    "steak": "grill",
    'sushi': 'japanese',
    'chicken': 'chicken',
    'pizzeria': 'pizza',
    'pizza': 'pizza',
    'bbg': 'bbq',
    'hoagie': 'sandwich',
    'steaks': 'sandwich',
    'coffee': 'cafe', 
    'cafe': 'cafe',
    'diner': 'american',
    'luncheonette': 'american',
    'wings': 'wings',
    'deli': 'deli',
    'mexican': 'latin',
    'guadalupana': 'latin',
    'guatamelteco': 'latin',
    'fish': 'seafood',
    'casa': 'latin'}

# Function to find the keyword and return the corresponding desc_2 value
def map_keyword_to_desc(name, mapping):
    for keyword, desc in mapping.items():
        if re.search(re.escape(keyword), name, re.IGNORECASE):
            return desc
    return None

# Define the list of specific values
specific_values = ["food trucks", "fooddeliveryservices", "bars", "grocery", "restaurants"]

# Define the combined condition: desc_2 is NA or in the list of specific values
condition_combined = food_desc['desc_2'].isna() | food_desc['desc_1'].isin(specific_values)

# Apply the function to update desc_2 based on the combined condition
food_desc.loc[condition_combined, 'desc_1'] = food_desc.loc[condition_combined, 'name'].apply(lambda x: map_keyword_to_desc(x, keyword_mapping))

Identify fast food/chain restaurants

Override desc_1 with “fastfood” for major chain restaurants.

Code
food_desc.groupby("name").size().sort_values(ascending = False).head(10)
name
Wawa                         28
McDonald's                   25
Popeyes Louisiana Kitchen    19
Wendy's                      17
Philly Pretzel Factory       13
KFC                          13
Subway                       10
Wingstop                      9
Taco Bell                     9
Crown Fried Chicken           8
dtype: int64
Code
fast_food = [
    "McDonald's",
    "Wawa", 
    "Wingstop",
    "MrBeast Burger",
    "Chick-fil-A",
    "Chipotle Mexican Grill",
    "Checkers",
    "Wendy's",
    "Subway",
    "Popeyes Louisiana Kitchen",
    "Taco Bell",
    "KFC",
    "Burger King",
]
Code
food_desc.loc[food_desc["name"].isin(fast_food), "desc_1"] = "fastfood"

Take the descriptions with frequency < 6 and replace the string with the second term in alias

Code
desc_counts = food_desc.groupby("desc_1").size().reset_index(name='count').sort_values("count", ascending = False)

# Filter for 'desc_1' values where count is less than 6
desc_less_than_six = desc_counts[desc_counts['count'] < 6]['desc_1']

# Convert the filtered results to a list
desc_list = desc_less_than_six.tolist()

# replace desc_1 with desc_2 where desc_1 is in the list of descriptions with frequency < 6
food_desc.loc[food_desc['desc_1'].isin(desc_list), 'desc_1'] = food_desc['desc_2']
Code
food_desc.groupby("desc_1").size().reset_index(name='count').sort_values("count", ascending = False)
desc_1 count
83 pizza 327
29 chinese 179
47 fastfood 162
95 sandwiches 117
96 seafood 92
11 breakfast_brunch 81
38 deli 70
15 burgers 55
17 cafe 52
65 italian 42
27 chicken_wings 38
51 grill 37
94 salad 37
97 soulfood 37
5 bakeries 36
31 coffee 33
76 mexican 30
66 japanese 27
111 tradamerican 24
18 cafes 24
25 cheesesteaks 24
57 hotdogs 23
75 mediterranean 23
23 caribbean 22
104 sushi 20
26 chicken 19
53 halal 19
42 diners 18
7 bbq 17
68 juicebars 16
72 latin 16
30 cocktailbars 15
40 desserts 14
61 indpak 13
69 korean 13
116 vietnamese 13
115 venues 13
4 asianfusion 12
60 icecream 12
3 american 12
87 pretzels 12
39 delis 12
77 mideastern 11
102 steak 11
109 thai 11
50 gastropubs 10
114 vegetarian 9
9 beerbar 9
1 african 9
41 dimsum 9
105 tacos 9
19 cajun 9
113 vegan 8
90 ramen 8
120 wings 7
45 donuts 7
81 noodles 7
28 chickenshop 6
80 newamerican 6
101 sportsbars 4
22 cantonese 3
93 russian 3
8 beer_and_wine 3
74 markets 2
100 spanish 2
119 whiskeybars 2
79 musicvenues 2
82 pancakes 2
89 pubs 2
84 polish 2
0 acaibowls 2
121 wraps 2
49 french 2
24 cheese 2
70 kosher 2
63 irish 2
37 danceclubs 2
43 divebars 2
59 iberian 2
56 hotdog 2
46 falafel 2
103 streetvendors 1
21 candy 1
106 tapas 1
107 tapasmallplates 1
108 tex-mex 1
20 cambodian 1
16 burmese 1
110 themedcafes 1
33 convenience 1
112 ukrainian 1
14 buffets 1
13 bubbletea 1
12 brewpubs 1
10 brazilian 1
117 waffles 1
118 waterstores 1
6 bars 1
2 airportlounges 1
32 comfortfood 1
71 laotian 1
34 cookingclasses 1
52 haitian 1
73 lounges 1
67 jazzandblues 1
64 irish_pubs 1
62 intlgrocery 1
78 modern_european 1
58 hotpot 1
55 healthmarkets 1
54 hawaiian 1
85 popuprestaurants 1
99 southern 1
86 popupshops 1
88 publicmarkets 1
48 fooddeliveryservices 1
91 reiki 1
92 restaurants 1
44 dominican 1
35 creperies 1
98 soup 1
36 customcakes 1
Code
food_cats = food_desc.groupby("desc_1").size().reset_index(name='count').sort_values("count", ascending = False)
top_food_cats = list(food_cats[food_cats["count"] >= 12]["desc_1"])
food_desc = food_desc[food_desc["desc_1"].isin(top_food_cats)]

Sort into larger categories

Code
# Define the mapping dictionary
desc_mapping = {
    "burgers": "sandwiches/delis",
    "hotdogs": "sandwiches/delis",
    "chicken_wings": "chicken",
    "tradamerican": "american",
    "newamerican": "american",
    "wings": "chicken",
    "hotdog": "diner/grill",
    "gastropubs": "diner/grill",
    "diners": "diner/grill",
    "steak": "diner/grill",
    "grills": "diner/grill",
    "grill": "diner/grill",
    "coffee": "bakeries/cafes",
    'cafe': "bakeries/cafes",
    "cafes": "bakeries/cafes",
    'bagels': "bakeries/cafes",
    'wraps': "bakeries/cafes",
    'soups': "bakeries/cafes",
    'soup': "bakeries/cafes",
    "cupcakes": "bakeries/cafes",
    "creperies": "bakeries/cafes",
    "bakeries": "bakeries/cafes",
    "juicebars": "bakeries/cafes",
    'donuts': "bakeries/cafes",
    'icecream': "bakeries/cafes",
    "ice cream": 'bakeries/cafes',
    "desserts": "bakeries/cafes",
    "waffles": "bakeries/cafes",
    "cakeshop": "bakeries/cafes",
    'customcakes': 'bakeries/cafes',
    "cheesesteaks": "sandwiches/delis",
    "sandwich": "sandwiches/delis",
    "sandwiches": "sandwiches/delis",
    "delis": "sandwiches/delis",
    "deli": "sandwiches/delis",
    "chickenshop": "chicken",
    "sushi": "japanese",
    "ramen": "japanese",
    'hotpot': 'chinese',
    "szechuan": "chinese",
    "cantonese": "chinese",
    "dimsum": "chinese",
    "shanghainese": "chinese",
    "pretzels": "bakeries/cafes",
    "mexican": "latin",
    "honduran": "latin",
    "nicaraguan": "latin",
    "spanish": "latin",
    "guatemalan": "latin",
    "tacos": "latin",
    "brazilian": "latin",
    "portuguese": "latin",
    "dominican": "caribbean",
    "haitian": "caribbean",
    "seafoodmarket,": "seafood",
    "seafoodmarkets,": "seafood",
    "greek": "mediterranean",
    "mideastern": "halal"
}

# Replace desc_1 values using the mapping dictionary
food_desc['desc_1'] = food_desc['desc_1'].replace(desc_mapping)
Code
food_desc.groupby("desc_1").size().reset_index(name='count').sort_values("count", ascending = False)
desc_1 count
18 pizza 327
20 sandwiches/delis 301
2 bakeries/cafes 199
7 chinese 179
10 fastfood 162
21 seafood 92
4 breakfast_brunch 81
6 chicken 57
9 diner/grill 55
14 japanese 47
16 latin 46
13 italian 42
22 soulfood 37
19 salad 37
0 american 36
17 mediterranean 23
5 caribbean 22
11 halal 19
3 bbq 17
8 cocktailbars 15
23 venues 13
12 indpak 13
15 korean 13
24 vietnamese 13
1 asianfusion 12
Code
len(food_desc)
1858

Parks

Code
# Filter and create a copy to avoid SettingWithCopyWarning
parks_desc = parks.copy()

# Create a new column 'desc_1'
parks_desc["desc_1"] = parks_desc['alias'].str.split(',').str[0].str.strip().str.lower()
parks_desc["desc_2"] = parks_desc['alias'].str.split(',').str[1].str.strip().str.lower()
parks_desc["desc_3"] = parks_desc['alias'].str.split(',').str[2].str.strip().str.lower()

First word in ‘alias’

Code
# Group by 'desc_1' and count, then convert to DataFrame
parks_desc.groupby("desc_1").size().reset_index(name='count').sort_values("count", ascending = False).head(100)
desc_1 count
19 parks 150
15 landmarks 36
21 playgrounds 25
7 dog_parks 23
13 hiking 17
24 recreation 9
12 gardens 7
25 skate_parks 6
28 tennis 4
11 football 4
3 basketballcourts 2
6 discgolf 2
14 kids_activities 2
17 museums 2
18 nonprofit 2
23 rafting 2
20 pets 1
26 stadiumsarenas 1
22 publicart 1
27 swimmingpools 1
0 amateursportsteams 1
16 localflavor 1
1 bars 1
10 fishing 1
9 farms 1
8 farmersmarket 1
5 climbing 1
4 bikes 1
2 baseballfields 1
29 theater 1

Filter for categories where frequency ≥ 5

Code
park_desc_1 = parks_desc.groupby("desc_1").size().reset_index(name='count').sort_values("count", ascending = False).head(9)
top_park_desc = list(park_desc_1["desc_1"])
# remove historical landmarks
top_park_desc.remove('landmarks')

# filter for obs in the list of top descriptions
parks_desc = parks_desc[parks_desc["desc_1"].isin(top_park_desc)]
parks_desc.head(10)
name rating address alias title geometry desc_1 desc_2 desc_3
0 Cedar Park 4.5 50th St & Baltimore Ave parks Parks POINT (2677006.643 234133.514) parks NaN NaN
1 Chester Avenue Dog Park 3.5 801 S 48th St dog_parks Dog Parks POINT (2678024.964 234174.704) dog_parks NaN NaN
2 Malcolm X Park 4.0 51ST And Pine St parks Parks POINT (2676187.013 235721.062) parks NaN NaN
3 Barkan Park 3.0 4936 Spruce St parks, playgrounds Parks, Playgrounds POINT (2677109.530 236278.545) parks playgrounds NaN
4 Clark Park Dog Bowl 4.5 43rd & Chester dog_parks Dog Parks POINT (2680815.785 234312.734) dog_parks NaN NaN
5 Clark Park 4.5 43RD And Baltimore parks Parks POINT (2680756.007 234729.865) parks NaN NaN
6 Greys Ferry Cresent Skatepark 5.0 3600 Grays Ferry Ave skate_parks Skate Parks POINT (2683004.637 231400.562) skate_parks NaN NaN
7 Grays Ferry Crescent 4.0 parks Parks POINT (2682813.782 232182.835) parks NaN NaN
8 Lanier Dog Park 4.0 2911 Tasker St dog_parks Dog Parks POINT (2685221.027 229056.524) dog_parks NaN NaN
9 Saunder's Green 3.5 300-50 Saunders Ave parks Parks POINT (2683400.920 238616.685) parks NaN NaN

Education

Code
# Filter and create a copy to avoid SettingWithCopyWarning
edu_desc = education.copy()

# Create a new column 'desc_1'
edu_desc["desc_1"] = edu_desc['alias'].str.split(',').str[0].str.strip().str.lower()
edu_desc["desc_2"] = edu_desc['alias'].str.split(',').str[1].str.strip().str.lower()
edu_desc["desc_3"] = edu_desc['alias'].str.split(',').str[2].str.strip().str.lower()

First word in alias

Code
# Group by 'desc_1' and count, then convert to DataFrame
edu_desc.groupby("desc_1").size().reset_index(name='count').sort_values("count", ascending = False).head(20)
desc_1 count
14 childcare 91
29 elementaryschools 86
66 preschools 62
17 collegeuniv 55
43 highschools 33
26 driving_schools 31
28 educationservices 25
75 specialtyschools 23
81 theater 17
6 artschools 15
19 cosmetology_schools 14
83 tutoring 13
3 artclasses 13
27 education 12
22 dance_schools 10
86 vocation 9
23 dancestudio 8
68 privatetutors 7
49 montessori 7
20 cprclasses 7

Filter for frequency ≥ 6

Code
edu_desc_1 = edu_desc.groupby("desc_1").size().reset_index(name='count').sort_values("count", ascending = False).head(18)
top_edu_desc = list(edu_desc_1["desc_1"])
# remove historical landmarks
top_edu_desc.remove('theater')

# filter for obs in the list of top descriptions
edu_desc_1 = edu_desc[edu_desc["desc_1"].isin(top_edu_desc)]
edu_desc.head(10)
name rating address alias title geometry desc_1 desc_2 desc_3
0 BellyWise 5.0 NaN midwives, specialtyschools Midwives, Specialty Schools POINT (2676169.939 231988.984) midwives specialtyschools NaN
1 Limitless CPR 5.0 NaN cprclasses CPR Classes POINT (2677042.522 231394.414) cprclasses NaN NaN
2 Michele Judge 5.0 4919 Pentridge St jewelryrepair, artclasses, jewelry Jewelry Repair, Art Classes, Jewelry POINT (2677304.635 233898.273) jewelryrepair artclasses jewelry
3 Kipp West Philadelphia Prep Charter 1.0 5900 Baltimore Ave specialed Special Education POINT (2671666.162 232636.864) specialed NaN NaN
4 Beulah Baptist Christian Day School 2.0 5001 Spruce St elementaryschools, preschools Elementary Schools, Preschools POINT (2677065.563 236331.076) elementaryschools preschools NaN
5 The Academy of Industrial Arts 5.0 6328 Paschall Ave vocation, electricians Vocational & Technical School, Electricians POINT (2674851.743 225884.662) vocation electricians NaN
6 TYLII 3.0 5124 Walnut St preschools, childcare Preschools, Child Care & Day Care POINT (2676415.568 237130.841) preschools childcare NaN
7 University of the Sciences 3.0 600 S 43rd St collegeuniv Colleges & Universities POINT (2681062.408 233555.127) collegeuniv NaN NaN
8 All Around This World 5.0 4336 Pine St educationservices Educational Services POINT (2680347.395 235381.376) educationservices NaN NaN
9 Jubilee School 3.0 4211 Chester Ave highschools Middle Schools & High Schools POINT (2681043.809 234475.721) highschools NaN NaN

Grocery

Code
# Filter and create a copy to avoid SettingWithCopyWarning
grocery_desc = grocery.copy()

# Create a new column 'desc_1'
grocery_desc["desc_1"] = grocery_desc['alias'].str.split(',').str[0].str.strip().str.lower()
grocery_desc["desc_2"] = grocery_desc['alias'].str.split(',').str[1].str.strip().str.lower()
grocery_desc["desc_3"] = grocery_desc['alias'].str.split(',').str[2].str.strip().str.lower()
Code
grocery_desc.head()
name rating address alias title geometry desc_1 desc_2 desc_3
0 Kim A Grocery & Deli 1.0 1840 S 58th St grocery Grocery POINT (2675396.135 229529.164) grocery NaN NaN
1 S and J Seafood 4.0 713 S 52nd St seafoodmarkets, grocery Seafood Markets, Grocery POINT (2675793.250 234207.384) seafoodmarkets grocery NaN
2 Mariposa Food Co-op 4.0 4824 Baltimore Ave grocery, healthmarkets Grocery, Health Markets POINT (2677563.153 234052.277) grocery healthmarkets NaN
3 Jennifer Grocery 2.5 4824 Chester Ave grocery, convenience Grocery, Convenience Stores POINT (2678754.671 232771.760) grocery convenience NaN
4 Fu-Wah Mini Market 4.5 810 S 47th St convenience, sandwiches, grocery Convenience Stores, Sandwiches, Grocery POINT (2678571.989 234207.924) convenience sandwiches grocery

Nightlife

Code
# Filter and create a copy to avoid SettingWithCopyWarning
night_desc = nightlife.copy()

# Create a new column 'desc_1'
night_desc["desc_1"] = night_desc['alias'].str.split(',').str[0].str.strip().str.lower()
night_desc["desc_2"] = night_desc['alias'].str.split(',').str[1].str.strip().str.lower()
night_desc["desc_3"] = night_desc['alias'].str.split(',').str[2].str.strip().str.lower()
Code
night_desc.head()
name rating address alias title geometry desc_1 desc_2 desc_3
0 Pentridge Station Beer Garden 4.5 5116 Pentridge St beergardens Beer Gardens POINT (2676581.781 233167.788) beergardens NaN NaN
1 720 Bistro 4.0 720 S.52nd st seafood, breakfast_brunch, cocktailbars Seafood, Breakfast & Brunch, Cocktail Bars POINT (2675620.054 234102.746) seafood breakfast_brunch cocktailbars
2 Bayou 4.5 5025 Baltimore Ave lounges Lounges POINT (2676525.799 234061.840) lounges NaN NaN
3 Booker's Restaurant and Bar 3.5 5021 Baltimore Ave bars, breakfast_brunch, tradamerican Bars, Breakfast & Brunch, American POINT (2676559.222 234070.085) bars breakfast_brunch tradamerican
4 The Wine Garden 4.5 5019 Baltimore Ave wine_bars Wine Bars POINT (2676563.551 234060.148) wine_bars NaN NaN

Alias

Code
# Group by 'desc_1' and count, then convert to DataFrame
night_desc.groupby("desc_1").size().reset_index(name='count').sort_values("count", ascending = False).head(10)
desc_1 count
6 bars 172
86 sportsbars 78
75 pubs 71
34 divebars 53
61 lounges 49
100 tradamerican 49
67 newamerican 47
23 cocktailbars 39
47 hookah_bars 28
66 musicvenues 28
Code
# Group by 'desc_2' and count, then convert to DataFrame
night_desc.groupby("desc_2").size().reset_index(name='count').sort_values("count", ascending = False).head(10)
desc_2 count
7 bars 93
103 tradamerican 61
93 sportsbars 48
25 cocktailbars 43
69 newamerican 39
62 lounges 28
86 seafood 28
68 musicvenues 25
80 pubs 24
11 beerbar 19
Code
# Group by 'desc_3' and count, then convert to DataFrame
night_desc.groupby("desc_3").size().reset_index(name='count').sort_values("count", ascending = False).head(10)
desc_3 count
17 cocktailbars 51
70 tradamerican 39
3 bars 24
8 breakfast_brunch 21
6 beerbar 20
72 venues 17
10 burgers 17
56 sandwiches 15
48 musicvenues 12
62 sportsbars 12

Replace desc_1 with more relevant alias terms

Code
# replace desc_2
condition1 = night_desc['desc_2'].isin(["newamerican", "seafood", "breakfast_brunch", "tradamerican", "burgers", "sandwiches", "pizza", "mexican", "italian",
                                       "french", "ramen", "tapasmallplates", "french", "ramen", "mideastern", "comfortfood", "steak", "mediterranean", "caribbean",
                                       "bbq", "peruvian", "japanese", "ethiopian", "tex-mex", "southern", "juicebars", "cajun", "soulfood", "portugeuse", "sushi",
                                       "vegan", "cafes", "diners", "chicken-wings", "chinese", "salad", "cheesesteaks", "african", "german", "desserts", "indpak",
                                       "thai", "tacos", "falafel", "vietnamese", "waffles", "delis", "coffee", "korean", "hotdog", "moroccan", "newmexican", "cuban",
                                       "argentine", "gluten_free", "russian", "bakeries", "portuguese", "spanish", "latin", "catering", "gastropubs", 
                                       "breweries", "gaybars", "whiskeybars", "tikibars", "tobaccoshops", "beer_and_wine", "breweries", "venues",
                                       "poolhalls", "jazzandblues", "airportlounges", "cigarbars", "speakeasies", "theater", "nightlife"])

condition2 = night_desc['desc_3'].notna()

# Update desc_1 where conditions are met
night_desc.loc[condition1 & condition2, 'desc_2'] = night_desc['desc_3']


# replace desc_1
condition1 = night_desc['desc_1'].isin(["newamerican", "seafood", "breakfast_brunch", "tradamerican", "burgers", "sandwiches", "pizza", "mexican", "italian",
                                       "french", "ramen", "tapasmallplates", "french", "ramen", "mideastern", "comfortfood", "steak", "mediterranean", "caribbean",
                                       "bbq", "peruvian", "japanese", "ethiopian", "tex-mex", "southern", "juicebars", "cajun", "soulfood", "portugeuse", "sushi",
                                       "vegan", "cafes", "diners", "chicken-wings", "chinese", "salad", "cheesesteaks", "african", "german", "desserts", "indpak",
                                       "thai", "tacos", "falafel", "vietnamese", "waffles", "delis", "coffee", "korean", "hotdog", "moroccan", "newmexican", "cuban",
                                       "argentine", "gluten_free", "russian", "bakeries", "portuguese", "spanish", "latin", "catering", "gastropubs", 
                                       "breweries", "gaybars", "whiskeybars", "tikibars", "tobaccoshops", "beer_and_wine", "breweries", "venues",
                                       "poolhalls", "jazzandblues", "airportlounges", "cigarbars", "speakeasies", "theater", "nightlife"])

condition2 = night_desc['desc_2'].notna()

# Update desc_1 where conditions are met
night_desc.loc[condition1 & condition2, 'desc_1'] = night_desc['desc_2']

night_desc.groupby("desc_1").size().reset_index(name='count').sort_values("count", ascending = False).head(15)
desc_1 count
4 bars 271
51 sportsbars 114
12 cocktailbars 105
45 pubs 86
39 lounges 64
20 divebars 60
6 beerbar 44
61 wine_bars 42
40 musicvenues 42
31 hookah_bars 36
17 danceclubs 32
34 irish_pubs 14
37 karaoke 13
7 beergardens 12
0 airportlounges 9

Filter for categories where frequency ≥ 8

Code
night_desc_1 = night_desc.groupby("desc_1").size().reset_index(name='count').sort_values("count", ascending = False).head(9)
top_night_desc = list(night_desc_1["desc_1"])

# filter for obs in the list of top descriptions
night_desc = night_desc[night_desc["desc_1"].isin(top_night_desc)]
night_desc.head(10)
name rating address alias title geometry desc_1 desc_2 desc_3
1 720 Bistro 4.0 720 S.52nd st seafood, breakfast_brunch, cocktailbars Seafood, Breakfast & Brunch, Cocktail Bars POINT (2675620.054 234102.746) cocktailbars cocktailbars cocktailbars
2 Bayou 4.5 5025 Baltimore Ave lounges Lounges POINT (2676525.799 234061.840) lounges NaN NaN
3 Booker's Restaurant and Bar 3.5 5021 Baltimore Ave bars, breakfast_brunch, tradamerican Bars, Breakfast & Brunch, American POINT (2676559.222 234070.085) bars tradamerican tradamerican
4 The Wine Garden 4.5 5019 Baltimore Ave wine_bars Wine Bars POINT (2676563.551 234060.148) wine_bars NaN NaN
5 Carbon Copy 4.5 701 S 50th St breweries, pizza, bars Breweries, Pizza, Bars POINT (2676997.957 233927.765) bars bars bars
6 Dock Street Cannery and Tasting Room 4.0 705 S 50th St beerbar, lounges Beer Bar, Lounges POINT (2677046.168 233912.738) beerbar lounges NaN
7 Trendsetters Bar & Lounge 3.5 5301 Woodland Ave burgers, lounges, wraps Burgers, Lounges, Wraps POINT (2678439.957 230236.502) lounges lounges wraps
8 The Barn 3.5 4901 Catharine St poolhalls, sportsbars, divebars Pool Halls, Sports Bars, Dive Bars POINT (2677365.738 234212.623) sportsbars sportsbars divebars
9 Eris Temple 4.0 602 S 52nd St musicvenues Music Venues POINT (2675702.055 234852.608) musicvenues NaN NaN
10 Dahlak 3.5 4708 Baltimore Ave ethiopian, divebars Ethiopian, Dive Bars POINT (2678487.076 234234.642) divebars divebars NaN

Entertainment

Code
# Filter and create a copy to avoid SettingWithCopyWarning
entertain_desc = entertainment.copy()

# Create a new column 'desc_1'
entertain_desc["desc_1"] = entertain_desc['alias'].str.split(',').str[0].str.strip().str.lower()
entertain_desc["desc_2"] = entertain_desc['alias'].str.split(',').str[1].str.strip().str.lower()
entertain_desc["desc_3"] = entertain_desc['alias'].str.split(',').str[2].str.strip().str.lower()
Code
entertain_desc.head()
name rating address alias title geometry desc_1 desc_2 desc_3
0 Noam Osband 5.0 NaN musicians Musicians POINT (2676083.283 232370.886) musicians NaN NaN
1 Ceramic Concept 5.0 5015 Baltimore Ave galleries Art Galleries POINT (2676620.711 234054.592) galleries NaN NaN
2 Eris Temple 4.0 602 S 52nd St musicvenues Music Venues POINT (2675702.055 234852.608) musicvenues NaN NaN
3 Baltimore Avenue Dollar Stroll 5.0 4800 Baltimore Ave unofficialyelpevents, festivals Unofficial Yelp Events, Festivals POINT (2677842.004 234135.971) unofficialyelpevents festivals NaN
4 Curio Theatre Company 4.5 815 S 48th St theater Performing Arts POINT (2678192.432 234160.959) theater NaN NaN

Alias

Code
# Group by 'desc_1' and count, then convert to DataFrame
entertain_desc.groupby("desc_1").size().reset_index(name='count').sort_values("count", ascending = False).head(10)
desc_1 count
130 theater 127
53 galleries 96
50 festivals 69
86 museums 42
42 djs 28
89 musicvenues 28
105 psychics 23
85 movietheaters 20
117 social_clubs 19
47 eventplanning 16
Code
entertain_desc.groupby("desc_2").size().reset_index(name='count').sort_values("count", ascending = False).head(10)
desc_2 count
45 galleries 32
80 musicvenues 25
116 theater 24
119 venues 19
35 dancestudio 14
39 eventplanning 11
34 danceclubs 10
75 movietheaters 10
4 arcades 9
14 bars 8
Code
entertain_desc.groupby("desc_3").size().reset_index(name='count').sort_values("count", ascending = False).head(10)
desc_3 count
59 musicvenues 13
92 venues 9
25 dance_schools 6
27 dancestudio 5
58 musicians 5
34 eventplanning 5
49 kids_activities 5
2 arcades 4
3 artclasses 4
89 theater 4

Replace desc_1 with more relevant alias terms

Code
# replace desc_2
condition1 = entertain_desc['desc_2'].isin(["arts", "musicians", "kids_activities", "venues", "bars", "nonprofit", "lounges"])

condition2 = entertain_desc['desc_3'].notna()

# Update desc_1 where conditions are met
entertain_desc.loc[condition1 & condition2, 'desc_2'] = entertain_desc['desc_3']


# replace desc_1
condition1 = entertain_desc['desc_1'].isin(["arts", "musicians", "kids_activities", "venues", "bars", "nonprofit", "lounges"])

condition2 = entertain_desc['desc_2'].notna()

# Update desc_1 where conditions are met
entertain_desc.loc[condition1 & condition2, 'desc_1'] = entertain_desc['desc_2']

entertain_desc.groupby("desc_1").size().reset_index(name='count').sort_values("count", ascending = False).head(40)
desc_1 count
131 theater 130
55 galleries 98
52 festivals 70
87 museums 42
90 musicvenues 34
43 djs 29
118 social_clubs 24
106 psychics 23
86 movietheaters 21
48 eventplanning 17
39 danceclubs 16
35 countryclubs 11
57 gardens 10
7 artmuseums 10
104 psychic_astrology 9
40 dancestudio 9
95 paintandsip 9
125 stadiumsarenas 9
8 arts 8
17 bowling 8
5 arcades 8
100 photoboothrentals 8
124 sportsteams 7
6 artclasses 7
37 culturalcenter 7
32 comedyclubs 6
105 psychicmediums 6
91 newamerican 6
82 magicians 5
71 jazzandblues 5
27 catering 5
26 casinos 5
15 bingo 5
54 framing 4
122 specialtyschools 4
123 sportsbars 4
0 4
98 partyequipmentrentals 4
28 clowns 4
9 artsandcrafts 3

Filter for frequency

Code
entertain_desc_1 = entertain_desc.groupby("desc_1").size().reset_index(name='count').sort_values("count", ascending = False)
entertain_desc_1 = entertain_desc_1[entertain_desc_1['count'] >= 5]

top_entertain_desc = list(entertain_desc_1["desc_1"])


top_entertain_desc.remove('psychics')

# filter for obs in the list of top descriptions
entertain_desc = entertain_desc[entertain_desc["desc_1"].isin(top_entertain_desc)]
Code
entertain_desc.groupby("desc_1").size().reset_index(name='count').sort_values("count", ascending = False)
desc_1 count
31 theater 130
16 galleries 98
15 festivals 70
21 museums 42
22 musicvenues 34
13 djs 29
28 social_clubs 24
20 movietheaters 21
14 eventplanning 17
11 danceclubs 16
9 countryclubs 11
2 artmuseums 10
17 gardens 10
12 dancestudio 9
24 paintandsip 9
30 stadiumsarenas 9
26 psychic_astrology 9
25 photoboothrentals 8
5 bowling 8
3 arts 8
0 arcades 8
10 culturalcenter 7
1 artclasses 7
29 sportsteams 7
23 newamerican 6
27 psychicmediums 6
8 comedyclubs 6
19 magicians 5
7 catering 5
6 casinos 5
18 jazzandblues 5
4 bingo 5
Code
# Define the mapping dictionary
desc_mapping = {
    "djs": "nightclubs",
    "danceclubs": "nightclubs",
    "artmuseums": "museums",
    "sportsteams": "sports",
    "stadiumsarenas": "sports",
    "paintandsip": "specialtyclasses",
    "dancestudio": "specialtyclasses",
    "bowling": "games/arcade",
    "bingo": "games/arcade",
    "arcades": "games/arcade",
    "artclasses": "specialtyclasses"
}

# Replace desc_1 values using the mapping dictionary
entertain_desc.loc[:, 'desc_1'] = entertain_desc['desc_1'].replace(desc_mapping)
Code
entertain_cat = entertain_desc.groupby("desc_1").size().reset_index(name='count').sort_values("count", ascending = False)
entertain_cat_top = list(entertain_cat[entertain_cat["count"] >= 11]["desc_1"])
# filter for desc_1
entertain_desc = entertain_desc[entertain_desc["desc_1"].isin(entertain_cat_top)]
entertain_desc.groupby("desc_1").size().reset_index(name='count').sort_values("count", ascending = False)
desc_1 count
12 theater 130
3 galleries 98
2 festivals 70
6 museums 52
8 nightclubs 45
7 musicvenues 34
10 specialtyclasses 25
9 social_clubs 24
4 games/arcade 21
5 movietheaters 21
1 eventplanning 17
11 sports 16
0 countryclubs 11

Shopping

Code
# Filter and create a copy to avoid SettingWithCopyWarning
shop_desc = shopping.copy()

# Create a new column 'desc_1'
shop_desc["desc_1"] = shop_desc['alias'].str.split(',').str[0].str.strip().str.lower()
shop_desc["desc_2"] = shop_desc['alias'].str.split(',').str[1].str.strip().str.lower()
shop_desc["desc_3"] = shop_desc['alias'].str.split(',').str[2].str.strip().str.lower()
Code
shop_desc.head()
name rating address alias title geometry desc_1 desc_2 desc_3
0 Rite Aid 2.0 5214-30 Baltimore Ave drugstores, convenience Drugstores, Convenience Stores POINT (2675264.630 233732.096) drugstores convenience NaN
1 Ajah Creative Sweets And Treats & More 5.0 NaN desserts, cupcakes, giftshops Desserts, Cupcakes, Gift Shops POINT (2677939.712 232732.831) desserts cupcakes giftshops
2 Ceramic Concept 5.0 5015 Baltimore Ave galleries Art Galleries POINT (2676620.711 234054.592) galleries NaN NaN
3 VIX Emporium 4.5 5009 Baltimore Ave artsandcrafts, childcloth, jewelry Arts & Crafts, Children's Clothing, Jewelry POINT (2676676.410 234091.659) artsandcrafts childcloth jewelry
4 Michele Judge 5.0 4919 Pentridge St jewelryrepair, artclasses, jewelry Jewelry Repair, Art Classes, Jewelry POINT (2677304.635 233898.273) jewelryrepair artclasses jewelry

Alias

Code
# Group by 'desc_1' and count, then convert to DataFrame
shop_desc.groupby("desc_1").size().reset_index(name='count').sort_values("count", ascending = False).head(10)
desc_1 count
32 drugstores 234
142 womenscloth 114
69 jewelry 90
31 discountstore 87
49 galleries 84
48 furniture 74
113 shoes 68
29 deptstores 56
86 menscloth 50
11 bookstores 41
Code
# Group by 'desc_2' and count, then convert to DataFrame
shop_desc.groupby("desc_2").size().reset_index(name='count').sort_values("count", ascending = False).head(10)
desc_2 count
24 convenience 95
0 accessories 67
139 womenscloth 66
81 menscloth 51
48 furniture 38
65 jewelry 35
100 pharmacy 34
35 discountstore 32
49 galleries 29
61 homedecor 27
Code
# Group by desc_3' and count, then convert to DataFrame
shop_desc.groupby("desc_3").size().reset_index(name='count').sort_values("count", ascending = False).head(10)
desc_3 count
21 convenience 55
0 accessories 30
115 womenscloth 28
54 homedecor 26
79 pharmacy 23
95 sportswear 18
92 shoes 18
56 jewelry 18
15 childcloth 16
108 vintage 15

Filter for most common labels

Code
shop_desc_1 = shop_desc.groupby("desc_1").size().reset_index(name='count').sort_values("count", ascending = False).head(20)
top_shop_desc = list(shop_desc_1["desc_1"])
top_shop_desc
['drugstores',
 'womenscloth',
 'jewelry',
 'discountstore',
 'galleries',
 'furniture',
 'shoes',
 'deptstores',
 'menscloth',
 'bookstores',
 'accessories',
 'thrift_stores',
 'antiques',
 'grocery',
 'giftshops',
 'homedecor',
 'selfstorage',
 'childcloth',
 'sportswear',
 'vintage']
Code
# remove drugstores and grocery
top_shop_desc.remove('drugstores')
top_shop_desc.remove('grocery')

# filter for obs in the list of top descriptions
shop_desc = shop_desc[shop_desc["desc_1"].isin(top_shop_desc)]
Code
# Define the mapping dictionary
desc_mapping = {
    "mattresses": "furniture",
    "kitchenandbath": "furniture",
    "rugs": "homedecor",
    "hats": "accessories",
    "watch_repair": "accessories"
}

# Replace desc_1 values using the mapping dictionary
shop_desc.loc[:, 'desc_1'] = shop_desc['desc_1'].replace(desc_mapping)
shop_desc.groupby("desc_1").size().reset_index(name='count').sort_values("count", ascending = False).head(20)
desc_1 count
17 womenscloth 114
10 jewelry 90
5 discountstore 87
7 galleries 84
6 furniture 74
13 shoes 68
4 deptstores 56
11 menscloth 50
2 bookstores 41
0 accessories 40
15 thrift_stores 36
1 antiques 31
8 giftshops 24
12 selfstorage 23
9 homedecor 23
3 childcloth 21
14 sportswear 20
16 vintage 19

Healthcare

Code
# Filter and create a copy to avoid SettingWithCopyWarning
health_desc = healthcare.copy()

# Create a new column 'desc_1'
health_desc["desc_1"] = health_desc['alias'].str.split(',').str[0].str.strip().str.lower()
health_desc["desc_2"] = health_desc['alias'].str.split(',').str[1].str.strip().str.lower()
health_desc["desc_3"] = health_desc['alias'].str.split(',').str[2].str.strip().str.lower()

Alias

Code
# Group by 'desc_1' and count, then convert to DataFrame
health_desc.groupby("desc_1").size().reset_index(name='count').sort_values("count", ascending = False).head(10)
desc_1 count
38 laboratorytesting 33
80 urgent_care 26
33 homehealthcare 25
43 medcenters 23
29 generaldentistry 21
35 hospitals 20
37 internalmed 19
21 drugstores 19
26 familydr 16
11 chiropractors 14

Filter for most common labels

Code
health_desc_1 = health_desc.groupby("desc_1").size().reset_index(name='count').sort_values("count", ascending = False).head(20)
top_health_desc = list(shop_desc_1["desc_1"])
top_health_desc
['drugstores',
 'womenscloth',
 'jewelry',
 'discountstore',
 'galleries',
 'furniture',
 'shoes',
 'deptstores',
 'menscloth',
 'bookstores',
 'accessories',
 'thrift_stores',
 'antiques',
 'grocery',
 'giftshops',
 'homedecor',
 'selfstorage',
 'childcloth',
 'sportswear',
 'vintage']
Code
# filter for obs in the list of top descriptions
health_desc = health_desc[health_desc["desc_1"].isin(top_health_desc)]

Historic landmarks

Code
# Filter and create a copy to avoid SettingWithCopyWarning
historic_desc = historic.copy()

# Create a new column 'desc_1'
historic_desc["desc_1"] = historic_desc['alias'].str.split(',').str[0].str.strip().str.lower()
historic_desc["desc_2"] = historic_desc['alias'].str.split(',').str[1].str.strip().str.lower()
historic_desc["desc_3"] = historic_desc['alias'].str.split(',').str[2].str.strip().str.lower()

Alias

Code
# Group by 'desc_1' and count, then convert to DataFrame
historic_desc.groupby("desc_1").size().reset_index(name='count').sort_values("count", ascending = False).head(10)
desc_1 count
1 landmarks 46
4 museums 25
0 artmuseums 2
2 libraries 2
9 venues 2
3 municipality 1
5 nonprofit 1
6 postoffices 1
7 publicart 1
8 religiousorgs 1

Filter for most common labels

Code
historic_desc_1 = historic_desc.groupby("desc_1").size().reset_index(name='count').sort_values("count", ascending = False).head(3)
top_historic_desc = list(historic_desc_1["desc_1"])
top_historic_desc
['landmarks', 'museums', 'artmuseums']
Code
# filter for obs in the list of top descriptions
historic_desc = historic_desc[historic_desc["desc_1"].isin(top_historic_desc)]
Code
# Define the mapping dictionary
desc_mapping = {
    "artmuseums": "museums"
}

# Replace desc_1 values using the mapping dictionary
historic_desc['desc_1'] = historic_desc['desc_1'].replace(desc_mapping)
historic_desc.groupby("desc_1").size().reset_index(name='count').sort_values("count", ascending = False).head(20)
desc_1 count
0 landmarks 46
1 museums 27

Kids activities

Code
kids_desc = kids.copy()

# Create a new column 'desc_1'
kids_desc["desc_1"] = kids_desc['alias'].str.split(',').str[0].str.strip().str.lower()
kids_desc["desc_2"] = kids_desc['alias'].str.split(',').str[1].str.strip().str.lower()
kids_desc["desc_3"] = kids_desc['alias'].str.split(',').str[2].str.strip().str.lower()

kids_desc.head()
name rating address alias title geometry desc_1 desc_2 desc_3
0 Makono African Hair Braiding 3.0 5529 Baltimore Ave hairstylists, cosmetics Hair Stylists, Cosmetics & Beauty Supply POINT (2673677.520 233714.490) hairstylists cosmetics NaN
1 Carbon Copy 4.5 701 S 50th St breweries, pizza, bars Breweries, Pizza, Bars POINT (2676997.957 233927.765) breweries pizza bars
2 Firehouse Bicycles 4.5 701 S 50th St bikes, bike_repair_maintenance Bikes, Bike Repair/Maintenance POINT (2677005.367 233951.502) bikes bike_repair_maintenance NaN
3 Common Beat Music 4.0 4916 Baltimore Ave vinyl_records, electronicsrepair, musicinstrumentservices Vinyl Records, Electronics Repair, Musical Instrument Services POINT (2677188.738 233998.088) vinyl_records electronicsrepair musicinstrumentservices
4 Cedar Park 4.5 50th St & Baltimore Ave parks Parks POINT (2677006.643 234133.514) parks NaN NaN

Alias

Code
# Group by 'desc_1' and count, then convert to DataFrame
kids_desc.groupby("desc_1").size().reset_index(name='count').sort_values("count", ascending = False).head(10)
desc_1 count
45 childcare 92
232 parks 53
131 generaldentistry 39
87 drugstores 36
141 gyms 34
255 preschools 29
311 theater 26
95 elementaryschools 26
189 libraries 26
107 festivals 25
Code
kids_desc = kids_desc[kids_desc["desc_1"].isin(["childcare", "parks", "preschools", "theater", "libraries", "festivals", "museums", 
                                                "martialarts", "churches", "playgrounds", "musicalinstrumentsandteachers", "landmarks", 
                                                "kids_activities"])]
Code
kids_desc.groupby("desc_1").size().reset_index(name='count').sort_values("count", ascending = False)
desc_1 count
0 childcare 92
9 parks 53
11 preschools 29
5 libraries 26
12 theater 26
2 festivals 25
7 museums 19
1 churches 18
6 martialarts 18
10 playgrounds 18
8 musicalinstrumentsandteachers 17
4 landmarks 15
3 kids_activities 13

Art and music

Code
arts_desc = arts.copy()

# Create a new column 'desc_1'
arts_desc["desc_1"] = arts_desc['alias'].str.split(',').str[0].str.strip().str.lower()
arts_desc["desc_2"] = arts_desc['alias'].str.split(',').str[1].str.strip().str.lower()
arts_desc["desc_3"] = arts_desc['alias'].str.split(',').str[2].str.strip().str.lower()

arts_desc.head()
name rating address alias title geometry desc_1 desc_2 desc_3
0 Curio Theatre Company 4.5 815 S 48th St theater Performing Arts POINT (2678192.432 234160.959) theater NaN NaN
1 Studio 34 4.5 4522 Baltimore Ave yoga, pilates, theater Yoga, Pilates, Performing Arts POINT (2679553.196 234477.061) yoga pilates theater
2 Painted Bride Art Center 3.5 5212 Market St theater, venues Performing Arts, Venues & Event Spaces POINT (2676029.704 238278.040) theater venues NaN
3 The Rotunda 4.5 4014 Walnut St musicvenues, theater Music Venues, Performing Arts POINT (2682266.656 236480.866) musicvenues theater NaN
4 Penn Live Arts 4.0 3680 Walnut St theater, jazzandblues, musicvenues Performing Arts, Jazz & Blues, Music Venues POINT (2684326.329 236037.832) theater jazzandblues musicvenues

Alias

Code
# Group by 'desc_1' and count, then convert to DataFrame
arts_desc.groupby("desc_1").size().reset_index(name='count').sort_values("count", ascending = False).head(10)
desc_1 count
27 theater 163
7 dancestudio 11
18 musicalinstrumentsandteachers 11
12 galleries 9
20 musicvenues 6
19 musicians 5
11 festivals 5
6 dance_schools 5
3 artschools 4
1 artmuseums 4

Regroup into large categories

Code
# Define the mapping dictionary
desc_mapping = {
    "specialtyschools": "classes",
    "dance_schools": "classes",
    "artschools": "classes",
    "musicalinstrumentsandteachers": "classes",
    "dancestudio": "dancestudios",
    "galleries": "gallery/museum",
    "museums": "gallery/museum",
    "artmuseums": "gallery/museum",
    "festivals": "events/shows",
    "jazzandblues": "events/shows",
    "djs": "events/shows",
    "musicvenues": "events/shows",
    "comedyclubs": "events/shows",
    "magicians": "events/shows",
    "musicians": "events/shows"
}

# Replace desc_1 values using the mapping dictionary
arts_desc['desc_1'] =arts_desc['desc_1'].replace(desc_mapping)
arts_desc.groupby("desc_1").size().reset_index(name='count').sort_values("count", ascending = False).head(20)
desc_1 count
16 theater 163
7 events/shows 24
2 classes 22
8 gallery/museum 15
4 dancestudios 11
3 culturalcenter 2
0 aerialfitness 1
12 opera 1
17 venues 1
15 summer_camps 1
14 social_clubs 1
13 preschools 1
9 hair 1
11 nonprofit 1
10 language_schools 1
1 arts 1
6 educationservices 1
5 dinnertheater 1
18 yoga 1

Filter for most common labels

Code
arts_desc_1 = arts_desc.groupby("desc_1").size().reset_index(name='count').sort_values("count", ascending = False).head(20)
top_arts_desc = list(arts_desc_1["desc_1"])

# remove historical landmarks
top_arts_desc.remove('summer_camps')
top_arts_desc.remove('preschools')
top_arts_desc.remove('language_schools')
top_arts_desc.remove('educationservices')

top_arts_desc
['theater',
 'events/shows',
 'classes',
 'gallery/museum',
 'dancestudios',
 'culturalcenter',
 'aerialfitness',
 'opera',
 'venues',
 'social_clubs',
 'hair',
 'nonprofit',
 'arts',
 'dinnertheater',
 'yoga']
Code
arts_desc = arts_desc[arts_desc["desc_1"].isin(top_arts_desc)]

Beauty

Code
beauty_desc = beauty.copy()

# Create a new column 'desc_1'
beauty_desc["desc_1"] = beauty_desc['alias'].str.split(',').str[0].str.strip().str.lower()
beauty_desc["desc_2"] = beauty_desc['alias'].str.split(',').str[1].str.strip().str.lower()
beauty_desc["desc_3"] = beauty_desc['alias'].str.split(',').str[2].str.strip().str.lower()

beauty_desc.tail()
name rating address alias title geometry desc_1 desc_2 desc_3
2156 Lips and Drips by Erica Marie 5.0 2342 S Broad St medicalspa, ivhydration Medical Spas, IV Hydration POINT (2691839.535 224599.252) medicalspa ivhydration NaN
2157 The Lash Lounge Wyndmoor - 909 Willow 4.0 909 E Willow Grove Ave eyelashservice, eyebrowservices, threadingservices Eyelash Service, Eyebrow Services, Threading Services POINT (2685044.968 283383.097) eyelashservice eyebrowservices threadingservices
2158 Hair Expo 3.5 7721 Crittenden St hairstylists Hair Stylists POINT (2683447.588 279731.355) hairstylists NaN NaN
2159 The Cut Bar 4.0 532 S 4th St barbers Barbers POINT (2697625.976 232400.348) barbers NaN NaN
2160 House of Hair Lounge 3.0 259 S 60th St hairstylists Hair Stylists POINT (2671622.181 237231.748) hairstylists NaN NaN
Code
# Group by 'desc_1' and count, then convert to DataFrame
beauty_desc.groupby("desc_1").size().reset_index(name='count').sort_values("count", ascending = False).head(10)
desc_1 count
85 hair 378
146 othersalons 342
12 barbers 149
89 hairstylists 87
43 cosmetics 82
119 massage 74
70 florists 53
117 makeupartists 49
86 hair_extensions 35
188 tattoo 33
Code
remove_cat = ["florists", "jewelry", "drugstores", "generaldentistry", "discountstore", "womenscloth", 
              "deptstores", "furniture", "", "bookstores", "galleries", "giftshops", "artsandcrafts", 
              "gardening", "fabricstores", "antiques", "menscloth", "orthodontists", "vintage", 
              "thriftstores", "stationary", "bikes", "beautysvc", "accessories", "optometrists", 
              "framing", "interiordesign", "tobaccoshops", "homedecor", "stationary", "thriftstores"]

# Grouping, counting, and filtering beauty_desc
beauty_desc_1 = beauty_desc.groupby("desc_1").size().reset_index(name='count').sort_values("count", ascending=False)
top_beauty_desc = beauty_desc_1[beauty_desc_1['count'] > 6]

# Filtering top_beauty_desc to remove unwanted categories
top_beauty_desc = top_beauty_desc[~top_beauty_desc["desc_1"].isin(remove_cat)]["desc_1"]

beauty_desc = beauty_desc[beauty_desc["desc_1"].isin(top_beauty_desc)]

Combine all amenities

Code
# Create explicit copies of the DataFrame slices
food_desc = food_desc.copy()
parks_desc = parks_desc.copy()
edu_desc = edu_desc.copy()
grocery_desc = grocery_desc.copy()
entertain_desc = entertain_desc.copy()
night_desc = night_desc.copy()
shop_desc = shop_desc.copy()
health_desc = health_desc.copy()
historic_desc = historic_desc.copy()
arts_desc = arts_desc.copy()
beauty_desc = beauty_desc.copy()
kids_desc = kids_desc.copy()

# Now perform the operations
food_desc.loc[:, 'type'] = 'restaurant'
parks_desc.loc[:, 'type'] = 'parks'
edu_desc.loc[:, 'type'] = 'education'
grocery_desc.loc[:, 'type'] = 'grocery'
entertain_desc.loc[:, 'type'] = 'entertainment'
night_desc.loc[:, 'type'] = 'nightlife'
shop_desc.loc[:, 'type'] = 'shopping'
health_desc.loc[:, 'type'] = 'healthcare'
historic_desc.loc[:, 'type'] = 'historic'
arts_desc.loc[:, 'type'] = 'arts'
beauty_desc.loc[:, 'type'] = 'beauty'
kids_desc.loc[:, 'type'] = 'kids'
Code
# combine gdfs
amenities = gpd.GeoDataFrame(pd.concat([food_desc, parks_desc, edu_desc, grocery_desc, entertain_desc, night_desc, shop_desc,
                                       health_desc, historic_desc, arts_desc, beauty_desc, kids_desc], ignore_index=True))

Clip to Philly bounds

Code
phl_bound_squeezed = phl_bound_proj.squeeze().geometry

amenities_phl = amenities.within(phl_bound_squeezed)
amenities_phl_gdf = amenities[amenities_phl]
amenities_phl_gdf.explore(tiles='CartoDB dark_matter', legend=True, column='type', cmap='Paired')
Make this Notebook Trusted to load map: File -> Trust Notebook
Code
# count by type
amenities_grouped = amenities_phl_gdf.groupby('type').size().reset_index(name='count')
Code
amenities_grouped.to_csv('amenities_grouped.csv', index=False)
amenities_phl_gdf.to_file('amenities_phl_gdf.geojson', driver='GeoJSON', index=False)
Code
amenities_tract.to_file('amenities_tract.geojson', driver='GeoJSON', index=False)

Amenities spatial patterns

Code
# Extract unique business types
business_types = amenities_phl_gdf['type'].unique()

# Determine the number of rows and columns for the subplots
n_rows = len(business_types) // 3 + (len(business_types) % 3 > 0)
fig, axes = plt.subplots(n_rows, 3, figsize=(15, n_rows * 4))

# Flatten the axes array for easy looping
axes = axes.flatten()

# Create a map for each business type
for i, business_type in enumerate(business_types):
    # Filter data for the current business type
    subset = amenities_phl_gdf[amenities_phl_gdf['type'] == business_type]

    # Get count for the current business type
    count = amenities_grouped[amenities_grouped['type'] == business_type]['count'].values[0]

    # Plotting with transparency
    subset.plot(ax=axes[i], color='navy', markersize=1, alpha=1)

    # Set title with count (n = count)
    axes[i].set_title(f"{business_type.capitalize()} (n = {count})")

    # Customizations: Remove boxes, axis ticks, and labels
    axes[i].set_axis_off()

# Remove unused subplots
for j in range(i+1, len(axes)):
    fig.delaxes(axes[j])

# Adjust layout
plt.tight_layout()

# Display the panel of maps
plt.show()

Intersect with neighborhoods

Code
amenities_neighborhood = gpd.sjoin(amenities_phl_gdf, neigh, how = "left", predicate = "intersects")
amenities_tract = gpd.sjoin(amenities_phl_gdf, tract, how = "left", predicate = "intersects")
Code
amenities_neigh_group = amenities_neighborhood.groupby(["nb_name", "type"]).size().reset_index(name='count')
amenities_tract_group = amenities_tract.groupby(["nb_name", "type"]).size().reset_index(name='count')
Code
amenities_tract_group.head()
nb_name type count
0 Airport-Navy Yard 01 entertainment 2
1 Airport-Navy Yard 01 kids 2
2 Airport-Navy Yard 01 nightlife 1
3 Airport-Navy Yard 01 restaurant 3
4 Airport-Navy Yard 02 beauty 1
Code
amenities_neigh_total = amenities_neigh_group.groupby("nb_name")['count'].sum().reset_index(name='total_amenities')
amenities_tract_total = amenities_tract_group.groupby("nb_name")['count'].sum().reset_index(name='total_amenities')
Code
amenities_tract_total.head()
nb_name total_amenities
0 Airport-Navy Yard 01 8
1 Airport-Navy Yard 02 54
2 Airport-Navy Yard 03 43
3 Airport-Navy Yard 04 42
4 Airport-Navy Yard 05 48
Code
amenities_neigh_group = amenities_neigh_group.merge(amenities_neigh_total, on="nb_name", how="left")
amenities_tract_group = amenities_tract_group.merge(amenities_tract_total, on="nb_name", how="left")
Code
amenities_tract_group.head()
nb_name type count total_amenities
0 Airport-Navy Yard 01 entertainment 2 8
1 Airport-Navy Yard 01 kids 2 8
2 Airport-Navy Yard 01 nightlife 1 8
3 Airport-Navy Yard 01 restaurant 3 8
4 Airport-Navy Yard 02 beauty 1 54
Code
amenities_neigh_group["pct_share"] = (100*amenities_neigh_group["count"]/amenities_neigh_group["total_amenities"]).round(1)
amenities_tract_group["pct_share"] = (100*amenities_tract_group["count"]/amenities_tract_group["total_amenities"]).round(1)
Code
amenities_tract_group.head()
nb_name type count total_amenities pct_share
0 Airport-Navy Yard 01 entertainment 2 8 25.0
1 Airport-Navy Yard 01 kids 2 8 25.0
2 Airport-Navy Yard 01 nightlife 1 8 12.5
3 Airport-Navy Yard 01 restaurant 3 8 37.5
4 Airport-Navy Yard 02 beauty 1 54 1.9
Code
amenities_neigh_group_gdf = amenities_neigh_group.merge(neigh, on = "nb_name", how = "left")
amenities_tract_group_gdf = amenities_tract_group.merge(tract, on = "nb_name", how = "left")
Code
amenities_neigh = gpd.GeoDataFrame(amenities_neigh_group_gdf, geometry='geometry')
amenities_tract = gpd.GeoDataFrame(amenities_tract_group_gdf, geometry='geometry')
Code
amenities_nb = amenities_neigh[["nb_name", "type", "count", "total_amenities", "pct_share", "geometry", "area_mi2"]]
amenities_tr = amenities_tract[["nb_name", "type", "count", "total_amenities", "pct_share", "geometry", "area_mi2"]]
Code
amenities_nb = amenities_nb.copy()
amenities_nb.loc[:, 'count_per_mi2'] = amenities_nb['count'] / amenities_nb['area_mi2']
amenities_tr = amenities_tr.copy()
amenities_tr.loc[:, 'count_per_mi2'] = amenities_tr['count'] / amenities_tr['area_mi2']
Code
amenities_tr[amenities_tr["type"] == "kids"].explore(tiles='CartoDB dark_matter', legend=True, column = "pct_share", cmap = "magma")
Make this Notebook Trusted to load map: File -> Trust Notebook
Code
# Extract unique types
amenity_types = amenities_nb['type'].unique()

# Determine the number of rows and columns for the subplots
n_rows = len(amenity_types) // 3 + (len(amenity_types) % 3 > 0)
fig, axes = plt.subplots(n_rows, 3, figsize=(15, n_rows * 4))

# Flatten the axes array for easy looping
axes = axes.flatten()

# Create a choropleth map for each amenity type
for i, amenity_type in enumerate(amenity_types):
    # Filter data for the current amenity type
    subset = amenities_nb[amenities_nb['type'] == amenity_type]

    # Plotting
    subset.plot(column='pct_share', ax=axes[i], legend=True,
                legend_kwds={'label': "share of all amenities"},
                cmap='YlGnBu')

    # Set title
    axes[i].set_title(amenity_type.capitalize())
    
     # Remove boxes, axis ticks, and axis labels
    axes[i].set_axis_off()

# Remove unused subplots
for j in range(i+1, len(axes)):
    fig.delaxes(axes[j])

# Adjust layout
plt.tight_layout()

# Display the panel of maps
plt.show()

Code
amenities_nb.to_file('amenities_neigh.geojson', driver='GeoJSON', index=False)
amenities_tr.to_file('amenities_tract.geojson', driver='GeoJSON', index=False)

Correlation analysis

Code
# spreading the data
amenities_nb_wide = amenities_nb.pivot_table(index='nb_name', columns='type', values='pct_share', aggfunc=np.mean).fillna(0)
Code
# Calculating the correlation matrix
correlation_matrix = amenities_nb_wide.corr()

import altair as alt

# Reset index to convert the index into a column for Altair
heatmap_data = correlation_matrix.reset_index().melt('type', var_name='type2', value_name='correlation')

# Create the heatmap
heatmap = alt.Chart(heatmap_data).mark_rect().encode(
    x='type:N',
    y='type2:N',
    color='correlation:Q'
)

# Add text to each cell
text = heatmap.mark_text(baseline='middle').encode(
    text=alt.Text('correlation:Q', format='.2f'),
    color=alt.condition(
        alt.datum.correlation > 0.75, 
        alt.value('white'),
        alt.value('black')
    )
)

# Display the chart
chart = (heatmap + text).properties(width=600, height=600)
chart

Cluster analysis

Code
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

Overall (all features)

Code
kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
Code
amenities_nb_wide = amenities_nb.pivot_table(index='nb_name', columns='type', values='pct_share', fill_value=0).reset_index()
Code
amenities_nb_scaled = scaler.fit_transform(amenities_nb_wide[["arts", "education", "kids", "nightlife", "parks", "restaurant", "shopping"]])
Code
amenities_nb_scaled[:5]
array([[-0.57794679,  0.14926393, -0.15135435, -0.01048566, -0.4437252 ,
        -0.06703338, -0.08815405],
       [-0.57794679, -0.92493936, -0.69537104,  0.69895844, -0.4437252 ,
         2.00822975, -0.02153764],
       [-0.57794679, -0.0868247 , -0.47776436, -0.54499833, -0.16425187,
         1.1136301 ,  0.06411203],
       [-0.57794679,  0.31452598, -0.05464027,  0.12557212, -0.4437252 ,
         0.42707689, -0.44978598],
       [-0.57794679, -0.92493936, -0.69537104, -0.89486118, -0.4437252 ,
         3.71421046, -0.95416737]])
Code
amenities_nb_scaled.mean(axis=0)
array([-3.50596745e-17,  2.92163954e-17,  6.28152501e-17,  1.75298372e-17,
        2.48339361e-17, -5.84327908e-17, -7.30409885e-17])
Code
amenities_nb_scaled.std(axis=0)
array([1., 1., 1., 1., 1., 1., 1.])
Code
# Perform the fit
kmeans.fit(amenities_nb_scaled)

# Extract the labels
amenities_nb_wide['label'] = kmeans.labels_
Code
(
    alt.Chart(amenities_nb_wide)
    .mark_circle()
    .encode(
        alt.X("restaurant:Q", scale=alt.Scale(zero=False)),
        alt.Y("education:Q", scale=alt.Scale(zero=False)),
        color=alt.Color("label:N", scale=alt.Scale(scheme="dark2")),
        tooltip=list(amenities_nb_wide.columns),
    )
    .properties(width=800, height=600)
    .interactive()
)
Code
amenities_nb_wide.groupby("label").size()
label
0    80
1    27
2     4
3    35
4     6
dtype: int64
Code
# Average restaurants per cluster
amenities_nb_wide.groupby("label")['restaurant'].mean().sort_values()
label
2     0.000000
1     8.829630
4     9.066667
3    18.377143
0    42.618750
Name: restaurant, dtype: float64
Code
# Average educational institutions per cluster
amenities_nb_wide.groupby("label")['parks'].mean().sort_values()
label
2     0.000000
0     1.318750
3     1.442857
1     3.729630
4    29.616667
Name: parks, dtype: float64
Code
amenities_clusters = amenities_nb_wide.merge(neigh, how = "left", on = "nb_name")
amenities_clusters_gdf = gpd.GeoDataFrame(amenities_clusters, geometry = "geometry")
Code
amenities_clusters_gdf.explore(
    column="label",
    cmap="Set3",
    categorical=True,
    legend=True,
    tiles="CartoDB positron"
)
Make this Notebook Trusted to load map: File -> Trust Notebook
Code
amenities_clusters_gdf.to_file('clusters.geojson', driver='GeoJSON', index=False)
Code
cluster_profile = amenities_nb_wide.groupby("label")[["arts", "education", "kids", "nightlife", "parks", "restaurant", "shopping"]].mean()

cluster_profile
type arts education kids nightlife parks restaurant shopping
label
0 0.805000 6.900000 4.276250 7.538750 1.318750 42.618750 4.962500
1 7.096296 12.351852 5.318519 16.866667 3.729630 8.829630 11.200000
2 0.000000 30.825000 42.075000 0.000000 0.000000 0.000000 0.000000
3 1.008571 5.054286 3.668571 9.342857 1.442857 18.377143 23.082857
4 0.983333 0.883333 15.316667 2.350000 29.616667 9.066667 2.783333
Code
# Number of clusters to try out
n_clusters = list(range(2, 15))

# Run kmeans for each value of k
inertias = []
for k in n_clusters:
    
    # Initialize and run
    kmeans = KMeans(n_clusters=k, n_init=20)
    kmeans.fit(amenities_nb_scaled)
    
    # Save the "inertia"
    inertias.append(kmeans.inertia_)
    
# Plot it!
plt.plot(n_clusters, inertias, marker='o', ms=10, mfc='white', lw=4, mew=3);

from kneed import KneeLocator

# Initialize the knee algorithm
kn = KneeLocator(n_clusters, inertias, curve='convex', direction='decreasing')

# Print out the knee 
print(kn.knee)
6

Restaurants

Code
kmeans = KMeans(n_clusters=8, random_state=42, n_init=10)
Code
food_neigh = food_desc.sjoin(neigh, how = "left")
Code
restaurant = food_desc[["name", "desc_1"]]
nb_gdf = amenities_neighborhood[["nb_name", "name", "rating"]]

restaurants_nb_gdf = restaurant.merge(nb_gdf, on = "name", how = "left")
Code
restaurants_nb_grouped = restaurants_nb_gdf.groupby(["nb_name", "desc_1"]).size().reset_index(name="count")

# Sort within each neighborhood group by count in descending order
restaurants_nb_grouped = restaurants_nb_grouped.sort_values(by=["nb_name", "count"], ascending=[True, False])
restaurants_nb_grouped.head()
nb_name desc_1 count
1 Academy Gardens fastfood 8
0 Academy Gardens chinese 4
2 Academy Gardens pizza 1
3 Academy Gardens seafood 1
4 Academy Gardens venues 1
Code
restaurants_nb_wide = restaurants_nb_grouped.pivot(index='nb_name', columns='desc_1', values='count').reset_index().fillna(0)

# Calculating the correlation matrix
correlation_matrix = restaurants_nb_wide.corr()

import altair as alt

# Reset index to convert the index into a column for Altair
heatmap_data = correlation_matrix.reset_index().melt('desc_1', var_name='desc_2', value_name='correlation')
Code
restaurants_total.hist()
<Axes: >

Code
# Calculating the total restaurants for each neighborhood
restaurants_total = restaurants_nb_wide.set_index('nb_name').sum(axis=1)

# Using 'map' to align and assign the total restaurants back to the original DataFrame
restaurants_nb_wide["total_restaurants"] = restaurants_nb_wide['nb_name'].map(restaurants_total)

restaurants_nb_wide = restaurants_nb_wide[restaurants_nb_wide["total_restaurants"] > 10].copy()

restaurants_nb_wide.head()
desc_1 nb_name american asianfusion bakeries/cafes bbq breakfast_brunch caribbean chicken chinese cocktailbars diner/grill fastfood halal indpak italian japanese korean latin mediterranean pizza salad sandwiches/delis seafood soulfood venues vietnamese total_restaurants
0 Academy Gardens 0.0 0.0 0.0 0.0 0.0 0.0 0.0 4.0 0.0 0.0 8.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 1.0 0.0 15.0
1 Airport 2.0 0.0 18.0 0.0 4.0 0.0 2.0 0.0 0.0 3.0 46.0 0.0 0.0 4.0 2.0 0.0 5.0 3.0 4.0 7.0 8.0 7.0 0.0 2.0 0.0 117.0
2 Allegheny West 0.0 0.0 2.0 2.0 0.0 1.0 8.0 6.0 0.0 0.0 86.0 0.0 0.0 0.0 6.0 0.0 0.0 0.0 5.0 0.0 3.0 4.0 1.0 0.0 0.0 124.0
3 Andorra 5.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 31.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 11.0 0.0 0.0 0.0 0.0 50.0
6 Brewerytown 0.0 0.0 6.0 0.0 1.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 0.0 2.0 0.0 2.0 0.0 0.0 17.0
Code
# Columns representing different cuisines
cuisine_columns = ['bakeries/cafes', 
                   'chicken', 'chinese', 'diner/grill', 
                   'fastfood', "korean",
                   'pizza', 
                   'sandwiches/delis', 'vietnamese', 'total_restaurants']

# Convert cuisine counts to percentages of the total restaurants
for column in cuisine_columns:
    restaurants_nb_wide[column] = ((restaurants_nb_wide[column] / restaurants_nb_wide['total_restaurants']) * 100).round(2)

# Check the updated DataFrame
restaurants_nb_wide.head()
desc_1 nb_name american asianfusion bakeries/cafes bbq breakfast_brunch caribbean chicken chinese cocktailbars diner/grill fastfood halal indpak italian japanese korean latin mediterranean pizza salad sandwiches/delis seafood soulfood venues vietnamese total_restaurants
0 Academy Gardens 0.0 0.0 0.00 0.0 0.0 0.0 0.00 26.67 0.0 0.00 53.33 0.0 0.0 0.0 0.0 0.0 0.0 0.0 6.67 0.0 0.00 1.0 0.0 1.0 0.0 100.0
1 Airport 2.0 0.0 15.38 0.0 4.0 0.0 1.71 0.00 0.0 2.56 39.32 0.0 0.0 4.0 2.0 0.0 5.0 3.0 3.42 7.0 6.84 7.0 0.0 2.0 0.0 100.0
2 Allegheny West 0.0 0.0 1.61 2.0 0.0 1.0 6.45 4.84 0.0 0.00 69.35 0.0 0.0 0.0 6.0 0.0 0.0 0.0 4.03 0.0 2.42 4.0 1.0 0.0 0.0 100.0
3 Andorra 5.0 1.0 0.00 0.0 0.0 0.0 0.00 2.00 0.0 0.00 62.00 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.00 1.0 22.00 0.0 0.0 0.0 0.0 100.0
6 Brewerytown 0.0 0.0 35.29 0.0 1.0 0.0 0.00 17.65 0.0 0.00 0.00 0.0 0.0 0.0 0.0 0.0 0.0 0.0 17.65 0.0 11.76 0.0 2.0 0.0 0.0 100.0
Code
# Create the heatmap
heatmap = alt.Chart(heatmap_data).mark_rect().encode(
    x='desc_1:N',
    y='desc_2:N',
    color='correlation:Q'
)

# Add text to each cell
text = heatmap.mark_text(baseline='middle').encode(
    text=alt.Text('correlation:Q', format='.2f'),
    color=alt.condition(
        alt.datum.correlation > 0.5, 
        alt.value('white'),
        alt.value('black')
    )
)

# Display the chart
chart = (heatmap + text).properties(width=1000, height=1000)
chart
Code
restaurants_nb_scaled = scaler.fit_transform(restaurants_nb_wide[['bakeries/cafes', 
                   'chicken', 'chinese', 'diner/grill', 
                   'fastfood', "korean",
                   'pizza', 
                   'sandwiches/delis', 'vietnamese', 'total_restaurants']])
Code
restaurants_nb_scaled[:5]
restaurants_nb_scaled.std(axis=0)
array([1., 1., 1., 1., 1., 1., 1., 1., 1., 0.])
Code
restaurants_nb_scaled.mean(axis=0)
array([-3.77948264e-17, -5.19678863e-17,  1.86021411e-17,  1.00392508e-16,
       -1.02754684e-16,  2.42123106e-17, -1.25195362e-16,  1.27557539e-16,
        4.72435330e-18,  0.00000000e+00])
Code
# Perform the fit
kmeans.fit(restaurants_nb_scaled)

# Extract the labels
restaurants_nb_wide['label'] = kmeans.labels_
Code
(
    alt.Chart(restaurants_nb_wide)
    .mark_circle()
    .encode(
        alt.X("bakeries/cafes:Q", scale=alt.Scale(zero=False)),
        alt.Y("chinese:Q", scale=alt.Scale(zero=False)),
        color=alt.Color("label:N", scale=alt.Scale(scheme="dark2")),
        tooltip=list(restaurants_nb_wide.columns),
    )
    .properties(width=800, height=600)
    .interactive()
)
Code
restaurants_nb_wide.groupby("label").size()
label
0    21
1     2
2     4
3    43
4     4
5     1
6    15
7     4
dtype: int64
Code
# Average restaurants per cluster
restaurants_nb_wide.groupby("label")['bakeries/cafes'].mean().sort_values()
label
1     0.000000
3     2.634419
7     3.032500
0     5.221429
4     6.295000
5    11.110000
2    11.255000
6    25.364667
Name: bakeries/cafes, dtype: float64
Code
restaurants_clusters = restaurants_nb_wide.merge(neigh, how = "left", on = "nb_name")
restaurants_clusters_gdf = gpd.GeoDataFrame(restaurants_clusters, geometry = "geometry")
Code
restaurants_clusters_gdf.explore(
    column="label",
    cmap="Set3",
    categorical=True,
    legend=True,
    tiles="CartoDB positron"
)
Make this Notebook Trusted to load map: File -> Trust Notebook
Code
food_profile = restaurants_nb_wide.groupby("label")[['bakeries/cafes', 
                   'chicken', 'chinese', 'diner/grill', 
                   'fastfood', "korean",
                   'pizza', 
                   'sandwiches/delis', 'vietnamese']].mean().round()

food_profile
desc_1 bakeries/cafes chicken chinese diner/grill fastfood korean pizza sandwiches/delis vietnamese
label
0 5.0 3.0 4.0 3.0 38.0 0.0 9.0 13.0 0.0
1 0.0 66.0 7.0 0.0 0.0 0.0 9.0 2.0 0.0
2 11.0 5.0 10.0 2.0 19.0 0.0 11.0 15.0 3.0
3 3.0 1.0 5.0 0.0 69.0 0.0 5.0 7.0 0.0
4 6.0 1.0 39.0 0.0 0.0 0.0 19.0 19.0 0.0
5 11.0 0.0 0.0 6.0 39.0 6.0 0.0 0.0 0.0
6 25.0 2.0 4.0 0.0 20.0 0.0 13.0 9.0 0.0
7 3.0 8.0 5.0 6.0 0.0 0.0 25.0 32.0 0.0
Code
restaurants_clusters_gdf.to_file('food_clusters.geojson', driver='GeoJSON', index=False)
Code
# Number of clusters to try out
n_clusters = list(range(2, 15))

# Run kmeans for each value of k
inertias = []
for k in n_clusters:
    
    # Initialize and run
    kmeans = KMeans(n_clusters=k, n_init=10)
    kmeans.fit(restaurants_nb_scaled)
    
    # Save the "inertia"
    inertias.append(kmeans.inertia_)
    
# Plot it!
plt.plot(n_clusters, inertias, marker='o', ms=10, mfc='white', lw=4, mew=3);

from kneed import KneeLocator

# Initialize the knee algorithm
kn = KneeLocator(n_clusters, inertias, curve='convex', direction='decreasing')

# Print out the knee 
print(kn.knee)
7

Nightlife

Code
nightlife = night_desc[["name", "desc_1"]]
nb_gdf = amenities_neighborhood[["nb_name", "name", "rating"]]

nightlife_nb_gdf = nightlife.merge(nb_gdf, on = "name", how = "left")
Code
nightlife_nb_grouped = nightlife_nb_gdf.groupby(["nb_name", "desc_1"]).size().reset_index(name="count")

# Sort within each neighborhood group by count in descending order
nightlife_nb_grouped = nightlife_nb_grouped.sort_values(by=["nb_name", "count"], ascending=[True, False])
nightlife_nb_grouped.head(20)
nb_name desc_1 count
0 Academy Gardens bars 1
1 Academy Gardens pubs 1
5 Airport wine_bars 14
4 Airport sportsbars 9
2 Airport bars 4
3 Airport cocktailbars 3
6 Allegheny West bars 2
7 Allegheny West divebars 1
9 Andorra sportsbars 14
8 Andorra pubs 1
12 Bella Vista cocktailbars 4
13 Bella Vista divebars 3
14 Bella Vista wine_bars 3
10 Bella Vista bars 2
11 Bella Vista beerbar 1
15 Brewerytown bars 2
16 Brewerytown sportsbars 2
17 Bridesburg bars 3
18 Bridesburg divebars 1
19 Bridesburg pubs 1
Code
nightlife_nb_wide = nightlife_nb_grouped.pivot(index='nb_name', columns='desc_1', values='count').fillna(0).reset_index()
Code
nightlife_nb_scaled = scaler.fit_transform(nightlife_nb_wide[['bars', 'beerbar', 'cocktailbars', 'divebars', 'lounges', 'musicvenues', 'pubs',
                                                             'sportsbars', 'wine_bars']])
Code
nightlife_nb_scaled[:5]
nightlife_nb_scaled.std(axis=0)
array([1., 1., 1., 1., 1., 1., 1., 1., 1.])
Code
nightlife_nb_scaled.mean(axis=0)
array([ 5.39833214e-17, -2.13896179e-17,  3.11295332e-17, -4.07421293e-17,
        1.01855323e-17, -1.68061284e-17,  6.11131940e-18,  8.65770249e-18,
        3.25937035e-17])
Code
# Perform the fit
kmeans.fit(nightlife_nb_scaled)

# Extract the labels
nightlife_nb_wide['label'] = kmeans.labels_
Code
(
    alt.Chart(nightlife_nb_wide)
    .mark_circle()
    .encode(
        alt.X("bars:Q", scale=alt.Scale(zero=False)),
        alt.Y("musicvenues:Q", scale=alt.Scale(zero=False)),
        color=alt.Color("label:N", scale=alt.Scale(scheme="dark2")),
        tooltip=list(nightlife_nb_wide.columns),
    )
    .properties(width=800, height=600)
    .interactive()
)
Code
nightlife_nb_wide.groupby("label").size()
label
0      9
1     53
2      1
3      1
4      2
5      1
6      1
7      2
8      5
9      3
10    10
11     1
12     9
13    11
dtype: int64
Code
nightlife_clusters = nightlife_nb_wide.merge(tract, how = "left", on = "nb_name")
nightlife_clusters_gdf = gpd.GeoDataFrame(nightlife_clusters, geometry = "geometry")
Code
nightlife_clusters_gdf.explore(
    column="label",
    cmap="Set3",
    categorical=True,
    legend=True,
    tiles="CartoDB positron"
)
ValueError: Location values cannot contain NaNs.

District profiles

Code
clusters = gpd.read_file("data/clusters.geojson")
districts = gpd.read_file("data/Planning_Districts.geojson")[["DIST_NAME", "geometry"]]
districts_proj = districts.to_crs(2272)

cluster_ctr = clusters.copy()
cluster_ctr.geometry = cluster_ctr.geometry.centroid
Code
tract_dist = cluster_ctr.sjoin(districts_proj)
Code
district_profile = tract_dist.groupby("DIST_NAME")[['arts', 'beauty', 'education', 'entertainment', 'grocery', 
                                   'healthcare', 'historic', 'kids', 'nightlife', 'parks', 
                                   'restaurant', 'shopping']].mean()

district_profile.to_csv('district_profile.csv', index=False)

district_profile
arts beauty education entertainment grocery healthcare historic kids nightlife parks restaurant shopping
DIST_NAME
Central 5.331579 11.152632 9.994737 12.452632 7.284211 0.094737 1.942105 5.894737 12.963158 3.842105 14.989474 14.063158
Central Northeast 0.625000 28.050000 4.475000 0.850000 4.925000 0.000000 0.000000 9.450000 11.250000 1.575000 32.325000 6.500000
Lower Far Northeast 0.500000 20.558333 10.858333 2.366667 3.150000 0.000000 0.000000 7.000000 8.858333 0.000000 36.966667 9.716667
Lower North 1.781818 19.118182 9.100000 4.800000 8.163636 0.000000 2.418182 4.263636 5.790909 3.509091 34.509091 6.572727
Lower Northeast 0.000000 27.800000 2.800000 0.300000 13.020000 0.000000 0.000000 3.340000 2.300000 0.000000 31.580000 18.840000
Lower Northwest 0.466667 22.522222 9.966667 3.811111 7.566667 1.388889 0.000000 4.655556 11.611111 1.922222 27.311111 8.777778
Lower South 0.366667 8.100000 1.766667 14.600000 1.100000 0.000000 0.000000 15.133333 11.966667 6.133333 37.266667 3.533333
Lower Southwest 0.414286 10.914286 2.228571 1.557143 8.800000 0.000000 0.000000 0.985714 13.357143 0.414286 52.242857 9.028571
North 0.209091 23.318182 6.372727 1.763636 3.781818 0.000000 0.000000 1.309091 12.609091 0.418182 37.890909 12.372727
North Delaware 1.220000 24.700000 4.840000 2.320000 6.420000 0.500000 0.000000 4.560000 10.400000 1.980000 37.180000 5.880000
River Wards 2.480000 20.820000 5.580000 6.220000 8.260000 0.000000 0.000000 2.940000 14.580000 1.540000 27.220000 10.340000
South 4.315385 19.176923 4.876923 7.392308 11.361538 0.223077 0.315385 6.500000 14.200000 3.815385 20.215385 7.607692
University Southwest 2.072727 12.481818 9.654545 3.872727 5.663636 0.318182 4.545455 11.063636 7.063636 6.618182 27.900000 8.754545
Upper Far Northeast 1.100000 27.266667 3.833333 19.966667 4.066667 0.000000 0.000000 2.766667 3.033333 8.900000 22.300000 6.733333
Upper North 0.475000 20.425000 10.575000 1.350000 10.650000 0.000000 0.000000 2.100000 2.100000 0.000000 42.362500 9.925000
Upper Northwest 2.200000 22.281818 8.554545 4.463636 6.809091 0.300000 0.581818 7.018182 3.054545 4.818182 30.345455 9.581818
West 0.133333 34.511111 11.833333 2.255556 9.477778 0.000000 0.000000 8.433333 4.422222 3.900000 19.388889 5.611111
West Park 4.180000 12.520000 3.920000 15.880000 5.440000 0.000000 1.600000 5.780000 11.420000 3.200000 17.240000 18.820000

Write files

Code
tract.to_file('tract.geojson', driver='GeoJSON', index=False)