04a 01 Data Merging Basics - HannaAA17/Data-Scientist-With-Python-datacamp GitHub Wiki

Inner Join

# Merge the taxi_owners and taxi_veh tables setting a suffix
taxi_own_veh = taxi_owners.merge(taxi_veh, on='vid', suffixes=('_own','_veh'))

# Print the most popular fuel_type
print(taxi_own_veh['fuel_type'].value_counts())

output

<script.py> output:
    HYBRID                    2792
    GASOLINE                   611
    FLEX FUEL                   89
    COMPRESSED NATURAL GAS      27
    Name: fuel_type, dtype: int64

One-to-many relationships

# Merge the licenses and biz_owners table on account
licenses_owners = licenses.merge(biz_owners,on="account")

# Group the results by title then count the number of accounts
counted_df = licenses_owners.groupby('title').agg({'account':'count'})

# Sort the counted_df in desending order
sorted_df = counted_df.sort_values('account',ascending=False)

# Use .head() method to print the first few rows of sorted_df
print(sorted_df.head())

output

                 account
title                   
PRESIDENT           6259
SECRETARY           5205
SOLE PROPRIETOR     1658
OTHER               1200
VICE PRESIDENT       970

Merge multiple tables

df1.merge(df2, on='col')\
   .merge(df3, on='col)
# Merge licenses and zip_demo, on zip; and merge the wards on ward
licenses_zip_ward = licenses.merge(zip_demo,on='zip') \
            			.merge(wards,on='ward')

# Print the results by alderman and show median income
print(licenses_zip_ward.groupby('alderman').agg({'income':'median'}))
⚠️ **GitHub.com Fallback** ⚠️