from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import geopandas as gpd

import warnings
import os
import gc

# Define file paths
taxi_zones_PATH = "/content/drive/MyDrive/Assignment3/taxi_zone"
DATA_PATH = "/content/drive/MyDrive/Assignment3/tripdata"

# Read the Parquet dataframes
yellow_jan_df = pd.read_parquet(os.path.join(DATA_PATH, 'yellow_tripdata_2023-01.parquet'))
yellow_mar_df = pd.read_parquet(os.path.join(DATA_PATH, 'yellow_tripdata_2023-03.parquet'))
yellow_jun_df = pd.read_parquet(os.path.join(DATA_PATH, 'yellow_tripdata_2023-06.parquet'))

# Load taxi zone lookup data
zones_lookup_df = pd.read_csv("/content/drive/MyDrive/Assignment3/taxi_zone_lookup.csv")
zone_shp = gpd.read_file(os.path.join(taxi_zones_PATH, 'taxi_zones.shp'))

# Define a list of dataframes
dfs = [yellow_jan_df, yellow_mar_df, yellow_jun_df]

# Define the column names
cols = ['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
        'Passenger_count', 'Trip_distance','RateCodeID', 'store_and_fwd_flag',
        'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount',
        'extra', 'MTA_tax','tip_amount' , 'tolls_amount', 'improvement_surcharge',
        'total_amount', 'congestion_surcharge', 'Airport_fee']

# Rename columns in each dataframe
for df in dfs:
    df.columns = cols

yellow_jan_df.head()

yellow_jan_df.info(show_counts = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3066766 entries, 0 to 3066765
Data columns (total 19 columns):
 #   Column                 Non-Null Count    Dtype         
---  ------                 --------------    -----         
 0   VendorID               3066766 non-null  int64         
 1   tpep_pickup_datetime   3066766 non-null  datetime64[ns]
 2   tpep_dropoff_datetime  3066766 non-null  datetime64[ns]
 3   Passenger_count        2995023 non-null  float64       
 4   Trip_distance          3066766 non-null  float64       
 5   RateCodeID             2995023 non-null  float64       
 6   store_and_fwd_flag     2995023 non-null  object        
 7   PULocationID           3066766 non-null  int64         
 8   DOLocationID           3066766 non-null  int64         
 9   payment_type           3066766 non-null  int64         
 10  fare_amount            3066766 non-null  float64       
 11  extra                  3066766 non-null  float64       
 12  MTA_tax                3066766 non-null  float64       
 13  tip_amount             3066766 non-null  float64       
 14  tolls_amount           3066766 non-null  float64       
 15  improvement_surcharge  3066766 non-null  float64       
 16  total_amount           3066766 non-null  float64       
 17  congestion_surcharge   2995023 non-null  float64       
 18  Airport_fee            2995023 non-null  float64       
dtypes: datetime64[ns](2), float64(12), int64(4), object(1)
memory usage: 444.6+ MB

# Extract passenger counts for each month
passenger_counts_jan = (yellow_jan_df['Passenger_count']).sum()
passenger_counts_mar = (yellow_mar_df['Passenger_count']).sum()
passenger_counts_jun = (yellow_jun_df['Passenger_count']).sum()

# Set the style
sns.set(style="white")
plt.figure(figsize=(10,3))

ax1 = sns.barplot(x=[passenger_counts_jan, passenger_counts_mar, passenger_counts_jun], y= ['January', 'March', 'June'], palette=['#00b894', '#00b894','#00b894'], orient= "h", width = 0.3)
ax1.set_xlim([0,5000000])
for bars_group in ax1.containers:
    ax1.bar_label(bars_group, padding=3, fontsize=15)


# Set labels and title
sns.despine()
plt.xlabel('Passenger Count')
plt.title('Passenger Count Comparison for January, March, and June');

# List of dataframes for the three months
dataframes = [yellow_jan_df, yellow_mar_df, yellow_jun_df]
months = ['January', 'March', 'June']

# Create a figure with three subplots in one row and three columns
fig, axes = plt.subplots(1, 3, figsize=(12, 5))

# Main title for the entire set of subplots
fig.suptitle('Distribution of Passenger Count', fontsize=14)

# Iterate over the dataframes and months
for i, df in enumerate(dataframes):
    ax = axes[i]
    sns.histplot(df['Passenger_count'], kde=False, ax=ax, bins=10, color='#00b894')
    ax.set_title(months[i])  # Set the subtitle for each subplot
    ax.set_xlabel('Passenger Count')
    ax.set_ylabel('Frequency')

# Adjust layout
sns.despine()
plt.tight_layout()
plt.subplots_adjust(top=0.85)  # Adjust the position of the main title
plt.show()

# List of columns related to pricing and fare to compare
columns_to_compare = ['total_amount', 'fare_amount', 'extra', 'improvement_surcharge']

# Create a figure with four subplots (2 rows and 2 columns)
fig, axes = plt.subplots(2, 2, figsize=(8, 8))

colors = sns.color_palette(['#00b894','#FFE8D1','#568EA3'])

# Iterate over the columns
for i, column in enumerate(columns_to_compare):
    # Calculate the row and column indices for the subplot
    row_index = i // 2  # Integer division to determine the row
    col_index = i % 2   # Modulus to determine the column

    # Create a dataset with data from all three months for the current column
    data = pd.concat([yellow_jan_df[column], yellow_mar_df[column], yellow_jun_df[column]], axis=1)
    data.columns = ['January', 'March', 'June']

    # Create a boxplot for the current column in the appropriate subplot
    sns.boxplot(data=data, palette= colors, ax=axes[row_index, col_index])
    axes[row_index, col_index].set_title(f'{column.upper()}')
    if column in ['total_amount', 'fare_amount']:
        axes[row_index, col_index].set_ylim([0,100])
    elif column == "improvement_surcharge":
        axes[row_index, col_index].set_ylim([0,15])

# Adjust layout
plt.tight_layout()
plt.show()

# List of dataframes for the three months
dataframes = [yellow_jan_df, yellow_mar_df, yellow_jun_df]
months = ['January', 'March', 'June']

# Define the order of days of the week
days_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

# Create a figure with three rows and two columns
fig, axes = plt.subplots(3, 2, figsize=(15, 10))


# Iterate over the dataframes and months
for i, (df, month) in enumerate(zip(dataframes, months)):
    row_index = i  # Determine the row index

    # Calculate day of the week for pickups and dropoffs
    df['pickup_day'] = df['tpep_pickup_datetime'].dt.day_name()
    df['dropoff_day'] = df['tpep_dropoff_datetime'].dt.day_name()

    # Plot the number of pickups for each day of the week
    sns.countplot(y='pickup_day', data=df, ax=axes[row_index, 0], order=days_order, color="#00b894", width = 0.3)
    axes[row_index, 0].set_title(f'Pickups in {month}')
    axes[row_index, 0].set_xlabel('')
    axes[row_index, 0].set_ylabel('')

    # Plot the number of dropoffs for each day of the week
    sns.countplot(y='dropoff_day', data=df, ax=axes[row_index, 1], order=days_order, color="#fdcb6e", width = 0.3)
    axes[row_index, 1].set_title(f'Dropoffs in {month}')
    axes[row_index, 1].set_xlabel('')
    axes[row_index, 1].set_ylabel('')

# Adjust layout
sns.despine()
plt.tight_layout()
plt.show()

# Del dataframe
del yellow_mar_df, yellow_jun_df

# Run garbage collection
gc.collect()

15753

# Check missing values
yellow_jan_df.isnull().sum()

VendorID                     0
tpep_pickup_datetime         0
tpep_dropoff_datetime        0
Passenger_count          71743
Trip_distance                0
RateCodeID               71743
store_and_fwd_flag       71743
PULocationID                 0
DOLocationID                 0
payment_type                 0
fare_amount                  0
extra                        0
MTA_tax                      0
tip_amount                   0
tolls_amount                 0
improvement_surcharge        0
total_amount                 0
congestion_surcharge     71743
Airport_fee              71743
pickup_day                   0
dropoff_day                  0
dtype: int64

# Distribution of RateCodeID
yellow_jan_df["RateCodeID"].value_counts()

1.0     2839305
2.0      114239
5.0       15043
99.0      13106
3.0        8958
4.0        4366
6.0           6
Name: RateCodeID, dtype: int64

# Distribution of store_and_fwd_flag
yellow_jan_df["store_and_fwd_flag"].value_counts()

N    2975020
Y      20003
Name: store_and_fwd_flag, dtype: int64

# Impute missing values with the median
columns_to_impute = ["Passenger_count", "congestion_surcharge", "Airport_fee"]

for column in columns_to_impute:
    median_value = yellow_jan_df[column].median()
    yellow_jan_df[column].fillna(median_value, inplace=True)

# Replace missing values in PULocationID and DOLocationID with the most frequent instance
yellow_jan_df['RateCodeID'].fillna(1.0, inplace=True)
yellow_jan_df['store_and_fwd_flag'].fillna("N", inplace=True)

# Verify that missing values have been imputed
yellow_jan_df.isnull().sum()

VendorID                 0
tpep_pickup_datetime     0
tpep_dropoff_datetime    0
Passenger_count          0
Trip_distance            0
RateCodeID               0
store_and_fwd_flag       0
PULocationID             0
DOLocationID             0
payment_type             0
fare_amount              0
extra                    0
MTA_tax                  0
tip_amount               0
tolls_amount             0
improvement_surcharge    0
total_amount             0
congestion_surcharge     0
Airport_fee              0
pickup_day               0
dropoff_day              0
dtype: int64

sns.boxplot(yellow_jan_df['Trip_distance'])
plt.title("The box plot of Trip_distance")
plt.xlabel("Trip_distance");
plt.ylabel("Miles");

# Filter the DataFrame to get instances where Trip_distance is greater than 1000
filtered_df = yellow_jan_df[yellow_jan_df['Trip_distance'] > 1000]

# Print the trip distances and their value counts
print(filtered_df['Trip_distance'].value_counts().sum())

67

yellow_jan_df["Passenger_count"].value_counts()

1.0    2333143
2.0     451536
3.0     106353
4.0      53745
0.0      51164
5.0      42681
6.0      28124
8.0         13
7.0          6
9.0          1
Name: Passenger_count, dtype: int64

def clean_data(df):
    # Drop negative or zero values of trip_distance, fare_amount, total_amount, passenger_count
    df = df[(df['Trip_distance'] > 0) & (df['fare_amount'] > 0) & (df['total_amount'] > 0) & (df['Passenger_count'] > 0)]

    # Drop row with passenger_count > 6, trip_distance > 1000
    df = df[(df['Passenger_count'] <= 6) & (df['Trip_distance'] <= 1000)]

    # Drop rows with same pickup and dropoff time
    df = df[df['tpep_pickup_datetime'] != df['tpep_dropoff_datetime']]

    print("Finish Cleaning")
    return df

yellow_jan_df = clean_data(yellow_jan_df)

Finish Cleaning

# Select the numerical columns for the scatter plot matrix
numerical_columns = ['Passenger_count', 'Trip_distance', 'fare_amount',
                     'extra', 'MTA_tax', 'improvement_surcharge', 'tip_amount', 'tolls_amount',
                     'total_amount', 'congestion_surcharge', 'Airport_fee']

# Create a DataFrame with only the selected numerical columns
numerical_data = yellow_jan_df[numerical_columns]

#Get correlation data
correlation_matrix = numerical_data.corr()
new_correlation_matrix = correlation_matrix.iloc[1:, :-1]
# ones_like can build a matrix of booleans (True, False) with the same shape as our data
ones_corr = np.ones_like(correlation_matrix, dtype = bool)

# return only upper triangle matrix
mask = np.triu(ones_corr)
new_mask = mask[1:,0:-1]

fig, ax = plt.subplots(figsize=(8,6))

cmap = sns.diverging_palette(0, 230, 90, 60, as_cmap = True)
sns.heatmap(data=new_correlation_matrix, mask = new_mask,
           annot=True, fmt='.2f', vmin=-1, vmax = 1, cmap = cmap,
           linecolor='white', linewidths = 0.5);

yticks = [i.upper() for i in new_correlation_matrix.index]
xticks = [i.upper() for i in new_correlation_matrix.columns]

ax.set_yticklabels(yticks, rotation=0);
ax.set_xticklabels(xticks, rotation = 90);
title = 'CORRELATION MATRIX\n'
ax.set_title(title, loc='left', fontsize = 18);

yellow_jan_df['pickup_hour']=yellow_jan_df['tpep_pickup_datetime'].dt.hour
yellow_jan_df['dropoff_hour']=yellow_jan_df['tpep_dropoff_datetime'].dt.hour

# Define the rush-hour time periods
morning_rush_hours = range(7, 10)
evening_rush_hours = range(16, 19)

# Create a function to check if the pickup hour is during rush hour
def is_rush_hour(pickup_hour):
    if pickup_hour in morning_rush_hours or pickup_hour in evening_rush_hours:
        return 1  # It's rush hour
    else:
        return 0  # It's not rush hour

# Apply the function to create the rush-hour flag column
yellow_jan_df['is_rush_hour'] = yellow_jan_df['pickup_hour'].apply(is_rush_hour)

yellow_jan_df[["pickup_hour", "is_rush_hour"]].value_counts()

pickup_hour  is_rush_hour
18           1               208189
17           1               201467
15           0               188477
16           1               188025
19           0               186133
14           0               183876
13           0               171525
12           0               163163
20           0               160027
21           0               156261
11           0               148100
22           0               142542
10           0               137917
9            1               125976
8            1               112273
23           0               110426
7            1                83247
0            0                81716
1            0                57402
6            0                41841
2            0                40156
3            0                26025
5            0                16891
4            0                16634
dtype: int64

def merge_taxi_with_zones(taxi_df, zones_df, zone_shp):
    """
    Merges a New York City taxi dataframe with a zones dataframe and zone shape file based on the pickup or dropoff location ID.
    :param taxi_df: New York City taxi dataframe with pickup and dropoff location IDs.
    :param zones_df: NYC taxi zone lookup table dataframe.
    :param zone_shp: zone shape file for the pickup or dropoff location ID in the taxi dataframe.
    :return: Merged dataframe with the additional zone, borough, service zone, shape area, shape length, geometry columns.
    """
    # Merge the taxi dataframe with the look_up dataframe based on the location ID column
    # For pickup
    merged_df = pd.merge(taxi_df, zones_df, left_on='PULocationID', right_on='LocationID')
    merged_df = merged_df.drop(columns='LocationID')
    merged_df = merged_df.rename(columns={'Borough': 'PU_borough', 'Zone': 'PU_zone', 'service_zone': 'PU_service_zone'})
    # For dropoff
    merged_df = pd.merge(merged_df, zones_df, left_on='DOLocationID', right_on='LocationID')
    merged_df = merged_df.drop(columns='LocationID')
    merged_df = merged_df.rename(columns={'Borough': 'DO_borough', 'Zone': 'DO_zone', 'service_zone': 'DO_service_zone'})

    # Merge the taxi dataframe with the shape dataframe based on the location ID column
    # For pickup
    merged_df = pd.merge(merged_df, zone_shp, left_on='PULocationID', right_on='LocationID')
    merged_df = merged_df.drop(columns=['LocationID','OBJECTID', 'zone', 'borough'])
    merged_df = merged_df.rename(columns={'Shape_Leng': 'PU_Shape_Leng', 'Shape_Area': 'PU_Shape_Area', 'geometry': 'PU_geometry'})

    # For pickup
    merged_df = pd.merge(merged_df, zone_shp, left_on='DOLocationID', right_on='LocationID')
    merged_df = merged_df.drop(columns=['LocationID','OBJECTID', 'zone', 'borough'])
    merged_df = merged_df.rename(columns={'Shape_Leng': 'DO_Shape_Leng', 'Shape_Area': 'DO_Shape_Area', 'geometry': 'DO_geometry'})

    return merged_df

merged_df = merge_taxi_with_zones(yellow_jan_df, zones_lookup_df, zone_shp)

merged_df.head()

# Structure of merged_df
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2897101 entries, 0 to 2897100
Data columns (total 36 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int64         
 1   tpep_pickup_datetime   datetime64[ns]
 2   tpep_dropoff_datetime  datetime64[ns]
 3   Passenger_count        float64       
 4   Trip_distance          float64       
 5   RateCodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int64         
 8   DOLocationID           int64         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  MTA_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  Airport_fee            float64       
 19  pickup_day             object        
 20  dropoff_day            object        
 21  pickup_hour            int64         
 22  dropoff_hour           int64         
 23  is_rush_hour           int64         
 24  PU_borough             object        
 25  PU_zone                object        
 26  PU_service_zone        object        
 27  DO_borough             object        
 28  DO_zone                object        
 29  DO_service_zone        object        
 30  PU_Shape_Leng          float64       
 31  PU_Shape_Area          float64       
 32  PU_geometry            geometry      
 33  DO_Shape_Leng          float64       
 34  DO_Shape_Area          float64       
 35  DO_geometry            geometry      
dtypes: datetime64[ns](2), float64(16), geometry(2), int64(7), object(9)
memory usage: 817.8+ MB

# Calculate straight-line distance for the entire DataFrame
merged_df['straight_distance'] = merged_df.apply(
    lambda row: row['PU_Shape_Leng'] * 0.00062137
    if row['PULocationID'] == row['DOLocationID']
    else row['PU_geometry'].centroid.distance(row['DO_geometry'].centroid) * 0.00062137,
    axis=1
)

merged_df.head()

# Calculating the "distance_complexity"
merged_df["distance_complexity"] = round(abs(merged_df["Trip_distance"] - merged_df["straight_distance"]), 3)

merged_df.head()

pickups_by_zone = merged_df.groupby(["PU_zone"]).size().reset_index(name='count')
pickups_by_zone = pickups_by_zone.sort_values(['count'], ascending=[False])


dropoffs_by_zone = merged_df.groupby(["DO_zone"]).size().reset_index(name='count')
dropoffs_by_zone = dropoffs_by_zone.sort_values(['count'], ascending=[False])

pickups_by_zone.head(10)

dropoffs_by_zone.head(10)

# Extract the top 10 pickup and dropoff zones
top_10_pickup_zones = pickups_by_zone.head(10)
top_10_dropoff_zones = dropoffs_by_zone.head(10)

# Create subplots for pickup and dropoff zones
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Create bar plots for top 10 pickup zones
sns.barplot(x='count', y='PU_zone', data=top_10_pickup_zones, ax=axes[0], color = "#00b894", width=0.3)
axes[0].set_title('Top 10 Pickup Zones')
axes[0].set_xlabel('')
axes[0].set_ylabel('')

# Create bar plots for top 10 dropoff zones
sns.barplot(x='count', y='DO_zone', data=top_10_dropoff_zones, ax=axes[1], color = "#fdcb6e", width=0.3)
axes[1].set_title('Top 10 Dropoff Zones')
axes[1].set_xlabel('')
axes[1].set_ylabel('')

sns.despine()
plt.tight_layout()
plt.show()

# Count the number of trips for each vendor
vendor_counts = merged_df['VendorID'].value_counts().reset_index()
vendor_counts.columns = ['VendorID', 'Trip Count']

# Sort the vendors by popularity (trip count)
vendor_counts = vendor_counts.sort_values(by='Trip Count', ascending=False)

# Create a bar plot to visualize the rankings
plt.figure(figsize=(10, 3))
ax_vendors = sns.barplot(x='Trip Count', y='VendorID', data=vendor_counts, color = "#00b894", orient = 'h', width = 0.3)
for bars_group in ax_vendors.containers:
    ax_vendors.bar_label(bars_group, padding=3, fontsize=15)
plt.xlabel('Trip Count')
plt.ylabel('VendorID')
plt.title('Vendor Popularity Rankings')
sns.despine()
plt.show()

# Count the number of trips for pickup_hour
pickup_hours_counts = merged_df['pickup_hour'].value_counts().reset_index()
pickup_hours_counts.columns = ['pickup_hour', 'Trip Count']

# Sort the pickup hours by popularity (trip count)
pickup_hours_counts = pickup_hours_counts.sort_values(by='pickup_hour')

# Count the number of trips for dropoff_hour
dropoff_hours_counts = merged_df['dropoff_hour'].value_counts().reset_index()
dropoff_hours_counts.columns = ['dropoff_hour', 'Trip Count']

# Sort the dropoff hours by popularity (trip count)
dropoff_hours_counts = dropoff_hours_counts.sort_values(by='dropoff_hour')

# Create a figure with two subplots
fig, axes = plt.subplots(1, 2, figsize=(14, 4))

# Plot for pickup hours
sns.barplot(x='pickup_hour', y='Trip Count', data=pickup_hours_counts, ax=axes[0], color="#00b894")
axes[0].set_xlabel('')
axes[0].set_ylabel('Trip Count')
axes[0].set_title('Pickup Hours')

# Plot for dropoff hours
sns.barplot(x='dropoff_hour', y='Trip Count', data=dropoff_hours_counts, ax=axes[1], color="#fdcb6e")
axes[1].set_xlabel('')
axes[1].set_ylabel('')
axes[1].set_title('Dropoff Hours')

# Adjust layout
sns.despine()
plt.tight_layout()
plt.show()

# Create new feature weekday_or_weekend
merged_df['weekday_or_weekend'] = merged_df['pickup_day'].apply(lambda x: 1 if x in ["Saturday", "Sunday"] else 0)

# Group by 'weekday_or_weekend' and calculate the average trip distance
average_distance_by_day = merged_df.groupby('weekday_or_weekend')['Trip_distance'].mean().reset_index()

# Create a bar plot
plt.figure(figsize=(10, 3))
ax_trip_distance = sns.barplot(x="Trip_distance", y="weekday_or_weekend", data=average_distance_by_day, palette=['#00b894', '#fdcb6e'], orient = "h", width = 0.3)
ax_trip_distance.set_yticklabels(['Weekday', 'Weekend'])
ax_trip_distance.set_ylabel('Day of the Week')
ax_trip_distance.set_xlabel('Average Trip Distance (miles)')
ax_trip_distance.set_title('Average Trip Distance on Weekdays vs. Weekends')

# Add labels to the bars
for bars_group in ax_trip_distance.containers:
    ax_trip_distance.bar_label(bars_group, padding=3, fontsize=15)

sns.despine()
plt.show()

# Group by 'weekday_or_weekend' and calculate the average trip distance
average_pc_by_day = merged_df.groupby('weekday_or_weekend')['Passenger_count'].mean().reset_index()

# Create a bar plot
plt.figure(figsize=(10, 3))
ax_passenger_count = sns.barplot(x="Passenger_count", y="weekday_or_weekend", data=average_pc_by_day, palette=['#00b894', '#fdcb6e'], orient = "h", width = 0.3)
ax_passenger_count.set_yticklabels(['Weekday', 'Weekend'])
ax_passenger_count.set_ylabel('Day of the Week')
ax_passenger_count.set_xlabel('Average Passenger Count')
ax_passenger_count.set_title('Average Passenger Count on Weekdays vs. Weekends')

# Add labels to the bars
for bars_group in ax_passenger_count.containers:
    ax_passenger_count.bar_label(bars_group, padding=3, fontsize=15)

sns.despine()
plt.show()

# Get correlation data for these columns
fare_tip_corr = yellow_jan_df[['fare_amount', 'tip_amount']].corr()

fare_tip_corr

# Get correlation data for these columns
fare_pass_count_corr = yellow_jan_df[['fare_amount', 'Passenger_count']].corr()

fare_pass_count_corr

Field	Description
VendorID	A code indicating the TPEP provider that provided the record. 1 for Creative Mobile Technologies, LLC and 2 for VeriFone Inc.
tpep_pickup_datetime	The date and time when the meter was engaged.
tpep_dropoff_datetime	The date and time when the meter was disengaged.
Passenger_count	The number of passengers in the vehicle.
Trip_distance	The distance of the trip in miles.
PULocationID	The pickup location ID, corresponding to the taxi zone where the taximeter was engaged.
DOLocationID	The dropoff location ID, corresponding to the taxi zone where the taximeter was disengaged.
RatecodeID	The rate code for the trip. 1=Standard rate, 2=JFK, 3=Newark, 4=Nassau or Westchester, 5=Negotiated fare, 6=Group ride
store_and_fwd_flag	Indicates whether the trip record was held in vehicle memory before sending to the vendor. Y=store and forward; N=not a store and forward trip.
payment_type	A numeric code indicating the payment method: 1= Credit card, 2= Cash, 3= No charge, 4= Dispute, 5= Unknown, 6= Voided trip.
fare_amount	The time-and-distance fare calculated by the meter.
extra	Extra charges. Currently, this only includes the 0.5 dollars and 1 dollar rush hour and overnight charges.
MTA_tax	The 0.50 dollars MTA tax that is automatically triggered based on the metered rate in use.
improvement_surcharge	0.30 dollars improvement surcharge assessed trips at the flag drop. The improvement surcharge began being levied in 2015.
tip_amount	Tip amount – This field is automatically populated for credit card tips. Cash tips are not included.
tolls_amount	Total amount of all tolls paid in trip.
total_amount	The total amount charged to passengers. This field includes the metered fare, extra charges, MTA tax, tip_amount, and tolls_amount plus any improvement_surcharge or ehail_fee.
congestion_surcharge	2.75 dollars congestion surcharge assessed trips in yellow and green taxis in Manhattan south of 96th St. The surcharge began being levied in 2019.
Airport_fee	1.25 dollars for pick up only at LaGuardia and John F. Kennedy Airports

	VendorID	tpep_pickup_datetime	tpep_dropoff_datetime	Passenger_count	Trip_distance	RateCodeID	store_and_fwd_flag	PULocationID	DOLocationID	payment_type	...	PU_service_zone	DO_borough	DO_zone	DO_service_zone	PU_Shape_Leng	PU_Shape_Area	PU_geometry	DO_Shape_Leng	DO_Shape_Area	DO_geometry
0	2	2023-01-01 00:32:10	2023-01-01 00:40:36	1.0	0.97	1.0	N	161	141	2	...	Yellow Zone	Manhattan	Lenox Hill West	Yellow Zone	0.035804	0.000072	POLYGON ((991081.026 214453.698, 990952.644 21...	0.041514	0.000077	POLYGON ((994839.073 216123.698, 994786.740 21...
1	2	2023-01-01 00:21:43	2023-01-01 00:27:52	1.0	2.00	1.0	N	161	141	2	...	Yellow Zone	Manhattan	Lenox Hill West	Yellow Zone	0.035804	0.000072	POLYGON ((991081.026 214453.698, 990952.644 21...	0.041514	0.000077	POLYGON ((994839.073 216123.698, 994786.740 21...
2	2	2023-01-01 00:11:44	2023-01-01 00:20:38	2.0	1.83	1.0	N	161	141	1	...	Yellow Zone	Manhattan	Lenox Hill West	Yellow Zone	0.035804	0.000072	POLYGON ((991081.026 214453.698, 990952.644 21...	0.041514	0.000077	POLYGON ((994839.073 216123.698, 994786.740 21...
3	2	2023-01-01 00:35:52	2023-01-01 00:43:38	2.0	1.41	1.0	N	161	141	2	...	Yellow Zone	Manhattan	Lenox Hill West	Yellow Zone	0.035804	0.000072	POLYGON ((991081.026 214453.698, 990952.644 21...	0.041514	0.000077	POLYGON ((994839.073 216123.698, 994786.740 21...
4	1	2023-01-01 00:51:12	2023-01-01 00:59:38	1.0	1.80	1.0	N	161	141	2	...	Yellow Zone	Manhattan	Lenox Hill West	Yellow Zone	0.035804	0.000072	POLYGON ((991081.026 214453.698, 990952.644 21...	0.041514	0.000077	POLYGON ((994839.073 216123.698, 994786.740 21...

	VendorID	tpep_pickup_datetime	tpep_dropoff_datetime	Passenger_count	Trip_distance	RateCodeID	store_and_fwd_flag	PULocationID	DOLocationID	payment_type	...	DO_borough	DO_zone	DO_service_zone	PU_Shape_Leng	PU_Shape_Area	PU_geometry	DO_Shape_Leng	DO_Shape_Area	DO_geometry	straight_distance
0	2	2023-01-01 00:32:10	2023-01-01 00:40:36	1.0	0.97	1.0	N	161	141	2	...	Manhattan	Lenox Hill West	Yellow Zone	0.035804	0.000072	POLYGON ((991081.026 214453.698, 990952.644 21...	0.041514	0.000077	POLYGON ((994839.073 216123.698, 994786.740 21...	3.707452
1	2	2023-01-01 00:21:43	2023-01-01 00:27:52	1.0	2.00	1.0	N	161	141	2	...	Manhattan	Lenox Hill West	Yellow Zone	0.035804	0.000072	POLYGON ((991081.026 214453.698, 990952.644 21...	0.041514	0.000077	POLYGON ((994839.073 216123.698, 994786.740 21...	3.707452
2	2	2023-01-01 00:11:44	2023-01-01 00:20:38	2.0	1.83	1.0	N	161	141	1	...	Manhattan	Lenox Hill West	Yellow Zone	0.035804	0.000072	POLYGON ((991081.026 214453.698, 990952.644 21...	0.041514	0.000077	POLYGON ((994839.073 216123.698, 994786.740 21...	3.707452
3	2	2023-01-01 00:35:52	2023-01-01 00:43:38	2.0	1.41	1.0	N	161	141	2	...	Manhattan	Lenox Hill West	Yellow Zone	0.035804	0.000072	POLYGON ((991081.026 214453.698, 990952.644 21...	0.041514	0.000077	POLYGON ((994839.073 216123.698, 994786.740 21...	3.707452
4	1	2023-01-01 00:51:12	2023-01-01 00:59:38	1.0	1.80	1.0	N	161	141	2	...	Manhattan	Lenox Hill West	Yellow Zone	0.035804	0.000072	POLYGON ((991081.026 214453.698, 990952.644 21...	0.041514	0.000077	POLYGON ((994839.073 216123.698, 994786.740 21...	3.707452

	VendorID	tpep_pickup_datetime	tpep_dropoff_datetime	Passenger_count	Trip_distance	RateCodeID	store_and_fwd_flag	PULocationID	DOLocationID	payment_type	...	DO_zone	DO_service_zone	PU_Shape_Leng	PU_Shape_Area	PU_geometry	DO_Shape_Leng	DO_Shape_Area	DO_geometry	straight_distance	distance_complexity
0	2	2023-01-01 00:32:10	2023-01-01 00:40:36	1.0	0.97	1.0	N	161	141	2	...	Lenox Hill West	Yellow Zone	0.035804	0.000072	POLYGON ((991081.026 214453.698, 990952.644 21...	0.041514	0.000077	POLYGON ((994839.073 216123.698, 994786.740 21...	3.707452	2.737
1	2	2023-01-01 00:21:43	2023-01-01 00:27:52	1.0	2.00	1.0	N	161	141	2	...	Lenox Hill West	Yellow Zone	0.035804	0.000072	POLYGON ((991081.026 214453.698, 990952.644 21...	0.041514	0.000077	POLYGON ((994839.073 216123.698, 994786.740 21...	3.707452	1.707
2	2	2023-01-01 00:11:44	2023-01-01 00:20:38	2.0	1.83	1.0	N	161	141	1	...	Lenox Hill West	Yellow Zone	0.035804	0.000072	POLYGON ((991081.026 214453.698, 990952.644 21...	0.041514	0.000077	POLYGON ((994839.073 216123.698, 994786.740 21...	3.707452	1.877
3	2	2023-01-01 00:35:52	2023-01-01 00:43:38	2.0	1.41	1.0	N	161	141	2	...	Lenox Hill West	Yellow Zone	0.035804	0.000072	POLYGON ((991081.026 214453.698, 990952.644 21...	0.041514	0.000077	POLYGON ((994839.073 216123.698, 994786.740 21...	3.707452	2.297
4	1	2023-01-01 00:51:12	2023-01-01 00:59:38	1.0	1.80	1.0	N	161	141	2	...	Lenox Hill West	Yellow Zone	0.035804	0.000072	POLYGON ((991081.026 214453.698, 990952.644 21...	0.041514	0.000077	POLYGON ((994839.073 216123.698, 994786.740 21...	3.707452	1.907

	PU_zone	count
117	JFK Airport	147508
225	Upper East Side South	143123
224	Upper East Side North	133630
150	Midtown Center	130550
175	Penn Station/Madison Sq West	105353
151	Midtown East	101806
131	Lincoln Square East	96676
218	Times Sq/Theatre District	95251
127	LaGuardia Airport	85880
159	Murray Hill	85253

	DO_zone	count
229	Upper East Side North	141274
230	Upper East Side South	127618
154	Midtown Center	111398
223	Times Sq/Theatre District	85627
163	Murray Hill	85295
232	Upper West Side South	84752
135	Lincoln Square East	84694
134	Lenox Hill West	84414
155	Midtown East	79378
45	Clinton East	73937

	fare_amount	tip_amount
fare_amount	1.000000	0.610648
tip_amount	0.610648	1.000000

	fare_amount	Passenger_count
fare_amount	1.000000	0.027862
Passenger_count	0.027862	1.000000

New York City Taxi Data Analysis using Python

Name : Phan Vinh Phu

TABLE OF CONTENTS¶

Introduction¶

1. Data Loading¶

2. Data Exploration and Pre-processing¶

3. Featurization¶

4. Data Analysis¶

5. Conclusion¶

Introduction¶

Data Dictionary¶

1. Data Loading ¶

1.1 Install & Import Dependencies ¶

Mount drive¶

1.2 Load the dataset to dataframe ¶

1.3 Compare data of 3 months ¶

The total numer of passenger by month¶

The distribution of passenger_count for each month¶

Pricing and Fare¶

The distribution of pickup and dropoff day of the week¶

2. Data Exploration and Pre-processing ¶

Delete the march and june taxi dataframe to free the memory¶

2.1 Handling missing values ¶

2.2 Identify two columns that have “noisy” (erroneous) values. ¶

Trip_distance¶

Passenger Count¶

2.3 Identify 2 columns that are highly correlated and explain their correlation ¶

Data Preprocessing¶

Exam the correlation¶

3. Featurization¶

3.1 Create a feature which is a flag indicating if the trip is in rush-hour or not.¶

Extract and create pickup_hour and dropoff_hour from this datetime¶

Create a feature which is a flag indicating if the trip is in rush-hour or not.¶

3.2 Create a feature that encodes the “complexity” of the trip.¶

Merge the yellow taxi dataframe with taxi lookup csv file¶

Define a function to merge data frame with lookup_table and shape file¶

Create a feature for the straight-line distance of the trip.¶

3.3 Calculate the pickup and drop-off frequency in each taxi zone.¶

Visualise the top 10 Pickup and Dropoff Zones¶

4. Data Analysis¶

4.1 Rank the vendors by popularity¶

4.2 What are the peak travel hours?¶

4.3 What is the average distance of the trips on weekdays and weekends?¶

4.4 What is the average number of passengers in a trip on weekdays and weekends?¶

4.5 What is the correlation between the fare amount and the tip??¶

4.6 What is the correlation between the fare amount and the number of passengers?¶

5. Conclusion¶

5.1 Summary¶

5.2 Challenges¶

5.3 Possible Next Steps¶

Introduction ¶

1. Data Loading ¶

2. Data Exploration and Pre-processing ¶

3. Featurization ¶

4. Data Analysis ¶

5. Conclusion ¶

Extract and create `pickup_hour` and `dropoff_hour` from this datetime¶