# import libraries for data manipulation
import numpy as np
import pandas as pd

# import libraries for data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# to suppress warnings
import warnings
warnings.filterwarnings('ignore')

# let colab access my google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

# read the data
df = pd.read_csv('/content/drive/MyDrive/MIT - Applied Data Science/Foundations - Python & Statistics/Project - FoodHub/foodhub_order.csv')

# returns the first 5 rows
df.head()

# Write your code here
df.shape

(1898, 9)

# Use info() to print a concise summary of the DataFrame
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1898 entries, 0 to 1897
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   order_id               1898 non-null   int64  
 1   customer_id            1898 non-null   int64  
 2   restaurant_name        1898 non-null   object 
 3   cuisine_type           1898 non-null   object 
 4   cost_of_the_order      1898 non-null   float64
 5   day_of_the_week        1898 non-null   object 
 6   rating                 1898 non-null   object 
 7   food_preparation_time  1898 non-null   int64  
 8   delivery_time          1898 non-null   int64  
dtypes: float64(1), int64(4), object(4)
memory usage: 133.6+ KB

# Write your code here
df.isna().sum()

# Write your code here
df.describe().T

# Write the code here

# Finding how many rows in the rating column are filled with "Not given", meaning orders there are not rated
(df["rating"] == "Not given").sum()

736

# Write the code here

# Let's start by the cuisine_type column. Being a categorical variable I'll use a barplot
sns.countplot(data=df, x="cuisine_type")
plt.xticks(rotation=90)
plt.show()

# Let's now analyse the cost_of_the_order column. Being a numerical variable, I'll use a histogram as well as a boxplot to give a more accurate visualization of the data
sns.histplot(data=df, x="cost_of_the_order", kde=True)
plt.show()
sns.boxplot(data=df, x="cost_of_the_order")
plt.show()

# Now we'll analyse the day_of_the_week column, whitch is a categorical variable, so I'll use a barplot
sns.countplot(data=df, x="day_of_the_week")
plt.show()

# Going for the rating column, I will use a barplot in order to analyse the ratings count
sns.countplot(data=df, x="rating")
plt.show()

# I'm now going to plot the food_preparation_time column. Once this is a numerical variable, I shall use a histogram and a boxplot better visualize this data
sns.histplot(data=df, binwidth=1, x="food_preparation_time")
plt.show()
sns.boxplot(data=df, x="food_preparation_time")
plt.show()

# Finally let's analyse the delivery_time column, with again a histogram and a boxplot, being this a numeric variable as well
sns.histplot(data=df, binwidth=1, x="delivery_time")
plt.show()
sns.boxplot(data=df, x="delivery_time")
plt.show()

# Write the code here
df["restaurant_name"].value_counts().nlargest(5)

# Write the code here

# To find out which is the most popular cuisine type on weekends, I'll use a barplot showing the two variables by using the hue parameter on the week day
sns.countplot(data=df, x="cuisine_type", hue="day_of_the_week")
plt.xticks(rotation=90)
plt.show()

# Write the code here

# First I will store the total number of orders in a variable
total = df.shape[0]

# Now I will create another variable to store the number of orders that are above 20
above_20 = df[df["cost_of_the_order"] > 20].shape[0]

# Finaly, now that we have all the values we need, I'll find the percentage of the orders that cost more than 20 dollars and print the result
percentage_above_20 = (above_20 / total) * 100

round(percentage_above_20)

29

# Write the code here
round(df["delivery_time"].mean())

24

# Write the code here
df["customer_id"].value_counts().nlargest(3)

# Write the code here

# I'll start by creating a correlation table between the most relevant numeric variables
corr = df[["cost_of_the_order", "food_preparation_time", "delivery_time"]].corr()

corr

# I'll now plot the data of the corr table using a heatmap to better visualize the correlations between this variables
sns.heatmap(data=corr, annot=True, cmap="YlGnBu")
plt.show()

# I'm now going to analyse the relationship between delivery_time and day_of_the_week to see in which days the orders arrive faster to the customers
sns.boxplot(data=df, x="day_of_the_week", y="delivery_time", palette="Set2")
plt.title("Delevery Time by Day of the Week")
plt.xlabel("Day of the Week")
plt.ylabel("Delivery Time (minutes)")
plt.show()

# I'm now going to analyse the relationship between food_preparation_time and day_of_the_week, to see if the food preparation time differs during the week from weekends.
sns.boxplot(data=df, x="day_of_the_week", y="food_preparation_time", palette="Set2")
plt.title("Food Preparation Time by Day of the Week")
plt.xlabel("Day of the Week")
plt.ylabel("Food Preparation Time (minutes)")
plt.show()

# I'm now going for the cuisine_type and cost_of_the_order variables, to see if there is much difference on the costs regarding the type of cuisine
plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x="cuisine_type", y="cost_of_the_order", palette="Set3")
plt.title("Cost of the Order by Cuisine Type")
plt.xlabel("Cuisine Type")
plt.ylabel("Cost of the Order")
plt.xticks(rotation=90)
plt.show()

# Let's look at the relationship between delivery_time and rating, to see if costumers tend to give higher ratings when the orders are delivered faster
sns.boxplot(data=df, x= "rating", y="delivery_time", palette="Set2")
plt.title("Rating by Delivery Time")
plt.xlabel("Rating")
plt.ylabel("Delivery Time")
plt.show()

# Now let's analyse if there is a relationship between the rating and how much a customer pays for his order, to see if the cost influences the ratings
sns.boxplot(data=df, x="rating", y="cost_of_the_order", palette="Set3")
plt.title("Rating by Cost")
plt.xlabel("Rating")
plt.ylabel("Cost")
plt.show()

# Write the code here

# First I will convert the "Not Given" values in the rating column to "NaN". Then I'll change the datatype of the rating column to numeric. Then I'll be able to calculate the average
df["rating"] = df["rating"].replace("Not given", np.nan)

# Converting the column to numeric datatype
df["rating"] = pd.to_numeric(df["rating"])

# Confirming if the conversion of the datatype was sucessful
df.info()

# Checking if the "Not Given" values where correctly replaced by NaN
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1898 entries, 0 to 1897
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   order_id               1898 non-null   int64  
 1   customer_id            1898 non-null   int64  
 2   restaurant_name        1898 non-null   object 
 3   cuisine_type           1898 non-null   object 
 4   cost_of_the_order      1898 non-null   float64
 5   day_of_the_week        1898 non-null   object 
 6   rating                 1162 non-null   float64
 7   food_preparation_time  1898 non-null   int64  
 8   delivery_time          1898 non-null   int64  
dtypes: float64(2), int64(4), object(3)
memory usage: 133.6+ KB

# Now I'll group the data by restaurant_name and calculate the top rating counts for each rating value
df.groupby(["restaurant_name"])["rating"].value_counts().nlargest(10)

# Grouping the data by restaurant_name and calculting the count and average of ratings for each restaurant
stats = df.groupby("restaurant_name")["rating"].agg(["count", "mean"])

# Filtering the data to create a dataframe with only the restaurants that mach the promotional criteria
target_restaurants = stats[(stats["count"] > 50) & (stats["mean"] > 4)]

target_restaurants

# Write the code here

# To calculate the company total net revenue, I'll create a column in the dataframe assigning the defined percentages to the correspondent order values
df["company_revenue"] = 0

# Assigning the 25% revenue for orders over 20 dollars
df.loc[df["cost_of_the_order"] > 20, "company_revenue"] = df["cost_of_the_order"] * 0.25

# Assigning the 15% revenue for orders over 5 to 20 dollars
df.loc[(df["cost_of_the_order"] > 5) & (df["cost_of_the_order"] <= 20), "company_revenue"] = df["cost_of_the_order"] * 0.15

# Calculating the total net revenue
net_revenue = df["company_revenue"].sum()

round(net_revenue, 2)

6166.3

# Write the code here

# I'll first create a new column in the dataframe with the total delivery time
df["total_time"] = df["food_preparation_time"] + df["delivery_time"]

# Now I'll count the orders that take more than 60 minutes
above_60 = df[df["total_time"] > 60]

# Finally I'll calculate the percentage
percentage_above_60 = (above_60.shape[0] / df.shape[0]) * 100

# I will also calculate the total number of orders over 60 minutes to give more prespective
total_above_60 = above_60["total_time"].count().sum()

print(round(percentage_above_60, 1))
print(total_above_60)

10.5
200

# Write the code here

# Let's plot again the delivery times by day of the week to visualise this two variables
sns.boxplot(data=df, x="day_of_the_week", y="delivery_time", palette="Set2")
plt.title("Delevery Time by Day of the Week")
plt.xlabel("Day of the Week")
plt.ylabel("Delivery Time (minutes)")
plt.show()

# Now let's calculate the mean delivery times for weekends and weekdays

delivery_mean= df.groupby(["day_of_the_week"])["delivery_time"].mean()
delivery_mean

	order_id	customer_id	restaurant_name	cuisine_type	cost_of_the_order	day_of_the_week	rating	food_preparation_time	delivery_time
0	1477147	337525	Hangawi	Korean	30.75	Weekend	Not given	25	20
1	1477685	358141	Blue Ribbon Sushi Izakaya	Japanese	12.08	Weekend	Not given	25	23
2	1477070	66393	Cafe Habana	Mexican	12.23	Weekday	5	23	28
3	1477334	106968	Blue Ribbon Fried Chicken	American	29.20	Weekend	3	25	15
4	1478249	76942	Dirty Bird to Go	American	11.59	Weekday	4	25	24

	count	mean	std	min	25%	50%	75%	max
order_id	1898.0	1.477496e+06	548.049724	1476547.00	1477021.25	1477495.50	1.477970e+06	1478444.00
customer_id	1898.0	1.711685e+05	113698.139743	1311.00	77787.75	128600.00	2.705250e+05	405334.00
cost_of_the_order	1898.0	1.649885e+01	7.483812	4.47	12.08	14.14	2.229750e+01	35.41
food_preparation_time	1898.0	2.737197e+01	4.632481	20.00	23.00	27.00	3.100000e+01	35.00
delivery_time	1898.0	2.416175e+01	4.972637	15.00	20.00	25.00	2.800000e+01	33.00

	count
restaurant_name
Shake Shack	219
The Meatball Shop	132
Blue Ribbon Sushi	119
Blue Ribbon Fried Chicken	96
Parm	68

	cost_of_the_order	food_preparation_time	delivery_time
cost_of_the_order	1.000000	0.041527	-0.029949
food_preparation_time	0.041527	1.000000	0.011094
delivery_time	-0.029949	0.011094	1.000000

	order_id	customer_id	restaurant_name	cuisine_type	cost_of_the_order	day_of_the_week	rating	food_preparation_time	delivery_time
0	1477147	337525	Hangawi	Korean	30.75	Weekend	NaN	25	20
1	1477685	358141	Blue Ribbon Sushi Izakaya	Japanese	12.08	Weekend	NaN	25	23
2	1477070	66393	Cafe Habana	Mexican	12.23	Weekday	5.0	23	28
3	1477334	106968	Blue Ribbon Fried Chicken	American	29.20	Weekend	3.0	25	15
4	1478249	76942	Dirty Bird to Go	American	11.59	Weekday	4.0	25	24

	count	mean
restaurant_name
Blue Ribbon Fried Chicken	64	4.328125
Blue Ribbon Sushi	73	4.219178
Shake Shack	133	4.278195
The Meatball Shop	84	4.511905

	count
customer_id
52832	13
47440	10
83287	9

Project Foundations for Data Science: FoodHub Data AnalysisÂ¶

ContextÂ¶

ObjectiveÂ¶

Data DescriptionÂ¶

Data DictionaryÂ¶

Let us start by importing the required librariesÂ¶

Understanding the structure of the dataÂ¶

Observations:Â¶

Question 1: How many rows and columns are present in the data? [0.5 mark]Â¶

Observations:Â¶

Question 2: What are the datatypes of the different columns in the dataset? (The info() function can be used) [0.5 mark]Â¶

Observations:Â¶

Question 3: Are there any missing values in the data? If yes, treat them using an appropriate method. [1 mark]Â¶

Observations:Â¶

Question 4: Check the statistical summary of the data. What is the minimum, average, and maximum time it takes for food to be prepared once an order is placed? [2 marks]Â¶

Observations:Â¶

Question 5: How many orders are not rated? [1 mark]Â¶

Observations:Â¶

Exploratory Data Analysis (EDA)Â¶

Univariate AnalysisÂ¶

Question 6: Explore all the variables and provide observations on their distributions. (Generally, histograms, boxplots, countplots, etc. are used for univariate exploration.) [9 marks]Â¶

Observations:Â¶

Observations:Â¶

Observations:Â¶

Observations:Â¶

Observations:Â¶

Observations:Â¶

Question 7: Which are the top 5 restaurants in terms of the number of orders received? [1 mark]Â¶

Observations:Â¶

Question 8: Which is the most popular cuisine on weekends? [1 mark]Â¶

Observations:Â¶

Question 9: What percentage of the orders cost more than 20 dollars? [2 marks]Â¶

Observations:Â¶

Question 10: What is the mean order delivery time? [1 mark]Â¶

Observations:Â¶

Question 11: The company has decided to give 20% discount vouchers to the top 3 most frequent customers. Find the IDs of these customers and the number of orders they placed. [1 mark]Â¶

Observations:Â¶

Multivariate AnalysisÂ¶

Question 12: Perform a multivariate analysis to explore relationships between the important variables in the dataset. (It is a good idea to explore relations between numerical variables as well as relations between numerical and categorical variables) [10 marks]Â¶

Observations:Â¶

Observations:Â¶

Observations:Â¶

Observations:Â¶

Observations:Â¶

Observations:Â¶

Observations:Â¶

Question 14: The company charges the restaurant 25% on the orders having cost greater than 20 dollars and 15% on the orders having cost greater than 5 dollars. Find the net revenue generated by the company across all orders. [3 marks]Â¶

Observations:Â¶

Question 15: The company wants to analyze the total time required to deliver the food. What percentage of orders take more than 60 minutes to get delivered from the time the order is placed? (The food has to be prepared and then delivered.) [2 marks]Â¶

Observations:Â¶

Question 16: The company wants to analyze the delivery time of the orders on weekdays and weekends. How does the mean delivery time vary during weekdays and weekends? [2 marks]Â¶

Observations:Â¶

Conclusion and RecommendationsÂ¶

Question 17: What are your conclusions from the analysis? What recommendations would you like to share to help improve the business? (You can use cuisine type and feedback ratings to drive your business recommendations.) [6 marks]Â¶

Conclusions:Â¶

Recommendations:Â¶