# Importing libraries for data manipulation
import numpy as np
import pandas as pd

# Importing libraries for data visualization
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.graphics.gofplots import ProbPlot
import scipy.stats as stats

# Importing libraries for building linear regression model
import statsmodels.api as sm
from statsmodels.formula.api import ols
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso

# Importing libraries for tree Based models
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import plot_tree
from sklearn.ensemble import RandomForestRegressor

# Importing libraries for hyperparameter tuning
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Importing libraries for model evaluation
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_score
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Importing library for splitting data
from sklearn.model_selection import train_test_split

# Importing library for data preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

# Importing filter to ignore deprecation warnings
import warnings
warnings.filterwarnings("ignore")

# Removing the limit from the number of displayed columns and rows.
pd.set_option("display.max_columns", None)

# Leting colab access my google drive

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

# Using the pd.read_csv() funtion to load the dataset

df = pd.read_csv("/content/drive/MyDrive/MIT - Applied Data Science/Projects/Capstone/used_cars.csv")

# Looking at the top 5 rows of the dataset to start building some intuition

df.head()

# Checking the size of the dataset using the .shape method

df.shape

(7253, 14)

# Displaying basic information about the dataset

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7253 entries, 0 to 7252
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   S.No.              7253 non-null   int64  
 1   Name               7253 non-null   object 
 2   Location           7253 non-null   object 
 3   Year               7253 non-null   int64  
 4   Kilometers_Driven  7253 non-null   int64  
 5   Fuel_Type          7253 non-null   object 
 6   Transmission       7253 non-null   object 
 7   Owner_Type         7253 non-null   object 
 8   Mileage            7251 non-null   float64
 9   Engine             7207 non-null   float64
 10  Power              7078 non-null   float64
 11  Seats              7200 non-null   float64
 12  New_price          1006 non-null   float64
 13  Price              6019 non-null   float64
dtypes: float64(6), int64(3), object(5)
memory usage: 793.4+ KB

# Checking the missing values, let's use the .isnull().sum() function, that will return us a count of the missing values in our data

df.isnull().sum()

# I'm now going to run the is.na().sum() function that will return a count of any fields that may have NaN values

df.isna().sum()

# I'm now running the .duplicated().sum() function, to know if there are any duplicated records on the dataset

df.duplicated().sum()

0

# In order to start building some more intuition on our data, I'm now using the .describe() function, which will return a statistical summary of our columns

df.describe().T

# Checking for unique values in categorical columns

categorical_columns = df.select_dtypes(include=["object"]).columns
print("\n**Unique Values in Categorical Columns:**")
for col in categorical_columns:
    print(f"{col}: {df[col].nunique()} unique values")

**Unique Values in Categorical Columns:**
Name: 2041 unique values
Location: 11 unique values
Fuel_Type: 5 unique values
Transmission: 2 unique values
Owner_Type: 4 unique values

# Extracting the brand from the Name column. I'll extract the first word (brand name)

df["Brand"] = df["Name"].str.split().str[0]

# Droping the original Name column

df.drop(columns=["Name"], inplace=True)

# Displaying the unique brands

print("Unique Brands in the Dataset:")
print(df["Brand"].nunique(), "unique brands")
print(df["Brand"].unique())

Unique Brands in the Dataset:
33 unique brands
['Maruti' 'Hyundai' 'Honda' 'Audi' 'Nissan' 'Toyota' 'Volkswagen' 'Tata'
 'Land' 'Mitsubishi' 'Renault' 'Mercedes-Benz' 'BMW' 'Mahindra' 'Ford'
 'Porsche' 'Datsun' 'Jaguar' 'Volvo' 'Chevrolet' 'Skoda' 'Mini' 'Fiat'
 'Jeep' 'Smart' 'Ambassador' 'Isuzu' 'ISUZU' 'Force' 'Bentley'
 'Lamborghini' 'Hindustan' 'OpelCorsa']

# Standardizing brand names (convert all to uppercase)

df["Brand"] = df["Brand"].str.upper()

# Displaying unique brands
print("Unique Brands After Standardization:")
print(df["Brand"].nunique(), "unique brands")
print(df["Brand"].unique())

Unique Brands After Standardization:
32 unique brands
['MARUTI' 'HYUNDAI' 'HONDA' 'AUDI' 'NISSAN' 'TOYOTA' 'VOLKSWAGEN' 'TATA'
 'LAND' 'MITSUBISHI' 'RENAULT' 'MERCEDES-BENZ' 'BMW' 'MAHINDRA' 'FORD'
 'PORSCHE' 'DATSUN' 'JAGUAR' 'VOLVO' 'CHEVROLET' 'SKODA' 'MINI' 'FIAT'
 'JEEP' 'SMART' 'AMBASSADOR' 'ISUZU' 'FORCE' 'BENTLEY' 'LAMBORGHINI'
 'HINDUSTAN' 'OPELCORSA']

# Checking all car names containing "LAND"
print(df[df["Brand"] == "LAND"]["Brand"].value_counts())
print(df[df["Brand"] == "LAND"])

Brand
LAND    67
Name: count, dtype: int64
      S.No.    Location  Year  Kilometers_Driven Fuel_Type Transmission  \
13       13       Delhi  2014              72000    Diesel    Automatic   
14       14        Pune  2012              85000    Diesel    Automatic   
191     191  Coimbatore  2018              36091    Diesel    Automatic   
311     311       Delhi  2017              44000    Diesel    Automatic   
399     399   Hyderabad  2012              56000    Diesel    Automatic   
...     ...         ...   ...                ...       ...          ...   
6434   6434       Kochi  2012              89190    Diesel    Automatic   
6717   6717       Kochi  2018              23342    Diesel    Automatic   
6857   6857      Mumbai  2011              87000    Diesel    Automatic   
7157   7157   Hyderabad  2015              49000    Diesel    Automatic   
7198   7198   Hyderabad  2012             147202    Diesel    Automatic   

     Owner_Type  Mileage  Engine   Power  Seats  New_price  Price Brand  
13        First    12.70  2179.0  187.70    5.0        NaN  27.00  LAND  
14       Second     0.00  2179.0  115.00    5.0        NaN  17.50  LAND  
191       First    12.70  2179.0  187.70    5.0        NaN  55.76  LAND  
311       First    12.70  2179.0  187.70    5.0        NaN  44.00  LAND  
399       First    12.70  2179.0  187.70    5.0        NaN  30.00  LAND  
...         ...      ...     ...     ...    ...        ...    ...   ...  
6434     Second    11.40  2993.0  245.41    7.0        NaN    NaN  LAND  
6717      First    12.83  2179.0  147.50    5.0        NaN    NaN  LAND  
6857      First     0.00  2179.0  115.00    5.0        NaN    NaN  LAND  
7157     Second    12.70  2179.0  187.70    5.0        NaN    NaN  LAND  
7198      First    11.80  2993.0  241.60    7.0        NaN    NaN  LAND  

[67 rows x 14 columns]

# Correcting LAND to LAND ROVER

df["Brand"] = df["Brand"].replace("LAND", "LAND ROVER")

# Verifying the correction

print(df["Brand"].unique())

['MARUTI' 'HYUNDAI' 'HONDA' 'AUDI' 'NISSAN' 'TOYOTA' 'VOLKSWAGEN' 'TATA'
 'LAND ROVER' 'MITSUBISHI' 'RENAULT' 'MERCEDES-BENZ' 'BMW' 'MAHINDRA'
 'FORD' 'PORSCHE' 'DATSUN' 'JAGUAR' 'VOLVO' 'CHEVROLET' 'SKODA' 'MINI'
 'FIAT' 'JEEP' 'SMART' 'AMBASSADOR' 'ISUZU' 'FORCE' 'BENTLEY'
 'LAMBORGHINI' 'HINDUSTAN' 'OPELCORSA']

# Imputing Seats with mode

df["Seats"].fillna(df["Seats"].mode()[0], inplace=True)

# Verifying missing values again

print("Missing Values After Seats Imputation:")
print(df.isnull().sum())

Missing Values After Seats Imputation:
S.No.                   0
Location                0
Year                    0
Kilometers_Driven       0
Fuel_Type               0
Transmission            0
Owner_Type              0
Mileage                 2
Engine                 46
Power                 175
Seats                   0
New_price            6247
Price                1234
Brand                   0
dtype: int64

# Droping the New_price column

df.drop(columns=["New_price"], inplace=True)

# Verifying that it's gone

print("Columns after dropping 'New_price':")
print(df.columns)

Columns after dropping 'New_price':
Index(['S.No.', 'Location', 'Year', 'Kilometers_Driven', 'Fuel_Type',
       'Transmission', 'Owner_Type', 'Mileage', 'Engine', 'Power', 'Seats',
       'Price', 'Brand'],
      dtype='object')

# Defining  numerical columns to analyze

numerical_features = ["Year", "Kilometers_Driven", "Mileage", "Engine", "Power", "Seats", "Price"]

# Ploting histograms for numerical features

plt.figure(figsize=(15, 10))
for i, col in enumerate(numerical_features, 1):
    plt.subplot(3, 3, i)  # Adjusting rows and columns based on number of features
    sns.histplot(df[col], bins=30, kde=True)
    plt.title(f"Distribution of {col}")
plt.tight_layout()
plt.show()

# Converting the column Year to Car Age and droping the year field

df["Car_Age"] = 2024 - df["Year"]
df.drop(columns=["Year"], inplace=True)

# Applying log transformation to skewed numerical features

df["Kilometers_Driven_Log"] = np.log1p(df["Kilometers_Driven"])
df["Engine_Log"] = np.log1p(df["Engine"])
df["Power_Log"] = np.log1p(df["Power"])
df["Price_Log"] = np.log1p(df["Price"])  # Target variable, but keep original

# Dropping original versions of transformed features (except for Price)

df.drop(columns=["Kilometers_Driven", "Engine", "Power"], inplace=True)

# Verifying changes

print(df.head())

   S.No.    Location Fuel_Type Transmission Owner_Type  Mileage  Seats  Price  \
0      0      Mumbai       CNG       Manual      First    26.60    5.0   1.75   
1      1        Pune    Diesel       Manual      First    19.67    5.0  12.50   
2      2     Chennai    Petrol       Manual      First    18.20    5.0   4.50   
3      3     Chennai    Diesel       Manual      First    20.77    7.0   6.00   
4      4  Coimbatore    Diesel    Automatic     Second    15.20    5.0  17.74   

     Brand  Car_Age  Kilometers_Driven_Log  Engine_Log  Power_Log  Price_Log  
0   MARUTI       14              11.184435    6.906755   4.080246   1.011601  
1  HYUNDAI        9              10.621352    7.367077   4.845761   2.602690  
2    HONDA       13              10.736418    7.090077   4.496471   1.704748  
3   MARUTI       12              11.373675    7.130099   4.497139   1.945910  
4     AUDI       11              10.613271    7.585281   4.954418   2.930660

# Plotting Scatter Plots for Log-Transformed Features

numerical_features = ["Kilometers_Driven_Log", "Engine_Log", "Power_Log", "Car_Age"]

plt.figure(figsize=(12, 10))
for i, col in enumerate(numerical_features, 1):
    plt.subplot(2, 2, i)
    sns.scatterplot(x=df[col], y=df["Price_Log"], alpha=0.5)
    plt.title(f"Scatter Plot: Price_Log vs {col}")
plt.tight_layout()
plt.show()

# Plotting the correlation heatmap

numerical_df = df.select_dtypes(include=["number"])

plt.figure(figsize=(12, 8))
sns.heatmap(numerical_df.corr(), annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation Heatmap")
plt.show()

# Plotting Box Plot for Price_Log vs Location

plt.figure(figsize=(15, 6))
sns.boxplot(x=df["Location"], y=df["Price_Log"])
plt.xticks(rotation=90)
plt.title("Box Plot: Price_Log vs Location")
plt.show()

# Imputing Mileage based on Fuel_Type

df["Mileage"].fillna(df.groupby("Fuel_Type")["Mileage"].transform("median"), inplace=True)

# Imputing Engine_Log based on Brand

df["Engine_Log"].fillna(df.groupby("Brand")["Engine_Log"].transform("median"), inplace=True)

# Imputing Power_Log based on Brand

df["Power_Log"].fillna(df.groupby("Brand")["Power_Log"].transform("median"), inplace=True)

# Verifying that all missing values are handled

print("Missing Values After Imputation:\n", df.isnull().sum())

Missing Values After Imputation:
 S.No.                       0
Location                    0
Fuel_Type                   0
Transmission                0
Owner_Type                  0
Mileage                     2
Seats                       0
Price                    1234
Brand                       0
Car_Age                     0
Kilometers_Driven_Log       0
Engine_Log                  0
Power_Log                   2
Price_Log                1234
dtype: int64

# Re-checking Mileage and Power_Log for missing values

print(df[df["Mileage"].isnull()])  # Check the rows where Mileage is still missing
print(df[df["Power_Log"].isnull()])  # Check missing Power_Log rows

      S.No. Location Fuel_Type Transmission Owner_Type  Mileage  Seats  Price  \
4446   4446  Chennai  Electric    Automatic      First      NaN    5.0  13.00   
4904   4904   Mumbai  Electric    Automatic      First      NaN    5.0  12.75   

         Brand  Car_Age  Kilometers_Driven_Log  Engine_Log  Power_Log  \
4446  MAHINDRA        8              10.819798    4.290459   3.737670   
4904    TOYOTA       13              10.691968    7.494986   4.304065   

      Price_Log  
4446   2.639057  
4904   2.621039  
      S.No. Location Fuel_Type Transmission Owner_Type  Mileage  Seats  Price  \
915     915     Pune    Diesel    Automatic     Second      0.0    2.0    3.0   
6216   6216     Pune    Diesel       Manual     Second     14.1    5.0    NaN   

          Brand  Car_Age  Kilometers_Driven_Log  Engine_Log  Power_Log  \
915       SMART       16              11.542494    6.684612        NaN   
6216  HINDUSTAN       28              11.082158    7.598900        NaN   

      Price_Log  
915    1.386294  
6216        NaN

# Fixing Mileage for Electric cars

electric_median_mileage = df[df["Fuel_Type"] == "Electric"]["Mileage"].median()
df.loc[df["Fuel_Type"] == "Electric", "Mileage"] = df["Mileage"].fillna(electric_median_mileage)

# Fixing Power_Log for rare brands using overall median

overall_median_power = df["Power_Log"].median()
df["Power_Log"].fillna(overall_median_power, inplace=True)

# Verifying if all missing values are gone
print("Missing Values After Final Fix:\n", df.isnull().sum())

Missing Values After Final Fix:
 S.No.                       0
Location                    0
Fuel_Type                   0
Transmission                0
Owner_Type                  0
Mileage                     2
Seats                       0
Price                    1234
Brand                       0
Car_Age                     0
Kilometers_Driven_Log       0
Engine_Log                  0
Power_Log                   0
Price_Log                1234
dtype: int64

# Dropping rows where Mileage is still missing

df.dropna(subset=["Mileage"], inplace=True)

# Dropping rows where Price is missing

df.dropna(subset=["Price"], inplace=True)

# Verifying that all missing values are gone

print("Final Missing Value Check:\n", df.isnull().sum())

Final Missing Value Check:
 S.No.                    0
Location                 0
Fuel_Type                0
Transmission             0
Owner_Type               0
Mileage                  0
Seats                    0
Price                    0
Brand                    0
Car_Age                  0
Kilometers_Driven_Log    0
Engine_Log               0
Power_Log                0
Price_Log                0
dtype: int64

# One-Hot Encoding for categorical variables

df_encoded = pd.get_dummies(df, columns=["Location", "Fuel_Type", "Transmission", "Owner_Type", "Brand"], drop_first=True)

# Verify the new dataset

print("Encoded DataFrame Shape:", df_encoded.shape)
print("First Rows After Encoding:\n", df_encoded.head())

Encoded DataFrame Shape: (6017, 55)
First Rows After Encoding:
    S.No.  Mileage  Seats  Price  Car_Age  Kilometers_Driven_Log  Engine_Log  \
0      0    26.60    5.0   1.75       14              11.184435    6.906755   
1      1    19.67    5.0  12.50        9              10.621352    7.367077   
2      2    18.20    5.0   4.50       13              10.736418    7.090077   
3      3    20.77    7.0   6.00       12              11.373675    7.130099   
4      4    15.20    5.0  17.74       11              10.613271    7.585281   

   Power_Log  Price_Log  Location_Bangalore  Location_Chennai  \
0   4.080246   1.011601               False             False   
1   4.845761   2.602690               False             False   
2   4.496471   1.704748               False              True   
3   4.497139   1.945910               False              True   
4   4.954418   2.930660               False             False   

   Location_Coimbatore  Location_Delhi  Location_Hyderabad  Location_Jaipur  \
0                False           False               False            False   
1                False           False               False            False   
2                False           False               False            False   
3                False           False               False            False   
4                 True           False               False            False   

   Location_Kochi  Location_Kolkata  Location_Mumbai  Location_Pune  \
0           False             False             True          False   
1           False             False            False           True   
2           False             False            False          False   
3           False             False            False          False   
4           False             False            False          False   

   Fuel_Type_Diesel  Fuel_Type_LPG  Fuel_Type_Petrol  Transmission_Manual  \
0             False          False             False                 True   
1              True          False             False                 True   
2             False          False              True                 True   
3              True          False             False                 True   
4              True          False             False                False   

   Owner_Type_Fourth & Above  Owner_Type_Second  Owner_Type_Third  Brand_AUDI  \
0                      False              False             False       False   
1                      False              False             False       False   
2                      False              False             False       False   
3                      False              False             False       False   
4                      False               True             False        True   

   Brand_BENTLEY  Brand_BMW  Brand_CHEVROLET  Brand_DATSUN  Brand_FIAT  \
0          False      False            False         False       False   
1          False      False            False         False       False   
2          False      False            False         False       False   
3          False      False            False         False       False   
4          False      False            False         False       False   

   Brand_FORCE  Brand_FORD  Brand_HONDA  Brand_HYUNDAI  Brand_ISUZU  \
0        False       False        False          False        False   
1        False       False        False           True        False   
2        False       False         True          False        False   
3        False       False        False          False        False   
4        False       False        False          False        False   

   Brand_JAGUAR  Brand_JEEP  Brand_LAMBORGHINI  Brand_LAND ROVER  \
0         False       False              False             False   
1         False       False              False             False   
2         False       False              False             False   
3         False       False              False             False   
4         False       False              False             False   

   Brand_MAHINDRA  Brand_MARUTI  Brand_MERCEDES-BENZ  Brand_MINI  \
0           False          True                False       False   
1           False         False                False       False   
2           False         False                False       False   
3           False          True                False       False   
4           False         False                False       False   

   Brand_MITSUBISHI  Brand_NISSAN  Brand_PORSCHE  Brand_RENAULT  Brand_SKODA  \
0             False         False          False          False        False   
1             False         False          False          False        False   
2             False         False          False          False        False   
3             False         False          False          False        False   
4             False         False          False          False        False   

   Brand_SMART  Brand_TATA  Brand_TOYOTA  Brand_VOLKSWAGEN  Brand_VOLVO  
0        False       False         False             False        False  
1        False       False         False             False        False  
2        False       False         False             False        False  
3        False       False         False             False        False  
4        False       False         False             False        False

# Defining features and target

X = df_encoded.drop(columns=["Price", "Price_Log"])
y = df_encoded["Price"]  # Target variable

# Train-test split (80% Train, 20% Test)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Verifying split

print(f"Training Set: {X_train.shape}, Test Set: {X_test.shape}")

Training Set: (4813, 53), Test Set: (1204, 53)

# Initializing the Linear Regression model

lin_reg = LinearRegression()

# Training the model

lin_reg.fit(X_train, y_train)

# Making predictions

y_train_pred = lin_reg.predict(X_train)
y_test_pred = lin_reg.predict(X_test)

# Evaluating model performance

# R2

train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

# RMSE

train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

# MAE

train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)

# MAPE

def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

train_mape = mean_absolute_percentage_error(y_train, y_train_pred)
test_mape = mean_absolute_percentage_error(y_test, y_test_pred)

# Displaying results
print("Linear Regression Model Performance:")
print(f"RÂ² Score (Train): {train_r2:.4f}")
print(f"RÂ² Score (Test) : {test_r2:.4f}")
print(f"RMSE (Train)    : {train_rmse:.4f}")
print(f"RMSE (Test)     : {test_rmse:.4f}")
print(f"MAE (Train)     : {train_mae:.4f}")
print(f"MAE (Test)      : {test_mae:.4f}")
print(f"MAPE (Train)    : {train_mape:.2f}%")
print(f"MAPE (Test)     : {test_mape:.2f}%")

Linear Regression Model Performance:
RÂ² Score (Train): 0.7557
RÂ² Score (Test) : 0.7588
RMSE (Train)    : 5.5733
RMSE (Test)     : 5.3205
MAE (Train)     : 3.0761
MAE (Test)      : 3.0943
MAPE (Train)    : 60.80%
MAPE (Test)     : 65.63%

# Predicting on train data to get residuals

y_train_pred = lin_reg.predict(X_train)
residuals = y_train - y_train_pred

# Checking the Mean of Residuals

mean_residuals = np.mean(residuals)
print(f"Mean of Residuals: {mean_residuals:.5f}")

Mean of Residuals: 0.00000

# Homoscedasticity Check - Residuals vs Predicted

plt.figure(figsize=(6, 4))
sns.scatterplot(x=y_train_pred, y=residuals, alpha=0.5)
plt.axhline(y=0, color="r", linestyle="--", linewidth=2)
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("Homoscedasticity Check: Residuals vs Predicted")
plt.show()

# Ploting a Q-Q plot to verify if residuals align with a normal distribution

plt.figure(figsize=(6, 4))
stats.probplot(residuals, dist="norm", plot=plt)
plt.title("Q-Q Plot for Linearity Check")
plt.show()

# Plotting an histogram to check if the errors follow a normal distribution

plt.figure(figsize=(6, 4))
sns.histplot(residuals, bins=30, kde=True)
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.title("Normality of Residuals")
plt.show()

# Cross-Validation performance check

# Performing 5-Fold Cross-Validation (scoring based on RÂ²)

cv_scores = cross_val_score(lin_reg, X_train, y_train, cv=5, scoring='r2')

# Displaying cross-validation results

print(f"\nCross-Validation Results:")
print(f"Mean RÂ² Score: {cv_scores.mean():.4f}")
print(f"Standard Deviation of RÂ²: {cv_scores.std():.4f}")
print(f"All RÂ² Scores: {cv_scores}")

Cross-Validation Results:
Mean RÂ² Score: 0.7358
Standard Deviation of RÂ²: 0.0281
All RÂ² Scores: [0.74678498 0.76442222 0.70710684 0.76301225 0.69773929]

# Training Ridge Regression

ridge = Ridge(alpha=1.0)  # Alpha is the regularization strength
ridge.fit(X_train, y_train)
y_train_pred_ridge = ridge.predict(X_train)
y_test_pred_ridge = ridge.predict(X_test)

# Training Lasso Regression

lasso = Lasso(alpha=0.01)  # Alpha should be small for Lasso to avoid aggressive feature elimination
lasso.fit(X_train, y_train)
y_train_pred_lasso = lasso.predict(X_train)
y_test_pred_lasso = lasso.predict(X_test)

# Evaluating both models

def evaluate_model(model_name, y_train, y_train_pred, y_test, y_test_pred):
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    train_mae = mean_absolute_error(y_train, y_train_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    train_mape = mean_absolute_percentage_error(y_train, y_train_pred)
    test_mape = mean_absolute_percentage_error(y_test, y_test_pred)

    print(f"\n{model_name} Model Performance:")
    print(f"RÂ² Score (Train): {train_r2:.4f}")
    print(f"RÂ² Score (Test) : {test_r2:.4f}")
    print(f"RMSE (Train)    : {train_rmse:.4f}")
    print(f"RMSE (Test)     : {test_rmse:.4f}")
    print(f"MAE (Train)     : {train_mae:.4f}")
    print(f"MAE (Test)      : {test_mae:.4f}")
    print(f"MAPE (Train)    : {train_mape:.2f}%")
    print(f"MAPE (Test)     : {test_mape:.2f}%")

# Printing results

evaluate_model("Ridge Regression", y_train, y_train_pred_ridge, y_test, y_test_pred_ridge)
evaluate_model("Lasso Regression", y_train, y_train_pred_lasso, y_test, y_test_pred_lasso)

Ridge Regression Model Performance:
RÂ² Score (Train): 0.7519
RÂ² Score (Test) : 0.7596
RMSE (Train)    : 5.6158
RMSE (Test)     : 5.3118
MAE (Train)     : 3.1204
MAE (Test)      : 3.1185
MAPE (Train)    : 61.73%
MAPE (Test)     : 66.54%

Lasso Regression Model Performance:
RÂ² Score (Train): 0.7469
RÂ² Score (Test) : 0.7580
RMSE (Train)    : 5.6724
RMSE (Test)     : 5.3294
MAE (Train)     : 3.1705
MAE (Test)      : 3.1508
MAPE (Train)    : 62.94%
MAPE (Test)     : 67.32%

# Getting feature names

feature_names = X_train.columns

# Getting Lasso coefficients
lasso_coeffs = lasso.coef_

# Identifying features with zero coefficients (dropped by Lasso)

dropped_features = feature_names[lasso_coeffs == 0]

# Printing dropped Features

print("Features Dropped by Lasso Regression:")
print(dropped_features)

Features Dropped by Lasso Regression:
Index(['Location_Pune', 'Fuel_Type_Diesel', 'Fuel_Type_LPG',
       'Owner_Type_Fourth & Above', 'Brand_BENTLEY', 'Brand_DATSUN',
       'Brand_FIAT', 'Brand_FORCE', 'Brand_ISUZU', 'Brand_JEEP',
       'Brand_MITSUBISHI', 'Brand_SMART', 'Brand_TOYOTA'],
      dtype='object')

# Training Decision Tree Model

dt_model = DecisionTreeRegressor(random_state=42, max_depth=10)  # Limiting depth to prevent overfitting
dt_model.fit(X_train, y_train)

# Making predictions
y_train_pred_dt = dt_model.predict(X_train)
y_test_pred_dt = dt_model.predict(X_test)

# Evaluate model performance

def evaluate_model(model_name, y_train, y_train_pred, y_test, y_test_pred):
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    train_mae = mean_absolute_error(y_train, y_train_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    train_mape = mean_absolute_percentage_error(y_train, y_train_pred)
    test_mape = mean_absolute_percentage_error(y_test, y_test_pred)

    print(f"\n{model_name} Model Performance:")
    print(f"RÂ² Score (Train): {train_r2:.4f}")
    print(f"RÂ² Score (Test) : {test_r2:.4f}")
    print(f"RMSE (Train)    : {train_rmse:.4f}")
    print(f"RMSE (Test)     : {test_rmse:.4f}")
    print(f"MAE (Train)     : {train_mae:.4f}")
    print(f"MAE (Test)      : {test_mae:.4f}")
    print(f"MAPE (Train)    : {train_mape:.2f}%")
    print(f"MAPE (Test)     : {test_mape:.2f}%")

# Print Decision Tree Results

evaluate_model("Decision Tree", y_train, y_train_pred_dt, y_test, y_test_pred_dt)

Decision Tree Model Performance:
RÂ² Score (Train): 0.9717
RÂ² Score (Test) : 0.7441
RMSE (Train)    : 1.8960
RMSE (Test)     : 5.4799
MAE (Train)     : 1.0258
MAE (Test)      : 2.0590
MAPE (Train)    : 14.22%
MAPE (Test)     : 26.36%

# Defining parameter grid

param_grid = {
    "max_depth": [5, 7, 10, 15],
    "min_samples_split": [5, 10, 20],
    "min_samples_leaf": [5, 10, 15]
}

# Initializing decision tree model

dt = DecisionTreeRegressor(random_state=42)

# Running GridSearchCV to find the best parameters

grid_search = GridSearchCV(dt, param_grid, cv=5, scoring="r2", n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters

best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Training Decision Tree with best parameters

best_dt = DecisionTreeRegressor(**best_params, random_state=42)
best_dt.fit(X_train, y_train)

# Making Predictions

y_train_pred_best_dt = best_dt.predict(X_train)
y_test_pred_best_dt = best_dt.predict(X_test)

# Evaluating Tuned Model

evaluate_model("Tuned Decision Tree", y_train, y_train_pred_best_dt, y_test, y_test_pred_best_dt)

Best Parameters: {'max_depth': 15, 'min_samples_leaf': 5, 'min_samples_split': 20}

Tuned Decision Tree Model Performance:
RÂ² Score (Train): 0.9208
RÂ² Score (Test) : 0.8195
RMSE (Train)    : 3.1737
RMSE (Test)     : 4.6028
MAE (Train)     : 1.4059
MAE (Test)      : 1.9617
MAPE (Train)    : 14.97%
MAPE (Test)     : 24.66%

# Plotting the Decision Tree

# Setting figure size

plt.figure(figsize=(20, 10))

# Plot the decision tree (limiting depth for better visualization)

plot_tree(best_dt, feature_names=X_train.columns, filled=True, rounded=True, max_depth=4)  # Adjust max_depth if needed

# Showing the plot
plt.title("Decision Tree Visualization (Pruned)")
plt.show()

# Extracing feature importances from the tuned decision tree model

feature_importances = best_dt.feature_importances_

# Creating a DataFrame for better visualization

importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})

# Sorting by importance in descending order

importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Displaying the top 15 features

plt.figure(figsize=(12, 6))
plt.barh(importance_df['Feature'][:15], importance_df['Importance'][:15], color='skyblue')
plt.xlabel("Feature Importance")
plt.ylabel("Feature")
plt.title("Top 15 Feature Importances - Decision Tree")
plt.gca().invert_yaxis()
plt.show()

# Displaying the DataFrame

print(importance_df)

                      Feature  Importance
6                   Power_Log    0.697919
3                     Car_Age    0.162960
5                  Engine_Log    0.035850
4       Kilometers_Driven_Log    0.030605
38           Brand_LAND ROVER    0.015809
1                     Mileage    0.012793
11         Location_Hyderabad    0.009526
20        Transmission_Manual    0.009494
2                       Seats    0.006371
41        Brand_MERCEDES-BENZ    0.004749
42                 Brand_MINI    0.003954
9         Location_Coimbatore    0.002448
24                 Brand_AUDI    0.001512
32                Brand_HONDA    0.001210
0                       S.No.    0.001210
33              Brand_HYUNDAI    0.000649
50               Brand_TOYOTA    0.000544
19           Fuel_Type_Petrol    0.000490
10             Location_Delhi    0.000486
14           Location_Kolkata    0.000339
40               Brand_MARUTI    0.000193
49                 Brand_TATA    0.000186
22          Owner_Type_Second    0.000166
26                  Brand_BMW    0.000144
15            Location_Mumbai    0.000089
39             Brand_MAHINDRA    0.000080
7          Location_Bangalore    0.000058
27            Brand_CHEVROLET    0.000040
31                 Brand_FORD    0.000036
51           Brand_VOLKSWAGEN    0.000034
17           Fuel_Type_Diesel    0.000021
8            Location_Chennai    0.000010
16              Location_Pune    0.000010
12            Location_Jaipur    0.000008
13             Location_Kochi    0.000005
46              Brand_RENAULT    0.000000
45              Brand_PORSCHE    0.000000
44               Brand_NISSAN    0.000000
43           Brand_MITSUBISHI    0.000000
48                Brand_SMART    0.000000
47                Brand_SKODA    0.000000
23           Owner_Type_Third    0.000000
25              Brand_BENTLEY    0.000000
37          Brand_LAMBORGHINI    0.000000
36                 Brand_JEEP    0.000000
35               Brand_JAGUAR    0.000000
34                Brand_ISUZU    0.000000
18              Fuel_Type_LPG    0.000000
30                Brand_FORCE    0.000000
29                 Brand_FIAT    0.000000
28               Brand_DATSUN    0.000000
21  Owner_Type_Fourth & Above    0.000000
52                Brand_VOLVO    0.000000

# Initializing and training the random forest model

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predictions

y_train_pred_rf = rf_model.predict(X_train)
y_test_pred_rf = rf_model.predict(X_test)

# Performance Evaluation

rf_train_r2 = r2_score(y_train, y_train_pred_rf)
rf_test_r2 = r2_score(y_test, y_test_pred_rf)
rf_train_rmse = mean_squared_error(y_train, y_train_pred_rf) ** 0.5
rf_test_rmse = mean_squared_error(y_test, y_test_pred_rf) ** 0.5
rf_train_mae = mean_absolute_error(y_train, y_train_pred_rf)
rf_test_mae = mean_absolute_error(y_test, y_test_pred_rf)
rf_train_mape = np.mean(np.abs((y_train - y_train_pred_rf) / y_train)) * 100
rf_test_mape = np.mean(np.abs((y_test - y_test_pred_rf) / y_test)) * 100

# Displaying results

print(f"Random Forest Model Performance:")
print(f"RÂ² Score (Train): {rf_train_r2:.4f}")
print(f"RÂ² Score (Test) : {rf_test_r2:.4f}")
print(f"RMSE (Train)    : {rf_train_rmse:.4f}")
print(f"RMSE (Test)     : {rf_test_rmse:.4f}")
print(f"MAE (Train)     : {rf_train_mae:.4f}")
print(f"MAE (Test)      : {rf_test_mae:.4f}")
print(f"MAPE (Train)    : {rf_train_mape:.2f}%")
print(f"MAPE (Test)     : {rf_test_mape:.2f}%")

Random Forest Model Performance:
RÂ² Score (Train): 0.9851
RÂ² Score (Test) : 0.8839
RMSE (Train)    : 1.3759
RMSE (Test)     : 3.6912
MAE (Train)     : 0.5495
MAE (Test)      : 1.4871
MAPE (Train)    : 5.95%
MAPE (Test)     : 20.21%

# Defining the parameter grid

rf_param_grid = {
    "n_estimators": [100, 200],  # Reducing tree count options
    "max_depth": [10, 15],       # Keeping it within a reasonable range
    "min_samples_split": [10, 20],  # Higher values prevent overfitting
    "min_samples_leaf": [5, 10]   # Prevents too small leaves
}

# Initializing random forest regressor

rf_model = RandomForestRegressor(random_state=42)

# Using RandomizedSearchCV for faster tuning

rf_random_search = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=rf_param_grid,
    n_iter=10,  # Runs only 10 combinations instead of all
    cv=3,  # Reduced cross-validation folds
    scoring="r2",
    verbose=2,
    n_jobs=-1  # Uses all available CPU cores
)

# Fitting the model

rf_random_search.fit(X_train, y_train)

# Getting the best parameters

best_rf_params = rf_random_search.best_params_
print("Best Parameters:", best_rf_params)

# Training Random Forest with the best parameters

best_rf = RandomForestRegressor(**best_rf_params, random_state=42)
best_rf.fit(X_train, y_train)

# Making predictions

y_train_pred_best_rf = best_rf.predict(X_train)
y_test_pred_best_rf = best_rf.predict(X_test)

# Evaluating the tuned Random Forest model

best_rf_train_r2 = r2_score(y_train, y_train_pred_best_rf)
best_rf_test_r2 = r2_score(y_test, y_test_pred_best_rf)
best_rf_train_rmse = mean_squared_error(y_train, y_train_pred_best_rf) ** 0.5
best_rf_test_rmse = mean_squared_error(y_test, y_test_pred_best_rf) ** 0.5
best_rf_train_mae = mean_absolute_error(y_train, y_train_pred_best_rf)
best_rf_test_mae = mean_absolute_error(y_test, y_test_pred_best_rf)
best_rf_train_mape = np.mean(np.abs((y_train - y_train_pred_best_rf) / y_train)) * 100
best_rf_test_mape = np.mean(np.abs((y_test - y_test_pred_best_rf) / y_test)) * 100

# Displaying results

print("\n**Tuned Random Forest Model Performance:**")
print(f"RÂ² Score (Train): {best_rf_train_r2:.4f}")
print(f"RÂ² Score (Test) : {best_rf_test_r2:.4f}")
print(f"RMSE (Train)    : {best_rf_train_rmse:.4f}")
print(f"RMSE (Test)     : {best_rf_test_rmse:.4f}")
print(f"MAE (Train)     : {best_rf_train_mae:.4f}")
print(f"MAE (Test)      : {best_rf_test_mae:.4f}")
print(f"MAPE (Train)    : {best_rf_train_mape:.2f}%")
print(f"MAPE (Test)     : {best_rf_test_mape:.2f}%")

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best Parameters: {'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_depth': 15}

**Tuned Random Forest Model Performance:**
RÂ² Score (Train): 0.9369
RÂ² Score (Test) : 0.8571
RMSE (Train)    : 2.8318
RMSE (Test)     : 4.0956
MAE (Train)     : 1.1745
MAE (Test)      : 1.6715
MAPE (Train)    : 12.83%
MAPE (Test)     : 22.21%

# Extracting feature importances from the tuned Random forest model

rf_feature_importances = best_rf.feature_importances_

# Creating a DataFrame for better visualization

rf_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': rf_feature_importances})

# Sorting by importance in descending order

rf_importance_df = rf_importance_df.sort_values(by='Importance', ascending=False)

# Displaying the top 15 features

plt.figure(figsize=(12, 6))
plt.barh(rf_importance_df['Feature'][:15], rf_importance_df['Importance'][:15], color='lightgreen')
plt.xlabel("Feature Importance")
plt.ylabel("Feature")
plt.title("Top 15 Feature Importances - Random Forest")
plt.gca().invert_yaxis()  # Invert y-axis for better readability
plt.show()

# Displaying the DataFrame

print(rf_importance_df)

                      Feature    Importance
6                   Power_Log  7.141844e-01
3                     Car_Age  1.634773e-01
4       Kilometers_Driven_Log  3.420207e-02
5                  Engine_Log  2.767288e-02
1                     Mileage  1.625237e-02
0                       S.No.  8.523266e-03
20        Transmission_Manual  6.785359e-03
2                       Seats  4.636363e-03
41        Brand_MERCEDES-BENZ  4.201282e-03
38           Brand_LAND ROVER  3.752296e-03
42                 Brand_MINI  2.693046e-03
24                 Brand_AUDI  1.726941e-03
9         Location_Coimbatore  1.332947e-03
26                  Brand_BMW  1.289749e-03
11         Location_Hyderabad  1.231823e-03
50               Brand_TOYOTA  1.189698e-03
32                Brand_HONDA  1.009778e-03
17           Fuel_Type_Diesel  7.312384e-04
15            Location_Mumbai  6.892281e-04
19           Fuel_Type_Petrol  6.132026e-04
36                 Brand_JEEP  4.755899e-04
22          Owner_Type_Second  4.492650e-04
14           Location_Kolkata  4.293241e-04
7          Location_Bangalore  4.158671e-04
39             Brand_MAHINDRA  3.284861e-04
10             Location_Delhi  3.168373e-04
35               Brand_JAGUAR  2.922101e-04
33              Brand_HYUNDAI  2.051417e-04
40               Brand_MARUTI  1.729231e-04
13             Location_Kochi  1.652453e-04
47                Brand_SKODA  1.293168e-04
49                 Brand_TATA  1.259220e-04
31                 Brand_FORD  7.929768e-05
16              Location_Pune  5.555644e-05
27            Brand_CHEVROLET  4.188421e-05
8            Location_Chennai  3.876059e-05
51           Brand_VOLKSWAGEN  2.992249e-05
46              Brand_RENAULT  2.395319e-05
12            Location_Jaipur  1.434947e-05
23           Owner_Type_Third  8.711058e-06
43           Brand_MITSUBISHI  4.234619e-06
44               Brand_NISSAN  1.936499e-06
18              Fuel_Type_LPG  1.418812e-08
48                Brand_SMART  0.000000e+00
45              Brand_PORSCHE  0.000000e+00
28               Brand_DATSUN  0.000000e+00
29                 Brand_FIAT  0.000000e+00
21  Owner_Type_Fourth & Above  0.000000e+00
37          Brand_LAMBORGHINI  0.000000e+00
34                Brand_ISUZU  0.000000e+00
25              Brand_BENTLEY  0.000000e+00
30                Brand_FORCE  0.000000e+00
52                Brand_VOLVO  0.000000e+00

	count	mean	std	min	25%	50%	75%	max
S.No.	7253.0	3626.000000	2093.905084	0.00	1813.000	3626.00	5439.0000	7252.00
Year	7253.0	2013.365366	3.254421	1996.00	2011.000	2014.00	2016.0000	2019.00
Kilometers_Driven	7253.0	58699.063146	84427.720583	171.00	34000.000	53416.00	73000.0000	6500000.00
Mileage	7251.0	18.141580	4.562197	0.00	15.170	18.16	21.1000	33.54
Engine	7207.0	1616.573470	595.285137	72.00	1198.000	1493.00	1968.0000	5998.00
Power	7078.0	112.765214	53.493553	34.20	75.000	94.00	138.1000	616.00
Seats	7200.0	5.280417	0.809277	2.00	5.000	5.00	5.0000	10.00
New_price	1006.0	22.779692	27.759344	3.91	7.885	11.57	26.0425	375.00
Price	6019.0	9.479468	11.187917	0.44	3.500	5.64	9.9500	160.00

Used Cars Price PredictionÂ¶

Problem DefinitionÂ¶

The Context:Â¶

The objective:Â¶

The key questions:Â¶

The problem formulation:Â¶

Data DictionaryÂ¶

Loading librariesÂ¶

Let us load the dataÂ¶

Data OverviewÂ¶

Exploratory Data AnalysisÂ¶

Feature EngineeringÂ¶

Missing value treatmentÂ¶

Univariate AnalysisÂ¶

Bivariate AnalysisÂ¶

Missing value treatment (Part 2)Â¶

Important Insights from EDA and Data PreprocessingÂ¶

Building Various ModelsÂ¶

EncodingÂ¶

Split the DataÂ¶

Checking Linear Regression AssumptionsÂ¶

Checking the mean of the residualsÂ¶

Homoscedasticity CheckÂ¶

Linearity of VariablesÂ¶

Normality of Error TermsÂ¶

How does the model is performing after cross validation?Â¶

2) Ridge / Lasso Regression

3) Decision Trees

Hyperparameter Tuning: Decision TreeÂ¶

4) Random Forest

Hyperparameter Tuning: Random ForestÂ¶

Conclusions and RecommendationsÂ¶

	S.No.	Name	Location	Year	Kilometers_Driven	Fuel_Type	Transmission	Owner_Type	Mileage	Engine	Power	Seats	New_price	Price
0	0	Maruti Wagon R LXI CNG	Mumbai	2010	72000	CNG	Manual	First	26.60	998.0	58.16	5.0	NaN	1.75
1	1	Hyundai Creta 1.6 CRDi SX Option	Pune	2015	41000	Diesel	Manual	First	19.67	1582.0	126.20	5.0	NaN	12.50
2	2	Honda Jazz V	Chennai	2011	46000	Petrol	Manual	First	18.20	1199.0	88.70	5.0	8.61	4.50
3	3	Maruti Ertiga VDI	Chennai	2012	87000	Diesel	Manual	First	20.77	1248.0	88.76	7.0	NaN	6.00
4	4	Audi A4 New 2.0 TDI Multitronic	Coimbatore	2013	40670	Diesel	Automatic	Second	15.20	1968.0	140.80	5.0	NaN	17.74

	0
S.No.	0
Name	0
Location	0
Year	0
Kilometers_Driven	0
Fuel_Type	0
Transmission	0
Owner_Type	0
Mileage	2
Engine	46
Power	175
Seats	53
New_price	6247
Price	1234