# Importing libraries for data manipulation
import numpy as np
import pandas as pd

# Importing libraries for data visualization
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.graphics.gofplots import ProbPlot
import scipy.stats as stats

# Importing libraries for building linear regression model
import statsmodels.api as sm
from statsmodels.formula.api import ols
from sklearn.linear_model import LinearRegression

# Importing libraries for model evaluation
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_score
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Importing library for splitting data
from sklearn.model_selection import train_test_split

# Importing library for data preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

# Importing filter to ignore deprecation warnings
import warnings
warnings.filterwarnings("ignore")

# Let colab access my google drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive

# Using the pd.read_csv() funtion to load the dataset

df = pd.read_csv("/content/drive/MyDrive/MIT - Applied Data Science/Projects/Elective Project/Boston.csv")

# Looking at the top 5 rows of the dataset to start building some intuition

df.head()

# Checking the size of the dataset using the .shape method

df.shape

(506, 13)

# Using the .info() function to get information about the data types and also to gain some intuition about missing values

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 13 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    int64  
 10  PTRATIO  506 non-null    float64
 11  LSTAT    506 non-null    float64
 12  MEDV     506 non-null    float64
dtypes: float64(10), int64(3)
memory usage: 51.5 KB

# Just to double-check that in fact there are no missing values, let's use the .isnull().sum() function, that will return us a count of the missing values in our data

df.isnull().sum()

# Just to be absolutely sure, I'm now going to run the is.na().sum() function that will return a count of any fields that may have NaN values

df.isna().sum()

# I'm now running the .duplicated().sum() function, to know if there are any duplicated records on the dataset

df.duplicated().sum()

0

# In order to start building some more intuition on our data, I'm now using the .describe() function, which will return a statistical summary of our columns

df.describe().T

# So let's start the EDA, ploting a histogram for our target variable, MEDV, to check on how it's data is distributed
# I'll focus on the distribution type (normal or skewed) and extreme values that can be potential outliers

sns.histplot(df['MEDV'], kde=True, bins=30)
plt.title("Distribution of MEDV (House Prices)")
plt.xlabel("MEDV (in $1000)")
plt.ylabel("Frequency")
plt.show()

# Using the np.log() function to transform the variable distribution

df['MEDV_log'] = np.log(df['MEDV'])

sns.histplot(df['MEDV_log'], kde=True, bins=30)
plt.title("Distribution of MEDV_log (House Prices)")
plt.xlabel("MEDV_log (in $1000)")
plt.ylabel("Frequency")
plt.show()

# Now I'm plotting a heatmap to analyse the feature correlations and check relationships between them
# I'll focus on which idependent variables have a high correlation with our target variable MEDV
# Check for multicollinearity presence on our features

plt.figure(figsize=(12,8))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Feature Correlation Heatmap")
plt.show()

# Now I'm going to analyse the variables independently so I'm able to do some inferences
# I'm focusing on checking the distribution in the data as well as trying to detect potential outliers present in each feature

features = df.columns

for feature in features:
    plt.figure(figsize=(12, 4))


    plt.subplot(1, 2, 1)
    sns.histplot(df[feature], kde=True, bins=30)
    plt.title(f"Distribution of {feature}")


    plt.subplot(1, 2, 2)
    sns.boxplot(y=df[feature])
    plt.title(f"Boxplot of {feature}")

    plt.show()

# I'm now plotting Scatter plots between the variables that show a strong correlation
# Focusing on positive and negative correlations with values between >= 0.7 or <= -0.7

sns.pairplot(df, vars=['MEDV', 'RM', 'LSTAT', 'NOX', 'DIS', 'INDUS', 'TAX', 'AGE', 'RAD'])
plt.show()

# Importing the required function

from scipy.stats import pearsonr

# Removing the outliers in TAX

df = df[df["TAX"] < 600]

# Checking the new correlation between TAX and RAD

corr_tax_rad = pearsonr(df["TAX"], df["RAD"])[0]
print(f"The correlation between TAX and RAD after outlier removal: {corr_tax_rad:.4f}")

The correlation between TAX and RAD after outlier removal: 0.2498

# Defining the indenpendent variables (x) and target variable (y)

Y = df['MEDV_log']

X = df.drop(columns = {'MEDV', 'MEDV_log'})

# Add the intercept term

X = sm.add_constant(X)

# Splitting the data into train and test (70% train, 30% test)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.30, random_state = 1)

# Defining the function to calculate VIF

def calculate_vif(train):
    vif_data = pd.DataFrame()
    vif_data["Feature"] = train.columns
    vif_data["VIF"] = [variance_inflation_factor(train.values, i) for i in range(len(train.columns))]
    return vif_data

# Displaying VIF scores for training data

print(calculate_vif(X_train))

    Feature         VIF
0     const  708.370336
1      CRIM    2.872608
2        ZN    2.583947
3     INDUS    2.964985
4      CHAS    1.104334
5       NOX    5.631089
6        RM    2.566053
7       AGE    2.718199
8       DIS    3.433860
9       RAD    1.196240
10      TAX    1.661921
11  PTRATIO    1.653781
12    LSTAT    3.052094

# Initializing StandardScaler

scaler = StandardScaler()

# Applying scaling only on the independent variables (X_train and X_test)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Converting back to DataFrame to retain feature names

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

# Display a preview of scaled data

print("Scaled Training Data:\n", X_train_scaled.head())

Scaled Training Data:
    const      CRIM        ZN     INDUS     CHAS       NOX        RM       AGE  \
0    0.0 -0.518552  1.079000 -0.836244 -0.30532 -0.713256  0.543155 -1.080120   
1    0.0 -0.561877  0.325751 -0.589128 -0.30532 -0.820128  0.525468 -0.984791   
2    0.0 -0.537923 -0.615810 -0.841465 -0.30532 -0.208043  0.938161  0.879417   
3    0.0  0.012980 -0.615810 -0.355935 -0.30532 -0.062308  1.168089  0.653453   
4    0.0 -0.523104 -0.615810 -0.871049 -0.30532 -0.489797 -0.740612 -1.256654   

        DIS       RAD       TAX   PTRATIO     LSTAT  
0  0.937553  0.329909  1.237076 -1.162043 -0.987817  
1  0.424121 -0.286878 -0.520378  0.552429 -0.891426  
2 -0.517479 -1.520452 -0.685608  0.011017 -0.857689  
3 -0.615493  2.180271 -0.129832 -0.169454 -0.719530  
4  0.335576 -0.286878  1.717747 -0.395042 -0.139580

# I'm first training a linear regression model using sklearn.LinearRegression() to make the predictions


# Initializing the Linear Regression model

lin_reg = LinearRegression()

# Training the model on the scaled training data

lin_reg.fit(X_train_scaled, y_train)

# Making predictions on the training and test sets

y_train_pred = lin_reg.predict(X_train_scaled)
y_test_pred = lin_reg.predict(X_test_scaled)

# I'll now train the sm.OLS() model to check the statistical summary of the model

# Ensuring indices are aligned

y_train_aligned = y_train.reset_index(drop=True)
X_train_aligned = X_train_scaled.drop(columns=["const"], errors="ignore").reset_index(drop=True)  # Remove 'const' if present

# Fiting OLS model using the corrected training data

ols_model = sm.OLS(y_train_aligned, sm.add_constant(X_train_aligned)).fit()

# Displaying the full statistical summary

print(ols_model.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:               MEDV_log   R-squared:                       0.869
Model:                            OLS   Adj. R-squared:                  0.862
Method:                 Least Squares   F-statistic:                     134.9
Date:                Mon, 17 Feb 2025   Prob (F-statistic):          1.44e-100
Time:                        23:09:19   Log-Likelihood:                 193.70
No. Observations:                 258   AIC:                            -361.4
Df Residuals:                     245   BIC:                            -315.2
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          3.1531      0.007    432.132      0.000       3.139       3.167
CRIM          -0.0077      0.012     -0.619      0.536      -0.032       0.017
ZN             0.0115      0.012      0.977      0.330      -0.012       0.035
INDUS          0.0090      0.013      0.713      0.476      -0.016       0.034
CHAS           0.0104      0.008      1.357      0.176      -0.005       0.026
NOX           -0.0439      0.017     -2.534      0.012      -0.078      -0.010
RM             0.1869      0.012     15.994      0.000       0.164       0.210
AGE           -0.0549      0.012     -4.566      0.000      -0.079      -0.031
DIS           -0.0781      0.014     -5.779      0.000      -0.105      -0.052
RAD            0.0178      0.008      2.233      0.026       0.002       0.034
TAX           -0.0388      0.009     -4.128      0.000      -0.057      -0.020
PTRATIO       -0.0607      0.009     -6.473      0.000      -0.079      -0.042
LSTAT         -0.0552      0.013     -4.328      0.000      -0.080      -0.030
==============================================================================
Omnibus:                       15.519   Durbin-Watson:                   1.965
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               18.382
Skew:                           0.497   Prob(JB):                     0.000102
Kurtosis:                       3.849   Cond. No.                         6.20
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

# Defining the features to keep (removing CRIM, ZN, INDUS, CHAS)

X_train_refined = X_train_scaled.drop(columns=["CRIM", "ZN", "INDUS", "CHAS"])
X_test_refined = X_test_scaled.drop(columns=["CRIM", "ZN", "INDUS", "CHAS"])

# Ensuring indices are aligned

y_train_aligned = y_train.reset_index(drop=True)
X_train_refined = X_train_refined.reset_index(drop=True)

# Fiting the refined OLS model

ols_model_refined = sm.OLS(y_train_aligned, sm.add_constant(X_train_refined)).fit()

# Displaying the new OLS summary

print(ols_model_refined.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:               MEDV_log   R-squared:                       0.866
Model:                            OLS   Adj. R-squared:                  0.862
Method:                 Least Squares   F-statistic:                     202.0
Date:                Mon, 17 Feb 2025   Prob (F-statistic):          3.00e-104
Time:                        20:38:49   Log-Likelihood:                 191.68
No. Observations:                 258   AIC:                            -365.4
Df Residuals:                     249   BIC:                            -333.4
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          3.1531      0.007    432.235      0.000       3.139       3.167
const       -2.58e-17   5.07e-18     -5.088      0.000   -3.58e-17   -1.58e-17
NOX           -0.0473      0.014     -3.497      0.001      -0.074      -0.021
RM             0.1888      0.011     16.655      0.000       0.167       0.211
AGE           -0.0569      0.012     -4.812      0.000      -0.080      -0.034
DIS           -0.0775      0.012     -6.619      0.000      -0.101      -0.054
RAD            0.0157      0.008      2.058      0.041       0.001       0.031
TAX           -0.0360      0.008     -4.320      0.000      -0.052      -0.020
PTRATIO       -0.0638      0.009     -7.362      0.000      -0.081      -0.047
LSTAT         -0.0533      0.013     -4.242      0.000      -0.078      -0.029
==============================================================================
Omnibus:                       12.087   Durbin-Watson:                   2.003
Prob(Omnibus):                  0.002   Jarque-Bera (JB):               13.670
Skew:                           0.426   Prob(JB):                      0.00108
Kurtosis:                       3.738   Cond. No.                     2.80e+16
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 1.05e-30. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.

# Evaluating model performance with Rsquared

train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("**Linear Regression Model Performance with Rsquared**")
print(f"RÂ² Score (Train): {train_r2:.4f}")
print(f"RÂ² Score (Test) : {test_r2:.4f}")

**Linear Regression Model Performance with Rsquared**
RÂ² Score (Train): 0.8686
RÂ² Score (Test) : 0.8567

# Evaluating model performance with MAE

train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)

print("**Linear Regression Model Performance with MAE**")
print(f"Mean Absolute Error (Train): {train_mae:.4f}")
print(f"Mean Absolute Error (Test) : {test_mae:.4f}")

**Linear Regression Model Performance with MAE**
Mean Absolute Error (Train): 0.0858
Mean Absolute Error (Test) : 0.0788

# Evaluating model performance with RSME

train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

print("**Linear Regression Model Performance with RSME**")
print(f"Root Mean Squared Error (Train): {train_rmse:.4f}")
print(f"Root Mean Squared Error (Test) : {test_rmse:.4f}")

**Linear Regression Model Performance with RSME**
Root Mean Squared Error (Train): 0.1142
Root Mean Squared Error (Test) : 0.1051

# Evaluating model performance in MAPE

# Defining a function to calculate MAPE

def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100  # Convert to percentage

# Calculating MAPE for train and test sets

train_mape = mean_absolute_percentage_error(y_train, y_train_pred)
test_mape = mean_absolute_percentage_error(y_test, y_test_pred)

print("**Linear Regression Model Performance with MAPE**")
print(f"Mean Absolute Percentage Error (Train): {train_mape:.2f}%")
print(f"Mean Absolute Percentage Error (Test) : {test_mape:.2f}%")

**Linear Regression Model Performance with MAPE**
Mean Absolute Percentage Error (Train): 2.77%
Mean Absolute Percentage Error (Test) : 2.52%

# Double-checking multicollinearity using VIF

# Computing VIF on the final feature set

final_features = X_train_scaled.columns
vif_data = pd.DataFrame()
vif_data["Feature"] = final_features
vif_data["VIF"] = [variance_inflation_factor(X_train_scaled.values, i) for i in range(len(final_features))]

# Display VIF scores

print("**Final Multicollinearity Check (VIF Scores):**")
print(vif_data)

**Final Multicollinearity Check (VIF Scores):**
    Feature       VIF
0     const       NaN
1      CRIM  2.872608
2        ZN  2.583947
3     INDUS  2.964985
4      CHAS  1.104334
5       NOX  5.631089
6        RM  2.566053
7       AGE  2.718199
8       DIS  3.433860
9       RAD  1.196240
10      TAX  1.661921
11  PTRATIO  1.653781
12    LSTAT  3.052094

# Cross-Validation performance check

# Initializing Linear Regression model

lin_reg = LinearRegression()

# Performing 5-Fold Cross-Validation (scoring based on RÂ²)

cv_scores = cross_val_score(lin_reg, X_train_scaled, y_train, cv=5, scoring='r2')

# Displaying cross-validation results

print("**Cross-Validation Results:**")
print(f"Mean RÂ² Score: {np.mean(cv_scores):.4f}")
print(f"Standard Deviation of RÂ²: {np.std(cv_scores):.4f}")
print(f"All RÂ² Scores: {cv_scores}")

**Cross-Validation Results:**
Mean RÂ² Score: 0.8497
Standard Deviation of RÂ²: 0.0148
All RÂ² Scores: [0.85566448 0.82114747 0.85066249 0.85944711 0.86178792]

# We're aiming for a mean value close to 0

residuals = y_test - y_test_pred
mean_residuals = np.mean(residuals)
print(f"Mean of Residuals: {mean_residuals:.6f}")

Mean of Residuals: -0.005186

# Ploting the residual vs. predicted values

plt.figure(figsize=(8,5))
sns.scatterplot(x=y_test_pred, y=residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("Residuals vs. Predicted Values (Homoscedasticity Check)")
plt.show()

# Ploting a Q-Q plot to verify if residuals align with a normal distribution

plt.figure(figsize=(8,5))
stats.probplot(residuals, dist="norm", plot=plt)
plt.title("Q-Q Plot of Residuals")
plt.show()

# Plotting an histogram to check if the errors follow a normal distribution

plt.figure(figsize=(8,5))
sns.histplot(residuals, bins=30, kde=True)
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.title("Histogram of Residuals (Normality Check)")
plt.show()

# Extracting coefficients from the OLS model

coef = ols_model.params

# Starting and creating equation

equation = "log (Price) ="
print(equation, end=" ")
for i in range(len(coef)):
    if i == len(coef) - 1:  # No '+' at the end
        print(f"({coef[i]:.4f}) * {coef.index[i]}", end=" ")
    else:
        print(f"({coef[i]:.4f}) * {coef.index[i]} +", end=" ")

log (Price) = (3.1531) * const + (-0.0077) * CRIM + (0.0115) * ZN + (0.0090) * INDUS + (0.0104) * CHAS + (-0.0439) * NOX + (0.1869) * RM + (-0.0549) * AGE + (-0.0781) * DIS + (0.0178) * RAD + (-0.0388) * TAX + (-0.0607) * PTRATIO + (-0.0552) * LSTAT

# Creating a DataFrame with actual vs predicted values

predictions_df = pd.DataFrame({
    "Actual MEDV_log": y_test,
    "Predicted MEDV_log": y_test_pred,
    "Actual MEDV": np.exp(y_test),  # Convert log values back to original price
    "Predicted MEDV": np.exp(y_test_pred),  # Convert log predictions back
    "Residuals": y_test - y_test_pred  # Difference between actual and predicted
})

# Displaying the first few rows of the dataset

print("**Predictions DataFrame:**")
print(predictions_df.head())

# Scatter Plot: Actual vs. Predicted Values

plt.figure(figsize=(8, 6))
sns.scatterplot(x=predictions_df["Actual MEDV"], y=predictions_df["Predicted MEDV"], alpha=0.7)
plt.plot([min(predictions_df["Actual MEDV"]), max(predictions_df["Actual MEDV"])],
         [min(predictions_df["Actual MEDV"]), max(predictions_df["Actual MEDV"])],
         color='red', linestyle='--')  # Perfect prediction line
plt.xlabel("Actual House Prices")
plt.ylabel("Predicted House Prices")
plt.title("Actual vs. Predicted House Prices")
plt.show()

**Predictions DataFrame:**
     Actual MEDV_log  Predicted MEDV_log  Actual MEDV  Predicted MEDV  \
302         3.273364            3.357222         26.4       28.709338   
122         3.020425            3.007894         20.5       20.244712   
228         3.843744            3.749873         46.7       42.515696   
257         3.912023            3.995833         50.0       54.371121   
102         2.923162            3.053223         18.6       21.183506   

     Residuals  
302  -0.083858  
122   0.012531  
228   0.093871  
257  -0.083810  
102  -0.130061

	CRIM	ZN	INDUS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	LSTAT	MEDV
0	0.00632	18.0	2.31	0.538	6.575	65.2	4.0900	1	296	15.3	4.98	24.0
1	0.02731	0.0	7.07	0.469	6.421	78.9	4.9671	2	242	17.8	9.14	21.6
2	0.02729	0.0	7.07	0.469	7.185	61.1	4.9671	2	242	17.8	4.03	34.7
3	0.03237	0.0	2.18	0.458	6.998	45.8	6.0622	3	222	18.7	2.94	33.4
4	0.06905	0.0	2.18	0.458	7.147	54.2	6.0622	3	222	18.7	5.33	36.2

	count	mean	std	min	25%	50%	75%	max
CRIM	506.0	3.613524	8.601545	0.00632	0.082045	0.25651	3.677083	88.9762
ZN	506.0	11.363636	23.322453	0.00000	0.000000	0.00000	12.500000	100.0000
INDUS	506.0	11.136779	6.860353	0.46000	5.190000	9.69000	18.100000	27.7400
CHAS	506.0	0.069170	0.253994	0.00000	0.000000	0.00000	0.000000	1.0000
NOX	506.0	0.554695	0.115878	0.38500	0.449000	0.53800	0.624000	0.8710
RM	506.0	6.284634	0.702617	3.56100	5.885500	6.20850	6.623500	8.7800
AGE	506.0	68.574901	28.148861	2.90000	45.025000	77.50000	94.075000	100.0000
DIS	506.0	3.795043	2.105710	1.12960	2.100175	3.20745	5.188425	12.1265
RAD	506.0	9.549407	8.707259	1.00000	4.000000	5.00000	24.000000	24.0000
TAX	506.0	408.237154	168.537116	187.00000	279.000000	330.00000	666.000000	711.0000
PTRATIO	506.0	18.455534	2.164946	12.60000	17.400000	19.05000	20.200000	22.0000
LSTAT	506.0	12.653063	7.141062	1.73000	6.950000	11.36000	16.955000	37.9700
MEDV	506.0	22.532806	9.197104	5.00000	17.025000	21.20000	25.000000	50.0000

Regression Project: Boston House Price PredictionÂ¶

Marks: 60Â¶

ObjectiveÂ¶

DatasetÂ¶

Importing the necessary librariesÂ¶

Mounting DriveÂ¶

Loading the datasetÂ¶

Data OverviewÂ¶

Exploratory Data Analysis (EDA)Â¶

1. What does the distribution of 'MEDV' look like?Â¶

2. What can we infer from the correlation heatmap? Is there correlation between the dependent and independent variables?Â¶

Univariate AnalysisÂ¶

3. What are all the inferences that can be found by doing univariate analysis for different variables?Â¶

Bivariate AnalysisÂ¶

4. Do bivariate analysis to visualize the relationship between the features having significant correlations (>= 0.7 or <= -0.7)Â¶

Data PreprocessingÂ¶

Outlier TreatmentÂ¶

Splitting the DatasetÂ¶

Handling MulticollinearityÂ¶

Feature ScalingÂ¶

Model Building - Linear RegressionÂ¶

Model Performance CheckÂ¶

1. How does the model is performing? Check using Rsquared, RSME, MAE, MAPEÂ¶

2. Is there multicollinearity? Check using VIFÂ¶

3. How does the model is performing after cross validation?Â¶

Checking Linear Regression AssumptionsÂ¶

Checking the mean of the residualsÂ¶

Homoscedasticity CheckÂ¶

Linearity of VariablesÂ¶

Normality of Error TermsÂ¶

Final ModelÂ¶

Model EquationÂ¶

Predictions VisualizationÂ¶

Actionable Insights and RecommendationsÂ¶