{
"cells": [
{
"cell_type": "markdown",
"id": "c8e6a45e-ec06-46bf-b672-9cc0d8c0f16d",
"metadata": {},
"source": [
"### Toyota Car Prices"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "ab356672-aea9-4fc5-86a7-ea2e9088f3f8",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.linear_model import LinearRegression, Lasso, Ridge, LassoCV, BayesianRidge\n",
"from sklearn import metrics\n",
"from itertools import combinations\n",
"import statsmodels.formula.api as sm"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "67fe6d71-ba69-4a83-971e-98e1af9e351f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" Id \n",
" Model \n",
" Price \n",
" Age_08_04 \n",
" Mfg_Month \n",
" Mfg_Year \n",
" KM \n",
" Fuel_Type \n",
" HP \n",
" Met_Color \n",
" ... \n",
" Powered_Windows \n",
" Power_Steering \n",
" Radio \n",
" Mistlamps \n",
" Sport_Model \n",
" Backseat_Divider \n",
" Metallic_Rim \n",
" Radio_cassette \n",
" Parking_Assistant \n",
" Tow_Bar \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 1 \n",
" TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors \n",
" 13500 \n",
" 23 \n",
" 10 \n",
" 2002 \n",
" 46986 \n",
" Diesel \n",
" 90 \n",
" 1 \n",
" ... \n",
" 1 \n",
" 1 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" \n",
" \n",
" 1 \n",
" 2 \n",
" TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors \n",
" 13750 \n",
" 23 \n",
" 10 \n",
" 2002 \n",
" 72937 \n",
" Diesel \n",
" 90 \n",
" 1 \n",
" ... \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" \n",
" \n",
" 2 \n",
" 3 \n",
" TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors \n",
" 13950 \n",
" 24 \n",
" 9 \n",
" 2002 \n",
" 41711 \n",
" Diesel \n",
" 90 \n",
" 1 \n",
" ... \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" \n",
" \n",
" 3 \n",
" 4 \n",
" TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors \n",
" 14950 \n",
" 26 \n",
" 7 \n",
" 2002 \n",
" 48000 \n",
" Diesel \n",
" 90 \n",
" 0 \n",
" ... \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" \n",
" \n",
" 4 \n",
" 5 \n",
" TOYOTA Corolla 2.0 D4D HATCHB SOL 2/3-Doors \n",
" 13750 \n",
" 30 \n",
" 3 \n",
" 2002 \n",
" 38500 \n",
" Diesel \n",
" 90 \n",
" 0 \n",
" ... \n",
" 1 \n",
" 1 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" \n",
" \n",
"
\n",
"
5 rows × 39 columns
\n",
"
"
],
"text/plain": [
" Id Model Price Age_08_04 \\\n",
"0 1 TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors 13500 23 \n",
"1 2 TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors 13750 23 \n",
"2 3 TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors 13950 24 \n",
"3 4 TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors 14950 26 \n",
"4 5 TOYOTA Corolla 2.0 D4D HATCHB SOL 2/3-Doors 13750 30 \n",
"\n",
" Mfg_Month Mfg_Year KM Fuel_Type HP Met_Color ... Powered_Windows \\\n",
"0 10 2002 46986 Diesel 90 1 ... 1 \n",
"1 10 2002 72937 Diesel 90 1 ... 0 \n",
"2 9 2002 41711 Diesel 90 1 ... 0 \n",
"3 7 2002 48000 Diesel 90 0 ... 0 \n",
"4 3 2002 38500 Diesel 90 0 ... 1 \n",
"\n",
" Power_Steering Radio Mistlamps Sport_Model Backseat_Divider \\\n",
"0 1 0 0 0 1 \n",
"1 1 0 0 0 1 \n",
"2 1 0 0 0 1 \n",
"3 1 0 0 0 1 \n",
"4 1 0 1 0 1 \n",
"\n",
" Metallic_Rim Radio_cassette Parking_Assistant Tow_Bar \n",
"0 0 0 0 0 \n",
"1 0 0 0 0 \n",
"2 0 0 0 0 \n",
"3 0 0 0 0 \n",
"4 0 0 0 0 \n",
"\n",
"[5 rows x 39 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv(r'/Users/patriciaxufre/Documents/SBE - Disciplinas/2957 | ABA/2024-25/Datasets Examples/ToyotaCorolla.csv')\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "0b2a104a-27d8-451f-bfe7-f8c48d2db922",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['Id', 'Model', 'Price', 'Age_08_04', 'Mfg_Month', 'Mfg_Year', 'KM',\n",
" 'Fuel_Type', 'HP', 'Met_Color', 'Color', 'Automatic', 'CC', 'Doors',\n",
" 'Cylinders', 'Gears', 'Quarterly_Tax', 'Weight', 'Mfr_Guarantee',\n",
" 'BOVAG_Guarantee', 'Guarantee_Period', 'ABS', 'Airbag_1', 'Airbag_2',\n",
" 'Airco', 'Automatic_airco', 'Boardcomputer', 'CD_Player',\n",
" 'Central_Lock', 'Powered_Windows', 'Power_Steering', 'Radio',\n",
" 'Mistlamps', 'Sport_Model', 'Backseat_Divider', 'Metallic_Rim',\n",
" 'Radio_cassette', 'Parking_Assistant', 'Tow_Bar'],\n",
" dtype='object')"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.columns"
]
},
{
"cell_type": "markdown",
"id": "72795da5-086a-4986-b5ef-c4e108648605",
"metadata": {},
"source": [
"#### 1st Linear Model"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "0aceb23c-2f41-44f4-b8cd-b140007879bd",
"metadata": {},
"outputs": [],
"source": [
"# create a list of predictor variables by removing the output variable and text columns\n",
"exclude = ('Price', 'Id', 'Model', 'Fuel_Type', 'Color')\n",
"predictors = [s for s in df.columns if s not in exclude]\n",
"outcome = 'Price'"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "31cb914e-6a01-42df-a80c-b8e1519763ad",
"metadata": {},
"outputs": [],
"source": [
"# partition data\n",
"X = df[predictors]\n",
"y = df[outcome]\n",
"train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.4, random_state = 1)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "2b8a8dfa-655a-4872-ac98-d1c59a4acb52",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"LinearRegression() In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. "
],
"text/plain": [
"LinearRegression()"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Linear Regression Model (only in the train set)\n",
"reg = LinearRegression()\n",
"reg.fit(train_X, train_y)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "65bceac3-9142-404c-b2b8-335fe934e0b9",
"metadata": {},
"outputs": [],
"source": [
"pred_error_train = pd.DataFrame({\n",
" 'residual': train_y - reg.predict(train_X),\n",
" 'data set': 'training'\n",
"})\n",
"pred_error_test = pd.DataFrame({\n",
" 'residual': test_y - reg.predict(test_X),\n",
" 'data set': 'test'\n",
"})\n",
"boxdata_df = pd.concat([pred_error_train, pred_error_test], ignore_index = True)\n",
"order = ['training', 'test']\n",
"boxdata_df['data set']= pd.Categorical(boxdata_df['data set'], categories = order, ordered = True)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "2c3b5589-9e9f-4c70-ad22-5900d46e93e3",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"fig, axes = plt.subplots(nrows = 1, ncols = 3)\n",
"fig.set_size_inches(9,4)\n",
"common = {'bins': 100, 'range': [-6500,6500]}\n",
"pred_error_train.hist(ax = axes[0], **common)\n",
"pred_error_test.hist(ax = axes[1], **common)\n",
"boxdata_df.boxplot(ax=axes[2], by = 'data set')\n",
"axes[0].set_title(\"Training\")\n",
"axes[1].set_title(\"Test\")\n",
"axes[0].set_ylabel(\"\")\n",
"axes[1].set_ylabel(\"\")\n",
"axes[2].set_title(\"\")\n",
"fig.suptitle(\"\")\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "d51a14fc-7991-4734-8a41-1c55b52bb52b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Performance Measures - Training Set\n",
"Mean Absolute Error: 811.68\n",
"Mean Absolute Percentage Error: 8.01%\n",
"Root Mean Squared Error: 1121.06\n",
"\n",
"Performance Measures - Test Set\n",
"Mean Absolute Error: 880.14\n",
"Mean Absolute Percentage Error: 8.87%\n",
"Root Mean Squared Error: 1382.04\n"
]
}
],
"source": [
"# Evaluating Performance\n",
"# Training Set\n",
"mae = metrics.mean_absolute_error(train_y, reg.predict(train_X))\n",
"mape = metrics.mean_absolute_percentage_error(train_y, reg.predict(train_X))\n",
"mse = metrics.mean_squared_error(train_y, reg.predict(train_X))\n",
"rmse = np.sqrt(mse)\n",
"print('Performance Measures - Training Set')\n",
"print(f'Mean Absolute Error: {mae:.2f}')\n",
"print(f'Mean Absolute Percentage Error: {mape*100:.2f}%')\n",
"print(f'Root Mean Squared Error: {rmse:.2f}')\n",
"# Test Set\n",
"mae = metrics.mean_absolute_error(test_y,reg.predict(test_X))\n",
"mape = metrics.mean_absolute_percentage_error(test_y, reg.predict(test_X))\n",
"mse = metrics.mean_squared_error(test_y, reg.predict(test_X))\n",
"rmse = np.sqrt(mse)\n",
"print('\\nPerformance Measures - Test Set')\n",
"print(f'Mean Absolute Error: {mae:.2f}')\n",
"print(f'Mean Absolute Percentage Error: {mape*100:.2f}%')\n",
"print(f'Root Mean Squared Error: {rmse:.2f}')"
]
},
{
"cell_type": "markdown",
"id": "6b6aa2ce-7240-4484-bed9-2e6d60c014ad",
"metadata": {},
"source": [
"#### 2nd Linear Model"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "183b91af-e593-440e-94b4-c049858a24d2",
"metadata": {},
"outputs": [],
"source": [
"predictors = ['Age_08_04', 'KM', 'Fuel_Type', 'HP', 'Met_Color', 'Automatic', 'CC', 'Doors', 'Quarterly_Tax', 'Weight']\n",
"outcome = 'Price'\n",
"# partition data\n",
"X = pd.get_dummies(df[predictors], drop_first = True)\n",
"y = df[outcome]\n",
"train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.4, random_state = 123)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "5c4108c8-a202-4227-9983-fc762b160b04",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Predictor coefficient\n",
"0 Age_08_04 -113.244942\n",
"1 KM -0.015739\n",
"2 HP 2.325408\n",
"3 Met_Color -72.350023\n",
"4 Automatic -149.178152\n",
"5 CC -0.038289\n",
"6 Doors -251.428537\n",
"7 Quarterly_Tax 14.975890\n",
"8 Weight 32.255569\n",
"9 Fuel_Type_Diesel -1307.813608\n",
"10 Fuel_Type_Petrol 2785.721080\n"
]
}
],
"source": [
"lm = LinearRegression()\n",
"lm.fit(train_X, train_y)\n",
"\n",
"# coefficients:\n",
"print(pd.DataFrame({'Predictor': X.columns, 'coefficient': lm.coef_}))"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "f2bbbfc3-e42c-40f0-af7a-55694bcedef3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Performance Measures - Training Set\n",
"Mean Absolute Error: 922.12\n",
"Mean Absolute Percentage Error: 9.31%\n",
"Root Mean Squared Error: 1190.13\n",
"\n",
"Performance Measures - Test Set\n",
"Mean Absolute Error: 1018.69\n",
"Mean Absolute Percentage Error: 9.86%\n",
"Root Mean Squared Error: 1605.91\n"
]
}
],
"source": [
"# Evaluating Performance\n",
"# Training Set\n",
"mae = metrics.mean_absolute_error(train_y,lm.predict(train_X))\n",
"mape = metrics.mean_absolute_percentage_error(train_y, lm.predict(train_X))\n",
"mse = metrics.mean_squared_error(train_y, lm.predict(train_X))\n",
"rmse = np.sqrt(mse)\n",
"print('Performance Measures - Training Set')\n",
"print(f'Mean Absolute Error: {mae:.2f}')\n",
"print(f'Mean Absolute Percentage Error: {mape*100:.2f}%')\n",
"print(f'Root Mean Squared Error: {rmse:.2f}')\n",
"# Test Set\n",
"mae = metrics.mean_absolute_error(test_y,lm.predict(test_X))\n",
"mape = metrics.mean_absolute_percentage_error(test_y, lm.predict(test_X))\n",
"mse = metrics.mean_squared_error(test_y, lm.predict(test_X))\n",
"rmse = np.sqrt(mse)\n",
"print('\\nPerformance Measures - Test Set')\n",
"print(f'Mean Absolute Error: {mae:.2f}')\n",
"print(f'Mean Absolute Percentage Error: {mape*100:.2f}%')\n",
"print(f'Root Mean Squared Error: {rmse:.2f}')"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "24fa6746-d6f2-4d9f-a56d-bd8f203349a1",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Residuals\n",
"pred_error_train = pd.DataFrame({\n",
" 'residual': train_y - lm.predict(train_X),\n",
" 'data set': 'training'\n",
"})\n",
"pred_error_test = pd.DataFrame({\n",
" 'residual': test_y - lm.predict(test_X),\n",
" 'data set': 'test'\n",
"})\n",
"fig, axes = plt.subplots(nrows = 1, ncols = 2)\n",
"fig.set_size_inches(9,4)\n",
"common = {'bins': 100, 'range': [-6500,6500]}\n",
"pred_error_train.hist(ax = axes[0], **common)\n",
"pred_error_test.hist(ax = axes[1], **common)\n",
"axes[0].set_title(\"Training\")\n",
"axes[1].set_title(\"Test\")\n",
"axes[0].set_ylabel(\"\")\n",
"axes[1].set_ylabel(\"\")\n",
"fig.suptitle(\"\")\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "fb27e5a6-9f98-414a-a11f-23a7d25a3268",
"metadata": {},
"outputs": [],
"source": [
"## Performance Measures\n",
"# Function to compute Adjusted R²\n",
"def adjusted_r2_score(y_true, y_pred, model):\n",
" r2 = metrics.r2_score(y_true, y_pred)\n",
" n = len(y_true) # number of observations\n",
" p = len(model.coef_) # number of predictors\n",
" return 1 - ((1 - r2) * (n-1) / (n-p-1)) # adj R2 formula\n",
" \n",
"# Function to compute AIC (Akaike Information Criterion)\n",
"def AIC_score(y_true, y_pred, model):\n",
" n = len(y_true)\n",
" rss = np.sum((y_true - y_pred)**2) # SSE - Residual sum of Squares\n",
" p = len(model.coef_)\n",
" return n*np.log(rss/n) + 2*(p+1) # AIC formula"
]
},
{
"cell_type": "markdown",
"id": "79418881-c7f0-4839-ac40-33a59f483906",
"metadata": {},
"source": [
"#### Exhaustive Feature Selection"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "5469cf22-f89e-4319-ab7d-1e188508a088",
"metadata": {},
"outputs": [],
"source": [
"# Exhaustive Search Function\n",
"def exhaustive_search(variables, train_fn, score_fn):\n",
" best_results = []\n",
" for k in range(1,len(variables)+1):\n",
" for subset in combinations(variables, k):\n",
" model = train_fn(subset)\n",
" score = score_fn(model, subset)\n",
" best_results.append({'n': k, 'score': score, 'model': model, 'variables': subset})\n",
" return sorted(best_results, key = lambda x: x['score'], reverse = True)\n",
" \n",
"# Function to train a model with selected variables\n",
"def train_model(variables):\n",
" model = LinearRegression()\n",
" model.fit(train_X[list(variables)], train_y)\n",
" return model\n",
"\n",
"# Function to score the model using Adjusted R²\n",
"def score_model(model, variables):\n",
" pred_y = model.predict(train_X[list(variables)])\n",
" return -adjusted_r2_score(train_y, pred_y, model)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "de9b3396-57c4-445b-8583-ac32bcdadb04",
"metadata": {},
"outputs": [],
"source": [
"# Define predictors\n",
"allVariables = train_X.columns\n",
"\n",
"# Perform exhaustive feature search\n",
"results = exhaustive_search(allVariables, train_model, score_model)\n",
"\n",
"# Store results in a DataFrame\n",
"data = []\n",
"for result in results:\n",
" model = result['model']\n",
" variables = list(result['variables'])\n",
" AIC = AIC_score(train_y, model.predict(train_X[variables]), model)\n",
" d = {'n': result['n'], 'r2adj': -result['score'], 'AIC': AIC}\n",
" d.update({var: var in result['variables'] for var in allVariables})\n",
" data.append(d)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "7db67835-428d-49e7-9fa6-914d8f7b662a",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" n \n",
" r2adj \n",
" AIC \n",
" Age_08_04 \n",
" Automatic \n",
" CC \n",
" Doors \n",
" Fuel_Type_Diesel \n",
" Fuel_Type_Petrol \n",
" HP \n",
" KM \n",
" Met_Color \n",
" Quarterly_Tax \n",
" Weight \n",
" \n",
" \n",
" \n",
" \n",
" 1024 \n",
" 1 \n",
" 0.765863 \n",
" 12807.545515 \n",
" True \n",
" False \n",
" False \n",
" False \n",
" False \n",
" False \n",
" False \n",
" False \n",
" False \n",
" False \n",
" False \n",
" \n",
" \n",
" 1208 \n",
" 2 \n",
" 0.800835 \n",
" 12669.255092 \n",
" True \n",
" False \n",
" False \n",
" False \n",
" False \n",
" False \n",
" True \n",
" False \n",
" False \n",
" False \n",
" False \n",
" \n",
" \n",
" 1671 \n",
" 3 \n",
" 0.855124 \n",
" 12396.234593 \n",
" True \n",
" False \n",
" False \n",
" False \n",
" False \n",
" True \n",
" False \n",
" False \n",
" False \n",
" False \n",
" True \n",
" \n",
" \n",
" 1878 \n",
" 4 \n",
" 0.871117 \n",
" 12296.514559 \n",
" True \n",
" False \n",
" False \n",
" False \n",
" False \n",
" True \n",
" False \n",
" True \n",
" False \n",
" False \n",
" True \n",
" \n",
" \n",
" 1990 \n",
" 5 \n",
" 0.878224 \n",
" 12248.669683 \n",
" True \n",
" False \n",
" False \n",
" False \n",
" False \n",
" True \n",
" False \n",
" True \n",
" False \n",
" True \n",
" True \n",
" \n",
" \n",
" 2022 \n",
" 6 \n",
" 0.881719 \n",
" 12224.591431 \n",
" True \n",
" False \n",
" False \n",
" True \n",
" False \n",
" True \n",
" False \n",
" True \n",
" False \n",
" True \n",
" True \n",
" \n",
" \n",
" 2046 \n",
" 7 \n",
" 0.883439 \n",
" 12212.969194 \n",
" True \n",
" False \n",
" False \n",
" True \n",
" True \n",
" True \n",
" False \n",
" True \n",
" False \n",
" True \n",
" True \n",
" \n",
" \n",
" 2045 \n",
" 8 \n",
" 0.883431 \n",
" 12214.014649 \n",
" True \n",
" True \n",
" False \n",
" True \n",
" True \n",
" True \n",
" False \n",
" True \n",
" False \n",
" True \n",
" True \n",
" \n",
" \n",
" 2044 \n",
" 9 \n",
" 0.883388 \n",
" 12215.322797 \n",
" True \n",
" True \n",
" False \n",
" True \n",
" True \n",
" True \n",
" False \n",
" True \n",
" True \n",
" True \n",
" True \n",
" \n",
" \n",
" 2037 \n",
" 10 \n",
" 0.883281 \n",
" 12217.098365 \n",
" True \n",
" True \n",
" False \n",
" True \n",
" True \n",
" True \n",
" True \n",
" True \n",
" True \n",
" True \n",
" True \n",
" \n",
" \n",
" 2031 \n",
" 11 \n",
" 0.883173 \n",
" 12218.884757 \n",
" True \n",
" True \n",
" True \n",
" True \n",
" True \n",
" True \n",
" True \n",
" True \n",
" True \n",
" True \n",
" True \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" n r2adj AIC Age_08_04 Automatic CC Doors \\\n",
"1024 1 0.765863 12807.545515 True False False False \n",
"1208 2 0.800835 12669.255092 True False False False \n",
"1671 3 0.855124 12396.234593 True False False False \n",
"1878 4 0.871117 12296.514559 True False False False \n",
"1990 5 0.878224 12248.669683 True False False False \n",
"2022 6 0.881719 12224.591431 True False False True \n",
"2046 7 0.883439 12212.969194 True False False True \n",
"2045 8 0.883431 12214.014649 True True False True \n",
"2044 9 0.883388 12215.322797 True True False True \n",
"2037 10 0.883281 12217.098365 True True False True \n",
"2031 11 0.883173 12218.884757 True True True True \n",
"\n",
" Fuel_Type_Diesel Fuel_Type_Petrol HP KM Met_Color \\\n",
"1024 False False False False False \n",
"1208 False False True False False \n",
"1671 False True False False False \n",
"1878 False True False True False \n",
"1990 False True False True False \n",
"2022 False True False True False \n",
"2046 True True False True False \n",
"2045 True True False True False \n",
"2044 True True False True True \n",
"2037 True True True True True \n",
"2031 True True True True True \n",
"\n",
" Quarterly_Tax Weight \n",
"1024 False False \n",
"1208 False False \n",
"1671 False True \n",
"1878 False True \n",
"1990 True True \n",
"2022 True True \n",
"2046 True True \n",
"2045 True True \n",
"2044 True True \n",
"2037 True True \n",
"2031 True True "
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Create DataFrame with results\n",
"results_df = pd.DataFrame(data, columns=['n', 'r2adj', 'AIC'] + sorted(allVariables))\n",
"\n",
"best_models = results_df.loc[results_df.groupby('n')['r2adj'].idxmax()] # Get the best model for each subset size\n",
"\n",
"# Sort by number of features\n",
"best_models.sort_values(by='n')"
]
},
{
"cell_type": "markdown",
"id": "bb55159c-0fa1-48d5-887c-f717e75fc17f",
"metadata": {},
"source": [
"#### Iterative Features Selection"
]
},
{
"cell_type": "markdown",
"id": "fd11aa92-b061-4c6b-84de-2d19c11ea6db",
"metadata": {},
"source": [
"#### Forward Selection"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "36a898ec-59fa-4ad2-b0d9-5bf1032d2f61",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Added Feature: Age_08_04, Adjusted R²: 0.7659\n",
"Added Feature: HP, Adjusted R²: 0.8008\n",
"Added Feature: Weight, Adjusted R²: 0.8327\n",
"Added Feature: KM, Adjusted R²: 0.8641\n",
"Added Feature: Fuel_Type_Petrol, Adjusted R²: 0.8724\n",
"Added Feature: Quarterly_Tax, Adjusted R²: 0.8794\n",
"Added Feature: Doors, Adjusted R²: 0.8823\n",
"Added Feature: Fuel_Type_Diesel, Adjusted R²: 0.8834\n",
"\n",
"Selected Features: ['Age_08_04', 'HP', 'Weight', 'KM', 'Fuel_Type_Petrol', 'Quarterly_Tax', 'Doors', 'Fuel_Type_Diesel']\n",
"\n",
"Final Model Performance:\n",
"Adjusted R² (Training Set): 0.8834\n",
"Adjusted R² (Test Set): 0.8217\n"
]
}
],
"source": [
"def forward_feature_selection(X, y):\n",
" selected_features = []\n",
" remaining_features = list(X.columns)\n",
" best_score = -np.inf # Start with a very low Adjusted R²\n",
" best_features = None\n",
" \n",
" while remaining_features:\n",
" scores = []\n",
" \n",
" for feature in remaining_features:\n",
" temp_features = selected_features + [feature]\n",
" model = LinearRegression()\n",
" model.fit(X[temp_features], y)\n",
" y_pred = model.predict(X[temp_features])\n",
" adj_r2 = adjusted_r2_score(y, y_pred, model) # Compute Adjusted R²\n",
" scores.append((feature, adj_r2))\n",
" \n",
" # Select the feature that maximizes Adjusted R²\n",
" scores.sort(key=lambda x: x[1], reverse=True)\n",
" best_new_feature, best_new_score = scores[0]\n",
" \n",
" # Stop if Adjusted R² does not improve significantly\n",
" if best_new_score <= best_score:\n",
" break\n",
" \n",
" selected_features.append(best_new_feature)\n",
" remaining_features.remove(best_new_feature)\n",
" best_score = best_new_score\n",
" best_features = selected_features.copy()\n",
" \n",
" print(f\"Added Feature: {best_new_feature}, Adjusted R²: {best_score:.4f}\")\n",
" \n",
" return best_features\n",
"\n",
"# Run forward feature selection\n",
"best_selected_features = forward_feature_selection(train_X, train_y)\n",
"\n",
"print(\"\\nSelected Features:\", best_selected_features)\n",
"\n",
"# Fit final model with selected features\n",
"final_model = LinearRegression()\n",
"final_model.fit(train_X[best_selected_features], train_y)\n",
"\n",
"# Predict on train and test sets\n",
"train_pred = final_model.predict(train_X[best_selected_features])\n",
"test_pred = final_model.predict(test_X[best_selected_features])\n",
"\n",
"# Compute performance metrics\n",
"train_adj_r2 = adjusted_r2_score(train_y, train_pred, final_model)\n",
"test_adj_r2 = adjusted_r2_score(test_y, test_pred, final_model)\n",
"\n",
"print(f\"\\nFinal Model Performance:\")\n",
"print(f\"Adjusted R² (Training Set): {train_adj_r2:.4f}\")\n",
"print(f\"Adjusted R² (Test Set): {test_adj_r2:.4f}\")"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "b8eb5230-2a5e-49da-b066-4a0d186b9cb3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Performance Measures - Test Set\n",
"Mean Absolute Error: 1016.99\n",
"Mean Absolute Percentage Error: 9.83%\n",
"Root Mean Squared Error: 1592.14\n"
]
}
],
"source": [
"# Evaluating Performance\n",
"# Test Set\n",
"mae = metrics.mean_absolute_error(test_y,test_pred)\n",
"mape = metrics.mean_absolute_percentage_error(test_y, test_pred)\n",
"mse = metrics.mean_squared_error(test_y, test_pred)\n",
"rmse = np.sqrt(mse)\n",
"print('\\nPerformance Measures - Test Set')\n",
"print(f'Mean Absolute Error: {mae:.2f}')\n",
"print(f'Mean Absolute Percentage Error: {mape*100:.2f}%')\n",
"print(f'Root Mean Squared Error: {rmse:.2f}')"
]
},
{
"cell_type": "markdown",
"id": "71be37ef-5039-4c56-a163-c9ec792b56e8",
"metadata": {},
"source": [
"#### Backward Selection"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "8c9ae645-cb54-4014-9a3a-48d4318ed91a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Removed Feature: CC, Adjusted R²: 0.8833\n",
"Removed Feature: HP, Adjusted R²: 0.8834\n",
"Removed Feature: Met_Color, Adjusted R²: 0.8834\n",
"Removed Feature: Automatic, Adjusted R²: 0.8834\n",
"\n",
"Selected Features: ['Age_08_04', 'KM', 'Doors', 'Quarterly_Tax', 'Weight', 'Fuel_Type_Diesel', 'Fuel_Type_Petrol']\n",
"\n",
"Final Model Performance:\n",
"Adjusted R² (Training Set): 0.8834\n",
"Adjusted R² (Test Set): 0.8188\n"
]
}
],
"source": [
"def backward_feature_selection(X, y):\n",
" selected_features = list(X.columns) # Start with all features\n",
" best_score = adjusted_r2_score(y, LinearRegression().fit(X, y).predict(X), LinearRegression().fit(X, y)) # Initial Adjusted R²\n",
" best_features = selected_features.copy()\n",
"\n",
" while len(selected_features) > 1:\n",
" scores = []\n",
" \n",
" for feature in selected_features:\n",
" temp_features = [f for f in selected_features if f != feature] # Remove one feature at a time\n",
" model = LinearRegression()\n",
" model.fit(X[temp_features], y)\n",
" y_pred = model.predict(X[temp_features])\n",
" adj_r2 = adjusted_r2_score(y, y_pred, model) # Compute Adjusted R²\n",
" scores.append((feature, adj_r2))\n",
" \n",
" # Find the worst-performing feature (one whose removal improves Adjusted R²)\n",
" scores.sort(key=lambda x: x[1], reverse=True)\n",
" worst_feature, best_new_score = scores[0] # The last item has the lowest impact\n",
" \n",
" # Stop if Adjusted R² does not improve\n",
" \n",
" if best_new_score > best_score:\n",
" selected_features.remove(worst_feature)\n",
" best_score = best_new_score\n",
" best_features = selected_features.copy()\n",
" print(f\"Removed Feature: {worst_feature}, Adjusted R²: {best_score:.4f}\")\n",
" else:\n",
" break \n",
" \n",
" return best_features\n",
"\n",
"# Run backward feature selection\n",
"best_selected_features = backward_feature_selection(train_X, train_y)\n",
"\n",
"print(\"\\nSelected Features:\", best_selected_features)\n",
"\n",
"# Fit final model with selected features\n",
"final_model = LinearRegression()\n",
"final_model.fit(train_X[best_selected_features], train_y)\n",
"\n",
"# Predict on train and test sets\n",
"train_pred = final_model.predict(train_X[best_selected_features])\n",
"test_pred = final_model.predict(test_X[best_selected_features])\n",
"\n",
"# Compute performance metrics\n",
"train_adj_r2 = adjusted_r2_score(train_y, train_pred, final_model)\n",
"test_adj_r2 = adjusted_r2_score(test_y, test_pred, final_model)\n",
"\n",
"print(f\"\\nFinal Model Performance:\")\n",
"print(f\"Adjusted R² (Training Set): {train_adj_r2:.4f}\")\n",
"print(f\"Adjusted R² (Test Set): {test_adj_r2:.4f}\")"
]
},
{
"cell_type": "markdown",
"id": "fe089eca-55c5-42d7-9ade-f4eddac792eb",
"metadata": {},
"source": [
"#### Stepwise Regression Function (Forward + Backward Selection)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "e616bae3-0ff3-4a1c-9a2e-001c14724526",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Added Feature: Age_08_04, Adjusted R²: 0.7659\n",
"Added Feature: HP, Adjusted R²: 0.8008\n",
"Added Feature: Weight, Adjusted R²: 0.8327\n",
"Added Feature: KM, Adjusted R²: 0.8641\n",
"Added Feature: Fuel_Type_Petrol, Adjusted R²: 0.8724\n",
"Added Feature: Quarterly_Tax, Adjusted R²: 0.8794\n",
"Added Feature: Doors, Adjusted R²: 0.8823\n",
"Added Feature: Fuel_Type_Diesel, Adjusted R²: 0.8834\n",
"\n",
"Selected Features: ['Age_08_04', 'HP', 'Weight', 'KM', 'Fuel_Type_Petrol', 'Quarterly_Tax', 'Doors', 'Fuel_Type_Diesel']\n",
"\n",
"Final Model Performance:\n",
"Adjusted R² (Training Set): 0.8834\n",
"Adjusted R² (Test Set): 0.8217\n"
]
}
],
"source": [
"def stepwise_regression(X, y):\n",
" selected_features = [] # Start with an empty set of features\n",
" remaining_features = list(X.columns)\n",
" best_score = -np.inf\n",
" best_features = None\n",
" improved = True # Flag to track if improvement happens\n",
"\n",
" while improved:\n",
" improved = False # Reset improvement flag\n",
" \n",
" # FORWARD STEP: Try adding a new feature\n",
" forward_scores = []\n",
" for feature in remaining_features:\n",
" temp_features = selected_features + [feature]\n",
" model = LinearRegression()\n",
" model.fit(X[temp_features], y)\n",
" y_pred = model.predict(X[temp_features])\n",
" adj_r2 = adjusted_r2_score(y, y_pred, model)\n",
" forward_scores.append((feature, adj_r2))\n",
" \n",
" # Pick the feature that gives the best improvement\n",
" if forward_scores:\n",
" forward_scores.sort(key=lambda x: x[1], reverse=True)\n",
" best_new_feature, best_new_score = forward_scores[0]\n",
" \n",
" if best_new_score > best_score: # Add feature if Adjusted R² improves\n",
" selected_features.append(best_new_feature)\n",
" remaining_features.remove(best_new_feature)\n",
" best_score = best_new_score\n",
" improved = True\n",
" print(f\"Added Feature: {best_new_feature}, Adjusted R²: {best_score:.4f}\")\n",
"\n",
" # BACKWARD STEP: Try removing a feature\n",
" backward_scores = []\n",
" for feature in selected_features:\n",
" temp_features = [f for f in selected_features if f != feature]\n",
" if not temp_features: # Ensure at least one feature is selected\n",
" continue\n",
" model = LinearRegression()\n",
" model.fit(X[temp_features], y)\n",
" y_pred = model.predict(X[temp_features])\n",
" adj_r2 = adjusted_r2_score(y, y_pred, model)\n",
" backward_scores.append((feature, adj_r2))\n",
" \n",
" # Remove the worst-performing feature if it improves Adjusted R²\n",
" if backward_scores:\n",
" backward_scores.sort(key=lambda x: x[1], reverse=True)\n",
" worst_feature, best_new_score = backward_scores[-1] # The last item has the lowest impact\n",
" \n",
" if best_new_score > best_score: # Remove feature if Adjusted R² improves\n",
" selected_features.remove(worst_feature)\n",
" remaining_features.append(worst_feature)\n",
" best_score = best_new_score\n",
" improved = True\n",
" print(f\"Removed Feature: {worst_feature}, Adjusted R²: {best_score:.4f}\")\n",
"\n",
" return selected_features\n",
"\n",
"# Run stepwise regression\n",
"best_selected_features = stepwise_regression(train_X, train_y)\n",
"\n",
"print(\"\\nSelected Features:\", best_selected_features)\n",
"\n",
"# Fit final model with selected features\n",
"final_model = LinearRegression()\n",
"final_model.fit(train_X[best_selected_features], train_y)\n",
"\n",
"# Predict on train and test sets\n",
"train_pred = final_model.predict(train_X[best_selected_features])\n",
"test_pred = final_model.predict(test_X[best_selected_features])\n",
"\n",
"# Compute performance metrics\n",
"train_adj_r2 = adjusted_r2_score(train_y, train_pred, final_model)\n",
"test_adj_r2 = adjusted_r2_score(test_y, test_pred, final_model)\n",
"\n",
"print(f\"\\nFinal Model Performance:\")\n",
"print(f\"Adjusted R² (Training Set): {train_adj_r2:.4f}\")\n",
"print(f\"Adjusted R² (Test Set): {test_adj_r2:.4f}\")"
]
},
{
"cell_type": "markdown",
"id": "1b52b955-5636-4a68-a3d9-28dc6d582acd",
"metadata": {},
"source": [
"#### REGULARIZATION"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "28cb0ca7-a200-45df-a90c-6a0bf5f442ef",
"metadata": {},
"outputs": [],
"source": [
"# Standardize features (since regularization methods are sensitive to feature scaling)\n"
]
},
{
"cell_type": "markdown",
"id": "01163fbc-5066-46ed-bf9e-cac402f896d3",
"metadata": {},
"source": [
"#### Lasso Method"
]
},
{
"cell_type": "code",
"execution_count": 56,
"id": "01f016c7-ca0c-4ccf-a33e-a89ae5d35c31",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Performance Measures - Test Set\n",
"Mean Absolute Error: 1017.99\n",
"Mean Absolute Percentage Error: 9.85%\n",
"Root Mean Squared Error: 1602.96\n"
]
}
],
"source": [
"# Fit Lasso model: given alpha\n",
"\n",
"\n",
"# Test Set\n",
"mae = metrics.mean_absolute_error(test_y,lasso.predict(test_X_scaled))\n",
"mape = metrics.mean_absolute_percentage_error(test_y, lasso.predict(test_X_scaled))\n",
"mse = metrics.mean_squared_error(test_y, lasso.predict(test_X_scaled))\n",
"rmse = np.sqrt(mse)\n",
"print('\\nPerformance Measures - Test Set')\n",
"print(f'Mean Absolute Error: {mae:.2f}')\n",
"print(f'Mean Absolute Percentage Error: {mape*100:.2f}%')\n",
"print(f'Root Mean Squared Error: {rmse:.2f}')"
]
},
{
"cell_type": "code",
"execution_count": 58,
"id": "fc2937a5-5b72-4c0a-9a5e-f0195bc5c25b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Performance Measures - Test Set\n",
"Mean Absolute Error: 1004.52\n",
"Mean Absolute Percentage Error: 9.71%\n",
"Root Mean Squared Error: 1527.54\n",
"Lasso-CV chosen regularization: 32.8906\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" Feature \n",
" Lasso_Coefficient \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" Age_08_04 \n",
" -2064.078027 \n",
" \n",
" \n",
" 1 \n",
" KM \n",
" -605.185323 \n",
" \n",
" \n",
" 2 \n",
" HP \n",
" 122.425179 \n",
" \n",
" \n",
" 3 \n",
" Met_Color \n",
" -0.000000 \n",
" \n",
" \n",
" 4 \n",
" Automatic \n",
" -0.000000 \n",
" \n",
" \n",
" 5 \n",
" CC \n",
" -0.000000 \n",
" \n",
" \n",
" 6 \n",
" Doors \n",
" -151.531573 \n",
" \n",
" \n",
" 7 \n",
" Quarterly_Tax \n",
" 408.213102 \n",
" \n",
" \n",
" 8 \n",
" Weight \n",
" 1346.496654 \n",
" \n",
" \n",
" 9 \n",
" Fuel_Type_Diesel \n",
" -246.661435 \n",
" \n",
" \n",
" 10 \n",
" Fuel_Type_Petrol \n",
" 716.945506 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Feature Lasso_Coefficient\n",
"0 Age_08_04 -2064.078027\n",
"1 KM -605.185323\n",
"2 HP 122.425179\n",
"3 Met_Color -0.000000\n",
"4 Automatic -0.000000\n",
"5 CC -0.000000\n",
"6 Doors -151.531573\n",
"7 Quarterly_Tax 408.213102\n",
"8 Weight 1346.496654\n",
"9 Fuel_Type_Diesel -246.661435\n",
"10 Fuel_Type_Petrol 716.945506"
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Fit Lasso model: use cross-validation to determine alpha\n",
"\n",
"lassoCV.fit(train_X_scaled, train_y)\n",
"\n",
"# Test Set\n",
"mae = metrics.mean_absolute_error(test_y,lassoCV.predict(test_X_scaled))\n",
"mape = metrics.mean_absolute_percentage_error(test_y, lassoCV.predict(test_X_scaled))\n",
"mse = metrics.mean_squared_error(test_y, lassoCV.predict(test_X_scaled))\n",
"rmse = np.sqrt(mse)\n",
"print('\\nPerformance Measures - Test Set')\n",
"print(f'Mean Absolute Error: {mae:.2f}')\n",
"print(f'Mean Absolute Percentage Error: {mape*100:.2f}%')\n",
"print(f'Root Mean Squared Error: {rmse:.2f}')\n",
"print(f'Lasso-CV chosen regularization: {lassoCV.alpha_:.4f}')\n",
"\n",
"lasso_coef_df = pd.DataFrame({\n",
" \"Feature\": train_X.columns,\n",
" \"Lasso_Coefficient\": lassoCV.coef_\n",
"})\n",
"lasso_coef_df"
]
},
{
"cell_type": "markdown",
"id": "1760bbc0-5fb0-4575-8f69-06145d6432bb",
"metadata": {},
"source": [
"#### Ridge Method"
]
},
{
"cell_type": "code",
"execution_count": 60,
"id": "cccd8ecf-a9cf-41ad-8f9b-94d0b7632e34",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Performance Measures - Test Set\n",
"Mean Absolute Error: 1018.24\n",
"Mean Absolute Percentage Error: 9.85%\n",
"Root Mean Squared Error: 1602.86\n"
]
}
],
"source": [
"# Fit Ridge model: given alpha\n",
"\n",
"\n",
"# Test Set\n",
"mae = metrics.mean_absolute_error(test_y,ridge.predict(test_X_scaled))\n",
"mape = metrics.mean_absolute_percentage_error(test_y, ridge.predict(test_X_scaled))\n",
"mse = metrics.mean_squared_error(test_y, ridge.predict(test_X_scaled))\n",
"rmse = np.sqrt(mse)\n",
"print('\\nPerformance Measures - Test Set')\n",
"print(f'Mean Absolute Error: {mae:.2f}')\n",
"print(f'Mean Absolute Percentage Error: {mape*100:.2f}%')\n",
"print(f'Root Mean Squared Error: {rmse:.2f}')"
]
},
{
"cell_type": "code",
"execution_count": 62,
"id": "39119203-7877-4998-a7b9-cfcda9a581c0",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Performance Measures - Test Set\n",
"Mean Absolute Error: 1017.84\n",
"Mean Absolute Percentage Error: 9.85%\n",
"Root Mean Squared Error: 1600.13\n",
"Bayesian Ridge chosen regularization: 1.9180\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" Feature \n",
" Bayesian_Ridge_Coefficient \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" Age_08_04 \n",
" -2043.218002 \n",
" \n",
" \n",
" 1 \n",
" KM \n",
" -595.632852 \n",
" \n",
" \n",
" 2 \n",
" HP \n",
" 44.540875 \n",
" \n",
" \n",
" 3 \n",
" Met_Color \n",
" -33.249048 \n",
" \n",
" \n",
" 4 \n",
" Automatic \n",
" -33.543001 \n",
" \n",
" \n",
" 5 \n",
" CC \n",
" -19.911395 \n",
" \n",
" \n",
" 6 \n",
" Doors \n",
" -233.314366 \n",
" \n",
" \n",
" 7 \n",
" Quarterly_Tax \n",
" 592.973300 \n",
" \n",
" \n",
" 8 \n",
" Weight \n",
" 1504.044958 \n",
" \n",
" \n",
" 9 \n",
" Fuel_Type_Diesel \n",
" -400.972781 \n",
" \n",
" \n",
" 10 \n",
" Fuel_Type_Petrol \n",
" 892.664894 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Feature Bayesian_Ridge_Coefficient\n",
"0 Age_08_04 -2043.218002\n",
"1 KM -595.632852\n",
"2 HP 44.540875\n",
"3 Met_Color -33.249048\n",
"4 Automatic -33.543001\n",
"5 CC -19.911395\n",
"6 Doors -233.314366\n",
"7 Quarterly_Tax 592.973300\n",
"8 Weight 1504.044958\n",
"9 Fuel_Type_Diesel -400.972781\n",
"10 Fuel_Type_Petrol 892.664894"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Fit Ridge model: use Bayesian method to determine alpha\n",
"\n",
"ridgeB.fit(train_X_scaled, train_y)\n",
"\n",
"# Test Set\n",
"mae = metrics.mean_absolute_error(test_y,ridgeB.predict(test_X_scaled))\n",
"mape = metrics.mean_absolute_percentage_error(test_y, ridgeB.predict(test_X_scaled))\n",
"mse = metrics.mean_squared_error(test_y, ridgeB.predict(test_X_scaled))\n",
"rmse = np.sqrt(mse)\n",
"print('\\nPerformance Measures - Test Set')\n",
"print(f'Mean Absolute Error: {mae:.2f}')\n",
"print(f'Mean Absolute Percentage Error: {mape*100:.2f}%')\n",
"print(f'Root Mean Squared Error: {rmse:.2f}')\n",
"regularization_strength = ridgeB.lambda_ / ridgeB.alpha_\n",
"print(f'Bayesian Ridge chosen regularization: {regularization_strength:.4f}')\n",
"\n",
"ridge_coef_df = pd.DataFrame({\n",
" \"Feature\": train_X.columns,\n",
" \"Bayesian_Ridge_Coefficient\": ridgeB.coef_\n",
"})\n",
"ridge_coef_df"
]
},
{
"cell_type": "markdown",
"id": "7d3a79f6-3b19-4114-b9c5-3f817136f3e3",
"metadata": {},
"source": [
"### OLS REGRESSION"
]
},
{
"cell_type": "code",
"execution_count": 65,
"id": "2bd73a22-a02d-4df0-b52b-94ce8866ce91",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 67,
"id": "8d1c84d2-a782-4569-9ef1-7de66a96c9c4",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"OLS Regression Results \n",
"\n",
" Dep. Variable: Price R-squared: 0.869 \n",
" \n",
"\n",
" Model: OLS Adj. R-squared: 0.868 \n",
" \n",
"\n",
" Method: Least Squares F-statistic: 859.6 \n",
" \n",
"\n",
" Date: Mon, 17 Mar 2025 Prob (F-statistic): 0.00 \n",
" \n",
"\n",
" Time: 18:50:19 Log-Likelihood: -12347. \n",
" \n",
"\n",
" No. Observations: 1436 AIC: 2.472e+04 \n",
" \n",
"\n",
" Df Residuals: 1424 BIC: 2.478e+04 \n",
" \n",
"\n",
" Df Model: 11 \n",
" \n",
"\n",
" Covariance Type: nonrobust \n",
" \n",
"
\n",
"\n",
"\n",
" coef std err t P>|t| [0.025 0.975] \n",
" \n",
"\n",
" Intercept -7325.9928 1231.633 -5.948 0.000 -9742.002 -4909.984 \n",
" \n",
"\n",
" Fuel_Type_Diesel[T.True] 627.9746 375.764 1.671 0.095 -109.136 1365.085 \n",
" \n",
"\n",
" Fuel_Type_Petrol[T.True] 2420.3110 368.314 6.571 0.000 1697.815 3142.807 \n",
" \n",
"\n",
" Age_08_04 -123.1200 2.596 -47.421 0.000 -128.213 -118.027 \n",
" \n",
"\n",
" KM -0.0169 0.001 -12.901 0.000 -0.019 -0.014 \n",
" \n",
"\n",
" HP 23.8514 3.466 6.881 0.000 17.052 30.651 \n",
" \n",
"\n",
" Met_Color 36.2887 74.966 0.484 0.628 -110.767 183.344 \n",
" \n",
"\n",
" Automatic 258.8277 157.775 1.640 0.101 -50.669 568.324 \n",
" \n",
"\n",
" CC -0.0627 0.091 -0.692 0.489 -0.241 0.115 \n",
" \n",
"\n",
" Doors -71.6120 39.658 -1.806 0.071 -149.407 6.183 \n",
" \n",
"\n",
" Quarterly_Tax 12.3125 1.650 7.463 0.000 9.076 15.549 \n",
" \n",
"\n",
" Weight 19.3566 1.218 15.894 0.000 16.968 21.746 \n",
" \n",
"
\n",
"\n",
"\n",
" Omnibus: 203.998 Durbin-Watson: 1.594 \n",
" \n",
"\n",
" Prob(Omnibus): 0.000 Jarque-Bera (JB): 2183.722 \n",
" \n",
"\n",
" Skew: -0.248 Prob(JB): 0.00 \n",
" \n",
"\n",
" Kurtosis: 9.021 Cond. No. 2.79e+06 \n",
" \n",
"
Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 2.79e+06. This might indicate that there are strong multicollinearity or other numerical problems."
],
"text/latex": [
"\\begin{center}\n",
"\\begin{tabular}{lclc}\n",
"\\toprule\n",
"\\textbf{Dep. Variable:} & Price & \\textbf{ R-squared: } & 0.869 \\\\\n",
"\\textbf{Model:} & OLS & \\textbf{ Adj. R-squared: } & 0.868 \\\\\n",
"\\textbf{Method:} & Least Squares & \\textbf{ F-statistic: } & 859.6 \\\\\n",
"\\textbf{Date:} & Mon, 17 Mar 2025 & \\textbf{ Prob (F-statistic):} & 0.00 \\\\\n",
"\\textbf{Time:} & 18:50:19 & \\textbf{ Log-Likelihood: } & -12347. \\\\\n",
"\\textbf{No. Observations:} & 1436 & \\textbf{ AIC: } & 2.472e+04 \\\\\n",
"\\textbf{Df Residuals:} & 1424 & \\textbf{ BIC: } & 2.478e+04 \\\\\n",
"\\textbf{Df Model:} & 11 & \\textbf{ } & \\\\\n",
"\\textbf{Covariance Type:} & nonrobust & \\textbf{ } & \\\\\n",
"\\bottomrule\n",
"\\end{tabular}\n",
"\\begin{tabular}{lcccccc}\n",
" & \\textbf{coef} & \\textbf{std err} & \\textbf{t} & \\textbf{P$> |$t$|$} & \\textbf{[0.025} & \\textbf{0.975]} \\\\\n",
"\\midrule\n",
"\\textbf{Intercept} & -7325.9928 & 1231.633 & -5.948 & 0.000 & -9742.002 & -4909.984 \\\\\n",
"\\textbf{Fuel\\_Type\\_Diesel[T.True]} & 627.9746 & 375.764 & 1.671 & 0.095 & -109.136 & 1365.085 \\\\\n",
"\\textbf{Fuel\\_Type\\_Petrol[T.True]} & 2420.3110 & 368.314 & 6.571 & 0.000 & 1697.815 & 3142.807 \\\\\n",
"\\textbf{Age\\_08\\_04} & -123.1200 & 2.596 & -47.421 & 0.000 & -128.213 & -118.027 \\\\\n",
"\\textbf{KM} & -0.0169 & 0.001 & -12.901 & 0.000 & -0.019 & -0.014 \\\\\n",
"\\textbf{HP} & 23.8514 & 3.466 & 6.881 & 0.000 & 17.052 & 30.651 \\\\\n",
"\\textbf{Met\\_Color} & 36.2887 & 74.966 & 0.484 & 0.628 & -110.767 & 183.344 \\\\\n",
"\\textbf{Automatic} & 258.8277 & 157.775 & 1.640 & 0.101 & -50.669 & 568.324 \\\\\n",
"\\textbf{CC} & -0.0627 & 0.091 & -0.692 & 0.489 & -0.241 & 0.115 \\\\\n",
"\\textbf{Doors} & -71.6120 & 39.658 & -1.806 & 0.071 & -149.407 & 6.183 \\\\\n",
"\\textbf{Quarterly\\_Tax} & 12.3125 & 1.650 & 7.463 & 0.000 & 9.076 & 15.549 \\\\\n",
"\\textbf{Weight} & 19.3566 & 1.218 & 15.894 & 0.000 & 16.968 & 21.746 \\\\\n",
"\\bottomrule\n",
"\\end{tabular}\n",
"\\begin{tabular}{lclc}\n",
"\\textbf{Omnibus:} & 203.998 & \\textbf{ Durbin-Watson: } & 1.594 \\\\\n",
"\\textbf{Prob(Omnibus):} & 0.000 & \\textbf{ Jarque-Bera (JB): } & 2183.722 \\\\\n",
"\\textbf{Skew:} & -0.248 & \\textbf{ Prob(JB): } & 0.00 \\\\\n",
"\\textbf{Kurtosis:} & 9.021 & \\textbf{ Cond. No. } & 2.79e+06 \\\\\n",
"\\bottomrule\n",
"\\end{tabular}\n",
"%\\caption{OLS Regression Results}\n",
"\\end{center}\n",
"\n",
"Notes: \\newline\n",
" [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. \\newline\n",
" [2] The condition number is large, 2.79e+06. This might indicate that there are \\newline\n",
" strong multicollinearity or other numerical problems."
],
"text/plain": [
"\n",
"\"\"\"\n",
" OLS Regression Results \n",
"==============================================================================\n",
"Dep. Variable: Price R-squared: 0.869\n",
"Model: OLS Adj. R-squared: 0.868\n",
"Method: Least Squares F-statistic: 859.6\n",
"Date: Mon, 17 Mar 2025 Prob (F-statistic): 0.00\n",
"Time: 18:50:19 Log-Likelihood: -12347.\n",
"No. Observations: 1436 AIC: 2.472e+04\n",
"Df Residuals: 1424 BIC: 2.478e+04\n",
"Df Model: 11 \n",
"Covariance Type: nonrobust \n",
"============================================================================================\n",
" coef std err t P>|t| [0.025 0.975]\n",
"--------------------------------------------------------------------------------------------\n",
"Intercept -7325.9928 1231.633 -5.948 0.000 -9742.002 -4909.984\n",
"Fuel_Type_Diesel[T.True] 627.9746 375.764 1.671 0.095 -109.136 1365.085\n",
"Fuel_Type_Petrol[T.True] 2420.3110 368.314 6.571 0.000 1697.815 3142.807\n",
"Age_08_04 -123.1200 2.596 -47.421 0.000 -128.213 -118.027\n",
"KM -0.0169 0.001 -12.901 0.000 -0.019 -0.014\n",
"HP 23.8514 3.466 6.881 0.000 17.052 30.651\n",
"Met_Color 36.2887 74.966 0.484 0.628 -110.767 183.344\n",
"Automatic 258.8277 157.775 1.640 0.101 -50.669 568.324\n",
"CC -0.0627 0.091 -0.692 0.489 -0.241 0.115\n",
"Doors -71.6120 39.658 -1.806 0.071 -149.407 6.183\n",
"Quarterly_Tax 12.3125 1.650 7.463 0.000 9.076 15.549\n",
"Weight 19.3566 1.218 15.894 0.000 16.968 21.746\n",
"==============================================================================\n",
"Omnibus: 203.998 Durbin-Watson: 1.594\n",
"Prob(Omnibus): 0.000 Jarque-Bera (JB): 2183.722\n",
"Skew: -0.248 Prob(JB): 0.00\n",
"Kurtosis: 9.021 Cond. No. 2.79e+06\n",
"==============================================================================\n",
"\n",
"Notes:\n",
"[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
"[2] The condition number is large, 2.79e+06. This might indicate that there are\n",
"strong multicollinearity or other numerical problems.\n",
"\"\"\""
]
},
"execution_count": 67,
"metadata": {},
"output_type": "execute_result"
}
],
"source": []
},
{
"cell_type": "code",
"execution_count": 113,
"id": "0362501b-bfeb-4927-a873-81543963923e",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" Age_08_04 \n",
" KM \n",
" HP \n",
" Quarterly_Tax \n",
" Weight \n",
" Fuel_Type_Petrol \n",
" Price \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 23 \n",
" 46986 \n",
" 90 \n",
" 210 \n",
" 1165 \n",
" False \n",
" 13500 \n",
" \n",
" \n",
" 1 \n",
" 23 \n",
" 72937 \n",
" 90 \n",
" 210 \n",
" 1165 \n",
" False \n",
" 13750 \n",
" \n",
" \n",
" 2 \n",
" 24 \n",
" 41711 \n",
" 90 \n",
" 210 \n",
" 1165 \n",
" False \n",
" 13950 \n",
" \n",
" \n",
" 3 \n",
" 26 \n",
" 48000 \n",
" 90 \n",
" 210 \n",
" 1165 \n",
" False \n",
" 14950 \n",
" \n",
" \n",
" 4 \n",
" 30 \n",
" 38500 \n",
" 90 \n",
" 210 \n",
" 1170 \n",
" False \n",
" 13750 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Age_08_04 KM HP Quarterly_Tax Weight Fuel_Type_Petrol Price\n",
"0 23 46986 90 210 1165 False 13500\n",
"1 23 72937 90 210 1165 False 13750\n",
"2 24 41711 90 210 1165 False 13950\n",
"3 26 48000 90 210 1165 False 14950\n",
"4 30 38500 90 210 1170 False 13750"
]
},
"execution_count": 113,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\n",
"formula = 'Price ~' + '+'.join(predictors)"
]
},
{
"cell_type": "code",
"execution_count": 115,
"id": "c832c55b-e417-4063-9a58-d13eb774bd7f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"OLS Regression Results \n",
"\n",
" Dep. Variable: Price R-squared: 0.868 \n",
" \n",
"\n",
" Model: OLS Adj. R-squared: 0.868 \n",
" \n",
"\n",
" Method: Least Squares F-statistic: 1569. \n",
" \n",
"\n",
" Date: Mon, 17 Mar 2025 Prob (F-statistic): 0.00 \n",
" \n",
"\n",
" Time: 19:01:35 Log-Likelihood: -12352. \n",
" \n",
"\n",
" No. Observations: 1436 AIC: 2.472e+04 \n",
" \n",
"\n",
" Df Residuals: 1429 BIC: 2.475e+04 \n",
" \n",
"\n",
" Df Model: 6 \n",
" \n",
"\n",
" Covariance Type: nonrobust \n",
" \n",
"
\n",
"\n",
"\n",
" coef std err t P>|t| [0.025 0.975] \n",
" \n",
"\n",
" Intercept -7162.2958 1186.423 -6.037 0.000 -9489.613 -4834.979 \n",
" \n",
"\n",
" Fuel_Type_Petrol[T.True] 1940.6025 250.039 7.761 0.000 1450.119 2431.086 \n",
" \n",
"\n",
" Age_08_04 -122.4599 2.571 -47.626 0.000 -127.504 -117.416 \n",
" \n",
"\n",
" KM -0.0172 0.001 -13.208 0.000 -0.020 -0.015 \n",
" \n",
"\n",
" HP 21.2704 3.095 6.872 0.000 15.199 27.342 \n",
" \n",
"\n",
" Quarterly_Tax 12.1300 1.650 7.351 0.000 8.893 15.367 \n",
" \n",
"\n",
" Weight 19.5809 1.074 18.240 0.000 17.475 21.687 \n",
" \n",
"
\n",
"\n",
"\n",
" Omnibus: 209.991 Durbin-Watson: 1.588 \n",
" \n",
"\n",
" Prob(Omnibus): 0.000 Jarque-Bera (JB): 2292.692 \n",
" \n",
"\n",
" Skew: -0.271 Prob(JB): 0.00 \n",
" \n",
"\n",
" Kurtosis: 9.166 Cond. No. 2.68e+06 \n",
" \n",
"
Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 2.68e+06. This might indicate that there are strong multicollinearity or other numerical problems."
],
"text/latex": [
"\\begin{center}\n",
"\\begin{tabular}{lclc}\n",
"\\toprule\n",
"\\textbf{Dep. Variable:} & Price & \\textbf{ R-squared: } & 0.868 \\\\\n",
"\\textbf{Model:} & OLS & \\textbf{ Adj. R-squared: } & 0.868 \\\\\n",
"\\textbf{Method:} & Least Squares & \\textbf{ F-statistic: } & 1569. \\\\\n",
"\\textbf{Date:} & Mon, 17 Mar 2025 & \\textbf{ Prob (F-statistic):} & 0.00 \\\\\n",
"\\textbf{Time:} & 19:01:35 & \\textbf{ Log-Likelihood: } & -12352. \\\\\n",
"\\textbf{No. Observations:} & 1436 & \\textbf{ AIC: } & 2.472e+04 \\\\\n",
"\\textbf{Df Residuals:} & 1429 & \\textbf{ BIC: } & 2.475e+04 \\\\\n",
"\\textbf{Df Model:} & 6 & \\textbf{ } & \\\\\n",
"\\textbf{Covariance Type:} & nonrobust & \\textbf{ } & \\\\\n",
"\\bottomrule\n",
"\\end{tabular}\n",
"\\begin{tabular}{lcccccc}\n",
" & \\textbf{coef} & \\textbf{std err} & \\textbf{t} & \\textbf{P$> |$t$|$} & \\textbf{[0.025} & \\textbf{0.975]} \\\\\n",
"\\midrule\n",
"\\textbf{Intercept} & -7162.2958 & 1186.423 & -6.037 & 0.000 & -9489.613 & -4834.979 \\\\\n",
"\\textbf{Fuel\\_Type\\_Petrol[T.True]} & 1940.6025 & 250.039 & 7.761 & 0.000 & 1450.119 & 2431.086 \\\\\n",
"\\textbf{Age\\_08\\_04} & -122.4599 & 2.571 & -47.626 & 0.000 & -127.504 & -117.416 \\\\\n",
"\\textbf{KM} & -0.0172 & 0.001 & -13.208 & 0.000 & -0.020 & -0.015 \\\\\n",
"\\textbf{HP} & 21.2704 & 3.095 & 6.872 & 0.000 & 15.199 & 27.342 \\\\\n",
"\\textbf{Quarterly\\_Tax} & 12.1300 & 1.650 & 7.351 & 0.000 & 8.893 & 15.367 \\\\\n",
"\\textbf{Weight} & 19.5809 & 1.074 & 18.240 & 0.000 & 17.475 & 21.687 \\\\\n",
"\\bottomrule\n",
"\\end{tabular}\n",
"\\begin{tabular}{lclc}\n",
"\\textbf{Omnibus:} & 209.991 & \\textbf{ Durbin-Watson: } & 1.588 \\\\\n",
"\\textbf{Prob(Omnibus):} & 0.000 & \\textbf{ Jarque-Bera (JB): } & 2292.692 \\\\\n",
"\\textbf{Skew:} & -0.271 & \\textbf{ Prob(JB): } & 0.00 \\\\\n",
"\\textbf{Kurtosis:} & 9.166 & \\textbf{ Cond. No. } & 2.68e+06 \\\\\n",
"\\bottomrule\n",
"\\end{tabular}\n",
"%\\caption{OLS Regression Results}\n",
"\\end{center}\n",
"\n",
"Notes: \\newline\n",
" [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. \\newline\n",
" [2] The condition number is large, 2.68e+06. This might indicate that there are \\newline\n",
" strong multicollinearity or other numerical problems."
],
"text/plain": [
"\n",
"\"\"\"\n",
" OLS Regression Results \n",
"==============================================================================\n",
"Dep. Variable: Price R-squared: 0.868\n",
"Model: OLS Adj. R-squared: 0.868\n",
"Method: Least Squares F-statistic: 1569.\n",
"Date: Mon, 17 Mar 2025 Prob (F-statistic): 0.00\n",
"Time: 19:01:35 Log-Likelihood: -12352.\n",
"No. Observations: 1436 AIC: 2.472e+04\n",
"Df Residuals: 1429 BIC: 2.475e+04\n",
"Df Model: 6 \n",
"Covariance Type: nonrobust \n",
"============================================================================================\n",
" coef std err t P>|t| [0.025 0.975]\n",
"--------------------------------------------------------------------------------------------\n",
"Intercept -7162.2958 1186.423 -6.037 0.000 -9489.613 -4834.979\n",
"Fuel_Type_Petrol[T.True] 1940.6025 250.039 7.761 0.000 1450.119 2431.086\n",
"Age_08_04 -122.4599 2.571 -47.626 0.000 -127.504 -117.416\n",
"KM -0.0172 0.001 -13.208 0.000 -0.020 -0.015\n",
"HP 21.2704 3.095 6.872 0.000 15.199 27.342\n",
"Quarterly_Tax 12.1300 1.650 7.351 0.000 8.893 15.367\n",
"Weight 19.5809 1.074 18.240 0.000 17.475 21.687\n",
"==============================================================================\n",
"Omnibus: 209.991 Durbin-Watson: 1.588\n",
"Prob(Omnibus): 0.000 Jarque-Bera (JB): 2292.692\n",
"Skew: -0.271 Prob(JB): 0.00\n",
"Kurtosis: 9.166 Cond. No. 2.68e+06\n",
"==============================================================================\n",
"\n",
"Notes:\n",
"[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
"[2] The condition number is large, 2.68e+06. This might indicate that there are\n",
"strong multicollinearity or other numerical problems.\n",
"\"\"\""
]
},
"execution_count": 115,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"OLS_model = sm.ols(formula = formula, data = df1).fit()\n",
"OLS_model.summary()"
]
},
{
"cell_type": "code",
"execution_count": 119,
"id": "ca4faf6a-629f-4d71-8276-9b24a9c2dd8a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Performance Measures\n",
"Mean Absolute Error: 975.97\n",
"Mean Absolute Percentage Error: 9.64%\n",
"Root Mean Squared Error: 1316.14\n"
]
}
],
"source": [
"mae = metrics.mean_absolute_error(y,OLS_model.predict(X))\n",
"mape = metrics.mean_absolute_percentage_error(y, OLS_model.predict(X))\n",
"mse = metrics.mean_squared_error(y, OLS_model.predict(X))\n",
"rmse = np.sqrt(mse)\n",
"print('\\nPerformance Measures')\n",
"print(f'Mean Absolute Error: {mae:.2f}')\n",
"print(f'Mean Absolute Percentage Error: {mape*100:.2f}%')\n",
"print(f'Root Mean Squared Error: {rmse:.2f}')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7fa74bcf-3fbe-4a93-b7a7-5be1bd326ffe",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:base] *",
"language": "python",
"name": "conda-base-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}