நமது model உருவாக்கத்திற்கு வெறும் linear regression-ஐ மட்டும் பயன்படுத்தாமல், வேறு சில algorithm-வுடனும் ஒப்பிட்டு எது சிறந்ததோ அதை பயன்படுத்த வேண்டும். இதற்கான நிரல் பின்வருமாறு. இது நமது தரவுகளை பல்வேறு algorithm-ல் பொருத்தி, ஒவ்வொன்றினுடைய Score மற்றும் RMSE மதிப்புகளை வெளிப்படுத்துகிறது. இவற்றில் சிறந்ததை நாம் தேர்வு செய்து கொள்ளலாம்.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet | |
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, ExtraTreesRegressor, GradientBoostingRegressor | |
from sklearn.tree import DecisionTreeRegressor | |
from sklearn.neural_network import MLPRegressor | |
from sklearn.model_selection import train_test_split,cross_val_score | |
from sklearn.externals import joblib | |
from sklearn.metrics import mean_squared_error | |
from azure.storage.blob import BlockBlobService | |
import matplotlib.pyplot as plt | |
from math import sqrt | |
import numpy as np | |
import os | |
df = pd.read_csv('./training_data.csv') | |
i = list(df.columns.values) | |
i.pop(i.index('SalePrice')) | |
df0 = df[i+['SalePrice']] | |
df = df0.select_dtypes(include=['integer','float']) | |
X = df[list(df.columns)[:-1]] | |
y = df['SalePrice'] | |
X_train, X_test, y_train, y_test = train_test_split(X, y) | |
def linear(): | |
regressor = LinearRegression() | |
regressor.fit(X_train, y_train) | |
y_predictions = regressor.predict(X_test) | |
return (regressor.score(X_test, y_test),sqrt(mean_squared_error(y_test, y_predictions))) | |
def ridge(): | |
regressor = Ridge(alpha=.3, normalize=True) | |
regressor.fit(X_train, y_train) | |
y_predictions = regressor.predict(X_test) | |
return (regressor.score(X_test, y_test),sqrt(mean_squared_error(y_test, y_predictions))) | |
def lasso(): | |
regressor = Lasso(alpha=0.00009, normalize=True) | |
regressor.fit(X_train, y_train) | |
y_predictions = regressor.predict(X_test) | |
return (regressor.score(X_test, y_test),sqrt(mean_squared_error(y_test, y_predictions))) | |
def elasticnet(): | |
regressor = ElasticNet(alpha=1, l1_ratio=0.5, normalize=False) | |
regressor.fit(X_train, y_train) | |
y_predictions = regressor.predict(X_test) | |
return (regressor.score(X_test, y_test),sqrt(mean_squared_error(y_test, y_predictions))) | |
def randomforest(): | |
regressor = RandomForestRegressor(n_estimators=15,min_samples_split=15,criterion='mse',max_depth=None) | |
regressor.fit(X_train, y_train) | |
y_predictions = regressor.predict(X_test) | |
print("Selected Features for RamdomForest",regressor.feature_importances_) | |
return (regressor.score(X_test, y_test),sqrt(mean_squared_error(y_test, y_predictions))) | |
def perceptron(): | |
regressor = MLPRegressor(hidden_layer_sizes=(5000,), activation='relu', solver='adam', max_iter=1000) | |
regressor.fit(X_train, y_train) | |
y_predictions = regressor.predict(X_test) | |
print("Co-efficients of Perceptron",regressor.coefs_) | |
return (regressor.score(X_test, y_test),sqrt(mean_squared_error(y_test, y_predictions))) | |
def decisiontree(): | |
regressor = DecisionTreeRegressor(min_samples_split=30,max_depth=None) | |
regressor.fit(X_train, y_train) | |
y_predictions = regressor.predict(X_test) | |
print("Selected Features for DecisionTrees",regressor.feature_importances_) | |
return (regressor.score(X_test, y_test),sqrt(mean_squared_error(y_test, y_predictions))) | |
def adaboost(): | |
regressor = AdaBoostRegressor(random_state=8, loss='exponential').fit(X_train, y_train) | |
regressor.fit(X_train, y_train) | |
y_predictions = regressor.predict(X_test) | |
print("Selected Features for Adaboost",regressor.feature_importances_) | |
return (regressor.score(X_test, y_test),sqrt(mean_squared_error(y_test, y_predictions))) | |
def extratrees(): | |
regressor = ExtraTreesRegressor(n_estimators=50).fit(X_train, y_train) | |
regressor.fit(X_train, y_train) | |
y_predictions = regressor.predict(X_test) | |
print("Selected Features for Extratrees",regressor.feature_importances_) | |
return (regressor.score(X_test, y_test),sqrt(mean_squared_error(y_test, y_predictions))) | |
def gradientboosting(): | |
regressor = GradientBoostingRegressor(loss='ls',n_estimators=500, min_samples_split=15).fit(X_train, y_train) | |
regressor.fit(X_train, y_train) | |
y_predictions = regressor.predict(X_test) | |
print("Selected Features for Gradientboosting",regressor.feature_importances_) | |
return (regressor.score(X_test, y_test),sqrt(mean_squared_error(y_test, y_predictions))) | |
print ("Score, RMSE values") | |
print ("Linear = ",linear()) | |
print ("Ridge = ",ridge()) | |
print ("Lasso = ",lasso()) | |
print ("ElasticNet = ",elasticnet()) | |
print ("RandomForest = ",randomforest()) | |
print ("Perceptron = ",perceptron()) | |
print ("DecisionTree = ",decisiontree()) | |
print ("AdaBoost = ",adaboost()) | |
print ("ExtraTrees = ",extratrees()) | |
print ("GradientBoosting = ",gradientboosting()) |
நிரலுக்கான வெளியீடு.
Score, RMSE values
Linear = (0.7437086925668539, 40067.32048747698)
Ridge = (0.7426559924644496, 40149.523137601194)
Lasso = (0.7437086997392647, 40067.31992682729)
ElasticNet = (0.7427716507607811, 40140.499909601196)
RandomForest = (0.7816174352942802, 36985.57224959144)
Perceptron = (0.7090884723574984, 42687.80529374248)
DecisionTree = (0.7205230305007451, 41840.45264436496)
AdaBoost = (0.7405881117926998, 40310.51057481991)
ExtraTrees = (0.8112271823246542, 34386.90514804029)
GradientBoosting = (0.770865727419495, 37885.095662535474)
Selected Features for RamdomForest [0.61070268 0.04279095 0.04336447 0.17066371 0.01107406 0.01329107
0.0065515 0.03938371 0.02458596 0.02051551 0.01707638]
Selected Features for DecisionTrees [0.75618387 0.03596786 0.02304119 0.13037245 0.0022674 0. 0.00739768 0.01056845 0.01184136 0.01171254 0.01064719]
Selected Features for Adaboost [0.38413232 0.18988447 0.03844386 0.12826885 0.03857277 0.03995005
0.01059839 0.08066205 0.05036717 0.01473333 0.02438674]
Selected Features for Extratrees [0.33168574 0.04675749 0.05913052 0.11159271 0.05178125 0.02947481
0.03966461 0.16786223 0.06241882 0.05316226 0.04646956]
Selected Features for Gradientboosting [0.04426232 0.16359645 0.14768597 0.25403034 0.02119119 0.04361512
0.01825781 0.01626673 0.15891844 0.07188963 0.06028599]
Co-efficients of Perceptron [array([[ 2.83519650e-01, 7.33024272e-03, 2.80373628e-01, …, -1.43939606e-03, -3.84913926e-02],
[ 1.34495184e-01, 1.31687141e-02, 1.72078666e-04, …,1.70666499e-23, -2.31494718e-02, -1.08758545e-02],
[ 9.44490485e-02, -2.34835375e-02, 2.37798999e-02, …, -1.74549692e-02, -2.70192753e-02, -3.67706290e-02],
…,
[ 1.59527225e-01, -3.19744701e-02, -1.22884400e-01, …, -2.35994429e-26, -3.03880584e-02, -2.85251050e-02],
[-3.63149939e-01, -4.05674884e-02, 2.66679331e-01, …, -1.73628910e-02, 7.40224353e-03, -6.89871249e-03],
[-4.30743882e-01, 7.07948777e-03, 3.34518179e-01, …, -1.74075111e-02, 3.47755293e-02, -2.64627071e-02]]),
array([[ 0.16789784],[-0.01864141],[ 0.20432696],…,[ 0.01739125],[-0.02779454],[-0.00476935]])]
ஒரு model-ஐ இதுவே சிறந்தது எனக் கூறுவதற்கு, அதனுடைய Score மற்றும் RMSE மதிப்பு அல்லாது Threshold Limit, Sensitivity போன்றவற்றையும் நாம் கணக்கில் கொள்ள வேண்டும். இதைப் பற்றியும் மேலே குறிப்பிட்டுள்ள ஒவ்வொரு algorithm-ஐப் பற்றியும் பின்னர் நாம் விளக்கமாகக் காணலாம். மேலே குறிப்பிட்டுள்ள algorithms-ல் ஒருசிலவை எந்தெந்த features-ஐ வைத்து கணித்துள்ளது என்பதை வெளிப்படுத்தியுள்ளது. ஆனால் இந்தப் பண்பு linear, ridge, lasso, elasticnet போன்றவற்றிற்குக் கிடையாது. ஆகவே இதுபோன்ற algorithms-க்கு RFE technique மூலம் நாம் features-ஐ தேர்வு செய்து அனுப்ப வேண்டும். இதைப் பற்றி ‘feature selection’ எனும் அடுத்த பகுதியில் காணலாம்.