Linear Regression with Stats
# -*- coding: utf-8 -*-
"""
Created on Fri Nov 3 21:43:32 2023
@author: Syed Kamran Bukhari
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#read CSV file
dataset = pd.read_csv('train.csv')
# Create a boolean mask for rows to be deleted
mask = (dataset['SaleCondition'] == 'AdjLand') | (dataset['SaleCondition'] == 'Alloca') | (dataset['SaleCondition'] == 'Family')
# Use the mask to filter the DataFrame and keep only rows that don't match the condition
dataset = dataset[~mask]
# Select relevant features
selected_features = ["OverallQual", "GarageCars", "YearBuilt", "PoolArea", "SaleCondition"]
#dividing them in to categories in Numbers
from sklearn.preprocessing import LabelEncoder
label_encoder=LabelEncoder()
dataset['SaleCondition']= label_encoder.fit_transform(dataset['SaleCondition'])
X = dataset[selected_features]
Y = dataset.iloc[:,-1].values
Y=Y.reshape(len(Y),1)
from sklearn.impute import SimpleImputer
imputer_1=SimpleImputer(missing_values=np.nan,strategy="most_frequent")
imputer_2=SimpleImputer(missing_values=np.nan,strategy="median")
imputer_1.fit(X)
imputer_2.fit(Y)
X=imputer_1.transform(X)
X=pd.DataFrame(X)
Y=imputer_2.transform(Y)
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)
#import Regressor
from sklearn.linear_model import LinearRegression
regressor=LinearRegression()
regressor.fit(X_train, Y_train)
#prediction values
Y_pred=regressor.predict(X_test)
#RMSE score
residual = abs(Y_test-Y_pred)
ss= residual**2
ss=ss.sum()/len(Y_test)
print('The RMSE score is =', ss)
#R2 score
from sklearn.metrics import r2_score
R2= r2_score(Y_test, Y_pred)
print('The R2 score is =',R2)
#Durbin Watson Statistics
from statsmodels.stats.stattools import durbin_watson
DW = durbin_watson(residual)
print('Durbin Watson Statistics =',DW)
# -*- coding: utf-8 -*-
ReplyDelete"""
Created on Wed Nov 8 10:11:48 2023
@author: pc
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#Import CSV File
dataset = pd.read_csv('Position_Salaries.csv')
x=dataset.iloc[:,1:2].values
y=dataset.iloc[:,-1].values
from sklearn.ensemble import RandomForestRegressor
reg=RandomForestRegressor(n_estimators=90,random_state=0)
reg.fit(x,y)
y_pred=reg.predict([[6.5]])
print('predicted salary is=', y_pred)
plt.scatter(x,y,color='red')
plt.plot(x,reg.predict(x),color='blue')
plt.xlabel('Position')
plt.ylabel('Salary')
plt.title('Position vs Salary (Low resolution)')
plt.show()
#high resolution
x_grid=np.arange(min(x),max(x),0.1)
x_grid=x_grid.reshape(len(x_grid),1)
plt.scatter(x,y,color='red')
plt.plot(x_grid,reg.predict(x_grid),color='blue')
plt.xlabel('Position')
plt.ylabel('Salary')
plt.title('Position vs Salary (High resolution)')
plt.show()
y_ideal=np.arange(min(y),max(y))
plt.scatter(y,reg.predict(x),color='red')
plt.plot(y_ideal,y_ideal,color='blue')
plt.xlabel('origanl value')
plt.ylabel('predicted value')
plt.title('orignal vs predicted')
plt.show()
ss=abs(y-reg.predict(x))
ss=ss.sum()/len(y)
from math import sqrt
#rmse value should be equal to 0 for best model
rmse=sqrt(ss)
print('RMSE score is= ',rmse)
#r2 score should be equal to 1 for best model
from sklearn.metrics import r2_score
R2=r2_score(y,reg.predict(x))
print('r2_score= ',R2 )
from statsmodels.stats.stattools import durbin_watson
db=durbin_watson(abs(y-reg.predict(x)))
print('Durbin watson= ',db)