Linear Regression with Stats

 # -*- coding: utf-8 -*-

"""

Created on Fri Nov  3 21:43:32 2023


@author: Syed Kamran Bukhari

"""


import numpy as np

import pandas as pd

import matplotlib.pyplot as plt


#read CSV file

dataset = pd.read_csv('train.csv')


# Create a boolean mask for rows to be deleted

mask = (dataset['SaleCondition'] == 'AdjLand') | (dataset['SaleCondition'] == 'Alloca') | (dataset['SaleCondition'] == 'Family')


# Use the mask to filter the DataFrame and keep only rows that don't match the condition

dataset = dataset[~mask]


# Select relevant features

selected_features = ["OverallQual", "GarageCars", "YearBuilt", "PoolArea", "SaleCondition"]


#dividing them in to categories in Numbers

from sklearn.preprocessing import LabelEncoder

label_encoder=LabelEncoder()

dataset['SaleCondition']= label_encoder.fit_transform(dataset['SaleCondition'])


X = dataset[selected_features]

Y = dataset.iloc[:,-1].values

Y=Y.reshape(len(Y),1)


from sklearn.impute import SimpleImputer

imputer_1=SimpleImputer(missing_values=np.nan,strategy="most_frequent")

imputer_2=SimpleImputer(missing_values=np.nan,strategy="median")


imputer_1.fit(X)

imputer_2.fit(Y)


X=imputer_1.transform(X)

X=pd.DataFrame(X)

Y=imputer_2.transform(Y)


from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)


#import Regressor

from sklearn.linear_model import LinearRegression

regressor=LinearRegression()

regressor.fit(X_train, Y_train)


#prediction values

Y_pred=regressor.predict(X_test)


#RMSE score

residual = abs(Y_test-Y_pred)

ss= residual**2

ss=ss.sum()/len(Y_test)

print('The RMSE score is =', ss)


#R2 score

from sklearn.metrics import r2_score

R2= r2_score(Y_test, Y_pred)

print('The R2 score is =',R2)


#Durbin Watson Statistics

from statsmodels.stats.stattools import durbin_watson

DW = durbin_watson(residual)

print('Durbin Watson Statistics =',DW)

Comments

  1. # -*- coding: utf-8 -*-
    """
    Created on Wed Nov 8 10:11:48 2023

    @author: pc
    """

    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt

    #Import CSV File
    dataset = pd.read_csv('Position_Salaries.csv')

    x=dataset.iloc[:,1:2].values
    y=dataset.iloc[:,-1].values

    from sklearn.ensemble import RandomForestRegressor
    reg=RandomForestRegressor(n_estimators=90,random_state=0)
    reg.fit(x,y)

    y_pred=reg.predict([[6.5]])
    print('predicted salary is=', y_pred)

    plt.scatter(x,y,color='red')
    plt.plot(x,reg.predict(x),color='blue')
    plt.xlabel('Position')
    plt.ylabel('Salary')
    plt.title('Position vs Salary (Low resolution)')
    plt.show()

    #high resolution
    x_grid=np.arange(min(x),max(x),0.1)
    x_grid=x_grid.reshape(len(x_grid),1)
    plt.scatter(x,y,color='red')
    plt.plot(x_grid,reg.predict(x_grid),color='blue')
    plt.xlabel('Position')
    plt.ylabel('Salary')
    plt.title('Position vs Salary (High resolution)')
    plt.show()

    y_ideal=np.arange(min(y),max(y))
    plt.scatter(y,reg.predict(x),color='red')
    plt.plot(y_ideal,y_ideal,color='blue')
    plt.xlabel('origanl value')
    plt.ylabel('predicted value')
    plt.title('orignal vs predicted')
    plt.show()


    ss=abs(y-reg.predict(x))
    ss=ss.sum()/len(y)
    from math import sqrt
    #rmse value should be equal to 0 for best model
    rmse=sqrt(ss)
    print('RMSE score is= ',rmse)

    #r2 score should be equal to 1 for best model
    from sklearn.metrics import r2_score
    R2=r2_score(y,reg.predict(x))
    print('r2_score= ',R2 )

    from statsmodels.stats.stattools import durbin_watson
    db=durbin_watson(abs(y-reg.predict(x)))
    print('Durbin watson= ',db)
















    ReplyDelete

Post a Comment

Popular posts from this blog

PAKISTAN ANSWERS BACK TO INDIA

PRIME MINISTER OF PAKISTAN WINNING THE HEARTS OF THE POOR PAKISTANI PEOPLE.

LOCAL GOVERMENT SYSTEM OF ISLAMABAD THE CAPITAL