import pandas as pd
import numpy as np


df = pd.read_csv('loan_prediction.csv')


for col in df.columns:
    counts = df[col].value_counts()
    print(f'df[{col}]')
    print(counts)
    print('\n')

df[Loan_ID]
LP001002    1
LP002328    1
LP002305    1
LP002308    1
LP002314    1
           ..
LP001692    1
LP001693    1
LP001698    1
LP001699    1
LP002990    1
Name: Loan_ID, Length: 614, dtype: int64


df[Gender]
Male      489
Female    112
Name: Gender, dtype: int64


df[Married]
Yes    398
No     213
Name: Married, dtype: int64


df[Dependents]
0     345
1     102
2     101
3+     51
Name: Dependents, dtype: int64


df[Education]
Graduate        480
Not Graduate    134
Name: Education, dtype: int64


df[Self_Employed]
No     500
Yes     82
Name: Self_Employed, dtype: int64


df[ApplicantIncome]
2500    9
4583    6
6000    6
2600    6
3333    5
       ..
3244    1
4408    1
3917    1
3992    1
7583    1
Name: ApplicantIncome, Length: 505, dtype: int64


df[CoapplicantIncome]
0.0       273
2500.0      5
2083.0      5
1666.0      5
2250.0      3
         ... 
2791.0      1
1010.0      1
1695.0      1
2598.0      1
240.0       1
Name: CoapplicantIncome, Length: 287, dtype: int64


df[LoanAmount]
120.0    20
110.0    17
100.0    15
160.0    12
187.0    12
         ..
240.0     1
214.0     1
59.0      1
166.0     1
253.0     1
Name: LoanAmount, Length: 203, dtype: int64


df[Loan_Amount_Term]
360.0    512
180.0     44
480.0     15
300.0     13
240.0      4
84.0       4
120.0      3
60.0       2
36.0       2
12.0       1
Name: Loan_Amount_Term, dtype: int64


df[Credit_History]
1.0    475
0.0     89
Name: Credit_History, dtype: int64


df[Property_Area]
Semiurban    233
Urban        202
Rural        179
Name: Property_Area, dtype: int64


df[Loan_Status]
Y    422
N    192
Name: Loan_Status, dtype: int64


df.isnull().sum() * 100 / len(df)

Loan_ID              0.000000
Gender               2.117264
Married              0.488599
Dependents           2.442997
Education            0.000000
Self_Employed        5.211726
ApplicantIncome      0.000000
CoapplicantIncome    0.000000
LoanAmount           3.583062
Loan_Amount_Term     2.280130
Credit_History       8.143322
Property_Area        0.000000
Loan_Status          0.000000
dtype: float64


df = df.drop('Loan_ID', axis=1)


df['Gender'].fillna(df['Gender'].mode()[0], inplace=True)
df['Married'].fillna(df['Married'].mode()[0], inplace=True)
df['Dependents'].fillna(df['Dependents'].mode()[0], inplace=True)
df['Self_Employed'].fillna(df['Self_Employed'].mode()[0], inplace=True)
df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mode()[0], inplace=True)
df['Credit_History'].fillna(df['Credit_History'].mode()[0], inplace=True)
df['LoanAmount'].fillna(df['LoanAmount'].mean(), inplace=True)


(df['LoanAmount']).hist(bins=20);


print(df['ApplicantIncome'].value_counts())
(df['ApplicantIncome']).hist(bins=20);

2500    9
4583    6
6000    6
2600    6
3333    5
       ..
3244    1
4408    1
3917    1
3992    1
7583    1
Name: ApplicantIncome, Length: 505, dtype: int64


print(df['CoapplicantIncome'].value_counts())
(df['CoapplicantIncome']).hist(bins=20);

0.0       273
2500.0      5
2083.0      5
1666.0      5
2250.0      3
         ... 
2791.0      1
1010.0      1
1695.0      1
2598.0      1
240.0       1
Name: CoapplicantIncome, Length: 287, dtype: int64


import seaborn as sns
print(df['Gender'].value_counts())
sns.countplot(data=df, x='Gender');

Male      502
Female    112
Name: Gender, dtype: int64


print(df['Married'].value_counts())
sns.countplot(x='Married', data=df);

Yes    401
No     213
Name: Married, dtype: int64


print(df['Dependents'].value_counts())
sns.countplot(x='Dependents', data=df);

0     360
1     102
2     101
3+     51
Name: Dependents, dtype: int64


print(df['Property_Area'].value_counts())
sns.countplot(x='Property_Area', data=df);

Semiurban    233
Urban        202
Rural        179
Name: Property_Area, dtype: int64


df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0}).astype('int')
df['Married'] = df['Married'].map({'Yes': 1, 'No': 0}).astype('int')
df['Education'] = df['Education'].map({'Graduate':1,'Not Graduate':0}).astype('int')
df['Self_Employed'] = df['Self_Employed'].map({'Yes':1,'No':0}).astype('int')
df['Property_Area'] = df['Property_Area'].map({'Rural':0,'Semiurban':1,'Urban':2}).astype('int')
df['Loan_Status'] = df['Loan_Status'].map({'Y':1,'N':0}).astype('int')


df['Dependents'].value_counts()

0     360
1     102
2     101
3+     51
Name: Dependents, dtype: int64


df['Dependents'] = df['Dependents'].replace(to_replace='3+', value='4')


X, y = df.drop('Loan_Status', axis=1), df['Loan_Status']


X.sample(5)


from sklearn.preprocessing import StandardScaler

cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']
X[cols] = StandardScaler().fit_transform(X[cols])


from sklearn.model_selection import train_test_split, cross_val_score

model_df = {}
def model_evaluation(model, train_data = X, test_data = y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
    model.fit(X_train, y_train)
    print(f"The accuracy of the {model} model is {model.score(X_test, y_test)}")
    print(f"The average cross-validation score for the {model} model is {np.mean(cross_val_score(model, X, y))}")
    model_df[model] = round(np.mean(cross_val_score(model, X, y)) * 100, 2)


from sklearn.linear_model import LogisticRegression
model_evaluation(LogisticRegression(), X, y)

The accuracy of the LogisticRegression() model is 0.8536585365853658
The average cross-validation score for the LogisticRegression() model is 0.8045715047314408


from sklearn import svm
model_evaluation(svm.SVC(kernel='rbf'), X, y)

The accuracy of the SVC() model is 0.7804878048780488
The average cross-validation score for the SVC() model is 0.7964147674263627


from sklearn.tree import DecisionTreeClassifier
model_evaluation(DecisionTreeClassifier(), X, y)

The accuracy of the DecisionTreeClassifier() model is 0.6829268292682927
The average cross-validation score for the DecisionTreeClassifier() model is 0.7133813141410104


from sklearn.ensemble import GradientBoostingClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
clf = GradientBoostingClassifier().fit(X_train, y_train)
model_evaluation(GradientBoostingClassifier(), X, y)

The accuracy of the GradientBoostingClassifier() model is 0.7723577235772358
The average cross-validation score for the GradientBoostingClassifier() model is 0.7784886045581768


feature_imp = dict(zip(clf.feature_names_in_, clf.feature_importances_))
feature_imp = {k: v for k,v in sorted(feature_imp.items(), key = lambda x:x[1], reverse=True)}
feature_imp

{'Credit_History': 0.38792505774796615,
 'ApplicantIncome': 0.23985943994908085,
 'LoanAmount': 0.1733795397901463,
 'CoapplicantIncome': 0.1130300041415762,
 'Loan_Amount_Term': 0.025064253010718747,
 'Property_Area': 0.02052473749821235,
 'Education': 0.01358987565877404,
 'Married': 0.012135131235704743,
 'Dependents': 0.01155673288696885,
 'Self_Employed': 0.002902933066350774,
 'Gender': 3.229501450091258e-05}


from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf = RandomForestClassifier().fit(X_train, y_train)
model_evaluation(RandomForestClassifier(), X, y)

The accuracy of the RandomForestClassifier() model is 0.7967479674796748
The average cross-validation score for the RandomForestClassifier() model is 0.7834066373450619


feature_imp = dict(zip(clf.feature_names_in_, clf.feature_importances_))
feature_imp = {k: v for k,v in sorted(feature_imp.items(), key = lambda x: x[1], reverse=True)}
feature_imp

{'Credit_History': 0.2656112007962421,
 'ApplicantIncome': 0.21205330923882804,
 'LoanAmount': 0.1788106892969859,
 'CoapplicantIncome': 0.11051792898545215,
 'Loan_Amount_Term': 0.05071574201710987,
 'Property_Area': 0.04779461976279416,
 'Dependents': 0.047117472920864524,
 'Married': 0.028515989927750004,
 'Education': 0.020860133123977196,
 'Self_Employed': 0.0199838135254959,
 'Gender': 0.018019100404500027}


model_df

{LogisticRegression(): 80.46,
 SVC(): 79.64,
 DecisionTreeClassifier(): 72.32,
 GradientBoostingClassifier(): 77.85,
 RandomForestClassifier(): 78.66}


from sklearn.model_selection import GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Define the parameter grid to search
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization parameter
    'penalty': ['l1', 'l2'],  # Type of regularization
    'solver': ['liblinear', 'saga'],  # Solver algorithm
}

# Create the Logistic Regression model
logistic_regression = LogisticRegression(max_iter=1000)

# Create GridSearchCV with cross-validation
grid_search = GridSearchCV(logistic_regression, param_grid, cv=5, scoring='accuracy')

# Fit the model to the data and perform hyperparameter tuning
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Get the best accuracy score
best_accuracy = grid_search.best_score_
print("Best Accuracy Score:", best_accuracy)

Best Hyperparameters: {'C': 0.01, 'penalty': 'l1', 'solver': 'liblinear'}
Best Accuracy Score: 0.824860853432282


import joblib

logistic_regression = LogisticRegression(C = 0.01,
                                        penalty = 'l1',
                                        solver = 'liblinear',
                                        max_iter = 1000)
logistic_regression.fit(X_train, y_train)
joblib.dump(logistic_regression,'loan_status_predict')
best_model = joblib.load('loan_status_predict')


new_df = pd.DataFrame({
    'Gender':1,
    'Married':1,
    'Dependents':2,
    'Education':0,
    'Self_Employed':0,
    'ApplicantIncome':2889,
    'CoapplicantIncome':0.0,
    'LoanAmount':45,
    'Loan_Amount_Term':180,
    'Credit_History':0,
    'Property_Area':1
},index=[0])
result = best_model.predict(new_df)
if result == 0:
    print("Loan is not approved")
else:
    print("Loan is approved")

Loan is not approved


from tkinter import *
import joblib
import pandas as pd
def show_entry():
    
    p1 = float(e1.get())
    p2 = float(e2.get())
    p3 = float(e3.get())
    p4 = float(e4.get())
    p5 = float(e5.get())
    p6 = float(e6.get())
    p7 = float(e7.get())
    p8 = float(e8.get())
    p9 = float(e9.get())
    p10 = float(e10.get())
    p11 = float(e11.get())
    
    model = joblib.load('loan_status_predict')
    df = pd.DataFrame({
    'Gender':p1,
    'Married':p2,
    'Dependents':p3,
    'Education':p4,
    'Self_Employed':p5,
    'ApplicantIncome':p6,
    'CoapplicantIncome':p7,
    'LoanAmount':p8,
    'Loan_Amount_Term':p9,
    'Credit_History':p10,
    'Property_Area':p11
},index=[0])
    result = model.predict(df)
    
    if result == 1:
        Label(master, text="Loan approved").grid(row=31)
    else:
        Label(master, text="Loan Not Approved").grid(row=31)
        
    
master =Tk()
master.title("Loan Status Prediction Using Machine Learning")
label = Label(master,text = "Loan Status Prediction",bg = "black",
               fg = "white").grid(row=0,columnspan=2)

Label(master,text = "Gender [1:Male ,0:Female]").grid(row=1)
Label(master,text = "Married [1:Yes,0:No]").grid(row=2)
Label(master,text = "Dependents [1,2,3,4]").grid(row=3)
Label(master,text = "Education ['Graduate':1,'Not Graduate':0]").grid(row=4)
Label(master,text = "Self_Employed ['Yes':1,'No':0]").grid(row=5)
Label(master,text = "ApplicantIncome").grid(row=6)
Label(master,text = "CoapplicantIncome").grid(row=7)
Label(master,text = "LoanAmount").grid(row=8)
Label(master,text = "Loan_Amount_Term").grid(row=9)
Label(master,text = "Credit_History [Credit history meets guidelines ('Yes': 1, 'No': 0)]").grid(row=10)
Label(master,text = "Property_Area ['Rural':0,'Semiurban':1,'Urban':2]").grid(row=11)


e1 = Entry(master)
e2 = Entry(master)
e3 = Entry(master)
e4 = Entry(master)
e5 = Entry(master)
e6 = Entry(master)
e7 = Entry(master)
e8 = Entry(master)
e9 = Entry(master)
e10 = Entry(master)
e11 = Entry(master)


e1.grid(row=1,column=1)
e2.grid(row=2,column=1)
e3.grid(row=3,column=1)
e4.grid(row=4,column=1)
e5.grid(row=5,column=1)
e6.grid(row=6,column=1)
e7.grid(row=7,column=1)
e8.grid(row=8,column=1)
e9.grid(row=9,column=1)
e10.grid(row=10,column=1)
e11.grid(row=11,column=1)

Button(master,text="Predict",command=show_entry).grid()

mainloop()

Loan Prediction Project¶

1. Dataset Information¶

2. Handling Missing Values¶

Percentage of Missing Values in each column¶

Filling the missing values in columns with the mode¶

3. Data Exploratory¶

Loan Amount Histogram¶

Applicant Income Histogram¶

Coapplicant Income¶

Countplot of Gender¶

Countplot of Married People¶

Countplot of Dependents¶

Countplot of Property Area¶

4. Label Encoding¶

Mapping to Numerical Values¶

Changing Categorical Entries to Numerical Entries¶

8. Save the feature columns as vector X and the target column as vector y.¶

9. Feature Scaling¶

Standardizing Entries that have big values¶

10. Dividing the dataset into a training set and a test set and then applying K-Fold Cross Validation.¶

11. Logistic Regression¶

12. Support Vector Classifier¶

13. Decision Tree Classifier¶

14. Gradient Boosting Classifier¶

Analyzing The Features Importance¶

15. Random Forest Classifier¶

Analyzing The Feature Importance¶

16. Hyperparameter Tuning¶

17. Predicting New Data Using The Best Model¶

Saving The Best Model¶

Predicting using The Best Model¶

18. Application-based Machine Learning Model¶

	Gender	Married	Education	Self_Employed	ApplicantIncome	CoapplicantIncome	LoanAmount	Loan_Amount_Term	Credit_History	Property_Area
15	1	0	1	0	4950	0.0	125.0	360.0	1.0	2
521	1	0	1	0	2500	0.0	55.0	360.0	1.0	1
25	1	1	1	1	9560	0.0	191.0	360.0	1.0	1
553	1	1	0	0	2454	2333.0	181.0	360.0	0.0	2
88	1	0	1	0	8566	0.0	210.0	360.0	1.0	2