import pandas as pd


df = pd.read_csv('income.csv')


for col in df.columns:
    counts = df[col].value_counts()
    print(f'dataframe[{col}]')
    print(counts)
    print('\n')

dataframe[age]
36    1348
35    1337
33    1335
23    1329
31    1325
      ... 
88       6
85       5
87       3
89       2
86       1
Name: age, Length: 74, dtype: int64


dataframe[workclass]
Private             33906
Self-emp-not-inc     3862
Local-gov            3136
?                    2799
State-gov            1981
Self-emp-inc         1695
Federal-gov          1432
Without-pay            21
Never-worked           10
Name: workclass, dtype: int64


dataframe[fnlwgt]
203488    21
190290    19
120277    19
125892    18
126569    18
          ..
188488     1
285290     1
293579     1
114874     1
257302     1
Name: fnlwgt, Length: 28523, dtype: int64


dataframe[education]
HS-grad         15784
Some-college    10878
Bachelors        8025
Masters          2657
Assoc-voc        2061
11th             1812
Assoc-acdm       1601
10th             1389
7th-8th           955
Prof-school       834
9th               756
12th              657
Doctorate         594
5th-6th           509
1st-4th           247
Preschool          83
Name: education, dtype: int64


dataframe[educational-num]
9     15784
10    10878
13     8025
14     2657
11     2061
7      1812
12     1601
6      1389
4       955
15      834
5       756
8       657
16      594
3       509
2       247
1        83
Name: educational-num, dtype: int64


dataframe[marital-status]
Married-civ-spouse       22379
Never-married            16117
Divorced                  6633
Separated                 1530
Widowed                   1518
Married-spouse-absent      628
Married-AF-spouse           37
Name: marital-status, dtype: int64


dataframe[occupation]
Prof-specialty       6172
Craft-repair         6112
Exec-managerial      6086
Adm-clerical         5611
Sales                5504
Other-service        4923
Machine-op-inspct    3022
?                    2809
Transport-moving     2355
Handlers-cleaners    2072
Farming-fishing      1490
Tech-support         1446
Protective-serv       983
Priv-house-serv       242
Armed-Forces           15
Name: occupation, dtype: int64


dataframe[relationship]
Husband           19716
Not-in-family     12583
Own-child          7581
Unmarried          5125
Wife               2331
Other-relative     1506
Name: relationship, dtype: int64


dataframe[race]
White                 41762
Black                  4685
Asian-Pac-Islander     1519
Amer-Indian-Eskimo      470
Other                   406
Name: race, dtype: int64


dataframe[gender]
Male      32650
Female    16192
Name: gender, dtype: int64


dataframe[capital-gain]
0        44807
15024      513
7688       410
7298       364
99999      244
         ...  
1111         1
7262         1
22040        1
1639         1
2387         1
Name: capital-gain, Length: 123, dtype: int64


dataframe[capital-loss]
0       46560
1902      304
1977      253
1887      233
2415       72
        ...  
2465        1
2080        1
155         1
1911        1
2201        1
Name: capital-loss, Length: 99, dtype: int64


dataframe[hours-per-week]
40    22803
50     4246
45     2717
60     2177
35     1937
      ...  
69        1
87        1
94        1
82        1
79        1
Name: hours-per-week, Length: 96, dtype: int64


dataframe[native-country]
United-States                 43832
Mexico                          951
?                               857
Philippines                     295
Germany                         206
Puerto-Rico                     184
Canada                          182
El-Salvador                     155
India                           151
Cuba                            138
England                         127
China                           122
South                           115
Jamaica                         106
Italy                           105
Dominican-Republic              103
Japan                            92
Guatemala                        88
Poland                           87
Vietnam                          86
Columbia                         85
Haiti                            75
Portugal                         67
Taiwan                           65
Iran                             59
Greece                           49
Nicaragua                        49
Peru                             46
Ecuador                          45
France                           38
Ireland                          37
Hong                             30
Thailand                         30
Cambodia                         28
Trinadad&Tobago                  27
Laos                             23
Yugoslavia                       23
Outlying-US(Guam-USVI-etc)       23
Scotland                         21
Honduras                         20
Hungary                          19
Holand-Netherlands                1
Name: native-country, dtype: int64


dataframe[income]
<=50K    37155
>50K     11687
Name: income, dtype: int64


df = df.drop('fnlwgt', axis=1)


df = pd.concat([df.drop('workclass', axis=1), pd.get_dummies(df['workclass']).add_prefix('workclass_')], axis=1)
df = pd.concat([df.drop('occupation', axis=1), pd.get_dummies(df['occupation']).add_prefix('occupation_')], axis=1)
df = df.drop('education', axis=1)
df = pd.concat([df.drop('marital-status', axis=1), pd.get_dummies(df['marital-status']).add_prefix('marital-status_')], axis=1)
df = pd.concat([df.drop('relationship', axis=1), pd.get_dummies(df['relationship']).add_prefix('relationship_')], axis=1)
df = pd.concat([df.drop('race', axis=1), pd.get_dummies(df['race']).add_prefix('race_')], axis=1)
df = pd.concat([df.drop('native-country', axis=1), pd.get_dummies(df['native-country']).add_prefix('native-country_')], axis=1)


df['gender'] = df['gender'].apply(lambda x: 1 if x == 'Male' else 0)
df['income'] = df['income'].apply(lambda x: 1 if x == '>50K' else 0)


df.shape

(48842, 91)


income_corr = df.corr()['income'].abs()
sorted_income_corr = income_corr.sort_values()
num_cols_to_drop = int(0.8 * len(df.columns))
cols_to_keep = sorted_income_corr.iloc[num_cols_to_drop : ].index
df_most_corr = df[cols_to_keep]


import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize = (15, 10))
sns.heatmap(df_most_corr.corr(), annot=True, cmap='coolwarm');


from sklearn.model_selection import train_test_split

X = df.drop('income', axis=1)
y = df['income']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier().fit(X_train, y_train)
clf.score(X_test, y_test)

0.8496263691268298


feature_imp = dict(zip(clf.feature_names_in_, clf.feature_importances_))
feature_imp = {k: v for k,v in sorted(feature_imp.items(), key = lambda x: x[1], reverse=True)}
feature_imp

{'age': 0.22322233494447943,
 'educational-num': 0.1311826117279202,
 'hours-per-week': 0.11350407412031566,
 'capital-gain': 0.10854257519814174,
 'marital-status_Married-civ-spouse': 0.07285802131856205,
 'relationship_Husband': 0.054071865666143,
 'capital-loss': 0.03623408938158132,
 'marital-status_Never-married': 0.02570174793814733,
 'occupation_Exec-managerial': 0.021618843393805664,
 'occupation_Prof-specialty': 0.017875105317731954,
 'gender': 0.013889273602117912,
 'relationship_Not-in-family': 0.010563681520294483,
 'workclass_Private': 0.009653923427601437,
 'relationship_Own-child': 0.008426776216139093,
 'workclass_Self-emp-not-inc': 0.008300231271496155,
 'relationship_Wife': 0.00795649380744919,
 'occupation_Other-service': 0.007734290828360036,
 'native-country_United-States': 0.0064937340627310445,
 'marital-status_Divorced': 0.0063637295755381166,
 'race_White': 0.006223073287283066,
 'occupation_Sales': 0.006051784217098677,
 'workclass_Self-emp-inc': 0.00601461315741399,
 'occupation_Craft-repair': 0.005939820440652321,
 'relationship_Unmarried': 0.005749606215869169,
 'workclass_Local-gov': 0.005398525105848354,
 'occupation_Adm-clerical': 0.005145963881619278,
 'workclass_Federal-gov': 0.005086183145137154,
 'race_Black': 0.004796416481660459,
 'occupation_Farming-fishing': 0.0047604968012297825,
 'workclass_State-gov': 0.004412370339855017,
 'occupation_Tech-support': 0.0040771021673870105,
 'occupation_Machine-op-inspct': 0.004061759162365423,
 'occupation_Transport-moving': 0.003942784452512743,
 'occupation_Handlers-cleaners': 0.0033955311356353925,
 'race_Asian-Pac-Islander': 0.00293520185458387,
 'native-country_?': 0.0028921094649814356,
 'occupation_Protective-serv': 0.0026519234782636236,
 'native-country_Mexico': 0.0026285800906803384,
 'marital-status_Separated': 0.0020013038539824536,
 'relationship_Other-relative': 0.0019994140634655012,
 'occupation_?': 0.0019458118810433975,
 'workclass_?': 0.0016426835806499277,
 'marital-status_Widowed': 0.00157219247753646,
 'native-country_Canada': 0.0013575423796474877,
 'race_Amer-Indian-Eskimo': 0.0013434018787387844,
 'native-country_Philippines': 0.0011953237624546384,
 'race_Other': 0.0010785055357013873,
 'native-country_Germany': 0.0010094294037617407,
 'native-country_England': 0.0009540105100897136,
 'marital-status_Married-spouse-absent': 0.0009186510261147557,
 'native-country_India': 0.0008537887570261623,
 'native-country_Italy': 0.0007804564594581538,
 'native-country_Cuba': 0.0006789765423287363,
 'native-country_Japan': 0.0006178291065667383,
 'native-country_China': 0.0006116247447849556,
 'native-country_Poland': 0.0006085195896047573,
 'native-country_South': 0.000571696217536573,
 'native-country_Puerto-Rico': 0.0005669439950076394,
 'native-country_Jamaica': 0.0005244447392335239,
 'native-country_Ireland': 0.0004891875712529776,
 'native-country_Iran': 0.00047676226896973395,
 'native-country_Portugal': 0.00043557398118890494,
 'native-country_Greece': 0.00042426214741192475,
 'native-country_France': 0.0003981770948688261,
 'native-country_Cambodia': 0.000367642110478976,
 'marital-status_Married-AF-spouse': 0.000341186603122011,
 'native-country_Taiwan': 0.0003137402012422082,
 'native-country_Columbia': 0.00031055692301654435,
 'native-country_Yugoslavia': 0.0002969385079017758,
 'native-country_El-Salvador': 0.00028787357778905197,
 'native-country_Dominican-Republic': 0.00027879151115686035,
 'native-country_Vietnam': 0.0002737884854932835,
 'native-country_Peru': 0.00021806625005251905,
 'native-country_Ecuador': 0.0002097423646966491,
 'native-country_Hungary': 0.00019569670018537144,
 'native-country_Haiti': 0.00019330228615587253,
 'occupation_Priv-house-serv': 0.00018751479833655982,
 'native-country_Hong': 0.00013972805660000248,
 'native-country_Nicaragua': 0.000136058734038263,
 'native-country_Guatemala': 0.00013347680168018442,
 'native-country_Scotland': 0.00012227616818582268,
 'native-country_Trinadad&Tobago': 0.00011930826974066256,
 'workclass_Without-pay': 0.00011617143550319518,
 'native-country_Laos': 9.009014864171938e-05,
 'native-country_Thailand': 8.255386512834968e-05,
 'occupation_Armed-Forces': 7.978436171451954e-05,
 'native-country_Outlying-US(Guam-USVI-etc)': 5.077876426342873e-05,
 'native-country_Honduras': 3.551547200466243e-05,
 'workclass_Never-worked': 5.655835812791772e-06,
 'native-country_Holand-Netherlands': 0.0}


from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [200, 300, 400],           # List of different values for n_estimators
    'max_depth': [None, 20, 30, 40],          # List of different values for max_depth
    'min_samples_split': [5, 8, 11],         # List of different values for min_samples_split
    'max_features': ['auto', 'sqrt', 'log2']  # List of different values for max_features
}

# Create the RandomForestClassifier
rf_classifier = RandomForestClassifier()

# Create the GridSearchCV object
grid_search = GridSearchCV(rf_classifier, param_grid=param_grid, verbose=10, n_jobs=-1)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


grid_search.best_params_


best_estimator = grid_search.best_estimator_
best_estimator.score(X_test, y_test)

Predicting_Income_Project¶

1. Introduction to the Dataset¶

Dataset description¶

Dropping `fnlwgt` column¶

2. One-Hot Encoding¶

Multi-class columns¶

Binary-class columns¶

One-Hot Encoding Multi-Class Columns¶

Encoding Binary-Class Columns¶

3. Feature Selection, Most Correlated With Income¶

Filtering Top 20% Most Correlated Columns with the `Income` Column¶

Plotting The Correlation Heatmap¶

4. Building Machine Learning Model¶

Why Should be Random Forest Model?¶

Dividing the Dataset into the Training Data and the Testing Data¶

Applying The Random Forest Classifier Model¶

Analyzing The Features Importance¶

Feature Importance in Machine Learning Model¶

Top 5 Features with the Highest Importance Scores¶

5. Hyperparameter Tuning¶

Get The Best Parameters¶

Get The Best Score¶

Predicting_Income_Project¶

1. Introduction to the Dataset¶

Dataset description¶

Dropping fnlwgt column¶

2. One-Hot Encoding¶

Multi-class columns¶

Binary-class columns¶

One-Hot Encoding Multi-Class Columns¶

Encoding Binary-Class Columns¶

3. Feature Selection, Most Correlated With Income¶

Filtering Top 20% Most Correlated Columns with the Income Column¶

Plotting The Correlation Heatmap¶

4. Building Machine Learning Model¶

Why Should be Random Forest Model?¶

Dividing the Dataset into the Training Data and the Testing Data¶

Applying The Random Forest Classifier Model¶

Analyzing The Features Importance¶

Feature Importance in Machine Learning Model¶

Top 5 Features with the Highest Importance Scores¶

5. Hyperparameter Tuning¶

Get The Best Parameters¶

Get The Best Score¶

Dropping `fnlwgt` column¶

Filtering Top 20% Most Correlated Columns with the `Income` Column¶