import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

data = fetch_openml(name='adult', version=2, as_frame=True)
df = data.frame

df.head()

df['class'].value_counts()

class
<=50K    37155
>50K     11687
Name: count, dtype: int64

df = df.dropna()
X = df.drop('class', axis=1)
y = df['class'].apply(lambda x: 1 if x == '>50K' else 0)  # binary target

# Separate categorical and continuous columns
categorical_cols = X.select_dtypes(include='category').columns
numerical_cols = X.select_dtypes(include='number').columns

categorical_cols

Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'native-country'],
      dtype='object')

numerical_cols

Index(['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',
       'hours-per-week'],
      dtype='object')

# 5. One-hot encode categorical variables
X_encoded = pd.get_dummies(X, columns=categorical_cols)

# 6. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# 7. Fit a Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# 8. Evaluate
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)

print("(drop_first = False) R^2 Score on test set:", r2)
# 9. Print coefficients with feature names
coef_df = pd.DataFrame({
    'Feature': X_encoded.columns,
    'Coefficient': model.coef_
})

# Sort by absolute coefficient size for better readability
coef_df['Abs(Coefficient)'] = coef_df['Coefficient'].abs()
coef_df = coef_df.sort_values(by='Abs(Coefficient)', ascending=False).drop(columns='Abs(Coefficient)')

print(coef_df)

(drop_first = False) R^2 Score on test set: 0.36291365326169656
                              Feature   Coefficient
56                  relationship_Wife  1.966319e-01
73              native-country_France  1.790582e-01
78  native-country_Holand-Netherlands -1.354470e-01
88                native-country_Laos -1.300259e-01
40         occupation_Exec-managerial  1.259097e-01
..                                ...           ...
4                        capital-loss  9.223598e-05
45         occupation_Priv-house-serv  3.893058e-05
3                        capital-gain  7.686164e-06
1                              fnlwgt  8.849648e-08
8              workclass_Never-worked -5.449460e-14

[105 rows x 2 columns]

# 5. One-hot encode categorical variables
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

# 6. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# 7. Fit a Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# 8. Evaluate
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)

print("(drop_first = True) R^2 Score on test set:", r2)
# 9. Print coefficients with feature names
coef_df = pd.DataFrame({
    'Feature': X_encoded.columns,
    'Coefficient': model.coef_
})

# Sort by absolute coefficient size for better readability
coef_df['Abs(Coefficient)'] = coef_df['Coefficient'].abs()
coef_df = coef_df.sort_values(by='Abs(Coefficient)', ascending=False).drop(columns='Abs(Coefficient)')

print(coef_df)

(drop_first = True) R^2 Score on test set: 0.3629136532620578
                              Feature   Coefficient
12              workclass_Without-pay -2.200538e-01
70  native-country_Holand-Netherlands -2.064746e-01
80                native-country_Laos -2.010535e-01
89            native-country_Scotland -1.927850e-01
47         relationship_Not-in-family -1.799546e-01
..                                ...           ...
21                education_Bachelors -4.688160e-04
4                        capital-loss  9.223598e-05
3                        capital-gain  7.686164e-06
1                              fnlwgt  8.849648e-08
7              workclass_Never-worked -1.129652e-13

[97 rows x 2 columns]

# 5. One-hot encode categorical variables
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=False)

# 6. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier(random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Predict on test data
y_pred = rf_model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.852072968490879

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.93      0.90      6842
           1       0.73      0.62      0.67      2203

    accuracy                           0.85      9045
   macro avg       0.81      0.77      0.79      9045
weighted avg       0.85      0.85      0.85      9045

# 5. One-hot encode categorical variables
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

# 6. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier(random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Predict on test data
y_pred = rf_model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.852294085129906

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.93      0.90      6842
           1       0.73      0.62      0.67      2203

    accuracy                           0.85      9045
   macro avg       0.81      0.77      0.79      9045
weighted avg       0.85      0.85      0.85      9045

	age	workclass	fnlwgt	education	education-num	marital-status	occupation	relationship	race	sex	capital-gain	hours-per-week	native-country	class
0	25	Private	226802	11th	7	Never-married	Machine-op-inspct	Own-child	Black	Male	0	40	United-States	<=50K
1	38	Private	89814	HS-grad	9	Married-civ-spouse	Farming-fishing	Husband	White	Male	0	50	United-States	<=50K
2	28	Local-gov	336951	Assoc-acdm	12	Married-civ-spouse	Protective-serv	Husband	White	Male	0	40	United-States	>50K
3	44	Private	160323	Some-college	10	Married-civ-spouse	Machine-op-inspct	Husband	Black	Male	7688	40	United-States	>50K
4	18	NaN	103497	Some-college	10	Never-married	NaN	Own-child	White	Female	0	30	United-States	<=50K

linear regression¶

random forest¶