In [1]:
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
In [2]:
data = fetch_openml(name='adult', version=2, as_frame=True)
df = data.frame
In [3]:
df.head()
Out[3]:
age workclass fnlwgt education education-num marital-status occupation relationship race sex capital-gain capital-loss hours-per-week native-country class
0 25 Private 226802 11th 7 Never-married Machine-op-inspct Own-child Black Male 0 0 40 United-States <=50K
1 38 Private 89814 HS-grad 9 Married-civ-spouse Farming-fishing Husband White Male 0 0 50 United-States <=50K
2 28 Local-gov 336951 Assoc-acdm 12 Married-civ-spouse Protective-serv Husband White Male 0 0 40 United-States >50K
3 44 Private 160323 Some-college 10 Married-civ-spouse Machine-op-inspct Husband Black Male 7688 0 40 United-States >50K
4 18 NaN 103497 Some-college 10 Never-married NaN Own-child White Female 0 0 30 United-States <=50K
In [4]:
df['class'].value_counts()
Out[4]:
class
<=50K    37155
>50K     11687
Name: count, dtype: int64
In [5]:
df = df.dropna()
X = df.drop('class', axis=1)
y = df['class'].apply(lambda x: 1 if x == '>50K' else 0)  # binary target

# Separate categorical and continuous columns
categorical_cols = X.select_dtypes(include='category').columns
numerical_cols = X.select_dtypes(include='number').columns
In [6]:
categorical_cols
Out[6]:
Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'native-country'],
      dtype='object')
In [7]:
numerical_cols
Out[7]:
Index(['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',
       'hours-per-week'],
      dtype='object')

linear regression¶

accuracy/R^2没有影响,但coef差别很大

GPT推荐在linear regression/logistic regression上面使用drop_first=True

In [8]:
# 5. One-hot encode categorical variables
X_encoded = pd.get_dummies(X, columns=categorical_cols)

# 6. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# 7. Fit a Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# 8. Evaluate
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)

print("(drop_first = False) R^2 Score on test set:", r2)
# 9. Print coefficients with feature names
coef_df = pd.DataFrame({
    'Feature': X_encoded.columns,
    'Coefficient': model.coef_
})

# Sort by absolute coefficient size for better readability
coef_df['Abs(Coefficient)'] = coef_df['Coefficient'].abs()
coef_df = coef_df.sort_values(by='Abs(Coefficient)', ascending=False).drop(columns='Abs(Coefficient)')

print(coef_df)
(drop_first = False) R^2 Score on test set: 0.36291365326169656
                              Feature   Coefficient
56                  relationship_Wife  1.966319e-01
73              native-country_France  1.790582e-01
78  native-country_Holand-Netherlands -1.354470e-01
88                native-country_Laos -1.300259e-01
40         occupation_Exec-managerial  1.259097e-01
..                                ...           ...
4                        capital-loss  9.223598e-05
45         occupation_Priv-house-serv  3.893058e-05
3                        capital-gain  7.686164e-06
1                              fnlwgt  8.849648e-08
8              workclass_Never-worked -5.449460e-14

[105 rows x 2 columns]
In [9]:
# 5. One-hot encode categorical variables
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

# 6. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# 7. Fit a Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# 8. Evaluate
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)

print("(drop_first = True) R^2 Score on test set:", r2)
# 9. Print coefficients with feature names
coef_df = pd.DataFrame({
    'Feature': X_encoded.columns,
    'Coefficient': model.coef_
})

# Sort by absolute coefficient size for better readability
coef_df['Abs(Coefficient)'] = coef_df['Coefficient'].abs()
coef_df = coef_df.sort_values(by='Abs(Coefficient)', ascending=False).drop(columns='Abs(Coefficient)')

print(coef_df)
(drop_first = True) R^2 Score on test set: 0.3629136532620578
                              Feature   Coefficient
12              workclass_Without-pay -2.200538e-01
70  native-country_Holand-Netherlands -2.064746e-01
80                native-country_Laos -2.010535e-01
89            native-country_Scotland -1.927850e-01
47         relationship_Not-in-family -1.799546e-01
..                                ...           ...
21                education_Bachelors -4.688160e-04
4                        capital-loss  9.223598e-05
3                        capital-gain  7.686164e-06
1                              fnlwgt  8.849648e-08
7              workclass_Never-worked -1.129652e-13

[97 rows x 2 columns]

random forest¶

accuracy有一点点点点区别,可能来自各种随机的影响,但鉴于drop_first=True本身不应该用于random forest,所以下面的代码也不需要继续深究

In [10]:
# 5. One-hot encode categorical variables
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=False)

# 6. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier(random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Predict on test data
y_pred = rf_model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
Accuracy: 0.852072968490879

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.93      0.90      6842
           1       0.73      0.62      0.67      2203

    accuracy                           0.85      9045
   macro avg       0.81      0.77      0.79      9045
weighted avg       0.85      0.85      0.85      9045

In [11]:
# 5. One-hot encode categorical variables
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

# 6. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier(random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Predict on test data
y_pred = rf_model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
Accuracy: 0.852294085129906

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.93      0.90      6842
           1       0.73      0.62      0.67      2203

    accuracy                           0.85      9045
   macro avg       0.81      0.77      0.79      9045
weighted avg       0.85      0.85      0.85      9045

In [ ]: