In [1]:
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
In [2]:
data = fetch_openml(name='adult', version=2, as_frame=True)
df = data.frame
In [3]:
df.head()
Out[3]:
age | workclass | fnlwgt | education | education-num | marital-status | occupation | relationship | race | sex | capital-gain | capital-loss | hours-per-week | native-country | class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 25 | Private | 226802 | 11th | 7 | Never-married | Machine-op-inspct | Own-child | Black | Male | 0 | 0 | 40 | United-States | <=50K |
1 | 38 | Private | 89814 | HS-grad | 9 | Married-civ-spouse | Farming-fishing | Husband | White | Male | 0 | 0 | 50 | United-States | <=50K |
2 | 28 | Local-gov | 336951 | Assoc-acdm | 12 | Married-civ-spouse | Protective-serv | Husband | White | Male | 0 | 0 | 40 | United-States | >50K |
3 | 44 | Private | 160323 | Some-college | 10 | Married-civ-spouse | Machine-op-inspct | Husband | Black | Male | 7688 | 0 | 40 | United-States | >50K |
4 | 18 | NaN | 103497 | Some-college | 10 | Never-married | NaN | Own-child | White | Female | 0 | 0 | 30 | United-States | <=50K |
In [4]:
df['class'].value_counts()
Out[4]:
class <=50K 37155 >50K 11687 Name: count, dtype: int64
In [5]:
df = df.dropna()
X = df.drop('class', axis=1)
y = df['class'].apply(lambda x: 1 if x == '>50K' else 0) # binary target
# Separate categorical and continuous columns
categorical_cols = X.select_dtypes(include='category').columns
numerical_cols = X.select_dtypes(include='number').columns
In [6]:
categorical_cols
Out[6]:
Index(['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'], dtype='object')
In [7]:
numerical_cols
Out[7]:
Index(['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week'], dtype='object')
linear regression¶
accuracy/R^2没有影响,但coef差别很大
GPT推荐在linear regression/logistic regression上面使用drop_first=True
In [8]:
# 5. One-hot encode categorical variables
X_encoded = pd.get_dummies(X, columns=categorical_cols)
# 6. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)
# 7. Fit a Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)
# 8. Evaluate
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
print("(drop_first = False) R^2 Score on test set:", r2)
# 9. Print coefficients with feature names
coef_df = pd.DataFrame({
'Feature': X_encoded.columns,
'Coefficient': model.coef_
})
# Sort by absolute coefficient size for better readability
coef_df['Abs(Coefficient)'] = coef_df['Coefficient'].abs()
coef_df = coef_df.sort_values(by='Abs(Coefficient)', ascending=False).drop(columns='Abs(Coefficient)')
print(coef_df)
(drop_first = False) R^2 Score on test set: 0.36291365326169656 Feature Coefficient 56 relationship_Wife 1.966319e-01 73 native-country_France 1.790582e-01 78 native-country_Holand-Netherlands -1.354470e-01 88 native-country_Laos -1.300259e-01 40 occupation_Exec-managerial 1.259097e-01 .. ... ... 4 capital-loss 9.223598e-05 45 occupation_Priv-house-serv 3.893058e-05 3 capital-gain 7.686164e-06 1 fnlwgt 8.849648e-08 8 workclass_Never-worked -5.449460e-14 [105 rows x 2 columns]
In [9]:
# 5. One-hot encode categorical variables
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
# 6. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)
# 7. Fit a Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)
# 8. Evaluate
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
print("(drop_first = True) R^2 Score on test set:", r2)
# 9. Print coefficients with feature names
coef_df = pd.DataFrame({
'Feature': X_encoded.columns,
'Coefficient': model.coef_
})
# Sort by absolute coefficient size for better readability
coef_df['Abs(Coefficient)'] = coef_df['Coefficient'].abs()
coef_df = coef_df.sort_values(by='Abs(Coefficient)', ascending=False).drop(columns='Abs(Coefficient)')
print(coef_df)
(drop_first = True) R^2 Score on test set: 0.3629136532620578 Feature Coefficient 12 workclass_Without-pay -2.200538e-01 70 native-country_Holand-Netherlands -2.064746e-01 80 native-country_Laos -2.010535e-01 89 native-country_Scotland -1.927850e-01 47 relationship_Not-in-family -1.799546e-01 .. ... ... 21 education_Bachelors -4.688160e-04 4 capital-loss 9.223598e-05 3 capital-gain 7.686164e-06 1 fnlwgt 8.849648e-08 7 workclass_Never-worked -1.129652e-13 [97 rows x 2 columns]
random forest¶
accuracy有一点点点点区别,可能来自各种随机的影响,但鉴于drop_first=True
本身不应该用于random forest,所以下面的代码也不需要继续深究
In [10]:
# 5. One-hot encode categorical variables
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=False)
# 6. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)
rf_model = RandomForestClassifier(random_state=42)
# Train the model
rf_model.fit(X_train, y_train)
# Predict on test data
y_pred = rf_model.predict(X_test)
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
Accuracy: 0.852072968490879 Classification Report: precision recall f1-score support 0 0.88 0.93 0.90 6842 1 0.73 0.62 0.67 2203 accuracy 0.85 9045 macro avg 0.81 0.77 0.79 9045 weighted avg 0.85 0.85 0.85 9045
In [11]:
# 5. One-hot encode categorical variables
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
# 6. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)
rf_model = RandomForestClassifier(random_state=42)
# Train the model
rf_model.fit(X_train, y_train)
# Predict on test data
y_pred = rf_model.predict(X_test)
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
Accuracy: 0.852294085129906 Classification Report: precision recall f1-score support 0 0.88 0.93 0.90 6842 1 0.73 0.62 0.67 2203 accuracy 0.85 9045 macro avg 0.81 0.77 0.79 9045 weighted avg 0.85 0.85 0.85 9045
In [ ]: