如果您能让我知道如何使用 SMOTENC,我将不胜感激。我写:
# Data
XX = pd.read_csv('Financial Distress.csv')
y = np.array(XX['Financial Distress'].values.tolist())
y = np.array([0 if i > -0.50 else 1 for i in y])
Na = np.array(pd.read_csv('Na.csv', header=None).values)
XX = XX.iloc[:, 3:127]
# Use get-dummies to convert categorical features into dummy ones
dis_features = ['x121']
X = pd.get_dummies(XX, columns=dis_features)
# # Divide Data into Train and Test
indices = np.arange(y.shape[0])
X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(X, y, indices, stratify=y, test_size=0.3,
random_state=42)
num_indices=list(X)[:X.shape[1]-37]
cat_indices=list(X)[X.shape[1]-37:]
num_indices1 = list(X.iloc[:,np.r_[0:94,95,97,100:123]].columns.values)
cat_indices1 = list(X.iloc[:,np.r_[94,96,98,99,123:160]].columns.values)
print(len(num_indices1))
print(len(cat_indices1))
pipeline=Pipeline(steps= [
# Categorical features
('feature_processing', FeatureUnion(transformer_list = [
('categorical', MultiColumn(cat_indices)),
#numeric
('numeric', Pipeline(steps = [
('select', MultiColumn(num_indices)),
('scale', StandardScaler())
]))
])),
('clf', rg)
]
)
pipeline_with_resampling = make_pipeline(SMOTENC(categorical_features=cat_indices1), pipeline)
# # Grid Search to determine best params
cv=StratifiedKFold(n_splits=5,random_state=42)
rg_cv = GridSearchCV(pipeline_with_resampling, param_grid, cv=cv, scoring = 'f1')
rg_cv.fit(X_train, y_train)
因此,正如所指出的,我有 5 个分类特征。实际上,索引 123 到 160 与一个具有 37 个可能值的分类特征相关,这些特征使用 get_dummies 转换为 37 列。
明月笑刀无情
小唯快跑啊
相关分类