将 FeatureUnion 输出转换为字典,以便进行 DictVectorizer

我正在尝试从datacamp教程重新创建管道,并遇到了管道问题。我相信我遇到的问题是将FectionUnion的输出转换为DictVectorizer的字典。当我运行下面的代码时,代码失败,因为BaseEstimator,TransformerMixin没有定义。任何关于我哪里出错的指导将不胜感激。


import pandas as pd

from sklearn_pandas import DataFrameMapper

from sklearn.impute import SimpleImputer

from sklearn_pandas import CategoricalImputer

from sklearn.pipeline import FeatureUnion, Pipeline

from sklearn.feature_extraction import DictVectorizer

import xgboost as xgb

from sklearn.model_selection import cross_val_score







kidney_feature_names = ['age',

                    'bp',

                    'sg',

                    'al',

                    'su',

                    'rbc',

                    'pc',

                    'pcc',

                    'ba',

                    'bgr',

                    'bu',

                    'sc',

                    'sod',

                    'pot',

                    'hemo',

                    'pcv',

                    'wc',

                    'rc',

                    'htn',

                    'dm',

                    'cad',

                    'appet',

                    'pe',

                    'ane',

                    'class']


kidney_data = pd.read_csv("https://assets.datacamp.com/production/repositories/943/datasets/82c231cd41f92325cf33b78aaa360824e6b599b9/chronic_kidney_disease.csv",

                      names=kidney_feature_names,

                      index_col=False,

                      na_values=["?"])


kidney_data['pcv'] = pd.to_numeric(kidney_data['pcv'], errors='coerce')

kidney_data['wc'] = pd.to_numeric(kidney_data['wc'], errors='coerce')

kidney_data['rc'] = pd.to_numeric(kidney_data['rc'], errors='coerce')

print(kidney_data.dtypes)


#Split data between data and labels

X, y = kidney_data.iloc[:,:-1], kidney_data.iloc[:, -1]


# Check number of nulls in each feature column

nulls_per_column = X.isnull().sum()

print(nulls_per_column)



慕姐8265434
浏览 146回答 1
1回答

holdtom

简短的回答是,您需要从sklearn导入这些:from sklearn.base import BaseEstimator, TransformerMixin我还尝试复制它并遇到了一些其他问题,但我在这个答案中找到了解决方案:Sklearn_pandas在管道中返回TypeError以下是我的完整代码:# Import modulesimport pandas as pdfrom sklearn_pandas import DataFrameMapper, CategoricalImputerfrom sklearn.preprocessing import Imputer, StandardScalerfrom sklearn.pipeline import FeatureUnionfrom sklearn.model_selection import cross_val_score, RandomizedSearchCVfrom sklearn.base import BaseEstimator, TransformerMixinimport xgboost as xgb# Create list of column names for kidney data: kidney_colskidney_cols = ['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr',               'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm',               'cad', 'appet', 'pe', 'ane', 'label']# Load dataset: df_kidneydf_kidney = pd.read_csv('chronic_kidney_disease.csv', names=kidney_cols,                        na_values='?')# Replace label values with 0 (ckd) and 1df_kidney['label'].replace({'ckd':0, 'notckd':1}, inplace=True)# Define X and y: X, yX, y = df_kidney.iloc[:, :-1], df_kidney['label'].values# Define new column order for X: col_ordercol_order = ['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot',             'hemo', 'pcv', 'wc', 'rc', 'rbc', 'pc', 'pcc', 'ba', 'htn', 'dm',             'cad', 'appet', 'pe', 'ane']# Rearrange columns of XX = X[col_order]# Create a boolean mask for categorical columnscategorical_feature_mask = X.dtypes == object# Get a list of categorical column namescategorical_columns = X.columns[categorical_feature_mask].tolist()# Get a list of non-categorical column namesnon_categorical_columns = X.columns[~categorical_feature_mask].tolist()# Create empty list to hold column imputers: transformerstransformers = []# Create numeric imputers and add to list of transformerstransformers.extend([([num_col], [Imputer(strategy='median'),                                                 StandardScaler()]) for num_col                    in non_categorical_columns])# Create categorical imputers and add to list of transformerstransformers.extend([(cat_col, [CategoricalImputer()]) for cat_col in                    categorical_columns])# Use list of transformers to create a DataFrameMapper objectnumeric_categorical_union = DataFrameMapper(transformers, input_df=True,                                            df_out=True)# Define Dictifier class to turn df into dictionary as part of pipelineclass Dictifier(BaseEstimator, TransformerMixin):           def fit(self, X, y=None):        return self    def transform(self, X):        return X.to_dict('records')# Create full pipelinepipeline = Pipeline([('featureunion', numeric_categorical_union),                    ('dictifier', Dictifier()),                    ('vectorizer', DictVectorizer(sort=False)),                    ('clf', xgb.XGBClassifier(max_depth=3))])# Perform cross-validationcross_val_scores = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=3)
打开App,查看更多内容
随时随地看视频慕课网APP

相关分类

Python