分析 Python For 循环中包含的数据帧

现在的情况:


我有一个函数将二进制类目标变量分为“1”和“0”,然后读取每个变量的所有自变量。该函数还根据类别“1”和“0”确定每个自变量的 KDE,然后计算相交面积:


import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

from scipy.stats import gaussian_kde


def intersection_area(data, bandwidth, margin,target_variable_name):

        #target_variable_name is the column name of the response variable

        data = data.dropna()

        X = data.drop(columns = [str(target_variable_name)], axis = 1)

        names = list(X.columns)

        new_columns = []

        for column_name in names[:-1]:

            x0= data.loc[data[str(target_variable_name)] == 0,str(column_name)]

            x1= data.loc[data[str(target_variable_name)] == 1,str(column_name)]

            

            kde0 = gaussian_kde(x0, bw_method=bandwidth)

            kde1 = gaussian_kde(x1, bw_method=bandwidth)

            x_min = min(x0.min(), x1.min()) #find the lowest value between two minimum points

            x_max = min(x0.max(), x1.max()) #finds the lowest value between two maximum points

            dx = margin * (x_max - x_min) # add a margin since the kde is wider than the data

            x_min -= dx

            x_max += dx

        

            x = np.linspace(x_min, x_max, 500)

            kde0_x = kde0(x)

            kde1_x = kde1(x)

            inters_x = np.minimum(kde0_x, kde1_x)

            area_inters_x = np.trapz(inters_x, x) #intersection of two kde

            print(area_inters_x)

问题: 如果我有 n_class = 4 该函数将如下所示:



白板的微信
浏览 100回答 1
1回答

撒科打诨

考虑使用每个目标级别的多个类的列表理解来构建 x 和 kde 的列表。并且不是在每次迭代中打印结果,而是将结果绑定到数据框中:def intersection_area_new(data, bandwidth, margin, target_variable_name):        # Collect the names of the independent variables        data = data.dropna()                # determine the number of unique classes from a multi-class target variable and save them as a list.        classes = data['target'].unique()                kde_dicts = []        for column_name in data.columns[:-1]:            # BUILD LIST OF x's AND kde's            x_s = [data.loc[(data[target_variable_name] == i), str(column_name)] for i in classes]            kde_s = [gaussian_kde(x, bw_method=bandwidth) for x in x_s]                        x_min = min([x.min() for x in x_s])              # find the lowest value between two minimum points            x_max = min([x.max() for x in x_s])              # find the lowest value between two maximum points                                        dx = margin * (x_max - x_min)                    # add a margin since the kde is wider than the data            x_min -= dx            x_max += dx                x_array = np.linspace(x_min, x_max, 500)            kde_x_s = [kde(x_array) for kde in kde_s]                                    inters_x = np.array(kde_x_s).min(axis=0)            area_inters_x = np.trapz(inters_x, x_array)      # intersection of kdes                        kde_dicts.append({'target': target_variable_name,                               'column': column_name,                              'intersection': area_inters_x})                return pd.DataFrame(kde_dicts)输出output = intersection_area_new(sample_dataset, None, 0.5, "target")print(output.head(10))#    target column  intersection# 0  target   var1      0.842256# 1  target   var2      0.757190# 2  target   var3      0.676021# 3  target   var4      0.873074# 4  target   var5      0.763626# 5  target   var6      0.868560
打开App,查看更多内容
随时随地看视频慕课网APP

相关分类

Python