如何关联两个 pandas 数据帧的标量值

这就是我所做的，但它仍然不像我找到内置的 pandas 功能或包那样顺利。因为我最终想用两个以上的表来完成此操作，所以我将表（数据帧）放入字典中。然后，我将每个表更改为单列表，其中具有表示原始列名称和索引值的 MultiIndex。字段值是首尾相连的原始列值。然后，我将这些新表合并到 MultiIndex 上的一个完整外部联接中。现在，我可以通过关联最终表中各自的列来关联任意两个原始表。import pandas as pdgvtx_eiu_df = pd.read_csv('gvtx_eiu.csv',index_col=0,                          skip_blank_lines=False)gvtx_eiu_df.columns.name = 'year'polpartix_eiu_df = pd.read_csv('polpartix_eiu.csv',index_col=0,                               skip_blank_lines=False)polpartix_eiu_df.columns.name = 'year'clean_elec_idea_df = pd.read_csv('clean_elec_idea.csv', index_col=0,                                 skip_blank_lines=False)clean_elec_idea_df.columns.name = 'year'test_table_dict = {'gvtx_eiu': gvtx_eiu_df,                   'polpartix_eiu': polpartix_eiu_df,                   'clean_elec_idea': clean_elec_idea_df}'''# Updated to not use this anymore. Using stack now, thanks to @jtorca. So it# fits more neatly into one function.# Serialize df columns into MultiIndex df, index=(year, country), one columndef df_to_multidx_df(df: pd.DataFrame, cols_idx1_name: str = 'Previous Columns',                     idx_idx2_name: str = 'Previous Index',                     val_col_name: str = 'Values') -> pd.DataFrame:    #Takes 2d dataframe (df) with a single-level index and one or more    #single-level columns. All df values must be the same type.    #Parameters:    #    df: 2d dataframe with single-level index and one or more    #        single-level columns. All df values must be the same type.    #    cols_idx1_name: 1st index title for returned dataframe; index is df    #        column names.    #    idx_idx2_name: 2nd index title for returned dataframe; index is df    #        index.    #Returns:    #    a 2d dataframe with a MultiIndex constructed of table_df column    #    names and index values. Has a single column with field values that are    #    all df columns strung end to end.    # Create MultiIndex from product of index values and column names.    mult_idx = pd.MultiIndex.from_product([df.columns, df.index],                                          names=[cols_idx1_name, idx_idx2_name])    # 1D list of table values in same order as MultiIndex.    val_list = [val for col in df for val in df[col]]        return pd.DataFrame(val_list, index=mult_idx, columns=[val_col_name])'''def df_dict_to_multidx_df(df_dict: dict) -> pd.DataFrame:#     , cols_idx1_name: str = 'idx1',#     idx_idx2_name: str = 'idx2') -> pd.DataFrame:    '''Converts a dictionary (df_dict) of 2d dataframes, each with single-level    indices and columns, into a 2d dataframe (multidx_df) with each column    containing the the values of one of df_dict's dataframes. The index of    multidx_df is a MultiIndex of the input dataframes' column names and index    values. Dataframes are joined in full outer join on the MultiIndex.        NOTE: each input dataframe's index and columns row must be named        beforehand in order to name the columns in the multiindex and join on it.    Parameters:        df_dict: dictionary of 2d dataframes, each with single-level            indices and columns.    Returns:        multidx_df = MultiIndex dataframe.'''        df_dict_copy = df_dict.copy()            # Full outer join each table to multidx_df on MultiIndex.        # Start with first indicator to have a left df to merge.    first_key = next(iter(df_dict_copy))    multidx_df = pd.DataFrame(df_dict_copy.pop(first_key).stack(),                                     columns=[first_key])    for key, df in df_dict_copy.items():        df = pd.DataFrame(df.stack(), columns=[key])        multidx_df = multidx_df.merge(right=df, how='outer',                                     on=multidx_df.index.names[:2])        # concat twice as fast as merge#         multidx_df = pd.concat([multidx_df, df], names=['indicator'], axis=1)        return multidx_df###Test Codeprint(gvtx_eiu_df)#               2006    2007   2008   2009   2010   2011   2012   2013   2014  \# country                                                                       # Afghanistan    NaN  0.0395  0.079  0.079  0.079  0.079  0.079  0.079  0.114   # Albania      0.507  0.5070  0.507  0.507  0.507  0.471  0.400  0.400  0.400   # Algeria      0.221  0.2210  0.221  0.221  0.221  0.221  0.221  0.221  0.221   # Angola       0.214  0.2680  0.321  0.321  0.321  0.321  0.321  0.321  0.321   # Argentina    0.500  0.5000  0.500  0.535  0.571  0.571  0.571  0.571  0.571   # ...            ...     ...    ...    ...    ...    ...    ...    ...    ...   # Venezuela    0.364  0.3960  0.429  0.411  0.393  0.393  0.429  0.429  0.429   # Vietnam      0.429  0.4290  0.429  0.429  0.429  0.429  0.393  0.393  0.393   # Yemen        0.271  0.2610  0.250  0.214  0.179  0.036  0.143  0.143  0.143   # Zambia       0.464  0.4640  0.464  0.500  0.536  0.500  0.536  0.536  0.536   # Zimbabwe     0.079  0.0790  0.079  0.104  0.129  0.129  0.129  0.129  0.129   #               2015   2016   2017   2018  # country                                  # Afghanistan  0.114  0.114  0.114  0.114  # Albania      0.436  0.436  0.471  0.471  # Algeria      0.221  0.221  0.221  0.221  # Angola       0.321  0.321  0.286  0.286  # Argentina    0.500  0.500  0.500  0.536  # ...            ...    ...    ...    ...  # Venezuela    0.393  0.250  0.286  0.179  # Vietnam      0.393  0.321  0.321  0.321  # Yemen        0.036    NaN    NaN    NaN  # Zambia       0.536  0.536  0.500  0.464  # Zimbabwe     0.200  0.200  0.200  0.200  # [164 rows x 13 columns]test_serialized = df_to_multidx_df(df=gvtx_eiu_df, cols_idx1_name='Year',                                   idx_idx2_name='Country',                                   val_col_name='gvtx_eiu')print(test_serialized)#                       gvtx_eiu# Year Country                  # 2006 Afghanistan           NaN#      Albania             0.507#      Algeria             0.221#      Angola              0.214#      Argentina           0.500# ...                        ...# 2018 Venezuela           0.179#      Vietnam             0.321#      Yemen                 NaN#      Zambia              0.464#      Zimbabwe            0.200# [2132 rows x 1 columns]test_multidx_df = table_dict_to_multidx_df(test_table_dict, 'Year', 'Country')print(test_multidx_df)#                       gvtx_eiu       polpartix_eiu  clean_elec_idea# Year Country                                                       # 2006 Afghanistan           NaN               0.222            0.475#      Albania             0.507               0.444            0.541#      Algeria             0.221               0.222            0.399#      Angola              0.214               0.111              NaN#      Argentina           0.500               0.556            0.778# ...                        ...                 ...              ...# 2017 Somalia               NaN                 NaN            0.394#      South Sudan           NaN                 NaN              NaN# 2018 Georgia               NaN                 NaN            0.605#      Somalia               NaN                 NaN              NaN#      South Sudan           NaN                 NaN              NaN# [6976 rows x 3 columns]test_multidx_profile = ProfileReport(test_multidx_df, title='Test MultIdx Profile')输出正是我想要的，但除了希望有一个或两个语句的解决方案之外，我对迭代数据帧的输入字典并不完全满意。我尝试将输入设为数据帧的数据帧，这样我就可以 apply(lambda) 来节省一些内存，但我认为没有骰子让 apply() 正常工作，是时候继续了。

如何关联两个 pandas 数据帧的标量值

2回答