摇曳的蔷薇
选项1如果另一列中的所有子列表长度相同,则numpy可以是以下有效选项:vals = np.array(df.B.values.tolist()) a = np.repeat(df.A, vals.shape[1])pd.DataFrame(np.column_stack((a, vals.ravel())), columns=df.columns) A B0 1 11 1 22 2 13 2 2选项2如果子列表具有不同的长度,则需要执行其他步骤:vals = df.B.values.tolist()rs = [len(r) for r in vals] a = np.repeat(df.A, rs)pd.DataFrame(np.column_stack((a, np.concatenate(vals))), columns=df.columns) A B0 1 11 1 22 2 13 2 2选项3我试着将其概括为一个用于压扁N列和瓷砖M列的工作,稍后我会努力提高它的效率:df = pd.DataFrame({'A': [1,2,3], 'B': [[1,2], [1,2,3], [1]], 'C': [[1,2,3], [1,2], [1,2]], 'D': ['A', 'B', 'C']}) A B C D0 1 [1, 2] [1, 2, 3] A1 2 [1, 2, 3] [1, 2] B2 3 [1] [1, 2] Cdef unnest(df, tile, explode): vals = df[explode].sum(1) rs = [len(r) for r in vals] a = np.repeat(df[tile].values, rs, axis=0) b = np.concatenate(vals.values) d = np.column_stack((a, b)) return pd.DataFrame(d, columns = tile + ['_'.join(explode)])unnest(df, ['A', 'D'], ['B', 'C']) A D B_C0 1 A 11 1 A 22 1 A 13 1 A 24 1 A 35 2 B 16 2 B 27 2 B 38 2 B 19 2 B 210 3 C 111 3 C 112 3 C 2功能def wen1(df): return df.set_index('A').B.apply(pd.Series).stack().reset_index(level=0).rename(columns={0: 'B'})def wen2(df): return pd.DataFrame({'A':df.A.repeat(df.B.str.len()),'B':np.concatenate(df.B.values)})def wen3(df): s = pd.DataFrame({'B': np.concatenate(df.B.values)}, index=df.index.repeat(df.B.str.len())) return s.join(df.drop('B', 1), how='left')def wen4(df): return pd.DataFrame([[x] + [z] for x, y in df.values for z in y],columns=df.columns)def chris1(df): vals = np.array(df.B.values.tolist()) a = np.repeat(df.A, vals.shape[1]) return pd.DataFrame(np.column_stack((a, vals.ravel())), columns=df.columns)def chris2(df): vals = df.B.values.tolist() rs = [len(r) for r in vals] a = np.repeat(df.A.values, rs) return pd.DataFrame(np.column_stack((a, np.concatenate(vals))), columns=df.columns)计时import pandas as pdimport matplotlib.pyplot as pltimport numpy as npfrom timeit import timeitres = pd.DataFrame( index=['wen1', 'wen2', 'wen3', 'wen4', 'chris1', 'chris2'], columns=[10, 50, 100, 500, 1000, 5000, 10000], dtype=float)for f in res.index: for c in res.columns: df = pd.DataFrame({'A': [1, 2], 'B': [[1, 2], [1, 2]]}) df = pd.concat([df]*c) stmt = '{}(df)'.format(f) setp = 'from __main__ import df, {}'.format(f) res.at[f, c] = timeit(stmt, setp, number=50)ax = res.div(res.min()).T.plot(loglog=True)ax.set_xlabel("N")ax.set_ylabel("time (relative)")
人到中年有点甜
另一种方法是将meshgrid配方应用于列的行以免除:import numpy as npimport pandas as pddef unnest(frame, explode): def mesh(values): return np.array(np.meshgrid(*values)).T.reshape(-1, len(values)) data = np.vstack(mesh(row) for row in frame[explode].values) return pd.DataFrame(data=data, columns=explode)df = pd.DataFrame({'A': [1, 2], 'B': [[1, 2], [1, 2]]})print(unnest(df, ['A', 'B'])) # baseprint()df = pd.DataFrame({'A': [1, 2], 'B': [[1, 2], [3, 4]], 'C': [[1, 2], [3, 4]]})print(unnest(df, ['A', 'B', 'C'])) # multiple columnsprint()df = pd.DataFrame({'A': [1, 2, 3], 'B': [[1, 2], [1, 2, 3], [1]], 'C': [[1, 2, 3], [1, 2], [1, 2]], 'D': ['A', 'B', 'C']})print(unnest(df, ['A', 'B'])) # uneven length listsprint()print(unnest(df, ['D', 'B'])) # different typesprint()产量 A B0 1 11 1 22 2 13 2 2 A B C0 1 1 11 1 2 12 1 1 23 1 2 24 2 3 35 2 4 36 2 3 47 2 4 4 A B0 1 11 1 22 2 13 2 24 2 35 3 1 D B0 A 11 A 22 B 13 B 24 B 35 C 1