手记

Pandas 数据合并

1.简单实例

import pandas as pd
import numpy as np

df1 = pd.DataFrame({‘key’: [‘K1’, ‘K2’, ‘K3’, ‘K4’],
‘A’: [‘A1’, ‘A2’, ‘A3’, ‘A8’],
‘B’: [‘B1’, ‘B2’, ‘B3’, ‘B8’]})

df2 = pd.DataFrame({‘key’: [‘K3’, ‘K4’, ‘K5’, ‘K6’],
‘A’: [‘A3’, ‘A4’, ‘A5’, ‘A6’],
‘B’: [‘B3’, ‘B4’, ‘B5’, ‘B6’]})

print(“df1=\n{}\n”.format(df1))
print(“df2=\n{}\n”.format(df2))

merge_df = pd.merge(df1, df2)
merge_inner = pd.merge(df1, df2, how=‘inner’, on=[‘key’])
merge_left = pd.merge(df1, df2, how=‘left’)
merge_left_on_key = pd.merge(df1, df2, how=‘left’, on=[‘key’])
merge_right_on_key = pd.merge(df1, df2, how=‘right’, on=[‘key’])
merge_outer = pd.merge(df1, df2, how=‘outer’, on=[‘key’])

print(“merge_df=\n{}\n”.format(merge_df))
print(“merge_inner=\n{}\n”.format(merge_inner))
print(“merge_left=\n{}\n”.format(merge_left))
print(“merge_left_on_key=\n{}\n”.format(merge_left_on_key))
print(“merge_right_on_key=\n{}\n”.format(merge_right_on_key))
print(“merge_outer=\n{}\n”.format(merge_outer))

df3 = pd.DataFrame({‘key’: [‘K1’, ‘K2’, ‘K3’, ‘K4’],
‘A’: [‘A1’, ‘A2’, ‘A3’, ‘A8’],
‘B’: [‘B1’, ‘B2’, ‘B3’, ‘B8’]},
index=[0, 1, 2, 3])

df4 = pd.DataFrame({‘key’: [‘K3’, ‘K4’, ‘K5’, ‘K6’],
‘C’: [‘A3’, ‘A4’, ‘A5’, ‘A6’],
‘D’: [‘B3’, ‘B4’, ‘B5’, ‘B6’]},
index=[1, 2, 3, 4])

print(“df3=\n{}\n”.format(df3))
print(“df4=\n{}\n”.format(df4))

join_df = df3.join(df4, lsuffix=’_self’, rsuffix=’_other’)
join_left = df3.join(df4, how=‘left’, lsuffix=’_self’, rsuffix=’_other’)
join_right = df1.join(df4, how=‘outer’, lsuffix=’_self’, rsuffix=’_other’)

print(“join_df=\n{}\n”.format(join_df))
print(“join_left=\n{}\n”.format(join_left))
print(“join_right=\n{}\n”.format(join_right))

2.Append 与 Extend的区别
append() 方法用于在列表末尾添加新的对象。该方法无返回值,但是会修改原来的列表
append() 追加单个元素到List的尾部,只接受一个参数,参数可以是任何数据类型
list.append(object) 其实就是向列表中添加一个对象object
extend() 将一个列表中每个元素分别添加到另一个列表中,只接受一个参数
list.extend(sequence) 把一个序列seq的内容添加到列表中。

3.大批量数据合并算法———两个数据的每列特征排列顺序不同

import pandas as pd
df1 = pd.read_csv(‘E:/Satisfaction/Data/data1.csv’,header = None)
df2 = pd.read_csv(‘E:/Satisfaction/Data/data2.csv’,header = None)

df1.drop_duplicates(df1.columns[4],inplace=True)
df2.drop_duplicates(df2.columns[4],inplace=True)
df1.set_index(df1.columns[4], inplace = True)
df2.set_index(df2.columns[4], inplace = True)
#删除重复索引
#inplace = False 不替换原来数据
#inplace = True 替换原来数据

add_data = []
#df1中有的数据可能在df2中没有,产生KeyError
df1[‘is_null’] = [True if idx in df2.index else False for idx,row in df1.iterrows()]
df1 = df1[df1[‘is_null’]==True]
for index,row in df1.iterrows():
df2_row = df2.loc[index]
row_data = [df2_row[i] for i in range(8,20)]
add_data.append(row_data)
# row = row.append(df2_row) row为Series类型
# row = pd.concat((row,df2_row),axis = 1)
# d.loc[index] = row
df1.reset_index(inplace=True)
#reset_index:还原索引,重新变为默认的整型索引
out_val = pd.concat((df1,pd.DataFrame(add_data,columns=[i for i in range(100,112)])),axis = 1)
out_val.to_csv(‘E:/Satisfaction/Data/out_val.csv’)
print(‘done’)

0人推荐
随时随地看视频
慕课网APP