在第一个数据框给出的两个日期之间找到第二个数据框的最小值和最大值

我有这 2 个虚拟数据框


np.random.seed(12345)


df1=pd.DataFrame({'name'    :  ['A']*4+['B']*4,

                'start_date':   pd.to_datetime(['2000-03-15', '2000-06-12','2000-09-01', '2001-01-17','2000-03-19', '2000-06-14','2000-09-14', '2001-01-22']),

                'end_date':pd.to_datetime(['2000-06-12','2000-09-01', '2001-01-17','2001-03-19', '2000-06-14','2000-09-14', '2001-01-22','2001-02-01'])})


date=pd.date_range('2000-01-01','2002-01-01')

name=['A']*len(date)+['B']*len(date)

date=date.append(date)

import numpy as np

low=np.random.rand(len(date))

high=low+np.random.rand(len(date))

df2=pd.DataFrame({'name': name, 'date': date, 'low':low,'high':high})

对于 df1 中的每一行,我都给出了名称、开始日期和结束日期。


我想在 high 中找到最大值,在 low 中找到与名称相同并且在 df2 中的开始日期和结束日期之间的最小值


以下是我目前的解决方案。


df1=df1.set_index('name')

df2=df2.set_index(['name','date'])

df2=df2.sort_index()

df1['max']=-1

df1['min']=-1

for name in df1.index.unique():

    df=df2.loc[name]

    tmphigh=[]

    tmplow=[]

    for (_,start_date,end_date,_,_) in df1.loc[name].itertuples(name=None):

        newdf=df.iloc[df.index.searchsorted(start_date): df.index.searchsorted(end_date)]

        tmphigh.append(newdf.high.max())

        tmplow.append(newdf.low.min())

    df1.loc[[name],['max']]=tmphigh

    df1.loc[[name],['min']]=tmplow


然而,应用超过百万的行仍然需要相当长的时间。我想知道是否有更快的方法来做到这一点。


[编辑]:感谢 Pramote Kuacharoen,我能够调整他的一些代码并实现比我现有代码快 6 倍的速度。


分成循环的原因是我发现在 apply 函数中包含 df2[name] 的生成会导致计算时间显着增加。


因此我分开计算它可能有助于减少函数调用以提取 df2 中名称下的所有值。


如果有人能提出比我的方法更好的方法,我会很高兴。但这对我来说已经足够了。


以下是我目前的解决方案


from tqdm import tqdm

df1a=df1.groupby('name')

df2a=df2.groupby('name')

mergedf=df1

mergedf['maximum']=-1

mergedf['minimum']=-1

def get_min_max(row):

    dfx=df2x.iloc[df2x.index.searchsorted(row['start_date']): df2x.index.searchsorted(row['end_date'])]

    maximum = dfx['high'].max()

    minimum = dfx['low'].min() 

    return pd.Series({'maximum': maximum, 'minimum': minimum})

for name,df in tqdm(df1a):

    df2x=df2a.get_group(name)

    mergedf.loc[[name],['maximum','minimum']]=df.apply(get_min_max,axis=1)


qq_笑_17
浏览 138回答 1
1回答

慕雪6442864

import pandas as pddf1=pd.DataFrame({'name'&nbsp; &nbsp; :&nbsp; ['A']*4+['B']*4,&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; 'start_date':&nbsp; &nbsp;pd.to_datetime(['2000-03-15', '2000-06-12','2000-09-01', '2001-01-17','2000-03-19', '2000-06-14','2000-09-14', '2001-01-22']),&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; 'end_date':pd.to_datetime(['2000-06-12','2000-09-01', '2001-01-17','2001-03-19', '2000-06-14','2000-09-14', '2001-01-22','2001-02-01'])})date=pd.date_range('2000-01-01','2002-01-01')name=['A']*len(date)+['B']*len(date)date=date.append(date)import numpy as nplow=np.random.rand(len(date))high=low+np.random.rand(len(date))df2=pd.DataFrame({'name': name, 'date': date, 'low':low,'high':high})df2 = df2.set_index('date')def find_max(row):&nbsp; &nbsp; return df2[df2['name'] == row['name']].loc[row['start_date']:row['end_date'], 'high'].max()def find_min(row):&nbsp; &nbsp; return df2[df2['name'] == row['name']].loc[row['start_date']:row['end_date'], 'low'].min()df1['maximum'] = df1.apply(find_max, axis=1)df1['minimum'] = df1.apply(find_min, axis=1)尝试一次找到最小值和最大值。它可能会节省一些时间。def find_min_max(row):&nbsp; &nbsp; dfx = df2[df2['name'] == row['name']].loc[row['start_date']:row['end_date'], ['high', 'low']]&nbsp; &nbsp; maximum = dfx['high'].max()&nbsp; &nbsp; minimum = dfx['low'].min()&nbsp; &nbsp; return pd.Series({'maximum': maximum, 'minimum': minimum})df1.merge(df1.apply(find_min_max, axis=1), left_index=True, right_index=True)试试这个:多处理和共享内存。将其保存在 .py 文件中并使用命令行运行它。它应该快得多。我将 n_workers 设置为 4。您可以更改它。import numpy as npimport pandas as pdfrom multiprocessing.shared_memory import SharedMemoryfrom concurrent.futures import ProcessPoolExecutor, as_completeddef find_min_max(name, data_info):&nbsp; &nbsp; shm_name, shape, dtype = data_info[0]&nbsp; &nbsp; shm1 = SharedMemory(shm_name)&nbsp; &nbsp; np1 = np.recarray(shape=shape, dtype=dtype, buf=shm1.buf)&nbsp; &nbsp; shm_name, shape, dtype = data_info[1]&nbsp; &nbsp; shm2 = SharedMemory(shm_name)&nbsp; &nbsp; np2 = np.recarray(shape=shape, dtype=dtype, buf=shm2.buf)&nbsp; &nbsp; data1 = np1[np1['name'] == name]&nbsp; &nbsp; data2 = np2[np2['name'] == name]&nbsp; &nbsp; for rec in data1:&nbsp; &nbsp; &nbsp; &nbsp; idx1 = np.searchsorted(data2['date'], rec['start_date'])&nbsp; &nbsp; &nbsp; &nbsp; idx2 = np.searchsorted(data2['date'], rec['end_date'])&nbsp; &nbsp; &nbsp; &nbsp; data = data2[idx1:idx2]&nbsp; &nbsp; &nbsp; &nbsp; np1[rec['index']]['maximum'] = data['high'].max()&nbsp; &nbsp; &nbsp; &nbsp; np1[rec['index']]['minimum'] = data['low'].min()def main():&nbsp; &nbsp; np.random.seed(12345)&nbsp; &nbsp; df1 = pd.DataFrame({'name':&nbsp; ['A']*4+['B']*4,&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; 'start_date':&nbsp; &nbsp;pd.to_datetime(['2000-03-15', '2000-06-12', '2000-09-01', '2001-01-17', '2000-03-19', '2000-06-14', '2000-09-14', '2001-01-22']),&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; 'end_date': pd.to_datetime(['2000-06-12', '2000-09-01', '2001-01-17', '2001-03-19', '2000-06-14', '2000-09-14', '2001-01-22', '2001-02-01'])})&nbsp; &nbsp; date = pd.date_range('2000-01-01', '2002-01-01')&nbsp; &nbsp; name = ['A']*len(date)+['B']*len(date)&nbsp; &nbsp; date = date.append(date)&nbsp; &nbsp; low = np.random.rand(len(date))&nbsp; &nbsp; high = low+np.random.rand(len(date))&nbsp; &nbsp; df2 = pd.DataFrame({'name': name, 'date': date, 'low': low, 'high': high})&nbsp; &nbsp; df1 = df1.sort_values('name')&nbsp; &nbsp; df2 = df2.sort_values(['name', 'date'])&nbsp; &nbsp; df1['maximum'] = -1.0&nbsp; &nbsp; df1['minimum'] = -1.0&nbsp; &nbsp; np1 = df1.to_records(column_dtypes={&nbsp; &nbsp; &nbsp; &nbsp; 'name': '|S20', 'start_date': '<M8[ns]', 'end_date': '<M8[ns]'})&nbsp; &nbsp; np2 = df2.to_records(column_dtypes={&nbsp; &nbsp; &nbsp; &nbsp; 'name': '|S20', 'date': '<M8[ns]', 'low': '<f8', 'high': '<f8'})&nbsp; &nbsp; names = [str.encode(name) for name in df1['name'].unique()]&nbsp; &nbsp; del df1&nbsp; &nbsp; del df2&nbsp; &nbsp; shm1 = SharedMemory(name='d1', create=True, size=np1.nbytes)&nbsp; &nbsp; shm2 = SharedMemory(name='d2', create=True, size=np2.nbytes)&nbsp; &nbsp; shm1_np_array = np.recarray(&nbsp; &nbsp; &nbsp; &nbsp; shape=np1.shape, dtype=np1.dtype, buf=shm1.buf)&nbsp; &nbsp; np.copyto(shm1_np_array, np1)&nbsp; &nbsp; shm2_np_array = np.recarray(&nbsp; &nbsp; &nbsp; &nbsp; shape=np2.shape, dtype=np2.dtype, buf=shm2.buf)&nbsp; &nbsp; np.copyto(shm2_np_array, np2)&nbsp; &nbsp; data_info = [&nbsp; &nbsp; &nbsp; &nbsp; (shm1.name, np1.shape, np1.dtype),&nbsp; &nbsp; &nbsp; &nbsp; (shm2.name, np2.shape, np2.dtype)&nbsp; &nbsp; ]&nbsp; &nbsp; del np1&nbsp; &nbsp; del np2&nbsp; &nbsp; # Set number of workers&nbsp; &nbsp; n_workers = 4&nbsp; &nbsp; with ProcessPoolExecutor(n_workers) as exe:&nbsp; &nbsp; &nbsp; &nbsp; fs = [exe.submit(find_min_max, name, data_info)&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; for name in names]&nbsp; &nbsp; &nbsp; &nbsp; for _ in as_completed(fs):&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; pass&nbsp; &nbsp; print(shm1_np_array)&nbsp; &nbsp; shm1.close()&nbsp; &nbsp; shm2.close()&nbsp; &nbsp; shm1.unlink()&nbsp; &nbsp; shm2.unlink()if __name__ == "__main__":&nbsp; &nbsp; main()
打开App,查看更多内容
随时随地看视频慕课网APP

相关分类

Python