慕田峪7331174
你可以试试下面的代码。from pyspark.sql.functions import *# Create DataFrame df1df1 = spark.createDataFrame([(4,8,1,1), (7,5,6,2), (7,3,5,3)], ["x", "y", "z", "cid"])# Create DataFrame df2df2 = spark.createDataFrame([(8,4,5,1), (1,2,9,2), (8,6,4,3), (4,5,4,4)], ["x", "y", "z", "cid"])# Get the values from cid column from df1col1 = df1.select(collect_set("cid")).collect()[0][0]# Filter the dataframe df2 where cid values present in df2 but not in df1.df3 = df2.filter(~df2["cid"].isin(col1))# Union df1 and df3.df4 = df1.union(df3)df4.show()// 输出+---+---+---+---+| x| y| z|cid|+---+---+---+---+| 4| 8| 1| 1|| 7| 5| 6| 2|| 7| 3| 5| 3|| 4| 5| 4| 4|+---+---+---+---+我希望这有帮助。