火花函数对UDF性能的影响?
# UDF vs Spark functionfrom faker import Factoryfrom pyspark.sql.functions import lit, concat fake = Factory.create()fake.seed(4321)# Each entry consists of last_name, first_name, ssn, job, and age (at least 1)from pyspark.sql import Rowdef fake_entry(): name = fake.name().split() return (name[1], name[0], fake.ssn(), fake.job(), abs(2016 - fake.date_time().year) + 1)# Create a helper function to call a function repeatedlydef repeat(times, func, *args, **kwargs): for _ in xrange(times): yield func(*args, **kwargs)data = list(repeat(500000, fake_entry))print len(data)data[0]dataDF = sqlContext.createDataFrame(data, ('last_name', 'first_name', 'ssn', 'occupation', 'age'))dataDF.cache()
concat_s = udf(lambda s: s+ 's')udfData = dataDF.select(concat_s(dataDF.first_name).alias('name'))udfData.count()
spfData = dataDF.select(concat(dataDF.first_name, lit('s')).alias('name'))spfData.count()
concat
慕村9548890