Demo1:最简易的模拟Mapreduce过程
from functools import reduce
words = ['a','bb','ccc','dddd']
#map方法:取每一个元素的长度
word_count = map(len,words)
print("map: ",list(word_count))
#reduce方法:取所有长度的总和
word_sum = reduce(lambda x,y:x+y,[1, 2, 3, 4])
print("reduce: ",word_sum)
运行结果:
Demo2:稍微详细点的:
map:逐一读出来
import sys
def read_input(file):
for line in file:
yield line.split()
def main():
data = read_input(sys.stdin)
for words in data:
for word in words:
print("%s%s%d" % (word,'\t',1))
if __name__ == '__main__':
main()
reduce:对相同的进行累加
import sys
from operator import itemgetter
from itertools import groupby
def read_mapper_output(file,separator = '\t'):
for line in file:
yield line.rstrip().split(separator,1)
def main():
data = read_mapper_output(sys.stdin)
for current_word,group in groupby(data, itemgetter(0)):
total_count = sum(int(count) for current_word,count in group)
print("%s%s%d" % (current_word, '\t' ,total_count))
if __name__ == '__main__':
main()
关于itemgetter 和groupby:
groupby 用于对序列进行分组
使用方式:
groupby(iterable[, keyfunc])
iterable 是一个可迭代对象,keyfunc 是分组函数。
用于对 iterable 的连续项进行分组,如果不指定,则默认对 iterable 中的连续相同项进行分组,返回一个 (key, sub-iterator) 的迭代器。
groupby()创建了一个迭代器,而在每次迭代时都会返回一个值(value)和一个子迭代器。value可以理解为groupby(iterable[, keyfunc])中,分组函数keyfunc的运行结果
Demo1:
from itertools import groupby
for key,value_iter in groupby('aaaabbbccddsscccddd'):
print(key,':',list(value_iter))
Demo2:使用 len 函数作为分组函数
from itertools import groupby
for key,value_iter in groupby(['aaaa','bbb','cc','dd','ss','ccc','ddd'],len):
print(key,':',list(value_iter))
Demo3:
对比groupby和defaultdict
from operator import itemgetter
from itertools import groupby
rows=[
{'name':'mark','age':18,'uid':'110'},
{'name':'miaomiao','age':28,'uid':'160'},
{'name':'xiaohei','age':38,'uid':'130'},
{'name':'miaomiao2','age':28,'uid':'150'},
]
#首先根据age排序
print("排序前 \n")
for i in range(len(rows)):
print(rows[i])
rows.sort(key=itemgetter('age'))
print("排序后 \n")
for i in range(len(rows)):
print(rows[i])
for age,items in groupby(rows,key=itemgetter('age')):
print(age)
print(list(items))
利用defaultdict生成的一键多值进行分类
from collections import defaultdict
rows=[
{'name':'mark','age':18,'uid':'110'},
{'name':'miaomiao','age':28,'uid':'160'},
{'name':'xiaohei','age':38,'uid':'130'},
{'name':'miaomiao2','age':28,'uid':'150'},
]
rows_by_age=defaultdict(list)
for row in rows:
rows_by_age[row['age']].append(row)
for a in rows_by_age:
print(a) #访问键
for a in rows_by_age[28]:
print(a) #访问键对应的值