MapReduce倒排索引-原创手记-慕课网

倒排索引就是某个单词在那些文件中出现了多少次，而在最开始学习的Wordcount中是某个文件中那些单词出现了多少次，两者正好相反，就像在搜索引擎上搜索一个单词，下面就会罗列出这个单词在哪些文件里出现过。

首先我们就要想好，什么是key,什么是value，map输出的结果是什么，reduce怎样处理才能够得到想要的结果。

在倒排索引中，我们需要得到单词-文件名-次数，因为这个次数是这个单词在这个文件中出现的次数，所以此时单词和文件应该合在一起作为key，次数作为value。但一次reduce过后还需要将结果中单词合并。因为我们最后要的结果是单词-文件名~次数，文件名~次数······所以需要两次reduce,就用到了combiner 。

此时就变得清晰了，第一次map要输出单词和文件名共同为键，用combiner计算出次数,combiner再输出单词和文件名-次数，用reduce进行合并。combine就相当于一次reducer，但combiner在map端。

package mr.inverseIndex;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
public class ForInverseMR {
public static class ForMapper extends Mapper<LongWritable,Text,Text,Text>{
private Text oKey=new Text();
private Text oValue=new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String filename=((FileSplit)context.getInputSplit()).getPath().getName();//获得文件名
String strs[]=value.toString().split(" ");
for(String s:strs){ //遍历单词
oKey.set(s+"-"+filename);//将单词和文件名设置为键
context.write(oKey,oValue);//value为空，为了计数
}
//输出键值对格式单词-文件名
}
}
public static class ForCombiner extends Reducer<Text,Text,Text,Text>{
private Text oKey=new Text();
private Text oValue=new Text();
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
int count=0;//次数
for (Text text:values){
count++;
}
String strs[]=key.toString().split("-");//将键重新拆分为单词
oKey.set(strs[0]);
oValue.set(strs[1]+"~"+count);
context.write(oKey,oValue);
//输出键值对格式单词1 文件名1~次数
// 单词1 文件名2~次数
}
}
public static class ForReducer extends Reducer<Text,Text,Text,Text>{
private Text oValue=new Text();
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
StringBuilder sb=new StringBuilder();
for(Text text:values){
sb.append(text.toString()).append(",");
}
sb.delete(sb.length()-1,sb.length());//去掉最后一个逗号
oValue.set(sb.toString());
context.write(key,oValue);
//输出格式单词文件1~次数，文件2~次数···
}
}
public static void main(String[] args) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException {
Job job= Job.getInstance();
// 设置map的类
job.setMapperClass(ForInverseMR.ForMapper.class);
//设置combiner的类
job.setCombinerClass(ForInverseMR.ForCombiner.class);
//设置reduce的类
job.setReducerClass(ForInverseMR.ForReducer.class);
//设置map的输出key value 类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
// 设置reduce 输出的key value 类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
//设置输入文件的路径
FileInputFormat.addInputPath(job,new Path("F:\\forTestData\\inverseIndex\\data"));
//设置输出文件的路径//如果重复文件夹就删
FileSystem fs=FileSystem.get(new URI("file://F://out"),new Configuration());
if(fs.exists(new Path("F://out"))){
fs.delete(new Path("F://out"),true);
}
FileOutputFormat.setOutputPath(job,new Path("F://out"));
//保证reduce的个数是1
job.setNumReduceTasks(1);
//true表示将运行进度等信息及时输出给用户，false的话只是等待作业结束
job.waitForCompletion(true);
}
}

完成。

原文出处