Lucene 高级搜索引擎工厂(二)
前情回顾:
上篇文章介绍了,mongodb driver的使用方法。本文详细的介绍Lucene的全文检索功能。
1.定时任务 更新索引库
package com.noseparte.lucene;
import com.mongodb.client.FindIterable;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoDatabase;
import com.noseparte.mongo.MongoDBConfig;
import lombok.extern.slf4j.Slf4j;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.suggest.analyzing.AnalyzingInfixSuggester;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.bson.Document;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import org.wltea.analyzer.lucene.IKAnalyzer;
import java.io.IOException;
import java.nio.file.FileSystems;
/**
* Copyright © 2018 noseparte © BeiJing BoLuo Network Technology Co. Ltd.
* @Author Noseparte
* @Compile 2018-12-24 -- 16:08
* @Version 1.0
* @Description 更新Lucene的索引库
*/
@Slf4j
@Component
public class LuceneIndexesFactory {
public static String INDEX_PATH = "";
public static String GEOGRAPHY_INDEX_PATH = "";
/** 分配lucene索引库的磁盘地址 */
static {
String os_name = System.getProperty("os.name");
if (os_name.substring(0, 3).toLowerCase().equals("win")) {
INDEX_PATH = "D:\\data\\lucene\\lucene-db";
GEOGRAPHY_INDEX_PATH = "D:\\data\\lucene\\lucene-geography-db";
} else {
INDEX_PATH = "/data/lucene/lucene-db";
GEOGRAPHY_INDEX_PATH = "/data/lucene/lucene-geography-db";
}
}
@Autowired
private MongoDBConfig mongoDBConfig;
public static IndexSearcher init(String path) throws IOException {
IndexSearcher indexSearcher = null;
AnalyzingInfixSuggester suggester = null;
if (indexSearcher == null) {
// 1、创建Directory
Directory directory = FSDirectory.open(FileSystems.getDefault().getPath(path));
// 2、创建IndexReader
DirectoryReader directoryReader = DirectoryReader.open(directory);
// 3、根据IndexReader创建IndexSearch
indexSearcher = new IndexSearcher(directoryReader);
}
return indexSearcher;
}
public static void main(String[] args) {
System.out.println(System.getProperty("os.name").substring(0, 3));
}
/**
* // 原始文档 ==> 创建索引 // 1.获得文档; 2.构架文档对象; 3.分析文档(分词); 4.创建索引;
*
* <p>见数据库文件生成本地索引文件,创建索引 {@link // https://www.cnblogs.com/dacc123/p/8228298.html} {@linkplain //
* 索引的创建 IndexWriter 和索引速度的优化 }
*/
@Scheduled(cron = "0 0/5 * * * ? ")
public void updateIndex() {
IndexWriter indexWriter = null;
try {
Directory directory = FSDirectory.open(FileSystems.getDefault().getPath(INDEX_PATH));
// 根据空格和符号来完成分词,还可以完成数字、字母、E-mail地址、IP地址以及中文字符的分析处理,还可以支持过滤词表,用来代替StopAnalyzer能够实现的过滤功能。
// Analyzer analyzer = new StandardAnalyzer();
// 实现了以词典为基础的正反向全切分,以及正反向最大匹配切分两种方法。IKAnalyzer是第三方实现的分词器,继承自Lucene的Analyzer类,针对中文文本进行处理。
Analyzer analyzer = new IKAnalyzer(true);
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
indexWriterConfig.setRAMBufferSizeMB(16.0);
indexWriter = new IndexWriter(directory, indexWriterConfig);
long deleteCount = indexWriter.deleteAll(); // 清除以前的索引
log.info("索引库清除完毕,清除数量为,deleteCount,{}", deleteCount);
MongoDatabase database = mongoDBConfig.getDatabase("h5wx_login_server");
MongoCollection<Document> collection = database.getCollection("player_tree_node_info");
FindIterable<Document> Documents = collection.find();
for (Document cursor : Documents) {
org.apache.lucene.document.Document document = new org.apache.lucene.document.Document();
document.add(new Field("id", cursor.getObjectId("_id").toString(), TextField.TYPE_STORED));
document.add(new Field("playerId", cursor.getString("playerId"), TextField.TYPE_STORED));
indexWriter.addDocument(document);
}
} catch (Exception e) {
log.error("创建索引失败。 errorMsg,{}", e.getMessage());
} finally {
try {
if (null != indexWriter) {
indexWriter.close();
}
} catch (Exception ex) {
log.error("索引流关闭失败,error,{}", ex.getMessage());
}
}
}
@Scheduled(cron = "0 0/5 * * * ? ")
public void updateGeographyIndex() {
IndexWriter indexWriter = null;
try {
Directory directory =
FSDirectory.open(FileSystems.getDefault().getPath(GEOGRAPHY_INDEX_PATH));
// 根据空格和符号来完成分词,还可以完成数字、字母、E-mail地址、IP地址以及中文字符的分析处理,还可以支持过滤词表,用来代替StopAnalyzer能够实现的过滤功能。
// Analyzer analyzer = new StandardAnalyzer();
// 实现了以词典为基础的正反向全切分,以及正反向最大匹配切分两种方法。IKAnalyzer是第三方实现的分词器,继承自Lucene的Analyzer类,针对中文文本进行处理。
Analyzer analyzer = new IKAnalyzer(true);
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
indexWriterConfig.setRAMBufferSizeMB(16.0);
indexWriter = new IndexWriter(directory, indexWriterConfig);
long deleteCount = indexWriter.deleteAll(); // 清除以前的索引
log.info("索引库清除完毕,清除数量为,deleteCount,{}", deleteCount);
MongoDatabase database = mongoDBConfig.getDatabase("depth-search");
MongoCollection<Document> collection = database.getCollection("mg_national_geography_repo");
FindIterable<Document> Documents = collection.find();
for (Document cursor : Documents) {
org.apache.lucene.document.Document document = new org.apache.lucene.document.Document();
document.add(new Field("id", cursor.getObjectId("_id").toString(), TextField.TYPE_STORED));
document.add(new Field("scenery", cursor.getString("scenery"), TextField.TYPE_STORED));
document.add(new Field("geography", cursor.getString("geography"), TextField.TYPE_STORED));
document.add(new Field("title", cursor.getString("title"), TextField.TYPE_STORED));
document.add(new Field("author", cursor.getString("author"), TextField.TYPE_STORED));
indexWriter.addDocument(document);
}
} catch (Exception e) {
log.error("创建索引失败。 errorMsg,{}", e.getMessage());
} finally {
try {
if (null != indexWriter) {
indexWriter.close();
}
} catch (Exception ex) {
log.error("索引流关闭失败,error,{}", ex.getMessage());
}
}
}
}
2.根据关键字去索引库检索
package com.noseparte.lucene;
import com.noseparte.base.bean.Response;
import com.noseparte.base.controller.BaseController;
import com.noseparte.redis.RedisConstant;
import com.noseparte.redis.RedisUtils;
import lombok.extern.slf4j.Slf4j;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.search.*;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.configurationprocessor.json.JSONObject;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
import org.wltea.analyzer.lucene.IKAnalyzer;
import java.util.ArrayList;
import java.util.List;
/**
* Copyright © 2018 noseparte © BeiJing BoLuo Network Technology Co. Ltd.
*
* @Author Noseparte
* @Compile 2018-12-25 -- 11:15
* @Version 1.0
* @Description
*/
@Slf4j
@RestController
@RequestMapping("api/lucene")
public class LuceneSearchController extends BaseController {
@Autowired
private RedisUtils redisUtils;
@PostMapping("/search")
public Response retrieval(@RequestParam("keyword") String keyword){
log.info("本次检索的关键词为:keyword,{} ========= ",keyword);
List<String> result = new ArrayList<>();
try{
Analyzer analyzer = new IKAnalyzer(true);
// 简单的查询,创建Query表示搜索域为content包含keyWord的文档
//Query query = new QueryParser("content", analyzer).parse(keyWord);
String[] fields = {"playerId"};
// MUST 表示and,MUST_NOT 表示not ,SHOULD表示or
BooleanClause.Occur[] clauses = {BooleanClause.Occur.SHOULD};
// MultiFieldQueryParser表示多个域解析, 同时可以解析含空格的字符串,如果我们搜索"上海 中国"
Query multiFieldQuery = MultiFieldQueryParser.parse(keyword, fields, clauses, analyzer);
// 5、根据searcher搜索并且返回TopDocs
IndexSearcher indexSearcher = LuceneIndexesFactory.init(LuceneIndexesFactory.INDEX_PATH);
// 5、根据searcher搜索并且返回TopDocs
TopDocs topDocs = indexSearcher.search(multiFieldQuery, 100);
log.info("共找到的匹配处: hitsCount,{}",topDocs.totalHits);
// 6、根据TopDocs获取ScoreDoc对象
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
log.info("共找到文档的匹配数:docLength,{}",scoreDocs.length);
QueryScorer scorer = new QueryScorer(multiFieldQuery,"content");
SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter("<span style=\"color:red\">","</span>");
Highlighter highlighter = new Highlighter(htmlFormatter, scorer);
highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer));
for(ScoreDoc scoreDoc : scoreDocs){
// 7、根据searcher和ScoreDoc对象获取具体的Document对象
Document document = indexSearcher.doc(scoreDoc.doc);
String id = document.get("id");
String playerId = document.get("playerId");
JSONObject jsonObject = new JSONObject();
jsonObject.put("id",id);
jsonObject.put("playerId",playerId);
result.add(jsonObject.toString());
}
return getResponse().success(result);
}catch (Exception e){
log.error("检索失败, 异常原因:errorMsg,{}",e.getMessage());
return getResponse().failure(e.getMessage());
}
}
@PostMapping("/searchGeo")
public Response retrievalGeo(@RequestParam("keyword") String keyword){
log.info("本次检索的关键词为:keyword,{} ========= ",keyword);
List<String> result = new ArrayList<>();
String[] fields = new String[10];
try{
Analyzer analyzer = new IKAnalyzer(true);
// 简单的查询,创建Query表示搜索域为content包含keyWord的文档
//Query query = new QueryParser("content", analyzer).parse(keyWord);
String field = redisUtils.lGet(RedisConstant.GEO_FIELDS, 0L ,redisUtils.lGetListSize(RedisConstant.GEO_FIELDS));
fields[0] = field;
// MUST 表示and,MUST_NOT 表示not ,SHOULD表示or
BooleanClause.Occur[] clauses = {BooleanClause.Occur.SHOULD,BooleanClause.Occur.SHOULD,BooleanClause.Occur.SHOULD,BooleanClause.Occur.SHOULD};
// MultiFieldQueryParser表示多个域解析, 同时可以解析含空格的字符串,如果我们搜索"上海 中国"
Query multiFieldQuery = MultiFieldQueryParser.parse(keyword, fields, clauses, analyzer);
// 5、根据searcher搜索并且返回TopDocs
IndexSearcher indexSearcher = LuceneIndexesFactory.init(LuceneIndexesFactory.GEOGRAPHY_INDEX_PATH);
// 5、根据searcher搜索并且返回TopDocs
TopDocs topDocs = indexSearcher.search(multiFieldQuery, 100);
log.info("共找到的匹配处: hitsCount,{}",topDocs.totalHits);
// 6、根据TopDocs获取ScoreDoc对象
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
log.info("共找到文档的匹配数:docLength,{}",scoreDocs.length);
QueryScorer scorer = new QueryScorer(multiFieldQuery,"content");
SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter("<span style=\"color:red\">","</span>");
Highlighter highlighter = new Highlighter(htmlFormatter, scorer);
highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer));
for(ScoreDoc scoreDoc : scoreDocs){
// 7、根据searcher和ScoreDoc对象获取具体的Document对象
Document document = indexSearcher.doc(scoreDoc.doc);
JSONObject jsonObject = new JSONObject();
jsonObject.put("id", document.get("id"));
jsonObject.put("scenery", document.get("scenery"));
jsonObject.put("geography", document.get("geography"));
jsonObject.put("link", document.get("link"));
jsonObject.put("title", document.get("title"));
jsonObject.put("author", document.get("author"));
jsonObject.put("authorUrl",document.get("authorUrl"));
jsonObject.put("publishedTime",document.get("publishedTime"));
jsonObject.put("createTime",document.get("createTime"));
result.add(jsonObject.toString());
}
return getResponse().success(result);
}catch (Exception e){
log.error("检索失败, 异常原因:errorMsg,{}",e.getMessage());
return getResponse().failure(e.getMessage());
}
}
}