1.Lucene 简介
Apache Lucene是一个基于Java全文搜索引擎,利用它可以轻易地为Java软件加入全文搜寻功能。Lucene的最主要工作是替文件的每一个字作索引,索引让搜寻的效率比传统的逐字比较大大提高,Lucen提供一组解读,过滤,分析文件,编排和使用索引的API
2.创建索引(项目里要导入的几个jar包)
package com.java1234.lucene;
import java.nio.file.Paths;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.IntField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
public class Indexer {
private Integer ids[]={1,2,3};
private String citys[]={"aingdao","banjing","changhai"};
private String descs[]={
"Qingdao is b beautiful city.",
"Nanjing is c city of culture.",
"Shanghai is d bustling city."
};
private Directory dir;
/**
* 获取IndexWriter实例
* @return
* @throws Exception
*/
private IndexWriter getWriter()throws Exception{
Analyzer analyzer=new StandardAnalyzer(); // 标准分词器
IndexWriterConfig iwc=new IndexWriterConfig(analyzer);
IndexWriter writer=new IndexWriter(dir, iwc);
return writer;
}
/**
* 生成索引
* @param indexDir
* @throws Exception
*/
private void index(String indexDir)throws Exception{
dir=FSDirectory.open(Paths.get(indexDir));
IndexWriter writer=getWriter();
for(int i=0;i<ids.length;i++){
Document doc=new Document();
doc.add(new IntField("id", ids[i], Field.Store.YES));
doc.add(new StringField("city",citys[i],Field.Store.YES));
doc.add(new TextField("desc", descs[i], Field.Store.YES));
writer.addDocument(doc); // 添加文档
}
writer.close();
}
}
3.对内容进行查询,更新,删除操作
package com.java1234.lucene;
import java.nio.file.Paths;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
//进行查询搜索
public class Searcher {
public static void search(String indexDir,String q)throws Exception{
Directory dir=FSDirectory.open(Paths.get(indexDir));
IndexReader reader=DirectoryReader.open(dir);
IndexSearcher is=new IndexSearcher(reader);
Analyzer analyzer=new StandardAnalyzer(); // 标准分词器
QueryParser parser=new QueryParser("desc", analyzer);
Query query=parser.parse(q);
long start=System.currentTimeMillis();
TopDocs hits=is.search(query, 10);
long end=System.currentTimeMillis();
System.out.println("匹配 "+q+" ,总共花费"+(end-start)+"毫秒"+"查询到"+hits.totalHits+"个记录");
for(ScoreDoc scoreDoc:hits.scoreDocs){
Document doc=is.doc(scoreDoc.doc);
System.out.println(doc.get("city"));
}
reader.close();
}
}
//进行删除操作,第一种方式删除
/**
* 测试删除 在合并前
* @throws Exception
*/
@Test
public void testDeleteBeforeMerge()throws Exception{
IndexWriter writer=getWriter();
System.out.println("删除前:"+writer.numDocs());
//deleteDocuments(Term term)是根据term删除document,但是这里的删除并不是真正的删除。而是存储在一个回收站中的,可以恢复删除的document
writer.deleteDocuments(new Term("id","1"));
writer.commit();
System.out.println("writer.maxDoc():"+writer.maxDoc());//最大文档数不变
System.out.println("writer.numDocs():"+writer.numDocs());//实际文档数减少
writer.close();
}
//进行删除操作,第二种方式删除
/**
* 测试删除 在合并后
* @throws Exception
*/
@Test
public void testDeleteAfterMerge()throws Exception{
IndexWriter writer=getWriter();
System.out.println("删除前:"+writer.numDocs());
writer.deleteDocuments(new Term("id","1"));
//你想要强制删除回收站的信息可以调用writer.forceMergeDeletes()这个方法,但是这个方法不推荐使用,比较消耗内存,lucene会自动根据容量的大小删除所删除的文件
writer.forceMergeDeletes(); // 强制删除
writer.commit();
System.out.println("writer.maxDoc():"+writer.maxDoc());//最大文档数减少
System.out.println("writer.numDocs():"+writer.numDocs());//实际文档数减少
writer.close();
}
/**
* 测试更新
* @throws Exception
*/
@Test
public void testUpdate()throws Exception{
IndexWriter writer=getWriter();
Document doc=new Document();
doc.add(new StringField("id", "1", Field.Store.YES));
doc.add(new StringField("city","qingdao",Field.Store.YES));
doc.add(new TextField("desc", "dsss is a city.", Field.Store.NO));
//所谓的更新索引是分两步进行的:先删除然后再添加索引,添加的索引占用删除前索引的位置;如果在删除索引时lucene在索引文件中找不到相应的数据,就会在索引文件的最后面添加新的索引
writer.updateDocument(new Term("id","1"), doc);
writer.close();
}
/**
* 对特定项搜索(不分词搜索)
* @throws Exception
*/
@Test
public void testTermQuery()throws Exception{
String searchField="contents";
String q="particular";
Term t=new Term(searchField,q);
Query query=new TermQuery(t);
TopDocs hits=is.search(query, 10);
System.out.println("匹配 '"+q+"',总共查询到"+hits.totalHits+"个文档");
for(ScoreDoc scoreDoc:hits.scoreDocs){
Document doc=is.doc(scoreDoc.doc);
System.out.println(doc.get("fullPath"));
}
}
4.Lucene4种方式搜索
/**
* 指定项范围搜索
* @throws Exception
*/
@Test
public void testTermRangeQuery()throws Exception{
TermRangeQuery query=new TermRangeQuery("desc", new BytesRef("b".getBytes()), new BytesRef("c".getBytes()), true, true);
TopDocs hits=is.search(query, 10);
for(ScoreDoc scoreDoc:hits.scoreDocs){
Document doc=is.doc(scoreDoc.doc);
System.out.println(doc.get("id"));
System.out.println(doc.get("city"));
System.out.println(doc.get("desc"));
}
}
/**
* 指定数字范围
* @throws Exception
*/
@Test
public void testNumericRangeQuery()throws Exception{
NumericRangeQuery<Integer> query=NumericRangeQuery.newIntRange("id", 1, 2, true, true);
TopDocs hits=is.search(query, 10);
for(ScoreDoc scoreDoc:hits.scoreDocs){
Document doc=is.doc(scoreDoc.doc);
System.out.println(doc.get("id"));
System.out.println(doc.get("city"));
System.out.println(doc.get("desc"));
}
}
/**
* 指定字符串开头搜索
* @throws Exception
*/
@Test
public void testPrefixQuery()throws Exception{
PrefixQuery query=new PrefixQuery(new Term("city","a"));
TopDocs hits=is.search(query, 10);
for(ScoreDoc scoreDoc:hits.scoreDocs){
Document doc=is.doc(scoreDoc.doc);
System.out.println(doc.get("id"));
System.out.println(doc.get("city"));
System.out.println(doc.get("desc"));
}
}
/**
* 多条件查询
* @throws Exception
*/
@Test
public void testBooleanQuery()throws Exception{
NumericRangeQuery<Integer> query1=NumericRangeQuery.newIntRange("id", 1, 2, true, true);
PrefixQuery query2=new PrefixQuery(new Term("city","a"));
BooleanQuery.Builder booleanQuery=new BooleanQuery.Builder();
booleanQuery.add(query1,BooleanClause.Occur.MUST);
booleanQuery.add(query2,BooleanClause.Occur.MUST);
TopDocs hits=is.search(booleanQuery.build(), 10);
for(ScoreDoc scoreDoc:hits.scoreDocs){
Document doc=is.doc(scoreDoc.doc);
System.out.println(doc.get("id"));
System.out.println(doc.get("city"));
System.out.println(doc.get("desc"));
}
}
//对多个字段以及多个值进行搜索查询
/**
* 根据信息分类和关键词进行查询
* @param type,资源的类型,其值为news或product
* @param searchKey,搜索的关键字
* @return Hits
*/
public Hits executeSearch(String type,String keyword)
{
Hits result = null;
if(type != null && !type.equals("") && keyword != null && !keyword.equals(""))
{
try
{
//根据关键字构造一个数组
String[] key = new String[]{keyword,type};
//同时声明一个与之对应的字段数组
String[] fields = {"title","type"};
//声明BooleanClause.Occur[]数组,它表示多个条件之间的关系
BooleanClause.Occur[] flags=new BooleanClause.Occur[]{BooleanClause.Occur.MUST,BooleanClause.Occur.MUST};
ChineseAnalyzer analyzer = new ChineseAnalyzer();
//用MultiFieldQueryParser得到query对象
Query query = MultiFieldQueryParser.parse(key, fields, flags, analyzer);
//c:/index表示我们的索引文件所在的目录
IndexSearcher searcher = new IndexSearcher("c:/index");
//查询结果
result = searcher.search(query);
} catch (Exception e)
{
e.printStackTrace();
}
}
return result;
}
//这里需要注意的就是BooleanClause.Occur[]数组,它表示多个条件之间的关系,BooleanClause.Occur.MUST表示and,BooleanClause.Occur.MUST_NOT表示not,BooleanClause.Occur.SHOULD表示or.
5.Lucene中文分词器以及高亮显示
public static void search(String indexDir,String q)throws Exception{
Directory dir=FSDirectory.open(Paths.get(indexDir));
IndexReader reader=DirectoryReader.open(dir);
IndexSearcher is=new IndexSearcher(reader);
// Analyzer analyzer=new StandardAnalyzer(); // 标准分词器
SmartChineseAnalyzer analyzer=new SmartChineseAnalyzer();
QueryParser parser=new QueryParser("desc", analyzer);
Query query=parser.parse(q);
long start=System.currentTimeMillis();
TopDocs hits=is.search(query, 10);
long end=System.currentTimeMillis();
System.out.println("匹配 "+q+" ,总共花费"+(end-start)+"毫秒"+"查询到"+hits.totalHits+"个记录");
QueryScorer scorer=new QueryScorer(query);
Fragmenter fragmenter=new SimpleSpanFragmenter(scorer);
SimpleHTMLFormatter simpleHTMLFormatter=new SimpleHTMLFormatter("<b><font color='red'>","</font></b>");
Highlighter highlighter=new Highlighter(simpleHTMLFormatter, scorer);
highlighter.setTextFragmenter(fragmenter);
for(ScoreDoc scoreDoc:hits.scoreDocs){
Document doc=is.doc(scoreDoc.doc);
System.out.println(doc.get("city"));
System.out.println(doc.get("desc"));
String desc=doc.get("desc");
if(desc!=null){
TokenStream tokenStream=analyzer.tokenStream("desc", new StringReader(desc));
System.out.println(highlighter.getBestFragment(tokenStream, desc));
}
}
reader.close();
}
7.实现分也的两种方式
lucene3.5之前分页提供的方式为再查询方式(每次查询全部记录,然后取其中部分记录,这种方式用的最多),lucene官方的解释:由于我们的速度足够快。处理海量数据时,内存容易内存溢出。
lucene3.5以后提供一个searchAfter,这个是在特大数据量采用(亿级数据量),速度相对慢一点,像google搜索图片的时候,点击更多,然后再出来一批。这种方式就是把数据保存在缓存里面。然后再去取。
/**
* 这就是先查询所有的数据,然后去分页数据
* 注意 这种方式处理海量数据的时候,容易内存溢出
* @param query
* @param pageIndex--第几页
* @param pageSize--每页显示多少数据
*/
public void searchPage(String query,int pageIndex,int pageSize) {
try {
Directory dir = FileIndexUtils.getDirectory();
IndexSearcher searcher = getSearcher(dir);
QueryParser parser = new QueryParser(Version.LUCENE_35,"desc",new StandardAnalyzer(Version.LUCENE_35));
Query q = parser.parse(query);
TopDocs tds = searcher.search(q, 500);
//注意 此处把500条数据放在内存里。
ScoreDoc[] sds = tds.scoreDocs;
int start = (pageIndex-1)*pageSize;
int end = pageIndex*pageSize;
for(int i=start;i<end;i++) {
Document doc = searcher.doc(sds[i].doc);
System.out.println(sds[i].doc+":"+doc.get("path")+"-->"+doc.get("filename"));
}
searcher.close();
} catch (org.apache.lucene.queryParser.ParseException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 根据页码和分页大小获取上一次的最后一个scoredocs
* @param pageIndex
* @param pageSize
* @param query
* @param searcher
* @return
* @throws IOException
*/
private ScoreDoc getLastScoreDoc(int pageIndex,int pageSize,Query query,IndexSearcher searcher) throws IOException {
if(pageIndex==1)return null;//如果是第一页就返回空
int num = pageSize*(pageIndex-1);//获取上一页的最后数量
TopDocs tds = searcher.search(query, num);
return tds.scoreDocs[num-1];
}
public void searchPageByAfter(String query,int pageIndex,int pageSize) {
try {
Directory dir = FileIndexUtils.getDirectory();
IndexSearcher searcher = getSearcher(dir);
QueryParser parser = new QueryParser(Version.LUCENE_35,"desc",new StandardAnalyzer(Version.LUCENE_35));
Query q = parser.parse(query);
//获取上一页的最后一个元素
ScoreDoc lastSd = getLastScoreDoc(pageIndex, pageSize, q, searcher);
//通过最后一个元素去搜索下一页的元素
TopDocs tds = searcher.searchAfter(lastSd,q, pageSize);
for(ScoreDoc sd:tds.scoreDocs) {
Document doc = searcher.doc(sd.doc);
System.out.println(sd.doc+":"+doc.get("path")+"-->"+doc.get("filename"));
}
searcher.close();
} catch (org.apache.lucene.queryParser.ParseException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
热门评论
厉害了厉害了厉害了厉害了