共计 17594 个字符,预计需要花费 44 分钟才能阅读完成。
基础知识解析:
索引(Index):在 Lucene 中,索引是一个蕴含文档(Document)的数据结构,相似于 MySQL 中的表。Lucene 将文档中的字段进行索引,以便后续进行高效的搜寻。每个索引蕴含多个文档,而每个文档能够蕴含多个字段。文档(Document):在 Lucene 中,文档是要进行索引和搜寻的根本单位,相似于 MySQL 中的一行数据。文档能够蕴含多个字段,每个字段都能够蕴含一个或多个值。文档能够是一个蕴含文本数据的 Java 对象,例如一篇文章或一条记录。字段(Field):在 Lucene 中,字段是文档中的一个属性或值,相似于 MySQL 中的列。字段能够蕴含不同类型的值,例如文本、数字、日期等。每个字段都能够被索引和搜寻。
在创立索引时需指定类型的参数:
IndexWriterConfig.OpenMode.CREATE: 如果索引目录中不存在索引,则创立一个新的索引;如果索引目录中曾经存在索引,则删除现有的索引并创立一个新的索引。这个模式会齐全清空现有的索引,而后从头开始构建新的索引。IndexWriterConfig.OpenMode.APPEND: 如果索引目录中不存在索引,则创立一个新的索引;如果索引目录中曾经存在索引,则在现有的索引上追加新的索引。这个模式会在现有的索引根底上增量地增加新的文档和更新已有的文档,而不会清空现有的索引。IndexWriterConfig.OpenMode.CREATE_OR_APPEND: 如果索引目录中不存在索引,则创立一个新的索引;如果索引目录中曾经存在索引,则在现有的索引上追加新的索引。这个模式会依据索引目录中是否存在索引来决定是创立新的索引还是在现有的索引上追加
创立字段时需指定的类型:
StringField:用于存储不须要进行分词的文本数据,实用于关键字、标识符等须要准确匹配的状况。TextField:用于存储须要进行分词的文本数据,实用于文章内容、形容等须要进行全文搜寻的状况。SortedDocValuesField:用于存储须要进行排序的文本数据,实用于日期、价格等须要进行范畴查问或排序的状况。BinaryDocValuesField:用于存储二进制数据,例如图片、音频等。NumericDocValuesField:用于存储数值型数据,例如整数、浮点数等,实用于数值范畴查问或排序的状况。IntPoint、FloatPoint、LongPoint、DoublePoint:别离用于存储整数、浮点数、长整数、双精度浮点数类型的数据,用于进行数值范畴查问。StringField、TextField(带排序):这些字段类型在 StringField 和 TextField 的根底上增加了排序功能,实用于须要进行排序的文本数据。SortedNumericDocValuesField:用于存储多值数值型数据,例如多个数值型数据组成的数组或汇合。SortedSetDocValuesField:用于存储多值文本数据,例如多个字符串组成的汇合。LatLonPoint:用于存储地理位置信息,包含经度和纬度,用于进行地理位置范畴查问。DatePoint:用于存储日期信息,包含年、月、日,用于进行日期范畴查问。BinaryPoint:用于存储二进制数据,例如 IPv4 地址、UUID 等。StringField、TextField(带向量):这些字段类型在 StringField 和 TextField 的根底上增加了向量存储性能,用于进行文本向量检索。
以下为集成例子:
POM 文件导入相干依赖包: 我的项目 springboot 是 2.2.2 版本
<!-- lucene 外围库 -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>7.6.0</version>
</dependency>
<!-- Lucene 的查问解析器 -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queryparser</artifactId>
<version>7.6.0</version>
</dependency>
<!-- lucene 的默认分词器库 -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-common</artifactId>
<version>7.6.0</version>
</dependency>
<!-- lucene 的高亮显示 -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-highlighter</artifactId>
<version>7.6.0</version>
</dependency>
<!-- ik 分词器 -->
<dependency>
<groupId>com.janeluo</groupId>
<artifactId>ikanalyzer</artifactId>
<version>2012_u6</version>
</dependency>
若呈现以下问题, 有可能 jar 包版本不兼容问题, 需自行从新实现分词器
org.springframework.web.util.NestedServletException: Handler dispatch failed; nested exception is java.lang.AbstractMethodError: org.apache.lucene.analysis.Analyzer.createComponents(Ljava/lang/String;)Lorg/apache/lucene/analysis/Analyzer$TokenStreamComponents;
如下:
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Tokenizer;
public class MyIKAnalyzer extends Analyzer {
private boolean useSmart;
public boolean useSmart() {return this.useSmart;}
public void setUseSmart(boolean useSmart) {this.useSmart = useSmart;}
public MyIKAnalyzer() {this(false);
}
@Override
protected TokenStreamComponents createComponents(String s) {Tokenizer _MyIKTokenizer = new MyIKTokenizer(this.useSmart());
return new TokenStreamComponents(_MyIKTokenizer);
}
public MyIKAnalyzer(boolean useSmart) {this.useSmart = useSmart;}
}
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;
import java.io.IOException;
public class MyIKTokenizer extends Tokenizer {
private IKSegmenter _IKImplement;
private final CharTermAttribute termAtt = (CharTermAttribute)this.addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = (OffsetAttribute)this.addAttribute(OffsetAttribute.class);
private final TypeAttribute typeAtt = (TypeAttribute)this.addAttribute(TypeAttribute.class);
private int endPosition;
//useSmart:设置是否应用智能分词。默认为 false,应用细粒度分词,这里如果更改为 TRUE,那么搜寻到的后果可能就少的很多
public MyIKTokenizer(boolean useSmart) {this._IKImplement = new IKSegmenter(this.input, useSmart);
}
public final boolean incrementToken() throws IOException {this.clearAttributes();
Lexeme nextLexeme = this._IKImplement.next();
if (nextLexeme != null) {this.termAtt.append(nextLexeme.getLexemeText());
this.termAtt.setLength(nextLexeme.getLength());
this.offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());
this.endPosition = nextLexeme.getEndPosition();
this.typeAtt.setType(nextLexeme.getLexemeTypeString());
return true;
} else {return false;}
}
public void reset() throws IOException {super.reset();
this._IKImplement.reset(this.input);
}
public final void end() {int finalOffset = this.correctOffset(this.endPosition);
this.offsetAtt.setOffset(finalOffset, finalOffset);
}
}
实体类对象:
/**
* 政策库
*/
@Data
@TableName(value = "policy_info")
public class PolicyInfo extends AbstractSimpleEntity {@TableId(type = IdType.AUTO)
private Long id;
@ApiModelProperty("创建人")
private Long createdBy;
@ApiModelProperty("更新人")
private Long updatedBy;
@ApiModelProperty("政策名称")
private String name;
@ApiModelProperty("文号")
private String symbol;
@ApiModelProperty("政策标签")
private String label;
@ApiModelProperty("政策级别:DISTRICT= 区级,CITY= 市级,PROVINCIAL= 省级,NATIONAL= 国家级")
private PolicyLevelEnum level;
@ApiModelProperty("政策发文工夫")
private Date publicationTime;
@ApiModelProperty("实用园区")
private String applicablePark;
@ApiModelProperty("重点摘要")
private String keySummary;
@ApiModelProperty("详情 JSON 动静展现")
private String detailJson;
@ApiModelProperty("政策编码:zc+ 日期 +00001")
private String number;
@ApiModelProperty("公布工夫")
private Date releaseTime;
@ApiModelProperty("底图图片 url")
private String baseMapUrl;
@ApiModelProperty("公布状态: 已公布, 未公布")
private ReleaseStatusEnum releaseStatus;
@DateTimeFormat(pattern = "yyyy-MM-dd HH:mm:ss")
@JsonFormat(timezone = "GMT+8",pattern = "yyyy-MM-dd HH:mm:ss")
public Date createTime;
@DateTimeFormat(pattern = "yyyy-MM-dd HH:mm:ss")
@JsonFormat(timezone = "GMT+8",pattern = "yyyy-MM-dd HH:mm:ss")
public Date updateTime;
测试类相干操作
import cn.xxx.DateUtils;
import cn.xxx.AnnotationUtil;
import cn.xxx.configuration.MyIKAnalyzer;
import cn.xxx.AreaTypeEnum;
import cn.xxx.PolicyLevelEnum;
import cn.xxx.PolicyInfo;
import cn.xxx.PolicyInfoMapper;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Fragmenter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.test.context.junit4.SpringRunner;
import java.io.IOException;
import java.nio.file.FileSystems;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
@Slf4j
@RunWith(SpringRunner.class)
@SpringBootTest
public class BlogLuceneManagerTest {
@Autowired
private PolicyInfoMapper policyInfoMapper;
/**
* 插入索引与数据
* @throws Exception
*/
@Test
public void addIndex() throws Exception {List<PolicyInfo> policyInfos = policyInfoMapper.selectList(null);
// 创立文档的汇合
Collection<Document> docs = new ArrayList<>();
for(int i=0;i<policyInfos.size();i++){PolicyInfo policyInfo = policyInfos.get(i);
//policyInfoMapper.insertSelective(list1.get(i));
// 创立文档对象
Document document = addDocument(policyInfo);
docs.add(document);
}
// 索引目录类, 指定索引在硬盘中的地位,我的设置为 D 盘的 indexDir 文件夹
Directory directory = FSDirectory.open(FileSystems.getDefault().getPath("d:\\indexDir"));
// 引入 IK 分词器
Analyzer analyzer = new MyIKAnalyzer();
// 索引写出工具的配置对象,这个中央就是最下面报错的问题解决方案
IndexWriterConfig conf = new IndexWriterConfig(analyzer);
// 设置打开方式:OpenMode.APPEND 会在索引库的根底上追加新索引。OpenMode.CREATE 会先清空原来数据,再提交新的索引
conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
// 创立索引的写出工具类。参数:索引的目录和配置信息
IndexWriter indexWriter = new IndexWriter(directory, conf);
// 把文档汇合交给 IndexWriter
indexWriter.addDocuments(docs);
// 提交
indexWriter.commit();
// 敞开
indexWriter.close();}
// 增加索引映射 与 数据插入
public Document addDocument(PolicyInfo policyInfo){Long id = policyInfo.getId();
Long createdBy = policyInfo.getCreatedBy();
String createTime = null ;
if(null != policyInfo.getCreateTime()){createTime = DateUtils.formatDate(policyInfo.getCreateTime(),"yyyy-MM-dd HH:mm:ss");
}
Long updatedBy = policyInfo.getUpdatedBy();
String updateTime = null;
if(null != policyInfo.getUpdateTime()){updateTime = DateUtils.formatDate(policyInfo.getUpdateTime(),"yyyy-MM-dd HH:mm:ss");
}
String name = policyInfo.getName();
String symbol = policyInfo.getSymbol();
String label = policyInfo.getLabel();
String level = policyInfo.getLevel().getValue();
String publicationTime = null;
if(null != policyInfo.getPublicationTime()){publicationTime = DateUtils.formatDate(policyInfo.getPublicationTime(),"yyyy-MM-dd HH:mm:ss");
}
String applicablePark = policyInfo.getApplicablePark();
String keySummary = policyInfo.getKeySummary();
String detailJson = policyInfo.getDetailJson();
String number = policyInfo.getNumber();
String releaseTime = null;
if(null != policyInfo.getReleaseTime()){releaseTime = DateUtils.formatDate(policyInfo.getReleaseTime(),"yyyy-MM-dd HH:mm:ss");
}
String baseMapUrl = policyInfo.getBaseMapUrl();
String releaseStatus = policyInfo.getReleaseStatus().getValue();
Document document = new Document();
//StringField 会创立索引,然而不会被分词,TextField,即创立索引又会被分词。document.add(new StringField("id", id+"", Field.Store.YES));
if(null != createdBy){document.add(new StringField("createdBy", createdBy+"", Field.Store.YES));
}
if(StringUtils.isNotEmpty(createTime)){document.add(new TextField("createTime", createTime, Field.Store.YES));
}
if(null != updatedBy){document.add(new StringField("updatedBy", updatedBy+"", Field.Store.YES));
}
if(StringUtils.isNotEmpty(updateTime)){document.add(new TextField("updateTime", updateTime, Field.Store.YES));
}
if(StringUtils.isNotEmpty(name)){document.add(new TextField("name", name, Field.Store.YES));
}
if(StringUtils.isNotEmpty(symbol)){document.add(new TextField("symbol", symbol, Field.Store.YES));
}
if(StringUtils.isNotEmpty(label)){document.add(new TextField("label", label, Field.Store.YES));
}
if(StringUtils.isNotEmpty(level)){document.add(new StringField("level", level, Field.Store.YES));
}
if(StringUtils.isNotEmpty(publicationTime)){
// 分词查问用
document.add(new TextField("publicationTime", publicationTime, Field.Store.YES));
// 排序用
document.add(new SortedDocValuesField("publicationTime_sort", new BytesRef(publicationTime)));
}
if(StringUtils.isNotEmpty(applicablePark)){document.add(new TextField("applicablePark", applicablePark, Field.Store.YES));
}
if(StringUtils.isNotEmpty(keySummary)){document.add(new TextField("keySummary", keySummary, Field.Store.YES));
}
if(StringUtils.isNotEmpty(detailJson)){document.add(new TextField("detailJson", detailJson, Field.Store.YES));
}
if(StringUtils.isNotEmpty(number)){document.add(new TextField("number", number, Field.Store.YES));
}
if(StringUtils.isNotEmpty(releaseTime)){document.add(new TextField("releaseTime", releaseTime, Field.Store.YES));
}
if(StringUtils.isNotEmpty(baseMapUrl)){document.add(new StringField("baseMapUrl", baseMapUrl, Field.Store.YES));
}
if(StringUtils.isNotEmpty(releaseStatus)){document.add(new StringField("releaseStatus", releaseStatus, Field.Store.YES));
}
return document;
}
/**
* 针对某个字段查问
* @throws IOException
* @throws ParseException
*/
@Test
public void searchTextBytText() throws IOException,ParseException{
String text = "对于苏州告诉";
Directory directory = FSDirectory.open(FileSystems.getDefault().getPath("d:\\indexDir"));
// 索引读取工具
IndexReader reader = DirectoryReader.open(directory);
// 索引搜寻工具
IndexSearcher searcher = new IndexSearcher(reader);
// 创立查问解析器, 两个参数:默认要查问的字段的名称,分词器
QueryParser parser = new QueryParser("name", new MyIKAnalyzer());
// 创立查问对象
Query query = parser.parse(text);
// 获取前十条记录
TopDocs topDocs = searcher.search(query, 10);
// 获取总条数
System.out.println("本次搜寻共找到" + topDocs.totalHits + "条数据");
// 获取得分文档对象(ScoreDoc)数组.SocreDoc 中蕴含:文档的编号、文档的得分
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
List<PolicyInfo> list = new ArrayList<>();
for (ScoreDoc scoreDoc : scoreDocs) {
// 取出文档编号
int docID = scoreDoc.doc;
// 依据编号去找文档
Document doc = reader.document(docID);
System.out.println(doc);
//policyInfo policyInfo = policyInfoMapper.selectByPrimaryKey(doc.get("id"));
//list.add(policyInfo);
}
}
// 须要分词查问的字段
private String[] query={"name","label","publicationTime","applicablePark","keySummary"};
/**
* 多字段分词分页查问高亮显示
* @return
* @throws IOException
* @throws ParseException
* @throws InvalidTokenOffsetsException
*/
@Test
public void searchTextByKeyWordToPage() throws IOException,ParseException, InvalidTokenOffsetsException{
String keyWord = null;
AreaTypeEnum areaTypeEnum = AreaTypeEnum.LOCAL_POLICY;
List<String> levels = new ArrayList<>();
if (AreaTypeEnum.LOCAL_POLICY.equals(areaTypeEnum)){levels.add(PolicyLevelEnum.DISTRICT.getValue());
levels.add(PolicyLevelEnum.CITY.getValue());
levels.add(PolicyLevelEnum.PROVINCIAL.getValue());
}else{levels.add(PolicyLevelEnum.NATIONAL.getValue());
}
int page = 1;
int pageSize = 10;
Directory directory = FSDirectory.open(FileSystems.getDefault().getPath("d:\\indexDir"));
// 索引读取工具
IndexReader reader = DirectoryReader.open(directory);
// 索引搜寻工具
IndexSearcher searcher = new IndexSearcher(reader);
//todo 多条件查问结构
BooleanQuery.Builder finalBooleanQeury = new BooleanQuery.Builder();
//todo 条件: 过滤数据的条件
BooleanQuery.Builder levelBuilder = new BooleanQuery.Builder();
for (String level : levels) {Query termQuery = new TermQuery(new Term("level", level));
levelBuilder.add(termQuery, BooleanClause.Occur.SHOULD);
}
BooleanQuery levelBuild = levelBuilder.build();
finalBooleanQeury.add(levelBuild,BooleanClause.Occur.MUST);
//todo 条件: 分词查问
if(StringUtils.isNotEmpty(keyWord)) {
// 创立查问解析器, 两个参数:默认要查问的字段的名称,分词器
MultiFieldQueryParser parser = new MultiFieldQueryParser(query, new MyIKAnalyzer());
// 创立查问对象
Query query = parser.parse(keyWord);
BooleanQuery.Builder keyWordBuilder = new BooleanQuery.Builder();
keyWordBuilder.add(query, BooleanClause.Occur.MUST);
BooleanQuery keyWordBuild = keyWordBuilder.build();
finalBooleanQeury.add(keyWordBuild,BooleanClause.Occur.MUST);
}
BooleanQuery finalBooleanBuild = finalBooleanQeury.build();
//SortField 排序字段 参数:1. 须要排序的字段 2. 该字段的类型 3. 默认是升序 如果须要降序 则为 true
Sort sort = new Sort(new SortField("publicationTime_sort", SortField.Type.STRING_VAL, true));
// 获取总条数
TopDocs topDocs = searchByPage(page,pageSize,searcher,finalBooleanBuild,sort);
// 高亮显示
SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<span style='color:red'>", "</span>");
Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(finalBooleanBuild));
Fragmenter fragmenter = new SimpleFragmenter(100); // 高亮后的段落范畴在 100 字内
highlighter.setTextFragmenter(fragmenter);
// 获取得分文档对象(ScoreDoc)数组.SocreDoc 中蕴含
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
System.out.println("本次搜寻共找到" + scoreDocs.length + "条数据");
List<PolicyInfo> list = new ArrayList<>();
for (ScoreDoc scoreDoc : scoreDocs) {
// 取出文档编号
int docID = scoreDoc.doc;
// 依据编号去找文档
Document doc = reader.document(docID);
System.out.println("====================================================================================");
System.out.println("关键字搜寻:"+keyWord);
//policyInfo policyInfo = policyInfoMapper.selectByPrimaryKey(doc.get("id"));
// 解决高亮字段显示
java.lang.reflect.Field[] allFields = AnnotationUtil.getAllFields(PolicyInfo.class);
for (java.lang.reflect.Field allField : allFields) {String fieldName = allField.getName();
String value = getDoc(highlighter, doc, fieldName);
System.out.println("["+fieldName + "]:" + value);
}
//policyInfo.setDescs(descs);
//policyInfo.setTitle(title);
//list.add(policyInfo);
}
}
private TopDocs searchByPage(Integer pageNum, Integer pageSize, IndexSearcher searcher, Query query, Sort sort) throws IOException {
TopDocs result = null;
ScoreDoc before = null;
if(pageNum > 1){TopDocs docsBefore = searcher.search(query, (pageNum-1)*pageSize,sort);
ScoreDoc[] scoreDocs = docsBefore.scoreDocs;
if(scoreDocs.length > 0){before = scoreDocs[scoreDocs.length - 1];
}
}
result = searcher.searchAfter(before, query, pageSize,sort);
return result;
}
/**
* 解决高亮字段显示
* @param highlighter
* @param doc
* @param fieldName
* @return
* @throws IOException
* @throws InvalidTokenOffsetsException
*/
public String getDoc(Highlighter highlighter,
Document doc,
String fieldName
) throws IOException, InvalidTokenOffsetsException {String value = doc.get(fieldName);
for (String key : query) {if(key.equals(fieldName)){if(StringUtils.isNotEmpty(value)){String bestFragment = highlighter.getBestFragment(new MyIKAnalyzer(), fieldName, value);
if (StringUtils.isNotEmpty(bestFragment)){value = bestFragment;}
}
}
}
return value;
}
/**
* 数据更新
* @throws IOException
*/
public void update() throws IOException{
Long id = 1L;
PolicyInfo policyInfo = policyInfoMapper.selectByPrimaryKey(id);
policyInfo.setName("哈哈广州市黄埔区商务局 广州开发区商务局对于印发广州市黄埔区 广州开发区 广州高新区促成商贸企业高质量倒退搀扶措施实施细则的告诉");
policyInfoMapper.updateById(policyInfo);
// 创立目录对象
Directory directory = FSDirectory.open(FileSystems.getDefault().getPath("d:\\indexDir"));
// 创立配置对象
IndexWriterConfig conf = new IndexWriterConfig(new MyIKAnalyzer());
// 创立索引写出工具
IndexWriter writer = new IndexWriter(directory, conf);
// 创立新的文档数据
Document document = addDocument(policyInfo);
writer.updateDocument(new Term("id",id+""), document);
// 提交
writer.commit();
// 敞开
writer.close();}
}
正文完