LuceneFactoryTest.java 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364
  1. import org.apache.commons.lang.math.RandomUtils;
  2. import org.apache.lucene.analysis.Analyzer;
  3. import org.apache.lucene.analysis.TokenStream;
  4. import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
  5. import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  6. import org.apache.lucene.document.*;
  7. import org.apache.lucene.index.IndexReader;
  8. import org.apache.lucene.index.Term;
  9. import org.apache.lucene.queryparser.classic.ParseException;
  10. import org.apache.lucene.queryparser.classic.QueryParser;
  11. import org.apache.lucene.search.*;
  12. import org.apache.lucene.search.highlight.Highlighter;
  13. import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
  14. import org.apache.lucene.search.highlight.QueryScorer;
  15. import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
  16. import org.dbsyncer.common.model.Paging;
  17. import org.dbsyncer.storage.lucene.Shard;
  18. import org.junit.After;
  19. import org.junit.Before;
  20. import org.junit.Test;
  21. import java.io.IOException;
  22. import java.io.StringReader;
  23. import java.time.LocalDateTime;
  24. import java.time.ZoneOffset;
  25. import java.time.format.DateTimeFormatter;
  26. import java.util.List;
  27. import java.util.Map;
  28. public class LuceneFactoryTest {
  29. private Shard shard;
  30. @Before
  31. public void setUp() throws IOException {
  32. shard = new Shard("target/indexDir/");
  33. }
  34. @After
  35. public void tearDown() throws IOException {
  36. shard.close();
  37. }
  38. @Test
  39. public void testQuery() throws IOException {
  40. int size = 3;
  41. for (int i = size; i > 0; i--) {
  42. Document doc = new Document();
  43. doc.add(new StringField("id", String.valueOf(i), Field.Store.YES));
  44. doc.add(new StringField("name", "中文" + i, Field.Store.YES));
  45. doc.add(new TextField("content", "这是一串很长长长长长长长的文本", Field.Store.YES));
  46. // 创建索引
  47. int age = RandomUtils.nextInt(50);
  48. doc.add(new IntPoint("age", age));
  49. // 需要存储内容
  50. doc.add(new StoredField("age", age));
  51. // 需要排序
  52. doc.add(new NumericDocValuesField("age", age));
  53. System.out.println(String.format("id=%s,age:=%s", String.valueOf(i), age));
  54. // 2020-05-23 12:00:00
  55. long createTime = 1590206400000L + i;
  56. doc.add(new LongPoint("createTime", createTime));
  57. doc.add(new StoredField("createTime", createTime));
  58. doc.add(new NumericDocValuesField("createTime", createTime));
  59. shard.insert(doc);
  60. }
  61. // 范围查询 IntPoint.newRangeQuery("id", 1, 100)
  62. // 集合查询 IntPoint.newSetQuery("id", 2, 3)
  63. // 单个查询 IntPoint.newExactQuery("id", 3)
  64. BooleanQuery query = new BooleanQuery.Builder()
  65. .add(IntPoint.newRangeQuery("age", 1, 100), BooleanClause.Occur.MUST)
  66. .build();
  67. Paging paging = shard.query(query, new Sort(new SortField("createTime", SortField.Type.LONG, true)));
  68. paging.getData().forEach(m -> System.out.println(m));
  69. // 清空
  70. shard.deleteAll();
  71. }
  72. @Test
  73. public void testCURD() throws IOException {
  74. System.out.println("测试前:");
  75. List<Map> maps = shard.query(new MatchAllDocsQuery());
  76. maps.forEach(m -> System.out.println(m));
  77. check();
  78. // 新增
  79. Document doc = new Document();
  80. String id = "100";
  81. doc.add(new StringField("id", id, Field.Store.YES));
  82. doc.add(new TextField("content", "这是一款大规模数据处理软件,名字叫做Apache Spark", Field.Store.YES));
  83. shard.insert(doc);
  84. System.out.println("新增后:");
  85. maps = shard.query(new MatchAllDocsQuery());
  86. maps.forEach(m -> System.out.println(m));
  87. check();
  88. // 修改
  89. doc.add(new TextField("content", "这是一款大规模数据处理软件,名字叫做Apache Spark[已修改]", Field.Store.YES));
  90. shard.update(new Term("id", id), doc);
  91. System.out.println("修改后:");
  92. maps = shard.query(new MatchAllDocsQuery());
  93. maps.forEach(m -> System.out.println(m));
  94. check();
  95. // 删除
  96. shard.delete(new Term("id", id));
  97. System.out.println("删除后:");
  98. maps = shard.query(new MatchAllDocsQuery());
  99. maps.forEach(m -> System.out.println(m));
  100. check();
  101. // 清空
  102. shard.deleteAll();
  103. }
  104. @Test
  105. public void fmtDate() {
  106. DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
  107. LocalDateTime localDateTime = LocalDateTime.parse("2020-05-23 12:00:00", formatter);
  108. long timeStamp = localDateTime.toInstant(ZoneOffset.ofHours(8)).toEpochMilli();
  109. System.out.println(timeStamp);
  110. }
  111. /**
  112. * 按词条搜索
  113. * <p>
  114. * TermQuery是最简单、也是最常用的Query。TermQuery可以理解成为“词条搜索”,
  115. * 在搜索引擎中最基本的搜索就是在索引中搜索某一词条,而TermQuery就是用来完成这项工作的。
  116. * 在Lucene中词条是最基本的搜索单位,从本质上来讲一个词条其实就是一个名/值对。
  117. * 只不过这个“名”是字段名,而“值”则表示字段中所包含的某个关键字。
  118. *
  119. * @throws IOException
  120. */
  121. @Test
  122. public void termQueryTest() throws IOException {
  123. String searchField = "title";
  124. //这是一个条件查询的api,用于添加条件
  125. TermQuery query = new TermQuery(new Term(searchField, "Spark"));
  126. //执行查询,并打印查询到的记录数
  127. shard.query(query);
  128. }
  129. /**
  130. * 多条件查询
  131. *
  132. * BooleanQuery也是实际开发过程中经常使用的一种Query。
  133. * 它其实是一个组合的Query,在使用时可以把各种Query对象添加进去并标明它们之间的逻辑关系。
  134. * BooleanQuery本身来讲是一个布尔子句的容器,它提供了专门的API方法往其中添加子句,
  135. * 并标明它们之间的关系,以下代码为BooleanQuery提供的用于添加子句的API接口:
  136. *
  137. * @throws IOException
  138. */
  139. @Test
  140. public void BooleanQueryTest() throws IOException {
  141. String searchField1 = "title";
  142. String searchField2 = "content";
  143. Query query1 = new TermQuery(new Term(searchField1, "Spark"));
  144. Query query2 = new TermQuery(new Term(searchField2, "Apache"));
  145. BooleanQuery.Builder builder = new BooleanQuery.Builder();
  146. // BooleanClause用于表示布尔查询子句关系的类,
  147. // 包 括:
  148. // BooleanClause.Occur.MUST,
  149. // BooleanClause.Occur.MUST_NOT,
  150. // BooleanClause.Occur.SHOULD。
  151. // 必须包含,不能包含,可以包含三种.有以下6种组合:
  152. //
  153. // 1.MUST和MUST:取得连个查询子句的交集。
  154. // 2.MUST和MUST_NOT:表示查询结果中不能包含MUST_NOT所对应得查询子句的检索结果。
  155. // 3.SHOULD与MUST_NOT:连用时,功能同MUST和MUST_NOT。
  156. // 4.SHOULD与MUST连用时,结果为MUST子句的检索结果,但是SHOULD可影响排序。
  157. // 5.SHOULD与SHOULD:表示“或”关系,最终检索结果为所有检索子句的并集。
  158. // 6.MUST_NOT和MUST_NOT:无意义,检索无结果。
  159. builder.add(query1, BooleanClause.Occur.SHOULD);
  160. builder.add(query2, BooleanClause.Occur.SHOULD);
  161. BooleanQuery query = builder.build();
  162. //执行查询,并打印查询到的记录数
  163. shard.query(query);
  164. }
  165. /**
  166. * 匹配前缀
  167. * <p>
  168. * PrefixQuery用于匹配其索引开始以指定的字符串的文档。就是文档中存在xxx%
  169. * <p>
  170. *
  171. * @throws IOException
  172. */
  173. @Test
  174. public void prefixQueryTest() throws IOException {
  175. String searchField = "title";
  176. Term term = new Term(searchField, "Spar");
  177. Query query = new PrefixQuery(term);
  178. //执行查询,并打印查询到的记录数
  179. shard.query(query);
  180. }
  181. /**
  182. * 短语搜索
  183. * <p>
  184. * 所谓PhraseQuery,就是通过短语来检索,比如我想查“big car”这个短语,
  185. * 那么如果待匹配的document的指定项里包含了"big car"这个短语,
  186. * 这个document就算匹配成功。可如果待匹配的句子里包含的是“big black car”,
  187. * 那么就无法匹配成功了,如果也想让这个匹配,就需要设定slop,
  188. * 先给出slop的概念:slop是指两个项的位置之间允许的最大间隔距离
  189. *
  190. * @throws IOException
  191. */
  192. @Test
  193. public void phraseQueryTest() throws IOException {
  194. String searchField = "content";
  195. String query1 = "apache";
  196. String query2 = "spark";
  197. PhraseQuery.Builder builder = new PhraseQuery.Builder();
  198. builder.add(new Term(searchField, query1));
  199. builder.add(new Term(searchField, query2));
  200. builder.setSlop(0);
  201. PhraseQuery phraseQuery = builder.build();
  202. //执行查询,并打印查询到的记录数
  203. shard.query(phraseQuery);
  204. }
  205. /**
  206. * 相近词语搜索
  207. * <p>
  208. * FuzzyQuery是一种模糊查询,它可以简单地识别两个相近的词语。
  209. *
  210. * @throws IOException
  211. */
  212. @Test
  213. public void fuzzyQueryTest() throws IOException {
  214. String searchField = "content";
  215. Term t = new Term(searchField, "大规模");
  216. Query query = new FuzzyQuery(t);
  217. //执行查询,并打印查询到的记录数
  218. shard.query(query);
  219. }
  220. /**
  221. * 通配符搜索(IO影响较大,不建议使用)
  222. * <p>
  223. * Lucene也提供了通配符的查询,这就是WildcardQuery。
  224. * 通配符“?”代表1个字符,而“*”则代表0至多个字符。
  225. *
  226. * @throws IOException
  227. */
  228. @Test
  229. public void wildcardQueryTest() throws IOException {
  230. String searchField = "content";
  231. Term term = new Term(searchField, "大*规模");
  232. Query query = new WildcardQuery(term);
  233. //执行查询,并打印查询到的记录数
  234. shard.query(query);
  235. }
  236. /**
  237. * 分词查询
  238. *
  239. * @throws IOException
  240. * @throws ParseException
  241. */
  242. @Test
  243. public void queryParserTest() throws IOException, ParseException {
  244. final Analyzer analyzer = shard.getAnalyzer();
  245. String searchField = "content";
  246. //指定搜索字段和分析器
  247. QueryParser parser = new QueryParser(searchField, analyzer);
  248. //QueryParser queryParser = new MultiFieldQueryParser(new String[]{"title", "content"}, analyzer);
  249. //用户输入内容
  250. Query query = parser.parse("Spark");
  251. //执行查询,并打印查询到的记录数
  252. shard.query(query);
  253. }
  254. /**
  255. * 高亮处理
  256. *
  257. * @throws IOException
  258. */
  259. @Test
  260. public void HighlighterTest() throws IOException, ParseException, InvalidTokenOffsetsException {
  261. final Analyzer analyzer = shard.getAnalyzer();
  262. final IndexSearcher searcher = shard.getSearcher();
  263. String searchField = "content";
  264. String text = "大规模";
  265. //指定搜索字段和分析器
  266. QueryParser parser = new QueryParser(searchField, analyzer);
  267. //用户输入内容
  268. Query query = parser.parse(text);
  269. TopDocs topDocs = searcher.search(query, 100);
  270. // 关键字高亮显示的html标签,需要导入lucene-highlighter-xxx.jar
  271. SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<span style='color:red'>", "</span>");
  272. Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query));
  273. for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
  274. //取得对应的文档对象
  275. Document document = searcher.doc(scoreDoc.doc);
  276. // 内容增加高亮显示
  277. TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(document.get("content")));
  278. String content = highlighter.getBestFragment(tokenStream, document.get("content"));
  279. System.out.println(content);
  280. }
  281. }
  282. @Test
  283. public void testAnalyzerDoc() throws IOException {
  284. // SmartChineseAnalyzer smartcn分词器 需要lucene依赖 且和lucene版本同步
  285. Analyzer analyzer = new SmartChineseAnalyzer();
  286. String text = "Apache Spark 是专为大规模数据处理而设计的快速通用的计算引擎";
  287. TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(text));
  288. CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
  289. try {
  290. tokenStream.reset();
  291. while (tokenStream.incrementToken()) {
  292. System.out.println(charTermAttribute.toString());
  293. }
  294. tokenStream.end();
  295. } finally {
  296. tokenStream.close();
  297. analyzer.close();
  298. }
  299. }
  300. private void check() throws IOException {
  301. final IndexSearcher searcher = shard.getSearcher();
  302. IndexReader reader = searcher.getIndexReader();
  303. // 通过reader可以有效的获取到文档的数量
  304. // 有效的索引文档
  305. System.out.println("有效的索引文档:" + reader.numDocs());
  306. // 总共的索引文档
  307. System.out.println("总共的索引文档:" + reader.maxDoc());
  308. // 删掉的索引文档,其实不恰当,应该是在回收站里的索引文档
  309. System.out.println("删掉的索引文档:" + reader.numDeletedDocs());
  310. }
  311. }