LuceneFactoryTest.java 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430
  1. import org.apache.lucene.analysis.Analyzer;
  2. import org.apache.lucene.analysis.TokenStream;
  3. import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
  4. import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  5. import org.apache.lucene.document.*;
  6. import org.apache.lucene.index.IndexReader;
  7. import org.apache.lucene.index.Term;
  8. import org.apache.lucene.queryparser.classic.ParseException;
  9. import org.apache.lucene.queryparser.classic.QueryParser;
  10. import org.apache.lucene.search.*;
  11. import org.apache.lucene.search.highlight.Highlighter;
  12. import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
  13. import org.apache.lucene.search.highlight.QueryScorer;
  14. import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
  15. import org.dbsyncer.common.model.Paging;
  16. import org.dbsyncer.common.util.CollectionUtils;
  17. import org.dbsyncer.common.util.JsonUtil;
  18. import org.dbsyncer.common.util.RandomUtil;
  19. import org.dbsyncer.connector.constant.ConnectorConstant;
  20. import org.dbsyncer.storage.constant.ConfigConstant;
  21. import org.dbsyncer.storage.enums.StorageDataStatusEnum;
  22. import org.dbsyncer.storage.lucene.Shard;
  23. import org.dbsyncer.storage.query.Option;
  24. import org.dbsyncer.storage.util.ParamsUtil;
  25. import org.junit.After;
  26. import org.junit.Before;
  27. import org.junit.Test;
  28. import org.slf4j.Logger;
  29. import org.slf4j.LoggerFactory;
  30. import java.io.IOException;
  31. import java.io.StringReader;
  32. import java.time.Instant;
  33. import java.util.HashMap;
  34. import java.util.List;
  35. import java.util.Map;
  36. import java.util.concurrent.*;
  37. public class LuceneFactoryTest {
  38. private final Logger logger = LoggerFactory.getLogger(getClass());
  39. private Shard shard;
  40. @Before
  41. public void setUp() throws IOException {
  42. shard = new Shard("target/indexDir/");
  43. }
  44. @After
  45. public void tearDown() throws IOException {
  46. shard.deleteAll();
  47. }
  48. @Test
  49. public void testConcurrentUpdate() throws InterruptedException {
  50. // 模拟并发
  51. final int threadSize = 100;
  52. final ExecutorService pool = Executors.newFixedThreadPool(threadSize);
  53. final CyclicBarrier barrier = new CyclicBarrier(threadSize);
  54. final CountDownLatch latch = new CountDownLatch(threadSize);
  55. for (int i = 0; i < threadSize; i++) {
  56. final int k = i + 3;
  57. pool.submit(() -> {
  58. try {
  59. barrier.await();
  60. // 模拟操作
  61. System.out.println(String.format("%s:%s", Thread.currentThread().getName(), k));
  62. Document data = ParamsUtil.convertData2Doc(createMap(k));
  63. //IndexableField field = data.getField(ConfigConstant.CONFIG_MODEL_ID);
  64. //shard.update(new Term(ConfigConstant.CONFIG_MODEL_ID, field.stringValue()), data);
  65. shard.insert(data);
  66. } catch (InterruptedException e) {
  67. logger.error(e.getMessage());
  68. } catch (BrokenBarrierException e) {
  69. logger.error(e.getMessage());
  70. } catch (Exception e) {
  71. logger.error(e.getMessage());
  72. } finally {
  73. latch.countDown();
  74. }
  75. });
  76. }
  77. try {
  78. latch.await();
  79. logger.info("try to shutdown");
  80. pool.shutdown();
  81. check();
  82. Sort sort = new Sort(new SortField(ConfigConstant.CONFIG_MODEL_UPDATE_TIME, SortField.Type.LONG, true),
  83. new SortField(ConfigConstant.CONFIG_MODEL_CREATE_TIME, SortField.Type.LONG, true));
  84. Paging paging = shard.query(new Option(new MatchAllDocsQuery()), 1, 20, sort);
  85. if (!CollectionUtils.isEmpty(paging.getData())) {
  86. List<Map> data = (List<Map>) paging.getData();
  87. data.stream().forEach(r -> System.out.println(r));
  88. }
  89. } catch (InterruptedException e) {
  90. logger.error(e.getMessage());
  91. } catch (IOException e) {
  92. e.printStackTrace();
  93. }
  94. TimeUnit.SECONDS.sleep(3);
  95. logger.info("test end");
  96. }
  97. private Map<String, Object> createMap(int i) {
  98. Map<String, Object> params = new HashMap();
  99. params.put(ConfigConstant.CONFIG_MODEL_ID, "953402886828589057");
  100. params.put(ConfigConstant.DATA_SUCCESS, StorageDataStatusEnum.SUCCESS.getValue());
  101. params.put(ConfigConstant.DATA_EVENT, ConnectorConstant.OPERTION_UPDATE);
  102. params.put(ConfigConstant.DATA_ERROR, "");
  103. Map<String, Object> row = new HashMap<>();
  104. row.put("id", i);
  105. row.put("name", "中文");
  106. row.put("tel", "15800001234");
  107. row.put("update_time", System.currentTimeMillis());
  108. row.put("remark", "test" + i);
  109. params.put(ConfigConstant.CONFIG_MODEL_JSON, JsonUtil.objToJson(row));
  110. params.put(ConfigConstant.CONFIG_MODEL_CREATE_TIME, Instant.now().toEpochMilli());
  111. return params;
  112. }
  113. @Test
  114. public void testQuery() throws IOException {
  115. int size = 3;
  116. for (int i = size; i > 0; i--) {
  117. Document doc = new Document();
  118. doc.add(new StringField("id", String.valueOf(i), Field.Store.YES));
  119. doc.add(new StringField("name", "中文" + i, Field.Store.YES));
  120. doc.add(new TextField("content", "这是一串很长长长长长长长的文本", Field.Store.YES));
  121. // 创建索引
  122. int age = RandomUtil.nextInt(0, 50);
  123. doc.add(new IntPoint("age", age));
  124. // 需要存储内容
  125. doc.add(new StoredField("age", age));
  126. // 需要排序
  127. doc.add(new NumericDocValuesField("age", age));
  128. System.out.println(String.format("id=%s,age:=%s", String.valueOf(i), age));
  129. // 2020-05-23 12:00:00
  130. long createTime = 1590206400000L + i;
  131. doc.add(new LongPoint("createTime", createTime));
  132. doc.add(new StoredField("createTime", createTime));
  133. doc.add(new NumericDocValuesField("createTime", createTime));
  134. shard.insert(doc);
  135. }
  136. // 范围查询 IntPoint.newRangeQuery("id", 1, 100)
  137. // 集合查询 IntPoint.newSetQuery("id", 2, 3)
  138. // 单个查询 IntPoint.newExactQuery("id", 3)
  139. BooleanQuery query = new BooleanQuery.Builder()
  140. .add(IntPoint.newRangeQuery("age", 1, 100), BooleanClause.Occur.MUST)
  141. .build();
  142. Paging paging = shard.query(query, new Sort(new SortField("createTime", SortField.Type.LONG, true)));
  143. paging.getData().forEach(m -> System.out.println(m));
  144. // 清空
  145. shard.deleteAll();
  146. }
  147. @Test
  148. public void testCURD() throws IOException {
  149. System.out.println("测试前:");
  150. List<Map> maps = shard.query(new MatchAllDocsQuery());
  151. maps.forEach(m -> System.out.println(m));
  152. check();
  153. // 新增
  154. Document doc = new Document();
  155. String id = "100";
  156. doc.add(new StringField("id", id, Field.Store.YES));
  157. doc.add(new TextField("content", "这是一款大规模数据处理软件,名字叫做Apache Spark", Field.Store.YES));
  158. shard.insert(doc);
  159. System.out.println("新增后:");
  160. maps = shard.query(new MatchAllDocsQuery());
  161. maps.forEach(m -> System.out.println(m));
  162. check();
  163. // 修改
  164. doc.add(new TextField("content", "这是一款大规模数据处理软件,名字叫做Apache Spark[已修改]", Field.Store.YES));
  165. shard.update(new Term("id", id), doc);
  166. System.out.println("修改后:");
  167. maps = shard.query(new MatchAllDocsQuery());
  168. maps.forEach(m -> System.out.println(m));
  169. check();
  170. // 删除
  171. shard.delete(new Term("id", id));
  172. System.out.println("删除后:");
  173. maps = shard.query(new MatchAllDocsQuery());
  174. maps.forEach(m -> System.out.println(m));
  175. check();
  176. // 清空
  177. shard.deleteAll();
  178. }
  179. /**
  180. * 按词条搜索
  181. * <p>
  182. * TermQuery是最简单、也是最常用的Query。TermQuery可以理解成为“词条搜索”, 在搜索引擎中最基本的搜索就是在索引中搜索某一词条,而TermQuery就是用来完成这项工作的。
  183. * 在Lucene中词条是最基本的搜索单位,从本质上来讲一个词条其实就是一个名/值对。 只不过这个“名”是字段名,而“值”则表示字段中所包含的某个关键字。
  184. *
  185. * @throws IOException
  186. */
  187. @Test
  188. public void testQueryTerm() throws IOException {
  189. String searchField = "title";
  190. //这是一个条件查询的api,用于添加条件
  191. TermQuery query = new TermQuery(new Term(searchField, "Spark"));
  192. //执行查询,并打印查询到的记录数
  193. shard.query(query);
  194. }
  195. /**
  196. * 多条件查询
  197. * <p>
  198. * BooleanQuery也是实际开发过程中经常使用的一种Query。 它其实是一个组合的Query,在使用时可以把各种Query对象添加进去并标明它们之间的逻辑关系。 BooleanQuery本身来讲是一个布尔子句的容器,它提供了专门的API方法往其中添加子句,
  199. * 并标明它们之间的关系,以下代码为BooleanQuery提供的用于添加子句的API接口:
  200. *
  201. * @throws IOException
  202. */
  203. @Test
  204. public void testQueryBoolean() throws IOException {
  205. String searchField1 = "title";
  206. String searchField2 = "content";
  207. Query query1 = new TermQuery(new Term(searchField1, "Spark"));
  208. Query query2 = new TermQuery(new Term(searchField2, "Apache"));
  209. BooleanQuery.Builder builder = new BooleanQuery.Builder();
  210. // BooleanClause用于表示布尔查询子句关系的类,
  211. // 包 括:
  212. // BooleanClause.Occur.MUST,
  213. // BooleanClause.Occur.MUST_NOT,
  214. // BooleanClause.Occur.SHOULD。
  215. // 必须包含,不能包含,可以包含三种.有以下6种组合:
  216. //
  217. // 1.MUST和MUST:取得连个查询子句的交集。
  218. // 2.MUST和MUST_NOT:表示查询结果中不能包含MUST_NOT所对应得查询子句的检索结果。
  219. // 3.SHOULD与MUST_NOT:连用时,功能同MUST和MUST_NOT。
  220. // 4.SHOULD与MUST连用时,结果为MUST子句的检索结果,但是SHOULD可影响排序。
  221. // 5.SHOULD与SHOULD:表示“或”关系,最终检索结果为所有检索子句的并集。
  222. // 6.MUST_NOT和MUST_NOT:无意义,检索无结果。
  223. builder.add(query1, BooleanClause.Occur.SHOULD);
  224. builder.add(query2, BooleanClause.Occur.SHOULD);
  225. BooleanQuery query = builder.build();
  226. //执行查询,并打印查询到的记录数
  227. shard.query(query);
  228. }
  229. /**
  230. * 匹配前缀
  231. * <p>
  232. * PrefixQuery用于匹配其索引开始以指定的字符串的文档。就是文档中存在xxx%
  233. * <p>
  234. *
  235. * @throws IOException
  236. */
  237. @Test
  238. public void testQueryPrefix() throws IOException {
  239. String searchField = "title";
  240. Term term = new Term(searchField, "Spar");
  241. Query query = new PrefixQuery(term);
  242. //执行查询,并打印查询到的记录数
  243. shard.query(query);
  244. }
  245. /**
  246. * 短语搜索
  247. * <p>
  248. * 所谓PhraseQuery,就是通过短语来检索,比如我想查“big car”这个短语, 那么如果待匹配的document的指定项里包含了"big car"这个短语, 这个document就算匹配成功。可如果待匹配的句子里包含的是“big black car”,
  249. * 那么就无法匹配成功了,如果也想让这个匹配,就需要设定slop, 先给出slop的概念:slop是指两个项的位置之间允许的最大间隔距离
  250. *
  251. * @throws IOException
  252. */
  253. @Test
  254. public void testQueryPhrase() throws IOException {
  255. String searchField = "content";
  256. String query1 = "apache";
  257. String query2 = "spark";
  258. PhraseQuery.Builder builder = new PhraseQuery.Builder();
  259. builder.add(new Term(searchField, query1));
  260. builder.add(new Term(searchField, query2));
  261. builder.setSlop(0);
  262. PhraseQuery phraseQuery = builder.build();
  263. //执行查询,并打印查询到的记录数
  264. shard.query(phraseQuery);
  265. }
  266. /**
  267. * 相近词语搜索
  268. * <p>
  269. * FuzzyQuery是一种模糊查询,它可以简单地识别两个相近的词语。
  270. *
  271. * @throws IOException
  272. */
  273. @Test
  274. public void testQueryFuzzy() throws IOException {
  275. String searchField = "content";
  276. Term t = new Term(searchField, "大规模");
  277. Query query = new FuzzyQuery(t);
  278. //执行查询,并打印查询到的记录数
  279. shard.query(query);
  280. }
  281. /**
  282. * 通配符搜索(IO影响较大,不建议使用)
  283. * <p>
  284. * Lucene也提供了通配符的查询,这就是WildcardQuery。 通配符“?”代表1个字符,而“*”则代表0至多个字符。
  285. *
  286. * @throws IOException
  287. */
  288. @Test
  289. public void testQueryWildcard() throws IOException {
  290. String searchField = "content";
  291. Term term = new Term(searchField, "大*规模");
  292. Query query = new WildcardQuery(term);
  293. //执行查询,并打印查询到的记录数
  294. shard.query(query);
  295. }
  296. /**
  297. * 分词查询
  298. *
  299. * @throws IOException
  300. * @throws ParseException
  301. */
  302. @Test
  303. public void testQueryParser() throws IOException, ParseException {
  304. final Analyzer analyzer = shard.getAnalyzer();
  305. String searchField = "content";
  306. //指定搜索字段和分析器
  307. QueryParser parser = new QueryParser(searchField, analyzer);
  308. //QueryParser queryParser = new MultiFieldQueryParser(new String[]{"title", "content"}, analyzer);
  309. //用户输入内容
  310. Query query = parser.parse("Spark");
  311. //执行查询,并打印查询到的记录数
  312. shard.query(query);
  313. }
  314. /**
  315. * 高亮处理
  316. *
  317. * @throws IOException
  318. */
  319. @Test
  320. public void testHighlighter() throws IOException, ParseException, InvalidTokenOffsetsException {
  321. final Analyzer analyzer = shard.getAnalyzer();
  322. final IndexSearcher searcher = shard.getSearcher();
  323. String searchField = "content";
  324. String text = "大规模";
  325. //指定搜索字段和分析器
  326. QueryParser parser = new QueryParser(searchField, analyzer);
  327. //用户输入内容
  328. Query query = parser.parse(text);
  329. TopDocs topDocs = searcher.search(query, 100);
  330. // 关键字高亮显示的html标签,需要导入lucene-highlighter-xxx.jar
  331. SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<span style='color:red'>", "</span>");
  332. Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query));
  333. for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
  334. //取得对应的文档对象
  335. Document document = searcher.doc(scoreDoc.doc);
  336. // 内容增加高亮显示
  337. TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(document.get("content")));
  338. String content = highlighter.getBestFragment(tokenStream, document.get("content"));
  339. System.out.println(content);
  340. }
  341. }
  342. @Test
  343. public void testAnalyzerDoc() throws IOException {
  344. // SmartChineseAnalyzer smartcn分词器 需要lucene依赖 且和lucene版本同步
  345. Analyzer analyzer = new SmartChineseAnalyzer();
  346. String text = "Apache Spark 是专为大规模数据处理而设计的快速通用的计算引擎";
  347. TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(text));
  348. CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
  349. try {
  350. tokenStream.reset();
  351. while (tokenStream.incrementToken()) {
  352. System.out.println(charTermAttribute.toString());
  353. }
  354. tokenStream.end();
  355. } finally {
  356. tokenStream.close();
  357. analyzer.close();
  358. }
  359. }
  360. private void check() throws IOException {
  361. final IndexSearcher searcher = shard.getSearcher();
  362. IndexReader reader = searcher.getIndexReader();
  363. // 通过reader可以有效的获取到文档的数量
  364. // 有效的索引文档
  365. System.out.println("有效的索引文档:" + reader.numDocs());
  366. // 总共的索引文档
  367. System.out.println("总共的索引文档:" + reader.maxDoc());
  368. // 删掉的索引文档,其实不恰当,应该是在回收站里的索引文档
  369. System.out.println("删掉的索引文档:" + reader.numDeletedDocs());
  370. }
  371. }