最近在做一个百科的模块,搜索功能当然必不可少,同时也支持一下拼音的条件检索。
1. 准备工作 1.1. 导入依赖
<!-- ik分词 --> <dependency> <groupId>com.janeluo</groupId> <artifactId>ikanalyzer</artifactId> <version>2012_u6</version> </dependency> <!-- 汉字和拼音转换 --> <dependency> <groupId>com.pinyin4j</groupId> <artifactId>pinyin4j</artifactId> <version>2.5.0</version> </dependency>1.2. 完善表结构
whole_py:全拼,","分隔,如:大家好即为“da,jia,hao” short_py:首字母简拼2. 编写汉字转拼音工具类
import net.sourceforge.pinyin4j.PinyinHelper; import org.apache.commons.lang3.StringUtils; import java.util.ArrayList; import java.util.LinkedHashSet; import java.util.List; public class PinYinUtils { /** * 获取汉字首字母简拼 * * @param hanZi 汉字 * @return */ public static String getJaneSpell(String hanZi) { String result = null; if (StringUtils.isNotBlank(hanZi)) { char[] charArray = hanZi.toCharArray(); StringBuffer sb = new StringBuffer(); for (char ch : charArray) { // 逐个汉字进行转换,每个汉字返回值为一个String数组(因为有多音字) String[] stringArray = PinyinHelper.toHanyuPinyinStringArray(ch); if (null != stringArray) { sb.append(stringArray[0].charAt(0)); } } if (sb.length() > 0) { result = sb.toString(); } } return result; } /** * 获取汉字全拼 * * @param hanZi 汉字 * @return 转换失败 返回null */ public static List<String> getWholeSpell(String hanZi) { List<String> charList = new ArrayList<>(); hanZi = hanZi.trim().toLowerCase(); if (StringUtils.isNotBlank(hanZi)) { char[] charArray = hanZi.toCharArray(); StringBuilder eng = null; LinkedHashSet<String> pySet = new LinkedHashSet<>(); for (char ch : charArray) { String str = String.valueOf(ch); if (!str.matches("[a-zA-Z\\d]")) { if (eng != null) { charList.add(eng.toString()); eng = null; } String[] pyArray = PinyinHelper.toHanyuPinyinStringArray(ch); if (null != pyArray && pyArray.length > 0) { pySet.add(pyArray[0].replaceAll("\\d", "")); } } else { if (eng == null) { eng = new StringBuilder(); } eng.append(ch); } } charList.addAll(pySet); if (eng != null) { charList.add(eng.toString()); } } return charList; } }3. 此处省去新增操作
4. 接口搜索功能实现 4.1. 分词处理
private static String pyRegex = "[^aoeiuv]?h?[iuv]?(ai|ei|ao|ou|er|ang?|eng? |ong|a|o|e|i|u|ng|n)?"; private Set<String> textSplit(String keyword) { Set<String> set = new LinkedHashSet<>(); StringReader reader = new StringReader(keyword.trim()); IKSegmenter iks = new IKSegmenter(reader, true); try { Lexeme lexeme; while ((lexeme = iks.next()) != null) { set.add(lexeme.getLexemeText()); } } catch (IOException e) { e.printStackTrace(); } return set; } private Set<String> pySplit(String keyword) { int tag = 0; String s = keyword; Set<String> set = new LinkedHashSet<>(); for (int i = s.length(); i > 0; i = i - tag) { Matcher matcher = Pattern.compile(pyRegex).matcher(s); if (matcher.find()) { set.add(matcher.group()); tag = matcher.end() - matcher.start(); s = s.substring(tag); } } return set; } /** * <p>业务实现类搜索方法.</p> * * @param keyWord 搜索的关键词 */ @Override public List<DiseaseListVO> search(String keyWord) { if (StringUtils.isEmpty(keyWord)) { return null; } // 过滤出中英文、数字 Set<String> keywordSet = textSplit(keyWord.replaceAll("[^0-9a-zA-Z\\u4e00-\\u9fa5]", "")); StringBuilder pyBuilder = new StringBuilder(); StringBuilder hzBuilder = new StringBuilder(); keywordSet.forEach(k -> { if (Pattern.compile("^[a-zA-Z]+$").matcher(k).matches()) { pyBuilder.append(k); } else { hzBuilder.append(k).append("|"); } }); DiseaseQueryParam query = new DiseaseQueryParam(); query.setExactName(keyWord); // 多种检索条件为了使搜索范围更加广泛 if (pyBuilder.length() > 0) { String whole = pyBuilder.toString().toLowerCase(); query.setShortPyRegexp(whole); Set<String> pySet = pySplit(whole); if (CollectionUtils.isNotEmpty(pySet)) { whole = StringUtils.join(pySet, "|"); query.setWholePyLike(StringUtils.join(pySet, ",")); } query.setWholePyRegexp(whole); } if (hzBuilder.length() > 0) { String hz = hzBuilder.toString(); query.setLikeName(hz.substring(0, hz.length() - 1)); } else { query.setLikeName(keyWord); } return libMapper.listForSearch(query); }4.2. Mapper映射文件
<!-- CASE...WHEN...是为了按照满足条件数排序,从而筛选出最优结果。 --> <select id="listForSearch" parameterType="parameterType" resultType="resultType"> <bind name="exactNameLike" value="'%'+exactName+'%'"/> SELECT dl.id,dl.`name`,( (CASE WHEN (dl.`name` = #{exactName}) THEN 1 ELSE 0 END) + (CASE WHEN (dl.`name` LIKE #{exactNameLike}) THEN 1 ELSE 0 END) + (CASE WHEN (dl.`name` REGEXP #{likeName}) THEN 1 ELSE 0 END) + (CASE WHEN (dc.`name` REGEXP #{likeName}) THEN 1 ELSE 0 END) <if test="wholePyRegexp!=null and wholePyRegexp!=''"> <bind name="pyLike" value="'%'+wholePyLike+'%'"/> + (CASE WHEN (dl.whole_py LIKE #{pyLike}) THEN 1 ELSE 0 END) + (CASE WHEN (dl.whole_py REGEXP #{wholePyRegexp}) THEN 1 ELSE 0 END) </if> <if test="shortPyRegexp!=null and shortPyRegexp!=''"> + (CASE WHEN (dl.short_py REGEXP #{shortPyRegexp}) THEN 1 ELSE 0 END) </if> ) i FROM t_disease_lib dl LEFT JOIN t_disease_classify dc ON dl.classify_id=dc.id WHERE dl.`status`=1 HAVING i>1 ORDER BY i DESC </select>5. 测试
前端输入 candou,返回结果:
{ "code": "200", "message": "OK", "data": [ { "id": "1aefc5bdb8e444be8e47531eec1ce810", "name": "蚕豆" } ] }输入 cand / cd / 蚕豆 / 蚕 ,也均能正确返回,再广泛一点输入 c / d,则会 返回包含该字母的结果,字段文本更长的话同理,这里只做举例。
6. 总结
可满足简单搜索功能需求缺点:只支持少数据量搜索,数据量大时SQL执行慢。