spring boot 统一处理敏感词

it2023-05-07  92

使用filter统一对敏感词的处理,

包含两种方式处理

   a: 替换敏感词

   b: 包含敏感词禁止提交

1、初始化 词库信息

package com.common.sensitiveword; import lombok.extern.slf4j.Slf4j; import org.springframework.util.ResourceUtils; import java.io.*; import java.util.*; import java.util.concurrent.ConcurrentHashMap; import java.util.regex.Matcher; import java.util.regex.Pattern; @Slf4j public class InitSensitiveWord { /** * 敏感词文件默认编码格式 */ public static final String DEFAULT_ENCODING = "utf-8"; /** * 敏感词DFA树关系标记key */ private final String IS_END = "isEnd"; /** * 不是敏感词的最后一个字符 */ private final String END_FALSE = "0"; /** * 是敏感词的最后一个字符 */ private final String END_TRUE = "1"; /** * 敏感词文件存放路径 */ private final String SENSITIVE_WORD_FILE_PATH = "sensitiveWord" + File.separator; /** * 忽略特殊字符的正则表达式 */ private final String IGNORE_SPECIAL_CHAR_REGEX = "[`~!@#$%^&*()+=|{}':;',\\\\[\\\\].<>/?~!@#¥%……&*()——+|{}【】‘;:”“’。,、?]|\\s*"; Pattern pattern = Pattern.compile(IGNORE_SPECIAL_CHAR_REGEX); /** * 词库信息 */ public static Map sensitiveWordMap; public InitSensitiveWord() { super(); } /** * 初始化词库 * * @return 敏感词 */ public Map initKeyword() { //读取敏感词库 Set keyWordSet = readSensitiveWordFile(); // 将敏感词读入hashMap addSensitiveWordToHashMap(keyWordSet); // 返回敏感词 return sensitiveWordMap; } /** * 读取敏感词库 * * @return 敏感词 */ private Set readSensitiveWordFile() { Set set = new HashSet(); set.add("日本鬼子"); set.add("傻逼"); set.add("大傻子"); return set; } /** * 获取文件夹路径 * * @return 文件夹下所有txt */ public void initSensitiveWords() throws FileNotFoundException { sensitiveWordMap = new ConcurrentHashMap(); File dir = new File(getFilePath() + SENSITIVE_WORD_FILE_PATH); if (dir.isDirectory() && dir.exists()) { for (File file : dir.listFiles()) { createDFATree(readSensitiveWordFileToSet(file)); log.info(String.format("将敏感词文件加载到DFA树列表成功{%s}", file)); } log.info(String.format("总共构建%s棵DFA敏感词树", sensitiveWordMap.size())); } else { throw new RuntimeException(String.format("敏感词文件目录不存在{%s}", dir)); } } /** * 读取文件中的敏感词 * * @param file 敏感词文件 * @return 敏感词set集合 */ private Set<String> readSensitiveWordFileToSet(File file) { Set<String> words = new HashSet<>(); if (file.exists()) { BufferedReader reader = null; try { reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), DEFAULT_ENCODING)); String line = ""; while ((line = reader.readLine()) != null) { words.add(line.trim()); } } catch (Exception e) { e.printStackTrace(); } finally { if (reader != null) { try { reader.close(); } catch (IOException e) { e.printStackTrace(); } } } } log.info(String.format("从文件{%s}读取到{%s}个敏感词", file, words.size())); return words; } /** * 将敏感词构建成DFA树 * { * 出={ * isEnd=0, * 售={ * isEnd=0, * 手={ * isEnd=0, * 刀={ * isEnd=1 * } * }, * 军={ * isEnd=0, * 刀={ * isEnd=1 * } * } * } * } * } * * @param sensitiveWords 敏感词列表 */ private void createDFATree(Set<String> sensitiveWords) { Iterator<String> it = sensitiveWords.iterator(); while (it.hasNext()) { String word = it.next(); Map currentMap = sensitiveWordMap; for (int i = 0; i < word.length(); i++) { char key = word.charAt(i); if (isIgnore(key)) { continue; } Object oldValueMap = currentMap.get(key); if (oldValueMap == null) { // 不存在以key字符的DFA树则需要创建一个 Map newValueMap = new ConcurrentHashMap(); newValueMap.put(IS_END, END_FALSE); currentMap.put(key, newValueMap); currentMap = newValueMap; } else { currentMap = (Map) oldValueMap; } if (i == word.length() - 1) { // 给最后一个字符添加结束标识 currentMap.put(IS_END, END_TRUE); } } } } /** * 读取敏感词库中的内容,将内容添加到set集合中 */ private Set readSensitiveWordFileFromFile() throws IOException { String filePath = getFilePath(); Set set = null; InputStreamReader read = null; try { //读取文件 File file = new File(filePath + "sensitiveword/SensitiveWord.txt"); read = new InputStreamReader(new FileInputStream(file), DEFAULT_ENCODING); if (file.isFile() && file.exists()) { //文件流是否存在 set = new HashSet(); BufferedReader bufferedReader = new BufferedReader(read); String txt = null; while ((txt = bufferedReader.readLine()) != null) { //读取文件,将文件内容放入到set中 set.add(txt); } } else { //不存在抛出异常信息 throw new RuntimeException("敏感词库文件不存在"); } } finally { if (null != read) { //关闭文件流 read.close(); } } return set; } /** * 将敏感词读入hashMap * * @param keyWordSet 敏感词 */ private void addSensitiveWordToHashMap(Set<String> keyWordSet) { //初始化敏感词容器,减少扩容操作 sensitiveWordMap = new ConcurrentHashMap(keyWordSet.size()); String key = null; Map nowMap = null; Map newWorMap = null; //迭代keyWordSet Iterator iterator = keyWordSet.iterator(); while (iterator.hasNext()) { // 关键字 key = (String) iterator.next(); nowMap = sensitiveWordMap; for (int i = 0; i < key.length(); i++) { // 转换成char型 char keyChar = key.charAt(i); // 获取 Object wordMap = nowMap.get(keyChar); // 如果存在该key,直接赋值 if (wordMap != null) { nowMap = (Map) wordMap; } else { // 不存在则,则构建一个map,同时将isEnd设置为0,因为他不是最后一个 newWorMap = new HashMap(); // 不是最后一个 newWorMap.put("isEnd", "0"); nowMap.put(keyChar, newWorMap); nowMap = newWorMap; } if (i == key.length() - 1) { //最后一个 nowMap.put("isEnd", "1"); } } } } /** * 判断是否是要忽略的字符(忽略所有特殊字符以及空格) * * @param specificChar 指定字符 * @return 特殊字符或空格true否则false */ private boolean isIgnore(char specificChar) { Matcher matcher = pattern.matcher(String.valueOf(specificChar)); return matcher.matches(); } /** * 获取项目文件路径 * * @return 目录路径 * @throws FileNotFoundException */ private String getFilePath() throws FileNotFoundException { //判断系统环境 String osName = System.getProperty("os.name"); log.info("-------系统环境 {} --------", osName); String filePath = null; if (osName.startsWith("Windows")) { // windows filePath = ResourceUtils.getURL("classpath:").getPath(); } else { // unix or linux filePath = System.getProperty("user.dir") + "/"; } log.info("-------------文件路径 {} ----------", filePath); return filePath; } }

2、敏感词 帮助类

package com.common.sensitiveword; import java.io.FileNotFoundException; import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.Set; public class SensitivewordUtils { /** * 最小匹配规则 */ public static final Integer MIN_MATCH_TYPE = 0; /** * 最大匹配规则 */ public static final Integer MAX_MATCH_TYPE = -1; static { // 初始化数据 try { new InitSensitiveWord().initSensitiveWords(); } catch (FileNotFoundException e) { e.printStackTrace(); } } /** * 判断文字是否包含敏感字符 * * @param txt 文字 * @param matchType 匹配规则 1:最小匹配规则,2:最大匹配规则 * @return 若包含返回true,否则返回false */ public static boolean isContaintSensitiveWord(String txt, int matchType) { boolean flag = false; for (int i = 0; i < txt.length(); i++) { //判断是否包含敏感字符 int matchFlag = checkSensitiveWord(txt, i, matchType); //大于0存在,返回true if (matchFlag > 0) { flag = true; } } return flag; } /** * 获取文字中的敏感词 * * @param txt 文字 * @param matchType 匹配规则 1:最小匹配规则,2:最大匹配规则 * @return */ public static Set getSensitiveWord(String txt, int matchType) { Set sensitiveWordList = new HashSet(); for (int i = 0; i < txt.length(); i++) { //判断是否包含敏感字符 int length = checkSensitiveWord(txt, i, matchType); //存在,加入list中 if (length > 0) { sensitiveWordList.add(txt.substring(i, i + length)); //减1的原因,是因为for会自增 i = i + length - 1; } } return sensitiveWordList; } /** * 替换敏感字字符,所有的敏感词都用*替换 * * @param txt 字符串的内容 * @version 1.0 */ public static String replaceSensitiveWord(String txt) { String resultTxt = txt; //获取所有的敏感词 Set set = getSensitiveWord(txt, MAX_MATCH_TYPE); Iterator iterator = set.iterator(); String word = null; String replaceString = null; while (iterator.hasNext()) { word = (String) iterator.next(); replaceString = getReplaceChars("*", word.length()); resultTxt = resultTxt.replaceAll(word, replaceString); } return resultTxt; } /** * 获取替换后的字符串 * * @param replaceChar 替换字符 * @param length 替换长度 * @return 替换结果 */ private static String getReplaceChars(String replaceChar, int length) { String resultReplace = replaceChar; for (int i = 1; i < length; i++) { resultReplace += replaceChar; } return resultReplace; } /** * 检查文字中是否包含敏感字符,检查规则如下: * * @param txt * @param beginIndex * @param matchType * @return,如果存在,则返回敏感词字符的长度,不存在返回0 */ @SuppressWarnings({"rawtypes"}) public static int checkSensitiveWord(String txt, int beginIndex, int matchType) { //敏感词结束标识位:用于敏感词只有1位的情况 boolean flag = false; //匹配标识数默认为0 int matchFlag = 0; char word = 0; Map nowMap = InitSensitiveWord.sensitiveWordMap; for (int i = beginIndex; i < txt.length(); i++) { word = txt.charAt(i); //获取指定key nowMap = (Map) nowMap.get(word); //存在,则判断是否为最后一个 if (nowMap != null) { //找到相应key,匹配标识+1 matchFlag++; //如果为最后一个匹配规则,结束循环,返回匹配标识数 if ("1".equals(nowMap.get("isEnd"))) { //结束标志位为true flag = true; //最小规则,直接返回,最大规则还需继续查找 if (MIN_MATCH_TYPE == matchType) { break; } } } else { //不存在,直接返回 break; } } if (matchFlag < 2 || !flag) { //长度必须大于等于1,为词 matchFlag = 0; } return matchFlag; } public static void main(String[] args) { long beginTime = System.currentTimeMillis(); System.out.println("敏感词的数量:" + InitSensitiveWord.sensitiveWordMap.size()); String string = "腐败125公关兼职招聘4, 傻逼5,大 傻 逼123大傻子啊"; System.out.println("待检测语句字数:" + string.length()); //查看字符串有哪些敏感词 Set set = SensitivewordUtils.getSensitiveWord(string, 1); //替换 String str = SensitivewordUtils.replaceSensitiveWord(string); System.out.println("替换后的:" + str); long endTime = System.currentTimeMillis(); System.out.println("语句中包含敏感词的个数为:" + set.size() + "。包含:" + set); System.out.println("总共消耗时间为:" + (endTime - beginTime)); } }

 

3、统一处理,使用filter处理

  包含敏感词,对敏感词进行替换

package com.opencloud.common.filter; import com.alibaba.fastjson.JSON; import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONObject; import com.common.sensitiveword.SensitivewordUtils; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.springframework.http.HttpHeaders; import org.springframework.http.MediaType; import javax.servlet.ReadListener; import javax.servlet.ServletInputStream; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletRequestWrapper; import java.io.ByteArrayInputStream; import java.io.IOException; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.Set; /** * 敏感词请求 * * @author kou */ public class SensitiveWordRequestWrapper extends HttpServletRequestWrapper { private Map<String, String[]> params = new HashMap<>(); public SensitiveWordRequestWrapper(HttpServletRequest request) { super(request); //将参数表,赋予给当前的Map以便于持有request中的参数 this.params.putAll(request.getParameterMap()); } @Override public ServletInputStream getInputStream() throws IOException { // 非json类型,直接返回 if (!super.getHeader(HttpHeaders.CONTENT_TYPE).equalsIgnoreCase(MediaType.APPLICATION_JSON_VALUE)) { return super.getInputStream(); } //为空,直接返回 String json = IOUtils.toString(super.getInputStream(), "utf-8"); if (StringUtils.isEmpty(json)) { return super.getInputStream(); } Map<String, Object> map = JSON.parseObject(json); dealSensitiveWord(map); ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(JSON.toJSONString(map).getBytes("utf-8")); return new ServletInputStream() { @Override public boolean isFinished() { return false; } @Override public boolean isReady() { return false; } @Override public void setReadListener(ReadListener readListener) { } @Override public int read() { return byteArrayInputStream.read(); } }; } @Override public String getParameter(String name) { String[] values = params.get(name); if (values == null || values.length == 0) { return null; } // 判断是否包含敏感字符 if (SensitivewordUtils.isContaintSensitiveWord(values[0], SensitivewordUtils.MAX_MATCH_TYPE)) { // 替换敏感字符 return SensitivewordUtils.replaceSensitiveWord(values[0]); } return values[0]; } @Override public String[] getParameterValues(String name) { String[] values = params.get(name); if (values == null || values.length == 0) { return null; } for (int i = 0; i < values.length; i++) { // 判断是否包含敏感字符 if (SensitivewordUtils.isContaintSensitiveWord(values[i], SensitivewordUtils.MAX_MATCH_TYPE)) { // 替换敏感字符 values[i] = SensitivewordUtils.replaceSensitiveWord(values[i]); } } return values; } private void dealSensitiveWord(Map<String, Object> map) { Set<String> set = map.keySet(); Iterator<String> it = set.iterator(); // 将parameter的值去除空格后重写回去 while (it.hasNext()) { String key = it.next(); Object values = map.get(key); String value; if (values instanceof String) { value = ((String) values).trim(); // 判断是否包含敏感字符 if (SensitivewordUtils.isContaintSensitiveWord(value, SensitivewordUtils.MAX_MATCH_TYPE)) { // 替换敏感字符 values = SensitivewordUtils.replaceSensitiveWord(value); } } else if (values instanceof JSONArray) { JSONArray json = (JSONArray) values; if (!json.isEmpty()) { for (int i = 0; i < json.size(); i++) { // 遍历 jsonarray 数组,把每一个对象转成 json 对象 JSONObject job = json.getJSONObject(i); dealSensitiveWord(job); } } } map.put(key, values); } } }

 

包含敏感词禁止提交,并返回敏感词信息

package com.common.filter; import com.alibaba.fastjson.JSON; import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONObject; import com.common.sensitiveword.SensitivewordUtils; import lombok.extern.slf4j.Slf4j; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.springframework.http.HttpHeaders; import org.springframework.http.MediaType; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletRequestWrapper; import java.io.IOException; import java.util.*; /** * 包含敏感词直接返回 * * @author kou */ @Slf4j public class SensitiveWordReturnRequestWrapper extends HttpServletRequestWrapper { private Map<String, String[]> params = new HashMap<>(); public SensitiveWordReturnRequestWrapper(HttpServletRequest request) { super(request); //将参数表,赋予给当前的Map以便于持有request中的参数 this.params.putAll(request.getParameterMap()); } /** * 校验是否包含敏感字符 * * @return 返回敏感字符 */ public Set validateSensitiveWord() throws IOException { // 保存包含的敏感字符 Set<String> sensitiveWords = new HashSet<>(); Set sensitive = null; if (params.size() > 0) { Iterator<Map.Entry<String, String[]>> it = params.entrySet().iterator(); while (it.hasNext()) { Map.Entry<String, String[]> entry = it.next(); String[] values = entry.getValue(); if (values != null) { for (int i = 0; i < values.length; i++) { // 获取敏感字词 sensitive = SensitivewordUtils.getSensitiveWord(values[i], SensitivewordUtils.MAX_MATCH_TYPE); if (null != sensitive && sensitive.size() > 0) { sensitiveWords.addAll(sensitive); } } } } } // application/json 提交的获取body流 if (null != super.getHeader(HttpHeaders.CONTENT_TYPE)) { //为空,直接返回 String contentType = super.getHeader(HttpHeaders.CONTENT_TYPE).split(";")[0]; if (contentType.equalsIgnoreCase(MediaType.APPLICATION_JSON_VALUE)) { String json = IOUtils.toString(super.getInputStream(), "utf-8"); if (StringUtils.isNotBlank(json)) { // 处理json数据 dealJsonDatas(json, sensitiveWords); } } } if (sensitiveWords.size() > 0) { return sensitiveWords; } return null; } /** * 处理json 数据 * * @param json 原始json数据 * @param sensitiveWords 包含的敏感词 */ public void dealJsonDatas(String json, Set<String> sensitiveWords) { Object datas = JSON.parse(json); if (datas instanceof JSONObject) { // 对象 dealSensitiveWord((Map<String, Object>) datas, sensitiveWords); } else if (datas instanceof JSONArray) { // 数组 dealArrayDatas((JSONArray) datas, sensitiveWords); } else { // 其他的类型直接用字符串进行替换 // 获取敏感字词 Set sensitive = SensitivewordUtils.getSensitiveWord(json.toLowerCase(), SensitivewordUtils.MAX_MATCH_TYPE); if (null != sensitive && sensitive.size() > 0) { sensitiveWords.addAll(sensitive); } } } /** * 处理敏感词 * * @param map 原始数据 * @param sensitiveWords 包含的敏感词 */ private void dealSensitiveWord(Map<String, Object> map, Set<String> sensitiveWords) { Set<String> set = map.keySet(); Iterator<String> it = set.iterator(); // 将parameter的值去除空格后重写回去 while (it.hasNext()) { String key = it.next(); Object values = map.get(key); if (null != values) { if (values instanceof String) { // 处理字符串 dealStringData((String) values, sensitiveWords); } else if (values instanceof JSONObject || values instanceof Map) { // 包含对象 JSONObject object = (JSONObject) values; if (!object.isEmpty()) { // 判断是否是空对象 dealSensitiveWord(object, sensitiveWords); } } else if (values instanceof JSONArray || values instanceof List) { // 包含数组 dealArrayDatas((JSONArray) values, sensitiveWords); } } } } /** * 处理数组类型数据 * * @param json json 数组 * @param sensitiveWords 包含的敏感词 */ public void dealArrayDatas(JSONArray json, Set<String> sensitiveWords) { if (!json.isEmpty()) { if (null != json) { // 判断数组是否是对象,是对象循环处理对象内容 if (json.get(0) instanceof JSONObject) { for (int i = 0; i < json.size(); i++) { // 遍历 jsonarray 数组,把每一个对象转成 json 对象 JSONObject job = json.getJSONObject(i); dealSensitiveWord(job, sensitiveWords); } } else if (json.get(0) instanceof String) { // 不是对象,为字符串数组 for (int i = 0; i < json.size(); i++) { // 处理字符串 dealStringData(json.getString(i), sensitiveWords); } } } } } /** * 处理数组类型数据 * * @param content 字符串内容 * @param sensitiveWords 包含的敏感词 */ public void dealStringData(String content, Set<String> sensitiveWords) { // 将字符串转换为小写 String value = content.toLowerCase(); // 空数据不处理 if (StringUtils.isNotBlank(value)) { // 获取敏感字词 Set sensitive = SensitivewordUtils.getSensitiveWord(value, SensitivewordUtils.MAX_MATCH_TYPE); // 保存敏感词 if (null != sensitive && sensitive.size() > 0) { sensitiveWords.addAll(sensitive); } } } }

敏感词过滤器处理,针对不同的处理方式进行相应处理

package com.common.filter; import com.alibaba.fastjson.JSON; import com.common.model.ResultBody; import com.common.utils.StringUtils; import lombok.extern.slf4j.Slf4j; import org.springframework.http.HttpMethod; import javax.servlet.*; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import java.io.IOException; import java.io.PrintWriter; import java.util.Set; /** * 敏感字过滤器 * * @author kou */ @Slf4j public class SensitiveWordFilter implements Filter { /** * 1:替换敏感字符 */ public static final String REPLACE_SENSITIVE_WORD = "1"; /** * 2:包含敏感字符禁止提交 */ public static final String RETURN_CONTAIN_SENSITIVE_WORD = "2"; /** * 处理方式,1:替换敏感字符, 2:包含敏感字符禁止提交,null不处理 */ private String dealMethod; public SensitiveWordFilter(String dealMethod) { log.info("处理敏感字符方式:{}", StringUtils.isBlank(dealMethod) ? "不做任何处理" : dealMethod.equals(1) ? "替换敏感字符" : "包含敏感字符禁止提交"); this.dealMethod = dealMethod; } @Override public void doFilter(ServletRequest servletRequest, ServletResponse servletResponse, FilterChain filterChain) throws IOException, ServletException { // 获取request HttpServletRequest request = (HttpServletRequest) servletRequest; // 获取response HttpServletResponse response = (HttpServletResponse) servletResponse; // 获取请求方式 String method = request.getMethod(); // 判断是否是post请求 if (method.equals(HttpMethod.POST.name())) { // 未传处理方式不进行任何校验 if (null == this.dealMethod) { filterChain.doFilter(request, response); return; } else if (this.dealMethod.equals(REPLACE_SENSITIVE_WORD)) { // 替换敏感字符 SensitiveWordRequestWrapper sensitiveWordRequest = new SensitiveWordRequestWrapper(request); filterChain.doFilter(sensitiveWordRequest, response); return; } else if (this.dealMethod.equals(RETURN_CONTAIN_SENSITIVE_WORD)) { // 包含敏感字符禁止提交 SensitiveWordReturnRequestWrapper sensitiveWordReturn = new SensitiveWordReturnRequestWrapper(request); Set<String> set = sensitiveWordReturn.validateSensitiveWord(); // 判断是否包含敏感字符 if (null != set && set.size() > 0) { log.info("包含敏感词数:{} 个,禁止提交", set.size()); response.setCharacterEncoding("UTF-8"); response.setContentType("application/json; charset=utf-8"); PrintWriter out = response.getWriter(); out.append(JSON.toJSONString(ResultBody.failed().msg("包含敏感词禁止提交").data(set))); return; } } } filterChain.doFilter(request, response); } }

在config里设置bean

/** * 默认配置类 * * @author kou */ @Slf4j @Configuration public class AutoConfiguration { /** * 敏感词处理方式,1:替换敏感字符, 2:包含敏感字符禁止提交,null不处理 */ @Value("${sensitiveWord.dealMethod:#{null}}") private String sensitiveWordDealMethod; /** * 敏感词过滤 * * @return */ @Bean public FilterRegistrationBean sensitiveWordFilter() { FilterRegistrationBean filterRegistrationBean = new FilterRegistrationBean(new SensitiveWordFilter(sensitiveWordDealMethod)); log.info("SensitiveWordFilter [{}]", filterRegistrationBean); return filterRegistrationBean; } }

配置文件设置

sensitiveWord.dealMethod 配置,默认不设置不处理敏感词

sensitiveWord: # 敏感词处理方式,1:替换敏感字符, 2:包含敏感字符禁止提交,null不处理 dealMethod: 2

 

最新回复(0)