12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061 |
- package com.ch.jedge.jbot.dict;
- import com.changhong.qlib.util.StringUtils;
- import com.changhong.qlib.util.file.FileUtils;
- import java.io.File;
- import java.util.List;
- import java.util.Map;
- /**
- * 词库工具
- * 扫描词库目录下的通用词库文件,并动态添加到Jieba分词词库中
- *
- */
- public class JEDictUtil {
- public static void loadStaticCommonDicts(final Map<String, JedgeWordDict> dictMap, String dictPath) {
- FileUtils.listDirFiles(dictPath, file -> {
- JedgeWordDict dict = loadStaticCommonDict(dictPath, file);
- if(dict!=null) {
- synchronized (dictMap) {
- dictMap.put(dict.getName(), dict);
- }
- }
- return null;
- });
- }
- public static JedgeWordDict loadStaticCommonDict(String dictPath, File file) {
- if(file==null || !file.getName().endsWith(".dic") || !file.exists())
- return null;
- List<String> lines = StringUtils.readStringLinesFromFile(file);
- JedgeWordDict dict = new JedgeWordDict(file.getAbsolutePath());
- String[] defaultWordDef = "100 nz".split(" ");
- for(String l : lines) {
- //添加词汇
- if(StringUtils.isNotValidStr(l) || l.charAt(0) == '#')
- continue;
- if(l.startsWith("$")) {
- defaultWordDef = l.substring(1).trim().split(" ");
- continue;
- }
- String[] disc = l.split(" ");
- int idx = 0;
- String[] realWord = new String[3];
- for(String d : disc) {
- if(d.isEmpty()) continue;
- realWord[idx++] = d;
- if(idx==3) break;
- }
- String word = realWord[0],
- freq = realWord[1]==null?defaultWordDef[0]:realWord[1],
- type = realWord[2]==null?defaultWordDef[1]:realWord[2];
- if(StringUtils.isNotValidStr(word)) continue;
- dict.appendWordItem(new JedgeWordItem(word, type, freq, false));
- }
- if(!dict.isEmpty()) return dict;
- return null;
- }
- }
|