JEDictUtil.java 2.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061
  1. package com.ch.jedge.jbot.dict;
  2. import com.changhong.qlib.util.StringUtils;
  3. import com.changhong.qlib.util.file.FileUtils;
  4. import java.io.File;
  5. import java.util.List;
  6. import java.util.Map;
  7. /**
  8. * 词库工具
  9. * 扫描词库目录下的通用词库文件,并动态添加到Jieba分词词库中
  10. *
  11. */
  12. public class JEDictUtil {
  13. public static void loadStaticCommonDicts(final Map<String, JedgeWordDict> dictMap, String dictPath) {
  14. FileUtils.listDirFiles(dictPath, file -> {
  15. JedgeWordDict dict = loadStaticCommonDict(dictPath, file);
  16. if(dict!=null) {
  17. synchronized (dictMap) {
  18. dictMap.put(dict.getName(), dict);
  19. }
  20. }
  21. return null;
  22. });
  23. }
  24. public static JedgeWordDict loadStaticCommonDict(String dictPath, File file) {
  25. if(file==null || !file.getName().endsWith(".dic") || !file.exists())
  26. return null;
  27. List<String> lines = StringUtils.readStringLinesFromFile(file);
  28. JedgeWordDict dict = new JedgeWordDict(file.getAbsolutePath());
  29. String[] defaultWordDef = "100 nz".split(" ");
  30. for(String l : lines) {
  31. //添加词汇
  32. if(StringUtils.isNotValidStr(l) || l.charAt(0) == '#')
  33. continue;
  34. if(l.startsWith("$")) {
  35. defaultWordDef = l.substring(1).trim().split(" ");
  36. continue;
  37. }
  38. String[] disc = l.split(" ");
  39. int idx = 0;
  40. String[] realWord = new String[3];
  41. for(String d : disc) {
  42. if(d.isEmpty()) continue;
  43. realWord[idx++] = d;
  44. if(idx==3) break;
  45. }
  46. String word = realWord[0],
  47. freq = realWord[1]==null?defaultWordDef[0]:realWord[1],
  48. type = realWord[2]==null?defaultWordDef[1]:realWord[2];
  49. if(StringUtils.isNotValidStr(word)) continue;
  50. dict.appendWordItem(new JedgeWordItem(word, type, freq, false));
  51. }
  52. if(!dict.isEmpty()) return dict;
  53. return null;
  54. }
  55. }