Java-解析搜狗输入法核心词库sgim_core.bin文件 发表于 2015-09-06 | 分类于 Java | | sgim_core.bin文件是搜狗输入法的核心词库。 下面对sgim_core.bin文件进行解析: 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566import java.io.IOException;import java.io.RandomAccessFile;import java.nio.ByteBuffer;import java.nio.ByteOrder;import java.nio.channels.FileChannel;public class Sougou { public static void main(final String[] args) throws IOException { final String binFile = "/Users/YI/Documents/workspace/TestWordLib/src/db/sgim_core.bin"; final int[] searchKey = { 0x02, 0x00, 0x4A, 0x55 }; final ByteBuffer bb; try (RandomAccessFile file = new RandomAccessFile(binFile, "r"); final FileChannel fChannel = file.getChannel();) { bb = ByteBuffer.allocate((int) fChannel.size()); fChannel.read(bb); } bb.order(ByteOrder.LITTLE_ENDIAN); bb.rewind(); int words = bb.getInt(0xC); System.out.println("读入文件: " + binFile + ",单词:" + words); int idx = 0; int i; int startPos = -1; while (bb.hasRemaining()) { i = 0xff & bb.get(); if (i == searchKey[idx]) { idx++; if (idx == searchKey.length) { startPos = bb.position() - searchKey.length; break; } } else { idx = 0; } } if (startPos != -1) { short s; int counter = 0; final ByteBuffer buffer = ByteBuffer.allocate(Short.MAX_VALUE); System.out.println("单词起始位置:0x" + Integer.toHexString(startPos)); bb.position(startPos); while (bb.hasRemaining() && (words-- > 0)) { s = bb.getShort(); bb.get(buffer.array(), 0, s); counter++; String word = new String(buffer.array(), 0, s, "UTF-16LE"); if(word.length() > 1){ System.out.println(word); } } final int endPos = bb.position(); final int diff = endPos - startPos; System.out.println("读出单词'" + binFile + "':" + counter); System.out.println("单词结尾位置:0x" + Integer.toHexString(endPos)); System.out.println("单词词典长度:0x" + Integer.toHexString(diff)); } else { System.err.println("文件版本已更新!"); } }} 运行后可以看到解析后的词组: