副标题#e#
开始之前,先看一下从人人网中发现的90后用户爱用的词
是不是很好玩,哈哈。写这篇文章就是让你简单的自动的从文本中找出新的词,这样就知道现在的年轻人喜欢什么了(对于博主这种上了年纪的人来说,真的是很有用,呜呜)
项目结构
当然,text.dat和common.dic这两个文件你可以随意替换,注意text.dat中的数据一定要够份量,否则没啥效果
原理么,看下Matrix67大牛的文章你就懂了
互联网时代的社会语言学:基于SNS的文本数据挖掘
训练数据下载
下边开始上代码
common
这个里边包含以下几个类,主要是定义数据结构
CountMap.java
定义一个计数Map来进行数据操作和持久化
package grid.common; import java.io.Serializable; import java.util.HashMap; public class CountMap<T> extends HashMap<T,Integer> implements Serializable { private static final long serialVersionUID = 6097963798841161750L; public void increase(T t) {//添加元素 Integer count = get(t); if (null == count) { put(t,1); } else { put(t,++count); } } public int count() { //计数 int count = 0; for (T t : keySet()) { count += get(t); } return count; } public int get(char c) { Integer count = super.get(c); return null == count ? 0 : count; } }
Node.java
定义语法树的节点
package grid.common; import java.util.ArrayList; import java.util.Collections; import java.util.List; public class Node<T> { protected List<Node<T>> children; protected Node<T> parent; protected T value; Node(T value) { this.value = value; } public Node<T> add(T value) { if (null == children) { children = new ArrayList<Node<T>>(); } Node<T> child = new Node<T>(value); child.setParent(this); children.add(child); return child; } public T getValue() { return value; } public Node<T> getParent() { return parent; } public void setParent(Node<T> parent) { this.parent = parent; } private void recurseChildren(List<Node<T>> list,Node<T> parent) { if (null == parent.children) { list.add(parent); } else { for (Node<T> node : parent.children) { recurseChildren(list,node); } } } public List<Node<T>> getLeaves() { List<Node<T>> list = new ArrayList<Node<T>>(); recurseChildren(list,this); return list; } public List<T> getBranchPath() { List<T> list = new ArrayList<T>(); Node<T> node = this; do { list.add(node.getValue()); node = node.parent; } while (null != node && !(node instanceof Tree<?>)); Collections.reverse(list); return list; } private void append(StringBuilder builder,int deep,Node<T> node) { for (int i = 0; i < deep; i++) { builder.append(" "); } builder.append("|--"); builder.append(node.getValue()); builder.append("\n"); if (null != node.children) { for (Node<T> child : node.children) { append(builder,deep + 1,child); } } } public String dump() { StringBuilder builder = new StringBuilder(); append(builder,0,this); return builder.toString(); } public String toString() { return value.toString(); } }
TextDatReader.java
读取训练数据
package grid.common; import java.io.File; import java.io.FileReader; import java.io.IOException; public class TextDatReader { public static String read(String path) throws IOException { File file = new File(path); FileReader reader = new FileReader(file); char buffer[] = new char[(int) file.length()]; reader.read(buffer); return new String(buffer); } }
TextUtils.java
用来做文本处理,如判断是否为空、匹配字符等
package grid.common; public class TextUtils { public static boolean isCnLetter(char c) { return c >= 0x4E00 && c <= 0x9FCB; } public static boolean isNumeric(char c) { return c >= '0' && c <= '9'; } public static boolean isEnLetter(char c) { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); } public static boolean match(String src,int off,String dest) { int len = dest.length(); int srcLen = src.length(); for (int i = 0; i < len; i++) { if (srcLen <= off + i) { return false; } if (dest.charAt(i) != src.charAt(off + i)) { return false; } } return true; } public static boolean isBlank(String str) { return null == str || str.isEmpty() || str.trim().isEmpty(); } }
Tree.java
语法树
package grid.common; public class Tree<T> extends Node<T> { public Tree(T value) { super(value); } }
dic
里边包含CnDictionary类
CnDictionary.java
词典处理
package grid.text.dic; import grid.common.CountMap; import grid.common.TextDatReader; import grid.common.TextUtils; import java.io.IOException; import java.util.HashSet; import java.util.Set; public class CnDictionary { private final String COMMON_WORD_DIC_PATH = "common.dic"; /** * This text data is for character statistic. Change to your own if you * like. */ private final String COMMON_LETTER_RESOURCE_PATH = "text.dat"; private Set<String> dictionary = new HashSet<String>(); private CountMap<Character> letterCountMap = new CountMap<Character>(); private int totalLetterCount; private static CnDictionary instance; //单例模式 public static CnDictionary Instance() { if (null == instance) { try { instance = new CnDictionary(); } catch (IOException e) { e.printStackTrace(); } } return instance; } private CnDictionary() throws IOException { initWordDic(); initLetterCountMap(); } private void initLetterCountMap() throws IOException { String letterResource = TextDatReader.read(COMMON_LETTER_RESOURCE_PATH);//读取语料数据 text.dat final int len = letterResource.length(); char c; for (int i = 0; i < len; i++) { c = letterResource.charAt(i); if (TextUtils.isCnLetter(c)) { letterCountMap.increase(c); } } totalLetterCount = letterCountMap.count(); } private void initWordDic() throws IOException { String bytes = TextDatReader.read(COMMON_WORD_DIC_PATH);//读取词典commondic final int len = bytes.length(); String s = ""; char c; for (int i = 0; i < len; i++) { c = bytes.charAt(i); if ('\n' == c || '\r' == c || 0 == c) { if (!TextUtils.isBlank(s)) { dictionary.add(s.trim()); } s = ""; } else { s += c; } if (0 == c) { break; } } } public boolean contains(String word) { return dictionary.contains(word); } public double rate(char c) { return (double) letterCountMap.get(c) / totalLetterCount; } public int size() { return dictionary.size(); } }
evolution
#p#副标题#e##p#分页标题#e#
EntropyJudger.java
计算熵值
package grid.text.evolution; import grid.common.CountMap; import grid.common.TextUtils; import grid.text.index.Pos; import grid.text.index.TextIndexer; public class EntropyJudger { private TextIndexer indexer; /** * A word least appeared count */ private static int LEAST_COUNT_THRESHOLD = 5; //阈值 /** * Threshold for solid rate calculated by word appeared count and every * single letter. * * The smaller this values is,more new words you will get,but with less * accuracy. The greater this value is,less new words you will get,but * with high accuracy. */ private static double SOLID_RATE_THRESHOLD = 0.018; /** * Threshold for entropy value calculated by candidate word prefix character * count and suffix character count * * The smaller this values is,but * with high accuracy. */ private static double ENTROPY_THRESHOL = 1.92; public EntropyJudger(TextIndexer indexer) { this.indexer = indexer; } public boolean judge(String candidate) { double solidRate = getSolidRate(candidate); if (solidRate < SOLID_RATE_THRESHOLD) { return false; } double entropy = getEntropy(candidate); if (entropy < ENTROPY_THRESHOL) { return false; } return true; } private double getEntropy(String candidate) { Pos pos = new Pos(candidate); CountMap<Character> frontCountMap = new CountMap<Character>(); CountMap<Character> backCountMap = new CountMap<Character>(); final int candidateLen = candidate.length(); int off = 0; char c; double rate,frontEntropy = 0,backEntropy = 0; while (indexer.find(pos).isFound()) { off = pos.getPos(); c = indexer.charAt(off - 1); if (TextUtils.isCnLetter(c)) { frontCountMap.increase(c); } c = indexer.charAt(off + candidateLen); if (TextUtils.isCnLetter(c)) { backCountMap.increase(c); } } for (char key : frontCountMap.keySet()) { rate = (double) frontCountMap.get(key) / frontCountMap.count(); frontEntropy -= rate * Math.log(rate); } for (char key : backCountMap.keySet()) { rate = (double) backCountMap.get(key) / backCountMap.count(); backEntropy -= rate * Math.log(rate); } return frontEntropy > backEntropy ? backEntropy : frontEntropy; } /** * @param candidate * @return */ public double getSolidRate(String candidate) { final int candidateLen = candidate.length(); if (candidateLen < 2) { return 1; } final int count = indexer.count(candidate); double rate = 1; if (count < LEAST_COUNT_THRESHOLD) { return 0; } for (int i = 0; i < candidateLen; i++) { rate *= (double) count / indexer.count("" + candidate.charAt(i)); } return Math.pow(rate,1D / candidateLen) * Math.sqrt(candidateLen); } public void setIndexer(TextIndexer indexer) { this.indexer = indexer; } }
NewWordDiscover.java
抽词程序
package grid.text.evolution; import grid.common.TextUtils; import grid.text.dic.CnDictionary; import grid.text.index.CnPreviewTextIndexer; import grid.text.index.TextIndexer; import grid.text.selector.CnTextSelector; import grid.text.selector.TextSelector; import java.util.HashSet; import java.util.Set; public class NewWordDiscover { private CnDictionary dictionary; /** * Minimum word length */ private final static int MIN_CANDIDATE_LEN = 2; /** * Maximum word length */ private final static int MAX_CANDIDATE_LEN = 6; private static Set<Character> structuralLetterSet = new HashSet<Character>(); private static char[] structuralLetters = { '我','你','您','他','她','谁','哪','那','这','的','了','着','也','是','有','不','在','与','呢','啊','呀','吧','嗯','哦','哈','呐' }; static { for (char c : structuralLetters) { structuralLetterSet.add(c); } } public NewWordDiscover() { dictionary = CnDictionary.Instance(); } /** * New word discover is based on statistic and entropy,better to sure * document size is in 100kb level,or you may get a unsatisfied result. * * @param document * @return */ public Set<String> discover(String document) { Set<String> set = new HashSet<String>(); TextIndexer indexer = new CnPreviewTextIndexer(document); TextSelector selector = new CnTextSelector(document,MIN_CANDIDATE_LEN,MAX_CANDIDATE_LEN); EntropyJudger judger = new EntropyJudger(indexer); String candidate; while (!selector.end()) { candidate = selector.next(); if (TextUtils.isBlank(candidate)) { continue; } if (structuralLetterSet.contains(candidate.charAt(0)) || structuralLetterSet.contains(candidate.charAt(candidate .length() - 1))) { continue; } // Replace IF clause with "set.contains(candidate)" if you want to // find new word without any dictionary if (dictionary.contains(candidate) || set.contains(candidate)) { selector.select(); } else if (judger.judge(candidate)) { set.add(candidate); } } return set; } }
index
这几个类用于给词创建索引,方便从词典中找出
CnPreviewTextIndexer.java
package grid.text.index; import grid.common.TextUtils; import java.util.HashMap; import java.util.Map; import java.util.Vector; public class CnPreviewTextIndexer implements TextIndexer { private final static int CN_LETTER_COUNT = 5021; private String document; private Map<Character,Vector<Integer>> posMap; public CnPreviewTextIndexer(String document) { this.document = document; init(); } private void init() { final int len = document.length(); final int supposedMinCount = 1 + (int) Math.log(len / CN_LETTER_COUNT + 1); char c; Vector<Integer> posVector; posMap = new HashMap<Character,Vector<Integer>>(CN_LETTER_COUNT); for (int i = 0; i < len; i++) { c = document.charAt(i); if (!TextUtils.isCnLetter(c)) { continue; } posVector = posMap.get(c); if (null == posVector) { posVector = new Vector<Integer>(supposedMinCount); posMap.put(c,posVector); } posVector.add(i); } } @Override public int count(String text) { if (TextUtils.isBlank(text)) { return 0; } Vector<Integer> vector = posMap.get(text.charAt(0)); if (null == vector) { return 0; } if (1 == text.length()) { return vector.size(); } final int size = vector.size(); int count = 0; for (int i = 0; i < size; i++) { if (TextUtils.match(document,vector.get(i),text)) { count++; } } return count; } @Override public Pos find(Pos pos) { String text = pos.getTarget(); pos.setFound(false); if (TextUtils.isBlank(text)) { return pos; } Vector<Integer> vector = posMap.get(text.charAt(0)); if (null == vector) { return pos; } final int arraySize = vector.size(); final int arrayIndex = pos.arrayIndex + 1; for (int i = arrayIndex; i < arraySize; i++) { if (TextUtils.match(document,text)) { pos.setFound(true); pos.setPos(vector.get(i)); pos.arrayIndex = i; break; } } return pos; } @Override public int len() { return document.length(); } @Override public String sub(int off,int len) { if (off < 0 || off + len >= document.length()) { return ""; } return document.substring(off,off + len); } @Override public char charAt(int index) { if (index < 0 || index >= document.length()) { return 0; } return document.charAt(index); } }
Pos.java
package grid.text.index; public class Pos { private String target; /** * Pos for current matched full target text */ private int pos = -1; /** * Index in position array for current matched full target text */ int arrayIndex = -1; private boolean found = false; public Pos(String target) { this.target = target; } public String getTarget() { return target; } public int getPos() { return pos; } public boolean isFound() { return found; } void setPos(int pos) { this.pos = pos; } void setFound(boolean found) { this.found = found; } }
SimpleTextIndexer.java
package grid.text.index; public class SimpleTextIndexer implements TextIndexer { private String document; public SimpleTextIndexer(String document) { this.document = document; } @Override public int count(String text) { int off = 0; int count = 0; final int len = text.length(); while ((off = document.indexOf(text,off)) > -1) { count++; off += len; } return count; } @Override public Pos find(Pos pos) { final String text = pos.getTarget(); final int len = text.length(); int off = pos.getPos() + len; if (pos.getPos() < 0) off = 0; pos.setFound(false); if ((off = document.indexOf(text,off)) > -1) { pos.setFound(true); pos.setPos(off); } return pos; } @Override public int len() { return document.length(); } @Override public String sub(int off,int len) { return document.substring(off,off + len); } @Override public char charAt(int index) { if (index < 0 || index >= document.length()) { return 0; } return document.charAt(index); } }
TextIndexer.java
package grid.text.index; public interface TextIndexer { /** * @param text * @return count for specific text */ public int count(String text); /** * @param pos * @return next position for current pos */ public Pos find(Pos pos); /** * @return original document length */ public int len(); /** * @param off * @param len * @return the sub string start from <b>off</b> and with a length with * <b>len</b> */ public String sub(int off,int len); /** * @param index * @return return the character in the specified index */ public char charAt(int index); }
participle
#p#副标题#e##p#分页标题#e#
分词处理,具体看实现
Chunk.java
package grid.text.participle; import grid.text.dic.CnDictionary; import java.util.List; public class Chunk implements Comparable<Chunk> { private List<String> list; private int len = 0; private double avg = 0; private double variance = 0; public Chunk(List<String> list) { this.list = list; init(); } private void init() { for (String s : list) { len += s.length(); } avg = (double) len / list.size(); for (String s : list) { variance += Math.pow(avg - s.length(),2); } variance = Math.sqrt(variance); } public int getLen() { return len; } public double getAvg() { return avg; } public double getVariance() { return variance; } public String getHead() { if (null == list || list.isEmpty()) { return ""; } return list.get(0); } private int compareDouble(double d1,double d2) { if (d1 - d2 < -0.0000001D) { return 1; } else if (d1 - d2 > 0.0000001D) { return -1; } return 0; } @Override public int compareTo(Chunk o) { if (len != o.len) { return o.len - len; } int d = compareDouble(avg,o.avg); if (0 != d) { return d; } d = compareDouble(variance,o.variance); if (0 != d) { return d; } CnDictionary dictionary = CnDictionary.Instance(); double rateSrc = 0,rateDest = 0; for (String s : list) { if (1 == s.length()) { rateSrc += dictionary.rate(s.charAt(0)); } } for (String s : o.list) { if (1 == s.length()) { rateDest += dictionary.rate(s.charAt(0)); } } return compareDouble(rateSrc,rateDest); } public String toString() { return list.toString(); } }
ChunkStream.java
package grid.text.participle; import grid.common.Node; import grid.common.TextUtils; import grid.common.Tree; import grid.text.dic.CnDictionary; import java.util.ArrayList; import java.util.Collections; import java.util.List; public class ChunkStream { /** * Define the max supposed word length * * You could shorten the value if you don't need too long participle result */ private static final int MAX_WORD_LEN = 7; /** * Define the predict level while execute participle. * * Negligible accuracy will be promoted if you increase this value */ private static final int PREDICT_LEVEL = 3; private static CnDictionary dictionary = CnDictionary.Instance(); public String next(String text,int off) { Tree<String> root = new Tree<String>("ROOT"); recurse(root,off,text,0); List<Node<String>> list = root.getLeaves(); List<Chunk> chunkList = new ArrayList<Chunk>(); for (Node<String> node : list) { chunkList.add(new Chunk(node.getBranchPath())); } Collections.sort(chunkList); return chunkList.get(0).getHead(); } private void recurse(Node<String> node,String text,int predictDeep) { int len = MAX_WORD_LEN + off > text.length() ? text.length() - off : MAX_WORD_LEN; while (predictDeep < PREDICT_LEVEL) { if (len < 1) { return; } String s = text.substring(off,off + len); if (len < 2) { if (!TextUtils.isCnLetter(text.charAt(off))) { break; } recurse(node.add(s),off + 1,predictDeep + 1); } else if (dictionary.contains(s)) { recurse(node.add(s),off + s.length(),predictDeep + 1); } len--; } } }
MechanicalParticiple.java
package grid.text.participle; import grid.common.TextUtils; import java.util.Vector; public class MechanicalParticiple { public Vector<String> partition(String document) { Vector<String> vector = new Vector<String>(); final int docLen = document.length(); int off = 0; char c; String seg = ""; ChunkStream stream = new ChunkStream(); while (off < docLen) { c = document.charAt(off); if (TextUtils.isEnLetter(c) || TextUtils.isNumeric(c)) { seg += c; off++; } else if (TextUtils.isCnLetter(c)) { if (!TextUtils.isBlank(seg)) { vector.add(seg); seg = ""; } String word = stream.next(document,off); if (!TextUtils.isBlank(word)) { vector.add(word); off += word.length(); } } else { if (!TextUtils.isBlank(seg)) { vector.add(seg); seg = ""; } /** * TODO: Uncomment the "ELSE IF" clause if you would like to * reserve punctuations */ // else if (!TextUtils.isBlank("" + c)) { vector.add("" + c); } off++; } } if (!TextUtils.isBlank(seg)) { vector.add(seg); } return vector; } }
selector
#p#副标题#e##p#分页标题#e#
文本选择器,筛选出可能为新词的词汇
CnTextSelector.java
package grid.text.selector; import grid.common.TextUtils; public class CnTextSelector extends CommonTextSelector { public CnTextSelector(String document,int minSelectLen,int maxSelectLen) { super(document,minSelectLen,maxSelectLen); } protected void adjustCurLen() { while (pos < docLen && !TextUtils.isCnLetter(document.charAt(pos))) { pos++; } for (int i = 0; i < maxSelectLen && pos + i < docLen; i++) { if (!TextUtils.isCnLetter(document.charAt(pos + i))) { curLen = i; if (curLen < minSelectLen) { pos++; adjustCurLen(); } return; } } curLen = pos + maxSelectLen > docLen ? docLen - pos : maxSelectLen; } }
CommonTextSelector.java
package grid.text.selector; public class CommonTextSelector implements TextSelector { protected String document; protected int pos = 0; protected int maxSelectLen = 5; protected int minSelectLen = 2; protected int curLen; protected final int docLen; public CommonTextSelector(String document,int maxSelectLen) { this.document = document; this.minSelectLen = minSelectLen; this.maxSelectLen = maxSelectLen; docLen = document.length(); adjustCurLen(); } public void select() { pos += ++curLen; adjustCurLen(); } protected void adjustCurLen() { curLen = pos + maxSelectLen > docLen ? docLen - pos : maxSelectLen; } public String next() { if (curLen < minSelectLen) { pos++; adjustCurLen(); } if (pos + curLen <= docLen && curLen >= minSelectLen) { return document.substring(pos,pos + curLen--); } else { curLen--; // return document.substring(pos,docLen); return ""; } } public boolean end() { return curLen < minSelectLen && curLen + pos >= docLen - 1; } @Override public int getCurPos() { return pos; } }
TextSelector.java
package grid.text.selector; public interface TextSelector { public boolean end(); public void select(); public String next(); public int getCurPos(); }
测试代码
NewWordDiscoverTest.java
package grid.test; import grid.common.TextDatReader; import grid.text.evolution.NewWordDiscover; import java.io.IOException; import java.util.Set; public class NewWordDiscoverTest { private final static String path = "text.dat"; public static void main(String args[]) throws IOException { // Replace your document here String document = TextDatReader.read(path); NewWordDiscover discover = new NewWordDiscover(); long start = System.currentTimeMillis(); Set<String> words = discover.discover(document); System.out.println("Speed: " + (double) document.length() / (System.currentTimeMillis() - start) * 1000); System.out.println("New words size: " + words.size()); System.out.println("New word is: "+"\n"); for (String str : words) { System.out.println(str+"\n"); } } }
抽词测试,结果如下
#p#副标题#e##p#分页标题#e#
ParticipleTest.java
package grid.test; import grid.text.participle.MechanicalParticiple; import java.util.Vector; public class ParticipleTest { private static String document = "我是中国人"; public static void main(String args[]) { MechanicalParticiple participle = new MechanicalParticiple(); Vector<String> vec = participle.partition(document); System.out.println(vec); } }
分词测试,结果如下
怎么样,很酷吧,你还可以试着用《天龙八部》数据集玩下,看看主角是不是乔帮主。如果发现了什么新鲜词,请告诉博主,咱也不落后哈!