java热词搜索_java使用Nagao算法实现新词发现、热门词的挖掘

更新时间:2023-08-01 22:20:15 阅读: 评论:0

java热词搜索_java使⽤Nagao算法实现新词发现、热门词的挖
采⽤Nagao算法统计各个⼦字符串的频次,然后基于这些频次统计每个字符串的词频、左右邻个数、左右熵、交互信息(内部凝聚度)。
名词解释:
词频:该字符串在⽂档中出现的次数。出现次数越多越重要。
左右邻个数:⽂档中该字符串的左边和右边出现的不同的字的个数。左右邻越多,说明字符串成词概率越⾼。
左右熵:⽂档中该字符串的左边和右边出现的不同的字的数量分布的熵。类似上⾯的指标,有⼀定区别。
交互信息:每次将某字符串分成两部分,左半部分字符串和右半部分字符串,计算其同时出现的概率除于其各⾃独⽴出现的概率,最后取所有的划分⾥⾯概率最⼩值。这个值越⼤,说明字符串内部凝聚度越⾼,越可能成词。
算法具体流程:
1.  将输⼊⽂件逐⾏读⼊,按照⾮汉字([^\u4E00-\u9FA5]+)以及停词“的很了么呢是嘛个都也⽐还这于不与才上⽤就好在和对挺去后没说”,
分成⼀个个字符串,代码如下:
String[] phras = line.split("[^\u4E00-\u9FA5]+|["+stopwords+"]");
停⽤词可以修改。
2.  获取所有切分后的字符串的左⼦串和右⼦串,分别加⼊左、右PTable
3.  对PTable排序,并计算LTable。LTable记录的是,排序后的PTable中,下⼀个⼦串同上⼀个⼦串具有相同字符的数量
4.  遍历PTable和LTable,即可得到所有⼦字符串的词频、左右邻
5.  根据所有⼦字符串的词频、左右邻结果,输出字符串的词频、左右邻个数、左右熵、交互信息
1.  NagaoAlgorithm.java
package com.algo.word;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
public class NagaoAlgorithm {
private int N;
private List leftPTable;
private int[] leftLTable;
private List rightPTable;
private int[] rightLTable;
酒的品牌排行榜
private double wordNumber;
private Map wordTFNeighbor;
private final static String stopwords = "的很了么呢是嘛个都也⽐还这于不与才上⽤就好在和对挺去后没说"; private NagaoAlgorithm(){
//default N = 5
N = 5;
leftPTable = new ArrayList();
rightPTable = new ArrayList();
wordTFNeighbor = new HashMap();
}
//rever phra
private String rever(String phra) {
StringBuilder reverPhra = new StringBuilder();
for (int i = phra.length() - 1; i >= 0; i--)
reverPhra.append(phra.charAt(i));
考驾照色盲测试图String();
}
//co-prefix length of s1 and s2
private int coPrefixLength(String s1, String s2){
int coPrefixLength = 0;
for(int i = 0; i < Math.min(s1.length(), s2.length()); i++){
if(s1.charAt(i) == s2.charAt(i)) coPrefixLength++;
el break;
}
return coPrefixLength;
}
//add substring of line to pTable
private void addToPTable(String line){
//split line according to concutive none Chine character
String[] phras = line.split("[^\u4E00-\u9FA5]+|["+stopwords+"]");
for(String phra : phras){
for(int i = 0; i < phra.length(); i++)
rightPTable.add(phra.substring(i));
String reverPhra = rever(phra);
for(int i = 0; i < reverPhra.length(); i++)
leftPTable.add(reverPhra.substring(i));
wordNumber += phra.length();
}
}
//count lTable
private void countLTable(){
Collections.sort(rightPTable);
rightLTable = new int[rightPTable.size()];
for(int i = 1; i < rightPTable.size(); i++)
rightLTable[i] = (i-1), (i));
Collections.sort(leftPTable);
leftLTable = new int[leftPTable.size()];
for(int i = 1; i < leftPTable.size(); i++)
leftLTable[i] = (i-1), (i));
System.out.println("Info: [Nagao Algorithm Step 2]: having sorted PTable and counted left and right LTable"); }
//according to pTable and lTable, count statistical result: TF, neighbor distribution
private void countTFNeighbor(){
//get TF and right neighbor
for(int pIndex = 0; pIndex < rightPTable.size(); pIndex++){
String phra = (pIndex);
for(int length = 1 + rightLTable[pIndex]; length <= N && length <= phra.length(); length++){
String word = phra.substring(0, length);
TFNeighbor tfNeighbor = new TFNeighbor();
tfNeighbor.incrementTF();
if(phra.length() > length)
tfNeighbor.addToRightNeighbor(phra.charAt(length));
白痴的英文for(int lIndex = pIndex+1; lIndex < rightLTable.length; lIndex++){
if(rightLTable[lIndex] >= length){
议论文阅读tfNeighbor.incrementTF();
String coPhra = (lIndex);
if(coPhra.length() > length)
tfNeighbor.addToRightNeighbor(coPhra.charAt(length));
}
el break;
}
wordTFNeighbor.put(word, tfNeighbor);
}
}
//get left neighbor
for(int pIndex = 0; pIndex < leftPTable.size(); pIndex++){
String phra = (pIndex);
天堂与地狱之间for(int length = 1 + leftLTable[pIndex]; length <= N && length <= phra.length(); length++){ String word = rever(phra.substring(0, length));
TFNeighbor tfNeighbor = (word);
if(phra.length() > length)
tfNeighbor.addToLeftNeighbor(phra.charAt(length));
for(int lIndex = pIndex + 1; lIndex < leftLTable.length; lIndex++){
if(leftLTable[lIndex] >= length){
String coPhra = (lIndex);
if(coPhra.length() > length)
tfNeighbor.addToLeftNeighbor(coPhra.charAt(length));
}
el break;
}
}
}
排气管放炮
System.out.println("Info: [Nagao Algorithm Step 3]: having counted TF and Neighbor");
}
//according to wordTFNeighbor, count MI of word
private double countMI(String word){
if(word.length() <= 1) return 0;ppt动画大师
double coProbability = (word).getTF()/wordNumber;
List mi = new ArrayList(word.length());
for(int pos = 1; pos < word.length(); pos++){
String leftPart = word.substring(0, pos);
String rightPart = word.substring(pos);
double leftProbability = (leftPart).getTF()/wordNumber; double rightProbability =
<(rightPart).getTF()/wordNumber; mi.add(coProbability/(leftProbability*rightProbability));
}
return Collections.min(mi);
}
//save TF, (left and right) neighbor number, neighbor entropy, mutual information private void saveTFNeighborInfoMI(String out, String stopList, String[] threshold){ try {
//read stop words file
Set stopWords = new HashSet();
BufferedReader br = new BufferedReader(new FileReader(stopList));
String line;
while((line = br.readLine()) != null){
if(line.length() > 1)
stopWords.add(line);
}
br.clo();
//output words TF, neighbor info, MI
BufferedWriter bw = new BufferedWriter(new FileWriter(out));
for(Map.Entry entry : Set()){
if( Key().length() <= 1 || Key()) ) continue; TFNeighbor tfNeighbor = Value();
int tf, leftNeighborNumber, rightNeighborNumber;
double mi;
tf = TF();
西河柳leftNeighborNumber = LeftNeighborNumber();

本文发布于:2023-08-01 22:20:15,感谢您对本站的认可!

本文链接:https://www.wtabcd.cn/fanwen/fan/82/1126144.html

版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。

标签:字符串   词频   部分   出现   算法
相关文章
留言与评论(共有 0 条评论)
   
验证码:
推荐文章
排行榜
Copyright ©2019-2022 Comsenz Inc.Powered by © 专利检索| 网站地图