Java中⽂情感分类(or⽂本分类):Alink中⽂情感分析、HanLP中⽂情感分
析、Sp。。。
⽂章⽬录
石家庄翻译⼀、项⽬⽬录
u.alink:Alink 中⽂情感分析
u.bayes:在 代码基础上,略作修改后的贝叶斯情感分类,效果不太好,不予介绍
u.hanlp:HanLP 中⽂情感分析
u.sparkml:SparkML 中⽂情感分析,待定
u.zoom.data:中⽂情感分析(⽂本分类)使⽤的数据集
中⽂情感挖掘语料-ChnSentiCorp(谭松波)
搜狗⽂本分类语料库迷你版
微博评论情感数据集:weibo_nti_100k.csv
del:保存的情感分析模型
⼆、Alink 中⽂情感分析:微博评论情感分析
package u.alink;
import s.ClassifierConstant;
ax9import com.alibaba.alink.operator.batch.BatchOperator;
import com.alibaba.alink.operator.batch.source.CsvSourceBatchOp;
import com.alibaba.alink.operator.batch.source.TextSourceBatchOp;
import com.alibaba.alink.pipeline.LocalPredictor;
import com.alibaba.alink.pipeline.LocalPredictor;
import com.alibaba.alink.pipeline.Pipeline;
import com.alibaba.alink.pipeline.PipelineModel;
import com.alibaba.alink.pipeline.classification.LogisticRegression;
import com.alibaba.alink.pipeline.classification.NaiveBayesTextClassifier;
import com.alibaba.alink.pipeline.dataproc.Imputer;
import com.alibaba.alink.pipeline.nlp.DocCountVectorizer;
import com.alibaba.alink.pipeline.nlp.Segment;
import com.alibaba.alink.pipeline.nlp.StopWordsRemover;
import org.pes.Row;
import java.io.File;
import java.util.List;
/**
* @author 32098
*/
public class CommentClassifier {
private static PipelineModel pipelineModel;
public static void initNaiveBayesModel(){
pipelineModel = PipelineModel.load(ClassifierConstant.WEIBO_LR_MODEL_PATH);
if(pipelineModel==null){
System.out.println("开始构建模型...");
BatchOperator<?> sourceBatchOp =getCommentSourceOp();
Pipeline pipeline =new Pipeline(
// 缺失值填充:null
new Imputer().tSelectedCols("review").tOutputCols("featureText").tStrategy("value").tFillValue("null"),
breaking bad// 分词操作
new Segment().tSelectedCol("featureText"),
// 去除停⽤词
new StopWordsRemover().tSelectedCol("featureText"),
/*
* TF, Term Frequency: 词频,⽣成特征向量的类型
* /pinshu/alink_doc/7a529b8564228c01c31f2fa58c43f782
*/
new DocCountVectorizer().tFeatureType("TF").tSelectedCol("featureText").tOutputCol("featureVector"),
new NaiveBayesTextClassifier().tVectorCol("featureVector").tLabelCol("label").tPredictionCol("pred") );
pipelineModel = pipeline.fit(sourceBatchOp);
pipelineModel.save(ClassifierConstant.WEIBO_NB_MODEL_PATH);
try{
// save ⽅法是将模型连接到了 sink 组件,还需要等到 ute(),才会真正写出模型
}catch(Exception e){
e.printStackTrace();
}
}
System.out.println("模型构建成功!");
}
public static void initLogisticRegressionModel(){
pipelineModel = PipelineModel.load(ClassifierConstant.WEIBO_LR_MODEL_PATH);
if(pipelineModel==null){
System.out.println("开始构建模型...");
BatchOperator<?> sourceBatchOp =getCommentSourceOp();
Pipeline pipeline =new Pipeline(
// 缺失值填充:null
new Imputer().tSelectedCols("review").tOutputCols("featureText").tStrategy("value").tFillValue("null"),
// 分词操作
new Segment().tSelectedCol("featureText"),
// 去除停⽤词
new StopWordsRemover().tSelectedCol("featureText"),
/*如何提高成绩
* TF, Term Frequency: 词频,⽣成特征向量的类型
* TF, Term Frequency: 词频,⽣成特征向量的类型
* /pinshu/alink_doc/7a529b8564228c01c31f2fa58c43f782
*/
new DocCountVectorizer().tFeatureType("TF").tSelectedCol("featureText").tOutputCol("featureVector"),
new LogisticRegression().tVectorCol("featureVector").tLabelCol("label").tPredictionCol("pred")
);
pipelineModel = pipeline.fit(sourceBatchOp);
pipelineModel.save(ClassifierConstant.WEIBO_NB_MODEL_PATH);
try{
// save ⽅法是将模型连接到了 sink 组件,还需要等到 ute(),才会真正写出模型
}catch(Exception e){
e.printStackTrace();
}
}
System.out.println("模型构建成功!");
}
private static BatchOperator<?>getCommentSourceOp(){
return new CsvSourceBatchOp()
.
tFilePath(ClassifierConstant.DATASET_WEIBO_PATH)
.tSchemaStr("label int, review string")
.tIgnoreFirstLine(true);
}
public static String getClassification(String text){
if(pipelineModel==null){
initNaiveBayesModel();
}
try{
// blog.csdn/Alink1024/article/details/107813310
LocalPredictor localPredictor = llectLocalPredictor("review string");
// System.out.OutputSchema());
Row row = Row.of(text);
return String.valueOf(localPredictor.map(row).getField(3));
}catch(Exception e){
e.printStackTrace();
成都北大青鸟
}
return null;
}
public static void main(String[] args)throws Exception {
/
/ Can't, we will u LocalPredictor
// initNaiveBayesModel();
// System.out.println("------------------------------");
// TextSourceBatchOp textSourceBatchOp1 = new TextSourceBatchOp()
// .Property("ur.dir")+"/src/main/java/cn/edu/neu/zoom/".replace("/", File.parator))
// .tTextCol("review");
// ansform(textSourceBatchOp1).lect(new String[]{"label", "pred", "review"}).sampleWithSize(20).print();
//
// initLogisticRegressionModel();
/
/ System.out.println("------------------------------");
// TextSourceBatchOp textSourceBatchOp2 = new TextSourceBatchOp()
// .Property("ur.dir")+"/src/main/java/cn/edu/neu/zoom/".replace("/", File.parator))
// .tTextCol("review");
// ansform(textSourceBatchOp2).lect(new String[]{"label", "pred", "review"}).sampleWithSize(20).print();5353
System.out.println(getClassification("你真好"));
System.out.println(getClassification("哇哦今年的春夏季⾐服不错诶"));
TextSourceBatchOp textSourceBatchOp1 =new TextSourceBatchOp()
.Property("ur.dir")+"/src/main/java/cn/edu/neu/zoom/".replace("/", File.parator))
.tTextCol("review");
TextSourceBatchOp textSourceBatchOp2 =new TextSourceBatchOp()
.Property("ur.dir")+"/src/main/java/cn/edu/neu/zoom/".replace("/", File.parator))
.tTextCol("review");
List<Row> negRows = DataSet().collect();
List<Row> posRows = DataSet().collect();
int acc =0;
for(Row negRow : negRows){
// except to be 0
String text =getClassification((String) Field(0));
System.out.println(text);
if("0".equals(text)){
acc+=1;
}
}
for(Row posRow : posRows){
// except to be 1
String text =getClassification((String) Field(0));
园林工程施工与管理System.out.println(text);
if("0".equals(text)){
acc+=1;
}
}
System.out.println("Acc: "+(double) acc/(negRows.size()+posRows.size()));
}
}
这个分类感觉有点慢
三、HanLP 中⽂情感分析
通过 HanLP的NaiveBayesClassifier与HanLPTokenizer实现的微博评论情感分析、酒店评论情感分析、⽂本分类:package u.hanlp;
import s.ClassifierConstant;
import s.ClassifierConstant;
import com.hankcs.hanlp.classification.classifiers.AbstractClassifier;
import com.hankcs.hanlp.classification.classifiers.NaiveBayesClassifier;
import com.hankcs.pus.FileDataSet;
import com.hankcs.pus.IDataSet;
import com.hankcs.dels.AbstractModel;
import com.hankcs.dels.NaiveBayesModel;
import com.hankcs.kenizers.HanLPTokenizer;
import java.io.*;
import java.util.Map;
/**
* @author 32098
*/
public class HanLpClassifier {
private static AbstractClassifier classifier = null;
/**
*
* @param dataPath 数据路径
* @param modelPath 模型路径
*/
public static void initClassifier(String dataPath, String modelPath){
AbstractModel model =loadModel(modelPath);
if(model==null){
System.out.println("No model find, begin train model!");
IDataSet dataSet = null;
try{
System.out.println(dataPath);
File f =new File(dataPath);
if(f.isFile()){
BufferedReader reader =new BufferedReader(new FileReader(dataPath));
String str;
dataSet =new FileDataSet().tTokenizer(new HanLPTokenizer());
System.out.println("Prepare datat!");
// ignore first line
str = adLine();
blow是什么意思while((adLine())!=null){
dataSet.add(str.substring(0,1), str.substring(2));
}
}el{
dataSet =new FileDataSet().tTokenizer(new HanLPTokenizer()).load(dataPath,"UTF-8");
}
System.out.println("Datat prepared!");
}catch(IOException e){
e.printStackTrace();
}
classifier =new NaiveBayesClassifier();
model = Model();
英语 作文
saveModel(modelPath, model);
}el{
System.out.println("NaiveBayesModel init succeeded!");
classifier =new NaiveBayesClassifier((NaiveBayesModel) model);
}
}
private static void saveModel(String modelPath, AbstractModel model){
try(ObjectOutputStream oos =new ObjectOutputStream(new FileOutputStream(modelPath))){
oos.writeObject(model);
System.out.println("Save NaiveBayesModel Succeeded!");
}catch(Exception e){
}