改进SinglePass聚类算法的实现包括分词tfidf计算卡方Word文件下载.docx
《改进SinglePass聚类算法的实现包括分词tfidf计算卡方Word文件下载.docx》由会员分享,可在线阅读,更多相关《改进SinglePass聚类算法的实现包括分词tfidf计算卡方Word文件下载.docx(64页珍藏版)》请在冰点文库上搜索。
importcode.*;
publicclasssegment{
publicstaticHashMap<
String,Integer>
wordInt=newHashMap<
();
publicstaticintwordCnt[][]=newint[20000][11];
publicstaticString[]cntWord=newString[20000];
publicstaticint[]wordValid=newint[20000];
publicstaticintA[][]=newint[20000][11];
publicstaticintB[][]=newint[20000][11];
publicstaticintC[][]=newint[20000][11];
publicstaticintD[][]=newint[20000][11];
publicstaticintcateWordSum[]=newint[20000];
publicstaticdoublechi[][]=newdouble[20000][11];
publicstaticdoublechiAvg[]=newdouble[20000];
publicstaticintnum=0;
wordDic=newHashMap<
publicstaticHashSet<
String>
name=newHashSet<
place=newHashSet<
organization=newHashSet<
time=newHashSet<
stopWord=newHashSet<
publicstaticFileTDIR=newFile("
D:
/Users/admin/eclipseworkspace/CreateCorpus/突发事件语料"
);
publicstaticString[]CATEGORIES={"
雅安地震"
"
新疆巴楚恐怖事件"
禽流感H7N9疫情"
台湾渔船菲律宾射杀"
波士顿爆炸案"
黄浦江死猪"
发改委副主任落马"
四川泸县瓦斯爆炸"
南方暴雨"
内蒙古通辽地震"
};
publicstaticvoidoutFile()
{
Fileout=newFile("
wordChi1.txt"
if(out.exists()){
out.delete();
}
try{
if(out.createNewFile())
OutputStreamWriteroutputW=newOutputStreamWriter(newFileOutputStream(out),"
GBk"
BufferedWriteroutput=newBufferedWriter(outputW);
for(inti=0;
i<
num;
i++)
if(wordValid[i]==0)continue;
output.write(cntWord[i]+"
\r\n"
for(intj=0;
j<
10;
j++)
output.write(String.valueOf(wordCnt[i][j]));
output.write("
//output.write("
output.close();
}catch(IOExceptione){
//TODOAuto-generatedcatchblock
e.printStackTrace();
System.out.println("
输出文件wordChi1.txt完毕!
"
publicstaticvoidoutputFile()
wordCnt.txt"
"
11;
输出文件wordCnt.txt完毕!
CreateStopWord(Stringfilename)
HashSet<
tempStopWord=newHashSet<
Filefile=newFile(filename);
if(file.exists()&
&
file.isFile()){
try{
InputStreamReaderread=newInputStreamReader(newFileInputStream(file),"
BufferedReaderinput=newBufferedReader(read);
Stringtext;
while((text=input.readLine())!
=null)//从第三行到文件末尾是正文
tempStopWord.add(text);
input.close();
}
catch(IOExceptionioException){
System.err.println("
FileError!
returntempStopWord;
publicstaticStringReturnTitle(Stringfilename)
Stringtitle="
;
text=input.readLine();
//第一行是时间
//第二行是标题
title=text;
returntitle;
publicstaticStringReturnContent(Stringfilename)
Stringcontent="
content+=text;
returncontent;
publicstaticStringReturnTitleandContent(Stringfilename)
content=text;
publicstaticStringReturnTitleandFirstParagraph(Stringfilename)
text+=input.readLine();
//第三行正文的第一段
privatestaticString[]spitByWhiteChar(Stringpara){
if(para!
=null)
returnpara.split("
[\\s ]"
//returnpara.split("
else
returnnull;
/**
*@paramargs
*/
publicstaticvoidmain(String[]args){
//TODOAuto-generatedmethodstub
ICTCLAS30testICTCLAS30=newICTCLAS30();
Stringargu="
."
if(testICTCLAS30.ICTCLAS_Init(argu.getBytes("
GB2312"
))==false){
InitFail!
return;
}catch(UnsupportedEncodingExceptione1){
e1.printStackTrace();
/*
*���ô
��Ա�ע��ID�����Լ�1������һ����ע��0�����������ע��2���������ע��3����һ����ע��
testICTCLAS30.ICTCLAS_SetPOSmap
(1);
for(inti=0;
i<
CATEGORIES.length;
i++)
{
//System.out.println(CATEGORIES[i]);
FileclassDir=newFile(TDIR,CATEGORIES[i]);
if(!
classDir.isDirectory()){
不能找到目录="
+classDir);
//遍历
for(Filefile:
classDir.listFiles())
Stringfilepath="
/Users/admin/eclipseworkspace/CreateCorpus/突发事件语料/"
+CATEGORIES[i]+"
/"
+file.getName();
//System.out.println(filepath);
Stringnews=ReturnTitleandContent(filepath);
//Stringnews=ReturnTitleandFirstParagraph(filepath);
//Stringnews=ReturnTitle(filepath);
news=news.trim();
//System.out.println(news);
//ȥ��
newChineseCheck().CheckChinese(news))
continue;
news=newChangeCode().Change(news);
//�ִ�
if(news==null)
bytenativeBytes[];
nativeBytes=testICTCLAS30.ICTCLAS_ParagraphProcess(
news.getBytes("
GBK"
),1);
StringspitedNews=newString(nativeBytes,0,
nativeBytes.length,"
//FileWriterwriter=newFileWriter(String.valueOf(i),false);
String[]words=spitByWhiteChar(spitedNews);
spitedNews="
for(Stringword:
words){
intpos=word.indexOf("
intpos1=word.indexOf("
/n"
intpos2=word.indexOf("
/v"
intpos3=word.indexOf("
/t"
//intpos2,pos3,pos4,pos5;
//intpos2=word.indexOf("
word.equals("
)&
((pos1!
=-1)||(pos2!
=-1)||(pos3!
=-1))){
word=word.substring(0,pos);
if(wordDic.containsKey(word)){
wordDic.put(word,wordDic.get(word)+1);
}else{
wordDic.put(word,1);
//System.out.println(word);
//System.out.println(wordDic.size());
}catch(UnsupportedEncodingExceptione){
System.out.print("
初步统计词(名词+动词+时间词)之后的词个数:
System.out.println(wordDic.size());
//去停用词
stopWord=CreateStopWord("
ChineseStopWord.txt"
//Iterator<
iterator=stopWord.iterator();
//while(iterator.hasNext()){
//System.out.println(iterator.next());
//}
//intnum=0;
Iteratoriter=wordDic.entrySet().iterator();
while(iter.hasNext()){
Map.Entryentry=(Map.Entry)iter.next();
Objectkey=entry.getKey();
Objectval=entry.getValue();
if(stopWord.contains(key))
iter.remove();
wordCnt[num][10]=Integer.valueOf(val.toString());
//System.out.println(wordCnt[num][10]);
wordInt.put(key.toString(),num);
//词和词的编号映射,wordCnt保存的是0-9的类的词个数以及这个词的总个数
cntWord[num]=key.toString();
++num;
去除停用词剩余词个数:
+num);
//计算wordCnt[0-9]再次遍历
intflag[][]=newint[20000][10];
for(intw=0;
w