改进SinglePass聚类算法的实现包括分词tfidf计算卡方Word文件下载.docx

资源描述

改进SinglePass聚类算法的实现包括分词tfidf计算卡方Word文件下载.docx

《改进SinglePass聚类算法的实现包括分词tfidf计算卡方Word文件下载.docx》由会员分享，可在线阅读，更多相关《改进SinglePass聚类算法的实现包括分词tfidf计算卡方Word文件下载.docx（64页珍藏版）》请在冰点文库上搜索。

改进SinglePass聚类算法的实现包括分词tfidf计算卡方Word文件下载.docx

importcode.*;

publicclasssegment{

publicstaticHashMap<

String,Integer>

wordInt=newHashMap<

（）;

publicstaticintwordCnt[][]=newint[20000][11];

publicstaticString[]cntWord=newString[20000];

publicstaticint[]wordValid=newint[20000];

publicstaticintA[][]=newint[20000][11];

publicstaticintB[][]=newint[20000][11];

publicstaticintC[][]=newint[20000][11];

publicstaticintD[][]=newint[20000][11];

publicstaticintcateWordSum[]=newint[20000];

publicstaticdoublechi[][]=newdouble[20000][11];

publicstaticdoublechiAvg[]=newdouble[20000];

publicstaticintnum=0;

wordDic=newHashMap<

publicstaticHashSet<

String>

name=newHashSet<

place=newHashSet<

organization=newHashSet<

time=newHashSet<

stopWord=newHashSet<

publicstaticFileTDIR=newFile（"

/Users/admin/eclipseworkspace/CreateCorpus/突发事件语料"

）;

publicstaticString[]CATEGORIES={"

雅安地震"

新疆巴楚恐怖事件"

禽流感H7N9疫情"

台湾渔船菲律宾射杀"

波士顿爆炸案"

黄浦江死猪"

发改委副主任落马"

四川泸县瓦斯爆炸"

南方暴雨"

内蒙古通辽地震"

};

publicstaticvoidoutFile（）

{

Fileout=newFile（"

wordChi1.txt"

if（out.exists（））{

out.delete（）;

}

try{

if（out.createNewFile（））

OutputStreamWriteroutputW=newOutputStreamWriter（newFileOutputStream（out）,"

GBk"

BufferedWriteroutput=newBufferedWriter（outputW）;

for（inti=0;

num;

i++）

if（wordValid[i]==0）continue;

output.write（cntWord[i]+"

\r\n"

for（intj=0;

10;

j++）

output.write（String.valueOf（wordCnt[i][j]））;

output.write（"

//output.write（"

output.close（）;

}catch（IOExceptione）{

//TODOAuto-generatedcatchblock

e.printStackTrace（）;

System.out.println（"

输出文件wordChi1.txt完毕！

publicstaticvoidoutputFile（）

wordCnt.txt"

11;

输出文件wordCnt.txt完毕！

CreateStopWord（Stringfilename）

HashSet<

tempStopWord=newHashSet<

Filefile=newFile（filename）;

if（file.exists（）&

file.isFile（））{

try{

InputStreamReaderread=newInputStreamReader（newFileInputStream（file）,"

BufferedReaderinput=newBufferedReader（read）;

Stringtext;

while（（text=input.readLine（））!

=null）//从第三行到文件末尾是正文

tempStopWord.add（text）;

input.close（）;

}

catch（IOExceptionioException）{

System.err.println（"

FileError!

returntempStopWord;

publicstaticStringReturnTitle（Stringfilename）

Stringtitle="

;

text=input.readLine（）;

//第一行是时间

//第二行是标题

title=text;

returntitle;

publicstaticStringReturnContent（Stringfilename）

Stringcontent="

content+=text;

returncontent;

publicstaticStringReturnTitleandContent（Stringfilename）

content=text;

publicstaticStringReturnTitleandFirstParagraph（Stringfilename）

text+=input.readLine（）;

//第三行正文的第一段

privatestaticString[]spitByWhiteChar（Stringpara）{

if（para!

=null）

returnpara.split（"

[\\s　]"

//returnpara.split（"

else

returnnull;

/**

*@paramargs

publicstaticvoidmain（String[]args）{

//TODOAuto-generatedmethodstub

ICTCLAS30testICTCLAS30=newICTCLAS30（）;

Stringargu="

if（testICTCLAS30.ICTCLAS_Init（argu.getBytes（"

GB2312"

））==false）{

InitFail!

return;

}catch（UnsupportedEncodingExceptione1）{

e1.printStackTrace（）;

*��ô

��Ա�ע��ID��Լ�1��һ��ע��0��ע��2��ע��3��һ��ע��

testICTCLAS30.ICTCLAS_SetPOSmap

（1）;

for（inti=0;

CATEGORIES.length;

i++）

{

//System.out.println（CATEGORIES[i]）;

FileclassDir=newFile（TDIR,CATEGORIES[i]）;

if（!

classDir.isDirectory（））{

不能找到目录="

+classDir）;

//遍历

for（Filefile:

classDir.listFiles（））

Stringfilepath="

/Users/admin/eclipseworkspace/CreateCorpus/突发事件语料/"

+CATEGORIES[i]+"

+file.getName（）;

//System.out.println（filepath）;

Stringnews=ReturnTitleandContent（filepath）;

//Stringnews=ReturnTitleandFirstParagraph（filepath）;

//Stringnews=ReturnTitle（filepath）;

news=news.trim（）;

//System.out.println（news）;

//ȥ��

newChineseCheck（）.CheckChinese（news））

continue;

news=newChangeCode（）.Change（news）;

//�ִ�

if（news==null）

bytenativeBytes[];

nativeBytes=testICTCLAS30.ICTCLAS_ParagraphProcess（

news.getBytes（"

GBK"

）,1）;

StringspitedNews=newString（nativeBytes,0,

nativeBytes.length,"

//FileWriterwriter=newFileWriter（String.valueOf（i）,false）;

String[]words=spitByWhiteChar（spitedNews）;

spitedNews="

for（Stringword:

words）{

intpos=word.indexOf（"

intpos1=word.indexOf（"

/n"

intpos2=word.indexOf（"

/v"

intpos3=word.indexOf（"

/t"

//intpos2,pos3,pos4,pos5;

//intpos2=word.indexOf（"

word.equals（"

）&

（（pos1!

=-1）||（pos2!

=-1）||（pos3!

=-1）））{

word=word.substring（0,pos）;

if（wordDic.containsKey（word））{

wordDic.put（word,wordDic.get（word）+1）;

}else{

wordDic.put（word,1）;

//System.out.println（word）;

//System.out.println（wordDic.size（））;

}catch（UnsupportedEncodingExceptione）{

System.out.print（"

初步统计词（名词+动词+时间词）之后的词个数：

System.out.println（wordDic.size（））;

//去停用词

stopWord=CreateStopWord（"

ChineseStopWord.txt"

//Iterator<

iterator=stopWord.iterator（）;

//while（iterator.hasNext（））{

//System.out.println（iterator.next（））;

//}

//intnum=0;

Iteratoriter=wordDic.entrySet（）.iterator（）;

while（iter.hasNext（））{

Map.Entryentry=（Map.Entry）iter.next（）;

Objectkey=entry.getKey（）;

Objectval=entry.getValue（）;

if（stopWord.contains（key））

iter.remove（）;

wordCnt[num][10]=Integer.valueOf（val.toString（））;

//System.out.println（wordCnt[num][10]）;

wordInt.put（key.toString（）,num）;

//词和词的编号映射，wordCnt保存的是0-9的类的词个数以及这个词的总个数

cntWord[num]=key.toString（）;

++num;

去除停用词剩余词个数：

+num）;

//计算wordCnt[0-9]再次遍历

intflag[][]=newint[20000][10];

for（intw=0;

展开阅读全文