改进SinglePass聚类算法的实现包括分词tfidf计算卡方.docx
《改进SinglePass聚类算法的实现包括分词tfidf计算卡方.docx》由会员分享,可在线阅读,更多相关《改进SinglePass聚类算法的实现包括分词tfidf计算卡方.docx(64页珍藏版)》请在冰点文库上搜索。
改进SinglePass聚类算法的实现包括分词tfidf计算卡方
改进Single-Pass聚类算法的实现(包括分词、tfidf计算、卡方检验特征选择)
//热点话题发现的预处理,生成单词表,tdidf向量的值
packagetest;
importjava.io.BufferedReader;
importjava.io.BufferedWriter;
importjava.io.File;
importjava.io.FileInputStream;
importjava.io.FileOutputStream;
importjava.io.FileReader;
importjava.io.IOException;
importjava.io.InputStreamReader;
importjava.io.OutputStream;
importjava.io.OutputStreamWriter;
importjava.io.UnsupportedEncodingException;
importjava.util.ArrayList;
importjava.util.Arrays;
importjava.util.HashMap;
importjava.util.HashSet;
importjava.util.Iterator;
importjava.util.Map;
importcom.NewsList.*;
importcom.aliasi.util.Files;
importICTCLAS.I3S.AC.*;
importjava.io.FileWriter;
importcode.*;
publicclasssegment{
publicstaticHashMapwordInt=newHashMap();
publicstaticintwordCnt[][]=newint[20000][11];
publicstaticString[]cntWord=newString[20000];
publicstaticint[]wordValid=newint[20000];
publicstaticintA[][]=newint[20000][11];
publicstaticintB[][]=newint[20000][11];
publicstaticintC[][]=newint[20000][11];
publicstaticintD[][]=newint[20000][11];
publicstaticintcateWordSum[]=newint[20000];
publicstaticdoublechi[][]=newdouble[20000][11];
publicstaticdoublechiAvg[]=newdouble[20000];
publicstaticintnum=0;
publicstaticHashMapwordDic=newHashMap();
publicstaticHashSetname=newHashSet();
publicstaticHashSetplace=newHashSet();
publicstaticHashSetorganization=newHashSet();
publicstaticHashSettime=newHashSet();
publicstaticHashSetstopWord=newHashSet();
publicstaticFileTDIR=newFile("D:
/Users/admin/eclipseworkspace/CreateCorpus/突发事件语料");
publicstaticString[]CATEGORIES={"雅安地震","新疆巴楚恐怖事件","禽流感H7N9疫情","台湾渔船菲律宾射杀","波士顿爆炸案","黄浦江死猪","发改委副主任落马","四川泸县瓦斯爆炸","南方暴雨","内蒙古通辽地震"};
publicstaticvoidoutFile()
{
Fileout=newFile("wordChi1.txt");
if(out.exists()){
out.delete();
}
try{
if(out.createNewFile())
OutputStreamWriteroutputW=newOutputStreamWriter(newFileOutputStream(out),"GBk");
BufferedWriteroutput=newBufferedWriter(outputW);
for(inti=0;i{if(wordValid[i]==0)continue;output.write(cntWord[i]+"\r\n");for(intj=0;j<10;j++){output.write(String.valueOf(wordCnt[i][j]));output.write("\r\n");}//output.write("\r\n");}output.close();}}catch(IOExceptione){//TODOAuto-generatedcatchblocke.printStackTrace();}System.out.println("输出文件wordChi1.txt完毕!");}publicstaticvoidoutputFile(){Fileout=newFile("wordCnt.txt");if(out.exists()){out.delete();}try{if(out.createNewFile()){OutputStreamWriteroutputW=newOutputStreamWriter(newFileOutputStream(out),"GBk");BufferedWriteroutput=newBufferedWriter(outputW);for(inti=0;i{output.write(cntWord[i]+"");for(intj=0;j<11;j++){output.write(String.valueOf(wordCnt[i][j]));output.write("");}output.write("\r\n");}output.close();}}catch(IOExceptione){//TODOAuto-generatedcatchblocke.printStackTrace();}System.out.println("输出文件wordCnt.txt完毕!");}publicstaticHashSetCreateStopWord(Stringfilename){HashSettempStopWord=newHashSet();Filefile=newFile(filename);if(file.exists()&&file.isFile()){try{InputStreamReaderread=newInputStreamReader(newFileInputStream(file),"GBk");BufferedReaderinput=newBufferedReader(read);Stringtext;while((text=input.readLine())!=null)//从第三行到文件末尾是正文{tempStopWord.add(text);}input.close();}catch(IOExceptionioException){System.err.println("FileError!");}}returntempStopWord;}publicstaticStringReturnTitle(Stringfilename){Stringtitle="";Filefile=newFile(filename);if(file.exists()&&file.isFile()){try{InputStreamReaderread=newInputStreamReader(newFileInputStream(file),"GBk");BufferedReaderinput=newBufferedReader(read);Stringtext;text=input.readLine();//第一行是时间text=input.readLine();//第二行是标题title=text;input.close();}catch(IOExceptionioException){System.err.println("FileError!");}}returntitle;}publicstaticStringReturnContent(Stringfilename){Stringcontent="";Filefile=newFile(filename);if(file.exists()&&file.isFile()){try{InputStreamReaderread=newInputStreamReader(newFileInputStream(file),"GBk");BufferedReaderinput=newBufferedReader(read);Stringtext;text=input.readLine();//第一行是时间text=input.readLine();//第二行是标题while((text=input.readLine())!=null)//从第三行到文件末尾是正文{content+=text;}input.close();}catch(IOExceptionioException){System.err.println("FileError!");}}returncontent;}publicstaticStringReturnTitleandContent(Stringfilename){Stringcontent="";Filefile=newFile(filename);if(file.exists()&&file.isFile()){try{InputStreamReaderread=newInputStreamReader(newFileInputStream(file),"GBk");BufferedReaderinput=newBufferedReader(read);Stringtext;text=input.readLine();//第一行是时间text=input.readLine();//第二行是标题content=text;while((text=input.readLine())!=null)//从第三行到文件末尾是正文{content+=text;}input.close();}catch(IOExceptionioException){System.err.println("FileError!");}}returncontent;}publicstaticStringReturnTitleandFirstParagraph(Stringfilename){Stringcontent="";Filefile=newFile(filename);if(file.exists()&&file.isFile()){try{InputStreamReaderread=newInputStreamReader(newFileInputStream(file),"GBk");BufferedReaderinput=newBufferedReader(read);Stringtext;text=input.readLine();//第一行是时间text=input.readLine();//第二行是标题text+=input.readLine();//第三行正文的第一段content=text;input.close();}catch(IOExceptionioException){System.err.println("FileError!");}}returncontent;}privatestaticString[]spitByWhiteChar(Stringpara){if(para!=null)returnpara.split("[\\s ]");//returnpara.split("");elsereturnnull;}/***@paramargs*/publicstaticvoidmain(String[]args){//TODOAuto-generatedmethodstubICTCLAS30testICTCLAS30=newICTCLAS30();Stringargu=".";try{if(testICTCLAS30.ICTCLAS_Init(argu.getBytes("GB2312"))==false){System.out.println("InitFail!");return;}}catch(UnsupportedEncodingExceptione1){//TODOAuto-generatedcatchblocke1.printStackTrace();}/**���ô��Ա�ע��ID�����Լ�1������һ����ע��0�����������ע��2���������ע��3����һ����ע��*/testICTCLAS30.ICTCLAS_SetPOSmap(1);for(inti=0;i{//System.out.println(CATEGORIES[i]);FileclassDir=newFile(TDIR,CATEGORIES[i]);if(!classDir.isDirectory()){System.out.println("不能找到目录="+classDir);}//遍历for(Filefile:classDir.listFiles()){Stringfilepath="D:/Users/admin/eclipseworkspace/CreateCorpus/突发事件语料/"+CATEGORIES[i]+"/"+file.getName();//System.out.println(filepath);Stringnews=ReturnTitleandContent(filepath);//Stringnews=ReturnTitleandFirstParagraph(filepath);//Stringnews=ReturnTitle(filepath);news=news.trim();//System.out.println(news);//ȥ��if(!newChineseCheck().CheckChinese(news))continue;news=newChangeCode().Change(news);//�ִ�if(news==null)continue;bytenativeBytes[];try{nativeBytes=testICTCLAS30.ICTCLAS_ParagraphProcess(news.getBytes("GBK"),1);StringspitedNews=newString(nativeBytes,0,nativeBytes.length,"GBK");//FileWriterwriter=newFileWriter(String.valueOf(i),false);String[]words=spitByWhiteChar(spitedNews);spitedNews="";for(Stringword:words){intpos=word.indexOf("/");intpos1=word.indexOf("/n");intpos2=word.indexOf("/v");intpos3=word.indexOf("/t");//intpos2,pos3,pos4,pos5;//intpos2=word.indexOf("/");if(!word.equals("")&&((pos1!=-1)||(pos2!=-1)||(pos3!=-1))){word=word.substring(0,pos);if(wordDic.containsKey(word)){wordDic.put(word,wordDic.get(word)+1);}else{wordDic.put(word,1);//System.out.println(word);}}}//System.out.println(wordDic.size());}catch(UnsupportedEncodingExceptione){//TODOAuto-generatedcatchblocke.printStackTrace();}catch(IOExceptione){//TODOAuto-generatedcatchblocke.printStackTrace();}}}System.out.print("初步统计词(名词+动词+时间词)之后的词个数:");System.out.println(wordDic.size());//去停用词stopWord=CreateStopWord("ChineseStopWord.txt");//Iteratoriterator=stopWord.iterator();//while(iterator.hasNext()){//System.out.println(iterator.next());//}//intnum=0;Iteratoriter=wordDic.entrySet().iterator();while(iter.hasNext()){Map.Entryentry=(Map.Entry)iter.next();Objectkey=entry.getKey();Objectval=entry.getValue();if(stopWord.contains(key)){iter.remove();continue;}else{wordCnt[num][10]=Integer.valueOf(val.toString());//System.out.println(wordCnt[num][10]);wordInt.put(key.toString(),num);//词和词的编号映射,wordCnt保存的是0-9的类的词个数以及这个词的总个数cntWord[num]=key.toString();++num;}}System.out.println("去除停用词剩余词个数:"+num);//计算wordCnt[0-9]再次遍历for(inti=0;i{FileclassDir=newFile(TDIR,CATEGORIES[i]);if(!classDir.isDirectory()){System.out.println("不能找到目录="+classDir);}//遍历for(Filefile:classDir.listFiles()){intflag[][]=newint[20000][10];for(intw=0;w
if(wordValid[i]==0)continue;
output.write(cntWord[i]+"\r\n");
for(intj=0;j<10;j++)
output.write(String.valueOf(wordCnt[i][j]));
output.write("\r\n");
//output.write("\r\n");
output.close();
}catch(IOExceptione){
//TODOAuto-generatedcatchblock
e.printStackTrace();
System.out.println("输出文件wordChi1.txt完毕!
");
publicstaticvoidoutputFile()
Fileout=newFile("wordCnt.txt");
for(inti=0;i{output.write(cntWord[i]+"");for(intj=0;j<11;j++){output.write(String.valueOf(wordCnt[i][j]));output.write("");}output.write("\r\n");}output.close();}}catch(IOExceptione){//TODOAuto-generatedcatchblocke.printStackTrace();}System.out.println("输出文件wordCnt.txt完毕!");}publicstaticHashSetCreateStopWord(Stringfilename){HashSettempStopWord=newHashSet();Filefile=newFile(filename);if(file.exists()&&file.isFile()){try{InputStreamReaderread=newInputStreamReader(newFileInputStream(file),"GBk");BufferedReaderinput=newBufferedReader(read);Stringtext;while((text=input.readLine())!=null)//从第三行到文件末尾是正文{tempStopWord.add(text);}input.close();}catch(IOExceptionioException){System.err.println("FileError!");}}returntempStopWord;}publicstaticStringReturnTitle(Stringfilename){Stringtitle="";Filefile=newFile(filename);if(file.exists()&&file.isFile()){try{InputStreamReaderread=newInputStreamReader(newFileInputStream(file),"GBk");BufferedReaderinput=newBufferedReader(read);Stringtext;text=input.readLine();//第一行是时间text=input.readLine();//第二行是标题title=text;input.close();}catch(IOExceptionioException){System.err.println("FileError!");}}returntitle;}publicstaticStringReturnContent(Stringfilename){Stringcontent="";Filefile=newFile(filename);if(file.exists()&&file.isFile()){try{InputStreamReaderread=newInputStreamReader(newFileInputStream(file),"GBk");BufferedReaderinput=newBufferedReader(read);Stringtext;text=input.readLine();//第一行是时间text=input.readLine();//第二行是标题while((text=input.readLine())!=null)//从第三行到文件末尾是正文{content+=text;}input.close();}catch(IOExceptionioException){System.err.println("FileError!");}}returncontent;}publicstaticStringReturnTitleandContent(Stringfilename){Stringcontent="";Filefile=newFile(filename);if(file.exists()&&file.isFile()){try{InputStreamReaderread=newInputStreamReader(newFileInputStream(file),"GBk");BufferedReaderinput=newBufferedReader(read);Stringtext;text=input.readLine();//第一行是时间text=input.readLine();//第二行是标题content=text;while((text=input.readLine())!=null)//从第三行到文件末尾是正文{content+=text;}input.close();}catch(IOExceptionioException){System.err.println("FileError!");}}returncontent;}publicstaticStringReturnTitleandFirstParagraph(Stringfilename){Stringcontent="";Filefile=newFile(filename);if(file.exists()&&file.isFile()){try{InputStreamReaderread=newInputStreamReader(newFileInputStream(file),"GBk");BufferedReaderinput=newBufferedReader(read);Stringtext;text=input.readLine();//第一行是时间text=input.readLine();//第二行是标题text+=input.readLine();//第三行正文的第一段content=text;input.close();}catch(IOExceptionioException){System.err.println("FileError!");}}returncontent;}privatestaticString[]spitByWhiteChar(Stringpara){if(para!=null)returnpara.split("[\\s ]");//returnpara.split("");elsereturnnull;}/***@paramargs*/publicstaticvoidmain(String[]args){//TODOAuto-generatedmethodstubICTCLAS30testICTCLAS30=newICTCLAS30();Stringargu=".";try{if(testICTCLAS30.ICTCLAS_Init(argu.getBytes("GB2312"))==false){System.out.println("InitFail!");return;}}catch(UnsupportedEncodingExceptione1){//TODOAuto-generatedcatchblocke1.printStackTrace();}/**���ô��Ա�ע��ID�����Լ�1������һ����ע��0�����������ע��2���������ע��3����һ����ע��*/testICTCLAS30.ICTCLAS_SetPOSmap(1);for(inti=0;i{//System.out.println(CATEGORIES[i]);FileclassDir=newFile(TDIR,CATEGORIES[i]);if(!classDir.isDirectory()){System.out.println("不能找到目录="+classDir);}//遍历for(Filefile:classDir.listFiles()){Stringfilepath="D:/Users/admin/eclipseworkspace/CreateCorpus/突发事件语料/"+CATEGORIES[i]+"/"+file.getName();//System.out.println(filepath);Stringnews=ReturnTitleandContent(filepath);//Stringnews=ReturnTitleandFirstParagraph(filepath);//Stringnews=ReturnTitle(filepath);news=news.trim();//System.out.println(news);//ȥ��if(!newChineseCheck().CheckChinese(news))continue;news=newChangeCode().Change(news);//�ִ�if(news==null)continue;bytenativeBytes[];try{nativeBytes=testICTCLAS30.ICTCLAS_ParagraphProcess(news.getBytes("GBK"),1);StringspitedNews=newString(nativeBytes,0,nativeBytes.length,"GBK");//FileWriterwriter=newFileWriter(String.valueOf(i),false);String[]words=spitByWhiteChar(spitedNews);spitedNews="";for(Stringword:words){intpos=word.indexOf("/");intpos1=word.indexOf("/n");intpos2=word.indexOf("/v");intpos3=word.indexOf("/t");//intpos2,pos3,pos4,pos5;//intpos2=word.indexOf("/");if(!word.equals("")&&((pos1!=-1)||(pos2!=-1)||(pos3!=-1))){word=word.substring(0,pos);if(wordDic.containsKey(word)){wordDic.put(word,wordDic.get(word)+1);}else{wordDic.put(word,1);//System.out.println(word);}}}//System.out.println(wordDic.size());}catch(UnsupportedEncodingExceptione){//TODOAuto-generatedcatchblocke.printStackTrace();}catch(IOExceptione){//TODOAuto-generatedcatchblocke.printStackTrace();}}}System.out.print("初步统计词(名词+动词+时间词)之后的词个数:");System.out.println(wordDic.size());//去停用词stopWord=CreateStopWord("ChineseStopWord.txt");//Iteratoriterator=stopWord.iterator();//while(iterator.hasNext()){//System.out.println(iterator.next());//}//intnum=0;Iteratoriter=wordDic.entrySet().iterator();while(iter.hasNext()){Map.Entryentry=(Map.Entry)iter.next();Objectkey=entry.getKey();Objectval=entry.getValue();if(stopWord.contains(key)){iter.remove();continue;}else{wordCnt[num][10]=Integer.valueOf(val.toString());//System.out.println(wordCnt[num][10]);wordInt.put(key.toString(),num);//词和词的编号映射,wordCnt保存的是0-9的类的词个数以及这个词的总个数cntWord[num]=key.toString();++num;}}System.out.println("去除停用词剩余词个数:"+num);//计算wordCnt[0-9]再次遍历for(inti=0;i{FileclassDir=newFile(TDIR,CATEGORIES[i]);if(!classDir.isDirectory()){System.out.println("不能找到目录="+classDir);}//遍历for(Filefile:classDir.listFiles()){intflag[][]=newint[20000][10];for(intw=0;w
output.write(cntWord[i]+"");
for(intj=0;j<11;j++)
output.write("");
System.out.println("输出文件wordCnt.txt完毕!
publicstaticHashSetCreateStopWord(Stringfilename)
HashSettempStopWord=newHashSet();
Filefile=newFile(filename);
if(file.exists()&&file.isFile()){
InputStreamReaderread=newInputStreamReader(newFileInputStream(file),"GBk");
BufferedReaderinput=newBufferedReader(read);
Stringtext;
while((text=input.readLine())!
=null)//从第三行到文件末尾是正文
tempStopWord.add(text);
input.close();
catch(IOExceptionioException){
System.err.println("FileError!
returntempStopWord;
publicstaticStringReturnTitle(Stringfilename)
Stringtitle="";
text=input.readLine();//第一行是时间
text=input.readLine();//第二行是标题
title=text;
returntitle;
publicstaticStringReturnContent(Stringfilename)
Stringcontent="";
content+=text;
returncontent;
publicstaticStringReturnTitleandContent(Stringfilename)
content=text;
publicstaticStringReturnTitleandFirstParagraph(Stringfilename)
text+=input.readLine();//第三行正文的第一段
privatestaticString[]spitByWhiteChar(Stringpara){
if(para!
=null)
returnpara.split("[\\s ]");
//returnpara.split("");
else
returnnull;
/**
*@paramargs
*/
publicstaticvoidmain(String[]args){
//TODOAuto-generatedmethodstub
ICTCLAS30testICTCLAS30=newICTCLAS30();
Stringargu=".";
if(testICTCLAS30.ICTCLAS_Init(argu.getBytes("GB2312"))==false){
System.out.println("InitFail!
return;
}catch(UnsupportedEncodingExceptione1){
e1.printStackTrace();
/*
*���ô��Ա�ע��ID�����Լ�1������һ����ע��0�����������ע��2���������ע��3����һ����ע��
testICTCLAS30.ICTCLAS_SetPOSmap
(1);
for(inti=0;i{//System.out.println(CATEGORIES[i]);FileclassDir=newFile(TDIR,CATEGORIES[i]);if(!classDir.isDirectory()){System.out.println("不能找到目录="+classDir);}//遍历for(Filefile:classDir.listFiles()){Stringfilepath="D:/Users/admin/eclipseworkspace/CreateCorpus/突发事件语料/"+CATEGORIES[i]+"/"+file.getName();//System.out.println(filepath);Stringnews=ReturnTitleandContent(filepath);//Stringnews=ReturnTitleandFirstParagraph(filepath);//Stringnews=ReturnTitle(filepath);news=news.trim();//System.out.println(news);//ȥ��if(!newChineseCheck().CheckChinese(news))continue;news=newChangeCode().Change(news);//�ִ�if(news==null)continue;bytenativeBytes[];try{nativeBytes=testICTCLAS30.ICTCLAS_ParagraphProcess(news.getBytes("GBK"),1);StringspitedNews=newString(nativeBytes,0,nativeBytes.length,"GBK");//FileWriterwriter=newFileWriter(String.valueOf(i),false);String[]words=spitByWhiteChar(spitedNews);spitedNews="";for(Stringword:words){intpos=word.indexOf("/");intpos1=word.indexOf("/n");intpos2=word.indexOf("/v");intpos3=word.indexOf("/t");//intpos2,pos3,pos4,pos5;//intpos2=word.indexOf("/");if(!word.equals("")&&((pos1!=-1)||(pos2!=-1)||(pos3!=-1))){word=word.substring(0,pos);if(wordDic.containsKey(word)){wordDic.put(word,wordDic.get(word)+1);}else{wordDic.put(word,1);//System.out.println(word);}}}//System.out.println(wordDic.size());}catch(UnsupportedEncodingExceptione){//TODOAuto-generatedcatchblocke.printStackTrace();}catch(IOExceptione){//TODOAuto-generatedcatchblocke.printStackTrace();}}}System.out.print("初步统计词(名词+动词+时间词)之后的词个数:");System.out.println(wordDic.size());//去停用词stopWord=CreateStopWord("ChineseStopWord.txt");//Iteratoriterator=stopWord.iterator();//while(iterator.hasNext()){//System.out.println(iterator.next());//}//intnum=0;Iteratoriter=wordDic.entrySet().iterator();while(iter.hasNext()){Map.Entryentry=(Map.Entry)iter.next();Objectkey=entry.getKey();Objectval=entry.getValue();if(stopWord.contains(key)){iter.remove();continue;}else{wordCnt[num][10]=Integer.valueOf(val.toString());//System.out.println(wordCnt[num][10]);wordInt.put(key.toString(),num);//词和词的编号映射,wordCnt保存的是0-9的类的词个数以及这个词的总个数cntWord[num]=key.toString();++num;}}System.out.println("去除停用词剩余词个数:"+num);//计算wordCnt[0-9]再次遍历for(inti=0;i{FileclassDir=newFile(TDIR,CATEGORIES[i]);if(!classDir.isDirectory()){System.out.println("不能找到目录="+classDir);}//遍历for(Filefile:classDir.listFiles()){intflag[][]=newint[20000][10];for(intw=0;w
//System.out.println(CATEGORIES[i]);
FileclassDir=newFile(TDIR,CATEGORIES[i]);
if(!
classDir.isDirectory()){
System.out.println("不能找到目录="+classDir);
//遍历
for(Filefile:
classDir.listFiles())
Stringfilepath="D:
/Users/admin/eclipseworkspace/CreateCorpus/突发事件语料/"+CATEGORIES[i]+"/"+file.getName();
//System.out.println(filepath);
Stringnews=ReturnTitleandContent(filepath);
//Stringnews=ReturnTitleandFirstParagraph(filepath);
//Stringnews=ReturnTitle(filepath);
news=news.trim();
//System.out.println(news);
//ȥ��
newChineseCheck().CheckChinese(news))
continue;
news=newChangeCode().Change(news);
//�ִ�
if(news==null)
bytenativeBytes[];
nativeBytes=testICTCLAS30.ICTCLAS_ParagraphProcess(
news.getBytes("GBK"),1);
StringspitedNews=newString(nativeBytes,0,
nativeBytes.length,"GBK");
//FileWriterwriter=newFileWriter(String.valueOf(i),false);
String[]words=spitByWhiteChar(spitedNews);
spitedNews="";
for(Stringword:
words){
intpos=word.indexOf("/");
intpos1=word.indexOf("/n");
intpos2=word.indexOf("/v");
intpos3=word.indexOf("/t");
//intpos2,pos3,pos4,pos5;
//intpos2=word.indexOf("/");
word.equals("")&&((pos1!
=-1)||(pos2!
=-1)||(pos3!
=-1))){
word=word.substring(0,pos);
if(wordDic.containsKey(word)){
wordDic.put(word,wordDic.get(word)+1);
}else{
wordDic.put(word,1);
//System.out.println(word);
//System.out.println(wordDic.size());
}catch(UnsupportedEncodingExceptione){
System.out.print("初步统计词(名词+动词+时间词)之后的词个数:
System.out.println(wordDic.size());
//去停用词
stopWord=CreateStopWord("ChineseStopWord.txt");
//Iteratoriterator=stopWord.iterator();
//while(iterator.hasNext()){
//System.out.println(iterator.next());
//}
//intnum=0;
Iteratoriter=wordDic.entrySet().iterator();
while(iter.hasNext()){
Map.Entryentry=(Map.Entry)iter.next();
Objectkey=entry.getKey();
Objectval=entry.getValue();
if(stopWord.contains(key))
iter.remove();
wordCnt[num][10]=Integer.valueOf(val.toString());
//System.out.println(wordCnt[num][10]);
wordInt.put(key.toString(),num);//词和词的编号映射,wordCnt保存的是0-9的类的词个数以及这个词的总个数
cntWord[num]=key.toString();
++num;
System.out.println("去除停用词剩余词个数:
"+num);
//计算wordCnt[0-9]再次遍历
for(inti=0;i{FileclassDir=newFile(TDIR,CATEGORIES[i]);if(!classDir.isDirectory()){System.out.println("不能找到目录="+classDir);}//遍历for(Filefile:classDir.listFiles()){intflag[][]=newint[20000][10];for(intw=0;w
intflag[][]=newint[20000][10];
for(intw=0;w
copyright@ 2008-2023 冰点文库 网站版权所有
经营许可证编号:鄂ICP备19020893号-2