数据挖掘基于贝叶斯算法及KNN算法.docx
《数据挖掘基于贝叶斯算法及KNN算法.docx》由会员分享,可在线阅读,更多相关《数据挖掘基于贝叶斯算法及KNN算法.docx(26页珍藏版)》请在冰点文库上搜索。
数据挖掘基于贝叶斯算法及KNN算法
数据挖掘-基于贝叶斯算法及KNN算法
数据挖掘-基于贝叶斯算法及KNN算法的newsgroup18828文档分类器的JAVA实现(上) 本分类器的完整工程可以到点击打开链接下载,详细说明的运行方法,用eclipse可以运行,学习数据挖掘的朋友可以跑一下,有问题可以联系我,欢迎交流:
)
上文中描述了newsgroup18828文档集的预处理及贝叶斯算法的JAVA实现,下面我们来看看如何实现基于KNN算法的newsgroup文本分类器
1KNN算法的描述
KNN算法描述如下:
STEPONE:
文本向量化表示,由特征词的TF*IDF值计算
STEPTWO:
在新文本到达后,根据特征词确定新文本的向量
STEPTHREE:
在训练文本集中选出与新文本最相似的K个文本,相似度用向量夹角余弦度量,计算公式为:
其中,K值的确定目前没有很好的方法,一般采用先定一个初始值,然后根据实验测试的结果调整K值
本项目中K取20
STEPFOUR:
在新文本的K个邻居中,依次计算每类的权重,每类的权重等于K个邻居中属于该类的训练样本与测试样本的相似度之和。
STEPFIVE:
比较类的权重,将文本分到权重最大的那个类别中。
2文档TF-IDF计算及向量化表示
实现KNN算法首先要实现文档的向量化表示
计算特征词的TF*IDF,每个文档的向量由包含所有特征词的TF*IDF值组成,每一维对应一个特征词
TF及IDF的计算公式如下,分别为特征词的特征项频率和逆文档频率
文档向量计算类ComputeWordsVector.java如下
1.package com.pku.yangliu;
2.import java.io.BufferedReader;
3.import java.io.File;
4.import java.io.FileReader;
5.import java.io.FileWriter;
6.import java.io.IOException;
7.import java.util.SortedMap;
8.import java.util.Map;
9.import java.util.Set;
10.import java.util.TreeMap;
11.import java.util.Iterator;
12.
13./**计算文档的属性向量,将所有文档向量化
14. * @author yangliu
15. * @qq 772330184
16. * @mail yang.liu@
17. *
18. */
19.public class ComputeWordsVector {
20.
21. /**计算文档的TF属性向量,直接写成二维数组遍历形式即可,没必要递归
22. * @param strDir 处理好的newsgroup文件目录的绝对路径
23. * @param trainSamplePercent 训练样例集占每个类目的比例
24. * @param indexOfSample 测试样例集的起始的测试样例编号
25. * @param wordMap 属性词典map
26. * @throws IOException
27. */
28. public void computeTFMultiIDF(String strDir, double trainSamplePercent, int indexOfSample, Map iDFPerWordMap, Map wordMap) throws IOException{
29. File fileDir = new File(strDir);
30. String word;
31. SortedMap TFPerDocMap = new TreeMap();
32. //注意可以用两个写文件,一个专门写测试样例,一个专门写训练样例,用sampleType的值来表示
33. String trainFileDir = "F:
/DataMiningSample/docVector/wordTFIDFMapTrainSample"+indexOfSample;
34. String testFileDir = "F:
/DataMiningSample/docVector/wordTFIDFMapTestSample"+indexOfSample;
35. FileWriter tsTrainWriter = new FileWriter(new File(trainFileDir));
36. FileWriter tsTestWrtier = new FileWriter(new File(testFileDir));
37. FileWriter tsWriter = tsTrainWriter;
38. File[] sampleDir = fileDir.listFiles();
39. for(int i = 0; i < sampleDir.length; i++){
40. String cateShortName = sampleDir[i].getName();
41. System.out.println("compute:
" + cateShortName);
42. File[] sample = sampleDir[i].listFiles();
43. double testBeginIndex = indexOfSample*(sample.length * (1-trainSamplePercent));//测试样例的起始文件序号
44. double testEndIndex = (indexOfSample+1)*(sample.length * (1-trainSamplePercent));//测试样例集的结束文件序号
45. System.out.println("dirName_total length:
"+sampleDir[i].getCanonicalPath()+"_"+sample.length);
46. System.out.println(trainSamplePercent + " length:
"+sample.length * trainSamplePercent +" testBeginIndex:
"+testBeginIndex+" testEndIndex"+ testEndIndex);
47. for(int j = 0;j < sample.length; j++){
48. TFPerDocMap.clear();
49. FileReader samReader = new FileReader(sample[j]);
50. BufferedReader samBR = new BufferedReader(samReader);
51. String fileShortName = sample[j].getName();
52. Double wordSumPerDoc = 0.0;//计算每篇文档的总词数
53. while((word = samBR.readLine()) !
= null){
54. if(!
word.isEmpty() && wordMap.containsKey(word)){//必须是属性词典里面的词,去掉的词不考虑
55. wordSumPerDoc++;
56. if(TFPerDocMap.containsKey(word)){
57. Double count = TFPerDocMap.get(word);
58. TFPerDocMap.put(word, count + 1);
59. }
60. else {
61. TFPerDocMap.put(word, 1.0);
62. }
63. }
64. }
65. //遍历一下当前文档的TFmap,除以文档的总词数换成词频,然后将词频乘以词的IDF,得到最终的特征权值,并且输出到文件
66. //注意测试样例和训练样例写入的文件不同
67. if(j >= testBeginIndex && j <= testEndIndex){
68. tsWriter = tsTestWrtier;
69. }
70. else{
71. tsWriter = tsTrainWriter;
72. }
73. Double wordWeight;
74. Set> tempTF = TFPerDocMap.entrySet();
75. for(Iterator> mt = tempTF.iterator(); mt.hasNext();){
76. Map.Entry me = mt.next();
77. //wordWeight = (me.getValue() / wordSumPerDoc) * IDFPerWordMap.get(me.getKey());
78. //这里IDF暂时设为1,具体的计算IDF算法改进和实现见我的博客中关于kmeans聚类的博文
79. wordWeight = (me.getValue() / wordSumPerDoc) * 1.0;
80. TFPerDocMap.put(me.getKey(), wordWeight);
81. }
82. tsWriter.append(cateShortName + " ");
83. String keyWord = fileShortName.substring(0,5);
84. tsWriter.append(keyWord+ " ");
85. Set> tempTF2 = TFPerDocMap.entrySet();
86. for(Iterator> mt = tempTF2.iterator(); mt.hasNext();){
87. Map.Entry ne = mt.next();
88. tsWriter.append(ne.getKey() + " " + ne.getValue() + " ");
89. }
90. tsWriter.append("\n");
91. tsWriter.flush();
92. }
93. }
94. tsTrainWriter.close();
95. tsTestWrtier.close();
96. tsWriter.close();
97. }
98.
99. /**统计每个词的总的出现次数,返回出现次数大于3次的词汇构成最终的属性词典
100. * @param strDir 处理好的newsgroup文件目录的绝对路径
101. * @throws IOException
102. */
103. public SortedMap countWords(String strDir,Map wordMap) throws IOException{
104. File sampleFile = new File(strDir);
105. File [] sample = sampleFile.listFiles();
106. String word;
107. for(int i = 0; i < sample.length; i++){
108. if(!
sample[i].isDirectory()){
109. if(sample[i].getName().contains("stemed")){
110. FileReader samReader = new FileReader(sample[i]);
111. BufferedReader samBR = new BufferedReader(samReader);
112. while((word = samBR.readLine()) !
= null){
113. if(!
word.isEmpty() && wordMap.containsKey(word)){
114. double count = wordMap.get(word) + 1;
115. wordMap.put(word, count);
116. }
117. else {
118. wordMap.put(word, 1.0);
119. }
120. }
121. }
122. }
123. else countWords(sample[i].getCanonicalPath(),wordMap);
124. }
125. //只返回出现次数大于3的单词
126. SortedMap newWordMap = new TreeMap();
127. Set> allWords = wordMap.entrySet();
128. for(Iterator> it = allWords.iterator(); it.hasNext();){
129. Map.Entry me = it.next();
130. if(me.getValue() >= 1){
131. newWordMap.put(me.getKey(),me.getValue());
132. }
133. }
134. return newWordMap;
135. }
136.
137. /**打印属性词典
138. * @param SortedMap 属性词典
139. * @throws IOException
140. */
141. void printWordMap(Map wordMap) throws IOException {
142. // TODO Auto-generated method stub
143. System.out.println("printWordMap");
144. int countLine = 0;
145. File outPutFile = new File("F:
/DataMiningSample/docVector/allDicWordCountMap.txt");
146. FileWriter outPutFileWriter = new FileWriter(outPutFile);
147. Set> allWords = wordMap.entrySet();
148. for(Iterator> it = allWords.iterator(); it.hasNext();){
149. Map.Entry me = it.next();
150. outPutFileWriter.write(me.getKey()+" "+me.getValue()+"\n");
151. countLine++;
152. }
153. System.out.println("WordMap size" + countLine);
154. }
155.
156. /**计算IDF,即属性词典中每个词在多少个文档中出现过
157. * @param SortedMap 属性词典
158. * @return 单词的IDFmap
159. * @throws IOException
160. */
161. SortedMap computeIDF(String string, Map wordMap) throws IOException {
162. // TODO Auto-generated method stub
163. File fileDir = new File(string);
164. String word;
165. SortedMap IDFPerWordMap = new TreeMap();
166. Set> wordMapSet = wordMap.entrySet();
167. for(Iterator> pt = wordMapSet.iterator(); pt.hasNext();){
168. Map.Entry pe = pt.next();
169. Double coutDoc = 0.0;
170. String dicWord = pe.getKey();
171. File[] sampleDir = fileDir.listFiles();
172. for(int i = 0; i < sampleDir.length; i++){
173. File[] sample = sampleDir[i].listFiles();
174. for(int j = 0;j < sample.length; j++){
175. FileReader samReader = new FileReader(sample[j]);
176.