实验报告聚类分析报告.docx
《实验报告聚类分析报告.docx》由会员分享,可在线阅读,更多相关《实验报告聚类分析报告.docx(26页珍藏版)》请在冰点文库上搜索。
实验报告聚类分析报告
实验报告聚类分析
实验原理:
K均值聚类、中心点聚类、系统聚类和EM算法聚类分析技术。
实验题目:
用鸢尾花的数据集,进行聚类挖掘分析。
实验要求:
探索鸢尾花数据的基本特征,利用不同的聚类挖掘方法,获得基本结论并简明解释。
实验题目--分析报告:
data(iris)
>rm(list=ls())
>gc()
used(Mb)gctrigger(Mb)maxused(Mb)
Ncells43173023.192971849.760759132.5
Vcells7876056.1838860864.0159240312.2
>data(iris)
>data<-iris
>head(data)
Sepal.LengthSepal.WidthPetal.LengthPetal.WidthSpecies
15.13.51.40.2setosa
24.93.01.40.2setosa
34.73.21.30.2setosa
44.63.11.50.2setosa
55.03.61.40.2setosa
65.43.91.70.4setosa
#Kmean聚类分析
>newiris<-iris
>newiris$Species<-NULL
>(kc<-kmeans(newiris,3))
K-meansclusteringwith3clustersofsizes62,50,38
Clustermeans:
Sepal.LengthSepal.WidthPetal.LengthPetal.Width
15.9016132.7483874.3935481.433871
25.0060003.4280001.4620000.246000
36.8500003.0736845.7421052.071053
Clusteringvector:
[1]2222222222222222222222222222222222222222
[41]2222222222113111111111111111111111111311
[81]1111111111111111111131333313333331133331
[121]313133113333313333133313331331
Withinclustersumofsquaresbycluster:
[1]39.8209715.1510023.87947
(between_SS/total_SS=88.4%)
Availablecomponents:
[1]"cluster""centers""totss""withinss""tot.withinss"
[6]"betweenss""size""iter""ifault"
>table(iris$Species,kc$cluster)
123
setosa0500
versicolor4802
virginica14036
>plot(newiris[c("Sepal.Length","Sepal.Width")],col=kc$cluster)
>points(kc$centers[,c("Sepal.Length","Sepal.Width")],col=1:
3,pch=8,cex=2)
#K-Mediods进行聚类分析
>install.packages("cluster")
>library(cluster)
>iris.pam<-pam(iris,3)
>table(iris$Species,iris.pam$clustering)
123
setosa5000
versicolor0347
virginica0491
>layout(matrix(c(1,2),1,2))
>plot(iris.pam)
>layout(matrix
(1))
#hc
>iris.hc<-hclust(dist(iris[,1:
4]))
>plot(iris.hc,hang=-1)
>plclust(iris.hc,labels=FALSE,hang=-1)
>re<-rect.hclust(iris.hc,k=3)
>iris.id<-cutree(iris.hc,3)
#利用剪枝函数cutree()参数h控制输出height=18时的系谱类别
>sapply(unique(iris.id),
+function(g)iris$Species[iris.id==g])
[[1]]
[1]setosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosa
[12]setosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosa
[23]setosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosa
[34]setosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosa
[45]setosasetosasetosasetosasetosasetosa
Levels:
setosaversicolorvirginica
[[2]]
[1]versicolorversicolorversicolorversicolorversicolorversicolorversicolor
[8]versicolorversicolorversicolorversicolorversicolorversicolorversicolor
[15]versicolorversicolorversicolorversicolorversicolorversicolorversicolor
[22]versicolorversicolorvirginicavirginicavirginicavirginicavirginica
[29]virginicavirginicavirginicavirginicavirginicavirginicavirginica
[36]virginicavirginicavirginicavirginicavirginicavirginicavirginica
[43]virginicavirginicavirginicavirginicavirginicavirginicavirginica
[50]virginicavirginicavirginicavirginicavirginicavirginicavirginica
[57]virginicavirginicavirginicavirginicavirginicavirginicavirginica
[64]virginicavirginicavirginicavirginicavirginicavirginicavirginica
[71]virginicavirginica
Levels:
setosaversicolorvirginica
[[3]]
[1]versicolorversicolorversicolorversicolorversicolorversicolorversicolor
[8]versicolorversicolorversicolorversicolorversicolorversicolorversicolor
[15]versicolorversicolorversicolorversicolorversicolorversicolorversicolor
[22]versicolorversicolorversicolorversicolorversicolorversicolorvirginica
Levels:
setosaversicolorvirginica
>plot(iris.hc)
>rect.hclust(iris.hc,k=4,border="lightgrey")#用浅灰色矩形框出4分类聚类结果
>rect.hclust(iris.hc,k=3,border="darkgrey")#用浅灰色矩形框出3分类聚类结果
>rect.hclust(iris.hc,k=7,which=c(2,6),border="darkgrey")
#DBSCAN#基于密度的聚类
>install.packages("fpc")
>library(fpc)
>ds1=dbscan(iris[,1:
4],eps=1,MinPts=5)#半径参数为1,密度阈值为5
>ds1
dbscanPts=150MinPts=5eps=1
12
border01
seed5099
total50100
>ds2=dbscan(iris[,1:
4],eps=4,MinPts=5)
>ds3=dbscan(iris[,1:
4],eps=4,MinPts=2)
>ds4=dbscan(iris[,1:
4],eps=8,MinPts=2)
>par(mfcol=c(2,2))
>plot(ds1,iris[,1:
4],main="1:
MinPts=5eps=1")
>plot(ds3,iris[,1:
4],main="3:
MinPts=2eps=4")
>plot(ds2,iris[,1:
4],main="2:
MinPts=5eps=4")
>plot(ds4,iris[,1:
4],main="4:
MinPts=2eps=8")
>d=dist(iris[,1:
4])#计算数据集的距离矩阵d
>max(d);min(d)#计算数据集样本的距离的最值
[1]7.085196
[1]0
>install.packages("ggplot2")
>library(ggplot2)
>interval=cut_interval(d,30)
>table(interval)
interval
[0,0.236](0.236,0.472](0.472,0.709](0.709,0.945](0.945,1.18](1.18,1.42]
88585876891831688
(1.42,1.65](1.65,1.89](1.89,2.13](2.13,2.36](2.36,2.6](2.6,2.83]
543369379339335406
(2.83,3.07](3.07,3.31](3.31,3.54](3.54,3.78](3.78,4.01](4.01,4.25]
458459465480468505
(4.25,4.49](4.49,4.72](4.72,4.96](4.96,5.2](5.2,5.43](5.43,5.67]
349385321291187138
(5.67,5.9](5.9,6.14](6.14,6.38](6.38,6.61](6.61,6.85](6.85,7.09]
97927850184
>which.max(table(interval))
(0.709,0.945]
4
>for(iin3:
5)
+{for(jin1:
10)
+{ds=dbscan(iris[,1:
4],eps=i,MinPts=j)
+print(ds)
+}
+}
dbscanPts=150MinPts=1eps=3
1
seed150
total150
dbscanPts=150MinPts=2eps=3
1
seed150
total150
dbscanPts=150MinPts=3eps=3
1
seed150
total150
dbscanPts=150MinPts=4eps=3
1
seed150
total150
dbscanPts=150MinPts=5eps=3
1
seed150
total150
dbscanPts=150MinPts=6eps=3
1
seed150
total150
dbscanPts=150MinPts=7eps=3
1
seed150
total150
dbscanPts=150MinPts=8eps=3
1
seed150
total150
dbscanPts=150MinPts=9eps=3
1
seed150
total150
dbscanPts=150MinPts=10eps=3
1
seed150
total150
dbscanPts=150MinPts=1eps=4
1
seed150
total150
dbscanPts=150MinPts=2eps=4
1
seed150
total150
dbscanPts=150MinPts=3eps=4
1
seed150
total150
dbscanPts=150MinPts=4eps=4
1
seed150
total150
dbscanPts=150MinPts=5eps=4
1
seed150
total150
dbscanPts=150MinPts=6eps=4
1
seed150
total150
dbscanPts=150MinPts=7eps=4
1
seed150
total150
dbscanPts=150MinPts=8eps=4
1
seed150
total150
dbscanPts=150MinPts=9eps=4
1
seed150
total150
dbscanPts=150MinPts=10eps=4
1
seed150
total150
dbscanPts=150MinPts=1eps=5
1
seed150
total150
dbscanPts=150MinPts=2eps=5
1
seed150
total150
dbscanPts=150MinPts=3eps=5
1
seed150
total150
dbscanPts=150MinPts=4eps=5
1
seed150
total150
dbscanPts=150MinPts=5eps=5
1
seed150
total150
dbscanPts=150MinPts=6eps=5
1
seed150
total150
dbscanPts=150MinPts=7eps=5
1
seed150
total150
dbscanPts=150MinPts=8eps=5
1
seed150
total150
dbscanPts=150MinPts=9eps=5
1
seed150
total150
dbscanPts=150MinPts=10eps=5
1
seed150
total150
#30次dbscan的聚类结果
>ds5=dbscan(iris[,1:
4],eps=3,MinPts=2)
>ds6=dbscan(iris[,1:
4],eps=4,MinPts=5)
>ds7=dbscan(iris[,1:
4],eps=5,MinPts=9)
>par(mfcol=c(1,3))
>plot(ds5,iris[,1:
4],main="1:
MinPts=2eps=3")
>plot(ds6,iris[,1:
4],main="3:
MinPts=5eps=4")
>plot(ds7,iris[,1:
4],main="2:
MinPts=9eps=5")
#EM期望最大化聚类
>install.packages("mclust")
>library(mclust)
>fit_EM=Mclust(iris[,1:
4])
fitting...
|===========================================================================|100%
>summary(fit_EM)
----------------------------------------------------
GaussianfinitemixturemodelfittedbyEMalgorithm
----------------------------------------------------
MclustVEV(ellipsoidal,equalshape)modelwith2components:
log.likelihoodndfBICICL
-215.72615026-561.7285-561.7289
Clusteringtable:
12
50100
>summary(fit_EM,parameters=TRUE)
----------------------------------------------------
GaussianfinitemixturemodelfittedbyEMalgorithm
----------------------------------------------------
MclustVEV(ellipsoidal,equalshape)modelwith2components:
log.likelihoodndfBICICL
-215.72615026-561.7285-561.7289
Clusteringtable:
12
50100
Mixingprobabilities:
12
0.33333190.6666681
Means:
[,1][,2]
Sepal.Length5.00600226.261996
Sepal.Width3.42800492.871999
Petal.Length1.46200074.905992
Petal.Width0.24599981.675997
Variances:
[,,1]
Sepal.LengthSepal.WidthPetal.LengthPetal.Width
Sepal.Length0.150651140.130801150.020844630.01309107
Sepal.Width0.130801150.176045290.016032450.01221458
Petal.Length0.020844630.016032450.028082600.00601568
Petal.Width0.013091070.012214580.006015680.01042365
[,,2]
Sepal.LengthSepal.WidthPetal.LengthPetal.Width
Sepal.Length0.40004380.108654440.39940180.14368256
Sepal.Width0.10865440.109280770.12389040.07284384
Petal.Length0.39940180.123890400.61090240.25738990
Petal.Width0.14368260.072843840.25738990.16808182
>plot(fit_EM)#对EM聚类结果作图
Model-basedclusteringplots:
1:
BIC
2:
classification
3:
uncertainty
4:
density
Selection:
(下面显示选项)
#选1
#选2
#选3
#选4
Selection:
0
>iris_BIC=mclustBIC(iris[,1:
4])
fitting...
|===========================================================================|100%
>iris_BICsum=summary(iris_BIC,data=iris[,1:
4])
>iris_BICsum#获取数1据集iris在各模型和类别数下的BIC值
BestBICvalues:
VEV,2VEV,3VVV,2
BIC-561.7285-562.5522369-574.01783
BICdiff0.0000-0.8237748-12.28937
Classificationtableformodel(VEV,2):
12
50100
>iris_BIC
BayesianInformationCriterion(BIC):
EIIVIIEEIVEIEVIVVIEEE
1-1804.0854-1804.0854-1522.1202-1522.1202-1522.1202-1522.1202-829.97