实验报告聚类分析报告.docx

资源描述

实验报告聚类分析报告.docx

《实验报告聚类分析报告.docx》由会员分享，可在线阅读，更多相关《实验报告聚类分析报告.docx（26页珍藏版）》请在冰点文库上搜索。

实验报告聚类分析报告.docx

实验报告聚类分析报告

实验报告聚类分析

实验原理：

K均值聚类、中心点聚类、系统聚类和EM算法聚类分析技术。

实验题目：

用鸢尾花的数据集，进行聚类挖掘分析。

实验要求：

探索鸢尾花数据的基本特征，利用不同的聚类挖掘方法，获得基本结论并简明解释。

实验题目--分析报告：

data（iris）

>rm（list=ls（））

>gc（）

used（Mb）gctrigger（Mb）maxused（Mb）

Ncells43173023.192971849.760759132.5

Vcells7876056.1838860864.0159240312.2

>data（iris）

>data<-iris

>head（data）

Sepal.LengthSepal.WidthPetal.LengthPetal.WidthSpecies

15.13.51.40.2setosa

24.93.01.40.2setosa

34.73.21.30.2setosa

44.63.11.50.2setosa

55.03.61.40.2setosa

65.43.91.70.4setosa

#Kmean聚类分析

>newiris<-iris

>newiris$Species<-NULL

>（kc<-kmeans（newiris,3））

K-meansclusteringwith3clustersofsizes62,50,38

Clustermeans:

Sepal.LengthSepal.WidthPetal.LengthPetal.Width

15.9016132.7483874.3935481.433871

25.0060003.4280001.4620000.246000

36.8500003.0736845.7421052.071053

Clusteringvector:

[1]2222222222222222222222222222222222222222

[41]2222222222113111111111111111111111111311

[81]1111111111111111111131333313333331133331

[121]313133113333313333133313331331

Withinclustersumofsquaresbycluster:

[1]39.8209715.1510023.87947

（between_SS/total_SS=88.4%）

Availablecomponents:

[1]"cluster""centers""totss""withinss""tot.withinss"

[6]"betweenss""size""iter""ifault"

>table（iris$Species,kc$cluster）

123

setosa0500

versicolor4802

virginica14036

>plot（newiris[c（"Sepal.Length","Sepal.Width"）],col=kc$cluster）

>points（kc$centers[,c（"Sepal.Length","Sepal.Width"）],col=1:

3,pch=8,cex=2）

#K-Mediods进行聚类分析

>install.packages（"cluster"）

>library（cluster）

>iris.pam<-pam（iris,3）

>table（iris$Species,iris.pam$clustering）

123

setosa5000

versicolor0347

virginica0491

>layout（matrix（c（1,2）,1,2））

>plot（iris.pam）

>layout（matrix

（1））

#hc

>iris.hc<-hclust（dist（iris[,1:

4]））

>plot（iris.hc,hang=-1）

>plclust（iris.hc,labels=FALSE,hang=-1）

>re<-rect.hclust（iris.hc,k=3）

>iris.id<-cutree（iris.hc,3）

#利用剪枝函数cutree（）参数h控制输出height=18时的系谱类别

>sapply（unique（iris.id）,

+function（g）iris$Species[iris.id==g]）

[[1]]

[1]setosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosa

[12]setosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosa

[23]setosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosa

[34]setosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosa

[45]setosasetosasetosasetosasetosasetosa

Levels:

setosaversicolorvirginica

[[2]]

[1]versicolorversicolorversicolorversicolorversicolorversicolorversicolor

[8]versicolorversicolorversicolorversicolorversicolorversicolorversicolor

[15]versicolorversicolorversicolorversicolorversicolorversicolorversicolor

[22]versicolorversicolorvirginicavirginicavirginicavirginicavirginica

[29]virginicavirginicavirginicavirginicavirginicavirginicavirginica

[36]virginicavirginicavirginicavirginicavirginicavirginicavirginica

[43]virginicavirginicavirginicavirginicavirginicavirginicavirginica

[50]virginicavirginicavirginicavirginicavirginicavirginicavirginica

[57]virginicavirginicavirginicavirginicavirginicavirginicavirginica

[64]virginicavirginicavirginicavirginicavirginicavirginicavirginica

[71]virginicavirginica

Levels:

setosaversicolorvirginica

[[3]]

[1]versicolorversicolorversicolorversicolorversicolorversicolorversicolor

[8]versicolorversicolorversicolorversicolorversicolorversicolorversicolor

[15]versicolorversicolorversicolorversicolorversicolorversicolorversicolor

[22]versicolorversicolorversicolorversicolorversicolorversicolorvirginica

Levels:

setosaversicolorvirginica

>plot（iris.hc）

>rect.hclust（iris.hc,k=4,border="lightgrey"）#用浅灰色矩形框出4分类聚类结果

>rect.hclust（iris.hc,k=3,border="darkgrey"）#用浅灰色矩形框出3分类聚类结果

>rect.hclust（iris.hc,k=7,which=c（2,6）,border="darkgrey"）

#DBSCAN#基于密度的聚类

>install.packages（"fpc"）

>library（fpc）

>ds1=dbscan（iris[,1:

4],eps=1,MinPts=5）#半径参数为1，密度阈值为5

>ds1

dbscanPts=150MinPts=5eps=1

border01

seed5099

total50100

>ds2=dbscan（iris[,1:

4],eps=4,MinPts=5）

>ds3=dbscan（iris[,1:

4],eps=4,MinPts=2）

>ds4=dbscan（iris[,1:

4],eps=8,MinPts=2）

>par（mfcol=c（2,2））

>plot（ds1,iris[,1:

4],main="1:

MinPts=5eps=1"）

>plot（ds3,iris[,1:

4],main="3:

MinPts=2eps=4"）

>plot（ds2,iris[,1:

4],main="2:

MinPts=5eps=4"）

>plot（ds4,iris[,1:

4],main="4:

MinPts=2eps=8"）

>d=dist（iris[,1:

4]）#计算数据集的距离矩阵d

>max（d）;min（d）#计算数据集样本的距离的最值

[1]7.085196

[1]0

>install.packages（"ggplot2"）

>library（ggplot2）

>interval=cut_interval（d,30）

>table（interval）

interval

[0,0.236]（0.236,0.472]（0.472,0.709]（0.709,0.945]（0.945,1.18]（1.18,1.42]

88585876891831688

（1.42,1.65]（1.65,1.89]（1.89,2.13]（2.13,2.36]（2.36,2.6]（2.6,2.83]

543369379339335406

（2.83,3.07]（3.07,3.31]（3.31,3.54]（3.54,3.78]（3.78,4.01]（4.01,4.25]

458459465480468505

（4.25,4.49]（4.49,4.72]（4.72,4.96]（4.96,5.2]（5.2,5.43]（5.43,5.67]

349385321291187138

（5.67,5.9]（5.9,6.14]（6.14,6.38]（6.38,6.61]（6.61,6.85]（6.85,7.09]

97927850184

>which.max（table（interval））

（0.709,0.945]

>for（iin3:

5）

+{for（jin1:

10）

+{ds=dbscan（iris[,1:

4],eps=i,MinPts=j）

+print（ds）

dbscanPts=150MinPts=1eps=3

seed150

total150

dbscanPts=150MinPts=2eps=3

seed150

total150

dbscanPts=150MinPts=3eps=3

seed150

total150

dbscanPts=150MinPts=4eps=3

seed150

total150

dbscanPts=150MinPts=5eps=3

seed150

total150

dbscanPts=150MinPts=6eps=3

seed150

total150

dbscanPts=150MinPts=7eps=3

seed150

total150

dbscanPts=150MinPts=8eps=3

seed150

total150

dbscanPts=150MinPts=9eps=3

seed150

total150

dbscanPts=150MinPts=10eps=3

seed150

total150

dbscanPts=150MinPts=1eps=4

seed150

total150

dbscanPts=150MinPts=2eps=4

seed150

total150

dbscanPts=150MinPts=3eps=4

seed150

total150

dbscanPts=150MinPts=4eps=4

seed150

total150

dbscanPts=150MinPts=5eps=4

seed150

total150

dbscanPts=150MinPts=6eps=4

seed150

total150

dbscanPts=150MinPts=7eps=4

seed150

total150

dbscanPts=150MinPts=8eps=4

seed150

total150

dbscanPts=150MinPts=9eps=4

seed150

total150

dbscanPts=150MinPts=10eps=4

seed150

total150

dbscanPts=150MinPts=1eps=5

seed150

total150

dbscanPts=150MinPts=2eps=5

seed150

total150

dbscanPts=150MinPts=3eps=5

seed150

total150

dbscanPts=150MinPts=4eps=5

seed150

total150

dbscanPts=150MinPts=5eps=5

seed150

total150

dbscanPts=150MinPts=6eps=5

seed150

total150

dbscanPts=150MinPts=7eps=5

seed150

total150

dbscanPts=150MinPts=8eps=5

seed150

total150

dbscanPts=150MinPts=9eps=5

seed150

total150

dbscanPts=150MinPts=10eps=5

seed150

total150

#30次dbscan的聚类结果

>ds5=dbscan（iris[,1:

4],eps=3,MinPts=2）

>ds6=dbscan（iris[,1:

4],eps=4,MinPts=5）

>ds7=dbscan（iris[,1:

4],eps=5,MinPts=9）

>par（mfcol=c（1,3））

>plot（ds5,iris[,1:

4],main="1:

MinPts=2eps=3"）

>plot（ds6,iris[,1:

4],main="3:

MinPts=5eps=4"）

>plot（ds7,iris[,1:

4],main="2:

MinPts=9eps=5"）

#EM期望最大化聚类

>install.packages（"mclust"）

>library（mclust）

>fit_EM=Mclust（iris[,1:

4]）

fitting...

|===========================================================================|100%

>summary（fit_EM）

----------------------------------------------------

GaussianfinitemixturemodelfittedbyEMalgorithm

----------------------------------------------------

MclustVEV（ellipsoidal,equalshape）modelwith2components:

log.likelihoodndfBICICL

-215.72615026-561.7285-561.7289

Clusteringtable:

50100

>summary（fit_EM,parameters=TRUE）

----------------------------------------------------

GaussianfinitemixturemodelfittedbyEMalgorithm

----------------------------------------------------

MclustVEV（ellipsoidal,equalshape）modelwith2components:

log.likelihoodndfBICICL

-215.72615026-561.7285-561.7289

Clusteringtable:

50100

Mixingprobabilities:

0.33333190.6666681

Means:

[,1][,2]

Sepal.Length5.00600226.261996

Sepal.Width3.42800492.871999

Petal.Length1.46200074.905992

Petal.Width0.24599981.675997

Variances:

[,,1]

Sepal.LengthSepal.WidthPetal.LengthPetal.Width

Sepal.Length0.150651140.130801150.020844630.01309107

Sepal.Width0.130801150.176045290.016032450.01221458

Petal.Length0.020844630.016032450.028082600.00601568

Petal.Width0.013091070.012214580.006015680.01042365

[,,2]

Sepal.LengthSepal.WidthPetal.LengthPetal.Width

Sepal.Length0.40004380.108654440.39940180.14368256

Sepal.Width0.10865440.109280770.12389040.07284384

Petal.Length0.39940180.123890400.61090240.25738990

Petal.Width0.14368260.072843840.25738990.16808182

>plot（fit_EM）#对EM聚类结果作图

Model-basedclusteringplots:

BIC

classification

uncertainty

density

Selection:

（下面显示选项）

#选1

#选2

#选3

#选4

Selection:

>iris_BIC=mclustBIC（iris[,1:

4]）

fitting...

|===========================================================================|100%

>iris_BICsum=summary（iris_BIC,data=iris[,1:

4]）

>iris_BICsum#获取数1据集iris在各模型和类别数下的BIC值

BestBICvalues:

VEV,2VEV,3VVV,2

BIC-561.7285-562.5522369-574.01783

BICdiff0.0000-0.8237748-12.28937

Classificationtableformodel（VEV,2）:

50100

>iris_BIC

BayesianInformationCriterion（BIC）:

EIIVIIEEIVEIEVIVVIEEE

1-1804.0854-1804.0854-1522.1202-1522.1202-1522.1202-1522.1202-829.97

展开阅读全文