Apriori算法实验报告.docx

资源描述

Apriori算法实验报告.docx

《Apriori算法实验报告.docx》由会员分享，可在线阅读，更多相关《Apriori算法实验报告.docx（21页珍藏版）》请在冰点文库上搜索。

Apriori算法实验报告.docx

Apriori算法实验报告

题目

Apriori算法实现

学生姓名

学生学号

专业班级

指导教师

2014-12-27

实验一Apriori算法实现

一、实验目的

1.加强对Apriori算法的理解；

2.锻炼分析问题、解决问题并动手实践的能力。

二、实验要求

使用一种你熟悉的程序设计语言，如C++或Java，实现Apriori算法，至少在两种不同的数据集上比较算法的性能。

三、实验环境

Win7旗舰版+VisualStudio2010

语言：

C++

四、算法描述

1、Apriori算法说明

在Apriori算法中，寻找频繁项集的基本思想是：

A.简单统计所有含一个元素项目集出现的频率，找出不小于最小支持度的项目集,即频繁项集；

B.从第二步开始，循环处理直到再没有最大项目集生成。

循环过程是:

第k步中,根据第k-1步生成的频繁（k-1）项集产生侯选k项集。

根据候选k项集，算出候选k项集支持度，并与最小支持度比较,找到频繁k项集。

下文中遇到的以下符号,分别代表相应的内容

k-itemset　k项集

Lk　　　　频繁k项集

Ck　　　　侯选k项集

2、Apriori算法描述

数据结构说明

doubleminsup;//设置最小支持度

mapitems_count;//统计各个项集的数目

vector>datavec;//原始数据项集

vector>candidatevec;//候选项集

vector>frequentvec;//频繁项集

ofstreamoutFile;

intround=1;//生成项集轮次

longtrancount=0;//原始事务总数

//判断某个项目在某一个事务中是否存在，存在则值为1，反之为0

vector>bitmap;

Apriori算法的第一步是简单统计所有含一个元素的项集出现的频率，来决定频繁1项集。

在第k步,分两个阶段：

1，用函数genCanItemsetK，通过第（k-1）步中生成的频繁（k-1）项集来生成侯选k项集；2.计算侯选k项集的支持度，并找出频繁k项集。

Apriori算法描述如下

getOriData（）;//获取原始数据集，并统计事务个数

genCanItemset1（）;//产生输出候选1项集

genFreItemset1（）;//产生频繁项集

if（!

frequentvec.empty（））//根据频繁1项集，执行程序

{

genCanItemsetK（）;//生成并输出候选k项集

genFreItemsetK（）;//计算并输出频繁k项集

}while（!

frequentvec.empty（））;//频繁项集不为空,则循环继续

}

其中，产生候选k项集函数genCanItemsetK中涉及两个重要函数，项集合并函数mergeItem和剪枝函数cutNotCanItemsetK。

3、函数方法说明

//获取原始数据集，并统计事务个数

voidgetOriData（）;

//合并生成新的候选项集

vectormergeItem（vectorvect1,vectorvect2,intround）;

//判断项集item是否已经存在候选项集集合items中,存在则返回1

intisExist（vectoritem,vector>items）;

//产生并输出候选1项集

voidgenCanItemset1（）;

//产生并输出频繁1项集

voidgenFreItemset1（）;

//产生并输出候选k-项集（k>=2）

voidgenCanItemsetK（）;

//产生并输出频繁k-项集（k>=2）

voidgenFreItemsetK（）;

//剪枝：

剪去合并后项集中含有非频繁项集中的项

voidcutNotCanItemsetK（vector&item）;

五、实验截图

1.程序运行界面

2.输出文件截图1

3.输出文件截图1

六、实验总结

做完这个实验，有如下收获：

1.同一数据集，最小支持度越小，那么产生的频繁项集维数越高，程序运行时间越长；

2.更加深刻理解了：

频繁子集的任何子集一定是频繁的，子集频繁父亲一定频繁；

3.Apriori也存在缺点：

第一在每一步产生侯选项目集时循环产生的组合过多，没有排除不应该参与组合的元素；第二,每次计算项集的支持度时，开销会随着数据的增多而成几何级增长。

七、附

1.程序源码main.cpp

#include

usingnamespacestd;

doubleminsup;//设置最小支持度

mapitems_count;//统计各个项集的数目

vector>datavec;//原始数据项集

vector>candidatevec;//候选项集

vector>frequentvec;//频繁项集

ofstreamoutFile;

intround=1;//生成项集轮次

longtrancount=0;//原始事务总数

//判断某个项目在某一个事务中是否存在，存在则值为1，反之为0

vector>bitmap;

//获取原始数据集，并统计事务个数

voidgetOriData（）;

//合并生成新的候选项集

vectormergeItem（vectorvect1,vectorvect2,intround）;

//判断项集item是否已经存在候选项集集合items中,存在则返回1

intisExist（vectoritem,vector>items）;

//产生并输出候选1项集

voidgenCanItemset1（）;

//产生并输出频繁1项集

voidgenFreItemset1（）;

//产生并输出候选k-项集（k>=2）

voidgenCanItemsetK（）;

//产生并输出频繁k-项集（k>=2）

voidgenFreItemsetK（）;

//剪枝：

剪去合并后项集中含有非频繁项集中的项

voidcutNotCanItemsetK（vector&item）;

intmain（）

{

getOriData（）;//获取原始数据集，并统计事务个数

cout<<"请输入结果文件名：

";//pause

stringfName;

cin>>fName;

cout<<"请输入最小支持度：

cin>>minsup;

outFile.open（fName,ios:

trunc）;

outFile<<"最小支持度为minsup="<

genCanItemset1（）;

genFreItemset1（）;

if（!

frequentvec.empty（））//判断频繁1项集是否为空，为空则退出

{

genCanItemsetK（）;

genFreItemsetK（）;

}while（!

frequentvec.empty（））;//频繁项集不为空,则循环继续

}

outFile.close（）;

cout<<"\n结果已保存到"<

\n";

system（"pause"）;

return0;

}

//获取原始数据集，并统计事务个数

voidgetOriData（）

{

intflag;

cout<<"数据集文件：

\n1.dataA.txt\n2.dataB.txt\n请输入（1选择dataA，其他选择2）\n";

cin>>flag;

stringfilename;

if（flag==1）

filename="dataA.txt";//打开数据文件

else

filename="dataB.txt";

ifstreamfile（filename）;

if（!

file）//检查文件是否打开成功

{

cout<<"Failtoopendatafile!

system（"pause"）;

exit（0）;

}

else

{

stringtemp;

vectoritem;//项集的临时vector

cout<<"原始数据集：

intbegin,end;

while（getline（file,temp））//一行一行读入数据

{

trancount++;

begin=0;

temp.erase（0,temp.find_first_not_of（"\r\t\n"））;//去除字符串首部的空格

temp.erase（temp.find_last_not_of（"\r\t\n"）+1）;//去除字符串尾部的空格

while（（end=temp.find（'',begin））!

=string:

npos）//每一个事务中的项是以空格为分隔符的

{

item.push_back（temp.substr（begin,end-begin））;//将每一个项插入item中

begin=end+1;

}

item.push_back（temp.substr（begin））;//一个事务中的最后一项

datavec.push_back（item）;//将一个事务中的所有项当成一个整体插入另一个大的vector中

item.clear（）;//清空item

cout<

}

file.close（）;

}

//产生并输出候选1项集

voidgenCanItemset1（）

{

mapitem_map;

for（intix=0;ix!

=datavec.size（）;++ix）

{

for（intiy=0;iy!

=datavec[ix].size（）;++iy）

{

items_count[datavec[ix].at（iy）]++;//该项集的计数加1

item_map[datavec[ix].at（iy）]=true;//表示该项目在该事务中存在，值为1，否则默认为0

}

bitmap.push_back（item_map）;

item_map.clear（）;//这里一定要清空一下

}

map:

const_iteratormap_it=items_count.begin（）;

outFile<<"候选1项集:

while（map_it!

=items_count.end（））//输出候选1项集

{

outFile<<"{"<first<<"}"<

map_it++;

}

//产生并输出频繁1项集

voidgenFreItemset1（）

{

map:

const_iteratormap_it=items_count.begin（）;

outFile<<"频繁1项集:

vectoritem;//项集的临时vector

while（map_it!

=items_count.end（））//频繁1项集

{

if（（（float）map_it->second/（float）trancount）>minsup||fabs（（（float）map_it->second/（float）trancount）-minsup）<1.0e-7）//支持度大于0.2

{

outFile.setf（ios:

fixed）;

outFile<<"{"<first<<"}"<<"支持度:

（2）<<（float）map_it->second/（float）trancount<

item.push_back（map_it->first）;

frequentvec.push_back（item）;//插入频繁1项集的vector中

item.clear（）;

}

map_it++;

}

//产生并输出候选k-项集（k>=2）

voidgenCanItemsetK（）

{

//生成下一轮的候选项集

vectoritem;//项集的临时vector

intst=frequentvec.size（）;

candidatevec.clear（）;//清除上一轮的候选项集

for（intst1=0;st1

{

for（intst2=st1+1;st2

{

item=mergeItem（frequentvec[st1],frequentvec[st2],round）;//调用函数合并生成下一轮的候选项集

if（!

item.empty（）&&!

isExist（item,candidatevec））//若经过判断处理后返回的vector不为空且还不存在该项集，则作为候选项集加入候选vector中

{

cutNotCanItemsetK（item）;

}

round++;

outFile<<"候选"<

for（intix=0;ix!

=candidatevec.size（）;++ix）//输出候选项集

{

outFile<<"{";

for（intiy=0;iy!

=candidatevec[ix].size（）;++iy）

{

outFile<

}

outFile<<"}"<

}

if（candidatevec.empty（））//候选项集为空

{

outFile<<"候选"<

}

//产生并输出频繁k-项集（k>=2）

voidgenFreItemsetK（）

{

intflag;//标记某个项集在某条事务中是否出现，出现为1，不出现为0，如：

{I1I2}

intcount;//统计某个想集在整个交易的事务集中出现的次数

stringtempstr;//临时string，用于串接各个项成一个字符串：

如:

I1I2I3串接为"I1I2I3"

intmark;//为避免执行多余的字符串串接工作

frequentvec.clear（）;//清除上一轮的频繁项集

for（intsx=0;sx!

=candidatevec.size（）;++sx）//构造下一轮的频繁项集

{

mark=1;

count=0;

for（intsy=0;sy!

=bitmap.size（）;++sy）

{

flag=1;//初始化为1，表出现

for（intsz=0;sz!

=candidatevec[sx].size（）;++sz）

{

if（bitmap[sy][candidatevec[sx].at（sz）]==false）//存在某一个子项不存在，则没出现项集

{

flag=0;

}

if（mark==1）//只串接一次，如I1I2否则为10个I1I2的串接

{

tempstr+=candidatevec[sx].at（sz）;//串接字符串

}

if（flag）//flag仍然为1，表示该项集在该条事务中出现了，计数加1

{

count++;

}

mark++;

}

if（（（float）count/（float）trancount）>minsup||fabs（（（float）count/（float）trancount）-minsup）<1.0e-7）//支持度大于0.2

{

frequentvec.push_back（candidatevec[sx]）;//插入频繁项集

}

items_count[tempstr]=count;//对应该项集的计数值

/////////假设此时生成的tempstr为I1I2I3,为便于后面的求置信度的计算，这里需要产生I2I1I3,I1I3I2等组合，并

//在items_count中给它们赋予和I1I2I3相同的值

sort（candidatevec[sx].begin（）,candidatevec[sx].end（））;//排序

stringtempstr2;

while（next_permutation（candidatevec[sx].begin（）,candidatevec[sx].end（）））//取下一排列组合

{

for（inttempst=0;tempst!

=candidatevec[sx].size（）;tempst++）//拼接出该字符串组合

{

tempstr2+=candidatevec[sx][tempst];

}

items_count[tempstr2]=count;//对应该项集的计数值

tempstr2.erase（）;

}

tempstr.erase（）;

}

if（!

frequentvec.empty（））//频繁项集不为空

{

outFile<<"频繁"<

for（intsx=0;sx!

=frequentvec.size（）;++sx）//输出频繁项集

{

outFile.setf（ios:

fixed）;

outFile<<"{";

for（intsz=0;sz!

=frequentvec[sx].size（）;++sz）

{

outFile<

tempstr+=frequentvec[sx].at（sz）;//串接字符串

}

outFile<<"}";

outFile<<"支持度:

（2）<<（float）items_count[tempstr]/（float）trancount<

tempstr.erase（）;

}

else

{

outFile<<"没有"<

}

//两个项集合并（要求只有一项不同）成一个新的项集（做为候选集）

vectormergeItem（vectorvect1,vectorvect2,intround）

{

intcount=0;//统计两个vector中相同的项的数目

vectorvect;

maptempMap;//辅助判断两个vector中重复的项

for（unsignedintst=0;st

{

tempMap[vect1[st]]++;

vect.push_back（vect1[st]）;

}

for（unsignedintst=0;st

{

tempMap[vect2[st]]++;

if（tempMap[vect2[st]]==2）//表示这两项相同

{

count++;

}

else

{

vect.push_back（vect2[st]）;

}

if（（count+1）!

=round）//要求两个项目集只有一个项目不相同，其他都相同,如:

I1I2I4和I1I2I3

{

vect.clear（）;

}

returnvect;

}

//剪枝：

剪去合并后项集中含有非频繁项集中的项

voidcutNotCanItemsetK（vector&item）

{

////////实现剪枝//////////////////////////

stringtempstr;

vectortempvec;

boolfound=false;//是否包含有非频繁的子集,为1表示含有，有的话进行剪枝，如假设I1I4为非频繁项集，则I1I2I4要剪枝掉

stringteststr;

inttestint;

tempvec=item;

sort（tempvec.begin（）,tempvec.end（））;

while（next_permutation（tempvec.begin（）,tempvec.end（）））//遍历所有的组合I1I2I4,要变成I1I4I2或其他如I2I1I4才能判断它包含I1I4这个非频繁项集

{

for（inttempst=0;tempst!

=tempvec.size（）;tempst++）//拼接出该字符串组合

{

tempstr+=tempvec[tempst];

}

for（map:

const_iteratortempit=items_count.begin（）;tempit!

=items_count.end（）;tempit++

展开阅读全文