Apriori algorithm是关联规则里一项基本算法。是由Rakesh Agrawal和Ramakrishnan Srikant两位博士在1994年提出的关联规则挖掘算法。关联规则的目的就是在一个数据集中找出项与项之间的关系,也被称为购物蓝分析 (Market Basket analysis),因为“购物蓝分析”很贴切的表达了适用该算法情景中的一个子集。
算法关键在于生成密集项。生成第K个密集项的方法是:
- 由K-1个密集项 x 数据集中所有的Item类型 生成一个每项大小比K-1密集项大1得新集合
- 对于新集合的每个项目,去掉其中项目子集不包含在K-1密集项中得项目,即剪枝(我的方法是得出每个项目(假设为[‘A’,’B’,’C’]) 的所有长度-1的子项集合 即( [ [‘A’,’B’],[‘A’,’C’],[‘B’,’C’]]) 然后判断其中的每一项是否出现在了K-1密集项中,如果有未出现的则删除
- 计算新集合中剩下的每个项目的支持度,即同时拥有该集合项目的记录条数占总数据条数的比例
- 取前K条项目,组合成为新的密集项
代码好丑= =
class Apriori:
def __init__(self, dataset, LK=4, min_sup=0.25):
self.dataset = dataset
self.LK = LK
self.min_sup = min_sup
self.I_set = []
# 获得所有Item类型
def get_items(self, dataset):
result = {}
for i in dataset:
for item in i:
if item not in result.keys():
result.setdefault(item, 1)
else:
result[item] += 1
out = []
for key in result.keys():
out.append([key, result[key]])
out.sort(key=lambda x: -x[1])
self.set_iset(out[0:self.LK])
return out[0:self.LK]
# 装入类变量
def set_iset(self, items):
for i in items:
self.I_set.append([i[0]])
def get_support(self, dataset, iset):
sum = 0
for data in dataset:
if len(list(set(iset).difference(data))) == 0:
sum += 1
return float(sum) / len(dataset)
# 获取K+1的频繁集
def get_LK_1(self, LK_set, dataset):
result = []
for now_set in LK_set:
for i in self.I_set:
if i[0] not in now_set:
now_i = copy.deepcopy(now_set)
now_i.append(i[0])
sum = 0
for data in dataset:
flag = True
for k in now_i:
if k not in data:
flag = False
break
if flag is True:
sum += 1
# 在支持度》0时加入集合
if sum >= 0:
now_i.sort()
unit = [now_i, sum]
flag = True
for res in result:
if unit == res:
flag = False
break
if flag is True:
result.append(unit)
# 按支持度排序
result.sort(key=lambda x: -x[1])
out = []
for i in result[0:self.LK]:
out.append(i[0])
# 取最多LK个
return result[0:self.LK], out
def find_bigist_K(self, dataset):
if len(self.I_set) is 0:
print('Get I First')
return None
iset = self.I_set
last_result = None
last_iset = None
result = None
count = 1
# 直到不能找到更大的频繁集
while len(iset) > 0:
last_result = result
last_iset = iset
result, tmp_iset = self.get_LK_1(iset, test_data)
iset = []
for tmp in tmp_iset:
# 验证所有子集是否都是频繁集
if self.check_k(last_iset, tmp) is True:
iset.append(tmp)
print('Calc {c}:'.format(c=count) + str(iset))
count += 1
return last_iset, last_result
def find_all_child(self, big_k):
return None
# 寻找一个集合的所有K-1项子集
def get_k_1_list(self, k_list):
result = []
for i in range(len(k_list)):
list_t = [k_list[j] for j in range(len(k_list)) if j is not i]
list_t.sort()
result.append(list_t)
return result
# 判断是否在K-1频繁集内
def check_k(self, iset, k_list):
k_tmp = self.get_k_1_list(k_list)
for k in k_tmp:
if k not in iset:
return False
return True
获取推荐的方法是获取最终密集项中得所有可能子集,产生推荐关系,如
对于[‘A’,’B’,’C’] 可以生成:
[‘A’,’B’] 推荐 [‘C] 则他的置信度为 Supp([‘A’,’B’,’C’]) / Supp([‘A’,’B’])
其他类推
当置信度高于阀值时输出或返回,作为推荐结果
# 获取推荐
def get_recommend(self, dataset, iset, min_dist=0.0):
result = []
while len(iset[0]) >= 2:
iset_t = copy.deepcopy(iset)
iset = []
for iset_item in iset_t:
for item in self.get_k_1_list(iset_item):
if item not in iset:
iset.append(item)
diff = list(set(iset_item).difference(item))
dist = self.get_support(dataset, iset_item) / self.get_support(dataset, item)
if dist > min_dist:
result.append([item, diff, dist])
return result
测试代码和数据:
test_data = [
['A', 'D', 'C'],
['B', 'C', 'E'],
['A', 'B', 'C', 'E'],
['B', 'E']
]
a = Apriori(test_data)
print(a.get_items(test_data))
result, out = a.find_bigist_K(test_data)
for item in a.get_recommend(test_data, result, min_dist=0.5):
print('For ' + str(item[0]) + ' to recommend : ' + str(item[1]) + ' have Confidence :' + str(item[2]))
当阀值为0.5时输出:
[['E', 3], ['C', 3], ['B', 3], ['A', 2]]
Calc 1:[['B', 'E'], ['C', 'E'], ['B', 'C'], ['A', 'C']]
Calc 2:[['B', 'C', 'E']]
Calc 3:[]
For ['C', 'E'] to recommend : ['B'] have Confidence :1.0
For ['B', 'E'] to recommend : ['C'] have Confidence :0.6666666666666666
For ['B', 'C'] to recommend : ['E'] have Confidence :1.0
For ['E'] to recommend : ['C'] have Confidence :0.6666666666666666
For ['C'] to recommend : ['E'] have Confidence :0.6666666666666666
For ['E'] to recommend : ['B'] have Confidence :1.0
For ['B'] to recommend : ['E'] have Confidence :1.0
For ['C'] to recommend : ['B'] have Confidence :0.6666666666666666
For ['B'] to recommend : ['C'] have Confidence :0.6666666666666666