实在没找到当条件子树的数据类似[[‘A’],[‘A’],[‘B’],[‘A’,’B’],[‘A’,’B’]]的时候该怎么建….有的是直接建成只有单条路径的树了,但是按照FP树构造逻辑应该在root上就有两条分支= = 迷
这次注释挺多
import copy
test_data = [
['A', 'B'],
['B', 'C', 'D', 'E'],
['A', 'C', 'D', 'F'],
['B', 'A', 'C', 'D'],
['B', 'A', 'C', 'F']
]
class FpNode:
def __init__(self):
# Count数量
self.count = 0
# 下一个同Id节点
self.next_node = None
# Id分类
self.id_name = None
# 子节点
self.child = []
# 父节点
self.root = None
class FpGrowth:
def __init__(self, dataset, min_supp=1):
self.dataset = dataset
# 最小绝对支持度
self.min_supp = min_supp
self.dic = {}
# 存储最大同ID节点位置
self.nodedic = {}
self.L = []
# 生成一级频繁项集
def get_first_group(self, dataset):
dic = {}
for data in dataset:
for i in data:
if i not in dic.keys():
dic.setdefault(i, 1)
else:
dic[i] += 1
all_item = []
for key in dic.keys():
all_item.append([key, dic[key]])
all_item.sort(key=lambda x: (-x[1], x))
result = []
for item in all_item:
if item[1] >= self.min_supp:
result.append(item[0])
else:
del dic[item[0]]
# L 频繁一项集
self.dic = dic
self.L = result
return result, dic
def clean_dataset(self, dataset):
result = []
for data in dataset:
tmp = []
# 仅添加在频繁集中的
for i in data:
if i in self.dic.keys():
tmp.append(i)
# 使用item值为第二排序条件保证在出现次数相同的情况下不会随机顺序
tmp.sort(key=lambda x: (-self.dic[x], x))
result.append(tmp)
self.dataset = result
return result
def build_tree(self, root=None, dataset=None):
if dataset is None:
dataset = self.dataset
if root is None:
root = FpNode()
for data in dataset:
d_root = root
for i in data:
# 查找是否包含了该值得child
flag = False
for child in d_root.child:
if child.id_name is i:
# 包含的状况
child.count += 1
flag = True
d_root = child
break
if not flag:
node = FpNode()
node.count = 1
node.id_name = i
node.root = d_root
d_root.child.append(node)
d_root = node
# 维护最大ID节点位置
if i in self.nodedic.keys():
dic_node = self.nodedic[i]
while dic_node.next_node is not None:
dic_node = dic_node.next_node
dic_node.next_node = node
else:
self.nodedic.setdefault(i, node)
return root
def is_tree_has_one_line(self, root):
while len(root.child) > 0:
if len(root.child) > 1:
return False
root = root.child[0]
return True
# 寻找一个集合的所有K-1项子集
def get_k_1_list(self, k_list):
result = []
for i in range(len(k_list)):
list_t = [k_list[j] for j in range(len(k_list)) if j is not i and k_list[j] is not None]
list_t.sort()
result.append(list_t)
result_k = result
while len(result_k[len(result_k) - 1]) > 1:
k_tmp = copy.deepcopy(result_k)
result_k = []
for k in k_tmp:
for i in range(len(k)):
list_t = [k[j] for j in range(len(k)) if j is not i]
list_t.sort()
result_k.append(list_t)
for t in result_k:
t.sort()
if t not in result:
result.append(t)
return result
def fp_growth(self, tree):
result = []
l_set = copy.deepcopy(self.L)
if tree is None or len(tree.child) is 0:
return []
if self.is_tree_has_one_line(tree):
t_root = tree
t_result = []
while t_root is not None:
t_result.append(t_root.id_name)
if len(t_root.child) > 0:
t_root = t_root.child[0]
else:
break
result = self.get_k_1_list(t_result)
return result
for i in range(len(l_set)):
item_curr = l_set[-i - 1]
i_result = []
d_node = self.nodedic[item_curr]
while d_node is not None:
item_tmp = []
d_root = d_node
while d_root.root is not None:
if d_root.id_name is not item_curr:
item_tmp.append(d_root.id_name)
d_root = d_root.root
for it in range(d_node.count):
i_result.append(item_tmp)
# result.append(item_tmp)
d_node = d_node.next_node
i_result.sort(key=lambda x:-len(x))
# 递归挖掘
fp_if = FpGrowth(i_result, self.min_supp)
fp_if.get_first_group(i_result)
fp_if.clean_dataset(i_result)
fp_root = fp_if.build_tree()
for res in fp_if.fp_growth(fp_root):
if res not in result:
result.append(res)
return result
f = FpGrowth(test_data, 2)
print(f.get_first_group(test_data))
print(f.clean_dataset(test_data))
root = f.build_tree()
print(f.fp_growth(root))