C4.5 在 ID3的基础上使用信息增益率代替信息增益,减少了信息数量对优先级的影响
gain_rate(X,A) = gain(X,A) / split_in_word(A)
其中split_in_word是分裂信息度量 ,即X在A属性的各个子集上得熵
# 分裂信息度量 即非分类属性的熵
def split_in_formation(self, dataset, index):
return DecisionTreeID3.entropy(DecisionTreeID3(), dataset, index)
# 信息增益率
def gain_ratio(self, dataset, index):
split_in = self.split_in_formation(dataset, index)
gain = DecisionTreeID3.gain(DecisionTreeID3(), dataset, index)
try:
result = gain / split_in
except Exception:
result = 1
C4.5处理连续数据的方式是把数据按数据范围均分成m份,其中m由用户提供
def perpre_dataset(self, dataset, split=5):
for i in range(len(dataset[0])):
if isinstance(dataset[0][i], str) or isinstance(dataset[0][i], bool):
continue
min_t = min([item[i] for item in dataset])
max_t = max([item[i] for item in dataset])
self.series_split.setdefault(i, [])
for k in range(split - 1):
self.series_split[i].append(min_t + (k + 1) * (max_t - min_t) / float(split + 1))
self.series_split[i].sort()
for item in dataset:
item[i] = self.get_series_type(item[i], i)
return dataset
def get_series_type(self, data, index):
if data <= self.series_split[index][0]:
return 0
for i in range(1, len(self.series_split[index])):
if self.series_split[index][i] >= data > self.series_split[index][i - 1]:
return i
if data > self.series_split[index][len(self.series_split[index]) - 1]:
return len(self.series_split[index])
存储连续属性分割方法和树root
class DecisionTreeC45:
series_split = {}
def __init__(self, dataset=None):
self.dataset = dataset
self.root = None
self.attr_num = 0
self.series_split = {}
生成树的方法和ID3方法类似,验证也一样
def build_tree(self, dataset, used_index=[]):
type_num = len(dataset[0]) - 1
self.attr_num = type_num
if len(used_index) is type_num:
leaf_node = TreeNode()
leaf_node.is_leaf = True
leaf_node.decision = DecisionTreeID3.find_most_pop_ans(dataset)
return leaf_node
if len(set([item[len(item) - 1] for item in dataset])) is 1:
leaf_node = TreeNode()
leaf_node.is_leaf = True
leaf_node.decision = dataset[0][len(dataset[0]) - 1]
return leaf_node
max_gain = 0
max_index = -1
for i in range(type_num):
if i not in used_index:
gain_tmp = self.gain_ratio(dataset, i)
if gain_tmp > max_gain:
max_gain = gain_tmp
max_index = i
node = TreeNode()
node.index = max_index
used_index.append(max_index)
type_set = set([item[max_index] for item in dataset])
for typex in type_set:
node.child.setdefault(typex,
self.build_tree([item for item in dataset if item[max_index] is typex], used_index))
self.root = node
return node
def check(self, dataset, split=5):
for index in self.series_split.keys():
dataset[index] = self.get_series_type(dataset[index], index)
print(dataset)
if len(dataset) is not self.attr_num:
print('Err data !')
root = self.root
if root is None:
return None
while root.is_leaf is not True:
root = root.find_child(dataset[root.index])
if root is None:
print('Unindex Attr !')
return None
return root.decision
随机森林是为了减少决策树过拟合的方法之一
对应方法为每次选取部分数据,使用部分属性进行训练,训练出多个树。验证时使用全部树进行验证,取占比最大的分类
class ForestNode:
def __init__(self, data_num=5, tree_num=3, attr_num=3):
self.trees = []
self.data_num = data_num # 每次取用的数据个数
self.tree_num = tree_num # 决策树个数
self.attr_num = attr_num # 每次随机选取的属性个数
# 生成随机列表
def get_random_list(self, st, ed, num):
result = []
while len(result) is not num:
t = random.randint(st, ed)
if t not in result:
result.append(t)
return result
def random_build_tree(self, dataset, split=5, decision_tree_type=DecisionTreeC45):
for i in range(self.tree_num):
t = decision_tree_type()
data_random = self.get_random_list(0, len(dataset) - 1, self.data_num)
data_part =copy.deepcopy([dataset[index] for index in range(len(dataset)) if index in data_random])
t.perpre_dataset(data_part)
attr_list = self.get_random_list(0, len(data_part[0]) - 1, len(data_part[0]) - self.attr_num)
t.build_tree(data_part, attr_list)
self.trees.append(t)
def random_check(self, dataset):
result = []
old_dataset = copy.deepcopy(dataset)
for tree in self.trees:
dataset = copy.deepcopy(old_dataset)
r = tree.check(dataset)
if r is not None:
result.append(r)
result_count = Counter(result)
# print(result_count)
return result_count.most_common(1)