C4.5 在 ID3的基础上使用信息增益率代替信息增益,减少了信息数量对优先级的影响

gain_rate(X,A) = gain(X,A) / split_in_word(A)

其中split_in_word是分裂信息度量 ,即X在A属性的各个子集上得熵

# 分裂信息度量 即非分类属性的熵
    def split_in_formation(self, dataset, index):
        return DecisionTreeID3.entropy(DecisionTreeID3(), dataset, index)

    # 信息增益率
    def gain_ratio(self, dataset, index):
        split_in = self.split_in_formation(dataset, index)
        gain = DecisionTreeID3.gain(DecisionTreeID3(), dataset, index)
        try:
            result = gain / split_in
        except Exception:
            result = 1

C4.5处理连续数据的方式是把数据按数据范围均分成m份,其中m由用户提供

    def perpre_dataset(self, dataset, split=5):
        for i in range(len(dataset[0])):
            if isinstance(dataset[0][i], str) or isinstance(dataset[0][i], bool):
                continue
            min_t = min([item[i] for item in dataset])
            max_t = max([item[i] for item in dataset])
            self.series_split.setdefault(i, [])
            for k in range(split - 1):
                self.series_split[i].append(min_t + (k + 1) * (max_t - min_t) / float(split + 1))

            self.series_split[i].sort()
            for item in dataset:
                item[i] = self.get_series_type(item[i], i)

        return dataset

    def get_series_type(self, data, index):
        if data <= self.series_split[index][0]:
            return 0
        for i in range(1, len(self.series_split[index])):
            if self.series_split[index][i] >= data > self.series_split[index][i - 1]:
                return i
        if data > self.series_split[index][len(self.series_split[index]) - 1]:
            return len(self.series_split[index])

存储连续属性分割方法和树root

class DecisionTreeC45:
    series_split = {}

    def __init__(self, dataset=None):
        self.dataset = dataset
        self.root = None
        self.attr_num = 0
        self.series_split = {}

生成树的方法和ID3方法类似,验证也一样

  def build_tree(self, dataset, used_index=[]):
        type_num = len(dataset[0]) - 1
        self.attr_num = type_num
        if len(used_index) is type_num:
            leaf_node = TreeNode()
            leaf_node.is_leaf = True
            leaf_node.decision = DecisionTreeID3.find_most_pop_ans(dataset)
            return leaf_node
        if len(set([item[len(item) - 1] for item in dataset])) is 1:
            leaf_node = TreeNode()
            leaf_node.is_leaf = True
            leaf_node.decision = dataset[0][len(dataset[0]) - 1]
            return leaf_node
        max_gain = 0
        max_index = -1
        for i in range(type_num):
            if i not in used_index:
                gain_tmp = self.gain_ratio(dataset, i)
                if gain_tmp > max_gain:
                    max_gain = gain_tmp
                    max_index = i

        node = TreeNode()
        node.index = max_index
        used_index.append(max_index)
        type_set = set([item[max_index] for item in dataset])
        for typex in type_set:
            node.child.setdefault(typex,
                                  self.build_tree([item for item in dataset if item[max_index] is typex], used_index))

        self.root = node
        return node

    def check(self, dataset, split=5):
        for index in self.series_split.keys():
            dataset[index] = self.get_series_type(dataset[index], index)

        print(dataset)
        if len(dataset) is not self.attr_num:
            print('Err data !')

        root = self.root
        if root is None:
            return None
        while root.is_leaf is not True:
            root = root.find_child(dataset[root.index])
            if root is None:
                print('Unindex Attr !')
                return None

        return root.decision

 

随机森林是为了减少决策树过拟合的方法之一

对应方法为每次选取部分数据,使用部分属性进行训练,训练出多个树。验证时使用全部树进行验证,取占比最大的分类

class ForestNode:
    def __init__(self, data_num=5, tree_num=3, attr_num=3):
        self.trees = []
        self.data_num = data_num  # 每次取用的数据个数
        self.tree_num = tree_num  # 决策树个数
        self.attr_num = attr_num  # 每次随机选取的属性个数

    # 生成随机列表
    def get_random_list(self, st, ed, num):
        result = []
        while len(result) is not num:
            t = random.randint(st, ed)
            if t not in result:
                result.append(t)

        return result

    def random_build_tree(self, dataset, split=5, decision_tree_type=DecisionTreeC45):
        for i in range(self.tree_num):
            t = decision_tree_type()
            data_random = self.get_random_list(0, len(dataset) - 1, self.data_num)
            data_part =copy.deepcopy([dataset[index] for index in range(len(dataset)) if index in data_random])
            t.perpre_dataset(data_part)
            attr_list = self.get_random_list(0, len(data_part[0]) - 1, len(data_part[0]) - self.attr_num)
            t.build_tree(data_part, attr_list)
            self.trees.append(t)

    def random_check(self, dataset):
        result = []
        old_dataset = copy.deepcopy(dataset)
        for tree in self.trees:
            dataset = copy.deepcopy(old_dataset)
            r = tree.check(dataset)
            if r is not None:
                result.append(r)

        result_count = Counter(result)
        # print(result_count)
        return result_count.most_common(1)