oneR算法,简单解释就是在一个Unit的所有属性中选择一个误差最小或者是最稳定的属性进行评判

同样在这个属性的每个属性值的对应结果中,对于每个属性值选取占比最大的结果作为该属性值的结果(语文不好……..

 

算法思想:

对于每个不连续的属性,统计每个属性的对应结果个数,最终返回每个属性值对应的结果以及总误差

对于每个连续属性,分为type + 1类 并将(n-1,n]区间中得属性值统计,同样返回每个属性区间对应的结果以及总误差

最后从所有属性中选出误差最小的属性作为评判依据

 

代码:

# outlook temperature humidity windy play
train = [
    ('sunny', 85.0, 85.0, False, 'no'),
    ('rainy', 75.0, 80.0, False, 'yes'),
    ('sunny', 75.0, 70.0, True, 'yes'),
    ('overcast', 72.0, 90.0, True, 'yes'),
    ('overcast', 81.0, 75.0, False, 'yes'),
    ('rainy', 71.0, 91.0, True, 'no'),
    ('sunny', 80.0, 90.0, True, 'no'),
    ('overcast', 83.0, 86.0, False, 'yes'),
    ('rainy', 70.0, 96.0, False, 'yes'),
    ('rainy', 68.0, 80.0, False, 'yes'),
    ('rainy', 65.0, 70.0, True, 'no'),
    ('overcast', 64.0, 65.0, True, 'yes'),
    ('sunny', 72.0, 95.0, False, 'no'),
    ('sunny', 69.0, 70.0, False, 'yes'),
]


# 计算非连续性属性
def oneR_calc_discrete(dataset, index):
    types = set([d[index] for d in dataset])
    res = set([d[len(d) - 1] for d in dataset])

    node = {type: {re: 0 for re in res} for type in types}

    for type in types:
        res = set([(d[len(d) - 1], 0) for d in dataset])
        for data in dataset:
            node[data[index]][data[len(data) - 1]] += 1

    type_score = []
    err = 0
    sum = 0
    for node_i in node:
        re_max, err_rate, err_num, t = find_max(node[node_i])
        type_score.append((node_i, re_max, err_rate))
        err += err_num
        sum += t

    return type_score, err / sum


# 计算连续属性
def oneR_calc_series(dataset, index, type=5):
    min_data = min([data[index] for data in dataset])
    max_data = max([data[index] for data in dataset])
    res = set([d[len(d) - 1] for d in dataset])

    step = (max_data - min_data) / type

    node = {}
    for i in range(type):
        node.setdefault(min_data + i * step, {re: 0 for re in res})

    keys = [key for key in node]
    node.setdefault('inf', {re: 0 for re in res})
    keys.sort()
    for data in dataset:
        for type_node in keys:
            if data[index] <= type_node:
                node[type_node][data[len(data) - 1]] += 1
                break
            if type_node is keys[len(keys) - 1]:
                node['inf'][data[len(data) - 1]] += 1
                break

    type_score = []
    err = 0
    sum = 0
    for node_i in node:
        re_max, err_rate, err_num, t = find_max(node[node_i])
        type_score.append((node_i, re_max, err_rate))
        err += err_num
        sum += t

    return type_score, err / sum


def find_max(dict):
    sum = 0
    max_key = None
    max_value = -1
    for item in dict:
        sum += dict[item]
        if dict[item] > max_value:
            max_value = dict[item]
            max_key = item
    if sum == 0:
        return max_key, 0, 0, 0
    return max_key, 1 - (max_value / sum), sum - max_value, sum


# 建立模型
def build_model(train_data, type):
    simple = train_data[0]

    model = []
    index = 0
    for item in simple:
        if item == simple[len(simple) - 1]:
            break
        if isinstance(item, str) or isinstance(item, bool):
            model.append(oneR_calc_discrete(train, index))
        else:
            model.append(oneR_calc_series(train, index, type))

        index += 1

    min_key = None
    min_value = 1
    min_index = 0
    index = 0
    for type_i in model:
        if type_i[len(type_i) - 1] < min_value:
            min_value = type_i[len(type_i) - 1]
            min_key = type_i
            min_index = index
        index += 1

    print('Use type {c}: '.format(c=min_index) + str(min_key))
    return [min_index, min_key]


def check(data, model):
    index = model[0]
    model = model[1]

    value = data[index]
    if isinstance(value, str) or isinstance(value, bool):
        for enum in model[0]:
            if value == enum[0]:
                return enum[1]
    else:
        keys = [key[0] for key in model[0] if key[0] != 'inf']
        keys.sort()
        for type_node in keys:
            if value <= type_node:
                l = [i[1] for i in model[0] if i[0] is type_node][0]
                return l
            if type_node is keys[len(keys) - 1]:
                # return model[0]([i[1] for i in model[0] if i[0] is 'inf'])[0]
                l = [i[1] for i in model[0] if i[0] is 'inf'][0]
                return l

# e = build_model(train, 7)
# print(check(('sunny', 85.0, 85.0, False),e))