oneR算法,简单解释就是在一个Unit的所有属性中选择一个误差最小或者是最稳定的属性进行评判
同样在这个属性的每个属性值的对应结果中,对于每个属性值选取占比最大的结果作为该属性值的结果(语文不好……..
算法思想:
对于每个不连续的属性,统计每个属性的对应结果个数,最终返回每个属性值对应的结果以及总误差
对于每个连续属性,分为type + 1类 并将(n-1,n]区间中得属性值统计,同样返回每个属性区间对应的结果以及总误差
最后从所有属性中选出误差最小的属性作为评判依据
代码:
# outlook temperature humidity windy play
train = [
('sunny', 85.0, 85.0, False, 'no'),
('rainy', 75.0, 80.0, False, 'yes'),
('sunny', 75.0, 70.0, True, 'yes'),
('overcast', 72.0, 90.0, True, 'yes'),
('overcast', 81.0, 75.0, False, 'yes'),
('rainy', 71.0, 91.0, True, 'no'),
('sunny', 80.0, 90.0, True, 'no'),
('overcast', 83.0, 86.0, False, 'yes'),
('rainy', 70.0, 96.0, False, 'yes'),
('rainy', 68.0, 80.0, False, 'yes'),
('rainy', 65.0, 70.0, True, 'no'),
('overcast', 64.0, 65.0, True, 'yes'),
('sunny', 72.0, 95.0, False, 'no'),
('sunny', 69.0, 70.0, False, 'yes'),
]
# 计算非连续性属性
def oneR_calc_discrete(dataset, index):
types = set([d[index] for d in dataset])
res = set([d[len(d) - 1] for d in dataset])
node = {type: {re: 0 for re in res} for type in types}
for type in types:
res = set([(d[len(d) - 1], 0) for d in dataset])
for data in dataset:
node[data[index]][data[len(data) - 1]] += 1
type_score = []
err = 0
sum = 0
for node_i in node:
re_max, err_rate, err_num, t = find_max(node[node_i])
type_score.append((node_i, re_max, err_rate))
err += err_num
sum += t
return type_score, err / sum
# 计算连续属性
def oneR_calc_series(dataset, index, type=5):
min_data = min([data[index] for data in dataset])
max_data = max([data[index] for data in dataset])
res = set([d[len(d) - 1] for d in dataset])
step = (max_data - min_data) / type
node = {}
for i in range(type):
node.setdefault(min_data + i * step, {re: 0 for re in res})
keys = [key for key in node]
node.setdefault('inf', {re: 0 for re in res})
keys.sort()
for data in dataset:
for type_node in keys:
if data[index] <= type_node:
node[type_node][data[len(data) - 1]] += 1
break
if type_node is keys[len(keys) - 1]:
node['inf'][data[len(data) - 1]] += 1
break
type_score = []
err = 0
sum = 0
for node_i in node:
re_max, err_rate, err_num, t = find_max(node[node_i])
type_score.append((node_i, re_max, err_rate))
err += err_num
sum += t
return type_score, err / sum
def find_max(dict):
sum = 0
max_key = None
max_value = -1
for item in dict:
sum += dict[item]
if dict[item] > max_value:
max_value = dict[item]
max_key = item
if sum == 0:
return max_key, 0, 0, 0
return max_key, 1 - (max_value / sum), sum - max_value, sum
# 建立模型
def build_model(train_data, type):
simple = train_data[0]
model = []
index = 0
for item in simple:
if item == simple[len(simple) - 1]:
break
if isinstance(item, str) or isinstance(item, bool):
model.append(oneR_calc_discrete(train, index))
else:
model.append(oneR_calc_series(train, index, type))
index += 1
min_key = None
min_value = 1
min_index = 0
index = 0
for type_i in model:
if type_i[len(type_i) - 1] < min_value:
min_value = type_i[len(type_i) - 1]
min_key = type_i
min_index = index
index += 1
print('Use type {c}: '.format(c=min_index) + str(min_key))
return [min_index, min_key]
def check(data, model):
index = model[0]
model = model[1]
value = data[index]
if isinstance(value, str) or isinstance(value, bool):
for enum in model[0]:
if value == enum[0]:
return enum[1]
else:
keys = [key[0] for key in model[0] if key[0] != 'inf']
keys.sort()
for type_node in keys:
if value <= type_node:
l = [i[1] for i in model[0] if i[0] is type_node][0]
return l
if type_node is keys[len(keys) - 1]:
# return model[0]([i[1] for i in model[0] if i[0] is 'inf'])[0]
l = [i[1] for i in model[0] if i[0] is 'inf'][0]
return l
# e = build_model(train, 7)
# print(check(('sunny', 85.0, 85.0, False),e))