修改了部分代码以适应上一张的数据集
K-means聚类效果不佳 估计是数据量不够和维度不足
Python3的PIL库已经万年不更新了 pip里也米有所以画图部分的均没有实现
from math import sqrt
import random
critics = {
'用户A': {
'集体智慧编程': 2.5,
'机器学习': 3.5,
'数据之美': 3.0,
'数据挖掘导论': 3.5,
'深入浅出数据挖掘': 2.5,
'数据挖掘实战': 3.0
},
'用户B': {
'集体智慧编程': 3.0,
'机器学习': 3.5,
'数据之美': 1.5,
'数据挖掘导论': 5.0,
'深入浅出数据挖掘': 3.5,
'数据挖掘实战': 3.0
},
'用户C': {
'集体智慧编程': 2.5,
'机器学习': 3.0,
'数据之美': 0.0,
'数据挖掘导论': 3.5,
'深入浅出数据挖掘': 4.0,
'数据挖掘实战': 0.0
},
'用户D': {
'集体智慧编程': 3.5,
'机器学习': 0.0,
'数据之美': 3.0,
'数据挖掘导论': 4.0,
'深入浅出数据挖掘': 4.0,
'数据挖掘实战': 2.5
},
'用户E': {
'集体智慧编程': 3.0,
'机器学习': 4.0,
'数据之美': 2.0,
'数据挖掘导论': 3.0,
'深入浅出数据挖掘': 3.0,
'数据挖掘实战': 2.5
},
'用户F': {
'集体智慧编程': 3.0,
'机器学习': 4.0,
'数据之美': 0.0,
'数据挖掘导论': 5.0,
'深入浅出数据挖掘': 3.0,
'数据挖掘实战': 2.0
},
'用户G': {
'集体智慧编程': 0.0,
'机器学习': 0.0,
'数据之美': 0.0,
'数据挖掘导论': 1.0,
'深入浅出数据挖掘': 1.0,
'数据挖掘实战': 0.0
},
}
# 基于皮尔逊相关度评价
def sim_pearson(person1, person2):
si = {}
for item in person1:
if item in person2:
si[item] = 1
n = len(si)
if n == 0:
return 0
# 对所有偏好求和
sum1 = sum(person1[it] for it in si)
sum2 = sum(person2[it] for it in si)
# 求平方和
sum1Sq = sum([pow(person1[it], 2) for it in si])
sum2Sq = sum([pow(person2[it], 2) for it in si])
# 乘积之和
pSum = sum([person1[it] * person2[it] for it in si])
# 计算评价值
num = pSum - (sum1 * sum2 / n)
den = sqrt((sum1Sq - pow(sum1, 2) / n) * (sum2Sq - pow(sum2, 2) / n))
if den == 0:
return 0
r = num / den
return 1.0 - r
class bicluster:
def __init__(self, vec, left=None, right=None, distance=0.0, id=None, name=None):
self.left = left
self.right = right
self.distance = distance
self.vec = vec
self.name = name
self.id = id
def hcluster(prefs, distance=sim_pearson):
distances = {}
currentclustid = -1
# clust = [bicluster(prefs[i],id=i) for i in range(len(prefs))]
clust = [bicluster(prefs[item], id=int(i), name=item) for i, item in enumerate(prefs)]
while len(clust) > 1:
lowestpair = (0, 1)
closest = distance(clust[0].vec, clust[1].vec)
# 遍历寻找最小距离
for i in range(len(clust)):
for j in range(i + 1, len(clust)):
if (clust[i].id, clust[j].id) not in distances:
distances[(clust[i].id, clust[j].id)] = distance(clust[i].vec, clust[j].vec)
d = distances[(clust[i].id, clust[j].id)]
if d < closest:
closest = d;
lowestpair = (i, j)
mergevec = {}
for item in clust[lowestpair[0]].vec:
if item in clust[lowestpair[1]].vec:
mergevec.setdefault(item, 0)
mergevec[item] = (clust[lowestpair[0]].vec[item] + clust[lowestpair[1]].vec[item]) / 2.0
newcluster = bicluster(mergevec, left=clust[lowestpair[0]], right=clust[lowestpair[1]], distance=closest,
id=currentclustid)
currentclustid = -1
del clust[lowestpair[1]]
del clust[lowestpair[0]]
clust.append(newcluster)
return clust[0]
def kckuster(rows, distance=sim_pearson, k=3):
ranges = [(min([rows[name][item] for item in rows[name]]), max([rows[name][item] for item in rows[name]])) for name
in rows]
clusters = [{itemName: random.random() * (ranges[i][1] - ranges[i][0]) + ranges[i][0] for i, itemName in
enumerate(rows[list(rows.keys())[0]])} for j in range(k)]
lastmatches = None
for t in range(50):
print('Iteration ' + str(t))
bestmatches = [[] for i in range(k)]
for j in rows:
row = rows[j]
bestmatch = 0
# !!!
for i in range(k):
d = distance(clusters[i], row)
# d = distance(clusters[i],[row[item] for item in row])
if d < distance(clusters[bestmatch], row):
bestmatch = i
bestmatches[bestmatch].append(j)
if bestmatches == lastmatches:
break
lastmatches = bestmatches
for i in range(k):
avgs = {itemName: 0.0 for itemName in rows[list(rows.keys())[0]]}
for rowid in bestmatches[i]:
for m in rows[rowid]:
avgs[m] += rows[rowid][m]
for j in rows[list(rows.keys())[0]]:
avgs[j] /= len(bestmatches[i]) == 0 and 1 or len(bestmatches[i])
clusters[i] = avgs
return bestmatches
# 输出列表图
def printclust(clust, labels=None, n=0):
for i in range(n):
print(' ', end='')
if clust.id < 0:
print('-')
else:
if labels is None:
print(clust.name)
else:
print(labels[clust.id])
if clust.left is not None:
printclust(clust.left, labels, n=n + 1)
if clust.right is not None:
printclust(clust.right, labels, n=n + 1)
# printclust(hcluster(critics))
# print(kckuster(critics))
总感觉K均值写错了。。。希望是错觉