《集体智慧编程》第三章发现群组代码存档

修改了部分代码以适应上一张的数据集
K-means聚类效果不佳估计是数据量不够和维度不足
Python3的PIL库已经万年不更新了 pip里也米有所以画图部分的均没有实现
from math import sqrt
import random

critics = {
    '用户A': {
        '集体智慧编程': 2.5,
        '机器学习': 3.5,
        '数据之美': 3.0,
        '数据挖掘导论': 3.5,
        '深入浅出数据挖掘': 2.5,
        '数据挖掘实战': 3.0
    },
    '用户B': {
        '集体智慧编程': 3.0,
        '机器学习': 3.5,
        '数据之美': 1.5,
        '数据挖掘导论': 5.0,
        '深入浅出数据挖掘': 3.5,
        '数据挖掘实战': 3.0
    },
    '用户C': {
        '集体智慧编程': 2.5,
        '机器学习': 3.0,
        '数据之美': 0.0,
        '数据挖掘导论': 3.5,
        '深入浅出数据挖掘': 4.0,
        '数据挖掘实战': 0.0
    },
    '用户D': {
        '集体智慧编程': 3.5,
        '机器学习': 0.0,
        '数据之美': 3.0,
        '数据挖掘导论': 4.0,
        '深入浅出数据挖掘': 4.0,
        '数据挖掘实战': 2.5
    },
    '用户E': {
        '集体智慧编程': 3.0,
        '机器学习': 4.0,
        '数据之美': 2.0,
        '数据挖掘导论': 3.0,
        '深入浅出数据挖掘': 3.0,
        '数据挖掘实战': 2.5
    },
    '用户F': {
        '集体智慧编程': 3.0,
        '机器学习': 4.0,
        '数据之美': 0.0,
        '数据挖掘导论': 5.0,
        '深入浅出数据挖掘': 3.0,
        '数据挖掘实战': 2.0
    },
    '用户G': {
        '集体智慧编程': 0.0,
        '机器学习': 0.0,
        '数据之美': 0.0,
        '数据挖掘导论': 1.0,
        '深入浅出数据挖掘': 1.0,
        '数据挖掘实战': 0.0
    },
}


# 基于皮尔逊相关度评价
def sim_pearson(person1, person2):
    si = {}
    for item in person1:
        if item in person2:
            si[item] = 1

    n = len(si)
    if n == 0:
        return 0
    # 对所有偏好求和
    sum1 = sum(person1[it] for it in si)
    sum2 = sum(person2[it] for it in si)

    # 求平方和
    sum1Sq = sum([pow(person1[it], 2) for it in si])
    sum2Sq = sum([pow(person2[it], 2) for it in si])

    # 乘积之和
    pSum = sum([person1[it] * person2[it] for it in si])

    # 计算评价值
    num = pSum - (sum1 * sum2 / n)
    den = sqrt((sum1Sq - pow(sum1, 2) / n) * (sum2Sq - pow(sum2, 2) / n))
    if den == 0:
        return 0

    r = num / den
    return 1.0 - r


class bicluster:
    def __init__(self, vec, left=None, right=None, distance=0.0, id=None, name=None):
        self.left = left
        self.right = right
        self.distance = distance
        self.vec = vec
        self.name = name
        self.id = id


def hcluster(prefs, distance=sim_pearson):
    distances = {}
    currentclustid = -1

    # clust = [bicluster(prefs[i],id=i) for i in range(len(prefs))]
    clust = [bicluster(prefs[item], id=int(i), name=item) for i, item in enumerate(prefs)]

    while len(clust) > 1:
        lowestpair = (0, 1)
        closest = distance(clust[0].vec, clust[1].vec)

        # 遍历寻找最小距离
        for i in range(len(clust)):
            for j in range(i + 1, len(clust)):
                if (clust[i].id, clust[j].id) not in distances:
                    distances[(clust[i].id, clust[j].id)] = distance(clust[i].vec, clust[j].vec)

                d = distances[(clust[i].id, clust[j].id)]

                if d < closest:
                    closest = d;
                    lowestpair = (i, j)

        mergevec = {}
        for item in clust[lowestpair[0]].vec:
            if item in clust[lowestpair[1]].vec:
                mergevec.setdefault(item, 0)
                mergevec[item] = (clust[lowestpair[0]].vec[item] + clust[lowestpair[1]].vec[item]) / 2.0

        newcluster = bicluster(mergevec, left=clust[lowestpair[0]], right=clust[lowestpair[1]], distance=closest,
                               id=currentclustid)

        currentclustid = -1
        del clust[lowestpair[1]]
        del clust[lowestpair[0]]
        clust.append(newcluster)

    return clust[0]


def kckuster(rows, distance=sim_pearson, k=3):
    ranges = [(min([rows[name][item] for item in rows[name]]), max([rows[name][item] for item in rows[name]])) for name
              in rows]

    clusters = [{itemName: random.random() * (ranges[i][1] - ranges[i][0]) + ranges[i][0] for i, itemName in
                 enumerate(rows[list(rows.keys())[0]])} for j in range(k)]

    lastmatches = None
    for t in range(50):
        print('Iteration ' + str(t))
        bestmatches = [[] for i in range(k)]

        for j in rows:
            row = rows[j]
            bestmatch = 0
            # !!!
            for i in range(k):
                d = distance(clusters[i], row)
                # d = distance(clusters[i],[row[item] for item in row])
                if d < distance(clusters[bestmatch], row):
                    bestmatch = i
            bestmatches[bestmatch].append(j)

        if bestmatches == lastmatches:
            break
        lastmatches = bestmatches

        for i in range(k):
            avgs = {itemName: 0.0 for itemName in rows[list(rows.keys())[0]]}
            for rowid in bestmatches[i]:
                for m in rows[rowid]:
                    avgs[m] += rows[rowid][m]
            for j in rows[list(rows.keys())[0]]:
                avgs[j] /= len(bestmatches[i]) == 0 and 1 or len(bestmatches[i])
            clusters[i] = avgs

    return bestmatches


# 输出列表图
def printclust(clust, labels=None, n=0):
    for i in range(n):
        print('    ', end='')
    if clust.id < 0:
        print('-')
    else:
        if labels is None:
            print(clust.name)
        else:
            print(labels[clust.id])

    if clust.left is not None:
        printclust(clust.left, labels, n=n + 1)
    if clust.right is not None:
        printclust(clust.right, labels, n=n + 1)


# printclust(hcluster(critics))
# print(kckuster(critics))
总感觉K均值写错了。。。希望是错觉
Log

《集体智慧编程》第三章发现群组代码存档

评论列表

发表评论取消回复

评论列表

发表评论 取消回复

发表评论取消回复