《集体智慧编程》第四章搜索与排名代码存档（上）

最近搞事情的有点多….都不能好好学习了
省略了原文中的爬虫部分直接使用了对应的数据集
所以吧PR算法的生成函数写在了searcher里
数据集在此：searchindex-db 原文的地址早就废了幸好有我大CSDN
代码..先记着
from sqlite3 import dbapi2 as sqlite


# dbname = 'searchengine.py'

class searcher:
    con = None

    def __init__(self, dbname):
        self.con = sqlite.connect(dbname)

    def __del__(self):
        self.con.close()

    def dbcommit(self):
        self.con.commit()

    def getmatchrows(self, q):
        fieldlist = 'w0.urlid'
        tablelist = ''
        clauselist = ''
        wordids = []

        # 空格拆分单词
        words = q.split(' ')
        tablenumber = 0

        for word in words:
            # 获取单词ID
            wordrow = self.con.execute("SELECT rowid from wordlist where word = '" + word + "'").fetchone()
            if wordrow != None:
                wordid = wordrow[0]
                wordids.append(wordid)
                if tablenumber > 0:
                    tablelist += ','
                    clauselist += ' and '
                    clauselist += 'w' + str(tablenumber - 1) + '.urlid=w' + str(tablenumber) + '.urlid and '
                fieldlist += ',w' + str(tablenumber) + '.location'
                tablelist += 'wordlocation w' + str(tablenumber)
                clauselist += 'w' + str(tablenumber) + '.wordid=' + str(wordid)
                tablenumber += 1

        fullquery = 'select ' + fieldlist + ' from ' + tablelist + ' where ' + clauselist
        # print(fullquery)
        cur = self.con.execute(fullquery)
        rows = [row for row in cur]

        return rows, wordids

    def getscoredlist(self, rows, wordids):
        totalscores = dict([(row[0], 0) for row in rows])

        # 评价
        # weights = []
        # 仅使用频度评价算法
        weights = [(1.0, self.frequencyscore(rows))]
        # 仅使用位置评价算法
        weights = [(1.0, self.locationscore(rows))]
        # 同时使用
        weights = [(1.0, self.frequencyscore(rows)), (1.5, self.locationscore(rows))]
        # 再加上PR算法
        weights = [(1.0, self.frequencyscore(rows)), (1.0, self.locationscore(rows)), (1.0, self.pagerankscore(rows))]

        for (weight, scores) in weights:
            for url in totalscores:
                totalscores[url] += weight * scores[url]

        return totalscores

    def geturlname(self, id):
        return self.con.execute("select url from urllist where rowid=" + str(id)).fetchone()[0]

    def query(self, q):
        rows, wordids = self.getmatchrows(q)
        scores = self.getscoredlist(rows, wordids)
        rankedscores = sorted([(score, url) for (url, score) in scores.items()], reverse=1)
        for (score, urlid) in rankedscores[0:10]:
            print(str(score) + "   " + str(self.geturlname(urlid)))

    def normalizescores(self, scores, smallisBetter=0):
        vsmall = 0.000001
        if smallisBetter:
            minscore = min(scores.values())
            return dict([(u, float(minscore) / max(vsmall, l)) for (u, l) in scores.items()])
        else:
            maxscore = max(scores.values())
            if maxscore == 0:
                maxscore = vsmall
            return dict([(u, float(c) / maxscore) for (u, c) in scores.items()])

    # 频度评价算法
    def frequencyscore(self, rows):
        counts = dict([(row[0], 0) for row in rows])
        for row in rows:
            counts[row[0]] += 1
        return self.normalizescores(counts)

    # 位置评价算法
    def locationscore(self, rows):
        locations = dict([(row[0], 1000000) for row in rows])
        for row in rows:
            loc = sum(row[1:])
            if loc < locations[row[0]]:
                locations[row[0]] = loc

        return self.normalizescores(locations, smallisBetter=1)

    # 单词距离算法
    def distancescore(self, rows):
        if len(rows[0]) <= 2:
            return dict([(row[0], 1.0) for row in rows])

        mindistance = dict([(row[0], 10000000) for row in rows])
        for row in rows:
            dist = sum([abs(row[i] - row[i - 1]) for i in range(2, len(row))])
            if dist < mindistance[row[0]]:
                mindistance[row[0]] = dist

        return self.normalizescores(mindistance, smallisBetter=1)

    # 外部回指链接计数
    def inboundlinkscore(self, rows):
        uniqueurls = set([row[0] for row in rows])
        inboundcount = dict(
            [(u, self.con.execute('SELECT COUNT(*) FROM link WHERE toid=' + str(u)).fetchone()[0]) for u in uniqueurls])
        return self.normalizescores(inboundcount)

    # PageRank算法
    # P = 0.85 // 用户持续点击链接的概率 ,阻尼因子
    # min = 0.15 = 1 - P
    # PR（t) = min + P * （ PR(B)/links(B) + .....) 指向A的每个网页的PR值除以网页中链接总数
    # 然后乘以阻尼因子，最后加上最小值，得出PR（A）
    # 初始化为每个页面设定随机值
    # 多次计算后各个页面的PR值均趋向稳定
    def calculatepagerank(self, iterations=20):
        self.con.execute('DROP TABLE IF EXISTS pagerank')
        self.con.execute('CREATE TABLE pagerank(urlid PRIMARY KEY ,score)')
        # 初始化
        self.con.execute('INSERT INTO pagerank SELECT rowid, 1.0 FROM urllist')
        self.dbcommit()

        for i in range(iterations):
            print('Iteration ' + str(i))
            for (urlid,) in self.con.execute('SELECT rowid FROM urllist'):
                pr = 0.15

                # 遍历指向该网页的所有网页
                for (linker,) in self.con.execute('SELECT DISTINCT fromid from link where toid=' + str(urlid)):
                    linkingpr = self.con.execute('SELECT score FROM pagerank where urlid=' + str(linker)).fetchone()[0]
                    linkingcount = self.con.execute('SELECT COUNT(*) FROM link WHERE fromid=' + str(linker)).fetchone()[
                        0]
                    pr += 0.85 * (linkingpr / linkingcount)
                self.con.execute('UPDATE pagerank SET score=' + str(pr) + ' WHERE urlid=' + str(urlid))
            self.dbcommit()

    # 查询PR算法的评分结果并标准化
    def pagerankscore(self, rows):
        pageranks = dict(
            [(row[0], self.con.execute('SELECT score FROM pagerank WHERE urlid=' + str(row[0])).fetchone()[0]) for row
             in rows])
        maxrank = max(pageranks.values())
        normaiizedscores = dict([(u, float(l) / maxrank) for (u, l) in pageranks.items()])
        return normaiizedscores

    # 链接文本评分
    def linktextscore(self, rows, wordids):
        linkscores = dict([(row[0], 0) for row in rows])
        for wordid in wordids:
            cur = self.con.execute('SELECT link.formid,link.toid FROM linkwords ,link where wordid=' + str(
                wordid) + ' AND linkwords.linkid=link.rowid')
            for (formid, toid) in cur:
                pr = self.con.execute('SELECT score FROM pagerank WHERE urlid=' + str(formid)).fetchone()[0]
                linkscores[toid] += pr
        maxscore = max(linkscores.values())
        normalizedscores = dict([(u, float(l) / maxscore) for (u, l) in linkscores.items()])
        return normalizedscores
Log

《集体智慧编程》第四章搜索与排名代码存档（上）

评论列表

发表评论取消回复

评论列表

发表评论 取消回复

发表评论取消回复