最近搞事情的有点多….都不能好好学习了
省略了原文中的爬虫部分直接使用了对应的数据集
所以吧PR算法的生成函数写在了searcher里
数据集在此:searchindex-db 原文的地址早就废了 幸好有我大CSDN
代码..先记着
from sqlite3 import dbapi2 as sqlite
# dbname = 'searchengine.py'
class searcher:
con = None
def __init__(self, dbname):
self.con = sqlite.connect(dbname)
def __del__(self):
self.con.close()
def dbcommit(self):
self.con.commit()
def getmatchrows(self, q):
fieldlist = 'w0.urlid'
tablelist = ''
clauselist = ''
wordids = []
# 空格拆分单词
words = q.split(' ')
tablenumber = 0
for word in words:
# 获取单词ID
wordrow = self.con.execute("SELECT rowid from wordlist where word = '" + word + "'").fetchone()
if wordrow != None:
wordid = wordrow[0]
wordids.append(wordid)
if tablenumber > 0:
tablelist += ','
clauselist += ' and '
clauselist += 'w' + str(tablenumber - 1) + '.urlid=w' + str(tablenumber) + '.urlid and '
fieldlist += ',w' + str(tablenumber) + '.location'
tablelist += 'wordlocation w' + str(tablenumber)
clauselist += 'w' + str(tablenumber) + '.wordid=' + str(wordid)
tablenumber += 1
fullquery = 'select ' + fieldlist + ' from ' + tablelist + ' where ' + clauselist
# print(fullquery)
cur = self.con.execute(fullquery)
rows = [row for row in cur]
return rows, wordids
def getscoredlist(self, rows, wordids):
totalscores = dict([(row[0], 0) for row in rows])
# 评价
# weights = []
# 仅使用频度评价算法
weights = [(1.0, self.frequencyscore(rows))]
# 仅使用位置评价算法
weights = [(1.0, self.locationscore(rows))]
# 同时使用
weights = [(1.0, self.frequencyscore(rows)), (1.5, self.locationscore(rows))]
# 再加上PR算法
weights = [(1.0, self.frequencyscore(rows)), (1.0, self.locationscore(rows)), (1.0, self.pagerankscore(rows))]
for (weight, scores) in weights:
for url in totalscores:
totalscores[url] += weight * scores[url]
return totalscores
def geturlname(self, id):
return self.con.execute("select url from urllist where rowid=" + str(id)).fetchone()[0]
def query(self, q):
rows, wordids = self.getmatchrows(q)
scores = self.getscoredlist(rows, wordids)
rankedscores = sorted([(score, url) for (url, score) in scores.items()], reverse=1)
for (score, urlid) in rankedscores[0:10]:
print(str(score) + " " + str(self.geturlname(urlid)))
def normalizescores(self, scores, smallisBetter=0):
vsmall = 0.000001
if smallisBetter:
minscore = min(scores.values())
return dict([(u, float(minscore) / max(vsmall, l)) for (u, l) in scores.items()])
else:
maxscore = max(scores.values())
if maxscore == 0:
maxscore = vsmall
return dict([(u, float(c) / maxscore) for (u, c) in scores.items()])
# 频度评价算法
def frequencyscore(self, rows):
counts = dict([(row[0], 0) for row in rows])
for row in rows:
counts[row[0]] += 1
return self.normalizescores(counts)
# 位置评价算法
def locationscore(self, rows):
locations = dict([(row[0], 1000000) for row in rows])
for row in rows:
loc = sum(row[1:])
if loc < locations[row[0]]:
locations[row[0]] = loc
return self.normalizescores(locations, smallisBetter=1)
# 单词距离算法
def distancescore(self, rows):
if len(rows[0]) <= 2:
return dict([(row[0], 1.0) for row in rows])
mindistance = dict([(row[0], 10000000) for row in rows])
for row in rows:
dist = sum([abs(row[i] - row[i - 1]) for i in range(2, len(row))])
if dist < mindistance[row[0]]:
mindistance[row[0]] = dist
return self.normalizescores(mindistance, smallisBetter=1)
# 外部回指链接计数
def inboundlinkscore(self, rows):
uniqueurls = set([row[0] for row in rows])
inboundcount = dict(
[(u, self.con.execute('SELECT COUNT(*) FROM link WHERE toid=' + str(u)).fetchone()[0]) for u in uniqueurls])
return self.normalizescores(inboundcount)
# PageRank算法
# P = 0.85 // 用户持续点击链接的概率 ,阻尼因子
# min = 0.15 = 1 - P
# PR(t) = min + P * ( PR(B)/links(B) + .....) 指向A的每个网页的PR值除以网页中链接总数
# 然后乘以阻尼因子,最后加上最小值,得出PR(A)
# 初始化为每个页面设定随机值
# 多次计算后各个页面的PR值均趋向稳定
def calculatepagerank(self, iterations=20):
self.con.execute('DROP TABLE IF EXISTS pagerank')
self.con.execute('CREATE TABLE pagerank(urlid PRIMARY KEY ,score)')
# 初始化
self.con.execute('INSERT INTO pagerank SELECT rowid, 1.0 FROM urllist')
self.dbcommit()
for i in range(iterations):
print('Iteration ' + str(i))
for (urlid,) in self.con.execute('SELECT rowid FROM urllist'):
pr = 0.15
# 遍历指向该网页的所有网页
for (linker,) in self.con.execute('SELECT DISTINCT fromid from link where toid=' + str(urlid)):
linkingpr = self.con.execute('SELECT score FROM pagerank where urlid=' + str(linker)).fetchone()[0]
linkingcount = self.con.execute('SELECT COUNT(*) FROM link WHERE fromid=' + str(linker)).fetchone()[
0]
pr += 0.85 * (linkingpr / linkingcount)
self.con.execute('UPDATE pagerank SET score=' + str(pr) + ' WHERE urlid=' + str(urlid))
self.dbcommit()
# 查询PR算法的评分结果并标准化
def pagerankscore(self, rows):
pageranks = dict(
[(row[0], self.con.execute('SELECT score FROM pagerank WHERE urlid=' + str(row[0])).fetchone()[0]) for row
in rows])
maxrank = max(pageranks.values())
normaiizedscores = dict([(u, float(l) / maxrank) for (u, l) in pageranks.items()])
return normaiizedscores
# 链接文本评分
def linktextscore(self, rows, wordids):
linkscores = dict([(row[0], 0) for row in rows])
for wordid in wordids:
cur = self.con.execute('SELECT link.formid,link.toid FROM linkwords ,link where wordid=' + str(
wordid) + ' AND linkwords.linkid=link.rowid')
for (formid, toid) in cur:
pr = self.con.execute('SELECT score FROM pagerank WHERE urlid=' + str(formid)).fetchone()[0]
linkscores[toid] += pr
maxscore = max(linkscores.values())
normalizedscores = dict([(u, float(l) / maxscore) for (u, l) in linkscores.items()])
return normalizedscores