只包含了对微博内容的信息提取而不包含爬虫
中文分词使用了jieba,安装:
pip3 install jieba
这些是用在微博数据预测数据集中得一个分析模块,不过总感觉还可以拿来做些什么
import re
from sqlite3 import dbapi2 as sqlite
import jieba
fh = re.compile(u'[^a-zA-Z0-9\u4E00-\u9FA5]')
topic_re = re.compile(u'#.*?#')
emoji_re = re.compile(u'\[.*?\]')
url_re = re.compile(u'http://t.cn/[a-z|A-Z|0-9]{7}')
share_re = re.compile(u'(分享自 @.*?)')
at_re = re.compile(u'@.*? ')
def get_word(doc):
wordlist = list(jieba.cut(doc))
words = []
for word in wordlist:
word = fh.subn('', word)
if len(word[0]) > 1:
words.append(word[0])
return words
# 寻找正则匹配
def find_re(doc, re=topic_re):
return re.findall(doc)
# 匹配为空
def replace_re(doc, list):
for item in list:
doc = doc.replace(item, '')
return doc
class db:
def __init__(self, filename):
self.con = sqlite.connect(filename)
def __del__(self):
self.con.close()
def get_count(self):
return int(self.con.execute('SELECT COUNT(*) FROM weibo_train').fetchone()[0])
def incword(self, word, f, c, l, mode='word'):
mode += '_count'
res = self.con.execute("SELECT * FROM {table} WHERE wordstr='{str}'".format(str=word, table=mode)).fetchone()
if res is None:
self.insert_word(word, f, c, l, mode)
else:
self.update_word(word, f, c, l, mode)
def update_word(self, word, f, c, l, mode):
res = self.con.execute(
("UPDATE {table} SET wordcount = wordcount + 1 ,wordfoward=wordfoward + {f}," +
"wordcomment = wordcomment + {c}, wordlike = wordlike + {l} WHERE wordstr='{str}'").format(
str=word, f=f, c=c, l=l, table=mode))
self.con.commit()
def insert_word(self, word, f, c, l, mode):
res = self.con.execute(
("INSERT INTO {table}(wordstr,wordcount,wordfoward,wordcomment,wordlike) " +
"VALUES('{str}',{count},{f},{c},{l})").format(str=word, count=1, f=f, c=c, l=l, table=mode))
self.con.commit()
def get_message(self, st, num):
res = self.con.execute(
"SELECT content,forward_count,comment_count,like_count FROM weibo_train LIMIT {st},{num}".format(st=st,
num=num))
result = []
for line in res.fetchall():
result.append((line[0], int(line[1]), int(line[2]), int(line[3])))
return result
这部分代码用于定义一些基本的正则表达式和数据库操作
其中f,c,l参数 预留给微博的转发,评价,点赞数量
对于每条微博,这样调用:
str=message[0]
# 获得话题列表
topic_list = find_re(str)
for topic in topic_list:
data.incword(topic, message[1], message[2], message[3], 'topic')
# 获取并删除原文中的表情
emoji_list = find_re(str, re=emoji_re)
str = replace_re(str, emoji_list)
for emoji in emoji_list:
data.incword(emoji, message[1], message[2], message[3], 'emoji')
# 获取并删除原文中得URL
url_list = find_re(str, re=url_re)
str = replace_re(str, url_list)
for url in url_list:
data.incword(url, message[1], message[2], message[3], 'url')
# 获取并删除原文中得分享信息
share_list = find_re(str, re=share_re)
str = replace_re(str, share_list)
for share in share_list:
data.incword(share, message[1], message[2], message[3], 'share')
# 获取原文中得@ 信息
at_list = find_re(str, re=at_re)
for at in at_list:
data.incword(at, message[1], message[2], message[3], 'at')
# 获取分词结果
word_list = get_word(str)
for word in word_list:
data.incword(word, message[1], message[2], message[3])
即可把该条微博的各个信息拆分存入数据库
使用爬虫针对性的抓取某个用户的微博应该可以通过上述数据分析该用户的:
- 兴趣面 (关键词)
- 常用软件(分享来源)
- 好友亲密度(@的频率等待)
- 发文习惯(表情和情绪类关键字2333)
- 等等
码着,万一有用2333