偶然看见Pixiv页面右侧的推荐貌似挺符合胃口,于是写个爬虫批量
原本想直接拉页面HTML来读取,后来发现直接加载的时候 推荐并不是即时出现的
倒过去找js 最后发现一个长这个样子的东西:
relatedList: function(id, count, options) {
return pixiv.api.get('/rpc/index.php', {
mode: 'get_recommend_illusts',
ids: id,
num: count || 100
}, options);
}
唔….. 遂利用该接口
{
"error": false,
"message": "",
"body": {
"total": 5,
"illusts": [
{
"illust_id": 42850203,
"illust_title": "COMIC1新刊プリキュア本サンプル",
"illust_user_id": "705370",
"illust_type": "1",
"illust_page_count": "6",
"url": {
"s": "http://i4.pixiv.net/c/150x150/img-master/img/2014/04/12/20/30/39/42850203_p0_master1200.jpg",
"240mw": "http://i4.pixiv.net/c/240x480/img-master/img/2014/04/12/20/30/39/42850203_p0_master1200.jpg",
"m": "http://i4.pixiv.net/c/600x600/img-master/img/2014/04/12/20/30/39/42850203_p0_master1200.jpg",
"big": "http://i4.pixiv.net/img-original/img/2014/04/12/20/30/39/42850203_p0.jpg",
"128x128": "http://i4.pixiv.net/c/128x128/img-master/img/2014/04/12/20/30/39/42850203_p0_square1200.jpg",
"ugoira600x600": "",
"540x540": "http://i4.pixiv.net/c/540x540_70/img-master/img/2014/04/12/20/30/39/42850203_p0_square1200.jpg"
},
"is_bookmarked": false,
"is_rated": false,
"is_commented": false
}]
}
}
使用原生的request 关键配置项如下:
# 你需要一个良好的代理
PROXY_IP_FILE = ''
COOKIE_FILE = 'cookie.txt'
# 最大探索深度
MAX_DEPTH = 2
# 每层数量
MAX_NUM_EVDEP = 3
SAVE_PATH = 'photo\\'
# 图片下载重试次数
RETRY_TIMES = 5
Proxy_ip 类用于从代理文件中读取信息并随机选择代理
class Proxy_ip:
def __init__(self, filename):
self.proxy_list = self.get_proxy_list(filename)
def get_proxy_list(self, filename):
list = []
of = open(filename, 'r')
try:
for line in of:
list.append(line.replace('\n', ''))
finally:
of.close()
return list
def get_random_proxy(self):
size = len(self.proxy_list)
ran = random.randint(0, size - 1)
return self.proxy_list[ran]
CrawlerTool类 主要是http接口访问的方法和下载图片文件的方法
class CrawlerTool:
def __init__(self, filename):
self.proxy = Proxy_ip(filename)
COOKIE = None
def set_cookie(self, file):
str = ''
for line in open(file, 'r'):
str += line
self.COOKIE = str
def _get_proxy_openner(self):
proxy_ip = self.proxy.get_random_proxy()
# print('Using:' + str(proxy_ip))
proxy_handler = urllib.request.ProxyHandler({'http': proxy_ip})
opener = urllib.request.build_opener(urllib.request.HTTPHandler, proxy_handler)
opener.addheaders = [('User-Agent', user_agent)]
if self.COOKIE is not None:
opener.addheaders = [('Cookie', self.COOKIE)]
return opener
def download_jpg(self, path, filename, url, ids='42676660'):
re_try_count = 0
while os.path.exists(path + filename) == False and re_try_count <= RETRY_TIMES:
re_try_count += 1
if url.find('.php') >= 0:
break
# proxyIp = '127.0.0.1'
# print("Download: " + url + " With --> " + proxyIp + "Retry: " + str(reTryCount))
try:
opener = self._get_proxy_openner()
opener.addheaders = [('Referer', 'http://www.pixiv.net/member_illust.php?mode=medium&illust_id=' + ids)]
f = opener.open(url)
jpg = f.read()
jpgStr = jpg.decode("utf8", 'ignore')
if jpgStr.find('<?xml version="1.0"') >= 0 or jpgStr.find('Unauthorized') >= 0 or jpgStr.find(
'403 Forbidden') >= 0:
print("Failed")
continue
except Exception as err:
print("Download Failed @" + url + ' Cause: ' + str(err))
continue
print("Download " + url + " OK ,Saving")
try:
File = open(path + filename, 'wb')
File.write(jpg)
File.close()
except Exception:
print("Save Failed @" + url)
continue
print(path + filename + " Saved !!")
def get_api_json(self, url):
try:
opener = self._get_proxy_openner()
f = opener.open(url)
# f = urllib.request.urlopen(url)
# print(f.getcode())
data = f.read().decode("utf8", 'ignore')
json_data = json.loads(data)
return json_data
except Exception as err:
print("Err @get_api_json : " + str(err))
return None
CrawlerThread 类 用于调度多线程来执行接口访问和图片下载任务
class CrawlerThread:
def __init__(self):
self.lock = threading.Lock()
self.c = CrawlerTool(PROXY_IP_FILE)
if COOKIE_FILE is not '':
self.c.set_cookie(COOKIE_FILE)
rpc_queue_list = []
image_queue_list = []
rpc_finish_flag = False
finish_rpc_list = []
finish_image_list = []
def call_rpc(self, ids, mode='get_recommend_illusts', num=MAX_NUM_EVDEP, url='http://www.pixiv.net/rpc/index.php'):
url_full = url + '?mode={mode}&ids={ids}&num={num}'.format(mode=mode, ids=ids, num=num)
result = None
while result is None:
result = self.c.get_api_json(url_full)
if result['error'] is not False:
print('Err @call_rpc')
return None
return [(item['illust_id'], item['url']['big']) for item in result['body']['illusts']]
def dispatch_rpc_thread(self, name):
while True:
while len(self.rpc_queue_list) == 0:
sleep(3)
self.lock.acquire()
ids = self.rpc_queue_list[0]
del self.rpc_queue_list[0]
self.finish_rpc_list.append(ids[0])
self.lock.release()
if ids[1] > MAX_DEPTH:
return
dep = ids[1] + 1
print('Rpc Thread ' + str(name) + " : " + str(ids[0]) + "--" + str(ids[1]))
result = self.call_rpc(ids[0])
for i in result:
if i[0] not in self.finish_rpc_list:
self.rpc_queue_list.append((i[0], dep))
if i[1] not in self.finish_image_list:
self.image_queue_list.append(i[1])
def do_rpc_dispatch(self, ids, dep=0, threads=3):
task_threads = []
self.rpc_queue_list.append((ids, dep))
for i in range(threads):
t = threading.Thread(target=self.dispatch_rpc_thread, args=(i,))
task_threads.append(t)
for task in task_threads:
task.start()
for task in task_threads:
task.join()
print("Rpc Complete !")
self.rpc_finish_flag = True
# print(self.image_queue_list)
def dispatch_image_thread(self, name):
while True:
while len(self.image_queue_list) == 0:
if self.rpc_finish_flag is False:
sleep(3)
else:
return
self.lock.acquire()
url = self.image_queue_list[0]
del self.image_queue_list[0]
self.finish_image_list.append(url)
self.lock.release()
print('Image Thread ' + str(name) + " : " + url)
filename = url.split('/')
filename = filename[len(filename) - 1]
self.c.download_jpg(SAVE_PATH, filename, url)
def do_image_dispatch(self, threads=5):
task_threads = []
if not os.path.exists(SAVE_PATH):
os.makedirs(SAVE_PATH)
for i in range(threads):
t = threading.Thread(target=self.dispatch_image_thread, args=(i,))
task_threads.append(t)
for task in task_threads:
task.start()
for task in task_threads:
task.join()
print("Image Complete !")
def do_dispatch(self, ids, dep=0, rpc_threads=3, image_threads=5):
image_thread = threading.Thread(target=self.do_image_dispatch, args=(image_threads,))
rpc_thread = threading.Thread(target=self.do_rpc_dispatch, args=(ids, dep, rpc_threads))
rpc_thread.start()
image_thread.start()
rpc_thread.join()
image_thread.join()
print('All OK !')
(感觉都写的很不优雅)
调用方法(谜之开车)
c = CrawlerThread()
c.do_dispatch('42676660')