偶然看见Pixiv页面右侧的推荐貌似挺符合胃口,于是写个爬虫批量

原本想直接拉页面HTML来读取,后来发现直接加载的时候 推荐并不是即时出现的

倒过去找js   最后发现一个长这个样子的东西:

 relatedList: function(id, count, options) {
        return pixiv.api.get('/rpc/index.php', {
          mode: 'get_recommend_illusts',
          ids: id,
          num: count || 100
        }, options);
      }

唔…..   遂利用该接口

{
  "error": false,
  "message": "",
  "body": {
    "total": 5,
    "illusts": [
      {
        "illust_id": 42850203,
        "illust_title": "COMIC1新刊プリキュア本サンプル",
        "illust_user_id": "705370",
        "illust_type": "1",
        "illust_page_count": "6",
        "url": {
          "s": "http://i4.pixiv.net/c/150x150/img-master/img/2014/04/12/20/30/39/42850203_p0_master1200.jpg",
          "240mw": "http://i4.pixiv.net/c/240x480/img-master/img/2014/04/12/20/30/39/42850203_p0_master1200.jpg",
          "m": "http://i4.pixiv.net/c/600x600/img-master/img/2014/04/12/20/30/39/42850203_p0_master1200.jpg",
          "big": "http://i4.pixiv.net/img-original/img/2014/04/12/20/30/39/42850203_p0.jpg",
          "128x128": "http://i4.pixiv.net/c/128x128/img-master/img/2014/04/12/20/30/39/42850203_p0_square1200.jpg",
          "ugoira600x600": "",
          "540x540": "http://i4.pixiv.net/c/540x540_70/img-master/img/2014/04/12/20/30/39/42850203_p0_square1200.jpg"
        },
        "is_bookmarked": false,
        "is_rated": false,
        "is_commented": false
      }]
  }
}

使用原生的request  关键配置项如下:

# 你需要一个良好的代理
PROXY_IP_FILE = ''
COOKIE_FILE = 'cookie.txt'
# 最大探索深度
MAX_DEPTH = 2
# 每层数量
MAX_NUM_EVDEP = 3
SAVE_PATH = 'photo\\'
# 图片下载重试次数
RETRY_TIMES = 5

Proxy_ip 类用于从代理文件中读取信息并随机选择代理

class Proxy_ip:
    def __init__(self, filename):
        self.proxy_list = self.get_proxy_list(filename)

    def get_proxy_list(self, filename):
        list = []
        of = open(filename, 'r')
        try:
            for line in of:
                list.append(line.replace('\n', ''))
        finally:
            of.close()
        return list

    def get_random_proxy(self):
        size = len(self.proxy_list)
        ran = random.randint(0, size - 1)
        return self.proxy_list[ran]

CrawlerTool类 主要是http接口访问的方法和下载图片文件的方法

class CrawlerTool:
    def __init__(self, filename):
        self.proxy = Proxy_ip(filename)

    COOKIE = None

    def set_cookie(self, file):
        str = ''
        for line in open(file, 'r'):
            str += line

        self.COOKIE = str

    def _get_proxy_openner(self):
        proxy_ip = self.proxy.get_random_proxy()
        # print('Using:' + str(proxy_ip))
        proxy_handler = urllib.request.ProxyHandler({'http': proxy_ip})
        opener = urllib.request.build_opener(urllib.request.HTTPHandler, proxy_handler)
        opener.addheaders = [('User-Agent', user_agent)]
        if self.COOKIE is not None:
            opener.addheaders = [('Cookie', self.COOKIE)]
        return opener

    def download_jpg(self, path, filename, url, ids='42676660'):
        re_try_count = 0
        while os.path.exists(path + filename) == False and re_try_count <= RETRY_TIMES:
            re_try_count += 1
            if url.find('.php') >= 0:
                break
            # proxyIp = '127.0.0.1'
            # print("Download: " + url + " With --> " + proxyIp + "Retry: " + str(reTryCount))
            try:
                opener = self._get_proxy_openner()
                opener.addheaders = [('Referer', 'http://www.pixiv.net/member_illust.php?mode=medium&illust_id=' + ids)]
                f = opener.open(url)
                jpg = f.read()
                jpgStr = jpg.decode("utf8", 'ignore')
                if jpgStr.find('<?xml version="1.0"') >= 0 or jpgStr.find('Unauthorized') >= 0 or jpgStr.find(
                        '403 Forbidden') >= 0:
                    print("Failed")
                    continue
            except Exception as err:
                print("Download Failed @" + url + ' Cause: ' + str(err))
                continue
            print("Download " + url + " OK ,Saving")
            try:
                File = open(path + filename, 'wb')
                File.write(jpg)
                File.close()
            except Exception:
                print("Save Failed @" + url)
                continue
        print(path + filename + " Saved !!")

    def get_api_json(self, url):
        try:
            opener = self._get_proxy_openner()
            f = opener.open(url)
            # f = urllib.request.urlopen(url)
            # print(f.getcode())
            data = f.read().decode("utf8", 'ignore')
            json_data = json.loads(data)
            return json_data
        except Exception as err:
            print("Err @get_api_json : " + str(err))
            return None

CrawlerThread 类 用于调度多线程来执行接口访问和图片下载任务

class CrawlerThread:
    def __init__(self):
        self.lock = threading.Lock()
        self.c = CrawlerTool(PROXY_IP_FILE)
        if COOKIE_FILE is not '':
            self.c.set_cookie(COOKIE_FILE)

    rpc_queue_list = []
    image_queue_list = []

    rpc_finish_flag = False
    finish_rpc_list = []
    finish_image_list = []

    def call_rpc(self, ids, mode='get_recommend_illusts', num=MAX_NUM_EVDEP, url='http://www.pixiv.net/rpc/index.php'):
        url_full = url + '?mode={mode}&ids={ids}&num={num}'.format(mode=mode, ids=ids, num=num)
        result = None
        while result is None:
            result = self.c.get_api_json(url_full)
        if result['error'] is not False:
            print('Err @call_rpc')
            return None
        return [(item['illust_id'], item['url']['big']) for item in result['body']['illusts']]

    def dispatch_rpc_thread(self, name):
        while True:
            while len(self.rpc_queue_list) == 0:
                sleep(3)
            self.lock.acquire()
            ids = self.rpc_queue_list[0]
            del self.rpc_queue_list[0]
            self.finish_rpc_list.append(ids[0])
            self.lock.release()
            if ids[1] > MAX_DEPTH:
                return
            dep = ids[1] + 1
            print('Rpc Thread ' + str(name) + " : " + str(ids[0]) + "--" + str(ids[1]))
            result = self.call_rpc(ids[0])
            for i in result:
                if i[0] not in self.finish_rpc_list:
                    self.rpc_queue_list.append((i[0], dep))
                if i[1] not in self.finish_image_list:
                    self.image_queue_list.append(i[1])

    def do_rpc_dispatch(self, ids, dep=0, threads=3):
        task_threads = []
        self.rpc_queue_list.append((ids, dep))
        for i in range(threads):
            t = threading.Thread(target=self.dispatch_rpc_thread, args=(i,))
            task_threads.append(t)
        for task in task_threads:
            task.start()
        for task in task_threads:
            task.join()
        print("Rpc Complete !")
        self.rpc_finish_flag = True
        # print(self.image_queue_list)

    def dispatch_image_thread(self, name):
        while True:
            while len(self.image_queue_list) == 0:
                if self.rpc_finish_flag is False:
                    sleep(3)
                else:
                    return
            self.lock.acquire()
            url = self.image_queue_list[0]
            del self.image_queue_list[0]
            self.finish_image_list.append(url)
            self.lock.release()
            print('Image Thread ' + str(name) + " : " + url)
            filename = url.split('/')
            filename = filename[len(filename) - 1]
            self.c.download_jpg(SAVE_PATH, filename, url)

    def do_image_dispatch(self, threads=5):
        task_threads = []
        if not os.path.exists(SAVE_PATH):
            os.makedirs(SAVE_PATH)
        for i in range(threads):
            t = threading.Thread(target=self.dispatch_image_thread, args=(i,))
            task_threads.append(t)
        for task in task_threads:
            task.start()
        for task in task_threads:
            task.join()
        print("Image Complete !")

    def do_dispatch(self, ids, dep=0, rpc_threads=3, image_threads=5):
        image_thread = threading.Thread(target=self.do_image_dispatch, args=(image_threads,))
        rpc_thread = threading.Thread(target=self.do_rpc_dispatch, args=(ids, dep, rpc_threads))
        rpc_thread.start()
        image_thread.start()
        rpc_thread.join()
        image_thread.join()
        print('All OK !')

(感觉都写的很不优雅)

调用方法(谜之开车)

c = CrawlerThread()
c.do_dispatch('42676660')