1、scrapy

pip install Scrapy

2、 selenium

pip install selenium

3、chromedriver

首先需要下载Chrome客户端,然后查看客户端的版本号,如果是linux无界面的话可以执行命令行下载
rpm -ivh google-chrome-stable_current_x86_64.rpm

然后去 https://chromedriver.storage.googleapis.com/index.html 下载后可以放进/usr/bin目录下

之后按以下代码可以顺利执行即可.

 import scrapy
 import json 
 from comic_spider.items import TopicItem
 import time
 from selenium import webdriver
 import time

 class SearchSpider(scrapy.spiders.Spider):
 name = 'search'
 search_page_url_pattern = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&page={page}&enc=utf-8"
 start_urls = ['https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8']

def __init__(self):
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    self.browser = webdriver.Chrome(chrome_options=chrome_options, executable_path='/usr/bin/chromedriver')
    super(SearchSpider, self).__init__()

def closed(self, reason):
    self.browser.close()  # 记得关闭

def parse(self, response):
    total_page = response.css('span.p-skip em b::text').extract_first()
    if total_page:
        for i in range(int(total_page)):
            next_page_url = self.search_page_url_pattern.format(page=2 * i + 1)
            yield scrapy.Request(next_page_url, callback=self.parse_page)
            time.sleep(1)

def parse_page(self, response):
    phone_info_list = response.css('div.p-name a')
    for item in book_info_list:
        phone_name = item.css('a::attr(title)').extract_first()
        phone_href = item.css('a::attr(href)').extract_first()

        yield dict(name=phone_name, href=phone_href)

最后修改:2020 年 07 月 08 日 09 : 31 PM
如果觉得我的文章对你有用,请随意赞赏