1、scrapy
pip install Scrapy
2、 selenium
pip install selenium
3、chromedriver
首先需要下载Chrome客户端,然后查看客户端的版本号,如果是linux无界面的话可以执行命令行下载
rpm -ivh google-chrome-stable_current_x86_64.rpm
然后去 https://chromedriver.storage.googleapis.com/index.html 下载后可以放进/usr/bin目录下
之后按以下代码可以顺利执行即可.
import scrapy
import json
from comic_spider.items import TopicItem
import time
from selenium import webdriver
import time
class SearchSpider(scrapy.spiders.Spider):
name = 'search'
search_page_url_pattern = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&page={page}&enc=utf-8"
start_urls = ['https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8']
def __init__(self):
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
self.browser = webdriver.Chrome(chrome_options=chrome_options, executable_path='/usr/bin/chromedriver')
super(SearchSpider, self).__init__()
def closed(self, reason):
self.browser.close() # 记得关闭
def parse(self, response):
total_page = response.css('span.p-skip em b::text').extract_first()
if total_page:
for i in range(int(total_page)):
next_page_url = self.search_page_url_pattern.format(page=2 * i + 1)
yield scrapy.Request(next_page_url, callback=self.parse_page)
time.sleep(1)
def parse_page(self, response):
phone_info_list = response.css('div.p-name a')
for item in book_info_list:
phone_name = item.css('a::attr(title)').extract_first()
phone_href = item.css('a::attr(href)').extract_first()
yield dict(name=phone_name, href=phone_href)