python+selenium爬取1688多个商品数据

moxiang 墨香 2020-09-03 11:13:30

python+selenium爬取1688多个商品数据

需求：查询某商品后页面的信息，不涉及深入采集

就像这种的：
在这里插入图片描述

为什么不用request，urllib进行爬取，而是用selenium呢？

有试过使用request，可以进行爬取，但是时间长了，就会出现error，反正阿里就不给你返回页面了，用了免费代理ip更恐怖，没有一个成功的，什么超级主机主动拒绝什么，一个看着比一个giaoligiaogiao。
之所以用selenium，因为他是一个用于Web应用程序测试的工具。Selenium测试直接运行在浏览器中，就像真正的用户在操作一样。详细的也不多逼逼。直接上代码！

import random

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import selenium.common.exceptions
import time
import csv
import re

#搜索函数
def seacher(key):
driver.find_element_by_id("home-header-searchbox").clear()
driver.find_element_by_id("home-header-searchbox").send_keys(key,Keys.ENTER)
driver.maximize_window()
time.sleep(1)
page = driver.find_element_by_xpath('//div[@class="rootComponent"]//span[@class="fui-paging-total"]').text
#获取最大页数
page = re.findall(('\d+'),page)[0]
print(page)
return int(page)

#解析没有广告的商品
def get_pridect():
try:

driver.implicitly_wait(1)
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
time.sleep(1)
divs = driver.find_elements_by_xpath('//div[@class="sm-offer"]//div[@class="card-container"]')
# print(divs)
for div in divs:
shop = div.find_element_by_xpath('.//div[@class="desc-container"]//a').text #商品名称
price = div.find_element_by_xpath('.//div[@class="desc-container"]//div[@class="price"]').text +'元' #价格
youhui = div.find_element_by_xpath('.//div[@class="desc-container"]//div[@class="offer-tag-container"]').text #优惠
chengjiaoe = div.find_element_by_xpath('.//div[@class="desc-container"]//div[@class="price-container"]').text
companyname = div.find_element_by_xpath('.//div[@class="desc-container"]//div[@class="company-name"]').text #公司名
shangjialeibie = div.find_element_by_xpath('.//div[@class="desc-container"]//div[@class="company-tag-container"]/a').text #商家类别
mingcheng = div.find_element_by_xpath('.//div[@class="desc-container"]//div[@class="common-company-tag"]').text
with open('./1688-data/0619.csv','a',newline='',encoding='utf-8') as file:
csvwriter = csv.writer(file,delimiter=',')
csvwriter.writerow([shop, price, youhui,chengjiaoe, companyname, shangjialeibie, mingcheng ])
except selenium.common.exceptions.TimeoutException:
print('parse_page: TimeoutException')
get_pridect()
except selenium.common.exceptions.StaleElementReferenceException:
print('parse_page: StaleElementReferenceException')
driver.refresh()
except selenium.common.exceptions.NoSuchElementException:
print('youwenti02')
get_pridect()

#解析有广告的商品
def get_pridected():
try:
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
driver.implicitly_wait(2)

divsl = driver.find_elements_by_xpath('//div[@class="sm-offer"]//div[@class="card-container ad-item"]')
# print(divsl)
for div in divsl:
shop = div.find_element_by_xpath('.//div[@class="desc-container"]//a').text #商品名称
price = div.find_element_by_xpath('.//div[@class="desc-container"]//div[@class="price"]').text +'元' #价格
youhui = div.find_element_by_xpath('.//div[@class="desc-container"]//div[@class="offer-tag-container"]').text #优惠
chengjiaoe = div.find_element_by_xpath('.//div[@class="desc-container"]//div[@class="price-container"]').text
companyname = div.find_element_by_xpath('.//div[@class="desc-container"]//div[@class="company-name"]').text #公司名
shangjialeibie = div.find_element_by_xpath('.//div[@class="desc-container"]//div[@class="company-tag-container"]/a').text #商家类别
mingcheng = div.find_element_by_xpath('.//div[@class="desc-container"]//div[@class="common-company-tag"]').text
#有的没有广告，我有想过使用,可惜不管用呀，有知道的还请说下：
# if div.find_element_by_xpath('.div[@class="price-container"]//div[@class="ad-container"]')：
# guanggao = div.find_element_by_xpath('.div[@class="price-container"]//div[@class="ad-container"]').text
# else:
# guanggao = ' '
# guanggao = div.find_element_by_xpath('.div[@class="price-container"]//div[@class="ad-container"]').text
# print(shop,price,sep='|')
with open('./1688-data/0619.csv','a',newline='',encoding='utf-8') as file:
csvwriter = csv.writer(file,delimiter=',')
csvwriter.writerow([shop,price,youhui,chengjiaoe,companyname,shangjialeibie,mingcheng])
except selenium.common.exceptions.TimeoutException:
print('parse_page: TimeoutException')
get_pridected()
except selenium.common.exceptions.StaleElementReferenceException:
print('parse_page: 刷新页面')
driver.refresh()
except selenium.common.exceptions.NoSuchElementException:
print('youwenti01')
get_pridected()

def main():
print('正在爬去第一页ing')
page = seacher(keyword)
get_pridect()
get_pridected()
page_num = 1
while page_num != page:
print('*'*100)
print('正在爬去第{}页'.format(page_num+1))
print('*' * 100)
test = driver.find_element_by_xpath('//div[@class="common-pagination"]//a[@class="fui-next"]')
driver.execute_script("arguments[0].click();", test)
driver.implicitly_wait(2)
driver.maximize_window()
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
get_pridect()
page_num +=1

if __name__ == '__main__':
# keyword = input('请输入需要查询的商品名称： ')
with open('./1688-data/classes.txt', 'r+', encoding='utf-8') as f: # 不需要自己关闭文件句柄 f.close()不需要默认 r
datas = f.read().splitlines()
for i in datas:
keyword = i
print('正在爬取'+i)
#无图模式
# driver = webdriver.ChromeOptions()
# prefs = {
# 'profile.default_content_setting_values': {
# 'images': 2
# }
# }
# driver.add_experimental_option('prefs', prefs)

# 代理ip
# PROXY = "171.35.221.103:9999"
# option.add_argument('--proxy-server='+PROXY)
# driver = webdriver.Chrome(chrome_options = option)

#无头
# option.add_experimental_option('prefs', prefs)
# driver = webdriver.Chrome(chrome_options = option)
driver = webdriver.Chrome()
driver.get('https://www.1688.com/',)
main()

别问为什么我用了两个解析网页的函数，别问，问就是我菜，

商品带广告的跟不带广告的虽说区别不大，但是我有想过一些解决办法，可惜了我菜。（有会的大佬，可以指点小弟一二。）
把这个读取的文件路径改成自己的。一个商品为一行，回车分割。
在这里插入图片描述
保存路径的在这里，记得改成自己的。

~~项目报错别怪我，要怪就怪我们都是菜鸡~~
我有想过无头模式，跟无图片加载（需要换下xpath路径)，可惜关于代理ip的问题，我放弃挣扎了！
关于代理ip 他不支持动态切换，只支持一次性的，啧啧，鸡肋呀！
或者利用DesiredCapabilities(代理设置)参数值，重新打开一个sessionId，加上代理重新访问一次url，就相当于重启浏览器再去查询一次。

from selenium import webdriver
from selenium.webdriver.common.proxy import ProxyType

browser=webdriver.PhantomJS()
proxy=webdriver.Proxy()
proxy.proxy_type=ProxyType.MANUAL
proxy.http_proxy='122.4.46.181:9999'
proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS)
browser.start_session(webdriver.DesiredCapabilities.PHANTOMJS)
browser.get('https://www.1688.com/')
print('1: ',browser.session_id)
print('2: ',browser.page_source)
print('3: ',browser.get_cookies())

其实我还想了一种方式，就是设置一个函数，当出现登陆页面或者验证码时，提醒更换IP，自己再去控制台手动输入，个人感觉没啥意义，还是算了，不如去想想怎么动态实现。

这代码商品少可以用，多的话还是建议scrapy+selenium,点点关注，后续更新！

版权声明：本文为博主原创文章，遵循 CC 4.0 BY-SA 版权协议，转载请附上原文出处链接和本声明。
本文链接：https://blog.csdn.net/t8116189520/article/details/94733864

版权声明

本文仅代表作者观点，不代表本站立场。
本文系作者授权发表，未经许可，不得转载。
本文地址：/moxiang/1218.html

上一篇 : python 根据关键词爬取1688商品数据（完整案例）

下一篇 : 阿里巴巴全球旺铺和全球E站一样吗？有什么区别

留言与评论（共有 0 条评论）

1000元配置

2000元配置

3000元配置

4000元配置

5000元配置

6000元配置

7000元配置

8000元配置

9000元配置

万元配置

i3配置

i5配置

i7配置

APU配置

R5配置

R7配置

游戏攻略

软件教程

Win10教程

Win7教程

Mac教程

Linux教程

U盘教程

Server教程

python+selenium爬取1688多个商品数据