页面滚动

由于很多的页面都是动态加载的,在用selenium模拟浏览器时,如果不滚动页面下方,那么有的页面数据就无法加载,所以需要让selenium执行js代码,对页面进行滚动

1
2
3
4
5
6
7
8
9
10
11
12
13
14
def scroll(browser):
# 执行这段代码,会获取到当前窗口总高度
js = "return action=document.body.scrollHeight"
# 初始化现在滚动条所在高度为0
height = 0
# 当前窗口总高度
new_height = browser.execute_script(js)
while height < new_height:
# 将滚动条调整至页面底部
for k in range(height, new_height, 300):
browser.execute_script('window.scrollTo(0, {})'.format(k))
time.sleep(0.2)
height = new_height
new_height = browser.execute_script(js)

JS

selenium还能执行一些javaScript的操作

1
2
3
# 删除遮挡的元素
js='var box=document.getElementById("scroll");box.remove();'
browser.execute_script(js)

完整代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import time
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from bs4 import BeautifulSoup
import requests

def create_web_driver(url):
option = webdriver.ChromeOptions()
#option.add_argument('--headless')
# 打开浏览器
browser = webdriver.Chrome(options=option)
#反屏蔽
browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
'source': 'Object.defineProperty(navigator, "webdriver", {get: () => undefined})'
})
browser.get(url)
browser.maximize_window()
return browser

def login(browser,user_name,password):
wait = WebDriverWait(browser, 10)
login_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#user-profile > div.nav-set > div.nav-login > div')))
login_button.click()
# 填写账号
wait.until(EC.element_to_be_clickable((By.NAME,'log'))).send_keys(user_name)
# 填写密码
browser.find_element(By.NAME, "pwd").send_keys(password)
# 点击登录
browser.find_element(By.NAME,'wp-submit').click()
time.sleep(5)

def scroll(browser):
# 执行这段代码,会获取到当前窗口总高度
js = "return action=document.body.scrollHeight"
# 初始化现在滚动条所在高度为0
height = 0
# 当前窗口总高度
new_height = browser.execute_script(js)
while height < new_height:
# 将滚动条调整至页面底部
for k in range(height, new_height, 300):
browser.execute_script('window.scrollTo(0, {})'.format(k))
time.sleep(0.2)
height = new_height
new_height = browser.execute_script(js)

def spider(browser,model_name):
wait = WebDriverWait(browser, 10)
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#navigation-top > span'))).click()
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#so'))).send_keys(model_name)
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#searchsubmit-so'))).click()
# 图片序号从几开始
j = 0

# 爬取首页中的所有页
while True:
# 记录首页项目数
i = 0
# 爬取首页中的所有项
while True:
# 一步一步向下滑动,加载所有项
scroll(browser)
# 进入第2页面
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, f'#main > article:nth-child({i+1})'))).click()
i=i+1
# 记录第2页面的子页面个数
x=0
# 爬取第二页页中的子页面
while True:
x=x+1
# 让selenium返回网页源码
web_data=browser.page_source
# bs解析
soup = BeautifulSoup(web_data, 'lxml')
# 获取图片list
li = soup.find("div",class_="single-content")
# 获得下载链接并下载
try:
for img in li.select("img[src]"):
# 图片张数
j=j+1
download_url=img["src"]
pic_data = requests.get(download_url).content
with open(f'./download/{model_name}/{j}.jpg', mode='wb') as f:
f.write(pic_data)
browser.find_element_by_link_text(f'{x+1}').click()
except:
break
# 返回首页
for m in range(x):
browser.back()
# 爬完首页的一页
if i==24:
break
scroll(browser)
# 删除遮挡的元素
js='var box=document.getElementById("scroll");box.remove();'
browser.execute_script(js)
# 点击首页的下一页
wait.until(EC.element_to_be_clickable((By.XPATH, "//a[@class='next page-numbers']"))).click()