Selenium效率低的缺点,但有着可见即可爬的优点。BeautifulSoup本来是用于更快地提取html代码中的数据,效率高,但难以爬取动态网页。

所以,我想采取Selenium+BeautifulSoup的方式,高效爬取动态网页。

  1. 先selenium经一系列的操作之后,brower.page_source 获取源码
  2. 将源码传给BeautifulSoup,快速爬取

下面以中国大学项目作为实例:

1
2
3
4
5
6
7
8
9
10
create database collegesdb charset utf8;
use collegesdb;
create table t(
ranking int,
name VARCHAR(20),
abroad_rate float(10,1),
employment_rate float(10,1),
numberOfGraduate int,
numberOfUndergraduate int
)charset=utf8;
Scrapy
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import pymysql
from bs4 import BeautifulSoup
import re

# 连接数据库
mydb = pymysql.connect(
host="localhost",
user="root",
password="123456",
db="collegesdb"
)
mycursor = mydb.cursor()

def create_web_driver(url):
# 不打开浏览器的情况下,爬取数据
options=webdriver.ChromeOptions()
# 无头模式
options.add_argument('--headless')
# 不加载图片
prefs = {
'profile.default_content_setting_values': {
'images': 2,
}
}
options.add_experimental_option('prefs', prefs)
browser = webdriver.Chrome(options=options) # 打开浏览器

#反屏蔽
browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
'source': 'Object.defineProperty(navigator, "webdriver", {get: () => undefined})'
})
browser.get(url)
return browser

def reptilian(browser):
pages=20
items=30
row = []
# 设置延迟等待对象
wait = WebDriverWait(browser, 5)
# 插入数据到数据库
def append_item(typeOfElement, regEx_name):
try:
first_link = soup.find(typeOfElement, string=re.compile(regEx_name))
abroad_rate = first_link.find_next_sibling(typeOfElement)
row.append(abroad_rate.text)
except:
row.append("-1")

for j in range(pages):
for i in range(items):
# 进入第二页,延迟等待
item=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, f'#content-box > div.rk-table-box > table > tbody > tr:nth-child({i+1}) > td.align-left > div > div.univname > div:nth-child(1) > div > div > a')))
item.click()
# waiting until all html presence
ranking = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#bcur_latest > div.bcur-latest > div > div.icon-container > span')))
# 获取第二页源码
web_data=browser.page_source
# BeautifulSoup解析源码
soup=BeautifulSoup(web_data,'lxml')
# 获取想要的数据

# 排名
row.append(ranking.text)
# 名字
name=soup.select("#univ_name > span.name-cn")
row.append(name[0].text)
# 出国率
append_item("span", "^出国深造率")
# 研究生数
append_item("span", "^研究生数")
# 本科生数
append_item("span", "^本科生数")
# 就业率
append_item("span", "^就业率")
# 插入数据库
sql = "INSERT INTO t (ranking, name, abroad_rate, numberOfGraduate,numberOfUndergraduate, employment_rate) VALUES (%s, %s, %s, %s, %s, %s)"
print(row)
mycursor.execute(sql, row)
mydb.commit()
# 清理数组
row.clear()
# 后退一页
browser.back()
# 下一页
element4 = browser.find_element(By.CSS_SELECTOR, '#content-box > ul > li.ant-pagination-item.ant-pagination-item-'+str(j+2)+' > a')
element4.click()
# 滑动到顶部
js_top = "var q=document.documentElement.scrollTop=0"
browser.execute_script(js_top)
browser.close()

if __name__=="__main__":
browser=create_web_driver("https://www.shanghairanking.cn/rankings/bcur/2021")
reptilian(browser)
Data Cleansing
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
from sqlalchemy import create_engine
import pandas as pd

db_connection_str = 'mysql+pymysql://root:123456@localhost/collegesdb'
db_connection = create_engine(db_connection_str)

df = pd.read_sql('SELECT * FROM t', con=db_connection)

def drop_row(field):
for x in df.index:
if df.loc[x, field] == -1:
df.drop(x, inplace=True)

if __name__=="__main__":
drop_row("ranking")
drop_row("abroad_rate")
drop_row("employment_rate")
drop_row("numberOfGraduate")
Data Analysis
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
import matplotlib.pyplot as plt
from data_cleaning2 import df
import matplotlib.ticker as ticker

#让可视化中能显示中文
plt.rcParams['font.sans-serif'] = [u'SimHei']
plt.rcParams['axes.unicode_minus'] = False

def scatter(x_name_in_df, y_name_in_df,xlabel,ylabel,color):
plt.scatter(x=df[x_name_in_df], y=df[y_name_in_df], color=color)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.gca().xaxis.set_major_locator(ticker.MultipleLocator(50))
plt.show()

if __name__=="__main__":
scatter("ranking", 'number_Undergraduate', 'University rankings', 'Number of undergraduate students','blue')
scatter("ranking", 'numberOfGraduate', 'University rankings', 'Number of graduate students', 'black')
scatter("ranking", 'employment_rate', 'University rankings', 'Employment rate', 'green')
scatter("ranking", 'abroad_rate', 'University rankings', 'Rate of going abroad', 'red')