本次我采用BeautifulSoup爬取网页数据,并打印到csv文件中,相比于selenium,BeautifulSoup更加快速。

有几个值得注意的问题

  • 编码

  • find()和find_all()

    1. find()返回指定元素
    2. find_all()返回指定元素的列表

    table=soup.find("tbody").find_all("tr")这里先用find()找到第一个tbody, 再用find_all()返回tbody中所有的tr元素列表。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
from bs4 import BeautifulSoup
import requests
import re

rankList = []
nameList=[]
regionList=[]
scoreList=[]
regEx=re.compile("ranking.*")

# resquests请求网页
url='https://www.shanghairanking.cn/rankings/arwu/2021'
res=requests.get(url)
# 由于BeautifulSoup的原因,不得不用此编码,才不会导致中文乱码
res.encoding = res.apparent_encoding
# 解析网页,返回BeautifulSoup对象
soup=BeautifulSoup(res.text,'lxml')
# 找到第一个tbody, 再找里面的所有tr,find_all返回元素列表
table=soup.find("tbody").find_all("tr")
i=0
for item in table:
i=i+1
# find返回元素
rank=item.find("div",class_=regEx)
rankList.append(rank.text.strip())

name=item.find("div",class_="link-container")
nameList.append(name.text.strip())

region=item.select("#content-box > div.rk-table-box > table > tbody > tr:nth-child("+str(i)+") > td:nth-child(3)")
regionList.append(region[0].text.strip())
score=item.select("#content-box > div.rk-table-box > table > tbody > tr:nth-child("+str(i)+") > td:nth-child(4)")
scoreList.append(score[0].text.strip())
import pandas as pd
Data = pd.DataFrame(columns = ["排名","名字","国家","总分"])
Data["排名"] = rankList
Data["名字"] = nameList
Data["国家"] = regionList
Data["总分"]=scoreList
# 要导入csv,用此编码,才不会导致中文乱码
Data.to_csv("test.csv", encoding='utf_8_sig')from bs4 import BeautifulSoup
import requests
import re

rankList = []
nameList=[]
regionList=[]
scoreList=[]

regEx=re.compile("ranking.*")
url='https://www.shanghairanking.cn/rankings/arwu/2021'
res=requests.get(url)

# 由于BeautifulSoup的原因,不得不用此编码,才不会导致中文乱码
res.encoding = res.apparent_encoding

soup=BeautifulSoup(res.text,'lxml')
table=soup.find("tbody").find_all("tr")
i=0
for item in table:
i=i+1
rank=item.find("div",class_=regEx)
rankList.append(rank.text.strip())

name=item.find("div",class_="link-container")
nameList.append(name.text.strip())

region=item.select("#content-box > div.rk-table-box > table > tbody > tr:nth-child("+str(i)+") > td:nth-child(3)")
regionList.append(region[0].text.strip())
score=item.select("#content-box > div.rk-table-box > table > tbody > tr:nth-child("+str(i)+") > td:nth-child(4)")
scoreList.append(score[0].text.strip())
import pandas as pd
Data = pd.DataFrame(columns = ["排名","名字","国家","总分"])
Data["排名"] = rankList
Data["名字"] = nameList
Data["国家"] = regionList
Data["总分"]=scoreList
# 要导入csv,用此编码,才不会导致中文乱码
Data.to_csv("test.csv", encoding='utf_8_sig')