beautifulSoup practice

本次我采用BeautifulSoup爬取网页数据，并打印到csv文件中，相比于selenium，BeautifulSoup更加快速。

有几个值得注意的问题

编码
find()和find_all()
1. find()返回指定元素
2. find_all()返回指定元素的列表
table=soup.find("tbody").find_all("tr")这里先用find()找到第一个tbody, 再用find_all()返回tbody中所有的tr元素列表。

from bs4 import BeautifulSoup
import requests
import re

rankList = []
nameList=[]
regionList=[]
scoreList=[]
regEx=re.compile("ranking.*")

# resquests请求网页
url='https://www.shanghairanking.cn/rankings/arwu/2021'
res=requests.get(url)
# 由于BeautifulSoup的原因，不得不用此编码，才不会导致中文乱码
res.encoding = res.apparent_encoding
# 解析网页，返回BeautifulSoup对象
soup=BeautifulSoup(res.text,'lxml')
# 找到第一个tbody, 再找里面的所有tr，find_all返回元素列表
table=soup.find("tbody").find_all("tr")
i=0
for item in table:
    i=i+1
    # find返回元素
    rank=item.find("div",class_=regEx)
    rankList.append(rank.text.strip())

    name=item.find("div",class_="link-container")
    nameList.append(name.text.strip())

    region=item.select("#content-box > div.rk-table-box > table > tbody > tr:nth-child("+str(i)+") > td:nth-child(3)")
    regionList.append(region[0].text.strip())
    score=item.select("#content-box > div.rk-table-box > table > tbody > tr:nth-child("+str(i)+") > td:nth-child(4)")
    scoreList.append(score[0].text.strip())
import pandas as pd
Data = pd.DataFrame(columns = ["排名","名字","国家","总分"])
Data["排名"] = rankList
Data["名字"] = nameList
Data["国家"] = regionList
Data["总分"]=scoreList
# 要导入csv，用此编码，才不会导致中文乱码
Data.to_csv("test.csv", encoding='utf_8_sig')from bs4 import BeautifulSoup
import requests
import re

rankList = []
nameList=[]
regionList=[]
scoreList=[]

regEx=re.compile("ranking.*")
url='https://www.shanghairanking.cn/rankings/arwu/2021'
res=requests.get(url)

# 由于BeautifulSoup的原因，不得不用此编码，才不会导致中文乱码
res.encoding = res.apparent_encoding

soup=BeautifulSoup(res.text,'lxml')
table=soup.find("tbody").find_all("tr")
i=0
for item in table:
    i=i+1
    rank=item.find("div",class_=regEx)
    rankList.append(rank.text.strip())

    name=item.find("div",class_="link-container")
    nameList.append(name.text.strip())

    region=item.select("#content-box > div.rk-table-box > table > tbody > tr:nth-child("+str(i)+") > td:nth-child(3)")
    regionList.append(region[0].text.strip())
    score=item.select("#content-box > div.rk-table-box > table > tbody > tr:nth-child("+str(i)+") > td:nth-child(4)")
    scoreList.append(score[0].text.strip())
import pandas as pd
Data = pd.DataFrame(columns = ["排名","名字","国家","总分"])
Data["排名"] = rankList
Data["名字"] = nameList
Data["国家"] = regionList
Data["总分"]=scoreList
# 要导入csv，用此编码，才不会导致中文乱码
Data.to_csv("test.csv", encoding='utf_8_sig')