beautifulSoup practice Created 2021-12-23 | Updated 2022-05-22
| Post Views:
本次我采用BeautifulSoup爬取网页数据,并打印到csv文件中,相比于selenium,BeautifulSoup更加快速。
有几个值得注意的问题
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 from bs4 import BeautifulSoupimport requestsimport rerankList = [] nameList=[] regionList=[] scoreList=[] regEx=re.compile ("ranking.*" ) url='https://www.shanghairanking.cn/rankings/arwu/2021' res=requests.get(url) res.encoding = res.apparent_encoding soup=BeautifulSoup(res.text,'lxml' ) table=soup.find("tbody" ).find_all("tr" ) i=0 for item in table: i=i+1 rank=item.find("div" ,class_=regEx) rankList.append(rank.text.strip()) name=item.find("div" ,class_="link-container" ) nameList.append(name.text.strip()) region=item.select("#content-box > div.rk-table-box > table > tbody > tr:nth-child(" +str (i)+") > td:nth-child(3)" ) regionList.append(region[0 ].text.strip()) score=item.select("#content-box > div.rk-table-box > table > tbody > tr:nth-child(" +str (i)+") > td:nth-child(4)" ) scoreList.append(score[0 ].text.strip()) import pandas as pdData = pd.DataFrame(columns = ["排名" ,"名字" ,"国家" ,"总分" ]) Data["排名" ] = rankList Data["名字" ] = nameList Data["国家" ] = regionList Data["总分" ]=scoreList Data.to_csv("test.csv" , encoding='utf_8_sig' )from bs4 import BeautifulSoup import requestsimport rerankList = [] nameList=[] regionList=[] scoreList=[] regEx=re.compile ("ranking.*" ) url='https://www.shanghairanking.cn/rankings/arwu/2021' res=requests.get(url) res.encoding = res.apparent_encoding soup=BeautifulSoup(res.text,'lxml' ) table=soup.find("tbody" ).find_all("tr" ) i=0 for item in table: i=i+1 rank=item.find("div" ,class_=regEx) rankList.append(rank.text.strip()) name=item.find("div" ,class_="link-container" ) nameList.append(name.text.strip()) region=item.select("#content-box > div.rk-table-box > table > tbody > tr:nth-child(" +str (i)+") > td:nth-child(3)" ) regionList.append(region[0 ].text.strip()) score=item.select("#content-box > div.rk-table-box > table > tbody > tr:nth-child(" +str (i)+") > td:nth-child(4)" ) scoreList.append(score[0 ].text.strip()) import pandas as pdData = pd.DataFrame(columns = ["排名" ,"名字" ,"国家" ,"总分" ]) Data["排名" ] = rankList Data["名字" ] = nameList Data["国家" ] = regionList Data["总分" ]=scoreList Data.to_csv("test.csv" , encoding='utf_8_sig' )