1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70
| import re import pandas as pd from selenium import webdriver from bs4 import BeautifulSoup from selenium.webdriver.common.by import By
def create_web_driver(url): options=webdriver.ChromeOptions() prefs = { 'profile.default_content_setting_values': { 'images': 2, } } options.add_experimental_option('prefs', prefs) browser = webdriver.Chrome(options=options)
browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', { 'source': 'Object.defineProperty(navigator, "webdriver", {get: () => undefined})' }) browser.get(url) return browser
def reptilian(browser): rankList = [] nameList=[] regionList=[] scoreList=[] regEx=re.compile("ranking.*") pages=34
for j in range(pages): web_data = browser.page_source soup=BeautifulSoup(web_data,'lxml') table=soup.find("tbody").find_all("tr") i=0 for item in table: i=i+1 rank=item.find("div",class_=regEx) rankList.append(rank.text.strip())
name=item.find("div",class_="link-container") nameList.append(name.text.strip())
region=item.select(f"#content-box > div.rk-table-box > table > tbody > tr:nth-child({i}) > td:nth-child(3)") regionList.append(region[0].text.strip()) score=item.select(f"#content-box > div.rk-table-box > table > tbody > tr:nth-child({i}) > td:nth-child(5)") scoreList.append(score[0].text.strip())
Data = pd.DataFrame(columns = ["ranking","name","country","score"]) Data["ranking"] = rankList Data["name"] = nameList Data["country"] = regionList Data["score"]=scoreList Data.to_csv("world_colleges_dataset.csv", encoding='utf_8_sig') element4 = browser.find_element(By.CSS_SELECTOR, f'#content-box > ul > li.ant-pagination-item.ant-pagination-item-{j+2} > a') element4.click()
if __name__ == '__main__': browser=create_web_driver('https://www.shanghairanking.cn/rankings/arwu/2021') reptilian(browser)
|