中国大学数据分析 Created 2022-01-06 | Updated 2022-05-22
| Post Views:
Selenium效率低的缺点,但有着可见即可爬的优点。BeautifulSoup本来是用于更快地提取html代码中的数据,效率高,但难以爬取动态网页。
所以,我想采取Selenium+BeautifulSoup的方式,高效爬取动态网页。
先selenium经一系列的操作之后,brower.page_source 获取源码
将源码传给BeautifulSoup,快速爬取
下面以中国大学项目作为实例:
1 2 3 4 5 6 7 8 9 10 create database collegesdb charset utf8; use collegesdb; create table t( ranking int, name VARCHAR(20), abroad_rate float(10,1), employment_rate float(10,1), numberOfGraduate int, numberOfUndergraduate int )charset=utf8;
Scrapy 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 from selenium import webdriverfrom selenium.common.exceptions import NoSuchElementExceptionfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.common.keys import Keysfrom selenium.webdriver.support import expected_conditions as ECfrom selenium.webdriver.support.wait import WebDriverWaitimport pymysqlfrom bs4 import BeautifulSoupimport remydb = pymysql.connect( host="localhost" , user="root" , password="123456" , db="collegesdb" ) mycursor = mydb.cursor() def create_web_driver (url ): options=webdriver.ChromeOptions() options.add_argument('--headless' ) prefs = { 'profile.default_content_setting_values' : { 'images' : 2 , } } options.add_experimental_option('prefs' , prefs) browser = webdriver.Chrome(options=options) browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument' , { 'source' : 'Object.defineProperty(navigator, "webdriver", {get: () => undefined})' }) browser.get(url) return browser def reptilian (browser ): pages=20 items=30 row = [] wait = WebDriverWait(browser, 5 ) def append_item (typeOfElement, regEx_name ): try : first_link = soup.find(typeOfElement, string=re.compile (regEx_name)) abroad_rate = first_link.find_next_sibling(typeOfElement) row.append(abroad_rate.text) except : row.append("-1" ) for j in range (pages): for i in range (items): item=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, f'#content-box > div.rk-table-box > table > tbody > tr:nth-child({i+1 } ) > td.align-left > div > div.univname > div:nth-child(1) > div > div > a' ))) item.click() ranking = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#bcur_latest > div.bcur-latest > div > div.icon-container > span' ))) web_data=browser.page_source soup=BeautifulSoup(web_data,'lxml' ) row.append(ranking.text) name=soup.select("#univ_name > span.name-cn" ) row.append(name[0 ].text) append_item("span" , "^出国深造率" ) append_item("span" , "^研究生数" ) append_item("span" , "^本科生数" ) append_item("span" , "^就业率" ) sql = "INSERT INTO t (ranking, name, abroad_rate, numberOfGraduate,numberOfUndergraduate, employment_rate) VALUES (%s, %s, %s, %s, %s, %s)" print (row) mycursor.execute(sql, row) mydb.commit() row.clear() browser.back() element4 = browser.find_element(By.CSS_SELECTOR, '#content-box > ul > li.ant-pagination-item.ant-pagination-item-' +str (j+2 )+' > a' ) element4.click() js_top = "var q=document.documentElement.scrollTop=0" browser.execute_script(js_top) browser.close() if __name__=="__main__" : browser=create_web_driver("https://www.shanghairanking.cn/rankings/bcur/2021" ) reptilian(browser)
Data Cleansing 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 from sqlalchemy import create_engineimport pandas as pddb_connection_str = 'mysql+pymysql://root:123456@localhost/collegesdb' db_connection = create_engine(db_connection_str) df = pd.read_sql('SELECT * FROM t' , con=db_connection) def drop_row (field ): for x in df.index: if df.loc[x, field] == -1 : df.drop(x, inplace=True ) if __name__=="__main__" : drop_row("ranking" ) drop_row("abroad_rate" ) drop_row("employment_rate" ) drop_row("numberOfGraduate" )
Data Analysis 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 import matplotlib.pyplot as pltfrom data_cleaning2 import dfimport matplotlib.ticker as tickerplt.rcParams['font.sans-serif' ] = [u'SimHei' ] plt.rcParams['axes.unicode_minus' ] = False def scatter (x_name_in_df, y_name_in_df,xlabel,ylabel,color ): plt.scatter(x=df[x_name_in_df], y=df[y_name_in_df], color=color) plt.xlabel(xlabel) plt.ylabel(ylabel) plt.gca().xaxis.set_major_locator(ticker.MultipleLocator(50 )) plt.show() if __name__=="__main__" : scatter("ranking" , 'number_Undergraduate' , 'University rankings' , 'Number of undergraduate students' ,'blue' ) scatter("ranking" , 'numberOfGraduate' , 'University rankings' , 'Number of graduate students' , 'black' ) scatter("ranking" , 'employment_rate' , 'University rankings' , 'Employment rate' , 'green' ) scatter("ranking" , 'abroad_rate' , 'University rankings' , 'Rate of going abroad' , 'red' )