爬取图片 Created 2022-01-21 | Updated 2022-05-22
| Post Views:
页面滚动 由于很多的页面都是动态加载的,在用selenium模拟浏览器时,如果不滚动页面下方,那么有的页面数据就无法加载,所以需要让selenium执行js代码,对页面进行滚动
1 2 3 4 5 6 7 8 9 10 11 12 13 14 def scroll (browser ): js = "return action=document.body.scrollHeight" height = 0 new_height = browser.execute_script(js) while height < new_height: for k in range (height, new_height, 300 ): browser.execute_script('window.scrollTo(0, {})' .format (k)) time.sleep(0.2 ) height = new_height new_height = browser.execute_script(js)
JS selenium还能执行一些javaScript的操作
1 2 3 js='var box=document.getElementById("scroll");box.remove();' browser.execute_script(js)
完整代码 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 import timefrom selenium import webdriverfrom selenium.common.exceptions import NoSuchElementExceptionfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.common.keys import Keysfrom selenium.webdriver.support import expected_conditions as ECfrom selenium.webdriver.support.wait import WebDriverWaitfrom bs4 import BeautifulSoupimport requestsdef create_web_driver (url ): option = webdriver.ChromeOptions() browser = webdriver.Chrome(options=option) browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument' , { 'source' : 'Object.defineProperty(navigator, "webdriver", {get: () => undefined})' }) browser.get(url) browser.maximize_window() return browser def login (browser,user_name,password ): wait = WebDriverWait(browser, 10 ) login_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#user-profile > div.nav-set > div.nav-login > div' ))) login_button.click() wait.until(EC.element_to_be_clickable((By.NAME,'log' ))).send_keys(user_name) browser.find_element(By.NAME, "pwd" ).send_keys(password) browser.find_element(By.NAME,'wp-submit' ).click() time.sleep(5 ) def scroll (browser ): js = "return action=document.body.scrollHeight" height = 0 new_height = browser.execute_script(js) while height < new_height: for k in range (height, new_height, 300 ): browser.execute_script('window.scrollTo(0, {})' .format (k)) time.sleep(0.2 ) height = new_height new_height = browser.execute_script(js) def spider (browser,model_name ): wait = WebDriverWait(browser, 10 ) wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#navigation-top > span' ))).click() wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#so' ))).send_keys(model_name) wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#searchsubmit-so' ))).click() j = 0 while True : i = 0 while True : scroll(browser) wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, f'#main > article:nth-child({i+1 } )' ))).click() i=i+1 x=0 while True : x=x+1 web_data=browser.page_source soup = BeautifulSoup(web_data, 'lxml' ) li = soup.find("div" ,class_="single-content" ) try : for img in li.select("img[src]" ): j=j+1 download_url=img["src" ] pic_data = requests.get(download_url).content with open (f'./download/{model_name} /{j} .jpg' , mode='wb' ) as f: f.write(pic_data) browser.find_element_by_link_text(f'{x+1 } ' ).click() except : break for m in range (x): browser.back() if i==24 : break scroll(browser) js='var box=document.getElementById("scroll");box.remove();' browser.execute_script(js) wait.until(EC.element_to_be_clickable((By.XPATH, "//a[@class='next page-numbers']" ))).click()