1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
| import requests import parsel import csv import time
def spider():
headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36' }
i=0 num=0 while True: time.sleep(1) url=f'https://sz.lianjia.com/ershoufang/pg{i+1}/'
# 发送请求 response = requests.get(url=url,headers=headers) # 获取源码 # 提取数据
selector=parsel.Selector(response.text) href=selector.css('div.leftContent li div.info.clear div.title a::attr(href)').getall() for link in href: num = num + 1 html_data=requests.get(url=link,headers=headers).text selector1 = parsel.Selector(html_data) try: title=selector1.css('.title h1::text').get() price=selector1.css('span.total::text').get() price_per_meter = selector1.css('.unitPrice span::text').get() Type = selector1.css('.room div.mainInfo::text').get() area = selector1.css('.base > div.content > ul > li:nth-child(3)::text').get() direction = selector1.css('.base > div.content > ul > li:nth-child(7)::text').get() furnished=selector1.css('.base > div.content > ul > li:nth-child(9) > span::text').get() lift=selector1.css('.base > div.content > ul > li:nth-child(12) > span::text').get() floors=selector1.css('.base > div.content > ul > li:nth-child(2)::text').get() year=selector1.css('.subInfo.noHidden::text').get() data=[title, price, price_per_meter, Type, area, direction, furnished, lift, floors,year] print(title, price, price_per_meter, Type, area, direction, furnished, lift, floors,year,sep='|') write_csv(data) except: pass
def write_csv(data_row): path = "house1.csv" with open(path,'a',newline='') as f: csv_write = csv.writer(f) csv_write.writerow(data_row)
if __name__ == '__main__': name = ("title", "price", "price_per_meter", "Type", "area", "direction", "furnished", "lift", "floors", "year") path = "house1.csv" with open(path,'a',newline='') as f: csv_write = csv.writer(f) csv_write.writerow(name) spider()
|