|
| 1 | +#通过某租房网站首页接口爬取租房房源信息 |
| 2 | +import time, re, csv, requests |
| 3 | +import codecs |
| 4 | +from bs4 import BeautifulSoup |
| 5 | + |
| 6 | +list=['jingan','xuhui','huangpu','changning','putuo','pudong','baoshan','hongkou','yangpu','minhang','jinshan','jiading','chongming','fengxian','songjiang','qingpu'] |
| 7 | +print("****处理开始****") |
| 8 | +with open(r'..\document\sh.csv', 'wb+')as fp: |
| 9 | + fp.write(codecs.BOM_UTF8) |
| 10 | +f = open(r'..\document\sh.csv','w+',newline='', encoding='utf-8') |
| 11 | +writer = csv.writer(f) |
| 12 | +urls = [] |
| 13 | + |
| 14 | +for a in list: |
| 15 | + urls.append('https://sh.lianjia.com/zufang/{}/pg1rco11/'.format(a)) |
| 16 | + headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.9 Safari/537.36'} |
| 17 | + |
| 18 | + res = requests.get('https://sh.lianjia.com/zufang/{}/pg1rco11/'.format(a), headers=headers) |
| 19 | + content = res.text |
| 20 | + soup = BeautifulSoup(content, 'html.parser') |
| 21 | + page_num = int(soup.find('div', attrs={'class': 'content__pg'}).attrs['data-totalpage']) |
| 22 | + for i in range(2,page_num+1): |
| 23 | + urls.append('https://sh.lianjia.com/zufang/{}/pg{}rco11/'.format(a,i)) |
| 24 | + |
| 25 | +print(urls) |
| 26 | + |
| 27 | +num=1 |
| 28 | +for url in urls: |
| 29 | + print("正在处理第{}页数据...".format(str(num))) |
| 30 | + res1 = requests.get(url, headers=headers) |
| 31 | + content1 = res1.text |
| 32 | + soup1 = BeautifulSoup(content1, 'html.parser') |
| 33 | + infos = soup1.find('div', {'class': 'content__list'}).find_all('div', {'class': 'content__list--item'}) |
| 34 | + |
| 35 | + for info in infos: |
| 36 | + |
| 37 | + house_url = 'https://sh.lianjia.com' + info.a['href'] |
| 38 | + title = info.find('p', {'class': 'content__list--item--title'}).find('a').get_text().strip() |
| 39 | + group = title.split()[0][3:] |
| 40 | + price = info.find('span', {'class': 'content__list--item-price'}).get_text() |
| 41 | + tag = info.find('p', {'class': 'content__list--item--bottom oneline'}).get_text() |
| 42 | + mixed = info.find('p', {'class': 'content__list--item--des'}).get_text() |
| 43 | + mix = re.split(r'/', mixed) |
| 44 | + address = mix[0].strip() |
| 45 | + area = mix[1].strip() |
| 46 | + door_orientation = mix[2].strip() |
| 47 | + style = mix[-1].strip() |
| 48 | + region = re.split(r'-', address)[0] |
| 49 | + writer.writerow((house_url, title, group, price, area, address, door_orientation, style, tag, region)) |
| 50 | + time.sleep(0) |
| 51 | + print("第{}页数据处理完毕,共{}条数据。".format(str(num), len(infos))) |
| 52 | + num+=1 |
| 53 | + |
| 54 | + |
| 55 | +f.close() |
| 56 | +print("****全部完成****") |
| 57 | + |
0 commit comments