Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 4a927af

Browse files
committed
Create sechouse.py
1 parent 50f7878 commit 4a927af

File tree

1 file changed

+152
-0
lines changed

1 file changed

+152
-0
lines changed

‎jiguang/fang/sechouse.py‎

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
import random
2+
import requests
3+
from bs4 import BeautifulSoup
4+
import re
5+
import math
6+
from lxml import etree
7+
8+
USER_AGENTS = [
9+
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
10+
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
11+
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
12+
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
13+
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
14+
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
15+
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
16+
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
17+
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
18+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
19+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
20+
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
21+
]
22+
chinese_city_district_dict = dict()
23+
chinese_area_dict = dict()
24+
25+
def create_headers():
26+
headers = dict()
27+
headers["User-Agent"] = random.choice(USER_AGENTS)
28+
headers["Referer"] = "http://www.ke.com"
29+
return headers
30+
31+
class SecHouse(object):
32+
def __init__(self, district, area, name, price, desc, pic):
33+
self.district = district
34+
self.area = area
35+
self.price = price
36+
self.name = name
37+
self.desc = desc
38+
self.pic = pic
39+
40+
def text(self):
41+
return self.district + "," + \
42+
self.area + "," + \
43+
self.name + "," + \
44+
self.price + "," + \
45+
self.desc + "," + \
46+
self.pic
47+
48+
def get_districts():
49+
url = 'https://bj.ke.com/xiaoqu/'
50+
headers = create_headers()
51+
response = requests.get(url, timeout=10, headers=headers)
52+
html = response.content
53+
root = etree.HTML(html)
54+
elements = root.xpath('///div[3]/div[1]/dl[2]/dd/div/div/a')
55+
en_names = list()
56+
ch_names = list()
57+
for element in elements:
58+
link = element.attrib['href']
59+
en_names.append(link.split('/')[-2])
60+
ch_names.append(element.text)
61+
62+
# 打印区县英文和中文名列表
63+
for index, name in enumerate(en_names):
64+
chinese_city_district_dict[name] = ch_names[index]
65+
return en_names
66+
67+
def get_areas(district):
68+
page = "http://bj.ke.com/xiaoqu/{0}".format(district)
69+
areas = list()
70+
try:
71+
headers = create_headers()
72+
response = requests.get(page, timeout=10, headers=headers)
73+
html = response.content
74+
root = etree.HTML(html)
75+
links = root.xpath('//div[3]/div[1]/dl[2]/dd/div/div[2]/a')
76+
77+
# 针对a标签的list进行处理
78+
for link in links:
79+
relative_link = link.attrib['href']
80+
# 去掉最后的"/"
81+
relative_link = relative_link[:-1]
82+
# 获取最后一节
83+
area = relative_link.split("/")[-1]
84+
# 去掉区县名,防止重复
85+
if area != district:
86+
chinese_area = link.text
87+
chinese_area_dict[area] = chinese_area
88+
# print(chinese_area)
89+
areas.append(area)
90+
return areas
91+
except Exception as e:
92+
print(e)
93+
94+
with open("sechouse.txt", "w", encoding='utf-8') as f:
95+
# 开始获得需要的板块数据
96+
total_page = 1
97+
sec_house_list = list()
98+
districts = get_districts()
99+
for district in districts:
100+
arealist = get_areas(district)
101+
for area in arealist:
102+
# 中文区县
103+
chinese_district = chinese_city_district_dict.get(district, "")
104+
# 中文版块
105+
chinese_area = chinese_area_dict.get(area, "")
106+
page = 'http://bj.ke.com/ershoufang/{0}/'.format(area)
107+
print(page)
108+
headers = create_headers()
109+
response = requests.get(page, timeout=10, headers=headers)
110+
html = response.content
111+
soup = BeautifulSoup(html, "lxml")
112+
113+
# 获得总的页数
114+
try:
115+
page_box = soup.find_all('div', class_='page-box')[0]
116+
matches = re.search('.*data-total-count="(\d+)".*', str(page_box))
117+
total_page = int(math.ceil(int(matches.group(1)) / 10))
118+
except Exception as e:
119+
print(e)
120+
121+
print(total_page)
122+
# 从第一页开始,一直遍历到最后一页
123+
headers = create_headers()
124+
for i in range(1, total_page + 1):
125+
page = 'http://bj.ke.com/ershoufang/{0}/pg{1}'.format(area,i)
126+
print(page)
127+
response = requests.get(page, timeout=10, headers=headers)
128+
html = response.content
129+
soup = BeautifulSoup(html, "lxml")
130+
131+
# 获得有小区信息的panel
132+
house_elements = soup.find_all('li', class_="clear")
133+
for house_elem in house_elements:
134+
price = house_elem.find('div', class_="totalPrice")
135+
name = house_elem.find('div', class_='title')
136+
desc = house_elem.find('div', class_="houseInfo")
137+
pic = house_elem.find('a', class_="img").find('img', class_="lj-lazy")
138+
139+
# 继续清理数据
140+
price = price.text.strip()
141+
name = name.text.replace("\n", "")
142+
desc = desc.text.replace("\n", "").strip()
143+
pic = pic.get('data-original').strip()
144+
145+
# 作为对象保存
146+
sec_house = SecHouse(chinese_district, chinese_area, name, price, desc, pic)
147+
print(sec_house.text())
148+
sec_house_list.append(sec_house)
149+
150+
for sec_house in sec_house_list:
151+
f.write(sec_house.text() + "\n")
152+

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /