Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 11cff06

Browse files
committed
Create newhouse.py
1 parent fbe0f5b commit 11cff06

File tree

1 file changed

+97
-0
lines changed

1 file changed

+97
-0
lines changed

‎jiguang/fang/newhouse.py‎

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
import random
2+
import requests
3+
from bs4 import BeautifulSoup
4+
import re
5+
import math
6+
7+
USER_AGENTS = [
8+
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
9+
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
10+
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
11+
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
12+
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
13+
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
14+
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
15+
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
16+
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
17+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
18+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
19+
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
20+
]
21+
22+
def create_headers():
23+
headers = dict()
24+
headers["User-Agent"] = random.choice(USER_AGENTS)
25+
headers["Referer"] = "http://www.ke.com"
26+
return headers
27+
28+
class NewHouse(object):
29+
def __init__(self, xiaoqu, price, total):
30+
self.xiaoqu = xiaoqu
31+
self.price = price
32+
self.total = total
33+
34+
def text(self):
35+
return self.xiaoqu + "," + \
36+
self.price + "," + \
37+
self.total
38+
39+
with open("newhouse.txt", "w", encoding='utf-8') as f:
40+
# 开始获得需要的板块数据
41+
total_page = 1
42+
loupan_list = list()
43+
page = 'http://bj.fang.ke.com/loupan/'
44+
print(page)
45+
headers = create_headers()
46+
response = requests.get(page, timeout=10, headers=headers)
47+
html = response.content
48+
soup = BeautifulSoup(html, "lxml")
49+
50+
# 获得总的页数
51+
try:
52+
page_box = soup.find_all('div', class_='page-box')[0]
53+
matches = re.search('.*data-total-count="(\d+)".*', str(page_box))
54+
total_page = int(math.ceil(int(matches.group(1)) / 10))
55+
except Exception as e:
56+
print(e)
57+
58+
print(total_page)
59+
# 从第一页开始,一直遍历到最后一页
60+
headers = create_headers()
61+
for i in range(1, total_page + 1):
62+
page = 'http://bj.fang.ke.com/loupan/pg{0}'.format(i)
63+
print(page)
64+
response = requests.get(page, timeout=10, headers=headers)
65+
html = response.content
66+
soup = BeautifulSoup(html, "lxml")
67+
68+
# 获得有小区信息的panel
69+
house_elements = soup.find_all('li', class_="resblock-list")
70+
for house_elem in house_elements:
71+
price = house_elem.find('span', class_="number")
72+
desc = house_elem.find('span', class_="desc")
73+
total = house_elem.find('div', class_="second")
74+
loupan = house_elem.find('a', class_='name')
75+
76+
# 继续清理数据
77+
try:
78+
price = price.text.strip() + desc.text.strip()
79+
except Exception as e:
80+
price = '0'
81+
82+
loupan = loupan.text.replace("\n", "")
83+
84+
try:
85+
total = total.text.strip().replace(u'总价', '')
86+
total = total.replace(u'/套起', '')
87+
except Exception as e:
88+
total = '0'
89+
90+
# 作为对象保存
91+
loupan = NewHouse(loupan, price, total)
92+
print(loupan.text())
93+
loupan_list.append(loupan)
94+
95+
for loupan in loupan_list:
96+
f.write(loupan.text() + "\n")
97+

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /