Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit abaf0e1

Browse files
2022年10月16日_17:56
1 parent 0b9dd8c commit abaf0e1

File tree

45 files changed

+59079
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+59079
-0
lines changed

‎.ipynb_checkpoints/20221011_Python_function-checkpoint.ipynb

Lines changed: 843 additions & 0 deletions
Large diffs are not rendered by default.

‎.ipynb_checkpoints/20221011_function-checkpoint.ipynb

Lines changed: 834 additions & 0 deletions
Large diffs are not rendered by default.

‎.ipynb_checkpoints/20221012_Python_Crawling_with_selenium,_BeautifulSoup-checkpoint.ipynb

Lines changed: 12388 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"id": "ebf41e13",
7+
"metadata": {},
8+
"outputs": [],
9+
"source": [
10+
"# crwaling library import\n",
11+
"\n",
12+
"from bs4 import BeautifulSoup\n",
13+
"from selenium import webdriver\n",
14+
"import requests\n",
15+
"# 코드 진행 지연을 위한 time 임포트\n",
16+
"\n",
17+
"import time\n",
18+
"# 2022-07 이후 selenium 업데이트로 인한 XPATH 추적 시 사용하는 임포트\n",
19+
"\n",
20+
"from selenium.webdriver.common.by import By\n",
21+
"\n",
22+
"# file io\n",
23+
"import codecs"
24+
]
25+
},
26+
{
27+
"cell_type": "markdown",
28+
"id": "15c52a2a",
29+
"metadata": {},
30+
"source": [
31+
"## 순회 크롤러\n",
32+
"\n",
33+
"- 같은 양식의 페이지를 순회하면서 자료를 수집해오는 크롤러\n",
34+
"\n",
35+
"- 원 페이지 크롤러 제작 후 > 완성된 크롤러를 반복문에 넣어서 만든다\n",
36+
"\n",
37+
"> 반복을 어디부터 돌릴지에 대한 파악이 제일 중요!\n",
38+
"\n",
39+
"<br>\n",
40+
"\n",
41+
"- 순서\n",
42+
"\n",
43+
"1. approach N page\n",
44+
"\n",
45+
"2. source code crawling\n",
46+
"\n",
47+
"3. parsing\n",
48+
"\n",
49+
"4. data extraction\n",
50+
"\n",
51+
"5. saving in txt file\n",
52+
"\n",
53+
"6. move to number 1.\n",
54+
"\n",
55+
"> 다음페이지 버튼 XPATH 클릭으로 페이지 넘기기"
56+
]
57+
},
58+
{
59+
"cell_type": "code",
60+
"execution_count": 63,
61+
"id": "ef4ec077",
62+
"metadata": {},
63+
"outputs": [
64+
{
65+
"name": "stdout",
66+
"output_type": "stream",
67+
"text": [
68+
"902\n",
69+
"902\n",
70+
"902\n",
71+
"902\n"
72+
]
73+
}
74+
],
75+
"source": [
76+
"# 리스트 형식 페이지 > url 바뀌지 않아도 > F12 + Network menu click > \n",
77+
"# 리스트 다음 페이지 클릭 > Network 변경사항을 Headers, Payload tab에서 확인 가능\n",
78+
"chrome_driver = webdriver.Chrome('chromedriver')\n",
79+
"\n",
80+
"# approach first page\n",
81+
"chrome_driver.get(\"https://product.kyobobook.co.kr/bestseller/online?period=001\")\n",
82+
"\n",
83+
"# 첫번째 제목 저장 리스트 > 반복문 중지 조건으로 필요\n",
84+
"check_name_list = list()\n",
85+
"\n",
86+
"rank_list = list()\n",
87+
"title_list = list()\n",
88+
"price_list = list()\n",
89+
"author_list = list()\n",
90+
"\n",
91+
"time.sleep(6)\n",
92+
"\n",
93+
"# 반복문\n",
94+
"while True:\n",
95+
" \n",
96+
" # 끝까지 스크롤 다운 (광고로 페이지 가리기 방지)\n",
97+
" chrome_driver.execute_script(\"window.scrollTo(0, document.body.scrollHeight);\")\n",
98+
" \n",
99+
" # source code crawling\n",
100+
" source = chrome_driver.page_source\n",
101+
" \n",
102+
" # parsing\n",
103+
" html_parsed_source = BeautifulSoup(source, \"html.parser\")\n",
104+
"\n",
105+
" # extract data(span_prod_name) & saving in list\n",
106+
" \n",
107+
" ####### Title\n",
108+
" span_prod_name = html_parsed_source.find_all(\"span\", class_=\"prod_name\") \n",
109+
" \n",
110+
" # while문 중지 조건 -> 같은 title이 list에 존재 할 때\n",
111+
" if (span_prod_name[0].text in check_name_list):\n",
112+
" chrome_driver.close()\n",
113+
" break\n",
114+
" check_name_list.append(span_prod_name[0].text)\n",
115+
" \n",
116+
" for title in span_prod_name:\n",
117+
" title_list.append(title.text) \n",
118+
" \n",
119+
" ######## Rank\n",
120+
" div_prod_rank = html_parsed_source.find_all(\"div\", class_=\"prod_rank\")\n",
121+
"\n",
122+
" for rank in div_prod_rank:\n",
123+
" rank_list.append(rank.text)\n",
124+
"\n",
125+
" ######## Price \n",
126+
" span_val = html_parsed_source.find_all(\"span\", class_=\"val\")\n",
127+
" \n",
128+
" for price in span_val:\n",
129+
" if(price.text == \"0\"):\n",
130+
" None\n",
131+
" else:\n",
132+
" price_list.append(price.text)\n",
133+
" \n",
134+
" ######### Author\n",
135+
" span_prod_author = html_parsed_source.find_all(\"span\", class_=\"prod_author\")\n",
136+
"\n",
137+
" for author in span_prod_author:\n",
138+
" author_list.append(author.text.split(\" ·\")[0])\n",
139+
" \n",
140+
" # 다음 페이지 버튼 XPATH로 이동\n",
141+
" chrome_driver.find_element(By.XPATH, '//*[@id=\"tabRoot\"]/div[4]/div[2]/button[2]').click()\n",
142+
" \n",
143+
" time.sleep(6)\n",
144+
"\n",
145+
"# extracted data item 개수 일치하는 지 확인\n",
146+
"book_list = [ title_list, rank_list, price_list, author_list ]\n",
147+
"\n",
148+
"for book in book_list:\n",
149+
" print(len(book))"
150+
]
151+
},
152+
{
153+
"cell_type": "markdown",
154+
"id": "989f6974",
155+
"metadata": {},
156+
"source": [
157+
"> kyobo_traversal.txt\n",
158+
"\n",
159+
"<br>\n",
160+
"\n",
161+
"> kyobo_traversal.csv"
162+
]
163+
}
164+
],
165+
"metadata": {
166+
"kernelspec": {
167+
"display_name": "Python 3 (ipykernel)",
168+
"language": "python",
169+
"name": "python3"
170+
},
171+
"language_info": {
172+
"codemirror_mode": {
173+
"name": "ipython",
174+
"version": 3
175+
},
176+
"file_extension": ".py",
177+
"mimetype": "text/x-python",
178+
"name": "python",
179+
"nbconvert_exporter": "python",
180+
"pygments_lexer": "ipython3",
181+
"version": "3.9.12"
182+
}
183+
},
184+
"nbformat": 4,
185+
"nbformat_minor": 5
186+
}

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /