Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit b525c63

Browse files
抓取OSChina
抓取ImportNew< 抓取CNBlog 抓取InfoQ
1 parent 5296460 commit b525c63

File tree

3 files changed

+221
-2
lines changed

3 files changed

+221
-2
lines changed

‎src/main/kotlin/com/light/saber/controller/KnowledgeCrawController.kt‎

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,4 +40,31 @@ class KnowledgeCrawController {
4040
return "DONE"
4141
}
4242

43+
@GetMapping("/knowledge/doCrawImportNewKnowledge")
44+
fun doCrawImportNewKnowledge(): String {
45+
Thread {
46+
CrawKnowledgeService.doCrawImportNewKnowledge()
47+
}.start()
48+
49+
return "DONE"
50+
}
51+
52+
@GetMapping("/knowledge/doCrawCNBlogKnowledge")
53+
fun doCrawCNBlogKnowledge(): String {
54+
Thread {
55+
CrawKnowledgeService.doCrawCNBlogKnowledge()
56+
}.start()
57+
58+
return "DONE"
59+
}
60+
61+
@GetMapping("/knowledge/doCrawInfoQKnowledge")
62+
fun doCrawInfoQKnowledge(): String {
63+
Thread {
64+
CrawKnowledgeService.doCrawInfoQKnowledge()
65+
}.start()
66+
67+
return "DONE"
68+
}
69+
4370
}

‎src/main/kotlin/com/light/saber/service/CrawKnowledgeService.kt‎

Lines changed: 191 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,197 @@ class CrawKnowledgeService {
6262
}
6363
}
6464

65+
fun doCrawImportNewKnowledge() {
66+
for (page in 1..135) {
67+
try {
68+
launch(CommonPool) {
69+
crawImportNew(page)
70+
}
71+
} catch (e: Exception) {
72+
73+
}
74+
}
75+
}
76+
77+
fun doCrawITEyeKnowledge() {
78+
for (page in 1..10000) {
79+
try {
80+
launch(CommonPool) {
81+
crawITEye(page)
82+
}
83+
} catch (e: Exception) {
84+
85+
}
86+
}
87+
}
88+
89+
fun doCrawCNBlogKnowledge() {
90+
for (page in 1..200) {
91+
try {
92+
launch(CommonPool) {
93+
crawCNBlog(page)
94+
}
95+
} catch (e: Exception) {
96+
97+
}
98+
}
99+
}
100+
101+
fun doCrawInfoQKnowledge() {
102+
for (page in 0..40) {
103+
try {
104+
launch(CommonPool) {
105+
crawInfoQ(page)
106+
}
107+
} catch (e: Exception) {
108+
109+
}
110+
}
111+
}
112+
113+
private fun crawInfoQ(page: Int) {
114+
val pageUrl = "http://www.infoq.com/cn/java/articles/${page * 12}"
115+
val 文章列表HTML = CrawlerWebClient.getPageHtmlText(pageUrl)
116+
val document = Jsoup.parse(文章列表HTML)
117+
// document.getElementsByClassName("news_type2 full_screen")[0].children[1].children[0]
118+
//<a href=​"/​cn/​articles/​Reactive-Systems-Akka-Actors-DomainDrivenDesign" title=​"使用Akka的Actor模型和领域驱动设计构建反应式系统">​...​</a>​
119+
document.getElementsByClass("news_type2 full_screen").forEach {
120+
val url = it.child(1).child(0).attr("href")
121+
val title = it.child(1).child(0).html()
122+
if (KnowledgeDao.countByUrl(url) == 0) {
123+
try {
124+
val InfoQ文章HTML = CrawlerWebClient.getPageHtmlText(url)
125+
val InfoQ文章Document = Jsoup.parse(InfoQ文章HTML)
126+
val content = 获取InfoQ文章内容(InfoQ文章Document)
127+
println(title)
128+
println(url)
129+
doSaveKnowledge(
130+
url = url,
131+
title = title,
132+
content = content
133+
)
134+
} catch (e: Exception) {
135+
136+
}
137+
}
138+
}
139+
140+
}
141+
142+
private fun 获取InfoQ文章内容(infoQ文章Document: Document?): String? {
143+
return infoQ文章Document?.getElementsByClass("text_info text_info_article")?.get(0)?.html()
144+
}
145+
146+
private fun crawCNBlog(page: Int) {
147+
val pageUrl = "https://www.cnblogs.com/#p$page"
148+
val 文章列表HTML = CrawlerWebClient.getPageHtmlText(pageUrl)
149+
val document = Jsoup.parse(文章列表HTML)
150+
// document.getElementsByClassName("titlelnk")[0]
151+
//<a class=​"titlelnk" href=​"https:​/​/​www.cnblogs.com/​qzrzq1/​p/​9069509.html" target=​"_blank">​基于Orangpi Zero和Linux ALSA实现WIFI无线音箱(一)​</a>​
152+
document.getElementsByClass("titlelnk").forEach {
153+
val url = it.attr("href")
154+
val title = it.html()
155+
if (KnowledgeDao.countByUrl(url) == 0) {
156+
try {
157+
val CNBlog文章HTML = CrawlerWebClient.getPageHtmlText(url)
158+
val CNBlog文章Document = Jsoup.parse(CNBlog文章HTML)
159+
val content = 获取CNBlog文章内容(CNBlog文章Document)
160+
println(title)
161+
println(url)
162+
doSaveKnowledge(
163+
url = url,
164+
title = title,
165+
content = content
166+
)
167+
} catch (e: Exception) {
168+
169+
}
170+
}
171+
}
172+
173+
}
174+
175+
private fun 获取CNBlog文章内容(cnBlog文章Document: Document?): String? {
176+
return cnBlog文章Document?.getElementById("cnblogs_post_body")?.html()
177+
}
178+
179+
private fun crawITEye(page: Int) {
180+
val pageUrl = "http://www.iteye.com/blogs/category/language?page=$page"
181+
val 文章列表HTML = CrawlerWebClient.getPageHtmlText(pageUrl)
182+
val document = Jsoup.parse(文章列表HTML)
183+
184+
// document.getElementsByClassName("content")[0].children[0].children[0]
185+
//<a href=​"http:​/​/​fhuan123.iteye.com/​blog/​2423594" title=​"C#Make自动化构建-简介" target=​"_blank">​C#Make自动化构建-简介​</a>​
186+
document.getElementsByClass("content").forEach {
187+
val url = it.child(0).child(0).attr("href")
188+
val title = it.child(0).child(0).html()
189+
if (KnowledgeDao.countByUrl(url) == 0) {
190+
try {
191+
val ITEye文章HTML = CrawlerWebClient.getPageHtmlText(url)
192+
val ITEye文章Document = Jsoup.parse(ITEye文章HTML)
193+
val content = 获取ITEye文章内容(ITEye文章Document)
194+
println(title)
195+
println(url)
196+
doSaveKnowledge(
197+
url = url,
198+
title = title,
199+
content = content
200+
)
201+
} catch (e: Exception) {
202+
203+
}
204+
}
205+
}
206+
207+
}
208+
209+
private fun 获取ITEye文章内容(itEye文章Document: Document?): String? {
210+
return itEye文章Document?.getElementById("blog_content")?.html()
211+
}
212+
213+
private fun crawImportNew(page: Int) {
214+
val pageUrl = "http://www.importnew.com/all-posts/page/$page"
215+
val 文章列表HTML = CrawlerWebClient.getPageHtmlText(pageUrl)
216+
val document = Jsoup.parse(文章列表HTML)
217+
// document.getElementsByClassName("meta-title")[0]
218+
//<a class=​"meta-title" target=​"_blank" href=​"http:​/​/​www.importnew.com/​28577.html" title=​"使用 Java 注解自动化处理对应关系实现注释代码化 ">​使用 Java 注解自动化处理对应关系实现注释代码化​</a>​
219+
document.getElementsByClass("meta-title").forEach {
220+
val url = it.attr("href")
221+
if (KnowledgeDao.countByUrl(url) == 0) {
222+
try {
223+
val ImportNew文章HTML = CrawlerWebClient.getPageHtmlText(url)
224+
val ImportNew文章Document = Jsoup.parse(ImportNew文章HTML)
225+
val title = 获取ImportNew文章标题(ImportNew文章Document)
226+
val content = 获取ImportNew文章内容(ImportNew文章Document)
227+
println(title)
228+
println(url)
229+
doSaveKnowledge(
230+
url = url,
231+
title = title,
232+
content = content
233+
)
234+
} catch (e: Exception) {
235+
236+
}
237+
}
238+
}
239+
240+
}
241+
242+
private fun 获取ImportNew文章内容(importNew文章Document: Document?): String? {
243+
// document.getElementsByClassName("entry")
244+
return importNew文章Document?.getElementsByClass("entry")?.get(0)?.html()
245+
}
246+
247+
private fun 获取ImportNew文章标题(importNew文章Document: Document?): String? {
248+
// document.getElementsByClassName("entry-header")[0]
249+
// <div class=​"entry-header">​<h1>​使用 Java 注解自动化处理对应关系实现注释代码化​</h1>​</div>​
250+
// document.getElementsByClassName("entry-header")[0].children[0].innerHTML
251+
// "使用 Java 注解自动化处理对应关系实现注释代码化"
252+
return importNew文章Document?.getElementsByClass("entry-header")?.get(0)?.child(0)?.html()
253+
254+
}
255+
65256
private fun crawOSChina(page: Int) {
66257
val pageUrl = "https://www.oschina.net/action/ajax/get_more_recommend_blog?classification=0&p=$page"
67258
val 文章列表HTML = CrawlerWebClient.getPageHtmlText(pageUrl)
@@ -101,8 +292,6 @@ class CrawKnowledgeService {
101292
} catch (e: Exception) {
102293

103294
}
104-
105-
106295
}
107296
}
108297
}

‎src/main/resources/templates/common/head.ftl‎

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,9 @@
4949
<dd><a href="/knowledge/doCrawJianShu" target="_blank">抓取简书</a></dd>
5050
<dd><a href="/knowledge/doCrawSegmentFaultKnowledge" target="_blank">抓取SegmentFault</a></dd>
5151
<dd><a href="/knowledge/doCrawOSChinaKnowledge" target="_blank">抓取OSChina</a></dd>
52+
<dd><a href="/knowledge/doCrawImportNewKnowledge" target="_blank">抓取ImportNew</a></dd>
53+
<dd><a href="/knowledge/doCrawCNBlogKnowledge" target="_blank">抓取CNBlog</a></dd>
54+
<dd><a href="/knowledge/doCrawInfoQKnowledge" target="_blank">抓取InfoQ</a></dd>
5255
<dd><a href="">超链接</a></dd>
5356
</dl>
5457
</li>

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /