Commit b525c63

committed

抓取OSChina

抓取ImportNew< 抓取CNBlog 抓取InfoQ

1 parent 5296460 commit b525c63Copy full SHA for b525c63

File tree

3 files changed

+221

-2

lines changed

src/main
- kotlin/com/light/saber
  - controller
    - KnowledgeCrawController.kt
  - service
    - CrawKnowledgeService.kt
- resources/templates/common
  - head.ftl

3 files changed

+221

-2

lines changed

`‎src/main/kotlin/com/light/saber/controller/KnowledgeCrawController.kt‎`

Lines changed: 27 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -40,4 +40,31 @@ class KnowledgeCrawController {`
`40`	`40`	`return "DONE"`
`41`	`41`	`}`
`42`	`42`
	`43`	`+ @GetMapping("/knowledge/doCrawImportNewKnowledge")`
	`44`	`+ fun doCrawImportNewKnowledge(): String {`
	`45`	`+ Thread {`
	`46`	`+ CrawKnowledgeService.doCrawImportNewKnowledge()`
	`47`	`+ }.start()`
	`48`	`+`
	`49`	`+ return "DONE"`
	`50`	`+ }`
	`51`	`+`
	`52`	`+ @GetMapping("/knowledge/doCrawCNBlogKnowledge")`
	`53`	`+ fun doCrawCNBlogKnowledge(): String {`
	`54`	`+ Thread {`
	`55`	`+ CrawKnowledgeService.doCrawCNBlogKnowledge()`
	`56`	`+ }.start()`
	`57`	`+`
	`58`	`+ return "DONE"`
	`59`	`+ }`
	`60`	`+`
	`61`	`+ @GetMapping("/knowledge/doCrawInfoQKnowledge")`
	`62`	`+ fun doCrawInfoQKnowledge(): String {`
	`63`	`+ Thread {`
	`64`	`+ CrawKnowledgeService.doCrawInfoQKnowledge()`
	`65`	`+ }.start()`
	`66`	`+`
	`67`	`+ return "DONE"`
	`68`	`+ }`
	`69`	`+`
`43`	`70`	`}`

`‎src/main/kotlin/com/light/saber/service/CrawKnowledgeService.kt‎`

Lines changed: 191 additions & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -62,6 +62,197 @@ class CrawKnowledgeService {`
`62`	`62`	`}`
`63`	`63`	`}`
`64`	`64`
	`65`	`+ fun doCrawImportNewKnowledge() {`
	`66`	`+ for (page in 1..135) {`
	`67`	`+ try {`
	`68`	`+ launch(CommonPool) {`
	`69`	`+ crawImportNew(page)`
	`70`	`+ }`
	`71`	`+ } catch (e: Exception) {`
	`72`	`+`
	`73`	`+ }`
	`74`	`+ }`
	`75`	`+ }`
	`76`	`+`
	`77`	`+ fun doCrawITEyeKnowledge() {`
	`78`	`+ for (page in 1..10000) {`
	`79`	`+ try {`
	`80`	`+ launch(CommonPool) {`
	`81`	`+ crawITEye(page)`
	`82`	`+ }`
	`83`	`+ } catch (e: Exception) {`
	`84`	`+`
	`85`	`+ }`
	`86`	`+ }`
	`87`	`+ }`
	`88`	`+`
	`89`	`+ fun doCrawCNBlogKnowledge() {`
	`90`	`+ for (page in 1..200) {`
	`91`	`+ try {`
	`92`	`+ launch(CommonPool) {`
	`93`	`+ crawCNBlog(page)`
	`94`	`+ }`
	`95`	`+ } catch (e: Exception) {`
	`96`	`+`
	`97`	`+ }`
	`98`	`+ }`
	`99`	`+ }`
	`100`	`+`
	`101`	`+ fun doCrawInfoQKnowledge() {`
	`102`	`+ for (page in 0..40) {`
	`103`	`+ try {`
	`104`	`+ launch(CommonPool) {`
	`105`	`+ crawInfoQ(page)`
	`106`	`+ }`
	`107`	`+ } catch (e: Exception) {`
	`108`	`+`
	`109`	`+ }`
	`110`	`+ }`
	`111`	`+ }`
	`112`	`+`
	`113`	`+ private fun crawInfoQ(page: Int) {`
	`114`	`+ val pageUrl = "http://www.infoq.com/cn/java/articles/${page * 12}"`
	`115`	`+ val 文章列表HTML = CrawlerWebClient.getPageHtmlText(pageUrl)`
	`116`	`+ val document = Jsoup.parse(文章列表HTML)`
	`117`	`+ // document.getElementsByClassName("news_type2 full_screen")[0].children[1].children[0]`
	`118`	`+ //<a href="/cn/articles/Reactive-Systems-Akka-Actors-DomainDrivenDesign" title="使用Akka的Actor模型和领域驱动设计构建反应式系统">...</a>`
	`119`	`+ document.getElementsByClass("news_type2 full_screen").forEach {`
	`120`	`+ val url = it.child(1).child(0).attr("href")`
	`121`	`+ val title = it.child(1).child(0).html()`
	`122`	`+ if (KnowledgeDao.countByUrl(url) == 0) {`
	`123`	`+ try {`
	`124`	`+ val InfoQ文章HTML = CrawlerWebClient.getPageHtmlText(url)`
	`125`	`+ val InfoQ文章Document = Jsoup.parse(InfoQ文章HTML)`
	`126`	`+ val content = 获取InfoQ文章内容(InfoQ文章Document)`
	`127`	`+ println(title)`
	`128`	`+ println(url)`
	`129`	`+ doSaveKnowledge(`
	`130`	`+ url = url,`
	`131`	`+ title = title,`
	`132`	`+ content = content`
	`133`	`+ )`
	`134`	`+ } catch (e: Exception) {`
	`135`	`+`
	`136`	`+ }`
	`137`	`+ }`
	`138`	`+ }`
	`139`	`+`
	`140`	`+ }`
	`141`	`+`
	`142`	`+ private fun 获取InfoQ文章内容(infoQ文章Document: Document?): String? {`
	`143`	`+ return infoQ文章Document?.getElementsByClass("text_info text_info_article")?.get(0)?.html()`
	`144`	`+ }`
	`145`	`+`
	`146`	`+ private fun crawCNBlog(page: Int) {`
	`147`	`+ val pageUrl = "https://www.cnblogs.com/#p$page"`
	`148`	`+ val 文章列表HTML = CrawlerWebClient.getPageHtmlText(pageUrl)`
	`149`	`+ val document = Jsoup.parse(文章列表HTML)`
	`150`	`+ // document.getElementsByClassName("titlelnk")[0]`
	`151`	`+ //<a class="titlelnk" href="https://www.cnblogs.com/qzrzq1/p/9069509.html" target="_blank">基于Orangpi Zero和Linux ALSA实现WIFI无线音箱(一)</a>`
	`152`	`+ document.getElementsByClass("titlelnk").forEach {`
	`153`	`+ val url = it.attr("href")`
	`154`	`+ val title = it.html()`
	`155`	`+ if (KnowledgeDao.countByUrl(url) == 0) {`
	`156`	`+ try {`
	`157`	`+ val CNBlog文章HTML = CrawlerWebClient.getPageHtmlText(url)`
	`158`	`+ val CNBlog文章Document = Jsoup.parse(CNBlog文章HTML)`
	`159`	`+ val content = 获取CNBlog文章内容(CNBlog文章Document)`
	`160`	`+ println(title)`
	`161`	`+ println(url)`
	`162`	`+ doSaveKnowledge(`
	`163`	`+ url = url,`
	`164`	`+ title = title,`
	`165`	`+ content = content`
	`166`	`+ )`
	`167`	`+ } catch (e: Exception) {`
	`168`	`+`
	`169`	`+ }`
	`170`	`+ }`
	`171`	`+ }`
	`172`	`+`
	`173`	`+ }`
	`174`	`+`
	`175`	`+ private fun 获取CNBlog文章内容(cnBlog文章Document: Document?): String? {`
	`176`	`+ return cnBlog文章Document?.getElementById("cnblogs_post_body")?.html()`
	`177`	`+ }`
	`178`	`+`
	`179`	`+ private fun crawITEye(page: Int) {`
	`180`	`+ val pageUrl = "http://www.iteye.com/blogs/category/language?page=$page"`
	`181`	`+ val 文章列表HTML = CrawlerWebClient.getPageHtmlText(pageUrl)`
	`182`	`+ val document = Jsoup.parse(文章列表HTML)`
	`183`	`+`
	`184`	`+ // document.getElementsByClassName("content")[0].children[0].children[0]`
	`185`	`+ //<a href="http://fhuan123.iteye.com/blog/2423594" title="C#Make自动化构建-简介" target="_blank">C#Make自动化构建-简介</a>`
	`186`	`+ document.getElementsByClass("content").forEach {`
	`187`	`+ val url = it.child(0).child(0).attr("href")`
	`188`	`+ val title = it.child(0).child(0).html()`
	`189`	`+ if (KnowledgeDao.countByUrl(url) == 0) {`
	`190`	`+ try {`
	`191`	`+ val ITEye文章HTML = CrawlerWebClient.getPageHtmlText(url)`
	`192`	`+ val ITEye文章Document = Jsoup.parse(ITEye文章HTML)`
	`193`	`+ val content = 获取ITEye文章内容(ITEye文章Document)`
	`194`	`+ println(title)`
	`195`	`+ println(url)`
	`196`	`+ doSaveKnowledge(`
	`197`	`+ url = url,`
	`198`	`+ title = title,`
	`199`	`+ content = content`
	`200`	`+ )`
	`201`	`+ } catch (e: Exception) {`
	`202`	`+`
	`203`	`+ }`
	`204`	`+ }`
	`205`	`+ }`
	`206`	`+`
	`207`	`+ }`
	`208`	`+`
	`209`	`+ private fun 获取ITEye文章内容(itEye文章Document: Document?): String? {`
	`210`	`+ return itEye文章Document?.getElementById("blog_content")?.html()`
	`211`	`+ }`
	`212`	`+`
	`213`	`+ private fun crawImportNew(page: Int) {`
	`214`	`+ val pageUrl = "http://www.importnew.com/all-posts/page/$page"`
	`215`	`+ val 文章列表HTML = CrawlerWebClient.getPageHtmlText(pageUrl)`
	`216`	`+ val document = Jsoup.parse(文章列表HTML)`
	`217`	`+ // document.getElementsByClassName("meta-title")[0]`
	`218`	`+ //<a class="meta-title" target="_blank" href="http://www.importnew.com/28577.html" title="使用 Java 注解自动化处理对应关系实现注释代码化 ">使用 Java 注解自动化处理对应关系实现注释代码化</a>`
	`219`	`+ document.getElementsByClass("meta-title").forEach {`
	`220`	`+ val url = it.attr("href")`
	`221`	`+ if (KnowledgeDao.countByUrl(url) == 0) {`
	`222`	`+ try {`
	`223`	`+ val ImportNew文章HTML = CrawlerWebClient.getPageHtmlText(url)`
	`224`	`+ val ImportNew文章Document = Jsoup.parse(ImportNew文章HTML)`
	`225`	`+ val title = 获取ImportNew文章标题(ImportNew文章Document)`
	`226`	`+ val content = 获取ImportNew文章内容(ImportNew文章Document)`
	`227`	`+ println(title)`
	`228`	`+ println(url)`
	`229`	`+ doSaveKnowledge(`
	`230`	`+ url = url,`
	`231`	`+ title = title,`
	`232`	`+ content = content`
	`233`	`+ )`
	`234`	`+ } catch (e: Exception) {`
	`235`	`+`
	`236`	`+ }`
	`237`	`+ }`
	`238`	`+ }`
	`239`	`+`
	`240`	`+ }`
	`241`	`+`
	`242`	`+ private fun 获取ImportNew文章内容(importNew文章Document: Document?): String? {`
	`243`	`+ // document.getElementsByClassName("entry")`
	`244`	`+ return importNew文章Document?.getElementsByClass("entry")?.get(0)?.html()`
	`245`	`+ }`
	`246`	`+`
	`247`	`+ private fun 获取ImportNew文章标题(importNew文章Document: Document?): String? {`
	`248`	`+// document.getElementsByClassName("entry-header")[0]`
	`249`	`+// <div class="entry-header"><h1>使用 Java 注解自动化处理对应关系实现注释代码化</h1></div>`
	`250`	`+// document.getElementsByClassName("entry-header")[0].children[0].innerHTML`
	`251`	`+// "使用 Java 注解自动化处理对应关系实现注释代码化"`
	`252`	`+ return importNew文章Document?.getElementsByClass("entry-header")?.get(0)?.child(0)?.html()`
	`253`	`+`
	`254`	`+ }`
	`255`	`+`
`65`	`256`	`private fun crawOSChina(page: Int) {`
`66`	`257`	`val pageUrl = "https://www.oschina.net/action/ajax/get_more_recommend_blog?classification=0&p=$page"`
`67`	`258`	`val 文章列表HTML = CrawlerWebClient.getPageHtmlText(pageUrl)`
`@@ -101,8 +292,6 @@ class CrawKnowledgeService {`
`101`	`292`	`} catch (e: Exception) {`
`102`	`293`
`103`	`294`	`}`
`104`		`-`
`105`		`-`
`106`	`295`	`}`
`107`	`296`	`}`
`108`	`297`	`}`

`‎src/main/resources/templates/common/head.ftl‎`

Lines changed: 3 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -49,6 +49,9 @@`
`49`	`49`	`<dd><a href="/knowledge/doCrawJianShu" target="_blank">抓取简书</a></dd>`
`50`	`50`	`<dd><a href="/knowledge/doCrawSegmentFaultKnowledge" target="_blank">抓取SegmentFault</a></dd>`
`51`	`51`	`<dd><a href="/knowledge/doCrawOSChinaKnowledge" target="_blank">抓取OSChina</a></dd>`
	`52`	`+ <dd><a href="/knowledge/doCrawImportNewKnowledge" target="_blank">抓取ImportNew</a></dd>`
	`53`	`+ <dd><a href="/knowledge/doCrawCNBlogKnowledge" target="_blank">抓取CNBlog</a></dd>`
	`54`	`+ <dd><a href="/knowledge/doCrawInfoQKnowledge" target="_blank">抓取InfoQ</a></dd>`
`52`	`55`	`<dd><a href="">超链接</a></dd>`
`53`	`56`	`</dl>`
`54`	`57`	`</li>`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit b525c63

File tree

3 files changed

3 files changed

`‎src/main/kotlin/com/light/saber/controller/KnowledgeCrawController.kt‎`

`‎src/main/kotlin/com/light/saber/service/CrawKnowledgeService.kt‎`

`‎src/main/resources/templates/common/head.ftl‎`

0 commit comments