@@ -62,6 +62,197 @@ class CrawKnowledgeService {
6262 }
6363 }
6464
65+  fun  doCrawImportNewKnowledge () {
66+  for  (page in  1 .. 135 ) {
67+  try  {
68+  launch(CommonPool ) {
69+  crawImportNew(page)
70+  }
71+  } catch  (e:  Exception ) {
72+ 73+  }
74+  }
75+  }
76+ 77+  fun  doCrawITEyeKnowledge () {
78+  for  (page in  1 .. 10000 ) {
79+  try  {
80+  launch(CommonPool ) {
81+  crawITEye(page)
82+  }
83+  } catch  (e:  Exception ) {
84+ 85+  }
86+  }
87+  }
88+ 89+  fun  doCrawCNBlogKnowledge () {
90+  for  (page in  1 .. 200 ) {
91+  try  {
92+  launch(CommonPool ) {
93+  crawCNBlog(page)
94+  }
95+  } catch  (e:  Exception ) {
96+ 97+  }
98+  }
99+  }
100+ 101+  fun  doCrawInfoQKnowledge () {
102+  for  (page in  0 .. 40 ) {
103+  try  {
104+  launch(CommonPool ) {
105+  crawInfoQ(page)
106+  }
107+  } catch  (e:  Exception ) {
108+ 109+  }
110+  }
111+  }
112+ 113+  private  fun  crawInfoQ (page :  Int ) {
114+  val  pageUrl =  " http://www.infoq.com/cn/java/articles/${page *  12 } " 
115+  val  文章列表HTML  =  CrawlerWebClient .getPageHtmlText(pageUrl)
116+  val  document =  Jsoup .parse(文章列表HTML )
117+  //  document.getElementsByClassName("news_type2 full_screen")[0].children[1].children[0]
118+  // <a href="/cn/articles/Reactive-Systems-Akka-Actors-DomainDrivenDesign" title="使用Akka的Actor模型和领域驱动设计构建反应式系统">...</a>
119+  document.getElementsByClass(" news_type2 full_screen" 
120+  val  url =  it.child(1 ).child(0 ).attr(" href" 
121+  val  title =  it.child(1 ).child(0 ).html()
122+  if  (KnowledgeDao .countByUrl(url) ==  0 ) {
123+  try  {
124+  val  InfoQ 文章HTML  =  CrawlerWebClient .getPageHtmlText(url)
125+  val  InfoQ 文章Document  =  Jsoup .parse(InfoQ 文章HTML )
126+  val  content =  获取InfoQ 文章内容(InfoQ 文章Document )
127+  println (title)
128+  println (url)
129+  doSaveKnowledge(
130+  url =  url,
131+  title =  title,
132+  content =  content
133+  )
134+  } catch  (e:  Exception ) {
135+ 136+  }
137+  }
138+  }
139+ 140+  }
141+ 142+  private  fun  获取InfoQ 文章内容(infoQ文章Document :  Document ? ): String?  {
143+  return  infoQ文章Document ?.getElementsByClass(" text_info text_info_article" 0 )?.html()
144+  }
145+ 146+  private  fun  crawCNBlog (page :  Int ) {
147+  val  pageUrl =  " https://www.cnblogs.com/#p$page " 
148+  val  文章列表HTML  =  CrawlerWebClient .getPageHtmlText(pageUrl)
149+  val  document =  Jsoup .parse(文章列表HTML )
150+  //  document.getElementsByClassName("titlelnk")[0]
151+  // <a class="titlelnk" href="https://www.cnblogs.com/qzrzq1/p/9069509.html" target="_blank">基于Orangpi Zero和Linux ALSA实现WIFI无线音箱(一)</a>
152+  document.getElementsByClass(" titlelnk" 
153+  val  url =  it.attr(" href" 
154+  val  title =  it.html()
155+  if  (KnowledgeDao .countByUrl(url) ==  0 ) {
156+  try  {
157+  val  CNBlog 文章HTML  =  CrawlerWebClient .getPageHtmlText(url)
158+  val  CNBlog 文章Document  =  Jsoup .parse(CNBlog 文章HTML )
159+  val  content =  获取CNBlog 文章内容(CNBlog 文章Document )
160+  println (title)
161+  println (url)
162+  doSaveKnowledge(
163+  url =  url,
164+  title =  title,
165+  content =  content
166+  )
167+  } catch  (e:  Exception ) {
168+ 169+  }
170+  }
171+  }
172+ 173+  }
174+ 175+  private  fun  获取CNBlog 文章内容(cnBlog文章Document :  Document ? ): String?  {
176+  return  cnBlog文章Document ?.getElementById(" cnblogs_post_body" 
177+  }
178+ 179+  private  fun  crawITEye (page :  Int ) {
180+  val  pageUrl =  " http://www.iteye.com/blogs/category/language?page=$page " 
181+  val  文章列表HTML  =  CrawlerWebClient .getPageHtmlText(pageUrl)
182+  val  document =  Jsoup .parse(文章列表HTML )
183+ 184+  //  document.getElementsByClassName("content")[0].children[0].children[0]
185+  // <a href="http://fhuan123.iteye.com/blog/2423594" title="C#Make自动化构建-简介" target="_blank">C#Make自动化构建-简介</a>
186+  document.getElementsByClass(" content" 
187+  val  url =  it.child(0 ).child(0 ).attr(" href" 
188+  val  title =  it.child(0 ).child(0 ).html()
189+  if  (KnowledgeDao .countByUrl(url) ==  0 ) {
190+  try  {
191+  val  ITEye 文章HTML  =  CrawlerWebClient .getPageHtmlText(url)
192+  val  ITEye 文章Document  =  Jsoup .parse(ITEye 文章HTML )
193+  val  content =  获取ITEye 文章内容(ITEye 文章Document )
194+  println (title)
195+  println (url)
196+  doSaveKnowledge(
197+  url =  url,
198+  title =  title,
199+  content =  content
200+  )
201+  } catch  (e:  Exception ) {
202+ 203+  }
204+  }
205+  }
206+ 207+  }
208+ 209+  private  fun  获取ITEye 文章内容(itEye文章Document :  Document ? ): String?  {
210+  return  itEye文章Document ?.getElementById(" blog_content" 
211+  }
212+ 213+  private  fun  crawImportNew (page :  Int ) {
214+  val  pageUrl =  " http://www.importnew.com/all-posts/page/$page " 
215+  val  文章列表HTML  =  CrawlerWebClient .getPageHtmlText(pageUrl)
216+  val  document =  Jsoup .parse(文章列表HTML )
217+  //  document.getElementsByClassName("meta-title")[0]
218+  // <a class="meta-title" target="_blank" href="http://www.importnew.com/28577.html" title="使用 Java 注解自动化处理对应关系实现注释代码化 ">使用 Java 注解自动化处理对应关系实现注释代码化</a>
219+  document.getElementsByClass(" meta-title" 
220+  val  url =  it.attr(" href" 
221+  if  (KnowledgeDao .countByUrl(url) ==  0 ) {
222+  try  {
223+  val  ImportNew 文章HTML  =  CrawlerWebClient .getPageHtmlText(url)
224+  val  ImportNew 文章Document  =  Jsoup .parse(ImportNew 文章HTML )
225+  val  title =  获取ImportNew 文章标题(ImportNew 文章Document )
226+  val  content =  获取ImportNew 文章内容(ImportNew 文章Document )
227+  println (title)
228+  println (url)
229+  doSaveKnowledge(
230+  url =  url,
231+  title =  title,
232+  content =  content
233+  )
234+  } catch  (e:  Exception ) {
235+ 236+  }
237+  }
238+  }
239+ 240+  }
241+ 242+  private  fun  获取ImportNew 文章内容(importNew文章Document :  Document ? ): String?  {
243+  //  document.getElementsByClassName("entry")
244+  return  importNew文章Document ?.getElementsByClass(" entry" 0 )?.html()
245+  }
246+ 247+  private  fun  获取ImportNew 文章标题(importNew文章Document :  Document ? ): String?  {
248+ //  document.getElementsByClassName("entry-header")[0]
249+ //  <div class="entry-header"><h1>使用 Java 注解自动化处理对应关系实现注释代码化</h1></div>
250+ //  document.getElementsByClassName("entry-header")[0].children[0].innerHTML
251+ //  "使用 Java 注解自动化处理对应关系实现注释代码化"
252+  return  importNew文章Document ?.getElementsByClass(" entry-header" 0 )?.child(0 )?.html()
253+ 254+  }
255+ 65256 private  fun  crawOSChina (page :  Int ) {
66257 val  pageUrl =  " https://www.oschina.net/action/ajax/get_more_recommend_blog?classification=0&p=$page " 
67258 val  文章列表HTML  =  CrawlerWebClient .getPageHtmlText(pageUrl)
@@ -101,8 +292,6 @@ class CrawKnowledgeService {
101292 } catch  (e:  Exception ) {
102293
103294 }
104- 105- 106295 }
107296 }
108297 }
0 commit comments