1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ from selenium import webdriver
4
+ from selenium .webdriver .common .keys import Keys
5
+ import time
6
+
7
+ # Get input for category and number of articles
8
+ category = input ("Enter category: " )
9
+ number_articles = int (input ("Enter number of articles: " ))
10
+
11
+ url = 'https://dev.to/search?q={}' .format (category )
12
+
13
+ # initiating the webdriver. Parameter includes the path of the webdriver.
14
+ driver = webdriver .Chrome ('C:\Webdrivers\chromedriver' )
15
+ driver .get (url )
16
+
17
+ # this is just to ensure that the page is loaded
18
+ time .sleep (5 )
19
+ html = driver .page_source
20
+
21
+ # Now apply bs4 to html variable
22
+ soup = BeautifulSoup (html , "html.parser" )
23
+ results_div = soup .find ('div' , {'id' : 'substories' })
24
+ articles = results_div .find_all ('article' )
25
+
26
+ # Getting articles from dev.to
27
+ count = 0
28
+ for article in articles :
29
+ article_data = article .find ('a' ,class_ = 'crayons-story__hidden-navigation-link' )['href' ]
30
+
31
+ post_url = "https://dev.to{}" .format (article_data )
32
+ driver .get (post_url )
33
+ time .sleep (5 )
34
+
35
+ post_html = driver .page_source
36
+ soup = BeautifulSoup (post_html , "html.parser" )
37
+ article_div = soup .find ('div' , {'class' : 'article-wrapper' })
38
+ article_content = article_div .find ('article' , {'id' : 'article-show-container' })
39
+
40
+ # Title of post found
41
+ header_tag = article_content .find ('header' ,class_ = 'crayons-article__header' )
42
+ title_div = header_tag .find ('div' ,class_ = 'crayons-article__header__meta' )
43
+ title_content = title_div .find ('h1' )
44
+
45
+ # Author of post found
46
+ author_tag = title_div .find ('div' ,class_ = 'crayons-article__subheader' )
47
+ author_name = author_tag .find ('a' ,class_ = 'crayons-link' )
48
+
49
+ # Post content found
50
+ article_content_div = article_content .find ('div' ,class_ = 'crayons-article__main' )
51
+ article_content_body = article_content_div .find ('div' ,class_ = 'crayons-article__body' )
52
+ p_tags = article_content_body .find_all ('p' )
53
+ article_content = ""
54
+ for p_tag in p_tags :
55
+ article_content += (p_tag .text .strip ()+ '\n ' )
56
+
57
+
58
+ print ("Title : " + title_content .text .strip ())
59
+ print ("Author : " + author_name .text .strip ())
60
+ print ("Body : " + article_content )
61
+
62
+ count = count + 1
63
+ if (count == number_articles ) :
64
+ break
65
+
66
+ driver .close () # closing the webdriver
0 commit comments