Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 3395c36

Browse files
scraper code file added
1 parent bd9f3c2 commit 3395c36

File tree

1 file changed

+86
-0
lines changed

1 file changed

+86
-0
lines changed

‎Hashnode-Scraper/scraper.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
import requests
2+
from bs4 import BeautifulSoup
3+
from selenium import webdriver
4+
from selenium.webdriver.common.keys import Keys
5+
import time
6+
from fpdf import FPDF
7+
8+
# Get input for category and number of articles
9+
category = input("Enter category: ")
10+
number_articles = int(input("Enter number of articles: "))
11+
driver_path = input('Enter chrome driver path: ')
12+
13+
url = 'https://hashnode.com/search?q={}'.format(category)
14+
15+
# initiating the webdriver. Parameter includes the path of the webdriver.
16+
driver = webdriver.Chrome(driver_path)
17+
driver.get(url)
18+
19+
# this is just to ensure that the page is loaded
20+
time.sleep(5)
21+
html = driver.page_source
22+
23+
# Now apply bs4 to html variable
24+
soup = BeautifulSoup(html, "html.parser")
25+
results_div = soup.find('div', {'class': 'pb-20'})
26+
blogs = results_div.find('div')
27+
28+
# Getting articles from dev.to
29+
count = 0
30+
for blog in blogs:
31+
32+
# If div is not a blog then skip
33+
check_blog = blog.find('a')['href']
34+
if check_blog[0] == '/':
35+
continue
36+
37+
# If div is blog then start scraping individual blogs
38+
blog_link = blog.find('a', class_='items-start')['href']
39+
post_url = blog_link
40+
driver.get(post_url)
41+
time.sleep(5)
42+
43+
post_html = driver.page_source
44+
soup = BeautifulSoup(post_html, "html.parser")
45+
title = soup.find('h1', itemprop = 'headline name').text
46+
author = soup.find('span', itemprop = 'name').text
47+
48+
# Post content found
49+
blog_content_body = soup.find(
50+
'div', itemprop='text')
51+
content_tags = blog_content_body.find_all(['p','h2','h3','h4'])
52+
53+
title_string = (title.strip()).encode(
54+
'latin-1', 'replace').decode('latin-1')
55+
author_string = ("By - {}".format(author.strip())
56+
).encode('latin-1', 'replace').decode('latin-1')
57+
58+
# Add a page
59+
pdf = FPDF()
60+
pdf.add_page()
61+
# set style and size of font
62+
pdf.set_font("Arial", size=12)
63+
64+
# Blog Title cell
65+
pdf.cell(200, 5, txt=title_string, ln=1, align='C')
66+
# Blog Author cell
67+
pdf.cell(200, 10, txt=author_string, ln=2, align='C')
68+
69+
for tag in content_tags:
70+
article_part = (tag.text.strip()).encode(
71+
'latin-1', 'replace').decode('latin-1')
72+
# Add part of article to pdf
73+
pdf.multi_cell(0, 5, txt=article_part, align='L')
74+
75+
# Trim title
76+
title = title if len(title) < 30 else title[:30]
77+
78+
# save the pdf with name .pdf
79+
pdf_title = ''.join(e for e in title if e.isalnum())
80+
pdf.output("{}.pdf".format(pdf_title))
81+
82+
count = count + 1
83+
if(count == number_articles):
84+
break
85+
86+
driver.close() # closing the webdriver

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /