Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit ab83470

Browse files
Merge pull request avinashkranjan#791 from XZANATOL/Youtube_Trending_Feed_Scrapper
Added Youtube Trending Feed Scrapper
2 parents 34645f1 + 11aef5d commit ab83470

File tree

3 files changed

+262
-0
lines changed

3 files changed

+262
-0
lines changed
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# Youtube Trending Feed Scrapper
2+
3+
It's a 2 scripts that is used to scrap and read the first 10 trending news in YouTube from any its available categories. Let be What's happening right ``Now``, in ``Gaming``, in ``Music``, or in ``Movies`` You will get it on your local machine.
4+
5+
# Installation
6+
* Install the following Python libraries:
7+
> ``pip3 install selenium pymongo mongoengine pandas``
8+
9+
10+
* Place ChromeDriver in the same directory of the script. You can download it from [here](https://sites.google.com/a/chromium.org/chromedriver/downloads). <br>
11+
(Note: Download the one with the same version of your Chrome browser.)
12+
13+
14+
* Install MongoDB Community Server on your machine. You can refer to the installation from [here](https://docs.mongodb.com/manual/administration/install-community/).
15+
16+
# Usage
17+
18+
The scripts allows you to save the scrapped content using 2 methods:
19+
20+
1) A MongoDB called ``Youtube`` and saved in a collection called ``trending``.
21+
2) A CSV file called ``Youtube.csv``.
22+
23+
You can save using either or both, It's up to your desires. The same goes with ``scrap_reader.py``, It can read from either MongoDB or the CSV file.
24+
25+
* For saving-to/reading-from a MongoDB, pass the ``-m`` argument.
26+
* For saving-to/reading-from a CSV file, pass the ``-c`` argument.
27+
28+
# Output
29+
30+
whatever the used argument to save the data is, it will be saved containing these video attributes:
31+
1) Video Section
32+
2) Video Title
33+
3) Video Link
34+
4) Video Channel
35+
5) Video Views
36+
6) Video Date
37+
38+
# Authors
39+
40+
Written by [XZANATOL](https://www.github.com/XZANATOL).
41+
42+
The project was built as a contribution during [GSSOC'21](https://gssoc.girlscript.tech/).
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
# Youtube Trending Feed Reader
2+
# Written by XZANATOL
3+
from optparse import OptionParser
4+
from pymongo import MongoClient
5+
import pandas as pd
6+
import sys
7+
8+
# Help menu
9+
usage = """
10+
<Script> [Options]
11+
12+
[Options]
13+
-h, --help Shows this help message and exit
14+
-c, --csv Reads data from "Youtube.csv" file
15+
-m, --mongo Reads data from MongoDB
16+
"""
17+
18+
# Load args
19+
parser = OptionParser()
20+
parser.add_option("-c", "--csv", action="store_true", dest="csv", help="Saves extracted contents to a CSV file.")
21+
parser.add_option("-m", "--mongo", action="store_true", dest="mongo", help="Saves extracted contents to a MongoDB.")
22+
23+
def read_mongo():
24+
# Connect to service
25+
client = MongoClient("127.0.0.1")
26+
# Create an object
27+
db = client.Youtube.trending
28+
return db.find() # Return all values
29+
30+
31+
def read_csv():
32+
# read databse
33+
df = pd.read_csv("Youtube.csv")
34+
data = []
35+
for index, row in df.iterrows():
36+
data.append(row) # Append each dictionary to the list
37+
return data # Return all values
38+
39+
40+
def display(data):
41+
i=0
42+
for card in data:
43+
# For every 10 cards print section
44+
if i%10 ==0:
45+
c = input("Show Section? [y/n] > ")
46+
if c.lower() == "y":
47+
print("***********************************")
48+
print(f"""{card["section"]} section""")
49+
print("***********************************")
50+
else:
51+
sys.exit() # If had enough of reading
52+
i+=1 # Increament
53+
print("Title:", card["title"])
54+
print("Link:", card["link"])
55+
print("Channel:", card["channel"])
56+
print("Views:", card["views"])
57+
print("Time:", card["date"])
58+
print("==============================================")
59+
60+
61+
if __name__ == "__main__":
62+
(options, args) = parser.parse_args()
63+
64+
# Flags
65+
csv = options.csv
66+
mongo = options.mongo
67+
# Validate flags
68+
if not (bool(csv) ^ bool(mongo)): # XNOR Gate
69+
print(usage)
70+
sys.exit()
71+
72+
if mongo:
73+
data = read_mongo()
74+
else:
75+
data = read_csv()
76+
display(data)
Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
# Youtube Trending Feed Scrapper
2+
# Written by XZANATOL
3+
from optparse import OptionParser
4+
from selenium import webdriver
5+
import pandas as pd
6+
import mongoengine
7+
import pymongo
8+
import time
9+
import sys
10+
11+
# Help menu
12+
usage = """
13+
<Script> [Options]
14+
15+
[Options]
16+
-h, --help Shows this help message and exit.
17+
-c, --csv Saves extracted contents to a CSV file.
18+
-m, --mongo Saves extracted contents to a MongoDB.
19+
"""
20+
21+
# Load args
22+
parser = OptionParser()
23+
parser.add_option("-c", "--csv", action="store_true", dest="csv", help="Saves extracted contents to a CSV file.")
24+
parser.add_option("-m", "--mongo", action="store_true", dest="mongo", help="Saves extracted contents to a MongoDB.")
25+
26+
# Defined DataFrame to avoid check errors
27+
df = pd.DataFrame()
28+
29+
# MongoDB Collection (Table) Template
30+
class Trending(mongoengine.Document):
31+
section = mongoengine.StringField(required=True)
32+
title = mongoengine.StringField(required=True)
33+
channel = mongoengine.StringField(required=True)
34+
link = mongoengine.StringField(required=True)
35+
views = mongoengine.StringField(required=True)
36+
date = mongoengine.StringField(required=True)
37+
38+
meta = {"indexes": ["section"]}
39+
40+
41+
def load_driver():
42+
"""Load Chrome webdriver."""
43+
driver = webdriver.Chrome("chromedriver.exe")
44+
return driver
45+
46+
47+
def page_scrap(driver):
48+
"""Scrap YouTube trending feed."""
49+
# pages to be scrapped: Now, Music, Gaming, Movies
50+
pages = ["https://www.youtube.com/feed/trending",
51+
"https://www.youtube.com/feed/trending?bp=4gINGgt5dG1hX2NoYXJ0cw%3D%3D",
52+
"https://www.youtube.com/feed/trending?bp=4gIcGhpnYW1pbmdfY29ycHVzX21vc3RfcG9wdWxhcg%3D%3D",
53+
"https://www.youtube.com/feed/trending?bp=4gIKGgh0cmFpbGVycw%3D%3D"]
54+
sections = ["Now", "Music", "Gaming", "Movies"]
55+
56+
for num in range(4):
57+
driver.get(pages[num])
58+
time.sleep(3) # Make sure that all the page is loaded
59+
# Extract first 10 contents
60+
cards = driver.find_elements_by_tag_name("ytd-video-renderer")[:10]
61+
links = driver.find_elements_by_id("video-title")[:10]
62+
meta_data = driver.find_elements_by_tag_name("ytd-video-meta-block")[:10]
63+
for i in range(10):
64+
# Splitted meta data that will be saved
65+
meta_splitted = meta_data[i].text.split("\n")
66+
# Sometimes this character is extracted for unknown reasons
67+
try:
68+
meta_splitted.remove("•")
69+
except:
70+
pass
71+
section = sections[num] # Scrapped from which section?
72+
link = links[i].get_attribute("href") # Video Link
73+
title = links[i].text # Video title
74+
channel = meta_splitted[0] # Channel name
75+
views = meta_splitted[1] # Video Views
76+
date = meta_splitted[2] # Release date
77+
78+
"""Arguments validation is better than making a scrapping algorithm for each"""
79+
if mongo:
80+
save_to_db(section, title, channel, link, views, date)
81+
if csv:
82+
append_to_df(section, title, channel, link, views, date)
83+
84+
print(f"[+]Finished scraping '{sections[num]}' section!")
85+
86+
# last validation for csv
87+
if csv:
88+
save_to_csv()
89+
90+
91+
def save_to_db(section, title, channel, link, views, date):
92+
"""Saves a record to database."""
93+
# Create object
94+
record = Trending(
95+
section = section,
96+
title = title,
97+
channel = channel,
98+
link = link,
99+
views = views,
100+
date = date)
101+
# Save record
102+
record.save()
103+
104+
105+
def append_to_df(section, title, channel, link, views, date):
106+
"""Appends a record to dataframe."""
107+
global df
108+
df = df.append({"section": section,
109+
"title": title,
110+
"channel": channel,
111+
"link": link,
112+
"views": views,
113+
"date": date,}, ignore_index=True)
114+
115+
116+
def save_to_csv():
117+
"""exports dataframe to a CSV file."""
118+
global df
119+
df.to_csv("Youtube.csv", index=False, columns=["section", "title",
120+
"channel", "link",
121+
"views", "date"])
122+
# Function end (eye friendly comment to seperate the function end line)
123+
124+
125+
if __name__ == "__main__":
126+
(options, args) = parser.parse_args()
127+
128+
# Flags
129+
csv = options.csv
130+
mongo = options.mongo
131+
# Validate flags
132+
if not (bool(csv) or bool(mongo)):
133+
print(usage)
134+
sys.exit()
135+
136+
if mongo:
137+
mongoengine.connect("Youtube")
138+
139+
driver = load_driver() # load driver
140+
page_scrap(driver) # start scrapping
141+
print("[+]Done !")
142+
# End session
143+
driver.quit()
144+
sys.exit()

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /