Commit ab83470

authored

Merge pull request avinashkranjan#791 from XZANATOL/Youtube_Trending_Feed_Scrapper

Added Youtube Trending Feed Scrapper

2 parents 34645f1 + 11aef5d commit ab83470Copy full SHA for ab83470

File tree

3 files changed

+262

-0

lines changed

Youtube Trending Feed Scrapper

3 files changed

+262

-0

lines changed

`‎Youtube Trending Feed Scrapper/ReadMe.md`

Lines changed: 42 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,42 @@`
	`1`	`+# Youtube Trending Feed Scrapper`
	`2`	`+`
	`3`	+It's a 2 scripts that is used to scrap and read the first 10 trending news in YouTube from any its available categories. Let be What's happening right ``Now``, in ``Gaming``, in ``Music``, or in ``Movies`` You will get it on your local machine.
	`4`	`+`
	`5`	`+# Installation`
	`6`	`+* Install the following Python libraries:`
	`7`	+> ``pip3 install selenium pymongo mongoengine pandas``
	`8`	`+`
	`9`	`+`
	`10`	`+* Place ChromeDriver in the same directory of the script. You can download it from [here](https://sites.google.com/a/chromium.org/chromedriver/downloads). <br>`
	`11`	`+(Note: Download the one with the same version of your Chrome browser.)`
	`12`	`+`
	`13`	`+`
	`14`	`+* Install MongoDB Community Server on your machine. You can refer to the installation from [here](https://docs.mongodb.com/manual/administration/install-community/).`
	`15`	`+`
	`16`	`+# Usage`
	`17`	`+`
	`18`	`+The scripts allows you to save the scrapped content using 2 methods:`
	`19`	`+`
	`20`	+1) A MongoDB called ``Youtube`` and saved in a collection called ``trending``.
	`21`	+2) A CSV file called ``Youtube.csv``.
	`22`	`+`
	`23`	+You can save using either or both, It's up to your desires. The same goes with ``scrap_reader.py``, It can read from either MongoDB or the CSV file.
	`24`	`+`
	`25`	+* For saving-to/reading-from a MongoDB, pass the ``-m`` argument.
	`26`	+* For saving-to/reading-from a CSV file, pass the ``-c`` argument.
	`27`	`+`
	`28`	`+# Output`
	`29`	`+`
	`30`	`+whatever the used argument to save the data is, it will be saved containing these video attributes:`
	`31`	`+1) Video Section`
	`32`	`+2) Video Title`
	`33`	`+3) Video Link`
	`34`	`+4) Video Channel`
	`35`	`+5) Video Views`
	`36`	`+6) Video Date`
	`37`	`+`
	`38`	`+# Authors`
	`39`	`+`
	`40`	`+Written by [XZANATOL](https://www.github.com/XZANATOL).`
	`41`	`+`
	`42`	`+The project was built as a contribution during [GSSOC'21](https://gssoc.girlscript.tech/).`

`‎Youtube Trending Feed Scrapper/scrap_reader.py`

Lines changed: 76 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,76 @@`
	`1`	`+# Youtube Trending Feed Reader`
	`2`	`+# Written by XZANATOL`
	`3`	`+from optparse import OptionParser`
	`4`	`+from pymongo import MongoClient`
	`5`	`+import pandas as pd`
	`6`	`+import sys`
	`7`	`+`
	`8`	`+# Help menu`
	`9`	`+usage = """`
	`10`	`+<Script> [Options]`
	`11`	`+`
	`12`	`+[Options]`
	`13`	`+ -h, --help Shows this help message and exit`
	`14`	`+ -c, --csv Reads data from "Youtube.csv" file`
	`15`	`+ -m, --mongo Reads data from MongoDB`
	`16`	`+"""`
	`17`	`+`
	`18`	`+# Load args`
	`19`	`+parser = OptionParser()`
	`20`	`+parser.add_option("-c", "--csv", action="store_true", dest="csv", help="Saves extracted contents to a CSV file.")`
	`21`	`+parser.add_option("-m", "--mongo", action="store_true", dest="mongo", help="Saves extracted contents to a MongoDB.")`
	`22`	`+`
	`23`	`+def read_mongo():`
	`24`	`+ # Connect to service`
	`25`	`+ client = MongoClient("127.0.0.1")`
	`26`	`+ # Create an object`
	`27`	`+ db = client.Youtube.trending`
	`28`	`+ return db.find() # Return all values`
	`29`	`+`
	`30`	`+`
	`31`	`+def read_csv():`
	`32`	`+ # read databse`
	`33`	`+ df = pd.read_csv("Youtube.csv")`
	`34`	`+ data = []`
	`35`	`+ for index, row in df.iterrows():`
	`36`	`+ data.append(row) # Append each dictionary to the list`
	`37`	`+ return data # Return all values`
	`38`	`+`
	`39`	`+`
	`40`	`+def display(data):`
	`41`	`+ i=0`
	`42`	`+ for card in data:`
	`43`	`+ # For every 10 cards print section`
	`44`	`+ if i%10 ==0:`
	`45`	`+ c = input("Show Section? [y/n] > ")`
	`46`	`+ if c.lower() == "y":`
	`47`	`+ print("***********************************")`
	`48`	`+ print(f"""{card["section"]} section""")`
	`49`	`+ print("***********************************")`
	`50`	`+ else:`
	`51`	`+ sys.exit() # If had enough of reading`
	`52`	`+ i+=1 # Increament`
	`53`	`+ print("Title:", card["title"])`
	`54`	`+ print("Link:", card["link"])`
	`55`	`+ print("Channel:", card["channel"])`
	`56`	`+ print("Views:", card["views"])`
	`57`	`+ print("Time:", card["date"])`
	`58`	`+ print("==============================================")`
	`59`	`+`
	`60`	`+`
	`61`	`+if __name__ == "__main__":`
	`62`	`+ (options, args) = parser.parse_args()`
	`63`	`+`
	`64`	`+ # Flags`
	`65`	`+ csv = options.csv`
	`66`	`+ mongo = options.mongo`
	`67`	`+ # Validate flags`
	`68`	`+ if not (bool(csv) ^ bool(mongo)): # XNOR Gate`
	`69`	`+ print(usage)`
	`70`	`+ sys.exit()`
	`71`	`+`
	`72`	`+ if mongo:`
	`73`	`+ data = read_mongo()`
	`74`	`+ else:`
	`75`	`+ data = read_csv()`
	`76`	`+ display(data)`

`‎Youtube Trending Feed Scrapper/youtube_scrapper.py`

Lines changed: 144 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,144 @@`
	`1`	`+# Youtube Trending Feed Scrapper`
	`2`	`+# Written by XZANATOL`
	`3`	`+from optparse import OptionParser`
	`4`	`+from selenium import webdriver`
	`5`	`+import pandas as pd`
	`6`	`+import mongoengine`
	`7`	`+import pymongo`
	`8`	`+import time`
	`9`	`+import sys`
	`10`	`+`
	`11`	`+# Help menu`
	`12`	`+usage = """`
	`13`	`+<Script> [Options]`
	`14`	`+`
	`15`	`+[Options]`
	`16`	`+ -h, --help Shows this help message and exit.`
	`17`	`+ -c, --csv Saves extracted contents to a CSV file.`
	`18`	`+ -m, --mongo Saves extracted contents to a MongoDB.`
	`19`	`+"""`
	`20`	`+`
	`21`	`+# Load args`
	`22`	`+parser = OptionParser()`
	`23`	`+parser.add_option("-c", "--csv", action="store_true", dest="csv", help="Saves extracted contents to a CSV file.")`
	`24`	`+parser.add_option("-m", "--mongo", action="store_true", dest="mongo", help="Saves extracted contents to a MongoDB.")`
	`25`	`+`
	`26`	`+# Defined DataFrame to avoid check errors`
	`27`	`+df = pd.DataFrame()`
	`28`	`+`
	`29`	`+# MongoDB Collection (Table) Template`
	`30`	`+class Trending(mongoengine.Document):`
	`31`	`+ section = mongoengine.StringField(required=True)`
	`32`	`+ title = mongoengine.StringField(required=True)`
	`33`	`+ channel = mongoengine.StringField(required=True)`
	`34`	`+ link = mongoengine.StringField(required=True)`
	`35`	`+ views = mongoengine.StringField(required=True)`
	`36`	`+ date = mongoengine.StringField(required=True)`
	`37`	`+`
	`38`	`+ meta = {"indexes": ["section"]}`
	`39`	`+`
	`40`	`+`
	`41`	`+def load_driver():`
	`42`	`+ """Load Chrome webdriver."""`
	`43`	`+ driver = webdriver.Chrome("chromedriver.exe")`
	`44`	`+ return driver`
	`45`	`+`
	`46`	`+`
	`47`	`+def page_scrap(driver):`
	`48`	`+ """Scrap YouTube trending feed."""`
	`49`	`+ # pages to be scrapped: Now, Music, Gaming, Movies`
	`50`	`+ pages = ["https://www.youtube.com/feed/trending",`
	`51`	`+ "https://www.youtube.com/feed/trending?bp=4gINGgt5dG1hX2NoYXJ0cw%3D%3D",`
	`52`	`+ "https://www.youtube.com/feed/trending?bp=4gIcGhpnYW1pbmdfY29ycHVzX21vc3RfcG9wdWxhcg%3D%3D",`
	`53`	`+ "https://www.youtube.com/feed/trending?bp=4gIKGgh0cmFpbGVycw%3D%3D"]`
	`54`	`+ sections = ["Now", "Music", "Gaming", "Movies"]`
	`55`	`+`
	`56`	`+ for num in range(4):`
	`57`	`+ driver.get(pages[num])`
	`58`	`+ time.sleep(3) # Make sure that all the page is loaded`
	`59`	`+ # Extract first 10 contents`
	`60`	`+ cards = driver.find_elements_by_tag_name("ytd-video-renderer")[:10]`
	`61`	`+ links = driver.find_elements_by_id("video-title")[:10]`
	`62`	`+ meta_data = driver.find_elements_by_tag_name("ytd-video-meta-block")[:10]`
	`63`	`+ for i in range(10):`
	`64`	`+ # Splitted meta data that will be saved`
	`65`	`+ meta_splitted = meta_data[i].text.split("\n")`
	`66`	`+ # Sometimes this character is extracted for unknown reasons`
	`67`	`+ try:`
	`68`	`+ meta_splitted.remove("•")`
	`69`	`+ except:`
	`70`	`+ pass`
	`71`	`+ section = sections[num] # Scrapped from which section?`
	`72`	`+ link = links[i].get_attribute("href") # Video Link`
	`73`	`+ title = links[i].text # Video title`
	`74`	`+ channel = meta_splitted[0] # Channel name`
	`75`	`+ views = meta_splitted[1] # Video Views`
	`76`	`+ date = meta_splitted[2] # Release date`
	`77`	`+`
	`78`	`+ """Arguments validation is better than making a scrapping algorithm for each"""`
	`79`	`+ if mongo:`
	`80`	`+ save_to_db(section, title, channel, link, views, date)`
	`81`	`+ if csv:`
	`82`	`+ append_to_df(section, title, channel, link, views, date)`
	`83`	`+`
	`84`	`+ print(f"[+]Finished scraping '{sections[num]}' section!")`
	`85`	`+`
	`86`	`+ # last validation for csv`
	`87`	`+ if csv:`
	`88`	`+ save_to_csv()`
	`89`	`+`
	`90`	`+`
	`91`	`+def save_to_db(section, title, channel, link, views, date):`
	`92`	`+ """Saves a record to database."""`
	`93`	`+ # Create object`
	`94`	`+ record = Trending(`
	`95`	`+ section = section,`
	`96`	`+ title = title,`
	`97`	`+ channel = channel,`
	`98`	`+ link = link,`
	`99`	`+ views = views,`
	`100`	`+ date = date)`
	`101`	`+ # Save record`
	`102`	`+ record.save()`
	`103`	`+`
	`104`	`+`
	`105`	`+def append_to_df(section, title, channel, link, views, date):`
	`106`	`+ """Appends a record to dataframe."""`
	`107`	`+ global df`
	`108`	`+ df = df.append({"section": section,`
	`109`	`+ "title": title,`
	`110`	`+ "channel": channel,`
	`111`	`+ "link": link,`
	`112`	`+ "views": views,`
	`113`	`+ "date": date,}, ignore_index=True)`
	`114`	`+`
	`115`	`+`
	`116`	`+def save_to_csv():`
	`117`	`+ """exports dataframe to a CSV file."""`
	`118`	`+ global df`
	`119`	`+ df.to_csv("Youtube.csv", index=False, columns=["section", "title",`
	`120`	`+ "channel", "link",`
	`121`	`+ "views", "date"])`
	`122`	`+ # Function end (eye friendly comment to seperate the function end line)`
	`123`	`+`
	`124`	`+`
	`125`	`+if __name__ == "__main__":`
	`126`	`+ (options, args) = parser.parse_args()`
	`127`	`+`
	`128`	`+ # Flags`
	`129`	`+ csv = options.csv`
	`130`	`+ mongo = options.mongo`
	`131`	`+ # Validate flags`
	`132`	`+ if not (bool(csv) or bool(mongo)):`
	`133`	`+ print(usage)`
	`134`	`+ sys.exit()`
	`135`	`+`
	`136`	`+ if mongo:`
	`137`	`+ mongoengine.connect("Youtube")`
	`138`	`+`
	`139`	`+ driver = load_driver() # load driver`
	`140`	`+ page_scrap(driver) # start scrapping`
	`141`	`+ print("[+]Done !")`
	`142`	`+ # End session`
	`143`	`+ driver.quit()`
	`144`	`+ sys.exit()`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit ab83470

File tree

3 files changed

3 files changed

`‎Youtube Trending Feed Scrapper/ReadMe.md`

`‎Youtube Trending Feed Scrapper/scrap_reader.py`

`‎Youtube Trending Feed Scrapper/youtube_scrapper.py`

0 commit comments