|
| 1 | +# Youtube Trending Feed Scrapper |
| 2 | +# Written by XZANATOL |
| 3 | +from optparse import OptionParser |
| 4 | +from selenium import webdriver |
| 5 | +import pandas as pd |
| 6 | +import mongoengine |
| 7 | +import pymongo |
| 8 | +import time |
| 9 | +import sys |
| 10 | + |
| 11 | +# Help menu |
| 12 | +usage = """ |
| 13 | +<Script> [Options] |
| 14 | + |
| 15 | +[Options] |
| 16 | + -h, --help Shows this help message and exit. |
| 17 | + -c, --csv Saves extracted contents to a CSV file. |
| 18 | + -m, --mongo Saves extracted contents to a MongoDB. |
| 19 | +""" |
| 20 | + |
| 21 | +# Load args |
| 22 | +parser = OptionParser() |
| 23 | +parser.add_option("-c", "--csv", action="store_true", dest="csv", help="Saves extracted contents to a CSV file.") |
| 24 | +parser.add_option("-m", "--mongo", action="store_true", dest="mongo", help="Saves extracted contents to a MongoDB.") |
| 25 | + |
| 26 | +# Defined DataFrame to avoid check errors |
| 27 | +df = pd.DataFrame() |
| 28 | + |
| 29 | +# MongoDB Collection (Table) Template |
| 30 | +class Trending(mongoengine.Document): |
| 31 | + section = mongoengine.StringField(required=True) |
| 32 | + title = mongoengine.StringField(required=True) |
| 33 | + channel = mongoengine.StringField(required=True) |
| 34 | + link = mongoengine.StringField(required=True) |
| 35 | + views = mongoengine.StringField(required=True) |
| 36 | + date = mongoengine.StringField(required=True) |
| 37 | + |
| 38 | + meta = {"indexes": ["section"]} |
| 39 | + |
| 40 | + |
| 41 | +def load_driver(): |
| 42 | + """Load Chrome webdriver.""" |
| 43 | + driver = webdriver.Chrome("chromedriver.exe") |
| 44 | + return driver |
| 45 | + |
| 46 | + |
| 47 | +def page_scrap(driver): |
| 48 | + """Scrap YouTube trending feed.""" |
| 49 | + # pages to be scrapped: Now, Music, Gaming, Movies |
| 50 | + pages = ["https://www.youtube.com/feed/trending", |
| 51 | + "https://www.youtube.com/feed/trending?bp=4gINGgt5dG1hX2NoYXJ0cw%3D%3D", |
| 52 | + "https://www.youtube.com/feed/trending?bp=4gIcGhpnYW1pbmdfY29ycHVzX21vc3RfcG9wdWxhcg%3D%3D", |
| 53 | + "https://www.youtube.com/feed/trending?bp=4gIKGgh0cmFpbGVycw%3D%3D"] |
| 54 | + sections = ["Now", "Music", "Gaming", "Movies"] |
| 55 | + |
| 56 | + for num in range(4): |
| 57 | + driver.get(pages[num]) |
| 58 | + time.sleep(3) # Make sure that all the page is loaded |
| 59 | + # Extract first 10 contents |
| 60 | + cards = driver.find_elements_by_tag_name("ytd-video-renderer")[:10] |
| 61 | + links = driver.find_elements_by_id("video-title")[:10] |
| 62 | + meta_data = driver.find_elements_by_tag_name("ytd-video-meta-block")[:10] |
| 63 | + for i in range(10): |
| 64 | + # Splitted meta data that will be saved |
| 65 | + meta_splitted = meta_data[i].text.split("\n") |
| 66 | + # Sometimes this character is extracted for unknown reasons |
| 67 | + try: |
| 68 | + meta_splitted.remove("•") |
| 69 | + except: |
| 70 | + pass |
| 71 | + section = sections[num] # Scrapped from which section? |
| 72 | + link = links[i].get_attribute("href") # Video Link |
| 73 | + title = links[i].text # Video title |
| 74 | + channel = meta_splitted[0] # Channel name |
| 75 | + views = meta_splitted[1] # Video Views |
| 76 | + date = meta_splitted[2] # Release date |
| 77 | + |
| 78 | + """Arguments validation is better than making a scrapping algorithm for each""" |
| 79 | + if mongo: |
| 80 | + save_to_db(section, title, channel, link, views, date) |
| 81 | + if csv: |
| 82 | + append_to_df(section, title, channel, link, views, date) |
| 83 | + |
| 84 | + print(f"[+]Finished scraping '{sections[num]}' section!") |
| 85 | + |
| 86 | + # last validation for csv |
| 87 | + if csv: |
| 88 | + save_to_csv() |
| 89 | + |
| 90 | + |
| 91 | +def save_to_db(section, title, channel, link, views, date): |
| 92 | + """Saves a record to database.""" |
| 93 | + # Create object |
| 94 | + record = Trending( |
| 95 | + section = section, |
| 96 | + title = title, |
| 97 | + channel = channel, |
| 98 | + link = link, |
| 99 | + views = views, |
| 100 | + date = date) |
| 101 | + # Save record |
| 102 | + record.save() |
| 103 | + |
| 104 | + |
| 105 | +def append_to_df(section, title, channel, link, views, date): |
| 106 | + """Appends a record to dataframe.""" |
| 107 | + global df |
| 108 | + df = df.append({"section": section, |
| 109 | + "title": title, |
| 110 | + "channel": channel, |
| 111 | + "link": link, |
| 112 | + "views": views, |
| 113 | + "date": date,}, ignore_index=True) |
| 114 | + |
| 115 | + |
| 116 | +def save_to_csv(): |
| 117 | + """exports dataframe to a CSV file.""" |
| 118 | + global df |
| 119 | + df.to_csv("Youtube.csv", index=False, columns=["section", "title", |
| 120 | + "channel", "link", |
| 121 | + "views", "date"]) |
| 122 | + # Function end (eye friendly comment to seperate the function end line) |
| 123 | + |
| 124 | + |
| 125 | +if __name__ == "__main__": |
| 126 | + (options, args) = parser.parse_args() |
| 127 | + |
| 128 | + # Flags |
| 129 | + csv = options.csv |
| 130 | + mongo = options.mongo |
| 131 | + # Validate flags |
| 132 | + if not (bool(csv) or bool(mongo)): |
| 133 | + print(usage) |
| 134 | + sys.exit() |
| 135 | + |
| 136 | + if mongo: |
| 137 | + mongoengine.connect("Youtube") |
| 138 | + |
| 139 | + driver = load_driver() # load driver |
| 140 | + page_scrap(driver) # start scrapping |
| 141 | + print("[+]Done !") |
| 142 | + # End session |
| 143 | + driver.quit() |
| 144 | + sys.exit() |
0 commit comments