Commit dd98afd

committed

Fix YouTube video extractor script with complete rewrite

- Replace broken meta tag extraction with modern ytInitialData approach - Fix channel URL construction to use proper /channel/ path prefix - Add robust error handling for YouTube's changing structure - Extract title, views, date, channel info, and description successfully - Maintain backward compatibility with command-line interface The original script was completely broken due to YouTube's HTML structure changes. This rewrite successfully extracts core video information using the modern approach.

1 parent bf1862e commit dd98afdCopy full SHA for dd98afd

File tree

1 file changed

+136

-78

lines changed

web-scraping/youtube-extractor
- extract_video_info.py

1 file changed

+136

-78

lines changed

`‎web-scraping/youtube-extractor/extract_video_info.py‎`

Lines changed: 136 additions & 78 deletions

Original file line number	Diff line number	Diff line change
`@@ -1,92 +1,150 @@`
`1`		`-fromrequests_htmlimport HTMLSession`
`2`		`-from bs4 import BeautifulSoupasbs`
	`1`	`+import requests`
	`2`	`+from bs4 import BeautifulSoup`
`3`	`3`	`import re`
`4`	`4`	`import json`
`5`		`-`
`6`		`-# init session`
`7`		`-session = HTMLSession()`
`8`		`-`
	`5`	`+import argparse`
`9`	`6`
`10`	`7`	`def get_video_info(url):`
`11`		`- # download HTML code`
`12`		`- response = session.get(url)`
`13`		`- # execute Javascript`
`14`		`- response.html.render(timeout=60)`
`15`		`- # create beautiful soup object to parse HTML`
`16`		`- soup = bs(response.html.html, "html.parser")`
`17`		`- # open("index.html", "w").write(response.html.html)`
`18`		`- # initialize the result`
`19`		`- result = {}`
`20`		`- # video title`
`21`		`- result["title"] = soup.find("meta", itemprop="name")['content']`
`22`		`- # video views`
`23`		`- result["views"] = soup.find("meta", itemprop="interactionCount")['content']`
`24`		`- # video description`
`25`		`- result["description"] = soup.find("meta", itemprop="description")['content']`
`26`		`- # date published`
`27`		`- result["date_published"] = soup.find("meta", itemprop="datePublished")['content']`
`28`		`- # get the duration of the video`
`29`		`- result["duration"] = soup.find("span", {"class": "ytp-time-duration"}).text`
`30`		`- # get the video tags`
`31`		`- result["tags"] = ', '.join([ meta.attrs.get("content") for meta in soup.find_all("meta", {"property": "og:video:tag"}) ])`
`32`		`-`
`33`		`- # Additional video and channel information (with help from: https://stackoverflow.com/a/68262735)`
`34`		`- data = re.search(r"var ytInitialData = ({.*?});", soup.prettify()).group(1)`
`35`		`- data_json = json.loads(data)`
`36`		`- videoPrimaryInfoRenderer = data_json['contents']['twoColumnWatchNextResults']['results']['results']['contents'][0]['videoPrimaryInfoRenderer']`
`37`		`- videoSecondaryInfoRenderer = data_json['contents']['twoColumnWatchNextResults']['results']['results']['contents'][1]['videoSecondaryInfoRenderer']`
`38`		`- # number of likes`
`39`		`- likes_label = videoPrimaryInfoRenderer['videoActions']['menuRenderer']['topLevelButtons'][0]['toggleButtonRenderer']['defaultText']['accessibility']['accessibilityData']['label'] # "No likes" or "###,### likes"`
`40`		`- likes_str = likes_label.split(' ')[0].replace(',','')`
`41`		`- result["likes"] = '0' if likes_str == 'No' else likes_str`
`42`		`- # number of likes (old way) doesn't always work`
`43`		`- # text_yt_formatted_strings = soup.find_all("yt-formatted-string", {"id": "text", "class": "ytd-toggle-button-renderer"})`
`44`		`- # result["likes"] = ''.join([ c for c in text_yt_formatted_strings[0].attrs.get("aria-label") if c.isdigit() ])`
`45`		`- # result["likes"] = 0 if result['likes'] == '' else int(result['likes'])`
`46`		`- # number of dislikes - YouTube does not publish this anymore...`
`47`		`- # result["dislikes"] = ''.join([ c for c in text_yt_formatted_strings[1].attrs.get("aria-label") if c.isdigit() ])`
`48`		`- # result["dislikes"] = '0' if result['dislikes'] == '' else result['dislikes']`
`49`		`- result['dislikes'] = 'UNKNOWN'`
`50`		`- # channel details`
`51`		`- channel_tag = soup.find("meta", itemprop="channelId")['content']`
`52`		`- # channel name`
`53`		`- channel_name = soup.find("span", itemprop="author").next.next['content']`
`54`		`- # channel URL`
`55`		`- # channel_url = soup.find("span", itemprop="author").next['href']`
`56`		`- channel_url = f"https://www.youtube.com/{channel_tag}"`
`57`		`- # number of subscribers as str`
`58`		`- channel_subscribers = videoSecondaryInfoRenderer['owner']['videoOwnerRenderer']['subscriberCountText']['accessibility']['accessibilityData']['label']`
`59`		`- # channel details (old way)`
`60`		`- # channel_tag = soup.find("yt-formatted-string", {"class": "ytd-channel-name"}).find("a")`
`61`		`- # # channel name (old way)`
`62`		`- # channel_name = channel_tag.text`
`63`		`- # # channel URL (old way)`
`64`		`- # channel_url = f"https://www.youtube.com{channel_tag['href']}"`
`65`		`- # number of subscribers as str (old way)`
`66`		`- # channel_subscribers = soup.find("yt-formatted-string", {"id": "owner-sub-count"}).text.strip()`
`67`		`- result['channel'] = {'name': channel_name, 'url': channel_url, 'subscribers': channel_subscribers}`
`68`		`- return result`
	`8`	`+ """`
	`9`	`+ Extract video information from YouTube using modern approach`
	`10`	`+ """`
	`11`	`+ headers = {`
	`12`	`+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'`
	`13`	`+ }`
	`14`	`+`
	`15`	`+ try:`
	`16`	`+ # Download HTML code`
	`17`	`+ response = requests.get(url, headers=headers)`
	`18`	`+ response.raise_for_status()`
	`19`	`+`
	`20`	`+ # Create beautiful soup object to parse HTML`
	`21`	`+ soup = BeautifulSoup(response.text, "html.parser")`
	`22`	`+`
	`23`	`+ # Initialize the result`
	`24`	`+ result = {}`
	`25`	`+`
	`26`	`+ # Extract ytInitialData which contains all the video information`
	`27`	`+ data_match = re.search(r'var ytInitialData = ({.*?});', response.text)`
	`28`	`+ if not data_match:`
	`29`	`+ raise Exception("Could not find ytInitialData in page")`
	`30`	`+`
	`31`	`+ data_json = json.loads(data_match.group(1))`
	`32`	`+`
	`33`	`+ # Get the main content sections`
	`34`	`+ contents = data_json['contents']['twoColumnWatchNextResults']['results']['results']['contents']`
	`35`	`+`
	`36`	`+ # Extract video information from videoPrimaryInfoRenderer`
	`37`	`+ if 'videoPrimaryInfoRenderer' in contents[0]:`
	`38`	`+ primary = contents[0]['videoPrimaryInfoRenderer']`
	`39`	`+`
	`40`	`+ # Video title`
	`41`	`+ result["title"] = primary['title']['runs'][0]['text']`
	`42`	`+`
	`43`	`+ # Video views`
	`44`	`+ result["views"] = primary['viewCount']['videoViewCountRenderer']['viewCount']['simpleText']`
	`45`	`+`
	`46`	`+ # Date published`
	`47`	`+ result["date_published"] = primary['dateText']['simpleText']`
	`48`	`+`
	`49`	`+ # Extract channel information from videoSecondaryInfoRenderer`
	`50`	`+ secondary = None`
	`51`	`+ if 'videoSecondaryInfoRenderer' in contents[1]:`
	`52`	`+ secondary = contents[1]['videoSecondaryInfoRenderer']`
	`53`	`+ owner = secondary['owner']['videoOwnerRenderer']`
	`54`	`+`
	`55`	`+ # Channel name`
	`56`	`+ channel_name = owner['title']['runs'][0]['text']`
	`57`	`+`
	`58`	`+ # Channel ID`
	`59`	`+ channel_id = owner['navigationEndpoint']['browseEndpoint']['browseId']`
	`60`	`+`
	`61`	`+ # Channel URL - FIXED with proper /channel/ path`
	`62`	`+ channel_url = f"https://www.youtube.com/channel/{channel_id}"`
	`63`	`+`
	`64`	`+ # Number of subscribers`
	`65`	`+ channel_subscribers = owner['subscriberCountText']['accessibility']['accessibilityData']['label']`
	`66`	`+`
	`67`	`+ result['channel'] = {`
	`68`	`+ 'name': channel_name,`
	`69`	`+ 'url': channel_url,`
	`70`	`+ 'subscribers': channel_subscribers`
	`71`	`+ }`
	`72`	`+`
	`73`	`+ # Extract video description`
	`74`	`+ if secondary and 'attributedDescription' in secondary:`
	`75`	`+ description_runs = secondary['attributedDescription']['content']`
	`76`	`+ result["description"] = description_runs`
	`77`	`+ else:`
	`78`	`+ result["description"] = "Description not available"`
	`79`	`+`
	`80`	`+ # Try to extract video duration from player overlay`
	`81`	`+ # This is a fallback approach since the original method doesn't work`
	`82`	`+ duration_match = re.search(r'"approxDurationMs":"(\d+)"', response.text)`
	`83`	`+ if duration_match:`
	`84`	`+ duration_ms = int(duration_match.group(1))`
	`85`	`+ minutes = duration_ms // 60000`
	`86`	`+ seconds = (duration_ms % 60000) // 1000`
	`87`	`+ result["duration"] = f"{minutes}:{seconds:02d}"`
	`88`	`+ else:`
	`89`	`+ result["duration"] = "Duration not available"`
	`90`	`+`
	`91`	`+ # Extract video tags if available`
	`92`	`+ video_tags = []`
	`93`	`+ if 'keywords' in data_json.get('metadata', {}).get('videoMetadataRenderer', {}):`
	`94`	`+ video_tags = data_json['metadata']['videoMetadataRenderer']['keywords']`
	`95`	`+ result["tags"] = ', '.join(video_tags) if video_tags else "No tags available"`
	`96`	`+`
	`97`	`+ # Extract likes (modern approach)`
	`98`	`+ result["likes"] = "Likes count not available"`
	`99`	`+ result["dislikes"] = "UNKNOWN" # YouTube no longer shows dislikes`
	`100`	`+`
	`101`	`+ # Try to find likes in the new structure`
	`102`	`+ for content in contents:`
	`103`	`+ if 'compositeVideoPrimaryInfoRenderer' in content:`
	`104`	`+ composite = content['compositeVideoPrimaryInfoRenderer']`
	`105`	`+ if 'likeButton' in composite:`
	`106`	`+ like_button = composite['likeButton']`
	`107`	`+ if 'toggleButtonRenderer' in like_button:`
	`108`	`+ toggle = like_button['toggleButtonRenderer']`
	`109`	`+ if 'defaultText' in toggle:`
	`110`	`+ default_text = toggle['defaultText']`
	`111`	`+ if 'accessibility' in default_text:`
	`112`	`+ accessibility = default_text['accessibility']`
	`113`	`+ if 'accessibilityData' in accessibility:`
	`114`	`+ label = accessibility['accessibilityData']['label']`
	`115`	`+ if 'like' in label.lower():`
	`116`	`+ result["likes"] = label`
	`117`	`+`
	`118`	`+ return result`
	`119`	`+`
	`120`	`+ except Exception as e:`
	`121`	`+ raise Exception(f"Error extracting video info: {str(e)}")`
`69`	`122`
`70`	`123`	`if __name__ == "__main__":`
`71`		`- import argparse`
`72`	`124`	`parser = argparse.ArgumentParser(description="YouTube Video Data Extractor")`
`73`	`125`	`parser.add_argument("url", help="URL of the YouTube video")`
`74`	`126`
`75`	`127`	`args = parser.parse_args()`
	`128`	`+`
`76`	`129`	`# parse the video URL from command line`
`77`	`130`	`url = args.url`
`78`	`131`
`79`		`- data = get_video_info(url)`
	`132`	`+ try:`
	`133`	`+ data = get_video_info(url)`
`80`	`134`
`81`		`- # print in nice format`
`82`		`- print(f"Title: {data['title']}")`
`83`		`- print(f"Views: {data['views']}")`
`84`		`- print(f"Published at: {data['date_published']}")`
`85`		`- print(f"Video Duration: {data['duration']}")`
`86`		`- print(f"Video tags: {data['tags']}")`
`87`		`- print(f"Likes: {data['likes']}")`
`88`		`- print(f"Dislikes: {data['dislikes']}")`
`89`		`- print(f"\nDescription: {data['description']}\n")`
`90`		`- print(f"\nChannel Name: {data['channel']['name']}")`
`91`		`- print(f"Channel URL: {data['channel']['url']}")`
`92`		`- print(f"Channel Subscribers: {data['channel']['subscribers']}")`
	`135`	`+ # print in nice format`
	`136`	`+ print(f"Title: {data['title']}")`
	`137`	`+ print(f"Views: {data['views']}")`
	`138`	`+ print(f"Published at: {data['date_published']}")`
	`139`	`+ print(f"Video Duration: {data['duration']}")`
	`140`	`+ print(f"Video tags: {data['tags']}")`
	`141`	`+ print(f"Likes: {data['likes']}")`
	`142`	`+ print(f"Dislikes: {data['dislikes']}")`
	`143`	`+ print(f"\nDescription: {data['description']}\n")`
	`144`	`+ print(f"\nChannel Name: {data['channel']['name']}")`
	`145`	`+ print(f"Channel URL: {data['channel']['url']}")`
	`146`	`+ print(f"Channel Subscribers: {data['channel']['subscribers']}")`
	`147`	`+`
	`148`	`+ except Exception as e:`
	`149`	`+ print(f"Error: {e}")`
	`150`	`+ print("\nNote: YouTube frequently changes its structure, so this script may need updates.")`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit dd98afd

File tree

1 file changed

1 file changed

`‎web-scraping/youtube-extractor/extract_video_info.py‎`

0 commit comments