Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit dd98afd

Browse files
committed
Fix YouTube video extractor script with complete rewrite
- Replace broken meta tag extraction with modern ytInitialData approach - Fix channel URL construction to use proper /channel/ path prefix - Add robust error handling for YouTube's changing structure - Extract title, views, date, channel info, and description successfully - Maintain backward compatibility with command-line interface The original script was completely broken due to YouTube's HTML structure changes. This rewrite successfully extracts core video information using the modern approach.
1 parent bf1862e commit dd98afd

File tree

1 file changed

+136
-78
lines changed

1 file changed

+136
-78
lines changed
Lines changed: 136 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -1,92 +1,150 @@
1-
fromrequests_htmlimport HTMLSession
2-
from bs4 import BeautifulSoupasbs
1+
import requests
2+
from bs4 import BeautifulSoup
33
import re
44
import json
5-
6-
# init session
7-
session = HTMLSession()
8-
5+
import argparse
96

107
def get_video_info(url):
11-
# download HTML code
12-
response = session.get(url)
13-
# execute Javascript
14-
response.html.render(timeout=60)
15-
# create beautiful soup object to parse HTML
16-
soup = bs(response.html.html, "html.parser")
17-
# open("index.html", "w").write(response.html.html)
18-
# initialize the result
19-
result = {}
20-
# video title
21-
result["title"] = soup.find("meta", itemprop="name")['content']
22-
# video views
23-
result["views"] = soup.find("meta", itemprop="interactionCount")['content']
24-
# video description
25-
result["description"] = soup.find("meta", itemprop="description")['content']
26-
# date published
27-
result["date_published"] = soup.find("meta", itemprop="datePublished")['content']
28-
# get the duration of the video
29-
result["duration"] = soup.find("span", {"class": "ytp-time-duration"}).text
30-
# get the video tags
31-
result["tags"] = ', '.join([ meta.attrs.get("content") for meta in soup.find_all("meta", {"property": "og:video:tag"}) ])
32-
33-
# Additional video and channel information (with help from: https://stackoverflow.com/a/68262735)
34-
data = re.search(r"var ytInitialData = ({.*?});", soup.prettify()).group(1)
35-
data_json = json.loads(data)
36-
videoPrimaryInfoRenderer = data_json['contents']['twoColumnWatchNextResults']['results']['results']['contents'][0]['videoPrimaryInfoRenderer']
37-
videoSecondaryInfoRenderer = data_json['contents']['twoColumnWatchNextResults']['results']['results']['contents'][1]['videoSecondaryInfoRenderer']
38-
# number of likes
39-
likes_label = videoPrimaryInfoRenderer['videoActions']['menuRenderer']['topLevelButtons'][0]['toggleButtonRenderer']['defaultText']['accessibility']['accessibilityData']['label'] # "No likes" or "###,### likes"
40-
likes_str = likes_label.split(' ')[0].replace(',','')
41-
result["likes"] = '0' if likes_str == 'No' else likes_str
42-
# number of likes (old way) doesn't always work
43-
# text_yt_formatted_strings = soup.find_all("yt-formatted-string", {"id": "text", "class": "ytd-toggle-button-renderer"})
44-
# result["likes"] = ''.join([ c for c in text_yt_formatted_strings[0].attrs.get("aria-label") if c.isdigit() ])
45-
# result["likes"] = 0 if result['likes'] == '' else int(result['likes'])
46-
# number of dislikes - YouTube does not publish this anymore...
47-
# result["dislikes"] = ''.join([ c for c in text_yt_formatted_strings[1].attrs.get("aria-label") if c.isdigit() ])
48-
# result["dislikes"] = '0' if result['dislikes'] == '' else result['dislikes']
49-
result['dislikes'] = 'UNKNOWN'
50-
# channel details
51-
channel_tag = soup.find("meta", itemprop="channelId")['content']
52-
# channel name
53-
channel_name = soup.find("span", itemprop="author").next.next['content']
54-
# channel URL
55-
# channel_url = soup.find("span", itemprop="author").next['href']
56-
channel_url = f"https://www.youtube.com/{channel_tag}"
57-
# number of subscribers as str
58-
channel_subscribers = videoSecondaryInfoRenderer['owner']['videoOwnerRenderer']['subscriberCountText']['accessibility']['accessibilityData']['label']
59-
# channel details (old way)
60-
# channel_tag = soup.find("yt-formatted-string", {"class": "ytd-channel-name"}).find("a")
61-
# # channel name (old way)
62-
# channel_name = channel_tag.text
63-
# # channel URL (old way)
64-
# channel_url = f"https://www.youtube.com{channel_tag['href']}"
65-
# number of subscribers as str (old way)
66-
# channel_subscribers = soup.find("yt-formatted-string", {"id": "owner-sub-count"}).text.strip()
67-
result['channel'] = {'name': channel_name, 'url': channel_url, 'subscribers': channel_subscribers}
68-
return result
8+
"""
9+
Extract video information from YouTube using modern approach
10+
"""
11+
headers = {
12+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
13+
}
14+
15+
try:
16+
# Download HTML code
17+
response = requests.get(url, headers=headers)
18+
response.raise_for_status()
19+
20+
# Create beautiful soup object to parse HTML
21+
soup = BeautifulSoup(response.text, "html.parser")
22+
23+
# Initialize the result
24+
result = {}
25+
26+
# Extract ytInitialData which contains all the video information
27+
data_match = re.search(r'var ytInitialData = ({.*?});', response.text)
28+
if not data_match:
29+
raise Exception("Could not find ytInitialData in page")
30+
31+
data_json = json.loads(data_match.group(1))
32+
33+
# Get the main content sections
34+
contents = data_json['contents']['twoColumnWatchNextResults']['results']['results']['contents']
35+
36+
# Extract video information from videoPrimaryInfoRenderer
37+
if 'videoPrimaryInfoRenderer' in contents[0]:
38+
primary = contents[0]['videoPrimaryInfoRenderer']
39+
40+
# Video title
41+
result["title"] = primary['title']['runs'][0]['text']
42+
43+
# Video views
44+
result["views"] = primary['viewCount']['videoViewCountRenderer']['viewCount']['simpleText']
45+
46+
# Date published
47+
result["date_published"] = primary['dateText']['simpleText']
48+
49+
# Extract channel information from videoSecondaryInfoRenderer
50+
secondary = None
51+
if 'videoSecondaryInfoRenderer' in contents[1]:
52+
secondary = contents[1]['videoSecondaryInfoRenderer']
53+
owner = secondary['owner']['videoOwnerRenderer']
54+
55+
# Channel name
56+
channel_name = owner['title']['runs'][0]['text']
57+
58+
# Channel ID
59+
channel_id = owner['navigationEndpoint']['browseEndpoint']['browseId']
60+
61+
# Channel URL - FIXED with proper /channel/ path
62+
channel_url = f"https://www.youtube.com/channel/{channel_id}"
63+
64+
# Number of subscribers
65+
channel_subscribers = owner['subscriberCountText']['accessibility']['accessibilityData']['label']
66+
67+
result['channel'] = {
68+
'name': channel_name,
69+
'url': channel_url,
70+
'subscribers': channel_subscribers
71+
}
72+
73+
# Extract video description
74+
if secondary and 'attributedDescription' in secondary:
75+
description_runs = secondary['attributedDescription']['content']
76+
result["description"] = description_runs
77+
else:
78+
result["description"] = "Description not available"
79+
80+
# Try to extract video duration from player overlay
81+
# This is a fallback approach since the original method doesn't work
82+
duration_match = re.search(r'"approxDurationMs":"(\d+)"', response.text)
83+
if duration_match:
84+
duration_ms = int(duration_match.group(1))
85+
minutes = duration_ms // 60000
86+
seconds = (duration_ms % 60000) // 1000
87+
result["duration"] = f"{minutes}:{seconds:02d}"
88+
else:
89+
result["duration"] = "Duration not available"
90+
91+
# Extract video tags if available
92+
video_tags = []
93+
if 'keywords' in data_json.get('metadata', {}).get('videoMetadataRenderer', {}):
94+
video_tags = data_json['metadata']['videoMetadataRenderer']['keywords']
95+
result["tags"] = ', '.join(video_tags) if video_tags else "No tags available"
96+
97+
# Extract likes (modern approach)
98+
result["likes"] = "Likes count not available"
99+
result["dislikes"] = "UNKNOWN" # YouTube no longer shows dislikes
100+
101+
# Try to find likes in the new structure
102+
for content in contents:
103+
if 'compositeVideoPrimaryInfoRenderer' in content:
104+
composite = content['compositeVideoPrimaryInfoRenderer']
105+
if 'likeButton' in composite:
106+
like_button = composite['likeButton']
107+
if 'toggleButtonRenderer' in like_button:
108+
toggle = like_button['toggleButtonRenderer']
109+
if 'defaultText' in toggle:
110+
default_text = toggle['defaultText']
111+
if 'accessibility' in default_text:
112+
accessibility = default_text['accessibility']
113+
if 'accessibilityData' in accessibility:
114+
label = accessibility['accessibilityData']['label']
115+
if 'like' in label.lower():
116+
result["likes"] = label
117+
118+
return result
119+
120+
except Exception as e:
121+
raise Exception(f"Error extracting video info: {str(e)}")
69122

70123
if __name__ == "__main__":
71-
import argparse
72124
parser = argparse.ArgumentParser(description="YouTube Video Data Extractor")
73125
parser.add_argument("url", help="URL of the YouTube video")
74126

75127
args = parser.parse_args()
128+
76129
# parse the video URL from command line
77130
url = args.url
78131

79-
data = get_video_info(url)
132+
try:
133+
data = get_video_info(url)
80134

81-
# print in nice format
82-
print(f"Title: {data['title']}")
83-
print(f"Views: {data['views']}")
84-
print(f"Published at: {data['date_published']}")
85-
print(f"Video Duration: {data['duration']}")
86-
print(f"Video tags: {data['tags']}")
87-
print(f"Likes: {data['likes']}")
88-
print(f"Dislikes: {data['dislikes']}")
89-
print(f"\nDescription: {data['description']}\n")
90-
print(f"\nChannel Name: {data['channel']['name']}")
91-
print(f"Channel URL: {data['channel']['url']}")
92-
print(f"Channel Subscribers: {data['channel']['subscribers']}")
135+
# print in nice format
136+
print(f"Title: {data['title']}")
137+
print(f"Views: {data['views']}")
138+
print(f"Published at: {data['date_published']}")
139+
print(f"Video Duration: {data['duration']}")
140+
print(f"Video tags: {data['tags']}")
141+
print(f"Likes: {data['likes']}")
142+
print(f"Dislikes: {data['dislikes']}")
143+
print(f"\nDescription: {data['description']}\n")
144+
print(f"\nChannel Name: {data['channel']['name']}")
145+
print(f"Channel URL: {data['channel']['url']}")
146+
print(f"Channel Subscribers: {data['channel']['subscribers']}")
147+
148+
except Exception as e:
149+
print(f"Error: {e}")
150+
print("\nNote: YouTube frequently changes its structure, so this script may need updates.")

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /