1
- from requests_html import HTMLSession
2
- from bs4 import BeautifulSoup as bs
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
3
import re
4
4
import json
5
-
6
- # init session
7
- session = HTMLSession ()
8
-
5
+ import argparse
9
6
10
7
def get_video_info (url ):
11
- # download HTML code
12
- response = session .get (url )
13
- # execute Javascript
14
- response .html .render (timeout = 60 )
15
- # create beautiful soup object to parse HTML
16
- soup = bs (response .html .html , "html.parser" )
17
- # open("index.html", "w").write(response.html.html)
18
- # initialize the result
19
- result = {}
20
- # video title
21
- result ["title" ] = soup .find ("meta" , itemprop = "name" )['content' ]
22
- # video views
23
- result ["views" ] = soup .find ("meta" , itemprop = "interactionCount" )['content' ]
24
- # video description
25
- result ["description" ] = soup .find ("meta" , itemprop = "description" )['content' ]
26
- # date published
27
- result ["date_published" ] = soup .find ("meta" , itemprop = "datePublished" )['content' ]
28
- # get the duration of the video
29
- result ["duration" ] = soup .find ("span" , {"class" : "ytp-time-duration" }).text
30
- # get the video tags
31
- result ["tags" ] = ', ' .join ([ meta .attrs .get ("content" ) for meta in soup .find_all ("meta" , {"property" : "og:video:tag" }) ])
32
-
33
- # Additional video and channel information (with help from: https://stackoverflow.com/a/68262735)
34
- data = re .search (r"var ytInitialData = ({.*?});" , soup .prettify ()).group (1 )
35
- data_json = json .loads (data )
36
- videoPrimaryInfoRenderer = data_json ['contents' ]['twoColumnWatchNextResults' ]['results' ]['results' ]['contents' ][0 ]['videoPrimaryInfoRenderer' ]
37
- videoSecondaryInfoRenderer = data_json ['contents' ]['twoColumnWatchNextResults' ]['results' ]['results' ]['contents' ][1 ]['videoSecondaryInfoRenderer' ]
38
- # number of likes
39
- likes_label = videoPrimaryInfoRenderer ['videoActions' ]['menuRenderer' ]['topLevelButtons' ][0 ]['toggleButtonRenderer' ]['defaultText' ]['accessibility' ]['accessibilityData' ]['label' ] # "No likes" or "###,### likes"
40
- likes_str = likes_label .split (' ' )[0 ].replace (',' ,'' )
41
- result ["likes" ] = '0' if likes_str == 'No' else likes_str
42
- # number of likes (old way) doesn't always work
43
- # text_yt_formatted_strings = soup.find_all("yt-formatted-string", {"id": "text", "class": "ytd-toggle-button-renderer"})
44
- # result["likes"] = ''.join([ c for c in text_yt_formatted_strings[0].attrs.get("aria-label") if c.isdigit() ])
45
- # result["likes"] = 0 if result['likes'] == '' else int(result['likes'])
46
- # number of dislikes - YouTube does not publish this anymore...
47
- # result["dislikes"] = ''.join([ c for c in text_yt_formatted_strings[1].attrs.get("aria-label") if c.isdigit() ])
48
- # result["dislikes"] = '0' if result['dislikes'] == '' else result['dislikes']
49
- result ['dislikes' ] = 'UNKNOWN'
50
- # channel details
51
- channel_tag = soup .find ("meta" , itemprop = "channelId" )['content' ]
52
- # channel name
53
- channel_name = soup .find ("span" , itemprop = "author" ).next .next ['content' ]
54
- # channel URL
55
- # channel_url = soup.find("span", itemprop="author").next['href']
56
- channel_url = f"https://www.youtube.com/{ channel_tag } "
57
- # number of subscribers as str
58
- channel_subscribers = videoSecondaryInfoRenderer ['owner' ]['videoOwnerRenderer' ]['subscriberCountText' ]['accessibility' ]['accessibilityData' ]['label' ]
59
- # channel details (old way)
60
- # channel_tag = soup.find("yt-formatted-string", {"class": "ytd-channel-name"}).find("a")
61
- # # channel name (old way)
62
- # channel_name = channel_tag.text
63
- # # channel URL (old way)
64
- # channel_url = f"https://www.youtube.com{channel_tag['href']}"
65
- # number of subscribers as str (old way)
66
- # channel_subscribers = soup.find("yt-formatted-string", {"id": "owner-sub-count"}).text.strip()
67
- result ['channel' ] = {'name' : channel_name , 'url' : channel_url , 'subscribers' : channel_subscribers }
68
- return result
8
+ """
9
+ Extract video information from YouTube using modern approach
10
+ """
11
+ headers = {
12
+ 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
13
+ }
14
+
15
+ try :
16
+ # Download HTML code
17
+ response = requests .get (url , headers = headers )
18
+ response .raise_for_status ()
19
+
20
+ # Create beautiful soup object to parse HTML
21
+ soup = BeautifulSoup (response .text , "html.parser" )
22
+
23
+ # Initialize the result
24
+ result = {}
25
+
26
+ # Extract ytInitialData which contains all the video information
27
+ data_match = re .search (r'var ytInitialData = ({.*?});' , response .text )
28
+ if not data_match :
29
+ raise Exception ("Could not find ytInitialData in page" )
30
+
31
+ data_json = json .loads (data_match .group (1 ))
32
+
33
+ # Get the main content sections
34
+ contents = data_json ['contents' ]['twoColumnWatchNextResults' ]['results' ]['results' ]['contents' ]
35
+
36
+ # Extract video information from videoPrimaryInfoRenderer
37
+ if 'videoPrimaryInfoRenderer' in contents [0 ]:
38
+ primary = contents [0 ]['videoPrimaryInfoRenderer' ]
39
+
40
+ # Video title
41
+ result ["title" ] = primary ['title' ]['runs' ][0 ]['text' ]
42
+
43
+ # Video views
44
+ result ["views" ] = primary ['viewCount' ]['videoViewCountRenderer' ]['viewCount' ]['simpleText' ]
45
+
46
+ # Date published
47
+ result ["date_published" ] = primary ['dateText' ]['simpleText' ]
48
+
49
+ # Extract channel information from videoSecondaryInfoRenderer
50
+ secondary = None
51
+ if 'videoSecondaryInfoRenderer' in contents [1 ]:
52
+ secondary = contents [1 ]['videoSecondaryInfoRenderer' ]
53
+ owner = secondary ['owner' ]['videoOwnerRenderer' ]
54
+
55
+ # Channel name
56
+ channel_name = owner ['title' ]['runs' ][0 ]['text' ]
57
+
58
+ # Channel ID
59
+ channel_id = owner ['navigationEndpoint' ]['browseEndpoint' ]['browseId' ]
60
+
61
+ # Channel URL - FIXED with proper /channel/ path
62
+ channel_url = f"https://www.youtube.com/channel/{ channel_id } "
63
+
64
+ # Number of subscribers
65
+ channel_subscribers = owner ['subscriberCountText' ]['accessibility' ]['accessibilityData' ]['label' ]
66
+
67
+ result ['channel' ] = {
68
+ 'name' : channel_name ,
69
+ 'url' : channel_url ,
70
+ 'subscribers' : channel_subscribers
71
+ }
72
+
73
+ # Extract video description
74
+ if secondary and 'attributedDescription' in secondary :
75
+ description_runs = secondary ['attributedDescription' ]['content' ]
76
+ result ["description" ] = description_runs
77
+ else :
78
+ result ["description" ] = "Description not available"
79
+
80
+ # Try to extract video duration from player overlay
81
+ # This is a fallback approach since the original method doesn't work
82
+ duration_match = re .search (r'"approxDurationMs":"(\d+)"' , response .text )
83
+ if duration_match :
84
+ duration_ms = int (duration_match .group (1 ))
85
+ minutes = duration_ms // 60000
86
+ seconds = (duration_ms % 60000 ) // 1000
87
+ result ["duration" ] = f"{ minutes } :{ seconds :02d} "
88
+ else :
89
+ result ["duration" ] = "Duration not available"
90
+
91
+ # Extract video tags if available
92
+ video_tags = []
93
+ if 'keywords' in data_json .get ('metadata' , {}).get ('videoMetadataRenderer' , {}):
94
+ video_tags = data_json ['metadata' ]['videoMetadataRenderer' ]['keywords' ]
95
+ result ["tags" ] = ', ' .join (video_tags ) if video_tags else "No tags available"
96
+
97
+ # Extract likes (modern approach)
98
+ result ["likes" ] = "Likes count not available"
99
+ result ["dislikes" ] = "UNKNOWN" # YouTube no longer shows dislikes
100
+
101
+ # Try to find likes in the new structure
102
+ for content in contents :
103
+ if 'compositeVideoPrimaryInfoRenderer' in content :
104
+ composite = content ['compositeVideoPrimaryInfoRenderer' ]
105
+ if 'likeButton' in composite :
106
+ like_button = composite ['likeButton' ]
107
+ if 'toggleButtonRenderer' in like_button :
108
+ toggle = like_button ['toggleButtonRenderer' ]
109
+ if 'defaultText' in toggle :
110
+ default_text = toggle ['defaultText' ]
111
+ if 'accessibility' in default_text :
112
+ accessibility = default_text ['accessibility' ]
113
+ if 'accessibilityData' in accessibility :
114
+ label = accessibility ['accessibilityData' ]['label' ]
115
+ if 'like' in label .lower ():
116
+ result ["likes" ] = label
117
+
118
+ return result
119
+
120
+ except Exception as e :
121
+ raise Exception (f"Error extracting video info: { str (e )} " )
69
122
70
123
if __name__ == "__main__" :
71
- import argparse
72
124
parser = argparse .ArgumentParser (description = "YouTube Video Data Extractor" )
73
125
parser .add_argument ("url" , help = "URL of the YouTube video" )
74
126
75
127
args = parser .parse_args ()
128
+
76
129
# parse the video URL from command line
77
130
url = args .url
78
131
79
- data = get_video_info (url )
132
+ try :
133
+ data = get_video_info (url )
80
134
81
- # print in nice format
82
- print (f"Title: { data ['title' ]} " )
83
- print (f"Views: { data ['views' ]} " )
84
- print (f"Published at: { data ['date_published' ]} " )
85
- print (f"Video Duration: { data ['duration' ]} " )
86
- print (f"Video tags: { data ['tags' ]} " )
87
- print (f"Likes: { data ['likes' ]} " )
88
- print (f"Dislikes: { data ['dislikes' ]} " )
89
- print (f"\n Description: { data ['description' ]} \n " )
90
- print (f"\n Channel Name: { data ['channel' ]['name' ]} " )
91
- print (f"Channel URL: { data ['channel' ]['url' ]} " )
92
- print (f"Channel Subscribers: { data ['channel' ]['subscribers' ]} " )
135
+ # print in nice format
136
+ print (f"Title: { data ['title' ]} " )
137
+ print (f"Views: { data ['views' ]} " )
138
+ print (f"Published at: { data ['date_published' ]} " )
139
+ print (f"Video Duration: { data ['duration' ]} " )
140
+ print (f"Video tags: { data ['tags' ]} " )
141
+ print (f"Likes: { data ['likes' ]} " )
142
+ print (f"Dislikes: { data ['dislikes' ]} " )
143
+ print (f"\n Description: { data ['description' ]} \n " )
144
+ print (f"\n Channel Name: { data ['channel' ]['name' ]} " )
145
+ print (f"Channel URL: { data ['channel' ]['url' ]} " )
146
+ print (f"Channel Subscribers: { data ['channel' ]['subscribers' ]} " )
147
+
148
+ except Exception as e :
149
+ print (f"Error: { e } " )
150
+ print ("\n Note: YouTube frequently changes its structure, so this script may need updates." )
0 commit comments