Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit a1ca465

Browse files
committed
add youtube transcript summarizer tutorial
1 parent 83a5d14 commit a1ca465

File tree

4 files changed

+319
-0
lines changed

4 files changed

+319
-0
lines changed

‎README.md‎

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,7 @@ This is a repository of all the tutorials of [The Python Code](https://www.thepy
200200
- [How to Extract Google Trends Data in Python](https://www.thepythoncode.com/article/extract-google-trends-data-in-python). ([code](web-scraping/extract-google-trends-data))
201201
- [How to Make a YouTube Video Downloader in Python](https://www.thepythoncode.com/article/make-a-youtube-video-downloader-in-python). ([code](web-scraping/youtube-video-downloader))
202202
- [How to Build a YouTube Audio Downloader in Python](https://www.thepythoncode.com/article/build-a-youtube-mp3-downloader-tkinter-python). ([code](web-scraping/youtube-mp3-downloader))
203+
- [YouTube Video Transcription Summarization with Python](https://thepythoncode.com/article/youtube-video-transcription-and-summarization-with-python). ([code](web-scraping/youtube-transcript-summarizer/))
203204

204205
- ### [Python Standard Library](https://www.thepythoncode.com/topic/python-standard-library)
205206
- [How to Transfer Files in the Network using Sockets in Python](https://www.thepythoncode.com/article/send-receive-files-using-sockets-python). ([code](general/transfer-files/))
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# [YouTube Video Transcription Summarization with Python](https://thepythoncode.com/article/youtube-video-transcription-and-summarization-with-python)
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
nltk
2+
pytube
3+
youtube_transcript_api
4+
colorama
5+
openai
Lines changed: 312 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,312 @@
1+
import os
2+
import re
3+
import nltk
4+
import pytube
5+
import youtube_transcript_api
6+
from youtube_transcript_api import YouTubeTranscriptApi
7+
from nltk.corpus import stopwords
8+
from nltk.tokenize import sent_tokenize, word_tokenize
9+
from nltk.probability import FreqDist
10+
from heapq import nlargest
11+
from urllib.parse import urlparse, parse_qs
12+
import textwrap
13+
from colorama import Fore, Back, Style, init
14+
from openai import OpenAI
15+
16+
# Initialize colorama for cross-platform colored terminal output
17+
init(autoreset=True)
18+
19+
# Download necessary NLTK data
20+
nltk.download('punkt_tab', quiet=True)
21+
nltk.download('punkt', quiet=True)
22+
nltk.download('stopwords', quiet=True)
23+
24+
# Initialize OpenAI client
25+
client = OpenAI(
26+
base_url="https://openrouter.ai/api/v1",
27+
api_key="<api_key>", # Add your OpenRouter API key here
28+
)
29+
30+
def extract_video_id(youtube_url):
31+
"""Extract the video ID from a YouTube URL."""
32+
parsed_url = urlparse(youtube_url)
33+
34+
if parsed_url.netloc == 'youtu.be':
35+
return parsed_url.path[1:]
36+
37+
if parsed_url.netloc in ('www.youtube.com', 'youtube.com'):
38+
if parsed_url.path == '/watch':
39+
return parse_qs(parsed_url.query)['v'][0]
40+
elif parsed_url.path.startswith('/embed/'):
41+
return parsed_url.path.split('/')[2]
42+
elif parsed_url.path.startswith('/v/'):
43+
return parsed_url.path.split('/')[2]
44+
45+
# If no match found
46+
raise ValueError(f"Could not extract video ID from URL: {youtube_url}")
47+
48+
def get_transcript(video_id):
49+
"""Get the transcript of a YouTube video."""
50+
try:
51+
transcript = YouTubeTranscriptApi.get_transcript(video_id)
52+
return ' '.join([entry['text'] for entry in transcript])
53+
except Exception as e:
54+
return f"Error retrieving transcript: {str(e)}."
55+
56+
def summarize_text_nltk(text, num_sentences=5):
57+
"""Summarize text using frequency-based extractive summarization with NLTK."""
58+
if not text or text.startswith("Error") or text.startswith("Transcript not available"):
59+
return text
60+
61+
# Tokenize the text into sentences and words
62+
sentences = sent_tokenize(text)
63+
64+
# If there are fewer sentences than requested, return all sentences
65+
if len(sentences) <= num_sentences:
66+
return text
67+
68+
# Tokenize words and remove stopwords
69+
stop_words = set(stopwords.words('english'))
70+
words = word_tokenize(text.lower())
71+
words = [word for word in words if word.isalnum() and word not in stop_words]
72+
73+
# Calculate word frequencies
74+
freq = FreqDist(words)
75+
76+
# Score sentences based on word frequencies
77+
sentence_scores = {}
78+
for i, sentence in enumerate(sentences):
79+
for word in word_tokenize(sentence.lower()):
80+
if word in freq:
81+
if i in sentence_scores:
82+
sentence_scores[i] += freq[word]
83+
else:
84+
sentence_scores[i] = freq[word]
85+
86+
# Get the top N sentences with highest scores
87+
summary_sentences_indices = nlargest(num_sentences, sentence_scores, key=sentence_scores.get)
88+
summary_sentences_indices.sort() # Sort to maintain original order
89+
90+
# Construct the summary
91+
summary = ' '.join([sentences[i] for i in summary_sentences_indices])
92+
return summary
93+
94+
def summarize_text_ai(text, video_title, num_sentences=5):
95+
"""Summarize text using the Mistral AI model via OpenRouter."""
96+
if not text or text.startswith("Error") or text.startswith("Transcript not available"):
97+
return text
98+
99+
# Truncate text if it's too long (models often have token limits)
100+
max_chars = 15000 # Adjust based on model's context window
101+
truncated_text = text[:max_chars] if len(text) > max_chars else text
102+
103+
prompt = f"""Please provide a concise summary of the following YouTube video transcript.
104+
Title: {video_title}
105+
106+
Transcript:
107+
{truncated_text}
108+
109+
Create a clear, informative summary that captures the main points and key insights from the video.
110+
Your summary should be approximately {num_sentences} sentences long.
111+
"""
112+
113+
try:
114+
completion = client.chat.completions.create(
115+
model="mistralai/mistral-small-3.1-24b-instruct:free",
116+
messages=[
117+
{
118+
"role": "user",
119+
"content": [
120+
{
121+
"type": "text",
122+
"text": prompt
123+
}
124+
]
125+
}
126+
]
127+
)
128+
return completion.choices[0].message.content
129+
except Exception as e:
130+
return f"Error generating AI summary: {str(e)}"
131+
132+
def summarize_youtube_video(youtube_url, num_sentences=5):
133+
"""Main function to summarize a YouTube video's transcription."""
134+
try:
135+
video_id = extract_video_id(youtube_url)
136+
transcript = get_transcript(video_id)
137+
138+
# Get video title for context
139+
try:
140+
yt = pytube.YouTube(youtube_url)
141+
video_title = yt.title
142+
143+
except Exception as e:
144+
video_title = "Unknown Title"
145+
146+
147+
# Generate both summaries
148+
print(Fore.YELLOW + f"Generating AI summary with {num_sentences} sentences...")
149+
ai_summary = summarize_text_ai(transcript, video_title, num_sentences)
150+
151+
print(Fore.YELLOW + f"Generating NLTK summary with {num_sentences} sentences...")
152+
nltk_summary = summarize_text_nltk(transcript, num_sentences)
153+
154+
return {
155+
"video_title": video_title,
156+
"video_id": video_id,
157+
"ai_summary": ai_summary,
158+
"nltk_summary": nltk_summary,
159+
"full_transcript_length": len(transcript.split()),
160+
"nltk_summary_length": len(nltk_summary.split()),
161+
"ai_summary_length": len(ai_summary.split()) if not ai_summary.startswith("Error") else 0
162+
}
163+
except Exception as e:
164+
return {"error": str(e)}
165+
166+
def format_time(seconds):
167+
"""Convert seconds to a readable time format."""
168+
hours, remainder = divmod(seconds, 3600)
169+
minutes, seconds = divmod(remainder, 60)
170+
171+
if hours > 0:
172+
return f"{hours}h {minutes}m {seconds}s"
173+
elif minutes > 0:
174+
return f"{minutes}m {seconds}s"
175+
else:
176+
return f"{seconds}s"
177+
178+
def format_number(number):
179+
"""Format large numbers with commas for readability."""
180+
return "{:,}".format(number)
181+
182+
def print_boxed_text(text, width=80, title=None, color=Fore.WHITE):
183+
"""Print text in a nice box with optional title."""
184+
wrapper = textwrap.TextWrapper(width=width-4) # -4 for the box margins
185+
wrapped_text = wrapper.fill(text)
186+
lines = wrapped_text.split('\n')
187+
188+
# Print top border with optional title
189+
if title:
190+
title_space = width - 4 - len(title)
191+
left_padding = title_space // 2
192+
right_padding = title_space - left_padding
193+
print(color + '┌' + '─' * left_padding + title + '─' * right_padding + '┐')
194+
else:
195+
print(color + '┌' + '─' * (width-2) + '┐')
196+
197+
# Print content
198+
for line in lines:
199+
padding = width - 2 - len(line)
200+
print(color + '│ ' + line + ' ' * padding + '│')
201+
202+
# Print bottom border
203+
print(color + '└' + '─' * (width-2) + '┘')
204+
205+
def print_summary_result(result, width=80):
206+
"""Print the summary result in a nicely formatted way."""
207+
if "error" in result:
208+
print_boxed_text(f"Error: {result['error']}", width=width, title="ERROR", color=Fore.RED)
209+
return
210+
211+
# Terminal width
212+
terminal_width = width
213+
214+
# Print header with video information
215+
print("\n" + Fore.CYAN + "=" * terminal_width)
216+
print(Fore.CYAN + Style.BRIGHT + result['video_title'].center(terminal_width))
217+
print(Fore.CYAN + "=" * terminal_width + "\n")
218+
219+
# Video metadata section
220+
print(Fore.YELLOW + Style.BRIGHT + "VIDEO INFORMATION".center(terminal_width))
221+
print(Fore.YELLOW + "─" * terminal_width)
222+
223+
# Two-column layout for metadata
224+
col_width = terminal_width // 2 - 2
225+
226+
# Row 3
227+
print(f"{Fore.GREEN}Video ID: {Fore.WHITE}{result['video_id']:<{col_width}}"
228+
f"{Fore.GREEN}URL: {Fore.WHITE}https://youtu.be/{result['video_id']}")
229+
230+
print(Fore.YELLOW + "─" * terminal_width + "\n")
231+
232+
# AI Summary section
233+
ai_compression = "N/A"
234+
if result['ai_summary_length'] > 0:
235+
ai_compression = round((1 - result['ai_summary_length'] / result['full_transcript_length']) * 100)
236+
237+
ai_summary_title = f" AI SUMMARY ({result['ai_summary_length']} words, condensed {ai_compression}% from {result['full_transcript_length']} words) "
238+
239+
print(Fore.GREEN + Style.BRIGHT + ai_summary_title.center(terminal_width))
240+
print(Fore.GREEN + "─" * terminal_width)
241+
242+
# Print the AI summary with proper wrapping
243+
wrapper = textwrap.TextWrapper(width=terminal_width-4,
244+
initial_indent=' ',
245+
subsequent_indent=' ')
246+
247+
# Split AI summary into paragraphs and print each
248+
ai_paragraphs = result['ai_summary'].split('\n')
249+
for paragraph in ai_paragraphs:
250+
if paragraph.strip(): # Skip empty paragraphs
251+
print(wrapper.fill(paragraph))
252+
print() # Empty line between paragraphs
253+
254+
print(Fore.GREEN + "─" * terminal_width + "\n")
255+
256+
# NLTK Summary section
257+
nltk_compression = round((1 - result['nltk_summary_length'] / result['full_transcript_length']) * 100)
258+
nltk_summary_title = f" NLTK SUMMARY ({result['nltk_summary_length']} words, condensed {nltk_compression}% from {result['full_transcript_length']} words) "
259+
260+
print(Fore.MAGENTA + Style.BRIGHT + nltk_summary_title.center(terminal_width))
261+
print(Fore.MAGENTA + "─" * terminal_width)
262+
263+
# Split NLTK summary into paragraphs and wrap each
264+
paragraphs = result['nltk_summary'].split('. ')
265+
formatted_paragraphs = []
266+
267+
current_paragraph = ""
268+
for sentence in paragraphs:
269+
if not sentence.endswith('.'):
270+
sentence += '.'
271+
272+
if len(current_paragraph) + len(sentence) + 1 <= 150: # Arbitrary length for paragraph
273+
current_paragraph += " " + sentence if current_paragraph else sentence
274+
else:
275+
if current_paragraph:
276+
formatted_paragraphs.append(current_paragraph)
277+
current_paragraph = sentence
278+
279+
if current_paragraph:
280+
formatted_paragraphs.append(current_paragraph)
281+
282+
# Print each paragraph
283+
for paragraph in formatted_paragraphs:
284+
print(wrapper.fill(paragraph))
285+
print() # Empty line between paragraphs
286+
287+
print(Fore.MAGENTA + "─" * terminal_width + "\n")
288+
289+
290+
if __name__ == "__main__":
291+
# Get terminal width
292+
try:
293+
terminal_width = os.get_terminal_size().columns
294+
# Limit width to reasonable range
295+
terminal_width = max(80, min(terminal_width, 120))
296+
except:
297+
terminal_width = 80 # Default if can't determine
298+
299+
# Print welcome banner
300+
print(Fore.CYAN + Style.BRIGHT + "\n" + "=" * terminal_width)
301+
print(Fore.CYAN + Style.BRIGHT + "YOUTUBE VIDEO SUMMARIZER".center(terminal_width))
302+
print(Fore.CYAN + Style.BRIGHT + "=" * terminal_width + "\n")
303+
304+
youtube_url = input(Fore.GREEN + "Enter YouTube video URL: " + Fore.WHITE)
305+
306+
num_sentences_input = input(Fore.GREEN + "Enter number of sentences for summaries (default 5): " + Fore.WHITE)
307+
num_sentences = int(num_sentences_input) if num_sentences_input.strip() else 5
308+
309+
print(Fore.YELLOW + "\nFetching and analyzing video transcript... Please wait...\n")
310+
311+
result = summarize_youtube_video(youtube_url, num_sentences)
312+
print_summary_result(result, width=terminal_width)

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /