Skip to content

Instantly share code, notes, and snippets.

@disler
Created January 6, 2025 01:16
Show Gist options
  • Save disler/884092163873301fe30d714837c767a4 to your computer and use it in GitHub Desktop.
Save disler/884092163873301fe30d714837c767a4 to your computer and use it in GitHub Desktop.
YT Transcript Download
import subprocess
import os
def download_yt_script(url: str) -> str:
"""
Download and extract script from YouTube video
Credit:
Original Code: https://github.com/davidgasquez/dotfiles/blob/bb9df4a369dbaef95ca0c35642de491c7dd41269/shell/zshrc#L75
Simonw Blog: https://simonwillison.net/2024/Dec/19/
"""
try:
# Use yt-dlp to download subtitles directly to a file
subtitle_cmd = [
'yt-dlp',
'--quiet',
'--skip-download',
'--write-sub',
'--sub-langs', 'en',
'--write-auto-sub',
'--convert-subs', 'vtt',
'--output', 'temp_subtitle',
url
]
subprocess.run(subtitle_cmd, check=True)
# Find the subtitle file
subtitle_files = [f for f in os.listdir('.') if f.startswith('temp_subtitle') and f.endswith('.vtt')]
if not subtitle_files:
raise RuntimeError("No subtitle file found")
# Read and process the subtitle file
with open(subtitle_files[0], 'r', encoding='utf-8') as f:
content = f.read()
# Clean up the subtitle file
for file in subtitle_files:
os.remove(file)
# Process the content - remove WEBVTT headers and timing
lines = []
for line in content.split('\n'):
line = line.strip()
if line and not line.startswith(('WEBVTT', 'Kind:', 'Language:', '-->')):
if not any(c.isdigit() for c in line): # Skip timestamp lines
lines.append(line)
processed_content = ' '.join(lines)
if not processed_content:
raise RuntimeError("No transcript content found")
return processed_content
except (subprocess.CalledProcessError, IOError) as e:
raise RuntimeError(f"Failed to download YouTube script: {str(e)}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment