Created
January 6, 2025 01:16
-
-
Save disler/884092163873301fe30d714837c767a4 to your computer and use it in GitHub Desktop.
YT Transcript Download
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import subprocess | |
import os | |
def download_yt_script(url: str) -> str: | |
""" | |
Download and extract script from YouTube video | |
Credit: | |
Original Code: https://github.com/davidgasquez/dotfiles/blob/bb9df4a369dbaef95ca0c35642de491c7dd41269/shell/zshrc#L75 | |
Simonw Blog: https://simonwillison.net/2024/Dec/19/ | |
""" | |
try: | |
# Use yt-dlp to download subtitles directly to a file | |
subtitle_cmd = [ | |
'yt-dlp', | |
'--quiet', | |
'--skip-download', | |
'--write-sub', | |
'--sub-langs', 'en', | |
'--write-auto-sub', | |
'--convert-subs', 'vtt', | |
'--output', 'temp_subtitle', | |
url | |
] | |
subprocess.run(subtitle_cmd, check=True) | |
# Find the subtitle file | |
subtitle_files = [f for f in os.listdir('.') if f.startswith('temp_subtitle') and f.endswith('.vtt')] | |
if not subtitle_files: | |
raise RuntimeError("No subtitle file found") | |
# Read and process the subtitle file | |
with open(subtitle_files[0], 'r', encoding='utf-8') as f: | |
content = f.read() | |
# Clean up the subtitle file | |
for file in subtitle_files: | |
os.remove(file) | |
# Process the content - remove WEBVTT headers and timing | |
lines = [] | |
for line in content.split('\n'): | |
line = line.strip() | |
if line and not line.startswith(('WEBVTT', 'Kind:', 'Language:', '-->')): | |
if not any(c.isdigit() for c in line): # Skip timestamp lines | |
lines.append(line) | |
processed_content = ' '.join(lines) | |
if not processed_content: | |
raise RuntimeError("No transcript content found") | |
return processed_content | |
except (subprocess.CalledProcessError, IOError) as e: | |
raise RuntimeError(f"Failed to download YouTube script: {str(e)}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment