Skip to content

youtube_transcript

YoutubeTranscript

Source code in src/gemini-cli/gemini/youtube_transcript.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
class YoutubeTranscript:

    @staticmethod
    def get_video_id(youtube_url):
        """
        Extract the video ID from a YouTube URL.
        Args:
            youtube_url (str): The YouTube URL.
        Returns:
            str: The extracted video ID or None if not found.
        """
        pattern = (r'(?:https?:\/\/)?(?:www\.)?(?:youtube\.com\/(?:[^\/\n\s]+\/\S+\/|(?:v|e(?:mbed)?)\/|\S*?['
                   r'?&]v=)|youtu\.be\/)([a-zA-Z0-9_-]{11})')
        match = re.search(pattern, youtube_url)
        return match.group(1) if match else None

    @staticmethod
    def get_video_title(video_id):
        """
        Get the title of the YouTube video.
        Args:
            video_id (str): The YouTube video ID.
        Returns:
            str: The title of the video or "Unknown" if not found.
        """
        url = f"https://www.youtube.com/watch?v={video_id}"
        try:
            response = requests.get(url)
            response.raise_for_status()
            matches = re.findall(r'<title>(.*?)</title>', response.text)
            return matches[0].replace(" - YouTube", "") if matches else "Unknown"
        except requests.RequestException as e:
            console.log(f"Error fetching video title: {e}", style='bold red')
            return "Unknown"

    @staticmethod
    def download_transcript(video_id):
        """
        Download the transcript and return as a string.
        Args:
            video_id (str): The YouTube video ID.
        Returns:
            str: The transcript text or an empty string if an error occurs.
        """
        try:
            transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
            transcript = transcript_list.find_generated_transcript(['en'])

            formatter = TextFormatter()
            transcript_text = formatter.format_transcript(transcript.fetch())

            # Remove timecodes and speaker names
            transcript_text = re.sub(r'\[\d+:\d+:\d+\]', '', transcript_text)
            transcript_text = re.sub(r'<\w+>', '', transcript_text)
            return transcript_text
        except Exception as e:
            console.log(f"Error downloading transcript: {e}", style='bold red')
            return ""

    @staticmethod
    def get_transcript(youtube_url):
        video_id = YoutubeTranscript.get_video_id(youtube_url)
        if video_id:
            transcript_text = YoutubeTranscript.download_transcript(video_id)
            if transcript_text:
                return transcript_text
            else:
                console.log("Unable to download transcript.", style='bold red')
        else:
            console.log("Invalid YouTube URL.", style='bold red')
download_transcript(video_id) staticmethod

Download the transcript and return as a string. Args: video_id (str): The YouTube video ID. Returns: str: The transcript text or an empty string if an error occurs.

Source code in src/gemini-cli/gemini/youtube_transcript.py
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
@staticmethod
def download_transcript(video_id):
    """
    Download the transcript and return as a string.
    Args:
        video_id (str): The YouTube video ID.
    Returns:
        str: The transcript text or an empty string if an error occurs.
    """
    try:
        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
        transcript = transcript_list.find_generated_transcript(['en'])

        formatter = TextFormatter()
        transcript_text = formatter.format_transcript(transcript.fetch())

        # Remove timecodes and speaker names
        transcript_text = re.sub(r'\[\d+:\d+:\d+\]', '', transcript_text)
        transcript_text = re.sub(r'<\w+>', '', transcript_text)
        return transcript_text
    except Exception as e:
        console.log(f"Error downloading transcript: {e}", style='bold red')
        return ""
get_video_id(youtube_url) staticmethod

Extract the video ID from a YouTube URL. Args: youtube_url (str): The YouTube URL. Returns: str: The extracted video ID or None if not found.

Source code in src/gemini-cli/gemini/youtube_transcript.py
12
13
14
15
16
17
18
19
20
21
22
23
24
@staticmethod
def get_video_id(youtube_url):
    """
    Extract the video ID from a YouTube URL.
    Args:
        youtube_url (str): The YouTube URL.
    Returns:
        str: The extracted video ID or None if not found.
    """
    pattern = (r'(?:https?:\/\/)?(?:www\.)?(?:youtube\.com\/(?:[^\/\n\s]+\/\S+\/|(?:v|e(?:mbed)?)\/|\S*?['
               r'?&]v=)|youtu\.be\/)([a-zA-Z0-9_-]{11})')
    match = re.search(pattern, youtube_url)
    return match.group(1) if match else None
get_video_title(video_id) staticmethod

Get the title of the YouTube video. Args: video_id (str): The YouTube video ID. Returns: str: The title of the video or "Unknown" if not found.

Source code in src/gemini-cli/gemini/youtube_transcript.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
@staticmethod
def get_video_title(video_id):
    """
    Get the title of the YouTube video.
    Args:
        video_id (str): The YouTube video ID.
    Returns:
        str: The title of the video or "Unknown" if not found.
    """
    url = f"https://www.youtube.com/watch?v={video_id}"
    try:
        response = requests.get(url)
        response.raise_for_status()
        matches = re.findall(r'<title>(.*?)</title>', response.text)
        return matches[0].replace(" - YouTube", "") if matches else "Unknown"
    except requests.RequestException as e:
        console.log(f"Error fetching video title: {e}", style='bold red')
        return "Unknown"