|
- // based from https://github.com/Kakulukian/youtube-transcript
- //
- // we refactored it to make it more maintainable
-
- import fetchPonyfill from 'fetch-ponyfill';
- import {
- InvalidVideoIdError,
- CannotRetrieveVideoPageError,
- FetchTranscriptRequestFailureError,
- InnerTubeApiKeyMissingError,
- InvalidTranscriptActionsError,
- InvalidTranscriptResponseContextError,
- } from './errors';
-
- const RE_YOUTUBE = /^.*(?:(?:youtu\.be\/|v\/|vi\/|u\/\w\/|embed\/)|(?:(?:watch)?\?v(?:i)?=|\&v(?:i)?=))([^#\&\?]*).*/im;
-
- export interface TranscriptConfig {
- language?: string;
- country?: string;
- }
- export interface TranscriptResponse {
- text: string;
- duration: number;
- offset: number;
- }
-
- const { fetch: f } = fetchPonyfill();
-
- export const retrieveVideoId = (videoId: string): string => {
- if (videoId.length === 11) {
- return videoId;
- }
- const matchId = videoId.match(RE_YOUTUBE);
- if (matchId && matchId.length) {
- return matchId[1];
- }
- throw new InvalidVideoIdError('Impossible to retrieve Youtube video ID.');
- };
-
- export const getVideoPage = async (videoId: string): Promise<string> => {
- const identifier = retrieveVideoId(videoId);
- const videoUrl = new URL('/watch', 'https://www.youtube.com');
- const videoUrlParams = new URLSearchParams({
- v: identifier,
- });
- videoUrl.search = videoUrlParams.toString();
- const videoPageResponse = await f(videoUrl.toString());
- if (!videoPageResponse.ok) {
- throw new CannotRetrieveVideoPageError('Unable to get video page.');
- }
- return videoPageResponse.text();
- };
-
- const generateNonce = () => {
- const rnd = Math.random().toString();
- const alphabet = 'ABCDEFGHIJKLMOPQRSTUVWXYZabcdefghjijklmnopqrstuvwxyz0123456789';
- const jda = [
- `${alphabet}+/=`,
- `${alphabet}+/`,
- `${alphabet}-_=`,
- `${alphabet}-_.`,
- `${alphabet}-_`,
- ];
- const b = jda[3];
- const a = [];
- for (let i = 0; i < rnd.length - 1; i++) {
- a.push(rnd[i].charCodeAt(i));
- }
- let c = '';
- let d = 0;
- let m; let n; let q; let r; let f; let
- g;
- while (d < a.length) {
- f = a[d];
- g = d + 1 < a.length;
-
- if (g) {
- m = a[d + 1];
- } else {
- m = 0;
- }
- n = d + 2 < a.length;
- if (n) {
- q = a[d + 2];
- } else {
- q = 0;
- }
- r = f >> 2;
- f = ((f & 3) << 4) | (m >> 4);
- m = ((m & 15) << 2) | (q >> 6);
- q &= 63;
- if (!n) {
- q = 64;
- if (!q) {
- m = 64;
- }
- }
- c += b[r] + b[f] + b[m] + b[q];
- d += 3;
- }
- return c;
- };
-
- const extractInnterTubeApiKeyFromPage = (videoPageBody: string): string => videoPageBody
- .split('"INNERTUBE_API_KEY":"')[1]
- .split('"')[0];
-
- const extractSerializedShareEntityFromPage = (page: string) => page.split('"serializedShareEntity":"')[1]?.split('"')[0];
-
- const extractVisitorDataFromPage = (page: string) => page.split('"VISITOR_DATA":"')[1]?.split('"')[0];
-
- const extractSessionIdFromPage = (page: string) => page.split('"sessionId":"')[1]?.split('"')[0];
-
- const extractClickTrackingParamsFromPage = (page: string) => page
- ?.split('"clickTrackingParams":"')[1]
- ?.split('"')[0];
-
- interface VideoPageData {
- innerTubeApiKey?: string;
- serializedShareEntity?: string;
- visitorData?: string;
- sessionId?: string;
- clickTrackingParams?: string;
- }
-
- export const extractDataFromPage = (page: string): VideoPageData => ({
- innerTubeApiKey: extractInnterTubeApiKeyFromPage(page),
- serializedShareEntity: extractSerializedShareEntityFromPage(page),
- visitorData: extractVisitorDataFromPage(page),
- sessionId: extractSessionIdFromPage(page),
- clickTrackingParams: extractClickTrackingParamsFromPage(page),
- });
-
- const generateGetTranscriptRequestBody = (
- p: Partial<VideoPageData>,
- config?: TranscriptConfig,
- ) => {
- const {
- serializedShareEntity,
- visitorData,
- sessionId,
- clickTrackingParams,
- } = p;
- return {
- context: {
- client: {
- hl: config?.language ?? 'en',
- gl: config?.country ?? 'PH',
- visitorData,
- userAgent:
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36,gzip(gfe)',
- clientName: 'WEB',
- clientVersion: '2.20200925.01.00',
- osName: 'Macintosh',
- osVersion: '10_15_4',
- browserName: 'Chrome',
- browserVersion: '85.0f.4183.83',
- screenWidthPoints: 1440,
- screenHeightPoints: 770,
- screenPixelDensity: 2,
- utcOffsetMinutes: 120,
- userInterfaceTheme: 'USER_INTERFACE_THEME_LIGHT',
- connectionType: 'CONN_CELLULAR_3G',
- },
- request: {
- sessionId,
- internalExperimentFlags: [],
- consistencyTokenJars: [],
- },
- user: {},
- clientScreenNonce: generateNonce(),
- clickTracking: {
- clickTrackingParams,
- },
- },
- params: serializedShareEntity,
- };
- };
-
- export const fetchTranscriptItems = async (pageData: VideoPageData, config?: TranscriptConfig) => {
- const { innerTubeApiKey } = pageData;
- if (!(innerTubeApiKey && innerTubeApiKey.length > 0)) {
- throw new InnerTubeApiKeyMissingError('InnerTube API key not found on video page.');
- }
- const getTranscriptUrl = new URL('/youtubei/v1/get_transcript', 'https://www.youtube.com');
- const getTranscriptParams = new URLSearchParams({
- key: innerTubeApiKey,
- });
- getTranscriptUrl.search = getTranscriptParams.toString();
- const transcriptResponse = await f(getTranscriptUrl.toString(), {
- method: 'POST',
- headers: {
- Accept: 'application/json',
- 'Content-Type': 'application/json',
- },
- body: JSON.stringify(generateGetTranscriptRequestBody(pageData, config)),
- });
-
- if (!transcriptResponse.ok) {
- throw new FetchTranscriptRequestFailureError(`Fetching transcript failed with status ${transcriptResponse.status}.`);
- }
-
- const transcriptBody = await transcriptResponse.json();
- if (!transcriptBody.responseContext) {
- throw new InvalidTranscriptResponseContextError('No responseContext found on get transcript response.');
- }
-
- if (!transcriptBody.actions) {
- throw new InvalidTranscriptActionsError('No actions found on get transcript response.');
- }
-
- const { cueGroups: transcripts } = transcriptBody
- .actions[0]
- .updateEngagementPanelAction
- .content
- .transcriptRenderer
- .body
- .transcriptBodyRenderer;
-
- return transcripts.map((cue) => ({
- text: cue.transcriptCueGroupRenderer.cues[0].transcriptCueRenderer
- .cue.simpleText,
- duration: parseInt(
- cue.transcriptCueGroupRenderer.cues[0].transcriptCueRenderer
- .durationMs,
- ),
- offset: parseInt(
- cue.transcriptCueGroupRenderer.cues[0].transcriptCueRenderer
- .startOffsetMs,
- ),
- })) as TranscriptResponse[];
- };
|