Get transcript summaries of Web videos.
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.
 
 

233 lignes
6.6 KiB

  1. // based from https://github.com/Kakulukian/youtube-transcript
  2. //
  3. // we refactored it to make it more maintainable
  4. import fetchPonyfill from 'fetch-ponyfill';
  5. import {
  6. InvalidVideoIdError,
  7. CannotRetrieveVideoPageError,
  8. FetchTranscriptRequestFailureError,
  9. InnerTubeApiKeyMissingError,
  10. InvalidTranscriptActionsError,
  11. InvalidTranscriptResponseContextError,
  12. } from './errors';
  13. const RE_YOUTUBE = /^.*(?:(?:youtu\.be\/|v\/|vi\/|u\/\w\/|embed\/)|(?:(?:watch)?\?v(?:i)?=|\&v(?:i)?=))([^#\&\?]*).*/im;
  14. export interface TranscriptConfig {
  15. language?: string;
  16. country?: string;
  17. }
  18. export interface TranscriptResponse {
  19. text: string;
  20. duration: number;
  21. offset: number;
  22. }
  23. const { fetch: f } = fetchPonyfill();
  24. export const retrieveVideoId = (videoId: string): string => {
  25. if (videoId.length === 11) {
  26. return videoId;
  27. }
  28. const matchId = videoId.match(RE_YOUTUBE);
  29. if (matchId && matchId.length) {
  30. return matchId[1];
  31. }
  32. throw new InvalidVideoIdError('Impossible to retrieve Youtube video ID.');
  33. };
  34. export const getVideoPage = async (videoId: string): Promise<string> => {
  35. const identifier = retrieveVideoId(videoId);
  36. const videoUrl = new URL('/watch', 'https://www.youtube.com');
  37. const videoUrlParams = new URLSearchParams({
  38. v: identifier,
  39. });
  40. videoUrl.search = videoUrlParams.toString();
  41. const videoPageResponse = await f(videoUrl.toString());
  42. if (!videoPageResponse.ok) {
  43. throw new CannotRetrieveVideoPageError('Unable to get video page.');
  44. }
  45. return videoPageResponse.text();
  46. };
  47. const generateNonce = () => {
  48. const rnd = Math.random().toString();
  49. const alphabet = 'ABCDEFGHIJKLMOPQRSTUVWXYZabcdefghjijklmnopqrstuvwxyz0123456789';
  50. const jda = [
  51. `${alphabet}+/=`,
  52. `${alphabet}+/`,
  53. `${alphabet}-_=`,
  54. `${alphabet}-_.`,
  55. `${alphabet}-_`,
  56. ];
  57. const b = jda[3];
  58. const a = [];
  59. for (let i = 0; i < rnd.length - 1; i++) {
  60. a.push(rnd[i].charCodeAt(i));
  61. }
  62. let c = '';
  63. let d = 0;
  64. let m; let n; let q; let r; let f; let
  65. g;
  66. while (d < a.length) {
  67. f = a[d];
  68. g = d + 1 < a.length;
  69. if (g) {
  70. m = a[d + 1];
  71. } else {
  72. m = 0;
  73. }
  74. n = d + 2 < a.length;
  75. if (n) {
  76. q = a[d + 2];
  77. } else {
  78. q = 0;
  79. }
  80. r = f >> 2;
  81. f = ((f & 3) << 4) | (m >> 4);
  82. m = ((m & 15) << 2) | (q >> 6);
  83. q &= 63;
  84. if (!n) {
  85. q = 64;
  86. if (!q) {
  87. m = 64;
  88. }
  89. }
  90. c += b[r] + b[f] + b[m] + b[q];
  91. d += 3;
  92. }
  93. return c;
  94. };
  95. const extractInnterTubeApiKeyFromPage = (videoPageBody: string): string => videoPageBody
  96. .split('"INNERTUBE_API_KEY":"')[1]
  97. .split('"')[0];
  98. const extractSerializedShareEntityFromPage = (page: string) => page.split('"serializedShareEntity":"')[1]?.split('"')[0];
  99. const extractVisitorDataFromPage = (page: string) => page.split('"VISITOR_DATA":"')[1]?.split('"')[0];
  100. const extractSessionIdFromPage = (page: string) => page.split('"sessionId":"')[1]?.split('"')[0];
  101. const extractClickTrackingParamsFromPage = (page: string) => page
  102. ?.split('"clickTrackingParams":"')[1]
  103. ?.split('"')[0];
  104. interface VideoPageData {
  105. innerTubeApiKey?: string;
  106. serializedShareEntity?: string;
  107. visitorData?: string;
  108. sessionId?: string;
  109. clickTrackingParams?: string;
  110. }
  111. export const extractDataFromPage = (page: string): VideoPageData => ({
  112. innerTubeApiKey: extractInnterTubeApiKeyFromPage(page),
  113. serializedShareEntity: extractSerializedShareEntityFromPage(page),
  114. visitorData: extractVisitorDataFromPage(page),
  115. sessionId: extractSessionIdFromPage(page),
  116. clickTrackingParams: extractClickTrackingParamsFromPage(page),
  117. });
  118. const generateGetTranscriptRequestBody = (
  119. p: Partial<VideoPageData>,
  120. config?: TranscriptConfig,
  121. ) => {
  122. const {
  123. serializedShareEntity,
  124. visitorData,
  125. sessionId,
  126. clickTrackingParams,
  127. } = p;
  128. return {
  129. context: {
  130. client: {
  131. hl: config?.language ?? 'en',
  132. gl: config?.country ?? 'PH',
  133. visitorData,
  134. userAgent:
  135. 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36,gzip(gfe)',
  136. clientName: 'WEB',
  137. clientVersion: '2.20200925.01.00',
  138. osName: 'Macintosh',
  139. osVersion: '10_15_4',
  140. browserName: 'Chrome',
  141. browserVersion: '85.0f.4183.83',
  142. screenWidthPoints: 1440,
  143. screenHeightPoints: 770,
  144. screenPixelDensity: 2,
  145. utcOffsetMinutes: 120,
  146. userInterfaceTheme: 'USER_INTERFACE_THEME_LIGHT',
  147. connectionType: 'CONN_CELLULAR_3G',
  148. },
  149. request: {
  150. sessionId,
  151. internalExperimentFlags: [],
  152. consistencyTokenJars: [],
  153. },
  154. user: {},
  155. clientScreenNonce: generateNonce(),
  156. clickTracking: {
  157. clickTrackingParams,
  158. },
  159. },
  160. params: serializedShareEntity,
  161. };
  162. };
  163. export const fetchTranscriptItems = async (pageData: VideoPageData, config?: TranscriptConfig) => {
  164. const { innerTubeApiKey } = pageData;
  165. if (!(innerTubeApiKey && innerTubeApiKey.length > 0)) {
  166. throw new InnerTubeApiKeyMissingError('InnerTube API key not found on video page.');
  167. }
  168. const getTranscriptUrl = new URL('/youtubei/v1/get_transcript', 'https://www.youtube.com');
  169. const getTranscriptParams = new URLSearchParams({
  170. key: innerTubeApiKey,
  171. });
  172. getTranscriptUrl.search = getTranscriptParams.toString();
  173. const transcriptResponse = await f(getTranscriptUrl.toString(), {
  174. method: 'POST',
  175. headers: {
  176. Accept: 'application/json',
  177. 'Content-Type': 'application/json',
  178. },
  179. body: JSON.stringify(generateGetTranscriptRequestBody(pageData, config)),
  180. });
  181. if (!transcriptResponse.ok) {
  182. throw new FetchTranscriptRequestFailureError(`Fetching transcript failed with status ${transcriptResponse.status}.`);
  183. }
  184. const transcriptBody = await transcriptResponse.json();
  185. if (!transcriptBody.responseContext) {
  186. throw new InvalidTranscriptResponseContextError('No responseContext found on get transcript response.');
  187. }
  188. if (!transcriptBody.actions) {
  189. throw new InvalidTranscriptActionsError('No actions found on get transcript response.');
  190. }
  191. const { cueGroups: transcripts } = transcriptBody
  192. .actions[0]
  193. .updateEngagementPanelAction
  194. .content
  195. .transcriptRenderer
  196. .body
  197. .transcriptBodyRenderer;
  198. return transcripts.map((cue) => ({
  199. text: cue.transcriptCueGroupRenderer.cues[0].transcriptCueRenderer
  200. .cue.simpleText,
  201. duration: parseInt(
  202. cue.transcriptCueGroupRenderer.cues[0].transcriptCueRenderer
  203. .durationMs,
  204. ),
  205. offset: parseInt(
  206. cue.transcriptCueGroupRenderer.cues[0].transcriptCueRenderer
  207. .startOffsetMs,
  208. ),
  209. })) as TranscriptResponse[];
  210. };