@@ -1 +1,5 @@ | |||||
# OpenAI API key. | |||||
OPENAI_API_KEY= | OPENAI_API_KEY= | ||||
# OpenAI organization ID. | |||||
OPENAI_ORGANIZATION_ID= |
@@ -0,0 +1,14 @@ | |||||
# webvideo-transcript-summary-core | |||||
This is the core SDK for summarizing transcripts for Web videos. | |||||
## Setup | |||||
1. Install dependencies. | |||||
2. Copy `.env.example` to `.env` and fill in the correct values. | |||||
3. Refer to `prompt-template.hbs` and create lists of prompts: | |||||
* `prompts/normalize-transcript-text.hbs` is needed for putting proper punctuation to transcript text. | |||||
* `prompts/summarize-transcript.hbs` is needed to perform actual summarization. | |||||
4. Run `npm run build` to build the project. | |||||
Use `npm link` to use it on your own project. |
@@ -49,5 +49,22 @@ | |||||
"dependencies": { | "dependencies": { | ||||
"fetch-ponyfill": "^7.1.0", | "fetch-ponyfill": "^7.1.0", | ||||
"handlebars": "^4.7.7" | "handlebars": "^4.7.7" | ||||
}, | |||||
"types": "./dist/types/index.d.ts", | |||||
"main": "./dist/cjs/production/index.js", | |||||
"module": "./dist/esm/production/index.js", | |||||
"exports": { | |||||
".": { | |||||
"development": { | |||||
"require": "./dist/cjs/development/index.js", | |||||
"import": "./dist/esm/development/index.js" | |||||
}, | |||||
"require": "./dist/cjs/production/index.js", | |||||
"import": "./dist/esm/production/index.js", | |||||
"types": "./dist/types/index.d.ts" | |||||
} | |||||
}, | |||||
"typesVersions": { | |||||
"*": {} | |||||
} | } | ||||
} | } |
@@ -1,18 +1,19 @@ | |||||
type ProcessEvent = { type: string, phase: string, command?: string }; | |||||
export type ProcessEvent = { | |||||
processType: string, | |||||
phase: string, | |||||
command?: string, | |||||
content?: string, | |||||
contentType?: string, | |||||
}; | |||||
type ProcessEventCallback = (event: ProcessEvent) => void; | type ProcessEventCallback = (event: ProcessEvent) => void; | ||||
type ErrorEventCallback = (event: Error) => void; | type ErrorEventCallback = (event: Error) => void; | ||||
type SuccessEvent = { contentType: string, content: unknown }; | |||||
type SuccessEventCallback = (event: SuccessEvent) => void; | |||||
export interface SummarizerEventEmitter extends NodeJS.EventEmitter { | export interface SummarizerEventEmitter extends NodeJS.EventEmitter { | ||||
process(): void; | process(): void; | ||||
on(eventType: 'process', callback: ProcessEventCallback): this; | on(eventType: 'process', callback: ProcessEventCallback): this; | ||||
on(eventType: 'error', callback: ErrorEventCallback): this; | on(eventType: 'error', callback: ErrorEventCallback): this; | ||||
on(eventType: 'success', callback: SuccessEventCallback): this; | |||||
on(eventType: 'end', callback: () => void): this; | on(eventType: 'end', callback: () => void): this; | ||||
} | } | ||||
@@ -37,3 +37,5 @@ export const createSummarizer = (params: CreateSummarizerParams): SummarizerEven | |||||
throw new TypeError(`Invalid video type: "${videoType}". Valid values are: ${JSON.stringify(Object.values(VideoType))}`); | throw new TypeError(`Invalid video type: "${videoType}". Valid values are: ${JSON.stringify(Object.values(VideoType))}`); | ||||
}; | }; | ||||
export * from './common'; |
@@ -4,7 +4,7 @@ import { | |||||
retrieveVideoId, | retrieveVideoId, | ||||
getVideoPage, | getVideoPage, | ||||
extractDataFromPage, | extractDataFromPage, | ||||
fetchTranscriptItems, TranscriptResponse, | |||||
fetchTranscriptItems, | |||||
} from './transcript'; | } from './transcript'; | ||||
import { normalizeTranscriptText, summarizeTranscript } from '../../summarizer'; | import { normalizeTranscriptText, summarizeTranscript } from '../../summarizer'; | ||||
@@ -23,10 +23,9 @@ export class YouTubeSummarizerEventEmitter extends EventEmitter implements Summa | |||||
...config | ...config | ||||
} = this.params; | } = this.params; | ||||
const identifier = retrieveVideoId(url); | const identifier = retrieveVideoId(url); | ||||
let transcripts: TranscriptResponse[] = []; | |||||
this.emit('process', { | this.emit('process', { | ||||
type: 'extract-data', | |||||
processType: 'extract-data', | |||||
phase: 'download-page', | phase: 'download-page', | ||||
}); | }); | ||||
@@ -34,26 +33,26 @@ export class YouTubeSummarizerEventEmitter extends EventEmitter implements Summa | |||||
.then((videoPageBody) => { | .then((videoPageBody) => { | ||||
const pageData = extractDataFromPage(videoPageBody); | const pageData = extractDataFromPage(videoPageBody); | ||||
this.emit('process', { | this.emit('process', { | ||||
type: 'extract-data', | |||||
processType: 'extract-data', | |||||
phase: 'success', | phase: 'success', | ||||
}); | }); | ||||
this.emit('process', { | this.emit('process', { | ||||
type: 'fetch-transcript', | |||||
processType: 'fetch-transcript', | |||||
phase: 'start', | phase: 'start', | ||||
}); | }); | ||||
return fetchTranscriptItems(pageData, config); | return fetchTranscriptItems(pageData, config); | ||||
}) | }) | ||||
.then((transcript) => { | .then((transcript) => { | ||||
this.emit('process', { | this.emit('process', { | ||||
type: 'fetch-transcript', | |||||
processType: 'fetch-transcript', | |||||
phase: 'success', | phase: 'success', | ||||
content: JSON.stringify(transcript), | |||||
contentType: 'application/json', | |||||
}); | }); | ||||
transcripts = transcript; | |||||
this.emit('process', { | this.emit('process', { | ||||
type: 'normalize-caption', | |||||
processType: 'normalize-caption', | |||||
phase: 'start', | phase: 'start', | ||||
}); | }); | ||||
@@ -65,12 +64,14 @@ export class YouTubeSummarizerEventEmitter extends EventEmitter implements Summa | |||||
}) | }) | ||||
.then((normalizedCaption) => { | .then((normalizedCaption) => { | ||||
this.emit('process', { | this.emit('process', { | ||||
type: 'normalize-caption', | |||||
processType: 'normalize-transcript', | |||||
phase: 'success', | phase: 'success', | ||||
content: normalizedCaption, | |||||
contentType: 'text/plain', | |||||
}); | }); | ||||
this.emit('process', { | this.emit('process', { | ||||
type: 'summarize-caption', | |||||
processType: 'summarize-transcript', | |||||
phase: 'start', | phase: 'start', | ||||
}); | }); | ||||
@@ -78,14 +79,12 @@ export class YouTubeSummarizerEventEmitter extends EventEmitter implements Summa | |||||
}) | }) | ||||
.then((summary) => { | .then((summary) => { | ||||
this.emit('process', { | this.emit('process', { | ||||
type: 'summarize-caption', | |||||
processType: 'summarize-transcript', | |||||
phase: 'success', | phase: 'success', | ||||
data: summary, | |||||
contentType: 'text/plain', | |||||
}); | }); | ||||
this.emit('success', { | |||||
contentType: 'application/json', | |||||
content: JSON.stringify({ transcripts, summary }), | |||||
}); | |||||
this.emit('end'); | this.emit('end'); | ||||
}) | }) | ||||
.catch((error) => { | .catch((error) => { | ||||
@@ -123,6 +123,20 @@ interface VideoPageData { | |||||
clickTrackingParams?: string; | clickTrackingParams?: string; | ||||
} | } | ||||
export interface Cue { | |||||
transcriptCueGroupRenderer: { | |||||
cues: { | |||||
transcriptCueRenderer: { | |||||
cue: { | |||||
simpleText: string; | |||||
}, | |||||
durationMs: string; | |||||
startOffsetMs: string; | |||||
} | |||||
}[], | |||||
}, | |||||
} | |||||
export const extractDataFromPage = (page: string): VideoPageData => ({ | export const extractDataFromPage = (page: string): VideoPageData => ({ | ||||
innerTubeApiKey: extractInnterTubeApiKeyFromPage(page), | innerTubeApiKey: extractInnterTubeApiKeyFromPage(page), | ||||
serializedShareEntity: extractSerializedShareEntityFromPage(page), | serializedShareEntity: extractSerializedShareEntityFromPage(page), | ||||
@@ -217,16 +231,18 @@ export const fetchTranscriptItems = async (pageData: VideoPageData, config?: Tra | |||||
.body | .body | ||||
.transcriptBodyRenderer; | .transcriptBodyRenderer; | ||||
return transcripts.map((cue) => ({ | |||||
return transcripts.map((cue: Cue) => ({ | |||||
text: cue.transcriptCueGroupRenderer.cues[0].transcriptCueRenderer | text: cue.transcriptCueGroupRenderer.cues[0].transcriptCueRenderer | ||||
.cue.simpleText, | .cue.simpleText, | ||||
duration: parseInt( | duration: parseInt( | ||||
cue.transcriptCueGroupRenderer.cues[0].transcriptCueRenderer | cue.transcriptCueGroupRenderer.cues[0].transcriptCueRenderer | ||||
.durationMs, | .durationMs, | ||||
10, | |||||
), | ), | ||||
offset: parseInt( | offset: parseInt( | ||||
cue.transcriptCueGroupRenderer.cues[0].transcriptCueRenderer | cue.transcriptCueGroupRenderer.cues[0].transcriptCueRenderer | ||||
.startOffsetMs, | .startOffsetMs, | ||||
10, | |||||
), | ), | ||||
})) as TranscriptResponse[]; | })) as TranscriptResponse[]; | ||||
}; | }; |
@@ -1,7 +1,16 @@ | |||||
import { config } from 'dotenv'; | |||||
import { writeFileSync } from 'fs'; | |||||
import { beforeAll, describe, it } from 'vitest'; | import { beforeAll, describe, it } from 'vitest'; | ||||
import { config } from 'dotenv'; | |||||
import { createSummarizer, VideoType } from '../src'; | import { createSummarizer, VideoType } from '../src'; | ||||
import { writeFileSync } from 'fs'; | |||||
const writeTranscript = (filename: string, content: string) => { | |||||
const transcripts = JSON.parse(content) as { | |||||
transcripts: { text: string }[], | |||||
summary: string, | |||||
}; | |||||
const transcriptText = transcripts.transcripts.map((t) => t.text).join('\n'); | |||||
writeFileSync(filename, transcriptText); | |||||
}; | |||||
describe('blah', () => { | describe('blah', () => { | ||||
beforeAll(() => { | beforeAll(() => { | ||||
@@ -15,14 +24,22 @@ describe('blah', () => { | |||||
openaiApiKey: process.env.OPENAI_API_KEY as string, | openaiApiKey: process.env.OPENAI_API_KEY as string, | ||||
}); | }); | ||||
summarizer.on('success', (data) => { | |||||
const transcripts = JSON.parse(data.content as string) as { | |||||
transcripts: { text: string }[], | |||||
summary: string, | |||||
}; | |||||
const transcriptText = transcripts.transcripts.map((t) => t.text).join('\n'); | |||||
writeFileSync('transcript.txt', transcriptText); | |||||
writeFileSync('summary.txt', transcripts.summary); | |||||
summarizer.on('process', (data) => { | |||||
if (data.phase === 'success') { | |||||
switch (data.processType) { | |||||
case 'fetch-transcript': | |||||
writeTranscript('transcript.txt', data.content as string); | |||||
break; | |||||
case 'normalize-transcript': | |||||
writeFileSync('normalized.txt', data.content as string); | |||||
break; | |||||
case 'summarize-transcript': | |||||
writeFileSync('summary.txt', data.content as string); | |||||
break; | |||||
default: | |||||
break; | |||||
} | |||||
} | |||||
}); | }); | ||||
summarizer.on('error', (err) => { | summarizer.on('error', (err) => { | ||||