@@ -1 +1,5 @@ | |||
# OpenAI API key. | |||
OPENAI_API_KEY= | |||
# OpenAI organization ID. | |||
OPENAI_ORGANIZATION_ID= |
@@ -0,0 +1,14 @@ | |||
# webvideo-transcript-summary-core | |||
This is the core SDK for summarizing transcripts for Web videos. | |||
## Setup | |||
1. Install dependencies. | |||
2. Copy `.env.example` to `.env` and fill in the correct values. | |||
3. Refer to `prompt-template.hbs` and create lists of prompts: | |||
* `prompts/normalize-transcript-text.hbs` is needed for putting proper punctuation to transcript text. | |||
* `prompts/summarize-transcript.hbs` is needed to perform actual summarization. | |||
4. Run `npm run build` to build the project. | |||
Use `npm link` to use it on your own project. |
@@ -49,5 +49,22 @@ | |||
"dependencies": { | |||
"fetch-ponyfill": "^7.1.0", | |||
"handlebars": "^4.7.7" | |||
}, | |||
"types": "./dist/types/index.d.ts", | |||
"main": "./dist/cjs/production/index.js", | |||
"module": "./dist/esm/production/index.js", | |||
"exports": { | |||
".": { | |||
"development": { | |||
"require": "./dist/cjs/development/index.js", | |||
"import": "./dist/esm/development/index.js" | |||
}, | |||
"require": "./dist/cjs/production/index.js", | |||
"import": "./dist/esm/production/index.js", | |||
"types": "./dist/types/index.d.ts" | |||
} | |||
}, | |||
"typesVersions": { | |||
"*": {} | |||
} | |||
} |
@@ -1,18 +1,19 @@ | |||
type ProcessEvent = { type: string, phase: string, command?: string }; | |||
export type ProcessEvent = { | |||
processType: string, | |||
phase: string, | |||
command?: string, | |||
content?: string, | |||
contentType?: string, | |||
}; | |||
type ProcessEventCallback = (event: ProcessEvent) => void; | |||
type ErrorEventCallback = (event: Error) => void; | |||
type SuccessEvent = { contentType: string, content: unknown }; | |||
type SuccessEventCallback = (event: SuccessEvent) => void; | |||
export interface SummarizerEventEmitter extends NodeJS.EventEmitter { | |||
process(): void; | |||
on(eventType: 'process', callback: ProcessEventCallback): this; | |||
on(eventType: 'error', callback: ErrorEventCallback): this; | |||
on(eventType: 'success', callback: SuccessEventCallback): this; | |||
on(eventType: 'end', callback: () => void): this; | |||
} | |||
@@ -37,3 +37,5 @@ export const createSummarizer = (params: CreateSummarizerParams): SummarizerEven | |||
throw new TypeError(`Invalid video type: "${videoType}". Valid values are: ${JSON.stringify(Object.values(VideoType))}`); | |||
}; | |||
export * from './common'; |
@@ -4,7 +4,7 @@ import { | |||
retrieveVideoId, | |||
getVideoPage, | |||
extractDataFromPage, | |||
fetchTranscriptItems, TranscriptResponse, | |||
fetchTranscriptItems, | |||
} from './transcript'; | |||
import { normalizeTranscriptText, summarizeTranscript } from '../../summarizer'; | |||
@@ -23,10 +23,9 @@ export class YouTubeSummarizerEventEmitter extends EventEmitter implements Summa | |||
...config | |||
} = this.params; | |||
const identifier = retrieveVideoId(url); | |||
let transcripts: TranscriptResponse[] = []; | |||
this.emit('process', { | |||
type: 'extract-data', | |||
processType: 'extract-data', | |||
phase: 'download-page', | |||
}); | |||
@@ -34,26 +33,26 @@ export class YouTubeSummarizerEventEmitter extends EventEmitter implements Summa | |||
.then((videoPageBody) => { | |||
const pageData = extractDataFromPage(videoPageBody); | |||
this.emit('process', { | |||
type: 'extract-data', | |||
processType: 'extract-data', | |||
phase: 'success', | |||
}); | |||
this.emit('process', { | |||
type: 'fetch-transcript', | |||
processType: 'fetch-transcript', | |||
phase: 'start', | |||
}); | |||
return fetchTranscriptItems(pageData, config); | |||
}) | |||
.then((transcript) => { | |||
this.emit('process', { | |||
type: 'fetch-transcript', | |||
processType: 'fetch-transcript', | |||
phase: 'success', | |||
content: JSON.stringify(transcript), | |||
contentType: 'application/json', | |||
}); | |||
transcripts = transcript; | |||
this.emit('process', { | |||
type: 'normalize-caption', | |||
processType: 'normalize-caption', | |||
phase: 'start', | |||
}); | |||
@@ -65,12 +64,14 @@ export class YouTubeSummarizerEventEmitter extends EventEmitter implements Summa | |||
}) | |||
.then((normalizedCaption) => { | |||
this.emit('process', { | |||
type: 'normalize-caption', | |||
processType: 'normalize-transcript', | |||
phase: 'success', | |||
content: normalizedCaption, | |||
contentType: 'text/plain', | |||
}); | |||
this.emit('process', { | |||
type: 'summarize-caption', | |||
processType: 'summarize-transcript', | |||
phase: 'start', | |||
}); | |||
@@ -78,14 +79,12 @@ export class YouTubeSummarizerEventEmitter extends EventEmitter implements Summa | |||
}) | |||
.then((summary) => { | |||
this.emit('process', { | |||
type: 'summarize-caption', | |||
processType: 'summarize-transcript', | |||
phase: 'success', | |||
data: summary, | |||
contentType: 'text/plain', | |||
}); | |||
this.emit('success', { | |||
contentType: 'application/json', | |||
content: JSON.stringify({ transcripts, summary }), | |||
}); | |||
this.emit('end'); | |||
}) | |||
.catch((error) => { | |||
@@ -123,6 +123,20 @@ interface VideoPageData { | |||
clickTrackingParams?: string; | |||
} | |||
export interface Cue { | |||
transcriptCueGroupRenderer: { | |||
cues: { | |||
transcriptCueRenderer: { | |||
cue: { | |||
simpleText: string; | |||
}, | |||
durationMs: string; | |||
startOffsetMs: string; | |||
} | |||
}[], | |||
}, | |||
} | |||
export const extractDataFromPage = (page: string): VideoPageData => ({ | |||
innerTubeApiKey: extractInnterTubeApiKeyFromPage(page), | |||
serializedShareEntity: extractSerializedShareEntityFromPage(page), | |||
@@ -217,16 +231,18 @@ export const fetchTranscriptItems = async (pageData: VideoPageData, config?: Tra | |||
.body | |||
.transcriptBodyRenderer; | |||
return transcripts.map((cue) => ({ | |||
return transcripts.map((cue: Cue) => ({ | |||
text: cue.transcriptCueGroupRenderer.cues[0].transcriptCueRenderer | |||
.cue.simpleText, | |||
duration: parseInt( | |||
cue.transcriptCueGroupRenderer.cues[0].transcriptCueRenderer | |||
.durationMs, | |||
10, | |||
), | |||
offset: parseInt( | |||
cue.transcriptCueGroupRenderer.cues[0].transcriptCueRenderer | |||
.startOffsetMs, | |||
10, | |||
), | |||
})) as TranscriptResponse[]; | |||
}; |
@@ -1,7 +1,16 @@ | |||
import { config } from 'dotenv'; | |||
import { writeFileSync } from 'fs'; | |||
import { beforeAll, describe, it } from 'vitest'; | |||
import { config } from 'dotenv'; | |||
import { createSummarizer, VideoType } from '../src'; | |||
import { writeFileSync } from 'fs'; | |||
const writeTranscript = (filename: string, content: string) => { | |||
const transcripts = JSON.parse(content) as { | |||
transcripts: { text: string }[], | |||
summary: string, | |||
}; | |||
const transcriptText = transcripts.transcripts.map((t) => t.text).join('\n'); | |||
writeFileSync(filename, transcriptText); | |||
}; | |||
describe('blah', () => { | |||
beforeAll(() => { | |||
@@ -15,14 +24,22 @@ describe('blah', () => { | |||
openaiApiKey: process.env.OPENAI_API_KEY as string, | |||
}); | |||
summarizer.on('success', (data) => { | |||
const transcripts = JSON.parse(data.content as string) as { | |||
transcripts: { text: string }[], | |||
summary: string, | |||
}; | |||
const transcriptText = transcripts.transcripts.map((t) => t.text).join('\n'); | |||
writeFileSync('transcript.txt', transcriptText); | |||
writeFileSync('summary.txt', transcripts.summary); | |||
summarizer.on('process', (data) => { | |||
if (data.phase === 'success') { | |||
switch (data.processType) { | |||
case 'fetch-transcript': | |||
writeTranscript('transcript.txt', data.content as string); | |||
break; | |||
case 'normalize-transcript': | |||
writeFileSync('normalized.txt', data.content as string); | |||
break; | |||
case 'summarize-transcript': | |||
writeFileSync('summary.txt', data.content as string); | |||
break; | |||
default: | |||
break; | |||
} | |||
} | |||
}); | |||
summarizer.on('error', (err) => { | |||