From 9252b037ca0eeb98f364ec2e7830f38276a71456 Mon Sep 17 00:00:00 2001 From: TheoryOfNekomata Date: Tue, 11 Apr 2023 19:51:51 +0800 Subject: [PATCH] Minor refactor Make event names consistent and improve the API. --- .env.example | 4 +++ README.md | 14 ++++++++++ package.json | 17 ++++++++++++ src/common.ts | 13 +++++----- src/index.ts | 2 ++ src/video-types/youtube/index.ts | 31 +++++++++++----------- src/video-types/youtube/transcript.ts | 18 ++++++++++++- test/index.test.ts | 37 +++++++++++++++++++-------- 8 files changed, 103 insertions(+), 33 deletions(-) create mode 100644 README.md diff --git a/.env.example b/.env.example index e570b8b..775a8f0 100644 --- a/.env.example +++ b/.env.example @@ -1 +1,5 @@ +# OpenAI API key. OPENAI_API_KEY= + +# OpenAI organization ID. +OPENAI_ORGANIZATION_ID= diff --git a/README.md b/README.md new file mode 100644 index 0000000..e9e95d6 --- /dev/null +++ b/README.md @@ -0,0 +1,14 @@ +# webvideo-transcript-summary-core + +This is the core SDK for summarizing transcripts for Web videos. + +## Setup + +1. Install dependencies. +2. Copy `.env.example` to `.env` and fill in the correct values. +3. Refer to `prompt-template.hbs` and create lists of prompts: + * `prompts/normalize-transcript-text.hbs` is needed for putting proper punctuation to transcript text. + * `prompts/summarize-transcript.hbs` is needed to perform actual summarization. +4. Run `npm run build` to build the project. + +Use `npm link` to use it on your own project. diff --git a/package.json b/package.json index 2a16f88..9daae67 100644 --- a/package.json +++ b/package.json @@ -49,5 +49,22 @@ "dependencies": { "fetch-ponyfill": "^7.1.0", "handlebars": "^4.7.7" + }, + "types": "./dist/types/index.d.ts", + "main": "./dist/cjs/production/index.js", + "module": "./dist/esm/production/index.js", + "exports": { + ".": { + "development": { + "require": "./dist/cjs/development/index.js", + "import": "./dist/esm/development/index.js" + }, + "require": "./dist/cjs/production/index.js", + "import": "./dist/esm/production/index.js", + "types": "./dist/types/index.d.ts" + } + }, + "typesVersions": { + "*": {} } } diff --git a/src/common.ts b/src/common.ts index 9a2539b..c7ea11d 100644 --- a/src/common.ts +++ b/src/common.ts @@ -1,18 +1,19 @@ -type ProcessEvent = { type: string, phase: string, command?: string }; +export type ProcessEvent = { + processType: string, + phase: string, + command?: string, + content?: string, + contentType?: string, +}; type ProcessEventCallback = (event: ProcessEvent) => void; type ErrorEventCallback = (event: Error) => void; -type SuccessEvent = { contentType: string, content: unknown }; - -type SuccessEventCallback = (event: SuccessEvent) => void; - export interface SummarizerEventEmitter extends NodeJS.EventEmitter { process(): void; on(eventType: 'process', callback: ProcessEventCallback): this; on(eventType: 'error', callback: ErrorEventCallback): this; - on(eventType: 'success', callback: SuccessEventCallback): this; on(eventType: 'end', callback: () => void): this; } diff --git a/src/index.ts b/src/index.ts index df4420d..6f2cd62 100644 --- a/src/index.ts +++ b/src/index.ts @@ -37,3 +37,5 @@ export const createSummarizer = (params: CreateSummarizerParams): SummarizerEven throw new TypeError(`Invalid video type: "${videoType}". Valid values are: ${JSON.stringify(Object.values(VideoType))}`); }; + +export * from './common'; diff --git a/src/video-types/youtube/index.ts b/src/video-types/youtube/index.ts index 0e9b9ae..454eaf3 100644 --- a/src/video-types/youtube/index.ts +++ b/src/video-types/youtube/index.ts @@ -4,7 +4,7 @@ import { retrieveVideoId, getVideoPage, extractDataFromPage, - fetchTranscriptItems, TranscriptResponse, + fetchTranscriptItems, } from './transcript'; import { normalizeTranscriptText, summarizeTranscript } from '../../summarizer'; @@ -23,10 +23,9 @@ export class YouTubeSummarizerEventEmitter extends EventEmitter implements Summa ...config } = this.params; const identifier = retrieveVideoId(url); - let transcripts: TranscriptResponse[] = []; this.emit('process', { - type: 'extract-data', + processType: 'extract-data', phase: 'download-page', }); @@ -34,26 +33,26 @@ export class YouTubeSummarizerEventEmitter extends EventEmitter implements Summa .then((videoPageBody) => { const pageData = extractDataFromPage(videoPageBody); this.emit('process', { - type: 'extract-data', + processType: 'extract-data', phase: 'success', }); this.emit('process', { - type: 'fetch-transcript', + processType: 'fetch-transcript', phase: 'start', }); return fetchTranscriptItems(pageData, config); }) .then((transcript) => { this.emit('process', { - type: 'fetch-transcript', + processType: 'fetch-transcript', phase: 'success', + content: JSON.stringify(transcript), + contentType: 'application/json', }); - transcripts = transcript; - this.emit('process', { - type: 'normalize-caption', + processType: 'normalize-caption', phase: 'start', }); @@ -65,12 +64,14 @@ export class YouTubeSummarizerEventEmitter extends EventEmitter implements Summa }) .then((normalizedCaption) => { this.emit('process', { - type: 'normalize-caption', + processType: 'normalize-transcript', phase: 'success', + content: normalizedCaption, + contentType: 'text/plain', }); this.emit('process', { - type: 'summarize-caption', + processType: 'summarize-transcript', phase: 'start', }); @@ -78,14 +79,12 @@ export class YouTubeSummarizerEventEmitter extends EventEmitter implements Summa }) .then((summary) => { this.emit('process', { - type: 'summarize-caption', + processType: 'summarize-transcript', phase: 'success', + data: summary, + contentType: 'text/plain', }); - this.emit('success', { - contentType: 'application/json', - content: JSON.stringify({ transcripts, summary }), - }); this.emit('end'); }) .catch((error) => { diff --git a/src/video-types/youtube/transcript.ts b/src/video-types/youtube/transcript.ts index f3cf6e4..0cd53bd 100644 --- a/src/video-types/youtube/transcript.ts +++ b/src/video-types/youtube/transcript.ts @@ -123,6 +123,20 @@ interface VideoPageData { clickTrackingParams?: string; } +export interface Cue { + transcriptCueGroupRenderer: { + cues: { + transcriptCueRenderer: { + cue: { + simpleText: string; + }, + durationMs: string; + startOffsetMs: string; + } + }[], + }, +} + export const extractDataFromPage = (page: string): VideoPageData => ({ innerTubeApiKey: extractInnterTubeApiKeyFromPage(page), serializedShareEntity: extractSerializedShareEntityFromPage(page), @@ -217,16 +231,18 @@ export const fetchTranscriptItems = async (pageData: VideoPageData, config?: Tra .body .transcriptBodyRenderer; - return transcripts.map((cue) => ({ + return transcripts.map((cue: Cue) => ({ text: cue.transcriptCueGroupRenderer.cues[0].transcriptCueRenderer .cue.simpleText, duration: parseInt( cue.transcriptCueGroupRenderer.cues[0].transcriptCueRenderer .durationMs, + 10, ), offset: parseInt( cue.transcriptCueGroupRenderer.cues[0].transcriptCueRenderer .startOffsetMs, + 10, ), })) as TranscriptResponse[]; }; diff --git a/test/index.test.ts b/test/index.test.ts index 7ddb2bf..bbd524c 100644 --- a/test/index.test.ts +++ b/test/index.test.ts @@ -1,7 +1,16 @@ -import { config } from 'dotenv'; +import { writeFileSync } from 'fs'; import { beforeAll, describe, it } from 'vitest'; +import { config } from 'dotenv'; import { createSummarizer, VideoType } from '../src'; -import { writeFileSync } from 'fs'; + +const writeTranscript = (filename: string, content: string) => { + const transcripts = JSON.parse(content) as { + transcripts: { text: string }[], + summary: string, + }; + const transcriptText = transcripts.transcripts.map((t) => t.text).join('\n'); + writeFileSync(filename, transcriptText); +}; describe('blah', () => { beforeAll(() => { @@ -15,14 +24,22 @@ describe('blah', () => { openaiApiKey: process.env.OPENAI_API_KEY as string, }); - summarizer.on('success', (data) => { - const transcripts = JSON.parse(data.content as string) as { - transcripts: { text: string }[], - summary: string, - }; - const transcriptText = transcripts.transcripts.map((t) => t.text).join('\n'); - writeFileSync('transcript.txt', transcriptText); - writeFileSync('summary.txt', transcripts.summary); + summarizer.on('process', (data) => { + if (data.phase === 'success') { + switch (data.processType) { + case 'fetch-transcript': + writeTranscript('transcript.txt', data.content as string); + break; + case 'normalize-transcript': + writeFileSync('normalized.txt', data.content as string); + break; + case 'summarize-transcript': + writeFileSync('summary.txt', data.content as string); + break; + default: + break; + } + } }); summarizer.on('error', (err) => {