Browse Source

Minor refactor

Make event names consistent and improve the API.
master
TheoryOfNekomata 1 year ago
parent
commit
9252b037ca
8 changed files with 103 additions and 33 deletions
  1. +4
    -0
      .env.example
  2. +14
    -0
      README.md
  3. +17
    -0
      package.json
  4. +7
    -6
      src/common.ts
  5. +2
    -0
      src/index.ts
  6. +15
    -16
      src/video-types/youtube/index.ts
  7. +17
    -1
      src/video-types/youtube/transcript.ts
  8. +27
    -10
      test/index.test.ts

+ 4
- 0
.env.example View File

@@ -1 +1,5 @@
# OpenAI API key.
OPENAI_API_KEY= OPENAI_API_KEY=

# OpenAI organization ID.
OPENAI_ORGANIZATION_ID=

+ 14
- 0
README.md View File

@@ -0,0 +1,14 @@
# webvideo-transcript-summary-core

This is the core SDK for summarizing transcripts for Web videos.

## Setup

1. Install dependencies.
2. Copy `.env.example` to `.env` and fill in the correct values.
3. Refer to `prompt-template.hbs` and create lists of prompts:
* `prompts/normalize-transcript-text.hbs` is needed for putting proper punctuation to transcript text.
* `prompts/summarize-transcript.hbs` is needed to perform actual summarization.
4. Run `npm run build` to build the project.

Use `npm link` to use it on your own project.

+ 17
- 0
package.json View File

@@ -49,5 +49,22 @@
"dependencies": { "dependencies": {
"fetch-ponyfill": "^7.1.0", "fetch-ponyfill": "^7.1.0",
"handlebars": "^4.7.7" "handlebars": "^4.7.7"
},
"types": "./dist/types/index.d.ts",
"main": "./dist/cjs/production/index.js",
"module": "./dist/esm/production/index.js",
"exports": {
".": {
"development": {
"require": "./dist/cjs/development/index.js",
"import": "./dist/esm/development/index.js"
},
"require": "./dist/cjs/production/index.js",
"import": "./dist/esm/production/index.js",
"types": "./dist/types/index.d.ts"
}
},
"typesVersions": {
"*": {}
} }
} }

+ 7
- 6
src/common.ts View File

@@ -1,18 +1,19 @@
type ProcessEvent = { type: string, phase: string, command?: string };
export type ProcessEvent = {
processType: string,
phase: string,
command?: string,
content?: string,
contentType?: string,
};


type ProcessEventCallback = (event: ProcessEvent) => void; type ProcessEventCallback = (event: ProcessEvent) => void;


type ErrorEventCallback = (event: Error) => void; type ErrorEventCallback = (event: Error) => void;


type SuccessEvent = { contentType: string, content: unknown };

type SuccessEventCallback = (event: SuccessEvent) => void;

export interface SummarizerEventEmitter extends NodeJS.EventEmitter { export interface SummarizerEventEmitter extends NodeJS.EventEmitter {
process(): void; process(): void;
on(eventType: 'process', callback: ProcessEventCallback): this; on(eventType: 'process', callback: ProcessEventCallback): this;
on(eventType: 'error', callback: ErrorEventCallback): this; on(eventType: 'error', callback: ErrorEventCallback): this;
on(eventType: 'success', callback: SuccessEventCallback): this;
on(eventType: 'end', callback: () => void): this; on(eventType: 'end', callback: () => void): this;
} }




+ 2
- 0
src/index.ts View File

@@ -37,3 +37,5 @@ export const createSummarizer = (params: CreateSummarizerParams): SummarizerEven


throw new TypeError(`Invalid video type: "${videoType}". Valid values are: ${JSON.stringify(Object.values(VideoType))}`); throw new TypeError(`Invalid video type: "${videoType}". Valid values are: ${JSON.stringify(Object.values(VideoType))}`);
}; };

export * from './common';

+ 15
- 16
src/video-types/youtube/index.ts View File

@@ -4,7 +4,7 @@ import {
retrieveVideoId, retrieveVideoId,
getVideoPage, getVideoPage,
extractDataFromPage, extractDataFromPage,
fetchTranscriptItems, TranscriptResponse,
fetchTranscriptItems,
} from './transcript'; } from './transcript';
import { normalizeTranscriptText, summarizeTranscript } from '../../summarizer'; import { normalizeTranscriptText, summarizeTranscript } from '../../summarizer';


@@ -23,10 +23,9 @@ export class YouTubeSummarizerEventEmitter extends EventEmitter implements Summa
...config ...config
} = this.params; } = this.params;
const identifier = retrieveVideoId(url); const identifier = retrieveVideoId(url);
let transcripts: TranscriptResponse[] = [];


this.emit('process', { this.emit('process', {
type: 'extract-data',
processType: 'extract-data',
phase: 'download-page', phase: 'download-page',
}); });


@@ -34,26 +33,26 @@ export class YouTubeSummarizerEventEmitter extends EventEmitter implements Summa
.then((videoPageBody) => { .then((videoPageBody) => {
const pageData = extractDataFromPage(videoPageBody); const pageData = extractDataFromPage(videoPageBody);
this.emit('process', { this.emit('process', {
type: 'extract-data',
processType: 'extract-data',
phase: 'success', phase: 'success',
}); });


this.emit('process', { this.emit('process', {
type: 'fetch-transcript',
processType: 'fetch-transcript',
phase: 'start', phase: 'start',
}); });
return fetchTranscriptItems(pageData, config); return fetchTranscriptItems(pageData, config);
}) })
.then((transcript) => { .then((transcript) => {
this.emit('process', { this.emit('process', {
type: 'fetch-transcript',
processType: 'fetch-transcript',
phase: 'success', phase: 'success',
content: JSON.stringify(transcript),
contentType: 'application/json',
}); });


transcripts = transcript;

this.emit('process', { this.emit('process', {
type: 'normalize-caption',
processType: 'normalize-caption',
phase: 'start', phase: 'start',
}); });


@@ -65,12 +64,14 @@ export class YouTubeSummarizerEventEmitter extends EventEmitter implements Summa
}) })
.then((normalizedCaption) => { .then((normalizedCaption) => {
this.emit('process', { this.emit('process', {
type: 'normalize-caption',
processType: 'normalize-transcript',
phase: 'success', phase: 'success',
content: normalizedCaption,
contentType: 'text/plain',
}); });


this.emit('process', { this.emit('process', {
type: 'summarize-caption',
processType: 'summarize-transcript',
phase: 'start', phase: 'start',
}); });


@@ -78,14 +79,12 @@ export class YouTubeSummarizerEventEmitter extends EventEmitter implements Summa
}) })
.then((summary) => { .then((summary) => {
this.emit('process', { this.emit('process', {
type: 'summarize-caption',
processType: 'summarize-transcript',
phase: 'success', phase: 'success',
data: summary,
contentType: 'text/plain',
}); });


this.emit('success', {
contentType: 'application/json',
content: JSON.stringify({ transcripts, summary }),
});
this.emit('end'); this.emit('end');
}) })
.catch((error) => { .catch((error) => {


+ 17
- 1
src/video-types/youtube/transcript.ts View File

@@ -123,6 +123,20 @@ interface VideoPageData {
clickTrackingParams?: string; clickTrackingParams?: string;
} }


export interface Cue {
transcriptCueGroupRenderer: {
cues: {
transcriptCueRenderer: {
cue: {
simpleText: string;
},
durationMs: string;
startOffsetMs: string;
}
}[],
},
}

export const extractDataFromPage = (page: string): VideoPageData => ({ export const extractDataFromPage = (page: string): VideoPageData => ({
innerTubeApiKey: extractInnterTubeApiKeyFromPage(page), innerTubeApiKey: extractInnterTubeApiKeyFromPage(page),
serializedShareEntity: extractSerializedShareEntityFromPage(page), serializedShareEntity: extractSerializedShareEntityFromPage(page),
@@ -217,16 +231,18 @@ export const fetchTranscriptItems = async (pageData: VideoPageData, config?: Tra
.body .body
.transcriptBodyRenderer; .transcriptBodyRenderer;


return transcripts.map((cue) => ({
return transcripts.map((cue: Cue) => ({
text: cue.transcriptCueGroupRenderer.cues[0].transcriptCueRenderer text: cue.transcriptCueGroupRenderer.cues[0].transcriptCueRenderer
.cue.simpleText, .cue.simpleText,
duration: parseInt( duration: parseInt(
cue.transcriptCueGroupRenderer.cues[0].transcriptCueRenderer cue.transcriptCueGroupRenderer.cues[0].transcriptCueRenderer
.durationMs, .durationMs,
10,
), ),
offset: parseInt( offset: parseInt(
cue.transcriptCueGroupRenderer.cues[0].transcriptCueRenderer cue.transcriptCueGroupRenderer.cues[0].transcriptCueRenderer
.startOffsetMs, .startOffsetMs,
10,
), ),
})) as TranscriptResponse[]; })) as TranscriptResponse[];
}; };

+ 27
- 10
test/index.test.ts View File

@@ -1,7 +1,16 @@
import { config } from 'dotenv';
import { writeFileSync } from 'fs';
import { beforeAll, describe, it } from 'vitest'; import { beforeAll, describe, it } from 'vitest';
import { config } from 'dotenv';
import { createSummarizer, VideoType } from '../src'; import { createSummarizer, VideoType } from '../src';
import { writeFileSync } from 'fs';

const writeTranscript = (filename: string, content: string) => {
const transcripts = JSON.parse(content) as {
transcripts: { text: string }[],
summary: string,
};
const transcriptText = transcripts.transcripts.map((t) => t.text).join('\n');
writeFileSync(filename, transcriptText);
};


describe('blah', () => { describe('blah', () => {
beforeAll(() => { beforeAll(() => {
@@ -15,14 +24,22 @@ describe('blah', () => {
openaiApiKey: process.env.OPENAI_API_KEY as string, openaiApiKey: process.env.OPENAI_API_KEY as string,
}); });


summarizer.on('success', (data) => {
const transcripts = JSON.parse(data.content as string) as {
transcripts: { text: string }[],
summary: string,
};
const transcriptText = transcripts.transcripts.map((t) => t.text).join('\n');
writeFileSync('transcript.txt', transcriptText);
writeFileSync('summary.txt', transcripts.summary);
summarizer.on('process', (data) => {
if (data.phase === 'success') {
switch (data.processType) {
case 'fetch-transcript':
writeTranscript('transcript.txt', data.content as string);
break;
case 'normalize-transcript':
writeFileSync('normalized.txt', data.content as string);
break;
case 'summarize-transcript':
writeFileSync('summary.txt', data.content as string);
break;
default:
break;
}
}
}); });


summarizer.on('error', (err) => { summarizer.on('error', (err) => {


Loading…
Cancel
Save