Browse Source

Initial commit

Add files from pridepack.
master
TheoryOfNekomata 1 year ago
commit
d82031ab4a
17 changed files with 4397 additions and 0 deletions
  1. +1
    -0
      .env.example
  2. +9
    -0
      .eslintrc
  3. +112
    -0
      .gitignore
  4. +7
    -0
      LICENSE
  5. +53
    -0
      package.json
  6. +3
    -0
      pridepack.json
  7. +15
    -0
      prompt-template.hbs
  8. +25
    -0
      src/common.ts
  9. +39
    -0
      src/index.ts
  10. +84
    -0
      src/summarizer.ts
  11. +13
    -0
      src/video-types/youtube/errors.ts
  12. +96
    -0
      src/video-types/youtube/index.ts
  13. +232
    -0
      src/video-types/youtube/transcript.ts
  14. +39
    -0
      test/index.test.ts
  15. +21
    -0
      tsconfig.eslint.json
  16. +21
    -0
      tsconfig.json
  17. +3627
    -0
      yarn.lock

+ 1
- 0
.env.example View File

@@ -0,0 +1 @@
OPENAI_API_KEY=

+ 9
- 0
.eslintrc View File

@@ -0,0 +1,9 @@
{
"root": true,
"extends": [
"lxsmnsyc/typescript"
],
"parserOptions": {
"project": "./tsconfig.eslint.json"
}
}

+ 112
- 0
.gitignore View File

@@ -0,0 +1,112 @@
# Logs
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
lerna-debug.log*

# Diagnostic reports (https://nodejs.org/api/report.html)
report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json

# Runtime data
pids
*.pid
*.seed
*.pid.lock

# Directory for instrumented libs generated by jscoverage/JSCover
lib-cov

# Coverage directory used by tools like istanbul
coverage
*.lcov

# nyc test coverage
.nyc_output

# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
.grunt

# Bower dependency directory (https://bower.io/)
bower_components

# node-waf configuration
.lock-wscript

# Compiled binary addons (https://nodejs.org/api/addons.html)
build/Release

# Dependency directories
node_modules/
jspm_packages/

# TypeScript v1 declaration files
typings/

# TypeScript cache
*.tsbuildinfo

# Optional npm cache directory
.npm

# Optional eslint cache
.eslintcache

# Microbundle cache
.rpt2_cache/
.rts2_cache_cjs/
.rts2_cache_es/
.rts2_cache_umd/

# Optional REPL history
.node_repl_history

# Output of 'npm pack'
*.tgz

# Yarn Integrity file
.yarn-integrity

# dotenv environment variables file
.env
.env.production
.env.development

# parcel-bundler cache (https://parceljs.org/)
.cache

# Next.js build output
.next

# Nuxt.js build / generate output
.nuxt
dist

# Gatsby files
.cache/
# Comment in the public line in if your project uses Gatsby and *not* Next.js
# https://nextjs.org/blog/next-9-1#public-directory-support
# public

# vuepress build output
.vuepress/dist

# Serverless directories
.serverless/

# FuseBox cache
.fusebox/

# DynamoDB Local files
.dynamodb/

# TernJS port file
.tern-port

.npmrc
.idea/
transcript.txt
summary.txt
results/
prompts/

+ 7
- 0
LICENSE View File

@@ -0,0 +1,7 @@
MIT License Copyright (c) 2023 TheoryOfNekomata <allan.crisostomo@outlook.com>

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice (including the next paragraph) shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

+ 53
- 0
package.json View File

@@ -0,0 +1,53 @@
{
"name": "@modal-sh/webvideo-transcript-summary-core",
"version": "0.0.0",
"files": [
"dist",
"src"
],
"engines": {
"node": ">=12"
},
"license": "MIT",
"keywords": [
"pridepack"
],
"devDependencies": {
"@types/node": "^18.14.1",
"eslint": "^8.35.0",
"eslint-config-lxsmnsyc": "^0.5.0",
"pridepack": "2.4.4",
"tslib": "^2.5.0",
"typescript": "^4.9.5",
"vitest": "^0.28.1"
},
"scripts": {
"prepublishOnly": "pridepack clean && pridepack build",
"build": "pridepack build",
"type-check": "pridepack check",
"lint": "pridepack lint",
"clean": "pridepack clean",
"watch": "pridepack watch",
"start": "pridepack start",
"dev": "pridepack dev",
"test": "vitest"
},
"private": false,
"description": "Get transcript summaries of Web videos.",
"repository": {
"url": "https://code.modal.sh/modal-soft/webvideo-transcript-summary-core",
"type": "git"
},
"homepage": "https://code.modal.sh/modal-soft/webvideo-transcript-summary-core",
"bugs": {
"url": "https://code.modal.sh/modal-soft/webvideo-transcript-summary-core/issues"
},
"author": "TheoryOfNekomata <allan.crisostomo@outlook.com>",
"publishConfig": {
"access": "public"
},
"dependencies": {
"fetch-ponyfill": "^7.1.0",
"handlebars": "^4.7.7"
}
}

+ 3
- 0
pridepack.json View File

@@ -0,0 +1,3 @@
{
"target": "es2018"
}

+ 15
- 0
prompt-template.hbs View File

@@ -0,0 +1,15 @@
Prompt text 1

---

Prompt text 2

{{transcript}}

---

Prompt text 3

{{transcript}}

Next line of text

+ 25
- 0
src/common.ts View File

@@ -0,0 +1,25 @@
type ProcessEvent = { type: string, phase: string, command?: string };

type ProcessEventCallback = (event: ProcessEvent) => void;

type ErrorEventCallback = (event: Error) => void;

type SuccessEvent = { contentType: string, content: unknown };

type SuccessEventCallback = (event: SuccessEvent) => void;

export interface SummarizerEventEmitter extends NodeJS.EventEmitter {
process(): void;
on(eventType: 'process', callback: ProcessEventCallback): this;
on(eventType: 'error', callback: ErrorEventCallback): this;
on(eventType: 'success', callback: SuccessEventCallback): this;
on(eventType: 'end', callback: () => void): this;
}

export interface CreateBaseSummarizerParams {
url: string;
language?: string;
country?: string;
openaiApiKey: string;
openaiOrganizationId?: string;
}

+ 39
- 0
src/index.ts View File

@@ -0,0 +1,39 @@
import { SummarizerEventEmitter } from './common';
import {
CreateYouTubeSummarizerParams,
YouTubeSummarizerEventEmitter,
} from './video-types/youtube';

export enum VideoType {
YOUTUBE = 'youtube',
}

export interface CreateSummarizerParams extends CreateYouTubeSummarizerParams {
type: VideoType;
}

export const createSummarizer = (params: CreateSummarizerParams): SummarizerEventEmitter => {
const {
type: videoType,
url,
openaiOrganizationId,
openaiApiKey,
language,
country,
} = params;

switch (videoType as string) {
case VideoType.YOUTUBE:
return new YouTubeSummarizerEventEmitter({
url,
openaiOrganizationId,
openaiApiKey,
language,
country,
});
default:
break;
}

throw new TypeError(`Invalid video type: "${videoType}". Valid values are: ${JSON.stringify(Object.values(VideoType))}`);
};

+ 84
- 0
src/summarizer.ts View File

@@ -0,0 +1,84 @@
import fetchPonyfill from 'fetch-ponyfill';
import Handlebars from 'handlebars';
import { resolve } from 'path';
import { readFile } from 'fs/promises';

const makeAiCall = async (prompts: string[], apiKey: string, organizationId?: string): Promise<string> => {
const headers: Record<string, string> = {
'Content-Type': 'application/json',
Accept: 'application/json',
Authorization: `Bearer ${apiKey}`,
};

if (organizationId) {
headers['OpenAI-Organization'] = organizationId;
}

const { fetch } = fetchPonyfill();
const response = await fetch(
new URL('/v1/chat/completions', 'https://api.openai.com'),
{
method: 'POST',
headers,
body: JSON.stringify({
//model: 'gpt-4',
model: 'gpt-3.5-turbo',
temperature: 0.6,
messages: [
{
role: 'user',
content: prompts[Math.floor(Math.random() * prompts.length)].trim(),
},
],
}),
},
);

if (!response.ok) {
const responseText = await response.text();
console.log(responseText);
throw new Error(`OpenAI API call failed with status ${response.status}`);
}

const { choices } = await response.json();

// should we use all the response choices?
return choices[0].message.content;
};

const compilePrompts = async (filename: string, params: Record<string, unknown>): Promise<string[]> => {
const rawPromptText = await readFile(resolve(__dirname, filename), 'utf-8');
const fill = Handlebars.compile(rawPromptText, { noEscape: true });
const filledText = fill(params);
return filledText.split('---').map((s) => s.trim());
};

export const normalizeTranscriptText = async (
rawTranscriptText: string,
apiKey: string,
organizationId?: string,
) => {
const prompts = await compilePrompts(
'../prompts/normalize-transcript-text.hbs',
{
transcript: rawTranscriptText,
},
);

return makeAiCall(prompts, apiKey, organizationId);
};

export const summarizeTranscript = async (
transcript: string,
apiKey: string,
organizationId?: string,
) => {
const prompts = await compilePrompts(
'../prompts/summarize-transcript.hbs',
{
transcript,
},
);

return makeAiCall(prompts, apiKey, organizationId);
};

+ 13
- 0
src/video-types/youtube/errors.ts View File

@@ -0,0 +1,13 @@
export class FetchTranscriptError extends Error {}

export class InvalidVideoIdError extends FetchTranscriptError {}

export class CannotRetrieveVideoPageError extends FetchTranscriptError {}

export class InnerTubeApiKeyMissingError extends FetchTranscriptError {}

export class FetchTranscriptRequestFailureError extends FetchTranscriptError {}

export class InvalidTranscriptResponseContextError extends FetchTranscriptError {}

export class InvalidTranscriptActionsError extends FetchTranscriptError {}

+ 96
- 0
src/video-types/youtube/index.ts View File

@@ -0,0 +1,96 @@
import { EventEmitter } from 'events';
import { CreateBaseSummarizerParams, SummarizerEventEmitter } from '../../common';
import {
retrieveVideoId,
getVideoPage,
extractDataFromPage,
fetchTranscriptItems, TranscriptResponse,
} from './transcript';
import { normalizeTranscriptText, summarizeTranscript } from '../../summarizer';

export interface CreateYouTubeSummarizerParams extends CreateBaseSummarizerParams {}

export class YouTubeSummarizerEventEmitter extends EventEmitter implements SummarizerEventEmitter {
constructor(private readonly params: CreateYouTubeSummarizerParams) {
super();
}

process() {
const {
url,
openaiApiKey,
openaiOrganizationId,
...config
} = this.params;
const identifier = retrieveVideoId(url);
let transcripts: TranscriptResponse[] = [];

this.emit('process', {
type: 'extract-data',
phase: 'download-page',
});

getVideoPage(identifier)
.then((videoPageBody) => {
const pageData = extractDataFromPage(videoPageBody);
this.emit('process', {
type: 'extract-data',
phase: 'success',
});

this.emit('process', {
type: 'fetch-transcript',
phase: 'start',
});
return fetchTranscriptItems(pageData, config);
})
.then((transcript) => {
this.emit('process', {
type: 'fetch-transcript',
phase: 'success',
});

transcripts = transcript;

this.emit('process', {
type: 'normalize-caption',
phase: 'start',
});

return normalizeTranscriptText(
transcript.map((item) => item.text).join(' '),
openaiApiKey,
openaiOrganizationId,
);
})
.then((normalizedCaption) => {
this.emit('process', {
type: 'normalize-caption',
phase: 'success',
});

this.emit('process', {
type: 'summarize-caption',
phase: 'start',
});

return summarizeTranscript(normalizedCaption, openaiApiKey, openaiOrganizationId);
})
.then((summary) => {
this.emit('process', {
type: 'summarize-caption',
phase: 'success',
});

this.emit('success', {
contentType: 'application/json',
content: JSON.stringify({ transcripts, summary }),
});
this.emit('end');
})
.catch((error) => {
this.emit('error', error);
this.emit('end');
});
}
}

+ 232
- 0
src/video-types/youtube/transcript.ts View File

@@ -0,0 +1,232 @@
// based from https://github.com/Kakulukian/youtube-transcript
//
// we refactored it to make it more maintainable

import fetchPonyfill from 'fetch-ponyfill';
import {
InvalidVideoIdError,
CannotRetrieveVideoPageError,
FetchTranscriptRequestFailureError,
InnerTubeApiKeyMissingError,
InvalidTranscriptActionsError,
InvalidTranscriptResponseContextError,
} from './errors';

const RE_YOUTUBE = /^.*(?:(?:youtu\.be\/|v\/|vi\/|u\/\w\/|embed\/)|(?:(?:watch)?\?v(?:i)?=|\&v(?:i)?=))([^#\&\?]*).*/im;

export interface TranscriptConfig {
language?: string;
country?: string;
}
export interface TranscriptResponse {
text: string;
duration: number;
offset: number;
}

const { fetch: f } = fetchPonyfill();

export const retrieveVideoId = (videoId: string): string => {
if (videoId.length === 11) {
return videoId;
}
const matchId = videoId.match(RE_YOUTUBE);
if (matchId && matchId.length) {
return matchId[1];
}
throw new InvalidVideoIdError('Impossible to retrieve Youtube video ID.');
};

export const getVideoPage = async (videoId: string): Promise<string> => {
const identifier = retrieveVideoId(videoId);
const videoUrl = new URL('/watch', 'https://www.youtube.com');
const videoUrlParams = new URLSearchParams({
v: identifier,
});
videoUrl.search = videoUrlParams.toString();
const videoPageResponse = await f(videoUrl.toString());
if (!videoPageResponse.ok) {
throw new CannotRetrieveVideoPageError('Unable to get video page.');
}
return videoPageResponse.text();
};

const generateNonce = () => {
const rnd = Math.random().toString();
const alphabet = 'ABCDEFGHIJKLMOPQRSTUVWXYZabcdefghjijklmnopqrstuvwxyz0123456789';
const jda = [
`${alphabet}+/=`,
`${alphabet}+/`,
`${alphabet}-_=`,
`${alphabet}-_.`,
`${alphabet}-_`,
];
const b = jda[3];
const a = [];
for (let i = 0; i < rnd.length - 1; i++) {
a.push(rnd[i].charCodeAt(i));
}
let c = '';
let d = 0;
let m; let n; let q; let r; let f; let
g;
while (d < a.length) {
f = a[d];
g = d + 1 < a.length;

if (g) {
m = a[d + 1];
} else {
m = 0;
}
n = d + 2 < a.length;
if (n) {
q = a[d + 2];
} else {
q = 0;
}
r = f >> 2;
f = ((f & 3) << 4) | (m >> 4);
m = ((m & 15) << 2) | (q >> 6);
q &= 63;
if (!n) {
q = 64;
if (!q) {
m = 64;
}
}
c += b[r] + b[f] + b[m] + b[q];
d += 3;
}
return c;
};

const extractInnterTubeApiKeyFromPage = (videoPageBody: string): string => videoPageBody
.split('"INNERTUBE_API_KEY":"')[1]
.split('"')[0];

const extractSerializedShareEntityFromPage = (page: string) => page.split('"serializedShareEntity":"')[1]?.split('"')[0];

const extractVisitorDataFromPage = (page: string) => page.split('"VISITOR_DATA":"')[1]?.split('"')[0];

const extractSessionIdFromPage = (page: string) => page.split('"sessionId":"')[1]?.split('"')[0];

const extractClickTrackingParamsFromPage = (page: string) => page
?.split('"clickTrackingParams":"')[1]
?.split('"')[0];

interface VideoPageData {
innerTubeApiKey?: string;
serializedShareEntity?: string;
visitorData?: string;
sessionId?: string;
clickTrackingParams?: string;
}

export const extractDataFromPage = (page: string): VideoPageData => ({
innerTubeApiKey: extractInnterTubeApiKeyFromPage(page),
serializedShareEntity: extractSerializedShareEntityFromPage(page),
visitorData: extractVisitorDataFromPage(page),
sessionId: extractSessionIdFromPage(page),
clickTrackingParams: extractClickTrackingParamsFromPage(page),
});

const generateGetTranscriptRequestBody = (
p: Partial<VideoPageData>,
config?: TranscriptConfig,
) => {
const {
serializedShareEntity,
visitorData,
sessionId,
clickTrackingParams,
} = p;
return {
context: {
client: {
hl: config?.language ?? 'en',
gl: config?.country ?? 'PH',
visitorData,
userAgent:
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36,gzip(gfe)',
clientName: 'WEB',
clientVersion: '2.20200925.01.00',
osName: 'Macintosh',
osVersion: '10_15_4',
browserName: 'Chrome',
browserVersion: '85.0f.4183.83',
screenWidthPoints: 1440,
screenHeightPoints: 770,
screenPixelDensity: 2,
utcOffsetMinutes: 120,
userInterfaceTheme: 'USER_INTERFACE_THEME_LIGHT',
connectionType: 'CONN_CELLULAR_3G',
},
request: {
sessionId,
internalExperimentFlags: [],
consistencyTokenJars: [],
},
user: {},
clientScreenNonce: generateNonce(),
clickTracking: {
clickTrackingParams,
},
},
params: serializedShareEntity,
};
};

export const fetchTranscriptItems = async (pageData: VideoPageData, config?: TranscriptConfig) => {
const { innerTubeApiKey } = pageData;
if (!(innerTubeApiKey && innerTubeApiKey.length > 0)) {
throw new InnerTubeApiKeyMissingError('InnerTube API key not found on video page.');
}
const getTranscriptUrl = new URL('/youtubei/v1/get_transcript', 'https://www.youtube.com');
const getTranscriptParams = new URLSearchParams({
key: innerTubeApiKey,
});
getTranscriptUrl.search = getTranscriptParams.toString();
const transcriptResponse = await f(getTranscriptUrl.toString(), {
method: 'POST',
headers: {
Accept: 'application/json',
'Content-Type': 'application/json',
},
body: JSON.stringify(generateGetTranscriptRequestBody(pageData, config)),
});

if (!transcriptResponse.ok) {
throw new FetchTranscriptRequestFailureError(`Fetching transcript failed with status ${transcriptResponse.status}.`);
}

const transcriptBody = await transcriptResponse.json();
if (!transcriptBody.responseContext) {
throw new InvalidTranscriptResponseContextError('No responseContext found on get transcript response.');
}

if (!transcriptBody.actions) {
throw new InvalidTranscriptActionsError('No actions found on get transcript response.');
}

const { cueGroups: transcripts } = transcriptBody
.actions[0]
.updateEngagementPanelAction
.content
.transcriptRenderer
.body
.transcriptBodyRenderer;

return transcripts.map((cue) => ({
text: cue.transcriptCueGroupRenderer.cues[0].transcriptCueRenderer
.cue.simpleText,
duration: parseInt(
cue.transcriptCueGroupRenderer.cues[0].transcriptCueRenderer
.durationMs,
),
offset: parseInt(
cue.transcriptCueGroupRenderer.cues[0].transcriptCueRenderer
.startOffsetMs,
),
})) as TranscriptResponse[];
};

+ 39
- 0
test/index.test.ts View File

@@ -0,0 +1,39 @@
import { config } from 'dotenv';
import { beforeAll, describe, it } from 'vitest';
import { createSummarizer, VideoType } from '../src';
import { writeFileSync } from 'fs';

describe('blah', () => {
beforeAll(() => {
config();
});

it('works', () => new Promise<void>((done) => {
const summarizer = createSummarizer({
type: VideoType.YOUTUBE,
url: 'https://www.youtube.com/watch?v=WeNgDxtBiyw',
openaiApiKey: process.env.OPENAI_API_KEY as string,
});

summarizer.on('success', (data) => {
const transcripts = JSON.parse(data.content as string) as {
transcripts: { text: string }[],
summary: string,
};
const transcriptText = transcripts.transcripts.map((t) => t.text).join('\n');
writeFileSync('transcript.txt', transcriptText);
writeFileSync('summary.txt', transcripts.summary);
});

summarizer.on('error', (err) => {
console.log(err);
done();
});

summarizer.on('end', () => {
done();
});

summarizer.process();
}), { timeout: 180000 });
});

+ 21
- 0
tsconfig.eslint.json View File

@@ -0,0 +1,21 @@
{
"exclude": ["node_modules"],
"include": ["src", "types", "test"],
"compilerOptions": {
"module": "ESNext",
"lib": ["ESNext"],
"importHelpers": true,
"declaration": true,
"sourceMap": true,
"rootDir": "./",
"strict": true,
"noUnusedLocals": true,
"noUnusedParameters": true,
"noImplicitReturns": true,
"noFallthroughCasesInSwitch": true,
"moduleResolution": "node",
"jsx": "react",
"esModuleInterop": true,
"target": "es2018"
}
}

+ 21
- 0
tsconfig.json View File

@@ -0,0 +1,21 @@
{
"exclude": ["node_modules"],
"include": ["src", "types"],
"compilerOptions": {
"module": "ESNext",
"lib": ["ESNext", "DOM"],
"importHelpers": true,
"declaration": true,
"sourceMap": true,
"rootDir": "./src",
"strict": true,
"noUnusedLocals": true,
"noUnusedParameters": true,
"noImplicitReturns": true,
"noFallthroughCasesInSwitch": true,
"moduleResolution": "node",
"jsx": "react",
"esModuleInterop": true,
"target": "es2018"
}
}

+ 3627
- 0
yarn.lock
File diff suppressed because it is too large
View File


Loading…
Cancel
Save