Procházet zdrojové kódy

Initial commit

Add files from pridepack.
master
TheoryOfNekomata před 1 rokem
revize
d82031ab4a
17 změnil soubory, kde provedl 4397 přidání a 0 odebrání
  1. +1
    -0
      .env.example
  2. +9
    -0
      .eslintrc
  3. +112
    -0
      .gitignore
  4. +7
    -0
      LICENSE
  5. +53
    -0
      package.json
  6. +3
    -0
      pridepack.json
  7. +15
    -0
      prompt-template.hbs
  8. +25
    -0
      src/common.ts
  9. +39
    -0
      src/index.ts
  10. +84
    -0
      src/summarizer.ts
  11. +13
    -0
      src/video-types/youtube/errors.ts
  12. +96
    -0
      src/video-types/youtube/index.ts
  13. +232
    -0
      src/video-types/youtube/transcript.ts
  14. +39
    -0
      test/index.test.ts
  15. +21
    -0
      tsconfig.eslint.json
  16. +21
    -0
      tsconfig.json
  17. +3627
    -0
      yarn.lock

+ 1
- 0
.env.example Zobrazit soubor

@@ -0,0 +1 @@
OPENAI_API_KEY=

+ 9
- 0
.eslintrc Zobrazit soubor

@@ -0,0 +1,9 @@
{
"root": true,
"extends": [
"lxsmnsyc/typescript"
],
"parserOptions": {
"project": "./tsconfig.eslint.json"
}
}

+ 112
- 0
.gitignore Zobrazit soubor

@@ -0,0 +1,112 @@
# Logs
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
lerna-debug.log*

# Diagnostic reports (https://nodejs.org/api/report.html)
report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json

# Runtime data
pids
*.pid
*.seed
*.pid.lock

# Directory for instrumented libs generated by jscoverage/JSCover
lib-cov

# Coverage directory used by tools like istanbul
coverage
*.lcov

# nyc test coverage
.nyc_output

# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
.grunt

# Bower dependency directory (https://bower.io/)
bower_components

# node-waf configuration
.lock-wscript

# Compiled binary addons (https://nodejs.org/api/addons.html)
build/Release

# Dependency directories
node_modules/
jspm_packages/

# TypeScript v1 declaration files
typings/

# TypeScript cache
*.tsbuildinfo

# Optional npm cache directory
.npm

# Optional eslint cache
.eslintcache

# Microbundle cache
.rpt2_cache/
.rts2_cache_cjs/
.rts2_cache_es/
.rts2_cache_umd/

# Optional REPL history
.node_repl_history

# Output of 'npm pack'
*.tgz

# Yarn Integrity file
.yarn-integrity

# dotenv environment variables file
.env
.env.production
.env.development

# parcel-bundler cache (https://parceljs.org/)
.cache

# Next.js build output
.next

# Nuxt.js build / generate output
.nuxt
dist

# Gatsby files
.cache/
# Comment in the public line in if your project uses Gatsby and *not* Next.js
# https://nextjs.org/blog/next-9-1#public-directory-support
# public

# vuepress build output
.vuepress/dist

# Serverless directories
.serverless/

# FuseBox cache
.fusebox/

# DynamoDB Local files
.dynamodb/

# TernJS port file
.tern-port

.npmrc
.idea/
transcript.txt
summary.txt
results/
prompts/

+ 7
- 0
LICENSE Zobrazit soubor

@@ -0,0 +1,7 @@
MIT License Copyright (c) 2023 TheoryOfNekomata <allan.crisostomo@outlook.com>

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice (including the next paragraph) shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

+ 53
- 0
package.json Zobrazit soubor

@@ -0,0 +1,53 @@
{
"name": "@modal-sh/webvideo-transcript-summary-core",
"version": "0.0.0",
"files": [
"dist",
"src"
],
"engines": {
"node": ">=12"
},
"license": "MIT",
"keywords": [
"pridepack"
],
"devDependencies": {
"@types/node": "^18.14.1",
"eslint": "^8.35.0",
"eslint-config-lxsmnsyc": "^0.5.0",
"pridepack": "2.4.4",
"tslib": "^2.5.0",
"typescript": "^4.9.5",
"vitest": "^0.28.1"
},
"scripts": {
"prepublishOnly": "pridepack clean && pridepack build",
"build": "pridepack build",
"type-check": "pridepack check",
"lint": "pridepack lint",
"clean": "pridepack clean",
"watch": "pridepack watch",
"start": "pridepack start",
"dev": "pridepack dev",
"test": "vitest"
},
"private": false,
"description": "Get transcript summaries of Web videos.",
"repository": {
"url": "https://code.modal.sh/modal-soft/webvideo-transcript-summary-core",
"type": "git"
},
"homepage": "https://code.modal.sh/modal-soft/webvideo-transcript-summary-core",
"bugs": {
"url": "https://code.modal.sh/modal-soft/webvideo-transcript-summary-core/issues"
},
"author": "TheoryOfNekomata <allan.crisostomo@outlook.com>",
"publishConfig": {
"access": "public"
},
"dependencies": {
"fetch-ponyfill": "^7.1.0",
"handlebars": "^4.7.7"
}
}

+ 3
- 0
pridepack.json Zobrazit soubor

@@ -0,0 +1,3 @@
{
"target": "es2018"
}

+ 15
- 0
prompt-template.hbs Zobrazit soubor

@@ -0,0 +1,15 @@
Prompt text 1

---

Prompt text 2

{{transcript}}

---

Prompt text 3

{{transcript}}

Next line of text

+ 25
- 0
src/common.ts Zobrazit soubor

@@ -0,0 +1,25 @@
type ProcessEvent = { type: string, phase: string, command?: string };

type ProcessEventCallback = (event: ProcessEvent) => void;

type ErrorEventCallback = (event: Error) => void;

type SuccessEvent = { contentType: string, content: unknown };

type SuccessEventCallback = (event: SuccessEvent) => void;

export interface SummarizerEventEmitter extends NodeJS.EventEmitter {
process(): void;
on(eventType: 'process', callback: ProcessEventCallback): this;
on(eventType: 'error', callback: ErrorEventCallback): this;
on(eventType: 'success', callback: SuccessEventCallback): this;
on(eventType: 'end', callback: () => void): this;
}

export interface CreateBaseSummarizerParams {
url: string;
language?: string;
country?: string;
openaiApiKey: string;
openaiOrganizationId?: string;
}

+ 39
- 0
src/index.ts Zobrazit soubor

@@ -0,0 +1,39 @@
import { SummarizerEventEmitter } from './common';
import {
CreateYouTubeSummarizerParams,
YouTubeSummarizerEventEmitter,
} from './video-types/youtube';

export enum VideoType {
YOUTUBE = 'youtube',
}

export interface CreateSummarizerParams extends CreateYouTubeSummarizerParams {
type: VideoType;
}

export const createSummarizer = (params: CreateSummarizerParams): SummarizerEventEmitter => {
const {
type: videoType,
url,
openaiOrganizationId,
openaiApiKey,
language,
country,
} = params;

switch (videoType as string) {
case VideoType.YOUTUBE:
return new YouTubeSummarizerEventEmitter({
url,
openaiOrganizationId,
openaiApiKey,
language,
country,
});
default:
break;
}

throw new TypeError(`Invalid video type: "${videoType}". Valid values are: ${JSON.stringify(Object.values(VideoType))}`);
};

+ 84
- 0
src/summarizer.ts Zobrazit soubor

@@ -0,0 +1,84 @@
import fetchPonyfill from 'fetch-ponyfill';
import Handlebars from 'handlebars';
import { resolve } from 'path';
import { readFile } from 'fs/promises';

const makeAiCall = async (prompts: string[], apiKey: string, organizationId?: string): Promise<string> => {
const headers: Record<string, string> = {
'Content-Type': 'application/json',
Accept: 'application/json',
Authorization: `Bearer ${apiKey}`,
};

if (organizationId) {
headers['OpenAI-Organization'] = organizationId;
}

const { fetch } = fetchPonyfill();
const response = await fetch(
new URL('/v1/chat/completions', 'https://api.openai.com'),
{
method: 'POST',
headers,
body: JSON.stringify({
//model: 'gpt-4',
model: 'gpt-3.5-turbo',
temperature: 0.6,
messages: [
{
role: 'user',
content: prompts[Math.floor(Math.random() * prompts.length)].trim(),
},
],
}),
},
);

if (!response.ok) {
const responseText = await response.text();
console.log(responseText);
throw new Error(`OpenAI API call failed with status ${response.status}`);
}

const { choices } = await response.json();

// should we use all the response choices?
return choices[0].message.content;
};

const compilePrompts = async (filename: string, params: Record<string, unknown>): Promise<string[]> => {
const rawPromptText = await readFile(resolve(__dirname, filename), 'utf-8');
const fill = Handlebars.compile(rawPromptText, { noEscape: true });
const filledText = fill(params);
return filledText.split('---').map((s) => s.trim());
};

export const normalizeTranscriptText = async (
rawTranscriptText: string,
apiKey: string,
organizationId?: string,
) => {
const prompts = await compilePrompts(
'../prompts/normalize-transcript-text.hbs',
{
transcript: rawTranscriptText,
},
);

return makeAiCall(prompts, apiKey, organizationId);
};

export const summarizeTranscript = async (
transcript: string,
apiKey: string,
organizationId?: string,
) => {
const prompts = await compilePrompts(
'../prompts/summarize-transcript.hbs',
{
transcript,
},
);

return makeAiCall(prompts, apiKey, organizationId);
};

+ 13
- 0
src/video-types/youtube/errors.ts Zobrazit soubor

@@ -0,0 +1,13 @@
export class FetchTranscriptError extends Error {}

export class InvalidVideoIdError extends FetchTranscriptError {}

export class CannotRetrieveVideoPageError extends FetchTranscriptError {}

export class InnerTubeApiKeyMissingError extends FetchTranscriptError {}

export class FetchTranscriptRequestFailureError extends FetchTranscriptError {}

export class InvalidTranscriptResponseContextError extends FetchTranscriptError {}

export class InvalidTranscriptActionsError extends FetchTranscriptError {}

+ 96
- 0
src/video-types/youtube/index.ts Zobrazit soubor

@@ -0,0 +1,96 @@
import { EventEmitter } from 'events';
import { CreateBaseSummarizerParams, SummarizerEventEmitter } from '../../common';
import {
retrieveVideoId,
getVideoPage,
extractDataFromPage,
fetchTranscriptItems, TranscriptResponse,
} from './transcript';
import { normalizeTranscriptText, summarizeTranscript } from '../../summarizer';

export interface CreateYouTubeSummarizerParams extends CreateBaseSummarizerParams {}

export class YouTubeSummarizerEventEmitter extends EventEmitter implements SummarizerEventEmitter {
constructor(private readonly params: CreateYouTubeSummarizerParams) {
super();
}

process() {
const {
url,
openaiApiKey,
openaiOrganizationId,
...config
} = this.params;
const identifier = retrieveVideoId(url);
let transcripts: TranscriptResponse[] = [];

this.emit('process', {
type: 'extract-data',
phase: 'download-page',
});

getVideoPage(identifier)
.then((videoPageBody) => {
const pageData = extractDataFromPage(videoPageBody);
this.emit('process', {
type: 'extract-data',
phase: 'success',
});

this.emit('process', {
type: 'fetch-transcript',
phase: 'start',
});
return fetchTranscriptItems(pageData, config);
})
.then((transcript) => {
this.emit('process', {
type: 'fetch-transcript',
phase: 'success',
});

transcripts = transcript;

this.emit('process', {
type: 'normalize-caption',
phase: 'start',
});

return normalizeTranscriptText(
transcript.map((item) => item.text).join(' '),
openaiApiKey,
openaiOrganizationId,
);
})
.then((normalizedCaption) => {
this.emit('process', {
type: 'normalize-caption',
phase: 'success',
});

this.emit('process', {
type: 'summarize-caption',
phase: 'start',
});

return summarizeTranscript(normalizedCaption, openaiApiKey, openaiOrganizationId);
})
.then((summary) => {
this.emit('process', {
type: 'summarize-caption',
phase: 'success',
});

this.emit('success', {
contentType: 'application/json',
content: JSON.stringify({ transcripts, summary }),
});
this.emit('end');
})
.catch((error) => {
this.emit('error', error);
this.emit('end');
});
}
}

+ 232
- 0
src/video-types/youtube/transcript.ts Zobrazit soubor

@@ -0,0 +1,232 @@
// based from https://github.com/Kakulukian/youtube-transcript
//
// we refactored it to make it more maintainable

import fetchPonyfill from 'fetch-ponyfill';
import {
InvalidVideoIdError,
CannotRetrieveVideoPageError,
FetchTranscriptRequestFailureError,
InnerTubeApiKeyMissingError,
InvalidTranscriptActionsError,
InvalidTranscriptResponseContextError,
} from './errors';

const RE_YOUTUBE = /^.*(?:(?:youtu\.be\/|v\/|vi\/|u\/\w\/|embed\/)|(?:(?:watch)?\?v(?:i)?=|\&v(?:i)?=))([^#\&\?]*).*/im;

export interface TranscriptConfig {
language?: string;
country?: string;
}
export interface TranscriptResponse {
text: string;
duration: number;
offset: number;
}

const { fetch: f } = fetchPonyfill();

export const retrieveVideoId = (videoId: string): string => {
if (videoId.length === 11) {
return videoId;
}
const matchId = videoId.match(RE_YOUTUBE);
if (matchId && matchId.length) {
return matchId[1];
}
throw new InvalidVideoIdError('Impossible to retrieve Youtube video ID.');
};

export const getVideoPage = async (videoId: string): Promise<string> => {
const identifier = retrieveVideoId(videoId);
const videoUrl = new URL('/watch', 'https://www.youtube.com');
const videoUrlParams = new URLSearchParams({
v: identifier,
});
videoUrl.search = videoUrlParams.toString();
const videoPageResponse = await f(videoUrl.toString());
if (!videoPageResponse.ok) {
throw new CannotRetrieveVideoPageError('Unable to get video page.');
}
return videoPageResponse.text();
};

const generateNonce = () => {
const rnd = Math.random().toString();
const alphabet = 'ABCDEFGHIJKLMOPQRSTUVWXYZabcdefghjijklmnopqrstuvwxyz0123456789';
const jda = [
`${alphabet}+/=`,
`${alphabet}+/`,
`${alphabet}-_=`,
`${alphabet}-_.`,
`${alphabet}-_`,
];
const b = jda[3];
const a = [];
for (let i = 0; i < rnd.length - 1; i++) {
a.push(rnd[i].charCodeAt(i));
}
let c = '';
let d = 0;
let m; let n; let q; let r; let f; let
g;
while (d < a.length) {
f = a[d];
g = d + 1 < a.length;

if (g) {
m = a[d + 1];
} else {
m = 0;
}
n = d + 2 < a.length;
if (n) {
q = a[d + 2];
} else {
q = 0;
}
r = f >> 2;
f = ((f & 3) << 4) | (m >> 4);
m = ((m & 15) << 2) | (q >> 6);
q &= 63;
if (!n) {
q = 64;
if (!q) {
m = 64;
}
}
c += b[r] + b[f] + b[m] + b[q];
d += 3;
}
return c;
};

const extractInnterTubeApiKeyFromPage = (videoPageBody: string): string => videoPageBody
.split('"INNERTUBE_API_KEY":"')[1]
.split('"')[0];

const extractSerializedShareEntityFromPage = (page: string) => page.split('"serializedShareEntity":"')[1]?.split('"')[0];

const extractVisitorDataFromPage = (page: string) => page.split('"VISITOR_DATA":"')[1]?.split('"')[0];

const extractSessionIdFromPage = (page: string) => page.split('"sessionId":"')[1]?.split('"')[0];

const extractClickTrackingParamsFromPage = (page: string) => page
?.split('"clickTrackingParams":"')[1]
?.split('"')[0];

interface VideoPageData {
innerTubeApiKey?: string;
serializedShareEntity?: string;
visitorData?: string;
sessionId?: string;
clickTrackingParams?: string;
}

export const extractDataFromPage = (page: string): VideoPageData => ({
innerTubeApiKey: extractInnterTubeApiKeyFromPage(page),
serializedShareEntity: extractSerializedShareEntityFromPage(page),
visitorData: extractVisitorDataFromPage(page),
sessionId: extractSessionIdFromPage(page),
clickTrackingParams: extractClickTrackingParamsFromPage(page),
});

const generateGetTranscriptRequestBody = (
p: Partial<VideoPageData>,
config?: TranscriptConfig,
) => {
const {
serializedShareEntity,
visitorData,
sessionId,
clickTrackingParams,
} = p;
return {
context: {
client: {
hl: config?.language ?? 'en',
gl: config?.country ?? 'PH',
visitorData,
userAgent:
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36,gzip(gfe)',
clientName: 'WEB',
clientVersion: '2.20200925.01.00',
osName: 'Macintosh',
osVersion: '10_15_4',
browserName: 'Chrome',
browserVersion: '85.0f.4183.83',
screenWidthPoints: 1440,
screenHeightPoints: 770,
screenPixelDensity: 2,
utcOffsetMinutes: 120,
userInterfaceTheme: 'USER_INTERFACE_THEME_LIGHT',
connectionType: 'CONN_CELLULAR_3G',
},
request: {
sessionId,
internalExperimentFlags: [],
consistencyTokenJars: [],
},
user: {},
clientScreenNonce: generateNonce(),
clickTracking: {
clickTrackingParams,
},
},
params: serializedShareEntity,
};
};

export const fetchTranscriptItems = async (pageData: VideoPageData, config?: TranscriptConfig) => {
const { innerTubeApiKey } = pageData;
if (!(innerTubeApiKey && innerTubeApiKey.length > 0)) {
throw new InnerTubeApiKeyMissingError('InnerTube API key not found on video page.');
}
const getTranscriptUrl = new URL('/youtubei/v1/get_transcript', 'https://www.youtube.com');
const getTranscriptParams = new URLSearchParams({
key: innerTubeApiKey,
});
getTranscriptUrl.search = getTranscriptParams.toString();
const transcriptResponse = await f(getTranscriptUrl.toString(), {
method: 'POST',
headers: {
Accept: 'application/json',
'Content-Type': 'application/json',
},
body: JSON.stringify(generateGetTranscriptRequestBody(pageData, config)),
});

if (!transcriptResponse.ok) {
throw new FetchTranscriptRequestFailureError(`Fetching transcript failed with status ${transcriptResponse.status}.`);
}

const transcriptBody = await transcriptResponse.json();
if (!transcriptBody.responseContext) {
throw new InvalidTranscriptResponseContextError('No responseContext found on get transcript response.');
}

if (!transcriptBody.actions) {
throw new InvalidTranscriptActionsError('No actions found on get transcript response.');
}

const { cueGroups: transcripts } = transcriptBody
.actions[0]
.updateEngagementPanelAction
.content
.transcriptRenderer
.body
.transcriptBodyRenderer;

return transcripts.map((cue) => ({
text: cue.transcriptCueGroupRenderer.cues[0].transcriptCueRenderer
.cue.simpleText,
duration: parseInt(
cue.transcriptCueGroupRenderer.cues[0].transcriptCueRenderer
.durationMs,
),
offset: parseInt(
cue.transcriptCueGroupRenderer.cues[0].transcriptCueRenderer
.startOffsetMs,
),
})) as TranscriptResponse[];
};

+ 39
- 0
test/index.test.ts Zobrazit soubor

@@ -0,0 +1,39 @@
import { config } from 'dotenv';
import { beforeAll, describe, it } from 'vitest';
import { createSummarizer, VideoType } from '../src';
import { writeFileSync } from 'fs';

describe('blah', () => {
beforeAll(() => {
config();
});

it('works', () => new Promise<void>((done) => {
const summarizer = createSummarizer({
type: VideoType.YOUTUBE,
url: 'https://www.youtube.com/watch?v=WeNgDxtBiyw',
openaiApiKey: process.env.OPENAI_API_KEY as string,
});

summarizer.on('success', (data) => {
const transcripts = JSON.parse(data.content as string) as {
transcripts: { text: string }[],
summary: string,
};
const transcriptText = transcripts.transcripts.map((t) => t.text).join('\n');
writeFileSync('transcript.txt', transcriptText);
writeFileSync('summary.txt', transcripts.summary);
});

summarizer.on('error', (err) => {
console.log(err);
done();
});

summarizer.on('end', () => {
done();
});

summarizer.process();
}), { timeout: 180000 });
});

+ 21
- 0
tsconfig.eslint.json Zobrazit soubor

@@ -0,0 +1,21 @@
{
"exclude": ["node_modules"],
"include": ["src", "types", "test"],
"compilerOptions": {
"module": "ESNext",
"lib": ["ESNext"],
"importHelpers": true,
"declaration": true,
"sourceMap": true,
"rootDir": "./",
"strict": true,
"noUnusedLocals": true,
"noUnusedParameters": true,
"noImplicitReturns": true,
"noFallthroughCasesInSwitch": true,
"moduleResolution": "node",
"jsx": "react",
"esModuleInterop": true,
"target": "es2018"
}
}

+ 21
- 0
tsconfig.json Zobrazit soubor

@@ -0,0 +1,21 @@
{
"exclude": ["node_modules"],
"include": ["src", "types"],
"compilerOptions": {
"module": "ESNext",
"lib": ["ESNext", "DOM"],
"importHelpers": true,
"declaration": true,
"sourceMap": true,
"rootDir": "./src",
"strict": true,
"noUnusedLocals": true,
"noUnusedParameters": true,
"noImplicitReturns": true,
"noFallthroughCasesInSwitch": true,
"moduleResolution": "node",
"jsx": "react",
"esModuleInterop": true,
"target": "es2018"
}
}

+ 3627
- 0
yarn.lock
Diff nebyl zobrazen, protože je příliš veliký
Zobrazit soubor


Načítá se…
Zrušit
Uložit