diff --git a/package.json b/package.json index f1f427c..ae91788 100644 --- a/package.json +++ b/package.json @@ -45,5 +45,9 @@ "author": "TheoryOfNekomata ", "publishConfig": { "access": "public" + }, + "dependencies": { + "fetch-ponyfill": "^7.1.0", + "xml-js": "^1.6.11" } } diff --git a/src/index.ts b/src/index.ts index f0e1a4e..b26a1e8 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,6 +1,30 @@ -export default function add(a: number, b: number): number { - if (process.env.NODE_ENV !== 'production') { - console.log('This code would not appear on production builds'); +import * as KanjidicImpl from './sources/kanjidic'; +import * as JmdictImpl from './sources/jmdict'; + +const SUPPORTED_SOURCES = [ + KanjidicImpl, + JmdictImpl, +] as const; + +export type CreateDownloaderParams = ( + KanjidicImpl.CreateDownloaderParams + | JmdictImpl.CreateDownloaderParams +); + +export * as Kanjidic from './sources/kanjidic'; +export * as Jmdict from './sources/jmdict'; +export * from './streams'; + +export const createDownloader = (params: CreateDownloaderParams) => { + const { type: sourceType, ...etcParams } = params; + + const theSourceModule = SUPPORTED_SOURCES + .find((videoTypeModule) => videoTypeModule.SOURCE_ID === sourceType); + + if (!theSourceModule) { + const validSourceTypes = SUPPORTED_SOURCES.map((videoTypeModule) => videoTypeModule.SOURCE_ID).join(', '); + throw new TypeError(`Invalid source type: "${sourceType}". Valid values are: ${validSourceTypes}`); } - return a + b; -} + + return theSourceModule.createDownloader(etcParams); +}; diff --git a/src/sources/jmdict/common.ts b/src/sources/jmdict/common.ts new file mode 100644 index 0000000..101835a --- /dev/null +++ b/src/sources/jmdict/common.ts @@ -0,0 +1 @@ +export const SOURCE_ID = 'jmdict' as const; diff --git a/src/sources/jmdict/downloader.ts b/src/sources/jmdict/downloader.ts new file mode 100644 index 0000000..aac07ef --- /dev/null +++ b/src/sources/jmdict/downloader.ts @@ -0,0 +1,25 @@ +import fetchPonyfill from 'fetch-ponyfill'; +import { PassThrough } from 'stream'; +import { createGunzip } from 'zlib'; +import { SOURCE_ID } from './common'; + +export interface CreateDownloaderParams { + type: typeof SOURCE_ID; + url?: string; +} + +const DEFAULT_SOURCE_URL = 'http://ftp.edrdg.org/pub/Nihongo/JMdict.gz' as const; + +export const createDownloader = async (params: Omit) => { + const { url = DEFAULT_SOURCE_URL } = params; + const { fetch } = fetchPonyfill(); + + const response = await fetch(url); + if (!response.ok) { + throw new Error(`Failed to download: ${url}`); + } + + const rawStream = response.body as unknown as PassThrough; + return rawStream + .pipe(createGunzip()); +}; diff --git a/src/sources/jmdict/index.ts b/src/sources/jmdict/index.ts new file mode 100644 index 0000000..22fa288 --- /dev/null +++ b/src/sources/jmdict/index.ts @@ -0,0 +1,2 @@ +export * from './common'; +export * from './downloader'; diff --git a/src/sources/kanjidic/common.ts b/src/sources/kanjidic/common.ts new file mode 100644 index 0000000..890f00d --- /dev/null +++ b/src/sources/kanjidic/common.ts @@ -0,0 +1 @@ +export const SOURCE_ID = 'kanjidic' as const; diff --git a/src/sources/kanjidic/downloader.ts b/src/sources/kanjidic/downloader.ts new file mode 100644 index 0000000..da22f0c --- /dev/null +++ b/src/sources/kanjidic/downloader.ts @@ -0,0 +1,25 @@ +import fetchPonyfill from 'fetch-ponyfill'; +import { PassThrough } from 'stream'; +import { createGunzip } from 'zlib'; +import { SOURCE_ID } from './common'; + +export interface CreateDownloaderParams { + type: typeof SOURCE_ID; + url?: string; +} + +const DEFAULT_SOURCE_URL = 'http://www.edrdg.org/kanjidic/kanjidic2.xml.gz' as const; + +export const createDownloader = async (params: Omit) => { + const { url = DEFAULT_SOURCE_URL } = params; + const { fetch } = fetchPonyfill(); + + const response = await fetch(url); + if (!response.ok) { + throw new Error(`Failed to download: ${url}`); + } + + const rawStream = response.body as unknown as PassThrough; + return rawStream + .pipe(createGunzip()); +}; diff --git a/src/sources/kanjidic/index.ts b/src/sources/kanjidic/index.ts new file mode 100644 index 0000000..c888de8 --- /dev/null +++ b/src/sources/kanjidic/index.ts @@ -0,0 +1,3 @@ +export * from './common'; +export * from './downloader'; + diff --git a/src/streams.ts b/src/streams.ts new file mode 100644 index 0000000..7d554c2 --- /dev/null +++ b/src/streams.ts @@ -0,0 +1,50 @@ +import { Transform, TransformCallback } from 'stream'; +import { xml2json } from 'xml-js'; + +export interface XmlToJsonLinesTransformStreamOptions { + entryTagName: string; +} + +class XmlToJsonLinesTransformStream extends Transform { + private charactersToParse?: string; + + constructor(private readonly options: XmlToJsonLinesTransformStreamOptions) { + super(); + // noop + } + + // eslint-disable-next-line no-underscore-dangle + _transform(chunk: Buffer, _encoding: BufferEncoding, callback: TransformCallback) { + const chunkStr = chunk.toString('utf-8'); + if (typeof this.charactersToParse !== 'string') { + const firstEntryIndex = chunkStr.indexOf(`<${this.options.entryTagName}>`); + this.charactersToParse = chunkStr.slice(firstEntryIndex); + } else { + this.charactersToParse += chunkStr; + } + + let theCharacters = `${this.charactersToParse}`; + let nextOpenTagIndex = theCharacters.indexOf(`<${this.options.entryTagName}>`); + let nextCloseTagIndex = theCharacters.indexOf(``); + do { + const xml = theCharacters + .slice( + nextOpenTagIndex, + nextCloseTagIndex + this.options.entryTagName.length + 3, + ) + .replace(/&(.+?);/g, '$1'); // FIXME better handling of XML entities??? This makes the pipe hang for some reason + const json = xml2json(xml, { compact: true }); + this.push(`${json}\n`); + theCharacters = theCharacters.slice(nextCloseTagIndex + this.options.entryTagName.length + 3); + nextOpenTagIndex = theCharacters.indexOf(`<${this.options.entryTagName}>`); + nextCloseTagIndex = theCharacters.indexOf(``); + } while (nextOpenTagIndex !== -1 && nextCloseTagIndex !== -1); + this.charactersToParse = theCharacters; + + callback(null, ''); + } +} + +export const createXmlToJsonLines = (options = { + entryTagName: 'entry', +} as XmlToJsonLinesTransformStreamOptions) => new XmlToJsonLinesTransformStream(options); diff --git a/test/index.test.ts b/test/index.test.ts index 441ca94..b36868b 100644 --- a/test/index.test.ts +++ b/test/index.test.ts @@ -1,8 +1,73 @@ +import { createReadStream, createWriteStream } from 'fs'; import { describe, it, expect } from 'vitest'; -import add from '../src'; +import { createDownloader, Kanjidic, Jmdict, createXmlToJsonLines } from '../src'; -describe('blah', () => { - it('works', () => { - expect(add(1, 1)).toEqual(2); +describe('downloader', () => { + describe.skip('kanjidic', () => { + it.skip('works', async () => { + const readStream = await createDownloader({ + type: Kanjidic.SOURCE_ID, + }); + + return new Promise((resolve) => { + const out = createWriteStream('kanjidic2.xml'); + + out.on('finish', () => { + resolve(); + }); + + readStream.pipe(out); + }); + }); + + it.skip('converts XML to JSON', () => new Promise((resolve) => { + const readStream = createReadStream('kanjidic2.xml', { encoding: 'utf-8' }); + const transform = createXmlToJsonLines({ + entryTagName: 'character', + }); + const out = createWriteStream('kanjidic2.jsonl'); + + out.on('finish', () => { + resolve(); + }); + + readStream + .pipe(transform) + .pipe(out); + })); + }); + + describe.skip('jmdict', () => { + it('works', async () => { + const readStream = await createDownloader({ + type: Jmdict.SOURCE_ID, + }); + + return new Promise((resolve) => { + const out = createWriteStream('jmdict.xml'); + + out.on('finish', () => { + resolve(); + }); + + readStream.pipe(out); + }); + }, { timeout: 300000 }); + + it('converts XML to JSON', () => new Promise((resolve) => { + const readStream = createReadStream('jmdict.full.xml', { encoding: 'utf-8' }); + const transform = createXmlToJsonLines({ + entryTagName: 'entry', + }); + const out = createWriteStream('jmdict.jsonl'); + + out.on('finish', () => { + resolve(); + }); + + readStream + .pipe(transform) + .pipe(out); + }), { timeout: 300000 }); }); }); diff --git a/tsconfig.eslint.json b/tsconfig.eslint.json index 459f2a1..f654232 100644 --- a/tsconfig.eslint.json +++ b/tsconfig.eslint.json @@ -3,7 +3,7 @@ "include": ["src", "types", "test"], "compilerOptions": { "module": "ESNext", - "lib": ["ESNext"], + "lib": ["ESNext", "DOM"], "importHelpers": true, "declaration": true, "sourceMap": true, diff --git a/tsconfig.json b/tsconfig.json index e210542..ebe2905 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -3,7 +3,7 @@ "include": ["src", "types"], "compilerOptions": { "module": "ESNext", - "lib": ["ESNext"], + "lib": ["ESNext", "DOM"], "importHelpers": true, "declaration": true, "sourceMap": true, diff --git a/yarn.lock b/yarn.lock index d021297..4bce601 100644 --- a/yarn.lock +++ b/yarn.lock @@ -1678,6 +1678,13 @@ fastq@^1.6.0: dependencies: reusify "^1.0.4" +fetch-ponyfill@^7.1.0: + version "7.1.0" + resolved "https://registry.yarnpkg.com/fetch-ponyfill/-/fetch-ponyfill-7.1.0.tgz#4266ed48b4e64663a50ab7f7fcb8e76f990526d0" + integrity sha512-FhbbL55dj/qdVO3YNK7ZEkshvj3eQ7EuIGV2I6ic/2YiocvyWv+7jg2s4AyS0wdRU75s3tA8ZxI/xPigb0v5Aw== + dependencies: + node-fetch "~2.6.1" + file-entry-cache@^6.0.1: version "6.0.1" resolved "https://registry.yarnpkg.com/file-entry-cache/-/file-entry-cache-6.0.1.tgz#211b2dd9659cb0394b073e7323ac3c933d522027" @@ -2474,6 +2481,13 @@ natural-compare@^1.4.0: resolved "https://registry.yarnpkg.com/natural-compare/-/natural-compare-1.4.0.tgz#4abebfeed7541f2c27acfb29bdbbd15c8d5ba4f7" integrity sha512-OWND8ei3VtNC9h7V60qff3SVobHr996CTwgxubgyQYEpg290h9J0buyECNNJexkFm5sOajh5G116RYA1c8ZMSw== +node-fetch@~2.6.1: + version "2.6.9" + resolved "https://registry.yarnpkg.com/node-fetch/-/node-fetch-2.6.9.tgz#7c7f744b5cc6eb5fd404e0c7a9fec630a55657e6" + integrity sha512-DJm/CJkZkRjKKj4Zi4BsKVZh3ValV5IR5s7LVZnW+6YMh0W1BfNA8XSs6DLMGYlId5F3KnA70uu2qepcR08Qqg== + dependencies: + whatwg-url "^5.0.0" + node-releases@^2.0.8: version "2.0.10" resolved "https://registry.yarnpkg.com/node-releases/-/node-releases-2.0.10.tgz#c311ebae3b6a148c89b1813fd7c4d3c024ef537f" @@ -2919,6 +2933,11 @@ safe-regex-test@^1.0.0: get-intrinsic "^1.1.3" is-regex "^1.1.4" +sax@^1.2.4: + version "1.2.4" + resolved "https://registry.yarnpkg.com/sax/-/sax-1.2.4.tgz#2816234e2378bddc4e5354fab5caa895df7100d9" + integrity sha512-NqVDv9TpANUjFm0N8uM5GxL36UgKi9/atZw+x7YFnQ8ckwFGKrl4xX4yWtrey3UJm5nP1kUbnYgLopqWNSRhWw== + semver@7.3.8: version "7.3.8" resolved "https://registry.yarnpkg.com/semver/-/semver-7.3.8.tgz#07a78feafb3f7b32347d725e33de7e2a2df67798" @@ -3213,6 +3232,11 @@ to-regex-range@^5.0.1: dependencies: is-number "^7.0.0" +tr46@~0.0.3: + version "0.0.3" + resolved "https://registry.yarnpkg.com/tr46/-/tr46-0.0.3.tgz#8184fd347dac9cdc185992f3a6622e14b9d9ab6a" + integrity sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw== + tsconfig-paths@^3.14.1: version "3.14.2" resolved "https://registry.yarnpkg.com/tsconfig-paths/-/tsconfig-paths-3.14.2.tgz#6e32f1f79412decd261f92d633a9dc1cfa99f088" @@ -3395,6 +3419,19 @@ wcwidth@^1.0.1: dependencies: defaults "^1.0.3" +webidl-conversions@^3.0.0: + version "3.0.1" + resolved "https://registry.yarnpkg.com/webidl-conversions/-/webidl-conversions-3.0.1.tgz#24534275e2a7bc6be7bc86611cc16ae0a5654871" + integrity sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ== + +whatwg-url@^5.0.0: + version "5.0.0" + resolved "https://registry.yarnpkg.com/whatwg-url/-/whatwg-url-5.0.0.tgz#966454e8765462e37644d3626f6742ce8b70965d" + integrity sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw== + dependencies: + tr46 "~0.0.3" + webidl-conversions "^3.0.0" + which-boxed-primitive@^1.0.2: version "1.0.2" resolved "https://registry.yarnpkg.com/which-boxed-primitive/-/which-boxed-primitive-1.0.2.tgz#13757bc89b209b049fe5d86430e21cf40a89a8e6" @@ -3496,6 +3533,13 @@ xdg-basedir@^4.0.0: resolved "https://registry.yarnpkg.com/xdg-basedir/-/xdg-basedir-4.0.0.tgz#4bc8d9984403696225ef83a1573cbbcb4e79db13" integrity sha512-PSNhEJDejZYV7h50BohL09Er9VaIefr2LMAf3OEmpCkjOi34eYyQYAXUTjEQtZJTKcF0E2UKTh+osDLsgNim9Q== +xml-js@^1.6.11: + version "1.6.11" + resolved "https://registry.yarnpkg.com/xml-js/-/xml-js-1.6.11.tgz#927d2f6947f7f1c19a316dd8eea3614e8b18f8e9" + integrity sha512-7rVi2KMfwfWFl+GpPg6m80IVMWXLRjO+PxTq7V2CDhoGak0wzYzFgUY2m4XJ47OGdXd8eLE8EmwfAmdjw7lC1g== + dependencies: + sax "^1.2.4" + xml-name-validator@^4.0.0: version "4.0.0" resolved "https://registry.yarnpkg.com/xml-name-validator/-/xml-name-validator-4.0.0.tgz#79a006e2e63149a8600f15430f0a4725d1524835"