Add most common sources for Japanese language datasets.master
@@ -45,5 +45,9 @@ | |||||
"author": "TheoryOfNekomata <allan.crisostomo@outlook.com>", | "author": "TheoryOfNekomata <allan.crisostomo@outlook.com>", | ||||
"publishConfig": { | "publishConfig": { | ||||
"access": "public" | "access": "public" | ||||
}, | |||||
"dependencies": { | |||||
"fetch-ponyfill": "^7.1.0", | |||||
"xml-js": "^1.6.11" | |||||
} | } | ||||
} | } |
@@ -1,6 +1,30 @@ | |||||
export default function add(a: number, b: number): number { | |||||
if (process.env.NODE_ENV !== 'production') { | |||||
console.log('This code would not appear on production builds'); | |||||
import * as KanjidicImpl from './sources/kanjidic'; | |||||
import * as JmdictImpl from './sources/jmdict'; | |||||
const SUPPORTED_SOURCES = [ | |||||
KanjidicImpl, | |||||
JmdictImpl, | |||||
] as const; | |||||
export type CreateDownloaderParams = ( | |||||
KanjidicImpl.CreateDownloaderParams | |||||
| JmdictImpl.CreateDownloaderParams | |||||
); | |||||
export * as Kanjidic from './sources/kanjidic'; | |||||
export * as Jmdict from './sources/jmdict'; | |||||
export * from './streams'; | |||||
export const createDownloader = (params: CreateDownloaderParams) => { | |||||
const { type: sourceType, ...etcParams } = params; | |||||
const theSourceModule = SUPPORTED_SOURCES | |||||
.find((videoTypeModule) => videoTypeModule.SOURCE_ID === sourceType); | |||||
if (!theSourceModule) { | |||||
const validSourceTypes = SUPPORTED_SOURCES.map((videoTypeModule) => videoTypeModule.SOURCE_ID).join(', '); | |||||
throw new TypeError(`Invalid source type: "${sourceType}". Valid values are: ${validSourceTypes}`); | |||||
} | } | ||||
return a + b; | |||||
} | |||||
return theSourceModule.createDownloader(etcParams); | |||||
}; |
@@ -0,0 +1 @@ | |||||
export const SOURCE_ID = 'jmdict' as const; |
@@ -0,0 +1,25 @@ | |||||
import fetchPonyfill from 'fetch-ponyfill'; | |||||
import { PassThrough } from 'stream'; | |||||
import { createGunzip } from 'zlib'; | |||||
import { SOURCE_ID } from './common'; | |||||
export interface CreateDownloaderParams { | |||||
type: typeof SOURCE_ID; | |||||
url?: string; | |||||
} | |||||
const DEFAULT_SOURCE_URL = 'http://ftp.edrdg.org/pub/Nihongo/JMdict.gz' as const; | |||||
export const createDownloader = async (params: Omit<CreateDownloaderParams, 'type'>) => { | |||||
const { url = DEFAULT_SOURCE_URL } = params; | |||||
const { fetch } = fetchPonyfill(); | |||||
const response = await fetch(url); | |||||
if (!response.ok) { | |||||
throw new Error(`Failed to download: ${url}`); | |||||
} | |||||
const rawStream = response.body as unknown as PassThrough; | |||||
return rawStream | |||||
.pipe(createGunzip()); | |||||
}; |
@@ -0,0 +1,2 @@ | |||||
export * from './common'; | |||||
export * from './downloader'; |
@@ -0,0 +1 @@ | |||||
export const SOURCE_ID = 'kanjidic' as const; |
@@ -0,0 +1,25 @@ | |||||
import fetchPonyfill from 'fetch-ponyfill'; | |||||
import { PassThrough } from 'stream'; | |||||
import { createGunzip } from 'zlib'; | |||||
import { SOURCE_ID } from './common'; | |||||
export interface CreateDownloaderParams { | |||||
type: typeof SOURCE_ID; | |||||
url?: string; | |||||
} | |||||
const DEFAULT_SOURCE_URL = 'http://www.edrdg.org/kanjidic/kanjidic2.xml.gz' as const; | |||||
export const createDownloader = async (params: Omit<CreateDownloaderParams, 'type'>) => { | |||||
const { url = DEFAULT_SOURCE_URL } = params; | |||||
const { fetch } = fetchPonyfill(); | |||||
const response = await fetch(url); | |||||
if (!response.ok) { | |||||
throw new Error(`Failed to download: ${url}`); | |||||
} | |||||
const rawStream = response.body as unknown as PassThrough; | |||||
return rawStream | |||||
.pipe(createGunzip()); | |||||
}; |
@@ -0,0 +1,3 @@ | |||||
export * from './common'; | |||||
export * from './downloader'; | |||||
@@ -0,0 +1,50 @@ | |||||
import { Transform, TransformCallback } from 'stream'; | |||||
import { xml2json } from 'xml-js'; | |||||
export interface XmlToJsonLinesTransformStreamOptions { | |||||
entryTagName: string; | |||||
} | |||||
class XmlToJsonLinesTransformStream extends Transform { | |||||
private charactersToParse?: string; | |||||
constructor(private readonly options: XmlToJsonLinesTransformStreamOptions) { | |||||
super(); | |||||
// noop | |||||
} | |||||
// eslint-disable-next-line no-underscore-dangle | |||||
_transform(chunk: Buffer, _encoding: BufferEncoding, callback: TransformCallback) { | |||||
const chunkStr = chunk.toString('utf-8'); | |||||
if (typeof this.charactersToParse !== 'string') { | |||||
const firstEntryIndex = chunkStr.indexOf(`<${this.options.entryTagName}>`); | |||||
this.charactersToParse = chunkStr.slice(firstEntryIndex); | |||||
} else { | |||||
this.charactersToParse += chunkStr; | |||||
} | |||||
let theCharacters = `${this.charactersToParse}`; | |||||
let nextOpenTagIndex = theCharacters.indexOf(`<${this.options.entryTagName}>`); | |||||
let nextCloseTagIndex = theCharacters.indexOf(`</${this.options.entryTagName}>`); | |||||
do { | |||||
const xml = theCharacters | |||||
.slice( | |||||
nextOpenTagIndex, | |||||
nextCloseTagIndex + this.options.entryTagName.length + 3, | |||||
) | |||||
.replace(/&(.+?);/g, '$1'); // FIXME better handling of XML entities??? This makes the pipe hang for some reason | |||||
const json = xml2json(xml, { compact: true }); | |||||
this.push(`${json}\n`); | |||||
theCharacters = theCharacters.slice(nextCloseTagIndex + this.options.entryTagName.length + 3); | |||||
nextOpenTagIndex = theCharacters.indexOf(`<${this.options.entryTagName}>`); | |||||
nextCloseTagIndex = theCharacters.indexOf(`</${this.options.entryTagName}>`); | |||||
} while (nextOpenTagIndex !== -1 && nextCloseTagIndex !== -1); | |||||
this.charactersToParse = theCharacters; | |||||
callback(null, ''); | |||||
} | |||||
} | |||||
export const createXmlToJsonLines = (options = { | |||||
entryTagName: 'entry', | |||||
} as XmlToJsonLinesTransformStreamOptions) => new XmlToJsonLinesTransformStream(options); |
@@ -1,8 +1,73 @@ | |||||
import { createReadStream, createWriteStream } from 'fs'; | |||||
import { describe, it, expect } from 'vitest'; | import { describe, it, expect } from 'vitest'; | ||||
import add from '../src'; | |||||
import { createDownloader, Kanjidic, Jmdict, createXmlToJsonLines } from '../src'; | |||||
describe('blah', () => { | |||||
it('works', () => { | |||||
expect(add(1, 1)).toEqual(2); | |||||
describe('downloader', () => { | |||||
describe.skip('kanjidic', () => { | |||||
it.skip('works', async () => { | |||||
const readStream = await createDownloader({ | |||||
type: Kanjidic.SOURCE_ID, | |||||
}); | |||||
return new Promise<void>((resolve) => { | |||||
const out = createWriteStream('kanjidic2.xml'); | |||||
out.on('finish', () => { | |||||
resolve(); | |||||
}); | |||||
readStream.pipe(out); | |||||
}); | |||||
}); | |||||
it.skip('converts XML to JSON', () => new Promise<void>((resolve) => { | |||||
const readStream = createReadStream('kanjidic2.xml', { encoding: 'utf-8' }); | |||||
const transform = createXmlToJsonLines({ | |||||
entryTagName: 'character', | |||||
}); | |||||
const out = createWriteStream('kanjidic2.jsonl'); | |||||
out.on('finish', () => { | |||||
resolve(); | |||||
}); | |||||
readStream | |||||
.pipe(transform) | |||||
.pipe(out); | |||||
})); | |||||
}); | |||||
describe.skip('jmdict', () => { | |||||
it('works', async () => { | |||||
const readStream = await createDownloader({ | |||||
type: Jmdict.SOURCE_ID, | |||||
}); | |||||
return new Promise<void>((resolve) => { | |||||
const out = createWriteStream('jmdict.xml'); | |||||
out.on('finish', () => { | |||||
resolve(); | |||||
}); | |||||
readStream.pipe(out); | |||||
}); | |||||
}, { timeout: 300000 }); | |||||
it('converts XML to JSON', () => new Promise<void>((resolve) => { | |||||
const readStream = createReadStream('jmdict.full.xml', { encoding: 'utf-8' }); | |||||
const transform = createXmlToJsonLines({ | |||||
entryTagName: 'entry', | |||||
}); | |||||
const out = createWriteStream('jmdict.jsonl'); | |||||
out.on('finish', () => { | |||||
resolve(); | |||||
}); | |||||
readStream | |||||
.pipe(transform) | |||||
.pipe(out); | |||||
}), { timeout: 300000 }); | |||||
}); | }); | ||||
}); | }); |
@@ -3,7 +3,7 @@ | |||||
"include": ["src", "types", "test"], | "include": ["src", "types", "test"], | ||||
"compilerOptions": { | "compilerOptions": { | ||||
"module": "ESNext", | "module": "ESNext", | ||||
"lib": ["ESNext"], | |||||
"lib": ["ESNext", "DOM"], | |||||
"importHelpers": true, | "importHelpers": true, | ||||
"declaration": true, | "declaration": true, | ||||
"sourceMap": true, | "sourceMap": true, | ||||
@@ -3,7 +3,7 @@ | |||||
"include": ["src", "types"], | "include": ["src", "types"], | ||||
"compilerOptions": { | "compilerOptions": { | ||||
"module": "ESNext", | "module": "ESNext", | ||||
"lib": ["ESNext"], | |||||
"lib": ["ESNext", "DOM"], | |||||
"importHelpers": true, | "importHelpers": true, | ||||
"declaration": true, | "declaration": true, | ||||
"sourceMap": true, | "sourceMap": true, | ||||
@@ -1678,6 +1678,13 @@ fastq@^1.6.0: | |||||
dependencies: | dependencies: | ||||
reusify "^1.0.4" | reusify "^1.0.4" | ||||
fetch-ponyfill@^7.1.0: | |||||
version "7.1.0" | |||||
resolved "https://registry.yarnpkg.com/fetch-ponyfill/-/fetch-ponyfill-7.1.0.tgz#4266ed48b4e64663a50ab7f7fcb8e76f990526d0" | |||||
integrity sha512-FhbbL55dj/qdVO3YNK7ZEkshvj3eQ7EuIGV2I6ic/2YiocvyWv+7jg2s4AyS0wdRU75s3tA8ZxI/xPigb0v5Aw== | |||||
dependencies: | |||||
node-fetch "~2.6.1" | |||||
file-entry-cache@^6.0.1: | file-entry-cache@^6.0.1: | ||||
version "6.0.1" | version "6.0.1" | ||||
resolved "https://registry.yarnpkg.com/file-entry-cache/-/file-entry-cache-6.0.1.tgz#211b2dd9659cb0394b073e7323ac3c933d522027" | resolved "https://registry.yarnpkg.com/file-entry-cache/-/file-entry-cache-6.0.1.tgz#211b2dd9659cb0394b073e7323ac3c933d522027" | ||||
@@ -2474,6 +2481,13 @@ natural-compare@^1.4.0: | |||||
resolved "https://registry.yarnpkg.com/natural-compare/-/natural-compare-1.4.0.tgz#4abebfeed7541f2c27acfb29bdbbd15c8d5ba4f7" | resolved "https://registry.yarnpkg.com/natural-compare/-/natural-compare-1.4.0.tgz#4abebfeed7541f2c27acfb29bdbbd15c8d5ba4f7" | ||||
integrity sha512-OWND8ei3VtNC9h7V60qff3SVobHr996CTwgxubgyQYEpg290h9J0buyECNNJexkFm5sOajh5G116RYA1c8ZMSw== | integrity sha512-OWND8ei3VtNC9h7V60qff3SVobHr996CTwgxubgyQYEpg290h9J0buyECNNJexkFm5sOajh5G116RYA1c8ZMSw== | ||||
node-fetch@~2.6.1: | |||||
version "2.6.9" | |||||
resolved "https://registry.yarnpkg.com/node-fetch/-/node-fetch-2.6.9.tgz#7c7f744b5cc6eb5fd404e0c7a9fec630a55657e6" | |||||
integrity sha512-DJm/CJkZkRjKKj4Zi4BsKVZh3ValV5IR5s7LVZnW+6YMh0W1BfNA8XSs6DLMGYlId5F3KnA70uu2qepcR08Qqg== | |||||
dependencies: | |||||
whatwg-url "^5.0.0" | |||||
node-releases@^2.0.8: | node-releases@^2.0.8: | ||||
version "2.0.10" | version "2.0.10" | ||||
resolved "https://registry.yarnpkg.com/node-releases/-/node-releases-2.0.10.tgz#c311ebae3b6a148c89b1813fd7c4d3c024ef537f" | resolved "https://registry.yarnpkg.com/node-releases/-/node-releases-2.0.10.tgz#c311ebae3b6a148c89b1813fd7c4d3c024ef537f" | ||||
@@ -2919,6 +2933,11 @@ safe-regex-test@^1.0.0: | |||||
get-intrinsic "^1.1.3" | get-intrinsic "^1.1.3" | ||||
is-regex "^1.1.4" | is-regex "^1.1.4" | ||||
sax@^1.2.4: | |||||
version "1.2.4" | |||||
resolved "https://registry.yarnpkg.com/sax/-/sax-1.2.4.tgz#2816234e2378bddc4e5354fab5caa895df7100d9" | |||||
integrity sha512-NqVDv9TpANUjFm0N8uM5GxL36UgKi9/atZw+x7YFnQ8ckwFGKrl4xX4yWtrey3UJm5nP1kUbnYgLopqWNSRhWw== | |||||
semver@7.3.8: | semver@7.3.8: | ||||
version "7.3.8" | version "7.3.8" | ||||
resolved "https://registry.yarnpkg.com/semver/-/semver-7.3.8.tgz#07a78feafb3f7b32347d725e33de7e2a2df67798" | resolved "https://registry.yarnpkg.com/semver/-/semver-7.3.8.tgz#07a78feafb3f7b32347d725e33de7e2a2df67798" | ||||
@@ -3213,6 +3232,11 @@ to-regex-range@^5.0.1: | |||||
dependencies: | dependencies: | ||||
is-number "^7.0.0" | is-number "^7.0.0" | ||||
tr46@~0.0.3: | |||||
version "0.0.3" | |||||
resolved "https://registry.yarnpkg.com/tr46/-/tr46-0.0.3.tgz#8184fd347dac9cdc185992f3a6622e14b9d9ab6a" | |||||
integrity sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw== | |||||
tsconfig-paths@^3.14.1: | tsconfig-paths@^3.14.1: | ||||
version "3.14.2" | version "3.14.2" | ||||
resolved "https://registry.yarnpkg.com/tsconfig-paths/-/tsconfig-paths-3.14.2.tgz#6e32f1f79412decd261f92d633a9dc1cfa99f088" | resolved "https://registry.yarnpkg.com/tsconfig-paths/-/tsconfig-paths-3.14.2.tgz#6e32f1f79412decd261f92d633a9dc1cfa99f088" | ||||
@@ -3395,6 +3419,19 @@ wcwidth@^1.0.1: | |||||
dependencies: | dependencies: | ||||
defaults "^1.0.3" | defaults "^1.0.3" | ||||
webidl-conversions@^3.0.0: | |||||
version "3.0.1" | |||||
resolved "https://registry.yarnpkg.com/webidl-conversions/-/webidl-conversions-3.0.1.tgz#24534275e2a7bc6be7bc86611cc16ae0a5654871" | |||||
integrity sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ== | |||||
whatwg-url@^5.0.0: | |||||
version "5.0.0" | |||||
resolved "https://registry.yarnpkg.com/whatwg-url/-/whatwg-url-5.0.0.tgz#966454e8765462e37644d3626f6742ce8b70965d" | |||||
integrity sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw== | |||||
dependencies: | |||||
tr46 "~0.0.3" | |||||
webidl-conversions "^3.0.0" | |||||
which-boxed-primitive@^1.0.2: | which-boxed-primitive@^1.0.2: | ||||
version "1.0.2" | version "1.0.2" | ||||
resolved "https://registry.yarnpkg.com/which-boxed-primitive/-/which-boxed-primitive-1.0.2.tgz#13757bc89b209b049fe5d86430e21cf40a89a8e6" | resolved "https://registry.yarnpkg.com/which-boxed-primitive/-/which-boxed-primitive-1.0.2.tgz#13757bc89b209b049fe5d86430e21cf40a89a8e6" | ||||
@@ -3496,6 +3533,13 @@ xdg-basedir@^4.0.0: | |||||
resolved "https://registry.yarnpkg.com/xdg-basedir/-/xdg-basedir-4.0.0.tgz#4bc8d9984403696225ef83a1573cbbcb4e79db13" | resolved "https://registry.yarnpkg.com/xdg-basedir/-/xdg-basedir-4.0.0.tgz#4bc8d9984403696225ef83a1573cbbcb4e79db13" | ||||
integrity sha512-PSNhEJDejZYV7h50BohL09Er9VaIefr2LMAf3OEmpCkjOi34eYyQYAXUTjEQtZJTKcF0E2UKTh+osDLsgNim9Q== | integrity sha512-PSNhEJDejZYV7h50BohL09Er9VaIefr2LMAf3OEmpCkjOi34eYyQYAXUTjEQtZJTKcF0E2UKTh+osDLsgNim9Q== | ||||
xml-js@^1.6.11: | |||||
version "1.6.11" | |||||
resolved "https://registry.yarnpkg.com/xml-js/-/xml-js-1.6.11.tgz#927d2f6947f7f1c19a316dd8eea3614e8b18f8e9" | |||||
integrity sha512-7rVi2KMfwfWFl+GpPg6m80IVMWXLRjO+PxTq7V2CDhoGak0wzYzFgUY2m4XJ47OGdXd8eLE8EmwfAmdjw7lC1g== | |||||
dependencies: | |||||
sax "^1.2.4" | |||||
xml-name-validator@^4.0.0: | xml-name-validator@^4.0.0: | ||||
version "4.0.0" | version "4.0.0" | ||||
resolved "https://registry.yarnpkg.com/xml-name-validator/-/xml-name-validator-4.0.0.tgz#79a006e2e63149a8600f15430f0a4725d1524835" | resolved "https://registry.yarnpkg.com/xml-name-validator/-/xml-name-validator-4.0.0.tgz#79a006e2e63149a8600f15430f0a4725d1524835" | ||||