Add most common sources for Japanese language datasets.master
@@ -45,5 +45,9 @@ | |||
"author": "TheoryOfNekomata <allan.crisostomo@outlook.com>", | |||
"publishConfig": { | |||
"access": "public" | |||
}, | |||
"dependencies": { | |||
"fetch-ponyfill": "^7.1.0", | |||
"xml-js": "^1.6.11" | |||
} | |||
} |
@@ -1,6 +1,30 @@ | |||
export default function add(a: number, b: number): number { | |||
if (process.env.NODE_ENV !== 'production') { | |||
console.log('This code would not appear on production builds'); | |||
import * as KanjidicImpl from './sources/kanjidic'; | |||
import * as JmdictImpl from './sources/jmdict'; | |||
const SUPPORTED_SOURCES = [ | |||
KanjidicImpl, | |||
JmdictImpl, | |||
] as const; | |||
export type CreateDownloaderParams = ( | |||
KanjidicImpl.CreateDownloaderParams | |||
| JmdictImpl.CreateDownloaderParams | |||
); | |||
export * as Kanjidic from './sources/kanjidic'; | |||
export * as Jmdict from './sources/jmdict'; | |||
export * from './streams'; | |||
export const createDownloader = (params: CreateDownloaderParams) => { | |||
const { type: sourceType, ...etcParams } = params; | |||
const theSourceModule = SUPPORTED_SOURCES | |||
.find((videoTypeModule) => videoTypeModule.SOURCE_ID === sourceType); | |||
if (!theSourceModule) { | |||
const validSourceTypes = SUPPORTED_SOURCES.map((videoTypeModule) => videoTypeModule.SOURCE_ID).join(', '); | |||
throw new TypeError(`Invalid source type: "${sourceType}". Valid values are: ${validSourceTypes}`); | |||
} | |||
return a + b; | |||
} | |||
return theSourceModule.createDownloader(etcParams); | |||
}; |
@@ -0,0 +1 @@ | |||
export const SOURCE_ID = 'jmdict' as const; |
@@ -0,0 +1,25 @@ | |||
import fetchPonyfill from 'fetch-ponyfill'; | |||
import { PassThrough } from 'stream'; | |||
import { createGunzip } from 'zlib'; | |||
import { SOURCE_ID } from './common'; | |||
export interface CreateDownloaderParams { | |||
type: typeof SOURCE_ID; | |||
url?: string; | |||
} | |||
const DEFAULT_SOURCE_URL = 'http://ftp.edrdg.org/pub/Nihongo/JMdict.gz' as const; | |||
export const createDownloader = async (params: Omit<CreateDownloaderParams, 'type'>) => { | |||
const { url = DEFAULT_SOURCE_URL } = params; | |||
const { fetch } = fetchPonyfill(); | |||
const response = await fetch(url); | |||
if (!response.ok) { | |||
throw new Error(`Failed to download: ${url}`); | |||
} | |||
const rawStream = response.body as unknown as PassThrough; | |||
return rawStream | |||
.pipe(createGunzip()); | |||
}; |
@@ -0,0 +1,2 @@ | |||
export * from './common'; | |||
export * from './downloader'; |
@@ -0,0 +1 @@ | |||
export const SOURCE_ID = 'kanjidic' as const; |
@@ -0,0 +1,25 @@ | |||
import fetchPonyfill from 'fetch-ponyfill'; | |||
import { PassThrough } from 'stream'; | |||
import { createGunzip } from 'zlib'; | |||
import { SOURCE_ID } from './common'; | |||
export interface CreateDownloaderParams { | |||
type: typeof SOURCE_ID; | |||
url?: string; | |||
} | |||
const DEFAULT_SOURCE_URL = 'http://www.edrdg.org/kanjidic/kanjidic2.xml.gz' as const; | |||
export const createDownloader = async (params: Omit<CreateDownloaderParams, 'type'>) => { | |||
const { url = DEFAULT_SOURCE_URL } = params; | |||
const { fetch } = fetchPonyfill(); | |||
const response = await fetch(url); | |||
if (!response.ok) { | |||
throw new Error(`Failed to download: ${url}`); | |||
} | |||
const rawStream = response.body as unknown as PassThrough; | |||
return rawStream | |||
.pipe(createGunzip()); | |||
}; |
@@ -0,0 +1,3 @@ | |||
export * from './common'; | |||
export * from './downloader'; | |||
@@ -0,0 +1,50 @@ | |||
import { Transform, TransformCallback } from 'stream'; | |||
import { xml2json } from 'xml-js'; | |||
export interface XmlToJsonLinesTransformStreamOptions { | |||
entryTagName: string; | |||
} | |||
class XmlToJsonLinesTransformStream extends Transform { | |||
private charactersToParse?: string; | |||
constructor(private readonly options: XmlToJsonLinesTransformStreamOptions) { | |||
super(); | |||
// noop | |||
} | |||
// eslint-disable-next-line no-underscore-dangle | |||
_transform(chunk: Buffer, _encoding: BufferEncoding, callback: TransformCallback) { | |||
const chunkStr = chunk.toString('utf-8'); | |||
if (typeof this.charactersToParse !== 'string') { | |||
const firstEntryIndex = chunkStr.indexOf(`<${this.options.entryTagName}>`); | |||
this.charactersToParse = chunkStr.slice(firstEntryIndex); | |||
} else { | |||
this.charactersToParse += chunkStr; | |||
} | |||
let theCharacters = `${this.charactersToParse}`; | |||
let nextOpenTagIndex = theCharacters.indexOf(`<${this.options.entryTagName}>`); | |||
let nextCloseTagIndex = theCharacters.indexOf(`</${this.options.entryTagName}>`); | |||
do { | |||
const xml = theCharacters | |||
.slice( | |||
nextOpenTagIndex, | |||
nextCloseTagIndex + this.options.entryTagName.length + 3, | |||
) | |||
.replace(/&(.+?);/g, '$1'); // FIXME better handling of XML entities??? This makes the pipe hang for some reason | |||
const json = xml2json(xml, { compact: true }); | |||
this.push(`${json}\n`); | |||
theCharacters = theCharacters.slice(nextCloseTagIndex + this.options.entryTagName.length + 3); | |||
nextOpenTagIndex = theCharacters.indexOf(`<${this.options.entryTagName}>`); | |||
nextCloseTagIndex = theCharacters.indexOf(`</${this.options.entryTagName}>`); | |||
} while (nextOpenTagIndex !== -1 && nextCloseTagIndex !== -1); | |||
this.charactersToParse = theCharacters; | |||
callback(null, ''); | |||
} | |||
} | |||
export const createXmlToJsonLines = (options = { | |||
entryTagName: 'entry', | |||
} as XmlToJsonLinesTransformStreamOptions) => new XmlToJsonLinesTransformStream(options); |
@@ -1,8 +1,73 @@ | |||
import { createReadStream, createWriteStream } from 'fs'; | |||
import { describe, it, expect } from 'vitest'; | |||
import add from '../src'; | |||
import { createDownloader, Kanjidic, Jmdict, createXmlToJsonLines } from '../src'; | |||
describe('blah', () => { | |||
it('works', () => { | |||
expect(add(1, 1)).toEqual(2); | |||
describe('downloader', () => { | |||
describe.skip('kanjidic', () => { | |||
it.skip('works', async () => { | |||
const readStream = await createDownloader({ | |||
type: Kanjidic.SOURCE_ID, | |||
}); | |||
return new Promise<void>((resolve) => { | |||
const out = createWriteStream('kanjidic2.xml'); | |||
out.on('finish', () => { | |||
resolve(); | |||
}); | |||
readStream.pipe(out); | |||
}); | |||
}); | |||
it.skip('converts XML to JSON', () => new Promise<void>((resolve) => { | |||
const readStream = createReadStream('kanjidic2.xml', { encoding: 'utf-8' }); | |||
const transform = createXmlToJsonLines({ | |||
entryTagName: 'character', | |||
}); | |||
const out = createWriteStream('kanjidic2.jsonl'); | |||
out.on('finish', () => { | |||
resolve(); | |||
}); | |||
readStream | |||
.pipe(transform) | |||
.pipe(out); | |||
})); | |||
}); | |||
describe.skip('jmdict', () => { | |||
it('works', async () => { | |||
const readStream = await createDownloader({ | |||
type: Jmdict.SOURCE_ID, | |||
}); | |||
return new Promise<void>((resolve) => { | |||
const out = createWriteStream('jmdict.xml'); | |||
out.on('finish', () => { | |||
resolve(); | |||
}); | |||
readStream.pipe(out); | |||
}); | |||
}, { timeout: 300000 }); | |||
it('converts XML to JSON', () => new Promise<void>((resolve) => { | |||
const readStream = createReadStream('jmdict.full.xml', { encoding: 'utf-8' }); | |||
const transform = createXmlToJsonLines({ | |||
entryTagName: 'entry', | |||
}); | |||
const out = createWriteStream('jmdict.jsonl'); | |||
out.on('finish', () => { | |||
resolve(); | |||
}); | |||
readStream | |||
.pipe(transform) | |||
.pipe(out); | |||
}), { timeout: 300000 }); | |||
}); | |||
}); |
@@ -3,7 +3,7 @@ | |||
"include": ["src", "types", "test"], | |||
"compilerOptions": { | |||
"module": "ESNext", | |||
"lib": ["ESNext"], | |||
"lib": ["ESNext", "DOM"], | |||
"importHelpers": true, | |||
"declaration": true, | |||
"sourceMap": true, | |||
@@ -3,7 +3,7 @@ | |||
"include": ["src", "types"], | |||
"compilerOptions": { | |||
"module": "ESNext", | |||
"lib": ["ESNext"], | |||
"lib": ["ESNext", "DOM"], | |||
"importHelpers": true, | |||
"declaration": true, | |||
"sourceMap": true, | |||
@@ -1678,6 +1678,13 @@ fastq@^1.6.0: | |||
dependencies: | |||
reusify "^1.0.4" | |||
fetch-ponyfill@^7.1.0: | |||
version "7.1.0" | |||
resolved "https://registry.yarnpkg.com/fetch-ponyfill/-/fetch-ponyfill-7.1.0.tgz#4266ed48b4e64663a50ab7f7fcb8e76f990526d0" | |||
integrity sha512-FhbbL55dj/qdVO3YNK7ZEkshvj3eQ7EuIGV2I6ic/2YiocvyWv+7jg2s4AyS0wdRU75s3tA8ZxI/xPigb0v5Aw== | |||
dependencies: | |||
node-fetch "~2.6.1" | |||
file-entry-cache@^6.0.1: | |||
version "6.0.1" | |||
resolved "https://registry.yarnpkg.com/file-entry-cache/-/file-entry-cache-6.0.1.tgz#211b2dd9659cb0394b073e7323ac3c933d522027" | |||
@@ -2474,6 +2481,13 @@ natural-compare@^1.4.0: | |||
resolved "https://registry.yarnpkg.com/natural-compare/-/natural-compare-1.4.0.tgz#4abebfeed7541f2c27acfb29bdbbd15c8d5ba4f7" | |||
integrity sha512-OWND8ei3VtNC9h7V60qff3SVobHr996CTwgxubgyQYEpg290h9J0buyECNNJexkFm5sOajh5G116RYA1c8ZMSw== | |||
node-fetch@~2.6.1: | |||
version "2.6.9" | |||
resolved "https://registry.yarnpkg.com/node-fetch/-/node-fetch-2.6.9.tgz#7c7f744b5cc6eb5fd404e0c7a9fec630a55657e6" | |||
integrity sha512-DJm/CJkZkRjKKj4Zi4BsKVZh3ValV5IR5s7LVZnW+6YMh0W1BfNA8XSs6DLMGYlId5F3KnA70uu2qepcR08Qqg== | |||
dependencies: | |||
whatwg-url "^5.0.0" | |||
node-releases@^2.0.8: | |||
version "2.0.10" | |||
resolved "https://registry.yarnpkg.com/node-releases/-/node-releases-2.0.10.tgz#c311ebae3b6a148c89b1813fd7c4d3c024ef537f" | |||
@@ -2919,6 +2933,11 @@ safe-regex-test@^1.0.0: | |||
get-intrinsic "^1.1.3" | |||
is-regex "^1.1.4" | |||
sax@^1.2.4: | |||
version "1.2.4" | |||
resolved "https://registry.yarnpkg.com/sax/-/sax-1.2.4.tgz#2816234e2378bddc4e5354fab5caa895df7100d9" | |||
integrity sha512-NqVDv9TpANUjFm0N8uM5GxL36UgKi9/atZw+x7YFnQ8ckwFGKrl4xX4yWtrey3UJm5nP1kUbnYgLopqWNSRhWw== | |||
semver@7.3.8: | |||
version "7.3.8" | |||
resolved "https://registry.yarnpkg.com/semver/-/semver-7.3.8.tgz#07a78feafb3f7b32347d725e33de7e2a2df67798" | |||
@@ -3213,6 +3232,11 @@ to-regex-range@^5.0.1: | |||
dependencies: | |||
is-number "^7.0.0" | |||
tr46@~0.0.3: | |||
version "0.0.3" | |||
resolved "https://registry.yarnpkg.com/tr46/-/tr46-0.0.3.tgz#8184fd347dac9cdc185992f3a6622e14b9d9ab6a" | |||
integrity sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw== | |||
tsconfig-paths@^3.14.1: | |||
version "3.14.2" | |||
resolved "https://registry.yarnpkg.com/tsconfig-paths/-/tsconfig-paths-3.14.2.tgz#6e32f1f79412decd261f92d633a9dc1cfa99f088" | |||
@@ -3395,6 +3419,19 @@ wcwidth@^1.0.1: | |||
dependencies: | |||
defaults "^1.0.3" | |||
webidl-conversions@^3.0.0: | |||
version "3.0.1" | |||
resolved "https://registry.yarnpkg.com/webidl-conversions/-/webidl-conversions-3.0.1.tgz#24534275e2a7bc6be7bc86611cc16ae0a5654871" | |||
integrity sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ== | |||
whatwg-url@^5.0.0: | |||
version "5.0.0" | |||
resolved "https://registry.yarnpkg.com/whatwg-url/-/whatwg-url-5.0.0.tgz#966454e8765462e37644d3626f6742ce8b70965d" | |||
integrity sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw== | |||
dependencies: | |||
tr46 "~0.0.3" | |||
webidl-conversions "^3.0.0" | |||
which-boxed-primitive@^1.0.2: | |||
version "1.0.2" | |||
resolved "https://registry.yarnpkg.com/which-boxed-primitive/-/which-boxed-primitive-1.0.2.tgz#13757bc89b209b049fe5d86430e21cf40a89a8e6" | |||
@@ -3496,6 +3533,13 @@ xdg-basedir@^4.0.0: | |||
resolved "https://registry.yarnpkg.com/xdg-basedir/-/xdg-basedir-4.0.0.tgz#4bc8d9984403696225ef83a1573cbbcb4e79db13" | |||
integrity sha512-PSNhEJDejZYV7h50BohL09Er9VaIefr2LMAf3OEmpCkjOi34eYyQYAXUTjEQtZJTKcF0E2UKTh+osDLsgNim9Q== | |||
xml-js@^1.6.11: | |||
version "1.6.11" | |||
resolved "https://registry.yarnpkg.com/xml-js/-/xml-js-1.6.11.tgz#927d2f6947f7f1c19a316dd8eea3614e8b18f8e9" | |||
integrity sha512-7rVi2KMfwfWFl+GpPg6m80IVMWXLRjO+PxTq7V2CDhoGak0wzYzFgUY2m4XJ47OGdXd8eLE8EmwfAmdjw7lC1g== | |||
dependencies: | |||
sax "^1.2.4" | |||
xml-name-validator@^4.0.0: | |||
version "4.0.0" | |||
resolved "https://registry.yarnpkg.com/xml-name-validator/-/xml-name-validator-4.0.0.tgz#79a006e2e63149a8600f15430f0a4725d1524835" | |||