Browse Source

Implement JMdict and Kanjidic sources

Add most common sources for Japanese language datasets.
master
TheoryOfNekomata 1 year ago
parent
commit
9e1355f746
13 changed files with 255 additions and 11 deletions
  1. +4
    -0
      package.json
  2. +29
    -5
      src/index.ts
  3. +1
    -0
      src/sources/jmdict/common.ts
  4. +25
    -0
      src/sources/jmdict/downloader.ts
  5. +2
    -0
      src/sources/jmdict/index.ts
  6. +1
    -0
      src/sources/kanjidic/common.ts
  7. +25
    -0
      src/sources/kanjidic/downloader.ts
  8. +3
    -0
      src/sources/kanjidic/index.ts
  9. +50
    -0
      src/streams.ts
  10. +69
    -4
      test/index.test.ts
  11. +1
    -1
      tsconfig.eslint.json
  12. +1
    -1
      tsconfig.json
  13. +44
    -0
      yarn.lock

+ 4
- 0
package.json View File

@@ -45,5 +45,9 @@
"author": "TheoryOfNekomata <allan.crisostomo@outlook.com>",
"publishConfig": {
"access": "public"
},
"dependencies": {
"fetch-ponyfill": "^7.1.0",
"xml-js": "^1.6.11"
}
}

+ 29
- 5
src/index.ts View File

@@ -1,6 +1,30 @@
export default function add(a: number, b: number): number {
if (process.env.NODE_ENV !== 'production') {
console.log('This code would not appear on production builds');
import * as KanjidicImpl from './sources/kanjidic';
import * as JmdictImpl from './sources/jmdict';

const SUPPORTED_SOURCES = [
KanjidicImpl,
JmdictImpl,
] as const;

export type CreateDownloaderParams = (
KanjidicImpl.CreateDownloaderParams
| JmdictImpl.CreateDownloaderParams
);

export * as Kanjidic from './sources/kanjidic';
export * as Jmdict from './sources/jmdict';
export * from './streams';

export const createDownloader = (params: CreateDownloaderParams) => {
const { type: sourceType, ...etcParams } = params;

const theSourceModule = SUPPORTED_SOURCES
.find((videoTypeModule) => videoTypeModule.SOURCE_ID === sourceType);

if (!theSourceModule) {
const validSourceTypes = SUPPORTED_SOURCES.map((videoTypeModule) => videoTypeModule.SOURCE_ID).join(', ');
throw new TypeError(`Invalid source type: "${sourceType}". Valid values are: ${validSourceTypes}`);
}
return a + b;
}

return theSourceModule.createDownloader(etcParams);
};

+ 1
- 0
src/sources/jmdict/common.ts View File

@@ -0,0 +1 @@
export const SOURCE_ID = 'jmdict' as const;

+ 25
- 0
src/sources/jmdict/downloader.ts View File

@@ -0,0 +1,25 @@
import fetchPonyfill from 'fetch-ponyfill';
import { PassThrough } from 'stream';
import { createGunzip } from 'zlib';
import { SOURCE_ID } from './common';

export interface CreateDownloaderParams {
type: typeof SOURCE_ID;
url?: string;
}

const DEFAULT_SOURCE_URL = 'http://ftp.edrdg.org/pub/Nihongo/JMdict.gz' as const;

export const createDownloader = async (params: Omit<CreateDownloaderParams, 'type'>) => {
const { url = DEFAULT_SOURCE_URL } = params;
const { fetch } = fetchPonyfill();

const response = await fetch(url);
if (!response.ok) {
throw new Error(`Failed to download: ${url}`);
}

const rawStream = response.body as unknown as PassThrough;
return rawStream
.pipe(createGunzip());
};

+ 2
- 0
src/sources/jmdict/index.ts View File

@@ -0,0 +1,2 @@
export * from './common';
export * from './downloader';

+ 1
- 0
src/sources/kanjidic/common.ts View File

@@ -0,0 +1 @@
export const SOURCE_ID = 'kanjidic' as const;

+ 25
- 0
src/sources/kanjidic/downloader.ts View File

@@ -0,0 +1,25 @@
import fetchPonyfill from 'fetch-ponyfill';
import { PassThrough } from 'stream';
import { createGunzip } from 'zlib';
import { SOURCE_ID } from './common';

export interface CreateDownloaderParams {
type: typeof SOURCE_ID;
url?: string;
}

const DEFAULT_SOURCE_URL = 'http://www.edrdg.org/kanjidic/kanjidic2.xml.gz' as const;

export const createDownloader = async (params: Omit<CreateDownloaderParams, 'type'>) => {
const { url = DEFAULT_SOURCE_URL } = params;
const { fetch } = fetchPonyfill();

const response = await fetch(url);
if (!response.ok) {
throw new Error(`Failed to download: ${url}`);
}

const rawStream = response.body as unknown as PassThrough;
return rawStream
.pipe(createGunzip());
};

+ 3
- 0
src/sources/kanjidic/index.ts View File

@@ -0,0 +1,3 @@
export * from './common';
export * from './downloader';


+ 50
- 0
src/streams.ts View File

@@ -0,0 +1,50 @@
import { Transform, TransformCallback } from 'stream';
import { xml2json } from 'xml-js';

export interface XmlToJsonLinesTransformStreamOptions {
entryTagName: string;
}

class XmlToJsonLinesTransformStream extends Transform {
private charactersToParse?: string;

constructor(private readonly options: XmlToJsonLinesTransformStreamOptions) {
super();
// noop
}

// eslint-disable-next-line no-underscore-dangle
_transform(chunk: Buffer, _encoding: BufferEncoding, callback: TransformCallback) {
const chunkStr = chunk.toString('utf-8');
if (typeof this.charactersToParse !== 'string') {
const firstEntryIndex = chunkStr.indexOf(`<${this.options.entryTagName}>`);
this.charactersToParse = chunkStr.slice(firstEntryIndex);
} else {
this.charactersToParse += chunkStr;
}

let theCharacters = `${this.charactersToParse}`;
let nextOpenTagIndex = theCharacters.indexOf(`<${this.options.entryTagName}>`);
let nextCloseTagIndex = theCharacters.indexOf(`</${this.options.entryTagName}>`);
do {
const xml = theCharacters
.slice(
nextOpenTagIndex,
nextCloseTagIndex + this.options.entryTagName.length + 3,
)
.replace(/&(.+?);/g, '$1'); // FIXME better handling of XML entities??? This makes the pipe hang for some reason
const json = xml2json(xml, { compact: true });
this.push(`${json}\n`);
theCharacters = theCharacters.slice(nextCloseTagIndex + this.options.entryTagName.length + 3);
nextOpenTagIndex = theCharacters.indexOf(`<${this.options.entryTagName}>`);
nextCloseTagIndex = theCharacters.indexOf(`</${this.options.entryTagName}>`);
} while (nextOpenTagIndex !== -1 && nextCloseTagIndex !== -1);
this.charactersToParse = theCharacters;

callback(null, '');
}
}

export const createXmlToJsonLines = (options = {
entryTagName: 'entry',
} as XmlToJsonLinesTransformStreamOptions) => new XmlToJsonLinesTransformStream(options);

+ 69
- 4
test/index.test.ts View File

@@ -1,8 +1,73 @@
import { createReadStream, createWriteStream } from 'fs';
import { describe, it, expect } from 'vitest';
import add from '../src';
import { createDownloader, Kanjidic, Jmdict, createXmlToJsonLines } from '../src';

describe('blah', () => {
it('works', () => {
expect(add(1, 1)).toEqual(2);
describe('downloader', () => {
describe.skip('kanjidic', () => {
it.skip('works', async () => {
const readStream = await createDownloader({
type: Kanjidic.SOURCE_ID,
});

return new Promise<void>((resolve) => {
const out = createWriteStream('kanjidic2.xml');

out.on('finish', () => {
resolve();
});

readStream.pipe(out);
});
});

it.skip('converts XML to JSON', () => new Promise<void>((resolve) => {
const readStream = createReadStream('kanjidic2.xml', { encoding: 'utf-8' });
const transform = createXmlToJsonLines({
entryTagName: 'character',
});
const out = createWriteStream('kanjidic2.jsonl');

out.on('finish', () => {
resolve();
});

readStream
.pipe(transform)
.pipe(out);
}));
});

describe.skip('jmdict', () => {
it('works', async () => {
const readStream = await createDownloader({
type: Jmdict.SOURCE_ID,
});

return new Promise<void>((resolve) => {
const out = createWriteStream('jmdict.xml');

out.on('finish', () => {
resolve();
});

readStream.pipe(out);
});
}, { timeout: 300000 });

it('converts XML to JSON', () => new Promise<void>((resolve) => {
const readStream = createReadStream('jmdict.full.xml', { encoding: 'utf-8' });
const transform = createXmlToJsonLines({
entryTagName: 'entry',
});
const out = createWriteStream('jmdict.jsonl');

out.on('finish', () => {
resolve();
});

readStream
.pipe(transform)
.pipe(out);
}), { timeout: 300000 });
});
});

+ 1
- 1
tsconfig.eslint.json View File

@@ -3,7 +3,7 @@
"include": ["src", "types", "test"],
"compilerOptions": {
"module": "ESNext",
"lib": ["ESNext"],
"lib": ["ESNext", "DOM"],
"importHelpers": true,
"declaration": true,
"sourceMap": true,


+ 1
- 1
tsconfig.json View File

@@ -3,7 +3,7 @@
"include": ["src", "types"],
"compilerOptions": {
"module": "ESNext",
"lib": ["ESNext"],
"lib": ["ESNext", "DOM"],
"importHelpers": true,
"declaration": true,
"sourceMap": true,


+ 44
- 0
yarn.lock View File

@@ -1678,6 +1678,13 @@ fastq@^1.6.0:
dependencies:
reusify "^1.0.4"
fetch-ponyfill@^7.1.0:
version "7.1.0"
resolved "https://registry.yarnpkg.com/fetch-ponyfill/-/fetch-ponyfill-7.1.0.tgz#4266ed48b4e64663a50ab7f7fcb8e76f990526d0"
integrity sha512-FhbbL55dj/qdVO3YNK7ZEkshvj3eQ7EuIGV2I6ic/2YiocvyWv+7jg2s4AyS0wdRU75s3tA8ZxI/xPigb0v5Aw==
dependencies:
node-fetch "~2.6.1"
file-entry-cache@^6.0.1:
version "6.0.1"
resolved "https://registry.yarnpkg.com/file-entry-cache/-/file-entry-cache-6.0.1.tgz#211b2dd9659cb0394b073e7323ac3c933d522027"
@@ -2474,6 +2481,13 @@ natural-compare@^1.4.0:
resolved "https://registry.yarnpkg.com/natural-compare/-/natural-compare-1.4.0.tgz#4abebfeed7541f2c27acfb29bdbbd15c8d5ba4f7"
integrity sha512-OWND8ei3VtNC9h7V60qff3SVobHr996CTwgxubgyQYEpg290h9J0buyECNNJexkFm5sOajh5G116RYA1c8ZMSw==
node-fetch@~2.6.1:
version "2.6.9"
resolved "https://registry.yarnpkg.com/node-fetch/-/node-fetch-2.6.9.tgz#7c7f744b5cc6eb5fd404e0c7a9fec630a55657e6"
integrity sha512-DJm/CJkZkRjKKj4Zi4BsKVZh3ValV5IR5s7LVZnW+6YMh0W1BfNA8XSs6DLMGYlId5F3KnA70uu2qepcR08Qqg==
dependencies:
whatwg-url "^5.0.0"
node-releases@^2.0.8:
version "2.0.10"
resolved "https://registry.yarnpkg.com/node-releases/-/node-releases-2.0.10.tgz#c311ebae3b6a148c89b1813fd7c4d3c024ef537f"
@@ -2919,6 +2933,11 @@ safe-regex-test@^1.0.0:
get-intrinsic "^1.1.3"
is-regex "^1.1.4"
sax@^1.2.4:
version "1.2.4"
resolved "https://registry.yarnpkg.com/sax/-/sax-1.2.4.tgz#2816234e2378bddc4e5354fab5caa895df7100d9"
integrity sha512-NqVDv9TpANUjFm0N8uM5GxL36UgKi9/atZw+x7YFnQ8ckwFGKrl4xX4yWtrey3UJm5nP1kUbnYgLopqWNSRhWw==
semver@7.3.8:
version "7.3.8"
resolved "https://registry.yarnpkg.com/semver/-/semver-7.3.8.tgz#07a78feafb3f7b32347d725e33de7e2a2df67798"
@@ -3213,6 +3232,11 @@ to-regex-range@^5.0.1:
dependencies:
is-number "^7.0.0"
tr46@~0.0.3:
version "0.0.3"
resolved "https://registry.yarnpkg.com/tr46/-/tr46-0.0.3.tgz#8184fd347dac9cdc185992f3a6622e14b9d9ab6a"
integrity sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==
tsconfig-paths@^3.14.1:
version "3.14.2"
resolved "https://registry.yarnpkg.com/tsconfig-paths/-/tsconfig-paths-3.14.2.tgz#6e32f1f79412decd261f92d633a9dc1cfa99f088"
@@ -3395,6 +3419,19 @@ wcwidth@^1.0.1:
dependencies:
defaults "^1.0.3"
webidl-conversions@^3.0.0:
version "3.0.1"
resolved "https://registry.yarnpkg.com/webidl-conversions/-/webidl-conversions-3.0.1.tgz#24534275e2a7bc6be7bc86611cc16ae0a5654871"
integrity sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==
whatwg-url@^5.0.0:
version "5.0.0"
resolved "https://registry.yarnpkg.com/whatwg-url/-/whatwg-url-5.0.0.tgz#966454e8765462e37644d3626f6742ce8b70965d"
integrity sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw==
dependencies:
tr46 "~0.0.3"
webidl-conversions "^3.0.0"
which-boxed-primitive@^1.0.2:
version "1.0.2"
resolved "https://registry.yarnpkg.com/which-boxed-primitive/-/which-boxed-primitive-1.0.2.tgz#13757bc89b209b049fe5d86430e21cf40a89a8e6"
@@ -3496,6 +3533,13 @@ xdg-basedir@^4.0.0:
resolved "https://registry.yarnpkg.com/xdg-basedir/-/xdg-basedir-4.0.0.tgz#4bc8d9984403696225ef83a1573cbbcb4e79db13"
integrity sha512-PSNhEJDejZYV7h50BohL09Er9VaIefr2LMAf3OEmpCkjOi34eYyQYAXUTjEQtZJTKcF0E2UKTh+osDLsgNim9Q==
xml-js@^1.6.11:
version "1.6.11"
resolved "https://registry.yarnpkg.com/xml-js/-/xml-js-1.6.11.tgz#927d2f6947f7f1c19a316dd8eea3614e8b18f8e9"
integrity sha512-7rVi2KMfwfWFl+GpPg6m80IVMWXLRjO+PxTq7V2CDhoGak0wzYzFgUY2m4XJ47OGdXd8eLE8EmwfAmdjw7lC1g==
dependencies:
sax "^1.2.4"
xml-name-validator@^4.0.0:
version "4.0.0"
resolved "https://registry.yarnpkg.com/xml-name-validator/-/xml-name-validator-4.0.0.tgz#79a006e2e63149a8600f15430f0a4725d1524835"


Loading…
Cancel
Save