diff --git a/package.json b/package.json index ae91788..7424586 100644 --- a/package.json +++ b/package.json @@ -1,5 +1,5 @@ { - "name": "murasaki-core", + "name": "@modal-sh/murasaki-core", "version": "0.0.0", "files": [ "dist", @@ -49,5 +49,22 @@ "dependencies": { "fetch-ponyfill": "^7.1.0", "xml-js": "^1.6.11" + }, + "types": "./dist/types/index.d.ts", + "main": "./dist/cjs/production/index.js", + "module": "./dist/esm/production/index.js", + "exports": { + ".": { + "development": { + "require": "./dist/cjs/development/index.js", + "import": "./dist/esm/development/index.js" + }, + "require": "./dist/cjs/production/index.js", + "import": "./dist/esm/production/index.js", + "types": "./dist/types/index.d.ts" + } + }, + "typesVersions": { + "*": {} } } diff --git a/src/streams.ts b/src/streams.ts index 7d554c2..558bf73 100644 --- a/src/streams.ts +++ b/src/streams.ts @@ -15,33 +15,34 @@ class XmlToJsonLinesTransformStream extends Transform { // eslint-disable-next-line no-underscore-dangle _transform(chunk: Buffer, _encoding: BufferEncoding, callback: TransformCallback) { - const chunkStr = chunk.toString('utf-8'); - if (typeof this.charactersToParse !== 'string') { - const firstEntryIndex = chunkStr.indexOf(`<${this.options.entryTagName}>`); - this.charactersToParse = chunkStr.slice(firstEntryIndex); - } else { - this.charactersToParse += chunkStr; - } - - let theCharacters = `${this.charactersToParse}`; - let nextOpenTagIndex = theCharacters.indexOf(`<${this.options.entryTagName}>`); - let nextCloseTagIndex = theCharacters.indexOf(``); - do { - const xml = theCharacters - .slice( - nextOpenTagIndex, - nextCloseTagIndex + this.options.entryTagName.length + 3, - ) - .replace(/&(.+?);/g, '$1'); // FIXME better handling of XML entities??? This makes the pipe hang for some reason - const json = xml2json(xml, { compact: true }); - this.push(`${json}\n`); - theCharacters = theCharacters.slice(nextCloseTagIndex + this.options.entryTagName.length + 3); - nextOpenTagIndex = theCharacters.indexOf(`<${this.options.entryTagName}>`); - nextCloseTagIndex = theCharacters.indexOf(``); - } while (nextOpenTagIndex !== -1 && nextCloseTagIndex !== -1); - this.charactersToParse = theCharacters; + try { + const chunkStr = chunk.toString('utf-8'); + if (typeof this.charactersToParse !== 'string') { + const firstEntryIndex = chunkStr.indexOf(`<${this.options.entryTagName}>`); + this.charactersToParse = chunkStr.slice(firstEntryIndex); + } else { + this.charactersToParse += chunkStr; + } - callback(null, ''); + let theCharacters = `${this.charactersToParse}`; + let nextOpenTagIndex = theCharacters.indexOf(`<${this.options.entryTagName}>`); + let nextCloseTagIndex = theCharacters.indexOf(``); + const closeTagLength = this.options.entryTagName.length + 3; + do { + const xml = theCharacters + .slice(nextOpenTagIndex, nextCloseTagIndex + closeTagLength) + .replace(/&(.+?);/g, '$1'); // FIXME better handling of XML entities??? This makes the pipe hang for some reason + const json = xml2json(xml, { compact: true }); + this.push(`${json}\n`); + theCharacters = theCharacters.slice(nextCloseTagIndex + closeTagLength); + nextOpenTagIndex = theCharacters.indexOf(`<${this.options.entryTagName}>`); + nextCloseTagIndex = theCharacters.indexOf(``); + } while (nextOpenTagIndex !== -1 && nextCloseTagIndex !== -1); + this.charactersToParse = theCharacters; + callback(null, ''); + } catch (err) { + callback(err as Error); + } } } diff --git a/test/index.test.ts b/test/index.test.ts deleted file mode 100644 index e75377e..0000000 --- a/test/index.test.ts +++ /dev/null @@ -1,73 +0,0 @@ -import { createReadStream, createWriteStream } from 'fs'; -import { describe, it, expect } from 'vitest'; -import { createDownloader, Kanjidic, JMdict, createXmlToJsonLines } from '../src'; - -describe('downloader', () => { - describe.skip('kanjidic', () => { - it.skip('works', async () => { - const readStream = await createDownloader({ - type: Kanjidic.SOURCE_ID, - }); - - return new Promise((resolve) => { - const out = createWriteStream('kanjidic2.xml'); - - out.on('finish', () => { - resolve(); - }); - - readStream.pipe(out); - }); - }); - - it.skip('converts XML to JSON', () => new Promise((resolve) => { - const readStream = createReadStream('kanjidic2.xml', { encoding: 'utf-8' }); - const transform = createXmlToJsonLines({ - entryTagName: 'character', - }); - const out = createWriteStream('kanjidic2.jsonl'); - - out.on('finish', () => { - resolve(); - }); - - readStream - .pipe(transform) - .pipe(out); - })); - }); - - describe.skip('jmdict', () => { - it('works', async () => { - const readStream = await createDownloader({ - type: JMdict.SOURCE_ID, - }); - - return new Promise((resolve) => { - const out = createWriteStream('jmdict.xml'); - - out.on('finish', () => { - resolve(); - }); - - readStream.pipe(out); - }); - }, { timeout: 300000 }); - - it('converts XML to JSON', () => new Promise((resolve) => { - const readStream = createReadStream('jmdict.full.xml', { encoding: 'utf-8' }); - const transform = createXmlToJsonLines({ - entryTagName: 'entry', - }); - const out = createWriteStream('jmdict.jsonl'); - - out.on('finish', () => { - resolve(); - }); - - readStream - .pipe(transform) - .pipe(out); - }), { timeout: 300000 }); - }); -}); diff --git a/test/sources/jmdict.test.ts b/test/sources/jmdict.test.ts new file mode 100644 index 0000000..447b3f5 --- /dev/null +++ b/test/sources/jmdict.test.ts @@ -0,0 +1,85 @@ +import { afterAll, describe, it } from 'vitest'; +import { createReadStream, createWriteStream } from 'fs'; +import { unlink } from 'fs/promises'; +import { createDownloader, createXmlToJsonLines, JMdict } from '../../src'; + +const DOWNLOAD_FILENAME = 'jmdict.xml' as const; +const PROCESS_OUTPUT_FILENAME = 'jmdict.jsonl' as const; + +// FIXME use tests that do not download the source data + +describe.skip('jmdict', () => { + afterAll(async () => { + try { + await unlink(DOWNLOAD_FILENAME); + } catch { + // noop + } + }); + + afterAll(async () => { + try { + await unlink(PROCESS_OUTPUT_FILENAME); + } catch { + // noop + } + }); + + describe('downloader', () => { + it('downloads the source data', async () => { + const readStream = await createDownloader({ + type: JMdict.SOURCE_ID, + }); + + return new Promise((resolve, reject) => { + const out = createWriteStream(DOWNLOAD_FILENAME); + + readStream.on('error', (err) => { + reject(err); + }); + + out.on('error', (err) => { + reject(err); + }); + + out.on('finish', () => { + resolve(); + }); + + readStream.pipe(out); + }); + }, { timeout: 30000 }); + }); + + describe('converter', () => { + it('converts XML to JSON', () => new Promise((resolve, reject) => { + const readStream = createReadStream(DOWNLOAD_FILENAME, { encoding: 'utf-8' }); + + readStream.on('error', (err) => { + reject(err); + }); + + const transform = createXmlToJsonLines({ + entryTagName: 'entry', + }); + + transform.on('error', (err) => { + reject(err); + }); + + const out = createWriteStream(PROCESS_OUTPUT_FILENAME); + + out.on('error', (err) => { + reject(err); + }); + + out.on('finish', () => { + resolve(); + }); + + readStream + .pipe(transform) + .pipe(out); + }), { timeout: 30000 }); + }); +}); diff --git a/test/sources/jmnedict.test.ts b/test/sources/jmnedict.test.ts new file mode 100644 index 0000000..8a4bac1 --- /dev/null +++ b/test/sources/jmnedict.test.ts @@ -0,0 +1,85 @@ +import { afterAll, describe, it } from 'vitest'; +import { createReadStream, createWriteStream } from 'fs'; +import { unlink } from 'fs/promises'; +import { createDownloader, createXmlToJsonLines, JMnedict } from '../../src'; + +const DOWNLOAD_FILENAME = 'jmnedict.xml' as const; +const PROCESS_OUTPUT_FILENAME = 'jmnedict.jsonl' as const; + +// FIXME use tests that do not download the source data + +describe.skip('jmnedict', () => { + afterAll(async () => { + try { + await unlink(DOWNLOAD_FILENAME); + } catch { + // noop + } + }); + + afterAll(async () => { + try { + await unlink(PROCESS_OUTPUT_FILENAME); + } catch { + // noop + } + }); + + describe('downloader', () => { + it('downloads the source data', async () => { + const readStream = await createDownloader({ + type: JMnedict.SOURCE_ID, + }); + + return new Promise((resolve, reject) => { + const out = createWriteStream(DOWNLOAD_FILENAME); + + readStream.on('error', (err) => { + reject(err); + }); + + out.on('error', (err) => { + reject(err); + }); + + out.on('finish', () => { + resolve(); + }); + + readStream.pipe(out); + }); + }, { timeout: 30000 }); + }); + + describe('converter', () => { + it('converts XML to JSON', () => new Promise((resolve, reject) => { + const readStream = createReadStream(DOWNLOAD_FILENAME, { encoding: 'utf-8' }); + + readStream.on('error', (err) => { + reject(err); + }); + + const transform = createXmlToJsonLines({ + entryTagName: 'entry', + }); + + transform.on('error', (err) => { + reject(err); + }); + + const out = createWriteStream(PROCESS_OUTPUT_FILENAME); + + out.on('error', (err) => { + reject(err); + }); + + out.on('finish', () => { + resolve(); + }); + + readStream + .pipe(transform) + .pipe(out); + }), { timeout: 45000 }); + }); +}); diff --git a/test/sources/kanjidic.test.ts b/test/sources/kanjidic.test.ts new file mode 100644 index 0000000..6277db0 --- /dev/null +++ b/test/sources/kanjidic.test.ts @@ -0,0 +1,85 @@ +import { describe, it, afterAll } from 'vitest'; +import { createReadStream, createWriteStream } from 'fs'; +import { unlink } from 'fs/promises'; +import { createDownloader, createXmlToJsonLines, Kanjidic } from '../../src'; + +const DOWNLOAD_FILENAME = 'kanjidic2.xml' as const; +const PROCESS_OUTPUT_FILENAME = 'kanjidic2.jsonl' as const; + +// FIXME use tests that do not download the source data + +describe.skip('kanjidic', () => { + afterAll(async () => { + try { + await unlink(DOWNLOAD_FILENAME); + } catch { + // noop + } + }); + + afterAll(async () => { + try { + await unlink(PROCESS_OUTPUT_FILENAME); + } catch { + // noop + } + }); + + describe('downloader', () => { + it('downloads the source data', async () => { + const readStream = await createDownloader({ + type: Kanjidic.SOURCE_ID, + }); + + return new Promise((resolve, reject) => { + const out = createWriteStream(DOWNLOAD_FILENAME); + + readStream.on('error', (err) => { + reject(err); + }); + + out.on('error', (err) => { + reject(err); + }); + + out.on('finish', () => { + resolve(); + }); + + readStream.pipe(out); + }); + }); + }); + + describe('converter', () => { + it('converts XML to JSON', () => new Promise((resolve, reject) => { + const readStream = createReadStream(DOWNLOAD_FILENAME, { encoding: 'utf-8' }); + + readStream.on('error', (err) => { + reject(err); + }); + + const transform = createXmlToJsonLines({ + entryTagName: 'character', + }); + + transform.on('error', (err) => { + reject(err); + }); + + const out = createWriteStream(PROCESS_OUTPUT_FILENAME); + + out.on('error', (err) => { + reject(err); + }); + + out.on('finish', () => { + resolve(); + }); + + readStream + .pipe(transform) + .pipe(out); + })); + }); +});