@@ -1,5 +1,5 @@ | |||
{ | |||
"name": "murasaki-core", | |||
"name": "@modal-sh/murasaki-core", | |||
"version": "0.0.0", | |||
"files": [ | |||
"dist", | |||
@@ -49,5 +49,22 @@ | |||
"dependencies": { | |||
"fetch-ponyfill": "^7.1.0", | |||
"xml-js": "^1.6.11" | |||
}, | |||
"types": "./dist/types/index.d.ts", | |||
"main": "./dist/cjs/production/index.js", | |||
"module": "./dist/esm/production/index.js", | |||
"exports": { | |||
".": { | |||
"development": { | |||
"require": "./dist/cjs/development/index.js", | |||
"import": "./dist/esm/development/index.js" | |||
}, | |||
"require": "./dist/cjs/production/index.js", | |||
"import": "./dist/esm/production/index.js", | |||
"types": "./dist/types/index.d.ts" | |||
} | |||
}, | |||
"typesVersions": { | |||
"*": {} | |||
} | |||
} |
@@ -15,33 +15,34 @@ class XmlToJsonLinesTransformStream extends Transform { | |||
// eslint-disable-next-line no-underscore-dangle | |||
_transform(chunk: Buffer, _encoding: BufferEncoding, callback: TransformCallback) { | |||
const chunkStr = chunk.toString('utf-8'); | |||
if (typeof this.charactersToParse !== 'string') { | |||
const firstEntryIndex = chunkStr.indexOf(`<${this.options.entryTagName}>`); | |||
this.charactersToParse = chunkStr.slice(firstEntryIndex); | |||
} else { | |||
this.charactersToParse += chunkStr; | |||
} | |||
let theCharacters = `${this.charactersToParse}`; | |||
let nextOpenTagIndex = theCharacters.indexOf(`<${this.options.entryTagName}>`); | |||
let nextCloseTagIndex = theCharacters.indexOf(`</${this.options.entryTagName}>`); | |||
do { | |||
const xml = theCharacters | |||
.slice( | |||
nextOpenTagIndex, | |||
nextCloseTagIndex + this.options.entryTagName.length + 3, | |||
) | |||
.replace(/&(.+?);/g, '$1'); // FIXME better handling of XML entities??? This makes the pipe hang for some reason | |||
const json = xml2json(xml, { compact: true }); | |||
this.push(`${json}\n`); | |||
theCharacters = theCharacters.slice(nextCloseTagIndex + this.options.entryTagName.length + 3); | |||
nextOpenTagIndex = theCharacters.indexOf(`<${this.options.entryTagName}>`); | |||
nextCloseTagIndex = theCharacters.indexOf(`</${this.options.entryTagName}>`); | |||
} while (nextOpenTagIndex !== -1 && nextCloseTagIndex !== -1); | |||
this.charactersToParse = theCharacters; | |||
try { | |||
const chunkStr = chunk.toString('utf-8'); | |||
if (typeof this.charactersToParse !== 'string') { | |||
const firstEntryIndex = chunkStr.indexOf(`<${this.options.entryTagName}>`); | |||
this.charactersToParse = chunkStr.slice(firstEntryIndex); | |||
} else { | |||
this.charactersToParse += chunkStr; | |||
} | |||
callback(null, ''); | |||
let theCharacters = `${this.charactersToParse}`; | |||
let nextOpenTagIndex = theCharacters.indexOf(`<${this.options.entryTagName}>`); | |||
let nextCloseTagIndex = theCharacters.indexOf(`</${this.options.entryTagName}>`); | |||
const closeTagLength = this.options.entryTagName.length + 3; | |||
do { | |||
const xml = theCharacters | |||
.slice(nextOpenTagIndex, nextCloseTagIndex + closeTagLength) | |||
.replace(/&(.+?);/g, '$1'); // FIXME better handling of XML entities??? This makes the pipe hang for some reason | |||
const json = xml2json(xml, { compact: true }); | |||
this.push(`${json}\n`); | |||
theCharacters = theCharacters.slice(nextCloseTagIndex + closeTagLength); | |||
nextOpenTagIndex = theCharacters.indexOf(`<${this.options.entryTagName}>`); | |||
nextCloseTagIndex = theCharacters.indexOf(`</${this.options.entryTagName}>`); | |||
} while (nextOpenTagIndex !== -1 && nextCloseTagIndex !== -1); | |||
this.charactersToParse = theCharacters; | |||
callback(null, ''); | |||
} catch (err) { | |||
callback(err as Error); | |||
} | |||
} | |||
} | |||
@@ -1,73 +0,0 @@ | |||
import { createReadStream, createWriteStream } from 'fs'; | |||
import { describe, it, expect } from 'vitest'; | |||
import { createDownloader, Kanjidic, JMdict, createXmlToJsonLines } from '../src'; | |||
describe('downloader', () => { | |||
describe.skip('kanjidic', () => { | |||
it.skip('works', async () => { | |||
const readStream = await createDownloader({ | |||
type: Kanjidic.SOURCE_ID, | |||
}); | |||
return new Promise<void>((resolve) => { | |||
const out = createWriteStream('kanjidic2.xml'); | |||
out.on('finish', () => { | |||
resolve(); | |||
}); | |||
readStream.pipe(out); | |||
}); | |||
}); | |||
it.skip('converts XML to JSON', () => new Promise<void>((resolve) => { | |||
const readStream = createReadStream('kanjidic2.xml', { encoding: 'utf-8' }); | |||
const transform = createXmlToJsonLines({ | |||
entryTagName: 'character', | |||
}); | |||
const out = createWriteStream('kanjidic2.jsonl'); | |||
out.on('finish', () => { | |||
resolve(); | |||
}); | |||
readStream | |||
.pipe(transform) | |||
.pipe(out); | |||
})); | |||
}); | |||
describe.skip('jmdict', () => { | |||
it('works', async () => { | |||
const readStream = await createDownloader({ | |||
type: JMdict.SOURCE_ID, | |||
}); | |||
return new Promise<void>((resolve) => { | |||
const out = createWriteStream('jmdict.xml'); | |||
out.on('finish', () => { | |||
resolve(); | |||
}); | |||
readStream.pipe(out); | |||
}); | |||
}, { timeout: 300000 }); | |||
it('converts XML to JSON', () => new Promise<void>((resolve) => { | |||
const readStream = createReadStream('jmdict.full.xml', { encoding: 'utf-8' }); | |||
const transform = createXmlToJsonLines({ | |||
entryTagName: 'entry', | |||
}); | |||
const out = createWriteStream('jmdict.jsonl'); | |||
out.on('finish', () => { | |||
resolve(); | |||
}); | |||
readStream | |||
.pipe(transform) | |||
.pipe(out); | |||
}), { timeout: 300000 }); | |||
}); | |||
}); |
@@ -0,0 +1,85 @@ | |||
import { afterAll, describe, it } from 'vitest'; | |||
import { createReadStream, createWriteStream } from 'fs'; | |||
import { unlink } from 'fs/promises'; | |||
import { createDownloader, createXmlToJsonLines, JMdict } from '../../src'; | |||
const DOWNLOAD_FILENAME = 'jmdict.xml' as const; | |||
const PROCESS_OUTPUT_FILENAME = 'jmdict.jsonl' as const; | |||
// FIXME use tests that do not download the source data | |||
describe.skip('jmdict', () => { | |||
afterAll(async () => { | |||
try { | |||
await unlink(DOWNLOAD_FILENAME); | |||
} catch { | |||
// noop | |||
} | |||
}); | |||
afterAll(async () => { | |||
try { | |||
await unlink(PROCESS_OUTPUT_FILENAME); | |||
} catch { | |||
// noop | |||
} | |||
}); | |||
describe('downloader', () => { | |||
it('downloads the source data', async () => { | |||
const readStream = await createDownloader({ | |||
type: JMdict.SOURCE_ID, | |||
}); | |||
return new Promise<void>((resolve, reject) => { | |||
const out = createWriteStream(DOWNLOAD_FILENAME); | |||
readStream.on('error', (err) => { | |||
reject(err); | |||
}); | |||
out.on('error', (err) => { | |||
reject(err); | |||
}); | |||
out.on('finish', () => { | |||
resolve(); | |||
}); | |||
readStream.pipe(out); | |||
}); | |||
}, { timeout: 30000 }); | |||
}); | |||
describe('converter', () => { | |||
it('converts XML to JSON', () => new Promise<void>((resolve, reject) => { | |||
const readStream = createReadStream(DOWNLOAD_FILENAME, { encoding: 'utf-8' }); | |||
readStream.on('error', (err) => { | |||
reject(err); | |||
}); | |||
const transform = createXmlToJsonLines({ | |||
entryTagName: 'entry', | |||
}); | |||
transform.on('error', (err) => { | |||
reject(err); | |||
}); | |||
const out = createWriteStream(PROCESS_OUTPUT_FILENAME); | |||
out.on('error', (err) => { | |||
reject(err); | |||
}); | |||
out.on('finish', () => { | |||
resolve(); | |||
}); | |||
readStream | |||
.pipe(transform) | |||
.pipe(out); | |||
}), { timeout: 30000 }); | |||
}); | |||
}); |
@@ -0,0 +1,85 @@ | |||
import { afterAll, describe, it } from 'vitest'; | |||
import { createReadStream, createWriteStream } from 'fs'; | |||
import { unlink } from 'fs/promises'; | |||
import { createDownloader, createXmlToJsonLines, JMnedict } from '../../src'; | |||
const DOWNLOAD_FILENAME = 'jmnedict.xml' as const; | |||
const PROCESS_OUTPUT_FILENAME = 'jmnedict.jsonl' as const; | |||
// FIXME use tests that do not download the source data | |||
describe.skip('jmnedict', () => { | |||
afterAll(async () => { | |||
try { | |||
await unlink(DOWNLOAD_FILENAME); | |||
} catch { | |||
// noop | |||
} | |||
}); | |||
afterAll(async () => { | |||
try { | |||
await unlink(PROCESS_OUTPUT_FILENAME); | |||
} catch { | |||
// noop | |||
} | |||
}); | |||
describe('downloader', () => { | |||
it('downloads the source data', async () => { | |||
const readStream = await createDownloader({ | |||
type: JMnedict.SOURCE_ID, | |||
}); | |||
return new Promise<void>((resolve, reject) => { | |||
const out = createWriteStream(DOWNLOAD_FILENAME); | |||
readStream.on('error', (err) => { | |||
reject(err); | |||
}); | |||
out.on('error', (err) => { | |||
reject(err); | |||
}); | |||
out.on('finish', () => { | |||
resolve(); | |||
}); | |||
readStream.pipe(out); | |||
}); | |||
}, { timeout: 30000 }); | |||
}); | |||
describe('converter', () => { | |||
it('converts XML to JSON', () => new Promise<void>((resolve, reject) => { | |||
const readStream = createReadStream(DOWNLOAD_FILENAME, { encoding: 'utf-8' }); | |||
readStream.on('error', (err) => { | |||
reject(err); | |||
}); | |||
const transform = createXmlToJsonLines({ | |||
entryTagName: 'entry', | |||
}); | |||
transform.on('error', (err) => { | |||
reject(err); | |||
}); | |||
const out = createWriteStream(PROCESS_OUTPUT_FILENAME); | |||
out.on('error', (err) => { | |||
reject(err); | |||
}); | |||
out.on('finish', () => { | |||
resolve(); | |||
}); | |||
readStream | |||
.pipe(transform) | |||
.pipe(out); | |||
}), { timeout: 45000 }); | |||
}); | |||
}); |
@@ -0,0 +1,85 @@ | |||
import { describe, it, afterAll } from 'vitest'; | |||
import { createReadStream, createWriteStream } from 'fs'; | |||
import { unlink } from 'fs/promises'; | |||
import { createDownloader, createXmlToJsonLines, Kanjidic } from '../../src'; | |||
const DOWNLOAD_FILENAME = 'kanjidic2.xml' as const; | |||
const PROCESS_OUTPUT_FILENAME = 'kanjidic2.jsonl' as const; | |||
// FIXME use tests that do not download the source data | |||
describe.skip('kanjidic', () => { | |||
afterAll(async () => { | |||
try { | |||
await unlink(DOWNLOAD_FILENAME); | |||
} catch { | |||
// noop | |||
} | |||
}); | |||
afterAll(async () => { | |||
try { | |||
await unlink(PROCESS_OUTPUT_FILENAME); | |||
} catch { | |||
// noop | |||
} | |||
}); | |||
describe('downloader', () => { | |||
it('downloads the source data', async () => { | |||
const readStream = await createDownloader({ | |||
type: Kanjidic.SOURCE_ID, | |||
}); | |||
return new Promise<void>((resolve, reject) => { | |||
const out = createWriteStream(DOWNLOAD_FILENAME); | |||
readStream.on('error', (err) => { | |||
reject(err); | |||
}); | |||
out.on('error', (err) => { | |||
reject(err); | |||
}); | |||
out.on('finish', () => { | |||
resolve(); | |||
}); | |||
readStream.pipe(out); | |||
}); | |||
}); | |||
}); | |||
describe('converter', () => { | |||
it('converts XML to JSON', () => new Promise<void>((resolve, reject) => { | |||
const readStream = createReadStream(DOWNLOAD_FILENAME, { encoding: 'utf-8' }); | |||
readStream.on('error', (err) => { | |||
reject(err); | |||
}); | |||
const transform = createXmlToJsonLines({ | |||
entryTagName: 'character', | |||
}); | |||
transform.on('error', (err) => { | |||
reject(err); | |||
}); | |||
const out = createWriteStream(PROCESS_OUTPUT_FILENAME); | |||
out.on('error', (err) => { | |||
reject(err); | |||
}); | |||
out.on('finish', () => { | |||
resolve(); | |||
}); | |||
readStream | |||
.pipe(transform) | |||
.pipe(out); | |||
})); | |||
}); | |||
}); |