@@ -1,5 +1,5 @@ | |||||
{ | { | ||||
"name": "murasaki-core", | |||||
"name": "@modal-sh/murasaki-core", | |||||
"version": "0.0.0", | "version": "0.0.0", | ||||
"files": [ | "files": [ | ||||
"dist", | "dist", | ||||
@@ -49,5 +49,22 @@ | |||||
"dependencies": { | "dependencies": { | ||||
"fetch-ponyfill": "^7.1.0", | "fetch-ponyfill": "^7.1.0", | ||||
"xml-js": "^1.6.11" | "xml-js": "^1.6.11" | ||||
}, | |||||
"types": "./dist/types/index.d.ts", | |||||
"main": "./dist/cjs/production/index.js", | |||||
"module": "./dist/esm/production/index.js", | |||||
"exports": { | |||||
".": { | |||||
"development": { | |||||
"require": "./dist/cjs/development/index.js", | |||||
"import": "./dist/esm/development/index.js" | |||||
}, | |||||
"require": "./dist/cjs/production/index.js", | |||||
"import": "./dist/esm/production/index.js", | |||||
"types": "./dist/types/index.d.ts" | |||||
} | |||||
}, | |||||
"typesVersions": { | |||||
"*": {} | |||||
} | } | ||||
} | } |
@@ -15,33 +15,34 @@ class XmlToJsonLinesTransformStream extends Transform { | |||||
// eslint-disable-next-line no-underscore-dangle | // eslint-disable-next-line no-underscore-dangle | ||||
_transform(chunk: Buffer, _encoding: BufferEncoding, callback: TransformCallback) { | _transform(chunk: Buffer, _encoding: BufferEncoding, callback: TransformCallback) { | ||||
const chunkStr = chunk.toString('utf-8'); | |||||
if (typeof this.charactersToParse !== 'string') { | |||||
const firstEntryIndex = chunkStr.indexOf(`<${this.options.entryTagName}>`); | |||||
this.charactersToParse = chunkStr.slice(firstEntryIndex); | |||||
} else { | |||||
this.charactersToParse += chunkStr; | |||||
} | |||||
let theCharacters = `${this.charactersToParse}`; | |||||
let nextOpenTagIndex = theCharacters.indexOf(`<${this.options.entryTagName}>`); | |||||
let nextCloseTagIndex = theCharacters.indexOf(`</${this.options.entryTagName}>`); | |||||
do { | |||||
const xml = theCharacters | |||||
.slice( | |||||
nextOpenTagIndex, | |||||
nextCloseTagIndex + this.options.entryTagName.length + 3, | |||||
) | |||||
.replace(/&(.+?);/g, '$1'); // FIXME better handling of XML entities??? This makes the pipe hang for some reason | |||||
const json = xml2json(xml, { compact: true }); | |||||
this.push(`${json}\n`); | |||||
theCharacters = theCharacters.slice(nextCloseTagIndex + this.options.entryTagName.length + 3); | |||||
nextOpenTagIndex = theCharacters.indexOf(`<${this.options.entryTagName}>`); | |||||
nextCloseTagIndex = theCharacters.indexOf(`</${this.options.entryTagName}>`); | |||||
} while (nextOpenTagIndex !== -1 && nextCloseTagIndex !== -1); | |||||
this.charactersToParse = theCharacters; | |||||
try { | |||||
const chunkStr = chunk.toString('utf-8'); | |||||
if (typeof this.charactersToParse !== 'string') { | |||||
const firstEntryIndex = chunkStr.indexOf(`<${this.options.entryTagName}>`); | |||||
this.charactersToParse = chunkStr.slice(firstEntryIndex); | |||||
} else { | |||||
this.charactersToParse += chunkStr; | |||||
} | |||||
callback(null, ''); | |||||
let theCharacters = `${this.charactersToParse}`; | |||||
let nextOpenTagIndex = theCharacters.indexOf(`<${this.options.entryTagName}>`); | |||||
let nextCloseTagIndex = theCharacters.indexOf(`</${this.options.entryTagName}>`); | |||||
const closeTagLength = this.options.entryTagName.length + 3; | |||||
do { | |||||
const xml = theCharacters | |||||
.slice(nextOpenTagIndex, nextCloseTagIndex + closeTagLength) | |||||
.replace(/&(.+?);/g, '$1'); // FIXME better handling of XML entities??? This makes the pipe hang for some reason | |||||
const json = xml2json(xml, { compact: true }); | |||||
this.push(`${json}\n`); | |||||
theCharacters = theCharacters.slice(nextCloseTagIndex + closeTagLength); | |||||
nextOpenTagIndex = theCharacters.indexOf(`<${this.options.entryTagName}>`); | |||||
nextCloseTagIndex = theCharacters.indexOf(`</${this.options.entryTagName}>`); | |||||
} while (nextOpenTagIndex !== -1 && nextCloseTagIndex !== -1); | |||||
this.charactersToParse = theCharacters; | |||||
callback(null, ''); | |||||
} catch (err) { | |||||
callback(err as Error); | |||||
} | |||||
} | } | ||||
} | } | ||||
@@ -1,73 +0,0 @@ | |||||
import { createReadStream, createWriteStream } from 'fs'; | |||||
import { describe, it, expect } from 'vitest'; | |||||
import { createDownloader, Kanjidic, JMdict, createXmlToJsonLines } from '../src'; | |||||
describe('downloader', () => { | |||||
describe.skip('kanjidic', () => { | |||||
it.skip('works', async () => { | |||||
const readStream = await createDownloader({ | |||||
type: Kanjidic.SOURCE_ID, | |||||
}); | |||||
return new Promise<void>((resolve) => { | |||||
const out = createWriteStream('kanjidic2.xml'); | |||||
out.on('finish', () => { | |||||
resolve(); | |||||
}); | |||||
readStream.pipe(out); | |||||
}); | |||||
}); | |||||
it.skip('converts XML to JSON', () => new Promise<void>((resolve) => { | |||||
const readStream = createReadStream('kanjidic2.xml', { encoding: 'utf-8' }); | |||||
const transform = createXmlToJsonLines({ | |||||
entryTagName: 'character', | |||||
}); | |||||
const out = createWriteStream('kanjidic2.jsonl'); | |||||
out.on('finish', () => { | |||||
resolve(); | |||||
}); | |||||
readStream | |||||
.pipe(transform) | |||||
.pipe(out); | |||||
})); | |||||
}); | |||||
describe.skip('jmdict', () => { | |||||
it('works', async () => { | |||||
const readStream = await createDownloader({ | |||||
type: JMdict.SOURCE_ID, | |||||
}); | |||||
return new Promise<void>((resolve) => { | |||||
const out = createWriteStream('jmdict.xml'); | |||||
out.on('finish', () => { | |||||
resolve(); | |||||
}); | |||||
readStream.pipe(out); | |||||
}); | |||||
}, { timeout: 300000 }); | |||||
it('converts XML to JSON', () => new Promise<void>((resolve) => { | |||||
const readStream = createReadStream('jmdict.full.xml', { encoding: 'utf-8' }); | |||||
const transform = createXmlToJsonLines({ | |||||
entryTagName: 'entry', | |||||
}); | |||||
const out = createWriteStream('jmdict.jsonl'); | |||||
out.on('finish', () => { | |||||
resolve(); | |||||
}); | |||||
readStream | |||||
.pipe(transform) | |||||
.pipe(out); | |||||
}), { timeout: 300000 }); | |||||
}); | |||||
}); |
@@ -0,0 +1,85 @@ | |||||
import { afterAll, describe, it } from 'vitest'; | |||||
import { createReadStream, createWriteStream } from 'fs'; | |||||
import { unlink } from 'fs/promises'; | |||||
import { createDownloader, createXmlToJsonLines, JMdict } from '../../src'; | |||||
const DOWNLOAD_FILENAME = 'jmdict.xml' as const; | |||||
const PROCESS_OUTPUT_FILENAME = 'jmdict.jsonl' as const; | |||||
// FIXME use tests that do not download the source data | |||||
describe.skip('jmdict', () => { | |||||
afterAll(async () => { | |||||
try { | |||||
await unlink(DOWNLOAD_FILENAME); | |||||
} catch { | |||||
// noop | |||||
} | |||||
}); | |||||
afterAll(async () => { | |||||
try { | |||||
await unlink(PROCESS_OUTPUT_FILENAME); | |||||
} catch { | |||||
// noop | |||||
} | |||||
}); | |||||
describe('downloader', () => { | |||||
it('downloads the source data', async () => { | |||||
const readStream = await createDownloader({ | |||||
type: JMdict.SOURCE_ID, | |||||
}); | |||||
return new Promise<void>((resolve, reject) => { | |||||
const out = createWriteStream(DOWNLOAD_FILENAME); | |||||
readStream.on('error', (err) => { | |||||
reject(err); | |||||
}); | |||||
out.on('error', (err) => { | |||||
reject(err); | |||||
}); | |||||
out.on('finish', () => { | |||||
resolve(); | |||||
}); | |||||
readStream.pipe(out); | |||||
}); | |||||
}, { timeout: 30000 }); | |||||
}); | |||||
describe('converter', () => { | |||||
it('converts XML to JSON', () => new Promise<void>((resolve, reject) => { | |||||
const readStream = createReadStream(DOWNLOAD_FILENAME, { encoding: 'utf-8' }); | |||||
readStream.on('error', (err) => { | |||||
reject(err); | |||||
}); | |||||
const transform = createXmlToJsonLines({ | |||||
entryTagName: 'entry', | |||||
}); | |||||
transform.on('error', (err) => { | |||||
reject(err); | |||||
}); | |||||
const out = createWriteStream(PROCESS_OUTPUT_FILENAME); | |||||
out.on('error', (err) => { | |||||
reject(err); | |||||
}); | |||||
out.on('finish', () => { | |||||
resolve(); | |||||
}); | |||||
readStream | |||||
.pipe(transform) | |||||
.pipe(out); | |||||
}), { timeout: 30000 }); | |||||
}); | |||||
}); |
@@ -0,0 +1,85 @@ | |||||
import { afterAll, describe, it } from 'vitest'; | |||||
import { createReadStream, createWriteStream } from 'fs'; | |||||
import { unlink } from 'fs/promises'; | |||||
import { createDownloader, createXmlToJsonLines, JMnedict } from '../../src'; | |||||
const DOWNLOAD_FILENAME = 'jmnedict.xml' as const; | |||||
const PROCESS_OUTPUT_FILENAME = 'jmnedict.jsonl' as const; | |||||
// FIXME use tests that do not download the source data | |||||
describe.skip('jmnedict', () => { | |||||
afterAll(async () => { | |||||
try { | |||||
await unlink(DOWNLOAD_FILENAME); | |||||
} catch { | |||||
// noop | |||||
} | |||||
}); | |||||
afterAll(async () => { | |||||
try { | |||||
await unlink(PROCESS_OUTPUT_FILENAME); | |||||
} catch { | |||||
// noop | |||||
} | |||||
}); | |||||
describe('downloader', () => { | |||||
it('downloads the source data', async () => { | |||||
const readStream = await createDownloader({ | |||||
type: JMnedict.SOURCE_ID, | |||||
}); | |||||
return new Promise<void>((resolve, reject) => { | |||||
const out = createWriteStream(DOWNLOAD_FILENAME); | |||||
readStream.on('error', (err) => { | |||||
reject(err); | |||||
}); | |||||
out.on('error', (err) => { | |||||
reject(err); | |||||
}); | |||||
out.on('finish', () => { | |||||
resolve(); | |||||
}); | |||||
readStream.pipe(out); | |||||
}); | |||||
}, { timeout: 30000 }); | |||||
}); | |||||
describe('converter', () => { | |||||
it('converts XML to JSON', () => new Promise<void>((resolve, reject) => { | |||||
const readStream = createReadStream(DOWNLOAD_FILENAME, { encoding: 'utf-8' }); | |||||
readStream.on('error', (err) => { | |||||
reject(err); | |||||
}); | |||||
const transform = createXmlToJsonLines({ | |||||
entryTagName: 'entry', | |||||
}); | |||||
transform.on('error', (err) => { | |||||
reject(err); | |||||
}); | |||||
const out = createWriteStream(PROCESS_OUTPUT_FILENAME); | |||||
out.on('error', (err) => { | |||||
reject(err); | |||||
}); | |||||
out.on('finish', () => { | |||||
resolve(); | |||||
}); | |||||
readStream | |||||
.pipe(transform) | |||||
.pipe(out); | |||||
}), { timeout: 45000 }); | |||||
}); | |||||
}); |
@@ -0,0 +1,85 @@ | |||||
import { describe, it, afterAll } from 'vitest'; | |||||
import { createReadStream, createWriteStream } from 'fs'; | |||||
import { unlink } from 'fs/promises'; | |||||
import { createDownloader, createXmlToJsonLines, Kanjidic } from '../../src'; | |||||
const DOWNLOAD_FILENAME = 'kanjidic2.xml' as const; | |||||
const PROCESS_OUTPUT_FILENAME = 'kanjidic2.jsonl' as const; | |||||
// FIXME use tests that do not download the source data | |||||
describe.skip('kanjidic', () => { | |||||
afterAll(async () => { | |||||
try { | |||||
await unlink(DOWNLOAD_FILENAME); | |||||
} catch { | |||||
// noop | |||||
} | |||||
}); | |||||
afterAll(async () => { | |||||
try { | |||||
await unlink(PROCESS_OUTPUT_FILENAME); | |||||
} catch { | |||||
// noop | |||||
} | |||||
}); | |||||
describe('downloader', () => { | |||||
it('downloads the source data', async () => { | |||||
const readStream = await createDownloader({ | |||||
type: Kanjidic.SOURCE_ID, | |||||
}); | |||||
return new Promise<void>((resolve, reject) => { | |||||
const out = createWriteStream(DOWNLOAD_FILENAME); | |||||
readStream.on('error', (err) => { | |||||
reject(err); | |||||
}); | |||||
out.on('error', (err) => { | |||||
reject(err); | |||||
}); | |||||
out.on('finish', () => { | |||||
resolve(); | |||||
}); | |||||
readStream.pipe(out); | |||||
}); | |||||
}); | |||||
}); | |||||
describe('converter', () => { | |||||
it('converts XML to JSON', () => new Promise<void>((resolve, reject) => { | |||||
const readStream = createReadStream(DOWNLOAD_FILENAME, { encoding: 'utf-8' }); | |||||
readStream.on('error', (err) => { | |||||
reject(err); | |||||
}); | |||||
const transform = createXmlToJsonLines({ | |||||
entryTagName: 'character', | |||||
}); | |||||
transform.on('error', (err) => { | |||||
reject(err); | |||||
}); | |||||
const out = createWriteStream(PROCESS_OUTPUT_FILENAME); | |||||
out.on('error', (err) => { | |||||
reject(err); | |||||
}); | |||||
out.on('finish', () => { | |||||
resolve(); | |||||
}); | |||||
readStream | |||||
.pipe(transform) | |||||
.pipe(out); | |||||
})); | |||||
}); | |||||
}); |