diff --git a/.gitignore b/.gitignore index bea7415..b92f0bb 100644 --- a/.gitignore +++ b/.gitignore @@ -106,3 +106,5 @@ dist .npmrc types/ +.murasaki.json +*.jsonl diff --git a/src/modules/init/InitController.ts b/src/modules/init/InitController.ts index c9a5337..fae50d4 100644 --- a/src/modules/init/InitController.ts +++ b/src/modules/init/InitController.ts @@ -1,6 +1,7 @@ import {RouteHandlerMethod} from 'fastify'; import {CreateDownloaderParams} from '@modal-sh/murasaki-core'; import {InitService, InitServiceImpl} from './InitService'; +import {constants} from 'http2'; export interface InitController { downloadDataset: RouteHandlerMethod; @@ -12,7 +13,11 @@ export class InitControllerImpl implements InitController { } readonly downloadDataset: RouteHandlerMethod = async (request, reply) => { - const result = await this.initService.downloadDataset(request.body as CreateDownloaderParams); - reply.send(result); + try { + const result = await this.initService.downloadDataset(request.body as CreateDownloaderParams); + reply.send(result); + } catch (err) { + reply.status(constants.HTTP_STATUS_INTERNAL_SERVER_ERROR).send(err); + } }; } diff --git a/src/modules/init/InitService.ts b/src/modules/init/InitService.ts index 581f9e3..2e78f0f 100644 --- a/src/modules/init/InitService.ts +++ b/src/modules/init/InitService.ts @@ -8,7 +8,8 @@ import { RadKFile, createXmlToJsonLines, } from '@modal-sh/murasaki-core'; -import { createWriteStream, readFileSync } from 'fs'; +import { createWriteStream, readFileSync, writeFileSync } from 'fs'; +import { PassThrough } from 'stream'; export interface InitService { downloadDataset(params: CreateDownloaderParams): Promise; @@ -22,7 +23,6 @@ interface ManifestDatasetEntry { type ManifestData = Record; export class InitServiceImpl implements InitService { - private manifestWriteStream?: NodeJS.WritableStream; private readonly manifestFilename = '.murasaki.json' as const; @@ -40,71 +40,66 @@ export class InitServiceImpl implements InitService { } } - private async commitDatasetMetadata(): Promise { - return new Promise((resolve, reject) => { - this.manifestWriteStream = createWriteStream(this.manifestFilename, { - flags: 'w', - }); - this.manifestWriteStream.on('error', reject); - this.manifestWriteStream.write(JSON.stringify(this.data)); - this.manifestWriteStream.end(() => { - resolve(this.data); - }); - }); + private commitDatasetMetadata(): ManifestData { + writeFileSync(this.manifestFilename, JSON.stringify(this.data), { flag: 'w' }); + return this.data; + } + + private static getParserStreams(type: CreateDownloaderParams['type']) { + switch (type) { + case Kanjidic.SOURCE_ID: + return createXmlToJsonLines({ + entryTagName: 'character', + }); + case JMnedict.SOURCE_ID: + case JMdict.SOURCE_ID: + return createXmlToJsonLines({ + entryTagName: 'entry', + }); + default: + break; + } + + return new PassThrough(); } async downloadDataset(params: CreateDownloaderParams): Promise { const downloader = await createDownloader(params); return new Promise((resolve, reject) => { + const parserStream = InitServiceImpl.getParserStreams(params.type); + const out = createWriteStream(`${params.type}.jsonl`); + parserStream.on('error', (error) => { + try { + this.commitDatasetMetadata(); + } catch { + // noop + } + + reject(new Error(`Error when parsing dataset: ${params.type as unknown as string}`, { cause: error })); + }); + out.on('finish', () => { const now = Date.now(); this.data[params.type] = { - ...this.data[params.type], - createdAt: this.data[params.type].createdAt ?? now, + ...(this.data[params.type] ?? {}), + createdAt: this.data[params.type]?.createdAt ?? now, lastUpdatedAt: now, }; - this.commitDatasetMetadata() - .then(resolve) - .catch(reject); + try { + this.commitDatasetMetadata(); + resolve(this.data); + } catch { + reject(new Error(`Error when committing metadata for dataset: ${params.type as unknown as string}`)); + } }); - switch (params.type) { - case Kanjidic.SOURCE_ID: { - const jsonlParser = createXmlToJsonLines({ - entryTagName: 'character', - }); - - downloader - .pipe(jsonlParser) - .pipe(out); - } return; - case JMnedict.SOURCE_ID: - case JMdict.SOURCE_ID: { - const jsonlParser = createXmlToJsonLines({ - entryTagName: 'entry', - }); - - downloader - .pipe(jsonlParser) - .pipe(out); - } return; - case KRadFile.SOURCE_ID: - case RadKFile.SOURCE_ID: - downloader.pipe(out); - return; - default: - break; - } - - this.commitDatasetMetadata() - .then(() => { - reject(new Error(`Unknown dataset: ${params.type as unknown as string}`)); - }) - .catch(reject); + downloader + .pipe(parserStream) + .pipe(out); }); } }