|
- import {
- createDownloader,
- CreateDownloaderParams,
- Kanjidic,
- JMdict,
- JMnedict,
- KRadFile,
- RadKFile,
- createXmlToJsonLines,
- } from '@modal-sh/murasaki-core';
- import { createWriteStream, readFileSync, writeFileSync } from 'fs';
- import { PassThrough } from 'stream';
-
- export interface InitService {
- downloadDataset(params: CreateDownloaderParams): Promise<ManifestData>;
- }
-
- interface ManifestDatasetEntry {
- createdAt: number;
- lastUpdatedAt: number;
- }
-
- type ManifestData = Record<string, ManifestDatasetEntry>;
-
- export class InitServiceImpl implements InitService {
-
- private readonly manifestFilename = '.murasaki.json' as const;
-
- private readonly manifestFileEncoding = 'utf-8' as const;
-
- private readonly data: ManifestData;
-
- constructor() {
- try {
- const dataBuffer = readFileSync(this.manifestFilename);
- const dataJsonString = dataBuffer.toString(this.manifestFileEncoding);
- this.data = JSON.parse(dataJsonString) as Record<string, ManifestDatasetEntry>;
- } catch {
- this.data = {};
- }
- }
-
- private commitDatasetMetadata(): ManifestData {
- writeFileSync(this.manifestFilename, JSON.stringify(this.data), { flag: 'w' });
- return this.data;
- }
-
- private static getParserStreams(type: CreateDownloaderParams['type']) {
- switch (type) {
- case Kanjidic.SOURCE_ID:
- return createXmlToJsonLines({
- entryTagName: 'character',
- });
- case JMnedict.SOURCE_ID:
- case JMdict.SOURCE_ID:
- return createXmlToJsonLines({
- entryTagName: 'entry',
- });
- default:
- break;
- }
-
- return new PassThrough();
- }
-
- async downloadDataset(params: CreateDownloaderParams): Promise<ManifestData> {
- const downloader = await createDownloader(params);
-
- return new Promise<ManifestData>((resolve, reject) => {
- const parserStream = InitServiceImpl.getParserStreams(params.type);
-
- const out = createWriteStream(`${params.type}.jsonl`);
-
- parserStream.on('error', (error) => {
- try {
- this.commitDatasetMetadata();
- } catch {
- // noop
- }
-
- reject(new Error(`Error when parsing dataset: ${params.type as unknown as string}`, { cause: error }));
- });
-
- out.on('finish', () => {
- const now = Date.now();
- this.data[params.type] = {
- ...(this.data[params.type] ?? {}),
- createdAt: this.data[params.type]?.createdAt ?? now,
- lastUpdatedAt: now,
- };
-
- try {
- this.commitDatasetMetadata();
- resolve(this.data);
- } catch {
- reject(new Error(`Error when committing metadata for dataset: ${params.type as unknown as string}`));
- }
- });
-
- downloader
- .pipe(parserStream)
- .pipe(out);
- });
- }
- }
|