|
- import {
- createDownloader,
- CreateDownloaderParams,
- Kanjidic,
- JMdict,
- JMnedict,
- KRadFile,
- RadKFile,
- createXmlToJsonLines,
- } from '@modal-sh/murasaki-core';
- import { createWriteStream, readFileSync } from 'fs';
-
- export interface InitService {
- downloadDataset(params: CreateDownloaderParams): Promise<ManifestData>;
- }
-
- interface ManifestDatasetEntry {
- createdAt: number;
- lastUpdatedAt: number;
- }
-
- type ManifestData = Record<string, ManifestDatasetEntry>;
-
- export class InitServiceImpl implements InitService {
- private manifestWriteStream?: NodeJS.WritableStream;
-
- private readonly manifestFilename = '.murasaki.json' as const;
-
- private readonly manifestFileEncoding = 'utf-8' as const;
-
- private readonly data: ManifestData;
-
- constructor() {
- try {
- const dataBuffer = readFileSync(this.manifestFilename);
- const dataJsonString = dataBuffer.toString(this.manifestFileEncoding);
- this.data = JSON.parse(dataJsonString) as Record<string, ManifestDatasetEntry>;
- } catch {
- this.data = {};
- }
- }
-
- private async commitDatasetMetadata(): Promise<ManifestData> {
- return new Promise<ManifestData>((resolve, reject) => {
- this.manifestWriteStream = createWriteStream(this.manifestFilename, {
- flags: 'w',
- });
- this.manifestWriteStream.on('error', reject);
- this.manifestWriteStream.write(JSON.stringify(this.data));
- this.manifestWriteStream.end(() => {
- resolve(this.data);
- });
- });
- }
-
- async downloadDataset(params: CreateDownloaderParams): Promise<ManifestData> {
- const downloader = await createDownloader(params);
-
- return new Promise<ManifestData>((resolve, reject) => {
- const out = createWriteStream(`${params.type}.jsonl`);
-
- out.on('finish', () => {
- const now = Date.now();
- this.data[params.type] = {
- ...this.data[params.type],
- createdAt: this.data[params.type].createdAt ?? now,
- lastUpdatedAt: now,
- };
-
- this.commitDatasetMetadata()
- .then(resolve)
- .catch(reject);
- });
-
- switch (params.type) {
- case Kanjidic.SOURCE_ID: {
- const jsonlParser = createXmlToJsonLines({
- entryTagName: 'character',
- });
-
- downloader
- .pipe(jsonlParser)
- .pipe(out);
- } return;
- case JMnedict.SOURCE_ID:
- case JMdict.SOURCE_ID: {
- const jsonlParser = createXmlToJsonLines({
- entryTagName: 'entry',
- });
-
- downloader
- .pipe(jsonlParser)
- .pipe(out);
- } return;
- case KRadFile.SOURCE_ID:
- case RadKFile.SOURCE_ID:
- downloader.pipe(out);
- return;
- default:
- break;
- }
-
- this.commitDatasetMetadata()
- .then(() => {
- reject(new Error(`Unknown dataset: ${params.type as unknown as string}`));
- })
- .catch(reject);
- });
- }
- }
|