|
@@ -8,7 +8,8 @@ import { |
|
|
RadKFile, |
|
|
RadKFile, |
|
|
createXmlToJsonLines, |
|
|
createXmlToJsonLines, |
|
|
} from '@modal-sh/murasaki-core'; |
|
|
} from '@modal-sh/murasaki-core'; |
|
|
import { createWriteStream, readFileSync } from 'fs'; |
|
|
|
|
|
|
|
|
import { createWriteStream, readFileSync, writeFileSync } from 'fs'; |
|
|
|
|
|
import { PassThrough } from 'stream'; |
|
|
|
|
|
|
|
|
export interface InitService { |
|
|
export interface InitService { |
|
|
downloadDataset(params: CreateDownloaderParams): Promise<ManifestData>; |
|
|
downloadDataset(params: CreateDownloaderParams): Promise<ManifestData>; |
|
@@ -22,7 +23,6 @@ interface ManifestDatasetEntry { |
|
|
type ManifestData = Record<string, ManifestDatasetEntry>; |
|
|
type ManifestData = Record<string, ManifestDatasetEntry>; |
|
|
|
|
|
|
|
|
export class InitServiceImpl implements InitService { |
|
|
export class InitServiceImpl implements InitService { |
|
|
private manifestWriteStream?: NodeJS.WritableStream; |
|
|
|
|
|
|
|
|
|
|
|
private readonly manifestFilename = '.murasaki.json' as const; |
|
|
private readonly manifestFilename = '.murasaki.json' as const; |
|
|
|
|
|
|
|
@@ -40,71 +40,66 @@ export class InitServiceImpl implements InitService { |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
private async commitDatasetMetadata(): Promise<ManifestData> { |
|
|
|
|
|
return new Promise<ManifestData>((resolve, reject) => { |
|
|
|
|
|
this.manifestWriteStream = createWriteStream(this.manifestFilename, { |
|
|
|
|
|
flags: 'w', |
|
|
|
|
|
}); |
|
|
|
|
|
this.manifestWriteStream.on('error', reject); |
|
|
|
|
|
this.manifestWriteStream.write(JSON.stringify(this.data)); |
|
|
|
|
|
this.manifestWriteStream.end(() => { |
|
|
|
|
|
resolve(this.data); |
|
|
|
|
|
}); |
|
|
|
|
|
}); |
|
|
|
|
|
|
|
|
private commitDatasetMetadata(): ManifestData { |
|
|
|
|
|
writeFileSync(this.manifestFilename, JSON.stringify(this.data), { flag: 'w' }); |
|
|
|
|
|
return this.data; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
private static getParserStreams(type: CreateDownloaderParams['type']) { |
|
|
|
|
|
switch (type) { |
|
|
|
|
|
case Kanjidic.SOURCE_ID: |
|
|
|
|
|
return createXmlToJsonLines({ |
|
|
|
|
|
entryTagName: 'character', |
|
|
|
|
|
}); |
|
|
|
|
|
case JMnedict.SOURCE_ID: |
|
|
|
|
|
case JMdict.SOURCE_ID: |
|
|
|
|
|
return createXmlToJsonLines({ |
|
|
|
|
|
entryTagName: 'entry', |
|
|
|
|
|
}); |
|
|
|
|
|
default: |
|
|
|
|
|
break; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
return new PassThrough(); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
async downloadDataset(params: CreateDownloaderParams): Promise<ManifestData> { |
|
|
async downloadDataset(params: CreateDownloaderParams): Promise<ManifestData> { |
|
|
const downloader = await createDownloader(params); |
|
|
const downloader = await createDownloader(params); |
|
|
|
|
|
|
|
|
return new Promise<ManifestData>((resolve, reject) => { |
|
|
return new Promise<ManifestData>((resolve, reject) => { |
|
|
|
|
|
const parserStream = InitServiceImpl.getParserStreams(params.type); |
|
|
|
|
|
|
|
|
const out = createWriteStream(`${params.type}.jsonl`); |
|
|
const out = createWriteStream(`${params.type}.jsonl`); |
|
|
|
|
|
|
|
|
|
|
|
parserStream.on('error', (error) => { |
|
|
|
|
|
try { |
|
|
|
|
|
this.commitDatasetMetadata(); |
|
|
|
|
|
} catch { |
|
|
|
|
|
// noop |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
reject(new Error(`Error when parsing dataset: ${params.type as unknown as string}`, { cause: error })); |
|
|
|
|
|
}); |
|
|
|
|
|
|
|
|
out.on('finish', () => { |
|
|
out.on('finish', () => { |
|
|
const now = Date.now(); |
|
|
const now = Date.now(); |
|
|
this.data[params.type] = { |
|
|
this.data[params.type] = { |
|
|
...this.data[params.type], |
|
|
|
|
|
createdAt: this.data[params.type].createdAt ?? now, |
|
|
|
|
|
|
|
|
...(this.data[params.type] ?? {}), |
|
|
|
|
|
createdAt: this.data[params.type]?.createdAt ?? now, |
|
|
lastUpdatedAt: now, |
|
|
lastUpdatedAt: now, |
|
|
}; |
|
|
}; |
|
|
|
|
|
|
|
|
this.commitDatasetMetadata() |
|
|
|
|
|
.then(resolve) |
|
|
|
|
|
.catch(reject); |
|
|
|
|
|
|
|
|
try { |
|
|
|
|
|
this.commitDatasetMetadata(); |
|
|
|
|
|
resolve(this.data); |
|
|
|
|
|
} catch { |
|
|
|
|
|
reject(new Error(`Error when committing metadata for dataset: ${params.type as unknown as string}`)); |
|
|
|
|
|
} |
|
|
}); |
|
|
}); |
|
|
|
|
|
|
|
|
switch (params.type) { |
|
|
|
|
|
case Kanjidic.SOURCE_ID: { |
|
|
|
|
|
const jsonlParser = createXmlToJsonLines({ |
|
|
|
|
|
entryTagName: 'character', |
|
|
|
|
|
}); |
|
|
|
|
|
|
|
|
|
|
|
downloader |
|
|
|
|
|
.pipe(jsonlParser) |
|
|
|
|
|
.pipe(out); |
|
|
|
|
|
} return; |
|
|
|
|
|
case JMnedict.SOURCE_ID: |
|
|
|
|
|
case JMdict.SOURCE_ID: { |
|
|
|
|
|
const jsonlParser = createXmlToJsonLines({ |
|
|
|
|
|
entryTagName: 'entry', |
|
|
|
|
|
}); |
|
|
|
|
|
|
|
|
|
|
|
downloader |
|
|
|
|
|
.pipe(jsonlParser) |
|
|
|
|
|
.pipe(out); |
|
|
|
|
|
} return; |
|
|
|
|
|
case KRadFile.SOURCE_ID: |
|
|
|
|
|
case RadKFile.SOURCE_ID: |
|
|
|
|
|
downloader.pipe(out); |
|
|
|
|
|
return; |
|
|
|
|
|
default: |
|
|
|
|
|
break; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
this.commitDatasetMetadata() |
|
|
|
|
|
.then(() => { |
|
|
|
|
|
reject(new Error(`Unknown dataset: ${params.type as unknown as string}`)); |
|
|
|
|
|
}) |
|
|
|
|
|
.catch(reject); |
|
|
|
|
|
|
|
|
downloader |
|
|
|
|
|
.pipe(parserStream) |
|
|
|
|
|
.pipe(out); |
|
|
}); |
|
|
}); |
|
|
} |
|
|
} |
|
|
} |
|
|
} |