Tools for learning Japanese.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

106 lines
2.7 KiB

  1. import {
  2. createDownloader,
  3. CreateDownloaderParams,
  4. Kanjidic,
  5. JMdict,
  6. JMnedict,
  7. KRadFile,
  8. RadKFile,
  9. createXmlToJsonLines,
  10. } from '@modal-sh/murasaki-core';
  11. import { createWriteStream, readFileSync, writeFileSync } from 'fs';
  12. import { PassThrough } from 'stream';
  13. export interface InitService {
  14. downloadDataset(params: CreateDownloaderParams): Promise<ManifestData>;
  15. }
  16. interface ManifestDatasetEntry {
  17. createdAt: number;
  18. lastUpdatedAt: number;
  19. }
  20. type ManifestData = Record<string, ManifestDatasetEntry>;
  21. export class InitServiceImpl implements InitService {
  22. private readonly manifestFilename = '.murasaki.json' as const;
  23. private readonly manifestFileEncoding = 'utf-8' as const;
  24. private readonly data: ManifestData;
  25. constructor() {
  26. try {
  27. const dataBuffer = readFileSync(this.manifestFilename);
  28. const dataJsonString = dataBuffer.toString(this.manifestFileEncoding);
  29. this.data = JSON.parse(dataJsonString) as Record<string, ManifestDatasetEntry>;
  30. } catch {
  31. this.data = {};
  32. }
  33. }
  34. private commitDatasetMetadata(): ManifestData {
  35. writeFileSync(this.manifestFilename, JSON.stringify(this.data), { flag: 'w' });
  36. return this.data;
  37. }
  38. private static getParserStreams(type: CreateDownloaderParams['type']) {
  39. switch (type) {
  40. case Kanjidic.SOURCE_ID:
  41. return createXmlToJsonLines({
  42. entryTagName: 'character',
  43. });
  44. case JMnedict.SOURCE_ID:
  45. case JMdict.SOURCE_ID:
  46. return createXmlToJsonLines({
  47. entryTagName: 'entry',
  48. });
  49. default:
  50. break;
  51. }
  52. return new PassThrough();
  53. }
  54. async downloadDataset(params: CreateDownloaderParams): Promise<ManifestData> {
  55. const downloader = await createDownloader(params);
  56. return new Promise<ManifestData>((resolve, reject) => {
  57. const parserStream = InitServiceImpl.getParserStreams(params.type);
  58. const out = createWriteStream(`${params.type}.jsonl`);
  59. parserStream.on('error', (error) => {
  60. try {
  61. this.commitDatasetMetadata();
  62. } catch {
  63. // noop
  64. }
  65. reject(new Error(`Error when parsing dataset: ${params.type as unknown as string}`, { cause: error }));
  66. });
  67. out.on('finish', () => {
  68. const now = Date.now();
  69. this.data[params.type] = {
  70. ...(this.data[params.type] ?? {}),
  71. createdAt: this.data[params.type]?.createdAt ?? now,
  72. lastUpdatedAt: now,
  73. };
  74. try {
  75. this.commitDatasetMetadata();
  76. resolve(this.data);
  77. } catch {
  78. reject(new Error(`Error when committing metadata for dataset: ${params.type as unknown as string}`));
  79. }
  80. });
  81. downloader
  82. .pipe(parserStream)
  83. .pipe(out);
  84. });
  85. }
  86. }