Tools for learning Japanese.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

111 lines
2.9 KiB

  1. import {
  2. createDownloader,
  3. CreateDownloaderParams,
  4. Kanjidic,
  5. JMdict,
  6. JMnedict,
  7. KRadFile,
  8. RadKFile,
  9. createXmlToJsonLines,
  10. } from '@modal-sh/murasaki-core';
  11. import { createWriteStream, readFileSync } from 'fs';
  12. export interface InitService {
  13. downloadDataset(params: CreateDownloaderParams): Promise<ManifestData>;
  14. }
  15. interface ManifestDatasetEntry {
  16. createdAt: number;
  17. lastUpdatedAt: number;
  18. }
  19. type ManifestData = Record<string, ManifestDatasetEntry>;
  20. export class InitServiceImpl implements InitService {
  21. private manifestWriteStream?: NodeJS.WritableStream;
  22. private readonly manifestFilename = '.murasaki.json' as const;
  23. private readonly manifestFileEncoding = 'utf-8' as const;
  24. private readonly data: ManifestData;
  25. constructor() {
  26. try {
  27. const dataBuffer = readFileSync(this.manifestFilename);
  28. const dataJsonString = dataBuffer.toString(this.manifestFileEncoding);
  29. this.data = JSON.parse(dataJsonString) as Record<string, ManifestDatasetEntry>;
  30. } catch {
  31. this.data = {};
  32. }
  33. }
  34. private async commitDatasetMetadata(): Promise<ManifestData> {
  35. return new Promise<ManifestData>((resolve, reject) => {
  36. this.manifestWriteStream = createWriteStream(this.manifestFilename, {
  37. flags: 'w',
  38. });
  39. this.manifestWriteStream.on('error', reject);
  40. this.manifestWriteStream.write(JSON.stringify(this.data));
  41. this.manifestWriteStream.end(() => {
  42. resolve(this.data);
  43. });
  44. });
  45. }
  46. async downloadDataset(params: CreateDownloaderParams): Promise<ManifestData> {
  47. const downloader = await createDownloader(params);
  48. return new Promise<ManifestData>((resolve, reject) => {
  49. const out = createWriteStream(`${params.type}.jsonl`);
  50. out.on('finish', () => {
  51. const now = Date.now();
  52. this.data[params.type] = {
  53. ...this.data[params.type],
  54. createdAt: this.data[params.type].createdAt ?? now,
  55. lastUpdatedAt: now,
  56. };
  57. this.commitDatasetMetadata()
  58. .then(resolve)
  59. .catch(reject);
  60. });
  61. switch (params.type) {
  62. case Kanjidic.SOURCE_ID: {
  63. const jsonlParser = createXmlToJsonLines({
  64. entryTagName: 'character',
  65. });
  66. downloader
  67. .pipe(jsonlParser)
  68. .pipe(out);
  69. } return;
  70. case JMnedict.SOURCE_ID:
  71. case JMdict.SOURCE_ID: {
  72. const jsonlParser = createXmlToJsonLines({
  73. entryTagName: 'entry',
  74. });
  75. downloader
  76. .pipe(jsonlParser)
  77. .pipe(out);
  78. } return;
  79. case KRadFile.SOURCE_ID:
  80. case RadKFile.SOURCE_ID:
  81. downloader.pipe(out);
  82. return;
  83. default:
  84. break;
  85. }
  86. this.commitDatasetMetadata()
  87. .then(() => {
  88. reject(new Error(`Unknown dataset: ${params.type as unknown as string}`));
  89. })
  90. .catch(reject);
  91. });
  92. }
  93. }