feat: update script

2025-12-28 23:02:09 +07:00 · 2025-12-28 23:02:09 +07:00 · d82393a47f
parent ab07646ef2
commit d82393a47f
1 changed files with 50 additions and 48 deletions
--- a/scripts/html-reddit-to-markdown.js
+++ b/scripts/html-reddit-to-markdown.js
@ -4,15 +4,21 @@
 * HTML Reddit to Markdown Converter
 *
 * Парсит HTML файлы Reddit постов и конвертирует их в Markdown.
- * Поддерживает извлечение основного контента и разделение на секции.
+ * Автоматически находит контент поста и конвертирует.
 *
 * Usage:
- *   node scripts/html-reddit-to-markdown.js \
+ *   # Простая конвертация в Markdown
- *     --input 0-inbox/post.htm \
+ *   node scripts/html-reddit-to-markdown.js source.html output.md
- *     --start 249 \
+ *
- *     --end 1219 \
+ *   # Конвертация с разделением на секции (JSON)
- *     --sections 6 \
+ *   node scripts/html-reddit-to-markdown.js source.html output.json
- *     --output /tmp/sections.json
+ *
 *   # Продвинутое использование (извлечь определенные строки)
 *   node scripts/html-reddit-to-markdown.js source.html output.md --start 249 --end 1219
 *
 * Формат определяется автоматически по расширению выходного файла:
 *   .md  → чистый Markdown
 *   .json → JSON с секциями и метаданными
 */
 const fs = require('fs');
@ -23,22 +29,16 @@ const TurndownService = require('turndown');
 // Настройка CLI
 program
-  .option('-i, --input <file>', 'Input HTML file')
+  .argument('<input>', 'Input HTML file')
-  .option('-s, --start <line>', 'Start line number', parseInt)
+  .argument('[output]', 'Output file (.md or .json)', '/tmp/output.md')
-  .option('-e, --end <line>', 'End line number', parseInt)
+  .option('-s, --sections <number>', 'Number of sections to split into (only for JSON output)', parseInt, 6)
-  .option('-n, --sections <number>', 'Number of sections to split into', parseInt, 6)
+  .option('--start <line>', 'Start line number (advanced)', parseInt)
-  .option('-o, --output <file>', 'Output JSON file')
+  .option('--end <line>', 'End line number (advanced)', parseInt)
  .option('--markdown-only', 'Output single markdown file instead of JSON')
  .parse(process.argv);
 const [inputFile, outputFile] = program.args;
 const options = program.opts();
 // Валидация параметров
 if (!options.input) {
  console.error('Error: --input is required');
  process.exit(1);
 }
 /**
 * Извлекает строки из файла
 */
@ -61,7 +61,8 @@ function parseRedditPost(html) {
  // Найти основной контейнер с постом
  // Reddit использует id формата "t3_xxxxx-post-rtjson-content"
-  const postContent = $('[id$="-post-rtjson-content"]');
+  // Ищем элемент, который НАЧИНАЕТСЯ с "t3_" И заканчивается на "-post-rtjson-content"
  const postContent = $('[id^="t3_"][id$="-post-rtjson-content"]');
  if (postContent.length === 0) {
    // Попробовать альтернативный селектор
@ -177,11 +178,11 @@ function splitIntoSections(markdown, numSections) {
 */
 async function main() {
  try {
-    console.log('🔍 Reading HTML file:', options.input);
+    console.log('🔍 Reading HTML file:', inputFile);
-    // Извлечь нужные строки
+    // Извлечь нужные строки (если указаны --start и --end)
    const html = extractLines(
-      options.input,
+      inputFile,
      options.start,
      options.end
    );
@ -192,6 +193,16 @@ async function main() {
    console.log('🔄 Converting to Markdown...');
    const markdown = convertToMarkdown(postHtml);
    // Определить формат вывода по расширению файла
    const isMarkdownOutput = outputFile.endsWith('.md');
    if (isMarkdownOutput) {
      // Простой вывод в Markdown
      fs.writeFileSync(outputFile, markdown, 'utf-8');
      console.log(`\n✅ Markdown saved to: ${outputFile}`);
      console.log(`📊 Size: ${(markdown.length / 1024).toFixed(1)} KB`);
    } else {
      // Вывод в JSON с секциями
      console.log('✂️  Splitting into sections...');
      const sections = splitIntoSections(markdown, options.sections);
@ -200,41 +211,32 @@ async function main() {
        console.log(`   Section ${s.number}: "${s.title}" (${s.headerCount} headers)`);
      });
    // Вывод результата
    if (options.markdownOnly) {
      const outputPath = options.output || '/tmp/output.md';
      fs.writeFileSync(outputPath, markdown, 'utf-8');
      console.log(`\n💾 Saved markdown to: ${outputPath}`);
    } else {
      const result = {
        metadata: {
-          inputFile: options.input,
+          inputFile: inputFile,
          totalSections: sections.length,
          extractedLines: options.start && options.end
            ? `${options.start}-${options.end}`
-            : 'all',
+            : 'auto-detected',
          generatedAt: new Date().toISOString()
        },
        fullMarkdown: markdown,
        sections: sections
      };
      if (options.output) {
      fs.writeFileSync(
-          options.output,
+        outputFile,
        JSON.stringify(result, null, 2),
        'utf-8'
      );
-        console.log(`\n💾 Saved JSON to: ${options.output}`);
+      console.log(`\n✅ JSON saved to: ${outputFile}`);
      } else {
        console.log('\n📄 JSON Output:');
        console.log(JSON.stringify(result, null, 2));
      }
    }
    console.log('\n✨ Done!');
  } catch (error) {
    console.error('❌ Error:', error.message);
    console.error('\nStack trace:', error.stack);
    process.exit(1);
  }
 }