feat: add parsing script

2025-12-28 22:32:32 +07:00 · 2025-12-28 22:32:32 +07:00 · 8f8c27965e
parent 47072a5a04
commit 8f8c27965e
1 changed files with 243 additions and 0 deletions
--- a/scripts/html-reddit-to-markdown.js
+++ b/scripts/html-reddit-to-markdown.js
@ -0,0 +1,243 @@
+#!/usr/bin/env node
+
+/**
+ * HTML Reddit to Markdown Converter
+ *
+ * Парсит HTML файлы Reddit постов и конвертирует их в Markdown.
+ * Поддерживает извлечение основного контента и разделение на секции.
+ *
+ * Usage:
+ *   node scripts/html-reddit-to-markdown.js \
+ *     --input 0-inbox/post.htm \
+ *     --start 249 \
+ *     --end 1219 \
+ *     --sections 6 \
+ *     --output /tmp/sections.json
+ */
+
+const fs = require('fs');
+const path = require('path');
+const { program } = require('commander');
+const cheerio = require('cheerio');
+const TurndownService = require('turndown');
+
+// Настройка CLI
+program
+  .option('-i, --input <file>', 'Input HTML file')
+  .option('-s, --start <line>', 'Start line number', parseInt)
+  .option('-e, --end <line>', 'End line number', parseInt)
+  .option('-n, --sections <number>', 'Number of sections to split into', parseInt, 6)
+  .option('-o, --output <file>', 'Output JSON file')
+  .option('--markdown-only', 'Output single markdown file instead of JSON')
+  .parse(process.argv);
+
+const options = program.opts();
+
+// Валидация параметров
+if (!options.input) {
+  console.error('Error: --input is required');
+  process.exit(1);
+}
+
+/**
+ * Извлекает строки из файла
+ */
+function extractLines(filePath, startLine, endLine) {
+  const content = fs.readFileSync(filePath, 'utf-8');
+  const lines = content.split('\n');
+
+  if (startLine && endLine) {
+    return lines.slice(startLine - 1, endLine).join('\n');
+  }
+
+  return content;
+}
+
+/**
+ * Парсит Reddit HTML и извлекает основной контент
+ */
+function parseRedditPost(html) {
+  const $ = cheerio.load(html);
+
+  // Найти основной контейнер с постом
+  // Reddit использует id формата "t3_xxxxx-post-rtjson-content"
+  const postContent = $('[id$="-post-rtjson-content"]');
+
+  if (postContent.length === 0) {
+    // Попробовать альтернативный селектор
+    const altContent = $('.md[property="schema:articleBody"]');
+    if (altContent.length > 0) {
+      return altContent.html();
+    }
+
+    throw new Error('Could not find post content container');
+  }
+
+  return postContent.html();
+}
+
+/**
+ * Конвертирует HTML в Markdown
+ */
+function convertToMarkdown(html) {
+  const turndownService = new TurndownService({
+    headingStyle: 'atx',
+    codeBlockStyle: 'fenced',
+    fence: '```',
+    emDelimiter: '*',
+    strongDelimiter: '**',
+    linkStyle: 'inlined'
+  });
+
+  // Кастомное правило для inline code
+  turndownService.addRule('inlineCode', {
+    filter: function (node) {
+      return node.nodeName === 'CODE' && node.parentNode.nodeName !== 'PRE';
+    },
+    replacement: function (content) {
+      return '`' + content + '`';
+    }
+  });
+
+  // Кастомное правило для code blocks
+  turndownService.addRule('codeBlock', {
+    filter: function (node) {
+      return node.nodeName === 'PRE';
+    },
+    replacement: function (content, node) {
+      const code = node.querySelector('code');
+      if (code) {
+        return '\n```\n' + code.textContent + '\n```\n';
+      }
+      return '\n```\n' + node.textContent + '\n```\n';
+    }
+  });
+
+  const markdown = turndownService.turndown(html);
+
+  // Cleanup: удалить лишние HTML комментарии и артефакты
+  return markdown
+    .replace(/<!--\?lit\$[^>]*-->/g, '')
+    .replace(/<!--\?-->/g, '')
+    .replace(/\n{3,}/g, '\n\n') // Убрать множественные переносы строк
+    .trim();
+}
+
+/**
+ * Разделяет markdown на секции по заголовкам H1
+ */
+function splitIntoSections(markdown, numSections) {
+  // Разделить по заголовкам H1
+  const h1Pattern = /^# .+$/gm;
+  const headers = [];
+  let match;
+
+  while ((match = h1Pattern.exec(markdown)) !== null) {
+    headers.push({
+      text: match[0],
+      index: match.index
+    });
+  }
+
+  if (headers.length === 0) {
+    return [{ number: 1, title: 'Full Content', content: markdown }];
+  }
+
+  // Если запрошено больше секций чем заголовков, использовать количество заголовков
+  const actualSections = Math.min(numSections, headers.length);
+  const headersPerSection = Math.ceil(headers.length / actualSections);
+
+  const sections = [];
+
+  for (let i = 0; i < actualSections; i++) {
+    const startHeaderIdx = i * headersPerSection;
+    const endHeaderIdx = Math.min((i + 1) * headersPerSection, headers.length);
+
+    const startPos = headers[startHeaderIdx].index;
+    const endPos = endHeaderIdx < headers.length
+      ? headers[endHeaderIdx].index
+      : markdown.length;
+
+    const sectionContent = markdown.substring(startPos, endPos).trim();
+    const firstHeader = sectionContent.match(/^# (.+)$/m);
+
+    sections.push({
+      number: i + 1,
+      title: firstHeader ? firstHeader[1] : `Section ${i + 1}`,
+      headerCount: endHeaderIdx - startHeaderIdx,
+      content: sectionContent
+    });
+  }
+
+  return sections;
+}
+
+/**
+ * Основная функция
+ */
+async function main() {
+  try {
+    console.log('🔍 Reading HTML file:', options.input);
+
+    // Извлечь нужные строки
+    const html = extractLines(
+      options.input,
+      options.start,
+      options.end
+    );
+
+    console.log('📝 Parsing Reddit HTML...');
+    const postHtml = parseRedditPost(html);
+
+    console.log('🔄 Converting to Markdown...');
+    const markdown = convertToMarkdown(postHtml);
+
+    console.log('✂️  Splitting into sections...');
+    const sections = splitIntoSections(markdown, options.sections);
+
+    console.log(`✅ Created ${sections.length} sections:`);
+    sections.forEach(s => {
+      console.log(`   Section ${s.number}: "${s.title}" (${s.headerCount} headers)`);
+    });
+
+    // Вывод результата
+    if (options.markdownOnly) {
+      const outputPath = options.output || '/tmp/output.md';
+      fs.writeFileSync(outputPath, markdown, 'utf-8');
+      console.log(`\n💾 Saved markdown to: ${outputPath}`);
+    } else {
+      const result = {
+        metadata: {
+          inputFile: options.input,
+          totalSections: sections.length,
+          extractedLines: options.start && options.end
+            ? `${options.start}-${options.end}`
+            : 'all',
+          generatedAt: new Date().toISOString()
+        },
+        sections: sections
+      };
+
+      if (options.output) {
+        fs.writeFileSync(
+          options.output,
+          JSON.stringify(result, null, 2),
+          'utf-8'
+        );
+        console.log(`\n💾 Saved JSON to: ${options.output}`);
+      } else {
+        console.log('\n📄 JSON Output:');
+        console.log(JSON.stringify(result, null, 2));
+      }
+    }
+
+    console.log('\n✨ Done!');
+
+  } catch (error) {
+    console.error('❌ Error:', error.message);
+    process.exit(1);
+  }
+}
+
+// Запуск
+main();