banatie-content/scripts/html-reddit-to-markdown.js

#!/usr/bin/env node

/**
 * HTML Reddit to Markdown Converter
 *
 * Парсит HTML файлы Reddit постов и конвертирует их в Markdown.
 * Автоматически находит контент поста и конвертирует.
 *
 * Usage:
 *   # Простая конвертация в Markdown
 *   node scripts/html-reddit-to-markdown.js source.html output.md
 *
 *   # Конвертация с разделением на секции (JSON)
 *   node scripts/html-reddit-to-markdown.js source.html output.json
 *
 *   # Продвинутое использование (извлечь определенные строки)
 *   node scripts/html-reddit-to-markdown.js source.html output.md --start 249 --end 1219
 *
 * Формат определяется автоматически по расширению выходного файла:
 *   .md  → чистый Markdown
 *   .json → JSON с секциями и метаданными
 */

const fs = require('fs');
const path = require('path');
const { program } = require('commander');
const cheerio = require('cheerio');
const TurndownService = require('turndown');

// Настройка CLI
program
  .argument('<input>', 'Input HTML file')
  .argument('[output]', 'Output file (.md or .json)', '/tmp/output.md')
  .option('-s, --sections <number>', 'Number of sections to split into (only for JSON output)', parseInt, 6)
  .option('--start <line>', 'Start line number (advanced)', parseInt)
  .option('--end <line>', 'End line number (advanced)', parseInt)
  .parse(process.argv);

const [inputFile, outputFile] = program.args;
const options = program.opts();

/**
 * Извлекает строки из файла
 */
function extractLines(filePath, startLine, endLine) {
  const content = fs.readFileSync(filePath, 'utf-8');
  const lines = content.split('\n');

  if (startLine && endLine) {
    return lines.slice(startLine - 1, endLine).join('\n');
  }

  return content;
}

/**
 * Парсит Reddit HTML и извлекает основной контент
 */
function parseRedditPost(html) {
  const $ = cheerio.load(html);

  // Найти основной контейнер с постом
  // Reddit использует id формата "t3_xxxxx-post-rtjson-content"
  // Ищем элемент, который НАЧИНАЕТСЯ с "t3_" И заканчивается на "-post-rtjson-content"
  const postContent = $('[id^="t3_"][id$="-post-rtjson-content"]');

  if (postContent.length === 0) {
    // Попробовать альтернативный селектор
    const altContent = $('.md[property="schema:articleBody"]');
    if (altContent.length > 0) {
      return altContent.html();
    }

    throw new Error('Could not find post content container');
  }

  return postContent.html();
}

/**
 * Конвертирует HTML в Markdown
 */
function convertToMarkdown(html) {
  const turndownService = new TurndownService({
    headingStyle: 'atx',
    codeBlockStyle: 'fenced',
    fence: '```',
    emDelimiter: '*',
    strongDelimiter: '**',
    linkStyle: 'inlined'
  });

  // Кастомное правило для inline code
  turndownService.addRule('inlineCode', {
    filter: function (node) {
      return node.nodeName === 'CODE' && node.parentNode.nodeName !== 'PRE';
    },
    replacement: function (content) {
      return '`' + content + '`';
    }
  });

  // Кастомное правило для code blocks
  turndownService.addRule('codeBlock', {
    filter: function (node) {
      return node.nodeName === 'PRE';
    },
    replacement: function (content, node) {
      const code = node.querySelector('code');
      if (code) {
        return '\n```\n' + code.textContent + '\n```\n';
      }
      return '\n```\n' + node.textContent + '\n```\n';
    }
  });

  const markdown = turndownService.turndown(html);

  // Cleanup: удалить лишние HTML комментарии и артефакты
  return markdown
    .replace(/<!--\?lit\$[^>]*-->/g, '')
    .replace(/<!--\?-->/g, '')
    .replace(/\n{3,}/g, '\n\n') // Убрать множественные переносы строк
    .trim();
}

/**
 * Разделяет markdown на секции по заголовкам H1
 */
function splitIntoSections(markdown, numSections) {
  // Разделить по заголовкам H1
  const h1Pattern = /^# .+$/gm;
  const headers = [];
  let match;

  while ((match = h1Pattern.exec(markdown)) !== null) {
    headers.push({
      text: match[0],
      index: match.index
    });
  }

  if (headers.length === 0) {
    return [{ number: 1, title: 'Full Content', content: markdown }];
  }

  // Если запрошено больше секций чем заголовков, использовать количество заголовков
  const actualSections = Math.min(numSections, headers.length);
  const headersPerSection = Math.ceil(headers.length / actualSections);

  const sections = [];

  for (let i = 0; i < actualSections; i++) {
    const startHeaderIdx = i * headersPerSection;
    const endHeaderIdx = Math.min((i + 1) * headersPerSection, headers.length);

    const startPos = headers[startHeaderIdx].index;
    const endPos = endHeaderIdx < headers.length
      ? headers[endHeaderIdx].index
      : markdown.length;

    const sectionContent = markdown.substring(startPos, endPos).trim();
    const firstHeader = sectionContent.match(/^# (.+)$/m);

    sections.push({
      number: i + 1,
      title: firstHeader ? firstHeader[1] : `Section ${i + 1}`,
      headerCount: endHeaderIdx - startHeaderIdx,
      content: sectionContent
    });
  }

  return sections;
}

/**
 * Основная функция
 */
async function main() {
  try {
    console.log('🔍 Reading HTML file:', inputFile);

    // Извлечь нужные строки (если указаны --start и --end)
    const html = extractLines(
      inputFile,
      options.start,
      options.end
    );

    console.log('📝 Parsing Reddit HTML...');
    const postHtml = parseRedditPost(html);

    console.log('🔄 Converting to Markdown...');
    const markdown = convertToMarkdown(postHtml);

    // Определить формат вывода по расширению файла
    const isMarkdownOutput = outputFile.endsWith('.md');

    if (isMarkdownOutput) {
      // Простой вывод в Markdown
      fs.writeFileSync(outputFile, markdown, 'utf-8');
      console.log(`\n✅ Markdown saved to: ${outputFile}`);
      console.log(`📊 Size: ${(markdown.length / 1024).toFixed(1)} KB`);
    } else {
      // Вывод в JSON с секциями
      console.log('✂️  Splitting into sections...');
      const sections = splitIntoSections(markdown, options.sections);

      console.log(`✅ Created ${sections.length} sections:`);
      sections.forEach(s => {
        console.log(`   Section ${s.number}: "${s.title}" (${s.headerCount} headers)`);
      });

      const result = {
        metadata: {
          inputFile: inputFile,
          totalSections: sections.length,
          extractedLines: options.start && options.end
            ? `${options.start}-${options.end}`
            : 'auto-detected',
          generatedAt: new Date().toISOString()
        },
        fullMarkdown: markdown,
        sections: sections
      };

      fs.writeFileSync(
        outputFile,
        JSON.stringify(result, null, 2),
        'utf-8'
      );
      console.log(`\n✅ JSON saved to: ${outputFile}`);
    }

    console.log('\n✨ Done!');

  } catch (error) {
    console.error('❌ Error:', error.message);
    console.error('\nStack trace:', error.stack);
    process.exit(1);
  }
}

// Запуск
main();