htmlToMarkdown function

String htmlToMarkdown(
  1. String html
)

Convert HTML content to clean markdown.

Implementation

String htmlToMarkdown(String html) {
  var text = html;

  // Remove scripts and styles
  text = text.replaceAll(
    RegExp(r'<script[^>]*>[\s\S]*?</script>', caseSensitive: false),
    '',
  );
  text = text.replaceAll(
    RegExp(r'<style[^>]*>[\s\S]*?</style>', caseSensitive: false),
    '',
  );
  text = text.replaceAll(RegExp(r'<!--[\s\S]*?-->'), '');

  // Convert headers
  for (var i = 6; i >= 1; i--) {
    text = text.replaceAllMapped(
      RegExp('<h$i[^>]*>(.*?)</h$i>', caseSensitive: false, dotAll: true),
      (m) => '\n${'#' * i} ${_stripTags(m.group(1) ?? '')}\n',
    );
  }

  // Convert links
  text = text.replaceAllMapped(
    RegExp(
      r'<a\s+[^>]*href="([^"]*)"[^>]*>(.*?)</a>',
      caseSensitive: false,
      dotAll: true,
    ),
    (m) => '[${_stripTags(m.group(2) ?? '')}](${m.group(1)})',
  );

  // Convert bold/italic
  text = text.replaceAllMapped(
    RegExp(r'<(strong|b)[^>]*>(.*?)</\1>', caseSensitive: false, dotAll: true),
    (m) => '**${m.group(2)}**',
  );
  text = text.replaceAllMapped(
    RegExp(r'<(em|i)[^>]*>(.*?)</\1>', caseSensitive: false, dotAll: true),
    (m) => '*${m.group(2)}*',
  );

  // Convert code blocks
  text = text.replaceAllMapped(
    RegExp(
      r'<pre[^>]*><code[^>]*>(.*?)</code></pre>',
      caseSensitive: false,
      dotAll: true,
    ),
    (m) => '\n```\n${_decodeHtmlEntities(m.group(1) ?? '')}\n```\n',
  );
  text = text.replaceAllMapped(
    RegExp(r'<code[^>]*>(.*?)</code>', caseSensitive: false, dotAll: true),
    (m) => '`${m.group(1)}`',
  );

  // Convert lists
  text = text.replaceAllMapped(
    RegExp(r'<li[^>]*>(.*?)</li>', caseSensitive: false, dotAll: true),
    (m) => '- ${_stripTags(m.group(1) ?? '').trim()}\n',
  );

  // Convert paragraphs and line breaks
  text = text.replaceAll(RegExp(r'<br\s*/?\s*>', caseSensitive: false), '\n');
  text = text.replaceAllMapped(
    RegExp(r'<p[^>]*>(.*?)</p>', caseSensitive: false, dotAll: true),
    (m) => '\n${_stripTags(m.group(1) ?? '').trim()}\n',
  );

  // Convert blockquotes
  text = text.replaceAllMapped(
    RegExp(
      r'<blockquote[^>]*>(.*?)</blockquote>',
      caseSensitive: false,
      dotAll: true,
    ),
    (m) {
      final content = _stripTags(m.group(1) ?? '').trim();
      return content.split('\n').map((l) => '> $l').join('\n');
    },
  );

  // Convert tables (basic)
  text = text.replaceAllMapped(
    RegExp(r'<tr[^>]*>(.*?)</tr>', caseSensitive: false, dotAll: true),
    (m) {
      final cells =
          RegExp(
                r'<t[dh][^>]*>(.*?)</t[dh]>',
                caseSensitive: false,
                dotAll: true,
              )
              .allMatches(m.group(1) ?? '')
              .map((c) => _stripTags(c.group(1) ?? '').trim())
              .toList();
      return '| ${cells.join(' | ')} |\n';
    },
  );

  // Remove remaining HTML tags
  text = _stripTags(text);

  // Decode HTML entities
  text = _decodeHtmlEntities(text);

  // Clean up whitespace
  text = text.replaceAll(RegExp(r'\n{3,}'), '\n\n');
  text = text.trim();

  return text;
}