htmlToMarkdown function
Convert HTML content to clean markdown.
Implementation
String htmlToMarkdown(String html) {
var text = html;
// Remove scripts and styles
text = text.replaceAll(
RegExp(r'<script[^>]*>[\s\S]*?</script>', caseSensitive: false),
'',
);
text = text.replaceAll(
RegExp(r'<style[^>]*>[\s\S]*?</style>', caseSensitive: false),
'',
);
text = text.replaceAll(RegExp(r'<!--[\s\S]*?-->'), '');
// Convert headers
for (var i = 6; i >= 1; i--) {
text = text.replaceAllMapped(
RegExp('<h$i[^>]*>(.*?)</h$i>', caseSensitive: false, dotAll: true),
(m) => '\n${'#' * i} ${_stripTags(m.group(1) ?? '')}\n',
);
}
// Convert links
text = text.replaceAllMapped(
RegExp(
r'<a\s+[^>]*href="([^"]*)"[^>]*>(.*?)</a>',
caseSensitive: false,
dotAll: true,
),
(m) => '[${_stripTags(m.group(2) ?? '')}](${m.group(1)})',
);
// Convert bold/italic
text = text.replaceAllMapped(
RegExp(r'<(strong|b)[^>]*>(.*?)</\1>', caseSensitive: false, dotAll: true),
(m) => '**${m.group(2)}**',
);
text = text.replaceAllMapped(
RegExp(r'<(em|i)[^>]*>(.*?)</\1>', caseSensitive: false, dotAll: true),
(m) => '*${m.group(2)}*',
);
// Convert code blocks
text = text.replaceAllMapped(
RegExp(
r'<pre[^>]*><code[^>]*>(.*?)</code></pre>',
caseSensitive: false,
dotAll: true,
),
(m) => '\n```\n${_decodeHtmlEntities(m.group(1) ?? '')}\n```\n',
);
text = text.replaceAllMapped(
RegExp(r'<code[^>]*>(.*?)</code>', caseSensitive: false, dotAll: true),
(m) => '`${m.group(1)}`',
);
// Convert lists
text = text.replaceAllMapped(
RegExp(r'<li[^>]*>(.*?)</li>', caseSensitive: false, dotAll: true),
(m) => '- ${_stripTags(m.group(1) ?? '').trim()}\n',
);
// Convert paragraphs and line breaks
text = text.replaceAll(RegExp(r'<br\s*/?\s*>', caseSensitive: false), '\n');
text = text.replaceAllMapped(
RegExp(r'<p[^>]*>(.*?)</p>', caseSensitive: false, dotAll: true),
(m) => '\n${_stripTags(m.group(1) ?? '').trim()}\n',
);
// Convert blockquotes
text = text.replaceAllMapped(
RegExp(
r'<blockquote[^>]*>(.*?)</blockquote>',
caseSensitive: false,
dotAll: true,
),
(m) {
final content = _stripTags(m.group(1) ?? '').trim();
return content.split('\n').map((l) => '> $l').join('\n');
},
);
// Convert tables (basic)
text = text.replaceAllMapped(
RegExp(r'<tr[^>]*>(.*?)</tr>', caseSensitive: false, dotAll: true),
(m) {
final cells =
RegExp(
r'<t[dh][^>]*>(.*?)</t[dh]>',
caseSensitive: false,
dotAll: true,
)
.allMatches(m.group(1) ?? '')
.map((c) => _stripTags(c.group(1) ?? '').trim())
.toList();
return '| ${cells.join(' | ')} |\n';
},
);
// Remove remaining HTML tags
text = _stripTags(text);
// Decode HTML entities
text = _decodeHtmlEntities(text);
// Clean up whitespace
text = text.replaceAll(RegExp(r'\n{3,}'), '\n\n');
text = text.trim();
return text;
}