toCleanHtml static method

String toCleanHtml(
  1. String html
)

Clean HTML while preserving basic structure

Implementation

static String toCleanHtml(String html) {
  // Remove unwanted elements
  String cleaned = html.replaceAll(RegExp(
    r'<(script|style|nav|header|footer|aside|menu)[^>]*>.*?</\1>',
    caseSensitive: false, dotAll: true
  ), '');

  // Remove comments
  cleaned = cleaned.replaceAll(RegExp(r'<!--.*?-->', dotAll: true), '');

  // Clean up attributes but keep essential ones
  cleaned = cleaned.replaceAllMapped(
    RegExp(r'<([a-z]+)[^>]*>', caseSensitive: false),
    (match) {
      final tag = match.group(1)!.toLowerCase();
      switch (tag) {
        case 'a':
          // Keep href attribute for links
          final hrefMatch = RegExp(r'href=[\"\x27]([^\"\x27]*)[\"\x27]', caseSensitive: false)
              .firstMatch(match.group(0)!);
          return hrefMatch != null ? '<a href="${hrefMatch.group(1)}">' : '<a>';
        case 'img':
          // Keep src and alt for images
          final imgMatch = RegExp(r'src=[\"\x27]([^\"\x27]*)[\"\x27].*?alt=[\"\x27]([^\"\x27]*)[\"\x27]',
              caseSensitive: false).firstMatch(match.group(0)!);
          return imgMatch != null
              ? '<img src="${imgMatch.group(1)}" alt="${imgMatch.group(2)}">'
              : '<img>';
        default:
          return '<$tag>';
      }
    }
  );

  return cleaned;
}