detectCorruptionPatterns method - ImageTokenizer class - image_tokenizer library

detectCorruptionPatterns static method

bool detectCorruptionPatterns(

String response

)

Detects potential image corruption patterns in model responses

Implementation

static bool detectCorruptionPatterns(String response) {
  try {
    if (response.isEmpty) return false;

    // Patterns that indicate image corruption
    final corruptionPatterns = [
      RegExp(r'describe\.describe\.describe\.+'), // Infinite "describe" repetition
      RegExp(r'^[₹]{10,}'), // Rupee symbol repetition
      RegExp(r'\bph\b.*\bph\b.*\bph\b'), // Repeating "ph" pattern
      RegExp(r'^(.)\1{10,}'), // Any single character repeated 10+ times
      RegExp(r'\b\w+\.\w+\.\w+\.+'), // Word repetition with dots
      RegExp(r'\b[a-zA-Z]{1,2}\s+[a-zA-Z]{1,2}\s+[a-zA-Z]{1,2}\b'), // Short letter sequences as words
    ];

    for (final pattern in corruptionPatterns) {
      if (pattern.hasMatch(response)) {
        debugPrint('ImageTokenizer: Detected corruption pattern - ${pattern.pattern}');
        return true;
      }
    }

    // Check for excessive repetition of short sequences
    final words = response.split(RegExp(r'\s+'));
    if (words.length > 10) {
      final wordCounts = <String, int>{};
      for (final word in words) {
        if (word.length <= 3) { // Focus on short words that might be corrupted data
          wordCounts[word] = (wordCounts[word] ?? 0) + 1;
        }
      }

      // If any short word appears too frequently, it might be corruption
      for (final entry in wordCounts.entries) {
        if (entry.value > words.length * 0.3) { // More than 30% of words
          debugPrint('ImageTokenizer: Detected excessive repetition of "${entry.key}" (${entry.value} times)');
          return true;
        }
      }
    }

    return false;
  } catch (e) {
    debugPrint('ImageTokenizer: Error detecting corruption patterns - $e');
    return false;
  }
}