scrape static method

Future<List<String>> scrape(
  1. String url, {
  2. dynamic parseHead = true,
  3. dynamic parseBody = true,
  4. dynamic verifyCandidates = true,
})

Returns feeds found on url

Implementation

static Future<List<String>> scrape(String url,
    {parseHead = true, parseBody = true, verifyCandidates = true}) async {
  var results = <String>[];
  var candidates = <String>[];

  // Get and parse website
  var response;
  var document;
  try {
    response = await http.get(Uri.parse(url));
    document = parse(response.body);
  } catch (e) {
    return results;
  }

  var uri = Uri.parse(url).removeFragment();
  var base = uri.scheme + '://' + uri.host;

  // Look for feed candidates in head
  if (parseHead) {
    for (var link in document.querySelectorAll("link[rel='alternate']")) {
      var type = link.attributes['type'];
      if (type != null) {
        if (type.contains('rss') || type.contains('xml')) {
          var href = link.attributes['href'];
          if (href != null) {
            // Fix relative URLs
            href = href.startsWith('/') ? base + href : href;
            candidates.add(href);
          }
        }
      }
    }
  }

  // Look for feed candidates in body
  if (parseBody) {
    for (var a in document.querySelectorAll('a')) {
      var href = a.attributes['href'];
      if (href != null) {
        if (href.contains('rss') ||
            href.contains('xml') ||
            href.contains('feed')) {
          // Fix relative URLs
          href = href.startsWith('/') ? base + href : href;
          href =
              href.endsWith('/') ? href.substring(0, href.length - 2) : href;

          // Fix naked URLs
          href = !href.startsWith('http') ? base + '/' + href : href;

          candidates.add(href);
        }
      }
    }
  }

  // Remove duplicates
  candidates = candidates.toSet().toList();

  // Verify candidates
  if (verifyCandidates) {
    for (var candidate in candidates) {
      try {
        await http.get(Uri.parse(candidate));
      } catch (e) {
        continue;
      }

      results.add(candidate);
    }
  }

  return results;
}