scrape static method
Future<List<String> >
scrape(
- String url, {
- dynamic parseHead = true,
- dynamic parseBody = true,
- dynamic verifyCandidates = true,
Returns feeds found on url
Implementation
static Future<List<String>> scrape(String url,
{parseHead = true, parseBody = true, verifyCandidates = true}) async {
var results = <String>[];
var candidates = <String>[];
// Get and parse website
var response;
var document;
try {
response = await http.get(Uri.parse(url));
document = parse(response.body);
} catch (e) {
return results;
}
var uri = Uri.parse(url).removeFragment();
var base = uri.scheme + '://' + uri.host;
// Look for feed candidates in head
if (parseHead) {
for (var link in document.querySelectorAll("link[rel='alternate']")) {
var type = link.attributes['type'];
if (type != null) {
if (type.contains('rss') || type.contains('xml')) {
var href = link.attributes['href'];
if (href != null) {
// Fix relative URLs
href = href.startsWith('/') ? base + href : href;
candidates.add(href);
}
}
}
}
}
// Look for feed candidates in body
if (parseBody) {
for (var a in document.querySelectorAll('a')) {
var href = a.attributes['href'];
if (href != null) {
if (href.contains('rss') ||
href.contains('xml') ||
href.contains('feed')) {
// Fix relative URLs
href = href.startsWith('/') ? base + href : href;
href =
href.endsWith('/') ? href.substring(0, href.length - 2) : href;
// Fix naked URLs
href = !href.startsWith('http') ? base + '/' + href : href;
candidates.add(href);
}
}
}
}
// Remove duplicates
candidates = candidates.toSet().toList();
// Verify candidates
if (verifyCandidates) {
for (var candidate in candidates) {
try {
await http.get(Uri.parse(candidate));
} catch (e) {
continue;
}
results.add(candidate);
}
}
return results;
}