scrapingHandler static method
Implementation
static Future scrapingHandler(
String url,
Config config, {
String? html,
}) async {
//Scraping headers
Map<String, String> headers = {};
headers.addAll({
HttpHeaders.userAgentHeader: generateUserAgent(config.userAgent),
});
if (config.headers != null) {
headers.addAll(config.headers!);
}
//Find if target is available for URL
Target? target = fetchTarget(config.targets, url);
if (target == null) {
return SpiderError(
500,
"This URL is not supported, Please try another URL",
);
}
String? proxy;
if (config.proxy != null) {
proxy = config.proxy!;
}
//Clean URL first
url = runCleaner(url, target.cleaner);
String dom;
if (target.needsHtml) {
//we need HTML
if (config.forceFetch) {
String? data = await getRequest(url, proxy, headers);
if (data == null) {
return SpiderError(500, 'Unable to find data from url.');
} else {
dom = data;
}
} else {
if (html == null || html == '' || html.isEmpty) {
String? data = await getRequest(url, proxy, headers);
if (data == null) {
return SpiderError(500, 'Unable to find data from url.');
} else {
dom = data;
}
} else {
dom = html;
}
}
} else {
//When needsHtml is false, dom gets replaced with url
dom = url;
}
//Only known parsers to us
List<Parser> rootParsers = [];
//Fetch all parsers and put them in allParsers variable as Parser model
List<Parser> allParsers = [];
for (final p in config.parsers[target.name]!) {
allParsers.add(p);
}
//Fetch _root parsers
rootParsers = childFinder('_root', allParsers);
//Minify huge html page
dom = htmlMinify(url, dom, target);
//Start parsing
Map<String, dynamic> f = await parsingHandler(allParsers, rootParsers, dom);
if (f.containsKey('url')) {
f.addAll({'target': target.name});
} else {
f.addAll({'url': url, 'target': target.name});
}
return f;
}