dart_spider 0.0.9 copy "dart_spider: ^0.0.9" to clipboard
dart_spider: ^0.0.9 copied to clipboard

Simple way to scrape and extract data from websites in a structured way.

example/example.dart

import 'package:dart_spider/dart_spider.dart';
import 'dart:convert';

void main() async {
  //Map of Config for multiple domains!
  Map<String, Config> config = {
    "books.toscrape.com": Config(
      parsers: {
        "homepage": [
          Parser(
            id: "books",
            parent: ["_root"],
            type: ParserType.element,
            selector: "ol.row > li",
            isPrivate: true,
            multiple: true,
          ),
          Parser(
            id: "name",
            parent: ["books"],
            type: ParserType.attribute,
            selector: "h3 a::title",
          ),
          Parser(
            id: "link",
            parent: ["books"],
            type: ParserType.url,
            selector: "a",
            optional: Optional(
              prepend: "http://books.toscrape.com/",
            ),
          ),
          Parser(
            id: "image",
            parent: ["books"],
            type: ParserType.image,
            selector: "img",
            optional: Optional(
              prepend: "http://books.toscrape.com/",
            ),
          ),
          Parser(
            id: "price",
            parent: ["books"],
            type: ParserType.text,
            selector: "p.price_color",
            optional: Optional(
              regexReplace: r"[^\x00-\x7F]",
              regexReplaceWith: "",
            ),
          ),
          Parser(
            id: "in_stock",
            parent: ["books"],
            type: ParserType.text,
            selector: "p.availability",
            optional: Optional(
              match: ['In stock', 'In Stock'],
            ),
          ),
        ],
        "product_page": [
          Parser(
            id: "breadcrumb",
            parent: ["_root"],
            type: ParserType.text,
            selector: "ul.breadcrumb > li",
            multiple: true,
          ),
          Parser(
            id: "title",
            parent: ["_root"],
            type: ParserType.text,
            selector: "h1",
          ),
          Parser(
            id: "price",
            parent: ["_root"],
            type: ParserType.text,
            selector: "div.product_main > p.price_color",
            optional: Optional(
              regexReplace: r"[^\x00-\x7F]",
              regexReplaceWith: "",
            ),
          ),
          Parser(
            id: "image",
            parent: ["_root"],
            type: ParserType.image,
            selector: "div#product_gallery img",
            optional: Optional(
              replace: {
                "../../": "http://books.toscrape.com/",
              },
            ),
          ),
          Parser(
            id: "in_stock",
            parent: ["_root"],
            type: ParserType.text,
            selector: "div.product_main > p.availability",
            optional: Optional(
              match: ['In stock', 'In Stock'],
            ),
          ),
          Parser(
            id: "description_element",
            parent: ["_root"],
            type: ParserType.sibling,
            selector: "div#product_description",
            optional: SiblingOptional(
              direction: SiblingDirection.next,
            ),
            isPrivate: true,
          ),
          Parser(
            id: "description",
            parent: ["description_element"],
            type: ParserType.text,
            selector: "p",
          ),
          Parser(
            id: "product_information",
            parent: ["_root"],
            type: ParserType.table,
            selector: "table tr",
            optional: TableOptional(
              keys: "th",
              values: "td",
            ),
          ),
        ],
      },
      targets: [
        Target(
          name: "product_page",
          where: ["/catalogue"],
        ),
        //Target "homepage" is last because ["/"] means any given url.
        Target(
          name: "homepage",
          where: ["/"],
        ),
      ],
    ),
  };

  //URL to scrape

  // To Scrape all books:
  // String url = "http://books.toscrape.com/";

  // To Scrape specific book:
  String url =
      "http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html";

  //Scrape from URL
  var res = await Spider.scrapeFromUrl(config, url);

  //Check if it's successful
  if (res is SpiderError) {
    //Print out the error
    print(res.error);
  } else {
    //Success
    print(jsonEncode(res));
  }
}
4
likes
140
points
13
downloads

Publisher

verified publisherpricehistoryapp.com

Weekly Downloads

Simple way to scrape and extract data from websites in a structured way.

Homepage

Documentation

API reference

License

Apache-2.0 (license)

Dependencies

html, http

More

Packages that depend on dart_spider